├── examples ├── .gitignore ├── parallel_learning │ ├── mlist.txt │ ├── predict.conf │ ├── README.md │ └── train.conf ├── lambdarank │ ├── predict.conf │ ├── rank.test.query │ ├── README.md │ ├── rank.train.query │ └── train.conf ├── regression │ ├── predict.conf │ ├── README.md │ └── train.conf ├── binary_classification │ ├── predict.conf │ ├── README.md │ └── train.conf ├── multiclass_classification │ ├── predict.conf │ ├── README.md │ └── train.conf ├── README.md └── python-guide │ ├── sklearn_example.py │ ├── README.md │ ├── plot_example.py │ └── simple_example.py ├── R-package ├── tests │ ├── testthat.R │ └── testthat │ │ ├── test_custom_objective.R │ │ ├── test_dataset.R │ │ └── test_basic.R ├── data │ ├── agaricus.test.rda │ └── agaricus.train.rda ├── unix_build_package.sh ├── win_build_package.cmd ├── demo │ ├── README.md │ ├── 00Index │ ├── boost_from_prediction.R │ ├── cross_validation.R │ ├── early_stopping.R │ └── multiclass.R ├── src │ ├── Makevars │ ├── Makevars.win │ ├── Makevars_fullcode │ ├── Makevars_fullcode.win │ ├── lightgbm-fullcode.cpp │ ├── lightgbm-all.cpp │ └── R_object_helper.h ├── man │ ├── lgb.Dataset.construct.Rd │ ├── lgb.get.eval.result.Rd │ ├── lgb.Dataset.save.Rd │ ├── lgb.Dataset.set.categorical.Rd │ ├── slice.Rd │ ├── lgb.Dataset.set.reference.Rd │ ├── agaricus.test.Rd │ ├── agaricus.train.Rd │ ├── dim.Rd │ ├── lgb.Dataset.create.valid.Rd │ ├── lgb.load.Rd │ ├── lgb.dump.Rd │ ├── dimnames.lgb.Dataset.Rd │ ├── lgb.save.Rd │ ├── getinfo.Rd │ ├── readRDS.lgb.Booster.Rd │ ├── setinfo.Rd │ ├── lgb.Dataset.Rd │ ├── lgb.importance.Rd │ ├── lgb.interprete.Rd │ ├── lgb.plot.importance.Rd │ ├── lgb.unloader.Rd │ ├── lgb.model.dt.tree.Rd │ ├── lgb.plot.interpretation.Rd │ ├── saveRDS.lgb.Booster.Rd │ └── predict.lgb.Booster.Rd ├── NAMESPACE ├── LICENSE ├── DESCRIPTION ├── R │ ├── readRDS.lgb.Booster.R │ ├── lgb.importance.R │ ├── lgb.unloader.R │ ├── lgb.plot.importance.R │ ├── saveRDS.lgb.Booster.R │ └── lightgbm.R └── README.md ├── docs ├── Installation-Guide.md ├── Parallel-Learning-Guide.md ├── Readme.md ├── Parameters-tuning.md ├── FAQ.md └── development.md ├── .gitmodules ├── .github └── ISSUE_TEMPLATE.md ├── include └── LightGBM │ ├── export.h │ ├── meta.h │ ├── utils │ ├── threading.h │ ├── openmp_wrapper.h │ ├── pipeline_reader.h │ ├── log.h │ └── random.h │ ├── objective_function.h │ ├── application.h │ ├── tree_learner.h │ ├── dataset_loader.h │ └── metric.h ├── src ├── main.cpp ├── network │ └── linkers_mpi.cpp ├── treelearner │ ├── tree_learner.cpp │ ├── feature_parallel_tree_learner.cpp │ ├── split_info.hpp │ └── leaf_splits.hpp ├── metric │ ├── metric.cpp │ └── dcg_calculator.cpp ├── boosting │ ├── boosting.cpp │ └── score_updater.hpp ├── objective │ └── objective_function.cpp └── io │ ├── parser.hpp │ └── parser.cpp ├── docker ├── README.md └── dockerfile-python ├── python-package ├── lightgbm │ ├── __init__.py │ ├── libpath.py │ └── compat.py ├── setup.py └── README.rst ├── LICENSE ├── .travis └── amd_sdk.sh ├── windows └── LightGBM.sln ├── pmml └── README.md ├── tests └── python_package_test │ └── test_basic.py ├── .travis.yml └── CMakeLists.txt /examples/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | -------------------------------------------------------------------------------- /examples/parallel_learning/mlist.txt: -------------------------------------------------------------------------------- 1 | 192.168.1.101 12400 2 | 192.168.1.102 12400 3 | -------------------------------------------------------------------------------- /R-package/tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(lightgbm) 3 | 4 | test_check("lightgbm") 5 | -------------------------------------------------------------------------------- /docs/Installation-Guide.md: -------------------------------------------------------------------------------- 1 | Refer to https://github.com/Microsoft/LightGBM/wiki/Installation-Guide. 2 | -------------------------------------------------------------------------------- /docs/Parallel-Learning-Guide.md: -------------------------------------------------------------------------------- 1 | Refer to https://github.com/Microsoft/LightGBM/wiki/Parallel-Learning-Guide -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "include/boost/compute"] 2 | path = compute 3 | url = https://github.com/boostorg/compute 4 | -------------------------------------------------------------------------------- /examples/lambdarank/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = rank.test 5 | 6 | input_model= LightGBM_model.txt 7 | -------------------------------------------------------------------------------- /R-package/data/agaricus.test.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huanzhang12/lightgbm-gpu/HEAD/R-package/data/agaricus.test.rda -------------------------------------------------------------------------------- /R-package/data/agaricus.train.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huanzhang12/lightgbm-gpu/HEAD/R-package/data/agaricus.train.rda -------------------------------------------------------------------------------- /examples/regression/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = regression.test 5 | 6 | input_model= LightGBM_model.txt 7 | -------------------------------------------------------------------------------- /examples/binary_classification/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = binary.test 5 | 6 | input_model= LightGBM_model.txt 7 | -------------------------------------------------------------------------------- /examples/parallel_learning/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = binary.test 5 | 6 | input_model= LightGBM_model.txt 7 | 8 | -------------------------------------------------------------------------------- /examples/multiclass_classification/predict.conf: -------------------------------------------------------------------------------- 1 | task = predict 2 | 3 | data = multiclass.test 4 | 5 | input_model= LightGBM_model.txt 6 | -------------------------------------------------------------------------------- /R-package/unix_build_package.sh: -------------------------------------------------------------------------------- 1 | cp ../include ./src/include -rf 2 | cp ../src ./src/src -rf 3 | rm ./src/Makevars 4 | cp ./src/Makevars_fullcode ./src/Makevars -f 5 | R CMD build --no-build-vignettes . -------------------------------------------------------------------------------- /R-package/win_build_package.cmd: -------------------------------------------------------------------------------- 1 | xcopy ..\include src\include /e /i /y 2 | xcopy ..\src src\src /e /i /y 3 | del .\src\Makevars.win 4 | copy .\src\Makevars_fullcode.win .\src\Makevars.win /y 5 | R CMD build --no-build-vignettes . -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | Examples 2 | ===================== 3 | 4 | You can learn how to use LightGBM by these examples. If you have any questions, please refer to our [wiki](https://github.com/Microsoft/LightGBM/wiki). 5 | 6 | -------------------------------------------------------------------------------- /R-package/demo/README.md: -------------------------------------------------------------------------------- 1 | LightGBM R examples 2 | ==== 3 | * [Basic walkthrough of wrappers](basic_walkthrough.R) 4 | * [Boosting from existing prediction](boost_from_prediction.R) 5 | * [Early Stopping](early_stopping.R) 6 | * [Cross Validation](cross_validation.R) 7 | * [Multiclass Training/Prediction](multiclass.R) 8 | * [Leaf (in)Stability](leaf_stability.R) 9 | -------------------------------------------------------------------------------- /R-package/demo/00Index: -------------------------------------------------------------------------------- 1 | basic_walkthrough Basic feature walkthrough 2 | boost_from_prediction Boosting from existing prediction 3 | early_stopping Early Stop in training 4 | cross_validation Cross Validation 5 | multiclass Multiclass training/prediction 6 | leaf_stability Leaf (in)Stability example 7 | -------------------------------------------------------------------------------- /R-package/src/Makevars: -------------------------------------------------------------------------------- 1 | # package root 2 | PKGROOT=../../ 3 | 4 | ENABLE_STD_THREAD=1 5 | CXX_STD = CXX11 6 | 7 | LGBM_RFLAGS = -DUSE_SOCKET 8 | 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) -Wno-deprecated-declarations 10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11 11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) 12 | OBJECTS = ./lightgbm-all.o ./lightgbm_R.o 13 | -------------------------------------------------------------------------------- /R-package/src/Makevars.win: -------------------------------------------------------------------------------- 1 | # package root 2 | PKGROOT=../../ 3 | 4 | ENABLE_STD_THREAD=1 5 | CXX_STD = CXX11 6 | 7 | LGBM_RFLAGS = -DUSE_SOCKET 8 | 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) 10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11 11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -lws2_32 -liphlpapi 12 | OBJECTS = ./lightgbm-all.o ./lightgbm_R.o 13 | -------------------------------------------------------------------------------- /examples/lambdarank/rank.test.query: -------------------------------------------------------------------------------- 1 | 12 2 | 19 3 | 18 4 | 10 5 | 15 6 | 15 7 | 22 8 | 23 9 | 18 10 | 16 11 | 16 12 | 11 13 | 6 14 | 13 15 | 17 16 | 21 17 | 20 18 | 16 19 | 13 20 | 16 21 | 21 22 | 15 23 | 10 24 | 19 25 | 10 26 | 13 27 | 18 28 | 17 29 | 23 30 | 24 31 | 16 32 | 13 33 | 17 34 | 24 35 | 17 36 | 10 37 | 17 38 | 15 39 | 18 40 | 16 41 | 9 42 | 9 43 | 21 44 | 14 45 | 13 46 | 13 47 | 13 48 | 10 49 | 10 50 | 6 51 | -------------------------------------------------------------------------------- /R-package/src/Makevars_fullcode: -------------------------------------------------------------------------------- 1 | # package root 2 | PKGROOT=. 3 | 4 | ENABLE_STD_THREAD=1 5 | CXX_STD = CXX11 6 | 7 | LGBM_RFLAGS = -DUSE_SOCKET 8 | 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) -Wno-deprecated-declarations 10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11 11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) 12 | OBJECTS = ./lightgbm-fullcode.o ./lightgbm_R.o 13 | -------------------------------------------------------------------------------- /R-package/src/Makevars_fullcode.win: -------------------------------------------------------------------------------- 1 | # package root 2 | PKGROOT=. 3 | 4 | ENABLE_STD_THREAD=1 5 | CXX_STD = CXX11 6 | 7 | LGBM_RFLAGS = -DUSE_SOCKET 8 | 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) 10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11 11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -lws2_32 -liphlpapi 12 | OBJECTS = ./lightgbm-fullcode.o ./lightgbm_R.o 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Please search your question on previous issues, stackoverflow or other search engines before you open a new one. 2 | 3 | For bugs and unexpected issues, please provide following information, so that we could reproduce on our system. 4 | 5 | ## Environment info 6 | Operating System: 7 | CPU: 8 | C++/Python/R version: 9 | 10 | ## Error Message: 11 | 12 | ## Reproducible examples 13 | 14 | ## Steps to reproduce 15 | 16 | 1. 17 | 2. 18 | 3. 19 | -------------------------------------------------------------------------------- /include/LightGBM/export.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_EXPORT_H_ 2 | #define LIGHTGBM_EXPORT_H_ 3 | 4 | /** Macros for exporting symbols in MSVC/GCC/CLANG **/ 5 | 6 | #ifdef __cplusplus 7 | #define LIGHTGBM_EXTERN_C extern "C" 8 | #else 9 | #define LIGHTGBM_EXTERN_C 10 | #endif 11 | 12 | 13 | #ifdef _MSC_VER 14 | #define LIGHTGBM_EXPORT __declspec(dllexport) 15 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport) 16 | #else 17 | #define LIGHTGBM_EXPORT 18 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C 19 | #endif 20 | 21 | #endif /** LIGHTGBM_EXPORT_H_ **/ 22 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | try { 6 | LightGBM::Application app(argc, argv); 7 | app.Run(); 8 | } 9 | catch (const std::exception& ex) { 10 | std::cerr << "Met Exceptions:" << std::endl; 11 | std::cerr << ex.what() << std::endl; 12 | exit(-1); 13 | } 14 | catch (const std::string& ex) { 15 | std::cerr << "Met Exceptions:" << std::endl; 16 | std::cerr << ex << std::endl; 17 | exit(-1); 18 | } 19 | catch (...) { 20 | std::cerr << "Unknown Exceptions" << std::endl; 21 | exit(-1); 22 | } 23 | } -------------------------------------------------------------------------------- /docs/Readme.md: -------------------------------------------------------------------------------- 1 | Documents 2 | ========= 3 | * [Installation Guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide) 4 | * [Quick Start](./Quick-Start.md) 5 | * [Python Quick Start](./Python-intro.md) 6 | * [Features](https://github.com/Microsoft/LightGBM/wiki/Features) 7 | * [Experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments) 8 | * [Parameters](./Parameters.md) 9 | * [Parameters Tuning](./Parameters-tuning.md) 10 | * [Python API Reference](./Python-API.md) 11 | * [Parallel Learning Guide](https://github.com/Microsoft/LightGBM/wiki/Parallel-Learning-Guide) 12 | * [FAQ](./FAQ.md) 13 | * [Development Guide](./development.md) 14 | 15 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.construct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.construct} 4 | \alias{lgb.Dataset.construct} 5 | \title{Construct Dataset explicitly} 6 | \usage{ 7 | lgb.Dataset.construct(dataset) 8 | } 9 | \arguments{ 10 | \item{dataset}{Object of class \code{lgb.Dataset}} 11 | } 12 | \description{ 13 | Construct Dataset explicitly 14 | } 15 | \examples{ 16 | \dontrun{ 17 | library(lightgbm) 18 | data(agaricus.train, package = "lightgbm") 19 | train <- agaricus.train 20 | dtrain <- lgb.Dataset(train$data, label = train$label) 21 | lgb.Dataset.construct(dtrain) 22 | } 23 | 24 | } 25 | 26 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Using LightGBM via Docker 2 | 3 | This directory contains `Dockerfile` to make it easy to build and run LightGBM via [Docker](http://www.docker.com/). 4 | 5 | ## Installing Docker 6 | 7 | Follow the general installation instructions 8 | [on the Docker site](https://docs.docker.com/installation/): 9 | 10 | * [OSX](https://docs.docker.com/installation/mac/): [docker toolbox](https://www.docker.com/toolbox) 11 | * [ubuntu](https://docs.docker.com/installation/ubuntulinux/) 12 | 13 | ## Running the container 14 | 15 | Build the container, for python users: 16 | 17 | $ docker build -t lightgbm -f dockerfile-python . 18 | 19 | After build finished, run the container: 20 | 21 | $ docker run --rm -it lightgbm 22 | -------------------------------------------------------------------------------- /R-package/man/lgb.get.eval.result.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.get.eval.result} 4 | \alias{lgb.get.eval.result} 5 | \title{Get record evaluation result from booster} 6 | \usage{ 7 | lgb.get.eval.result(booster, data_name, eval_name, iters = NULL, 8 | is_err = FALSE) 9 | } 10 | \arguments{ 11 | \item{booster}{Object of class \code{lgb.Booster}} 12 | 13 | \item{data_name}{name of dataset} 14 | 15 | \item{eval_name}{name of evaluation} 16 | 17 | \item{iters}{iterations, NULL will return all} 18 | 19 | \item{is_err}{TRUE will return evaluation error instead} 20 | } 21 | \value{ 22 | vector of evaluation result 23 | } 24 | \description{ 25 | Get record evaluation result from booster 26 | } 27 | 28 | -------------------------------------------------------------------------------- /examples/lambdarank/README.md: -------------------------------------------------------------------------------- 1 | LambdaRank Example 2 | ===================== 3 | Here is an example for LightGBM to run lambdarank task. 4 | 5 | ***You should copy executable file to this folder first.*** 6 | 7 | #### Training 8 | 9 | For windows, by running following command in this folder: 10 | ``` 11 | lightgbm.exe config=train.conf 12 | ``` 13 | 14 | 15 | For linux, by running following command in this folder: 16 | ``` 17 | ./lightgbm config=train.conf 18 | ``` 19 | 20 | #### Prediction 21 | 22 | You should finish training first. 23 | 24 | For windows, by running following command in this folder: 25 | ``` 26 | lightgbm.exe config=predict.conf 27 | ``` 28 | 29 | For linux, by running following command in this folder: 30 | ``` 31 | ./lightgbm config=predict.conf 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /examples/regression/README.md: -------------------------------------------------------------------------------- 1 | Regression Example 2 | ===================== 3 | Here is an example for LightGBM to run regression task. 4 | 5 | ***You should copy executable file to this folder first.*** 6 | 7 | #### Training 8 | 9 | For windows, by running following command in this folder: 10 | ``` 11 | lightgbm.exe config=train.conf 12 | ``` 13 | 14 | 15 | For linux, by running following command in this folder: 16 | ``` 17 | ./lightgbm config=train.conf 18 | ``` 19 | 20 | #### Prediction 21 | 22 | You should finish training first. 23 | 24 | For windows, by running following command in this folder: 25 | ``` 26 | lightgbm.exe config=predict.conf 27 | ``` 28 | 29 | For linux, by running following command in this folder: 30 | ``` 31 | ./lightgbm config=predict.conf 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.save.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.save} 4 | \alias{lgb.Dataset.save} 5 | \title{Save \code{lgb.Dataset} to a binary file} 6 | \usage{ 7 | lgb.Dataset.save(dataset, fname) 8 | } 9 | \arguments{ 10 | \item{dataset}{object of class \code{lgb.Dataset}} 11 | 12 | \item{fname}{object filename of output file} 13 | } 14 | \value{ 15 | passed dataset 16 | } 17 | \description{ 18 | Save \code{lgb.Dataset} to a binary file 19 | } 20 | \examples{ 21 | 22 | \dontrun{ 23 | library(lightgbm) 24 | data(agaricus.train, package = "lightgbm") 25 | train <- agaricus.train 26 | dtrain <- lgb.Dataset(train$data, label = train$label) 27 | lgb.Dataset.save(dtrain, "data.bin") 28 | } 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /examples/binary_classification/README.md: -------------------------------------------------------------------------------- 1 | Binary Classification Example 2 | ===================== 3 | Here is an example for LightGBM to run binary classification task. 4 | 5 | ***You should copy executable file to this folder first.*** 6 | 7 | #### Training 8 | 9 | For windows, by running following command in this folder: 10 | ``` 11 | lightgbm.exe config=train.conf 12 | ``` 13 | 14 | 15 | For linux, by running following command in this folder: 16 | ``` 17 | ./lightgbm config=train.conf 18 | ``` 19 | 20 | #### Prediction 21 | 22 | You should finish training first. 23 | 24 | For windows, by running following command in this folder: 25 | ``` 26 | lightgbm.exe config=predict.conf 27 | ``` 28 | 29 | For linux, by running following command in this folder: 30 | ``` 31 | ./lightgbm config=predict.conf 32 | ``` 33 | 34 | 35 | -------------------------------------------------------------------------------- /examples/multiclass_classification/README.md: -------------------------------------------------------------------------------- 1 | Multiclass Classification Example 2 | ===================== 3 | Here is an example for LightGBM to run multiclass classification task. 4 | 5 | ***You should copy executable file to this folder first.*** 6 | 7 | #### Training 8 | 9 | For windows, by running following command in this folder: 10 | ``` 11 | lightgbm.exe config=train.conf 12 | ``` 13 | 14 | 15 | For linux, by running following command in this folder: 16 | ``` 17 | ./lightgbm config=train.conf 18 | ``` 19 | 20 | #### Prediction 21 | 22 | You should finish training first. 23 | 24 | For windows, by running following command in this folder: 25 | ``` 26 | lightgbm.exe config=predict.conf 27 | ``` 28 | 29 | For linux, by running following command in this folder: 30 | ``` 31 | ./lightgbm config=predict.conf 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /include/LightGBM/meta.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_META_H_ 2 | #define LIGHTGBM_META_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace LightGBM { 12 | 13 | /*! \brief Type of data size, it is better to use signed type*/ 14 | typedef int32_t data_size_t; 15 | /*! \brief Type of score, and gradients */ 16 | typedef float score_t; 17 | 18 | const score_t kMinScore = -std::numeric_limits::infinity(); 19 | 20 | const score_t kEpsilon = 1e-15f; 21 | 22 | using ReduceFunction = std::function; 23 | 24 | using PredictFunction = 25 | std::function>&, double* output)>; 26 | 27 | #define NO_SPECIFIC (-1) 28 | 29 | } // namespace LightGBM 30 | 31 | #endif // LightGBM_META_H_ 32 | -------------------------------------------------------------------------------- /examples/parallel_learning/README.md: -------------------------------------------------------------------------------- 1 | Parallel Learning Example 2 | ===================== 3 | Here is an example for LightGBM to perform parallel learning for 2 machines. 4 | 5 | 1. Edit mlist.txt , write the ip of these 2 machines that you want to run application on. 6 | 7 | ``` 8 | machine1_ip 12400 9 | machine2_ip 12400 10 | ``` 11 | 12 | 2. Copy this folder and executable file to these 2 machines that you want to run application on. 13 | 3. Run command in this folder on both 2 machines: 14 | 15 | For windows:```lightgbm.exe config=train.conf``` 16 | 17 | For linux:```./lightgbm config=train.conf``` 18 | 19 | This parallel learning example is based on socket. LightGBM also support parallel learning based on mpi. 20 | 21 | For more details about the usage of parallel learning, please refer to [this](https://github.com/Microsoft/LightGBM/wiki/Parallel-Learning-Guide). 22 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.set.categorical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.set.categorical} 4 | \alias{lgb.Dataset.set.categorical} 5 | \title{Set categorical feature of \code{lgb.Dataset}} 6 | \usage{ 7 | lgb.Dataset.set.categorical(dataset, categorical_feature) 8 | } 9 | \arguments{ 10 | \item{dataset}{object of class \code{lgb.Dataset}} 11 | 12 | \item{categorical_feature}{categorical features} 13 | } 14 | \value{ 15 | passed dataset 16 | } 17 | \description{ 18 | Set categorical feature of \code{lgb.Dataset} 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package = "lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | lgb.Dataset.save(dtrain, "lgb.Dataset.data") 27 | dtrain <- lgb.Dataset("lgb.Dataset.data") 28 | lgb.Dataset.set.categorical(dtrain, 1:2) 29 | } 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /src/network/linkers_mpi.cpp: -------------------------------------------------------------------------------- 1 | #ifdef USE_MPI 2 | #include "linkers.h" 3 | 4 | namespace LightGBM { 5 | 6 | Linkers::Linkers(NetworkConfig config) { 7 | int argc = 0; 8 | char**argv = nullptr; 9 | int flag = 0; 10 | MPI_SAFE_CALL(MPI_Initialized(&flag)); // test if MPI has been initialized 11 | if (!flag) { // if MPI not started, start it 12 | MPI_SAFE_CALL(MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &flag)); 13 | } 14 | MPI_SAFE_CALL(MPI_Comm_size(MPI_COMM_WORLD, &num_machines_)); 15 | MPI_SAFE_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank_)); 16 | // wait for all client start up 17 | MPI_SAFE_CALL(MPI_Barrier(MPI_COMM_WORLD)); 18 | bruck_map_ = BruckMap::Construct(rank_, num_machines_); 19 | recursive_halving_map_ = RecursiveHalvingMap::Construct(rank_, num_machines_); 20 | } 21 | 22 | Linkers::~Linkers() { 23 | MPI_SAFE_CALL(MPI_Finalize()); 24 | } 25 | 26 | 27 | } // namespace LightGBM 28 | #endif // USE_MPI 29 | 30 | -------------------------------------------------------------------------------- /R-package/man/slice.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{slice} 4 | \alias{slice} 5 | \alias{slice.lgb.Dataset} 6 | \title{Slice a dataset} 7 | \usage{ 8 | slice(dataset, ...) 9 | 10 | \method{slice}{lgb.Dataset}(dataset, idxset, ...) 11 | } 12 | \arguments{ 13 | \item{dataset}{Object of class "lgb.Dataset"} 14 | 15 | \item{...}{other parameters (currently not used)} 16 | 17 | \item{idxset}{a integer vector of indices of rows needed} 18 | } 19 | \value{ 20 | constructed sub dataset 21 | } 22 | \description{ 23 | Get a new \code{lgb.Dataset} containing the specified rows of 24 | orginal lgb.Dataset object 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(lightgbm) 29 | data(agaricus.train, package = "lightgbm") 30 | train <- agaricus.train 31 | dtrain <- lgb.Dataset(train$data, label = train$label) 32 | 33 | dsub <- lightgbm::slice(dtrain, 1:42) 34 | labels <- lightgbm::getinfo(dsub, "label") 35 | } 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.set.reference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.set.reference} 4 | \alias{lgb.Dataset.set.reference} 5 | \title{Set reference of \code{lgb.Dataset}} 6 | \usage{ 7 | lgb.Dataset.set.reference(dataset, reference) 8 | } 9 | \arguments{ 10 | \item{dataset}{object of class \code{lgb.Dataset}} 11 | 12 | \item{reference}{object of class \code{lgb.Dataset}} 13 | } 14 | \value{ 15 | passed dataset 16 | } 17 | \description{ 18 | If you want to use validation data, you should set reference to training data 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package ="lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | data(agaricus.test, package = "lightgbm") 27 | test <- agaricus.test 28 | dtest <- lgb.Dataset(test$data, test = train$label) 29 | lgb.Dataset.set.reference(dtest, dtrain) 30 | } 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /R-package/man/agaricus.test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lightgbm.R 3 | \docType{data} 4 | \name{agaricus.test} 5 | \alias{agaricus.test} 6 | \title{Test part from Mushroom Data Set} 7 | \format{A list containing a label vector, and a dgCMatrix object with 1611 8 | rows and 126 variables} 9 | \usage{ 10 | data(agaricus.test) 11 | } 12 | \description{ 13 | This data set is originally from the Mushroom data set, 14 | UCI Machine Learning Repository. 15 | } 16 | \details{ 17 | This data set includes the following fields: 18 | 19 | \itemize{ 20 | \item \code{label} the label for each record 21 | \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. 22 | } 23 | } 24 | \references{ 25 | https://archive.ics.uci.edu/ml/datasets/Mushroom 26 | 27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 29 | School of Information and Computer Science. 30 | } 31 | \keyword{datasets} 32 | 33 | -------------------------------------------------------------------------------- /R-package/man/agaricus.train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lightgbm.R 3 | \docType{data} 4 | \name{agaricus.train} 5 | \alias{agaricus.train} 6 | \title{Training part from Mushroom Data Set} 7 | \format{A list containing a label vector, and a dgCMatrix object with 6513 8 | rows and 127 variables} 9 | \usage{ 10 | data(agaricus.train) 11 | } 12 | \description{ 13 | This data set is originally from the Mushroom data set, 14 | UCI Machine Learning Repository. 15 | } 16 | \details{ 17 | This data set includes the following fields: 18 | 19 | \itemize{ 20 | \item \code{label} the label for each record 21 | \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. 22 | } 23 | } 24 | \references{ 25 | https://archive.ics.uci.edu/ml/datasets/Mushroom 26 | 27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 29 | School of Information and Computer Science. 30 | } 31 | \keyword{datasets} 32 | 33 | -------------------------------------------------------------------------------- /python-package/lightgbm/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """LightGBM, Light Gradient Boosting Machine. 3 | 4 | Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors 5 | """ 6 | 7 | from __future__ import absolute_import 8 | 9 | from .basic import Booster, Dataset 10 | from .callback import (early_stopping, print_evaluation, record_evaluation, 11 | reset_parameter) 12 | from .engine import cv, train 13 | 14 | try: 15 | from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker 16 | except ImportError: 17 | pass 18 | try: 19 | from .plotting import plot_importance, plot_metric, plot_tree, create_tree_digraph 20 | except ImportError: 21 | pass 22 | 23 | 24 | __version__ = 0.1 25 | 26 | __all__ = ['Dataset', 'Booster', 27 | 'train', 'cv', 28 | 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 29 | 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 30 | 'plot_importance', 'plot_metric', 'plot_tree', 'create_tree_digraph'] 31 | -------------------------------------------------------------------------------- /R-package/man/dim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{dim.lgb.Dataset} 4 | \alias{dim.lgb.Dataset} 5 | \title{Dimensions of an lgb.Dataset} 6 | \usage{ 7 | \method{dim}{lgb.Dataset}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Object of class \code{lgb.Dataset}} 11 | 12 | \item{...}{other parameters} 13 | } 14 | \value{ 15 | a vector of numbers of rows and of columns 16 | } 17 | \description{ 18 | Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}. 19 | } 20 | \details{ 21 | Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also 22 | be directly used with an \code{lgb.Dataset} object. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library(lightgbm) 27 | data(agaricus.train, package = "lightgbm") 28 | train <- agaricus.train 29 | dtrain <- lgb.Dataset(train$data, label = train$label) 30 | 31 | stopifnot(nrow(dtrain) == nrow(train$data)) 32 | stopifnot(ncol(dtrain) == ncol(train$data)) 33 | stopifnot(all(dim(dtrain) == dim(train$data))) 34 | } 35 | 36 | } 37 | 38 | -------------------------------------------------------------------------------- /R-package/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method("dimnames<-",lgb.Dataset) 4 | S3method(dim,lgb.Dataset) 5 | S3method(dimnames,lgb.Dataset) 6 | S3method(getinfo,lgb.Dataset) 7 | S3method(predict,lgb.Booster) 8 | S3method(setinfo,lgb.Dataset) 9 | S3method(slice,lgb.Dataset) 10 | export(getinfo) 11 | export(lgb.Dataset) 12 | export(lgb.Dataset.construct) 13 | export(lgb.Dataset.create.valid) 14 | export(lgb.Dataset.save) 15 | export(lgb.Dataset.set.categorical) 16 | export(lgb.Dataset.set.reference) 17 | export(lgb.cv) 18 | export(lgb.dump) 19 | export(lgb.get.eval.result) 20 | export(lgb.importance) 21 | export(lgb.interprete) 22 | export(lgb.load) 23 | export(lgb.model.dt.tree) 24 | export(lgb.plot.importance) 25 | export(lgb.plot.interpretation) 26 | export(lgb.save) 27 | export(lgb.train) 28 | export(lgb.unloader) 29 | export(lightgbm) 30 | export(readRDS.lgb.Booster) 31 | export(saveRDS.lgb.Booster) 32 | export(setinfo) 33 | export(slice) 34 | import(methods) 35 | importFrom(R6,R6Class) 36 | importFrom(data.table,":=") 37 | importFrom(magrittr,"%>%") 38 | importFrom(magrittr,"%T>%") 39 | useDynLib(lightgbm) 40 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.create.valid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.create.valid} 4 | \alias{lgb.Dataset.create.valid} 5 | \title{Construct validation data} 6 | \usage{ 7 | lgb.Dataset.create.valid(dataset, data, info = list(), ...) 8 | } 9 | \arguments{ 10 | \item{dataset}{\code{lgb.Dataset} object, training data} 11 | 12 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} 13 | 14 | \item{info}{a list of information of the lgb.Dataset object} 15 | 16 | \item{...}{other information to pass to \code{info}.} 17 | } 18 | \value{ 19 | constructed dataset 20 | } 21 | \description{ 22 | Construct validation data according to training data 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library(lightgbm) 27 | data(agaricus.train, package = "lightgbm") 28 | train <- agaricus.train 29 | dtrain <- lgb.Dataset(train$data, label = train$label) 30 | data(agaricus.test, package = "lightgbm") 31 | test <- agaricus.test 32 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 33 | } 34 | 35 | } 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_custom_objective.R: -------------------------------------------------------------------------------- 1 | context('Test models with custom objective') 2 | 3 | data(agaricus.train, package='lightgbm') 4 | data(agaricus.test, package='lightgbm') 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 7 | watchlist <- list(eval = dtest, train = dtrain) 8 | 9 | logregobj <- function(preds, dtrain) { 10 | labels <- getinfo(dtrain, "label") 11 | preds <- 1 / (1 + exp(-preds)) 12 | grad <- preds - labels 13 | hess <- preds * (1 - preds) 14 | return(list(grad = grad, hess = hess)) 15 | } 16 | 17 | evalerror <- function(preds, dtrain) { 18 | labels <- getinfo(dtrain, "label") 19 | err <- as.numeric(sum(labels != (preds > 0))) / length(labels) 20 | return(list(name = "error", value = err, higher_better=FALSE)) 21 | } 22 | 23 | param <- list(num_leaves=8, learning_rate=1, 24 | objective=logregobj, metric="auc") 25 | num_round <- 10 26 | 27 | test_that("custom objective works", { 28 | bst <- lgb.train(param, dtrain, num_round, watchlist, eval = evalerror) 29 | expect_false(is.null(bst$record_evals)) 30 | }) 31 | -------------------------------------------------------------------------------- /python-package/setup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable=invalid-name, exec-used 3 | """Setup lightgbm package.""" 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import sys 8 | 9 | from setuptools import find_packages, setup 10 | 11 | sys.path.insert(0, '.') 12 | 13 | CURRENT_DIR = os.path.dirname(__file__) 14 | 15 | libpath_py = os.path.join(CURRENT_DIR, 'lightgbm/libpath.py') 16 | libpath = {'__file__': libpath_py} 17 | exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath) 18 | 19 | LIB_PATH = [os.path.relpath(path, CURRENT_DIR) for path in libpath['find_lib_path']()] 20 | print("Install lib_lightgbm from: %s" % LIB_PATH) 21 | 22 | 23 | setup(name='lightgbm', 24 | version=0.1, 25 | description="LightGBM Python Package", 26 | install_requires=[ 27 | 'numpy', 28 | 'scipy', 29 | ], 30 | maintainer='Guolin Ke', 31 | maintainer_email='guolin.ke@microsoft.com', 32 | zip_safe=False, 33 | packages=find_packages(), 34 | include_package_data=True, 35 | data_files=[('lightgbm', LIB_PATH)], 36 | url='https://github.com/Microsoft/LightGBM') 37 | -------------------------------------------------------------------------------- /R-package/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /R-package/man/lgb.load.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.load} 4 | \alias{lgb.load} 5 | \title{Load LightGBM model} 6 | \usage{ 7 | lgb.load(filename) 8 | } 9 | \arguments{ 10 | \item{filename}{path of model file} 11 | } 12 | \value{ 13 | booster 14 | } 15 | \description{ 16 | Load LightGBM model from saved model file 17 | } 18 | \examples{ 19 | \dontrun{ 20 | library(lightgbm) 21 | data(agaricus.train, package = "lightgbm") 22 | train <- agaricus.train 23 | dtrain <- lgb.Dataset(train$data, label = train$label) 24 | data(agaricus.test, package = "lightgbm") 25 | test <- agaricus.test 26 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 27 | params <- list(objective = "regression", metric = "l2") 28 | valids <- list(test = dtest) 29 | model <- lgb.train(params, 30 | dtrain, 31 | 100, 32 | valids, 33 | min_data = 1, 34 | learning_rate = 1, 35 | early_stopping_rounds = 10) 36 | lgb.save(model, "model.txt") 37 | load_booster <- lgb.load("model.txt") 38 | } 39 | 40 | } 41 | 42 | -------------------------------------------------------------------------------- /R-package/src/lightgbm-fullcode.cpp: -------------------------------------------------------------------------------- 1 | // application 2 | #include "./src/application/application.cpp" 3 | 4 | // boosting 5 | #include "./src/boosting/boosting.cpp" 6 | #include "./src/boosting/gbdt.cpp" 7 | 8 | // io 9 | #include "./src/io/bin.cpp" 10 | #include "./src/io/config.cpp" 11 | #include "./src/io/dataset.cpp" 12 | #include "./src/io/dataset_loader.cpp" 13 | #include "./src/io/metadata.cpp" 14 | #include "./src/io/parser.cpp" 15 | #include "./src/io/tree.cpp" 16 | 17 | // metric 18 | #include "./src/metric/dcg_calculator.cpp" 19 | #include "./src/metric/metric.cpp" 20 | 21 | // network 22 | #include "./src/network/linker_topo.cpp" 23 | #include "./src/network/linkers_socket.cpp" 24 | #include "./src/network/network.cpp" 25 | 26 | // objective 27 | #include "./src/objective/objective_function.cpp" 28 | 29 | // treelearner 30 | #include "./src/treelearner/data_parallel_tree_learner.cpp" 31 | #include "./src/treelearner/feature_parallel_tree_learner.cpp" 32 | #include "./src/treelearner/serial_tree_learner.cpp" 33 | #include "./src/treelearner/tree_learner.cpp" 34 | #include "./src/treelearner/voting_parallel_tree_learner.cpp" 35 | 36 | // c_api 37 | #include "./src/c_api.cpp" 38 | -------------------------------------------------------------------------------- /R-package/src/lightgbm-all.cpp: -------------------------------------------------------------------------------- 1 | // application 2 | #include "../../src/application/application.cpp" 3 | 4 | // boosting 5 | #include "../../src/boosting/boosting.cpp" 6 | #include "../../src/boosting/gbdt.cpp" 7 | 8 | // io 9 | #include "../../src/io/bin.cpp" 10 | #include "../../src/io/config.cpp" 11 | #include "../../src/io/dataset.cpp" 12 | #include "../../src/io/dataset_loader.cpp" 13 | #include "../../src/io/metadata.cpp" 14 | #include "../../src/io/parser.cpp" 15 | #include "../../src/io/tree.cpp" 16 | 17 | // metric 18 | #include "../../src/metric/dcg_calculator.cpp" 19 | #include "../../src/metric/metric.cpp" 20 | 21 | // network 22 | #include "../../src/network/linker_topo.cpp" 23 | #include "../../src/network/linkers_socket.cpp" 24 | #include "../../src/network/network.cpp" 25 | 26 | // objective 27 | #include "../../src/objective/objective_function.cpp" 28 | 29 | // treelearner 30 | #include "../../src/treelearner/data_parallel_tree_learner.cpp" 31 | #include "../../src/treelearner/feature_parallel_tree_learner.cpp" 32 | #include "../../src/treelearner/serial_tree_learner.cpp" 33 | #include "../../src/treelearner/tree_learner.cpp" 34 | #include "../../src/treelearner/voting_parallel_tree_learner.cpp" 35 | 36 | // c_api 37 | #include "../../src/c_api.cpp" 38 | -------------------------------------------------------------------------------- /R-package/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: lightgbm 2 | Type: Package 3 | Title: Light Gradient Boosting Machine 4 | Version: 0.1 5 | Date: 2016-12-29 6 | Author: Guolin Ke 7 | Maintainer: Guolin Ke 8 | Description: LightGBM is a gradient boosting framework that uses tree based learning algorithms. 9 | It is designed to be distributed and efficient with the following advantages: 10 | 1.Faster training speed and higher efficiency. 11 | 2.Lower memory usage. 12 | 3.Better accuracy. 13 | 4.Parallel learning supported. 14 | 5. Capable of handling large-scale data. 15 | License: The MIT License (MIT) | file LICENSE 16 | URL: https://github.com/Microsoft/LightGBM 17 | BugReports: https://github.com/Microsoft/LightGBM/issues 18 | VignetteBuilder: knitr 19 | Suggests: 20 | knitr, 21 | rmarkdown, 22 | ggplot2 (>= 1.0.1), 23 | DiagrammeR (>= 0.8.1), 24 | Ckmeans.1d.dp (>= 3.3.1), 25 | vcd (>= 1.3), 26 | testthat, 27 | igraph (>= 1.0.1), 28 | stringi (>= 0.5.2) 29 | Depends: 30 | R (>= 3.0), 31 | R6 32 | Imports: 33 | methods, 34 | Matrix (>= 1.1-0), 35 | data.table (>= 1.9.6), 36 | magrittr (>= 1.5), 37 | jsonlite 38 | RoxygenNote: 5.0.1 39 | -------------------------------------------------------------------------------- /R-package/man/lgb.dump.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.dump} 4 | \alias{lgb.dump} 5 | \title{Dump LightGBM model to json} 6 | \usage{ 7 | lgb.dump(booster, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{booster}{Object of class \code{lgb.Booster}} 11 | 12 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} 13 | } 14 | \value{ 15 | json format of model 16 | } 17 | \description{ 18 | Dump LightGBM model to json 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package = "lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | data(agaricus.test, package = "lightgbm") 27 | test <- agaricus.test 28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 29 | params <- list(objective = "regression", metric = "l2") 30 | valids <- list(test = dtest) 31 | model <- lgb.train(params, 32 | dtrain, 33 | 100, 34 | valids, 35 | min_data = 1, 36 | learning_rate = 1, 37 | early_stopping_rounds = 10) 38 | json_model <- lgb.dump(model) 39 | } 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /include/LightGBM/utils/threading.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_THREADING_H_ 2 | #define LIGHTGBM_UTILS_THREADING_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace LightGBM { 10 | 11 | class Threading { 12 | public: 13 | 14 | template 15 | static inline void For(INDEX_T start, INDEX_T end, const std::function& inner_fun) { 16 | int num_threads = 1; 17 | #pragma omp parallel 18 | #pragma omp master 19 | { 20 | num_threads = omp_get_num_threads(); 21 | } 22 | INDEX_T num_inner = (end - start + num_threads - 1) / num_threads; 23 | if (num_inner <= 0) { num_inner = 1; } 24 | OMP_INIT_EX(); 25 | #pragma omp parallel for schedule(static,1) 26 | for (int i = 0; i < num_threads; ++i) { 27 | OMP_LOOP_EX_BEGIN(); 28 | INDEX_T inner_start = start + num_inner * i; 29 | INDEX_T inner_end = inner_start + num_inner; 30 | if (inner_end > end) { inner_end = end; } 31 | if (inner_start < end) { 32 | inner_fun(i, inner_start, inner_end); 33 | } 34 | OMP_LOOP_EX_END(); 35 | } 36 | OMP_THROW_EX(); 37 | } 38 | }; 39 | 40 | } // namespace LightGBM 41 | 42 | #endif // LightGBM_UTILS_THREADING_H_ 43 | -------------------------------------------------------------------------------- /R-package/man/dimnames.lgb.Dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{dimnames.lgb.Dataset} 4 | \alias{dimnames.lgb.Dataset} 5 | \alias{dimnames<-.lgb.Dataset} 6 | \title{Handling of column names of \code{lgb.Dataset}} 7 | \usage{ 8 | \method{dimnames}{lgb.Dataset}(x) 9 | 10 | \method{dimnames}{lgb.Dataset}(x) <- value 11 | } 12 | \arguments{ 13 | \item{x}{object of class \code{lgb.Dataset}} 14 | 15 | \item{value}{a list of two elements: the first one is ignored 16 | and the second one is column names} 17 | } 18 | \description{ 19 | Only column names are supported for \code{lgb.Dataset}, thus setting of 20 | row names would have no effect and returned row names would be NULL. 21 | } 22 | \details{ 23 | Generic \code{dimnames} methods are used by \code{colnames}. 24 | Since row names are irrelevant, it is recommended to use \code{colnames} directly. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(lightgbm) 29 | data(agaricus.train, package = "lightgbm") 30 | train <- agaricus.train 31 | dtrain <- lgb.Dataset(train$data, label = train$label) 32 | lgb.Dataset.construct(dtrain) 33 | dimnames(dtrain) 34 | colnames(dtrain) 35 | colnames(dtrain) <- make.names(1:ncol(train$data)) 36 | print(dtrain, verbose = TRUE) 37 | } 38 | 39 | } 40 | 41 | -------------------------------------------------------------------------------- /R-package/man/lgb.save.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.save} 4 | \alias{lgb.save} 5 | \title{Save LightGBM model} 6 | \usage{ 7 | lgb.save(booster, filename, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{booster}{Object of class \code{lgb.Booster}} 11 | 12 | \item{filename}{saved filename} 13 | 14 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} 15 | } 16 | \value{ 17 | booster 18 | } 19 | \description{ 20 | Save LightGBM model 21 | } 22 | \examples{ 23 | \dontrun{ 24 | library(lightgbm) 25 | data(agaricus.train, package = "lightgbm") 26 | train <- agaricus.train 27 | dtrain <- lgb.Dataset(train$data, label = train$label) 28 | data(agaricus.test, package = "lightgbm") 29 | test <- agaricus.test 30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 31 | params <- list(objective = "regression", metric = "l2") 32 | valids <- list(test = dtest) 33 | model <- lgb.train(params, 34 | dtrain, 35 | 100, 36 | valids, 37 | min_data = 1, 38 | learning_rate = 1, 39 | early_stopping_rounds = 10) 40 | lgb.save(model, "model.txt") 41 | } 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /docker/dockerfile-python: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y cmake build-essential gcc g++ git wget && \ 5 | 6 | # open-mpi 7 | cd /usr/local/src && mkdir openmpi && cd openmpi && \ 8 | wget https://www.open-mpi.org/software/ompi/v2.0/downloads/openmpi-2.0.1.tar.gz && \ 9 | tar -xzf openmpi-2.0.1.tar.gz && cd openmpi-2.0.1 && \ 10 | ./configure --prefix=/usr/local/openmpi && make && make install && \ 11 | export PATH="/usr/local/openmpi/bin:$PATH" && \ 12 | 13 | # lightgbm 14 | cd /usr/local/src && mkdir lightgbm && cd lightgbm && \ 15 | git clone --recursive https://github.com/Microsoft/LightGBM && \ 16 | cd LightGBM && mkdir build && cd build && cmake -DUSE_MPI=ON .. && make && \ 17 | 18 | # python-package 19 | # miniconda 20 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 21 | /bin/bash Miniconda3-latest-Linux-x86_64.sh -f -b -p /opt/conda && \ 22 | export PATH="/opt/conda/bin:$PATH" && \ 23 | # lightgbm 24 | conda install -y numpy scipy scikit-learn pandas && \ 25 | cd ../python-package && python setup.py install && \ 26 | 27 | # clean 28 | apt-get autoremove -y && apt-get clean && \ 29 | conda clean -i -l -t -y && \ 30 | rm -rf /usr/local/src/* 31 | 32 | ENV PATH /opt/conda/bin:$PATH 33 | -------------------------------------------------------------------------------- /python-package/lightgbm/libpath.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Find the path to lightgbm dynamic library files.""" 3 | import os 4 | import sys 5 | 6 | 7 | def find_lib_path(): 8 | """Find the path to LightGBM library files. 9 | Returns 10 | ------- 11 | lib_path: list(string) 12 | List of all found library path to LightGBM 13 | """ 14 | curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) 15 | dll_path = [curr_path, os.path.join(curr_path, '../../lib/'), 16 | os.path.join(curr_path, '../../'), 17 | os.path.join(curr_path, './lib/'), 18 | os.path.join(sys.prefix, 'lightgbm')] 19 | if os.name == 'nt': 20 | dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/')) 21 | dll_path.append(os.path.join(curr_path, './windows/x64/Dll/')) 22 | dll_path.append(os.path.join(curr_path, '../../Release/')) 23 | dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path] 24 | else: 25 | dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path] 26 | lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] 27 | if not lib_path: 28 | dll_path = [os.path.realpath(p) for p in dll_path] 29 | raise Exception('Cannot find lightgbm Library in following paths: ' + ','.join(dll_path)) 30 | return lib_path 31 | -------------------------------------------------------------------------------- /R-package/man/getinfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{getinfo} 4 | \alias{getinfo} 5 | \alias{getinfo.lgb.Dataset} 6 | \title{Get information of an lgb.Dataset object} 7 | \usage{ 8 | getinfo(dataset, ...) 9 | 10 | \method{getinfo}{lgb.Dataset}(dataset, name, ...) 11 | } 12 | \arguments{ 13 | \item{dataset}{Object of class \code{lgb.Dataset}} 14 | 15 | \item{...}{other parameters} 16 | 17 | \item{name}{the name of the information field to get (see details)} 18 | } 19 | \value{ 20 | info data 21 | } 22 | \description{ 23 | Get information of an lgb.Dataset object 24 | } 25 | \details{ 26 | The \code{name} field can be one of the following: 27 | 28 | \itemize{ 29 | \item \code{label}: label lightgbm learn from ; 30 | \item \code{weight}: to do a weight rescale ; 31 | \item \code{group}: group size 32 | \item \code{init_score}: initial score is the base prediction lightgbm will boost from ; 33 | } 34 | } 35 | \examples{ 36 | \dontrun{ 37 | library(lightgbm) 38 | data(agaricus.train, package = "lightgbm") 39 | train <- agaricus.train 40 | dtrain <- lgb.Dataset(train$data, label = train$label) 41 | lgb.Dataset.construct(dtrain) 42 | 43 | labels <- lightgbm::getinfo(dtrain, "label") 44 | lightgbm::setinfo(dtrain, "label", 1 - labels) 45 | 46 | labels2 <- lightgbm::getinfo(dtrain, "label") 47 | stopifnot(all(labels2 == 1 - labels)) 48 | } 49 | 50 | } 51 | 52 | -------------------------------------------------------------------------------- /R-package/man/readRDS.lgb.Booster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readRDS.lgb.Booster.R 3 | \name{readRDS.lgb.Booster} 4 | \alias{readRDS.lgb.Booster} 5 | \title{readRDS for lgb.Booster models} 6 | \usage{ 7 | readRDS.lgb.Booster(file = "", refhook = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{a connection or the name of the file where the R object is saved to or read from.} 11 | 12 | \item{refhook}{a hook function for handling reference objects.} 13 | } 14 | \value{ 15 | an R object. 16 | } 17 | \description{ 18 | Attemps to load a model using RDS. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package = "lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | data(agaricus.test, package = "lightgbm") 27 | test <- agaricus.test 28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 29 | params <- list(objective = "regression", metric = "l2") 30 | valids <- list(test = dtest) 31 | model <- lgb.train(params, 32 | dtrain, 33 | 100, 34 | valids, 35 | min_data = 1, 36 | learning_rate = 1, 37 | early_stopping_rounds = 10) 38 | saveRDS.lgb.Booster(model, "model.rds") 39 | new_model <- readRDS.lgb.Booster("model.rds") 40 | } 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /R-package/man/setinfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{setinfo} 4 | \alias{setinfo} 5 | \alias{setinfo.lgb.Dataset} 6 | \title{Set information of an lgb.Dataset object} 7 | \usage{ 8 | setinfo(dataset, ...) 9 | 10 | \method{setinfo}{lgb.Dataset}(dataset, name, info, ...) 11 | } 12 | \arguments{ 13 | \item{dataset}{Object of class "lgb.Dataset"} 14 | 15 | \item{...}{other parameters} 16 | 17 | \item{name}{the name of the field to get} 18 | 19 | \item{info}{the specific field of information to set} 20 | } 21 | \value{ 22 | passed object 23 | } 24 | \description{ 25 | Set information of an lgb.Dataset object 26 | } 27 | \details{ 28 | The \code{name} field can be one of the following: 29 | 30 | \itemize{ 31 | \item \code{label}: label lightgbm learn from ; 32 | \item \code{weight}: to do a weight rescale ; 33 | \item \code{init_score}: initial score is the base prediction lightgbm will boost from ; 34 | \item \code{group}. 35 | } 36 | } 37 | \examples{ 38 | \dontrun{ 39 | library(lightgbm) 40 | data(agaricus.train, package = "lightgbm") 41 | train <- agaricus.train 42 | dtrain <- lgb.Dataset(train$data, label = train$label) 43 | lgb.Dataset.construct(dtrain) 44 | 45 | labels <- lightgbm::getinfo(dtrain, "label") 46 | lightgbm::setinfo(dtrain, "label", 1 - labels) 47 | 48 | labels2 <- lightgbm::getinfo(dtrain, "label") 49 | stopifnot(all.equal(labels2, 1 - labels)) 50 | } 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset} 4 | \alias{lgb.Dataset} 5 | \title{Construct lgb.Dataset object} 6 | \usage{ 7 | lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL, 8 | categorical_feature = NULL, free_raw_data = TRUE, info = list(), ...) 9 | } 10 | \arguments{ 11 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} 12 | 13 | \item{params}{a list of parameters} 14 | 15 | \item{reference}{reference dataset} 16 | 17 | \item{colnames}{names of columns} 18 | 19 | \item{categorical_feature}{categorical features} 20 | 21 | \item{free_raw_data}{TRUE for need to free raw data after construct} 22 | 23 | \item{info}{a list of information of the lgb.Dataset object} 24 | 25 | \item{...}{other information to pass to \code{info} or parameters pass to \code{params}} 26 | } 27 | \value{ 28 | constructed dataset 29 | } 30 | \description{ 31 | Construct lgb.Dataset object from dense matrix, sparse matrix 32 | or local file (that was created previously by saving an \code{lgb.Dataset}). 33 | } 34 | \examples{ 35 | \dontrun{ 36 | library(lightgbm) 37 | data(agaricus.train, package = "lightgbm") 38 | train <- agaricus.train 39 | dtrain <- lgb.Dataset(train$data, label = train$label) 40 | lgb.Dataset.save(dtrain, "lgb.Dataset.data") 41 | dtrain <- lgb.Dataset("lgb.Dataset.data") 42 | lgb.Dataset.construct(dtrain) 43 | } 44 | 45 | } 46 | 47 | -------------------------------------------------------------------------------- /R-package/demo/boost_from_prediction.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | require(methods) 3 | 4 | # Load in the agaricus dataset 5 | data(agaricus.train, package = "lightgbm") 6 | data(agaricus.test, package = "lightgbm") 7 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 8 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 9 | 10 | valids <- list(eval = dtest, train = dtrain) 11 | #--------------------Advanced features --------------------------- 12 | # advanced: start from a initial base prediction 13 | print("Start running example to start from a initial prediction") 14 | 15 | # Train lightgbm for 1 round 16 | param <- list(num_leaves = 4, 17 | learning_rate = 1, 18 | nthread = 2, 19 | silent = 1, 20 | objective = "binary") 21 | bst <- lgb.train(param, dtrain, 1, valids = valids) 22 | 23 | # Note: we need the margin value instead of transformed prediction in set_init_score 24 | ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE) 25 | ptest <- predict(bst, agaricus.test$data, rawscore = TRUE) 26 | 27 | # set the init_score property of dtrain and dtest 28 | # base margin is the base prediction we will boost from 29 | setinfo(dtrain, "init_score", ptrain) 30 | setinfo(dtest, "init_score", ptest) 31 | 32 | print("This is result of boost from initial prediction") 33 | bst <- lgb.train(params = param, 34 | data = dtrain, 35 | nrounds = 5, 36 | valids = valids) 37 | -------------------------------------------------------------------------------- /R-package/man/lgb.importance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.importance.R 3 | \name{lgb.importance} 4 | \alias{lgb.importance} 5 | \title{Compute feature importance in a model} 6 | \usage{ 7 | lgb.importance(model, percentage = TRUE) 8 | } 9 | \arguments{ 10 | \item{model}{object of class \code{lgb.Booster}.} 11 | 12 | \item{percentage}{whether to show importance in relative percentage.} 13 | } 14 | \value{ 15 | For a tree model, a \code{data.table} with the following columns: 16 | \itemize{ 17 | \item \code{Feature} Feature names in the model. 18 | \item \code{Gain} The total gain of this feature's splits. 19 | \item \code{Cover} The number of observation related to this feature. 20 | \item \code{Frequency} The number of times a feature splited in trees. 21 | } 22 | } 23 | \description{ 24 | Creates a \code{data.table} of feature importances in a model. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(lightgbm) 29 | data(agaricus.train, package = "lightgbm") 30 | train <- agaricus.train 31 | dtrain <- lgb.Dataset(train$data, label = train$label) 32 | 33 | params = list(objective = "binary", 34 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 35 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 36 | model <- lgb.train(params, dtrain, 20) 37 | model <- lgb.train(params, dtrain, 20) 38 | 39 | tree_imp1 <- lgb.importance(model, percentage = TRUE) 40 | tree_imp2 <- lgb.importance(model, percentage = FALSE) 41 | } 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /.travis/amd_sdk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original script from https://github.com/gregvw/amd_sdk/ 4 | 5 | # Location from which get nonce and file name from 6 | URL="http://developer.amd.com/tools-and-sdks/opencl-zone/opencl-tools-sdks/amd-accelerated-parallel-processing-app-sdk/" 7 | URLDOWN="http://developer.amd.com/amd-license-agreement-appsdk/" 8 | 9 | NONCE1_STRING='name="amd_developer_central_downloads_page_nonce"' 10 | FILE_STRING='name="f"' 11 | POSTID_STRING='name="post_id"' 12 | NONCE2_STRING='name="amd_developer_central_nonce"' 13 | 14 | #For newest FORM=`wget -qO - $URL | sed -n '/download-2/,/64-bit/p'` 15 | FORM=`wget -qO - $URL | sed -n '/download-5/,/64-bit/p'` 16 | 17 | # Get nonce from form 18 | NONCE1=`echo $FORM | awk -F ${NONCE1_STRING} '{print $2}'` 19 | NONCE1=`echo $NONCE1 | awk -F'"' '{print $2}'` 20 | echo $NONCE1 21 | 22 | # get the postid 23 | POSTID=`echo $FORM | awk -F ${POSTID_STRING} '{print $2}'` 24 | POSTID=`echo $POSTID | awk -F'"' '{print $2}'` 25 | echo $POSTID 26 | 27 | # get file name 28 | FILE=`echo $FORM | awk -F ${FILE_STRING} '{print $2}'` 29 | FILE=`echo $FILE | awk -F'"' '{print $2}'` 30 | echo $FILE 31 | 32 | FORM=`wget -qO - $URLDOWN --post-data "amd_developer_central_downloads_page_nonce=${NONCE1}&f=${FILE}&post_id=${POSTID}"` 33 | 34 | NONCE2=`echo $FORM | awk -F ${NONCE2_STRING} '{print $2}'` 35 | NONCE2=`echo $NONCE2 | awk -F'"' '{print $2}'` 36 | echo $NONCE2 37 | 38 | wget --content-disposition --trust-server-names $URLDOWN --post-data "amd_developer_central_nonce=${NONCE2}&f=${FILE}" -O AMD-SDK.tar.bz2; 39 | -------------------------------------------------------------------------------- /python-package/README.rst: -------------------------------------------------------------------------------- 1 | LightGBM Python Package 2 | ======================= 3 | 4 | Installation 5 | ------------ 6 | 7 | 1. Following `Installation Guide `__ to build first. 8 | For the windows user, please change the build config to ``DLL``. 9 | 2. Install with ``cd python-package; python setup.py install`` 10 | 11 | Note: Make sure you have `setuptools `__ 12 | 13 | 14 | Examples 15 | -------- 16 | 17 | Refer to the walk through examples in `python-guide folder `__ 18 | 19 | 20 | Troubleshooting 21 | -------- 22 | 23 | Refer to `FAQ `__ 24 | 25 | Developments 26 | -------- 27 | 28 | The code style of python package follows `pep8 `__. If you would like to make a contribution and not familiar with pep-8, please check the pep8 style guide first. Otherwise, you won't pass the check. You should be careful about: 29 | 30 | - E1 Indentation (check pep8 link above) 31 | - E202 whitespace before and after brackets 32 | - E225 missing whitespace around operator 33 | - E226 missing whitespace around arithmetic operator 34 | - E261 at least two spaces before inline comment 35 | - E301 expected 1 blank line in front of and at the end of a method 36 | - E302 expected 2 blank lines in front of and at the end of a function or a class 37 | 38 | You can ignore E501 (line too long). 39 | -------------------------------------------------------------------------------- /src/treelearner/tree_learner.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "serial_tree_learner.h" 4 | #include "gpu_tree_learner.h" 5 | #include "parallel_tree_learner.h" 6 | 7 | namespace LightGBM { 8 | 9 | TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, const std::string& device_type, const TreeConfig* tree_config) { 10 | if (device_type == std::string("cpu")) { 11 | if (learner_type == std::string("serial")) { 12 | return new SerialTreeLearner(tree_config); 13 | } else if (learner_type == std::string("feature")) { 14 | return new FeatureParallelTreeLearner(tree_config); 15 | } else if (learner_type == std::string("data")) { 16 | return new DataParallelTreeLearner(tree_config); 17 | } else if (learner_type == std::string("voting")) { 18 | return new VotingParallelTreeLearner(tree_config); 19 | } 20 | } 21 | else if (device_type == std::string("gpu")) { 22 | if (learner_type == std::string("serial")) { 23 | return new GPUTreeLearner(tree_config); 24 | } else if (learner_type == std::string("feature")) { 25 | return new FeatureParallelTreeLearner(tree_config); 26 | } else if (learner_type == std::string("data")) { 27 | return new DataParallelTreeLearner(tree_config); 28 | } else if (learner_type == std::string("voting")) { 29 | return new VotingParallelTreeLearner(tree_config); 30 | } 31 | } 32 | return nullptr; 33 | } 34 | 35 | } // namespace LightGBM 36 | -------------------------------------------------------------------------------- /windows/LightGBM.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_mpi|x64 = Debug_mpi|x64 11 | Debug|x64 = Debug|x64 12 | DLL|x64 = DLL|x64 13 | Release_mpi|x64 = Release_mpi|x64 14 | Release|x64 = Release|x64 15 | EndGlobalSection 16 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 17 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64 18 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64 19 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64 20 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64 21 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.ActiveCfg = DLL|x64 22 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.Build.0 = DLL|x64 23 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64 24 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64 25 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64 26 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64 27 | EndGlobalSection 28 | GlobalSection(SolutionProperties) = preSolution 29 | HideSolutionNode = FALSE 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /src/metric/metric.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regression_metric.hpp" 3 | #include "binary_metric.hpp" 4 | #include "rank_metric.hpp" 5 | #include "map_metric.hpp" 6 | #include "multiclass_metric.hpp" 7 | 8 | namespace LightGBM { 9 | 10 | Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config) { 11 | if (type == std::string("l2") || type == std::string("mean_squared_error") || type == std::string("mse")) { 12 | return new L2Metric(config); 13 | } else if (type == std::string("l1") || type == std::string("mean_absolute_error") || type == std::string("mae")) { 14 | return new L1Metric(config); 15 | } else if (type == std::string("huber")) { 16 | return new HuberLossMetric(config); 17 | } else if (type == std::string("fair")) { 18 | return new FairLossMetric(config); 19 | } else if (type == std::string("poisson")) { 20 | return new PoissonMetric(config); 21 | } else if (type == std::string("binary_logloss")) { 22 | return new BinaryLoglossMetric(config); 23 | } else if (type == std::string("binary_error")) { 24 | return new BinaryErrorMetric(config); 25 | } else if (type == std::string("auc")) { 26 | return new AUCMetric(config); 27 | } else if (type == std::string("ndcg")) { 28 | return new NDCGMetric(config); 29 | } else if (type == std::string("map")) { 30 | return new MapMetric(config); 31 | } else if (type == std::string("multi_logloss")) { 32 | return new MultiSoftmaxLoglossMetric(config); 33 | } else if (type == std::string("multi_error")) { 34 | return new MultiErrorMetric(config); 35 | } 36 | return nullptr; 37 | } 38 | 39 | } // namespace LightGBM 40 | -------------------------------------------------------------------------------- /examples/multiclass_classification/train.conf: -------------------------------------------------------------------------------- 1 | # task type, support train and predict 2 | task = train 3 | 4 | # boosting type, support gbdt for now, alias: boosting, boost 5 | boosting_type = gbdt 6 | 7 | # application type, support following application 8 | # regression , regression task 9 | # binary , binary classification task 10 | # lambdarank , lambdarank task 11 | # multiclass 12 | # alias: application, app 13 | objective = multiclass 14 | 15 | # eval metrics, support multi metric, delimite by ',' , support following metrics 16 | # l1 17 | # l2 , default metric for regression 18 | # ndcg , default metric for lambdarank 19 | # auc 20 | # binary_logloss , default metric for binary 21 | # binary_error 22 | # multi_logloss 23 | # multi_error 24 | metric = multi_logloss 25 | 26 | # number of class, for multiclass classification 27 | num_class = 5 28 | 29 | # frequence for metric output 30 | metric_freq = 1 31 | 32 | # true if need output metric for training data, alias: tranining_metric, train_metric 33 | is_training_metric = true 34 | 35 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 36 | max_bin = 255 37 | 38 | # training data 39 | # if exsting weight file, should name to "regression.train.weight" 40 | # alias: train_data, train 41 | data = multiclass.train 42 | 43 | # valid data 44 | valid_data = multiclass.test 45 | 46 | # round for early stopping 47 | early_stopping = 10 48 | 49 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds 50 | num_trees = 100 51 | 52 | # shrinkage rate , alias: shrinkage_rate 53 | learning_rate = 0.05 54 | 55 | # number of leaves for one tree, alias: num_leaf 56 | num_leaves = 31 57 | -------------------------------------------------------------------------------- /examples/python-guide/sklearn_example.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | from sklearn.metrics import mean_squared_error 6 | from sklearn.model_selection import GridSearchCV 7 | 8 | # load or create your dataset 9 | print('Load data...') 10 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 11 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 12 | 13 | y_train = df_train[0].values 14 | y_test = df_test[0].values 15 | X_train = df_train.drop(0, axis=1).values 16 | X_test = df_test.drop(0, axis=1).values 17 | 18 | print('Start training...') 19 | # train 20 | gbm = lgb.LGBMRegressor(objective='regression', 21 | num_leaves=31, 22 | learning_rate=0.05, 23 | n_estimators=20) 24 | gbm.fit(X_train, y_train, 25 | eval_set=[(X_test, y_test)], 26 | eval_metric='l1', 27 | early_stopping_rounds=5) 28 | 29 | print('Start predicting...') 30 | # predict 31 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) 32 | # eval 33 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) 34 | 35 | print('Calculate feature importances...') 36 | # feature importances 37 | print('Feature importances:', list(gbm.feature_importances_)) 38 | 39 | # other scikit-learn modules 40 | estimator = lgb.LGBMRegressor(num_leaves=31) 41 | 42 | param_grid = { 43 | 'learning_rate': [0.01, 0.1, 1], 44 | 'n_estimators': [20, 40] 45 | } 46 | 47 | gbm = GridSearchCV(estimator, param_grid) 48 | 49 | gbm.fit(X_train, y_train) 50 | 51 | print('Best parameters found by grid search are:', gbm.best_params_) 52 | -------------------------------------------------------------------------------- /examples/python-guide/README.md: -------------------------------------------------------------------------------- 1 | Python Package Example 2 | ===================== 3 | Here is an example for LightGBM to use python package. 4 | 5 | ***You should install lightgbm (both c++ and python verion) first.*** 6 | 7 | For the installation, check the wiki [here](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide). 8 | 9 | You also need scikit-learn, pandas and matplotlib (only for plot example) to run the examples, but they are not required for the package itself. You can install them with pip: 10 | ``` 11 | pip install scikit-learn pandas matplotlib -U 12 | ``` 13 | 14 | Now you can run examples in this folder, for example: 15 | ``` 16 | python simple_example.py 17 | ``` 18 | Examples including: 19 | - [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py) 20 | - Construct Dataset 21 | - Basic train and predict 22 | - Eval during training 23 | - Early stopping 24 | - Save model to file 25 | - Dump model to json format 26 | - Feature importances 27 | - [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py) 28 | - Basic train and predict with sklearn interface 29 | - Feature importances with sklearn interface 30 | - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py) 31 | - Set feature names 32 | - Directly use categorical features without one-hot encoding 33 | - Load model to predict 34 | - Dump and load model with pickle 35 | - Load model file to continue training 36 | - Change learning rates during training 37 | - Self-defined objective function 38 | - Self-defined eval metric 39 | - Callback function -------------------------------------------------------------------------------- /examples/lambdarank/rank.train.query: -------------------------------------------------------------------------------- 1 | 1 2 | 13 3 | 5 4 | 8 5 | 19 6 | 12 7 | 18 8 | 5 9 | 14 10 | 13 11 | 8 12 | 9 13 | 16 14 | 11 15 | 21 16 | 14 17 | 21 18 | 9 19 | 14 20 | 11 21 | 20 22 | 18 23 | 13 24 | 20 25 | 22 26 | 22 27 | 13 28 | 17 29 | 10 30 | 13 31 | 12 32 | 13 33 | 13 34 | 23 35 | 18 36 | 13 37 | 20 38 | 12 39 | 22 40 | 14 41 | 13 42 | 23 43 | 13 44 | 14 45 | 14 46 | 5 47 | 13 48 | 15 49 | 14 50 | 14 51 | 16 52 | 16 53 | 15 54 | 21 55 | 22 56 | 10 57 | 22 58 | 18 59 | 25 60 | 16 61 | 12 62 | 12 63 | 15 64 | 15 65 | 25 66 | 13 67 | 9 68 | 12 69 | 8 70 | 16 71 | 25 72 | 19 73 | 24 74 | 12 75 | 16 76 | 10 77 | 16 78 | 9 79 | 17 80 | 15 81 | 7 82 | 9 83 | 15 84 | 14 85 | 16 86 | 17 87 | 8 88 | 17 89 | 12 90 | 18 91 | 23 92 | 10 93 | 12 94 | 12 95 | 4 96 | 14 97 | 12 98 | 15 99 | 27 100 | 16 101 | 20 102 | 13 103 | 19 104 | 13 105 | 17 106 | 17 107 | 16 108 | 12 109 | 15 110 | 14 111 | 14 112 | 19 113 | 12 114 | 23 115 | 18 116 | 16 117 | 9 118 | 23 119 | 11 120 | 15 121 | 8 122 | 10 123 | 10 124 | 16 125 | 11 126 | 15 127 | 22 128 | 16 129 | 17 130 | 23 131 | 16 132 | 22 133 | 17 134 | 14 135 | 12 136 | 14 137 | 20 138 | 15 139 | 17 140 | 15 141 | 15 142 | 22 143 | 9 144 | 21 145 | 9 146 | 17 147 | 16 148 | 15 149 | 13 150 | 13 151 | 15 152 | 14 153 | 18 154 | 21 155 | 14 156 | 17 157 | 15 158 | 14 159 | 16 160 | 12 161 | 17 162 | 19 163 | 16 164 | 11 165 | 18 166 | 11 167 | 13 168 | 14 169 | 9 170 | 16 171 | 15 172 | 16 173 | 25 174 | 9 175 | 13 176 | 22 177 | 16 178 | 18 179 | 20 180 | 14 181 | 11 182 | 9 183 | 16 184 | 19 185 | 19 186 | 11 187 | 11 188 | 13 189 | 14 190 | 14 191 | 13 192 | 16 193 | 6 194 | 21 195 | 16 196 | 12 197 | 16 198 | 11 199 | 24 200 | 12 201 | 10 202 | -------------------------------------------------------------------------------- /examples/python-guide/plot_example.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | 6 | try: 7 | import matplotlib.pyplot as plt 8 | except ImportError: 9 | raise ImportError('You need to install matplotlib for plot_example.py.') 10 | 11 | # load or create your dataset 12 | print('Load data...') 13 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 14 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 15 | 16 | y_train = df_train[0].values 17 | y_test = df_test[0].values 18 | X_train = df_train.drop(0, axis=1).values 19 | X_test = df_test.drop(0, axis=1).values 20 | 21 | # create dataset for lightgbm 22 | lgb_train = lgb.Dataset(X_train, y_train) 23 | lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train) 24 | 25 | # specify your configurations as a dict 26 | params = { 27 | 'num_leaves': 5, 28 | 'metric': ('l1', 'l2'), 29 | 'verbose': 0 30 | } 31 | 32 | evals_result = {} # to record eval results for plotting 33 | 34 | print('Start training...') 35 | # train 36 | gbm = lgb.train(params, 37 | lgb_train, 38 | num_boost_round=100, 39 | valid_sets=[lgb_train, lgb_test], 40 | feature_name=['f' + str(i + 1) for i in range(28)], 41 | categorical_feature=[21], 42 | evals_result=evals_result, 43 | verbose_eval=10) 44 | 45 | print('Plot metrics during training...') 46 | ax = lgb.plot_metric(evals_result, metric='l1') 47 | plt.show() 48 | 49 | print('Plot feature importances...') 50 | ax = lgb.plot_importance(gbm, max_num_features=10) 51 | plt.show() 52 | 53 | print('Plot 84th tree...') # one tree use categorical feature to split 54 | ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) 55 | plt.show() 56 | -------------------------------------------------------------------------------- /docs/Parameters-tuning.md: -------------------------------------------------------------------------------- 1 | This is a page contains all parameters in LightGBM. 2 | 3 | ***List of other Helpful Links*** 4 | * [Parameters](./Parameters.md) 5 | * [Python API Reference](./Python-API.md) 6 | 7 | ## Convert parameters from XGBoost 8 | 9 | LightGBM uses [leaf-wise](https://github.com/Microsoft/LightGBM/wiki/Features#optimization-in-accuracy) tree growth algorithm. But other popular tools, e.g. XGBoost, use depth-wise tree growth. So LightGBM use ```num_leaves``` to control complexity of tree model, and other tools usually use ```max_depth```. Following table is the correspond between leaves and depths. The relation is ```num_leaves = 2^(max_depth) ```. 10 | 11 | | max_depth | num_leaves | 12 | | --------- | ---------- | 13 | | 1 | 2 | 14 | | 2 | 4 | 15 | | 3 | 8 | 16 | | 7 | 128 | 17 | | 10 | 1024 | 18 | 19 | ## For faster speed 20 | 21 | * Use bagging by set ```bagging_fraction``` and ```bagging_freq``` 22 | * Use feature sub-sampling by set ```feature_fraction``` 23 | * Use small ```max_bin``` 24 | * Use ```save_binary``` to speed up data loading in future learning 25 | * Use parallel learning, refer to [parallel learning guide](./Parallel-Learning-Guide.md). 26 | 27 | ## For better accuracy 28 | 29 | * Use large ```max_bin``` (may be slower) 30 | * Use small ```learning_rate``` with large ```num_iterations``` 31 | * Use large ```num_leaves```(may cause over-fitting) 32 | * Use bigger training data 33 | * Try ```dart``` 34 | 35 | ## Deal with over-fitting 36 | 37 | * Use small ```max_bin``` 38 | * Use small ```num_leaves``` 39 | * Use ```min_data_in_leaf``` and ```min_sum_hessian_in_leaf``` 40 | * Use bagging by set ```bagging_fraction``` and ```bagging_freq``` 41 | * Use feature sub-sampling by set ```feature_fraction``` 42 | * Use bigger training data 43 | * Try ```lambda_l1```, ```lambda_l2``` and ```min_gain_to_split``` to regularization 44 | * Try ```max_depth``` to avoid growing deep tree 45 | -------------------------------------------------------------------------------- /R-package/man/lgb.interprete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.interprete.R 3 | \name{lgb.interprete} 4 | \alias{lgb.interprete} 5 | \title{Compute feature contribution of prediction} 6 | \usage{ 7 | lgb.interprete(model, data, idxset, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{model}{object of class \code{lgb.Booster}.} 11 | 12 | \item{data}{a matrix object or a dgCMatrix object.} 13 | 14 | \item{idxset}{a integer vector of indices of rows needed.} 15 | 16 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.} 17 | } 18 | \value{ 19 | For regression, binary classification and lambdarank model, a \code{list} of \code{data.table} with the following columns: 20 | \itemize{ 21 | \item \code{Feature} Feature names in the model. 22 | \item \code{Contribution} The total contribution of this feature's splits. 23 | } 24 | For multiclass classification, a \code{list} of \code{data.table} with the Feature column and Contribution columns to each class. 25 | } 26 | \description{ 27 | Computes feature contribution components of rawscore prediction. 28 | } 29 | \examples{ 30 | \dontrun{ 31 | library(lightgbm) 32 | Sigmoid <- function(x) 1 / (1 + exp(-x)) 33 | Logit <- function(x) log(x / (1 - x)) 34 | data(agaricus.train, package = "lightgbm") 35 | train <- agaricus.train 36 | dtrain <- lgb.Dataset(train$data, label = train$label) 37 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) 38 | data(agaricus.test, package = "lightgbm") 39 | test <- agaricus.test 40 | 41 | params = list(objective = "binary", 42 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 43 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 44 | model <- lgb.train(params, dtrain, 20) 45 | model <- lgb.train(params, dtrain, 20) 46 | 47 | tree_interpretation <- lgb.interprete(model, test$data, 1:5) 48 | } 49 | 50 | } 51 | 52 | -------------------------------------------------------------------------------- /R-package/man/lgb.plot.importance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.plot.importance.R 3 | \name{lgb.plot.importance} 4 | \alias{lgb.plot.importance} 5 | \title{Plot feature importance as a bar graph} 6 | \usage{ 7 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain", 8 | left_margin = 10, cex = NULL) 9 | } 10 | \arguments{ 11 | \item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.} 12 | 13 | \item{top_n}{maximal number of top features to include into the plot.} 14 | 15 | \item{measure}{the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".} 16 | 17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.} 18 | 19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.} 20 | } 21 | \value{ 22 | The \code{lgb.plot.importance} function creates a \code{barplot} 23 | and silently returns a processed data.table with \code{top_n} features sorted by defined importance. 24 | } 25 | \description{ 26 | Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph. 27 | } 28 | \details{ 29 | The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature. 30 | Features are shown ranked in a decreasing importance order. 31 | } 32 | \examples{ 33 | \dontrun{ 34 | data(agaricus.train, package = "lightgbm") 35 | train <- agaricus.train 36 | dtrain <- lgb.Dataset(train$data, label = train$label) 37 | 38 | params = list(objective = "binary", 39 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 40 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 41 | model <- lgb.train(params, dtrain, 20) 42 | model <- lgb.train(params, dtrain, 20) 43 | 44 | tree_imp <- lgb.importance(model, percentage = TRUE) 45 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain") 46 | } 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /R-package/demo/cross_validation.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | # load in the agaricus dataset 3 | data(agaricus.train, package = "lightgbm") 4 | data(agaricus.test, package = "lightgbm") 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 7 | 8 | nrounds <- 2 9 | param <- list(num_leaves = 4, 10 | learning_rate = 1, 11 | objective = "binary") 12 | 13 | print("Running cross validation") 14 | # Do cross validation, this will print result out as 15 | # [iteration] metric_name:mean_value+std_value 16 | # std_value is standard deviation of the metric 17 | lgb.cv(param, 18 | dtrain, 19 | nrounds, 20 | nfold = 5, 21 | eval = "binary_error") 22 | 23 | print("Running cross validation, disable standard deviation display") 24 | # do cross validation, this will print result out as 25 | # [iteration] metric_name:mean_value+std_value 26 | # std_value is standard deviation of the metric 27 | lgb.cv(param, 28 | dtrain, 29 | nrounds, 30 | nfold = 5, 31 | eval = "binary_error", 32 | showsd = FALSE) 33 | 34 | # Tou can also do cross validation with cutomized loss function 35 | # See custom_objective.R 36 | print("Running cross validation, with cutomsized loss function") 37 | 38 | logregobj <- function(preds, dtrain) { 39 | labels <- getinfo(dtrain, "label") 40 | preds <- 1 / (1 + exp(-preds)) 41 | grad <- preds - labels 42 | hess <- preds * (1 - preds) 43 | return(list(grad = grad, hess = hess)) 44 | } 45 | evalerror <- function(preds, dtrain) { 46 | labels <- getinfo(dtrain, "label") 47 | err <- as.numeric(sum(labels != (preds > 0))) / length(labels) 48 | return(list(name = "error", value = err, higher_better = FALSE)) 49 | } 50 | 51 | # train with customized objective 52 | lgb.cv(params = param, 53 | data = dtrain, 54 | nrounds = nrounds, 55 | obj = logregobj, 56 | eval = evalerror, 57 | nfold = 5) 58 | -------------------------------------------------------------------------------- /R-package/R/readRDS.lgb.Booster.R: -------------------------------------------------------------------------------- 1 | #' readRDS for lgb.Booster models 2 | #' 3 | #' Attemps to load a model using RDS. 4 | #' 5 | #' @param file a connection or the name of the file where the R object is saved to or read from. 6 | #' @param refhook a hook function for handling reference objects. 7 | #' 8 | #' @return an R object. 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' library(lightgbm) 13 | #' data(agaricus.train, package = "lightgbm") 14 | #' train <- agaricus.train 15 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 16 | #' data(agaricus.test, package = "lightgbm") 17 | #' test <- agaricus.test 18 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 19 | #' params <- list(objective = "regression", metric = "l2") 20 | #' valids <- list(test = dtest) 21 | #' model <- lgb.train(params, 22 | #' dtrain, 23 | #' 100, 24 | #' valids, 25 | #' min_data = 1, 26 | #' learning_rate = 1, 27 | #' early_stopping_rounds = 10) 28 | #' saveRDS.lgb.Booster(model, "model.rds") 29 | #' new_model <- readRDS.lgb.Booster("model.rds") 30 | #' } 31 | #' 32 | #' @export 33 | readRDS.lgb.Booster <- function(file = "", refhook = NULL) { 34 | 35 | # Read RDS file 36 | object <- readRDS(file = file, refhook = refhook) 37 | 38 | # Check if object has the model stored 39 | if (!is.na(object$raw)) { 40 | 41 | # Create temporary file for the model loading 42 | temp <- tempfile() 43 | write(object$raw, temp) 44 | object2 <- lgb.load(temp) 45 | file.remove(temp) 46 | 47 | # Restore best iteration and recorded evaluations 48 | object2$best_iter <- object$best_iter 49 | object2$record_evals <- object$record_evals 50 | 51 | # Return newly loaded object 52 | return(object2) 53 | 54 | } else { 55 | 56 | # Return RDS loaded object 57 | return(object) 58 | 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /examples/python-guide/simple_example.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import json 4 | import lightgbm as lgb 5 | import pandas as pd 6 | from sklearn.metrics import mean_squared_error 7 | 8 | 9 | # load or create your dataset 10 | print('Load data...') 11 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 12 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 13 | 14 | y_train = df_train[0].values 15 | y_test = df_test[0].values 16 | X_train = df_train.drop(0, axis=1).values 17 | X_test = df_test.drop(0, axis=1).values 18 | 19 | # create dataset for lightgbm 20 | lgb_train = lgb.Dataset(X_train, y_train) 21 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 22 | 23 | # specify your configurations as a dict 24 | params = { 25 | 'task': 'train', 26 | 'boosting_type': 'gbdt', 27 | 'objective': 'regression', 28 | 'metric': {'l2', 'auc'}, 29 | 'num_leaves': 31, 30 | 'learning_rate': 0.05, 31 | 'feature_fraction': 0.9, 32 | 'bagging_fraction': 0.8, 33 | 'bagging_freq': 5, 34 | 'verbose': 0 35 | } 36 | 37 | print('Start training...') 38 | # train 39 | gbm = lgb.train(params, 40 | lgb_train, 41 | num_boost_round=20, 42 | valid_sets=lgb_eval, 43 | early_stopping_rounds=5) 44 | 45 | print('Save model...') 46 | # save model to file 47 | gbm.save_model('model.txt') 48 | 49 | print('Start predicting...') 50 | # predict 51 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) 52 | # eval 53 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) 54 | 55 | print('Dump model to JSON...') 56 | # dump model to json (and save to file) 57 | model_json = gbm.dump_model() 58 | 59 | with open('model.json', 'w+') as f: 60 | json.dump(model_json, f, indent=4) 61 | 62 | 63 | print('Feature names:', gbm.feature_name()) 64 | 65 | print('Calculate feature importances...') 66 | # feature importances 67 | print('Feature importances:', list(gbm.feature_importance())) 68 | -------------------------------------------------------------------------------- /R-package/man/lgb.unloader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.unloader.R 3 | \name{lgb.unloader} 4 | \alias{lgb.unloader} 5 | \title{LightGBM unloading error fix} 6 | \usage{ 7 | lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) 8 | } 9 | \arguments{ 10 | \item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.} 11 | 12 | \item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.} 13 | 14 | \item{restart}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.} 15 | } 16 | \value{ 17 | NULL invisibly. 18 | } 19 | \description{ 20 | Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | library(lightgbm) 25 | data(agaricus.train, package = "lightgbm") 26 | train <- agaricus.train 27 | dtrain <- lgb.Dataset(train$data, label = train$label) 28 | data(agaricus.test, package = "lightgbm") 29 | test <- agaricus.test 30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 31 | params <- list(objective = "regression", metric = "l2") 32 | valids <- list(test = dtest) 33 | model <- lgb.train(params, 34 | dtrain, 35 | 100, 36 | valids, 37 | min_data = 1, 38 | learning_rate = 1, 39 | early_stopping_rounds = 10) 40 | lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) 41 | rm(model, dtrain, dtest) # Not needed if wipe = TRUE 42 | gc() # Not needed if wipe = TRUE 43 | 44 | library(lightgbm) 45 | # Do whatever you want again with LightGBM without object clashing 46 | } 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /include/LightGBM/utils/openmp_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ 2 | #define LIGHTGBM_OPENMP_WRAPPER_H_ 3 | #ifdef _OPENMP 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "log.h" 12 | 13 | class ThreadExceptionHelper { 14 | public: 15 | ThreadExceptionHelper() { 16 | ex_ptr_ = nullptr; 17 | } 18 | 19 | ~ThreadExceptionHelper() { 20 | ReThrow(); 21 | } 22 | void ReThrow() { 23 | if (ex_ptr_ != nullptr) { 24 | std::rethrow_exception(ex_ptr_); 25 | ex_ptr_ = nullptr; 26 | } 27 | } 28 | void CaptureException() { 29 | // only catch first exception. 30 | if (ex_ptr_ != nullptr) { return; } 31 | std::unique_lock guard(lock_); 32 | if (ex_ptr_ != nullptr) { return; } 33 | ex_ptr_ = std::current_exception(); 34 | } 35 | private: 36 | std::exception_ptr ex_ptr_; 37 | std::mutex lock_; 38 | }; 39 | 40 | #define OMP_INIT_EX() ThreadExceptionHelper omp_except_helper 41 | #define OMP_LOOP_EX_BEGIN() try { 42 | 43 | #define OMP_LOOP_EX_END() } \ 44 | catch(std::exception& ex) { Log::Warning(ex.what()); omp_except_helper.CaptureException(); } \ 45 | catch(...) { omp_except_helper.CaptureException(); } 46 | #define OMP_THROW_EX() omp_except_helper.ReThrow() 47 | 48 | #else 49 | 50 | #ifdef _MSC_VER 51 | #pragma warning( disable : 4068 ) // disable unknown pragma warning 52 | #endif 53 | 54 | #ifdef __cplusplus 55 | extern "C" { 56 | #endif 57 | /** Fall here if no OPENMP support, so just 58 | simulate a single thread running. 59 | All #pragma omp should be ignored by the compiler **/ 60 | inline void omp_set_num_threads(int) {} 61 | inline int omp_get_num_threads() {return 1;} 62 | inline int omp_get_thread_num() {return 0;} 63 | #ifdef __cplusplus 64 | }; // extern "C" 65 | #endif 66 | 67 | #define OMP_INIT_EX() 68 | #define OMP_LOOP_EX_BEGIN() 69 | #define OMP_LOOP_EX_END() 70 | #define OMP_THROW_EX() 71 | 72 | #endif 73 | 74 | 75 | 76 | #endif /* LIGHTGBM_OPENMP_WRAPPER_H_ */ 77 | -------------------------------------------------------------------------------- /R-package/demo/early_stopping.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | require(methods) 3 | 4 | # Load in the agaricus dataset 5 | data(agaricus.train, package = "lightgbm") 6 | data(agaricus.test, package = "lightgbm") 7 | 8 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 9 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 10 | 11 | # Note: for customized objective function, we leave objective as default 12 | # Note: what we are getting is margin value in prediction 13 | # You must know what you are doing 14 | param <- list(num_leaves = 4, 15 | learning_rate = 1) 16 | valids <- list(eval = dtest) 17 | num_round <- 20 18 | 19 | # User define objective function, given prediction, return gradient and second order gradient 20 | # This is loglikelihood loss 21 | logregobj <- function(preds, dtrain) { 22 | labels <- getinfo(dtrain, "label") 23 | preds <- 1 / (1 + exp(-preds)) 24 | grad <- preds - labels 25 | hess <- preds * (1 - preds) 26 | return(list(grad = grad, hess = hess)) 27 | } 28 | 29 | # User defined evaluation function, return a pair metric_name, result, higher_better 30 | # NOTE: when you do customized loss function, the default prediction value is margin 31 | # This may make buildin evalution metric not function properly 32 | # For example, we are doing logistic loss, the prediction is score before logistic transformation 33 | # The buildin evaluation error assumes input is after logistic transformation 34 | # Take this in mind when you use the customization, and maybe you need write customized evaluation function 35 | evalerror <- function(preds, dtrain) { 36 | labels <- getinfo(dtrain, "label") 37 | err <- as.numeric(sum(labels != (preds > 0))) / length(labels) 38 | return(list(name = "error", value = err, higher_better = FALSE)) 39 | } 40 | print("Start training with early Stopping setting") 41 | 42 | bst <- lgb.train(param, 43 | dtrain, 44 | num_round, 45 | valids, 46 | objective = logregobj, 47 | eval = evalerror, 48 | early_stopping_round = 3) 49 | -------------------------------------------------------------------------------- /R-package/man/lgb.model.dt.tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.model.dt.tree.R 3 | \name{lgb.model.dt.tree} 4 | \alias{lgb.model.dt.tree} 5 | \title{Parse a LightGBM model json dump} 6 | \usage{ 7 | lgb.model.dt.tree(model, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{model}{object of class \code{lgb.Booster}} 11 | } 12 | \value{ 13 | A \code{data.table} with detailed information about model trees' nodes and leafs. 14 | 15 | The columns of the \code{data.table} are: 16 | 17 | \itemize{ 18 | \item \code{tree_index}: ID of a tree in a model (integer) 19 | \item \code{split_index}: ID of a node in a tree (integer) 20 | \item \code{split_feature}: for a node, it's a feature name (character); 21 | for a leaf, it simply labels it as \code{"NA"} 22 | \item \code{node_parent}: ID of the parent node for current node (integer) 23 | \item \code{leaf_index}: ID of a leaf in a tree (integer) 24 | \item \code{leaf_parent}: ID of the parent node for current leaf (integer) 25 | \item \code{split_gain}: Split gain of a node 26 | \item \code{threshold}: Spliting threshold value of a node 27 | \item \code{decision_type}: Decision type of a node 28 | \item \code{internal_value}: Node value 29 | \item \code{internal_count}: The number of observation collected by a node 30 | \item \code{leaf_value}: Leaf value 31 | \item \code{leaf_count}: The number of observation collected by a leaf 32 | } 33 | } 34 | \description{ 35 | Parse a LightGBM model json dump into a \code{data.table} structure. 36 | } 37 | \examples{ 38 | \dontrun{ 39 | library(lightgbm) 40 | 41 | data(agaricus.train, package = "lightgbm") 42 | train <- agaricus.train 43 | dtrain <- lgb.Dataset(train$data, label = train$label) 44 | 45 | params = list(objective = "binary", 46 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 47 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 48 | model <- lgb.train(params, dtrain, 20) 49 | model <- lgb.train(params, dtrain, 20) 50 | 51 | tree_dt <- lgb.model.dt.tree(model) 52 | } 53 | 54 | } 55 | 56 | -------------------------------------------------------------------------------- /pmml/README.md: -------------------------------------------------------------------------------- 1 | PMML Generator 2 | ============== 3 | The script pmml.py can be used to translate the LightGBM models, found in LightGBM_model.txt, to predictive model markup language (PMML). These models can then be imported by other analytics applications. The models that the language can describe includes decision trees. The specification of PMML can be found here at the Data Mining Group's [website](http://dmg.org/pmml/v4-3/GeneralStructure.html). 4 | 5 | In order to generate pmml files do the following steps. 6 | ``` 7 | lightgbm config=train.conf 8 | python pmml.py LightGBM_model.txt 9 | ``` 10 | The python script will create a file called **LightGBM_pmml.xml**. Inside the file you will find a `MiningModel` tag. In there you will find `TreeModel` tags. Each `TreeModel` tag contains the pmml translation of a decision tree inside the LightGBM_model.txt file. The model described by the **LightGBM_pmml.xml** file can be transferred to other analytics applications. For instance you can use the pmml file as an input to the jpmml-evaluator API. Follow the steps below to run a model described by **LightGBM_pmml.xml**. 11 | 12 | ##### Steps to run jpmml-evaluator 13 | 1, First clone the repository 14 | ``` 15 | git clone https://github.com/jpmml/jpmml-evaluator.git 16 | ``` 17 | 2, Build using maven 18 | ``` 19 | mvn clean install 20 | ``` 21 | 3, Run the EvaluationExample class on the model file using the following command 22 | ``` 23 | java -cp example-1.3-SNAPSHOT.jar org.jpmml.evaluator.EvaluationExample --model LightGBM_pmml.xml --input input.csv --output output.csv 24 | ``` 25 | Note, in order to run the model on the input.csv file, the input.csv file must have the same number of columns as specified by the `DataDictionary` field in the pmml file. Also, the column headers inside the input.csv file must be the same as the column names specified by the `MiningSchema` field. Inside output.csv you will find all the columns inside the input.csv file plus a new column. In the new column you will find the scores calculated by processing each rows data on the model. More information about jpmml-evaluator can be found at its [github repository](https://github.com/jpmml/jpmml-evaluator). -------------------------------------------------------------------------------- /R-package/man/lgb.plot.interpretation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.plot.interpretation.R 3 | \name{lgb.plot.interpretation} 4 | \alias{lgb.plot.interpretation} 5 | \title{Plot feature contribution as a bar graph} 6 | \usage{ 7 | lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1, 8 | left_margin = 10, cex = NULL) 9 | } 10 | \arguments{ 11 | \item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.} 12 | 13 | \item{top_n}{maximal number of top features to include into the plot.} 14 | 15 | \item{cols}{the column numbers of layout, will be used only for multiclass classification feature contribution.} 16 | 17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.} 18 | 19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.} 20 | } 21 | \value{ 22 | The \code{lgb.plot.interpretation} function creates a \code{barplot}. 23 | } 24 | \description{ 25 | Plot previously calculated feature contribution as a bar graph. 26 | } 27 | \details{ 28 | The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature. 29 | Features are shown ranked in a decreasing contribution order. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | library(lightgbm) 34 | Sigmoid <- function(x) {1 / (1 + exp(-x))} 35 | Logit <- function(x) {log(x / (1 - x))} 36 | data(agaricus.train, package = "lightgbm") 37 | train <- agaricus.train 38 | dtrain <- lgb.Dataset(train$data, label = train$label) 39 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) 40 | data(agaricus.test, package = "lightgbm") 41 | test <- agaricus.test 42 | 43 | params = list(objective = "binary", 44 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 45 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 46 | model <- lgb.train(params, dtrain, 20) 47 | model <- lgb.train(params, dtrain, 20) 48 | 49 | tree_interpretation <- lgb.interprete(model, test$data, 1:5) 50 | lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10) 51 | } 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /docs/FAQ.md: -------------------------------------------------------------------------------- 1 | LightGBM FAQ 2 | ======================= 3 | 4 | ###Catalog 5 | 6 | - [Python-package](FAQ.md#python-package) 7 | 8 | ###Python-package 9 | 10 | - **Question 1**: I see error messages like this when install from github using `python setup.py install`. 11 | 12 | ``` 13 | error: Error: setup script specifies an absolute path: 14 | 15 | /Users/Microsoft/LightGBM/python-package/lightgbm/../../lib_lightgbm.so 16 | 17 | setup() arguments must *always* be /-separated paths relative to the 18 | setup.py directory, *never* absolute paths. 19 | ``` 20 | 21 | - **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path). 22 | 23 | - **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already construct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`. 24 | 25 | - **Solution 2**: Because LightGBM constructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when construct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to: 26 | 27 | + get label(or weight/init_score/group) before construct dataset, it's same as get `self.label` 28 | + set label(or weight/init_score/group) before construct dataset, it's same as `self.label=some_label_array` 29 | + get num_data(or num_feature) before construct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape` 30 | + set predictor(or reference/categorical feature) after construct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data 31 | -------------------------------------------------------------------------------- /tests/python_package_test/test_basic.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: skip-file 3 | import os 4 | import tempfile 5 | import unittest 6 | 7 | import lightgbm as lgb 8 | import numpy as np 9 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file 10 | from sklearn.model_selection import train_test_split 11 | 12 | 13 | class TestBasic(unittest.TestCase): 14 | 15 | def test(self): 16 | X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) 17 | train_data = lgb.Dataset(X_train, max_bin=255, label=y_train) 18 | valid_data = train_data.create_valid(X_test, label=y_test) 19 | 20 | params = { 21 | "objective": "binary", 22 | "metric": "auc", 23 | "min_data": 10, 24 | "num_leaves": 15, 25 | "verbose": -1 26 | } 27 | bst = lgb.Booster(params, train_data) 28 | bst.add_valid(valid_data, "valid_1") 29 | 30 | for i in range(30): 31 | bst.update() 32 | if i % 10 == 0: 33 | print(bst.eval_train(), bst.eval_valid()) 34 | bst.save_model("model.txt") 35 | pred_from_matr = bst.predict(X_test) 36 | with tempfile.NamedTemporaryFile() as f: 37 | tname = f.name 38 | with open(tname, "w+b") as f: 39 | dump_svmlight_file(X_test, y_test, f) 40 | pred_from_file = bst.predict(tname) 41 | os.remove(tname) 42 | self.assertEqual(len(pred_from_matr), len(pred_from_file)) 43 | for preds in zip(pred_from_matr, pred_from_file): 44 | self.assertAlmostEqual(*preds, places=15) 45 | # check saved model persistence 46 | bst = lgb.Booster(params, model_file="model.txt") 47 | pred_from_model_file = bst.predict(X_test) 48 | self.assertEqual(len(pred_from_matr), len(pred_from_model_file)) 49 | for preds in zip(pred_from_matr, pred_from_model_file): 50 | self.assertEqual(*preds) 51 | # check pmml 52 | os.system('python ../../pmml/pmml.py model.txt') 53 | 54 | 55 | print("----------------------------------------------------------------------") 56 | print("running test_basic.py") 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /include/LightGBM/utils/pipeline_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_ 2 | #define LIGHTGBM_UTILS_PIPELINE_READER_H_ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace LightGBM{ 14 | 15 | /*! 16 | * \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block 17 | */ 18 | class PipelineReader { 19 | public: 20 | /*! 21 | * \brief Read data from a file, use pipeline methods 22 | * \param filename Filename of data 23 | * \process_fun Process function 24 | */ 25 | static size_t Read(const char* filename, int skip_bytes, const std::function& process_fun) { 26 | FILE* file; 27 | 28 | #ifdef _MSC_VER 29 | fopen_s(&file, filename, "rb"); 30 | #else 31 | file = fopen(filename, "rb"); 32 | #endif 33 | if (file == NULL) { 34 | return 0; 35 | } 36 | size_t cnt = 0; 37 | const size_t buffer_size = 16 * 1024 * 1024 ; 38 | // buffer used for the process_fun 39 | auto buffer_process = std::vector(buffer_size); 40 | // buffer used for the file reading 41 | auto buffer_read = std::vector(buffer_size); 42 | size_t read_cnt = 0; 43 | if (skip_bytes > 0) { 44 | // skip first k bytes 45 | read_cnt = fread(buffer_process.data(), 1, skip_bytes, file); 46 | } 47 | // read first block 48 | read_cnt = fread(buffer_process.data(), 1, buffer_size, file); 49 | size_t last_read_cnt = 0; 50 | while (read_cnt > 0) { 51 | // start read thread 52 | std::thread read_worker = std::thread( 53 | [file, &buffer_read, buffer_size, &last_read_cnt] { 54 | last_read_cnt = fread(buffer_read.data(), 1, buffer_size, file); 55 | } 56 | ); 57 | // start process 58 | cnt += process_fun(buffer_process.data(), read_cnt); 59 | // wait for read thread 60 | read_worker.join(); 61 | // exchange the buffer 62 | std::swap(buffer_process, buffer_read); 63 | read_cnt = last_read_cnt; 64 | } 65 | // close file 66 | fclose(file); 67 | return cnt; 68 | } 69 | 70 | }; 71 | 72 | } // namespace LightGBM 73 | 74 | #endif // LightGBM_UTILS_PIPELINE_READER_H_ 75 | -------------------------------------------------------------------------------- /R-package/man/saveRDS.lgb.Booster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/saveRDS.lgb.Booster.R 3 | \name{saveRDS.lgb.Booster} 4 | \alias{saveRDS.lgb.Booster} 5 | \title{saveRDS for lgb.Booster models} 6 | \usage{ 7 | saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL, 8 | compress = TRUE, refhook = NULL, raw = TRUE) 9 | } 10 | \arguments{ 11 | \item{object}{R object to serialize.} 12 | 13 | \item{file}{a connection or the name of the file where the R object is saved to or read from.} 14 | 15 | \item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.} 16 | 17 | \item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.} 18 | 19 | \item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.} 20 | 21 | \item{refhook}{a hook function for handling reference objects.} 22 | 23 | \item{raw}{whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.} 24 | } 25 | \value{ 26 | NULL invisibly. 27 | } 28 | \description{ 29 | Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | library(lightgbm) 34 | data(agaricus.train, package = "lightgbm") 35 | train <- agaricus.train 36 | dtrain <- lgb.Dataset(train$data, label = train$label) 37 | data(agaricus.test, package = "lightgbm") 38 | test <- agaricus.test 39 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 40 | params <- list(objective = "regression", metric = "l2") 41 | valids <- list(test = dtest) 42 | model <- lgb.train(params, 43 | dtrain, 44 | 100, 45 | valids, 46 | min_data = 1, 47 | learning_rate = 1, 48 | early_stopping_rounds = 10) 49 | saveRDS.lgb.Booster(model, "model.rds") 50 | } 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /include/LightGBM/objective_function.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_ 2 | #define LIGHTGBM_OBJECTIVE_FUNCTION_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace LightGBM { 10 | /*! 11 | * \brief The interface of Objective Function. 12 | */ 13 | class ObjectiveFunction { 14 | public: 15 | /*! \brief virtual destructor */ 16 | virtual ~ObjectiveFunction() {} 17 | 18 | /*! 19 | * \brief Initialize 20 | * \param metadata Label data 21 | * \param num_data Number of data 22 | */ 23 | virtual void Init(const Metadata& metadata, data_size_t num_data) = 0; 24 | 25 | /*! 26 | * \brief calculating first order derivative of loss function 27 | * \param score prediction score in this round 28 | * \gradients Output gradients 29 | * \hessians Output hessians 30 | */ 31 | virtual void GetGradients(const double* score, 32 | score_t* gradients, score_t* hessians) const = 0; 33 | 34 | virtual const char* GetName() const = 0; 35 | 36 | virtual bool IsConstantHessian() const { return false; } 37 | 38 | virtual bool BoostFromAverage() const { return false; } 39 | 40 | virtual bool SkipEmptyClass() const { return false; } 41 | 42 | virtual int NumTreePerIteration() const { return 1; } 43 | 44 | virtual int NumPredictOneRow() const { return 1; } 45 | 46 | virtual void ConvertOutput(const double* input, double* output) const { 47 | output[0] = input[0]; 48 | } 49 | 50 | virtual std::string ToString() const = 0; 51 | 52 | ObjectiveFunction() = default; 53 | /*! \brief Disable copy */ 54 | ObjectiveFunction& operator=(const ObjectiveFunction&) = delete; 55 | /*! \brief Disable copy */ 56 | ObjectiveFunction(const ObjectiveFunction&) = delete; 57 | 58 | /*! 59 | * \brief Create object of objective function 60 | * \param type Specific type of objective function 61 | * \param config Config for objective function 62 | */ 63 | LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& type, 64 | const ObjectiveConfig& config); 65 | 66 | /*! 67 | * \brief Load objective function from string object 68 | */ 69 | LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str); 70 | }; 71 | 72 | } // namespace LightGBM 73 | 74 | #endif // LightGBM_OBJECTIVE_FUNCTION_H_ 75 | -------------------------------------------------------------------------------- /src/boosting/boosting.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gbdt.h" 3 | #include "dart.hpp" 4 | #include "goss.hpp" 5 | 6 | namespace LightGBM { 7 | 8 | std::string GetBoostingTypeFromModelFile(const char* filename) { 9 | TextReader model_reader(filename, true); 10 | std::string type = model_reader.first_line(); 11 | return type; 12 | } 13 | 14 | bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) { 15 | if (boosting != nullptr) { 16 | TextReader model_reader(filename, true); 17 | model_reader.ReadAllLines(); 18 | std::stringstream str_buf; 19 | for (auto& line : model_reader.Lines()) { 20 | str_buf << line << '\n'; 21 | } 22 | if (!boosting->LoadModelFromString(str_buf.str())) 23 | return false; 24 | } 25 | 26 | return true; 27 | } 28 | 29 | Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) { 30 | if (filename == nullptr || filename[0] == '\0') { 31 | if (type == std::string("gbdt")) { 32 | return new GBDT(); 33 | } else if (type == std::string("dart")) { 34 | return new DART(); 35 | } else if (type == std::string("goss")) { 36 | return new GOSS(); 37 | } else { 38 | return nullptr; 39 | } 40 | } else { 41 | std::unique_ptr ret; 42 | auto type_in_file = GetBoostingTypeFromModelFile(filename); 43 | if (type_in_file == std::string("tree")) { 44 | if (type == std::string("gbdt")) { 45 | ret.reset(new GBDT()); 46 | } else if (type == std::string("dart")) { 47 | ret.reset(new DART()); 48 | } else if (type == std::string("goss")) { 49 | ret.reset(new GOSS()); 50 | } else { 51 | Log::Fatal("unknown boosting type %s", type.c_str()); 52 | } 53 | LoadFileToBoosting(ret.get(), filename); 54 | } else { 55 | Log::Fatal("unknown submodel type in model file %s", filename); 56 | } 57 | return ret.release(); 58 | } 59 | } 60 | 61 | Boosting* Boosting::CreateBoosting(const char* filename) { 62 | auto type = GetBoostingTypeFromModelFile(filename); 63 | std::unique_ptr ret; 64 | if (type == std::string("tree")) { 65 | ret.reset(new GBDT()); 66 | } else { 67 | Log::Fatal("unknown submodel type in model file %s", filename); 68 | } 69 | LoadFileToBoosting(ret.get(), filename); 70 | return ret.release(); 71 | } 72 | 73 | } // namespace LightGBM 74 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | Development Guide 2 | ================== 3 | 4 | Algorithms 5 | ---------- 6 | 7 | Refer to [Features](https://github.com/Microsoft/LightGBM/wiki/Features) to get important algorithms used in LightGBM. 8 | 9 | Classes And Code Structure 10 | -------------------------- 11 | 12 | ### Important Classes 13 | 14 | | Class | description | 15 | | ----- | --------- | 16 | | `Application` | The entrance of application, including training and prediction logic | 17 | | `Bin` | Data structure used for store feature discrete values(converted from float values) | 18 | | `Boosting` | Boosting interface, current implementation is GBDT and DART | 19 | | `Config` | Store parameters and configurations| 20 | | `Dataset` | Store information of dataset | 21 | | `DatasetLoader` | Used to construct dataset | 22 | | `Feature` | Store One column feature | 23 | | `Metric` | Evaluation metrics | 24 | | `Network` | Newwork interfaces and communication algorithms | 25 | | `ObjectiveFunction` | Objective function used to train | 26 | | `Tree` | Store information of tree model | 27 | | `TreeLearner` | Used to learn trees | 28 | 29 | ### Code Structure 30 | 31 | | Path | description | 32 | | ----- | --------- | 33 | | ./include | header files | 34 | | ./include/utils | some common functions | 35 | | ./src/application | Implementations of training and prediction logic | 36 | | ./src/boosting | Implementations of Boosting | 37 | | ./src/io | Implementations of IO relatived classes, including `Bin`, `Config`, `Dataset`, `DatasetLoader`, `Feature` and `Tree`| 38 | | ./src/metric | Implementations of metrics | 39 | | ./src/network | Implementations of network functions | 40 | | ./src/objective | Implementations of objective functions | 41 | | ./src/treelearner | Implementations of tree learners | 42 | 43 | ### API Documents 44 | 45 | LightGBM support use [doxygen](http://www.stack.nl/~dimitri/doxygen/) to generate documents for classes and functions. 46 | 47 | C API 48 | ----- 49 | Refere to the comments in [c_api.h](https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/c_api.h). 50 | 51 | High level Language package 52 | --------------------------- 53 | 54 | Follow the implementation of [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package/lightgbm). 55 | 56 | Ask Questions 57 | ------------- 58 | Feel free to open [issues](https://github.com/Microsoft/LightGBM/issues) if you met problems. 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_dataset.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | require(Matrix) 3 | 4 | context("testing lgb.Dataset functionality") 5 | 6 | data(agaricus.test, package='lightgbm') 7 | test_data <- agaricus.test$data[1:100,] 8 | test_label <- agaricus.test$label[1:100] 9 | 10 | test_that("lgb.Dataset: basic construction, saving, loading", { 11 | # from sparse matrix 12 | dtest1 <- lgb.Dataset(test_data, label=test_label) 13 | # from dense matrix 14 | dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label) 15 | expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) 16 | 17 | # save to a local file 18 | tmp_file <- tempfile('lgb.Dataset_') 19 | lgb.Dataset.save(dtest1, tmp_file) 20 | # read from a local file 21 | dtest3 <- lgb.Dataset(tmp_file) 22 | lgb.Dataset.construct(dtest3) 23 | unlink(tmp_file) 24 | expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) 25 | }) 26 | 27 | test_that("lgb.Dataset: getinfo & setinfo", { 28 | dtest <- lgb.Dataset(test_data) 29 | setinfo(dtest, 'label', test_label) 30 | labels <- getinfo(dtest, 'label') 31 | expect_equal(test_label, getinfo(dtest, 'label')) 32 | 33 | expect_true(length(getinfo(dtest, 'weight')) == 0) 34 | expect_true(length(getinfo(dtest, 'init_score')) == 0) 35 | 36 | # any other label should error 37 | expect_error(setinfo(dtest, 'asdf', test_label)) 38 | }) 39 | 40 | test_that("lgb.Dataset: slice, dim", { 41 | dtest <- lgb.Dataset(test_data, label=test_label) 42 | lgb.Dataset.construct(dtest) 43 | expect_equal(dim(dtest), dim(test_data)) 44 | dsub1 <- slice(dtest, 1:42) 45 | lgb.Dataset.construct(dsub1) 46 | expect_equal(nrow(dsub1), 42) 47 | expect_equal(ncol(dsub1), ncol(test_data)) 48 | }) 49 | 50 | test_that("lgb.Dataset: colnames", { 51 | dtest <- lgb.Dataset(test_data, label=test_label) 52 | expect_equal(colnames(dtest), colnames(test_data)) 53 | lgb.Dataset.construct(dtest) 54 | expect_equal(colnames(dtest), colnames(test_data)) 55 | expect_error( colnames(dtest) <- 'asdf') 56 | new_names <- make.names(1:ncol(test_data)) 57 | expect_silent(colnames(dtest) <- new_names) 58 | expect_equal(colnames(dtest), new_names) 59 | }) 60 | 61 | test_that("lgb.Dataset: nrow is correct for a very sparse matrix", { 62 | nr <- 1000 63 | x <- rsparsematrix(nr, 100, density=0.0005) 64 | # we want it very sparse, so that last rows are empty 65 | expect_lt(max(x@i), nr) 66 | dtest <- lgb.Dataset(x) 67 | expect_equal(dim(dtest), dim(x)) 68 | }) 69 | -------------------------------------------------------------------------------- /R-package/R/lgb.importance.R: -------------------------------------------------------------------------------- 1 | #' Compute feature importance in a model 2 | #' 3 | #' Creates a \code{data.table} of feature importances in a model. 4 | #' 5 | #' @param model object of class \code{lgb.Booster}. 6 | #' @param percentage whether to show importance in relative percentage. 7 | #' 8 | #' @return 9 | #' 10 | #' For a tree model, a \code{data.table} with the following columns: 11 | #' \itemize{ 12 | #' \item \code{Feature} Feature names in the model. 13 | #' \item \code{Gain} The total gain of this feature's splits. 14 | #' \item \code{Cover} The number of observation related to this feature. 15 | #' \item \code{Frequency} The number of times a feature splited in trees. 16 | #' } 17 | #' 18 | #' @examples 19 | #' \dontrun{ 20 | #' library(lightgbm) 21 | #' data(agaricus.train, package = "lightgbm") 22 | #' train <- agaricus.train 23 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 24 | #' 25 | #' params = list(objective = "binary", 26 | #' learning_rate = 0.01, num_leaves = 63, max_depth = -1, 27 | #' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 28 | #' model <- lgb.train(params, dtrain, 20) 29 | #' model <- lgb.train(params, dtrain, 20) 30 | #' 31 | #' tree_imp1 <- lgb.importance(model, percentage = TRUE) 32 | #' tree_imp2 <- lgb.importance(model, percentage = FALSE) 33 | #' } 34 | #' 35 | #' @importFrom magrittr %>% %T>% 36 | #' @importFrom data.table := 37 | #' @export 38 | lgb.importance <- function(model, percentage = TRUE) { 39 | 40 | # Check if model is a lightgbm model 41 | if (!any(class(model) == "lgb.Booster")) { 42 | stop("'model' has to be an object of class lgb.Booster") 43 | } 44 | 45 | # Setup importance 46 | tree_dt <- lgb.model.dt.tree(model) 47 | 48 | # Extract elements 49 | tree_imp <- tree_dt %>% 50 | magrittr::extract(., 51 | i = is.na(split_index) == FALSE, 52 | j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N), 53 | by = "split_feature") %T>% 54 | data.table::setnames(., old = "split_feature", new = "Feature") %>% 55 | magrittr::extract(., i = order(Gain, decreasing = TRUE)) 56 | 57 | # Check if relative values are requested 58 | if (percentage) { 59 | tree_imp[, ":="(Gain = Gain / sum(Gain), 60 | Cover = Cover / sum(Cover), 61 | Frequency = Frequency / sum(Frequency))] 62 | } 63 | 64 | # Return importance table 65 | return(tree_imp) 66 | 67 | } 68 | -------------------------------------------------------------------------------- /python-package/lightgbm/compat.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = C0103 3 | """Compatibility""" 4 | from __future__ import absolute_import 5 | 6 | import inspect 7 | import sys 8 | 9 | import numpy as np 10 | 11 | is_py3 = (sys.version_info[0] == 3) 12 | 13 | """compatibility between python2 and python3""" 14 | if is_py3: 15 | string_type = str 16 | numeric_types = (int, float, bool) 17 | integer_types = (int, ) 18 | range_ = range 19 | 20 | def argc_(func): 21 | """return number of arguments of a function""" 22 | return len(inspect.signature(func).parameters) 23 | else: 24 | string_type = basestring 25 | numeric_types = (int, long, float, bool) 26 | integer_types = (int, long) 27 | range_ = xrange 28 | 29 | def argc_(func): 30 | """return number of arguments of a function""" 31 | return len(inspect.getargspec(func).args) 32 | 33 | """json""" 34 | try: 35 | import simplejson as json 36 | except (ImportError, SyntaxError): 37 | # simplejson does not support Python 3.2, it throws a SyntaxError 38 | # because of u'...' Unicode literals. 39 | import json 40 | 41 | 42 | def json_default_with_numpy(obj): 43 | if isinstance(obj, (np.integer, np.floating, np.bool_)): 44 | return obj.item() 45 | elif isinstance(obj, np.ndarray): 46 | return obj.tolist() 47 | else: 48 | return obj 49 | 50 | 51 | """pandas""" 52 | try: 53 | from pandas import Series, DataFrame 54 | except ImportError: 55 | class Series(object): 56 | pass 57 | 58 | class DataFrame(object): 59 | pass 60 | 61 | """sklearn""" 62 | try: 63 | from sklearn.base import BaseEstimator 64 | from sklearn.base import RegressorMixin, ClassifierMixin 65 | from sklearn.preprocessing import LabelEncoder 66 | from sklearn.utils import deprecated 67 | try: 68 | from sklearn.model_selection import StratifiedKFold 69 | except ImportError: 70 | from sklearn.cross_validation import StratifiedKFold 71 | SKLEARN_INSTALLED = True 72 | LGBMModelBase = BaseEstimator 73 | LGBMRegressorBase = RegressorMixin 74 | LGBMClassifierBase = ClassifierMixin 75 | LGBMLabelEncoder = LabelEncoder 76 | LGBMDeprecated = deprecated 77 | LGBMStratifiedKFold = StratifiedKFold 78 | except ImportError: 79 | SKLEARN_INSTALLED = False 80 | LGBMModelBase = object 81 | LGBMClassifierBase = object 82 | LGBMRegressorBase = object 83 | LGBMLabelEncoder = None 84 | LGBMStratifiedKFold = None 85 | -------------------------------------------------------------------------------- /R-package/man/predict.lgb.Booster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{predict.lgb.Booster} 4 | \alias{predict.lgb.Booster} 5 | \title{Predict method for LightGBM model} 6 | \usage{ 7 | \method{predict}{lgb.Booster}(object, data, num_iteration = NULL, 8 | rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE) 9 | } 10 | \arguments{ 11 | \item{object}{Object of class \code{lgb.Booster}} 12 | 13 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} 14 | 15 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} 16 | 17 | \item{rawscore}{whether the prediction should be returned in the for of original untransformed 18 | sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for 19 | logistic regression would result in predictions for log-odds instead of probabilities.} 20 | 21 | \item{predleaf}{whether predict leaf index instead.} 22 | 23 | \item{header}{only used for prediction for text file. True if text file has header} 24 | 25 | \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 26 | prediction outputs per case.} 27 | } 28 | \value{ 29 | For regression or binary classification, it returns a vector of length \code{nrows(data)}. 30 | For multiclass classification, either a \code{num_class * nrows(data)} vector or 31 | a \code{(nrows(data), num_class)} dimension matrix is returned, depending on 32 | the \code{reshape} value. 33 | 34 | When \code{predleaf = TRUE}, the output is a matrix object with the 35 | number of columns corresponding to the number of trees. 36 | } 37 | \description{ 38 | Predicted values based on class \code{lgb.Booster} 39 | } 40 | \examples{ 41 | \dontrun{ 42 | library(lightgbm) 43 | data(agaricus.train, package = "lightgbm") 44 | train <- agaricus.train 45 | dtrain <- lgb.Dataset(train$data, label = train$label) 46 | data(agaricus.test, package = "lightgbm") 47 | test <- agaricus.test 48 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 49 | params <- list(objective = "regression", metric = "l2") 50 | valids <- list(test = dtest) 51 | model <- lgb.train(params, 52 | dtrain, 53 | 100, 54 | valids, 55 | min_data = 1, 56 | learning_rate = 1, 57 | early_stopping_rounds = 10) 58 | preds <- predict(model, test$data) 59 | } 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/objective/objective_function.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regression_objective.hpp" 3 | #include "binary_objective.hpp" 4 | #include "rank_objective.hpp" 5 | #include "multiclass_objective.hpp" 6 | 7 | namespace LightGBM { 8 | 9 | ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const ObjectiveConfig& config) { 10 | if (type == std::string("regression") || type == std::string("regression_l2") 11 | || type == std::string("mean_squared_error") || type == std::string("mse")) { 12 | return new RegressionL2loss(config); 13 | } else if (type == std::string("regression_l1") || type == std::string("mean_absolute_error") || type == std::string("mae")) { 14 | return new RegressionL1loss(config); 15 | } else if (type == std::string("huber")) { 16 | return new RegressionHuberLoss(config); 17 | } else if (type == std::string("fair")) { 18 | return new RegressionFairLoss(config); 19 | } else if (type == std::string("poisson")) { 20 | return new RegressionPoissonLoss(config); 21 | } else if (type == std::string("binary")) { 22 | return new BinaryLogloss(config); 23 | } else if (type == std::string("lambdarank")) { 24 | return new LambdarankNDCG(config); 25 | } else if (type == std::string("multiclass")) { 26 | return new MulticlassSoftmax(config); 27 | } else if (type == std::string("multiclassova")) { 28 | return new MulticlassOVA(config); 29 | } 30 | return nullptr; 31 | } 32 | 33 | ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& str) { 34 | auto strs = Common::Split(str.c_str(), " "); 35 | auto type = strs[0]; 36 | if (type == std::string("regression")) { 37 | return new RegressionL2loss(strs); 38 | } else if (type == std::string("regression_l1")) { 39 | return new RegressionL1loss(strs); 40 | } else if (type == std::string("huber")) { 41 | return new RegressionHuberLoss(strs); 42 | } else if (type == std::string("fair")) { 43 | return new RegressionFairLoss(strs); 44 | } else if (type == std::string("poisson")) { 45 | return new RegressionPoissonLoss(strs); 46 | } else if (type == std::string("binary")) { 47 | return new BinaryLogloss(strs); 48 | } else if (type == std::string("lambdarank")) { 49 | return new LambdarankNDCG(strs); 50 | } else if (type == std::string("multiclass")) { 51 | return new MulticlassSoftmax(strs); 52 | } else if (type == std::string("multiclassova")) { 53 | return new MulticlassOVA(strs); 54 | } 55 | return nullptr; 56 | } 57 | 58 | } // namespace LightGBM 59 | -------------------------------------------------------------------------------- /R-package/demo/multiclass.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | 3 | # We load the default iris dataset shipped with R 4 | data(iris) 5 | 6 | # We must convert factors to numeric 7 | # They must be starting from number 0 to use multiclass 8 | # For instance: 0, 1, 2, 3, 4, 5... 9 | iris$Species <- as.numeric(as.factor(iris$Species)) - 1 10 | 11 | # We cut the data set into 80% train and 20% validation 12 | # The 10 last samples of each class are for validation 13 | 14 | train <- as.matrix(iris[c(1:40, 51:90, 101:140), ]) 15 | test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) 16 | dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5]) 17 | dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5]) 18 | valids <- list(test = dtest) 19 | 20 | # Method 1 of training 21 | params <- list(objective = "multiclass", metric = "multi_error", num_class = 3) 22 | model <- lgb.train(params, 23 | dtrain, 24 | 100, 25 | valids, 26 | min_data = 1, 27 | learning_rate = 1, 28 | early_stopping_rounds = 10) 29 | 30 | # We can predict on test data, outputs a 90-length vector 31 | # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... 32 | my_preds <- predict(model, test[, 1:4]) 33 | 34 | # Method 2 of training, identical 35 | model <- lgb.train(list(), 36 | dtrain, 37 | 100, 38 | valids, 39 | min_data = 1, 40 | learning_rate = 1, 41 | early_stopping_rounds = 10, 42 | objective = "multiclass", 43 | metric = "multi_error", 44 | num_class = 3) 45 | 46 | # We can predict on test data, identical 47 | my_preds <- predict(model, test[, 1:4]) 48 | 49 | # A (30x3) matrix with the predictions, use parameter reshape 50 | # class1 class2 class3 51 | # obs1 obs1 obs1 52 | # obs2 obs2 obs2 53 | # .... .... .... 54 | my_preds <- predict(model, test[, 1:4], reshape = TRUE) 55 | 56 | # We can also get the predicted scores before the Sigmoid/Softmax application 57 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE) 58 | 59 | # Raw score predictions as matrix instead of vector 60 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE) 61 | 62 | # We can also get the leaf index 63 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE) 64 | 65 | # Predict leaf index as matrix instead of vector 66 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE) 67 | -------------------------------------------------------------------------------- /R-package/R/lgb.unloader.R: -------------------------------------------------------------------------------- 1 | #' LightGBM unloading error fix 2 | #' 3 | #' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object. 4 | #' 5 | #' @param restart Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed. 6 | #' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them. 7 | #' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment. 8 | #' 9 | #' @return NULL invisibly. 10 | #' 11 | #' @examples 12 | #' \dontrun{ 13 | #' library(lightgbm) 14 | #' data(agaricus.train, package = "lightgbm") 15 | #' train <- agaricus.train 16 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 17 | #' data(agaricus.test, package = "lightgbm") 18 | #' test <- agaricus.test 19 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 20 | #' params <- list(objective = "regression", metric = "l2") 21 | #' valids <- list(test = dtest) 22 | #' model <- lgb.train(params, 23 | #' dtrain, 24 | #' 100, 25 | #' valids, 26 | #' min_data = 1, 27 | #' learning_rate = 1, 28 | #' early_stopping_rounds = 10) 29 | #' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) 30 | #' rm(model, dtrain, dtest) # Not needed if wipe = TRUE 31 | #' gc() # Not needed if wipe = TRUE 32 | #' 33 | #' library(lightgbm) 34 | #' # Do whatever you want again with LightGBM without object clashing 35 | #' } 36 | #' 37 | #' @export 38 | lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) { 39 | 40 | # Unload package 41 | try(detach("package:lightgbm", unload = TRUE), silent = TRUE) 42 | 43 | # Should we wipe variables? (lgb.Booster, lgb.Dataset) 44 | if (wipe) { 45 | rm(list = ls(envir = envir)[which(sapply(ls(.GlobalEnv), function(x) {"lgb.Booster" %in% class(get(x, envir = envir))}))], envir = envir) 46 | rm(list = ls(envir = envir)[which(sapply(ls(.GlobalEnv), function(x) {"lgb.Dataset" %in% class(get(x, envir = envir))}))], envir = envir) 47 | gc(verbose = FALSE) 48 | } 49 | 50 | # Load package back? 51 | if (restore) { 52 | library(lightgbm) 53 | } 54 | 55 | invisible() 56 | 57 | } 58 | -------------------------------------------------------------------------------- /include/LightGBM/application.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_APPLICATION_H_ 2 | #define LIGHTGBM_APPLICATION_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace LightGBM { 11 | 12 | class DatasetLoader; 13 | class Dataset; 14 | class Boosting; 15 | class ObjectiveFunction; 16 | class Metric; 17 | 18 | /*! 19 | * \brief The main entrance of LightGBM. this application has two tasks: 20 | * Train and Predict. 21 | * Train task will train a new model 22 | * Predict task will predict the scores of test data using exsisting model, 23 | * and save the score to disk. 24 | */ 25 | class Application { 26 | public: 27 | Application(int argc, char** argv); 28 | 29 | /*! \brief Destructor */ 30 | ~Application(); 31 | 32 | /*! \brief To call this funciton to run application*/ 33 | inline void Run(); 34 | 35 | private: 36 | /*! 37 | * \brief Global Sync by minimal, will return minimal T across nodes 38 | * \param local Local data 39 | * \return minimal values across nodes 40 | */ 41 | template 42 | T GlobalSyncUpByMin(T& local); 43 | 44 | /*! \brief Load parameters from command line and config file*/ 45 | void LoadParameters(int argc, char** argv); 46 | 47 | /*! \brief Load data, including training data and validation data*/ 48 | void LoadData(); 49 | 50 | /*! \brief Initialization before training*/ 51 | void InitTrain(); 52 | 53 | /*! \brief Main Training logic */ 54 | void Train(); 55 | 56 | /*! \brief Initializations before prediction */ 57 | void InitPredict(); 58 | 59 | /*! \brief Main predicting logic */ 60 | void Predict(); 61 | 62 | /*! \brief All configs */ 63 | OverallConfig config_; 64 | /*! \brief Training data */ 65 | std::unique_ptr train_data_; 66 | /*! \brief Validation data */ 67 | std::vector> valid_datas_; 68 | /*! \brief Metric for training data */ 69 | std::vector> train_metric_; 70 | /*! \brief Metrics for validation data */ 71 | std::vector>> valid_metrics_; 72 | /*! \brief Boosting object */ 73 | std::unique_ptr boosting_; 74 | /*! \brief Training objective function */ 75 | std::unique_ptr objective_fun_; 76 | }; 77 | 78 | 79 | inline void Application::Run() { 80 | if (config_.task_type == TaskType::kPredict) { 81 | InitPredict(); 82 | Predict(); 83 | } else { 84 | InitTrain(); 85 | Train(); 86 | } 87 | } 88 | 89 | } // namespace LightGBM 90 | 91 | #endif // LightGBM_APPLICATION_H_ 92 | -------------------------------------------------------------------------------- /R-package/README.md: -------------------------------------------------------------------------------- 1 | LightGBM R Package 2 | ================== 3 | 4 | Installation 5 | ------------ 6 | 7 | Windows users may need to run with administrator rights (either R or the command prompt, depending on the way you are installing this package). Rtools must be installed for Windows. Linux users might require the appropriate user write permissions for packages. 8 | 9 | You can use a command prompt to install via command line: 10 | 11 | ``` 12 | cd R-package 13 | R CMD INSTALL --build . 14 | ``` 15 | 16 | You can also install directly from R using the repository with `devtools`: 17 | 18 | ```r 19 | devtools::install_github("Microsoft/LightGBM", subdir = "R-package") 20 | ``` 21 | 22 | For the `devtools` install scenario, you can safely ignore this message: 23 | 24 | ```r 25 | Warning message: 26 | GitHub repo contains submodules, may not function as expected! 27 | ``` 28 | 29 | If you want to build the self-contained R package, you can run ```unix_build_package.sh```(for UNIX) or ```win_build_package.cmd ```(for Windows). Then use ```R CMD INSTALL lightgbm_0.1.tar.gz``` to install. 30 | 31 | When your package installation is done, you can check quickly if your LightGBM R package is working by running the following: 32 | 33 | ```r 34 | library(lightgbm) 35 | data(agaricus.train, package='lightgbm') 36 | train <- agaricus.train 37 | dtrain <- lgb.Dataset(train$data, label=train$label) 38 | params <- list(objective="regression", metric="l2") 39 | model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10) 40 | ``` 41 | ### OSX installation 42 | 43 | The default installation cannot successfully complete in OSX because clang doesn't support OpenMP. 44 | 45 | You can use the following script to change default compiler to gcc, then compile LightGBM R package: 46 | 47 | ```bash 48 | brew install gcc --without-multilib 49 | mkdir -p ~/.R 50 | touch ~/.R/Makevars 51 | cat <>~/.R/Makevars 52 | C=gcc-6 53 | CXX=g++-6 54 | CXX1X=g++-6 55 | LDFLAGS=-L/usr/local/Cellar/gcc/6.3.0/lib 56 | CPPFLAGS=-I/usr/local/Cellar/gcc/6.3.0/include 57 | SHLIB_OPENMP_CFLAGS = -fopenmp 58 | SHLIB_OPENMP_CXXFLAGS = -fopenmp 59 | SHLIB_OPENMP_FCFLAGS = -fopenmp 60 | SHLIB_OPENMP_FFLAGS = -fopenmp 61 | EOF 62 | ``` 63 | 64 | Note: for `LDFLAGS=-L/usr/local/Cellar/gcc/6.3.0/lib` and `CPPFLAGS=-I/usr/local/Cellar/gcc/6.3.0/include`, you may need to change `6.3.0` to your gcc version. 65 | 66 | To check your LightGBM installation, the test is identical to Linux/Windows versions (check the test provided just before OSX Installation part) 67 | 68 | Examples 69 | ------------ 70 | 71 | * Please visit [demo](demo). 72 | -------------------------------------------------------------------------------- /include/LightGBM/tree_learner.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_TREE_LEARNER_H_ 2 | #define LIGHTGBM_TREE_LEARNER_H_ 3 | 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace LightGBM { 11 | 12 | /*! \brief forward declaration */ 13 | class Tree; 14 | class Dataset; 15 | 16 | /*! 17 | * \brief Interface for tree learner 18 | */ 19 | class TreeLearner { 20 | public: 21 | /*! \brief virtual destructor */ 22 | virtual ~TreeLearner() {} 23 | 24 | /*! 25 | * \brief Initialize tree learner with training dataset 26 | * \param train_data The used training data 27 | * \param is_constant_hessian True if all hessians share the same value 28 | */ 29 | virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0; 30 | 31 | virtual void ResetTrainingData(const Dataset* train_data) = 0; 32 | 33 | /*! 34 | * \brief Reset tree configs 35 | * \param tree_config config of tree 36 | */ 37 | virtual void ResetConfig(const TreeConfig* tree_config) = 0; 38 | 39 | /*! 40 | * \brief training tree model on dataset 41 | * \param gradients The first order gradients 42 | * \param hessians The second order gradients 43 | * \param is_constant_hessian True if all hessians share the same value 44 | * \return A trained tree 45 | */ 46 | virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian) = 0; 47 | 48 | /*! 49 | * \brief use a existing tree to fit the new gradients and hessians. 50 | */ 51 | virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0; 52 | 53 | /*! 54 | * \brief Set bagging data 55 | * \param used_indices Used data indices 56 | * \param num_data Number of used data 57 | */ 58 | virtual void SetBaggingData(const data_size_t* used_indices, 59 | data_size_t num_data) = 0; 60 | 61 | /*! 62 | * \brief Using last trained tree to predict score then adding to out_score; 63 | * \param out_score output score 64 | */ 65 | virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0; 66 | 67 | TreeLearner() = default; 68 | /*! \brief Disable copy */ 69 | TreeLearner& operator=(const TreeLearner&) = delete; 70 | /*! \brief Disable copy */ 71 | TreeLearner(const TreeLearner&) = delete; 72 | 73 | /*! 74 | * \brief Create object of tree learner 75 | * \param learner_type Type of tree learner 76 | * \param device_type Type of tree learner 77 | * \param tree_config config of tree 78 | */ 79 | static TreeLearner* CreateTreeLearner(const std::string& learner_type, 80 | const std::string& device_type, 81 | const TreeConfig* tree_config); 82 | }; 83 | 84 | } // namespace LightGBM 85 | 86 | #endif // LightGBM_TREE_LEARNER_H_ 87 | -------------------------------------------------------------------------------- /R-package/R/lgb.plot.importance.R: -------------------------------------------------------------------------------- 1 | #' Plot feature importance as a bar graph 2 | #' 3 | #' Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph. 4 | #' 5 | #' @param tree_imp a \code{data.table} returned by \code{\link{lgb.importance}}. 6 | #' @param top_n maximal number of top features to include into the plot. 7 | #' @param measure the name of importance measure to plot, can be "Gain", "Cover" or "Frequency". 8 | #' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names. 9 | #' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}. 10 | #' 11 | #' @details 12 | #' The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature. 13 | #' Features are shown ranked in a decreasing importance order. 14 | #' 15 | #' @return 16 | #' The \code{lgb.plot.importance} function creates a \code{barplot} 17 | #' and silently returns a processed data.table with \code{top_n} features sorted by defined importance. 18 | #' 19 | #' @examples 20 | #' \dontrun{ 21 | #' data(agaricus.train, package = "lightgbm") 22 | #' train <- agaricus.train 23 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 24 | #' 25 | #' params = list(objective = "binary", 26 | #' learning_rate = 0.01, num_leaves = 63, max_depth = -1, 27 | #' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 28 | #' model <- lgb.train(params, dtrain, 20) 29 | #' model <- lgb.train(params, dtrain, 20) 30 | #' 31 | #' tree_imp <- lgb.importance(model, percentage = TRUE) 32 | #' lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain") 33 | #' } 34 | #' 35 | #' @export 36 | lgb.plot.importance <- function(tree_imp, 37 | top_n = 10, 38 | measure = "Gain", 39 | left_margin = 10, 40 | cex = NULL) { 41 | 42 | # Check for measurement (column names) correctness 43 | measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE) 44 | 45 | # Get top N importance (defaults to 10) 46 | top_n <- min(top_n, nrow(tree_imp)) 47 | 48 | # Parse importance 49 | tree_imp <- tree_imp[order(abs(get(measure)), decreasing = TRUE),][1:top_n,] 50 | 51 | # Attempt to setup a correct cex 52 | if (is.null(cex)) { 53 | cex <- 2.5 / log2(1 + top_n) 54 | } 55 | 56 | # Refresh plot 57 | op <- par(no.readonly = TRUE) 58 | on.exit(par(op)) 59 | 60 | # Do some magic plotting 61 | par(mar = op$mar %>% magrittr::inset(., 2, left_margin)) 62 | 63 | # Do plot 64 | tree_imp[.N:1, 65 | barplot(height = get(measure), 66 | names.arg = Feature, 67 | horiz = TRUE, 68 | border = NA, 69 | main = "Feature Importance", 70 | xlab = measure, 71 | cex.names = cex, 72 | las = 1)] 73 | 74 | # Return invisibly 75 | invisible(tree_imp) 76 | 77 | } 78 | -------------------------------------------------------------------------------- /include/LightGBM/utils/log.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_LOG_H_ 2 | #define LIGHTGBM_UTILS_LOG_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace LightGBM { 13 | 14 | 15 | #ifndef CHECK 16 | #define CHECK(condition) \ 17 | if (!(condition)) Log::Fatal("Check failed: " #condition \ 18 | " at %s, line %d .\n", __FILE__, __LINE__); 19 | #endif 20 | 21 | #ifndef CHECK_NOTNULL 22 | #define CHECK_NOTNULL(pointer) \ 23 | if ((pointer) == nullptr) LightGBM::Log::Fatal(#pointer " Can't be NULL at %s, line %d .\n", __FILE__, __LINE__); 24 | #endif 25 | 26 | 27 | enum class LogLevel: int { 28 | Fatal = -1, 29 | Warning = 0, 30 | Info = 1, 31 | Debug = 2, 32 | }; 33 | 34 | 35 | /*! 36 | * \brief A static Log class 37 | */ 38 | class Log { 39 | public: 40 | /*! 41 | * \brief Resets the minimal log level. It is INFO by default. 42 | * \param level The new minimal log level. 43 | */ 44 | static void ResetLogLevel(LogLevel level) { 45 | GetLevel() = level; 46 | } 47 | 48 | static void Debug(const char *format, ...) { 49 | va_list val; 50 | va_start(val, format); 51 | Write(LogLevel::Debug, "Debug", format, val); 52 | va_end(val); 53 | } 54 | static void Info(const char *format, ...) { 55 | va_list val; 56 | va_start(val, format); 57 | Write(LogLevel::Info, "Info", format, val); 58 | va_end(val); 59 | } 60 | static void Warning(const char *format, ...) { 61 | va_list val; 62 | va_start(val, format); 63 | Write(LogLevel::Warning, "Warning", format, val); 64 | va_end(val); 65 | } 66 | static void Fatal(const char *format, ...) { 67 | va_list val; 68 | char str_buf[1024]; 69 | va_start(val, format); 70 | #ifdef _MSC_VER 71 | vsprintf_s(str_buf, format, val); 72 | #else 73 | vsprintf(str_buf, format, val); 74 | #endif 75 | va_end(val); 76 | fprintf(stderr, "[LightGBM] [Fatal] %s\n", str_buf); 77 | fflush(stderr); 78 | throw std::runtime_error(std::string(str_buf)); 79 | } 80 | 81 | private: 82 | 83 | static void Write(LogLevel level, const char* level_str, const char *format, va_list val) { 84 | if (level <= GetLevel()) { // omit the message with low level 85 | // write to STDOUT 86 | printf("[LightGBM] [%s] ", level_str); 87 | vprintf(format, val); 88 | printf("\n"); 89 | fflush(stdout); 90 | } 91 | } 92 | 93 | // a trick to use static variable in header file. 94 | // May be not good, but avoid to use an additional cpp file 95 | #if defined(_MSC_VER) 96 | static LogLevel& GetLevel() { static __declspec(thread) LogLevel level = LogLevel::Info; return level; } 97 | #else 98 | static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; } 99 | #endif 100 | 101 | }; 102 | 103 | } // namespace LightGBM 104 | #endif // LightGBM_UTILS_LOG_H_ 105 | -------------------------------------------------------------------------------- /R-package/R/saveRDS.lgb.Booster.R: -------------------------------------------------------------------------------- 1 | #' saveRDS for lgb.Booster models 2 | #' 3 | #' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. 4 | #' 5 | #' @param object R object to serialize. 6 | #' @param file a connection or the name of the file where the R object is saved to or read from. 7 | #' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save. 8 | #' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions. 9 | #' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection. 10 | #' @param refhook a hook function for handling reference objects. 11 | #' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}. 12 | #' 13 | #' @return NULL invisibly. 14 | #' 15 | #' @examples 16 | #' \dontrun{ 17 | #' library(lightgbm) 18 | #' data(agaricus.train, package = "lightgbm") 19 | #' train <- agaricus.train 20 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 21 | #' data(agaricus.test, package = "lightgbm") 22 | #' test <- agaricus.test 23 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 24 | #' params <- list(objective = "regression", metric = "l2") 25 | #' valids <- list(test = dtest) 26 | #' model <- lgb.train(params, 27 | #' dtrain, 28 | #' 100, 29 | #' valids, 30 | #' min_data = 1, 31 | #' learning_rate = 1, 32 | #' early_stopping_rounds = 10) 33 | #' saveRDS.lgb.Booster(model, "model.rds") 34 | #' } 35 | #' 36 | #' @export 37 | saveRDS.lgb.Booster <- function(object, 38 | file = "", 39 | ascii = FALSE, 40 | version = NULL, 41 | compress = TRUE, 42 | refhook = NULL, 43 | raw = TRUE) { 44 | 45 | # Check if object has a raw value (and if the user wants to store the raw) 46 | if (is.na(object$raw) & (raw)) { 47 | 48 | # Save model 49 | object$save() 50 | 51 | # Save RDS 52 | saveRDS(object, 53 | file = file, 54 | ascii = ascii, 55 | version = version, 56 | compress = compress, 57 | refhook = refhook) 58 | 59 | # Free model from memory 60 | object$raw <- NA 61 | 62 | } else { 63 | 64 | # Save as usual 65 | saveRDS(object, 66 | file = file, 67 | ascii = ascii, 68 | version = version, 69 | compress = compress, 70 | refhook = refhook) 71 | 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: required 3 | dist: trusty 4 | 5 | before_install: 6 | - test -n $CC && unset CC 7 | - test -n $CXX && unset CXX 8 | - wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 9 | - chmod +x conda.sh 10 | - bash conda.sh -b -p $HOME/miniconda 11 | - export PATH="$HOME/miniconda/bin:$PATH" 12 | - conda config --set always_yes yes --set changeps1 no 13 | - conda update -q conda 14 | - sudo add-apt-repository ppa:george-edison55/cmake-3.x -y 15 | - sudo apt-get update -q 16 | - bash .travis/amd_sdk.sh; 17 | - tar -xjf AMD-SDK.tar.bz2; 18 | - AMDAPPSDK=${HOME}/AMDAPPSDK; 19 | - export OPENCL_VENDOR_PATH=${AMDAPPSDK}/etc/OpenCL/vendors; 20 | - mkdir -p ${OPENCL_VENDOR_PATH}; 21 | - sh AMD-APP-SDK*.sh --tar -xf -C ${AMDAPPSDK}; 22 | - echo libamdocl64.so > ${OPENCL_VENDOR_PATH}/amdocl64.icd; 23 | - export LD_LIBRARY_PATH=${AMDAPPSDK}/lib/x86_64:${LD_LIBRARY_PATH}; 24 | - chmod +x ${AMDAPPSDK}/bin/x86_64/clinfo; 25 | - ${AMDAPPSDK}/bin/x86_64/clinfo; 26 | - export LIBRARY_PATH="$HOME/miniconda/lib:$LIBRARY_PATH" 27 | - export LD_RUN_PATH="$HOME/miniconda/lib:$LD_RUN_PATH" 28 | - export CPLUS_INCLUDE_PATH="$HOME/miniconda/include:$AMDAPPSDK/include/:$CPLUS_INCLUDE_PATH" 29 | 30 | install: 31 | - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential 32 | - sudo apt-get install -y cmake 33 | - conda install --yes atlas numpy scipy scikit-learn pandas matplotlib 34 | - conda install --yes -c conda-forge boost=1.63.0 35 | - pip install pep8 36 | 37 | script: 38 | - cd $TRAVIS_BUILD_DIR 39 | - mkdir build && cd build && cmake .. && make -j 40 | - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py 41 | - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install 42 | - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py 43 | - cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute . 44 | - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j 45 | - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py 46 | - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install 47 | - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py 48 | - cd $TRAVIS_BUILD_DIR 49 | - rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ .. 50 | - sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h 51 | - make -j$(nproc) 52 | - sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h 53 | - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py 54 | - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install 55 | - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py 56 | 57 | notifications: 58 | email: false 59 | 60 | matrix: 61 | include: 62 | - compiler: gcc 63 | - compiler: clang 64 | -------------------------------------------------------------------------------- /include/LightGBM/utils/random.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_RANDOM_H_ 2 | #define LIGHTGBM_UTILS_RANDOM_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace LightGBM { 10 | 11 | /*! 12 | * \brief A wrapper for random generator 13 | */ 14 | class Random { 15 | public: 16 | /*! 17 | * \brief Constructor, with random seed 18 | */ 19 | Random() { 20 | std::random_device rd; 21 | auto genrator = std::mt19937(rd()); 22 | std::uniform_int_distribution distribution(0, x); 23 | x = distribution(genrator); 24 | } 25 | /*! 26 | * \brief Constructor, with specific seed 27 | */ 28 | Random(int seed) { 29 | x = seed; 30 | } 31 | /*! 32 | * \brief Generate random integer, int16 range. [0, 65536] 33 | * \param lower_bound lower bound 34 | * \param upper_bound upper bound 35 | * \return The random integer between [lower_bound, upper_bound) 36 | */ 37 | inline int NextShort(int lower_bound, int upper_bound) { 38 | return (RandInt16()) % (upper_bound - lower_bound) + lower_bound; 39 | } 40 | 41 | /*! 42 | * \brief Generate random integer, int32 range 43 | * \param lower_bound lower bound 44 | * \param upper_bound upper bound 45 | * \return The random integer between [lower_bound, upper_bound) 46 | */ 47 | inline int NextInt(int lower_bound, int upper_bound) { 48 | return (RandInt32()) % (upper_bound - lower_bound) + lower_bound; 49 | } 50 | 51 | /*! 52 | * \brief Generate random float data 53 | * \return The random float between [0.0, 1.0) 54 | */ 55 | inline float NextFloat() { 56 | // get random float in [0,1) 57 | return static_cast(RandInt16()) / (32768.0f); 58 | } 59 | /*! 60 | * \brief Sample K data from {0,1,...,N-1} 61 | * \param N 62 | * \param K 63 | * \return K Ordered sampled data from {0,1,...,N-1} 64 | */ 65 | inline std::vector Sample(int N, int K) { 66 | std::vector ret; 67 | ret.reserve(K); 68 | if (K > N || K < 0) { 69 | return ret; 70 | } else if (K == N) { 71 | for (int i = 0; i < N; ++i) { 72 | ret.push_back(i); 73 | } 74 | } else if (K > N / 2) { 75 | for (int i = 0; i < N; ++i) { 76 | double prob = (K - ret.size()) / static_cast(N - i); 77 | if (NextFloat() < prob) { 78 | ret.push_back(i); 79 | } 80 | } 81 | } else { 82 | int min_step = 1; 83 | int avg_step = N / K; 84 | int max_step = 2 * avg_step - min_step; 85 | int start = -1; 86 | for (int i = 0; i < K; ++i) { 87 | int step = NextShort(min_step, max_step + 1); 88 | start += step; 89 | if (start >= N) { break; } 90 | ret.push_back(start); 91 | } 92 | } 93 | return ret; 94 | } 95 | private: 96 | inline int RandInt16() { 97 | x = (214013 * x + 2531011); 98 | return (x >> 16) & 0x7FFF; 99 | } 100 | 101 | inline int RandInt32() { 102 | x = (214013 * x + 2531011); 103 | return x & 0x7FFFFFF; 104 | } 105 | 106 | int x = 123456789; 107 | }; 108 | 109 | 110 | } // namespace LightGBM 111 | 112 | #endif // LightGBM_UTILS_RANDOM_H_ 113 | -------------------------------------------------------------------------------- /src/io/parser.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_IO_PARSER_HPP_ 2 | #define LIGHTGBM_IO_PARSER_HPP_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace LightGBM { 14 | 15 | class CSVParser: public Parser { 16 | public: 17 | explicit CSVParser(int label_idx) 18 | :label_idx_(label_idx) { 19 | } 20 | inline void ParseOneLine(const char* str, 21 | std::vector>* out_features, double* out_label) const override { 22 | int idx = 0; 23 | double val = 0.0f; 24 | int bias = 0; 25 | *out_label = 0.0f; 26 | while (*str != '\0') { 27 | str = Common::Atof(str, &val); 28 | if (idx == label_idx_) { 29 | *out_label = val; 30 | bias = -1; 31 | } 32 | else if (fabs(val) > 1e-10) { 33 | out_features->emplace_back(idx + bias, val); 34 | } 35 | ++idx; 36 | if (*str == ',') { 37 | ++str; 38 | } else if (*str != '\0') { 39 | Log::Fatal("Input format error when parsing as CSV"); 40 | } 41 | } 42 | } 43 | private: 44 | int label_idx_ = 0; 45 | }; 46 | 47 | class TSVParser: public Parser { 48 | public: 49 | explicit TSVParser(int label_idx) 50 | :label_idx_(label_idx) { 51 | } 52 | inline void ParseOneLine(const char* str, 53 | std::vector>* out_features, double* out_label) const override { 54 | int idx = 0; 55 | double val = 0.0f; 56 | int bias = 0; 57 | while (*str != '\0') { 58 | str = Common::Atof(str, &val); 59 | if (idx == label_idx_) { 60 | *out_label = val; 61 | bias = -1; 62 | } else if (fabs(val) > 1e-10) { 63 | out_features->emplace_back(idx + bias, val); 64 | } 65 | ++idx; 66 | if (*str == '\t') { 67 | ++str; 68 | } else if (*str != '\0') { 69 | Log::Fatal("Input format error when parsing as TSV"); 70 | } 71 | } 72 | } 73 | private: 74 | int label_idx_ = 0; 75 | }; 76 | 77 | class LibSVMParser: public Parser { 78 | public: 79 | explicit LibSVMParser(int label_idx) 80 | :label_idx_(label_idx) { 81 | if (label_idx > 0) { 82 | Log::Fatal("Label should be the first column in a LibSVM file"); 83 | } 84 | } 85 | inline void ParseOneLine(const char* str, 86 | std::vector>* out_features, double* out_label) const override { 87 | int idx = 0; 88 | double val = 0.0f; 89 | if (label_idx_ == 0) { 90 | str = Common::Atof(str, &val); 91 | *out_label = val; 92 | str = Common::SkipSpaceAndTab(str); 93 | } 94 | while (*str != '\0') { 95 | str = Common::Atoi(str, &idx); 96 | str = Common::SkipSpaceAndTab(str); 97 | if (*str == ':') { 98 | ++str; 99 | str = Common::Atof(str, &val); 100 | out_features->emplace_back(idx, val); 101 | } else { 102 | Log::Fatal("Input format error when parsing as LibSVM"); 103 | } 104 | str = Common::SkipSpaceAndTab(str); 105 | } 106 | } 107 | private: 108 | int label_idx_ = 0; 109 | }; 110 | 111 | } // namespace LightGBM 112 | #endif // LightGBM_IO_PARSER_HPP_ 113 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_basic.R: -------------------------------------------------------------------------------- 1 | context("basic functions") 2 | 3 | data(agaricus.train, package='lightgbm') 4 | data(agaricus.test, package='lightgbm') 5 | train <- agaricus.train 6 | test <- agaricus.test 7 | 8 | windows_flag = grepl('Windows', Sys.info()[['sysname']]) 9 | 10 | test_that("train and predict binary classification", { 11 | nrounds = 10 12 | bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5, 13 | nrounds = nrounds, objective = "binary", metric="binary_error") 14 | expect_false(is.null(bst$record_evals)) 15 | record_results <- lgb.get.eval.result(bst, "train", "binary_error") 16 | expect_lt(min(record_results), 0.02) 17 | 18 | pred <- predict(bst, test$data) 19 | expect_equal(length(pred), 1611) 20 | 21 | pred1 <- predict(bst, train$data, num_iteration = 1) 22 | expect_equal(length(pred1), 6513) 23 | err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label) 24 | err_log <- record_results[1] 25 | expect_lt(abs(err_pred1 - err_log), 10e-6) 26 | }) 27 | 28 | 29 | test_that("train and predict softmax", { 30 | lb <- as.numeric(iris$Species) - 1 31 | 32 | bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb, 33 | num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20, 34 | objective = "multiclass", metric="multi_error", num_class=3) 35 | 36 | expect_false(is.null(bst$record_evals)) 37 | record_results <- lgb.get.eval.result(bst, "train", "multi_error") 38 | expect_lt(min(record_results), 0.03) 39 | 40 | pred <- predict(bst, as.matrix(iris[, -5])) 41 | expect_equal(length(pred), nrow(iris) * 3) 42 | }) 43 | 44 | 45 | test_that("use of multiple eval metrics works", { 46 | bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4, 47 | learning_rate=1, nrounds = 10, objective = "binary", 48 | metric = list("binary_error","auc","binary_logloss") ) 49 | expect_false(is.null(bst$record_evals)) 50 | }) 51 | 52 | 53 | test_that("training continuation works", { 54 | dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE) 55 | watchlist = list(train=dtrain) 56 | param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1) 57 | 58 | # for the reference, use 10 iterations at once: 59 | bst <- lgb.train(param, dtrain, nrounds = 10, watchlist) 60 | err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10) 61 | # first 5 iterations: 62 | bst1 <- lgb.train(param, dtrain, nrounds = 5, watchlist) 63 | # test continuing from a model in file 64 | lgb.save(bst1, "lightgbm.model") 65 | # continue for 5 more: 66 | bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = bst1) 67 | err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10) 68 | expect_lt(abs(err_bst - err_bst2), 0.01) 69 | 70 | bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = "lightgbm.model") 71 | err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10) 72 | expect_lt(abs(err_bst - err_bst2), 0.01) 73 | }) 74 | 75 | 76 | test_that("cv works", { 77 | dtrain <- lgb.Dataset(train$data, label=train$label) 78 | params <- list(objective="regression", metric="l2,l1") 79 | bst <- lgb.cv(params, dtrain, 10, nflod=5, min_data=1, learning_rate=1, early_stopping_rounds=10) 80 | expect_false(is.null(bst$record_evals)) 81 | }) 82 | -------------------------------------------------------------------------------- /include/LightGBM/dataset_loader.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_DATASET_LOADER_H_ 2 | #define LIGHTGBM_DATASET_LOADER_H_ 3 | 4 | #include 5 | 6 | namespace LightGBM { 7 | 8 | class DatasetLoader { 9 | public: 10 | 11 | LIGHTGBM_EXPORT DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun, int num_class, const char* filename); 12 | 13 | LIGHTGBM_EXPORT ~DatasetLoader(); 14 | 15 | LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines); 16 | 17 | LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) { 18 | return LoadFromFile(filename, 0, 1); 19 | } 20 | 21 | LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data); 22 | 23 | LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values, 24 | int** sample_indices, int num_col, const int* num_per_col, 25 | size_t total_sample_size, data_size_t num_data); 26 | 27 | /*! \brief Disable copy */ 28 | DatasetLoader& operator=(const DatasetLoader&) = delete; 29 | /*! \brief Disable copy */ 30 | DatasetLoader(const DatasetLoader&) = delete; 31 | 32 | private: 33 | 34 | Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); 35 | 36 | void SetHeader(const char* filename); 37 | 38 | void CheckDataset(const Dataset* dataset); 39 | 40 | std::vector LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); 41 | 42 | std::vector SampleTextDataFromMemory(const std::vector& data); 43 | 44 | std::vector SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); 45 | 46 | void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector& sample_data, const Parser* parser, Dataset* dataset); 47 | 48 | /*! \brief Extract local features from memory */ 49 | void ExtractFeaturesFromMemory(std::vector& text_data, const Parser* parser, Dataset* dataset); 50 | 51 | /*! \brief Extract local features from file */ 52 | void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector& used_data_indices, Dataset* dataset); 53 | 54 | /*! \brief Check can load from binary file */ 55 | std::string CheckCanLoadFromBin(const char* filename); 56 | 57 | const IOConfig& io_config_; 58 | /*! \brief Random generator*/ 59 | Random random_; 60 | /*! \brief prediction function for initial model */ 61 | const PredictFunction& predict_fun_; 62 | /*! \brief number of classes */ 63 | int num_class_; 64 | /*! \brief index of label column */ 65 | int label_idx_; 66 | /*! \brief index of weight column */ 67 | int weight_idx_; 68 | /*! \brief index of group column */ 69 | int group_idx_; 70 | /*! \brief Mapper from real feature index to used index*/ 71 | std::unordered_set ignore_features_; 72 | /*! \brief store feature names */ 73 | std::vector feature_names_; 74 | /*! \brief Mapper from real feature index to used index*/ 75 | std::unordered_set categorical_features_; 76 | }; 77 | 78 | } 79 | 80 | #endif // LIGHTGBM_DATASET_LOADER_H_ -------------------------------------------------------------------------------- /R-package/R/lightgbm.R: -------------------------------------------------------------------------------- 1 | #' Simple interface for training an lightgbm model. 2 | #' Its documentation is combined with lgb.train. 3 | #' 4 | #' @rdname lgb.train 5 | #' @export 6 | lightgbm <- function(data, 7 | label = NULL, 8 | weight = NULL, 9 | params = list(), 10 | nrounds = 10, 11 | verbose = 1, 12 | eval_freq = 1L, 13 | early_stopping_rounds = NULL, 14 | save_name = "lightgbm.model", 15 | init_model = NULL, 16 | callbacks = list(), 17 | ...) { 18 | 19 | # Set data to a temporary variable 20 | dtrain <- data 21 | 22 | # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually 23 | if (!lgb.is.Dataset(dtrain)) { 24 | dtrain <- lgb.Dataset(data, label = label, weight = weight) 25 | } 26 | 27 | # Set validation as oneself 28 | valids <- list() 29 | if (verbose > 0) { 30 | valids$train = dtrain 31 | } 32 | 33 | # Train a model using the regular way 34 | bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq, 35 | early_stopping_rounds = early_stopping_rounds, 36 | init_model = init_model, callbacks = callbacks, ...) 37 | 38 | # Store model under a specific name 39 | bst$save_model(save_name) 40 | 41 | # Return booster 42 | return(bst) 43 | } 44 | 45 | #' Training part from Mushroom Data Set 46 | #' 47 | #' This data set is originally from the Mushroom data set, 48 | #' UCI Machine Learning Repository. 49 | #' 50 | #' This data set includes the following fields: 51 | #' 52 | #' \itemize{ 53 | #' \item \code{label} the label for each record 54 | #' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. 55 | #' } 56 | #' 57 | #' @references 58 | #' https://archive.ics.uci.edu/ml/datasets/Mushroom 59 | #' 60 | #' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 61 | #' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 62 | #' School of Information and Computer Science. 63 | #' 64 | #' @docType data 65 | #' @keywords datasets 66 | #' @name agaricus.train 67 | #' @usage data(agaricus.train) 68 | #' @format A list containing a label vector, and a dgCMatrix object with 6513 69 | #' rows and 127 variables 70 | NULL 71 | 72 | #' Test part from Mushroom Data Set 73 | #' 74 | #' This data set is originally from the Mushroom data set, 75 | #' UCI Machine Learning Repository. 76 | #' 77 | #' This data set includes the following fields: 78 | #' 79 | #' \itemize{ 80 | #' \item \code{label} the label for each record 81 | #' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. 82 | #' } 83 | #' 84 | #' @references 85 | #' https://archive.ics.uci.edu/ml/datasets/Mushroom 86 | #' 87 | #' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 88 | #' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 89 | #' School of Information and Computer Science. 90 | #' 91 | #' @docType data 92 | #' @keywords datasets 93 | #' @name agaricus.test 94 | #' @usage data(agaricus.test) 95 | #' @format A list containing a label vector, and a dgCMatrix object with 1611 96 | #' rows and 126 variables 97 | NULL 98 | 99 | # Various imports 100 | #' @import methods 101 | #' @importFrom R6 R6Class 102 | #' @useDynLib lightgbm 103 | NULL 104 | -------------------------------------------------------------------------------- /src/treelearner/feature_parallel_tree_learner.cpp: -------------------------------------------------------------------------------- 1 | #include "parallel_tree_learner.h" 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace LightGBM { 8 | 9 | 10 | template 11 | FeatureParallelTreeLearner::FeatureParallelTreeLearner(const TreeConfig* tree_config) 12 | :TREELEARNER_T(tree_config) { 13 | } 14 | 15 | template 16 | FeatureParallelTreeLearner::~FeatureParallelTreeLearner() { 17 | 18 | } 19 | 20 | template 21 | void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { 22 | TREELEARNER_T::Init(train_data, is_constant_hessian); 23 | rank_ = Network::rank(); 24 | num_machines_ = Network::num_machines(); 25 | input_buffer_.resize(sizeof(SplitInfo) * 2); 26 | output_buffer_.resize(sizeof(SplitInfo) * 2); 27 | } 28 | 29 | 30 | template 31 | void FeatureParallelTreeLearner::BeforeTrain() { 32 | TREELEARNER_T::BeforeTrain(); 33 | // get feature partition 34 | std::vector> feature_distribution(num_machines_, std::vector()); 35 | std::vector num_bins_distributed(num_machines_, 0); 36 | for (int i = 0; i < this->train_data_->num_total_features(); ++i) { 37 | int inner_feature_index = this->train_data_->InnerFeatureIndex(i); 38 | if (inner_feature_index == -1) { continue; } 39 | if (this->is_feature_used_[inner_feature_index]) { 40 | int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); 41 | feature_distribution[cur_min_machine].push_back(inner_feature_index); 42 | num_bins_distributed[cur_min_machine] += this->train_data_->FeatureNumBin(inner_feature_index); 43 | this->is_feature_used_[inner_feature_index] = false; 44 | } 45 | } 46 | // get local used features 47 | for (auto fid : feature_distribution[rank_]) { 48 | this->is_feature_used_[fid] = true; 49 | } 50 | } 51 | 52 | template 53 | void FeatureParallelTreeLearner::FindBestSplitsForLeaves() { 54 | SplitInfo smaller_best, larger_best; 55 | // get best split at smaller leaf 56 | smaller_best = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()]; 57 | // find local best split for larger leaf 58 | if (this->larger_leaf_splits_->LeafIndex() >= 0) { 59 | larger_best = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()]; 60 | } 61 | // sync global best info 62 | std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo)); 63 | std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo)); 64 | 65 | Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo), 66 | output_buffer_.data(), &SplitInfo::MaxReducer); 67 | // copy back 68 | std::memcpy(&smaller_best, output_buffer_.data(), sizeof(SplitInfo)); 69 | std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo)); 70 | // update best split 71 | this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best; 72 | if (this->larger_leaf_splits_->LeafIndex() >= 0) { 73 | this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best; 74 | } 75 | } 76 | 77 | // instantiate template classes, otherwise linker cannot find the code 78 | template class FeatureParallelTreeLearner; 79 | template class FeatureParallelTreeLearner; 80 | } // namespace LightGBM 81 | -------------------------------------------------------------------------------- /R-package/src/R_object_helper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * A simple wrapper for accessing data in R object. 3 | * Due to license issue(GPLv2), we cannot include R's header file, so use this simple wrapper instead. 4 | * However, if R changes the way it defines objects, this file will need to be updated as well. 5 | */ 6 | #ifndef R_OBJECT_HELPER_H_ 7 | #define R_OBJECT_HELPER_H_ 8 | 9 | #include 10 | 11 | #define TYPE_BITS 5 12 | struct sxpinfo_struct { 13 | unsigned int type : 5; 14 | unsigned int obj : 1; 15 | unsigned int named : 2; 16 | unsigned int gp : 16; 17 | unsigned int mark : 1; 18 | unsigned int debug : 1; 19 | unsigned int trace : 1; 20 | unsigned int spare : 1; 21 | unsigned int gcgen : 1; 22 | unsigned int gccls : 3; 23 | }; 24 | 25 | struct primsxp_struct { 26 | int offset; 27 | }; 28 | 29 | struct symsxp_struct { 30 | struct SEXPREC *pname; 31 | struct SEXPREC *value; 32 | struct SEXPREC *internal; 33 | }; 34 | 35 | struct listsxp_struct { 36 | struct SEXPREC *carval; 37 | struct SEXPREC *cdrval; 38 | struct SEXPREC *tagval; 39 | }; 40 | 41 | struct envsxp_struct { 42 | struct SEXPREC *frame; 43 | struct SEXPREC *enclos; 44 | struct SEXPREC *hashtab; 45 | }; 46 | 47 | struct closxp_struct { 48 | struct SEXPREC *formals; 49 | struct SEXPREC *body; 50 | struct SEXPREC *env; 51 | }; 52 | 53 | struct promsxp_struct { 54 | struct SEXPREC *value; 55 | struct SEXPREC *expr; 56 | struct SEXPREC *env; 57 | }; 58 | 59 | typedef struct SEXPREC { 60 | struct sxpinfo_struct sxpinfo; 61 | struct SEXPREC* attrib; 62 | struct SEXPREC* gengc_next_node, *gengc_prev_node; 63 | union { 64 | struct primsxp_struct primsxp; 65 | struct symsxp_struct symsxp; 66 | struct listsxp_struct listsxp; 67 | struct envsxp_struct envsxp; 68 | struct closxp_struct closxp; 69 | struct promsxp_struct promsxp; 70 | } u; 71 | } SEXPREC, *SEXP; 72 | 73 | struct vecsxp_struct { 74 | int length; 75 | int truelength; 76 | }; 77 | 78 | typedef struct VECTOR_SEXPREC { 79 | struct sxpinfo_struct sxpinfo; 80 | struct SEXPREC* attrib; 81 | struct SEXPREC* gengc_next_node, *gengc_prev_node; 82 | struct vecsxp_struct vecsxp; 83 | } VECTOR_SEXPREC, *VECSEXP; 84 | 85 | typedef union { VECTOR_SEXPREC s; double align; } SEXPREC_ALIGN; 86 | 87 | #define DATAPTR(x) (((SEXPREC_ALIGN *) (x)) + 1) 88 | 89 | #define R_CHAR_PTR(x) ((char *) DATAPTR(x)) 90 | 91 | #define R_INT_PTR(x) ((int *) DATAPTR(x)) 92 | 93 | #define R_REAL_PTR(x) ((double *) DATAPTR(x)) 94 | 95 | #define R_AS_INT(x) (*((int *) DATAPTR(x))) 96 | 97 | #define R_IS_NULL(x) ((*(SEXP)(x)).sxpinfo.type == 0) 98 | 99 | 100 | // 64bit pointer 101 | #if INTPTR_MAX == INT64_MAX 102 | 103 | #define R_ADDR(x) ((int64_t *) DATAPTR(x)) 104 | 105 | inline void R_SET_PTR(SEXP x, void* ptr) { 106 | if (ptr == nullptr) { 107 | R_ADDR(x)[0] = (int64_t)(NULL); 108 | } else { 109 | R_ADDR(x)[0] = (int64_t)(ptr); 110 | } 111 | } 112 | 113 | inline void* R_GET_PTR(SEXP x) { 114 | if (R_IS_NULL(x)) { 115 | return nullptr; 116 | } else { 117 | auto ret = (void *)(R_ADDR(x)[0]); 118 | if (ret == NULL) { 119 | ret = nullptr; 120 | } 121 | return ret; 122 | } 123 | } 124 | 125 | #else 126 | 127 | #define R_ADDR(x) ((int32_t *) DATAPTR(x)) 128 | 129 | inline void R_SET_PTR(SEXP x, void* ptr) { 130 | if (ptr == nullptr) { 131 | R_ADDR(x)[0] = (int32_t)(NULL); 132 | } else { 133 | R_ADDR(x)[0] = (int32_t)(ptr); 134 | } 135 | } 136 | 137 | inline void* R_GET_PTR(SEXP x) { 138 | if (R_IS_NULL(x)) { 139 | return nullptr; 140 | } else { 141 | auto ret = (void *)(R_ADDR(x)[0]); 142 | if (ret == NULL) { 143 | ret = nullptr; 144 | } 145 | return ret; 146 | } 147 | } 148 | 149 | #endif 150 | 151 | #endif // R_OBJECT_HELPER_H_ 152 | -------------------------------------------------------------------------------- /examples/parallel_learning/train.conf: -------------------------------------------------------------------------------- 1 | # task type, support train and predict 2 | task = train 3 | 4 | # boosting type, support gbdt for now, alias: boosting, boost 5 | boosting_type = gbdt 6 | 7 | # application type, support following application 8 | # regression , regression task 9 | # binary , binary classification task 10 | # lambdarank , lambdarank task 11 | # alias: application, app 12 | objective = binary 13 | 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics 15 | # l1 16 | # l2 , default metric for regression 17 | # ndcg , default metric for lambdarank 18 | # auc 19 | # binary_logloss , default metric for binary 20 | # binary_error 21 | metric = binary_logloss,auc 22 | 23 | # frequence for metric output 24 | metric_freq = 1 25 | 26 | # true if need output metric for training data, alias: tranining_metric, train_metric 27 | is_training_metric = true 28 | 29 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 30 | max_bin = 255 31 | 32 | # training data 33 | # if exsting weight file, should name to "binary.train.weight" 34 | # alias: train_data, train 35 | data = binary.train 36 | 37 | # validation data, support multi validation data, separated by ',' 38 | # if exsting weight file, should name to "binary.test.weight" 39 | # alias: valid, test, test_data, 40 | valid_data = binary.test 41 | 42 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds 43 | num_trees = 100 44 | 45 | # shrinkage rate , alias: shrinkage_rate 46 | learning_rate = 0.1 47 | 48 | # number of leaves for one tree, alias: num_leaf 49 | num_leaves = 63 50 | 51 | # type of tree learner, support following types: 52 | # serial , single machine version 53 | # feature , use feature parallel to train 54 | # data , use data parallel to train 55 | # voting , use voting based parallel to train 56 | # alias: tree 57 | tree_learner = feature 58 | 59 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 60 | # num_threads = 8 61 | 62 | # feature sub-sample, will random select 80% feature to train on each iteration 63 | # alias: sub_feature 64 | feature_fraction = 0.8 65 | 66 | # Support bagging (data sub-sample), will perform bagging every 5 iterations 67 | bagging_freq = 5 68 | 69 | # Bagging farction, will random select 80% data on bagging 70 | # alias: sub_row 71 | bagging_fraction = 0.8 72 | 73 | # minimal number data for one leaf, use this to deal with over-fit 74 | # alias : min_data_per_leaf, min_data 75 | min_data_in_leaf = 50 76 | 77 | # minimal sum hessians for one leaf, use this to deal with over-fit 78 | min_sum_hessian_in_leaf = 5.0 79 | 80 | # save memory and faster speed for sparse feature, alias: is_sparse 81 | is_enable_sparse = true 82 | 83 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed 84 | # alias: two_round_loading, two_round 85 | use_two_round_loading = false 86 | 87 | # true if need to save data to binary file and application will auto load data from binary file next time 88 | # alias: is_save_binary, save_binary 89 | is_save_binary_file = false 90 | 91 | # output model file 92 | output_model = LightGBM_model.txt 93 | 94 | # support continuous train from trained gbdt model 95 | # input_model= trained_model.txt 96 | 97 | # output prediction file for predict task 98 | # output_result= prediction.txt 99 | 100 | # support continuous train from initial score file 101 | # input_init_score= init_score.txt 102 | 103 | 104 | # number of machines in parallel training, alias: num_machine 105 | num_machines = 2 106 | 107 | # local listening port in parallel training, alias: local_port 108 | local_listen_port = 12400 109 | 110 | # machines list file for parallel training, alias: mlist 111 | machine_list_file = mlist.txt 112 | -------------------------------------------------------------------------------- /examples/regression/train.conf: -------------------------------------------------------------------------------- 1 | # task type, support train and predict 2 | task = train 3 | 4 | # boosting type, support gbdt for now, alias: boosting, boost 5 | boosting_type = gbdt 6 | 7 | # application type, support following application 8 | # regression , regression task 9 | # binary , binary classification task 10 | # lambdarank , lambdarank task 11 | # alias: application, app 12 | objective = regression 13 | 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics 15 | # l1 16 | # l2 , default metric for regression 17 | # ndcg , default metric for lambdarank 18 | # auc 19 | # binary_logloss , default metric for binary 20 | # binary_error 21 | metric = l2 22 | 23 | # frequence for metric output 24 | metric_freq = 1 25 | 26 | # true if need output metric for training data, alias: tranining_metric, train_metric 27 | is_training_metric = true 28 | 29 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 30 | max_bin = 255 31 | 32 | # training data 33 | # if exsting weight file, should name to "regression.train.weight" 34 | # alias: train_data, train 35 | data = regression.train 36 | 37 | # validation data, support multi validation data, separated by ',' 38 | # if exsting weight file, should name to "regression.test.weight" 39 | # alias: valid, test, test_data, 40 | valid_data = regression.test 41 | 42 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds 43 | num_trees = 100 44 | 45 | # shrinkage rate , alias: shrinkage_rate 46 | learning_rate = 0.05 47 | 48 | # number of leaves for one tree, alias: num_leaf 49 | num_leaves = 31 50 | 51 | # type of tree learner, support following types: 52 | # serial , single machine version 53 | # feature , use feature parallel to train 54 | # data , use data parallel to train 55 | # voting , use voting based parallel to train 56 | # alias: tree 57 | tree_learner = serial 58 | 59 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 60 | # num_threads = 8 61 | 62 | # feature sub-sample, will random select 80% feature to train on each iteration 63 | # alias: sub_feature 64 | feature_fraction = 0.9 65 | 66 | # Support bagging (data sub-sample), will perform bagging every 5 iterations 67 | bagging_freq = 5 68 | 69 | # Bagging farction, will random select 80% data on bagging 70 | # alias: sub_row 71 | bagging_fraction = 0.8 72 | 73 | # minimal number data for one leaf, use this to deal with over-fit 74 | # alias : min_data_per_leaf, min_data 75 | min_data_in_leaf = 100 76 | 77 | # minimal sum hessians for one leaf, use this to deal with over-fit 78 | min_sum_hessian_in_leaf = 5.0 79 | 80 | # save memory and faster speed for sparse feature, alias: is_sparse 81 | is_enable_sparse = true 82 | 83 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed 84 | # alias: two_round_loading, two_round 85 | use_two_round_loading = false 86 | 87 | # true if need to save data to binary file and application will auto load data from binary file next time 88 | # alias: is_save_binary, save_binary 89 | is_save_binary_file = false 90 | 91 | # output model file 92 | output_model = LightGBM_model.txt 93 | 94 | # support continuous train from trained gbdt model 95 | # input_model= trained_model.txt 96 | 97 | # output prediction file for predict task 98 | # output_result= prediction.txt 99 | 100 | # support continuous train from initial score file 101 | # input_init_score= init_score.txt 102 | 103 | 104 | # number of machines in parallel training, alias: num_machine 105 | num_machines = 1 106 | 107 | # local listening port in parallel training, alias: local_port 108 | local_listen_port = 12400 109 | 110 | # machines list file for parallel training, alias: mlist 111 | machine_list_file = mlist.txt 112 | -------------------------------------------------------------------------------- /examples/binary_classification/train.conf: -------------------------------------------------------------------------------- 1 | # task type, support train and predict 2 | task = train 3 | 4 | # boosting type, support gbdt for now, alias: boosting, boost 5 | boosting_type = gbdt 6 | 7 | # application type, support following application 8 | # regression , regression task 9 | # binary , binary classification task 10 | # lambdarank , lambdarank task 11 | # alias: application, app 12 | objective = binary 13 | 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics 15 | # l1 16 | # l2 , default metric for regression 17 | # ndcg , default metric for lambdarank 18 | # auc 19 | # binary_logloss , default metric for binary 20 | # binary_error 21 | metric = binary_logloss,auc 22 | 23 | # frequence for metric output 24 | metric_freq = 1 25 | 26 | # true if need output metric for training data, alias: tranining_metric, train_metric 27 | is_training_metric = true 28 | 29 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 30 | max_bin = 255 31 | 32 | # training data 33 | # if exsting weight file, should name to "binary.train.weight" 34 | # alias: train_data, train 35 | data = binary.train 36 | 37 | # validation data, support multi validation data, separated by ',' 38 | # if exsting weight file, should name to "binary.test.weight" 39 | # alias: valid, test, test_data, 40 | valid_data = binary.test 41 | 42 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds 43 | num_trees = 100 44 | 45 | # shrinkage rate , alias: shrinkage_rate 46 | learning_rate = 0.1 47 | 48 | # number of leaves for one tree, alias: num_leaf 49 | num_leaves = 63 50 | 51 | # type of tree learner, support following types: 52 | # serial , single machine version 53 | # feature , use feature parallel to train 54 | # data , use data parallel to train 55 | # voting , use voting based parallel to train 56 | # alias: tree 57 | tree_learner = serial 58 | 59 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 60 | # num_threads = 8 61 | 62 | # feature sub-sample, will random select 80% feature to train on each iteration 63 | # alias: sub_feature 64 | feature_fraction = 0.8 65 | 66 | # Support bagging (data sub-sample), will perform bagging every 5 iterations 67 | bagging_freq = 5 68 | 69 | # Bagging farction, will random select 80% data on bagging 70 | # alias: sub_row 71 | bagging_fraction = 0.8 72 | 73 | # minimal number data for one leaf, use this to deal with over-fit 74 | # alias : min_data_per_leaf, min_data 75 | min_data_in_leaf = 50 76 | 77 | # minimal sum hessians for one leaf, use this to deal with over-fit 78 | min_sum_hessian_in_leaf = 5.0 79 | 80 | # save memory and faster speed for sparse feature, alias: is_sparse 81 | is_enable_sparse = true 82 | 83 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed 84 | # alias: two_round_loading, two_round 85 | use_two_round_loading = false 86 | 87 | # true if need to save data to binary file and application will auto load data from binary file next time 88 | # alias: is_save_binary, save_binary 89 | is_save_binary_file = false 90 | 91 | # output model file 92 | output_model = LightGBM_model.txt 93 | 94 | # support continuous train from trained gbdt model 95 | # input_model= trained_model.txt 96 | 97 | # output prediction file for predict task 98 | # output_result= prediction.txt 99 | 100 | # support continuous train from initial score file 101 | # input_init_score= init_score.txt 102 | 103 | 104 | # number of machines in parallel training, alias: num_machine 105 | num_machines = 1 106 | 107 | # local listening port in parallel training, alias: local_port 108 | local_listen_port = 12400 109 | 110 | # machines list file for parallel training, alias: mlist 111 | machine_list_file = mlist.txt 112 | -------------------------------------------------------------------------------- /src/treelearner/split_info.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_ 2 | #define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | namespace LightGBM { 13 | 14 | /*! 15 | * \brief Used to store some information for gain split point 16 | */ 17 | struct SplitInfo { 18 | public: 19 | /*! \brief Feature index */ 20 | int feature; 21 | /*! \brief Split threshold */ 22 | unsigned int threshold; 23 | /*! \brief Left output after split */ 24 | double left_output; 25 | /*! \brief Right output after split */ 26 | double right_output; 27 | /*! \brief Split gain */ 28 | double gain; 29 | /*! \brief Left number of data after split */ 30 | data_size_t left_count; 31 | /*! \brief Right number of data after split */ 32 | data_size_t right_count; 33 | /*! \brief Left sum gradient after split */ 34 | double left_sum_gradient; 35 | /*! \brief Left sum hessian after split */ 36 | double left_sum_hessian; 37 | /*! \brief Right sum gradient after split */ 38 | double right_sum_gradient; 39 | /*! \brief Right sum hessian after split */ 40 | double right_sum_hessian; 41 | 42 | SplitInfo() { 43 | // initialize with -1 and -inf gain 44 | feature = -1; 45 | gain = kMinScore; 46 | } 47 | 48 | inline void Reset() { 49 | // initialize with -1 and -inf gain 50 | feature = -1; 51 | gain = kMinScore; 52 | } 53 | 54 | inline bool operator > (const SplitInfo &si) const; 55 | 56 | inline bool operator == (const SplitInfo &si) const; 57 | 58 | inline static void MaxReducer(const char* src, char* dst, int len) { 59 | const int type_size = sizeof(SplitInfo); 60 | int used_size = 0; 61 | const SplitInfo* p1; 62 | SplitInfo* p2; 63 | while (used_size < len) { 64 | p1 = reinterpret_cast(src); 65 | p2 = reinterpret_cast(dst); 66 | if (*p1 > *p2) { 67 | // copy 68 | std::memcpy(dst, src, type_size); 69 | } 70 | src += type_size; 71 | dst += type_size; 72 | used_size += type_size; 73 | } 74 | } 75 | }; 76 | 77 | 78 | 79 | inline bool SplitInfo::operator > (const SplitInfo& si) const { 80 | double local_gain = this->gain; 81 | double other_gain = si.gain; 82 | // replace nan with -inf 83 | if (local_gain == NAN) { 84 | local_gain = kMinScore; 85 | } 86 | // replace nan with -inf 87 | if (other_gain == NAN) { 88 | other_gain = kMinScore; 89 | } 90 | int local_feature = this->feature; 91 | int other_feature = si.feature; 92 | // replace -1 with max int 93 | if (local_feature == -1) { 94 | local_feature = INT32_MAX; 95 | } 96 | // replace -1 with max int 97 | if (other_feature == -1) { 98 | other_feature = INT32_MAX; 99 | } 100 | if (local_gain != other_gain) { 101 | return local_gain > other_gain; 102 | } else { 103 | // if same gain, use smaller feature 104 | return local_feature < other_feature; 105 | } 106 | } 107 | 108 | inline bool SplitInfo::operator == (const SplitInfo& si) const { 109 | double local_gain = this->gain; 110 | double other_gain = si.gain; 111 | // replace nan with -inf 112 | if (local_gain == NAN) { 113 | local_gain = kMinScore; 114 | } 115 | // replace nan with -inf 116 | if (other_gain == NAN) { 117 | other_gain = kMinScore; 118 | } 119 | int local_feature = this->feature; 120 | int other_feature = si.feature; 121 | // replace -1 with max int 122 | if (local_feature == -1) { 123 | local_feature = INT32_MAX; 124 | } 125 | // replace -1 with max int 126 | if (other_feature == -1) { 127 | other_feature = INT32_MAX; 128 | } 129 | if (local_gain != other_gain) { 130 | return local_gain == other_gain; 131 | } else { 132 | // if same gain, use smaller feature 133 | return local_feature == other_feature; 134 | } 135 | } 136 | 137 | } // namespace LightGBM 138 | #endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_ 139 | -------------------------------------------------------------------------------- /examples/lambdarank/train.conf: -------------------------------------------------------------------------------- 1 | # task type, support train and predict 2 | task = train 3 | 4 | # boosting type, support gbdt for now, alias: boosting, boost 5 | boosting_type = gbdt 6 | 7 | # application type, support following application 8 | # regression , regression task 9 | # binary , binary classification task 10 | # lambdarank , lambdarank task 11 | # alias: application, app 12 | objective = lambdarank 13 | 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics 15 | # l1 16 | # l2 , default metric for regression 17 | # ndcg , default metric for lambdarank 18 | # auc 19 | # binary_logloss , default metric for binary 20 | # binary_error 21 | metric = ndcg 22 | 23 | # evaluation position for ndcg metric, alias : ndcg_at 24 | ndcg_eval_at = 1,3,5 25 | 26 | # frequence for metric output 27 | metric_freq = 1 28 | 29 | # true if need output metric for training data, alias: tranining_metric, train_metric 30 | is_training_metric = true 31 | 32 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 33 | max_bin = 255 34 | 35 | # training data 36 | # if exsting weight file, should name to "rank.train.weight" 37 | # if exsting query file, should name to "rank.train.query" 38 | # alias: train_data, train 39 | data = rank.train 40 | 41 | # validation data, support multi validation data, separated by ',' 42 | # if exsting weight file, should name to "rank.test.weight" 43 | # if exsting query file, should name to "rank.test.query" 44 | # alias: valid, test, test_data, 45 | valid_data = rank.test 46 | 47 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds 48 | num_trees = 100 49 | 50 | # shrinkage rate , alias: shrinkage_rate 51 | learning_rate = 0.1 52 | 53 | # number of leaves for one tree, alias: num_leaf 54 | num_leaves = 31 55 | 56 | # type of tree learner, support following types: 57 | # serial , single machine version 58 | # feature , use feature parallel to train 59 | # data , use data parallel to train 60 | # voting , use voting based parallel to train 61 | # alias: tree 62 | tree_learner = serial 63 | 64 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 65 | # num_threads = 8 66 | 67 | # feature sub-sample, will random select 80% feature to train on each iteration 68 | # alias: sub_feature 69 | feature_fraction = 1.0 70 | 71 | # Support bagging (data sub-sample), will perform bagging every 5 iterations 72 | bagging_freq = 1 73 | 74 | # Bagging farction, will random select 80% data on bagging 75 | # alias: sub_row 76 | bagging_fraction = 0.9 77 | 78 | # minimal number data for one leaf, use this to deal with over-fit 79 | # alias : min_data_per_leaf, min_data 80 | min_data_in_leaf = 50 81 | 82 | # minimal sum hessians for one leaf, use this to deal with over-fit 83 | min_sum_hessian_in_leaf = 5.0 84 | 85 | # save memory and faster speed for sparse feature, alias: is_sparse 86 | is_enable_sparse = true 87 | 88 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed 89 | # alias: two_round_loading, two_round 90 | use_two_round_loading = false 91 | 92 | # true if need to save data to binary file and application will auto load data from binary file next time 93 | # alias: is_save_binary, save_binary 94 | is_save_binary_file = false 95 | 96 | # output model file 97 | output_model = LightGBM_model.txt 98 | 99 | # support continuous train from trained gbdt model 100 | # input_model= trained_model.txt 101 | 102 | # output prediction file for predict task 103 | # output_result= prediction.txt 104 | 105 | # support continuous train from initial score file 106 | # input_init_score= init_score.txt 107 | 108 | 109 | # number of machines in parallel training, alias: num_machine 110 | num_machines = 1 111 | 112 | # local listening port in parallel training, alias: local_port 113 | local_listen_port = 12400 114 | 115 | # machines list file for parallel training, alias: mlist 116 | machine_list_file = mlist.txt 117 | -------------------------------------------------------------------------------- /include/LightGBM/metric.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_METRIC_H_ 2 | #define LIGHTGBM_METRIC_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace LightGBM { 12 | 13 | /*! 14 | * \brief The interface of metric. 15 | * Metric is used to calculate metric result 16 | */ 17 | class Metric { 18 | public: 19 | /*! \brief virtual destructor */ 20 | virtual ~Metric() {} 21 | 22 | /*! 23 | * \brief Initialize 24 | * \param test_name Specific name for this metric, will output on log 25 | * \param metadata Label data 26 | * \param num_data Number of data 27 | */ 28 | virtual void Init(const Metadata& metadata, data_size_t num_data) = 0; 29 | 30 | virtual const std::vector& GetName() const = 0; 31 | 32 | virtual double factor_to_bigger_better() const = 0; 33 | /*! 34 | * \brief Calcaluting and printing metric result 35 | * \param score Current prediction score 36 | */ 37 | virtual std::vector Eval(const double* score, const ObjectiveFunction* objective) const = 0; 38 | 39 | Metric() = default; 40 | /*! \brief Disable copy */ 41 | Metric& operator=(const Metric&) = delete; 42 | /*! \brief Disable copy */ 43 | Metric(const Metric&) = delete; 44 | 45 | /*! 46 | * \brief Create object of metrics 47 | * \param type Specific type of metric 48 | * \param config Config for metric 49 | */ 50 | LIGHTGBM_EXPORT static Metric* CreateMetric(const std::string& type, const MetricConfig& config); 51 | 52 | }; 53 | 54 | /*! 55 | * \brief Static class, used to calculate DCG score 56 | */ 57 | class DCGCalculator { 58 | public: 59 | /*! 60 | * \brief Initial logic 61 | * \param label_gain Gain for labels, default is 2^i - 1 62 | */ 63 | static void Init(std::vector label_gain); 64 | 65 | /*! 66 | * \brief Calculate the DCG score at position k 67 | * \param k The position to evaluate 68 | * \param label Pointer of label 69 | * \param score Pointer of score 70 | * \param num_data Number of data 71 | * \return The DCG score 72 | */ 73 | static double CalDCGAtK(data_size_t k, const float* label, 74 | const double* score, data_size_t num_data); 75 | 76 | /*! 77 | * \brief Calculate the DCG score at multi position 78 | * \param ks The positions to evaluate 79 | * \param label Pointer of label 80 | * \param score Pointer of score 81 | * \param num_data Number of data 82 | * \param out Output result 83 | */ 84 | static void CalDCG(const std::vector& ks, 85 | const float* label, const double* score, 86 | data_size_t num_data, std::vector* out); 87 | 88 | /*! 89 | * \brief Calculate the Max DCG score at position k 90 | * \param k The position want to eval at 91 | * \param label Pointer of label 92 | * \param num_data Number of data 93 | * \return The max DCG score 94 | */ 95 | static double CalMaxDCGAtK(data_size_t k, 96 | const float* label, data_size_t num_data); 97 | 98 | /*! 99 | * \brief Calculate the Max DCG score at multi position 100 | * \param ks The positions want to eval at 101 | * \param label Pointer of label 102 | * \param num_data Number of data 103 | * \param out Output result 104 | */ 105 | static void CalMaxDCG(const std::vector& ks, 106 | const float* label, data_size_t num_data, std::vector* out); 107 | 108 | /*! 109 | * \brief Get discount score of position k 110 | * \param k The position 111 | * \return The discount of this position 112 | */ 113 | inline static double GetDiscount(data_size_t k) { return discount_[k]; } 114 | 115 | private: 116 | /*! \brief store gains for different label */ 117 | static std::vector label_gain_; 118 | /*! \brief store discount score for different position */ 119 | static std::vector discount_; 120 | /*! \brief max position for eval */ 121 | static const data_size_t kMaxPosition; 122 | }; 123 | 124 | 125 | } // namespace LightGBM 126 | 127 | 128 | #endif // LightGBM_METRIC_H_ 129 | -------------------------------------------------------------------------------- /src/io/parser.cpp: -------------------------------------------------------------------------------- 1 | #include "parser.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace LightGBM { 9 | 10 | void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) { 11 | *comma_cnt = 0; 12 | *tab_cnt = 0; 13 | *colon_cnt = 0; 14 | for (int i = 0; str[i] != '\0'; ++i) { 15 | if (str[i] == ',') { 16 | ++(*comma_cnt); 17 | } else if (str[i] == '\t') { 18 | ++(*tab_cnt); 19 | } else if (str[i] == ':') { 20 | ++(*colon_cnt); 21 | } 22 | } 23 | } 24 | 25 | int GetLabelIdxForLibsvm(std::string& str, int num_features, int label_idx) { 26 | if (num_features <= 0) { 27 | return label_idx; 28 | } 29 | str = Common::Trim(str); 30 | auto pos_space = str.find_first_of(" \f\n\r\t\v"); 31 | auto pos_colon = str.find_first_of(":"); 32 | if (pos_space == std::string::npos || pos_space < pos_colon) { 33 | return label_idx; 34 | } else { 35 | return -1; 36 | } 37 | } 38 | 39 | int GetLabelIdxForTSV(std::string& str, int num_features, int label_idx) { 40 | if (num_features <= 0) { 41 | return label_idx; 42 | } 43 | str = Common::Trim(str); 44 | auto tokens = Common::Split(str.c_str(), '\t'); 45 | if (static_cast(tokens.size()) == num_features) { 46 | return -1; 47 | } else { 48 | return label_idx; 49 | } 50 | } 51 | 52 | int GetLabelIdxForCSV(std::string& str, int num_features, int label_idx) { 53 | if (num_features <= 0) { 54 | return label_idx; 55 | } 56 | str = Common::Trim(str); 57 | auto tokens = Common::Split(str.c_str(), ','); 58 | if (static_cast(tokens.size()) == num_features) { 59 | return -1; 60 | } else { 61 | return label_idx; 62 | } 63 | } 64 | 65 | enum DataType { 66 | INVALID, 67 | CSV, 68 | TSV, 69 | LIBSVM 70 | }; 71 | 72 | Parser* Parser::CreateParser(const char* filename, bool has_header, int num_features, int label_idx) { 73 | std::ifstream tmp_file; 74 | tmp_file.open(filename); 75 | if (!tmp_file.is_open()) { 76 | Log::Fatal("Data file %s doesn't exist'", filename); 77 | } 78 | std::string line1, line2; 79 | if (has_header) { 80 | if (!tmp_file.eof()) { 81 | std::getline(tmp_file, line1); 82 | } 83 | } 84 | if (!tmp_file.eof()) { 85 | std::getline(tmp_file, line1); 86 | } else { 87 | Log::Fatal("Data file %s should have at least one line", filename); 88 | } 89 | if (!tmp_file.eof()) { 90 | std::getline(tmp_file, line2); 91 | } else { 92 | Log::Warning("Data file %s only has one line", filename); 93 | } 94 | tmp_file.close(); 95 | int comma_cnt = 0, comma_cnt2 = 0; 96 | int tab_cnt = 0, tab_cnt2 = 0; 97 | int colon_cnt = 0, colon_cnt2 = 0; 98 | // Get some statistic from 2 line 99 | GetStatistic(line1.c_str(), &comma_cnt, &tab_cnt, &colon_cnt); 100 | GetStatistic(line2.c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2); 101 | 102 | 103 | 104 | DataType type = DataType::INVALID; 105 | if (line2.size() == 0) { 106 | // if only have one line on file 107 | if (colon_cnt > 0) { 108 | type = DataType::LIBSVM; 109 | } else if (tab_cnt > 0) { 110 | type = DataType::TSV; 111 | } else if (comma_cnt > 0) { 112 | type = DataType::CSV; 113 | } 114 | } else { 115 | if (colon_cnt > 0 || colon_cnt2 > 0) { 116 | type = DataType::LIBSVM; 117 | } else if (tab_cnt == tab_cnt2 && tab_cnt > 0) { 118 | type = DataType::TSV; 119 | } else if (comma_cnt == comma_cnt2 && comma_cnt > 0) { 120 | type = DataType::CSV; 121 | } 122 | } 123 | if (type == DataType::INVALID) { 124 | Log::Fatal("Unknown format of training data"); 125 | } 126 | std::unique_ptr ret; 127 | if (type == DataType::LIBSVM) { 128 | label_idx = GetLabelIdxForLibsvm(line1, num_features, label_idx); 129 | ret.reset(new LibSVMParser(label_idx)); 130 | } 131 | else if (type == DataType::TSV) { 132 | label_idx = GetLabelIdxForTSV(line1, num_features, label_idx); 133 | ret.reset(new TSVParser(label_idx)); 134 | } 135 | else if (type == DataType::CSV) { 136 | label_idx = GetLabelIdxForCSV(line1, num_features, label_idx); 137 | ret.reset(new CSVParser(label_idx)); 138 | } 139 | 140 | if (label_idx < 0) { 141 | Log::Info("Data file %s doesn't contain a label column", filename); 142 | } 143 | return ret.release(); 144 | } 145 | 146 | } // namespace LightGBM 147 | -------------------------------------------------------------------------------- /src/boosting/score_updater.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ 2 | #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | namespace LightGBM { 14 | /*! 15 | * \brief Used to store and update score for data 16 | */ 17 | class ScoreUpdater { 18 | public: 19 | /*! 20 | * \brief Constructor, will pass a const pointer of dataset 21 | * \param data This class will bind with this data set 22 | */ 23 | ScoreUpdater(const Dataset* data, int num_tree_per_iteration) : data_(data) { 24 | num_data_ = data->num_data(); 25 | int64_t total_size = static_cast(num_data_) * num_tree_per_iteration; 26 | score_.resize(total_size); 27 | // default start score is zero 28 | #pragma omp parallel for schedule(static) 29 | for (int64_t i = 0; i < total_size; ++i) { 30 | score_[i] = 0.0f; 31 | } 32 | has_init_score_ = false; 33 | const double* init_score = data->metadata().init_score(); 34 | // if exists initial score, will start from it 35 | if (init_score != nullptr) { 36 | if ((data->metadata().num_init_score() % num_data_) != 0 37 | || (data->metadata().num_init_score() / num_data_) != num_tree_per_iteration) { 38 | Log::Fatal("number of class for initial score error"); 39 | } 40 | has_init_score_ = true; 41 | #pragma omp parallel for schedule(static) 42 | for (int64_t i = 0; i < total_size; ++i) { 43 | score_[i] = init_score[i]; 44 | } 45 | } 46 | } 47 | /*! \brief Destructor */ 48 | ~ScoreUpdater() { 49 | 50 | } 51 | 52 | inline bool has_init_score() const { return has_init_score_; } 53 | 54 | inline void AddScore(double val, int cur_tree_id) { 55 | int64_t offset = cur_tree_id * num_data_; 56 | #pragma omp parallel for schedule(static) 57 | for (int64_t i = 0; i < num_data_; ++i) { 58 | score_[offset + i] += val; 59 | } 60 | } 61 | /*! 62 | * \brief Using tree model to get prediction number, then adding to scores for all data 63 | * Note: this function generally will be used on validation data too. 64 | * \param tree Trained tree model 65 | * \param cur_tree_id Current tree for multiclass training 66 | */ 67 | inline void AddScore(const Tree* tree, int cur_tree_id) { 68 | tree->AddPredictionToScore(data_, num_data_, score_.data() + cur_tree_id * num_data_); 69 | } 70 | /*! 71 | * \brief Adding prediction score, only used for training data. 72 | * The training data is partitioned into tree leaves after training 73 | * Based on which We can get prediction quickly. 74 | * \param tree_learner 75 | * \param cur_tree_id Current tree for multiclass training 76 | */ 77 | inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { 78 | tree_learner->AddPredictionToScore(tree, score_.data() + cur_tree_id * num_data_); 79 | } 80 | /*! 81 | * \brief Using tree model to get prediction number, then adding to scores for parts of data 82 | * Used for prediction of training out-of-bag data 83 | * \param tree Trained tree model 84 | * \param data_indices Indices of data that will be processed 85 | * \param data_cnt Number of data that will be processed 86 | * \param cur_tree_id Current tree for multiclass training 87 | */ 88 | inline void AddScore(const Tree* tree, const data_size_t* data_indices, 89 | data_size_t data_cnt, int cur_tree_id) { 90 | tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + cur_tree_id * num_data_); 91 | } 92 | /*! \brief Pointer of score */ 93 | inline const double* score() const { return score_.data(); } 94 | inline const data_size_t num_data() const { return num_data_; } 95 | 96 | /*! \brief Disable copy */ 97 | ScoreUpdater& operator=(const ScoreUpdater&) = delete; 98 | /*! \brief Disable copy */ 99 | ScoreUpdater(const ScoreUpdater&) = delete; 100 | private: 101 | /*! \brief Number of total data */ 102 | data_size_t num_data_; 103 | /*! \brief Pointer of data set */ 104 | const Dataset* data_; 105 | /*! \brief Scores for data set */ 106 | std::vector score_; 107 | bool has_init_score_; 108 | }; 109 | 110 | } // namespace LightGBM 111 | #endif // LightGBM_BOOSTING_SCORE_UPDATER_HPP_ 112 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | if(APPLE) 4 | SET(CMAKE_CXX_COMPILER "g++-6") 5 | SET(CMAKE_C_COMPILER "gcc-6") 6 | endif() 7 | 8 | PROJECT(lightgbm) 9 | 10 | OPTION(USE_MPI "MPI based parallel learning" OFF) 11 | OPTION(USE_OPENMP "Enable OpenMP" ON) 12 | OPTION(USE_GPU "Enable GPU-acclerated training (EXPERIMENTAL)" OFF) 13 | 14 | if(APPLE) 15 | OPTION(APPLE_OUTPUT_DYLIB "Output dylib shared library" OFF) 16 | endif() 17 | 18 | if(USE_MPI) 19 | find_package(MPI REQUIRED) 20 | ADD_DEFINITIONS(-DUSE_MPI) 21 | MESSAGE(${MPI_LIBRARIES}) 22 | MESSAGE(${MPI_CXX_LIBRARIES}) 23 | else() 24 | ADD_DEFINITIONS(-DUSE_SOCKET) 25 | endif(USE_MPI) 26 | 27 | if(USE_OPENMP) 28 | find_package(OpenMP REQUIRED) 29 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 30 | else() 31 | # Ignore unknown #pragma warning 32 | if( (CMAKE_CXX_COMPILER_ID MATCHES "[cC][lL][aA][nN][gG]") 33 | OR (CMAKE_CXX_COMPILER_ID MATCHES "[gG][nN][uU]")) 34 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") 35 | endif() 36 | endif(USE_OPENMP) 37 | 38 | if(USE_GPU) 39 | find_package(OpenCL REQUIRED) 40 | include_directories(${OpenCL_INCLUDE_DIRS}) 41 | MESSAGE(STATUS "OpenCL include directory:" ${OpenCL_INCLUDE_DIRS}) 42 | find_package(Boost 1.56.0 COMPONENTS filesystem system REQUIRED) 43 | include_directories(${Boost_INCLUDE_DIRS}) 44 | ADD_DEFINITIONS(-DUSE_GPU) 45 | endif(USE_GPU) 46 | 47 | if(UNIX OR MINGW OR CYGWIN) 48 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes") 49 | endif() 50 | 51 | if(MSVC) 52 | if(MSVC_VERSION LESS 1800) 53 | message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a newer msvc.") 54 | endif() 55 | 56 | SET(variables 57 | CMAKE_C_FLAGS_DEBUG 58 | CMAKE_C_FLAGS_MINSIZEREL 59 | CMAKE_C_FLAGS_RELEASE 60 | CMAKE_C_FLAGS_RELWITHDEBINFO 61 | CMAKE_CXX_FLAGS_DEBUG 62 | CMAKE_CXX_FLAGS_MINSIZEREL 63 | CMAKE_CXX_FLAGS_RELEASE 64 | CMAKE_CXX_FLAGS_RELWITHDEBINFO 65 | ) 66 | foreach(variable ${variables}) 67 | if(${variable} MATCHES "/MD") 68 | string(REGEX REPLACE "/MD" "/MT" ${variable} "${${variable}}") 69 | endif() 70 | endforeach() 71 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /O2 /Ob2 /Oi /Ot /Oy /GL") 72 | else() 73 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") 74 | endif() 75 | 76 | 77 | SET(LightGBM_HEADER_DIR ${PROJECT_SOURCE_DIR}/include) 78 | SET(BOOST_COMPUTE_HEADER_DIR ${PROJECT_SOURCE_DIR}/compute/include) 79 | 80 | SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) 81 | SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) 82 | 83 | include_directories (${LightGBM_HEADER_DIR}) 84 | include_directories (${BOOST_COMPUTE_HEADER_DIR}) 85 | 86 | if(APPLE) 87 | if (APPLE_OUTPUT_DYLIB) 88 | SET(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") 89 | else() 90 | SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so") 91 | endif() 92 | endif(APPLE) 93 | 94 | if(USE_MPI) 95 | include_directories(${MPI_CXX_INCLUDE_PATH}) 96 | endif(USE_MPI) 97 | 98 | file(GLOB SOURCES 99 | src/application/*.cpp 100 | src/boosting/*.cpp 101 | src/io/*.cpp 102 | src/metric/*.cpp 103 | src/objective/*.cpp 104 | src/network/*.cpp 105 | src/treelearner/*.cpp 106 | ) 107 | 108 | add_executable(lightgbm src/main.cpp ${SOURCES}) 109 | add_library(_lightgbm SHARED src/c_api.cpp ${SOURCES}) 110 | 111 | if(MSVC) 112 | set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm") 113 | endif(MSVC) 114 | 115 | if(USE_MPI) 116 | TARGET_LINK_LIBRARIES(lightgbm ${MPI_CXX_LIBRARIES}) 117 | TARGET_LINK_LIBRARIES(_lightgbm ${MPI_CXX_LIBRARIES}) 118 | endif(USE_MPI) 119 | 120 | if(USE_GPU) 121 | TARGET_LINK_LIBRARIES(lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES}) 122 | TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES}) 123 | endif(USE_GPU) 124 | 125 | if(WIN32 AND (MINGW OR CYGWIN)) 126 | TARGET_LINK_LIBRARIES(lightgbm Ws2_32) 127 | TARGET_LINK_LIBRARIES(_lightgbm Ws2_32) 128 | TARGET_LINK_LIBRARIES(lightgbm IPHLPAPI) 129 | TARGET_LINK_LIBRARIES(_lightgbm IPHLPAPI) 130 | endif() 131 | 132 | install(TARGETS lightgbm _lightgbm 133 | RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin 134 | LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib 135 | ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) 136 | 137 | install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include) 138 | -------------------------------------------------------------------------------- /src/treelearner/leaf_splits.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ 2 | #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ 3 | 4 | #include 5 | #include "data_partition.hpp" 6 | 7 | #include 8 | 9 | namespace LightGBM { 10 | 11 | /*! 12 | * \brief used to find split candidates for a leaf 13 | */ 14 | class LeafSplits { 15 | public: 16 | LeafSplits(data_size_t num_data) 17 | :num_data_in_leaf_(num_data), num_data_(num_data), 18 | data_indices_(nullptr) { 19 | } 20 | void ResetNumData(data_size_t num_data) { 21 | num_data_ = num_data; 22 | num_data_in_leaf_ = num_data; 23 | } 24 | ~LeafSplits() { 25 | } 26 | 27 | /*! 28 | 29 | * \brief Init split on current leaf on partial data. 30 | * \param leaf Index of current leaf 31 | * \param data_partition current data partition 32 | * \param sum_gradients 33 | * \param sum_hessians 34 | */ 35 | void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) { 36 | leaf_index_ = leaf; 37 | data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); 38 | sum_gradients_ = sum_gradients; 39 | sum_hessians_ = sum_hessians; 40 | } 41 | 42 | /*! 43 | * \brief Init splits on current leaf, it will traverse all data to sum up the results 44 | * \param gradients 45 | * \param hessians 46 | */ 47 | void Init(const score_t* gradients, const score_t* hessians) { 48 | num_data_in_leaf_ = num_data_; 49 | leaf_index_ = 0; 50 | data_indices_ = nullptr; 51 | double tmp_sum_gradients = 0.0f; 52 | double tmp_sum_hessians = 0.0f; 53 | #pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians) 54 | for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { 55 | tmp_sum_gradients += gradients[i]; 56 | tmp_sum_hessians += hessians[i]; 57 | } 58 | sum_gradients_ = tmp_sum_gradients; 59 | sum_hessians_ = tmp_sum_hessians; 60 | } 61 | 62 | /*! 63 | * \brief Init splits on current leaf of partial data. 64 | * \param leaf Index of current leaf 65 | * \param data_partition current data partition 66 | * \param gradients 67 | * \param hessians 68 | */ 69 | void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) { 70 | leaf_index_ = leaf; 71 | data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); 72 | double tmp_sum_gradients = 0.0f; 73 | double tmp_sum_hessians = 0.0f; 74 | #pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians) 75 | for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { 76 | data_size_t idx = data_indices_[i]; 77 | tmp_sum_gradients += gradients[idx]; 78 | tmp_sum_hessians += hessians[idx]; 79 | } 80 | sum_gradients_ = tmp_sum_gradients; 81 | sum_hessians_ = tmp_sum_hessians; 82 | } 83 | 84 | 85 | /*! 86 | * \brief Init splits on current leaf, only update sum_gradients and sum_hessians 87 | * \param sum_gradients 88 | * \param sum_hessians 89 | */ 90 | void Init(double sum_gradients, double sum_hessians) { 91 | leaf_index_ = 0; 92 | sum_gradients_ = sum_gradients; 93 | sum_hessians_ = sum_hessians; 94 | } 95 | 96 | /*! 97 | * \brief Init splits on current leaf 98 | */ 99 | void Init() { 100 | leaf_index_ = -1; 101 | data_indices_ = nullptr; 102 | num_data_in_leaf_ = 0; 103 | } 104 | 105 | 106 | /*! \brief Get current leaf index */ 107 | int LeafIndex() const { return leaf_index_; } 108 | 109 | /*! \brief Get numer of data in current leaf */ 110 | data_size_t num_data_in_leaf() const { return num_data_in_leaf_; } 111 | 112 | /*! \brief Get sum of gradients of current leaf */ 113 | double sum_gradients() const { return sum_gradients_; } 114 | 115 | /*! \brief Get sum of hessians of current leaf */ 116 | double sum_hessians() const { return sum_hessians_; } 117 | 118 | /*! \brief Get indices of data of current leaf */ 119 | const data_size_t* data_indices() const { return data_indices_; } 120 | 121 | 122 | private: 123 | /*! \brief current leaf index */ 124 | int leaf_index_; 125 | /*! \brief number of data on current leaf */ 126 | data_size_t num_data_in_leaf_; 127 | /*! \brief number of all training data */ 128 | data_size_t num_data_; 129 | /*! \brief sum of gradients of current leaf */ 130 | double sum_gradients_; 131 | /*! \brief sum of hessians of current leaf */ 132 | double sum_hessians_; 133 | /*! \brief indices of data of current leaf */ 134 | const data_size_t* data_indices_; 135 | }; 136 | 137 | } // namespace LightGBM 138 | #endif // LightGBM_TREELEARNER_LEAF_SPLITS_HPP_ 139 | -------------------------------------------------------------------------------- /src/metric/dcg_calculator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace LightGBM { 11 | 12 | /*! \brief Declaration for some static members */ 13 | std::vector DCGCalculator::label_gain_; 14 | std::vector DCGCalculator::discount_; 15 | const data_size_t DCGCalculator::kMaxPosition = 10000; 16 | 17 | void DCGCalculator::Init(std::vector input_label_gain) { 18 | label_gain_.resize(input_label_gain.size()); 19 | for(size_t i = 0;i < input_label_gain.size();++i){ 20 | label_gain_[i] = static_cast(input_label_gain[i]); 21 | } 22 | discount_.resize(kMaxPosition); 23 | for (data_size_t i = 0; i < kMaxPosition; ++i) { 24 | discount_[i] = 1.0f / std::log2(2.0f + i); 25 | } 26 | } 27 | 28 | double DCGCalculator::CalMaxDCGAtK(data_size_t k, const float* label, data_size_t num_data) { 29 | double ret = 0.0f; 30 | // counts for all labels 31 | std::vector label_cnt(label_gain_.size(), 0); 32 | for (data_size_t i = 0; i < num_data; ++i) { 33 | ++label_cnt[static_cast(label[i])]; 34 | } 35 | int top_label = static_cast(label_gain_.size()) - 1; 36 | 37 | if (k > num_data) { k = num_data; } 38 | // start from top label, and accumulate DCG 39 | for (data_size_t j = 0; j < k; ++j) { 40 | while (top_label > 0 && label_cnt[top_label] <= 0) { 41 | top_label -= 1; 42 | } 43 | if (top_label < 0) { 44 | break; 45 | } 46 | ret += discount_[j] * label_gain_[top_label]; 47 | label_cnt[top_label] -= 1; 48 | } 49 | return ret; 50 | } 51 | 52 | void DCGCalculator::CalMaxDCG(const std::vector& ks, 53 | const float* label, 54 | data_size_t num_data, 55 | std::vector* out) { 56 | std::vector label_cnt(label_gain_.size(), 0); 57 | // counts for all labels 58 | for (data_size_t i = 0; i < num_data; ++i) { 59 | if (static_cast(label[i]) >= label_cnt.size()) { Log::Fatal("Label excel %d", label[i]); } 60 | ++label_cnt[static_cast(label[i])]; 61 | } 62 | double cur_result = 0.0f; 63 | data_size_t cur_left = 0; 64 | int top_label = static_cast(label_gain_.size()) - 1; 65 | // calculate k Max DCG by one pass 66 | for (size_t i = 0; i < ks.size(); ++i) { 67 | data_size_t cur_k = ks[i]; 68 | if (cur_k > num_data) { cur_k = num_data; } 69 | for (data_size_t j = cur_left; j < cur_k; ++j) { 70 | while (top_label > 0 && label_cnt[top_label] <= 0) { 71 | top_label -= 1; 72 | } 73 | if (top_label < 0) { 74 | break; 75 | } 76 | cur_result += discount_[j] * label_gain_[top_label]; 77 | label_cnt[top_label] -= 1; 78 | } 79 | (*out)[i] = cur_result; 80 | cur_left = cur_k; 81 | } 82 | } 83 | 84 | 85 | double DCGCalculator::CalDCGAtK(data_size_t k, const float* label, 86 | const double* score, data_size_t num_data) { 87 | // get sorted indices by score 88 | std::vector sorted_idx; 89 | for (data_size_t i = 0; i < num_data; ++i) { 90 | sorted_idx.emplace_back(i); 91 | } 92 | std::sort(sorted_idx.begin(), sorted_idx.end(), 93 | [score](data_size_t a, data_size_t b) {return score[a] > score[b]; }); 94 | 95 | if (k > num_data) { k = num_data; } 96 | double dcg = 0.0f; 97 | // calculate dcg 98 | for (data_size_t i = 0; i < k; ++i) { 99 | data_size_t idx = sorted_idx[i]; 100 | dcg += label_gain_[static_cast(label[idx])] * discount_[i]; 101 | } 102 | return dcg; 103 | } 104 | 105 | void DCGCalculator::CalDCG(const std::vector& ks, const float* label, 106 | const double * score, data_size_t num_data, std::vector* out) { 107 | // get sorted indices by score 108 | std::vector sorted_idx; 109 | for (data_size_t i = 0; i < num_data; ++i) { 110 | sorted_idx.emplace_back(i); 111 | } 112 | std::sort(sorted_idx.begin(), sorted_idx.end(), 113 | [score](data_size_t a, data_size_t b) {return score[a] > score[b]; }); 114 | 115 | double cur_result = 0.0f; 116 | data_size_t cur_left = 0; 117 | // calculate multi dcg by one pass 118 | for (size_t i = 0; i < ks.size(); ++i) { 119 | data_size_t cur_k = ks[i]; 120 | if (cur_k > num_data) { cur_k = num_data; } 121 | for (data_size_t j = cur_left; j < cur_k; ++j) { 122 | data_size_t idx = sorted_idx[j]; 123 | cur_result += label_gain_[static_cast(label[idx])] * discount_[j]; 124 | } 125 | (*out)[i] = cur_result; 126 | cur_left = cur_k; 127 | } 128 | } 129 | 130 | } // namespace LightGBM 131 | --------------------------------------------------------------------------------