├── examples
    ├── .gitignore
    ├── parallel_learning
    │   ├── mlist.txt
    │   ├── predict.conf
    │   ├── README.md
    │   └── train.conf
    ├── lambdarank
    │   ├── predict.conf
    │   ├── rank.test.query
    │   ├── README.md
    │   ├── rank.train.query
    │   └── train.conf
    ├── regression
    │   ├── predict.conf
    │   ├── README.md
    │   └── train.conf
    ├── binary_classification
    │   ├── predict.conf
    │   ├── README.md
    │   └── train.conf
    ├── multiclass_classification
    │   ├── predict.conf
    │   ├── README.md
    │   └── train.conf
    ├── README.md
    └── python-guide
    │   ├── sklearn_example.py
    │   ├── README.md
    │   ├── plot_example.py
    │   └── simple_example.py
├── R-package
    ├── tests
    │   ├── testthat.R
    │   └── testthat
    │   │   ├── test_custom_objective.R
    │   │   ├── test_dataset.R
    │   │   └── test_basic.R
    ├── data
    │   ├── agaricus.test.rda
    │   └── agaricus.train.rda
    ├── unix_build_package.sh
    ├── win_build_package.cmd
    ├── demo
    │   ├── README.md
    │   ├── 00Index
    │   ├── boost_from_prediction.R
    │   ├── cross_validation.R
    │   ├── early_stopping.R
    │   └── multiclass.R
    ├── src
    │   ├── Makevars
    │   ├── Makevars.win
    │   ├── Makevars_fullcode
    │   ├── Makevars_fullcode.win
    │   ├── lightgbm-fullcode.cpp
    │   ├── lightgbm-all.cpp
    │   └── R_object_helper.h
    ├── man
    │   ├── lgb.Dataset.construct.Rd
    │   ├── lgb.get.eval.result.Rd
    │   ├── lgb.Dataset.save.Rd
    │   ├── lgb.Dataset.set.categorical.Rd
    │   ├── slice.Rd
    │   ├── lgb.Dataset.set.reference.Rd
    │   ├── agaricus.test.Rd
    │   ├── agaricus.train.Rd
    │   ├── dim.Rd
    │   ├── lgb.Dataset.create.valid.Rd
    │   ├── lgb.load.Rd
    │   ├── lgb.dump.Rd
    │   ├── dimnames.lgb.Dataset.Rd
    │   ├── lgb.save.Rd
    │   ├── getinfo.Rd
    │   ├── readRDS.lgb.Booster.Rd
    │   ├── setinfo.Rd
    │   ├── lgb.Dataset.Rd
    │   ├── lgb.importance.Rd
    │   ├── lgb.interprete.Rd
    │   ├── lgb.plot.importance.Rd
    │   ├── lgb.unloader.Rd
    │   ├── lgb.model.dt.tree.Rd
    │   ├── lgb.plot.interpretation.Rd
    │   ├── saveRDS.lgb.Booster.Rd
    │   └── predict.lgb.Booster.Rd
    ├── NAMESPACE
    ├── LICENSE
    ├── DESCRIPTION
    ├── R
    │   ├── readRDS.lgb.Booster.R
    │   ├── lgb.importance.R
    │   ├── lgb.unloader.R
    │   ├── lgb.plot.importance.R
    │   ├── saveRDS.lgb.Booster.R
    │   └── lightgbm.R
    └── README.md
├── docs
    ├── Installation-Guide.md
    ├── Parallel-Learning-Guide.md
    ├── Readme.md
    ├── Parameters-tuning.md
    ├── FAQ.md
    └── development.md
├── .gitmodules
├── .github
    └── ISSUE_TEMPLATE.md
├── include
    └── LightGBM
    │   ├── export.h
    │   ├── meta.h
    │   ├── utils
    │       ├── threading.h
    │       ├── openmp_wrapper.h
    │       ├── pipeline_reader.h
    │       ├── log.h
    │       └── random.h
    │   ├── objective_function.h
    │   ├── application.h
    │   ├── tree_learner.h
    │   ├── dataset_loader.h
    │   └── metric.h
├── src
    ├── main.cpp
    ├── network
    │   └── linkers_mpi.cpp
    ├── treelearner
    │   ├── tree_learner.cpp
    │   ├── feature_parallel_tree_learner.cpp
    │   ├── split_info.hpp
    │   └── leaf_splits.hpp
    ├── metric
    │   ├── metric.cpp
    │   └── dcg_calculator.cpp
    ├── boosting
    │   ├── boosting.cpp
    │   └── score_updater.hpp
    ├── objective
    │   └── objective_function.cpp
    └── io
    │   ├── parser.hpp
    │   └── parser.cpp
├── docker
    ├── README.md
    └── dockerfile-python
├── python-package
    ├── lightgbm
    │   ├── __init__.py
    │   ├── libpath.py
    │   └── compat.py
    ├── setup.py
    └── README.rst
├── LICENSE
├── .travis
    └── amd_sdk.sh
├── windows
    └── LightGBM.sln
├── pmml
    └── README.md
├── tests
    └── python_package_test
    │   └── test_basic.py
├── .travis.yml
└── CMakeLists.txt


/examples/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/mlist.txt:
--------------------------------------------------------------------------------
1 | 192.168.1.101 12400
2 | 192.168.1.102 12400
3 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(lightgbm)
3 | 
4 | test_check("lightgbm")
5 | 


--------------------------------------------------------------------------------
/docs/Installation-Guide.md:
--------------------------------------------------------------------------------
1 | Refer to https://github.com/Microsoft/LightGBM/wiki/Installation-Guide.
2 | 


--------------------------------------------------------------------------------
/docs/Parallel-Learning-Guide.md:
--------------------------------------------------------------------------------
1 | Refer to https://github.com/Microsoft/LightGBM/wiki/Parallel-Learning-Guide


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "include/boost/compute"]
2 | 	path = compute
3 | 	url = https://github.com/boostorg/compute
4 | 


--------------------------------------------------------------------------------
/examples/lambdarank/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = rank.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 


--------------------------------------------------------------------------------
/R-package/data/agaricus.test.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huanzhang12/lightgbm-gpu/HEAD/R-package/data/agaricus.test.rda


--------------------------------------------------------------------------------
/R-package/data/agaricus.train.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huanzhang12/lightgbm-gpu/HEAD/R-package/data/agaricus.train.rda


--------------------------------------------------------------------------------
/examples/regression/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = regression.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 


--------------------------------------------------------------------------------
/examples/binary_classification/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = binary.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = binary.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/multiclass_classification/predict.conf:
--------------------------------------------------------------------------------
1 | task = predict
2 | 
3 | data = multiclass.test
4 | 
5 | input_model= LightGBM_model.txt
6 | 


--------------------------------------------------------------------------------
/R-package/unix_build_package.sh:
--------------------------------------------------------------------------------
1 | cp ../include ./src/include -rf
2 | cp ../src ./src/src -rf
3 | rm ./src/Makevars
4 | cp ./src/Makevars_fullcode ./src/Makevars -f
5 | R CMD build --no-build-vignettes .


--------------------------------------------------------------------------------
/R-package/win_build_package.cmd:
--------------------------------------------------------------------------------
1 | xcopy ..\include src\include /e /i /y
2 | xcopy ..\src src\src /e /i /y
3 | del .\src\Makevars.win
4 | copy .\src\Makevars_fullcode.win .\src\Makevars.win /y
5 | R CMD build --no-build-vignettes .


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | Examples
2 | =====================
3 | 
4 | You can learn how to use LightGBM by these examples. If you have any questions, please refer to our [wiki](https://github.com/Microsoft/LightGBM/wiki). 
5 | 
6 | 


--------------------------------------------------------------------------------
/R-package/demo/README.md:
--------------------------------------------------------------------------------
1 | LightGBM R examples
2 | ====
3 | * [Basic walkthrough of wrappers](basic_walkthrough.R)
4 | * [Boosting from existing prediction](boost_from_prediction.R)
5 | * [Early Stopping](early_stopping.R)
6 | * [Cross Validation](cross_validation.R)
7 | * [Multiclass Training/Prediction](multiclass.R)
8 | * [Leaf (in)Stability](leaf_stability.R)
9 | 


--------------------------------------------------------------------------------
/R-package/demo/00Index:
--------------------------------------------------------------------------------
1 | basic_walkthrough               Basic feature walkthrough
2 | boost_from_prediction           Boosting from existing prediction
3 | early_stopping                  Early Stop in training
4 | cross_validation                Cross Validation
5 | multiclass                      Multiclass training/prediction
6 | leaf_stability                  Leaf (in)Stability example
7 | 


--------------------------------------------------------------------------------
/R-package/src/Makevars:
--------------------------------------------------------------------------------
 1 | # package root
 2 | PKGROOT=../../
 3 | 
 4 | ENABLE_STD_THREAD=1
 5 | CXX_STD = CXX11
 6 | 
 7 | LGBM_RFLAGS = -DUSE_SOCKET
 8 | 
 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) -Wno-deprecated-declarations
10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11
11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
12 | OBJECTS = ./lightgbm-all.o ./lightgbm_R.o
13 | 


--------------------------------------------------------------------------------
/R-package/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | # package root
 2 | PKGROOT=../../
 3 | 
 4 | ENABLE_STD_THREAD=1
 5 | CXX_STD = CXX11
 6 | 
 7 | LGBM_RFLAGS = -DUSE_SOCKET 
 8 | 
 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) 
10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11
11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -lws2_32 -liphlpapi 
12 | OBJECTS = ./lightgbm-all.o ./lightgbm_R.o 
13 | 


--------------------------------------------------------------------------------
/examples/lambdarank/rank.test.query:
--------------------------------------------------------------------------------
 1 | 12
 2 | 19
 3 | 18
 4 | 10
 5 | 15
 6 | 15
 7 | 22
 8 | 23
 9 | 18
10 | 16
11 | 16
12 | 11
13 | 6
14 | 13
15 | 17
16 | 21
17 | 20
18 | 16
19 | 13
20 | 16
21 | 21
22 | 15
23 | 10
24 | 19
25 | 10
26 | 13
27 | 18
28 | 17
29 | 23
30 | 24
31 | 16
32 | 13
33 | 17
34 | 24
35 | 17
36 | 10
37 | 17
38 | 15
39 | 18
40 | 16
41 | 9
42 | 9
43 | 21
44 | 14
45 | 13
46 | 13
47 | 13
48 | 10
49 | 10
50 | 6
51 | 


--------------------------------------------------------------------------------
/R-package/src/Makevars_fullcode:
--------------------------------------------------------------------------------
 1 | # package root
 2 | PKGROOT=.
 3 | 
 4 | ENABLE_STD_THREAD=1
 5 | CXX_STD = CXX11
 6 | 
 7 | LGBM_RFLAGS = -DUSE_SOCKET
 8 | 
 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) -Wno-deprecated-declarations
10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11
11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
12 | OBJECTS = ./lightgbm-fullcode.o ./lightgbm_R.o
13 | 


--------------------------------------------------------------------------------
/R-package/src/Makevars_fullcode.win:
--------------------------------------------------------------------------------
 1 | # package root
 2 | PKGROOT=.
 3 | 
 4 | ENABLE_STD_THREAD=1
 5 | CXX_STD = CXX11
 6 | 
 7 | LGBM_RFLAGS = -DUSE_SOCKET 
 8 | 
 9 | PKG_CPPFLAGS= -I$(PKGROOT)/include $(LGBM_RFLAGS) 
10 | PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -std=c++11
11 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) -lws2_32 -liphlpapi 
12 | OBJECTS = ./lightgbm-fullcode.o ./lightgbm_R.o 
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Please search your question on previous issues, stackoverflow or other search engines before you open a new one.
 2 | 
 3 | For bugs and unexpected issues, please provide following information, so that we could reproduce on our system.
 4 | 
 5 | ## Environment info
 6 | Operating System:
 7 | CPU:
 8 | C++/Python/R version:
 9 | 
10 | ## Error Message:
11 | 
12 | ## Reproducible examples
13 | 
14 | ## Steps to reproduce
15 | 
16 | 1.
17 | 2.
18 | 3.
19 | 


--------------------------------------------------------------------------------
/include/LightGBM/export.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_EXPORT_H_
 2 | #define LIGHTGBM_EXPORT_H_
 3 | 
 4 | /** Macros for exporting symbols in MSVC/GCC/CLANG **/
 5 | 
 6 | #ifdef __cplusplus
 7 | #define LIGHTGBM_EXTERN_C extern "C"
 8 | #else
 9 | #define LIGHTGBM_EXTERN_C
10 | #endif
11 | 
12 | 
13 | #ifdef _MSC_VER
14 | #define LIGHTGBM_EXPORT __declspec(dllexport)
15 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport)
16 | #else
17 | #define LIGHTGBM_EXPORT 
18 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C
19 | #endif
20 | 
21 | #endif /** LIGHTGBM_EXPORT_H_ **/
22 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <LightGBM/application.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |   try {
 6 |     LightGBM::Application app(argc, argv);
 7 |     app.Run();
 8 |   }
 9 |   catch (const std::exception& ex) {
10 |     std::cerr << "Met Exceptions:" << std::endl;
11 |     std::cerr << ex.what() << std::endl;
12 |     exit(-1);
13 |   }
14 |   catch (const std::string& ex) {
15 |     std::cerr << "Met Exceptions:" << std::endl;
16 |     std::cerr << ex << std::endl;
17 |     exit(-1);
18 |   }
19 |   catch (...) {
20 |     std::cerr << "Unknown Exceptions" << std::endl;
21 |     exit(-1);
22 |   }
23 | }


--------------------------------------------------------------------------------
/docs/Readme.md:
--------------------------------------------------------------------------------
 1 | Documents
 2 | =========
 3 | * [Installation Guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide)
 4 | * [Quick Start](./Quick-Start.md)
 5 | * [Python Quick Start](./Python-intro.md)
 6 | * [Features](https://github.com/Microsoft/LightGBM/wiki/Features)
 7 | * [Experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments) 
 8 | * [Parameters](./Parameters.md)
 9 | * [Parameters Tuning](./Parameters-tuning.md)
10 | * [Python API Reference](./Python-API.md)
11 | * [Parallel Learning Guide](https://github.com/Microsoft/LightGBM/wiki/Parallel-Learning-Guide)
12 | * [FAQ](./FAQ.md)
13 | * [Development Guide](./development.md)
14 | 
15 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.construct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.construct}
 4 | \alias{lgb.Dataset.construct}
 5 | \title{Construct Dataset explicitly}
 6 | \usage{
 7 | lgb.Dataset.construct(dataset)
 8 | }
 9 | \arguments{
10 | \item{dataset}{Object of class \code{lgb.Dataset}}
11 | }
12 | \description{
13 | Construct Dataset explicitly
14 | }
15 | \examples{
16 | \dontrun{
17 | library(lightgbm)
18 | data(agaricus.train, package = "lightgbm")
19 | train <- agaricus.train
20 | dtrain <- lgb.Dataset(train$data, label = train$label)
21 | lgb.Dataset.construct(dtrain)
22 | }
23 | 
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Using LightGBM via Docker
 2 | 
 3 | This directory contains `Dockerfile` to make it easy to build and run LightGBM via [Docker](http://www.docker.com/).
 4 | 
 5 | ## Installing Docker
 6 | 
 7 | Follow the general installation instructions
 8 | [on the Docker site](https://docs.docker.com/installation/):
 9 | 
10 | * [OSX](https://docs.docker.com/installation/mac/): [docker toolbox](https://www.docker.com/toolbox)
11 | * [ubuntu](https://docs.docker.com/installation/ubuntulinux/)
12 | 
13 | ## Running the container
14 | 
15 | Build the container, for python users: 
16 | 
17 |     $ docker build -t lightgbm -f dockerfile-python .
18 | 
19 | After build finished, run the container:
20 | 
21 |     $ docker run --rm -it lightgbm
22 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.get.eval.result.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.get.eval.result}
 4 | \alias{lgb.get.eval.result}
 5 | \title{Get record evaluation result from booster}
 6 | \usage{
 7 | lgb.get.eval.result(booster, data_name, eval_name, iters = NULL,
 8 |   is_err = FALSE)
 9 | }
10 | \arguments{
11 | \item{booster}{Object of class \code{lgb.Booster}}
12 | 
13 | \item{data_name}{name of dataset}
14 | 
15 | \item{eval_name}{name of evaluation}
16 | 
17 | \item{iters}{iterations, NULL will return all}
18 | 
19 | \item{is_err}{TRUE will return evaluation error instead}
20 | }
21 | \value{
22 | vector of evaluation result
23 | }
24 | \description{
25 | Get record evaluation result from booster
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/examples/lambdarank/README.md:
--------------------------------------------------------------------------------
 1 | LambdaRank Example
 2 | =====================
 3 | Here is an example for LightGBM to run lambdarank task.
 4 | 
 5 | ***You should copy executable file to this folder first.***
 6 | 
 7 | #### Training
 8 | 
 9 | For windows, by running following command in this folder:
10 | ```
11 | lightgbm.exe config=train.conf
12 | ```
13 | 
14 | 
15 | For linux, by running following command in this folder:
16 | ```
17 | ./lightgbm config=train.conf
18 | ```
19 | 
20 | #### Prediction
21 | 
22 | You should finish training first.
23 | 
24 | For windows, by running following command in this folder:
25 | ```
26 | lightgbm.exe config=predict.conf
27 | ```
28 | 
29 | For linux, by running following command in this folder:
30 | ```
31 | ./lightgbm config=predict.conf
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/examples/regression/README.md:
--------------------------------------------------------------------------------
 1 | Regression Example
 2 | =====================
 3 | Here is an example for LightGBM to run regression task.
 4 | 
 5 | ***You should copy executable file to this folder first.***
 6 | 
 7 | #### Training
 8 | 
 9 | For windows, by running following command in this folder:
10 | ```
11 | lightgbm.exe config=train.conf
12 | ```
13 | 
14 | 
15 | For linux, by running following command in this folder:
16 | ```
17 | ./lightgbm config=train.conf
18 | ```
19 | 
20 | #### Prediction
21 | 
22 | You should finish training first.
23 | 
24 | For windows, by running following command in this folder:
25 | ```
26 | lightgbm.exe config=predict.conf
27 | ```
28 | 
29 | For linux, by running following command in this folder:
30 | ```
31 | ./lightgbm config=predict.conf
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.save.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.save}
 4 | \alias{lgb.Dataset.save}
 5 | \title{Save \code{lgb.Dataset} to a binary file}
 6 | \usage{
 7 | lgb.Dataset.save(dataset, fname)
 8 | }
 9 | \arguments{
10 | \item{dataset}{object of class \code{lgb.Dataset}}
11 | 
12 | \item{fname}{object filename of output file}
13 | }
14 | \value{
15 | passed dataset
16 | }
17 | \description{
18 | Save \code{lgb.Dataset} to a binary file
19 | }
20 | \examples{
21 | 
22 | \dontrun{
23 | library(lightgbm)
24 | data(agaricus.train, package = "lightgbm")
25 | train <- agaricus.train
26 | dtrain <- lgb.Dataset(train$data, label = train$label)
27 | lgb.Dataset.save(dtrain, "data.bin")
28 | }
29 | 
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/examples/binary_classification/README.md:
--------------------------------------------------------------------------------
 1 | Binary Classification Example
 2 | =====================
 3 | Here is an example for LightGBM to run binary classification task.
 4 | 
 5 | ***You should copy executable file to this folder first.***
 6 | 
 7 | #### Training
 8 | 
 9 | For windows, by running following command in this folder:
10 | ```
11 | lightgbm.exe config=train.conf
12 | ```
13 | 
14 | 
15 | For linux, by running following command in this folder:
16 | ```
17 | ./lightgbm config=train.conf
18 | ```
19 | 
20 | #### Prediction
21 | 
22 | You should finish training first.
23 | 
24 | For windows, by running following command in this folder:
25 | ```
26 | lightgbm.exe config=predict.conf
27 | ```
28 | 
29 | For linux, by running following command in this folder:
30 | ```
31 | ./lightgbm config=predict.conf
32 | ```
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/examples/multiclass_classification/README.md:
--------------------------------------------------------------------------------
 1 | Multiclass Classification Example
 2 | =====================
 3 | Here is an example for LightGBM to run multiclass classification task.
 4 | 
 5 | ***You should copy executable file to this folder first.***
 6 | 
 7 | #### Training
 8 | 
 9 | For windows, by running following command in this folder:
10 | ```
11 | lightgbm.exe config=train.conf
12 | ```
13 | 
14 | 
15 | For linux, by running following command in this folder:
16 | ```
17 | ./lightgbm config=train.conf
18 | ```
19 | 
20 | #### Prediction
21 | 
22 | You should finish training first.
23 | 
24 | For windows, by running following command in this folder:
25 | ```
26 | lightgbm.exe config=predict.conf
27 | ```
28 | 
29 | For linux, by running following command in this folder:
30 | ```
31 | ./lightgbm config=predict.conf
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/include/LightGBM/meta.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_META_H_
 2 | #define LIGHTGBM_META_H_
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #include <limits>
 7 | #include <vector>
 8 | #include <functional>
 9 | #include <memory>
10 | 
11 | namespace LightGBM {
12 | 
13 | /*! \brief Type of data size, it is better to use signed type*/
14 | typedef int32_t data_size_t;
15 | /*! \brief Type of score, and gradients */
16 | typedef float score_t;
17 | 
18 | const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
19 | 
20 | const score_t kEpsilon = 1e-15f;
21 | 
22 | using ReduceFunction = std::function<void(const char*, char*, int)>;
23 | 
24 | using PredictFunction =
25 | std::function<void(const std::vector<std::pair<int, double>>&, double* output)>;
26 | 
27 | #define NO_SPECIFIC (-1)
28 | 
29 | }  // namespace LightGBM
30 | 
31 | #endif   // LightGBM_META_H_
32 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/README.md:
--------------------------------------------------------------------------------
 1 | Parallel Learning Example
 2 | =====================
 3 | Here is an example for LightGBM to perform parallel learning for 2 machines.
 4 | 
 5 | 1. Edit mlist.txt , write the ip of these 2 machines that you want to run application on. 
 6 | 
 7 |   ```
 8 |   machine1_ip 12400
 9 |   machine2_ip 12400
10 |   ```
11 |   
12 | 2. Copy this folder and executable file to these 2 machines that you want to run application on.
13 | 3. Run command in this folder on both 2 machines:
14 | 
15 |   For windows:```lightgbm.exe config=train.conf```
16 |   
17 |   For linux:```./lightgbm config=train.conf```
18 | 
19 | This parallel learning example is based on socket. LightGBM also support parallel learning based on mpi.
20 | 
21 | For more details about the usage of parallel learning, please refer to [this](https://github.com/Microsoft/LightGBM/wiki/Parallel-Learning-Guide).
22 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.set.categorical.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.set.categorical}
 4 | \alias{lgb.Dataset.set.categorical}
 5 | \title{Set categorical feature of \code{lgb.Dataset}}
 6 | \usage{
 7 | lgb.Dataset.set.categorical(dataset, categorical_feature)
 8 | }
 9 | \arguments{
10 | \item{dataset}{object of class \code{lgb.Dataset}}
11 | 
12 | \item{categorical_feature}{categorical features}
13 | }
14 | \value{
15 | passed dataset
16 | }
17 | \description{
18 | Set categorical feature of \code{lgb.Dataset}
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package = "lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | lgb.Dataset.save(dtrain, "lgb.Dataset.data")
27 | dtrain <- lgb.Dataset("lgb.Dataset.data")
28 | lgb.Dataset.set.categorical(dtrain, 1:2)
29 | }
30 | 
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/src/network/linkers_mpi.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef USE_MPI
 2 | #include "linkers.h"
 3 | 
 4 | namespace LightGBM {
 5 | 
 6 | Linkers::Linkers(NetworkConfig config) {
 7 |   int argc = 0;
 8 |   char**argv = nullptr;
 9 |   int flag = 0;
10 |   MPI_SAFE_CALL(MPI_Initialized(&flag));  // test if MPI has been initialized
11 |   if (!flag) {  // if MPI not started, start it
12 |     MPI_SAFE_CALL(MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &flag));
13 |   }
14 |   MPI_SAFE_CALL(MPI_Comm_size(MPI_COMM_WORLD, &num_machines_));
15 |   MPI_SAFE_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank_));
16 |   // wait for all client start up
17 |   MPI_SAFE_CALL(MPI_Barrier(MPI_COMM_WORLD));
18 |   bruck_map_ = BruckMap::Construct(rank_, num_machines_);
19 |   recursive_halving_map_ = RecursiveHalvingMap::Construct(rank_, num_machines_);
20 | }
21 | 
22 | Linkers::~Linkers() {
23 |   MPI_SAFE_CALL(MPI_Finalize());
24 | }
25 | 
26 | 
27 | }  // namespace LightGBM
28 | #endif // USE_MPI
29 | 
30 | 


--------------------------------------------------------------------------------
/R-package/man/slice.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{slice}
 4 | \alias{slice}
 5 | \alias{slice.lgb.Dataset}
 6 | \title{Slice a dataset}
 7 | \usage{
 8 | slice(dataset, ...)
 9 | 
10 | \method{slice}{lgb.Dataset}(dataset, idxset, ...)
11 | }
12 | \arguments{
13 | \item{dataset}{Object of class "lgb.Dataset"}
14 | 
15 | \item{...}{other parameters (currently not used)}
16 | 
17 | \item{idxset}{a integer vector of indices of rows needed}
18 | }
19 | \value{
20 | constructed sub dataset
21 | }
22 | \description{
23 | Get a new \code{lgb.Dataset} containing the specified rows of
24 | orginal lgb.Dataset object
25 | }
26 | \examples{
27 | \dontrun{
28 | library(lightgbm)
29 | data(agaricus.train, package = "lightgbm")
30 | train <- agaricus.train
31 | dtrain <- lgb.Dataset(train$data, label = train$label)
32 | 
33 | dsub <- lightgbm::slice(dtrain, 1:42)
34 | labels <- lightgbm::getinfo(dsub, "label")
35 | }
36 | 
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.set.reference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.set.reference}
 4 | \alias{lgb.Dataset.set.reference}
 5 | \title{Set reference of \code{lgb.Dataset}}
 6 | \usage{
 7 | lgb.Dataset.set.reference(dataset, reference)
 8 | }
 9 | \arguments{
10 | \item{dataset}{object of class \code{lgb.Dataset}}
11 | 
12 | \item{reference}{object of class \code{lgb.Dataset}}
13 | }
14 | \value{
15 | passed dataset
16 | }
17 | \description{
18 | If you want to use validation data, you should set reference to training data
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package ="lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | data(agaricus.test, package = "lightgbm")
27 | test <- agaricus.test
28 | dtest <- lgb.Dataset(test$data, test = train$label)
29 | lgb.Dataset.set.reference(dtest, dtrain)
30 | }
31 | 
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/R-package/man/agaricus.test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lightgbm.R
 3 | \docType{data}
 4 | \name{agaricus.test}
 5 | \alias{agaricus.test}
 6 | \title{Test part from Mushroom Data Set}
 7 | \format{A list containing a label vector, and a dgCMatrix object with 1611
 8 | rows and 126 variables}
 9 | \usage{
10 | data(agaricus.test)
11 | }
12 | \description{
13 | This data set is originally from the Mushroom data set,
14 | UCI Machine Learning Repository.
15 | }
16 | \details{
17 | This data set includes the following fields:
18 | 
19 | \itemize{
20 |  \item \code{label} the label for each record
21 |  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
22 | }
23 | }
24 | \references{
25 | https://archive.ics.uci.edu/ml/datasets/Mushroom
26 | 
27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
29 | School of Information and Computer Science.
30 | }
31 | \keyword{datasets}
32 | 
33 | 


--------------------------------------------------------------------------------
/R-package/man/agaricus.train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lightgbm.R
 3 | \docType{data}
 4 | \name{agaricus.train}
 5 | \alias{agaricus.train}
 6 | \title{Training part from Mushroom Data Set}
 7 | \format{A list containing a label vector, and a dgCMatrix object with 6513
 8 | rows and 127 variables}
 9 | \usage{
10 | data(agaricus.train)
11 | }
12 | \description{
13 | This data set is originally from the Mushroom data set,
14 | UCI Machine Learning Repository.
15 | }
16 | \details{
17 | This data set includes the following fields:
18 | 
19 | \itemize{
20 |  \item \code{label} the label for each record
21 |  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
22 | }
23 | }
24 | \references{
25 | https://archive.ics.uci.edu/ml/datasets/Mushroom
26 | 
27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
29 | School of Information and Computer Science.
30 | }
31 | \keyword{datasets}
32 | 
33 | 


--------------------------------------------------------------------------------
/python-package/lightgbm/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """LightGBM, Light Gradient Boosting Machine.
 3 | 
 4 | Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
 5 | """
 6 | 
 7 | from __future__ import absolute_import
 8 | 
 9 | from .basic import Booster, Dataset
10 | from .callback import (early_stopping, print_evaluation, record_evaluation,
11 |                        reset_parameter)
12 | from .engine import cv, train
13 | 
14 | try:
15 |     from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
16 | except ImportError:
17 |     pass
18 | try:
19 |     from .plotting import plot_importance, plot_metric, plot_tree, create_tree_digraph
20 | except ImportError:
21 |     pass
22 | 
23 | 
24 | __version__ = 0.1
25 | 
26 | __all__ = ['Dataset', 'Booster',
27 |            'train', 'cv',
28 |            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
29 |            'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
30 |            'plot_importance', 'plot_metric', 'plot_tree', 'create_tree_digraph']
31 | 


--------------------------------------------------------------------------------
/R-package/man/dim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{dim.lgb.Dataset}
 4 | \alias{dim.lgb.Dataset}
 5 | \title{Dimensions of an lgb.Dataset}
 6 | \usage{
 7 | \method{dim}{lgb.Dataset}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Object of class \code{lgb.Dataset}}
11 | 
12 | \item{...}{other parameters}
13 | }
14 | \value{
15 | a vector of numbers of rows and of columns
16 | }
17 | \description{
18 | Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
19 | }
20 | \details{
21 | Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
22 | be directly used with an \code{lgb.Dataset} object.
23 | }
24 | \examples{
25 | \dontrun{
26 | library(lightgbm)
27 | data(agaricus.train, package = "lightgbm")
28 | train <- agaricus.train
29 | dtrain <- lgb.Dataset(train$data, label = train$label)
30 | 
31 | stopifnot(nrow(dtrain) == nrow(train$data))
32 | stopifnot(ncol(dtrain) == ncol(train$data))
33 | stopifnot(all(dim(dtrain) == dim(train$data)))
34 | }
35 | 
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/R-package/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("dimnames<-",lgb.Dataset)
 4 | S3method(dim,lgb.Dataset)
 5 | S3method(dimnames,lgb.Dataset)
 6 | S3method(getinfo,lgb.Dataset)
 7 | S3method(predict,lgb.Booster)
 8 | S3method(setinfo,lgb.Dataset)
 9 | S3method(slice,lgb.Dataset)
10 | export(getinfo)
11 | export(lgb.Dataset)
12 | export(lgb.Dataset.construct)
13 | export(lgb.Dataset.create.valid)
14 | export(lgb.Dataset.save)
15 | export(lgb.Dataset.set.categorical)
16 | export(lgb.Dataset.set.reference)
17 | export(lgb.cv)
18 | export(lgb.dump)
19 | export(lgb.get.eval.result)
20 | export(lgb.importance)
21 | export(lgb.interprete)
22 | export(lgb.load)
23 | export(lgb.model.dt.tree)
24 | export(lgb.plot.importance)
25 | export(lgb.plot.interpretation)
26 | export(lgb.save)
27 | export(lgb.train)
28 | export(lgb.unloader)
29 | export(lightgbm)
30 | export(readRDS.lgb.Booster)
31 | export(saveRDS.lgb.Booster)
32 | export(setinfo)
33 | export(slice)
34 | import(methods)
35 | importFrom(R6,R6Class)
36 | importFrom(data.table,":=")
37 | importFrom(magrittr,"%>%")
38 | importFrom(magrittr,"%T>%")
39 | useDynLib(lightgbm)
40 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.create.valid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.create.valid}
 4 | \alias{lgb.Dataset.create.valid}
 5 | \title{Construct validation data}
 6 | \usage{
 7 | lgb.Dataset.create.valid(dataset, data, info = list(), ...)
 8 | }
 9 | \arguments{
10 | \item{dataset}{\code{lgb.Dataset} object, training data}
11 | 
12 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
13 | 
14 | \item{info}{a list of information of the lgb.Dataset object}
15 | 
16 | \item{...}{other information to pass to \code{info}.}
17 | }
18 | \value{
19 | constructed dataset
20 | }
21 | \description{
22 | Construct validation data according to training data
23 | }
24 | \examples{
25 | \dontrun{
26 | library(lightgbm)
27 | data(agaricus.train, package = "lightgbm")
28 | train <- agaricus.train
29 | dtrain <- lgb.Dataset(train$data, label = train$label)
30 | data(agaricus.test, package = "lightgbm")
31 | test <- agaricus.test
32 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
33 | }
34 | 
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Microsoft Corporation 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_custom_objective.R:
--------------------------------------------------------------------------------
 1 | context('Test models with custom objective')
 2 | 
 3 | data(agaricus.train, package='lightgbm')
 4 | data(agaricus.test, package='lightgbm')
 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 7 | watchlist <- list(eval = dtest, train = dtrain)
 8 | 
 9 | logregobj <- function(preds, dtrain) {
10 |   labels <- getinfo(dtrain, "label")
11 |   preds <- 1 / (1 + exp(-preds))
12 |   grad <- preds - labels
13 |   hess <- preds * (1 - preds)
14 |   return(list(grad = grad, hess = hess))
15 | }
16 | 
17 | evalerror <- function(preds, dtrain) {
18 |   labels <- getinfo(dtrain, "label")
19 |   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
20 |   return(list(name = "error", value = err, higher_better=FALSE))
21 | }
22 | 
23 | param <- list(num_leaves=8, learning_rate=1,
24 |               objective=logregobj, metric="auc")
25 | num_round <- 10
26 | 
27 | test_that("custom objective works", {
28 |   bst <- lgb.train(param, dtrain, num_round, watchlist, eval = evalerror)
29 |   expect_false(is.null(bst$record_evals))
30 | })
31 | 


--------------------------------------------------------------------------------
/python-package/setup.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable=invalid-name, exec-used
 3 | """Setup lightgbm package."""
 4 | from __future__ import absolute_import
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from setuptools import find_packages, setup
10 | 
11 | sys.path.insert(0, '.')
12 | 
13 | CURRENT_DIR = os.path.dirname(__file__)
14 | 
15 | libpath_py = os.path.join(CURRENT_DIR, 'lightgbm/libpath.py')
16 | libpath = {'__file__': libpath_py}
17 | exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath)
18 | 
19 | LIB_PATH = [os.path.relpath(path, CURRENT_DIR) for path in libpath['find_lib_path']()]
20 | print("Install lib_lightgbm from: %s" % LIB_PATH)
21 | 
22 | 
23 | setup(name='lightgbm',
24 |       version=0.1,
25 |       description="LightGBM Python Package",
26 |       install_requires=[
27 |           'numpy',
28 |           'scipy',
29 |       ],
30 |       maintainer='Guolin Ke',
31 |       maintainer_email='guolin.ke@microsoft.com',
32 |       zip_safe=False,
33 |       packages=find_packages(),
34 |       include_package_data=True,
35 |       data_files=[('lightgbm', LIB_PATH)],
36 |       url='https://github.com/Microsoft/LightGBM')
37 | 


--------------------------------------------------------------------------------
/R-package/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Microsoft Corporation 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.load.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.load}
 4 | \alias{lgb.load}
 5 | \title{Load LightGBM model}
 6 | \usage{
 7 | lgb.load(filename)
 8 | }
 9 | \arguments{
10 | \item{filename}{path of model file}
11 | }
12 | \value{
13 | booster
14 | }
15 | \description{
16 | Load LightGBM model from saved model file
17 | }
18 | \examples{
19 | \dontrun{
20 | library(lightgbm)
21 | data(agaricus.train, package = "lightgbm")
22 | train <- agaricus.train
23 | dtrain <- lgb.Dataset(train$data, label = train$label)
24 | data(agaricus.test, package = "lightgbm")
25 | test <- agaricus.test
26 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
27 | params <- list(objective = "regression", metric = "l2")
28 | valids <- list(test = dtest)
29 | model <- lgb.train(params,
30 |                    dtrain,
31 |                    100,
32 |                    valids,
33 |                    min_data = 1,
34 |                    learning_rate = 1,
35 |                    early_stopping_rounds = 10)
36 | lgb.save(model, "model.txt")
37 | load_booster <- lgb.load("model.txt")
38 | }
39 | 
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/R-package/src/lightgbm-fullcode.cpp:
--------------------------------------------------------------------------------
 1 | // application
 2 | #include "./src/application/application.cpp"
 3 | 
 4 | // boosting
 5 | #include "./src/boosting/boosting.cpp"
 6 | #include "./src/boosting/gbdt.cpp"
 7 | 
 8 | // io
 9 | #include "./src/io/bin.cpp"
10 | #include "./src/io/config.cpp"
11 | #include "./src/io/dataset.cpp"
12 | #include "./src/io/dataset_loader.cpp"
13 | #include "./src/io/metadata.cpp"
14 | #include "./src/io/parser.cpp"
15 | #include "./src/io/tree.cpp"
16 | 
17 | // metric
18 | #include "./src/metric/dcg_calculator.cpp"
19 | #include "./src/metric/metric.cpp"
20 | 
21 | // network
22 | #include "./src/network/linker_topo.cpp"
23 | #include "./src/network/linkers_socket.cpp"
24 | #include "./src/network/network.cpp"
25 | 
26 | // objective
27 | #include "./src/objective/objective_function.cpp"
28 | 
29 | // treelearner
30 | #include "./src/treelearner/data_parallel_tree_learner.cpp"
31 | #include "./src/treelearner/feature_parallel_tree_learner.cpp"
32 | #include "./src/treelearner/serial_tree_learner.cpp"
33 | #include "./src/treelearner/tree_learner.cpp"
34 | #include "./src/treelearner/voting_parallel_tree_learner.cpp"
35 | 
36 | // c_api
37 | #include "./src/c_api.cpp"
38 | 


--------------------------------------------------------------------------------
/R-package/src/lightgbm-all.cpp:
--------------------------------------------------------------------------------
 1 | // application
 2 | #include "../../src/application/application.cpp"
 3 | 
 4 | // boosting
 5 | #include "../../src/boosting/boosting.cpp"
 6 | #include "../../src/boosting/gbdt.cpp"
 7 | 
 8 | // io
 9 | #include "../../src/io/bin.cpp"
10 | #include "../../src/io/config.cpp"
11 | #include "../../src/io/dataset.cpp"
12 | #include "../../src/io/dataset_loader.cpp"
13 | #include "../../src/io/metadata.cpp"
14 | #include "../../src/io/parser.cpp"
15 | #include "../../src/io/tree.cpp"
16 | 
17 | // metric
18 | #include "../../src/metric/dcg_calculator.cpp"
19 | #include "../../src/metric/metric.cpp"
20 | 
21 | // network
22 | #include "../../src/network/linker_topo.cpp"
23 | #include "../../src/network/linkers_socket.cpp"
24 | #include "../../src/network/network.cpp"
25 | 
26 | // objective
27 | #include "../../src/objective/objective_function.cpp"
28 | 
29 | // treelearner
30 | #include "../../src/treelearner/data_parallel_tree_learner.cpp"
31 | #include "../../src/treelearner/feature_parallel_tree_learner.cpp"
32 | #include "../../src/treelearner/serial_tree_learner.cpp"
33 | #include "../../src/treelearner/tree_learner.cpp"
34 | #include "../../src/treelearner/voting_parallel_tree_learner.cpp"
35 | 
36 | // c_api
37 | #include "../../src/c_api.cpp"
38 | 


--------------------------------------------------------------------------------
/R-package/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: lightgbm
 2 | Type: Package
 3 | Title: Light Gradient Boosting Machine
 4 | Version: 0.1
 5 | Date: 2016-12-29
 6 | Author: Guolin Ke <guolin.ke@microsoft.com>
 7 | Maintainer: Guolin Ke <guolin.ke@microsoft.com>
 8 | Description: LightGBM is a gradient boosting framework that uses tree based learning algorithms. 
 9 |     It is designed to be distributed and efficient with the following advantages:
10 |         1.Faster training speed and higher efficiency. 
11 |         2.Lower memory usage. 
12 |         3.Better accuracy. 
13 |         4.Parallel learning supported. 
14 |         5. Capable of handling large-scale data. 
15 | License: The MIT License (MIT) | file LICENSE
16 | URL: https://github.com/Microsoft/LightGBM
17 | BugReports: https://github.com/Microsoft/LightGBM/issues
18 | VignetteBuilder: knitr
19 | Suggests:
20 |     knitr,
21 |     rmarkdown,
22 |     ggplot2 (>= 1.0.1),
23 |     DiagrammeR (>= 0.8.1),
24 |     Ckmeans.1d.dp (>= 3.3.1),
25 |     vcd (>= 1.3),
26 |     testthat,
27 |     igraph (>= 1.0.1),
28 |     stringi (>= 0.5.2)
29 | Depends:
30 |     R (>= 3.0),
31 |     R6
32 | Imports:
33 |     methods,
34 |     Matrix (>= 1.1-0),
35 |     data.table (>= 1.9.6),
36 |     magrittr (>= 1.5),
37 |     jsonlite
38 | RoxygenNote: 5.0.1
39 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.dump.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.dump}
 4 | \alias{lgb.dump}
 5 | \title{Dump LightGBM model to json}
 6 | \usage{
 7 | lgb.dump(booster, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{booster}{Object of class \code{lgb.Booster}}
11 | 
12 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
13 | }
14 | \value{
15 | json format of model
16 | }
17 | \description{
18 | Dump LightGBM model to json
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package = "lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | data(agaricus.test, package = "lightgbm")
27 | test <- agaricus.test
28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
29 | params <- list(objective = "regression", metric = "l2")
30 | valids <- list(test = dtest)
31 | model <- lgb.train(params,
32 |                   dtrain,
33 |                    100,
34 |                    valids,
35 |                    min_data = 1,
36 |                    learning_rate = 1,
37 |                    early_stopping_rounds = 10)
38 | json_model <- lgb.dump(model)
39 | }
40 | 
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/threading.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_UTILS_THREADING_H_
 2 | #define LIGHTGBM_UTILS_THREADING_H_
 3 | 
 4 | #include <LightGBM/utils/openmp_wrapper.h>
 5 | 
 6 | #include <vector>
 7 | #include <functional>
 8 | 
 9 | namespace LightGBM {
10 | 
11 | class Threading {
12 | public:
13 | 
14 |   template<typename INDEX_T>
15 |   static inline void For(INDEX_T start, INDEX_T end, const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) {
16 |     int num_threads = 1;
17 |     #pragma omp parallel
18 |     #pragma omp master
19 |     {
20 |       num_threads = omp_get_num_threads();
21 |     }
22 |     INDEX_T num_inner = (end - start + num_threads - 1) / num_threads;
23 |     if (num_inner <= 0) { num_inner = 1; }
24 |     OMP_INIT_EX();
25 |     #pragma omp parallel for schedule(static,1)
26 |     for (int i = 0; i < num_threads; ++i) {
27 |       OMP_LOOP_EX_BEGIN();
28 |       INDEX_T inner_start = start + num_inner * i;
29 |       INDEX_T inner_end = inner_start + num_inner;
30 |       if (inner_end > end) { inner_end = end; }
31 |       if (inner_start < end) {
32 |         inner_fun(i, inner_start, inner_end);
33 |       }
34 |       OMP_LOOP_EX_END();
35 |     }
36 |     OMP_THROW_EX();
37 |   }
38 | };
39 | 
40 | }   // namespace LightGBM
41 | 
42 | #endif   // LightGBM_UTILS_THREADING_H_
43 | 


--------------------------------------------------------------------------------
/R-package/man/dimnames.lgb.Dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{dimnames.lgb.Dataset}
 4 | \alias{dimnames.lgb.Dataset}
 5 | \alias{dimnames<-.lgb.Dataset}
 6 | \title{Handling of column names of \code{lgb.Dataset}}
 7 | \usage{
 8 | \method{dimnames}{lgb.Dataset}(x)
 9 | 
10 | \method{dimnames}{lgb.Dataset}(x) <- value
11 | }
12 | \arguments{
13 | \item{x}{object of class \code{lgb.Dataset}}
14 | 
15 | \item{value}{a list of two elements: the first one is ignored
16 | and the second one is column names}
17 | }
18 | \description{
19 | Only column names are supported for \code{lgb.Dataset}, thus setting of
20 | row names would have no effect and returned row names would be NULL.
21 | }
22 | \details{
23 | Generic \code{dimnames} methods are used by \code{colnames}.
24 | Since row names are irrelevant, it is recommended to use \code{colnames} directly.
25 | }
26 | \examples{
27 | \dontrun{
28 | library(lightgbm)
29 | data(agaricus.train, package = "lightgbm")
30 | train <- agaricus.train
31 | dtrain <- lgb.Dataset(train$data, label = train$label)
32 | lgb.Dataset.construct(dtrain)
33 | dimnames(dtrain)
34 | colnames(dtrain)
35 | colnames(dtrain) <- make.names(1:ncol(train$data))
36 | print(dtrain, verbose = TRUE)
37 | }
38 | 
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.save.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.save}
 4 | \alias{lgb.save}
 5 | \title{Save LightGBM model}
 6 | \usage{
 7 | lgb.save(booster, filename, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{booster}{Object of class \code{lgb.Booster}}
11 | 
12 | \item{filename}{saved filename}
13 | 
14 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
15 | }
16 | \value{
17 | booster
18 | }
19 | \description{
20 | Save LightGBM model
21 | }
22 | \examples{
23 | \dontrun{
24 | library(lightgbm)
25 | data(agaricus.train, package = "lightgbm")
26 | train <- agaricus.train
27 | dtrain <- lgb.Dataset(train$data, label = train$label)
28 | data(agaricus.test, package = "lightgbm")
29 | test <- agaricus.test
30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
31 | params <- list(objective = "regression", metric = "l2")
32 | valids <- list(test = dtest)
33 | model <- lgb.train(params,
34 |                    dtrain,
35 |                    100,
36 |                    valids,
37 |                    min_data = 1,
38 |                    learning_rate = 1,
39 |                    early_stopping_rounds = 10)
40 | lgb.save(model, "model.txt")
41 | }
42 | 
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/docker/dockerfile-python:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y cmake build-essential gcc g++ git wget && \
 5 | 
 6 | # open-mpi
 7 |     cd /usr/local/src && mkdir openmpi && cd openmpi && \
 8 |     wget https://www.open-mpi.org/software/ompi/v2.0/downloads/openmpi-2.0.1.tar.gz && \
 9 |     tar -xzf openmpi-2.0.1.tar.gz && cd openmpi-2.0.1 && \
10 |     ./configure --prefix=/usr/local/openmpi && make && make install && \
11 |     export PATH="/usr/local/openmpi/bin:$PATH" && \
12 | 
13 | # lightgbm
14 |     cd /usr/local/src && mkdir lightgbm && cd lightgbm && \
15 |     git clone --recursive https://github.com/Microsoft/LightGBM && \
16 |     cd LightGBM && mkdir build && cd build && cmake -DUSE_MPI=ON .. && make && \
17 | 
18 | # python-package
19 |     # miniconda
20 |     wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
21 |     /bin/bash Miniconda3-latest-Linux-x86_64.sh -f -b -p /opt/conda && \
22 |     export PATH="/opt/conda/bin:$PATH" && \
23 |     # lightgbm
24 |     conda install -y numpy scipy scikit-learn pandas && \
25 |     cd ../python-package && python setup.py install && \
26 | 
27 | # clean
28 |     apt-get autoremove -y && apt-get clean && \
29 |     conda clean -i -l -t -y && \
30 |     rm -rf /usr/local/src/*
31 | 
32 | ENV PATH /opt/conda/bin:$PATH
33 | 


--------------------------------------------------------------------------------
/python-package/lightgbm/libpath.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """Find the path to lightgbm dynamic library files."""
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def find_lib_path():
 8 |     """Find the path to LightGBM library files.
 9 |     Returns
10 |     -------
11 |     lib_path: list(string)
12 |        List of all found library path to LightGBM
13 |     """
14 |     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
15 |     dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
16 |                 os.path.join(curr_path, '../../'),
17 |                 os.path.join(curr_path, './lib/'),
18 |                 os.path.join(sys.prefix, 'lightgbm')]
19 |     if os.name == 'nt':
20 |         dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
21 |         dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
22 |         dll_path.append(os.path.join(curr_path, '../../Release/'))
23 |         dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
24 |     else:
25 |         dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
26 |     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
27 |     if not lib_path:
28 |         dll_path = [os.path.realpath(p) for p in dll_path]
29 |         raise Exception('Cannot find lightgbm Library in following paths: ' + ','.join(dll_path))
30 |     return lib_path
31 | 


--------------------------------------------------------------------------------
/R-package/man/getinfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{getinfo}
 4 | \alias{getinfo}
 5 | \alias{getinfo.lgb.Dataset}
 6 | \title{Get information of an lgb.Dataset object}
 7 | \usage{
 8 | getinfo(dataset, ...)
 9 | 
10 | \method{getinfo}{lgb.Dataset}(dataset, name, ...)
11 | }
12 | \arguments{
13 | \item{dataset}{Object of class \code{lgb.Dataset}}
14 | 
15 | \item{...}{other parameters}
16 | 
17 | \item{name}{the name of the information field to get (see details)}
18 | }
19 | \value{
20 | info data
21 | }
22 | \description{
23 | Get information of an lgb.Dataset object
24 | }
25 | \details{
26 | The \code{name} field can be one of the following:
27 | 
28 | \itemize{
29 |     \item \code{label}: label lightgbm learn from ;
30 |     \item \code{weight}: to do a weight rescale ;
31 |     \item \code{group}: group size
32 |     \item \code{init_score}: initial score is the base prediction lightgbm will boost from ;
33 | }
34 | }
35 | \examples{
36 | \dontrun{
37 | library(lightgbm)
38 | data(agaricus.train, package = "lightgbm")
39 | train <- agaricus.train
40 | dtrain <- lgb.Dataset(train$data, label = train$label)
41 | lgb.Dataset.construct(dtrain)
42 | 
43 | labels <- lightgbm::getinfo(dtrain, "label")
44 | lightgbm::setinfo(dtrain, "label", 1 - labels)
45 | 
46 | labels2 <- lightgbm::getinfo(dtrain, "label")
47 | stopifnot(all(labels2 == 1 - labels))
48 | }
49 | 
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/R-package/man/readRDS.lgb.Booster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readRDS.lgb.Booster.R
 3 | \name{readRDS.lgb.Booster}
 4 | \alias{readRDS.lgb.Booster}
 5 | \title{readRDS for lgb.Booster models}
 6 | \usage{
 7 | readRDS.lgb.Booster(file = "", refhook = NULL)
 8 | }
 9 | \arguments{
10 | \item{file}{a connection or the name of the file where the R object is saved to or read from.}
11 | 
12 | \item{refhook}{a hook function for handling reference objects.}
13 | }
14 | \value{
15 | an R object.
16 | }
17 | \description{
18 | Attemps to load a model using RDS.
19 | }
20 | \examples{
21 | \dontrun{
22 |   library(lightgbm)
23 |   data(agaricus.train, package = "lightgbm")
24 |   train <- agaricus.train
25 |   dtrain <- lgb.Dataset(train$data, label = train$label)
26 |   data(agaricus.test, package = "lightgbm")
27 |   test <- agaricus.test
28 |   dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
29 |   params <- list(objective = "regression", metric = "l2")
30 |   valids <- list(test = dtest)
31 |   model <- lgb.train(params,
32 |                      dtrain,
33 |                      100,
34 |                      valids,
35 |                      min_data = 1,
36 |                      learning_rate = 1,
37 |                      early_stopping_rounds = 10)
38 |   saveRDS.lgb.Booster(model, "model.rds")
39 |   new_model <- readRDS.lgb.Booster("model.rds")
40 | }
41 | 
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/R-package/man/setinfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{setinfo}
 4 | \alias{setinfo}
 5 | \alias{setinfo.lgb.Dataset}
 6 | \title{Set information of an lgb.Dataset object}
 7 | \usage{
 8 | setinfo(dataset, ...)
 9 | 
10 | \method{setinfo}{lgb.Dataset}(dataset, name, info, ...)
11 | }
12 | \arguments{
13 | \item{dataset}{Object of class "lgb.Dataset"}
14 | 
15 | \item{...}{other parameters}
16 | 
17 | \item{name}{the name of the field to get}
18 | 
19 | \item{info}{the specific field of information to set}
20 | }
21 | \value{
22 | passed object
23 | }
24 | \description{
25 | Set information of an lgb.Dataset object
26 | }
27 | \details{
28 | The \code{name} field can be one of the following:
29 | 
30 | \itemize{
31 |     \item \code{label}: label lightgbm learn from ;
32 |     \item \code{weight}: to do a weight rescale ;
33 |     \item \code{init_score}: initial score is the base prediction lightgbm will boost from ;
34 |     \item \code{group}.
35 | }
36 | }
37 | \examples{
38 | \dontrun{
39 | library(lightgbm)
40 | data(agaricus.train, package = "lightgbm")
41 | train <- agaricus.train
42 | dtrain <- lgb.Dataset(train$data, label = train$label)
43 | lgb.Dataset.construct(dtrain)
44 | 
45 | labels <- lightgbm::getinfo(dtrain, "label")
46 | lightgbm::setinfo(dtrain, "label", 1 - labels)
47 | 
48 | labels2 <- lightgbm::getinfo(dtrain, "label")
49 | stopifnot(all.equal(labels2, 1 - labels))
50 | }
51 | 
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset}
 4 | \alias{lgb.Dataset}
 5 | \title{Construct lgb.Dataset object}
 6 | \usage{
 7 | lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL,
 8 |   categorical_feature = NULL, free_raw_data = TRUE, info = list(), ...)
 9 | }
10 | \arguments{
11 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
12 | 
13 | \item{params}{a list of parameters}
14 | 
15 | \item{reference}{reference dataset}
16 | 
17 | \item{colnames}{names of columns}
18 | 
19 | \item{categorical_feature}{categorical features}
20 | 
21 | \item{free_raw_data}{TRUE for need to free raw data after construct}
22 | 
23 | \item{info}{a list of information of the lgb.Dataset object}
24 | 
25 | \item{...}{other information to pass to \code{info} or parameters pass to \code{params}}
26 | }
27 | \value{
28 | constructed dataset
29 | }
30 | \description{
31 | Construct lgb.Dataset object from dense matrix, sparse matrix
32 | or local file (that was created previously by saving an \code{lgb.Dataset}).
33 | }
34 | \examples{
35 | \dontrun{
36 | library(lightgbm)
37 | data(agaricus.train, package = "lightgbm")
38 | train <- agaricus.train
39 | dtrain <- lgb.Dataset(train$data, label = train$label)
40 | lgb.Dataset.save(dtrain, "lgb.Dataset.data")
41 | dtrain <- lgb.Dataset("lgb.Dataset.data")
42 | lgb.Dataset.construct(dtrain)
43 | }
44 | 
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/R-package/demo/boost_from_prediction.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | require(methods)
 3 | 
 4 | # Load in the agaricus dataset
 5 | data(agaricus.train, package = "lightgbm")
 6 | data(agaricus.test, package = "lightgbm")
 7 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 8 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 9 | 
10 | valids <- list(eval = dtest, train = dtrain)
11 | #--------------------Advanced features ---------------------------
12 | # advanced: start from a initial base prediction
13 | print("Start running example to start from a initial prediction")
14 | 
15 | # Train lightgbm for 1 round
16 | param <- list(num_leaves = 4,
17 |               learning_rate = 1,
18 |               nthread = 2,
19 |               silent = 1,
20 |               objective = "binary")
21 | bst <- lgb.train(param, dtrain, 1, valids = valids)
22 | 
23 | # Note: we need the margin value instead of transformed prediction in set_init_score
24 | ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
25 | ptest  <- predict(bst, agaricus.test$data, rawscore = TRUE)
26 | 
27 | # set the init_score property of dtrain and dtest
28 | # base margin is the base prediction we will boost from
29 | setinfo(dtrain, "init_score", ptrain)
30 | setinfo(dtest, "init_score", ptest)
31 | 
32 | print("This is result of boost from initial prediction")
33 | bst <- lgb.train(params = param,
34 |                  data = dtrain,
35 |                  nrounds = 5,
36 |                  valids = valids)
37 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.importance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.importance.R
 3 | \name{lgb.importance}
 4 | \alias{lgb.importance}
 5 | \title{Compute feature importance in a model}
 6 | \usage{
 7 | lgb.importance(model, percentage = TRUE)
 8 | }
 9 | \arguments{
10 | \item{model}{object of class \code{lgb.Booster}.}
11 | 
12 | \item{percentage}{whether to show importance in relative percentage.}
13 | }
14 | \value{
15 | For a tree model, a \code{data.table} with the following columns:
16 | \itemize{
17 |   \item \code{Feature} Feature names in the model.
18 |   \item \code{Gain} The total gain of this feature's splits.
19 |   \item \code{Cover} The number of observation related to this feature.
20 |   \item \code{Frequency} The number of times a feature splited in trees.
21 | }
22 | }
23 | \description{
24 | Creates a \code{data.table} of feature importances in a model.
25 | }
26 | \examples{
27 | \dontrun{
28 | library(lightgbm)
29 | data(agaricus.train, package = "lightgbm")
30 | train <- agaricus.train
31 | dtrain <- lgb.Dataset(train$data, label = train$label)
32 | 
33 | params = list(objective = "binary",
34 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
35 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
36 |               model <- lgb.train(params, dtrain, 20)
37 | model <- lgb.train(params, dtrain, 20)
38 | 
39 | tree_imp1 <- lgb.importance(model, percentage = TRUE)
40 | tree_imp2 <- lgb.importance(model, percentage = FALSE)
41 | }
42 | 
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/.travis/amd_sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original script from https://github.com/gregvw/amd_sdk/
 4 | 
 5 | # Location from which get nonce and file name from
 6 | URL="http://developer.amd.com/tools-and-sdks/opencl-zone/opencl-tools-sdks/amd-accelerated-parallel-processing-app-sdk/"
 7 | URLDOWN="http://developer.amd.com/amd-license-agreement-appsdk/"
 8 | 
 9 | NONCE1_STRING='name="amd_developer_central_downloads_page_nonce"'
10 | FILE_STRING='name="f"'
11 | POSTID_STRING='name="post_id"'
12 | NONCE2_STRING='name="amd_developer_central_nonce"'
13 | 
14 | #For newest FORM=`wget -qO - $URL | sed -n '/download-2/,/64-bit/p'`
15 | FORM=`wget -qO - $URL | sed -n '/download-5/,/64-bit/p'`
16 | 
17 | # Get nonce from form
18 | NONCE1=`echo $FORM | awk -F ${NONCE1_STRING} '{print $2}'`
19 | NONCE1=`echo $NONCE1 | awk -F'"' '{print $2}'`
20 | echo $NONCE1
21 | 
22 | # get the postid
23 | POSTID=`echo $FORM | awk -F ${POSTID_STRING} '{print $2}'`
24 | POSTID=`echo $POSTID | awk -F'"' '{print $2}'`
25 | echo $POSTID
26 | 
27 | # get file name
28 | FILE=`echo $FORM | awk -F ${FILE_STRING} '{print $2}'`
29 | FILE=`echo $FILE | awk -F'"' '{print $2}'`
30 | echo $FILE
31 | 
32 | FORM=`wget -qO - $URLDOWN --post-data "amd_developer_central_downloads_page_nonce=${NONCE1}&f=${FILE}&post_id=${POSTID}"`
33 | 
34 | NONCE2=`echo $FORM | awk -F ${NONCE2_STRING} '{print $2}'`
35 | NONCE2=`echo $NONCE2 | awk -F'"' '{print $2}'`
36 | echo $NONCE2
37 | 
38 | wget --content-disposition --trust-server-names $URLDOWN --post-data "amd_developer_central_nonce=${NONCE2}&f=${FILE}" -O AMD-SDK.tar.bz2;
39 | 


--------------------------------------------------------------------------------
/python-package/README.rst:
--------------------------------------------------------------------------------
 1 | LightGBM Python Package
 2 | =======================
 3 | 
 4 | Installation
 5 | ------------
 6 | 
 7 | 1. Following `Installation Guide <https://github.com/Microsoft/LightGBM/wiki/Installation-Guide>`__ to build first.
 8 |    For the windows user, please change the build config to ``DLL``.
 9 | 2. Install with ``cd python-package; python setup.py install`` 
10 | 
11 | Note: Make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__
12 | 
13 | 
14 | Examples
15 | --------
16 | 
17 | Refer to the walk through examples in `python-guide folder <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
18 | 
19 | 
20 | Troubleshooting
21 | --------
22 | 
23 | Refer to `FAQ <https://github.com/Microsoft/LightGBM/tree/master/docs/FAQ.md>`__ 
24 | 
25 | Developments
26 | --------
27 | 
28 | The code style of python package follows `pep8 <https://www.python.org/dev/peps/pep-0008/>`__. If you would like to make a contribution and not familiar with pep-8, please check the pep8 style guide first. Otherwise, you won't pass the check. You should be careful about:
29 | 
30 | - E1 Indentation (check pep8 link above)
31 | - E202 whitespace before and after brackets
32 | - E225 missing whitespace around operator
33 | - E226 missing whitespace around arithmetic operator
34 | - E261 at least two spaces before inline comment
35 | - E301 expected 1 blank line in front of and at the end of a method
36 | - E302 expected 2 blank lines in front of and at the end of a function or a class
37 | 
38 | You can ignore E501 (line too long).
39 | 


--------------------------------------------------------------------------------
/src/treelearner/tree_learner.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/tree_learner.h>
 2 | 
 3 | #include "serial_tree_learner.h"
 4 | #include "gpu_tree_learner.h"
 5 | #include "parallel_tree_learner.h"
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, const std::string& device_type, const TreeConfig* tree_config) {
10 |   if (device_type == std::string("cpu")) {
11 |     if (learner_type == std::string("serial")) {
12 |       return new SerialTreeLearner(tree_config);
13 |     } else if (learner_type == std::string("feature")) {
14 |       return new FeatureParallelTreeLearner<SerialTreeLearner>(tree_config);
15 |     } else if (learner_type == std::string("data")) {
16 |       return new DataParallelTreeLearner<SerialTreeLearner>(tree_config);
17 |     } else if (learner_type == std::string("voting")) {
18 |       return new VotingParallelTreeLearner<SerialTreeLearner>(tree_config);
19 |     }
20 |   }
21 |   else if (device_type == std::string("gpu")) {
22 |     if (learner_type == std::string("serial")) {
23 |       return new GPUTreeLearner(tree_config);
24 |     } else if (learner_type == std::string("feature")) {
25 |       return new FeatureParallelTreeLearner<GPUTreeLearner>(tree_config);
26 |     } else if (learner_type == std::string("data")) {
27 |       return new DataParallelTreeLearner<GPUTreeLearner>(tree_config);
28 |     } else if (learner_type == std::string("voting")) {
29 |       return new VotingParallelTreeLearner<GPUTreeLearner>(tree_config);
30 |     }
31 |   }
32 |   return nullptr;
33 | }
34 | 
35 | }  // namespace LightGBM
36 | 


--------------------------------------------------------------------------------
/windows/LightGBM.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 14
 4 | VisualStudioVersion = 14.0.25420.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug_mpi|x64 = Debug_mpi|x64
11 | 		Debug|x64 = Debug|x64
12 | 		DLL|x64 = DLL|x64
13 | 		Release_mpi|x64 = Release_mpi|x64
14 | 		Release|x64 = Release|x64
15 | 	EndGlobalSection
16 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
17 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64
18 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64
19 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64
20 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64
21 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.ActiveCfg = DLL|x64
22 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.Build.0 = DLL|x64
23 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64
24 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64
25 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64
26 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64
27 | 	EndGlobalSection
28 | 	GlobalSection(SolutionProperties) = preSolution
29 | 		HideSolutionNode = FALSE
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/src/metric/metric.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/metric.h>
 2 | #include "regression_metric.hpp"
 3 | #include "binary_metric.hpp"
 4 | #include "rank_metric.hpp"
 5 | #include "map_metric.hpp"
 6 | #include "multiclass_metric.hpp"
 7 | 
 8 | namespace LightGBM {
 9 | 
10 | Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config) {
11 |   if (type == std::string("l2") || type == std::string("mean_squared_error") || type == std::string("mse")) {
12 |     return new L2Metric(config);
13 |   } else if (type == std::string("l1") || type == std::string("mean_absolute_error") || type == std::string("mae")) {
14 |     return new L1Metric(config);
15 |   } else if (type == std::string("huber")) {
16 |     return new HuberLossMetric(config);
17 |   } else if (type == std::string("fair")) {
18 |     return new FairLossMetric(config);
19 |   } else if (type == std::string("poisson")) {
20 |     return new PoissonMetric(config);
21 |   } else if (type == std::string("binary_logloss")) {
22 |     return new BinaryLoglossMetric(config);
23 |   } else if (type == std::string("binary_error")) {
24 |     return new BinaryErrorMetric(config);
25 |   } else if (type == std::string("auc")) {
26 |     return new AUCMetric(config);
27 |   } else if (type == std::string("ndcg")) {
28 |     return new NDCGMetric(config);
29 |   } else if (type == std::string("map")) {
30 |     return new MapMetric(config);
31 |   } else if (type == std::string("multi_logloss")) {
32 |     return new MultiSoftmaxLoglossMetric(config);
33 |   } else if (type == std::string("multi_error")) {
34 |     return new MultiErrorMetric(config);
35 |   }
36 |   return nullptr;
37 | }
38 | 
39 | }  // namespace LightGBM
40 | 


--------------------------------------------------------------------------------
/examples/multiclass_classification/train.conf:
--------------------------------------------------------------------------------
 1 | # task type, support train and predict
 2 | task = train
 3 | 
 4 | # boosting type, support gbdt for now, alias: boosting, boost
 5 | boosting_type = gbdt
 6 | 
 7 | # application type, support following application
 8 | # regression , regression task
 9 | # binary , binary classification task
10 | # lambdarank , lambdarank task
11 | # multiclass
12 | # alias: application, app
13 | objective = multiclass
14 | 
15 | # eval metrics, support multi metric, delimite by ',' , support following metrics
16 | # l1 
17 | # l2 , default metric for regression
18 | # ndcg , default metric for lambdarank
19 | # auc 
20 | # binary_logloss , default metric for binary
21 | # binary_error
22 | # multi_logloss
23 | # multi_error
24 | metric = multi_logloss
25 | 
26 | # number of class, for multiclass classification
27 | num_class = 5
28 | 
29 | # frequence for metric output
30 | metric_freq = 1
31 | 
32 | # true if need output metric for training data, alias: tranining_metric, train_metric
33 | is_training_metric = true
34 | 
35 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
36 | max_bin = 255
37 | 
38 | # training data
39 | # if exsting weight file, should name to "regression.train.weight"
40 | # alias: train_data, train
41 | data = multiclass.train
42 | 
43 | # valid data
44 | valid_data = multiclass.test
45 | 
46 | # round for early stopping
47 | early_stopping = 10
48 | 
49 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
50 | num_trees = 100
51 | 
52 | # shrinkage rate , alias: shrinkage_rate
53 | learning_rate = 0.05
54 | 
55 | # number of leaves for one tree, alias: num_leaf
56 | num_leaves = 31
57 | 


--------------------------------------------------------------------------------
/examples/python-guide/sklearn_example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = invalid-name, C0111
 3 | import lightgbm as lgb
 4 | import pandas as pd
 5 | from sklearn.metrics import mean_squared_error
 6 | from sklearn.model_selection import GridSearchCV
 7 | 
 8 | # load or create your dataset
 9 | print('Load data...')
10 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
11 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
12 | 
13 | y_train = df_train[0].values
14 | y_test = df_test[0].values
15 | X_train = df_train.drop(0, axis=1).values
16 | X_test = df_test.drop(0, axis=1).values
17 | 
18 | print('Start training...')
19 | # train
20 | gbm = lgb.LGBMRegressor(objective='regression',
21 |                         num_leaves=31,
22 |                         learning_rate=0.05,
23 |                         n_estimators=20)
24 | gbm.fit(X_train, y_train,
25 |         eval_set=[(X_test, y_test)],
26 |         eval_metric='l1',
27 |         early_stopping_rounds=5)
28 | 
29 | print('Start predicting...')
30 | # predict
31 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
32 | # eval
33 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
34 | 
35 | print('Calculate feature importances...')
36 | # feature importances
37 | print('Feature importances:', list(gbm.feature_importances_))
38 | 
39 | # other scikit-learn modules
40 | estimator = lgb.LGBMRegressor(num_leaves=31)
41 | 
42 | param_grid = {
43 |     'learning_rate': [0.01, 0.1, 1],
44 |     'n_estimators': [20, 40]
45 | }
46 | 
47 | gbm = GridSearchCV(estimator, param_grid)
48 | 
49 | gbm.fit(X_train, y_train)
50 | 
51 | print('Best parameters found by grid search are:', gbm.best_params_)
52 | 


--------------------------------------------------------------------------------
/examples/python-guide/README.md:
--------------------------------------------------------------------------------
 1 | Python Package Example
 2 | =====================
 3 | Here is an example for LightGBM to use python package.
 4 | 
 5 | ***You should install lightgbm (both c++ and python verion) first.***
 6 | 
 7 | For the installation, check the wiki [here](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide).
 8 | 
 9 | You also need scikit-learn, pandas and matplotlib (only for plot example) to run the examples, but they are not required for the package itself. You can install them with pip:
10 | ```
11 | pip install scikit-learn pandas matplotlib -U
12 | ```
13 | 
14 | Now you can run examples in this folder, for example:
15 | ```
16 | python simple_example.py
17 | ```
18 | Examples including:
19 | - [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
20 |     - Construct Dataset
21 |     - Basic train and predict
22 |     - Eval during training 
23 |     - Early stopping
24 |     - Save model to file
25 |     - Dump model to json format
26 |     - Feature importances
27 | - [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
28 |     - Basic train and predict with sklearn interface
29 |     - Feature importances with sklearn interface
30 | - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
31 |     - Set feature names
32 |     - Directly use categorical features without one-hot encoding
33 |     - Load model to predict
34 |     - Dump and load model with pickle
35 |     - Load model file to continue training
36 |     - Change learning rates during training
37 |     - Self-defined objective function
38 |     - Self-defined eval metric
39 |     - Callback function


--------------------------------------------------------------------------------
/examples/lambdarank/rank.train.query:
--------------------------------------------------------------------------------
  1 | 1
  2 | 13
  3 | 5
  4 | 8
  5 | 19
  6 | 12
  7 | 18
  8 | 5
  9 | 14
 10 | 13
 11 | 8
 12 | 9
 13 | 16
 14 | 11
 15 | 21
 16 | 14
 17 | 21
 18 | 9
 19 | 14
 20 | 11
 21 | 20
 22 | 18
 23 | 13
 24 | 20
 25 | 22
 26 | 22
 27 | 13
 28 | 17
 29 | 10
 30 | 13
 31 | 12
 32 | 13
 33 | 13
 34 | 23
 35 | 18
 36 | 13
 37 | 20
 38 | 12
 39 | 22
 40 | 14
 41 | 13
 42 | 23
 43 | 13
 44 | 14
 45 | 14
 46 | 5
 47 | 13
 48 | 15
 49 | 14
 50 | 14
 51 | 16
 52 | 16
 53 | 15
 54 | 21
 55 | 22
 56 | 10
 57 | 22
 58 | 18
 59 | 25
 60 | 16
 61 | 12
 62 | 12
 63 | 15
 64 | 15
 65 | 25
 66 | 13
 67 | 9
 68 | 12
 69 | 8
 70 | 16
 71 | 25
 72 | 19
 73 | 24
 74 | 12
 75 | 16
 76 | 10
 77 | 16
 78 | 9
 79 | 17
 80 | 15
 81 | 7
 82 | 9
 83 | 15
 84 | 14
 85 | 16
 86 | 17
 87 | 8
 88 | 17
 89 | 12
 90 | 18
 91 | 23
 92 | 10
 93 | 12
 94 | 12
 95 | 4
 96 | 14
 97 | 12
 98 | 15
 99 | 27
100 | 16
101 | 20
102 | 13
103 | 19
104 | 13
105 | 17
106 | 17
107 | 16
108 | 12
109 | 15
110 | 14
111 | 14
112 | 19
113 | 12
114 | 23
115 | 18
116 | 16
117 | 9
118 | 23
119 | 11
120 | 15
121 | 8
122 | 10
123 | 10
124 | 16
125 | 11
126 | 15
127 | 22
128 | 16
129 | 17
130 | 23
131 | 16
132 | 22
133 | 17
134 | 14
135 | 12
136 | 14
137 | 20
138 | 15
139 | 17
140 | 15
141 | 15
142 | 22
143 | 9
144 | 21
145 | 9
146 | 17
147 | 16
148 | 15
149 | 13
150 | 13
151 | 15
152 | 14
153 | 18
154 | 21
155 | 14
156 | 17
157 | 15
158 | 14
159 | 16
160 | 12
161 | 17
162 | 19
163 | 16
164 | 11
165 | 18
166 | 11
167 | 13
168 | 14
169 | 9
170 | 16
171 | 15
172 | 16
173 | 25
174 | 9
175 | 13
176 | 22
177 | 16
178 | 18
179 | 20
180 | 14
181 | 11
182 | 9
183 | 16
184 | 19
185 | 19
186 | 11
187 | 11
188 | 13
189 | 14
190 | 14
191 | 13
192 | 16
193 | 6
194 | 21
195 | 16
196 | 12
197 | 16
198 | 11
199 | 24
200 | 12
201 | 10
202 | 


--------------------------------------------------------------------------------
/examples/python-guide/plot_example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = invalid-name, C0111
 3 | import lightgbm as lgb
 4 | import pandas as pd
 5 | 
 6 | try:
 7 |     import matplotlib.pyplot as plt
 8 | except ImportError:
 9 |     raise ImportError('You need to install matplotlib for plot_example.py.')
10 | 
11 | # load or create your dataset
12 | print('Load data...')
13 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
14 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
15 | 
16 | y_train = df_train[0].values
17 | y_test = df_test[0].values
18 | X_train = df_train.drop(0, axis=1).values
19 | X_test = df_test.drop(0, axis=1).values
20 | 
21 | # create dataset for lightgbm
22 | lgb_train = lgb.Dataset(X_train, y_train)
23 | lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
24 | 
25 | # specify your configurations as a dict
26 | params = {
27 |     'num_leaves': 5,
28 |     'metric': ('l1', 'l2'),
29 |     'verbose': 0
30 | }
31 | 
32 | evals_result = {}  # to record eval results for plotting
33 | 
34 | print('Start training...')
35 | # train
36 | gbm = lgb.train(params,
37 |                 lgb_train,
38 |                 num_boost_round=100,
39 |                 valid_sets=[lgb_train, lgb_test],
40 |                 feature_name=['f' + str(i + 1) for i in range(28)],
41 |                 categorical_feature=[21],
42 |                 evals_result=evals_result,
43 |                 verbose_eval=10)
44 | 
45 | print('Plot metrics during training...')
46 | ax = lgb.plot_metric(evals_result, metric='l1')
47 | plt.show()
48 | 
49 | print('Plot feature importances...')
50 | ax = lgb.plot_importance(gbm, max_num_features=10)
51 | plt.show()
52 | 
53 | print('Plot 84th tree...')  # one tree use categorical feature to split
54 | ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
55 | plt.show()
56 | 


--------------------------------------------------------------------------------
/docs/Parameters-tuning.md:
--------------------------------------------------------------------------------
 1 | This is a page contains all parameters in LightGBM.
 2 | 
 3 | ***List of other Helpful Links***
 4 | * [Parameters](./Parameters.md)
 5 | * [Python API Reference](./Python-API.md)
 6 | 
 7 | ## Convert parameters from XGBoost
 8 | 
 9 | LightGBM uses [leaf-wise](https://github.com/Microsoft/LightGBM/wiki/Features#optimization-in-accuracy) tree growth algorithm. But other popular tools, e.g. XGBoost, use depth-wise tree growth. So LightGBM use ```num_leaves``` to control complexity of tree model, and other tools usually use ```max_depth```. Following table is the correspond between leaves and depths. The relation is ```num_leaves = 2^(max_depth) ```.
10 | 
11 | | max_depth | num_leaves |
12 | | --------- | ---------- |
13 | | 1 | 2 |
14 | | 2 | 4 |
15 | | 3 | 8 |
16 | | 7 | 128 |
17 | | 10 | 1024 |   
18 | 
19 | ## For faster speed
20 | 
21 | * Use bagging by set ```bagging_fraction``` and ```bagging_freq``` 
22 | * Use feature sub-sampling by set ```feature_fraction```
23 | * Use small ```max_bin```
24 | * Use ```save_binary``` to speed up data loading in future learning
25 | * Use parallel learning, refer to [parallel learning guide](./Parallel-Learning-Guide.md).
26 | 
27 | ## For better accuracy
28 | 
29 | * Use large ```max_bin``` (may be slower)
30 | * Use small ```learning_rate``` with large ```num_iterations```
31 | * Use large ```num_leaves```(may cause over-fitting)
32 | * Use bigger training data
33 | * Try ```dart```
34 | 
35 | ## Deal with over-fitting
36 | 
37 | * Use small ```max_bin```
38 | * Use small ```num_leaves```
39 | * Use ```min_data_in_leaf``` and ```min_sum_hessian_in_leaf```
40 | * Use bagging by set ```bagging_fraction``` and ```bagging_freq``` 
41 | * Use feature sub-sampling by set ```feature_fraction```
42 | * Use bigger training data
43 | * Try ```lambda_l1```, ```lambda_l2``` and ```min_gain_to_split``` to regularization
44 | * Try ```max_depth``` to avoid growing deep tree 
45 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.interprete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.interprete.R
 3 | \name{lgb.interprete}
 4 | \alias{lgb.interprete}
 5 | \title{Compute feature contribution of prediction}
 6 | \usage{
 7 | lgb.interprete(model, data, idxset, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{model}{object of class \code{lgb.Booster}.}
11 | 
12 | \item{data}{a matrix object or a dgCMatrix object.}
13 | 
14 | \item{idxset}{a integer vector of indices of rows needed.}
15 | 
16 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.}
17 | }
18 | \value{
19 | For regression, binary classification and lambdarank model, a \code{list} of \code{data.table} with the following columns:
20 | \itemize{
21 |   \item \code{Feature} Feature names in the model.
22 |   \item \code{Contribution} The total contribution of this feature's splits.
23 | }
24 | For multiclass classification, a \code{list} of \code{data.table} with the Feature column and Contribution columns to each class.
25 | }
26 | \description{
27 | Computes feature contribution components of rawscore prediction.
28 | }
29 | \examples{
30 | \dontrun{
31 | library(lightgbm)
32 | Sigmoid <- function(x) 1 / (1 + exp(-x))
33 | Logit <- function(x) log(x / (1 - x))
34 | data(agaricus.train, package = "lightgbm")
35 | train <- agaricus.train
36 | dtrain <- lgb.Dataset(train$data, label = train$label)
37 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
38 | data(agaricus.test, package = "lightgbm")
39 | test <- agaricus.test
40 | 
41 | params = list(objective = "binary",
42 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
43 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
44 |               model <- lgb.train(params, dtrain, 20)
45 | model <- lgb.train(params, dtrain, 20)
46 | 
47 | tree_interpretation <- lgb.interprete(model, test$data, 1:5)
48 | }
49 | 
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.plot.importance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.plot.importance.R
 3 | \name{lgb.plot.importance}
 4 | \alias{lgb.plot.importance}
 5 | \title{Plot feature importance as a bar graph}
 6 | \usage{
 7 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain",
 8 |   left_margin = 10, cex = NULL)
 9 | }
10 | \arguments{
11 | \item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.}
12 | 
13 | \item{top_n}{maximal number of top features to include into the plot.}
14 | 
15 | \item{measure}{the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".}
16 | 
17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.}
18 | 
19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
20 | }
21 | \value{
22 | The \code{lgb.plot.importance} function creates a \code{barplot}
23 | and silently returns a processed data.table with \code{top_n} features sorted by defined importance.
24 | }
25 | \description{
26 | Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.
27 | }
28 | \details{
29 | The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature.
30 | Features are shown ranked in a decreasing importance order.
31 | }
32 | \examples{
33 | \dontrun{
34 | data(agaricus.train, package = "lightgbm")
35 | train <- agaricus.train
36 | dtrain <- lgb.Dataset(train$data, label = train$label)
37 | 
38 | params = list(objective = "binary",
39 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
40 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
41 |               model <- lgb.train(params, dtrain, 20)
42 | model <- lgb.train(params, dtrain, 20)
43 | 
44 | tree_imp <- lgb.importance(model, percentage = TRUE)
45 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
46 | }
47 | 
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/R-package/demo/cross_validation.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | # load in the agaricus dataset
 3 | data(agaricus.train, package = "lightgbm")
 4 | data(agaricus.test, package = "lightgbm")
 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 7 | 
 8 | nrounds <- 2
 9 | param <- list(num_leaves = 4,
10 |               learning_rate = 1,
11 |               objective = "binary")
12 | 
13 | print("Running cross validation")
14 | # Do cross validation, this will print result out as
15 | # [iteration]  metric_name:mean_value+std_value
16 | # std_value is standard deviation of the metric
17 | lgb.cv(param,
18 |        dtrain,
19 |        nrounds,
20 |        nfold = 5,
21 |        eval = "binary_error")
22 | 
23 | print("Running cross validation, disable standard deviation display")
24 | # do cross validation, this will print result out as
25 | # [iteration]  metric_name:mean_value+std_value
26 | # std_value is standard deviation of the metric
27 | lgb.cv(param,
28 |        dtrain,
29 |        nrounds,
30 |        nfold = 5,
31 |        eval = "binary_error",
32 |        showsd = FALSE)
33 | 
34 | # Tou can also do cross validation with cutomized loss function
35 | # See custom_objective.R
36 | print("Running cross validation, with cutomsized loss function")
37 | 
38 | logregobj <- function(preds, dtrain) {
39 |   labels <- getinfo(dtrain, "label")
40 |   preds <- 1 / (1 + exp(-preds))
41 |   grad <- preds - labels
42 |   hess <- preds * (1 - preds)
43 |   return(list(grad = grad, hess = hess))
44 | }
45 | evalerror <- function(preds, dtrain) {
46 |   labels <- getinfo(dtrain, "label")
47 |   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
48 |   return(list(name = "error", value = err, higher_better = FALSE))
49 | }
50 | 
51 | # train with customized objective
52 | lgb.cv(params = param,
53 |        data = dtrain,
54 |        nrounds = nrounds,
55 |        obj = logregobj,
56 |        eval = evalerror,
57 |        nfold = 5)
58 | 


--------------------------------------------------------------------------------
/R-package/R/readRDS.lgb.Booster.R:
--------------------------------------------------------------------------------
 1 | #' readRDS for lgb.Booster models
 2 | #'
 3 | #' Attemps to load a model using RDS.
 4 | #' 
 5 | #' @param file a connection or the name of the file where the R object is saved to or read from.
 6 | #' @param refhook a hook function for handling reference objects.
 7 | #' 
 8 | #' @return an R object.
 9 | #' 
10 | #' @examples
11 | #' \dontrun{
12 | #'   library(lightgbm)
13 | #'   data(agaricus.train, package = "lightgbm")
14 | #'   train <- agaricus.train
15 | #'   dtrain <- lgb.Dataset(train$data, label = train$label)
16 | #'   data(agaricus.test, package = "lightgbm")
17 | #'   test <- agaricus.test
18 | #'   dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
19 | #'   params <- list(objective = "regression", metric = "l2")
20 | #'   valids <- list(test = dtest)
21 | #'   model <- lgb.train(params,
22 | #'                      dtrain,
23 | #'                      100,
24 | #'                      valids,
25 | #'                      min_data = 1,
26 | #'                      learning_rate = 1,
27 | #'                      early_stopping_rounds = 10)
28 | #'   saveRDS.lgb.Booster(model, "model.rds")
29 | #'   new_model <- readRDS.lgb.Booster("model.rds")
30 | #' }
31 | #' 
32 | #' @export
33 | readRDS.lgb.Booster <- function(file = "", refhook = NULL) {
34 |   
35 |   # Read RDS file
36 |   object <- readRDS(file = file, refhook = refhook)
37 |   
38 |   # Check if object has the model stored
39 |   if (!is.na(object$raw)) {
40 |     
41 |     # Create temporary file for the model loading
42 |     temp <- tempfile()
43 |     write(object$raw, temp)
44 |     object2 <- lgb.load(temp)
45 |     file.remove(temp)
46 |     
47 |     # Restore best iteration and recorded evaluations
48 |     object2$best_iter <- object$best_iter
49 |     object2$record_evals <- object$record_evals
50 |     
51 |     # Return newly loaded object
52 |     return(object2)
53 |     
54 |   } else {
55 |     
56 |     # Return RDS loaded object
57 |     return(object)
58 |     
59 |   }
60 |   
61 | }
62 | 


--------------------------------------------------------------------------------
/examples/python-guide/simple_example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = invalid-name, C0111
 3 | import json
 4 | import lightgbm as lgb
 5 | import pandas as pd
 6 | from sklearn.metrics import mean_squared_error
 7 | 
 8 | 
 9 | # load or create your dataset
10 | print('Load data...')
11 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
12 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
13 | 
14 | y_train = df_train[0].values
15 | y_test = df_test[0].values
16 | X_train = df_train.drop(0, axis=1).values
17 | X_test = df_test.drop(0, axis=1).values
18 | 
19 | # create dataset for lightgbm
20 | lgb_train = lgb.Dataset(X_train, y_train)
21 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
22 | 
23 | # specify your configurations as a dict
24 | params = {
25 |     'task': 'train',
26 |     'boosting_type': 'gbdt',
27 |     'objective': 'regression',
28 |     'metric': {'l2', 'auc'},
29 |     'num_leaves': 31,
30 |     'learning_rate': 0.05,
31 |     'feature_fraction': 0.9,
32 |     'bagging_fraction': 0.8,
33 |     'bagging_freq': 5,
34 |     'verbose': 0
35 | }
36 | 
37 | print('Start training...')
38 | # train
39 | gbm = lgb.train(params,
40 |                 lgb_train,
41 |                 num_boost_round=20,
42 |                 valid_sets=lgb_eval,
43 |                 early_stopping_rounds=5)
44 | 
45 | print('Save model...')
46 | # save model to file
47 | gbm.save_model('model.txt')
48 | 
49 | print('Start predicting...')
50 | # predict
51 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
52 | # eval
53 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
54 | 
55 | print('Dump model to JSON...')
56 | # dump model to json (and save to file)
57 | model_json = gbm.dump_model()
58 | 
59 | with open('model.json', 'w+') as f:
60 |     json.dump(model_json, f, indent=4)
61 | 
62 | 
63 | print('Feature names:', gbm.feature_name())
64 | 
65 | print('Calculate feature importances...')
66 | # feature importances
67 | print('Feature importances:', list(gbm.feature_importance()))
68 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.unloader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.unloader.R
 3 | \name{lgb.unloader}
 4 | \alias{lgb.unloader}
 5 | \title{LightGBM unloading error fix}
 6 | \usage{
 7 | lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv)
 8 | }
 9 | \arguments{
10 | \item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.}
11 | 
12 | \item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.}
13 | 
14 | \item{restart}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.}
15 | }
16 | \value{
17 | NULL invisibly.
18 | }
19 | \description{
20 | Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
21 | }
22 | \examples{
23 | \dontrun{
24 | library(lightgbm)
25 | data(agaricus.train, package = "lightgbm")
26 | train <- agaricus.train
27 | dtrain <- lgb.Dataset(train$data, label = train$label)
28 | data(agaricus.test, package = "lightgbm")
29 | test <- agaricus.test
30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
31 | params <- list(objective = "regression", metric = "l2")
32 | valids <- list(test = dtest)
33 | model <- lgb.train(params,
34 |                    dtrain,
35 |                    100,
36 |                    valids,
37 |                    min_data = 1,
38 |                    learning_rate = 1,
39 |                    early_stopping_rounds = 10)
40 | lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
41 | rm(model, dtrain, dtest) # Not needed if wipe = TRUE
42 | gc() # Not needed if wipe = TRUE
43 | 
44 | library(lightgbm)
45 | # Do whatever you want again with LightGBM without object clashing
46 | }
47 | 
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/openmp_wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_OPENMP_WRAPPER_H_
 2 | #define LIGHTGBM_OPENMP_WRAPPER_H_
 3 | #ifdef _OPENMP
 4 | 
 5 | #include <omp.h>
 6 | #include <exception>
 7 | #include <stdexcept>
 8 | #include <mutex>
 9 | #include <vector>
10 | #include <memory>
11 | #include "log.h"
12 | 
13 | class ThreadExceptionHelper {
14 | public:
15 |   ThreadExceptionHelper() { 
16 |     ex_ptr_ = nullptr; 
17 |   }
18 | 
19 |   ~ThreadExceptionHelper() { 
20 |     ReThrow();
21 |   }
22 |   void ReThrow() {
23 |     if (ex_ptr_ != nullptr) {
24 |       std::rethrow_exception(ex_ptr_);
25 |       ex_ptr_ = nullptr;
26 |     }
27 |   }
28 |   void CaptureException() {
29 |     // only catch first exception.
30 |     if (ex_ptr_ != nullptr) { return; }
31 |     std::unique_lock<std::mutex> guard(lock_);
32 |     if (ex_ptr_ != nullptr) { return; }
33 |     ex_ptr_ = std::current_exception();
34 |   }
35 | private:
36 |   std::exception_ptr ex_ptr_;
37 |   std::mutex lock_;
38 | };
39 | 
40 | #define OMP_INIT_EX() ThreadExceptionHelper omp_except_helper
41 | #define OMP_LOOP_EX_BEGIN() try {
42 | 
43 | #define OMP_LOOP_EX_END() } \
44 | catch(std::exception& ex) { Log::Warning(ex.what()); omp_except_helper.CaptureException(); } \
45 | catch(...) { omp_except_helper.CaptureException();  }
46 | #define OMP_THROW_EX() omp_except_helper.ReThrow()
47 | 
48 | #else
49 | 
50 | #ifdef _MSC_VER
51 |   #pragma warning( disable : 4068 ) // disable unknown pragma warning
52 | #endif
53 | 
54 | #ifdef __cplusplus
55 |   extern "C" {
56 | #endif
57 |   /** Fall here if no OPENMP support, so just
58 |       simulate a single thread running.
59 |       All #pragma omp should be ignored by the compiler **/
60 |   inline void omp_set_num_threads(int) {}
61 |   inline int omp_get_num_threads() {return 1;}
62 |   inline int omp_get_thread_num() {return 0;}
63 | #ifdef __cplusplus
64 | }; // extern "C"
65 | #endif
66 | 
67 | #define OMP_INIT_EX()
68 | #define OMP_LOOP_EX_BEGIN()
69 | #define OMP_LOOP_EX_END()
70 | #define OMP_THROW_EX()
71 | 
72 | #endif
73 | 
74 | 
75 | 
76 | #endif /* LIGHTGBM_OPENMP_WRAPPER_H_ */
77 | 


--------------------------------------------------------------------------------
/R-package/demo/early_stopping.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | require(methods)
 3 | 
 4 | # Load in the agaricus dataset
 5 | data(agaricus.train, package = "lightgbm")
 6 | data(agaricus.test, package = "lightgbm")
 7 | 
 8 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 9 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
10 | 
11 | # Note: for customized objective function, we leave objective as default
12 | # Note: what we are getting is margin value in prediction
13 | # You must know what you are doing
14 | param <- list(num_leaves = 4,
15 |               learning_rate = 1)
16 | valids <- list(eval = dtest)
17 | num_round <- 20
18 | 
19 | # User define objective function, given prediction, return gradient and second order gradient
20 | # This is loglikelihood loss
21 | logregobj <- function(preds, dtrain) {
22 |   labels <- getinfo(dtrain, "label")
23 |   preds <- 1 / (1 + exp(-preds))
24 |   grad <- preds - labels
25 |   hess <- preds * (1 - preds)
26 |   return(list(grad = grad, hess = hess))
27 | }
28 | 
29 | # User defined evaluation function, return a pair metric_name, result, higher_better
30 | # NOTE: when you do customized loss function, the default prediction value is margin
31 | # This may make buildin evalution metric not function properly
32 | # For example, we are doing logistic loss, the prediction is score before logistic transformation
33 | # The buildin evaluation error assumes input is after logistic transformation
34 | # Take this in mind when you use the customization, and maybe you need write customized evaluation function
35 | evalerror <- function(preds, dtrain) {
36 |   labels <- getinfo(dtrain, "label")
37 |   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
38 |   return(list(name = "error", value = err, higher_better = FALSE))
39 | }
40 | print("Start training with early Stopping setting")
41 | 
42 | bst <- lgb.train(param,
43 |                  dtrain,
44 |                  num_round,
45 |                  valids,
46 |                  objective = logregobj,
47 |                  eval = evalerror,
48 |                  early_stopping_round = 3)
49 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.model.dt.tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.model.dt.tree.R
 3 | \name{lgb.model.dt.tree}
 4 | \alias{lgb.model.dt.tree}
 5 | \title{Parse a LightGBM model json dump}
 6 | \usage{
 7 | lgb.model.dt.tree(model, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{model}{object of class \code{lgb.Booster}}
11 | }
12 | \value{
13 | A \code{data.table} with detailed information about model trees' nodes and leafs.
14 | 
15 | The columns of the \code{data.table} are:
16 | 
17 | \itemize{
18 |  \item \code{tree_index}: ID of a tree in a model (integer)
19 |  \item \code{split_index}: ID of a node in a tree (integer)
20 |  \item \code{split_feature}: for a node, it's a feature name (character);
21 |                              for a leaf, it simply labels it as \code{"NA"}
22 |  \item \code{node_parent}: ID of the parent node for current node (integer)
23 |  \item \code{leaf_index}: ID of a leaf in a tree (integer)
24 |  \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
25 |  \item \code{split_gain}: Split gain of a node
26 |  \item \code{threshold}: Spliting threshold value of a node
27 |  \item \code{decision_type}: Decision type of a node
28 |  \item \code{internal_value}: Node value
29 |  \item \code{internal_count}: The number of observation collected by a node
30 |  \item \code{leaf_value}: Leaf value
31 |  \item \code{leaf_count}: The number of observation collected by a leaf
32 | }
33 | }
34 | \description{
35 | Parse a LightGBM model json dump into a \code{data.table} structure.
36 | }
37 | \examples{
38 | \dontrun{
39 | library(lightgbm)
40 | 
41 | data(agaricus.train, package = "lightgbm")
42 | train <- agaricus.train
43 | dtrain <- lgb.Dataset(train$data, label = train$label)
44 | 
45 | params = list(objective = "binary",
46 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
47 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
48 |               model <- lgb.train(params, dtrain, 20)
49 | model <- lgb.train(params, dtrain, 20)
50 | 
51 | tree_dt <- lgb.model.dt.tree(model)
52 | }
53 | 
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/pmml/README.md:
--------------------------------------------------------------------------------
 1 | PMML Generator 
 2 | ==============
 3 | The script pmml.py can be used to translate the LightGBM models, found in LightGBM_model.txt, to  predictive model markup language (PMML). These models can then be imported by other analytics applications. The models that the language can describe includes decision trees. The specification of PMML can be found here at the Data Mining Group's [website](http://dmg.org/pmml/v4-3/GeneralStructure.html).
 4 | 
 5 | In order to generate pmml files do the following steps.
 6 | ```
 7 | lightgbm config=train.conf
 8 | python pmml.py LightGBM_model.txt
 9 | ```
10 | The python script will create a file called **LightGBM_pmml.xml**. Inside the file you will find a `MiningModel` tag. In there you will find `TreeModel` tags. Each `TreeModel` tag contains the pmml translation of a decision tree inside the LightGBM_model.txt file. The model described by the **LightGBM_pmml.xml** file can be transferred to other analytics applications. For instance you can use the pmml file as an input to the jpmml-evaluator API. Follow the steps below to run a model described by **LightGBM_pmml.xml**. 
11 | 
12 | ##### Steps to run jpmml-evaluator
13 | 1, First clone the repository
14 | ```
15 | git clone https://github.com/jpmml/jpmml-evaluator.git
16 | ```
17 | 2, Build using maven
18 | ```
19 | mvn clean install
20 | ```
21 | 3, Run the EvaluationExample class on the model file using the following command
22 | ```
23 | java -cp example-1.3-SNAPSHOT.jar org.jpmml.evaluator.EvaluationExample --model LightGBM_pmml.xml --input input.csv --output output.csv
24 | ```
25 | Note, in order to run the model on the input.csv file, the input.csv file must have the same number of columns as specified by the `DataDictionary` field in the pmml file. Also, the column headers inside the input.csv file must be the same as the column names specified by the `MiningSchema` field. Inside output.csv you will find all the columns inside the input.csv file plus a new column. In the new column you will find the scores calculated by processing each rows data on the model. More information about jpmml-evaluator can be found at its  [github repository](https://github.com/jpmml/jpmml-evaluator).


--------------------------------------------------------------------------------
/R-package/man/lgb.plot.interpretation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.plot.interpretation.R
 3 | \name{lgb.plot.interpretation}
 4 | \alias{lgb.plot.interpretation}
 5 | \title{Plot feature contribution as a bar graph}
 6 | \usage{
 7 | lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1,
 8 |   left_margin = 10, cex = NULL)
 9 | }
10 | \arguments{
11 | \item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.}
12 | 
13 | \item{top_n}{maximal number of top features to include into the plot.}
14 | 
15 | \item{cols}{the column numbers of layout, will be used only for multiclass classification feature contribution.}
16 | 
17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.}
18 | 
19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
20 | }
21 | \value{
22 | The \code{lgb.plot.interpretation} function creates a \code{barplot}.
23 | }
24 | \description{
25 | Plot previously calculated feature contribution as a bar graph.
26 | }
27 | \details{
28 | The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
29 | Features are shown ranked in a decreasing contribution order.
30 | }
31 | \examples{
32 | \dontrun{
33 | library(lightgbm)
34 | Sigmoid <- function(x) {1 / (1 + exp(-x))}
35 | Logit <- function(x) {log(x / (1 - x))}
36 | data(agaricus.train, package = "lightgbm")
37 | train <- agaricus.train
38 | dtrain <- lgb.Dataset(train$data, label = train$label)
39 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
40 | data(agaricus.test, package = "lightgbm")
41 | test <- agaricus.test
42 | 
43 | params = list(objective = "binary",
44 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
45 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
46 |               model <- lgb.train(params, dtrain, 20)
47 | model <- lgb.train(params, dtrain, 20)
48 | 
49 | tree_interpretation <- lgb.interprete(model, test$data, 1:5)
50 | lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10)
51 | }
52 | 
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/docs/FAQ.md:
--------------------------------------------------------------------------------
 1 | LightGBM FAQ
 2 | =======================
 3 | 
 4 | ###Catalog
 5 | 
 6 | - [Python-package](FAQ.md#python-package)
 7 | 
 8 | ###Python-package
 9 | 
10 | - **Question 1**: I see error messages like this when install from github using `python setup.py install`.
11 | 
12 |     ```
13 |     error: Error: setup script specifies an absolute path:
14 | 
15 |     /Users/Microsoft/LightGBM/python-package/lightgbm/../../lib_lightgbm.so
16 | 
17 |     setup() arguments must *always* be /-separated paths relative to the
18 |     setup.py directory, *never* absolute paths.
19 |     ```
20 | 
21 | - **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).
22 | 
23 | - **Question 2**: I see error messages like `Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset`, but I already construct dataset by some code like `train = lightgbm.Dataset(X_train, y_train)`, or error messages like `Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.`.
24 | 
25 | - **Solution 2**: Because LightGBM constructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when construct a Booster. And if you set free_raw_data=True (default), the raw data (with python data struct) will be freed. So, if you want to:
26 | 
27 |   + get label(or weight/init_score/group) before construct dataset, it's same as get `self.label`
28 |   + set label(or weight/init_score/group) before construct dataset, it's same as `self.label=some_label_array`
29 |   + get num_data(or num_feature) before construct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
30 |   + set predictor(or reference/categorical feature) after construct dataset, you should set free_raw_data=False or init a Dataset object with the same raw data
31 | 


--------------------------------------------------------------------------------
/tests/python_package_test/test_basic.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: skip-file
 3 | import os
 4 | import tempfile
 5 | import unittest
 6 | 
 7 | import lightgbm as lgb
 8 | import numpy as np
 9 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file
10 | from sklearn.model_selection import train_test_split
11 | 
12 | 
13 | class TestBasic(unittest.TestCase):
14 | 
15 |     def test(self):
16 |         X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
17 |         train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
18 |         valid_data = train_data.create_valid(X_test, label=y_test)
19 | 
20 |         params = {
21 |             "objective": "binary",
22 |             "metric": "auc",
23 |             "min_data": 10,
24 |             "num_leaves": 15,
25 |             "verbose": -1
26 |         }
27 |         bst = lgb.Booster(params, train_data)
28 |         bst.add_valid(valid_data, "valid_1")
29 | 
30 |         for i in range(30):
31 |             bst.update()
32 |             if i % 10 == 0:
33 |                 print(bst.eval_train(), bst.eval_valid())
34 |         bst.save_model("model.txt")
35 |         pred_from_matr = bst.predict(X_test)
36 |         with tempfile.NamedTemporaryFile() as f:
37 |             tname = f.name
38 |         with open(tname, "w+b") as f:
39 |             dump_svmlight_file(X_test, y_test, f)
40 |         pred_from_file = bst.predict(tname)
41 |         os.remove(tname)
42 |         self.assertEqual(len(pred_from_matr), len(pred_from_file))
43 |         for preds in zip(pred_from_matr, pred_from_file):
44 |             self.assertAlmostEqual(*preds, places=15)
45 |         # check saved model persistence
46 |         bst = lgb.Booster(params, model_file="model.txt")
47 |         pred_from_model_file = bst.predict(X_test)
48 |         self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
49 |         for preds in zip(pred_from_matr, pred_from_model_file):
50 |             self.assertEqual(*preds)
51 |         # check pmml
52 |         os.system('python ../../pmml/pmml.py model.txt')
53 | 
54 | 
55 | print("----------------------------------------------------------------------")
56 | print("running test_basic.py")
57 | unittest.main()
58 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/pipeline_reader.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
 2 | #define LIGHTGBM_UTILS_PIPELINE_READER_H_
 3 | 
 4 | #include <LightGBM/utils/log.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | #include <functional>
 9 | #include <thread>
10 | #include <memory>
11 | #include <algorithm>
12 | 
13 | namespace LightGBM{
14 | 
15 | /*!
16 | * \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block
17 | */
18 | class PipelineReader {
19 | public:
20 |   /*!
21 |   * \brief Read data from a file, use pipeline methods
22 |   * \param filename Filename of data
23 |   * \process_fun Process function
24 |   */
25 |   static size_t Read(const char* filename, int skip_bytes, const std::function<size_t (const char*, size_t)>& process_fun) {
26 |     FILE* file;
27 | 
28 | #ifdef _MSC_VER
29 |     fopen_s(&file, filename, "rb");
30 | #else
31 |     file = fopen(filename, "rb");
32 | #endif
33 |     if (file == NULL) {
34 |       return 0;
35 |     }
36 |     size_t cnt = 0;
37 |     const size_t buffer_size =  16 * 1024 * 1024 ;
38 |     // buffer used for the process_fun
39 |     auto buffer_process = std::vector<char>(buffer_size);
40 |     // buffer used for the file reading
41 |     auto buffer_read = std::vector<char>(buffer_size);
42 |     size_t read_cnt = 0;
43 |     if (skip_bytes > 0) {
44 |       // skip first k bytes
45 |       read_cnt = fread(buffer_process.data(), 1, skip_bytes, file);
46 |     }
47 |     // read first block
48 |     read_cnt = fread(buffer_process.data(), 1, buffer_size, file);
49 |     size_t last_read_cnt = 0;
50 |     while (read_cnt > 0) {
51 |       // start read thread
52 |       std::thread read_worker = std::thread(
53 |         [file, &buffer_read, buffer_size, &last_read_cnt] {
54 |         last_read_cnt = fread(buffer_read.data(), 1, buffer_size, file);
55 |       }
56 |       );
57 |       // start process
58 |       cnt += process_fun(buffer_process.data(), read_cnt);
59 |       // wait for read thread
60 |       read_worker.join();
61 |       // exchange the buffer
62 |       std::swap(buffer_process, buffer_read);
63 |       read_cnt = last_read_cnt;
64 |     }
65 |     // close file
66 |     fclose(file);
67 |     return cnt;
68 |   }
69 | 
70 | };
71 | 
72 | }  // namespace LightGBM
73 | 
74 | #endif   // LightGBM_UTILS_PIPELINE_READER_H_
75 | 


--------------------------------------------------------------------------------
/R-package/man/saveRDS.lgb.Booster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/saveRDS.lgb.Booster.R
 3 | \name{saveRDS.lgb.Booster}
 4 | \alias{saveRDS.lgb.Booster}
 5 | \title{saveRDS for lgb.Booster models}
 6 | \usage{
 7 | saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
 8 |   compress = TRUE, refhook = NULL, raw = TRUE)
 9 | }
10 | \arguments{
11 | \item{object}{R object to serialize.}
12 | 
13 | \item{file}{a connection or the name of the file where the R object is saved to or read from.}
14 | 
15 | \item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.}
16 | 
17 | \item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.}
18 | 
19 | \item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.}
20 | 
21 | \item{refhook}{a hook function for handling reference objects.}
22 | 
23 | \item{raw}{whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.}
24 | }
25 | \value{
26 | NULL invisibly.
27 | }
28 | \description{
29 | Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
30 | }
31 | \examples{
32 | \dontrun{
33 |   library(lightgbm)
34 |   data(agaricus.train, package = "lightgbm")
35 |   train <- agaricus.train
36 |   dtrain <- lgb.Dataset(train$data, label = train$label)
37 |   data(agaricus.test, package = "lightgbm")
38 |   test <- agaricus.test
39 |   dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
40 |   params <- list(objective = "regression", metric = "l2")
41 |   valids <- list(test = dtest)
42 |   model <- lgb.train(params,
43 |                      dtrain,
44 |                      100,
45 |                      valids,
46 |                      min_data = 1,
47 |                      learning_rate = 1,
48 |                      early_stopping_rounds = 10)
49 |   saveRDS.lgb.Booster(model, "model.rds")
50 | }
51 | 
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/include/LightGBM/objective_function.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_
 2 | #define LIGHTGBM_OBJECTIVE_FUNCTION_H_
 3 | 
 4 | #include <LightGBM/meta.h>
 5 | #include <LightGBM/config.h>
 6 | #include <LightGBM/dataset.h>
 7 | #include <functional>
 8 | 
 9 | namespace LightGBM {
10 | /*!
11 | * \brief The interface of Objective Function.
12 | */
13 | class ObjectiveFunction {
14 | public:
15 |   /*! \brief virtual destructor */
16 |   virtual ~ObjectiveFunction() {}
17 | 
18 |   /*!
19 |   * \brief Initialize
20 |   * \param metadata Label data
21 |   * \param num_data Number of data
22 |   */
23 |   virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
24 | 
25 |   /*!
26 |   * \brief calculating first order derivative of loss function
27 |   * \param score prediction score in this round
28 |   * \gradients Output gradients
29 |   * \hessians Output hessians
30 |   */
31 |   virtual void GetGradients(const double* score,
32 |     score_t* gradients, score_t* hessians) const = 0;
33 | 
34 |   virtual const char* GetName() const = 0;
35 | 
36 |   virtual bool IsConstantHessian() const { return false; }
37 | 
38 |   virtual bool BoostFromAverage() const { return false; }
39 | 
40 |   virtual bool SkipEmptyClass() const { return false; }
41 | 
42 |   virtual int NumTreePerIteration() const { return 1; }
43 | 
44 |   virtual int NumPredictOneRow() const { return 1; }
45 | 
46 |   virtual void ConvertOutput(const double* input, double* output) const {
47 |     output[0] = input[0];
48 |   }
49 | 
50 |   virtual std::string ToString() const = 0;
51 | 
52 |   ObjectiveFunction() = default;
53 |   /*! \brief Disable copy */
54 |   ObjectiveFunction& operator=(const ObjectiveFunction&) = delete;
55 |   /*! \brief Disable copy */
56 |   ObjectiveFunction(const ObjectiveFunction&) = delete;
57 | 
58 |   /*!
59 |   * \brief Create object of objective function
60 |   * \param type Specific type of objective function
61 |   * \param config Config for objective function
62 |   */
63 |   LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& type,
64 |     const ObjectiveConfig& config);
65 | 
66 |   /*!
67 |   * \brief Load objective function from string object
68 |   */
69 |   LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str);
70 | };
71 | 
72 | }  // namespace LightGBM
73 | 
74 | #endif   // LightGBM_OBJECTIVE_FUNCTION_H_
75 | 


--------------------------------------------------------------------------------
/src/boosting/boosting.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/boosting.h>
 2 | #include "gbdt.h"
 3 | #include "dart.hpp"
 4 | #include "goss.hpp"
 5 | 
 6 | namespace LightGBM {
 7 | 
 8 | std::string GetBoostingTypeFromModelFile(const char* filename) {
 9 |   TextReader<size_t> model_reader(filename, true);
10 |   std::string type = model_reader.first_line();
11 |   return type;
12 | }
13 | 
14 | bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
15 |   if (boosting != nullptr) {
16 |     TextReader<size_t> model_reader(filename, true);
17 |     model_reader.ReadAllLines();
18 |     std::stringstream str_buf;
19 |     for (auto& line : model_reader.Lines()) {
20 |       str_buf << line << '\n';
21 |     }
22 |     if (!boosting->LoadModelFromString(str_buf.str()))
23 |       return false;
24 |   }
25 | 
26 |   return true;
27 | }
28 | 
29 | Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) {
30 |   if (filename == nullptr || filename[0] == '\0') {
31 |     if (type == std::string("gbdt")) {
32 |       return new GBDT();
33 |     } else if (type == std::string("dart")) {
34 |       return new DART();
35 |     } else if (type == std::string("goss")) {
36 |       return new GOSS();
37 |     } else {
38 |       return nullptr;
39 |     }
40 |   } else {
41 |     std::unique_ptr<Boosting> ret;
42 |     auto type_in_file = GetBoostingTypeFromModelFile(filename);
43 |     if (type_in_file == std::string("tree")) {
44 |       if (type == std::string("gbdt")) {
45 |         ret.reset(new GBDT());
46 |       } else if (type == std::string("dart")) {
47 |         ret.reset(new DART());
48 |       } else if (type == std::string("goss")) {
49 |         ret.reset(new GOSS());
50 |       } else {
51 |         Log::Fatal("unknown boosting type %s", type.c_str());
52 |       }
53 |       LoadFileToBoosting(ret.get(), filename);
54 |     } else {
55 |       Log::Fatal("unknown submodel type in model file %s", filename);
56 |     }
57 |     return ret.release();
58 |   }
59 | }
60 | 
61 | Boosting* Boosting::CreateBoosting(const char* filename) {
62 |   auto type = GetBoostingTypeFromModelFile(filename);
63 |   std::unique_ptr<Boosting> ret;
64 |   if (type == std::string("tree")) {
65 |     ret.reset(new GBDT());
66 |   } else {
67 |     Log::Fatal("unknown submodel type in model file %s", filename);
68 |   }
69 |   LoadFileToBoosting(ret.get(), filename);
70 |   return ret.release();
71 | }
72 | 
73 | }  // namespace LightGBM
74 | 


--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
 1 | Development Guide
 2 | ==================
 3 | 
 4 | Algorithms
 5 | ----------
 6 | 
 7 | Refer to [Features](https://github.com/Microsoft/LightGBM/wiki/Features) to get important algorithms used in LightGBM.
 8 | 
 9 | Classes And Code Structure 
10 | --------------------------
11 | 
12 | ### Important Classes
13 | 
14 | | Class | description |
15 | | ----- | --------- |
16 | | `Application` | The entrance of application, including training and prediction logic |
17 | | `Bin` | Data structure used for store feature discrete values(converted from float values) | 
18 | | `Boosting` | Boosting interface, current implementation is GBDT and DART |
19 | | `Config` | Store parameters and configurations|
20 | | `Dataset` | Store information of dataset |
21 | | `DatasetLoader` | Used to construct dataset | 
22 | | `Feature` | Store One column feature |
23 | | `Metric` | Evaluation metrics |
24 | | `Network` | Newwork interfaces and communication algorithms |
25 | | `ObjectiveFunction` | Objective function used to train |
26 | | `Tree` | Store information of tree model |
27 | | `TreeLearner` | Used to learn trees | 
28 | 
29 | ### Code Structure
30 | 
31 | | Path | description |
32 | | ----- | --------- |
33 | | ./include | header files |
34 | | ./include/utils | some common functions |
35 | | ./src/application | Implementations of training and prediction logic |
36 | | ./src/boosting | Implementations of Boosting |
37 | | ./src/io | Implementations of IO relatived classes, including  `Bin`, `Config`, `Dataset`, `DatasetLoader`, `Feature` and `Tree`|
38 | | ./src/metric | Implementations of metrics |
39 | | ./src/network | Implementations of network functions |
40 | | ./src/objective | Implementations of objective functions |
41 | | ./src/treelearner | Implementations of tree learners |
42 | 
43 | ### API Documents
44 | 
45 | LightGBM support use [doxygen](http://www.stack.nl/~dimitri/doxygen/) to generate documents for classes and functions.
46 | 
47 | C API
48 | -----
49 | Refere to the comments in [c_api.h](https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/c_api.h).
50 | 
51 | High level Language package
52 | ---------------------------
53 | 
54 | Follow the implementation of [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package/lightgbm).
55 | 
56 | Ask Questions
57 | -------------
58 | Feel free to open [issues](https://github.com/Microsoft/LightGBM/issues) if you met problems.
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_dataset.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | require(Matrix)
 3 | 
 4 | context("testing lgb.Dataset functionality")
 5 | 
 6 | data(agaricus.test, package='lightgbm')
 7 | test_data <- agaricus.test$data[1:100,]
 8 | test_label <- agaricus.test$label[1:100]
 9 | 
10 | test_that("lgb.Dataset: basic construction, saving, loading", {
11 |   # from sparse matrix
12 |   dtest1 <- lgb.Dataset(test_data, label=test_label)
13 |   # from dense matrix 
14 |   dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label)
15 |   expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
16 |   
17 |   # save to a local file
18 |   tmp_file <- tempfile('lgb.Dataset_')
19 |   lgb.Dataset.save(dtest1, tmp_file)
20 |   # read from a local file
21 |   dtest3 <- lgb.Dataset(tmp_file)
22 |   lgb.Dataset.construct(dtest3)
23 |   unlink(tmp_file)
24 |   expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
25 | })
26 | 
27 | test_that("lgb.Dataset: getinfo & setinfo", {
28 |   dtest <- lgb.Dataset(test_data)
29 |   setinfo(dtest, 'label', test_label)
30 |   labels <- getinfo(dtest, 'label')
31 |   expect_equal(test_label, getinfo(dtest, 'label'))
32 |   
33 |   expect_true(length(getinfo(dtest, 'weight')) == 0)
34 |   expect_true(length(getinfo(dtest, 'init_score')) == 0)
35 |   
36 |   # any other label should error
37 |   expect_error(setinfo(dtest, 'asdf', test_label))
38 | })
39 | 
40 | test_that("lgb.Dataset: slice, dim", {
41 |   dtest <- lgb.Dataset(test_data, label=test_label)
42 |   lgb.Dataset.construct(dtest)
43 |   expect_equal(dim(dtest), dim(test_data))
44 |   dsub1 <- slice(dtest, 1:42)
45 |   lgb.Dataset.construct(dsub1)
46 |   expect_equal(nrow(dsub1), 42)
47 |   expect_equal(ncol(dsub1), ncol(test_data))
48 | })
49 | 
50 | test_that("lgb.Dataset: colnames", {
51 |   dtest <- lgb.Dataset(test_data, label=test_label)
52 |   expect_equal(colnames(dtest), colnames(test_data))
53 |   lgb.Dataset.construct(dtest)
54 |   expect_equal(colnames(dtest), colnames(test_data))
55 |   expect_error( colnames(dtest) <- 'asdf')
56 |   new_names <- make.names(1:ncol(test_data))
57 |   expect_silent(colnames(dtest) <- new_names)
58 |   expect_equal(colnames(dtest), new_names)
59 | })
60 | 
61 | test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
62 |   nr <- 1000
63 |   x <- rsparsematrix(nr, 100, density=0.0005)
64 |   # we want it very sparse, so that last rows are empty
65 |   expect_lt(max(x@i), nr)
66 |   dtest <- lgb.Dataset(x)
67 |   expect_equal(dim(dtest), dim(x))
68 | })
69 | 


--------------------------------------------------------------------------------
/R-package/R/lgb.importance.R:
--------------------------------------------------------------------------------
 1 | #' Compute feature importance in a model
 2 | #' 
 3 | #' Creates a \code{data.table} of feature importances in a model.
 4 | #' 
 5 | #' @param model object of class \code{lgb.Booster}.
 6 | #' @param percentage whether to show importance in relative percentage.
 7 | #' 
 8 | #' @return
 9 | #' 
10 | #' For a tree model, a \code{data.table} with the following columns:
11 | #' \itemize{
12 | #'   \item \code{Feature} Feature names in the model.
13 | #'   \item \code{Gain} The total gain of this feature's splits.
14 | #'   \item \code{Cover} The number of observation related to this feature.
15 | #'   \item \code{Frequency} The number of times a feature splited in trees.
16 | #' }
17 | #' 
18 | #' @examples
19 | #' \dontrun{
20 | #' library(lightgbm)
21 | #' data(agaricus.train, package = "lightgbm")
22 | #' train <- agaricus.train
23 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
24 | #'
25 | #' params = list(objective = "binary",
26 | #'               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
27 | #'               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
28 | #'               model <- lgb.train(params, dtrain, 20)
29 | #' model <- lgb.train(params, dtrain, 20)
30 | #'
31 | #' tree_imp1 <- lgb.importance(model, percentage = TRUE)
32 | #' tree_imp2 <- lgb.importance(model, percentage = FALSE)
33 | #' }
34 | #' 
35 | #' @importFrom magrittr %>% %T>%
36 | #' @importFrom data.table :=
37 | #' @export
38 | lgb.importance <- function(model, percentage = TRUE) {
39 |   
40 |   # Check if model is a lightgbm model
41 |   if (!any(class(model) == "lgb.Booster")) {
42 |     stop("'model' has to be an object of class lgb.Booster")
43 |   }
44 |   
45 |   # Setup importance
46 |   tree_dt <- lgb.model.dt.tree(model)
47 |   
48 |   # Extract elements
49 |   tree_imp <- tree_dt %>%
50 |     magrittr::extract(.,
51 |                       i = is.na(split_index) == FALSE,
52 |                       j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N),
53 |                       by = "split_feature") %T>%
54 |     data.table::setnames(., old = "split_feature", new = "Feature") %>%
55 |     magrittr::extract(., i = order(Gain, decreasing = TRUE))
56 |   
57 |   # Check if relative values are requested
58 |   if (percentage) {
59 |     tree_imp[, ":="(Gain = Gain / sum(Gain),
60 |                     Cover = Cover / sum(Cover),
61 |                     Frequency = Frequency / sum(Frequency))]
62 |   }
63 |   
64 |   # Return importance table
65 |   return(tree_imp)
66 |   
67 | }
68 | 


--------------------------------------------------------------------------------
/python-package/lightgbm/compat.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = C0103
 3 | """Compatibility"""
 4 | from __future__ import absolute_import
 5 | 
 6 | import inspect
 7 | import sys
 8 | 
 9 | import numpy as np
10 | 
11 | is_py3 = (sys.version_info[0] == 3)
12 | 
13 | """compatibility between python2 and python3"""
14 | if is_py3:
15 |     string_type = str
16 |     numeric_types = (int, float, bool)
17 |     integer_types = (int, )
18 |     range_ = range
19 | 
20 |     def argc_(func):
21 |         """return number of arguments of a function"""
22 |         return len(inspect.signature(func).parameters)
23 | else:
24 |     string_type = basestring
25 |     numeric_types = (int, long, float, bool)
26 |     integer_types = (int, long)
27 |     range_ = xrange
28 | 
29 |     def argc_(func):
30 |         """return number of arguments of a function"""
31 |         return len(inspect.getargspec(func).args)
32 | 
33 | """json"""
34 | try:
35 |     import simplejson as json
36 | except (ImportError, SyntaxError):
37 |     # simplejson does not support Python 3.2, it throws a SyntaxError
38 |     # because of u'...' Unicode literals.
39 |     import json
40 | 
41 | 
42 | def json_default_with_numpy(obj):
43 |     if isinstance(obj, (np.integer, np.floating, np.bool_)):
44 |         return obj.item()
45 |     elif isinstance(obj, np.ndarray):
46 |         return obj.tolist()
47 |     else:
48 |         return obj
49 | 
50 | 
51 | """pandas"""
52 | try:
53 |     from pandas import Series, DataFrame
54 | except ImportError:
55 |     class Series(object):
56 |         pass
57 | 
58 |     class DataFrame(object):
59 |         pass
60 | 
61 | """sklearn"""
62 | try:
63 |     from sklearn.base import BaseEstimator
64 |     from sklearn.base import RegressorMixin, ClassifierMixin
65 |     from sklearn.preprocessing import LabelEncoder
66 |     from sklearn.utils import deprecated
67 |     try:
68 |         from sklearn.model_selection import StratifiedKFold
69 |     except ImportError:
70 |         from sklearn.cross_validation import StratifiedKFold
71 |     SKLEARN_INSTALLED = True
72 |     LGBMModelBase = BaseEstimator
73 |     LGBMRegressorBase = RegressorMixin
74 |     LGBMClassifierBase = ClassifierMixin
75 |     LGBMLabelEncoder = LabelEncoder
76 |     LGBMDeprecated = deprecated
77 |     LGBMStratifiedKFold = StratifiedKFold
78 | except ImportError:
79 |     SKLEARN_INSTALLED = False
80 |     LGBMModelBase = object
81 |     LGBMClassifierBase = object
82 |     LGBMRegressorBase = object
83 |     LGBMLabelEncoder = None
84 |     LGBMStratifiedKFold = None
85 | 


--------------------------------------------------------------------------------
/R-package/man/predict.lgb.Booster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{predict.lgb.Booster}
 4 | \alias{predict.lgb.Booster}
 5 | \title{Predict method for LightGBM model}
 6 | \usage{
 7 | \method{predict}{lgb.Booster}(object, data, num_iteration = NULL,
 8 |   rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE)
 9 | }
10 | \arguments{
11 | \item{object}{Object of class \code{lgb.Booster}}
12 | 
13 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
14 | 
15 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
16 | 
17 | \item{rawscore}{whether the prediction should be returned in the for of original untransformed
18 | sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
19 | logistic regression would result in predictions for log-odds instead of probabilities.}
20 | 
21 | \item{predleaf}{whether predict leaf index instead.}
22 | 
23 | \item{header}{only used for prediction for text file. True if text file has header}
24 | 
25 | \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
26 | prediction outputs per case.}
27 | }
28 | \value{
29 | For regression or binary classification, it returns a vector of length \code{nrows(data)}.
30 | For multiclass classification, either a \code{num_class * nrows(data)} vector or
31 | a \code{(nrows(data), num_class)} dimension matrix is returned, depending on
32 | the \code{reshape} value.
33 | 
34 | When \code{predleaf = TRUE}, the output is a matrix object with the
35 | number of columns corresponding to the number of trees.
36 | }
37 | \description{
38 | Predicted values based on class \code{lgb.Booster}
39 | }
40 | \examples{
41 | \dontrun{
42 | library(lightgbm)
43 | data(agaricus.train, package = "lightgbm")
44 | train <- agaricus.train
45 | dtrain <- lgb.Dataset(train$data, label = train$label)
46 | data(agaricus.test, package = "lightgbm")
47 | test <- agaricus.test
48 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
49 | params <- list(objective = "regression", metric = "l2")
50 | valids <- list(test = dtest)
51 | model <- lgb.train(params,
52 |                    dtrain,
53 |                    100,
54 |                    valids,
55 |                    min_data = 1,
56 |                    learning_rate = 1,
57 |                    early_stopping_rounds = 10)
58 | preds <- predict(model, test$data)
59 | }
60 | 
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/src/objective/objective_function.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/objective_function.h>
 2 | #include "regression_objective.hpp"
 3 | #include "binary_objective.hpp"
 4 | #include "rank_objective.hpp"
 5 | #include "multiclass_objective.hpp"
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const ObjectiveConfig& config) {
10 |   if (type == std::string("regression") || type == std::string("regression_l2")
11 |       || type == std::string("mean_squared_error") || type == std::string("mse")) {
12 |     return new RegressionL2loss(config);
13 |   } else if (type == std::string("regression_l1") || type == std::string("mean_absolute_error")  || type == std::string("mae")) {
14 |     return new RegressionL1loss(config);
15 |   } else if (type == std::string("huber")) {
16 |     return new RegressionHuberLoss(config);
17 |   } else if (type == std::string("fair")) {
18 |     return new RegressionFairLoss(config);
19 |   } else if (type == std::string("poisson")) {
20 |     return new RegressionPoissonLoss(config);
21 |   } else if (type == std::string("binary")) {
22 |     return new BinaryLogloss(config);
23 |   } else if (type == std::string("lambdarank")) {
24 |     return new LambdarankNDCG(config);
25 |   } else if (type == std::string("multiclass")) {
26 |     return new MulticlassSoftmax(config);
27 |   } else if (type == std::string("multiclassova")) {
28 |     return new MulticlassOVA(config);
29 |   }
30 |   return nullptr;
31 | }
32 | 
33 | ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& str) {
34 |   auto strs = Common::Split(str.c_str(), " ");
35 |   auto type = strs[0];
36 |   if (type == std::string("regression")) {
37 |     return new RegressionL2loss(strs);
38 |   } else if (type == std::string("regression_l1")) {
39 |     return new RegressionL1loss(strs);
40 |   } else if (type == std::string("huber")) {
41 |     return new RegressionHuberLoss(strs);
42 |   } else if (type == std::string("fair")) {
43 |     return new RegressionFairLoss(strs);
44 |   } else if (type == std::string("poisson")) {
45 |     return new RegressionPoissonLoss(strs);
46 |   } else if (type == std::string("binary")) {
47 |     return new BinaryLogloss(strs);
48 |   } else if (type == std::string("lambdarank")) {
49 |     return new LambdarankNDCG(strs);
50 |   } else if (type == std::string("multiclass")) {
51 |     return new MulticlassSoftmax(strs);
52 |   } else if (type == std::string("multiclassova")) {
53 |     return new MulticlassOVA(strs);
54 |   }
55 |   return nullptr;
56 | }
57 | 
58 | }  // namespace LightGBM
59 | 


--------------------------------------------------------------------------------
/R-package/demo/multiclass.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | 
 3 | # We load the default iris dataset shipped with R
 4 | data(iris)
 5 | 
 6 | # We must convert factors to numeric
 7 | # They must be starting from number 0 to use multiclass
 8 | # For instance: 0, 1, 2, 3, 4, 5...
 9 | iris$Species <- as.numeric(as.factor(iris$Species)) - 1
10 | 
11 | # We cut the data set into 80% train and 20% validation
12 | # The 10 last samples of each class are for validation
13 | 
14 | train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
15 | test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
16 | dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
17 | dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
18 | valids <- list(test = dtest)
19 | 
20 | # Method 1 of training
21 | params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
22 | model <- lgb.train(params,
23 |                    dtrain,
24 |                    100,
25 |                    valids,
26 |                    min_data = 1,
27 |                    learning_rate = 1,
28 |                    early_stopping_rounds = 10)
29 | 
30 | # We can predict on test data, outputs a 90-length vector
31 | # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
32 | my_preds <- predict(model, test[, 1:4])
33 | 
34 | # Method 2 of training, identical
35 | model <- lgb.train(list(),
36 |                    dtrain,
37 |                    100,
38 |                    valids,
39 |                    min_data = 1,
40 |                    learning_rate = 1,
41 |                    early_stopping_rounds = 10,
42 |                    objective = "multiclass",
43 |                    metric = "multi_error",
44 |                    num_class = 3)
45 | 
46 | # We can predict on test data, identical
47 | my_preds <- predict(model, test[, 1:4])
48 | 
49 | # A (30x3) matrix with the predictions, use parameter reshape
50 | # class1 class2 class3
51 | #   obs1   obs1   obs1
52 | #   obs2   obs2   obs2
53 | #   ....   ....   ....
54 | my_preds <- predict(model, test[, 1:4], reshape = TRUE)
55 | 
56 | # We can also get the predicted scores before the Sigmoid/Softmax application
57 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE)
58 | 
59 | # Raw score predictions as matrix instead of vector
60 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
61 | 
62 | # We can also get the leaf index
63 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
64 | 
65 | # Predict leaf index as matrix instead of vector
66 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)
67 | 


--------------------------------------------------------------------------------
/R-package/R/lgb.unloader.R:
--------------------------------------------------------------------------------
 1 | #' LightGBM unloading error fix
 2 | #'
 3 | #' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
 4 | #' 
 5 | #' @param restart Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.
 6 | #' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.
 7 | #' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.
 8 | #' 
 9 | #' @return NULL invisibly.
10 | #' 
11 | #' @examples
12 | #' \dontrun{
13 | #' library(lightgbm)
14 | #' data(agaricus.train, package = "lightgbm")
15 | #' train <- agaricus.train
16 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
17 | #' data(agaricus.test, package = "lightgbm")
18 | #' test <- agaricus.test
19 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
20 | #' params <- list(objective = "regression", metric = "l2")
21 | #' valids <- list(test = dtest)
22 | #' model <- lgb.train(params,
23 | #'                    dtrain,
24 | #'                    100,
25 | #'                    valids,
26 | #'                    min_data = 1,
27 | #'                    learning_rate = 1,
28 | #'                    early_stopping_rounds = 10)
29 | #' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
30 | #' rm(model, dtrain, dtest) # Not needed if wipe = TRUE
31 | #' gc() # Not needed if wipe = TRUE
32 | #' 
33 | #' library(lightgbm)
34 | #' # Do whatever you want again with LightGBM without object clashing
35 | #' }
36 | #' 
37 | #' @export
38 | lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
39 |   
40 |   # Unload package
41 |   try(detach("package:lightgbm", unload = TRUE), silent = TRUE)
42 |   
43 |   # Should we wipe variables? (lgb.Booster, lgb.Dataset)
44 |   if (wipe) {
45 |     rm(list = ls(envir = envir)[which(sapply(ls(.GlobalEnv), function(x) {"lgb.Booster" %in% class(get(x, envir = envir))}))], envir = envir)
46 |     rm(list = ls(envir = envir)[which(sapply(ls(.GlobalEnv), function(x) {"lgb.Dataset" %in% class(get(x, envir = envir))}))], envir = envir)
47 |     gc(verbose = FALSE)
48 |   }
49 |   
50 |   # Load package back?
51 |   if (restore) {
52 |     library(lightgbm)
53 |   }
54 |   
55 |   invisible()
56 |   
57 | }
58 | 


--------------------------------------------------------------------------------
/include/LightGBM/application.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_APPLICATION_H_
 2 | #define LIGHTGBM_APPLICATION_H_
 3 | 
 4 | #include <LightGBM/meta.h>
 5 | #include <LightGBM/config.h>
 6 | 
 7 | #include <vector>
 8 | #include <memory>
 9 | 
10 | namespace LightGBM {
11 | 
12 | class DatasetLoader;
13 | class Dataset;
14 | class Boosting;
15 | class ObjectiveFunction;
16 | class Metric;
17 | 
18 | /*!
19 | * \brief The main entrance of LightGBM. this application has two tasks:
20 | *        Train and Predict.
21 | *        Train task will train a new model
22 | *        Predict task will predict the scores of test data using exsisting model,
23 | *        and save the score to disk.
24 | */
25 | class Application {
26 | public:
27 |   Application(int argc, char** argv);
28 | 
29 |   /*! \brief Destructor */
30 |   ~Application();
31 | 
32 |   /*! \brief To call this funciton to run application*/
33 |   inline void Run();
34 | 
35 | private:
36 |   /*! 
37 |   * \brief Global Sync by minimal, will return minimal T across nodes
38 |   * \param local Local data
39 |   * \return minimal values across nodes 
40 |   */
41 |   template<typename T>
42 |   T GlobalSyncUpByMin(T& local);
43 | 
44 |   /*! \brief Load parameters from command line and config file*/
45 |   void LoadParameters(int argc, char** argv);
46 | 
47 |   /*! \brief Load data, including training data and validation data*/
48 |   void LoadData();
49 | 
50 |   /*! \brief Initialization before training*/
51 |   void InitTrain();
52 | 
53 |   /*! \brief Main Training logic */
54 |   void Train();
55 | 
56 |   /*! \brief Initializations before prediction */
57 |   void InitPredict();
58 | 
59 |   /*! \brief Main predicting logic */
60 |   void Predict();
61 | 
62 |   /*! \brief All configs */
63 |   OverallConfig config_;
64 |   /*! \brief Training data */
65 |   std::unique_ptr<Dataset> train_data_;
66 |   /*! \brief Validation data */
67 |   std::vector<std::unique_ptr<Dataset>> valid_datas_;
68 |   /*! \brief Metric for training data */
69 |   std::vector<std::unique_ptr<Metric>> train_metric_;
70 |   /*! \brief Metrics for validation data */
71 |   std::vector<std::vector<std::unique_ptr<Metric>>> valid_metrics_;
72 |   /*! \brief Boosting object */
73 |   std::unique_ptr<Boosting> boosting_;
74 |   /*! \brief Training objective function */
75 |   std::unique_ptr<ObjectiveFunction> objective_fun_;
76 | };
77 | 
78 | 
79 | inline void Application::Run() {
80 |   if (config_.task_type == TaskType::kPredict) {
81 |     InitPredict();
82 |     Predict();
83 |   } else {
84 |     InitTrain();
85 |     Train();
86 |   }
87 | }
88 | 
89 | }  // namespace LightGBM
90 | 
91 | #endif   // LightGBM_APPLICATION_H_
92 | 


--------------------------------------------------------------------------------
/R-package/README.md:
--------------------------------------------------------------------------------
 1 | LightGBM R Package
 2 | ==================
 3 | 
 4 | Installation
 5 | ------------
 6 | 
 7 | Windows users may need to run with administrator rights (either R or the command prompt, depending on the way you are installing this package). Rtools must be installed for Windows. Linux users might require the appropriate user write permissions for packages.
 8 | 
 9 | You can use a command prompt to install via command line:
10 | 
11 | ```
12 | cd R-package
13 | R CMD INSTALL --build  .
14 | ```
15 | 
16 | You can also install directly from R using the repository with `devtools`:
17 | 
18 | ```r
19 | devtools::install_github("Microsoft/LightGBM", subdir = "R-package")
20 | ```
21 | 
22 | For the `devtools` install scenario, you can safely ignore this message:
23 | 
24 | ```r
25 | Warning message:
26 | GitHub repo contains submodules, may not function as expected! 
27 | ```
28 | 
29 | If you want to build the self-contained R package, you can run ```unix_build_package.sh```(for UNIX) or ```win_build_package.cmd ```(for Windows). Then use ```R CMD INSTALL lightgbm_0.1.tar.gz``` to install.
30 | 
31 | When your package installation is done, you can check quickly if your LightGBM R package is working by running the following:
32 | 
33 | ```r
34 | library(lightgbm)
35 | data(agaricus.train, package='lightgbm')
36 | train <- agaricus.train
37 | dtrain <- lgb.Dataset(train$data, label=train$label)
38 | params <- list(objective="regression", metric="l2")
39 | model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
40 | ```
41 | ### OSX installation 
42 | 
43 | The default installation cannot successfully complete in OSX because clang doesn't support OpenMP.
44 | 
45 | You can use the following script to change default compiler to gcc, then compile LightGBM R package:
46 | 
47 | ```bash
48 | brew install gcc --without-multilib
49 | mkdir -p ~/.R
50 | touch ~/.R/Makevars
51 | cat <<EOF >>~/.R/Makevars
52 | C=gcc-6
53 | CXX=g++-6
54 | CXX1X=g++-6
55 | LDFLAGS=-L/usr/local/Cellar/gcc/6.3.0/lib
56 | CPPFLAGS=-I/usr/local/Cellar/gcc/6.3.0/include
57 | SHLIB_OPENMP_CFLAGS = -fopenmp
58 | SHLIB_OPENMP_CXXFLAGS = -fopenmp
59 | SHLIB_OPENMP_FCFLAGS = -fopenmp
60 | SHLIB_OPENMP_FFLAGS = -fopenmp
61 | EOF
62 | ```
63 | 
64 | Note: for `LDFLAGS=-L/usr/local/Cellar/gcc/6.3.0/lib` and `CPPFLAGS=-I/usr/local/Cellar/gcc/6.3.0/include`, you may need to change `6.3.0` to your gcc version.
65 | 
66 | To check your LightGBM installation, the test is identical to Linux/Windows versions (check the test provided just before OSX Installation part)
67 | 
68 | Examples
69 | ------------
70 | 
71 | * Please visit [demo](demo).
72 | 


--------------------------------------------------------------------------------
/include/LightGBM/tree_learner.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_TREE_LEARNER_H_
 2 | #define LIGHTGBM_TREE_LEARNER_H_
 3 | 
 4 | 
 5 | #include <LightGBM/meta.h>
 6 | #include <LightGBM/config.h>
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace LightGBM {
11 | 
12 | /*! \brief forward declaration */
13 | class Tree;
14 | class Dataset;
15 | 
16 | /*!
17 | * \brief Interface for tree learner
18 | */
19 | class TreeLearner {
20 | public:
21 |   /*! \brief virtual destructor */
22 |   virtual ~TreeLearner() {}
23 | 
24 |   /*!
25 |   * \brief Initialize tree learner with training dataset
26 |   * \param train_data The used training data
27 |   * \param is_constant_hessian True if all hessians share the same value
28 |   */
29 |   virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
30 | 
31 |   virtual void ResetTrainingData(const Dataset* train_data) = 0;
32 | 
33 |   /*!
34 |   * \brief Reset tree configs
35 |   * \param tree_config config of tree
36 |   */
37 |   virtual void ResetConfig(const TreeConfig* tree_config) = 0;
38 | 
39 |   /*!
40 |   * \brief training tree model on dataset 
41 |   * \param gradients The first order gradients
42 |   * \param hessians The second order gradients
43 |   * \param is_constant_hessian True if all hessians share the same value
44 |   * \return A trained tree
45 |   */
46 |   virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian) = 0;
47 | 
48 |   /*!
49 |   * \brief use a existing tree to fit the new gradients and hessians.
50 |   */
51 |   virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0;
52 | 
53 |   /*!
54 |   * \brief Set bagging data
55 |   * \param used_indices Used data indices
56 |   * \param num_data Number of used data
57 |   */
58 |   virtual void SetBaggingData(const data_size_t* used_indices,
59 |     data_size_t num_data) = 0;
60 | 
61 |   /*!
62 |   * \brief Using last trained tree to predict score then adding to out_score;
63 |   * \param out_score output score
64 |   */
65 |   virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0;
66 | 
67 |   TreeLearner() = default;
68 |   /*! \brief Disable copy */
69 |   TreeLearner& operator=(const TreeLearner&) = delete;
70 |   /*! \brief Disable copy */
71 |   TreeLearner(const TreeLearner&) = delete;
72 | 
73 |   /*!
74 |   * \brief Create object of tree learner
75 |   * \param learner_type Type of tree learner
76 |   * \param device_type Type of tree learner
77 |   * \param tree_config config of tree
78 |   */
79 |   static TreeLearner* CreateTreeLearner(const std::string& learner_type,
80 |     const std::string& device_type,
81 |     const TreeConfig* tree_config);
82 | };
83 | 
84 | }  // namespace LightGBM
85 | 
86 | #endif   // LightGBM_TREE_LEARNER_H_
87 | 


--------------------------------------------------------------------------------
/R-package/R/lgb.plot.importance.R:
--------------------------------------------------------------------------------
 1 | #' Plot feature importance as a bar graph
 2 | #' 
 3 | #' Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.
 4 | #' 
 5 | #' @param tree_imp a \code{data.table} returned by \code{\link{lgb.importance}}.
 6 | #' @param top_n maximal number of top features to include into the plot.
 7 | #' @param measure the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".
 8 | #' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
 9 | #' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
10 | #' 
11 | #' @details
12 | #' The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature.
13 | #' Features are shown ranked in a decreasing importance order.
14 | #' 
15 | #' @return
16 | #' The \code{lgb.plot.importance} function creates a \code{barplot}
17 | #' and silently returns a processed data.table with \code{top_n} features sorted by defined importance.
18 | #' 
19 | #' @examples
20 | #' \dontrun{
21 | #' data(agaricus.train, package = "lightgbm")
22 | #' train <- agaricus.train
23 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
24 | #'
25 | #' params = list(objective = "binary",
26 | #'               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
27 | #'               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
28 | #'               model <- lgb.train(params, dtrain, 20)
29 | #' model <- lgb.train(params, dtrain, 20)
30 | #'
31 | #' tree_imp <- lgb.importance(model, percentage = TRUE)
32 | #' lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
33 | #' }
34 | #' 
35 | #' @export
36 | lgb.plot.importance <- function(tree_imp,
37 |                                 top_n = 10,
38 |                                 measure = "Gain",
39 |                                 left_margin = 10,
40 |                                 cex = NULL) {
41 |   
42 |   # Check for measurement (column names) correctness
43 |   measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
44 |   
45 |   # Get top N importance (defaults to 10)
46 |   top_n <- min(top_n, nrow(tree_imp))
47 |   
48 |   # Parse importance
49 |   tree_imp <- tree_imp[order(abs(get(measure)), decreasing = TRUE),][1:top_n,]
50 |   
51 |   # Attempt to setup a correct cex
52 |   if (is.null(cex)) {
53 |     cex <- 2.5 / log2(1 + top_n)
54 |   }
55 |   
56 |   # Refresh plot
57 |   op <- par(no.readonly = TRUE)
58 |   on.exit(par(op))
59 |   
60 |   # Do some magic plotting
61 |   par(mar = op$mar %>% magrittr::inset(., 2, left_margin))
62 |   
63 |   # Do plot
64 |   tree_imp[.N:1,
65 |            barplot(height = get(measure),
66 |                    names.arg = Feature,
67 |                    horiz = TRUE,
68 |                    border = NA,
69 |                    main = "Feature Importance",
70 |                    xlab = measure,
71 |                    cex.names = cex,
72 |                    las = 1)]
73 |   
74 |   # Return invisibly
75 |   invisible(tree_imp)
76 |   
77 | }
78 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/log.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_UTILS_LOG_H_
  2 | #define LIGHTGBM_UTILS_LOG_H_
  3 | 
  4 | #include <iostream>
  5 | #include <cstdio>
  6 | #include <cstdlib>
  7 | #include <cstdarg>
  8 | #include <cstring>
  9 | #include <exception>
 10 | #include <stdexcept>
 11 | 
 12 | namespace LightGBM {
 13 | 
 14 | 
 15 | #ifndef CHECK
 16 | #define CHECK(condition)                                   \
 17 |   if (!(condition)) Log::Fatal("Check failed: " #condition \
 18 |      " at %s, line %d .\n", __FILE__,  __LINE__);
 19 | #endif
 20 | 
 21 | #ifndef CHECK_NOTNULL
 22 | #define CHECK_NOTNULL(pointer)                             \
 23 |   if ((pointer) == nullptr) LightGBM::Log::Fatal(#pointer " Can't be NULL at %s, line %d .\n", __FILE__,  __LINE__);
 24 | #endif
 25 | 
 26 | 
 27 | enum class LogLevel: int {
 28 |   Fatal = -1,
 29 |   Warning = 0,
 30 |   Info = 1,
 31 |   Debug = 2,
 32 | };
 33 | 
 34 | 
 35 | /*!
 36 | * \brief A static Log class
 37 | */
 38 | class Log {
 39 | public:
 40 |   /*!
 41 |   * \brief Resets the minimal log level. It is INFO by default.
 42 |   * \param level The new minimal log level.
 43 |   */
 44 |   static void ResetLogLevel(LogLevel level) {
 45 |     GetLevel() = level;
 46 |   }
 47 | 
 48 |   static void Debug(const char *format, ...) {
 49 |     va_list val;
 50 |     va_start(val, format);
 51 |     Write(LogLevel::Debug, "Debug", format, val);
 52 |     va_end(val);
 53 |   }
 54 |   static void Info(const char *format, ...) {
 55 |     va_list val;
 56 |     va_start(val, format);
 57 |     Write(LogLevel::Info, "Info", format, val);
 58 |     va_end(val);
 59 |   }
 60 |   static void Warning(const char *format, ...) {
 61 |     va_list val;
 62 |     va_start(val, format);
 63 |     Write(LogLevel::Warning, "Warning", format, val);
 64 |     va_end(val);
 65 |   }
 66 |   static void Fatal(const char *format, ...) {
 67 |     va_list val;
 68 |     char str_buf[1024];
 69 |     va_start(val, format);
 70 | #ifdef _MSC_VER
 71 |     vsprintf_s(str_buf, format, val);
 72 | #else
 73 |     vsprintf(str_buf, format, val);
 74 | #endif
 75 |     va_end(val);
 76 |     fprintf(stderr, "[LightGBM] [Fatal] %s\n", str_buf);
 77 |     fflush(stderr);
 78 |     throw std::runtime_error(std::string(str_buf));
 79 |   }
 80 | 
 81 | private:
 82 | 
 83 |   static void Write(LogLevel level, const char* level_str, const char *format, va_list val) {
 84 |     if (level <= GetLevel()) {  // omit the message with low level
 85 |       // write to STDOUT
 86 |       printf("[LightGBM] [%s] ", level_str);
 87 |       vprintf(format, val);
 88 |       printf("\n");
 89 |       fflush(stdout);
 90 |     }
 91 |   }
 92 | 
 93 |   // a trick to use static variable in header file.
 94 |   // May be not good, but avoid to use an additional cpp file
 95 | #if defined(_MSC_VER)
 96 |   static LogLevel& GetLevel() { static __declspec(thread) LogLevel level = LogLevel::Info; return level; }
 97 | #else
 98 |   static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; }
 99 | #endif
100 | 
101 | };
102 | 
103 | }  // namespace LightGBM
104 | #endif   // LightGBM_UTILS_LOG_H_
105 | 


--------------------------------------------------------------------------------
/R-package/R/saveRDS.lgb.Booster.R:
--------------------------------------------------------------------------------
 1 | #' saveRDS for lgb.Booster models
 2 | #'
 3 | #' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
 4 | #' 
 5 | #' @param object R object to serialize.
 6 | #' @param file a connection or the name of the file where the R object is saved to or read from.
 7 | #' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.
 8 | #' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.
 9 | #' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.
10 | #' @param refhook a hook function for handling reference objects.
11 | #' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
12 | #' 
13 | #' @return NULL invisibly.
14 | #' 
15 | #' @examples
16 | #' \dontrun{
17 | #'   library(lightgbm)
18 | #'   data(agaricus.train, package = "lightgbm")
19 | #'   train <- agaricus.train
20 | #'   dtrain <- lgb.Dataset(train$data, label = train$label)
21 | #'   data(agaricus.test, package = "lightgbm")
22 | #'   test <- agaricus.test
23 | #'   dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
24 | #'   params <- list(objective = "regression", metric = "l2")
25 | #'   valids <- list(test = dtest)
26 | #'   model <- lgb.train(params,
27 | #'                      dtrain,
28 | #'                      100,
29 | #'                      valids,
30 | #'                      min_data = 1,
31 | #'                      learning_rate = 1,
32 | #'                      early_stopping_rounds = 10)
33 | #'   saveRDS.lgb.Booster(model, "model.rds")
34 | #' }
35 | #' 
36 | #' @export
37 | saveRDS.lgb.Booster <- function(object,
38 |                                 file = "",
39 |                                 ascii = FALSE,
40 |                                 version = NULL,
41 |                                 compress = TRUE,
42 |                                 refhook = NULL,
43 |                                 raw = TRUE) {
44 |   
45 |   # Check if object has a raw value (and if the user wants to store the raw)
46 |   if (is.na(object$raw) & (raw)) {
47 |     
48 |     # Save model
49 |     object$save()
50 |     
51 |     # Save RDS
52 |     saveRDS(object,
53 |             file = file,
54 |             ascii = ascii,
55 |             version = version,
56 |             compress = compress,
57 |             refhook = refhook)
58 |     
59 |     # Free model from memory
60 |     object$raw <- NA
61 |     
62 |   } else {
63 |     
64 |     # Save as usual
65 |     saveRDS(object,
66 |             file = file,
67 |             ascii = ascii,
68 |             version = version,
69 |             compress = compress,
70 |             refhook = refhook)
71 |     
72 |   }
73 |   
74 | }
75 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | sudo: required
 3 | dist: trusty
 4 | 
 5 | before_install:
 6 | - test -n $CC  && unset CC
 7 | - test -n $CXX && unset CXX
 8 | - wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 9 | - chmod +x conda.sh
10 | - bash conda.sh -b -p $HOME/miniconda
11 | - export PATH="$HOME/miniconda/bin:$PATH"
12 | - conda config --set always_yes yes --set changeps1 no
13 | - conda update -q conda
14 | - sudo add-apt-repository ppa:george-edison55/cmake-3.x -y
15 | - sudo apt-get update -q
16 | - bash .travis/amd_sdk.sh;
17 | - tar -xjf AMD-SDK.tar.bz2;
18 | - AMDAPPSDK=${HOME}/AMDAPPSDK;
19 | - export OPENCL_VENDOR_PATH=${AMDAPPSDK}/etc/OpenCL/vendors;
20 | - mkdir -p ${OPENCL_VENDOR_PATH};
21 | - sh AMD-APP-SDK*.sh --tar -xf -C ${AMDAPPSDK};
22 | - echo libamdocl64.so > ${OPENCL_VENDOR_PATH}/amdocl64.icd;
23 | - export LD_LIBRARY_PATH=${AMDAPPSDK}/lib/x86_64:${LD_LIBRARY_PATH};
24 | - chmod +x ${AMDAPPSDK}/bin/x86_64/clinfo;
25 | - ${AMDAPPSDK}/bin/x86_64/clinfo;
26 | - export LIBRARY_PATH="$HOME/miniconda/lib:$LIBRARY_PATH"
27 | - export LD_RUN_PATH="$HOME/miniconda/lib:$LD_RUN_PATH"
28 | - export CPLUS_INCLUDE_PATH="$HOME/miniconda/include:$AMDAPPSDK/include/:$CPLUS_INCLUDE_PATH"
29 | 
30 | install:
31 | - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
32 | - sudo apt-get install -y cmake
33 | - conda install --yes atlas numpy scipy scikit-learn pandas matplotlib
34 | - conda install --yes -c conda-forge boost=1.63.0
35 | - pip install pep8
36 | 
37 | script:
38 | - cd $TRAVIS_BUILD_DIR
39 | - mkdir build && cd build && cmake .. && make -j
40 | - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
41 | - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
42 | - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
43 | - cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute .
44 | - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
45 | - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
46 | - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
47 | - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
48 | - cd $TRAVIS_BUILD_DIR
49 | - rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ ..
50 | - sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h
51 | - make -j$(nproc)
52 | - sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h
53 | - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
54 | - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
55 | - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
56 | 
57 | notifications:
58 |   email: false
59 | 
60 | matrix:
61 |   include:
62 |     - compiler: gcc
63 |     - compiler: clang
64 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/random.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_UTILS_RANDOM_H_
  2 | #define LIGHTGBM_UTILS_RANDOM_H_
  3 | 
  4 | #include <cstdint>
  5 | 
  6 | #include <random>
  7 | #include <vector>
  8 | 
  9 | namespace LightGBM {
 10 | 
 11 | /*!
 12 | * \brief A wrapper for random generator
 13 | */
 14 | class Random {
 15 | public:
 16 |   /*!
 17 |   * \brief Constructor, with random seed
 18 |   */
 19 |   Random() {
 20 |     std::random_device rd;
 21 |     auto genrator = std::mt19937(rd());
 22 |     std::uniform_int_distribution<int> distribution(0, x);
 23 |     x = distribution(genrator);
 24 |   }
 25 |   /*!
 26 |   * \brief Constructor, with specific seed
 27 |   */
 28 |   Random(int seed) {
 29 |     x = seed;
 30 |   }
 31 |   /*!
 32 |   * \brief Generate random integer, int16 range. [0, 65536]
 33 |   * \param lower_bound lower bound
 34 |   * \param upper_bound upper bound
 35 |   * \return The random integer between [lower_bound, upper_bound)
 36 |   */
 37 |   inline int NextShort(int lower_bound, int upper_bound) {
 38 |     return (RandInt16()) % (upper_bound - lower_bound) + lower_bound;
 39 |   }
 40 | 
 41 |   /*!
 42 |   * \brief Generate random integer, int32 range
 43 |   * \param lower_bound lower bound
 44 |   * \param upper_bound upper bound
 45 |   * \return The random integer between [lower_bound, upper_bound)
 46 |   */
 47 |   inline int NextInt(int lower_bound, int upper_bound) {
 48 |     return (RandInt32()) % (upper_bound - lower_bound) + lower_bound;
 49 |   }
 50 | 
 51 |   /*!
 52 |   * \brief Generate random float data
 53 |   * \return The random float between [0.0, 1.0)
 54 |   */
 55 |   inline float NextFloat() {
 56 |     // get random float in [0,1)
 57 |     return static_cast<float>(RandInt16()) / (32768.0f);
 58 |   }
 59 |   /*!
 60 |   * \brief Sample K data from {0,1,...,N-1}
 61 |   * \param N
 62 |   * \param K
 63 |   * \return K Ordered sampled data from {0,1,...,N-1}
 64 |   */
 65 |   inline std::vector<int> Sample(int N, int K) {
 66 |     std::vector<int> ret;
 67 |     ret.reserve(K);
 68 |     if (K > N || K < 0) {
 69 |       return ret;
 70 |     } else if (K == N) {
 71 |       for (int i = 0; i < N; ++i) {
 72 |         ret.push_back(i);
 73 |       }
 74 |     } else if (K > N / 2) {
 75 |       for (int i = 0; i < N; ++i) {
 76 |         double prob = (K - ret.size()) / static_cast<double>(N - i);
 77 |         if (NextFloat() < prob) {
 78 |           ret.push_back(i);
 79 |         }
 80 |       }
 81 |     } else {
 82 |       int min_step = 1;
 83 |       int avg_step = N / K;
 84 |       int max_step = 2 * avg_step - min_step;
 85 |       int start = -1;
 86 |       for (int i = 0; i < K; ++i) {
 87 |         int step = NextShort(min_step, max_step + 1);
 88 |         start += step;
 89 |         if (start >= N) { break; }
 90 |         ret.push_back(start);
 91 |       }
 92 |     }
 93 |     return ret;
 94 |   }
 95 | private:
 96 |   inline int RandInt16() {
 97 |     x = (214013 * x + 2531011);
 98 |     return (x >> 16) & 0x7FFF;
 99 |   }
100 | 
101 |   inline int RandInt32() {
102 |     x = (214013 * x + 2531011);
103 |     return x & 0x7FFFFFF;
104 |   }
105 | 
106 |   int x = 123456789;
107 | };
108 | 
109 | 
110 | }  // namespace LightGBM
111 | 
112 | #endif   // LightGBM_UTILS_RANDOM_H_
113 | 


--------------------------------------------------------------------------------
/src/io/parser.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_IO_PARSER_HPP_
  2 | #define LIGHTGBM_IO_PARSER_HPP_
  3 | 
  4 | #include <LightGBM/utils/common.h>
  5 | #include <LightGBM/utils/log.h>
  6 | 
  7 | #include <LightGBM/dataset.h>
  8 | 
  9 | #include <unordered_map>
 10 | #include <vector>
 11 | #include <utility>
 12 | 
 13 | namespace LightGBM {
 14 | 
 15 | class CSVParser: public Parser {
 16 | public:
 17 |   explicit CSVParser(int label_idx)
 18 |     :label_idx_(label_idx) {
 19 |   }
 20 |   inline void ParseOneLine(const char* str,
 21 |     std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
 22 |     int idx = 0;
 23 |     double val = 0.0f;
 24 |     int bias = 0;
 25 |     *out_label = 0.0f;
 26 |     while (*str != '\0') {
 27 |       str = Common::Atof(str, &val);
 28 |       if (idx == label_idx_) {
 29 |         *out_label = val;
 30 |         bias = -1;
 31 |       }
 32 |       else if (fabs(val) > 1e-10) {
 33 |         out_features->emplace_back(idx + bias, val);
 34 |       }
 35 |       ++idx;
 36 |       if (*str == ',') {
 37 |         ++str;
 38 |       } else if (*str != '\0') {
 39 |         Log::Fatal("Input format error when parsing as CSV");
 40 |       }
 41 |     }
 42 |   }
 43 | private:
 44 |   int label_idx_ = 0;
 45 | };
 46 | 
 47 | class TSVParser: public Parser {
 48 | public:
 49 |   explicit TSVParser(int label_idx)
 50 |     :label_idx_(label_idx) {
 51 |   }
 52 |   inline void ParseOneLine(const char* str,
 53 |     std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
 54 |     int idx = 0;
 55 |     double val = 0.0f;
 56 |     int bias = 0;
 57 |     while (*str != '\0') {
 58 |       str = Common::Atof(str, &val);
 59 |       if (idx == label_idx_) {
 60 |         *out_label = val;
 61 |         bias = -1;
 62 |       } else if (fabs(val) > 1e-10) {
 63 |         out_features->emplace_back(idx + bias, val);
 64 |       }
 65 |       ++idx;
 66 |       if (*str == '\t') {
 67 |         ++str;
 68 |       } else if (*str != '\0') {
 69 |         Log::Fatal("Input format error when parsing as TSV");
 70 |       }
 71 |     }
 72 |   }
 73 | private:
 74 |   int label_idx_ = 0;
 75 | };
 76 | 
 77 | class LibSVMParser: public Parser {
 78 | public:
 79 |   explicit LibSVMParser(int label_idx)
 80 |     :label_idx_(label_idx) {
 81 |     if (label_idx > 0) {
 82 |       Log::Fatal("Label should be the first column in a LibSVM file");
 83 |     }
 84 |   }
 85 |   inline void ParseOneLine(const char* str,
 86 |     std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
 87 |     int idx = 0;
 88 |     double val = 0.0f;
 89 |     if (label_idx_ == 0) {
 90 |       str = Common::Atof(str, &val);
 91 |       *out_label = val;
 92 |       str = Common::SkipSpaceAndTab(str);
 93 |     }
 94 |     while (*str != '\0') {
 95 |       str = Common::Atoi(str, &idx);
 96 |       str = Common::SkipSpaceAndTab(str);
 97 |       if (*str == ':') {
 98 |         ++str;
 99 |         str = Common::Atof(str, &val);
100 |         out_features->emplace_back(idx, val);
101 |       } else {
102 |         Log::Fatal("Input format error when parsing as LibSVM");
103 |       }
104 |       str = Common::SkipSpaceAndTab(str);
105 |     }
106 |   }
107 | private:
108 |   int label_idx_ = 0;
109 | };
110 | 
111 | }  // namespace LightGBM
112 | #endif   // LightGBM_IO_PARSER_HPP_
113 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_basic.R:
--------------------------------------------------------------------------------
 1 | context("basic functions")
 2 | 
 3 | data(agaricus.train, package='lightgbm')
 4 | data(agaricus.test, package='lightgbm')
 5 | train <- agaricus.train
 6 | test <- agaricus.test
 7 | 
 8 | windows_flag = grepl('Windows', Sys.info()[['sysname']])
 9 | 
10 | test_that("train and predict binary classification", {
11 |   nrounds = 10
12 |   bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5,
13 |     nrounds = nrounds, objective = "binary", metric="binary_error")
14 |   expect_false(is.null(bst$record_evals))
15 |   record_results <- lgb.get.eval.result(bst, "train", "binary_error")
16 |   expect_lt(min(record_results), 0.02)
17 | 
18 |   pred <- predict(bst, test$data)
19 |   expect_equal(length(pred), 1611)
20 | 
21 |   pred1 <- predict(bst, train$data, num_iteration = 1)
22 |   expect_equal(length(pred1), 6513)
23 |   err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
24 |   err_log <- record_results[1]
25 |   expect_lt(abs(err_pred1 - err_log), 10e-6)
26 | })
27 | 
28 | 
29 | test_that("train and predict softmax", {
30 |   lb <- as.numeric(iris$Species) - 1
31 | 
32 |   bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb,
33 |                  num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20,
34 |                  objective = "multiclass", metric="multi_error", num_class=3)
35 | 
36 |   expect_false(is.null(bst$record_evals))
37 |   record_results <- lgb.get.eval.result(bst, "train", "multi_error")
38 |   expect_lt(min(record_results), 0.03)
39 | 
40 |   pred <- predict(bst, as.matrix(iris[, -5]))
41 |   expect_equal(length(pred), nrow(iris) * 3)
42 | })
43 | 
44 | 
45 | test_that("use of multiple eval metrics works", {
46 |   bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4,
47 |                 learning_rate=1, nrounds = 10, objective = "binary",
48 |                 metric = list("binary_error","auc","binary_logloss") )
49 |   expect_false(is.null(bst$record_evals))
50 | })
51 | 
52 | 
53 | test_that("training continuation works", {
54 |   dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE)
55 |   watchlist = list(train=dtrain)
56 |   param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1)
57 | 
58 |   # for the reference, use 10 iterations at once:
59 |   bst <- lgb.train(param, dtrain, nrounds = 10, watchlist)
60 |   err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10)
61 |   # first 5 iterations:
62 |   bst1 <- lgb.train(param, dtrain, nrounds = 5, watchlist)
63 |   # test continuing from a model in file
64 |   lgb.save(bst1, "lightgbm.model")
65 |   # continue for 5 more:
66 |   bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = bst1)
67 |   err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10)
68 |   expect_lt(abs(err_bst - err_bst2), 0.01)
69 | 
70 |   bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = "lightgbm.model")
71 |   err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10)
72 |   expect_lt(abs(err_bst - err_bst2), 0.01)
73 | })
74 | 
75 | 
76 | test_that("cv works", {
77 |   dtrain <- lgb.Dataset(train$data, label=train$label)
78 |   params <- list(objective="regression", metric="l2,l1")
79 |   bst <- lgb.cv(params, dtrain, 10, nflod=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
80 |   expect_false(is.null(bst$record_evals))
81 | })
82 | 


--------------------------------------------------------------------------------
/include/LightGBM/dataset_loader.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_DATASET_LOADER_H_
 2 | #define LIGHTGBM_DATASET_LOADER_H_
 3 | 
 4 | #include <LightGBM/dataset.h>
 5 | 
 6 | namespace LightGBM {
 7 | 
 8 | class DatasetLoader {
 9 | public:
10 | 
11 |   LIGHTGBM_EXPORT DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
12 | 
13 |   LIGHTGBM_EXPORT ~DatasetLoader();
14 | 
15 |   LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
16 | 
17 |   LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) {
18 |     return LoadFromFile(filename, 0, 1);
19 |   }
20 | 
21 |   LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
22 | 
23 |   LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values,
24 |     int** sample_indices, int num_col, const int* num_per_col,
25 |     size_t total_sample_size, data_size_t num_data);
26 | 
27 |   /*! \brief Disable copy */
28 |   DatasetLoader& operator=(const DatasetLoader&) = delete;
29 |   /*! \brief Disable copy */
30 |   DatasetLoader(const DatasetLoader&) = delete;
31 | 
32 | private:
33 | 
34 |   Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
35 | 
36 |   void SetHeader(const char* filename);
37 | 
38 |   void CheckDataset(const Dataset* dataset);
39 | 
40 |   std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
41 | 
42 |   std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);
43 | 
44 |   std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
45 | 
46 |   void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);
47 | 
48 |   /*! \brief Extract local features from memory */
49 |   void ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset);
50 | 
51 |   /*! \brief Extract local features from file */
52 |   void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
53 | 
54 |   /*! \brief Check can load from binary file */
55 |   std::string CheckCanLoadFromBin(const char* filename);
56 | 
57 |   const IOConfig& io_config_;
58 |   /*! \brief Random generator*/
59 |   Random random_;
60 |   /*! \brief prediction function for initial model */
61 |   const PredictFunction& predict_fun_;
62 |   /*! \brief number of classes */
63 |   int num_class_;
64 |   /*! \brief index of label column */
65 |   int label_idx_;
66 |   /*! \brief index of weight column */
67 |   int weight_idx_;
68 |   /*! \brief index of group column */
69 |   int group_idx_;
70 |   /*! \brief Mapper from real feature index to used index*/
71 |   std::unordered_set<int> ignore_features_;
72 |   /*! \brief store feature names */
73 |   std::vector<std::string> feature_names_;
74 |   /*! \brief Mapper from real feature index to used index*/
75 |   std::unordered_set<int> categorical_features_;
76 | };
77 | 
78 | }
79 | 
80 | #endif // LIGHTGBM_DATASET_LOADER_H_


--------------------------------------------------------------------------------
/R-package/R/lightgbm.R:
--------------------------------------------------------------------------------
  1 | #' Simple interface for training an lightgbm model.
  2 | #' Its documentation is combined with lgb.train.
  3 | #'
  4 | #' @rdname lgb.train
  5 | #' @export
  6 | lightgbm <- function(data,
  7 |                      label = NULL,
  8 |                      weight = NULL,
  9 |                      params = list(),
 10 |                      nrounds = 10,
 11 |                      verbose = 1,
 12 |                      eval_freq = 1L,
 13 |                      early_stopping_rounds = NULL,
 14 |                      save_name = "lightgbm.model",
 15 |                      init_model = NULL,
 16 |                      callbacks = list(),
 17 |                      ...) {
 18 |   
 19 |   # Set data to a temporary variable
 20 |   dtrain <- data
 21 |   
 22 |   # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
 23 |   if (!lgb.is.Dataset(dtrain)) {
 24 |     dtrain <- lgb.Dataset(data, label = label, weight = weight)
 25 |   }
 26 | 
 27 |   # Set validation as oneself
 28 |   valids <- list()
 29 |   if (verbose > 0) {
 30 |     valids$train = dtrain
 31 |   }
 32 |   
 33 |   # Train a model using the regular way
 34 |   bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq,
 35 |                    early_stopping_rounds = early_stopping_rounds,
 36 |                    init_model = init_model, callbacks = callbacks, ...)
 37 |   
 38 |   # Store model under a specific name
 39 |   bst$save_model(save_name)
 40 |   
 41 |   # Return booster
 42 |   return(bst)
 43 | }
 44 | 
 45 | #' Training part from Mushroom Data Set
 46 | #'
 47 | #' This data set is originally from the Mushroom data set,
 48 | #' UCI Machine Learning Repository.
 49 | #'
 50 | #' This data set includes the following fields:
 51 | #'
 52 | #' \itemize{
 53 | #'  \item \code{label} the label for each record
 54 | #'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
 55 | #' }
 56 | #'
 57 | #' @references
 58 | #' https://archive.ics.uci.edu/ml/datasets/Mushroom
 59 | #'
 60 | #' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
 61 | #' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
 62 | #' School of Information and Computer Science.
 63 | #'
 64 | #' @docType data
 65 | #' @keywords datasets
 66 | #' @name agaricus.train
 67 | #' @usage data(agaricus.train)
 68 | #' @format A list containing a label vector, and a dgCMatrix object with 6513
 69 | #' rows and 127 variables
 70 | NULL
 71 | 
 72 | #' Test part from Mushroom Data Set
 73 | #'
 74 | #' This data set is originally from the Mushroom data set,
 75 | #' UCI Machine Learning Repository.
 76 | #'
 77 | #' This data set includes the following fields:
 78 | #'
 79 | #' \itemize{
 80 | #'  \item \code{label} the label for each record
 81 | #'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
 82 | #' }
 83 | #'
 84 | #' @references
 85 | #' https://archive.ics.uci.edu/ml/datasets/Mushroom
 86 | #'
 87 | #' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
 88 | #' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
 89 | #' School of Information and Computer Science.
 90 | #'
 91 | #' @docType data
 92 | #' @keywords datasets
 93 | #' @name agaricus.test
 94 | #' @usage data(agaricus.test)
 95 | #' @format A list containing a label vector, and a dgCMatrix object with 1611
 96 | #' rows and 126 variables
 97 | NULL
 98 | 
 99 | # Various imports
100 | #' @import methods
101 | #' @importFrom R6 R6Class
102 | #' @useDynLib lightgbm
103 | NULL
104 | 


--------------------------------------------------------------------------------
/src/treelearner/feature_parallel_tree_learner.cpp:
--------------------------------------------------------------------------------
 1 | #include "parallel_tree_learner.h"
 2 | 
 3 | #include <cstring>
 4 | 
 5 | #include <vector>
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | 
10 | template <typename TREELEARNER_T>
11 | FeatureParallelTreeLearner<TREELEARNER_T>::FeatureParallelTreeLearner(const TreeConfig* tree_config)
12 |   :TREELEARNER_T(tree_config) {
13 | }
14 | 
15 | template <typename TREELEARNER_T>
16 | FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
17 | 
18 | }
19 | 
20 | template <typename TREELEARNER_T>
21 | void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
22 |   TREELEARNER_T::Init(train_data, is_constant_hessian);
23 |   rank_ = Network::rank();
24 |   num_machines_ = Network::num_machines();
25 |   input_buffer_.resize(sizeof(SplitInfo) * 2);
26 |   output_buffer_.resize(sizeof(SplitInfo) * 2);
27 | }
28 | 
29 | 
30 | template <typename TREELEARNER_T>
31 | void FeatureParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
32 |   TREELEARNER_T::BeforeTrain();
33 |   // get feature partition
34 |   std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
35 |   std::vector<int> num_bins_distributed(num_machines_, 0);
36 |   for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
37 |     int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
38 |     if (inner_feature_index == -1) { continue; }
39 |     if (this->is_feature_used_[inner_feature_index]) {
40 |       int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
41 |       feature_distribution[cur_min_machine].push_back(inner_feature_index);
42 |       num_bins_distributed[cur_min_machine] += this->train_data_->FeatureNumBin(inner_feature_index);
43 |       this->is_feature_used_[inner_feature_index] = false;
44 |     }
45 |   }
46 |   // get local used features
47 |   for (auto fid : feature_distribution[rank_]) {
48 |     this->is_feature_used_[fid] = true;
49 |   }
50 | }
51 | 
52 | template <typename TREELEARNER_T>
53 | void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsForLeaves() {
54 |   SplitInfo smaller_best, larger_best;
55 |   // get best split at smaller leaf
56 |   smaller_best = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
57 |   // find local best split for larger leaf
58 |   if (this->larger_leaf_splits_->LeafIndex() >= 0) {
59 |     larger_best = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
60 |   }
61 |   // sync global best info
62 |   std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
63 |   std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));
64 | 
65 |   Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo),
66 |                      output_buffer_.data(), &SplitInfo::MaxReducer);
67 |   // copy back
68 |   std::memcpy(&smaller_best, output_buffer_.data(), sizeof(SplitInfo));
69 |   std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
70 |   // update best split
71 |   this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best;
72 |   if (this->larger_leaf_splits_->LeafIndex() >= 0) {
73 |     this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best;
74 |   }
75 | }
76 | 
77 | // instantiate template classes, otherwise linker cannot find the code
78 | template class FeatureParallelTreeLearner<GPUTreeLearner>;
79 | template class FeatureParallelTreeLearner<SerialTreeLearner>;
80 | }  // namespace LightGBM
81 | 


--------------------------------------------------------------------------------
/R-package/src/R_object_helper.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * A simple wrapper for accessing data in R object.
  3 | * Due to license issue(GPLv2), we cannot include R's header file, so use this simple wrapper instead.
  4 | * However, if R changes the way it defines objects, this file will need to be updated as well.
  5 | */
  6 | #ifndef R_OBJECT_HELPER_H_
  7 | #define R_OBJECT_HELPER_H_
  8 | 
  9 | #include <cstdint>
 10 | 
 11 | #define TYPE_BITS 5
 12 | struct sxpinfo_struct {
 13 |   unsigned int type : 5;
 14 |   unsigned int obj : 1;
 15 |   unsigned int named : 2;
 16 |   unsigned int gp : 16;
 17 |   unsigned int mark : 1;
 18 |   unsigned int debug : 1;
 19 |   unsigned int trace : 1;
 20 |   unsigned int spare : 1;
 21 |   unsigned int gcgen : 1;
 22 |   unsigned int gccls : 3;
 23 | };
 24 | 
 25 | struct primsxp_struct {
 26 |   int offset;
 27 | };
 28 | 
 29 | struct symsxp_struct {
 30 |   struct SEXPREC *pname;
 31 |   struct SEXPREC *value;
 32 |   struct SEXPREC *internal;
 33 | };
 34 | 
 35 | struct listsxp_struct {
 36 |   struct SEXPREC *carval;
 37 |   struct SEXPREC *cdrval;
 38 |   struct SEXPREC *tagval;
 39 | };
 40 | 
 41 | struct envsxp_struct {
 42 |   struct SEXPREC *frame;
 43 |   struct SEXPREC *enclos;
 44 |   struct SEXPREC *hashtab;
 45 | };
 46 | 
 47 | struct closxp_struct {
 48 |   struct SEXPREC *formals;
 49 |   struct SEXPREC *body;
 50 |   struct SEXPREC *env;
 51 | };
 52 | 
 53 | struct promsxp_struct {
 54 |   struct SEXPREC *value;
 55 |   struct SEXPREC *expr;
 56 |   struct SEXPREC *env;
 57 | };
 58 | 
 59 | typedef struct SEXPREC {
 60 |   struct sxpinfo_struct sxpinfo;
 61 |   struct SEXPREC* attrib;
 62 |   struct SEXPREC* gengc_next_node, *gengc_prev_node;
 63 |   union {
 64 |     struct primsxp_struct primsxp;
 65 |     struct symsxp_struct symsxp;
 66 |     struct listsxp_struct listsxp;
 67 |     struct envsxp_struct envsxp;
 68 |     struct closxp_struct closxp;
 69 |     struct promsxp_struct promsxp;
 70 |   } u;
 71 | } SEXPREC, *SEXP;
 72 | 
 73 | struct vecsxp_struct {
 74 |   int length;
 75 |   int truelength;
 76 | };
 77 | 
 78 | typedef struct VECTOR_SEXPREC {
 79 |   struct sxpinfo_struct sxpinfo;
 80 |   struct SEXPREC* attrib;
 81 |   struct SEXPREC* gengc_next_node, *gengc_prev_node;
 82 |   struct vecsxp_struct vecsxp;
 83 | } VECTOR_SEXPREC, *VECSEXP;
 84 | 
 85 | typedef union { VECTOR_SEXPREC s; double align; } SEXPREC_ALIGN;
 86 | 
 87 | #define DATAPTR(x)  (((SEXPREC_ALIGN *) (x)) + 1)
 88 | 
 89 | #define R_CHAR_PTR(x)     ((char *) DATAPTR(x))
 90 | 
 91 | #define R_INT_PTR(x)  ((int *) DATAPTR(x))
 92 | 
 93 | #define R_REAL_PTR(x)     ((double *) DATAPTR(x))
 94 | 
 95 | #define R_AS_INT(x) (*((int *) DATAPTR(x)))
 96 | 
 97 | #define R_IS_NULL(x) ((*(SEXP)(x)).sxpinfo.type == 0)
 98 | 
 99 | 
100 | // 64bit pointer
101 | #if INTPTR_MAX == INT64_MAX
102 | 
103 | #define R_ADDR(x)  ((int64_t *) DATAPTR(x))
104 | 
105 | inline void R_SET_PTR(SEXP x, void* ptr) {
106 |   if (ptr == nullptr) {
107 |     R_ADDR(x)[0] = (int64_t)(NULL);
108 |   } else {
109 |     R_ADDR(x)[0] = (int64_t)(ptr);
110 |   }
111 | }
112 | 
113 | inline void* R_GET_PTR(SEXP x) {
114 |   if (R_IS_NULL(x)) {
115 |     return nullptr;
116 |   } else {
117 |     auto ret = (void *)(R_ADDR(x)[0]);
118 |     if (ret == NULL) {
119 |       ret = nullptr;
120 |     }
121 |     return ret;
122 |   }
123 | }
124 | 
125 | #else
126 | 
127 | #define R_ADDR(x)  ((int32_t *) DATAPTR(x))
128 | 
129 | inline void R_SET_PTR(SEXP x, void* ptr) {
130 |   if (ptr == nullptr) {
131 |     R_ADDR(x)[0] = (int32_t)(NULL);
132 |   } else {
133 |     R_ADDR(x)[0] = (int32_t)(ptr);
134 |   }
135 | }
136 | 
137 | inline void* R_GET_PTR(SEXP x) {
138 |   if (R_IS_NULL(x)) {
139 |     return nullptr;
140 |   } else {
141 |     auto ret = (void *)(R_ADDR(x)[0]);
142 |     if (ret == NULL) {
143 |       ret = nullptr;
144 |     }
145 |     return ret;
146 |   }
147 | }
148 | 
149 | #endif
150 | 
151 | #endif // R_OBJECT_HELPER_H_
152 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/train.conf:
--------------------------------------------------------------------------------
  1 | # task type, support train and predict
  2 | task = train
  3 | 
  4 | # boosting type, support gbdt for now, alias: boosting, boost
  5 | boosting_type = gbdt
  6 | 
  7 | # application type, support following application
  8 | # regression , regression task
  9 | # binary , binary classification task
 10 | # lambdarank , lambdarank task
 11 | # alias: application, app
 12 | objective = binary
 13 | 
 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics
 15 | # l1 
 16 | # l2 , default metric for regression
 17 | # ndcg , default metric for lambdarank
 18 | # auc 
 19 | # binary_logloss , default metric for binary
 20 | # binary_error
 21 | metric = binary_logloss,auc
 22 | 
 23 | # frequence for metric output
 24 | metric_freq = 1
 25 | 
 26 | # true if need output metric for training data, alias: tranining_metric, train_metric
 27 | is_training_metric = true
 28 | 
 29 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 30 | max_bin = 255
 31 | 
 32 | # training data
 33 | # if exsting weight file, should name to "binary.train.weight"
 34 | # alias: train_data, train
 35 | data = binary.train
 36 | 
 37 | # validation data, support multi validation data, separated by ','
 38 | # if exsting weight file, should name to "binary.test.weight"
 39 | # alias: valid, test, test_data, 
 40 | valid_data = binary.test
 41 | 
 42 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
 43 | num_trees = 100
 44 | 
 45 | # shrinkage rate , alias: shrinkage_rate
 46 | learning_rate = 0.1
 47 | 
 48 | # number of leaves for one tree, alias: num_leaf
 49 | num_leaves = 63
 50 | 
 51 | # type of tree learner, support following types:
 52 | # serial , single machine version
 53 | # feature , use feature parallel to train
 54 | # data , use data parallel to train
 55 | # voting , use voting based parallel to train
 56 | # alias: tree
 57 | tree_learner = feature
 58 | 
 59 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
 60 | # num_threads = 8
 61 | 
 62 | # feature sub-sample, will random select 80% feature to train on each iteration 
 63 | # alias: sub_feature
 64 | feature_fraction = 0.8
 65 | 
 66 | # Support bagging (data sub-sample), will perform bagging every 5 iterations
 67 | bagging_freq = 5
 68 | 
 69 | # Bagging farction, will random select 80% data on bagging
 70 | # alias: sub_row
 71 | bagging_fraction = 0.8
 72 | 
 73 | # minimal number data for one leaf, use this to deal with over-fit
 74 | # alias : min_data_per_leaf, min_data
 75 | min_data_in_leaf = 50
 76 | 
 77 | # minimal sum hessians for one leaf, use this to deal with over-fit
 78 | min_sum_hessian_in_leaf = 5.0
 79 | 
 80 | # save memory and faster speed for sparse feature, alias: is_sparse
 81 | is_enable_sparse = true
 82 | 
 83 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed
 84 | # alias: two_round_loading, two_round
 85 | use_two_round_loading = false
 86 | 
 87 | # true if need to save data to binary file and application will auto load data from binary file next time
 88 | # alias: is_save_binary, save_binary
 89 | is_save_binary_file = false
 90 | 
 91 | # output model file
 92 | output_model = LightGBM_model.txt
 93 | 
 94 | # support continuous train from trained gbdt model
 95 | # input_model= trained_model.txt
 96 | 
 97 | # output prediction file for predict task
 98 | # output_result= prediction.txt
 99 | 
100 | # support continuous train from initial score file
101 | # input_init_score= init_score.txt
102 | 
103 | 
104 | # number of machines in parallel training, alias: num_machine
105 | num_machines = 2
106 | 
107 | # local listening port in parallel training, alias: local_port
108 | local_listen_port = 12400
109 | 
110 | # machines list file for parallel training, alias: mlist
111 | machine_list_file = mlist.txt
112 | 


--------------------------------------------------------------------------------
/examples/regression/train.conf:
--------------------------------------------------------------------------------
  1 | # task type, support train and predict
  2 | task = train
  3 | 
  4 | # boosting type, support gbdt for now, alias: boosting, boost
  5 | boosting_type = gbdt
  6 | 
  7 | # application type, support following application
  8 | # regression , regression task
  9 | # binary , binary classification task
 10 | # lambdarank , lambdarank task
 11 | # alias: application, app
 12 | objective = regression
 13 | 
 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics
 15 | # l1 
 16 | # l2 , default metric for regression
 17 | # ndcg , default metric for lambdarank
 18 | # auc 
 19 | # binary_logloss , default metric for binary
 20 | # binary_error
 21 | metric = l2
 22 | 
 23 | # frequence for metric output
 24 | metric_freq = 1
 25 | 
 26 | # true if need output metric for training data, alias: tranining_metric, train_metric
 27 | is_training_metric = true
 28 | 
 29 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 30 | max_bin = 255
 31 | 
 32 | # training data
 33 | # if exsting weight file, should name to "regression.train.weight"
 34 | # alias: train_data, train
 35 | data = regression.train
 36 | 
 37 | # validation data, support multi validation data, separated by ','
 38 | # if exsting weight file, should name to "regression.test.weight"
 39 | # alias: valid, test, test_data, 
 40 | valid_data = regression.test
 41 | 
 42 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
 43 | num_trees = 100
 44 | 
 45 | # shrinkage rate , alias: shrinkage_rate
 46 | learning_rate = 0.05
 47 | 
 48 | # number of leaves for one tree, alias: num_leaf
 49 | num_leaves = 31
 50 | 
 51 | # type of tree learner, support following types:
 52 | # serial , single machine version
 53 | # feature , use feature parallel to train
 54 | # data , use data parallel to train
 55 | # voting , use voting based parallel to train
 56 | # alias: tree
 57 | tree_learner = serial
 58 | 
 59 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
 60 | # num_threads = 8
 61 | 
 62 | # feature sub-sample, will random select 80% feature to train on each iteration 
 63 | # alias: sub_feature
 64 | feature_fraction = 0.9
 65 | 
 66 | # Support bagging (data sub-sample), will perform bagging every 5 iterations
 67 | bagging_freq = 5
 68 | 
 69 | # Bagging farction, will random select 80% data on bagging
 70 | # alias: sub_row
 71 | bagging_fraction = 0.8
 72 | 
 73 | # minimal number data for one leaf, use this to deal with over-fit
 74 | # alias : min_data_per_leaf, min_data
 75 | min_data_in_leaf = 100
 76 | 
 77 | # minimal sum hessians for one leaf, use this to deal with over-fit
 78 | min_sum_hessian_in_leaf = 5.0
 79 | 
 80 | # save memory and faster speed for sparse feature, alias: is_sparse
 81 | is_enable_sparse = true
 82 | 
 83 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed
 84 | # alias: two_round_loading, two_round
 85 | use_two_round_loading = false
 86 | 
 87 | # true if need to save data to binary file and application will auto load data from binary file next time
 88 | # alias: is_save_binary, save_binary
 89 | is_save_binary_file = false
 90 | 
 91 | # output model file
 92 | output_model = LightGBM_model.txt
 93 | 
 94 | # support continuous train from trained gbdt model
 95 | # input_model= trained_model.txt
 96 | 
 97 | # output prediction file for predict task
 98 | # output_result= prediction.txt
 99 | 
100 | # support continuous train from initial score file
101 | # input_init_score= init_score.txt
102 | 
103 | 
104 | # number of machines in parallel training, alias: num_machine
105 | num_machines = 1
106 | 
107 | # local listening port in parallel training, alias: local_port
108 | local_listen_port = 12400
109 | 
110 | # machines list file for parallel training, alias: mlist
111 | machine_list_file = mlist.txt
112 | 


--------------------------------------------------------------------------------
/examples/binary_classification/train.conf:
--------------------------------------------------------------------------------
  1 | # task type, support train and predict
  2 | task = train
  3 | 
  4 | # boosting type, support gbdt for now, alias: boosting, boost
  5 | boosting_type = gbdt
  6 | 
  7 | # application type, support following application
  8 | # regression , regression task
  9 | # binary , binary classification task
 10 | # lambdarank , lambdarank task
 11 | # alias: application, app
 12 | objective = binary
 13 | 
 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics
 15 | # l1 
 16 | # l2 , default metric for regression
 17 | # ndcg , default metric for lambdarank
 18 | # auc 
 19 | # binary_logloss , default metric for binary
 20 | # binary_error
 21 | metric = binary_logloss,auc
 22 | 
 23 | # frequence for metric output
 24 | metric_freq = 1
 25 | 
 26 | # true if need output metric for training data, alias: tranining_metric, train_metric
 27 | is_training_metric = true
 28 | 
 29 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 30 | max_bin = 255
 31 | 
 32 | # training data
 33 | # if exsting weight file, should name to "binary.train.weight"
 34 | # alias: train_data, train
 35 | data = binary.train
 36 | 
 37 | # validation data, support multi validation data, separated by ','
 38 | # if exsting weight file, should name to "binary.test.weight"
 39 | # alias: valid, test, test_data, 
 40 | valid_data = binary.test
 41 | 
 42 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
 43 | num_trees = 100
 44 | 
 45 | # shrinkage rate , alias: shrinkage_rate
 46 | learning_rate = 0.1
 47 | 
 48 | # number of leaves for one tree, alias: num_leaf
 49 | num_leaves = 63
 50 | 
 51 | # type of tree learner, support following types:
 52 | # serial , single machine version
 53 | # feature , use feature parallel to train
 54 | # data , use data parallel to train
 55 | # voting , use voting based parallel to train
 56 | # alias: tree
 57 | tree_learner = serial
 58 | 
 59 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
 60 | # num_threads = 8
 61 | 
 62 | # feature sub-sample, will random select 80% feature to train on each iteration 
 63 | # alias: sub_feature
 64 | feature_fraction = 0.8
 65 | 
 66 | # Support bagging (data sub-sample), will perform bagging every 5 iterations
 67 | bagging_freq = 5
 68 | 
 69 | # Bagging farction, will random select 80% data on bagging
 70 | # alias: sub_row
 71 | bagging_fraction = 0.8
 72 | 
 73 | # minimal number data for one leaf, use this to deal with over-fit
 74 | # alias : min_data_per_leaf, min_data
 75 | min_data_in_leaf = 50
 76 | 
 77 | # minimal sum hessians for one leaf, use this to deal with over-fit
 78 | min_sum_hessian_in_leaf = 5.0
 79 | 
 80 | # save memory and faster speed for sparse feature, alias: is_sparse
 81 | is_enable_sparse = true
 82 | 
 83 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed
 84 | # alias: two_round_loading, two_round
 85 | use_two_round_loading = false
 86 | 
 87 | # true if need to save data to binary file and application will auto load data from binary file next time
 88 | # alias: is_save_binary, save_binary
 89 | is_save_binary_file = false
 90 | 
 91 | # output model file
 92 | output_model = LightGBM_model.txt
 93 | 
 94 | # support continuous train from trained gbdt model
 95 | # input_model= trained_model.txt
 96 | 
 97 | # output prediction file for predict task
 98 | # output_result= prediction.txt
 99 | 
100 | # support continuous train from initial score file
101 | # input_init_score= init_score.txt
102 | 
103 | 
104 | # number of machines in parallel training, alias: num_machine
105 | num_machines = 1
106 | 
107 | # local listening port in parallel training, alias: local_port
108 | local_listen_port = 12400
109 | 
110 | # machines list file for parallel training, alias: mlist
111 | machine_list_file = mlist.txt
112 | 


--------------------------------------------------------------------------------
/src/treelearner/split_info.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
  2 | #define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
  3 | 
  4 | #include <LightGBM/meta.h>
  5 | 
  6 | #include <cmath>
  7 | #include <cstdint>
  8 | #include <cstring>
  9 | 
 10 | #include <functional>
 11 | 
 12 | namespace LightGBM {
 13 | 
 14 | /*!
 15 | * \brief Used to store some information for gain split point
 16 | */
 17 | struct SplitInfo {
 18 | public:
 19 |   /*! \brief Feature index */
 20 |   int feature;
 21 |   /*! \brief Split threshold */
 22 |   unsigned int threshold;
 23 |   /*! \brief Left output after split */
 24 |   double left_output;
 25 |   /*! \brief Right output after split */
 26 |   double right_output;
 27 |   /*! \brief Split gain */
 28 |   double gain;
 29 |   /*! \brief Left number of data after split */
 30 |   data_size_t left_count;
 31 |   /*! \brief Right number of data after split */
 32 |   data_size_t right_count;
 33 |   /*! \brief Left sum gradient after split */
 34 |   double left_sum_gradient;
 35 |   /*! \brief Left sum hessian after split */
 36 |   double left_sum_hessian;
 37 |   /*! \brief Right sum gradient after split */
 38 |   double right_sum_gradient;
 39 |   /*! \brief Right sum hessian after split */
 40 |   double right_sum_hessian;
 41 | 
 42 |   SplitInfo() {
 43 |     // initialize with -1 and -inf gain
 44 |     feature = -1;
 45 |     gain = kMinScore;
 46 |   }
 47 | 
 48 |   inline void Reset() {
 49 |     // initialize with -1 and -inf gain
 50 |     feature = -1;
 51 |     gain = kMinScore;
 52 |   }
 53 | 
 54 |   inline bool operator > (const SplitInfo &si) const;
 55 | 
 56 |   inline bool operator == (const SplitInfo &si) const;
 57 | 
 58 |   inline static void MaxReducer(const char* src, char* dst, int len) {
 59 |     const int type_size = sizeof(SplitInfo);
 60 |     int used_size = 0;
 61 |     const SplitInfo* p1;
 62 |     SplitInfo* p2;
 63 |     while (used_size < len) {
 64 |       p1 = reinterpret_cast<const SplitInfo*>(src);
 65 |       p2 = reinterpret_cast<SplitInfo*>(dst);
 66 |       if (*p1 > *p2) {
 67 |         // copy
 68 |         std::memcpy(dst, src, type_size);
 69 |       }
 70 |       src += type_size;
 71 |       dst += type_size;
 72 |       used_size += type_size;
 73 |     }
 74 |   }
 75 | };
 76 | 
 77 | 
 78 | 
 79 | inline bool SplitInfo::operator > (const SplitInfo& si) const {
 80 |   double local_gain = this->gain;
 81 |   double other_gain = si.gain;
 82 |   // replace nan with -inf
 83 |   if (local_gain == NAN) {
 84 |     local_gain = kMinScore;
 85 |   }
 86 |   // replace nan with -inf
 87 |   if (other_gain == NAN) {
 88 |     other_gain = kMinScore;
 89 |   }
 90 |   int local_feature = this->feature;
 91 |   int other_feature = si.feature;
 92 |   // replace -1 with max int
 93 |   if (local_feature == -1) {
 94 |     local_feature = INT32_MAX;
 95 |   }
 96 |   // replace -1 with max int
 97 |   if (other_feature == -1) {
 98 |     other_feature = INT32_MAX;
 99 |   }
100 |   if (local_gain != other_gain) {
101 |     return local_gain > other_gain;
102 |   } else {
103 |     // if same gain, use smaller feature
104 |     return local_feature < other_feature;
105 |   }
106 | }
107 | 
108 | inline bool SplitInfo::operator == (const SplitInfo& si) const {
109 |   double local_gain = this->gain;
110 |   double other_gain = si.gain;
111 |   // replace nan with -inf
112 |   if (local_gain == NAN) {
113 |     local_gain = kMinScore;
114 |   }
115 |   // replace nan with -inf
116 |   if (other_gain == NAN) {
117 |     other_gain = kMinScore;
118 |   }
119 |   int local_feature = this->feature;
120 |   int other_feature = si.feature;
121 |   // replace -1 with max int
122 |   if (local_feature == -1) {
123 |     local_feature = INT32_MAX;
124 |   }
125 |   // replace -1 with max int
126 |   if (other_feature == -1) {
127 |     other_feature = INT32_MAX;
128 |   }
129 |   if (local_gain != other_gain) {
130 |     return local_gain == other_gain;
131 |   } else {
132 |     // if same gain, use smaller feature
133 |     return local_feature == other_feature;
134 |   }
135 | }
136 | 
137 | }  // namespace LightGBM
138 | #endif   // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
139 | 


--------------------------------------------------------------------------------
/examples/lambdarank/train.conf:
--------------------------------------------------------------------------------
  1 | # task type, support train and predict
  2 | task = train
  3 | 
  4 | # boosting type, support gbdt for now, alias: boosting, boost
  5 | boosting_type = gbdt
  6 | 
  7 | # application type, support following application
  8 | # regression , regression task
  9 | # binary , binary classification task
 10 | # lambdarank , lambdarank task
 11 | # alias: application, app
 12 | objective = lambdarank
 13 | 
 14 | # eval metrics, support multi metric, delimite by ',' , support following metrics
 15 | # l1 
 16 | # l2 , default metric for regression
 17 | # ndcg , default metric for lambdarank
 18 | # auc 
 19 | # binary_logloss , default metric for binary
 20 | # binary_error
 21 | metric = ndcg
 22 | 
 23 | # evaluation position for ndcg metric, alias : ndcg_at
 24 | ndcg_eval_at = 1,3,5
 25 | 
 26 | # frequence for metric output
 27 | metric_freq = 1
 28 | 
 29 | # true if need output metric for training data, alias: tranining_metric, train_metric
 30 | is_training_metric = true
 31 | 
 32 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 33 | max_bin = 255
 34 | 
 35 | # training data
 36 | # if exsting weight file, should name to "rank.train.weight"
 37 | # if exsting query file, should name to "rank.train.query"
 38 | # alias: train_data, train
 39 | data = rank.train
 40 | 
 41 | # validation data, support multi validation data, separated by ','
 42 | # if exsting weight file, should name to "rank.test.weight"
 43 | # if exsting query file, should name to "rank.test.query"
 44 | # alias: valid, test, test_data, 
 45 | valid_data = rank.test
 46 | 
 47 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
 48 | num_trees = 100
 49 | 
 50 | # shrinkage rate , alias: shrinkage_rate
 51 | learning_rate = 0.1
 52 | 
 53 | # number of leaves for one tree, alias: num_leaf
 54 | num_leaves = 31
 55 | 
 56 | # type of tree learner, support following types:
 57 | # serial , single machine version
 58 | # feature , use feature parallel to train
 59 | # data , use data parallel to train
 60 | # voting , use voting based parallel to train
 61 | # alias: tree
 62 | tree_learner = serial
 63 | 
 64 | # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
 65 | # num_threads = 8
 66 | 
 67 | # feature sub-sample, will random select 80% feature to train on each iteration 
 68 | # alias: sub_feature
 69 | feature_fraction = 1.0
 70 | 
 71 | # Support bagging (data sub-sample), will perform bagging every 5 iterations
 72 | bagging_freq = 1
 73 | 
 74 | # Bagging farction, will random select 80% data on bagging
 75 | # alias: sub_row
 76 | bagging_fraction = 0.9
 77 | 
 78 | # minimal number data for one leaf, use this to deal with over-fit
 79 | # alias : min_data_per_leaf, min_data
 80 | min_data_in_leaf = 50
 81 | 
 82 | # minimal sum hessians for one leaf, use this to deal with over-fit
 83 | min_sum_hessian_in_leaf = 5.0
 84 | 
 85 | # save memory and faster speed for sparse feature, alias: is_sparse
 86 | is_enable_sparse = true
 87 | 
 88 | # when data is bigger than memory size, set this to true. otherwise set false will have faster speed
 89 | # alias: two_round_loading, two_round
 90 | use_two_round_loading = false
 91 | 
 92 | # true if need to save data to binary file and application will auto load data from binary file next time
 93 | # alias: is_save_binary, save_binary
 94 | is_save_binary_file = false
 95 | 
 96 | # output model file
 97 | output_model = LightGBM_model.txt
 98 | 
 99 | # support continuous train from trained gbdt model
100 | # input_model= trained_model.txt
101 | 
102 | # output prediction file for predict task
103 | # output_result= prediction.txt
104 | 
105 | # support continuous train from initial score file
106 | # input_init_score= init_score.txt
107 | 
108 | 
109 | # number of machines in parallel training, alias: num_machine
110 | num_machines = 1
111 | 
112 | # local listening port in parallel training, alias: local_port
113 | local_listen_port = 12400
114 | 
115 | # machines list file for parallel training, alias: mlist
116 | machine_list_file = mlist.txt
117 | 


--------------------------------------------------------------------------------
/include/LightGBM/metric.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_METRIC_H_
  2 | #define LIGHTGBM_METRIC_H_
  3 | 
  4 | #include <LightGBM/meta.h>
  5 | #include <LightGBM/config.h>
  6 | #include <LightGBM/dataset.h>
  7 | #include <LightGBM/objective_function.h>
  8 | 
  9 | #include <vector>
 10 | 
 11 | namespace LightGBM {
 12 | 
 13 | /*!
 14 | * \brief The interface of metric.
 15 | *        Metric is used to calculate metric result
 16 | */
 17 | class Metric {
 18 | public:
 19 |   /*! \brief virtual destructor */
 20 |   virtual ~Metric() {}
 21 | 
 22 |   /*!
 23 |   * \brief Initialize
 24 |   * \param test_name Specific name for this metric, will output on log
 25 |   * \param metadata Label data
 26 |   * \param num_data Number of data
 27 |   */
 28 |   virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
 29 | 
 30 |   virtual const std::vector<std::string>& GetName() const = 0;
 31 | 
 32 |   virtual double factor_to_bigger_better() const = 0;
 33 |   /*!
 34 |   * \brief Calcaluting and printing metric result
 35 |   * \param score Current prediction score
 36 |   */
 37 |   virtual std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const = 0;
 38 | 
 39 |   Metric() = default;
 40 |   /*! \brief Disable copy */
 41 |   Metric& operator=(const Metric&) = delete;
 42 |   /*! \brief Disable copy */
 43 |   Metric(const Metric&) = delete;
 44 | 
 45 |   /*!
 46 |   * \brief Create object of metrics
 47 |   * \param type Specific type of metric
 48 |   * \param config Config for metric
 49 |   */
 50 |   LIGHTGBM_EXPORT static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
 51 | 
 52 | };
 53 | 
 54 | /*!
 55 | * \brief Static class, used to calculate DCG score
 56 | */
 57 | class DCGCalculator {
 58 | public:
 59 |   /*!
 60 |   * \brief Initial logic
 61 |   * \param label_gain Gain for labels, default is 2^i - 1
 62 |   */
 63 |   static void Init(std::vector<double> label_gain);
 64 | 
 65 |   /*!
 66 |   * \brief Calculate the DCG score at position k
 67 |   * \param k The position to evaluate
 68 |   * \param label Pointer of label
 69 |   * \param score Pointer of score
 70 |   * \param num_data Number of data
 71 |   * \return The DCG score
 72 |   */
 73 |   static double CalDCGAtK(data_size_t k, const float* label,
 74 |     const double* score, data_size_t num_data);
 75 | 
 76 |   /*!
 77 |   * \brief Calculate the DCG score at multi position
 78 |   * \param ks The positions to evaluate
 79 |   * \param label Pointer of label
 80 |   * \param score Pointer of score
 81 |   * \param num_data Number of data
 82 |   * \param out Output result
 83 |   */
 84 |   static void CalDCG(const std::vector<data_size_t>& ks,
 85 |     const float* label, const double* score,
 86 |     data_size_t num_data, std::vector<double>* out);
 87 | 
 88 |   /*!
 89 |   * \brief Calculate the Max DCG score at position k
 90 |   * \param k The position want to eval at
 91 |   * \param label Pointer of label
 92 |   * \param num_data Number of data
 93 |   * \return The max DCG score
 94 |   */
 95 |   static double CalMaxDCGAtK(data_size_t k,
 96 |     const float* label, data_size_t num_data);
 97 | 
 98 |   /*!
 99 |   * \brief Calculate the Max DCG score at multi position
100 |   * \param ks The positions want to eval at
101 |   * \param label Pointer of label
102 |   * \param num_data Number of data
103 |   * \param out Output result
104 |   */
105 |   static void CalMaxDCG(const std::vector<data_size_t>& ks,
106 |     const float* label, data_size_t num_data, std::vector<double>* out);
107 | 
108 |   /*!
109 |   * \brief Get discount score of position k
110 |   * \param k The position
111 |   * \return The discount of this position
112 |   */
113 |   inline static double GetDiscount(data_size_t k) { return discount_[k]; }
114 | 
115 | private:
116 |   /*! \brief store gains for different label */
117 |   static std::vector<double> label_gain_;
118 |   /*! \brief store discount score for different position */
119 |   static std::vector<double> discount_;
120 |   /*! \brief max position for eval */
121 |   static const data_size_t kMaxPosition;
122 | };
123 | 
124 | 
125 | }  // namespace LightGBM
126 | 
127 | 
128 | #endif   // LightGBM_METRIC_H_
129 | 


--------------------------------------------------------------------------------
/src/io/parser.cpp:
--------------------------------------------------------------------------------
  1 | #include "parser.hpp"
  2 | 
  3 | #include <iostream>
  4 | #include <fstream>
  5 | #include <functional>
  6 | #include <memory>
  7 | 
  8 | namespace LightGBM {
  9 | 
 10 | void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) {
 11 |   *comma_cnt = 0;
 12 |   *tab_cnt = 0;
 13 |   *colon_cnt = 0;
 14 |   for (int i = 0; str[i] != '\0'; ++i) {
 15 |     if (str[i] == ',') {
 16 |       ++(*comma_cnt);
 17 |     } else if (str[i] == '\t') {
 18 |       ++(*tab_cnt);
 19 |     } else if (str[i] == ':') {
 20 |       ++(*colon_cnt);
 21 |     }
 22 |   }
 23 | }
 24 | 
 25 | int GetLabelIdxForLibsvm(std::string& str, int num_features, int label_idx) {
 26 |   if (num_features <= 0) {
 27 |     return label_idx;
 28 |   }
 29 |   str = Common::Trim(str);
 30 |   auto pos_space = str.find_first_of(" \f\n\r\t\v");
 31 |   auto pos_colon = str.find_first_of(":");
 32 |   if (pos_space == std::string::npos || pos_space < pos_colon) {
 33 |     return label_idx;
 34 |   } else {
 35 |     return -1;
 36 |   }
 37 | }
 38 | 
 39 | int GetLabelIdxForTSV(std::string& str, int num_features, int label_idx) {
 40 |   if (num_features <= 0) {
 41 |     return label_idx;
 42 |   }
 43 |   str = Common::Trim(str);
 44 |   auto tokens = Common::Split(str.c_str(), '\t');
 45 |   if (static_cast<int>(tokens.size()) == num_features) {
 46 |     return -1;
 47 |   } else {
 48 |     return label_idx;
 49 |   }
 50 | }
 51 | 
 52 | int GetLabelIdxForCSV(std::string& str, int num_features, int label_idx) {
 53 |   if (num_features <= 0) {
 54 |     return label_idx;
 55 |   }
 56 |   str = Common::Trim(str);
 57 |   auto tokens = Common::Split(str.c_str(), ',');
 58 |   if (static_cast<int>(tokens.size()) == num_features) {
 59 |     return -1;
 60 |   } else {
 61 |     return label_idx;
 62 |   }
 63 | }
 64 | 
 65 | enum DataType {
 66 |   INVALID,
 67 |   CSV,
 68 |   TSV,
 69 |   LIBSVM
 70 | };
 71 | 
 72 | Parser* Parser::CreateParser(const char* filename, bool has_header, int num_features, int label_idx) {
 73 |   std::ifstream tmp_file;
 74 |   tmp_file.open(filename);
 75 |   if (!tmp_file.is_open()) {
 76 |     Log::Fatal("Data file %s doesn't exist'", filename);
 77 |   }
 78 |   std::string line1, line2;
 79 |   if (has_header) {
 80 |     if (!tmp_file.eof()) {
 81 |       std::getline(tmp_file, line1);
 82 |     }
 83 |   }
 84 |   if (!tmp_file.eof()) {
 85 |     std::getline(tmp_file, line1);
 86 |   } else {
 87 |     Log::Fatal("Data file %s should have at least one line", filename);
 88 |   }
 89 |   if (!tmp_file.eof()) {
 90 |     std::getline(tmp_file, line2);
 91 |   } else {
 92 |     Log::Warning("Data file %s only has one line", filename);
 93 |   }
 94 |   tmp_file.close();
 95 |   int comma_cnt = 0, comma_cnt2 = 0;
 96 |   int tab_cnt = 0, tab_cnt2 = 0;
 97 |   int colon_cnt = 0, colon_cnt2 = 0;
 98 |   // Get some statistic from 2 line
 99 |   GetStatistic(line1.c_str(), &comma_cnt, &tab_cnt, &colon_cnt);
100 |   GetStatistic(line2.c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2);
101 | 
102 | 
103 | 
104 |   DataType type = DataType::INVALID;
105 |   if (line2.size() == 0) {
106 |     // if only have one line on file
107 |     if (colon_cnt > 0) {
108 |       type = DataType::LIBSVM;
109 |     } else if (tab_cnt > 0) {
110 |       type = DataType::TSV;
111 |     } else if (comma_cnt > 0) {
112 |       type = DataType::CSV;
113 |     }
114 |   } else {
115 |     if (colon_cnt > 0 || colon_cnt2 > 0) {
116 |       type = DataType::LIBSVM;
117 |     } else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
118 |       type = DataType::TSV;
119 |     } else if (comma_cnt == comma_cnt2 && comma_cnt > 0) {
120 |       type = DataType::CSV;
121 |     }
122 |   }
123 |   if (type == DataType::INVALID) {
124 |     Log::Fatal("Unknown format of training data");
125 |   }
126 |   std::unique_ptr<Parser> ret;
127 |   if (type == DataType::LIBSVM) {
128 |     label_idx = GetLabelIdxForLibsvm(line1, num_features, label_idx);
129 |     ret.reset(new LibSVMParser(label_idx));
130 |   }
131 |   else if (type == DataType::TSV) {
132 |     label_idx = GetLabelIdxForTSV(line1, num_features, label_idx);
133 |     ret.reset(new TSVParser(label_idx));
134 |   }
135 |   else if (type == DataType::CSV) {
136 |     label_idx = GetLabelIdxForCSV(line1, num_features, label_idx);
137 |     ret.reset(new CSVParser(label_idx));
138 |   }
139 | 
140 |   if (label_idx < 0) {
141 |     Log::Info("Data file %s doesn't contain a label column", filename);
142 |   }
143 |   return ret.release();
144 | }
145 | 
146 | }  // namespace LightGBM
147 | 


--------------------------------------------------------------------------------
/src/boosting/score_updater.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
  2 | #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
  3 | 
  4 | 
  5 | #include <LightGBM/utils/openmp_wrapper.h>
  6 | #include <LightGBM/meta.h>
  7 | #include <LightGBM/dataset.h>
  8 | #include <LightGBM/tree.h>
  9 | #include <LightGBM/tree_learner.h>
 10 | 
 11 | #include <cstring>
 12 | 
 13 | namespace LightGBM {
 14 | /*!
 15 | * \brief Used to store and update score for data
 16 | */
 17 | class ScoreUpdater {
 18 | public:
 19 |   /*!
 20 |   * \brief Constructor, will pass a const pointer of dataset
 21 |   * \param data This class will bind with this data set
 22 |   */
 23 |   ScoreUpdater(const Dataset* data, int num_tree_per_iteration) : data_(data) {
 24 |     num_data_ = data->num_data();
 25 |     int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration;
 26 |     score_.resize(total_size);
 27 |     // default start score is zero
 28 |     #pragma omp parallel for schedule(static)
 29 |     for (int64_t i = 0; i < total_size; ++i) {
 30 |       score_[i] = 0.0f;
 31 |     }
 32 |     has_init_score_ = false;
 33 |     const double* init_score = data->metadata().init_score();
 34 |     // if exists initial score, will start from it
 35 |     if (init_score != nullptr) {
 36 |       if ((data->metadata().num_init_score() % num_data_) != 0
 37 |           || (data->metadata().num_init_score() / num_data_) != num_tree_per_iteration) {
 38 |         Log::Fatal("number of class for initial score error");
 39 |       }
 40 |       has_init_score_ = true;
 41 |       #pragma omp parallel for schedule(static)
 42 |       for (int64_t i = 0; i < total_size; ++i) {
 43 |         score_[i] = init_score[i];
 44 |       }
 45 |     }
 46 |   }
 47 |   /*! \brief Destructor */
 48 |   ~ScoreUpdater() {
 49 | 
 50 |   }
 51 | 
 52 |   inline bool has_init_score() const { return has_init_score_; }
 53 | 
 54 |   inline void AddScore(double val, int cur_tree_id) {
 55 |     int64_t offset = cur_tree_id * num_data_;
 56 |     #pragma omp parallel for schedule(static)
 57 |     for (int64_t i = 0; i < num_data_; ++i) {
 58 |       score_[offset + i] += val;
 59 |     }
 60 |   }
 61 |   /*!
 62 |   * \brief Using tree model to get prediction number, then adding to scores for all data
 63 |   *        Note: this function generally will be used on validation data too.
 64 |   * \param tree Trained tree model
 65 |   * \param cur_tree_id Current tree for multiclass training
 66 |   */
 67 |   inline void AddScore(const Tree* tree, int cur_tree_id) {
 68 |     tree->AddPredictionToScore(data_, num_data_, score_.data() + cur_tree_id * num_data_);
 69 |   }
 70 |   /*!
 71 |   * \brief Adding prediction score, only used for training data.
 72 |   *        The training data is partitioned into tree leaves after training
 73 |   *        Based on which We can get prediction quickly.
 74 |   * \param tree_learner
 75 |   * \param cur_tree_id Current tree for multiclass training
 76 |   */
 77 |   inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
 78 |     tree_learner->AddPredictionToScore(tree, score_.data() + cur_tree_id * num_data_);
 79 |   }
 80 |   /*!
 81 |   * \brief Using tree model to get prediction number, then adding to scores for parts of data
 82 |   *        Used for prediction of training out-of-bag data
 83 |   * \param tree Trained tree model
 84 |   * \param data_indices Indices of data that will be processed
 85 |   * \param data_cnt Number of data that will be processed
 86 |   * \param cur_tree_id Current tree for multiclass training
 87 |   */
 88 |   inline void AddScore(const Tree* tree, const data_size_t* data_indices,
 89 |                        data_size_t data_cnt, int cur_tree_id) {
 90 |     tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + cur_tree_id * num_data_);
 91 |   }
 92 |   /*! \brief Pointer of score */
 93 |   inline const double* score() const { return score_.data(); }
 94 |   inline const data_size_t num_data() const { return num_data_; }
 95 | 
 96 |   /*! \brief Disable copy */
 97 |   ScoreUpdater& operator=(const ScoreUpdater&) = delete;
 98 |   /*! \brief Disable copy */
 99 |   ScoreUpdater(const ScoreUpdater&) = delete;
100 | private:
101 |   /*! \brief Number of total data */
102 |   data_size_t num_data_;
103 |   /*! \brief Pointer of data set */
104 |   const Dataset* data_;
105 |   /*! \brief Scores for data set */
106 |   std::vector<double> score_;
107 |   bool has_init_score_;
108 | };
109 | 
110 | }  // namespace LightGBM
111 | #endif   // LightGBM_BOOSTING_SCORE_UPDATER_HPP_
112 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 2.8)
  2 | 
  3 | if(APPLE)
  4 |     SET(CMAKE_CXX_COMPILER "g++-6")
  5 |     SET(CMAKE_C_COMPILER "gcc-6")
  6 | endif()
  7 | 
  8 | PROJECT(lightgbm)
  9 | 
 10 | OPTION(USE_MPI "MPI based parallel learning" OFF)
 11 | OPTION(USE_OPENMP "Enable OpenMP" ON)
 12 | OPTION(USE_GPU "Enable GPU-acclerated training (EXPERIMENTAL)" OFF)
 13 | 
 14 | if(APPLE)
 15 |     OPTION(APPLE_OUTPUT_DYLIB "Output dylib shared library" OFF)
 16 | endif()
 17 | 
 18 | if(USE_MPI)
 19 |     find_package(MPI REQUIRED)
 20 |     ADD_DEFINITIONS(-DUSE_MPI)
 21 |     MESSAGE(${MPI_LIBRARIES})
 22 |     MESSAGE(${MPI_CXX_LIBRARIES})
 23 | else()
 24 |     ADD_DEFINITIONS(-DUSE_SOCKET)
 25 | endif(USE_MPI)
 26 | 
 27 | if(USE_OPENMP)
 28 |     find_package(OpenMP REQUIRED)
 29 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 30 | else()
 31 |     # Ignore unknown #pragma warning
 32 |     if( (CMAKE_CXX_COMPILER_ID MATCHES "[cC][lL][aA][nN][gG]")
 33 |       OR (CMAKE_CXX_COMPILER_ID MATCHES "[gG][nN][uU]"))
 34 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
 35 |     endif()
 36 | endif(USE_OPENMP)
 37 | 
 38 | if(USE_GPU)
 39 |     find_package(OpenCL REQUIRED)
 40 |     include_directories(${OpenCL_INCLUDE_DIRS})
 41 |     MESSAGE(STATUS "OpenCL include directory:" ${OpenCL_INCLUDE_DIRS})
 42 |     find_package(Boost 1.56.0 COMPONENTS filesystem system REQUIRED)
 43 |     include_directories(${Boost_INCLUDE_DIRS})
 44 |     ADD_DEFINITIONS(-DUSE_GPU)
 45 | endif(USE_GPU)
 46 | 
 47 | if(UNIX OR MINGW OR CYGWIN)
 48 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes")
 49 | endif()
 50 | 
 51 | if(MSVC)
 52 |     if(MSVC_VERSION LESS 1800)
 53 |         message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a newer msvc.")
 54 |     endif()
 55 | 
 56 |     SET(variables
 57 |         CMAKE_C_FLAGS_DEBUG
 58 |         CMAKE_C_FLAGS_MINSIZEREL
 59 |         CMAKE_C_FLAGS_RELEASE
 60 |         CMAKE_C_FLAGS_RELWITHDEBINFO
 61 |         CMAKE_CXX_FLAGS_DEBUG
 62 |         CMAKE_CXX_FLAGS_MINSIZEREL
 63 |         CMAKE_CXX_FLAGS_RELEASE
 64 |         CMAKE_CXX_FLAGS_RELWITHDEBINFO
 65 |     )
 66 |     foreach(variable ${variables})
 67 |         if(${variable} MATCHES "/MD")
 68 |             string(REGEX REPLACE "/MD" "/MT" ${variable} "${${variable}}")
 69 |         endif()
 70 |     endforeach()
 71 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /O2 /Ob2 /Oi /Ot /Oy /GL")
 72 | else()
 73 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 74 | endif()
 75 | 
 76 | 
 77 | SET(LightGBM_HEADER_DIR ${PROJECT_SOURCE_DIR}/include)
 78 | SET(BOOST_COMPUTE_HEADER_DIR ${PROJECT_SOURCE_DIR}/compute/include)
 79 | 
 80 | SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
 81 | SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
 82 | 
 83 | include_directories (${LightGBM_HEADER_DIR})
 84 | include_directories (${BOOST_COMPUTE_HEADER_DIR})
 85 | 
 86 | if(APPLE)
 87 |   if (APPLE_OUTPUT_DYLIB)
 88 |     SET(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
 89 |   else()
 90 |     SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
 91 |   endif()
 92 | endif(APPLE)
 93 | 
 94 | if(USE_MPI)
 95 |   include_directories(${MPI_CXX_INCLUDE_PATH})
 96 | endif(USE_MPI)
 97 | 
 98 | file(GLOB SOURCES 
 99 |     src/application/*.cpp
100 |     src/boosting/*.cpp
101 |     src/io/*.cpp
102 |     src/metric/*.cpp
103 |     src/objective/*.cpp
104 |     src/network/*.cpp
105 |     src/treelearner/*.cpp
106 | )
107 | 
108 | add_executable(lightgbm src/main.cpp ${SOURCES})
109 | add_library(_lightgbm SHARED src/c_api.cpp ${SOURCES})
110 | 
111 | if(MSVC)
112 |     set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm")
113 | endif(MSVC)
114 | 
115 | if(USE_MPI)
116 |   TARGET_LINK_LIBRARIES(lightgbm ${MPI_CXX_LIBRARIES})
117 |   TARGET_LINK_LIBRARIES(_lightgbm ${MPI_CXX_LIBRARIES})
118 | endif(USE_MPI)
119 | 
120 | if(USE_GPU)
121 |   TARGET_LINK_LIBRARIES(lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
122 |   TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
123 | endif(USE_GPU)
124 | 
125 | if(WIN32 AND (MINGW OR CYGWIN))
126 |     TARGET_LINK_LIBRARIES(lightgbm Ws2_32)
127 |     TARGET_LINK_LIBRARIES(_lightgbm Ws2_32)
128 |     TARGET_LINK_LIBRARIES(lightgbm IPHLPAPI)
129 |     TARGET_LINK_LIBRARIES(_lightgbm IPHLPAPI)
130 | endif()
131 | 
132 | install(TARGETS lightgbm _lightgbm
133 |         RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
134 |         LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
135 |         ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
136 | 
137 | install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
138 | 


--------------------------------------------------------------------------------
/src/treelearner/leaf_splits.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
  2 | #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
  3 | 
  4 | #include <LightGBM/meta.h>
  5 | #include "data_partition.hpp"
  6 | 
  7 | #include <vector>
  8 | 
  9 | namespace LightGBM {
 10 | 
 11 | /*!
 12 | * \brief used to find split candidates for a leaf
 13 | */
 14 | class LeafSplits {
 15 | public:
 16 |   LeafSplits(data_size_t num_data)
 17 |     :num_data_in_leaf_(num_data), num_data_(num_data),
 18 |     data_indices_(nullptr) {
 19 |   }
 20 |   void ResetNumData(data_size_t num_data) {
 21 |     num_data_ = num_data;
 22 |     num_data_in_leaf_ = num_data;
 23 |   }
 24 |   ~LeafSplits() {
 25 |   }
 26 | 
 27 |   /*!
 28 | 
 29 |   * \brief Init split on current leaf on partial data. 
 30 |   * \param leaf Index of current leaf
 31 |   * \param data_partition current data partition
 32 |   * \param sum_gradients
 33 |   * \param sum_hessians
 34 |   */
 35 |   void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) {
 36 |     leaf_index_ = leaf;
 37 |     data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
 38 |     sum_gradients_ = sum_gradients;
 39 |     sum_hessians_ = sum_hessians;
 40 |   }
 41 | 
 42 |   /*!
 43 |   * \brief Init splits on current leaf, it will traverse all data to sum up the results
 44 |   * \param gradients
 45 |   * \param hessians
 46 |   */
 47 |   void Init(const score_t* gradients, const score_t* hessians) {
 48 |     num_data_in_leaf_ = num_data_;
 49 |     leaf_index_ = 0;
 50 |     data_indices_ = nullptr;
 51 |     double tmp_sum_gradients = 0.0f;
 52 |     double tmp_sum_hessians = 0.0f;
 53 | #pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
 54 |     for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
 55 |       tmp_sum_gradients += gradients[i];
 56 |       tmp_sum_hessians += hessians[i];
 57 |     }
 58 |     sum_gradients_ = tmp_sum_gradients;
 59 |     sum_hessians_ = tmp_sum_hessians;
 60 |   }
 61 | 
 62 |   /*!
 63 |   * \brief Init splits on current leaf of partial data.
 64 |   * \param leaf Index of current leaf
 65 |   * \param data_partition current data partition
 66 |   * \param gradients
 67 |   * \param hessians
 68 |   */
 69 |   void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) {
 70 |     leaf_index_ = leaf;
 71 |     data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
 72 |     double tmp_sum_gradients = 0.0f;
 73 |     double tmp_sum_hessians = 0.0f;
 74 | #pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
 75 |     for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
 76 |       data_size_t idx = data_indices_[i];
 77 |       tmp_sum_gradients += gradients[idx];
 78 |       tmp_sum_hessians += hessians[idx];
 79 |     }
 80 |     sum_gradients_ = tmp_sum_gradients;
 81 |     sum_hessians_ = tmp_sum_hessians;
 82 |   }
 83 | 
 84 | 
 85 |   /*!
 86 |   * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
 87 |   * \param sum_gradients
 88 |   * \param sum_hessians
 89 |   */
 90 |   void Init(double sum_gradients, double sum_hessians) {
 91 |     leaf_index_ = 0;
 92 |     sum_gradients_ = sum_gradients;
 93 |     sum_hessians_ = sum_hessians;
 94 |   }
 95 | 
 96 |   /*!
 97 |   * \brief Init splits on current leaf
 98 |   */
 99 |   void Init() {
100 |     leaf_index_ = -1;
101 |     data_indices_ = nullptr;
102 |     num_data_in_leaf_ = 0;
103 |   }
104 | 
105 | 
106 |   /*! \brief Get current leaf index */
107 |   int LeafIndex() const { return leaf_index_; }
108 | 
109 |   /*! \brief Get numer of data in current leaf */
110 |   data_size_t num_data_in_leaf() const { return num_data_in_leaf_; }
111 | 
112 |   /*! \brief Get sum of gradients of current leaf */
113 |   double sum_gradients() const { return sum_gradients_; }
114 |   
115 |   /*! \brief Get sum of hessians of current leaf */
116 |   double sum_hessians() const { return sum_hessians_; }
117 | 
118 |   /*! \brief Get indices of data of current leaf */
119 |   const data_size_t* data_indices() const { return data_indices_; }
120 | 
121 | 
122 | private:
123 |   /*! \brief current leaf index */
124 |   int leaf_index_;
125 |   /*! \brief number of data on current leaf */
126 |   data_size_t num_data_in_leaf_;
127 |   /*! \brief number of all training data */
128 |   data_size_t num_data_;
129 |   /*! \brief sum of gradients of current leaf */
130 |   double sum_gradients_;
131 |   /*! \brief sum of hessians of current leaf */
132 |   double sum_hessians_;
133 |   /*! \brief indices of data of current leaf */
134 |   const data_size_t* data_indices_;
135 | };
136 | 
137 | }  // namespace LightGBM
138 | #endif   // LightGBM_TREELEARNER_LEAF_SPLITS_HPP_
139 | 


--------------------------------------------------------------------------------
/src/metric/dcg_calculator.cpp:
--------------------------------------------------------------------------------
  1 | #include <LightGBM/metric.h>
  2 | 
  3 | #include <LightGBM/utils/log.h>
  4 | 
  5 | #include <cmath>
  6 | 
  7 | #include <vector>
  8 | #include <algorithm>
  9 | 
 10 | namespace LightGBM {
 11 | 
 12 | /*! \brief Declaration for some static members */
 13 | std::vector<double> DCGCalculator::label_gain_;
 14 | std::vector<double> DCGCalculator::discount_;
 15 | const data_size_t DCGCalculator::kMaxPosition = 10000;
 16 | 
 17 | void DCGCalculator::Init(std::vector<double> input_label_gain) {
 18 |   label_gain_.resize(input_label_gain.size());
 19 |   for(size_t i = 0;i < input_label_gain.size();++i){
 20 |     label_gain_[i] = static_cast<double>(input_label_gain[i]);
 21 |   }
 22 |   discount_.resize(kMaxPosition);
 23 |   for (data_size_t i = 0; i < kMaxPosition; ++i) {
 24 |     discount_[i] = 1.0f / std::log2(2.0f + i);
 25 |   }
 26 | }
 27 | 
 28 | double DCGCalculator::CalMaxDCGAtK(data_size_t k, const float* label, data_size_t num_data) {
 29 |   double ret = 0.0f;
 30 |   // counts for all labels
 31 |   std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
 32 |   for (data_size_t i = 0; i < num_data; ++i) {
 33 |     ++label_cnt[static_cast<int>(label[i])];
 34 |   }
 35 |   int top_label = static_cast<int>(label_gain_.size()) - 1;
 36 | 
 37 |   if (k > num_data) { k = num_data; }
 38 |   //  start from top label, and accumulate DCG
 39 |   for (data_size_t j = 0; j < k; ++j) {
 40 |     while (top_label > 0 && label_cnt[top_label] <= 0) {
 41 |       top_label -= 1;
 42 |     }
 43 |     if (top_label < 0) {
 44 |       break;
 45 |     }
 46 |     ret += discount_[j] * label_gain_[top_label];
 47 |     label_cnt[top_label] -= 1;
 48 |   }
 49 |   return ret;
 50 | }
 51 | 
 52 | void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
 53 |                               const float* label,
 54 |                               data_size_t num_data,
 55 |                               std::vector<double>* out) {
 56 |   std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
 57 |   // counts for all labels
 58 |   for (data_size_t i = 0; i < num_data; ++i) {
 59 |     if (static_cast<size_t>(label[i]) >= label_cnt.size()) { Log::Fatal("Label excel %d", label[i]); }
 60 |     ++label_cnt[static_cast<int>(label[i])];
 61 |   }
 62 |   double cur_result = 0.0f;
 63 |   data_size_t cur_left = 0;
 64 |   int top_label = static_cast<int>(label_gain_.size()) - 1;
 65 |   // calculate k Max DCG by one pass
 66 |   for (size_t i = 0; i < ks.size(); ++i) {
 67 |     data_size_t cur_k = ks[i];
 68 |     if (cur_k > num_data) { cur_k = num_data; }
 69 |     for (data_size_t j = cur_left; j < cur_k; ++j) {
 70 |       while (top_label > 0 && label_cnt[top_label] <= 0) {
 71 |         top_label -= 1;
 72 |       }
 73 |       if (top_label < 0) {
 74 |         break;
 75 |       }
 76 |       cur_result += discount_[j] * label_gain_[top_label];
 77 |       label_cnt[top_label] -= 1;
 78 |     }
 79 |     (*out)[i] = cur_result;
 80 |     cur_left = cur_k;
 81 |   }
 82 | }
 83 | 
 84 | 
 85 | double DCGCalculator::CalDCGAtK(data_size_t k, const float* label,
 86 |                                 const double* score, data_size_t num_data) {
 87 |   // get sorted indices by score
 88 |   std::vector<data_size_t> sorted_idx;
 89 |   for (data_size_t i = 0; i < num_data; ++i) {
 90 |     sorted_idx.emplace_back(i);
 91 |   }
 92 |   std::sort(sorted_idx.begin(), sorted_idx.end(),
 93 |            [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
 94 | 
 95 |   if (k > num_data) { k = num_data; }
 96 |   double dcg = 0.0f;
 97 |   // calculate dcg
 98 |   for (data_size_t i = 0; i < k; ++i) {
 99 |     data_size_t idx = sorted_idx[i];
100 |     dcg += label_gain_[static_cast<int>(label[idx])] * discount_[i];
101 |   }
102 |   return dcg;
103 | }
104 | 
105 | void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const float* label,
106 |                            const double * score, data_size_t num_data, std::vector<double>* out) {
107 |   // get sorted indices by score
108 |   std::vector<data_size_t> sorted_idx;
109 |   for (data_size_t i = 0; i < num_data; ++i) {
110 |     sorted_idx.emplace_back(i);
111 |   }
112 |   std::sort(sorted_idx.begin(), sorted_idx.end(),
113 |             [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
114 | 
115 |   double cur_result = 0.0f;
116 |   data_size_t cur_left = 0;
117 |   // calculate multi dcg by one pass
118 |   for (size_t i = 0; i < ks.size(); ++i) {
119 |     data_size_t cur_k = ks[i];
120 |     if (cur_k > num_data) { cur_k = num_data; }
121 |     for (data_size_t j = cur_left; j < cur_k; ++j) {
122 |       data_size_t idx = sorted_idx[j];
123 |       cur_result += label_gain_[static_cast<int>(label[idx])] * discount_[j];
124 |     }
125 |     (*out)[i] = cur_result;
126 |     cur_left = cur_k;
127 |   }
128 | }
129 | 
130 | }  // namespace LightGBM
131 | 


--------------------------------------------------------------------------------