├── VERSION.txt
├── R-package
    ├── src
    │   ├── Makevars
    │   └── Makevars.win
    ├── .Rbuildignore
    ├── data
    │   ├── bank.rda
    │   ├── agaricus.test.rda
    │   └── agaricus.train.rda
    ├── tests
    │   ├── testthat.R
    │   └── testthat
    │   │   ├── test_custom_objective.R
    │   │   ├── test_parameters.R
    │   │   ├── test_dataset.R
    │   │   └── test_basic.R
    ├── man
    │   ├── lgb.Dataset.construct.Rd
    │   ├── lgb.Dataset.save.Rd
    │   ├── bank.Rd
    │   ├── lgb.Dataset.set.categorical.Rd
    │   ├── slice.Rd
    │   ├── lgb.Dataset.set.reference.Rd
    │   ├── agaricus.test.Rd
    │   ├── agaricus.train.Rd
    │   ├── dim.Rd
    │   ├── lgb.Dataset.create.valid.Rd
    │   ├── lgb.dump.Rd
    │   ├── dimnames.lgb.Dataset.Rd
    │   ├── lgb.save.Rd
    │   ├── readRDS.lgb.Booster.Rd
    │   ├── getinfo.Rd
    │   ├── setinfo.Rd
    │   ├── lgb.Dataset.Rd
    │   ├── lgb.get.eval.result.Rd
    │   ├── lgb.importance.Rd
    │   ├── lgb.load.Rd
    │   ├── lgb.plot.importance.Rd
    │   ├── lgb.interprete.Rd
    │   ├── lgb.unloader.Rd
    │   ├── lgb.plot.interpretation.Rd
    │   ├── lgb.prepare.Rd
    │   ├── saveRDS.lgb.Booster.Rd
    │   ├── lgb.model.dt.tree.Rd
    │   ├── lgb.prepare2.Rd
    │   ├── predict.lgb.Booster.Rd
    │   ├── lgb.prepare_rules.Rd
    │   └── lgb.prepare_rules2.Rd
    ├── demo
    │   ├── 00Index
    │   ├── boost_from_prediction.R
    │   ├── efficient_many_training.R
    │   ├── cross_validation.R
    │   ├── early_stopping.R
    │   ├── multiclass.R
    │   ├── multiclass_custom_objective.R
    │   └── categorical_features_prepare.R
    ├── build_package.R
    ├── LICENSE
    ├── NAMESPACE
    ├── R
    │   ├── readRDS.lgb.Booster.R
    │   ├── lgb.importance.R
    │   ├── lgb.unloader.R
    │   ├── saveRDS.lgb.Booster.R
    │   └── lgb.plot.importance.R
    └── DESCRIPTION
├── examples
    ├── .gitignore
    ├── parallel_learning
    │   ├── mlist.txt
    │   ├── predict.conf
    │   └── README.md
    ├── lambdarank
    │   ├── predict.conf
    │   ├── rank.test.query
    │   ├── README.md
    │   └── rank.train.query
    ├── regression
    │   ├── predict.conf
    │   └── README.md
    ├── binary_classification
    │   ├── predict.conf
    │   ├── forced_splits.json
    │   └── README.md
    ├── multiclass_classification
    │   ├── predict.conf
    │   ├── README.md
    │   └── train.conf
    └── python-guide
    │   ├── simple_example.py
    │   ├── plot_example.py
    │   ├── sklearn_example.py
    │   └── README.md
├── .gitmodules
├── docs
    ├── _static
    │   ├── images
    │   │   ├── gcc-bars.png
    │   │   ├── gcc-chart.png
    │   │   ├── gcc-table.png
    │   │   ├── leaf-wise.png
    │   │   ├── level-wise.png
    │   │   ├── gcc-meetup-1.png
    │   │   ├── gcc-meetup-2.png
    │   │   ├── gcc-comparison-1.png
    │   │   ├── gcc-comparison-2.png
    │   │   ├── screenshot-system.png
    │   │   ├── screenshot-use-gpu.png
    │   │   ├── screenshot-debug-run.png
    │   │   ├── screenshot-r-mingw-used.png
    │   │   ├── screenshot-boost-compiled.png
    │   │   ├── gpu-performance-comparison.png
    │   │   ├── screenshot-create-directory.png
    │   │   ├── screenshot-downloading-cmake.png
    │   │   ├── screenshot-files-to-remove.png
    │   │   ├── screenshot-git-for-windows.png
    │   │   ├── screenshot-configured-lightgbm.png
    │   │   ├── screenshot-mingw-installation.png
    │   │   ├── screenshot-segmentation-fault.png
    │   │   ├── screenshot-environment-variables.png
    │   │   ├── screenshot-mingw-makefiles-to-use.png
    │   │   ├── screenshot-advanced-system-settings.png
    │   │   ├── screenshot-lightgbm-in-cli-with-gpu.png
    │   │   ├── screenshot-added-manual-entry-in-cmake.png
    │   │   ├── screenshot-configured-and-generated-cmake.png
    │   │   └── screenshot-lightgbm-with-gpu-support-compiled.png
    │   └── js
    │   │   └── script.js
    ├── .linkcheckerrc
    ├── Makefile
    ├── make.bat
    ├── README.rst
    ├── Python-API.rst
    ├── index.rst
    ├── gcc-Tips.rst
    ├── Advanced-Topics.rst
    ├── Parameters-Tuning.rst
    └── Quick-Start.rst
├── tests
    ├── cpp_test
    │   ├── predict.conf
    │   ├── train.conf
    │   └── test.py
    └── python_package_test
    │   └── test_basic.py
├── pmml
    └── README.md
├── python-package
    ├── MANIFEST.in
    └── lightgbm
    │   ├── __init__.py
    │   └── libpath.py
├── docker
    ├── dockerfile-cli
    ├── dockerfile-python
    ├── gpu
    │   └── README.md
    └── README.md
├── .github
    └── ISSUE_TEMPLATE.md
├── include
    └── LightGBM
    │   ├── export.h
    │   ├── prediction_early_stop.h
    │   ├── utils
    │       ├── threading.h
    │       ├── openmp_wrapper.h
    │       ├── file_io.h
    │       ├── pipeline_reader.h
    │       ├── log.h
    │       └── random.h
    │   ├── meta.h
    │   ├── application.h
    │   ├── objective_function.h
    │   ├── tree_learner.h
    │   └── dataset_loader.h
├── src
    ├── main.cpp
    ├── network
    │   └── linkers_mpi.cpp
    ├── treelearner
    │   ├── tree_learner.cpp
    │   └── feature_parallel_tree_learner.cpp
    ├── boosting
    │   ├── boosting.cpp
    │   ├── prediction_early_stop.cpp
    │   └── gbdt_prediction.cpp
    └── metric
    │   └── metric.cpp
├── LICENSE
├── windows
    └── LightGBM.sln
├── .travis
    └── setup.sh
├── .travis.yml
├── .nuget
    └── create_nuget.py
├── swig
    └── lightgbmlib.i
└── .appveyor.yml


/VERSION.txt:
--------------------------------------------------------------------------------
1 | 2.1.2
2 | 


--------------------------------------------------------------------------------
/R-package/src/Makevars:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/R-package/src/Makevars.win:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | 


--------------------------------------------------------------------------------
/R-package/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^build_package.R$
2 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/mlist.txt:
--------------------------------------------------------------------------------
1 | 192.168.1.101 12400
2 | 192.168.1.102 12400
3 | 


--------------------------------------------------------------------------------
/R-package/data/bank.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/R-package/data/bank.rda


--------------------------------------------------------------------------------
/R-package/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(lightgbm)
3 | 
4 | test_check("lightgbm")
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "include/boost/compute"]
2 | 	path = compute
3 | 	url = https://github.com/boostorg/compute
4 | 


--------------------------------------------------------------------------------
/R-package/data/agaricus.test.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/R-package/data/agaricus.test.rda


--------------------------------------------------------------------------------
/docs/_static/images/gcc-bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-bars.png


--------------------------------------------------------------------------------
/R-package/data/agaricus.train.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/R-package/data/agaricus.train.rda


--------------------------------------------------------------------------------
/docs/_static/images/gcc-chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-chart.png


--------------------------------------------------------------------------------
/docs/_static/images/gcc-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-table.png


--------------------------------------------------------------------------------
/docs/_static/images/leaf-wise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/leaf-wise.png


--------------------------------------------------------------------------------
/docs/_static/images/level-wise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/level-wise.png


--------------------------------------------------------------------------------
/examples/lambdarank/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = rank.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 


--------------------------------------------------------------------------------
/tests/cpp_test/predict.conf:
--------------------------------------------------------------------------------
1 | data=../data/categorical.data
2 | 
3 | input_model=LightGBM_model.txt
4 | 
5 | task=predict
6 | 


--------------------------------------------------------------------------------
/docs/_static/images/gcc-meetup-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-meetup-1.png


--------------------------------------------------------------------------------
/docs/_static/images/gcc-meetup-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-meetup-2.png


--------------------------------------------------------------------------------
/examples/regression/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = regression.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 


--------------------------------------------------------------------------------
/docs/_static/images/gcc-comparison-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-comparison-1.png


--------------------------------------------------------------------------------
/docs/_static/images/gcc-comparison-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-comparison-2.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-system.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-use-gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-use-gpu.png


--------------------------------------------------------------------------------
/examples/binary_classification/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = binary.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/predict.conf:
--------------------------------------------------------------------------------
1 | 
2 | task = predict
3 | 
4 | data = binary.test
5 | 
6 | input_model= LightGBM_model.txt
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-debug-run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-debug-run.png


--------------------------------------------------------------------------------
/examples/multiclass_classification/predict.conf:
--------------------------------------------------------------------------------
1 | task = predict
2 | 
3 | data = multiclass.test
4 | 
5 | input_model= LightGBM_model.txt
6 | 


--------------------------------------------------------------------------------
/tests/cpp_test/train.conf:
--------------------------------------------------------------------------------
1 | data=../data/categorical.data
2 | 
3 | app=binary
4 | 
5 | num_trees=10
6 | 
7 | categorical_column=0,1,4,5,6
8 | 


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-r-mingw-used.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-r-mingw-used.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-boost-compiled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-boost-compiled.png


--------------------------------------------------------------------------------
/docs/_static/images/gpu-performance-comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gpu-performance-comparison.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-create-directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-create-directory.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-downloading-cmake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-downloading-cmake.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-files-to-remove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-files-to-remove.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-git-for-windows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-git-for-windows.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-configured-lightgbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-configured-lightgbm.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-mingw-installation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-mingw-installation.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-segmentation-fault.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-segmentation-fault.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-environment-variables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-environment-variables.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-mingw-makefiles-to-use.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-mingw-makefiles-to-use.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-advanced-system-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-advanced-system-settings.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-lightgbm-in-cli-with-gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-lightgbm-in-cli-with-gpu.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-added-manual-entry-in-cmake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-added-manual-entry-in-cmake.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-configured-and-generated-cmake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-configured-and-generated-cmake.png


--------------------------------------------------------------------------------
/docs/_static/images/screenshot-lightgbm-with-gpu-support-compiled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-lightgbm-with-gpu-support-compiled.png


--------------------------------------------------------------------------------
/tests/cpp_test/test.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import glob
3 | import numpy as np
4 | 
5 | preds = [np.loadtxt(name) for name in glob.glob('*.pred')]
6 | np.testing.assert_array_almost_equal(preds[0], preds[1], decimal=5)
7 | 


--------------------------------------------------------------------------------
/pmml/README.md:
--------------------------------------------------------------------------------
1 | PMML Generator 
2 | ==============
3 | 
4 | The old Python convert script is removed due to it cannot support the new format of categorical features.
5 | 
6 | Please refer to https://github.com/jpmml/jpmml-lightgbm.
7 | 


--------------------------------------------------------------------------------
/examples/binary_classification/forced_splits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "feature": 25,
 3 |     "threshold": 1.30,
 4 |     "left": {
 5 |         "feature": 26,
 6 |         "threshold": 0.85
 7 |     },
 8 |     "right": {
 9 |         "feature": 26,
10 |         "threshold": 0.85
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/docs/.linkcheckerrc:
--------------------------------------------------------------------------------
 1 | [checking]
 2 | recursionlevel=1
 3 | anchors=1
 4 | sslverify=0
 5 | 
 6 | [filtering]
 7 | ignore=public.tableau.com
 8 | ignorewarnings=http-robots-denied,https-certificate-error
 9 | 
10 | [output]
11 | # Set to 1 if you want see the full output, not only warnings and errors
12 | verbose=0
13 | 
14 | [AnchorCheck]
15 | 


--------------------------------------------------------------------------------
/docs/_static/js/script.js:
--------------------------------------------------------------------------------
1 | $(function() {
2 |     $('a[href^="./"][href*=".rst"]').attr('href', (i, val) => { return val.replace('.rst', '.html'); });  /* Replace '.rst' with '.html' in all internal links like './[Something].rst[#anchor]' */
3 |     $('.wy-nav-content').each(function () { this.style.setProperty('max-width', 'none', 'important'); });
4 | });
5 | 


--------------------------------------------------------------------------------
/examples/lambdarank/rank.test.query:
--------------------------------------------------------------------------------
 1 | 12
 2 | 19
 3 | 18
 4 | 10
 5 | 15
 6 | 15
 7 | 22
 8 | 23
 9 | 18
10 | 16
11 | 16
12 | 11
13 | 6
14 | 13
15 | 17
16 | 21
17 | 20
18 | 16
19 | 13
20 | 16
21 | 21
22 | 15
23 | 10
24 | 19
25 | 10
26 | 13
27 | 18
28 | 17
29 | 23
30 | 24
31 | 16
32 | 13
33 | 17
34 | 24
35 | 17
36 | 10
37 | 17
38 | 15
39 | 18
40 | 16
41 | 9
42 | 9
43 | 21
44 | 14
45 | 13
46 | 13
47 | 13
48 | 10
49 | 10
50 | 6
51 | 


--------------------------------------------------------------------------------
/python-package/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | prune build
 2 | include LICENSE
 3 | include *.rst *.txt
 4 | recursive-include lightgbm *.py *.txt *.so
 5 | recursive-include compile *.txt *.so
 6 | recursive-include compile/Release *.dll
 7 | recursive-include compile/compute *
 8 | recursive-include compile/include *
 9 | recursive-include compile/src *
10 | recursive-include compile/windows LightGBM.sln LightGBM.vcxproj
11 | recursive-include compile/windows/x64/DLL *.dll
12 | global-exclude *.py[co]
13 | 


--------------------------------------------------------------------------------
/docker/dockerfile-cli:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y cmake build-essential gcc g++ git && \
 5 |     rm -rf /var/lib/apt/lists/*
 6 | 
 7 | RUN git clone --recursive --branch stable https://github.com/Microsoft/LightGBM && \
 8 |     mkdir LightGBM/build && \
 9 |     cd LightGBM/build && \
10 |     cmake .. && \
11 |     make -j4 && \
12 |     make install && \
13 |     cd ../.. && \
14 |     rm -rf LightGBM 
15 | 
16 | ENTRYPOINT ["lightgbm"]
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Please search your question on previous issues, [stackoverflow](https://stackoverflow.com/questions/tagged/lightgbm) or other search engines before you open a new one.
 2 | 
 3 | For bugs and unexpected issues, please provide following information, so that we could reproduce on our system.
 4 | 
 5 | ## Environment info
 6 | Operating System:
 7 | CPU:
 8 | C++/Python/R version:
 9 | 
10 | ## Error Message:
11 | 
12 | ## Reproducible examples
13 | 
14 | ## Steps to reproduce
15 | 
16 | 1.
17 | 2.
18 | 3.
19 | 


--------------------------------------------------------------------------------
/examples/lambdarank/README.md:
--------------------------------------------------------------------------------
 1 | LambdaRank Example
 2 | ==================
 3 | 
 4 | Here is an example for LightGBM to run lambdarank task.
 5 | 
 6 | ***You should copy executable file to this folder first.***
 7 | 
 8 | Training
 9 | --------
10 | 
11 | Run the following command in this folder:
12 | 
13 | ```
14 | "./lightgbm" config=train.conf
15 | ```
16 | 
17 | Prediction
18 | ----------
19 | 
20 | You should finish training first.
21 | 
22 | Run the following command in this folder:
23 | 
24 | ```
25 | "./lightgbm" config=predict.conf
26 | ```
27 | 


--------------------------------------------------------------------------------
/examples/regression/README.md:
--------------------------------------------------------------------------------
 1 | Regression Example
 2 | ==================
 3 | 
 4 | Here is an example for LightGBM to run regression task.
 5 | 
 6 | ***You should copy executable file to this folder first.***
 7 | 
 8 | Training
 9 | --------
10 | 
11 | Run the following command in this folder:
12 | 
13 | ```
14 | "./lightgbm" config=train.conf
15 | ```
16 | 
17 | Prediction
18 | ----------
19 | 
20 | You should finish training first.
21 | 
22 | Run the following command in this folder:
23 | 
24 | ```
25 | "./lightgbm" config=predict.conf
26 | ```
27 | 


--------------------------------------------------------------------------------
/include/LightGBM/export.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_EXPORT_H_
 2 | #define LIGHTGBM_EXPORT_H_
 3 | 
 4 | /** Macros for exporting symbols in MSVC/GCC/CLANG **/
 5 | 
 6 | #ifdef __cplusplus
 7 | #define LIGHTGBM_EXTERN_C extern "C"
 8 | #else
 9 | #define LIGHTGBM_EXTERN_C
10 | #endif
11 | 
12 | 
13 | #ifdef _MSC_VER
14 | #define LIGHTGBM_EXPORT __declspec(dllexport)
15 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport)
16 | #else
17 | #define LIGHTGBM_EXPORT 
18 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C
19 | #endif
20 | 
21 | #endif /** LIGHTGBM_EXPORT_H_ **/
22 | 


--------------------------------------------------------------------------------
/examples/binary_classification/README.md:
--------------------------------------------------------------------------------
 1 | Binary Classification Example
 2 | =============================
 3 | 
 4 | Here is an example for LightGBM to run binary classification task.
 5 | 
 6 | ***You should copy executable file to this folder first.***
 7 | 
 8 | Training
 9 | --------
10 | 
11 | Run the following command in this folder:
12 | 
13 | ```
14 | "./lightgbm" config=train.conf
15 | ```
16 | 
17 | Prediction
18 | ----------
19 | 
20 | You should finish training first.
21 | 
22 | Run the following command in this folder:
23 | 
24 | ```
25 | "./lightgbm" config=predict.conf
26 | ```
27 | 


--------------------------------------------------------------------------------
/examples/multiclass_classification/README.md:
--------------------------------------------------------------------------------
 1 | Multiclass Classification Example
 2 | =================================
 3 | 
 4 | Here is an example for LightGBM to run multiclass classification task.
 5 | 
 6 | ***You should copy executable file to this folder first.***
 7 | 
 8 | Training
 9 | --------
10 | 
11 | Run the following command in this folder:
12 | 
13 | ```
14 | "./lightgbm" config=train.conf
15 | ```
16 | 
17 | Prediction
18 | ----------
19 | 
20 | You should finish training first.
21 | 
22 | Run the following command in this folder:
23 | 
24 | ```
25 | "./lightgbm" config=predict.conf
26 | ```
27 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <LightGBM/application.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |   try {
 6 |     LightGBM::Application app(argc, argv);
 7 |     app.Run();
 8 |   }
 9 |   catch (const std::exception& ex) {
10 |     std::cerr << "Met Exceptions:" << std::endl;
11 |     std::cerr << ex.what() << std::endl;
12 |     exit(-1);
13 |   }
14 |   catch (const std::string& ex) {
15 |     std::cerr << "Met Exceptions:" << std::endl;
16 |     std::cerr << ex << std::endl;
17 |     exit(-1);
18 |   }
19 |   catch (...) {
20 |     std::cerr << "Unknown Exceptions" << std::endl;
21 |     exit(-1);
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.construct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.construct}
 4 | \alias{lgb.Dataset.construct}
 5 | \title{Construct Dataset explicitly}
 6 | \usage{
 7 | lgb.Dataset.construct(dataset)
 8 | }
 9 | \arguments{
10 | \item{dataset}{Object of class \code{lgb.Dataset}}
11 | }
12 | \description{
13 | Construct Dataset explicitly
14 | }
15 | \examples{
16 | \dontrun{
17 | library(lightgbm)
18 | data(agaricus.train, package = "lightgbm")
19 | train <- agaricus.train
20 | dtrain <- lgb.Dataset(train$data, label = train$label)
21 | lgb.Dataset.construct(dtrain)
22 | }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/R-package/demo/00Index:
--------------------------------------------------------------------------------
 1 | basic_walkthrough               Basic feature walkthrough
 2 | boost_from_prediction           Boosting from existing prediction
 3 | categorical_feature_prepare     Categorical Feature Preparation
 4 | categorical_feature_rules       Categorical Feature Preparation with Rules
 5 | cross_validation                Cross Validation
 6 | early_stopping                  Early Stop in training
 7 | efficient_many_training         Efficiency for Many Model Trainings
 8 | multiclass                      Multiclass training/prediction
 9 | leaf_stability                  Leaf (in)Stability example
10 | weight_param                    Weight-Parameter adjustment relationship
11 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    = -W
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = LightGBM
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.save.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.save}
 4 | \alias{lgb.Dataset.save}
 5 | \title{Save \code{lgb.Dataset} to a binary file}
 6 | \usage{
 7 | lgb.Dataset.save(dataset, fname)
 8 | }
 9 | \arguments{
10 | \item{dataset}{object of class \code{lgb.Dataset}}
11 | 
12 | \item{fname}{object filename of output file}
13 | }
14 | \value{
15 | passed dataset
16 | }
17 | \description{
18 | Save \code{lgb.Dataset} to a binary file
19 | }
20 | \examples{
21 | 
22 | \dontrun{
23 | library(lightgbm)
24 | data(agaricus.train, package = "lightgbm")
25 | train <- agaricus.train
26 | dtrain <- lgb.Dataset(train$data, label = train$label)
27 | lgb.Dataset.save(dtrain, "data.bin")
28 | }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/docker/dockerfile-python:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y cmake build-essential gcc g++ git wget && \
 5 | 
 6 | # python-package
 7 |     # miniconda
 8 |     wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
 9 |     /bin/bash Miniconda3-latest-Linux-x86_64.sh -f -b -p /opt/conda && \
10 |     export PATH="/opt/conda/bin:$PATH" && \
11 |     # lightgbm
12 |     conda install -y numpy scipy scikit-learn pandas && \
13 |     git clone --recursive https://github.com/Microsoft/LightGBM && \
14 |     cd LightGBM/python-package && python setup.py install && \
15 | 
16 | # clean
17 |     apt-get autoremove -y && apt-get clean && \
18 |     conda clean -i -l -t -y && \
19 |     rm -rf /usr/local/src/*
20 | 
21 | ENV PATH /opt/conda/bin:$PATH
22 | 


--------------------------------------------------------------------------------
/R-package/man/bank.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lightgbm.R
 3 | \docType{data}
 4 | \name{bank}
 5 | \alias{bank}
 6 | \title{Bank Marketing Data Set}
 7 | \format{A data.table with 4521 rows and 17 variables}
 8 | \usage{
 9 | data(bank)
10 | }
11 | \description{
12 | This data set is originally from the Bank Marketing data set,
13 | UCI Machine Learning Repository.
14 | }
15 | \details{
16 | It contains only the following: bank.csv with 10% of the examples and 17 inputs,
17 | randomly selected from 3 (older version of this dataset with less inputs).
18 | }
19 | \references{
20 | http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
21 | 
22 | S. Moro, P. Cortez and P. Rita. (2014)
23 | A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/examples/parallel_learning/README.md:
--------------------------------------------------------------------------------
 1 | Parallel Learning Example
 2 | =========================
 3 | 
 4 | Here is an example for LightGBM to perform parallel learning for 2 machines.
 5 | 
 6 | 1. Edit [mlist.txt](./mlist.txt): write the ip of these 2 machines that you want to run application on.
 7 | 
 8 |    ```
 9 |    machine1_ip 12400
10 |    machine2_ip 12400
11 |    ```
12 | 
13 | 2. Copy this folder and executable file to these 2 machines that you want to run application on.
14 | 
15 | 3. Run command in this folder on both 2 machines:
16 | 
17 |    ```"./lightgbm" config=train.conf```
18 | 
19 | This parallel learning example is based on socket. LightGBM also supports parallel learning based on mpi.
20 | 
21 | For more details about the usage of parallel learning, please refer to [this](https://github.com/Microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst).
22 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.set.categorical.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.set.categorical}
 4 | \alias{lgb.Dataset.set.categorical}
 5 | \title{Set categorical feature of \code{lgb.Dataset}}
 6 | \usage{
 7 | lgb.Dataset.set.categorical(dataset, categorical_feature)
 8 | }
 9 | \arguments{
10 | \item{dataset}{object of class \code{lgb.Dataset}}
11 | 
12 | \item{categorical_feature}{categorical features}
13 | }
14 | \value{
15 | passed dataset
16 | }
17 | \description{
18 | Set categorical feature of \code{lgb.Dataset}
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package = "lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | lgb.Dataset.save(dtrain, "lgb.Dataset.data")
27 | dtrain <- lgb.Dataset("lgb.Dataset.data")
28 | lgb.Dataset.set.categorical(dtrain, 1:2)
29 | }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=LightGBM
13 | set SPHINXOPTS=-W
14 | 
15 | if "%1" == "" goto help
16 | 
17 | %SPHINXBUILD% >NUL 2>NUL
18 | if errorlevel 9009 (
19 | 	echo.
20 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
21 | 	echo.installed, then set the SPHINXBUILD environment variable to point
22 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
23 | 	echo.may add the Sphinx directory to PATH.
24 | 	echo.
25 | 	echo.If you don't have Sphinx installed, grab it from
26 | 	echo.http://sphinx-doc.org/
27 | 	exit /b 1
28 | )
29 | 
30 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
31 | goto end
32 | 
33 | :help
34 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
35 | 
36 | :end
37 | popd
38 | 


--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
 1 | Documentation
 2 | =============
 3 | 
 4 | Documentation for LightGBM is generated using `Sphinx <http://www.sphinx-doc.org/>`__.
 5 | 
 6 | List of parameters and their descriptions in `Parameters.rst <./Parameters.rst>`__
 7 | is generated automatically from comments in `config file <https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/config.h>`__
 8 | by `this script <https://github.com/Microsoft/LightGBM/blob/master/helper/parameter_generator.py>`__.
 9 | 
10 | After each commit on ``master``, documentation is updated and published to `Read the Docs <https://lightgbm.readthedocs.io/>`__.
11 | 
12 | Build
13 | -----
14 | 
15 | You can build the documentation locally. Just run in ``docs`` folder
16 | 
17 | for Python 3.x:
18 | 
19 | .. code:: sh
20 | 
21 |     pip install sphinx "sphinx_rtd_theme>=0.3"
22 |     make html
23 | 
24 |  
25 | for Python 2.x:
26 | 
27 | .. code:: sh
28 | 
29 |     pip install mock sphinx "sphinx_rtd_theme>=0.3"
30 |     make html
31 | 


--------------------------------------------------------------------------------
/R-package/man/slice.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{slice}
 4 | \alias{slice}
 5 | \alias{slice.lgb.Dataset}
 6 | \title{Slice a dataset}
 7 | \usage{
 8 | slice(dataset, ...)
 9 | 
10 | \method{slice}{lgb.Dataset}(dataset, idxset, ...)
11 | }
12 | \arguments{
13 | \item{dataset}{Object of class "lgb.Dataset"}
14 | 
15 | \item{...}{other parameters (currently not used)}
16 | 
17 | \item{idxset}{a integer vector of indices of rows needed}
18 | }
19 | \value{
20 | constructed sub dataset
21 | }
22 | \description{
23 | Get a new \code{lgb.Dataset} containing the specified rows of
24 | orginal lgb.Dataset object
25 | }
26 | \examples{
27 | \dontrun{
28 | library(lightgbm)
29 | data(agaricus.train, package = "lightgbm")
30 | train <- agaricus.train
31 | dtrain <- lgb.Dataset(train$data, label = train$label)
32 | 
33 | dsub <- lightgbm::slice(dtrain, 1:42)
34 | labels <- lightgbm::getinfo(dsub, "label")
35 | }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.set.reference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.set.reference}
 4 | \alias{lgb.Dataset.set.reference}
 5 | \title{Set reference of \code{lgb.Dataset}}
 6 | \usage{
 7 | lgb.Dataset.set.reference(dataset, reference)
 8 | }
 9 | \arguments{
10 | \item{dataset}{object of class \code{lgb.Dataset}}
11 | 
12 | \item{reference}{object of class \code{lgb.Dataset}}
13 | }
14 | \value{
15 | passed dataset
16 | }
17 | \description{
18 | If you want to use validation data, you should set reference to training data
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package ="lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | data(agaricus.test, package = "lightgbm")
27 | test <- agaricus.test
28 | dtest <- lgb.Dataset(test$data, test = train$label)
29 | lgb.Dataset.set.reference(dtest, dtrain)
30 | }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/network/linkers_mpi.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef USE_MPI
 2 | #include "linkers.h"
 3 | 
 4 | namespace LightGBM {
 5 | 
 6 | Linkers::Linkers(Config) {
 7 |   is_init_ = false;
 8 |   int argc = 0;
 9 |   char**argv = nullptr;
10 |   int flag = 0;
11 |   MPI_SAFE_CALL(MPI_Initialized(&flag));  // test if MPI has been initialized
12 |   if (!flag) {  // if MPI not started, start it
13 |     MPI_SAFE_CALL(MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &flag));
14 |   }
15 |   MPI_SAFE_CALL(MPI_Comm_size(MPI_COMM_WORLD, &num_machines_));
16 |   MPI_SAFE_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank_));
17 |   // wait for all client start up
18 |   MPI_SAFE_CALL(MPI_Barrier(MPI_COMM_WORLD));
19 |   bruck_map_ = BruckMap::Construct(rank_, num_machines_);
20 |   recursive_halving_map_ = RecursiveHalvingMap::Construct(rank_, num_machines_);
21 |   is_init_ = true;
22 | }
23 | 
24 | Linkers::~Linkers() {
25 |   if (is_init_) {
26 |     MPI_SAFE_CALL(MPI_Finalize());
27 |   }
28 | }
29 | 
30 | 
31 | }  // namespace LightGBM
32 | #endif // USE_MPI
33 | 


--------------------------------------------------------------------------------
/R-package/build_package.R:
--------------------------------------------------------------------------------
 1 | unlink("./src/include", recursive = TRUE)
 2 | unlink("./src/src", recursive = TRUE)
 3 | unlink("./src/compute", recursive = TRUE)
 4 | unlink("./src/build", recursive = TRUE)
 5 | unlink("./src/Release", recursive = TRUE)
 6 | if (!file.copy("./../include", "./src/", overwrite = TRUE, recursive = TRUE)) {
 7 |   stop("Cannot find folder LightGBM/include")
 8 | }
 9 | if (!file.copy("./../src", "./src/", overwrite = TRUE, recursive = TRUE)) {
10 |   stop("Cannot find folder LightGBM/src")
11 | }
12 | if (!file.copy("./../compute", "./src/", overwrite = TRUE, recursive = TRUE)) {
13 |   print("Cannot find folder LightGBM/compute, will disable GPU build")
14 | }
15 | if (!file.copy("./../CMakeLists.txt", "./src/", overwrite = TRUE, recursive = TRUE)) {
16 |   stop("Cannot find file LightGBM/CMakeLists.txt")
17 | }
18 | if (!file.exists("./src/_IS_FULL_PACKAGE")) {
19 |   file.create("./src/_IS_FULL_PACKAGE")
20 | }
21 | system("R CMD build --no-build-vignettes .")
22 | file.remove("./src/_IS_FULL_PACKAGE")
23 | 


--------------------------------------------------------------------------------
/R-package/man/agaricus.test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lightgbm.R
 3 | \docType{data}
 4 | \name{agaricus.test}
 5 | \alias{agaricus.test}
 6 | \title{Test part from Mushroom Data Set}
 7 | \format{A list containing a label vector, and a dgCMatrix object with 1611
 8 | rows and 126 variables}
 9 | \usage{
10 | data(agaricus.test)
11 | }
12 | \description{
13 | This data set is originally from the Mushroom data set,
14 | UCI Machine Learning Repository.
15 | }
16 | \details{
17 | This data set includes the following fields:
18 | 
19 | \itemize{
20 |  \item \code{label} the label for each record
21 |  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
22 | }
23 | }
24 | \references{
25 | https://archive.ics.uci.edu/ml/datasets/Mushroom
26 | 
27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
29 | School of Information and Computer Science.
30 | }
31 | \keyword{datasets}
32 | 


--------------------------------------------------------------------------------
/R-package/man/agaricus.train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lightgbm.R
 3 | \docType{data}
 4 | \name{agaricus.train}
 5 | \alias{agaricus.train}
 6 | \title{Training part from Mushroom Data Set}
 7 | \format{A list containing a label vector, and a dgCMatrix object with 6513
 8 | rows and 127 variables}
 9 | \usage{
10 | data(agaricus.train)
11 | }
12 | \description{
13 | This data set is originally from the Mushroom data set,
14 | UCI Machine Learning Repository.
15 | }
16 | \details{
17 | This data set includes the following fields:
18 | 
19 | \itemize{
20 |  \item \code{label} the label for each record
21 |  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
22 | }
23 | }
24 | \references{
25 | https://archive.ics.uci.edu/ml/datasets/Mushroom
26 | 
27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
29 | School of Information and Computer Science.
30 | }
31 | \keyword{datasets}
32 | 


--------------------------------------------------------------------------------
/R-package/man/dim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{dim.lgb.Dataset}
 4 | \alias{dim.lgb.Dataset}
 5 | \title{Dimensions of an lgb.Dataset}
 6 | \usage{
 7 | \method{dim}{lgb.Dataset}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Object of class \code{lgb.Dataset}}
11 | 
12 | \item{...}{other parameters}
13 | }
14 | \value{
15 | a vector of numbers of rows and of columns
16 | }
17 | \description{
18 | Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
19 | }
20 | \details{
21 | Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
22 | be directly used with an \code{lgb.Dataset} object.
23 | }
24 | \examples{
25 | \dontrun{
26 | library(lightgbm)
27 | data(agaricus.train, package = "lightgbm")
28 | train <- agaricus.train
29 | dtrain <- lgb.Dataset(train$data, label = train$label)
30 | 
31 | stopifnot(nrow(dtrain) == nrow(train$data))
32 | stopifnot(ncol(dtrain) == ncol(train$data))
33 | stopifnot(all(dim(dtrain) == dim(train$data)))
34 | }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.create.valid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset.create.valid}
 4 | \alias{lgb.Dataset.create.valid}
 5 | \title{Construct validation data}
 6 | \usage{
 7 | lgb.Dataset.create.valid(dataset, data, info = list(), ...)
 8 | }
 9 | \arguments{
10 | \item{dataset}{\code{lgb.Dataset} object, training data}
11 | 
12 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
13 | 
14 | \item{info}{a list of information of the lgb.Dataset object}
15 | 
16 | \item{...}{other information to pass to \code{info}.}
17 | }
18 | \value{
19 | constructed dataset
20 | }
21 | \description{
22 | Construct validation data according to training data
23 | }
24 | \examples{
25 | \dontrun{
26 | library(lightgbm)
27 | data(agaricus.train, package = "lightgbm")
28 | train <- agaricus.train
29 | dtrain <- lgb.Dataset(train$data, label = train$label)
30 | data(agaricus.test, package = "lightgbm")
31 | test <- agaricus.test
32 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
33 | }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Microsoft Corporation 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_custom_objective.R:
--------------------------------------------------------------------------------
 1 | context('Test models with custom objective')
 2 | 
 3 | data(agaricus.train, package='lightgbm')
 4 | data(agaricus.test, package='lightgbm')
 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 7 | watchlist <- list(eval = dtest, train = dtrain)
 8 | 
 9 | logregobj <- function(preds, dtrain) {
10 |   labels <- getinfo(dtrain, "label")
11 |   preds <- 1 / (1 + exp(-preds))
12 |   grad <- preds - labels
13 |   hess <- preds * (1 - preds)
14 |   return(list(grad = grad, hess = hess))
15 | }
16 | 
17 | evalerror <- function(preds, dtrain) {
18 |   labels <- getinfo(dtrain, "label")
19 |   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
20 |   return(list(name = "error", value = err, higher_better=FALSE))
21 | }
22 | 
23 | param <- list(num_leaves=8, learning_rate=1,
24 |               objective=logregobj, metric="auc")
25 | num_round <- 10
26 | 
27 | test_that("custom objective works", {
28 |   bst <- lgb.train(param, dtrain, num_round, watchlist, eval = evalerror)
29 |   expect_false(is.null(bst$record_evals))
30 | })
31 | 


--------------------------------------------------------------------------------
/R-package/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Microsoft Corporation 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/python-package/lightgbm/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """LightGBM, Light Gradient Boosting Machine.
 3 | 
 4 | Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
 5 | """
 6 | from __future__ import absolute_import
 7 | 
 8 | from .basic import Booster, Dataset
 9 | from .callback import (early_stopping, print_evaluation, record_evaluation,
10 |                        reset_parameter)
11 | from .engine import cv, train
12 | import os
13 | 
14 | try:
15 |     from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
16 | except ImportError:
17 |     pass
18 | try:
19 |     from .plotting import plot_importance, plot_metric, plot_tree, create_tree_digraph
20 | except ImportError:
21 |     pass
22 | 
23 | 
24 | dir_path = os.path.dirname(os.path.realpath(__file__))
25 | 
26 | if os.path.isfile(os.path.join(dir_path, 'VERSION.txt')):
27 |     __version__ = open(os.path.join(dir_path, 'VERSION.txt')).read().strip()
28 | 
29 | __all__ = ['Dataset', 'Booster',
30 |            'train', 'cv',
31 |            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
32 |            'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
33 |            'plot_importance', 'plot_metric', 'plot_tree', 'create_tree_digraph']
34 | 


--------------------------------------------------------------------------------
/include/LightGBM/prediction_early_stop.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_PREDICTION_EARLY_STOP_H_
 2 | #define LIGHTGBM_PREDICTION_EARLY_STOP_H_
 3 | 
 4 | #include <functional>
 5 | #include <string>
 6 | 
 7 | #include <LightGBM/export.h>
 8 | 
 9 | namespace LightGBM {
10 | 
11 | struct PredictionEarlyStopInstance {
12 |   /// Callback function type for early stopping.
13 |   /// Takes current prediction and number of elements in prediction
14 |   /// @returns true if prediction should stop according to criterion
15 |   using FunctionType = std::function<bool(const double*, int)>;
16 | 
17 |   FunctionType callback_function;  // callback function itself
18 |   int          round_period;       // call callback_function every `runPeriod` iterations
19 | };
20 | 
21 | struct PredictionEarlyStopConfig {
22 |   int round_period;
23 |   double margin_threshold;
24 | };
25 | 
26 | /// Create an early stopping algorithm of type `type`, with given round_period and margin threshold
27 | LIGHTGBM_EXPORT PredictionEarlyStopInstance CreatePredictionEarlyStopInstance(const std::string& type,
28 |                                                                               const PredictionEarlyStopConfig& config);
29 | 
30 | }   // namespace LightGBM
31 | 
32 | #endif // LIGHTGBM_PREDICTION_EARLY_STOP_H_
33 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.dump.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.dump}
 4 | \alias{lgb.dump}
 5 | \title{Dump LightGBM model to json}
 6 | \usage{
 7 | lgb.dump(booster, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{booster}{Object of class \code{lgb.Booster}}
11 | 
12 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
13 | }
14 | \value{
15 | json format of model
16 | }
17 | \description{
18 | Dump LightGBM model to json
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package = "lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | data(agaricus.test, package = "lightgbm")
27 | test <- agaricus.test
28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
29 | params <- list(objective = "regression", metric = "l2")
30 | valids <- list(test = dtest)
31 | model <- lgb.train(params,
32 |                   dtrain,
33 |                    100,
34 |                    valids,
35 |                    min_data = 1,
36 |                    learning_rate = 1,
37 |                    early_stopping_rounds = 10)
38 | json_model <- lgb.dump(model)
39 | }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/R-package/man/dimnames.lgb.Dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{dimnames.lgb.Dataset}
 4 | \alias{dimnames.lgb.Dataset}
 5 | \alias{dimnames<-.lgb.Dataset}
 6 | \title{Handling of column names of \code{lgb.Dataset}}
 7 | \usage{
 8 | \method{dimnames}{lgb.Dataset}(x)
 9 | 
10 | \method{dimnames}{lgb.Dataset}(x) <- value
11 | }
12 | \arguments{
13 | \item{x}{object of class \code{lgb.Dataset}}
14 | 
15 | \item{value}{a list of two elements: the first one is ignored
16 | and the second one is column names}
17 | }
18 | \description{
19 | Only column names are supported for \code{lgb.Dataset}, thus setting of
20 | row names would have no effect and returned row names would be NULL.
21 | }
22 | \details{
23 | Generic \code{dimnames} methods are used by \code{colnames}.
24 | Since row names are irrelevant, it is recommended to use \code{colnames} directly.
25 | }
26 | \examples{
27 | \dontrun{
28 | library(lightgbm)
29 | data(agaricus.train, package = "lightgbm")
30 | train <- agaricus.train
31 | dtrain <- lgb.Dataset(train$data, label = train$label)
32 | lgb.Dataset.construct(dtrain)
33 | dimnames(dtrain)
34 | colnames(dtrain)
35 | colnames(dtrain) <- make.names(1:ncol(train$data))
36 | print(dtrain, verbose = TRUE)
37 | }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/threading.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_UTILS_THREADING_H_
 2 | #define LIGHTGBM_UTILS_THREADING_H_
 3 | 
 4 | #include <LightGBM/utils/openmp_wrapper.h>
 5 | 
 6 | #include <vector>
 7 | #include <functional>
 8 | 
 9 | namespace LightGBM {
10 | 
11 | class Threading {
12 | public:
13 | 
14 |   template<typename INDEX_T>
15 |   static inline void For(INDEX_T start, INDEX_T end, const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) {
16 |     int num_threads = 1;
17 |     #pragma omp parallel
18 |     #pragma omp master
19 |     {
20 |       num_threads = omp_get_num_threads();
21 |     }
22 |     INDEX_T num_inner = (end - start + num_threads - 1) / num_threads;
23 |     if (num_inner <= 0) { num_inner = 1; }
24 |     OMP_INIT_EX();
25 |     #pragma omp parallel for schedule(static,1)
26 |     for (int i = 0; i < num_threads; ++i) {
27 |       OMP_LOOP_EX_BEGIN();
28 |       INDEX_T inner_start = start + num_inner * i;
29 |       INDEX_T inner_end = inner_start + num_inner;
30 |       if (inner_end > end) { inner_end = end; }
31 |       if (inner_start < end) {
32 |         inner_fun(i, inner_start, inner_end);
33 |       }
34 |       OMP_LOOP_EX_END();
35 |     }
36 |     OMP_THROW_EX();
37 |   }
38 | };
39 | 
40 | }   // namespace LightGBM
41 | 
42 | #endif   // LightGBM_UTILS_THREADING_H_
43 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.save.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.save}
 4 | \alias{lgb.save}
 5 | \title{Save LightGBM model}
 6 | \usage{
 7 | lgb.save(booster, filename, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{booster}{Object of class \code{lgb.Booster}}
11 | 
12 | \item{filename}{saved filename}
13 | 
14 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
15 | }
16 | \value{
17 | lgb.Booster
18 | }
19 | \description{
20 | Save LightGBM model
21 | }
22 | \examples{
23 | \dontrun{
24 | library(lightgbm)
25 | data(agaricus.train, package = "lightgbm")
26 | train <- agaricus.train
27 | dtrain <- lgb.Dataset(train$data, label = train$label)
28 | data(agaricus.test, package = "lightgbm")
29 | test <- agaricus.test
30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
31 | params <- list(objective = "regression", metric = "l2")
32 | valids <- list(test = dtest)
33 | model <- lgb.train(params,
34 |                    dtrain,
35 |                    100,
36 |                    valids,
37 |                    min_data = 1,
38 |                    learning_rate = 1,
39 |                    early_stopping_rounds = 10)
40 | lgb.save(model, "model.txt")
41 | }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/docs/Python-API.rst:
--------------------------------------------------------------------------------
 1 | Python API
 2 | ==========
 3 | 
 4 | Data Structure API
 5 | ------------------
 6 | 
 7 | .. autoclass:: lightgbm.Dataset
 8 |     :members:
 9 |     :show-inheritance:
10 | 
11 | .. autoclass:: lightgbm.Booster
12 |     :members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Training API
17 | ------------
18 | 
19 | .. autofunction:: lightgbm.train
20 | 
21 | .. autofunction:: lightgbm.cv
22 | 
23 | 
24 | Scikit-learn API
25 | ----------------
26 | 
27 | .. autoclass:: lightgbm.LGBMModel
28 |     :members:
29 |     :show-inheritance:
30 | 
31 | .. autoclass:: lightgbm.LGBMClassifier
32 |     :members:
33 |     :show-inheritance:
34 | 
35 | .. autoclass:: lightgbm.LGBMRegressor
36 |     :members:
37 |     :show-inheritance:
38 | 
39 | .. autoclass:: lightgbm.LGBMRanker
40 |     :members:
41 |     :show-inheritance:
42 | 
43 | 
44 | Callbacks
45 | ---------
46 | 
47 | .. autofunction:: lightgbm.early_stopping
48 | 
49 | .. autofunction:: lightgbm.print_evaluation
50 | 
51 | .. autofunction:: lightgbm.record_evaluation
52 | 
53 | .. autofunction:: lightgbm.reset_parameter
54 | 
55 | 
56 | Plotting
57 | --------
58 | 
59 | .. autofunction:: lightgbm.plot_importance
60 | 
61 | .. autofunction:: lightgbm.plot_metric
62 | 
63 | .. autofunction:: lightgbm.plot_tree
64 | 
65 | .. autofunction:: lightgbm.create_tree_digraph
66 | 


--------------------------------------------------------------------------------
/docker/gpu/README.md:
--------------------------------------------------------------------------------
 1 | # Dockerfile for LightGBM GPU Version with Python
 2 | 
 3 | A docker file with LightGBM utilizing nvidia-docker. The file is based on the nvidia/cuda:8.0 image. LightGBM can be utilized in GPU and CPU modes and via Python (2.7 & 3.5)
 4 | 
 5 | ## Contents
 6 | 
 7 | - LightGBM (cpu + gpu)
 8 | - Python 2.7 (Conda) + scikit-learn notebooks pandas matplotlib
 9 | - Python 3.5 (Conda) + scikit-learn notebooks pandas matplotlib
10 | 
11 | Running the container starts a jupyter notebook at localhost:8888
12 | 
13 | jupyter password: keras
14 | 
15 | ## Requirements
16 | 
17 | Requires docker and [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) on host machine.
18 | 
19 | ## Quickstart
20 | 
21 | ### Build Docker Image
22 | 
23 | ```sh
24 | mkdir lightgbm-docker
25 | cd lightgbm-docker
26 | wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/gpu/dockerfile.gpu
27 | docker build -f dockerfile.gpu -t lightgbm-gpu .
28 | ```
29 | 
30 | ### Run Image
31 | 
32 | ```sh
33 | nvidia-docker run --rm -d --name lightgbm-gpu -p 8888:8888 -v /home:/home lightgbm-gpu
34 | ```
35 | 
36 | ### Attach with Command Line Access (if required)
37 | 
38 | ```sh
39 | docker exec -it lightgbm-gpu bash
40 | ```
41 | 
42 | ### Jupyter Notebook
43 | 
44 | ```sh
45 | localhost:8888
46 | ```
47 | 


--------------------------------------------------------------------------------
/R-package/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("dimnames<-",lgb.Dataset)
 4 | S3method(dim,lgb.Dataset)
 5 | S3method(dimnames,lgb.Dataset)
 6 | S3method(getinfo,lgb.Dataset)
 7 | S3method(predict,lgb.Booster)
 8 | S3method(setinfo,lgb.Dataset)
 9 | S3method(slice,lgb.Dataset)
10 | export(getinfo)
11 | export(lgb.Dataset)
12 | export(lgb.Dataset.construct)
13 | export(lgb.Dataset.create.valid)
14 | export(lgb.Dataset.save)
15 | export(lgb.Dataset.set.categorical)
16 | export(lgb.Dataset.set.reference)
17 | export(lgb.cv)
18 | export(lgb.dump)
19 | export(lgb.get.eval.result)
20 | export(lgb.importance)
21 | export(lgb.interprete)
22 | export(lgb.load)
23 | export(lgb.model.dt.tree)
24 | export(lgb.plot.importance)
25 | export(lgb.plot.interpretation)
26 | export(lgb.prepare)
27 | export(lgb.prepare2)
28 | export(lgb.prepare_rules)
29 | export(lgb.prepare_rules2)
30 | export(lgb.save)
31 | export(lgb.train)
32 | export(lgb.unloader)
33 | export(lightgbm)
34 | export(readRDS.lgb.Booster)
35 | export(saveRDS.lgb.Booster)
36 | export(setinfo)
37 | export(slice)
38 | import(methods)
39 | importFrom(R6,R6Class)
40 | importFrom(data.table,":=")
41 | importFrom(data.table,set)
42 | importFrom(graphics,barplot)
43 | importFrom(graphics,par)
44 | importFrom(magrittr,"%>%")
45 | importFrom(magrittr,"%T>%")
46 | useDynLib(lib_lightgbm)
47 | 


--------------------------------------------------------------------------------
/R-package/man/readRDS.lgb.Booster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readRDS.lgb.Booster.R
 3 | \name{readRDS.lgb.Booster}
 4 | \alias{readRDS.lgb.Booster}
 5 | \title{readRDS for lgb.Booster models}
 6 | \usage{
 7 | readRDS.lgb.Booster(file = "", refhook = NULL)
 8 | }
 9 | \arguments{
10 | \item{file}{a connection or the name of the file where the R object is saved to or read from.}
11 | 
12 | \item{refhook}{a hook function for handling reference objects.}
13 | }
14 | \value{
15 | lgb.Booster.
16 | }
17 | \description{
18 | Attemps to load a model using RDS.
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(agaricus.train, package = "lightgbm")
24 | train <- agaricus.train
25 | dtrain <- lgb.Dataset(train$data, label = train$label)
26 | data(agaricus.test, package = "lightgbm")
27 | test <- agaricus.test
28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
29 | params <- list(objective = "regression", metric = "l2")
30 | valids <- list(test = dtest)
31 | model <- lgb.train(params,
32 |                    dtrain,
33 |                    100,
34 |                    valids,
35 |                    min_data = 1,
36 |                    learning_rate = 1,
37 |                    early_stopping_rounds = 10)
38 | saveRDS.lgb.Booster(model, "model.rds")
39 | new_model <- readRDS.lgb.Booster("model.rds")
40 | }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/R-package/man/getinfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{getinfo}
 4 | \alias{getinfo}
 5 | \alias{getinfo.lgb.Dataset}
 6 | \title{Get information of an lgb.Dataset object}
 7 | \usage{
 8 | getinfo(dataset, ...)
 9 | 
10 | \method{getinfo}{lgb.Dataset}(dataset, name, ...)
11 | }
12 | \arguments{
13 | \item{dataset}{Object of class \code{lgb.Dataset}}
14 | 
15 | \item{...}{other parameters}
16 | 
17 | \item{name}{the name of the information field to get (see details)}
18 | }
19 | \value{
20 | info data
21 | }
22 | \description{
23 | Get information of an lgb.Dataset object
24 | }
25 | \details{
26 | The \code{name} field can be one of the following:
27 | 
28 | \itemize{
29 |     \item \code{label}: label lightgbm learn from ;
30 |     \item \code{weight}: to do a weight rescale ;
31 |     \item \code{group}: group size
32 |     \item \code{init_score}: initial score is the base prediction lightgbm will boost from ;
33 | }
34 | }
35 | \examples{
36 | \dontrun{
37 | library(lightgbm)
38 | data(agaricus.train, package = "lightgbm")
39 | train <- agaricus.train
40 | dtrain <- lgb.Dataset(train$data, label = train$label)
41 | lgb.Dataset.construct(dtrain)
42 | 
43 | labels <- lightgbm::getinfo(dtrain, "label")
44 | lightgbm::setinfo(dtrain, "label", 1 - labels)
45 | 
46 | labels2 <- lightgbm::getinfo(dtrain, "label")
47 | stopifnot(all(labels2 == 1 - labels))
48 | }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R-package/man/setinfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{setinfo}
 4 | \alias{setinfo}
 5 | \alias{setinfo.lgb.Dataset}
 6 | \title{Set information of an lgb.Dataset object}
 7 | \usage{
 8 | setinfo(dataset, ...)
 9 | 
10 | \method{setinfo}{lgb.Dataset}(dataset, name, info, ...)
11 | }
12 | \arguments{
13 | \item{dataset}{Object of class "lgb.Dataset"}
14 | 
15 | \item{...}{other parameters}
16 | 
17 | \item{name}{the name of the field to get}
18 | 
19 | \item{info}{the specific field of information to set}
20 | }
21 | \value{
22 | passed object
23 | }
24 | \description{
25 | Set information of an lgb.Dataset object
26 | }
27 | \details{
28 | The \code{name} field can be one of the following:
29 | 
30 | \itemize{
31 |     \item \code{label}: label lightgbm learn from ;
32 |     \item \code{weight}: to do a weight rescale ;
33 |     \item \code{init_score}: initial score is the base prediction lightgbm will boost from ;
34 |     \item \code{group}.
35 | }
36 | }
37 | \examples{
38 | \dontrun{
39 | library(lightgbm)
40 | data(agaricus.train, package = "lightgbm")
41 | train <- agaricus.train
42 | dtrain <- lgb.Dataset(train$data, label = train$label)
43 | lgb.Dataset.construct(dtrain)
44 | 
45 | labels <- lightgbm::getinfo(dtrain, "label")
46 | lightgbm::setinfo(dtrain, "label", 1 - labels)
47 | 
48 | labels2 <- lightgbm::getinfo(dtrain, "label")
49 | stopifnot(all.equal(labels2, 1 - labels))
50 | }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/R-package/demo/boost_from_prediction.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | require(methods)
 3 | 
 4 | # Load in the agaricus dataset
 5 | data(agaricus.train, package = "lightgbm")
 6 | data(agaricus.test, package = "lightgbm")
 7 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 8 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 9 | 
10 | valids <- list(eval = dtest, train = dtrain)
11 | #--------------------Advanced features ---------------------------
12 | # advanced: start from a initial base prediction
13 | print("Start running example to start from a initial prediction")
14 | 
15 | # Train lightgbm for 1 round
16 | param <- list(num_leaves = 4,
17 |               learning_rate = 1,
18 |               nthread = 2,
19 |               objective = "binary")
20 | bst <- lgb.train(param, dtrain, 1, valids = valids)
21 | 
22 | # Note: we need the margin value instead of transformed prediction in set_init_score
23 | ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
24 | ptest  <- predict(bst, agaricus.test$data, rawscore = TRUE)
25 | 
26 | # set the init_score property of dtrain and dtest
27 | # base margin is the base prediction we will boost from
28 | setinfo(dtrain, "init_score", ptrain)
29 | setinfo(dtest, "init_score", ptest)
30 | 
31 | print("This is result of boost from initial prediction")
32 | bst <- lgb.train(params = param,
33 |                  data = dtrain,
34 |                  nrounds = 5,
35 |                  valids = valids)
36 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.Dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Dataset.R
 3 | \name{lgb.Dataset}
 4 | \alias{lgb.Dataset}
 5 | \title{Construct lgb.Dataset object}
 6 | \usage{
 7 | lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL,
 8 |   categorical_feature = NULL, free_raw_data = TRUE, info = list(), ...)
 9 | }
10 | \arguments{
11 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
12 | 
13 | \item{params}{a list of parameters}
14 | 
15 | \item{reference}{reference dataset}
16 | 
17 | \item{colnames}{names of columns}
18 | 
19 | \item{categorical_feature}{categorical features}
20 | 
21 | \item{free_raw_data}{TRUE for need to free raw data after construct}
22 | 
23 | \item{info}{a list of information of the lgb.Dataset object}
24 | 
25 | \item{...}{other information to pass to \code{info} or parameters pass to \code{params}}
26 | }
27 | \value{
28 | constructed dataset
29 | }
30 | \description{
31 | Construct lgb.Dataset object from dense matrix, sparse matrix
32 | or local file (that was created previously by saving an \code{lgb.Dataset}).
33 | }
34 | \examples{
35 | \dontrun{
36 | library(lightgbm)
37 | data(agaricus.train, package = "lightgbm")
38 | train <- agaricus.train
39 | dtrain <- lgb.Dataset(train$data, label = train$label)
40 | lgb.Dataset.save(dtrain, "lgb.Dataset.data")
41 | dtrain <- lgb.Dataset("lgb.Dataset.data")
42 | lgb.Dataset.construct(dtrain)
43 | }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.get.eval.result.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.get.eval.result}
 4 | \alias{lgb.get.eval.result}
 5 | \title{Get record evaluation result from booster}
 6 | \usage{
 7 | lgb.get.eval.result(booster, data_name, eval_name, iters = NULL,
 8 |   is_err = FALSE)
 9 | }
10 | \arguments{
11 | \item{booster}{Object of class \code{lgb.Booster}}
12 | 
13 | \item{data_name}{name of dataset}
14 | 
15 | \item{eval_name}{name of evaluation}
16 | 
17 | \item{iters}{iterations, NULL will return all}
18 | 
19 | \item{is_err}{TRUE will return evaluation error instead}
20 | }
21 | \value{
22 | vector of evaluation result
23 | }
24 | \description{
25 | Get record evaluation result from booster
26 | }
27 | \examples{
28 | \dontrun{
29 | library(lightgbm)
30 | data(agaricus.train, package = "lightgbm")
31 | train <- agaricus.train
32 | dtrain <- lgb.Dataset(train$data, label = train$label)
33 | data(agaricus.test, package = "lightgbm")
34 | test <- agaricus.test
35 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
36 | params <- list(objective = "regression", metric = "l2")
37 | valids <- list(test = dtest)
38 | model <- lgb.train(params,
39 |                    dtrain,
40 |                    100,
41 |                    valids,
42 |                    min_data = 1,
43 |                    learning_rate = 1,
44 |                    early_stopping_rounds = 10)
45 | lgb.get.eval.result(model, "test", "l2")
46 | }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.importance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.importance.R
 3 | \name{lgb.importance}
 4 | \alias{lgb.importance}
 5 | \title{Compute feature importance in a model}
 6 | \usage{
 7 | lgb.importance(model, percentage = TRUE)
 8 | }
 9 | \arguments{
10 | \item{model}{object of class \code{lgb.Booster}.}
11 | 
12 | \item{percentage}{whether to show importance in relative percentage.}
13 | }
14 | \value{
15 | For a tree model, a \code{data.table} with the following columns:
16 | \itemize{
17 |   \item \code{Feature} Feature names in the model.
18 |   \item \code{Gain} The total gain of this feature's splits.
19 |   \item \code{Cover} The number of observation related to this feature.
20 |   \item \code{Frequency} The number of times a feature splited in trees.
21 | }
22 | }
23 | \description{
24 | Creates a \code{data.table} of feature importances in a model.
25 | }
26 | \examples{
27 | \dontrun{
28 | library(lightgbm)
29 | data(agaricus.train, package = "lightgbm")
30 | train <- agaricus.train
31 | dtrain <- lgb.Dataset(train$data, label = train$label)
32 | 
33 | params = list(objective = "binary",
34 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
35 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
36 |               model <- lgb.train(params, dtrain, 20)
37 | model <- lgb.train(params, dtrain, 20)
38 | 
39 | tree_imp1 <- lgb.importance(model, percentage = TRUE)
40 | tree_imp2 <- lgb.importance(model, percentage = FALSE)
41 | }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/treelearner/tree_learner.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/tree_learner.h>
 2 | 
 3 | #include "serial_tree_learner.h"
 4 | #include "gpu_tree_learner.h"
 5 | #include "parallel_tree_learner.h"
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, const std::string& device_type, const Config* config) {
10 |   if (device_type == std::string("cpu")) {
11 |     if (learner_type == std::string("serial")) {
12 |       return new SerialTreeLearner(config);
13 |     } else if (learner_type == std::string("feature")) {
14 |       return new FeatureParallelTreeLearner<SerialTreeLearner>(config);
15 |     } else if (learner_type == std::string("data")) {
16 |       return new DataParallelTreeLearner<SerialTreeLearner>(config);
17 |     } else if (learner_type == std::string("voting")) {
18 |       return new VotingParallelTreeLearner<SerialTreeLearner>(config);
19 |     }
20 |   }
21 |   else if (device_type == std::string("gpu")) {
22 |     if (learner_type == std::string("serial")) {
23 |       return new GPUTreeLearner(config);
24 |     } else if (learner_type == std::string("feature")) {
25 |       return new FeatureParallelTreeLearner<GPUTreeLearner>(config);
26 |     } else if (learner_type == std::string("data")) {
27 |       return new DataParallelTreeLearner<GPUTreeLearner>(config);
28 |     } else if (learner_type == std::string("voting")) {
29 |       return new VotingParallelTreeLearner<GPUTreeLearner>(config);
30 |     }
31 |   }
32 |   return nullptr;
33 | }
34 | 
35 | }  // namespace LightGBM
36 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.load.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{lgb.load}
 4 | \alias{lgb.load}
 5 | \title{Load LightGBM model}
 6 | \usage{
 7 | lgb.load(filename = NULL, model_str = NULL)
 8 | }
 9 | \arguments{
10 | \item{filename}{path of model file}
11 | 
12 | \item{model_str}{a str containing the model}
13 | }
14 | \value{
15 | lgb.Booster
16 | }
17 | \description{
18 | Load LightGBM model from saved model file or string
19 | Load LightGBM takes in either a file path or model string
20 | If both are provided, Load will default to loading from file
21 | }
22 | \examples{
23 | \dontrun{
24 | library(lightgbm)
25 | data(agaricus.train, package = "lightgbm")
26 | train <- agaricus.train
27 | dtrain <- lgb.Dataset(train$data, label = train$label)
28 | data(agaricus.test, package = "lightgbm")
29 | test <- agaricus.test
30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
31 | params <- list(objective = "regression", metric = "l2")
32 | valids <- list(test = dtest)
33 | model <- lgb.train(params,
34 |                    dtrain,
35 |                    100,
36 |                    valids,
37 |                    min_data = 1,
38 |                    learning_rate = 1,
39 |                    early_stopping_rounds = 10)
40 | lgb.save(model, "model.txt")
41 | load_booster <- lgb.load(filename = "model.txt")
42 | model_string <- model$save_model_to_string(NULL) # saves best iteration
43 | load_booster_from_str <- lgb.load(model_str = model_string)
44 | }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. LightGBM documentation master file, created by
 2 |    sphinx-quickstart on Thu May  4 14:30:58 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to LightGBM's documentation!
 7 | ====================================
 8 | 
 9 | **LightGBM** is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:
10 | 
11 | - Faster training speed and higher efficiency
12 | - Lower memory usage
13 | - Better accuracy
14 | - Parallel and GPU learning supported
15 | - Capable of handling large-scale data
16 | 
17 | For more details, please refer to `Features <./Features.rst>`__.
18 | 
19 | .. toctree::
20 |    :maxdepth: 1
21 |    :caption: Contents:
22 | 
23 |    Installation Guide <Installation-Guide>
24 |    Quick Start <Quick-Start>
25 |    Python Quick Start <Python-Intro>
26 |    Features <Features>
27 |    Experiments <Experiments>
28 |    Parameters <Parameters>
29 |    Parameters Tuning <Parameters-Tuning>
30 |    Python API <Python-API>
31 |    Parallel Learning Guide <Parallel-Learning-Guide>
32 |    GPU Tutorial <GPU-Tutorial>
33 |    Advanced Topics <Advanced-Topics>
34 |    FAQ <FAQ>
35 |    Development Guide <Development-Guide>
36 | 
37 | .. toctree::
38 |    :hidden:
39 | 
40 |    GPU-Performance
41 |    GPU-Targets
42 |    GPU-Windows
43 |    gcc-Tips
44 |    README
45 | 
46 | Indices and Tables
47 | ==================
48 | 
49 | * :ref:`genindex`
50 | 


--------------------------------------------------------------------------------
/R-package/demo/efficient_many_training.R:
--------------------------------------------------------------------------------
 1 | # Efficient training means training without giving up too much RAM
 2 | # In the case of many trainings (like 100+ models), RAM will be eaten very quickly
 3 | # Therefore, it is essential to know a strategy to deal with such issue
 4 | 
 5 | # More results can be found here: https://github.com/Microsoft/LightGBM/issues/879#issuecomment-326656580
 6 | # Quote: "@Laurae2 Thanks for nice easily reproducible example (unlike mine).
 7 | # With reset=FALSE you get after 500 iterations (not 1000): OS reports 27GB usage, while R gc() reports 1.5GB.
 8 | # Just doing reset=TRUE will already improve things: OS reports 4.6GB.
 9 | # Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip."
10 | 
11 | # Load library
12 | library(lightgbm)
13 | 
14 | # Generate fictive data of size 1M x 100
15 | set.seed(11111)
16 | x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100)
17 | y_data <- rnorm(n = 1000000, mean = 0, sd = 5)
18 | 
19 | # Create lgb.Dataset for training
20 | data <- lgb.Dataset(x_data, label = y_data)
21 | data$construct()
22 | 
23 | # Loop through a training of 1000 models, please check your RAM on your task manager
24 | # It MUST remain constant (if not increasing very slightly)
25 | gbm <- list()
26 | 
27 | for (i in 1:1000) {
28 |   print(i)
29 |   gbm[[i]] <- lgb.train(params = list(objective = "regression"),
30 |                         data = data,
31 |                         1,
32 |                         reset_data = TRUE)
33 |   gc(verbose = FALSE)
34 | }
35 | 


--------------------------------------------------------------------------------
/windows/LightGBM.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 14
 4 | VisualStudioVersion = 14.0.25420.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug_mpi|x64 = Debug_mpi|x64
11 | 		Debug|x64 = Debug|x64
12 | 		DLL|x64 = DLL|x64
13 | 		Release_mpi|x64 = Release_mpi|x64
14 | 		Release|x64 = Release|x64
15 | 	EndGlobalSection
16 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
17 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64
18 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64
19 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64
20 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64
21 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.ActiveCfg = DLL|x64
22 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.Build.0 = DLL|x64
23 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64
24 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64
25 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64
26 | 		{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64
27 | 	EndGlobalSection
28 | 	GlobalSection(SolutionProperties) = preSolution
29 | 		HideSolutionNode = FALSE
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/.travis/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $TRAVIS_OS_NAME == "osx" ]]; then
 4 |     sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"  # fix "fatal error: _stdio.h: No such file or directory"
 5 |     rm '/usr/local/include/c++'
 6 | #    brew cask uninstall oclint  #  reserve variant to deal with conflict link
 7 |     if [[ $TASK == "mpi" ]]; then
 8 |         brew install open-mpi
 9 |     else
10 |         brew install gcc
11 |     fi
12 | #    brew link --overwrite gcc  # previous variant to deal with conflict link
13 |     wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda${PYTHON_VERSION:0:1}-latest-MacOSX-x86_64.sh
14 | else
15 |     if [[ $TASK == "mpi" ]]; then
16 |         sudo apt-get install -y libopenmpi-dev openmpi-bin
17 |     fi
18 |     if [[ $TASK == "gpu" ]]; then
19 |         sudo apt-get install -y ocl-icd-opencl-dev
20 |     fi
21 |     wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda${PYTHON_VERSION:0:1}-latest-Linux-x86_64.sh
22 | fi
23 | 
24 | sh conda.sh -b -p $HOME/miniconda
25 | conda config --set always_yes yes --set changeps1 no
26 | conda update -q conda
27 | 
28 | if [[ $TASK == "gpu" ]] && [[ $TRAVIS_OS_NAME == "linux" ]]; then
29 |     wget https://github.com/Microsoft/LightGBM/releases/download/v2.0.12/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2
30 |     tar -xjf AMD-APP-SDK*.tar.bz2
31 |     mkdir -p $OPENCL_VENDOR_PATH
32 |     sh AMD-APP-SDK*.sh --tar -xf -C $AMDAPPSDK
33 |     mv $AMDAPPSDK/lib/x86_64/sdk/* $AMDAPPSDK/lib/x86_64/
34 |     echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd
35 | fi
36 | 


--------------------------------------------------------------------------------
/examples/python-guide/simple_example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = invalid-name, C0111
 3 | import lightgbm as lgb
 4 | import pandas as pd
 5 | from sklearn.metrics import mean_squared_error
 6 | 
 7 | 
 8 | # load or create your dataset
 9 | print('Load data...')
10 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
11 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
12 | 
13 | y_train = df_train[0].values
14 | y_test = df_test[0].values
15 | X_train = df_train.drop(0, axis=1).values
16 | X_test = df_test.drop(0, axis=1).values
17 | 
18 | # create dataset for lightgbm
19 | lgb_train = lgb.Dataset(X_train, y_train)
20 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
21 | 
22 | # specify your configurations as a dict
23 | params = {
24 |     'task': 'train',
25 |     'boosting_type': 'gbdt',
26 |     'objective': 'regression',
27 |     'metric': {'l2', 'auc'},
28 |     'num_leaves': 31,
29 |     'learning_rate': 0.05,
30 |     'feature_fraction': 0.9,
31 |     'bagging_fraction': 0.8,
32 |     'bagging_freq': 5,
33 |     'verbose': 0
34 | }
35 | 
36 | print('Start training...')
37 | # train
38 | gbm = lgb.train(params,
39 |                 lgb_train,
40 |                 num_boost_round=20,
41 |                 valid_sets=lgb_eval,
42 |                 early_stopping_rounds=5)
43 | 
44 | print('Save model...')
45 | # save model to file
46 | gbm.save_model('model.txt')
47 | 
48 | print('Start predicting...')
49 | # predict
50 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
51 | # eval
52 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
53 | 


--------------------------------------------------------------------------------
/python-package/lightgbm/libpath.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """Find the path to lightgbm dynamic library files."""
 3 | import os
 4 | 
 5 | from platform import system
 6 | 
 7 | 
 8 | def find_lib_path():
 9 |     """Find the path to LightGBM library files.
10 |     Returns
11 |     -------
12 |     lib_path: list(string)
13 |        List of all found library path to LightGBM
14 |     """
15 |     if os.environ.get('LIGHTGBM_BUILD_DOC', False):
16 |         # we don't need lib_lightgbm while building docs
17 |         return []
18 | 
19 |     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
20 |     dll_path = [curr_path, os.path.join(curr_path, '../../'),
21 |                 os.path.join(curr_path, 'compile'),
22 |                 os.path.join(curr_path, '../compile'),
23 |                 os.path.join(curr_path, '../../lib/')]
24 |     if system() in ('Windows', 'Microsoft'):
25 |         dll_path.append(os.path.join(curr_path, '../compile/Release/'))
26 |         dll_path.append(os.path.join(curr_path, '../compile/windows/x64/DLL/'))
27 |         dll_path.append(os.path.join(curr_path, '../../Release/'))
28 |         dll_path.append(os.path.join(curr_path, '../../windows/x64/DLL/'))
29 |         dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
30 |     else:
31 |         dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
32 |     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
33 |     if not lib_path:
34 |         dll_path = [os.path.realpath(p) for p in dll_path]
35 |         raise Exception('Cannot find lightgbm library in following paths: ' + '\n'.join(dll_path))
36 |     return lib_path
37 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_parameters.R:
--------------------------------------------------------------------------------
 1 | data(agaricus.train, package='lightgbm')
 2 | data(agaricus.test, package='lightgbm')
 3 | train <- agaricus.train
 4 | test <- agaricus.test
 5 | 
 6 | test_that("Feature penalties work properly", {
 7 |   # Fit a series of models with varying penalty on most important variable
 8 |   var_name <- "odor=none"
 9 |   var_index <- which(train$data@Dimnames[[2]] == var_name)
10 |   
11 |   bst <- lapply(seq(1, 0, by = -0.1), function(x) {
12 |     feature_penalties <- rep(1, ncol(train$data))
13 |     feature_penalties[var_index] <- x
14 |     lightgbm(
15 |       data = train$data,
16 |       label = train$label,
17 |       num_leaves = 5,
18 |       learning_rate = 0.05,
19 |       nrounds = 20, 
20 |       objective = "binary", 
21 |       feature_penalty = paste0(feature_penalties, collapse = ","),
22 |       metric="binary_error",
23 |       verbose = -1
24 |     )
25 |   })
26 |   
27 |   var_gain <- lapply(bst, function(x) lgb.importance(x)[Feature == var_name, Gain])
28 |   var_cover <- lapply(bst, function(x) lgb.importance(x)[Feature == var_name, Cover])
29 |   var_freq <- lapply(bst, function(x) lgb.importance(x)[Feature == var_name, Frequency])
30 |   
31 |   # Ensure that feature gain, cover, and frequency decreases with stronger penalties
32 |   expect_true(all(diff(unlist(var_gain)) <= 0))
33 |   expect_true(all(diff(unlist(var_cover)) <= 0))
34 |   expect_true(all(diff(unlist(var_freq)) <= 0))
35 |   
36 |   expect_lt(min(diff(unlist(var_gain))), 0)
37 |   expect_lt(min(diff(unlist(var_cover))), 0)
38 |   expect_lt(min(diff(unlist(var_freq))), 0)
39 |   
40 |   # Ensure that feature is not used when feature_penalty = 0
41 |   expect_length(var_gain[[length(var_gain)]], 0)
42 | })


--------------------------------------------------------------------------------
/examples/multiclass_classification/train.conf:
--------------------------------------------------------------------------------
 1 | # task type, support train and predict
 2 | task = train
 3 | 
 4 | # boosting type, support gbdt for now, alias: boosting, boost
 5 | boosting_type = gbdt
 6 | 
 7 | # application type, support following application
 8 | # regression , regression task
 9 | # binary , binary classification task
10 | # lambdarank , lambdarank task
11 | # multiclass
12 | # alias: application, app
13 | objective = multiclass
14 | 
15 | # eval metrics, support multi metric, delimite by ',' , support following metrics
16 | # l1 
17 | # l2 , default metric for regression
18 | # ndcg , default metric for lambdarank
19 | # auc 
20 | # binary_logloss , default metric for binary
21 | # binary_error
22 | # multi_logloss
23 | # multi_error
24 | metric = multi_logloss
25 | 
26 | # number of class, for multiclass classification
27 | num_class = 5
28 | 
29 | # frequence for metric output
30 | metric_freq = 1
31 | 
32 | # true if need output metric for training data, alias: tranining_metric, train_metric
33 | is_training_metric = true
34 | 
35 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
36 | max_bin = 255
37 | 
38 | # training data
39 | # if exsting weight file, should name to "regression.train.weight"
40 | # alias: train_data, train
41 | data = multiclass.train
42 | 
43 | # valid data
44 | valid_data = multiclass.test
45 | 
46 | # round for early stopping
47 | early_stopping = 10
48 | 
49 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
50 | num_trees = 100
51 | 
52 | # shrinkage rate , alias: shrinkage_rate
53 | learning_rate = 0.05
54 | 
55 | # number of leaves for one tree, alias: num_leaf
56 | num_leaves = 31
57 | 


--------------------------------------------------------------------------------
/examples/lambdarank/rank.train.query:
--------------------------------------------------------------------------------
  1 | 1
  2 | 13
  3 | 5
  4 | 8
  5 | 19
  6 | 12
  7 | 18
  8 | 5
  9 | 14
 10 | 13
 11 | 8
 12 | 9
 13 | 16
 14 | 11
 15 | 21
 16 | 14
 17 | 21
 18 | 9
 19 | 14
 20 | 11
 21 | 20
 22 | 18
 23 | 13
 24 | 20
 25 | 22
 26 | 22
 27 | 13
 28 | 17
 29 | 10
 30 | 13
 31 | 12
 32 | 13
 33 | 13
 34 | 23
 35 | 18
 36 | 13
 37 | 20
 38 | 12
 39 | 22
 40 | 14
 41 | 13
 42 | 23
 43 | 13
 44 | 14
 45 | 14
 46 | 5
 47 | 13
 48 | 15
 49 | 14
 50 | 14
 51 | 16
 52 | 16
 53 | 15
 54 | 21
 55 | 22
 56 | 10
 57 | 22
 58 | 18
 59 | 25
 60 | 16
 61 | 12
 62 | 12
 63 | 15
 64 | 15
 65 | 25
 66 | 13
 67 | 9
 68 | 12
 69 | 8
 70 | 16
 71 | 25
 72 | 19
 73 | 24
 74 | 12
 75 | 16
 76 | 10
 77 | 16
 78 | 9
 79 | 17
 80 | 15
 81 | 7
 82 | 9
 83 | 15
 84 | 14
 85 | 16
 86 | 17
 87 | 8
 88 | 17
 89 | 12
 90 | 18
 91 | 23
 92 | 10
 93 | 12
 94 | 12
 95 | 4
 96 | 14
 97 | 12
 98 | 15
 99 | 27
100 | 16
101 | 20
102 | 13
103 | 19
104 | 13
105 | 17
106 | 17
107 | 16
108 | 12
109 | 15
110 | 14
111 | 14
112 | 19
113 | 12
114 | 23
115 | 18
116 | 16
117 | 9
118 | 23
119 | 11
120 | 15
121 | 8
122 | 10
123 | 10
124 | 16
125 | 11
126 | 15
127 | 22
128 | 16
129 | 17
130 | 23
131 | 16
132 | 22
133 | 17
134 | 14
135 | 12
136 | 14
137 | 20
138 | 15
139 | 17
140 | 15
141 | 15
142 | 22
143 | 9
144 | 21
145 | 9
146 | 17
147 | 16
148 | 15
149 | 13
150 | 13
151 | 15
152 | 14
153 | 18
154 | 21
155 | 14
156 | 17
157 | 15
158 | 14
159 | 16
160 | 12
161 | 17
162 | 19
163 | 16
164 | 11
165 | 18
166 | 11
167 | 13
168 | 14
169 | 9
170 | 16
171 | 15
172 | 16
173 | 25
174 | 9
175 | 13
176 | 22
177 | 16
178 | 18
179 | 20
180 | 14
181 | 11
182 | 9
183 | 16
184 | 19
185 | 19
186 | 11
187 | 11
188 | 13
189 | 14
190 | 14
191 | 13
192 | 16
193 | 6
194 | 21
195 | 16
196 | 12
197 | 16
198 | 11
199 | 24
200 | 12
201 | 10
202 | 


--------------------------------------------------------------------------------
/R-package/R/readRDS.lgb.Booster.R:
--------------------------------------------------------------------------------
 1 | #' readRDS for lgb.Booster models
 2 | #'
 3 | #' Attemps to load a model using RDS.
 4 | #' 
 5 | #' @param file a connection or the name of the file where the R object is saved to or read from.
 6 | #' @param refhook a hook function for handling reference objects.
 7 | #' 
 8 | #' @return lgb.Booster.
 9 | #' 
10 | #' @examples
11 | #' \dontrun{
12 | #' library(lightgbm)
13 | #' data(agaricus.train, package = "lightgbm")
14 | #' train <- agaricus.train
15 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
16 | #' data(agaricus.test, package = "lightgbm")
17 | #' test <- agaricus.test
18 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
19 | #' params <- list(objective = "regression", metric = "l2")
20 | #' valids <- list(test = dtest)
21 | #' model <- lgb.train(params,
22 | #'                    dtrain,
23 | #'                    100,
24 | #'                    valids,
25 | #'                    min_data = 1,
26 | #'                    learning_rate = 1,
27 | #'                    early_stopping_rounds = 10)
28 | #' saveRDS.lgb.Booster(model, "model.rds")
29 | #' new_model <- readRDS.lgb.Booster("model.rds")
30 | #' }
31 | #' 
32 | #' @export
33 | readRDS.lgb.Booster <- function(file = "", refhook = NULL) {
34 |   
35 |   # Read RDS file
36 |   object <- readRDS(file = file, refhook = refhook)
37 |   
38 |   # Check if object has the model stored
39 |   if (!is.na(object$raw)) {
40 |     
41 |     # Create temporary model for the model loading
42 |     object2 <- lgb.load(model_str = object$raw)
43 |     
44 |     # Restore best iteration and recorded evaluations
45 |     object2$best_iter <- object$best_iter
46 |     object2$record_evals <- object$record_evals
47 |     
48 |     # Return newly loaded object
49 |     return(object2)
50 |     
51 |   } else {
52 |     
53 |     # Return RDS loaded object
54 |     return(object)
55 |     
56 |   }
57 |   
58 | }
59 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.plot.importance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.plot.importance.R
 3 | \name{lgb.plot.importance}
 4 | \alias{lgb.plot.importance}
 5 | \title{Plot feature importance as a bar graph}
 6 | \usage{
 7 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain",
 8 |   left_margin = 10, cex = NULL)
 9 | }
10 | \arguments{
11 | \item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.}
12 | 
13 | \item{top_n}{maximal number of top features to include into the plot.}
14 | 
15 | \item{measure}{the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".}
16 | 
17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.}
18 | 
19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
20 | }
21 | \value{
22 | The \code{lgb.plot.importance} function creates a \code{barplot}
23 | and silently returns a processed data.table with \code{top_n} features sorted by defined importance.
24 | }
25 | \description{
26 | Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.
27 | }
28 | \details{
29 | The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature.
30 | Features are shown ranked in a decreasing importance order.
31 | }
32 | \examples{
33 | \dontrun{
34 | data(agaricus.train, package = "lightgbm")
35 | train <- agaricus.train
36 | dtrain <- lgb.Dataset(train$data, label = train$label)
37 | 
38 | params = list(objective = "binary",
39 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
40 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
41 |               model <- lgb.train(params, dtrain, 20)
42 | model <- lgb.train(params, dtrain, 20)
43 | 
44 | tree_imp <- lgb.importance(model, percentage = TRUE)
45 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
46 | }
47 | }
48 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.interprete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.interprete.R
 3 | \name{lgb.interprete}
 4 | \alias{lgb.interprete}
 5 | \title{Compute feature contribution of prediction}
 6 | \usage{
 7 | lgb.interprete(model, data, idxset, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{model}{object of class \code{lgb.Booster}.}
11 | 
12 | \item{data}{a matrix object or a dgCMatrix object.}
13 | 
14 | \item{idxset}{a integer vector of indices of rows needed.}
15 | 
16 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.}
17 | }
18 | \value{
19 | For regression, binary classification and lambdarank model, a \code{list} of \code{data.table} with the following columns:
20 | \itemize{
21 |   \item \code{Feature} Feature names in the model.
22 |   \item \code{Contribution} The total contribution of this feature's splits.
23 | }
24 | For multiclass classification, a \code{list} of \code{data.table} with the Feature column and Contribution columns to each class.
25 | }
26 | \description{
27 | Computes feature contribution components of rawscore prediction.
28 | }
29 | \examples{
30 | \dontrun{
31 | library(lightgbm)
32 | Sigmoid <- function(x) 1 / (1 + exp(-x))
33 | Logit <- function(x) log(x / (1 - x))
34 | data(agaricus.train, package = "lightgbm")
35 | train <- agaricus.train
36 | dtrain <- lgb.Dataset(train$data, label = train$label)
37 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
38 | data(agaricus.test, package = "lightgbm")
39 | test <- agaricus.test
40 | 
41 | params = list(objective = "binary",
42 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
43 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
44 |               model <- lgb.train(params, dtrain, 20)
45 | model <- lgb.train(params, dtrain, 20)
46 | 
47 | tree_interpretation <- lgb.interprete(model, test$data, 1:5)
48 | }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R-package/demo/cross_validation.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | # load in the agaricus dataset
 3 | data(agaricus.train, package = "lightgbm")
 4 | data(agaricus.test, package = "lightgbm")
 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 7 | 
 8 | nrounds <- 2
 9 | param <- list(num_leaves = 4,
10 |               learning_rate = 1,
11 |               objective = "binary")
12 | 
13 | print("Running cross validation")
14 | # Do cross validation, this will print result out as
15 | # [iteration]  metric_name:mean_value+std_value
16 | # std_value is standard deviation of the metric
17 | lgb.cv(param,
18 |        dtrain,
19 |        nrounds,
20 |        nfold = 5,
21 |        eval = "binary_error")
22 | 
23 | print("Running cross validation, disable standard deviation display")
24 | # do cross validation, this will print result out as
25 | # [iteration]  metric_name:mean_value+std_value
26 | # std_value is standard deviation of the metric
27 | lgb.cv(param,
28 |        dtrain,
29 |        nrounds,
30 |        nfold = 5,
31 |        eval = "binary_error",
32 |        showsd = FALSE)
33 | 
34 | # You can also do cross validation with cutomized loss function
35 | print("Running cross validation, with cutomsized loss function")
36 | 
37 | logregobj <- function(preds, dtrain) {
38 |   labels <- getinfo(dtrain, "label")
39 |   preds <- 1 / (1 + exp(-preds))
40 |   grad <- preds - labels
41 |   hess <- preds * (1 - preds)
42 |   return(list(grad = grad, hess = hess))
43 | }
44 | evalerror <- function(preds, dtrain) {
45 |   labels <- getinfo(dtrain, "label")
46 |   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
47 |   return(list(name = "error", value = err, higher_better = FALSE))
48 | }
49 | 
50 | # train with customized objective
51 | lgb.cv(params = param,
52 |        data = dtrain,
53 |        nrounds = nrounds,
54 |        obj = logregobj,
55 |        eval = evalerror,
56 |        nfold = 5)
57 | 


--------------------------------------------------------------------------------
/R-package/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: lightgbm
 2 | Type: Package
 3 | Title: Light Gradient Boosting Machine
 4 | Version: 2.1.2
 5 | Date: 2018-06-22
 6 | Authors@R: c(
 7 | 	person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")),
 8 | 	person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")),
 9 | 	person("Yachen", "Yan", role = c("ctb")),
10 | 	person("James", "Lamb", role = c("ctb"))
11 | 	)
12 | Description: Tree based algorithms can be improved by introducing boosting frameworks. LightGBM is one such framework, and this package offers an R interface to work with it.
13 |     It is designed to be distributed and efficient with the following advantages:
14 |         1. Faster training speed and higher efficiency.
15 |         2. Lower memory usage.
16 |         3. Better accuracy.
17 |         4. Parallel learning supported.
18 |         5. Capable of handling large-scale data.
19 |     In recognition of these advantages, LightGBM has being widely-used in many winning solutions of machine learning competitions.
20 |     Comparison experiments on public datasets suggest that LightGBM can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. In addition, parallel experiments suggest that in certain circumstances, LightGBM can achieve a linear speed-up in training time by using multiple machines.
21 | License: MIT + file LICENSE
22 | URL: https://github.com/Microsoft/LightGBM
23 | BugReports: https://github.com/Microsoft/LightGBM/issues
24 | VignetteBuilder: knitr
25 | Suggests:
26 |     knitr,
27 |     rmarkdown,
28 |     ggplot2 (>= 1.0.1),
29 |     DiagrammeR (>= 0.8.1),
30 |     Ckmeans.1d.dp (>= 3.3.1),
31 |     vcd (>= 1.3),
32 |     testthat,
33 |     igraph (>= 1.0.1),
34 |     stringi (>= 0.5.2)
35 | Depends:
36 |     R (>= 3.0),
37 |     R6 (>= 2.0)
38 | Imports:
39 |     graphics,
40 |     methods,
41 |     Matrix (>= 1.1-0),
42 |     data.table (>= 1.9.6),
43 |     magrittr (>= 1.5),
44 |     jsonlite (>= 1.0)
45 | RoxygenNote: 6.0.1
46 | 


--------------------------------------------------------------------------------
/examples/python-guide/plot_example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = invalid-name, C0111
 3 | import lightgbm as lgb
 4 | import pandas as pd
 5 | 
 6 | if lgb.compat.MATPLOTLIB_INSTALLED:
 7 |     import matplotlib.pyplot as plt
 8 | else:
 9 |     raise ImportError('You need to install matplotlib for plot_example.py.')
10 | 
11 | # load or create your dataset
12 | print('Load data...')
13 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
14 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
15 | 
16 | y_train = df_train[0].values
17 | y_test = df_test[0].values
18 | X_train = df_train.drop(0, axis=1).values
19 | X_test = df_test.drop(0, axis=1).values
20 | 
21 | # create dataset for lightgbm
22 | lgb_train = lgb.Dataset(X_train, y_train)
23 | lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
24 | 
25 | # specify your configurations as a dict
26 | params = {
27 |     'num_leaves': 5,
28 |     'metric': ('l1', 'l2'),
29 |     'verbose': 0
30 | }
31 | 
32 | evals_result = {}  # to record eval results for plotting
33 | 
34 | print('Start training...')
35 | # train
36 | gbm = lgb.train(params,
37 |                 lgb_train,
38 |                 num_boost_round=100,
39 |                 valid_sets=[lgb_train, lgb_test],
40 |                 feature_name=['f' + str(i + 1) for i in range(28)],
41 |                 categorical_feature=[21],
42 |                 evals_result=evals_result,
43 |                 verbose_eval=10)
44 | 
45 | print('Plot metrics recorded during training...')
46 | ax = lgb.plot_metric(evals_result, metric='l1')
47 | plt.show()
48 | 
49 | print('Plot feature importances...')
50 | ax = lgb.plot_importance(gbm, max_num_features=10)
51 | plt.show()
52 | 
53 | print('Plot 84th tree...')  # one tree use categorical feature to split
54 | ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
55 | plt.show()
56 | 
57 | print('Plot 84th tree with graphviz...')
58 | graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
59 | graph.render(view=True)
60 | 


--------------------------------------------------------------------------------
/include/LightGBM/meta.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_META_H_
 2 | #define LIGHTGBM_META_H_
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #include <limits>
 7 | #include <vector>
 8 | #include <functional>
 9 | #include <memory>
10 | 
11 | namespace LightGBM {
12 | 
13 | /*! \brief Type of data size, it is better to use signed type*/
14 | typedef int32_t data_size_t;
15 | 
16 | // Enable following marco to use double for score_t
17 | // #define SCORE_T_USE_DOUBLE
18 | 
19 | // Enable following marco to use double for label_t
20 | // #define LABEL_T_USE_DOUBLE
21 | 
22 | /*! \brief Type of score, and gradients */
23 | #ifdef SCORE_T_USE_DOUBLE
24 | typedef double score_t;
25 | #else
26 | typedef float score_t;
27 | #endif
28 | 
29 | /*! \brief Type of metadata, include weight and label */
30 | #ifdef LABEL_T_USE_DOUBLE
31 | typedef double label_t;
32 | #else
33 | typedef float label_t;
34 | #endif
35 | 
36 | const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
37 | 
38 | const score_t kEpsilon = 1e-15f;
39 | 
40 | const double kZeroThreshold = 1e-35f;
41 | 
42 | 
43 | typedef int32_t comm_size_t;
44 | 
45 | using PredictFunction =
46 | std::function<void(const std::vector<std::pair<int, double>>&, double* output)>;
47 | 
48 | typedef void(*ReduceFunction)(const char* input, char* output, int type_size, comm_size_t array_size);
49 | 
50 | 
51 | typedef void(*ReduceScatterFunction)(char* input, comm_size_t input_size, int type_size,
52 |                                      const comm_size_t* block_start, const comm_size_t* block_len, int num_block, char* output, comm_size_t output_size,
53 |                                      const ReduceFunction& reducer);
54 | 
55 | typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm_size_t* block_start,
56 |                                  const comm_size_t* block_len, int num_block, char* output, comm_size_t output_size);
57 | 
58 | 
59 | #define NO_SPECIFIC (-1)
60 | 
61 | #if (_MSC_VER <= 1800)
62 | #define __func__ __FUNCTION__
63 | #endif
64 | 
65 | }  // namespace LightGBM
66 | 
67 | #endif   // LightGBM_META_H_
68 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.unloader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.unloader.R
 3 | \name{lgb.unloader}
 4 | \alias{lgb.unloader}
 5 | \title{LightGBM unloading error fix}
 6 | \usage{
 7 | lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv)
 8 | }
 9 | \arguments{
10 | \item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.}
11 | 
12 | \item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.}
13 | 
14 | \item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.}
15 | }
16 | \value{
17 | NULL invisibly.
18 | }
19 | \description{
20 | Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
21 | }
22 | \examples{
23 | \dontrun{
24 | library(lightgbm)
25 | data(agaricus.train, package = "lightgbm")
26 | train <- agaricus.train
27 | dtrain <- lgb.Dataset(train$data, label = train$label)
28 | data(agaricus.test, package = "lightgbm")
29 | test <- agaricus.test
30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
31 | params <- list(objective = "regression", metric = "l2")
32 | valids <- list(test = dtest)
33 | model <- lgb.train(params,
34 |                    dtrain,
35 |                    100,
36 |                    valids,
37 |                    min_data = 1,
38 |                    learning_rate = 1,
39 |                    early_stopping_rounds = 10)
40 | lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
41 | rm(model, dtrain, dtest) # Not needed if wipe = TRUE
42 | gc() # Not needed if wipe = TRUE
43 | 
44 | library(lightgbm)
45 | # Do whatever you want again with LightGBM without object clashing
46 | }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/openmp_wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_OPENMP_WRAPPER_H_
 2 | #define LIGHTGBM_OPENMP_WRAPPER_H_
 3 | #ifdef _OPENMP
 4 | 
 5 | #include <omp.h>
 6 | #include <exception>
 7 | #include <stdexcept>
 8 | #include <mutex>
 9 | #include <vector>
10 | #include <memory>
11 | #include "log.h"
12 | 
13 | class ThreadExceptionHelper {
14 | public:
15 |   ThreadExceptionHelper() { 
16 |     ex_ptr_ = nullptr; 
17 |   }
18 | 
19 |   ~ThreadExceptionHelper() { 
20 |     ReThrow();
21 |   }
22 |   void ReThrow() {
23 |     if (ex_ptr_ != nullptr) {
24 |       std::rethrow_exception(ex_ptr_);
25 |     }
26 |   }
27 |   void CaptureException() {
28 |     // only catch first exception.
29 |     if (ex_ptr_ != nullptr) { return; }
30 |     std::unique_lock<std::mutex> guard(lock_);
31 |     if (ex_ptr_ != nullptr) { return; }
32 |     ex_ptr_ = std::current_exception();
33 |   }
34 | private:
35 |   std::exception_ptr ex_ptr_;
36 |   std::mutex lock_;
37 | };
38 | 
39 | #define OMP_INIT_EX() ThreadExceptionHelper omp_except_helper
40 | #define OMP_LOOP_EX_BEGIN() try {
41 | 
42 | #define OMP_LOOP_EX_END() } \
43 | catch(std::exception& ex) { Log::Warning(ex.what()); omp_except_helper.CaptureException(); } \
44 | catch(...) { omp_except_helper.CaptureException();  }
45 | #define OMP_THROW_EX() omp_except_helper.ReThrow()
46 | 
47 | #else
48 | 
49 | #ifdef _MSC_VER
50 |   #pragma warning( disable : 4068 ) // disable unknown pragma warning
51 | #endif
52 | 
53 | #ifdef __cplusplus
54 |   extern "C" {
55 | #endif
56 |   /** Fall here if no OPENMP support, so just
57 |       simulate a single thread running.
58 |       All #pragma omp should be ignored by the compiler **/
59 |   inline void omp_set_num_threads(int) {}
60 |   inline void omp_set_nested(int) {}
61 |   inline int omp_get_num_threads() {return 1;}
62 |   inline int omp_get_thread_num() {return 0;}
63 | #ifdef __cplusplus
64 | }; // extern "C"
65 | #endif
66 | 
67 | #define OMP_INIT_EX()
68 | #define OMP_LOOP_EX_BEGIN()
69 | #define OMP_LOOP_EX_END()
70 | #define OMP_THROW_EX()
71 | 
72 | #endif
73 | 
74 | 
75 | 
76 | #endif /* LIGHTGBM_OPENMP_WRAPPER_H_ */
77 | 


--------------------------------------------------------------------------------
/R-package/demo/early_stopping.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | require(methods)
 3 | 
 4 | # Load in the agaricus dataset
 5 | data(agaricus.train, package = "lightgbm")
 6 | data(agaricus.test, package = "lightgbm")
 7 | 
 8 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 9 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
10 | 
11 | # Note: for customized objective function, we leave objective as default
12 | # Note: what we are getting is margin value in prediction
13 | # You must know what you are doing
14 | param <- list(num_leaves = 4,
15 |               learning_rate = 1)
16 | valids <- list(eval = dtest)
17 | num_round <- 20
18 | 
19 | # User define objective function, given prediction, return gradient and second order gradient
20 | # This is loglikelihood loss
21 | logregobj <- function(preds, dtrain) {
22 |   labels <- getinfo(dtrain, "label")
23 |   preds <- 1 / (1 + exp(-preds))
24 |   grad <- preds - labels
25 |   hess <- preds * (1 - preds)
26 |   return(list(grad = grad, hess = hess))
27 | }
28 | 
29 | # User defined evaluation function, return a pair metric_name, result, higher_better
30 | # NOTE: when you do customized loss function, the default prediction value is margin
31 | # This may make buildin evalution metric not function properly
32 | # For example, we are doing logistic loss, the prediction is score before logistic transformation
33 | # The buildin evaluation error assumes input is after logistic transformation
34 | # Take this in mind when you use the customization, and maybe you need write customized evaluation function
35 | evalerror <- function(preds, dtrain) {
36 |   labels <- getinfo(dtrain, "label")
37 |   err <- as.numeric(sum(labels != (preds > 0.5))) / length(labels)
38 |   return(list(name = "error", value = err, higher_better = FALSE))
39 | }
40 | print("Start training with early Stopping setting")
41 | 
42 | bst <- lgb.train(param,
43 |                  dtrain,
44 |                  num_round,
45 |                  valids,
46 |                  objective = logregobj,
47 |                  eval = evalerror,
48 |                  early_stopping_round = 3)
49 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/file_io.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_UTILS_FILE_IO_H_
 2 | #define LIGHTGBM_UTILS_FILE_IO_H_
 3 | 
 4 | #include <memory>
 5 | 
 6 | namespace LightGBM{
 7 | 
 8 | /*!
 9 |  * \brief An interface for writing files from buffers
10 |  */
11 | struct VirtualFileWriter {
12 |   virtual ~VirtualFileWriter() {};
13 |   /*!
14 |    * \brief Initialize the writer
15 |    * \return True when the file is available for writes
16 |    */
17 |   virtual bool Init() = 0;
18 |   /*!
19 |    * \brief Append buffer to file
20 |    * \param data Buffer to write from
21 |    * \param bytes Number of bytes to write from buffer
22 |    * \return Number of bytes written
23 |    */
24 |   virtual size_t Write(const void* data, size_t bytes) const = 0;
25 |   /*!
26 |    * \brief Create appropriate writer for filename
27 |    * \param filename Filename of the data
28 |    * \return File writer instance
29 |    */
30 |   static std::unique_ptr<VirtualFileWriter> Make(const std::string& filename);
31 |   /*!
32 |    * \brief Check filename existence
33 |    * \param filename Filename of the data
34 |    * \return True when the file exists
35 |    */
36 |   static bool Exists(const std::string& filename);
37 | };
38 | 
39 | /**
40 |  * \brief An interface for reading files into buffers
41 |  */
42 | struct VirtualFileReader {
43 |   /*!
44 |    * \brief Constructor
45 |    * \param filename Filename of the data
46 |    */
47 |   virtual ~VirtualFileReader() {};
48 |   /*!
49 |    * \brief Initialize the reader
50 |    * \return True when the file is available for read
51 |    */
52 |   virtual bool Init() = 0;
53 |   /*!
54 |    * \brief Read data into buffer
55 |    * \param buffer Buffer to read data into
56 |    * \param bytes Number of bytes to read
57 |    * \return Number of bytes read
58 |    */
59 |   virtual size_t Read(void* buffer, size_t bytes) const = 0;
60 |   /*!
61 |    * \brief Create appropriate reader for filename
62 |    * \param filename Filename of the data
63 |    * \return File reader instance
64 |    */
65 |   static std::unique_ptr<VirtualFileReader> Make(const std::string& filename);
66 | };
67 | 
68 | }  // namespace LightGBM
69 | 
70 | #endif   // LightGBM_UTILS_FILE_IO_H_
71 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.plot.interpretation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.plot.interpretation.R
 3 | \name{lgb.plot.interpretation}
 4 | \alias{lgb.plot.interpretation}
 5 | \title{Plot feature contribution as a bar graph}
 6 | \usage{
 7 | lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1,
 8 |   left_margin = 10, cex = NULL)
 9 | }
10 | \arguments{
11 | \item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.}
12 | 
13 | \item{top_n}{maximal number of top features to include into the plot.}
14 | 
15 | \item{cols}{the column numbers of layout, will be used only for multiclass classification feature contribution.}
16 | 
17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.}
18 | 
19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
20 | }
21 | \value{
22 | The \code{lgb.plot.interpretation} function creates a \code{barplot}.
23 | }
24 | \description{
25 | Plot previously calculated feature contribution as a bar graph.
26 | }
27 | \details{
28 | The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
29 | Features are shown ranked in a decreasing contribution order.
30 | }
31 | \examples{
32 | \dontrun{
33 | library(lightgbm)
34 | Sigmoid <- function(x) {1 / (1 + exp(-x))}
35 | Logit <- function(x) {log(x / (1 - x))}
36 | data(agaricus.train, package = "lightgbm")
37 | train <- agaricus.train
38 | dtrain <- lgb.Dataset(train$data, label = train$label)
39 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
40 | data(agaricus.test, package = "lightgbm")
41 | test <- agaricus.test
42 | 
43 | params = list(objective = "binary",
44 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
45 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
46 |               model <- lgb.train(params, dtrain, 20)
47 | model <- lgb.train(params, dtrain, 20)
48 | 
49 | tree_interpretation <- lgb.interprete(model, test$data, 1:5)
50 | lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10)
51 | }
52 | }
53 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/pipeline_reader.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
 2 | #define LIGHTGBM_UTILS_PIPELINE_READER_H_
 3 | 
 4 | #include <LightGBM/utils/log.h>
 5 | 
 6 | #include <cstdio>
 7 | 
 8 | #include <functional>
 9 | #include <thread>
10 | #include <memory>
11 | #include <algorithm>
12 | #include "file_io.h"
13 | 
14 | namespace LightGBM{
15 | 
16 | /*!
17 | * \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block
18 | */
19 | class PipelineReader {
20 | public:
21 |   /*!
22 |   * \brief Read data from a file, use pipeline methods
23 |   * \param filename Filename of data
24 |   * \process_fun Process function
25 |   */
26 |   static size_t Read(const char* filename, int skip_bytes, const std::function<size_t (const char*, size_t)>& process_fun) {
27 |     auto reader = VirtualFileReader::Make(filename);
28 |     if (!reader->Init()) {
29 |       return 0;
30 |     }
31 |     size_t cnt = 0;
32 |     const size_t buffer_size =  16 * 1024 * 1024 ;
33 |     // buffer used for the process_fun
34 |     auto buffer_process = std::vector<char>(buffer_size);
35 |     // buffer used for the file reading
36 |     auto buffer_read = std::vector<char>(buffer_size);
37 |     size_t read_cnt = 0;
38 |     if (skip_bytes > 0) {
39 |       // skip first k bytes
40 |       read_cnt = reader->Read(buffer_process.data(), skip_bytes);
41 |     }
42 |     // read first block
43 |     read_cnt = reader->Read(buffer_process.data(), buffer_size);
44 | 
45 |     size_t last_read_cnt = 0;
46 |     while (read_cnt > 0) {
47 |       // start read thread
48 |       std::thread read_worker = std::thread(
49 |         [&reader, &buffer_read, buffer_size, &last_read_cnt] {
50 |         last_read_cnt = reader->Read(buffer_read.data(), buffer_size);
51 |       }
52 |       );
53 |       // start process
54 |       cnt += process_fun(buffer_process.data(), read_cnt);
55 |       // wait for read thread
56 |       read_worker.join();
57 |       // exchange the buffer
58 |       std::swap(buffer_process, buffer_read);
59 |       read_cnt = last_read_cnt;
60 |     }
61 |     return cnt;
62 |   }
63 | 
64 | };
65 | 
66 | }  // namespace LightGBM
67 | 
68 | #endif   // LightGBM_UTILS_PIPELINE_READER_H_
69 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.prepare.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.prepare.R
 3 | \name{lgb.prepare}
 4 | \alias{lgb.prepare}
 5 | \title{Data preparator for LightGBM datasets (numeric)}
 6 | \usage{
 7 | lgb.prepare(data)
 8 | }
 9 | \arguments{
10 | \item{data}{A data.frame or data.table to prepare.}
11 | }
12 | \value{
13 | The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
14 | }
15 | \description{
16 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
17 | }
18 | \examples{
19 | \dontrun{
20 | library(lightgbm)
21 | data(iris)
22 | 
23 | str(iris)
24 | # 'data.frame':	150 obs. of  5 variables:
25 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
26 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
27 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
28 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
29 | # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
30 | 
31 | str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric
32 | # 'data.frame':	150 obs. of  5 variables:
33 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
34 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
35 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
36 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
37 | # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
38 | 
39 | # When lightgbm package is installed, and you do not want to load it
40 | # You can still use the function!
41 | lgb.unloader()
42 | str(lightgbm::lgb.prepare(data = iris))
43 | # 'data.frame':	150 obs. of  5 variables:
44 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
45 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
46 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
47 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
48 | # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
49 | }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | sudo: required
 3 | dist: trusty
 4 | 
 5 | git:
 6 |   submodules: true
 7 | 
 8 | compiler:
 9 |   - gcc
10 | 
11 | os:
12 |   - linux
13 |   - osx
14 | 
15 | env:
16 |   global:
17 |     - PYTHON_VERSION=3.6
18 |   matrix:
19 |     - TASK=regular
20 |     - TASK=mpi PYTHON_VERSION=2.7
21 |     - TASK=pylint
22 |     - TASK=check-docs
23 |     - TASK=if-else
24 |     - TASK=sdist PYTHON_VERSION=3.4
25 |     - TASK=bdist PYTHON_VERSION=3.5
26 |     - TASK=gpu METHOD=source
27 |     - TASK=gpu METHOD=pip
28 | 
29 | matrix:
30 |   exclude:
31 |     - os: osx
32 |       env: TASK=gpu METHOD=source
33 |     - os: osx
34 |       env: TASK=gpu METHOD=pip
35 |     - os: osx
36 |       env: TASK=if-else
37 |     - os: osx
38 |       env: TASK=pylint
39 |     - os: osx
40 |       env: TASK=check-docs
41 | 
42 | before_install:
43 |   - test -n $CC  && unset CC
44 |   - test -n $CXX && unset CXX
45 |   - export PATH="$HOME/miniconda/bin:$PATH"
46 |   - export LGB_VER=$(head -n 1 VERSION.txt)
47 |   - export AMDAPPSDK=$HOME/AMDAPPSDK
48 |   - export LD_LIBRARY_PATH="$AMDAPPSDK/lib/x86_64:$LD_LIBRARY_PATH"
49 |   - export OPENCL_VENDOR_PATH=$AMDAPPSDK/etc/OpenCL/vendors
50 | 
51 | install:
52 |   - bash .travis/setup.sh
53 | 
54 | script:
55 |   - bash .travis/test.sh
56 | 
57 | notifications:
58 |   email: false
59 | 
60 | deploy:
61 |   provider: releases
62 |   api-key:
63 |     secure: "idU9Fb/yUz7VsVOEb0vGR8qqxcvXr4eh1tMzkKiMWLRx5XNeq7RCUzfKAPMIizFkML9zdMh/5vPtZ1Zs++3oWPpbZE2/o4CURoE+BvwDUyEDrKTdNSGoWgWZq0QLjfahj/PR8ObWlU+XCHqRQzKXlwbynwwUGRpOJrlEY0To5Kt9gTV5W8MxSlW7xFU2TTmMa499IZut38OuenJ3Nm9mTe6MCHFW4Y5uGp/gwNuBYfqzwUXDi6h/cJiJJD5drwtNnSneFZ2PZplrKxJxSJdSQ2aHttU+Wr8xogi9hLI/H6OA4UYCF69HrWOLSggplkZt6qUzaG7UfYyid4m6YbeKMUQRNBuGXhYVGr5qkyAzqXiOesGAef550346pWEZNGPLfNnKAwqPgkp8Q8tV9i0srjzyttqFAlLqhA76yST3kuX+QS0VGepSUTV+kkfxCaHZagxtX9Xve5RNybu4B44UmHWIGJnS6ijYpxWKwvWnMmBCIezFbZYyqsiXYC+9d5RfBgNFQ4PlRfmY0vnJlwUhx1AnyL9jsxnthwl9CNczo4mgBqnCSXxlhXNHz6ToMQuhgdhnqm5+qqJzI5/eUugxh8CW18qZTZBkrnL4DxEMm+bQ2QT8O07ZHrEDPKPXxQw7tBsphWvECetJ4DxXfNaf59GrY+eD6TFZuxurB5Vvo6s="
64 |   file_glob: true
65 |   file: python-package/dist/*.whl
66 |   skip_cleanup: true
67 |   draft: true
68 |   tag_name: $TRAVIS_TAG
69 |   on:
70 |     condition: "$TASK = bdist"
71 |     tags: true
72 |     all_branches: true
73 | 


--------------------------------------------------------------------------------
/R-package/man/saveRDS.lgb.Booster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/saveRDS.lgb.Booster.R
 3 | \name{saveRDS.lgb.Booster}
 4 | \alias{saveRDS.lgb.Booster}
 5 | \title{saveRDS for lgb.Booster models}
 6 | \usage{
 7 | saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
 8 |   compress = TRUE, refhook = NULL, raw = TRUE)
 9 | }
10 | \arguments{
11 | \item{object}{R object to serialize.}
12 | 
13 | \item{file}{a connection or the name of the file where the R object is saved to or read from.}
14 | 
15 | \item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.}
16 | 
17 | \item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.}
18 | 
19 | \item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.}
20 | 
21 | \item{refhook}{a hook function for handling reference objects.}
22 | 
23 | \item{raw}{whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.}
24 | }
25 | \value{
26 | NULL invisibly.
27 | }
28 | \description{
29 | Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
30 | }
31 | \examples{
32 | \dontrun{
33 | library(lightgbm)
34 | data(agaricus.train, package = "lightgbm")
35 | train <- agaricus.train
36 | dtrain <- lgb.Dataset(train$data, label = train$label)
37 | data(agaricus.test, package = "lightgbm")
38 | test <- agaricus.test
39 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
40 | params <- list(objective = "regression", metric = "l2")
41 | valids <- list(test = dtest)
42 | model <- lgb.train(params,
43 |                    dtrain,
44 |                    100,
45 |                    valids,
46 |                    min_data = 1,
47 |                    learning_rate = 1,
48 |                    early_stopping_rounds = 10)
49 | saveRDS.lgb.Booster(model, "model.rds")
50 | }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/.nuget/create_nuget.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from distutils.file_util import copy_file
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     source = sys.argv[1]
 9 |     current_dir = os.path.abspath(os.path.dirname(__file__))
10 |     linux_folder_path = os.path.join(current_dir, "runtimes", "linux-x64", "native")
11 |     if not os.path.exists(linux_folder_path):
12 |         os.makedirs(linux_folder_path)
13 |     osx_folder_path = os.path.join(current_dir, "runtimes", "osx-x64", "native")
14 |     if not os.path.exists(osx_folder_path):
15 |         os.makedirs(osx_folder_path)
16 |     windows_folder_path = os.path.join(current_dir, "runtimes", "win-x64", "native")
17 |     if not os.path.exists(windows_folder_path):
18 |         os.makedirs(windows_folder_path)
19 |     copy_file(os.path.join(source, "lib_lightgbm.so"), os.path.join(linux_folder_path, "lib_lightgbm.so"))
20 |     copy_file(os.path.join(source, "lib_lightgbm.dylib"), os.path.join(osx_folder_path, "lib_lightgbm.dylib"))
21 |     copy_file(os.path.join(source, "lib_lightgbm.dll"), os.path.join(windows_folder_path, "lib_lightgbm.dll"))
22 |     version = open(os.path.join(current_dir, os.path.pardir, 'VERSION.txt')).read().strip()
23 |     nuget_str = '''<?xml version="1.0"?>
24 |     <package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
25 |     <metadata>
26 |         <id>LightGBM</id>
27 |         <version>%s</version>
28 |         <authors>Guolin Ke</authors>
29 |         <owners>Guolin Ke</owners>
30 |         <licenseUrl>https://github.com/Microsoft/LightGBM/blob/master/LICENSE</licenseUrl>
31 |         <projectUrl>https://github.com/Microsoft/LightGBM</projectUrl>
32 |         <requireLicenseAcceptance>false</requireLicenseAcceptance>
33 |         <description>A fast, distributed, high performance gradient boosting framework</description>
34 |         <copyright>Copyright 2018 @ Microsoft</copyright>
35 |         <tags>machine-learning data-mining distributed native boosting gbdt</tags>
36 |         <dependencies> </dependencies>
37 |     </metadata>
38 |         <files>
39 |         <file src="runtimes\**" target="runtimes"/>
40 |         </files>
41 |     </package>
42 |     ''' % version
43 |     with open(os.path.join(current_dir, "LightGBM.nuspec"), "w") as nuget_file:
44 |         nuget_file.write(nuget_str)
45 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.model.dt.tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.model.dt.tree.R
 3 | \name{lgb.model.dt.tree}
 4 | \alias{lgb.model.dt.tree}
 5 | \title{Parse a LightGBM model json dump}
 6 | \usage{
 7 | lgb.model.dt.tree(model, num_iteration = NULL)
 8 | }
 9 | \arguments{
10 | \item{model}{object of class \code{lgb.Booster}}
11 | 
12 | \item{num_iteration}{number of iterations you want to predict with. NULL or 
13 | <= 0 means use best iteration}
14 | }
15 | \value{
16 | A \code{data.table} with detailed information about model trees' nodes and leafs.
17 | 
18 | The columns of the \code{data.table} are:
19 | 
20 | \itemize{
21 |  \item \code{tree_index}: ID of a tree in a model (integer)
22 |  \item \code{split_index}: ID of a node in a tree (integer)
23 |  \item \code{split_feature}: for a node, it's a feature name (character);
24 |                              for a leaf, it simply labels it as \code{"NA"}
25 |  \item \code{node_parent}: ID of the parent node for current node (integer)
26 |  \item \code{leaf_index}: ID of a leaf in a tree (integer)
27 |  \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
28 |  \item \code{split_gain}: Split gain of a node
29 |  \item \code{threshold}: Spliting threshold value of a node
30 |  \item \code{decision_type}: Decision type of a node
31 |  \item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right
32 |  \item \code{internal_value}: Node value
33 |  \item \code{internal_count}: The number of observation collected by a node
34 |  \item \code{leaf_value}: Leaf value
35 |  \item \code{leaf_count}: The number of observation collected by a leaf
36 | }
37 | }
38 | \description{
39 | Parse a LightGBM model json dump into a \code{data.table} structure.
40 | }
41 | \examples{
42 | \dontrun{
43 | library(lightgbm)
44 | 
45 | data(agaricus.train, package = "lightgbm")
46 | train <- agaricus.train
47 | dtrain <- lgb.Dataset(train$data, label = train$label)
48 | 
49 | params = list(objective = "binary",
50 |               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
51 |               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
52 |               model <- lgb.train(params, dtrain, 20)
53 | model <- lgb.train(params, dtrain, 20)
54 | 
55 | tree_dt <- lgb.model.dt.tree(model)
56 | }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Using LightGBM via Docker
 2 | 
 3 | This directory contains `Dockerfile` to make it easy to build and run LightGBM via [Docker](http://www.docker.com/).
 4 | 
 5 | ## Installing Docker
 6 | 
 7 | Follow the general installation instructions
 8 | [on the Docker site](https://docs.docker.com/installation/):
 9 | 
10 | * [macOS](https://docs.docker.com/installation/mac/): [docker toolbox](https://www.docker.com/toolbox)
11 | * [Ubuntu](https://docs.docker.com/installation/ubuntulinux/)
12 | 
13 | ## Using CLI Version of LightGBM via Docker
14 | 
15 | Build a Docker image with LightGBM CLI:
16 | 
17 | ```
18 | mkdir lightgbm-docker
19 | cd lightgbm-docker
20 | wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-cli
21 | docker build -t lightgbm-cli -f dockerfile-cli .
22 | ```
23 | 
24 | where `lightgbm-cli` is the desired Docker image name.
25 | 
26 | Run the CLI from the container:
27 | 
28 | ```
29 | docker run --rm -it \
30 | --volume $HOME/lgbm.conf:/lgbm.conf \
31 | --volume $HOME/model.txt:/model.txt \
32 | --volume $HOME/tmp:/out \
33 | lightgbm-cli \
34 | config=lgbm.conf
35 | ```
36 | 
37 | In the above example, three volumes are [mounted](https://docs.docker.com/engine/reference/commandline/run/#mount-volume--v-read-only)
38 | from the host machine to the Docker container:
39 | 
40 | * `lgbm.conf` - task config, for example
41 | 
42 | ```
43 | app=multiclass
44 | num_class=3
45 | task=convert_model
46 | input_model=model.txt
47 | convert_model=/out/predict.cpp
48 | convert_model_language=cpp
49 | ```
50 | 
51 | * `model.txt` - an input file for the task, could be training data or, in this case, a pre-trained model.
52 | * `out` - a directory to store the output of the task, notice that `convert_model` in the task config is using it.
53 | 
54 | `config=lgbm.conf` is a command-line argument passed to the `lightgbm` executable, more arguments can
55 | be passed if required.
56 | 
57 | ## Running the Python-package Сontainer
58 | 
59 | Build the container, for Python users:
60 | 
61 | ```
62 | mkdir lightgbm-docker
63 | cd lightgbm-docker
64 | wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-python
65 | docker build -t lightgbm -f dockerfile-python .
66 | ```
67 | 
68 | After build finished, run the container:
69 | 
70 | ```
71 | docker run --rm -it lightgbm
72 | ```
73 | 


--------------------------------------------------------------------------------
/src/boosting/boosting.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/boosting.h>
 2 | #include "gbdt.h"
 3 | #include "dart.hpp"
 4 | #include "goss.hpp"
 5 | #include "rf.hpp"
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | std::string GetBoostingTypeFromModelFile(const char* filename) {
10 |   TextReader<size_t> model_reader(filename, true);
11 |   std::string type = model_reader.first_line();
12 |   return type;
13 | }
14 | 
15 | bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
16 |   auto start_time = std::chrono::steady_clock::now();
17 |   if (boosting != nullptr) {
18 |     TextReader<size_t> model_reader(filename, true);
19 |     size_t buffer_len = 0;
20 |     auto buffer = model_reader.ReadContent(&buffer_len);
21 |     if (!boosting->LoadModelFromString(buffer.data(), buffer_len)) {
22 |       return false;
23 |     }
24 |   }
25 |   std::chrono::duration<double, std::milli> delta = (std::chrono::steady_clock::now() - start_time);
26 |   Log::Debug("Time for loading model: %f seconds", 1e-3*delta);
27 |   return true;
28 | }
29 | 
30 | Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) {
31 |   if (filename == nullptr || filename[0] == '\0') {
32 |     if (type == std::string("gbdt")) {
33 |       return new GBDT();
34 |     } else if (type == std::string("dart")) {
35 |       return new DART();
36 |     } else if (type == std::string("goss")) {
37 |       return new GOSS();
38 |     } else if (type == std::string("rf")) {
39 |       return new RF();
40 |     } else {
41 |       return nullptr;
42 |     }
43 |   } else {
44 |     std::unique_ptr<Boosting> ret;
45 |     if (GetBoostingTypeFromModelFile(filename) == std::string("tree")) {
46 |       if (type == std::string("gbdt")) {
47 |         ret.reset(new GBDT());
48 |       } else if (type == std::string("dart")) {
49 |         ret.reset(new DART());
50 |       } else if (type == std::string("goss")) {
51 |         ret.reset(new GOSS());
52 |       } else if (type == std::string("rf")) {
53 |         return new RF();
54 |       } else {
55 |         Log::Fatal("Unknown boosting type %s", type.c_str());
56 |       }
57 |       LoadFileToBoosting(ret.get(), filename);
58 |     } else {
59 |       Log::Fatal("Unknown model format or submodel type in model file %s", filename);
60 |     }
61 |     return ret.release();
62 |   }
63 | }
64 | 
65 | }  // namespace LightGBM
66 | 


--------------------------------------------------------------------------------
/docs/gcc-Tips.rst:
--------------------------------------------------------------------------------
 1 | Recommendations When Using gcc
 2 | ==============================
 3 | 
 4 | It is recommended to use ``-O3 -mtune=native`` to achieve maximum speed during LightGBM training.
 5 | 
 6 | Using Intel Ivy Bridge CPU on 1M x 1K Bosch dataset, the performance increases as follow:
 7 | 
 8 | +-------------------------------------+---------------------+
 9 | | Compilation Flag                    | Performance Index   |
10 | +=====================================+=====================+
11 | | ``-O2 -mtune=core2``                | 100.00%             |
12 | +-------------------------------------+---------------------+
13 | | ``-O2 -mtune=native``               | 100.90%             |
14 | +-------------------------------------+---------------------+
15 | | ``-O3 -mtune=native``               | 102.78%             |
16 | +-------------------------------------+---------------------+
17 | | ``-O3 -ffast-math -mtune=native``   | 100.64%             |
18 | +-------------------------------------+---------------------+
19 | 
20 | You can find more details on the experimentation below:
21 | 
22 | -  `Laurae++/Benchmarks <https://sites.google.com/view/lauraepp/new-benchmarks/old-benchmarks>`__
23 | 
24 | -  `Laurae2/gbt\_benchmarks <https://github.com/Laurae2/gbt_benchmarks>`__
25 | 
26 | -  `Laurae's Benchmark Master Data (Interactive) <https://public.tableau.com/views/gbt_benchmarks/Master-Data?:showVizHome=no>`__
27 | 
28 | -  `Kaggle Paris Meetup #12 Slides <https://drive.google.com/file/d/0B6qJBmoIxFe0ZHNCOXdoRWMxUm8/view>`__
29 | 
30 | Some explanatory pictures:
31 | 
32 | .. image:: ./_static/images/gcc-table.png
33 |    :align: center
34 |    :target: ./_static/images/gcc-table.png
35 | 
36 | .. image:: ./_static/images/gcc-bars.png
37 |    :align: center
38 |    :target: ./_static/images/gcc-bars.png
39 | 
40 | .. image:: ./_static/images/gcc-chart.png
41 |    :align: center
42 |    :target: ./_static/images/gcc-chart.png
43 | 
44 | .. image:: ./_static/images/gcc-comparison-1.png
45 |    :align: center
46 |    :target: ./_static/images/gcc-comparison-1.png
47 | 
48 | .. image:: ./_static/images/gcc-comparison-2.png
49 |    :align: center
50 |    :target: ./_static/images/gcc-comparison-2.png
51 | 
52 | .. image:: ./_static/images/gcc-meetup-1.png
53 |    :align: center
54 |    :target: ./_static/images/gcc-meetup-1.png
55 | 
56 | .. image:: ./_static/images/gcc-meetup-2.png
57 |    :align: center
58 |    :target: ./_static/images/gcc-meetup-2.png
59 | 


--------------------------------------------------------------------------------
/examples/python-guide/sklearn_example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable = invalid-name, C0111
 3 | import numpy as np
 4 | import pandas as pd
 5 | import lightgbm as lgb
 6 | 
 7 | from sklearn.metrics import mean_squared_error
 8 | from sklearn.model_selection import GridSearchCV
 9 | 
10 | # load or create your dataset
11 | print('Load data...')
12 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
13 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
14 | 
15 | y_train = df_train[0].values
16 | y_test = df_test[0].values
17 | X_train = df_train.drop(0, axis=1).values
18 | X_test = df_test.drop(0, axis=1).values
19 | 
20 | print('Start training...')
21 | # train
22 | gbm = lgb.LGBMRegressor(objective='regression',
23 |                         num_leaves=31,
24 |                         learning_rate=0.05,
25 |                         n_estimators=20)
26 | gbm.fit(X_train, y_train,
27 |         eval_set=[(X_test, y_test)],
28 |         eval_metric='l1',
29 |         early_stopping_rounds=5)
30 | 
31 | print('Start predicting...')
32 | # predict
33 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
34 | # eval
35 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
36 | 
37 | # feature importances
38 | print('Feature importances:', list(gbm.feature_importances_))
39 | 
40 | 
41 | # self-defined eval metric
42 | # f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
43 | # Root Mean Squared Logarithmic Error (RMSLE)
44 | def rmsle(y_true, y_pred):
45 |     return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
46 | 
47 | 
48 | print('Start training with custom eval function...')
49 | # train
50 | gbm.fit(X_train, y_train,
51 |         eval_set=[(X_test, y_test)],
52 |         eval_metric=rmsle,
53 |         early_stopping_rounds=5)
54 | 
55 | print('Start predicting...')
56 | # predict
57 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
58 | # eval
59 | print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])
60 | 
61 | # other scikit-learn modules
62 | estimator = lgb.LGBMRegressor(num_leaves=31)
63 | 
64 | param_grid = {
65 |     'learning_rate': [0.01, 0.1, 1],
66 |     'n_estimators': [20, 40]
67 | }
68 | 
69 | gbm = GridSearchCV(estimator, param_grid)
70 | 
71 | gbm.fit(X_train, y_train)
72 | 
73 | print('Best parameters found by grid search are:', gbm.best_params_)
74 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.prepare2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.prepare2.R
 3 | \name{lgb.prepare2}
 4 | \alias{lgb.prepare2}
 5 | \title{Data preparator for LightGBM datasets (integer)}
 6 | \usage{
 7 | lgb.prepare2(data)
 8 | }
 9 | \arguments{
10 | \item{data}{A data.frame or data.table to prepare.}
11 | }
12 | \value{
13 | The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
14 | }
15 | \description{
16 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
17 | }
18 | \examples{
19 | \dontrun{
20 | library(lightgbm)
21 | data(iris)
22 | 
23 | str(iris)
24 | # 'data.frame':	150 obs. of  5 variables:
25 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
26 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
27 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
28 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
29 | # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
30 | 
31 | str(lgb.prepare2(data = iris)) # Convert all factors/chars to integer
32 | # 'data.frame':	150 obs. of  5 variables:
33 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
34 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
35 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
36 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
37 | # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
38 | 
39 | # When lightgbm package is installed, and you do not want to load it
40 | # You can still use the function!
41 | lgb.unloader()
42 | str(lightgbm::lgb.prepare2(data = iris))
43 | # 'data.frame':	150 obs. of  5 variables:
44 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
45 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
46 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
47 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
48 | # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
49 | 
50 | }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/R-package/R/lgb.importance.R:
--------------------------------------------------------------------------------
 1 | #' Compute feature importance in a model
 2 | #' 
 3 | #' Creates a \code{data.table} of feature importances in a model.
 4 | #' 
 5 | #' @param model object of class \code{lgb.Booster}.
 6 | #' @param percentage whether to show importance in relative percentage.
 7 | #' 
 8 | #' @return
 9 | #' 
10 | #' For a tree model, a \code{data.table} with the following columns:
11 | #' \itemize{
12 | #'   \item \code{Feature} Feature names in the model.
13 | #'   \item \code{Gain} The total gain of this feature's splits.
14 | #'   \item \code{Cover} The number of observation related to this feature.
15 | #'   \item \code{Frequency} The number of times a feature splited in trees.
16 | #' }
17 | #' 
18 | #' @examples
19 | #' \dontrun{
20 | #' library(lightgbm)
21 | #' data(agaricus.train, package = "lightgbm")
22 | #' train <- agaricus.train
23 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
24 | #'
25 | #' params = list(objective = "binary",
26 | #'               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
27 | #'               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
28 | #'               model <- lgb.train(params, dtrain, 20)
29 | #' model <- lgb.train(params, dtrain, 20)
30 | #'
31 | #' tree_imp1 <- lgb.importance(model, percentage = TRUE)
32 | #' tree_imp2 <- lgb.importance(model, percentage = FALSE)
33 | #' }
34 | #' 
35 | #' @importFrom magrittr %>% %T>%
36 | #' @importFrom data.table :=
37 | #' @export
38 | lgb.importance <- function(model, percentage = TRUE) {
39 |   
40 |   # Check if model is a lightgbm model
41 |   if (!inherits(model, "lgb.Booster")) {
42 |     stop("'model' has to be an object of class lgb.Booster")
43 |   }
44 |   
45 |   # Setup importance
46 |   tree_dt <- lgb.model.dt.tree(model)
47 |   
48 |   # Extract elements
49 |   tree_imp <- tree_dt %>%
50 |     magrittr::extract(.,
51 |                       i = ! is.na(split_index),
52 |                       j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N),
53 |                       by = "split_feature") %T>%
54 |     data.table::setnames(., old = "split_feature", new = "Feature") %>%
55 |     magrittr::extract(., i = order(Gain, decreasing = TRUE))
56 |   
57 |   # Check if relative values are requested
58 |   if (percentage) {
59 |     tree_imp[, ":="(Gain = Gain / sum(Gain),
60 |                     Cover = Cover / sum(Cover),
61 |                     Frequency = Frequency / sum(Frequency))]
62 |   }
63 |   
64 |   # Return importance table
65 |   return(tree_imp)
66 |   
67 | }
68 | 


--------------------------------------------------------------------------------
/include/LightGBM/application.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_APPLICATION_H_
 2 | #define LIGHTGBM_APPLICATION_H_
 3 | 
 4 | #include <LightGBM/meta.h>
 5 | #include <LightGBM/config.h>
 6 | 
 7 | #include <vector>
 8 | #include <memory>
 9 | 
10 | namespace LightGBM {
11 | 
12 | class DatasetLoader;
13 | class Dataset;
14 | class Boosting;
15 | class ObjectiveFunction;
16 | class Metric;
17 | 
18 | /*!
19 | * \brief The main entrance of LightGBM. this application has two tasks:
20 | *        Train and Predict.
21 | *        Train task will train a new model
22 | *        Predict task will predict the scores of test data using existing model,
23 | *        and save the score to disk.
24 | */
25 | class Application {
26 | public:
27 |   Application(int argc, char** argv);
28 | 
29 |   /*! \brief Destructor */
30 |   ~Application();
31 | 
32 |   /*! \brief To call this funciton to run application*/
33 |   inline void Run();
34 | 
35 | private:
36 | 
37 |   /*! \brief Load parameters from command line and config file*/
38 |   void LoadParameters(int argc, char** argv);
39 | 
40 |   /*! \brief Load data, including training data and validation data*/
41 |   void LoadData();
42 | 
43 |   /*! \brief Initialization before training*/
44 |   void InitTrain();
45 | 
46 |   /*! \brief Main Training logic */
47 |   void Train();
48 | 
49 |   /*! \brief Initializations before prediction */
50 |   void InitPredict();
51 | 
52 |   /*! \brief Main predicting logic */
53 |   void Predict();
54 | 
55 |   /*! \brief Main Convert model logic */
56 |   void ConvertModel();
57 | 
58 |   /*! \brief All configs */
59 |   Config config_;
60 |   /*! \brief Training data */
61 |   std::unique_ptr<Dataset> train_data_;
62 |   /*! \brief Validation data */
63 |   std::vector<std::unique_ptr<Dataset>> valid_datas_;
64 |   /*! \brief Metric for training data */
65 |   std::vector<std::unique_ptr<Metric>> train_metric_;
66 |   /*! \brief Metrics for validation data */
67 |   std::vector<std::vector<std::unique_ptr<Metric>>> valid_metrics_;
68 |   /*! \brief Boosting object */
69 |   std::unique_ptr<Boosting> boosting_;
70 |   /*! \brief Training objective function */
71 |   std::unique_ptr<ObjectiveFunction> objective_fun_;
72 | };
73 | 
74 | 
75 | inline void Application::Run() {
76 |   if (config_.task == TaskType::kPredict || config_.task == TaskType::KRefitTree) {
77 |     InitPredict();
78 |     Predict();
79 |   } else if (config_.task == TaskType::kConvertModel) {
80 |     ConvertModel();
81 |   } else {
82 |     InitTrain();
83 |     Train();
84 |   }
85 | }
86 | 
87 | }  // namespace LightGBM
88 | 
89 | #endif   // LightGBM_APPLICATION_H_
90 | 


--------------------------------------------------------------------------------
/R-package/R/lgb.unloader.R:
--------------------------------------------------------------------------------
 1 | #' LightGBM unloading error fix
 2 | #'
 3 | #' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
 4 | #' 
 5 | #' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.
 6 | #' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.
 7 | #' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.
 8 | #' 
 9 | #' @return NULL invisibly.
10 | #' 
11 | #' @examples
12 | #' \dontrun{
13 | #' library(lightgbm)
14 | #' data(agaricus.train, package = "lightgbm")
15 | #' train <- agaricus.train
16 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
17 | #' data(agaricus.test, package = "lightgbm")
18 | #' test <- agaricus.test
19 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
20 | #' params <- list(objective = "regression", metric = "l2")
21 | #' valids <- list(test = dtest)
22 | #' model <- lgb.train(params,
23 | #'                    dtrain,
24 | #'                    100,
25 | #'                    valids,
26 | #'                    min_data = 1,
27 | #'                    learning_rate = 1,
28 | #'                    early_stopping_rounds = 10)
29 | #' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
30 | #' rm(model, dtrain, dtest) # Not needed if wipe = TRUE
31 | #' gc() # Not needed if wipe = TRUE
32 | #' 
33 | #' library(lightgbm)
34 | #' # Do whatever you want again with LightGBM without object clashing
35 | #' }
36 | #' 
37 | #' @export
38 | lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
39 |   
40 |   # Unload package
41 |   try(detach("package:lightgbm", unload = TRUE), silent = TRUE)
42 |   
43 |   # Should we wipe variables? (lgb.Booster, lgb.Dataset)
44 |   if (wipe) {
45 |     boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir))
46 |     datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir))
47 |     rm(list = c(boosters, datasets), envir = envir)
48 |     gc(verbose = FALSE)
49 |   }
50 |   
51 |   # Load package back?
52 |   if (restore) {
53 |     library(lightgbm)
54 |   }
55 |   
56 |   invisible()
57 |   
58 | }
59 | 


--------------------------------------------------------------------------------
/R-package/demo/multiclass.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | 
 3 | # We load the default iris dataset shipped with R
 4 | data(iris)
 5 | 
 6 | # We must convert factors to numeric
 7 | # They must be starting from number 0 to use multiclass
 8 | # For instance: 0, 1, 2, 3, 4, 5...
 9 | iris$Species <- as.numeric(as.factor(iris$Species)) - 1
10 | 
11 | # We cut the data set into 80% train and 20% validation
12 | # The 10 last samples of each class are for validation
13 | 
14 | train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
15 | test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
16 | dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
17 | dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
18 | valids <- list(test = dtest)
19 | 
20 | # Method 1 of training
21 | params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
22 | model <- lgb.train(params,
23 |                    dtrain,
24 |                    100,
25 |                    valids,
26 |                    min_data = 1,
27 |                    learning_rate = 1,
28 |                    early_stopping_rounds = 10)
29 | 
30 | # We can predict on test data, outputs a 90-length vector
31 | # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
32 | my_preds <- predict(model, test[, 1:4])
33 | 
34 | # Method 2 of training, identical
35 | model <- lgb.train(list(),
36 |                    dtrain,
37 |                    100,
38 |                    valids,
39 |                    min_data = 1,
40 |                    learning_rate = 1,
41 |                    early_stopping_rounds = 10,
42 |                    objective = "multiclass",
43 |                    metric = "multi_error",
44 |                    num_class = 3)
45 | 
46 | # We can predict on test data, identical
47 | my_preds <- predict(model, test[, 1:4])
48 | 
49 | # A (30x3) matrix with the predictions, use parameter reshape
50 | # class1 class2 class3
51 | #   obs1   obs1   obs1
52 | #   obs2   obs2   obs2
53 | #   ....   ....   ....
54 | my_preds <- predict(model, test[, 1:4], reshape = TRUE)
55 | 
56 | # We can also get the predicted scores before the Sigmoid/Softmax application
57 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE)
58 | 
59 | # Raw score predictions as matrix instead of vector
60 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
61 | 
62 | # We can also get the leaf index
63 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
64 | 
65 | # Predict leaf index as matrix instead of vector
66 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)
67 | 


--------------------------------------------------------------------------------
/R-package/man/predict.lgb.Booster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.Booster.R
 3 | \name{predict.lgb.Booster}
 4 | \alias{predict.lgb.Booster}
 5 | \title{Predict method for LightGBM model}
 6 | \usage{
 7 | \method{predict}{lgb.Booster}(object, data, num_iteration = NULL,
 8 |   rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE,
 9 |   ...)
10 | }
11 | \arguments{
12 | \item{object}{Object of class \code{lgb.Booster}}
13 | 
14 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
15 | 
16 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
17 | 
18 | \item{rawscore}{whether the prediction should be returned in the for of original untransformed
19 | sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
20 | logistic regression would result in predictions for log-odds instead of probabilities.}
21 | 
22 | \item{predleaf}{whether predict leaf index instead.}
23 | 
24 | \item{header}{only used for prediction for text file. True if text file has header}
25 | 
26 | \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
27 | prediction outputs per case.}
28 | 
29 | \item{...}{Additional named arguments passed to the \code{predict()} method of
30 | the \code{lgb.Booster} object passed to \code{object}.}
31 | }
32 | \value{
33 | For regression or binary classification, it returns a vector of length \code{nrows(data)}.
34 | For multiclass classification, either a \code{num_class * nrows(data)} vector or
35 | a \code{(nrows(data), num_class)} dimension matrix is returned, depending on
36 | the \code{reshape} value.
37 | 
38 | When \code{predleaf = TRUE}, the output is a matrix object with the
39 | number of columns corresponding to the number of trees.
40 | }
41 | \description{
42 | Predicted values based on class \code{lgb.Booster}
43 | }
44 | \examples{
45 | \dontrun{
46 | library(lightgbm)
47 | data(agaricus.train, package = "lightgbm")
48 | train <- agaricus.train
49 | dtrain <- lgb.Dataset(train$data, label = train$label)
50 | data(agaricus.test, package = "lightgbm")
51 | test <- agaricus.test
52 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
53 | params <- list(objective = "regression", metric = "l2")
54 | valids <- list(test = dtest)
55 | model <- lgb.train(params,
56 |                    dtrain,
57 |                    100,
58 |                    valids,
59 |                    min_data = 1,
60 |                    learning_rate = 1,
61 |                    early_stopping_rounds = 10)
62 | preds <- predict(model, test$data)
63 | }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/boosting/prediction_early_stop.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/prediction_early_stop.h>
 2 | #include <LightGBM/utils/log.h>
 3 | 
 4 | #include <algorithm>
 5 | #include <vector>
 6 | #include <cmath>
 7 | #include <limits>
 8 | 
 9 | namespace {
10 | 
11 | using namespace LightGBM;
12 | 
13 | PredictionEarlyStopInstance CreateNone(const PredictionEarlyStopConfig&) {
14 |   return PredictionEarlyStopInstance{
15 |     [](const double*, int) {
16 |     return false;
17 |   },
18 |     std::numeric_limits<int>::max() // make sure the lambda is almost never called
19 |   };
20 | }
21 | 
22 | PredictionEarlyStopInstance CreateMulticlass(const PredictionEarlyStopConfig& config) {
23 |   // margin_threshold will be captured by value
24 |   const double margin_threshold = config.margin_threshold;
25 | 
26 |   return PredictionEarlyStopInstance{
27 |     [margin_threshold](const double* pred, int sz) {
28 |     if (sz < 2) {
29 |       Log::Fatal("Multiclass early stopping needs predictions to be of length two or larger");
30 |     }
31 | 
32 |     // copy and sort
33 |     std::vector<double> votes(static_cast<size_t>(sz));
34 |     for (int i = 0; i < sz; ++i) {
35 |       votes[i] = pred[i];
36 |     }
37 |     std::partial_sort(votes.begin(), votes.begin() + 2, votes.end(), std::greater<double>());
38 | 
39 |     const auto margin = votes[0] - votes[1];
40 | 
41 |     if (margin > margin_threshold) {
42 |       return true;
43 |     }
44 | 
45 |     return false;
46 |   },
47 |     config.round_period
48 |   };
49 | }
50 | 
51 | PredictionEarlyStopInstance CreateBinary(const PredictionEarlyStopConfig& config) {
52 |   // margin_threshold will be captured by value
53 |   const double margin_threshold = config.margin_threshold;
54 | 
55 |   return PredictionEarlyStopInstance{
56 |     [margin_threshold](const double* pred, int sz) {
57 |     if (sz != 1) {
58 |       Log::Fatal("Binary early stopping needs predictions to be of length one");
59 |     }
60 |     const auto margin = 2.0 * fabs(pred[0]);
61 | 
62 |     if (margin > margin_threshold) {
63 |       return true;
64 |     }
65 | 
66 |     return false;
67 |   },
68 |     config.round_period
69 |   };
70 | }
71 | 
72 | }
73 | 
74 | namespace LightGBM {
75 | 
76 | PredictionEarlyStopInstance CreatePredictionEarlyStopInstance(const std::string& type,
77 |                                                               const PredictionEarlyStopConfig& config) {
78 |   if (type == "none") {
79 |     return CreateNone(config);
80 |   } else if (type == "multiclass") {
81 |     return CreateMulticlass(config);
82 |   } else if (type == "binary") {
83 |     return CreateBinary(config);
84 |   } else {
85 |     throw std::runtime_error("Unknown early stopping type: " + type);
86 |   }
87 | }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/docs/Advanced-Topics.rst:
--------------------------------------------------------------------------------
 1 | Advanced Topics
 2 | ===============
 3 | 
 4 | Missing Value Handle
 5 | --------------------
 6 | 
 7 | -  LightGBM enables the missing value handle by default. Disable it by setting ``use_missing=false``.
 8 | 
 9 | -  LightGBM uses NA (NaN) to represent missing values by default. Change it to use zero by setting ``zero_as_missing=true``.
10 | 
11 | -  When ``zero_as_missing=false`` (default), the unshown values in sparse matrices (and LightSVM) are treated as zeros.
12 | 
13 | -  When ``zero_as_missing=true``, NA and zeros (including unshown values in sparse matrices (and LightSVM)) are treated as missing.
14 | 
15 | Categorical Feature Support
16 | ---------------------------
17 | 
18 | -  LightGBM offers good accuracy with integer-encoded categorical features. LightGBM applies
19 |    `Fisher (1958) <http://www.csiss.org/SPACE/workshops/2004/SAC/files/fisher.pdf>`_
20 |    to find the optimal split over categories as
21 |    `described here <./Features.rst#optimal-split-for-categorical-features>`_. This often performs better than one-hot encoding.
22 | 
23 | -  Use ``categorical_feature`` to specify the categorical features.
24 |    Refer to the parameter ``categorical_feature`` in `Parameters <./Parameters.rst#categorical_feature>`__.
25 | 
26 | -  Categorical features must be encoded as non-negative integers (``int``) less than ``Int32.MaxValue`` (2147483647).
27 |    It is best to use a contiguous range of integers.
28 | 
29 | -  Use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting (when ``#data`` is small or ``#category`` is large).
30 | 
31 | -  For a categorical feature with high cardinality (``#category`` is large), it often works best to
32 |    treat the feature as numeric, either by simply ignoring the categorical interpretation of the integers or
33 |    by embedding the categories in a low-dimensional numeric space.
34 | 
35 | LambdaRank
36 | ----------
37 | 
38 | -  The label should be of type ``int``, such that larger numbers correspond to higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
39 | 
40 | -  Use ``label_gain`` to set the gain(weight) of ``int`` label.
41 | 
42 | -  Use ``max_position`` to set the NDCG optimization position.
43 | 
44 | Parameters Tuning
45 | -----------------
46 | 
47 | -  Refer to `Parameters Tuning <./Parameters-Tuning.rst>`__.
48 | 
49 | Parallel Learning
50 | -----------------
51 | 
52 | -  Refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__.
53 | 
54 | GPU Support
55 | -----------
56 | 
57 | -  Refer to `GPU Tutorial <./GPU-Tutorial.rst>`__ and `GPU Targets <./GPU-Targets.rst>`__.
58 | 
59 | Recommendations for gcc Users (MinGW, \*nix)
60 | --------------------------------------------
61 | 
62 | -  Refer to `gcc Tips <./gcc-Tips.rst>`__.
63 | 


--------------------------------------------------------------------------------
/examples/python-guide/README.md:
--------------------------------------------------------------------------------
 1 | Python-package Examples
 2 | =======================
 3 | 
 4 | Here is an example for LightGBM to use Python-package.
 5 | 
 6 | You should install LightGBM [Python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) first.
 7 | 
 8 | You also need scikit-learn, pandas, matplotlib (only for plot example), and scipy (only for logistic regression example) to run the examples, but they are not required for the package itself. You can install them with pip:
 9 | 
10 | ```
11 | pip install scikit-learn pandas matplotlib scipy -U
12 | ```
13 | 
14 | Now you can run examples in this folder, for example:
15 | 
16 | ```
17 | python simple_example.py
18 | ```
19 | 
20 | Examples include:
21 | 
22 | - [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
23 |     - Construct Dataset
24 |     - Basic train and predict
25 |     - Eval during training 
26 |     - Early stopping
27 |     - Save model to file
28 | - [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
29 |     - Create data for learning with sklearn interface 
30 |     - Basic train and predict with sklearn interface
31 |     - Feature importances with sklearn interface
32 |     - Self-defined eval metric with sklearn interface
33 |     - Find best parameters for the model with sklearn's GridSearchCV
34 | - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
35 |     - Set feature names
36 |     - Directly use categorical features without one-hot encoding
37 |     - Dump model to json format
38 |     - Get feature importances
39 |     - Get feature names
40 |     - Load model to predict
41 |     - Dump and load model with pickle
42 |     - Load model file to continue training
43 |     - Change learning rates during training
44 |     - Change any parameters during training
45 |     - Self-defined objective function
46 |     - Self-defined eval metric
47 |     - Callback function
48 | - [logistic_regression.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/logistic_regression.py)
49 |     - Use objective `xentropy` or `binary`
50 |     - Use `xentropy` with binary labels or probability labels
51 |     - Use `binary` only with binary labels
52 |     - Compare speed of `xentropy` versus `binary`
53 | - [plot_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/plot_example.py)
54 |     - Construct Dataset
55 |     - Train and record eval results for further plotting
56 |     - Plot metrics recorded during training
57 |     - Plot feature importances
58 |     - Plot one specified tree
59 |     - Plot one specified tree with Graphviz
60 | 


--------------------------------------------------------------------------------
/include/LightGBM/objective_function.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_
 2 | #define LIGHTGBM_OBJECTIVE_FUNCTION_H_
 3 | 
 4 | #include <LightGBM/meta.h>
 5 | #include <LightGBM/config.h>
 6 | #include <LightGBM/dataset.h>
 7 | #include <functional>
 8 | 
 9 | namespace LightGBM {
10 | /*!
11 | * \brief The interface of Objective Function.
12 | */
13 | class ObjectiveFunction {
14 | public:
15 |   /*! \brief virtual destructor */
16 |   virtual ~ObjectiveFunction() {}
17 | 
18 |   /*!
19 |   * \brief Initialize
20 |   * \param metadata Label data
21 |   * \param num_data Number of data
22 |   */
23 |   virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
24 | 
25 |   /*!
26 |   * \brief calculating first order derivative of loss function
27 |   * \param score prediction score in this round
28 |   * \gradients Output gradients
29 |   * \hessians Output hessians
30 |   */
31 |   virtual void GetGradients(const double* score,
32 |     score_t* gradients, score_t* hessians) const = 0;
33 | 
34 |   virtual const char* GetName() const = 0;
35 | 
36 |   virtual bool IsConstantHessian() const { return false; }
37 | 
38 |   virtual bool IsRenewTreeOutput() const { return false; }
39 | 
40 |   virtual double RenewTreeOutput(double ori_output, const double*,
41 |                                  const data_size_t*,
42 |                                  const data_size_t*,
43 |                                  data_size_t) const { return ori_output; }
44 | 
45 |   virtual double BoostFromScore() const { return 0.0f; }
46 | 
47 |   virtual bool SkipEmptyClass() const { return false; }
48 | 
49 |   virtual int NumModelPerIteration() const { return 1; }
50 | 
51 |   virtual int NumPredictOneRow() const { return 1; }
52 | 
53 |   /*! \brief The prediction should be accurate or not. True will disable early stopping for prediction. */
54 |   virtual bool NeedAccuratePrediction() const { return true; }
55 | 
56 |   virtual void ConvertOutput(const double* input, double* output) const {
57 |     output[0] = input[0];
58 |   }
59 | 
60 |   virtual std::string ToString() const = 0;
61 | 
62 |   ObjectiveFunction() = default;
63 |   /*! \brief Disable copy */
64 |   ObjectiveFunction& operator=(const ObjectiveFunction&) = delete;
65 |   /*! \brief Disable copy */
66 |   ObjectiveFunction(const ObjectiveFunction&) = delete;
67 | 
68 |   /*!
69 |   * \brief Create object of objective function
70 |   * \param type Specific type of objective function
71 |   * \param config Config for objective function
72 |   */
73 |   LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& type,
74 |     const Config& config);
75 | 
76 |   /*!
77 |   * \brief Load objective function from string object
78 |   */
79 |   LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str);
80 | };
81 | 
82 | }  // namespace LightGBM
83 | 
84 | #endif   // LightGBM_OBJECTIVE_FUNCTION_H_
85 | 


--------------------------------------------------------------------------------
/swig/lightgbmlib.i:
--------------------------------------------------------------------------------
  1 | /* lightgbmlib.i */
  2 | %module lightgbmlib
  3 | %ignore LGBM_BoosterSaveModelToString;
  4 | %{
  5 | /* Includes the header in the wrapper code */
  6 | #include "../include/LightGBM/export.h"
  7 | #include "../include/LightGBM/utils/log.h"
  8 | #include "../include/LightGBM/c_api.h"
  9 | %}
 10 | 
 11 | /* header files */
 12 | %include "../include/LightGBM/export.h"
 13 | %include "../include/LightGBM/c_api.h"
 14 | %include "cpointer.i"
 15 | %include "carrays.i"
 16 | 
 17 | %inline %{
 18 |   char * LGBM_BoosterSaveModelToStringSWIG(BoosterHandle handle,
 19 | 					   int num_iteration,
 20 | 					   int64_t buffer_len,
 21 | 					   int64_t* out_len) {
 22 |     char* dst = new char[buffer_len];
 23 |     int result = LGBM_BoosterSaveModelToString(handle, num_iteration, buffer_len, out_len, dst);
 24 |     if (result != 0) {
 25 |       return nullptr;
 26 |     }
 27 |     return dst;
 28 |   }
 29 | %}
 30 | 
 31 | %pointer_functions(int, intp)
 32 | %pointer_functions(long, longp)
 33 | %pointer_functions(double, doublep)
 34 | %pointer_functions(float, floatp)
 35 | %pointer_functions(int64_t, int64_tp)
 36 | %pointer_functions(int32_t, int32_tp)
 37 | 
 38 | %pointer_cast(int64_t *, long *, int64_t_to_long_ptr)
 39 | %pointer_cast(int64_t *, double *, int64_t_to_double_ptr)
 40 | %pointer_cast(int32_t *, int *, int32_t_to_int_ptr)
 41 | %pointer_cast(long *, int64_t *, long_to_int64_t_ptr)
 42 | %pointer_cast(double *, int64_t *, double_to_int64_t_ptr)
 43 | %pointer_cast(double *, void *, double_to_voidp_ptr)
 44 | %pointer_cast(int *, int32_t *, int_to_int32_t_ptr)
 45 | %pointer_cast(float *, void *, float_to_voidp_ptr)
 46 | 
 47 | %array_functions(double, doubleArray)
 48 | %array_functions(float, floatArray)
 49 | %array_functions(int, intArray)
 50 | %array_functions(long, longArray)
 51 | 
 52 | /* Custom pointer manipulation template */
 53 | %define %pointer_manipulation(TYPE,NAME)
 54 | %{
 55 |   static TYPE *new_##NAME() { %}
 56 |   %{  TYPE* NAME = new TYPE; return NAME; %}
 57 |   %{}
 58 | 
 59 |   static void delete_##NAME(TYPE *self) { %}
 60 |   %{  if (self) delete self; %}
 61 |   %{}
 62 |   %}
 63 | 
 64 | TYPE *new_##NAME();
 65 | void  delete_##NAME(TYPE *self);
 66 | 
 67 | %enddef
 68 | 
 69 | %define %pointer_dereference(TYPE,NAME)
 70 | %{
 71 |   static TYPE NAME ##_value(TYPE *self) {
 72 |     TYPE NAME = *self;
 73 |     return NAME;
 74 |   }
 75 | %}
 76 | 
 77 | TYPE NAME##_value(TYPE *self);
 78 | 
 79 | %enddef
 80 | 
 81 | %define %pointer_handle(TYPE,NAME)
 82 | %{
 83 |   static TYPE* NAME ##_handle() { %}
 84 |   %{ TYPE* NAME = new TYPE; *NAME = (TYPE)operator new(sizeof(int*)); return NAME; %}
 85 |   %{}
 86 | %}
 87 | 
 88 | TYPE *NAME##_handle();
 89 | 
 90 | %enddef
 91 | 
 92 | %pointer_manipulation(void*, voidpp)
 93 | 
 94 | /* Allow dereferencing of void** to void* */
 95 | %pointer_dereference(void*, voidpp)
 96 | 
 97 | /* Allow retrieving handle to void** */
 98 | %pointer_handle(void*, voidpp)
 99 | 
100 | 


--------------------------------------------------------------------------------
/R-package/R/saveRDS.lgb.Booster.R:
--------------------------------------------------------------------------------
 1 | #' saveRDS for lgb.Booster models
 2 | #'
 3 | #' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
 4 | #' 
 5 | #' @param object R object to serialize.
 6 | #' @param file a connection or the name of the file where the R object is saved to or read from.
 7 | #' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.
 8 | #' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.
 9 | #' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.
10 | #' @param refhook a hook function for handling reference objects.
11 | #' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
12 | #' 
13 | #' @return NULL invisibly.
14 | #' 
15 | #' @examples
16 | #' \dontrun{
17 | #' library(lightgbm)
18 | #' data(agaricus.train, package = "lightgbm")
19 | #' train <- agaricus.train
20 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
21 | #' data(agaricus.test, package = "lightgbm")
22 | #' test <- agaricus.test
23 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
24 | #' params <- list(objective = "regression", metric = "l2")
25 | #' valids <- list(test = dtest)
26 | #' model <- lgb.train(params,
27 | #'                    dtrain,
28 | #'                    100,
29 | #'                    valids,
30 | #'                    min_data = 1,
31 | #'                    learning_rate = 1,
32 | #'                    early_stopping_rounds = 10)
33 | #' saveRDS.lgb.Booster(model, "model.rds")
34 | #' }
35 | #' 
36 | #' @export
37 | saveRDS.lgb.Booster <- function(object,
38 |                                 file = "",
39 |                                 ascii = FALSE,
40 |                                 version = NULL,
41 |                                 compress = TRUE,
42 |                                 refhook = NULL,
43 |                                 raw = TRUE) {
44 |   
45 |   # Check if object has a raw value (and if the user wants to store the raw)
46 |   if (is.na(object$raw) && raw) {
47 |     
48 |     # Save model
49 |     object$save()
50 |     
51 |     # Save RDS
52 |     saveRDS(object,
53 |             file = file,
54 |             ascii = ascii,
55 |             version = version,
56 |             compress = compress,
57 |             refhook = refhook)
58 |     
59 |     # Free model from memory
60 |     object$raw <- NA
61 |     
62 |   } else {
63 |     
64 |     # Save as usual
65 |     saveRDS(object,
66 |             file = file,
67 |             ascii = ascii,
68 |             version = version,
69 |             compress = compress,
70 |             refhook = refhook)
71 |     
72 |   }
73 |   
74 | }
75 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/log.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_UTILS_LOG_H_
  2 | #define LIGHTGBM_UTILS_LOG_H_
  3 | 
  4 | #include <iostream>
  5 | #include <cstdio>
  6 | #include <cstdlib>
  7 | #include <cstdarg>
  8 | #include <cstring>
  9 | #include <exception>
 10 | #include <stdexcept>
 11 | 
 12 | namespace LightGBM {
 13 | 
 14 | #if defined(_MSC_VER)
 15 | #define THREAD_LOCAL __declspec(thread) 
 16 | #else
 17 | #define THREAD_LOCAL thread_local
 18 | #endif
 19 | 
 20 | #ifndef CHECK
 21 | #define CHECK(condition)                                   \
 22 |   if (!(condition)) Log::Fatal("Check failed: " #condition \
 23 |      " at %s, line %d .\n", __FILE__,  __LINE__);
 24 | #endif
 25 | 
 26 | #ifndef CHECK_NOTNULL
 27 | #define CHECK_NOTNULL(pointer)                             \
 28 |   if ((pointer) == nullptr) LightGBM::Log::Fatal(#pointer " Can't be NULL at %s, line %d .\n", __FILE__,  __LINE__);
 29 | #endif
 30 | 
 31 | 
 32 | enum class LogLevel: int {
 33 |   Fatal = -1,
 34 |   Warning = 0,
 35 |   Info = 1,
 36 |   Debug = 2,
 37 | };
 38 | 
 39 | 
 40 | /*!
 41 | * \brief A static Log class
 42 | */
 43 | class Log {
 44 | public:
 45 |   /*!
 46 |   * \brief Resets the minimal log level. It is INFO by default.
 47 |   * \param level The new minimal log level.
 48 |   */
 49 |   static void ResetLogLevel(LogLevel level) {
 50 |     GetLevel() = level;
 51 |   }
 52 | 
 53 |   static void Debug(const char *format, ...) {
 54 |     va_list val;
 55 |     va_start(val, format);
 56 |     Write(LogLevel::Debug, "Debug", format, val);
 57 |     va_end(val);
 58 |   }
 59 |   static void Info(const char *format, ...) {
 60 |     va_list val;
 61 |     va_start(val, format);
 62 |     Write(LogLevel::Info, "Info", format, val);
 63 |     va_end(val);
 64 |   }
 65 |   static void Warning(const char *format, ...) {
 66 |     va_list val;
 67 |     va_start(val, format);
 68 |     Write(LogLevel::Warning, "Warning", format, val);
 69 |     va_end(val);
 70 |   }
 71 |   static void Fatal(const char *format, ...) {
 72 |     va_list val;
 73 |     char str_buf[1024];
 74 |     va_start(val, format);
 75 | #ifdef _MSC_VER
 76 |     vsprintf_s(str_buf, format, val);
 77 | #else
 78 |     vsprintf(str_buf, format, val);
 79 | #endif
 80 |     va_end(val);
 81 |     fprintf(stderr, "[LightGBM] [Fatal] %s\n", str_buf);
 82 |     fflush(stderr);
 83 |     throw std::runtime_error(std::string(str_buf));
 84 |   }
 85 | 
 86 | private:
 87 | 
 88 |   static void Write(LogLevel level, const char* level_str, const char *format, va_list val) {
 89 |     if (level <= GetLevel()) {  // omit the message with low level
 90 |       // write to STDOUT
 91 |       printf("[LightGBM] [%s] ", level_str);
 92 |       vprintf(format, val);
 93 |       printf("\n");
 94 |       fflush(stdout);
 95 |     }
 96 |   }
 97 | 
 98 |   // a trick to use static variable in header file.
 99 |   // May be not good, but avoid to use an additional cpp file
100 |   static LogLevel& GetLevel() { static THREAD_LOCAL LogLevel level = LogLevel::Info; return level; }
101 | 
102 | };
103 | 
104 | }  // namespace LightGBM
105 | #endif   // LightGBM_UTILS_LOG_H_
106 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_dataset.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | require(Matrix)
 3 | 
 4 | context("testing lgb.Dataset functionality")
 5 | 
 6 | data(agaricus.test, package='lightgbm')
 7 | test_data <- agaricus.test$data[1:100,]
 8 | test_label <- agaricus.test$label[1:100]
 9 | 
10 | test_that("lgb.Dataset: basic construction, saving, loading", {
11 |   # from sparse matrix
12 |   dtest1 <- lgb.Dataset(test_data, label=test_label)
13 |   # from dense matrix 
14 |   dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label)
15 |   expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
16 |   
17 |   # save to a local file
18 |   tmp_file <- tempfile('lgb.Dataset_')
19 |   lgb.Dataset.save(dtest1, tmp_file)
20 |   # read from a local file
21 |   dtest3 <- lgb.Dataset(tmp_file)
22 |   lgb.Dataset.construct(dtest3)
23 |   unlink(tmp_file)
24 |   expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
25 | })
26 | 
27 | test_that("lgb.Dataset: getinfo & setinfo", {
28 |   dtest <- lgb.Dataset(test_data)
29 |   dtest$construct()
30 |   
31 |   setinfo(dtest, 'label', test_label)
32 |   labels <- getinfo(dtest, 'label')
33 |   expect_equal(test_label, getinfo(dtest, 'label'))
34 |   
35 |   expect_true(length(getinfo(dtest, 'weight')) == 0)
36 |   expect_true(length(getinfo(dtest, 'init_score')) == 0)
37 |   
38 |   # any other label should error
39 |   expect_error(setinfo(dtest, 'asdf', test_label))
40 | })
41 | 
42 | test_that("lgb.Dataset: slice, dim", {
43 |   dtest <- lgb.Dataset(test_data, label=test_label)
44 |   lgb.Dataset.construct(dtest)
45 |   expect_equal(dim(dtest), dim(test_data))
46 |   dsub1 <- slice(dtest, 1:42)
47 |   lgb.Dataset.construct(dsub1)
48 |   expect_equal(nrow(dsub1), 42)
49 |   expect_equal(ncol(dsub1), ncol(test_data))
50 | })
51 | 
52 | test_that("lgb.Dataset: colnames", {
53 |   dtest <- lgb.Dataset(test_data, label=test_label)
54 |   expect_equal(colnames(dtest), colnames(test_data))
55 |   lgb.Dataset.construct(dtest)
56 |   expect_equal(colnames(dtest), colnames(test_data))
57 |   expect_error( colnames(dtest) <- 'asdf')
58 |   new_names <- make.names(1:ncol(test_data))
59 |   expect_silent(colnames(dtest) <- new_names)
60 |   expect_equal(colnames(dtest), new_names)
61 | })
62 | 
63 | test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
64 |   nr <- 1000
65 |   x <- rsparsematrix(nr, 100, density=0.0005)
66 |   # we want it very sparse, so that last rows are empty
67 |   expect_lt(max(x@i), nr)
68 |   dtest <- lgb.Dataset(x)
69 |   expect_equal(dim(dtest), dim(x))
70 | })
71 | 
72 | test_that("lgb.Dataset: Dataset should be able to construct from matrix and return non-null handle", {
73 |   rawData <- matrix(runif(1000),ncol=10)
74 |   handle <- NA_real_
75 |   ref_handle <- NULL
76 |   handle <- lightgbm:::lgb.call("LGBM_DatasetCreateFromMat_R"
77 |                                 , ret = handle
78 |                                 , rawData
79 |                                 , nrow(rawData)
80 |                                 , ncol(rawData)
81 |                                 , lightgbm:::lgb.params2str(params=list())
82 |                                 , ref_handle)
83 |   expect_false(is.na(handle))
84 | })
85 | 


--------------------------------------------------------------------------------
/R-package/R/lgb.plot.importance.R:
--------------------------------------------------------------------------------
 1 | #' Plot feature importance as a bar graph
 2 | #' 
 3 | #' Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.
 4 | #' 
 5 | #' @param tree_imp a \code{data.table} returned by \code{\link{lgb.importance}}.
 6 | #' @param top_n maximal number of top features to include into the plot.
 7 | #' @param measure the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".
 8 | #' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
 9 | #' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
10 | #' 
11 | #' @details
12 | #' The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature.
13 | #' Features are shown ranked in a decreasing importance order.
14 | #' 
15 | #' @return
16 | #' The \code{lgb.plot.importance} function creates a \code{barplot}
17 | #' and silently returns a processed data.table with \code{top_n} features sorted by defined importance.
18 | #' 
19 | #' @examples
20 | #' \dontrun{
21 | #' data(agaricus.train, package = "lightgbm")
22 | #' train <- agaricus.train
23 | #' dtrain <- lgb.Dataset(train$data, label = train$label)
24 | #'
25 | #' params = list(objective = "binary",
26 | #'               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
27 | #'               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
28 | #'               model <- lgb.train(params, dtrain, 20)
29 | #' model <- lgb.train(params, dtrain, 20)
30 | #'
31 | #' tree_imp <- lgb.importance(model, percentage = TRUE)
32 | #' lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
33 | #' }
34 | #' @importFrom graphics barplot par
35 | #' @export
36 | lgb.plot.importance <- function(tree_imp,
37 |                                 top_n = 10,
38 |                                 measure = "Gain",
39 |                                 left_margin = 10,
40 |                                 cex = NULL) {
41 |   
42 |   # Check for measurement (column names) correctness
43 |   measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
44 |   
45 |   # Get top N importance (defaults to 10)
46 |   top_n <- min(top_n, nrow(tree_imp))
47 |   
48 |   # Parse importance
49 |   tree_imp <- tree_imp[order(abs(get(measure)), decreasing = TRUE),][seq_len(top_n),]
50 |   
51 |   # Attempt to setup a correct cex
52 |   if (is.null(cex)) {
53 |     cex <- 2.5 / log2(1 + top_n)
54 |   }
55 |   
56 |   # Refresh plot
57 |   op <- graphics::par(no.readonly = TRUE)
58 |   on.exit(graphics::par(op))
59 |   
60 |   # Do some magic plotting
61 |   graphics::par(mar = op$mar %>% magrittr::inset(., 2, left_margin))
62 |   
63 |   # Do plot
64 |   tree_imp[.N:1,
65 |            graphics::barplot(
66 |                height = get(measure),
67 |                names.arg = Feature,
68 |                horiz = TRUE,
69 |                border = NA,
70 |                main = "Feature Importance",
71 |                xlab = measure,
72 |                cex.names = cex,
73 |                las = 1
74 |            )]
75 |   
76 |   # Return invisibly
77 |   invisible(tree_imp)
78 |   
79 | }
80 | 


--------------------------------------------------------------------------------
/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1.2.{build}
 2 | 
 3 | configuration:  # a trick to construct a build matrix
 4 |   - 3.5
 5 |   - 3.6
 6 | 
 7 | environment:
 8 |   matrix:
 9 |     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
10 |       COMPILER: MSVC
11 |     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
12 |       COMPILER: MINGW
13 | 
14 | clone_depth: 50
15 | 
16 | install:
17 |   - git submodule update --init --recursive  # get `compute` folder
18 |   - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%  # delete sh.exe from PATH (mingw32-make fix)
19 |   - set PATH=C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
20 |   - set PYTHON_VERSION=%CONFIGURATION%
21 |   - ps: >-
22 |       switch ($env:PYTHON_VERSION) {
23 |           "2.7" {$env:MINICONDA = """C:\Miniconda-x64"""}
24 |           "3.4" {$env:MINICONDA = """C:\Miniconda34-x64"""}
25 |           "3.5" {$env:MINICONDA = """C:\Miniconda35-x64"""}
26 |           "3.6" {$env:MINICONDA = """C:\Miniconda36-x64"""}
27 |           default {$env:MINICONDA = """C:\Miniconda36-x64"""}
28 |       }
29 |   - set PATH=%MINICONDA%;%MINICONDA%\Scripts;%PATH%
30 |   - ps: $env:LGB_VER = (Get-Content VERSION.txt).trim()
31 |   - conda config --set always_yes yes --set changeps1 no
32 |   - conda update -q conda
33 |   - conda create -q -n test-env python=%PYTHON_VERSION% numpy nose scipy scikit-learn pandas matplotlib python-graphviz pytest
34 |   - activate test-env
35 | 
36 | build_script:
37 |   - mkdir %APPVEYOR_BUILD_FOLDER%\build && cd %APPVEYOR_BUILD_FOLDER%\build
38 |   - cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. && cmake --build . --target ALL_BUILD --config Release
39 | 
40 | test_script:
41 |   - pytest %APPVEYOR_BUILD_FOLDER%\tests\c_api_test\test_.py
42 |   - cd %APPVEYOR_BUILD_FOLDER%\python-package && python setup.py sdist --formats gztar
43 |   - IF "%COMPILER%"=="MINGW" (
44 |     pip install %APPVEYOR_BUILD_FOLDER%\python-package\dist\lightgbm-%LGB_VER%.tar.gz --install-option=--mingw -v)
45 |     ELSE (
46 |     pip install %APPVEYOR_BUILD_FOLDER%\python-package\dist\lightgbm-%LGB_VER%.tar.gz -v)
47 |   - pytest %APPVEYOR_BUILD_FOLDER%\tests\python_package_test
48 |   - cd %APPVEYOR_BUILD_FOLDER%\examples\python-guide
49 |   - ps: >-
50 |       @("import matplotlib", "matplotlib.use('Agg')") + (Get-Content "plot_example.py") | Set-Content "plot_example.py"  # prevent interactive window mode
51 |   - ps: >-
52 |       foreach ($file in @(Get-ChildItem *.py)) {
53 |         python $file
54 |         if ($LastExitCode -ne 0) { $host.SetShouldExit($LastExitCode) }
55 |       }  # run all examples
56 |   - IF "%COMPILER%"=="MINGW" appveyor exit  # skip all further steps
57 |   - cd %APPVEYOR_BUILD_FOLDER%\python-package && python setup.py bdist_wheel --plat-name=win-amd64 --universal
58 | 
59 | artifacts:
60 | - path: Release/lib_lightgbm.dll
61 |   name: Library
62 | - path: Release/lightgbm.exe
63 |   name: Exe
64 | - path: python-package/dist/*
65 |   name: Pip
66 | 
67 | deploy:
68 |   release: $(APPVEYOR_REPO_TAG_NAME)
69 |   provider: GitHub
70 |   auth_token:
71 |     secure: KR44XwtxY0cLlVpQwY726BvC6gzT0cYTf0ahJ4cSvvS0UVoSJxkR900ICfVXHRoT
72 |   artifact: Library,Exe,Pip
73 |   force_update: true
74 |   draft: true
75 |   on:
76 |     appveyor_repo_tag: true
77 | 


--------------------------------------------------------------------------------
/src/metric/metric.cpp:
--------------------------------------------------------------------------------
 1 | #include <LightGBM/metric.h>
 2 | #include "regression_metric.hpp"
 3 | #include "binary_metric.hpp"
 4 | #include "rank_metric.hpp"
 5 | #include "map_metric.hpp"
 6 | #include "multiclass_metric.hpp"
 7 | #include "xentropy_metric.hpp"
 8 | 
 9 | namespace LightGBM {
10 | 
11 | Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
12 |   if (type == std::string("regression") || type == std::string("regression_l2") || type == std::string("l2") || type == std::string("mean_squared_error") || type == std::string("mse")) {
13 |     return new L2Metric(config);
14 |   } else if (type == std::string("l2_root") || type == std::string("root_mean_squared_error") || type == std::string("rmse")) {
15 |     return new RMSEMetric(config);
16 |   } else if (type == std::string("regression_l1") || type == std::string("l1") || type == std::string("mean_absolute_error") || type == std::string("mae")) {
17 |     return new L1Metric(config);
18 |   } else if (type == std::string("quantile")) {
19 |     return new QuantileMetric(config);
20 |   } else if (type == std::string("huber")) {
21 |     return new HuberLossMetric(config);
22 |   } else if (type == std::string("fair")) {
23 |     return new FairLossMetric(config);
24 |   } else if (type == std::string("poisson")) {
25 |     return new PoissonMetric(config);
26 |   } else if (type == std::string("binary_logloss") || type == std::string("binary")) {
27 |     return new BinaryLoglossMetric(config);
28 |   } else if (type == std::string("binary_error")) {
29 |     return new BinaryErrorMetric(config);
30 |   } else if (type == std::string("auc")) {
31 |     return new AUCMetric(config);
32 |   } else if (type == std::string("ndcg")) {
33 |     return new NDCGMetric(config);
34 |   } else if (type == std::string("map") || type == std::string("mean_average_precision")) {
35 |     return new MapMetric(config);
36 |   } else if (type == std::string("multi_logloss") || type == std::string("multiclass") || type == std::string("softmax") || type == std::string("multiclassova") || type == std::string("multiclass_ova") || type == std::string("ova") || type == std::string("ovr")) {
37 |     return new MultiSoftmaxLoglossMetric(config);
38 |   } else if (type == std::string("multi_error")) {
39 |     return new MultiErrorMetric(config);
40 |   } else if (type == std::string("xentropy") || type == std::string("cross_entropy")) {
41 |     return new CrossEntropyMetric(config);
42 |   } else if (type == std::string("xentlambda") || type == std::string("cross_entropy_lambda")) {
43 |     return new CrossEntropyLambdaMetric(config);
44 |   } else if (type == std::string("kldiv") || type == std::string("kullback_leibler")) {
45 |     return new KullbackLeiblerDivergence(config);
46 |   } else if (type == std::string("mean_absolute_percentage_error") || type == std::string("mape")) {
47 |     return new MAPEMetric(config);
48 |   } else if (type == std::string("gamma")) {
49 |     return new GammaMetric(config);
50 |   } else if (type == std::string("gamma_deviance")) {
51 |     return new GammaDevianceMetric(config);
52 |   } else if (type == std::string("tweedie")) {
53 |     return new TweedieMetric(config);
54 |   }
55 |   return nullptr;
56 | }
57 | 
58 | }  // namespace LightGBM
59 | 


--------------------------------------------------------------------------------
/include/LightGBM/tree_learner.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_TREE_LEARNER_H_
 2 | #define LIGHTGBM_TREE_LEARNER_H_
 3 | 
 4 | 
 5 | #include <LightGBM/meta.h>
 6 | #include <LightGBM/config.h>
 7 | #include <LightGBM/json11.hpp>
 8 | 
 9 | #include <vector>
10 | 
11 | using namespace json11;
12 | 
13 | namespace LightGBM {
14 | 
15 | /*! \brief forward declaration */
16 | class Tree;
17 | class Dataset;
18 | class ObjectiveFunction;
19 | 
20 | /*!
21 | * \brief Interface for tree learner
22 | */
23 | class TreeLearner {
24 | public:
25 |   /*! \brief virtual destructor */
26 |   virtual ~TreeLearner() {}
27 | 
28 |   /*!
29 |   * \brief Initialize tree learner with training dataset
30 |   * \param train_data The used training data
31 |   * \param is_constant_hessian True if all hessians share the same value
32 |   */
33 |   virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
34 | 
35 |   virtual void ResetTrainingData(const Dataset* train_data) = 0;
36 | 
37 |   /*!
38 |   * \brief Reset tree configs
39 |   * \param config config of tree
40 |   */
41 |   virtual void ResetConfig(const Config* config) = 0;
42 | 
43 |   /*!
44 |   * \brief training tree model on dataset 
45 |   * \param gradients The first order gradients
46 |   * \param hessians The second order gradients
47 |   * \param is_constant_hessian True if all hessians share the same value
48 |   * \return A trained tree
49 |   */
50 |   virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, 
51 |                       Json& forced_split_json) = 0;
52 | 
53 |   /*!
54 |   * \brief use a existing tree to fit the new gradients and hessians.
55 |   */
56 |   virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0;
57 | 
58 |   virtual Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
59 |                                   const score_t* gradients, const score_t* hessians) = 0;
60 | 
61 |   /*!
62 |   * \brief Set bagging data
63 |   * \param used_indices Used data indices
64 |   * \param num_data Number of used data
65 |   */
66 |   virtual void SetBaggingData(const data_size_t* used_indices,
67 |     data_size_t num_data) = 0;
68 | 
69 |   /*!
70 |   * \brief Using last trained tree to predict score then adding to out_score;
71 |   * \param out_score output score
72 |   */
73 |   virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0;
74 | 
75 |   virtual void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, const double* prediction,
76 |                                data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0;
77 | 
78 |   TreeLearner() = default;
79 |   /*! \brief Disable copy */
80 |   TreeLearner& operator=(const TreeLearner&) = delete;
81 |   /*! \brief Disable copy */
82 |   TreeLearner(const TreeLearner&) = delete;
83 | 
84 |   /*!
85 |   * \brief Create object of tree learner
86 |   * \param learner_type Type of tree learner
87 |   * \param device_type Type of tree learner
88 |   * \param config config of tree
89 |   */
90 |   static TreeLearner* CreateTreeLearner(const std::string& learner_type,
91 |     const std::string& device_type,
92 |     const Config* config);
93 | };
94 | 
95 | }  // namespace LightGBM
96 | 
97 | #endif   // LightGBM_TREE_LEARNER_H_
98 | 


--------------------------------------------------------------------------------
/docs/Parameters-Tuning.rst:
--------------------------------------------------------------------------------
 1 | Parameters Tuning
 2 | =================
 3 | 
 4 | This page contains parameters tuning guides for different scenarios.
 5 | 
 6 | **List of other helpful links**
 7 | 
 8 | -  `Parameters <./Parameters.rst>`__
 9 | -  `Python API <./Python-API.rst>`__
10 | 
11 | Tune Parameters for the Leaf-wise (Best-first) Tree
12 | ---------------------------------------------------
13 | 
14 | LightGBM uses the `leaf-wise <./Features.rst#leaf-wise-best-first-tree-growth>`__ tree growth algorithm, while many other popular tools use depth-wise tree growth.
15 | Compared with depth-wise growth, the leaf-wise algorithm can converge much faster.
16 | However, the leaf-wise growth may be over-fitting if not used with the appropriate parameters.
17 | 
18 | To get good results using a leaf-wise tree, these are some important parameters:
19 | 
20 | 1. ``num_leaves``. This is the main parameter to control the complexity of the tree model.
21 |    Theoretically, we can set ``num_leaves = 2^(max_depth)`` to obtain the same number of leaves as depth-wise tree.
22 |    However, this simple conversion is not good in practice.
23 |    The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
24 |    Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``.
25 |    For example, when the ``max_depth=7`` the depth-wise tree can get good accuracy,
26 |    but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise.
27 | 
28 | 2. ``min_data_in_leaf``. This is a very important parameter to prevent over-fitting in a leaf-wise tree.
29 |    Its optimal value depends on the number of training samples and ``num_leaves``.
30 |    Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
31 |    In practice, setting it to hundreds or thousands is enough for a large dataset.
32 | 
33 | 3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly.
34 | 
35 | For Faster Speed
36 | ----------------
37 | 
38 | -  Use bagging by setting ``bagging_fraction`` and ``bagging_freq``
39 | 
40 | -  Use feature sub-sampling by setting ``feature_fraction``
41 | 
42 | -  Use small ``max_bin``
43 | 
44 | -  Use ``save_binary`` to speed up data loading in future learning
45 | 
46 | -  Use parallel learning, refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__
47 | 
48 | 
49 | For Better Accuracy
50 | -------------------
51 | 
52 | -  Use large ``max_bin`` (may be slower)
53 | 
54 | -  Use small ``learning_rate`` with large ``num_iterations``
55 | 
56 | -  Use large ``num_leaves`` (may cause over-fitting)
57 | 
58 | -  Use bigger training data
59 | 
60 | -  Try ``dart``
61 | 
62 | Deal with Over-fitting
63 | ----------------------
64 | 
65 | -  Use small ``max_bin``
66 | 
67 | -  Use small ``num_leaves``
68 | 
69 | -  Use ``min_data_in_leaf`` and ``min_sum_hessian_in_leaf``
70 | 
71 | -  Use bagging by set ``bagging_fraction`` and ``bagging_freq``
72 | 
73 | -  Use feature sub-sampling by set ``feature_fraction``
74 | 
75 | -  Use bigger training data
76 | 
77 | -  Try ``lambda_l1``, ``lambda_l2`` and ``min_gain_to_split`` for regularization
78 | 
79 | -  Try ``max_depth`` to avoid growing deep tree
80 | 


--------------------------------------------------------------------------------
/R-package/demo/multiclass_custom_objective.R:
--------------------------------------------------------------------------------
 1 | require(lightgbm)
 2 | 
 3 | # We load the default iris dataset shipped with R
 4 | data(iris)
 5 | 
 6 | # We must convert factors to numeric
 7 | # They must be starting from number 0 to use multiclass
 8 | # For instance: 0, 1, 2, 3, 4, 5...
 9 | iris$Species <- as.numeric(as.factor(iris$Species)) - 1
10 | 
11 | # We cut the data set into 80% train and 20% validation
12 | # The 10 last samples of each class are for validation
13 | 
14 | train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
15 | test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
16 | dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
17 | dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
18 | valids <- list(test = dtest)
19 | 
20 | # Method 1 of training with built-in multiclass objective
21 | model_builtin <- lgb.train(list(),
22 |                            dtrain,
23 |                            100,
24 |                            valids,
25 |                            min_data = 1,
26 |                            learning_rate = 1,
27 |                            early_stopping_rounds = 10,
28 |                            objective = "multiclass",
29 |                            metric = "multi_logloss",
30 |                            num_class = 3)
31 | 
32 | preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE)
33 | 
34 | # Method 2 of training with custom objective function
35 | 
36 | # User defined objective function, given prediction, return gradient and second order gradient
37 | custom_multiclass_obj = function(preds, dtrain) {
38 |     labels = getinfo(dtrain, "label")
39 |     
40 |     # preds is a matrix with rows corresponding to samples and colums corresponding to choices
41 |     preds = matrix(preds, nrow = length(labels))
42 |     
43 |     # to prevent overflow, normalize preds by row
44 |     preds = preds - apply(preds, 1, max)
45 |     prob = exp(preds) / rowSums(exp(preds))
46 |     
47 |     # compute gradient
48 |     grad = prob
49 |     grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1
50 |     
51 |     # compute hessian (approximation)
52 |     hess = 2 * prob * (1 - prob)
53 |     
54 |     return(list(grad = grad, hess = hess))
55 | }
56 | 
57 | # define custom metric 
58 | custom_multiclass_metric = function(preds, dtrain) {
59 |     labels = getinfo(dtrain, "label")
60 |     preds = matrix(preds, nrow = length(labels))
61 |     preds = preds - apply(preds, 1, max)
62 |     prob = exp(preds) / rowSums(exp(preds))
63 |     
64 |     return(list(name = "error",
65 |                 value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
66 |                 higher_better = FALSE))
67 |     
68 | }
69 | 
70 | model_custom <- lgb.train(list(),
71 |                           dtrain,
72 |                           100,
73 |                           valids,
74 |                           min_data = 1,
75 |                           learning_rate = 1,
76 |                           early_stopping_rounds = 10,
77 |                           objective = custom_multiclass_obj,
78 |                           eval = custom_multiclass_metric,
79 |                           num_class = 3)
80 | 
81 | preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE)
82 | 
83 | # compare predictions
84 | identical(preds_builtin, preds_custom)
85 | 
86 | 


--------------------------------------------------------------------------------
/include/LightGBM/utils/random.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIGHTGBM_UTILS_RANDOM_H_
  2 | #define LIGHTGBM_UTILS_RANDOM_H_
  3 | 
  4 | #include <cstdint>
  5 | 
  6 | #include <random>
  7 | #include <vector>
  8 | #include <set>
  9 | 
 10 | namespace LightGBM {
 11 | 
 12 | /*!
 13 | * \brief A wrapper for random generator
 14 | */
 15 | class Random {
 16 | public:
 17 |   /*!
 18 |   * \brief Constructor, with random seed
 19 |   */
 20 |   Random() {
 21 |     std::random_device rd;
 22 |     auto genrator = std::mt19937(rd());
 23 |     std::uniform_int_distribution<int> distribution(0, x);
 24 |     x = distribution(genrator);
 25 |   }
 26 |   /*!
 27 |   * \brief Constructor, with specific seed
 28 |   */
 29 |   Random(int seed) {
 30 |     x = seed;
 31 |   }
 32 |   /*!
 33 |   * \brief Generate random integer, int16 range. [0, 65536]
 34 |   * \param lower_bound lower bound
 35 |   * \param upper_bound upper bound
 36 |   * \return The random integer between [lower_bound, upper_bound)
 37 |   */
 38 |   inline int NextShort(int lower_bound, int upper_bound) {
 39 |     return (RandInt16()) % (upper_bound - lower_bound) + lower_bound;
 40 |   }
 41 | 
 42 |   /*!
 43 |   * \brief Generate random integer, int32 range
 44 |   * \param lower_bound lower bound
 45 |   * \param upper_bound upper bound
 46 |   * \return The random integer between [lower_bound, upper_bound)
 47 |   */
 48 |   inline int NextInt(int lower_bound, int upper_bound) {
 49 |     return (RandInt32()) % (upper_bound - lower_bound) + lower_bound;
 50 |   }
 51 | 
 52 |   /*!
 53 |   * \brief Generate random float data
 54 |   * \return The random float between [0.0, 1.0)
 55 |   */
 56 |   inline float NextFloat() {
 57 |     // get random float in [0,1)
 58 |     return static_cast<float>(RandInt16()) / (32768.0f);
 59 |   }
 60 |   /*!
 61 |   * \brief Sample K data from {0,1,...,N-1}
 62 |   * \param N
 63 |   * \param K
 64 |   * \return K Ordered sampled data from {0,1,...,N-1}
 65 |   */
 66 |   inline std::vector<int> Sample(int N, int K) {
 67 |     std::vector<int> ret;
 68 |     ret.reserve(K);
 69 |     if (K > N || K <= 0) {
 70 |       return ret;
 71 |     } else if (K == N) {
 72 |       for (int i = 0; i < N; ++i) {
 73 |         ret.push_back(i);
 74 |       }
 75 |     } else if (K > 1 && K > (N / std::log2(K))) {
 76 |       for (int i = 0; i < N; ++i) {
 77 |         double prob = (K - ret.size()) / static_cast<double>(N - i);
 78 |         if (NextFloat() < prob) {
 79 |           ret.push_back(i);
 80 |         }
 81 |       }
 82 |     } else {
 83 |       std::set<int> sample_set;
 84 |       while (static_cast<int>(sample_set.size()) < K) {
 85 |         int next = RandInt32() % N;
 86 |         if (sample_set.count(next) == 0) {
 87 |           sample_set.insert(next);
 88 |         }
 89 |       }
 90 |       for (auto iter = sample_set.begin(); iter != sample_set.end(); ++iter) {
 91 |         ret.push_back(*iter);
 92 |       }
 93 |     }
 94 |     return ret;
 95 |   }
 96 | private:
 97 |   inline int RandInt16() {
 98 |     x = (214013 * x + 2531011);
 99 |     return static_cast<int>((x >> 16) & 0x7FFF);
100 |   }
101 | 
102 |   inline int RandInt32() {
103 |     x = (214013 * x + 2531011);
104 |     return static_cast<int>(x & 0x7FFFFFFF);
105 |   }
106 | 
107 |   unsigned int x = 123456789;
108 | };
109 | 
110 | 
111 | }  // namespace LightGBM
112 | 
113 | #endif   // LightGBM_UTILS_RANDOM_H_
114 | 


--------------------------------------------------------------------------------
/R-package/tests/testthat/test_basic.R:
--------------------------------------------------------------------------------
 1 | context("basic functions")
 2 | 
 3 | data(agaricus.train, package='lightgbm')
 4 | data(agaricus.test, package='lightgbm')
 5 | train <- agaricus.train
 6 | test <- agaricus.test
 7 | 
 8 | windows_flag = grepl('Windows', Sys.info()[['sysname']])
 9 | 
10 | test_that("train and predict binary classification", {
11 |   nrounds = 10
12 |   bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5,
13 |     nrounds = nrounds, objective = "binary", metric="binary_error")
14 |   expect_false(is.null(bst$record_evals))
15 |   record_results <- lgb.get.eval.result(bst, "train", "binary_error")
16 |   expect_lt(min(record_results), 0.02)
17 | 
18 |   pred <- predict(bst, test$data)
19 |   expect_equal(length(pred), 1611)
20 | 
21 |   pred1 <- predict(bst, train$data, num_iteration = 1)
22 |   expect_equal(length(pred1), 6513)
23 |   err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
24 |   err_log <- record_results[1]
25 |   expect_lt(abs(err_pred1 - err_log), 10e-6)
26 | })
27 | 
28 | 
29 | test_that("train and predict softmax", {
30 |   lb <- as.numeric(iris$Species) - 1
31 | 
32 |   bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb,
33 |                  num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20,
34 |                  objective = "multiclass", metric="multi_error", num_class=3)
35 | 
36 |   expect_false(is.null(bst$record_evals))
37 |   record_results <- lgb.get.eval.result(bst, "train", "multi_error")
38 |   expect_lt(min(record_results), 0.03)
39 | 
40 |   pred <- predict(bst, as.matrix(iris[, -5]))
41 |   expect_equal(length(pred), nrow(iris) * 3)
42 | })
43 | 
44 | 
45 | test_that("use of multiple eval metrics works", {
46 |   bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4,
47 |                 learning_rate=1, nrounds = 10, objective = "binary",
48 |                 metric = list("binary_error","auc","binary_logloss") )
49 |   expect_false(is.null(bst$record_evals))
50 | })
51 | 
52 | 
53 | test_that("training continuation works", {
54 |   dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE)
55 |   watchlist = list(train=dtrain)
56 |   param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1)
57 | 
58 |   # for the reference, use 10 iterations at once:
59 |   bst <- lgb.train(param, dtrain, nrounds = 10, watchlist)
60 |   err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10)
61 |   # first 5 iterations:
62 |   bst1 <- lgb.train(param, dtrain, nrounds = 5, watchlist)
63 |   # test continuing from a model in file
64 |   lgb.save(bst1, "lightgbm.model")
65 |   # continue for 5 more:
66 |   bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = bst1)
67 |   err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10)
68 |   expect_lt(abs(err_bst - err_bst2), 0.01)
69 | 
70 |   bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = "lightgbm.model")
71 |   err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10)
72 |   expect_lt(abs(err_bst - err_bst2), 0.01)
73 | })
74 | 
75 | 
76 | test_that("cv works", {
77 |   dtrain <- lgb.Dataset(train$data, label=train$label)
78 |   params <- list(objective="regression", metric="l2,l1")
79 |   bst <- lgb.cv(params, dtrain, 10, nflod=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
80 |   expect_false(is.null(bst$record_evals))
81 | })
82 | 


--------------------------------------------------------------------------------
/docs/Quick-Start.rst:
--------------------------------------------------------------------------------
 1 | Quick Start
 2 | ===========
 3 | 
 4 | This is a quick start guide for LightGBM CLI version.
 5 | 
 6 | Follow the `Installation Guide <./Installation-Guide.rst>`__ to install LightGBM first.
 7 | 
 8 | **List of other helpful links**
 9 | 
10 | -  `Parameters <./Parameters.rst>`__
11 | 
12 | -  `Parameters Tuning <./Parameters-Tuning.rst>`__
13 | 
14 | -  `Python-package Quick Start <./Python-Intro.rst>`__
15 | 
16 | -  `Python API <./Python-API.rst>`__
17 | 
18 | Training Data Format
19 | --------------------
20 | 
21 | LightGBM supports input data files with `CSV`_, `TSV`_ and `LibSVM`_ formats.
22 | 
23 | Files could be both with and without headers.
24 | 
25 | Label column could be specified both by index and by name.
26 | 
27 | Some columns could be ignored.
28 | 
29 | Categorical Feature Support
30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 | 
32 | LightGBM can use categorical features directly (without one-hot encoding).
33 | The experiment on `Expo data`_ shows about 8x speed-up compared with one-hot encoding.
34 | 
35 | For the setting details, please refer to `Parameters <./Parameters.rst#categorical_feature>`__.
36 | 
37 | Weight and Query/Group Data
38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
39 | 
40 | LightGBM also supports weighted training, it needs an additional `weight data <./Parameters.rst#weight-data>`__.
41 | And it needs an additional `query data <./Parameters.rst#query-data>`_ for ranking task.
42 | 
43 | Also, weight and query data could be specified as columns in training data in the same manner as label.
44 | 
45 | Parameters Quick Look
46 | ---------------------
47 | 
48 | The parameters format is ``key1=value1 key2=value2 ...``.
49 | 
50 | Parameters can be set both in config file and command line.
51 | If one parameter appears in both command line and config file, LightGBM will use the parameter from the command line.
52 | 
53 | The most important parameters which new users should take a look to are located into `Core Parameters <./Parameters.rst#core-parameters>`__
54 | and the top of `Learning Control Parameters <./Parameters.rst#learning-control-parameters>`__
55 | sections of the full detailed list of `LightGBM's parameters <./Parameters.rst>`__.
56 | 
57 | Run LightGBM
58 | ------------
59 | 
60 | For Windows:
61 | 
62 | ::
63 | 
64 |     lightgbm.exe config=your_config_file other_args ...
65 | 
66 | For Unix:
67 | 
68 | ::
69 | 
70 |     ./lightgbm config=your_config_file other_args ...
71 | 
72 | Parameters can be set both in config file and command line, and the parameters in command line have higher priority than in config file.
73 | For example, following command line will keep ``num_trees=10`` and ignore the same parameter in config file.
74 | 
75 | ::
76 | 
77 |     ./lightgbm config=train.conf num_trees=10
78 | 
79 | Examples
80 | --------
81 | 
82 | -  `Binary Classification <https://github.com/Microsoft/LightGBM/tree/master/examples/binary_classification>`__
83 | 
84 | -  `Regression <https://github.com/Microsoft/LightGBM/tree/master/examples/regression>`__
85 | 
86 | -  `Lambdarank <https://github.com/Microsoft/LightGBM/tree/master/examples/lambdarank>`__
87 | 
88 | -  `Parallel Learning <https://github.com/Microsoft/LightGBM/tree/master/examples/parallel_learning>`__
89 | 
90 | .. _CSV: https://en.wikipedia.org/wiki/Comma-separated_values
91 | 
92 | .. _TSV: https://en.wikipedia.org/wiki/Tab-separated_values
93 | 
94 | .. _LibSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/
95 | 
96 | .. _Expo data: http://stat-computing.org/dataexpo/2009/
97 | 


--------------------------------------------------------------------------------
/src/boosting/gbdt_prediction.cpp:
--------------------------------------------------------------------------------
 1 | #include "gbdt.h"
 2 | 
 3 | #include <LightGBM/utils/openmp_wrapper.h>
 4 | #include <LightGBM/objective_function.h>
 5 | #include <LightGBM/prediction_early_stop.h>
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
10 |   int early_stop_round_counter = 0;
11 |   // set zero
12 |   std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
13 |   for (int i = 0; i < num_iteration_for_pred_; ++i) {
14 |     // predict all the trees for one iteration
15 |     for (int k = 0; k < num_tree_per_iteration_; ++k) {
16 |       output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features);
17 |     }
18 |     // check early stopping
19 |     ++early_stop_round_counter;
20 |     if (early_stop->round_period == early_stop_round_counter) {
21 |       if (early_stop->callback_function(output, num_tree_per_iteration_)) {
22 |         return;
23 |       }
24 |       early_stop_round_counter = 0;
25 |     }
26 |   }
27 | }
28 | 
29 | void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {
30 |   int early_stop_round_counter = 0;
31 |   // set zero
32 |   std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
33 |   for (int i = 0; i < num_iteration_for_pred_; ++i) {
34 |     // predict all the trees for one iteration
35 |     for (int k = 0; k < num_tree_per_iteration_; ++k) {
36 |       output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features);
37 |     }
38 |     // check early stopping
39 |     ++early_stop_round_counter;
40 |     if (early_stop->round_period == early_stop_round_counter) {
41 |       if (early_stop->callback_function(output, num_tree_per_iteration_)) {
42 |         return;
43 |       }
44 |       early_stop_round_counter = 0;
45 |     }
46 |   }
47 | }
48 | 
49 | void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
50 |   PredictRaw(features, output, early_stop);
51 |   if (average_output_) {
52 |     for (int k = 0; k < num_tree_per_iteration_; ++k) {
53 |       output[k] /= num_iteration_for_pred_;
54 |     }
55 |   } else if (objective_function_ != nullptr) {
56 |     objective_function_->ConvertOutput(output, output);
57 |   }
58 | }
59 | 
60 | void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {
61 |   PredictRawByMap(features, output, early_stop);
62 |   if (average_output_) {
63 |     for (int k = 0; k < num_tree_per_iteration_; ++k) {
64 |       output[k] /= num_iteration_for_pred_;
65 |     }
66 |   } else if (objective_function_ != nullptr) {
67 |     objective_function_->ConvertOutput(output, output);
68 |   }
69 | }
70 | 
71 | void GBDT::PredictLeafIndex(const double* features, double* output) const {
72 |   int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
73 |   for (int i = 0; i < total_tree; ++i) {
74 |     output[i] = models_[i]->PredictLeafIndex(features);
75 |   }
76 | }
77 | 
78 | void GBDT::PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const {
79 |   int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
80 |   for (int i = 0; i < total_tree; ++i) {
81 |     output[i] = models_[i]->PredictLeafIndexByMap(features);
82 |   }
83 | }
84 | 
85 | }  // namespace LightGBM
86 | 


--------------------------------------------------------------------------------
/src/treelearner/feature_parallel_tree_learner.cpp:
--------------------------------------------------------------------------------
 1 | #include "parallel_tree_learner.h"
 2 | 
 3 | #include <cstring>
 4 | 
 5 | #include <vector>
 6 | 
 7 | namespace LightGBM {
 8 | 
 9 | 
10 | template <typename TREELEARNER_T>
11 | FeatureParallelTreeLearner<TREELEARNER_T>::FeatureParallelTreeLearner(const Config* config)
12 |   :TREELEARNER_T(config) {
13 | }
14 | 
15 | template <typename TREELEARNER_T>
16 | FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
17 | 
18 | }
19 | 
20 | template <typename TREELEARNER_T>
21 | void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
22 |   TREELEARNER_T::Init(train_data, is_constant_hessian);
23 |   rank_ = Network::rank();
24 |   num_machines_ = Network::num_machines();
25 |   input_buffer_.resize((sizeof(SplitInfo) + sizeof(uint32_t) * this->config_->max_cat_threshold) * 2);
26 |   output_buffer_.resize((sizeof(SplitInfo) + sizeof(uint32_t) * this->config_->max_cat_threshold) * 2);
27 | }
28 | 
29 | 
30 | template <typename TREELEARNER_T>
31 | void FeatureParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
32 |   TREELEARNER_T::BeforeTrain();
33 |   // get feature partition
34 |   std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
35 |   std::vector<int> num_bins_distributed(num_machines_, 0);
36 |   for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
37 |     int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
38 |     if (inner_feature_index == -1) { continue; }
39 |     if (this->is_feature_used_[inner_feature_index]) {
40 |       int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
41 |       feature_distribution[cur_min_machine].push_back(inner_feature_index);
42 |       num_bins_distributed[cur_min_machine] += this->train_data_->FeatureNumBin(inner_feature_index);
43 |       this->is_feature_used_[inner_feature_index] = false;
44 |     }
45 |   }
46 |   // get local used features
47 |   for (auto fid : feature_distribution[rank_]) {
48 |     this->is_feature_used_[fid] = true;
49 |   }
50 | }
51 | 
52 | template <typename TREELEARNER_T>
53 | void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
54 |   TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract);
55 |   SplitInfo smaller_best_split, larger_best_split;
56 |   // get best split at smaller leaf
57 |   smaller_best_split = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
58 |   // find local best split for larger leaf
59 |   if (this->larger_leaf_splits_->LeafIndex() >= 0) {
60 |     larger_best_split = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
61 |   }
62 |   // sync global best info
63 |   SyncUpGlobalBestSplit(input_buffer_.data(), input_buffer_.data(), &smaller_best_split, &larger_best_split, this->config_->max_cat_threshold);
64 |   // update best split
65 |   this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best_split;
66 |   if (this->larger_leaf_splits_->LeafIndex() >= 0) {
67 |     this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best_split;
68 |   }
69 | }
70 | 
71 | // instantiate template classes, otherwise linker cannot find the code
72 | template class FeatureParallelTreeLearner<GPUTreeLearner>;
73 | template class FeatureParallelTreeLearner<SerialTreeLearner>;
74 | }  // namespace LightGBM
75 | 


--------------------------------------------------------------------------------
/tests/python_package_test/test_basic.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: skip-file
 3 | import os
 4 | import tempfile
 5 | import unittest
 6 | 
 7 | import lightgbm as lgb
 8 | import numpy as np
 9 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file
10 | from sklearn.model_selection import train_test_split
11 | 
12 | 
13 | class TestBasic(unittest.TestCase):
14 | 
15 |     def test(self):
16 |         X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
17 |         train_data = lgb.Dataset(X_train, label=y_train)
18 |         valid_data = train_data.create_valid(X_test, label=y_test)
19 | 
20 |         params = {
21 |             "objective": "binary",
22 |             "metric": "auc",
23 |             "min_data": 10,
24 |             "num_leaves": 15,
25 |             "verbose": -1,
26 |             "num_threads": 1,
27 |             "max_bin": 255
28 |         }
29 |         bst = lgb.Booster(params, train_data)
30 |         bst.add_valid(valid_data, "valid_1")
31 | 
32 |         for i in range(30):
33 |             bst.update()
34 |             if i % 10 == 0:
35 |                 print(bst.eval_train(), bst.eval_valid())
36 |         bst.save_model("model.txt")
37 |         pred_from_matr = bst.predict(X_test)
38 |         with tempfile.NamedTemporaryFile() as f:
39 |             tname = f.name
40 |         with open(tname, "w+b") as f:
41 |             dump_svmlight_file(X_test, y_test, f)
42 |         pred_from_file = bst.predict(tname)
43 |         os.remove(tname)
44 |         self.assertEqual(len(pred_from_matr), len(pred_from_file))
45 |         for preds in zip(pred_from_matr, pred_from_file):
46 |             self.assertAlmostEqual(*preds, places=15)
47 | 
48 |         # check saved model persistence
49 |         bst = lgb.Booster(params, model_file="model.txt")
50 |         pred_from_model_file = bst.predict(X_test)
51 |         self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
52 |         for preds in zip(pred_from_matr, pred_from_model_file):
53 |             # we need to check the consistency of model file here, so test for exact equal
54 |             self.assertEqual(*preds)
55 | 
56 |         # check early stopping is working. Make it stop very early, so the scores should be very close to zero
57 |         pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
58 |         pred_early_stopping = bst.predict(X_test, **pred_parameter)
59 |         self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
60 |         for preds in zip(pred_early_stopping, pred_from_matr):
61 |             # scores likely to be different, but prediction should still be the same
62 |             self.assertEqual(preds[0] > 0, preds[1] > 0)
63 | 
64 |     def test_chunked_dataset(self):
65 |         X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
66 | 
67 |         chunk_size = X_train.shape[0] // 10 + 1
68 |         X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
69 |         X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
70 | 
71 |         train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
72 |         valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
73 | 
74 |         train_data.construct()
75 |         valid_data.construct()
76 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.prepare_rules.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.prepare_rules.R
 3 | \name{lgb.prepare_rules}
 4 | \alias{lgb.prepare_rules}
 5 | \title{Data preparator for LightGBM datasets with rules (numeric)}
 6 | \usage{
 7 | lgb.prepare_rules(data, rules = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{A data.frame or data.table to prepare.}
11 | 
12 | \item{rules}{A set of rules from the data preparator, if already used.}
13 | }
14 | \value{
15 | A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
16 | }
17 | \description{
18 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(iris)
24 | 
25 | str(iris)
26 | # 'data.frame':	150 obs. of  5 variables:
27 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
28 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
29 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
30 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
31 | # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
32 | 
33 | new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
34 | str(new_iris$data)
35 | # 'data.frame':	150 obs. of  5 variables:
36 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
37 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
38 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
39 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
40 | # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
41 | 
42 | data(iris) # Erase iris dataset
43 | iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
44 | # Warning message:
45 | # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
46 | #  invalid factor level, NA generated
47 | 
48 | # Use conversion using known rules
49 | # Unknown factors become 0, excellent for sparse datasets
50 | newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
51 | 
52 | # Unknown factor is now zero, perfect for sparse datasets
53 | newer_iris$data[1, ] # Species became 0 as it is an unknown factor
54 | #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
55 | # 1          5.1         3.5          1.4         0.2       0
56 | 
57 | newer_iris$data[1, 5] <- 1 # Put back real initial value
58 | 
59 | # Is the newly created dataset equal? YES!
60 | all.equal(new_iris$data, newer_iris$data)
61 | # [1] TRUE
62 | 
63 | # Can we test our own rules?
64 | data(iris) # Erase iris dataset
65 | 
66 | # We remapped values differently
67 | personal_rules <- list(Species = c("setosa" = 3,
68 |                                    "versicolor" = 2,
69 |                                    "virginica" = 1))
70 | newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
71 | str(newest_iris$data) # SUCCESS!
72 | # 'data.frame':	150 obs. of  5 variables:
73 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
74 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
75 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
76 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
77 | # $ Species     : num  3 3 3 3 3 3 3 3 3 3 ...
78 | 
79 | }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/include/LightGBM/dataset_loader.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIGHTGBM_DATASET_LOADER_H_
 2 | #define LIGHTGBM_DATASET_LOADER_H_
 3 | 
 4 | #include <LightGBM/dataset.h>
 5 | 
 6 | namespace LightGBM {
 7 | 
 8 | class DatasetLoader {
 9 | public:
10 | 
11 |   LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
12 | 
13 |   LIGHTGBM_EXPORT ~DatasetLoader();
14 | 
15 |   LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file, int rank, int num_machines);
16 | 
17 |   LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file) {
18 |     return LoadFromFile(filename, initscore_file, 0, 1);
19 |   }
20 | 
21 |   LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const char* initscore_file, const Dataset* train_data);
22 | 
23 |   LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values,
24 |     int** sample_indices, int num_col, const int* num_per_col,
25 |     size_t total_sample_size, data_size_t num_data);
26 | 
27 |   /*! \brief Disable copy */
28 |   DatasetLoader& operator=(const DatasetLoader&) = delete;
29 |   /*! \brief Disable copy */
30 |   DatasetLoader(const DatasetLoader&) = delete;
31 | 
32 | private:
33 | 
34 |   Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
35 | 
36 |   void SetHeader(const char* filename);
37 | 
38 |   void CheckDataset(const Dataset* dataset);
39 | 
40 |   std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
41 | 
42 |   std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);
43 | 
44 |   std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
45 | 
46 |   void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);
47 | 
48 |   /*! \brief Extract local features from memory */
49 |   void ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset);
50 | 
51 |   /*! \brief Extract local features from file */
52 |   void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
53 | 
54 |   /*! \brief Check can load from binary file */
55 |   std::string CheckCanLoadFromBin(const char* filename);
56 | 
57 |   const Config& config_;
58 |   /*! \brief Random generator*/
59 |   Random random_;
60 |   /*! \brief prediction function for initial model */
61 |   const PredictFunction& predict_fun_;
62 |   /*! \brief number of classes */
63 |   int num_class_;
64 |   /*! \brief index of label column */
65 |   int label_idx_;
66 |   /*! \brief index of weight column */
67 |   int weight_idx_;
68 |   /*! \brief index of group column */
69 |   int group_idx_;
70 |   /*! \brief Mapper from real feature index to used index*/
71 |   std::unordered_set<int> ignore_features_;
72 |   /*! \brief store feature names */
73 |   std::vector<std::string> feature_names_;
74 |   /*! \brief Mapper from real feature index to used index*/
75 |   std::unordered_set<int> categorical_features_;
76 | };
77 | 
78 | }
79 | 
80 | #endif // LIGHTGBM_DATASET_LOADER_H_
81 | 


--------------------------------------------------------------------------------
/R-package/man/lgb.prepare_rules2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lgb.prepare_rules2.R
 3 | \name{lgb.prepare_rules2}
 4 | \alias{lgb.prepare_rules2}
 5 | \title{Data preparator for LightGBM datasets with rules (integer)}
 6 | \usage{
 7 | lgb.prepare_rules2(data, rules = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{A data.frame or data.table to prepare.}
11 | 
12 | \item{rules}{A set of rules from the data preparator, if already used.}
13 | }
14 | \value{
15 | A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
16 | }
17 | \description{
18 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
19 | }
20 | \examples{
21 | \dontrun{
22 | library(lightgbm)
23 | data(iris)
24 | 
25 | str(iris)
26 | # 'data.frame':	150 obs. of  5 variables:
27 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
28 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
29 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
30 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
31 | # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
32 | 
33 | new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
34 | str(new_iris$data)
35 | # 'data.frame':	150 obs. of  5 variables:
36 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
37 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
38 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
39 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
40 | # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
41 | 
42 | data(iris) # Erase iris dataset
43 | iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
44 | # Warning message:
45 | # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
46 | #  invalid factor level, NA generated
47 | 
48 | # Use conversion using known rules
49 | # Unknown factors become 0, excellent for sparse datasets
50 | newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
51 | 
52 | # Unknown factor is now zero, perfect for sparse datasets
53 | newer_iris$data[1, ] # Species became 0 as it is an unknown factor
54 | #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
55 | # 1          5.1         3.5          1.4         0.2       0
56 | 
57 | newer_iris$data[1, 5] <- 1 # Put back real initial value
58 | 
59 | # Is the newly created dataset equal? YES!
60 | all.equal(new_iris$data, newer_iris$data)
61 | # [1] TRUE
62 | 
63 | # Can we test our own rules?
64 | data(iris) # Erase iris dataset
65 | 
66 | # We remapped values differently
67 | personal_rules <- list(Species = c("setosa" = 3L,
68 |                                    "versicolor" = 2L,
69 |                                    "virginica" = 1L))
70 | newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
71 | str(newest_iris$data) # SUCCESS!
72 | # 'data.frame':	150 obs. of  5 variables:
73 | # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
74 | # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
75 | # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
76 | # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
77 | # $ Species     : int  3 3 3 3 3 3 3 3 3 3 ...
78 | 
79 | }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/R-package/demo/categorical_features_prepare.R:
--------------------------------------------------------------------------------
 1 | # Here we are going to try training a model with categorical features
 2 | 
 3 | # Load libraries
 4 | library(data.table)
 5 | library(lightgbm)
 6 | 
 7 | # Load data and look at the structure
 8 | # 
 9 | # Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
10 | # $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
11 | # $ job      : chr  "unemployed" "services" "management" "management" ...
12 | # $ marital  : chr  "married" "married" "single" "married" ...
13 | # $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
14 | # $ default  : chr  "no" "no" "no" "no" ...
15 | # $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
16 | # $ housing  : chr  "no" "yes" "yes" "yes" ...
17 | # $ loan     : chr  "no" "yes" "no" "yes" ...
18 | # $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
19 | # $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
20 | # $ month    : chr  "oct" "may" "apr" "jun" ...
21 | # $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
22 | # $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
23 | # $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
24 | # $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
25 | # $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
26 | # $ y        : chr  "no" "no" "no" "no" ...
27 | data(bank, package = "lightgbm")
28 | str(bank)
29 | 
30 | # We must now transform the data to fit in LightGBM
31 | # For this task, we use lgb.prepare
32 | # The function transforms the data into a fittable data
33 | # 
34 | # Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
35 | # $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
36 | # $ job      : chr  "unemployed" "services" "management" "management" ...
37 | # $ marital  : chr  "married" "married" "single" "married" ...
38 | # $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
39 | # $ default  : chr  "no" "no" "no" "no" ...
40 | # $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
41 | # $ housing  : chr  "no" "yes" "yes" "yes" ...
42 | # $ loan     : chr  "no" "yes" "no" "yes" ...
43 | # $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
44 | # $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
45 | # $ month    : chr  "oct" "may" "apr" "jun" ...
46 | # $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
47 | # $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
48 | # $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
49 | # $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
50 | # $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
51 | # $ y        : chr  "no" "no" "no" "no" ...
52 | bank <- lgb.prepare(data = bank)
53 | str(bank)
54 | 
55 | # Remove 1 to label because it must be between 0 and 1
56 | bank$y <- bank$y - 1
57 | 
58 | # Data input to LightGBM must be a matrix, without the label
59 | my_data <- as.matrix(bank[, 1:16, with = FALSE])
60 | 
61 | # Creating the LightGBM dataset with categorical features
62 | # The categorical features must be indexed like in R (1-indexed, not 0-indexed)
63 | lgb_data <- lgb.Dataset(data = my_data,
64 |                         label = bank$y,
65 |                         categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
66 | 
67 | # We can now train a model
68 | model <- lgb.train(list(objective = "binary",
69 |                         metric = "l2",
70 |                         min_data = 1,
71 |                         learning_rate = 0.1,
72 |                         min_data = 0,
73 |                         min_hessian = 1,
74 |                         max_depth = 2),
75 |                    lgb_data,
76 |                    100,
77 |                    valids = list(train = lgb_data))
78 | 
79 | # Try to find split_feature: 2
80 | # If you find it, it means it used a categorical feature in the first tree
81 | lgb.dump(model, num_iteration = 1)
82 | 


--------------------------------------------------------------------------------