├── VERSION.txt ├── R-package ├── src │ ├── Makevars │ └── Makevars.win ├── .Rbuildignore ├── data │ ├── bank.rda │ ├── agaricus.test.rda │ └── agaricus.train.rda ├── tests │ ├── testthat.R │ └── testthat │ │ ├── test_custom_objective.R │ │ ├── test_parameters.R │ │ ├── test_dataset.R │ │ └── test_basic.R ├── man │ ├── lgb.Dataset.construct.Rd │ ├── lgb.Dataset.save.Rd │ ├── bank.Rd │ ├── lgb.Dataset.set.categorical.Rd │ ├── slice.Rd │ ├── lgb.Dataset.set.reference.Rd │ ├── agaricus.test.Rd │ ├── agaricus.train.Rd │ ├── dim.Rd │ ├── lgb.Dataset.create.valid.Rd │ ├── lgb.dump.Rd │ ├── dimnames.lgb.Dataset.Rd │ ├── lgb.save.Rd │ ├── readRDS.lgb.Booster.Rd │ ├── getinfo.Rd │ ├── setinfo.Rd │ ├── lgb.Dataset.Rd │ ├── lgb.get.eval.result.Rd │ ├── lgb.importance.Rd │ ├── lgb.load.Rd │ ├── lgb.plot.importance.Rd │ ├── lgb.interprete.Rd │ ├── lgb.unloader.Rd │ ├── lgb.plot.interpretation.Rd │ ├── lgb.prepare.Rd │ ├── saveRDS.lgb.Booster.Rd │ ├── lgb.model.dt.tree.Rd │ ├── lgb.prepare2.Rd │ ├── predict.lgb.Booster.Rd │ ├── lgb.prepare_rules.Rd │ └── lgb.prepare_rules2.Rd ├── demo │ ├── 00Index │ ├── boost_from_prediction.R │ ├── efficient_many_training.R │ ├── cross_validation.R │ ├── early_stopping.R │ ├── multiclass.R │ ├── multiclass_custom_objective.R │ └── categorical_features_prepare.R ├── build_package.R ├── LICENSE ├── NAMESPACE ├── R │ ├── readRDS.lgb.Booster.R │ ├── lgb.importance.R │ ├── lgb.unloader.R │ ├── saveRDS.lgb.Booster.R │ └── lgb.plot.importance.R └── DESCRIPTION ├── examples ├── .gitignore ├── parallel_learning │ ├── mlist.txt │ ├── predict.conf │ └── README.md ├── lambdarank │ ├── predict.conf │ ├── rank.test.query │ ├── README.md │ └── rank.train.query ├── regression │ ├── predict.conf │ └── README.md ├── binary_classification │ ├── predict.conf │ ├── forced_splits.json │ └── README.md ├── multiclass_classification │ ├── predict.conf │ ├── README.md │ └── train.conf └── python-guide │ ├── simple_example.py │ ├── plot_example.py │ ├── sklearn_example.py │ └── README.md ├── .gitmodules ├── docs ├── _static │ ├── images │ │ ├── gcc-bars.png │ │ ├── gcc-chart.png │ │ ├── gcc-table.png │ │ ├── leaf-wise.png │ │ ├── level-wise.png │ │ ├── gcc-meetup-1.png │ │ ├── gcc-meetup-2.png │ │ ├── gcc-comparison-1.png │ │ ├── gcc-comparison-2.png │ │ ├── screenshot-system.png │ │ ├── screenshot-use-gpu.png │ │ ├── screenshot-debug-run.png │ │ ├── screenshot-r-mingw-used.png │ │ ├── screenshot-boost-compiled.png │ │ ├── gpu-performance-comparison.png │ │ ├── screenshot-create-directory.png │ │ ├── screenshot-downloading-cmake.png │ │ ├── screenshot-files-to-remove.png │ │ ├── screenshot-git-for-windows.png │ │ ├── screenshot-configured-lightgbm.png │ │ ├── screenshot-mingw-installation.png │ │ ├── screenshot-segmentation-fault.png │ │ ├── screenshot-environment-variables.png │ │ ├── screenshot-mingw-makefiles-to-use.png │ │ ├── screenshot-advanced-system-settings.png │ │ ├── screenshot-lightgbm-in-cli-with-gpu.png │ │ ├── screenshot-added-manual-entry-in-cmake.png │ │ ├── screenshot-configured-and-generated-cmake.png │ │ └── screenshot-lightgbm-with-gpu-support-compiled.png │ └── js │ │ └── script.js ├── .linkcheckerrc ├── Makefile ├── make.bat ├── README.rst ├── Python-API.rst ├── index.rst ├── gcc-Tips.rst ├── Advanced-Topics.rst ├── Parameters-Tuning.rst └── Quick-Start.rst ├── tests ├── cpp_test │ ├── predict.conf │ ├── train.conf │ └── test.py └── python_package_test │ └── test_basic.py ├── pmml └── README.md ├── python-package ├── MANIFEST.in └── lightgbm │ ├── __init__.py │ └── libpath.py ├── docker ├── dockerfile-cli ├── dockerfile-python ├── gpu │ └── README.md └── README.md ├── .github └── ISSUE_TEMPLATE.md ├── include └── LightGBM │ ├── export.h │ ├── prediction_early_stop.h │ ├── utils │ ├── threading.h │ ├── openmp_wrapper.h │ ├── file_io.h │ ├── pipeline_reader.h │ ├── log.h │ └── random.h │ ├── meta.h │ ├── application.h │ ├── objective_function.h │ ├── tree_learner.h │ └── dataset_loader.h ├── src ├── main.cpp ├── network │ └── linkers_mpi.cpp ├── treelearner │ ├── tree_learner.cpp │ └── feature_parallel_tree_learner.cpp ├── boosting │ ├── boosting.cpp │ ├── prediction_early_stop.cpp │ └── gbdt_prediction.cpp └── metric │ └── metric.cpp ├── LICENSE ├── windows └── LightGBM.sln ├── .travis └── setup.sh ├── .travis.yml ├── .nuget └── create_nuget.py ├── swig └── lightgbmlib.i └── .appveyor.yml /VERSION.txt: -------------------------------------------------------------------------------- 1 | 2.1.2 2 | -------------------------------------------------------------------------------- /R-package/src/Makevars: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /R-package/src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | -------------------------------------------------------------------------------- /R-package/.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^build_package.R$ 2 | -------------------------------------------------------------------------------- /examples/parallel_learning/mlist.txt: -------------------------------------------------------------------------------- 1 | 192.168.1.101 12400 2 | 192.168.1.102 12400 3 | -------------------------------------------------------------------------------- /R-package/data/bank.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/R-package/data/bank.rda -------------------------------------------------------------------------------- /R-package/tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(lightgbm) 3 | 4 | test_check("lightgbm") 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "include/boost/compute"] 2 | path = compute 3 | url = https://github.com/boostorg/compute 4 | -------------------------------------------------------------------------------- /R-package/data/agaricus.test.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/R-package/data/agaricus.test.rda -------------------------------------------------------------------------------- /docs/_static/images/gcc-bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-bars.png -------------------------------------------------------------------------------- /R-package/data/agaricus.train.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/R-package/data/agaricus.train.rda -------------------------------------------------------------------------------- /docs/_static/images/gcc-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-chart.png -------------------------------------------------------------------------------- /docs/_static/images/gcc-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-table.png -------------------------------------------------------------------------------- /docs/_static/images/leaf-wise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/leaf-wise.png -------------------------------------------------------------------------------- /docs/_static/images/level-wise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/level-wise.png -------------------------------------------------------------------------------- /examples/lambdarank/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = rank.test 5 | 6 | input_model= LightGBM_model.txt 7 | -------------------------------------------------------------------------------- /tests/cpp_test/predict.conf: -------------------------------------------------------------------------------- 1 | data=../data/categorical.data 2 | 3 | input_model=LightGBM_model.txt 4 | 5 | task=predict 6 | -------------------------------------------------------------------------------- /docs/_static/images/gcc-meetup-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-meetup-1.png -------------------------------------------------------------------------------- /docs/_static/images/gcc-meetup-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-meetup-2.png -------------------------------------------------------------------------------- /examples/regression/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = regression.test 5 | 6 | input_model= LightGBM_model.txt 7 | -------------------------------------------------------------------------------- /docs/_static/images/gcc-comparison-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-comparison-1.png -------------------------------------------------------------------------------- /docs/_static/images/gcc-comparison-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gcc-comparison-2.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-system.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-use-gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-use-gpu.png -------------------------------------------------------------------------------- /examples/binary_classification/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = binary.test 5 | 6 | input_model= LightGBM_model.txt 7 | -------------------------------------------------------------------------------- /examples/parallel_learning/predict.conf: -------------------------------------------------------------------------------- 1 | 2 | task = predict 3 | 4 | data = binary.test 5 | 6 | input_model= LightGBM_model.txt 7 | 8 | -------------------------------------------------------------------------------- /docs/_static/images/screenshot-debug-run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-debug-run.png -------------------------------------------------------------------------------- /examples/multiclass_classification/predict.conf: -------------------------------------------------------------------------------- 1 | task = predict 2 | 3 | data = multiclass.test 4 | 5 | input_model= LightGBM_model.txt 6 | -------------------------------------------------------------------------------- /tests/cpp_test/train.conf: -------------------------------------------------------------------------------- 1 | data=../data/categorical.data 2 | 3 | app=binary 4 | 5 | num_trees=10 6 | 7 | categorical_column=0,1,4,5,6 8 | -------------------------------------------------------------------------------- /docs/_static/images/screenshot-r-mingw-used.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-r-mingw-used.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-boost-compiled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-boost-compiled.png -------------------------------------------------------------------------------- /docs/_static/images/gpu-performance-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/gpu-performance-comparison.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-create-directory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-create-directory.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-downloading-cmake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-downloading-cmake.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-files-to-remove.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-files-to-remove.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-git-for-windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-git-for-windows.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-configured-lightgbm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-configured-lightgbm.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-mingw-installation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-mingw-installation.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-segmentation-fault.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-segmentation-fault.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-environment-variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-environment-variables.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-mingw-makefiles-to-use.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-mingw-makefiles-to-use.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-advanced-system-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-advanced-system-settings.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-lightgbm-in-cli-with-gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-lightgbm-in-cli-with-gpu.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-added-manual-entry-in-cmake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-added-manual-entry-in-cmake.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-configured-and-generated-cmake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-configured-and-generated-cmake.png -------------------------------------------------------------------------------- /docs/_static/images/screenshot-lightgbm-with-gpu-support-compiled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/LightGBM/master/docs/_static/images/screenshot-lightgbm-with-gpu-support-compiled.png -------------------------------------------------------------------------------- /tests/cpp_test/test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import glob 3 | import numpy as np 4 | 5 | preds = [np.loadtxt(name) for name in glob.glob('*.pred')] 6 | np.testing.assert_array_almost_equal(preds[0], preds[1], decimal=5) 7 | -------------------------------------------------------------------------------- /pmml/README.md: -------------------------------------------------------------------------------- 1 | PMML Generator 2 | ============== 3 | 4 | The old Python convert script is removed due to it cannot support the new format of categorical features. 5 | 6 | Please refer to https://github.com/jpmml/jpmml-lightgbm. 7 | -------------------------------------------------------------------------------- /examples/binary_classification/forced_splits.json: -------------------------------------------------------------------------------- 1 | { 2 | "feature": 25, 3 | "threshold": 1.30, 4 | "left": { 5 | "feature": 26, 6 | "threshold": 0.85 7 | }, 8 | "right": { 9 | "feature": 26, 10 | "threshold": 0.85 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /docs/.linkcheckerrc: -------------------------------------------------------------------------------- 1 | [checking] 2 | recursionlevel=1 3 | anchors=1 4 | sslverify=0 5 | 6 | [filtering] 7 | ignore=public.tableau.com 8 | ignorewarnings=http-robots-denied,https-certificate-error 9 | 10 | [output] 11 | # Set to 1 if you want see the full output, not only warnings and errors 12 | verbose=0 13 | 14 | [AnchorCheck] 15 | -------------------------------------------------------------------------------- /docs/_static/js/script.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | $('a[href^="./"][href*=".rst"]').attr('href', (i, val) => { return val.replace('.rst', '.html'); }); /* Replace '.rst' with '.html' in all internal links like './[Something].rst[#anchor]' */ 3 | $('.wy-nav-content').each(function () { this.style.setProperty('max-width', 'none', 'important'); }); 4 | }); 5 | -------------------------------------------------------------------------------- /examples/lambdarank/rank.test.query: -------------------------------------------------------------------------------- 1 | 12 2 | 19 3 | 18 4 | 10 5 | 15 6 | 15 7 | 22 8 | 23 9 | 18 10 | 16 11 | 16 12 | 11 13 | 6 14 | 13 15 | 17 16 | 21 17 | 20 18 | 16 19 | 13 20 | 16 21 | 21 22 | 15 23 | 10 24 | 19 25 | 10 26 | 13 27 | 18 28 | 17 29 | 23 30 | 24 31 | 16 32 | 13 33 | 17 34 | 24 35 | 17 36 | 10 37 | 17 38 | 15 39 | 18 40 | 16 41 | 9 42 | 9 43 | 21 44 | 14 45 | 13 46 | 13 47 | 13 48 | 10 49 | 10 50 | 6 51 | -------------------------------------------------------------------------------- /python-package/MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune build 2 | include LICENSE 3 | include *.rst *.txt 4 | recursive-include lightgbm *.py *.txt *.so 5 | recursive-include compile *.txt *.so 6 | recursive-include compile/Release *.dll 7 | recursive-include compile/compute * 8 | recursive-include compile/include * 9 | recursive-include compile/src * 10 | recursive-include compile/windows LightGBM.sln LightGBM.vcxproj 11 | recursive-include compile/windows/x64/DLL *.dll 12 | global-exclude *.py[co] 13 | -------------------------------------------------------------------------------- /docker/dockerfile-cli: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y cmake build-essential gcc g++ git && \ 5 | rm -rf /var/lib/apt/lists/* 6 | 7 | RUN git clone --recursive --branch stable https://github.com/Microsoft/LightGBM && \ 8 | mkdir LightGBM/build && \ 9 | cd LightGBM/build && \ 10 | cmake .. && \ 11 | make -j4 && \ 12 | make install && \ 13 | cd ../.. && \ 14 | rm -rf LightGBM 15 | 16 | ENTRYPOINT ["lightgbm"] 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Please search your question on previous issues, [stackoverflow](https://stackoverflow.com/questions/tagged/lightgbm) or other search engines before you open a new one. 2 | 3 | For bugs and unexpected issues, please provide following information, so that we could reproduce on our system. 4 | 5 | ## Environment info 6 | Operating System: 7 | CPU: 8 | C++/Python/R version: 9 | 10 | ## Error Message: 11 | 12 | ## Reproducible examples 13 | 14 | ## Steps to reproduce 15 | 16 | 1. 17 | 2. 18 | 3. 19 | -------------------------------------------------------------------------------- /examples/lambdarank/README.md: -------------------------------------------------------------------------------- 1 | LambdaRank Example 2 | ================== 3 | 4 | Here is an example for LightGBM to run lambdarank task. 5 | 6 | ***You should copy executable file to this folder first.*** 7 | 8 | Training 9 | -------- 10 | 11 | Run the following command in this folder: 12 | 13 | ``` 14 | "./lightgbm" config=train.conf 15 | ``` 16 | 17 | Prediction 18 | ---------- 19 | 20 | You should finish training first. 21 | 22 | Run the following command in this folder: 23 | 24 | ``` 25 | "./lightgbm" config=predict.conf 26 | ``` 27 | -------------------------------------------------------------------------------- /examples/regression/README.md: -------------------------------------------------------------------------------- 1 | Regression Example 2 | ================== 3 | 4 | Here is an example for LightGBM to run regression task. 5 | 6 | ***You should copy executable file to this folder first.*** 7 | 8 | Training 9 | -------- 10 | 11 | Run the following command in this folder: 12 | 13 | ``` 14 | "./lightgbm" config=train.conf 15 | ``` 16 | 17 | Prediction 18 | ---------- 19 | 20 | You should finish training first. 21 | 22 | Run the following command in this folder: 23 | 24 | ``` 25 | "./lightgbm" config=predict.conf 26 | ``` 27 | -------------------------------------------------------------------------------- /include/LightGBM/export.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_EXPORT_H_ 2 | #define LIGHTGBM_EXPORT_H_ 3 | 4 | /** Macros for exporting symbols in MSVC/GCC/CLANG **/ 5 | 6 | #ifdef __cplusplus 7 | #define LIGHTGBM_EXTERN_C extern "C" 8 | #else 9 | #define LIGHTGBM_EXTERN_C 10 | #endif 11 | 12 | 13 | #ifdef _MSC_VER 14 | #define LIGHTGBM_EXPORT __declspec(dllexport) 15 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport) 16 | #else 17 | #define LIGHTGBM_EXPORT 18 | #define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C 19 | #endif 20 | 21 | #endif /** LIGHTGBM_EXPORT_H_ **/ 22 | -------------------------------------------------------------------------------- /examples/binary_classification/README.md: -------------------------------------------------------------------------------- 1 | Binary Classification Example 2 | ============================= 3 | 4 | Here is an example for LightGBM to run binary classification task. 5 | 6 | ***You should copy executable file to this folder first.*** 7 | 8 | Training 9 | -------- 10 | 11 | Run the following command in this folder: 12 | 13 | ``` 14 | "./lightgbm" config=train.conf 15 | ``` 16 | 17 | Prediction 18 | ---------- 19 | 20 | You should finish training first. 21 | 22 | Run the following command in this folder: 23 | 24 | ``` 25 | "./lightgbm" config=predict.conf 26 | ``` 27 | -------------------------------------------------------------------------------- /examples/multiclass_classification/README.md: -------------------------------------------------------------------------------- 1 | Multiclass Classification Example 2 | ================================= 3 | 4 | Here is an example for LightGBM to run multiclass classification task. 5 | 6 | ***You should copy executable file to this folder first.*** 7 | 8 | Training 9 | -------- 10 | 11 | Run the following command in this folder: 12 | 13 | ``` 14 | "./lightgbm" config=train.conf 15 | ``` 16 | 17 | Prediction 18 | ---------- 19 | 20 | You should finish training first. 21 | 22 | Run the following command in this folder: 23 | 24 | ``` 25 | "./lightgbm" config=predict.conf 26 | ``` 27 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | try { 6 | LightGBM::Application app(argc, argv); 7 | app.Run(); 8 | } 9 | catch (const std::exception& ex) { 10 | std::cerr << "Met Exceptions:" << std::endl; 11 | std::cerr << ex.what() << std::endl; 12 | exit(-1); 13 | } 14 | catch (const std::string& ex) { 15 | std::cerr << "Met Exceptions:" << std::endl; 16 | std::cerr << ex << std::endl; 17 | exit(-1); 18 | } 19 | catch (...) { 20 | std::cerr << "Unknown Exceptions" << std::endl; 21 | exit(-1); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.construct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.construct} 4 | \alias{lgb.Dataset.construct} 5 | \title{Construct Dataset explicitly} 6 | \usage{ 7 | lgb.Dataset.construct(dataset) 8 | } 9 | \arguments{ 10 | \item{dataset}{Object of class \code{lgb.Dataset}} 11 | } 12 | \description{ 13 | Construct Dataset explicitly 14 | } 15 | \examples{ 16 | \dontrun{ 17 | library(lightgbm) 18 | data(agaricus.train, package = "lightgbm") 19 | train <- agaricus.train 20 | dtrain <- lgb.Dataset(train$data, label = train$label) 21 | lgb.Dataset.construct(dtrain) 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /R-package/demo/00Index: -------------------------------------------------------------------------------- 1 | basic_walkthrough Basic feature walkthrough 2 | boost_from_prediction Boosting from existing prediction 3 | categorical_feature_prepare Categorical Feature Preparation 4 | categorical_feature_rules Categorical Feature Preparation with Rules 5 | cross_validation Cross Validation 6 | early_stopping Early Stop in training 7 | efficient_many_training Efficiency for Many Model Trainings 8 | multiclass Multiclass training/prediction 9 | leaf_stability Leaf (in)Stability example 10 | weight_param Weight-Parameter adjustment relationship 11 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = -W 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = LightGBM 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.save.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.save} 4 | \alias{lgb.Dataset.save} 5 | \title{Save \code{lgb.Dataset} to a binary file} 6 | \usage{ 7 | lgb.Dataset.save(dataset, fname) 8 | } 9 | \arguments{ 10 | \item{dataset}{object of class \code{lgb.Dataset}} 11 | 12 | \item{fname}{object filename of output file} 13 | } 14 | \value{ 15 | passed dataset 16 | } 17 | \description{ 18 | Save \code{lgb.Dataset} to a binary file 19 | } 20 | \examples{ 21 | 22 | \dontrun{ 23 | library(lightgbm) 24 | data(agaricus.train, package = "lightgbm") 25 | train <- agaricus.train 26 | dtrain <- lgb.Dataset(train$data, label = train$label) 27 | lgb.Dataset.save(dtrain, "data.bin") 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /docker/dockerfile-python: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y cmake build-essential gcc g++ git wget && \ 5 | 6 | # python-package 7 | # miniconda 8 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 9 | /bin/bash Miniconda3-latest-Linux-x86_64.sh -f -b -p /opt/conda && \ 10 | export PATH="/opt/conda/bin:$PATH" && \ 11 | # lightgbm 12 | conda install -y numpy scipy scikit-learn pandas && \ 13 | git clone --recursive https://github.com/Microsoft/LightGBM && \ 14 | cd LightGBM/python-package && python setup.py install && \ 15 | 16 | # clean 17 | apt-get autoremove -y && apt-get clean && \ 18 | conda clean -i -l -t -y && \ 19 | rm -rf /usr/local/src/* 20 | 21 | ENV PATH /opt/conda/bin:$PATH 22 | -------------------------------------------------------------------------------- /R-package/man/bank.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lightgbm.R 3 | \docType{data} 4 | \name{bank} 5 | \alias{bank} 6 | \title{Bank Marketing Data Set} 7 | \format{A data.table with 4521 rows and 17 variables} 8 | \usage{ 9 | data(bank) 10 | } 11 | \description{ 12 | This data set is originally from the Bank Marketing data set, 13 | UCI Machine Learning Repository. 14 | } 15 | \details{ 16 | It contains only the following: bank.csv with 10% of the examples and 17 inputs, 17 | randomly selected from 3 (older version of this dataset with less inputs). 18 | } 19 | \references{ 20 | http://archive.ics.uci.edu/ml/datasets/Bank+Marketing 21 | 22 | S. Moro, P. Cortez and P. Rita. (2014) 23 | A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /examples/parallel_learning/README.md: -------------------------------------------------------------------------------- 1 | Parallel Learning Example 2 | ========================= 3 | 4 | Here is an example for LightGBM to perform parallel learning for 2 machines. 5 | 6 | 1. Edit [mlist.txt](./mlist.txt): write the ip of these 2 machines that you want to run application on. 7 | 8 | ``` 9 | machine1_ip 12400 10 | machine2_ip 12400 11 | ``` 12 | 13 | 2. Copy this folder and executable file to these 2 machines that you want to run application on. 14 | 15 | 3. Run command in this folder on both 2 machines: 16 | 17 | ```"./lightgbm" config=train.conf``` 18 | 19 | This parallel learning example is based on socket. LightGBM also supports parallel learning based on mpi. 20 | 21 | For more details about the usage of parallel learning, please refer to [this](https://github.com/Microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst). 22 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.set.categorical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.set.categorical} 4 | \alias{lgb.Dataset.set.categorical} 5 | \title{Set categorical feature of \code{lgb.Dataset}} 6 | \usage{ 7 | lgb.Dataset.set.categorical(dataset, categorical_feature) 8 | } 9 | \arguments{ 10 | \item{dataset}{object of class \code{lgb.Dataset}} 11 | 12 | \item{categorical_feature}{categorical features} 13 | } 14 | \value{ 15 | passed dataset 16 | } 17 | \description{ 18 | Set categorical feature of \code{lgb.Dataset} 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package = "lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | lgb.Dataset.save(dtrain, "lgb.Dataset.data") 27 | dtrain <- lgb.Dataset("lgb.Dataset.data") 28 | lgb.Dataset.set.categorical(dtrain, 1:2) 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=LightGBM 13 | set SPHINXOPTS=-W 14 | 15 | if "%1" == "" goto help 16 | 17 | %SPHINXBUILD% >NUL 2>NUL 18 | if errorlevel 9009 ( 19 | echo. 20 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 21 | echo.installed, then set the SPHINXBUILD environment variable to point 22 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 23 | echo.may add the Sphinx directory to PATH. 24 | echo. 25 | echo.If you don't have Sphinx installed, grab it from 26 | echo.http://sphinx-doc.org/ 27 | exit /b 1 28 | ) 29 | 30 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 31 | goto end 32 | 33 | :help 34 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 35 | 36 | :end 37 | popd 38 | -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | Documentation 2 | ============= 3 | 4 | Documentation for LightGBM is generated using `Sphinx `__. 5 | 6 | List of parameters and their descriptions in `Parameters.rst <./Parameters.rst>`__ 7 | is generated automatically from comments in `config file `__ 8 | by `this script `__. 9 | 10 | After each commit on ``master``, documentation is updated and published to `Read the Docs `__. 11 | 12 | Build 13 | ----- 14 | 15 | You can build the documentation locally. Just run in ``docs`` folder 16 | 17 | for Python 3.x: 18 | 19 | .. code:: sh 20 | 21 | pip install sphinx "sphinx_rtd_theme>=0.3" 22 | make html 23 | 24 | 25 | for Python 2.x: 26 | 27 | .. code:: sh 28 | 29 | pip install mock sphinx "sphinx_rtd_theme>=0.3" 30 | make html 31 | -------------------------------------------------------------------------------- /R-package/man/slice.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{slice} 4 | \alias{slice} 5 | \alias{slice.lgb.Dataset} 6 | \title{Slice a dataset} 7 | \usage{ 8 | slice(dataset, ...) 9 | 10 | \method{slice}{lgb.Dataset}(dataset, idxset, ...) 11 | } 12 | \arguments{ 13 | \item{dataset}{Object of class "lgb.Dataset"} 14 | 15 | \item{...}{other parameters (currently not used)} 16 | 17 | \item{idxset}{a integer vector of indices of rows needed} 18 | } 19 | \value{ 20 | constructed sub dataset 21 | } 22 | \description{ 23 | Get a new \code{lgb.Dataset} containing the specified rows of 24 | orginal lgb.Dataset object 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(lightgbm) 29 | data(agaricus.train, package = "lightgbm") 30 | train <- agaricus.train 31 | dtrain <- lgb.Dataset(train$data, label = train$label) 32 | 33 | dsub <- lightgbm::slice(dtrain, 1:42) 34 | labels <- lightgbm::getinfo(dsub, "label") 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.set.reference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.set.reference} 4 | \alias{lgb.Dataset.set.reference} 5 | \title{Set reference of \code{lgb.Dataset}} 6 | \usage{ 7 | lgb.Dataset.set.reference(dataset, reference) 8 | } 9 | \arguments{ 10 | \item{dataset}{object of class \code{lgb.Dataset}} 11 | 12 | \item{reference}{object of class \code{lgb.Dataset}} 13 | } 14 | \value{ 15 | passed dataset 16 | } 17 | \description{ 18 | If you want to use validation data, you should set reference to training data 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package ="lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | data(agaricus.test, package = "lightgbm") 27 | test <- agaricus.test 28 | dtest <- lgb.Dataset(test$data, test = train$label) 29 | lgb.Dataset.set.reference(dtest, dtrain) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/network/linkers_mpi.cpp: -------------------------------------------------------------------------------- 1 | #ifdef USE_MPI 2 | #include "linkers.h" 3 | 4 | namespace LightGBM { 5 | 6 | Linkers::Linkers(Config) { 7 | is_init_ = false; 8 | int argc = 0; 9 | char**argv = nullptr; 10 | int flag = 0; 11 | MPI_SAFE_CALL(MPI_Initialized(&flag)); // test if MPI has been initialized 12 | if (!flag) { // if MPI not started, start it 13 | MPI_SAFE_CALL(MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &flag)); 14 | } 15 | MPI_SAFE_CALL(MPI_Comm_size(MPI_COMM_WORLD, &num_machines_)); 16 | MPI_SAFE_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank_)); 17 | // wait for all client start up 18 | MPI_SAFE_CALL(MPI_Barrier(MPI_COMM_WORLD)); 19 | bruck_map_ = BruckMap::Construct(rank_, num_machines_); 20 | recursive_halving_map_ = RecursiveHalvingMap::Construct(rank_, num_machines_); 21 | is_init_ = true; 22 | } 23 | 24 | Linkers::~Linkers() { 25 | if (is_init_) { 26 | MPI_SAFE_CALL(MPI_Finalize()); 27 | } 28 | } 29 | 30 | 31 | } // namespace LightGBM 32 | #endif // USE_MPI 33 | -------------------------------------------------------------------------------- /R-package/build_package.R: -------------------------------------------------------------------------------- 1 | unlink("./src/include", recursive = TRUE) 2 | unlink("./src/src", recursive = TRUE) 3 | unlink("./src/compute", recursive = TRUE) 4 | unlink("./src/build", recursive = TRUE) 5 | unlink("./src/Release", recursive = TRUE) 6 | if (!file.copy("./../include", "./src/", overwrite = TRUE, recursive = TRUE)) { 7 | stop("Cannot find folder LightGBM/include") 8 | } 9 | if (!file.copy("./../src", "./src/", overwrite = TRUE, recursive = TRUE)) { 10 | stop("Cannot find folder LightGBM/src") 11 | } 12 | if (!file.copy("./../compute", "./src/", overwrite = TRUE, recursive = TRUE)) { 13 | print("Cannot find folder LightGBM/compute, will disable GPU build") 14 | } 15 | if (!file.copy("./../CMakeLists.txt", "./src/", overwrite = TRUE, recursive = TRUE)) { 16 | stop("Cannot find file LightGBM/CMakeLists.txt") 17 | } 18 | if (!file.exists("./src/_IS_FULL_PACKAGE")) { 19 | file.create("./src/_IS_FULL_PACKAGE") 20 | } 21 | system("R CMD build --no-build-vignettes .") 22 | file.remove("./src/_IS_FULL_PACKAGE") 23 | -------------------------------------------------------------------------------- /R-package/man/agaricus.test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lightgbm.R 3 | \docType{data} 4 | \name{agaricus.test} 5 | \alias{agaricus.test} 6 | \title{Test part from Mushroom Data Set} 7 | \format{A list containing a label vector, and a dgCMatrix object with 1611 8 | rows and 126 variables} 9 | \usage{ 10 | data(agaricus.test) 11 | } 12 | \description{ 13 | This data set is originally from the Mushroom data set, 14 | UCI Machine Learning Repository. 15 | } 16 | \details{ 17 | This data set includes the following fields: 18 | 19 | \itemize{ 20 | \item \code{label} the label for each record 21 | \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. 22 | } 23 | } 24 | \references{ 25 | https://archive.ics.uci.edu/ml/datasets/Mushroom 26 | 27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 29 | School of Information and Computer Science. 30 | } 31 | \keyword{datasets} 32 | -------------------------------------------------------------------------------- /R-package/man/agaricus.train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lightgbm.R 3 | \docType{data} 4 | \name{agaricus.train} 5 | \alias{agaricus.train} 6 | \title{Training part from Mushroom Data Set} 7 | \format{A list containing a label vector, and a dgCMatrix object with 6513 8 | rows and 127 variables} 9 | \usage{ 10 | data(agaricus.train) 11 | } 12 | \description{ 13 | This data set is originally from the Mushroom data set, 14 | UCI Machine Learning Repository. 15 | } 16 | \details{ 17 | This data set includes the following fields: 18 | 19 | \itemize{ 20 | \item \code{label} the label for each record 21 | \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. 22 | } 23 | } 24 | \references{ 25 | https://archive.ics.uci.edu/ml/datasets/Mushroom 26 | 27 | Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 28 | [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 29 | School of Information and Computer Science. 30 | } 31 | \keyword{datasets} 32 | -------------------------------------------------------------------------------- /R-package/man/dim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{dim.lgb.Dataset} 4 | \alias{dim.lgb.Dataset} 5 | \title{Dimensions of an lgb.Dataset} 6 | \usage{ 7 | \method{dim}{lgb.Dataset}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Object of class \code{lgb.Dataset}} 11 | 12 | \item{...}{other parameters} 13 | } 14 | \value{ 15 | a vector of numbers of rows and of columns 16 | } 17 | \description{ 18 | Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}. 19 | } 20 | \details{ 21 | Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also 22 | be directly used with an \code{lgb.Dataset} object. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library(lightgbm) 27 | data(agaricus.train, package = "lightgbm") 28 | train <- agaricus.train 29 | dtrain <- lgb.Dataset(train$data, label = train$label) 30 | 31 | stopifnot(nrow(dtrain) == nrow(train$data)) 32 | stopifnot(ncol(dtrain) == ncol(train$data)) 33 | stopifnot(all(dim(dtrain) == dim(train$data))) 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.create.valid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset.create.valid} 4 | \alias{lgb.Dataset.create.valid} 5 | \title{Construct validation data} 6 | \usage{ 7 | lgb.Dataset.create.valid(dataset, data, info = list(), ...) 8 | } 9 | \arguments{ 10 | \item{dataset}{\code{lgb.Dataset} object, training data} 11 | 12 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} 13 | 14 | \item{info}{a list of information of the lgb.Dataset object} 15 | 16 | \item{...}{other information to pass to \code{info}.} 17 | } 18 | \value{ 19 | constructed dataset 20 | } 21 | \description{ 22 | Construct validation data according to training data 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library(lightgbm) 27 | data(agaricus.train, package = "lightgbm") 28 | train <- agaricus.train 29 | dtrain <- lgb.Dataset(train$data, label = train$label) 30 | data(agaricus.test, package = "lightgbm") 31 | test <- agaricus.test 32 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_custom_objective.R: -------------------------------------------------------------------------------- 1 | context('Test models with custom objective') 2 | 3 | data(agaricus.train, package='lightgbm') 4 | data(agaricus.test, package='lightgbm') 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 7 | watchlist <- list(eval = dtest, train = dtrain) 8 | 9 | logregobj <- function(preds, dtrain) { 10 | labels <- getinfo(dtrain, "label") 11 | preds <- 1 / (1 + exp(-preds)) 12 | grad <- preds - labels 13 | hess <- preds * (1 - preds) 14 | return(list(grad = grad, hess = hess)) 15 | } 16 | 17 | evalerror <- function(preds, dtrain) { 18 | labels <- getinfo(dtrain, "label") 19 | err <- as.numeric(sum(labels != (preds > 0))) / length(labels) 20 | return(list(name = "error", value = err, higher_better=FALSE)) 21 | } 22 | 23 | param <- list(num_leaves=8, learning_rate=1, 24 | objective=logregobj, metric="auc") 25 | num_round <- 10 26 | 27 | test_that("custom objective works", { 28 | bst <- lgb.train(param, dtrain, num_round, watchlist, eval = evalerror) 29 | expect_false(is.null(bst$record_evals)) 30 | }) 31 | -------------------------------------------------------------------------------- /R-package/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /python-package/lightgbm/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """LightGBM, Light Gradient Boosting Machine. 3 | 4 | Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors 5 | """ 6 | from __future__ import absolute_import 7 | 8 | from .basic import Booster, Dataset 9 | from .callback import (early_stopping, print_evaluation, record_evaluation, 10 | reset_parameter) 11 | from .engine import cv, train 12 | import os 13 | 14 | try: 15 | from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker 16 | except ImportError: 17 | pass 18 | try: 19 | from .plotting import plot_importance, plot_metric, plot_tree, create_tree_digraph 20 | except ImportError: 21 | pass 22 | 23 | 24 | dir_path = os.path.dirname(os.path.realpath(__file__)) 25 | 26 | if os.path.isfile(os.path.join(dir_path, 'VERSION.txt')): 27 | __version__ = open(os.path.join(dir_path, 'VERSION.txt')).read().strip() 28 | 29 | __all__ = ['Dataset', 'Booster', 30 | 'train', 'cv', 31 | 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 32 | 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 33 | 'plot_importance', 'plot_metric', 'plot_tree', 'create_tree_digraph'] 34 | -------------------------------------------------------------------------------- /include/LightGBM/prediction_early_stop.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_PREDICTION_EARLY_STOP_H_ 2 | #define LIGHTGBM_PREDICTION_EARLY_STOP_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace LightGBM { 10 | 11 | struct PredictionEarlyStopInstance { 12 | /// Callback function type for early stopping. 13 | /// Takes current prediction and number of elements in prediction 14 | /// @returns true if prediction should stop according to criterion 15 | using FunctionType = std::function; 16 | 17 | FunctionType callback_function; // callback function itself 18 | int round_period; // call callback_function every `runPeriod` iterations 19 | }; 20 | 21 | struct PredictionEarlyStopConfig { 22 | int round_period; 23 | double margin_threshold; 24 | }; 25 | 26 | /// Create an early stopping algorithm of type `type`, with given round_period and margin threshold 27 | LIGHTGBM_EXPORT PredictionEarlyStopInstance CreatePredictionEarlyStopInstance(const std::string& type, 28 | const PredictionEarlyStopConfig& config); 29 | 30 | } // namespace LightGBM 31 | 32 | #endif // LIGHTGBM_PREDICTION_EARLY_STOP_H_ 33 | -------------------------------------------------------------------------------- /R-package/man/lgb.dump.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.dump} 4 | \alias{lgb.dump} 5 | \title{Dump LightGBM model to json} 6 | \usage{ 7 | lgb.dump(booster, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{booster}{Object of class \code{lgb.Booster}} 11 | 12 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} 13 | } 14 | \value{ 15 | json format of model 16 | } 17 | \description{ 18 | Dump LightGBM model to json 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package = "lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | data(agaricus.test, package = "lightgbm") 27 | test <- agaricus.test 28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 29 | params <- list(objective = "regression", metric = "l2") 30 | valids <- list(test = dtest) 31 | model <- lgb.train(params, 32 | dtrain, 33 | 100, 34 | valids, 35 | min_data = 1, 36 | learning_rate = 1, 37 | early_stopping_rounds = 10) 38 | json_model <- lgb.dump(model) 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /R-package/man/dimnames.lgb.Dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{dimnames.lgb.Dataset} 4 | \alias{dimnames.lgb.Dataset} 5 | \alias{dimnames<-.lgb.Dataset} 6 | \title{Handling of column names of \code{lgb.Dataset}} 7 | \usage{ 8 | \method{dimnames}{lgb.Dataset}(x) 9 | 10 | \method{dimnames}{lgb.Dataset}(x) <- value 11 | } 12 | \arguments{ 13 | \item{x}{object of class \code{lgb.Dataset}} 14 | 15 | \item{value}{a list of two elements: the first one is ignored 16 | and the second one is column names} 17 | } 18 | \description{ 19 | Only column names are supported for \code{lgb.Dataset}, thus setting of 20 | row names would have no effect and returned row names would be NULL. 21 | } 22 | \details{ 23 | Generic \code{dimnames} methods are used by \code{colnames}. 24 | Since row names are irrelevant, it is recommended to use \code{colnames} directly. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(lightgbm) 29 | data(agaricus.train, package = "lightgbm") 30 | train <- agaricus.train 31 | dtrain <- lgb.Dataset(train$data, label = train$label) 32 | lgb.Dataset.construct(dtrain) 33 | dimnames(dtrain) 34 | colnames(dtrain) 35 | colnames(dtrain) <- make.names(1:ncol(train$data)) 36 | print(dtrain, verbose = TRUE) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /include/LightGBM/utils/threading.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_THREADING_H_ 2 | #define LIGHTGBM_UTILS_THREADING_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace LightGBM { 10 | 11 | class Threading { 12 | public: 13 | 14 | template 15 | static inline void For(INDEX_T start, INDEX_T end, const std::function& inner_fun) { 16 | int num_threads = 1; 17 | #pragma omp parallel 18 | #pragma omp master 19 | { 20 | num_threads = omp_get_num_threads(); 21 | } 22 | INDEX_T num_inner = (end - start + num_threads - 1) / num_threads; 23 | if (num_inner <= 0) { num_inner = 1; } 24 | OMP_INIT_EX(); 25 | #pragma omp parallel for schedule(static,1) 26 | for (int i = 0; i < num_threads; ++i) { 27 | OMP_LOOP_EX_BEGIN(); 28 | INDEX_T inner_start = start + num_inner * i; 29 | INDEX_T inner_end = inner_start + num_inner; 30 | if (inner_end > end) { inner_end = end; } 31 | if (inner_start < end) { 32 | inner_fun(i, inner_start, inner_end); 33 | } 34 | OMP_LOOP_EX_END(); 35 | } 36 | OMP_THROW_EX(); 37 | } 38 | }; 39 | 40 | } // namespace LightGBM 41 | 42 | #endif // LightGBM_UTILS_THREADING_H_ 43 | -------------------------------------------------------------------------------- /R-package/man/lgb.save.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.save} 4 | \alias{lgb.save} 5 | \title{Save LightGBM model} 6 | \usage{ 7 | lgb.save(booster, filename, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{booster}{Object of class \code{lgb.Booster}} 11 | 12 | \item{filename}{saved filename} 13 | 14 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} 15 | } 16 | \value{ 17 | lgb.Booster 18 | } 19 | \description{ 20 | Save LightGBM model 21 | } 22 | \examples{ 23 | \dontrun{ 24 | library(lightgbm) 25 | data(agaricus.train, package = "lightgbm") 26 | train <- agaricus.train 27 | dtrain <- lgb.Dataset(train$data, label = train$label) 28 | data(agaricus.test, package = "lightgbm") 29 | test <- agaricus.test 30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 31 | params <- list(objective = "regression", metric = "l2") 32 | valids <- list(test = dtest) 33 | model <- lgb.train(params, 34 | dtrain, 35 | 100, 36 | valids, 37 | min_data = 1, 38 | learning_rate = 1, 39 | early_stopping_rounds = 10) 40 | lgb.save(model, "model.txt") 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /docs/Python-API.rst: -------------------------------------------------------------------------------- 1 | Python API 2 | ========== 3 | 4 | Data Structure API 5 | ------------------ 6 | 7 | .. autoclass:: lightgbm.Dataset 8 | :members: 9 | :show-inheritance: 10 | 11 | .. autoclass:: lightgbm.Booster 12 | :members: 13 | :show-inheritance: 14 | 15 | 16 | Training API 17 | ------------ 18 | 19 | .. autofunction:: lightgbm.train 20 | 21 | .. autofunction:: lightgbm.cv 22 | 23 | 24 | Scikit-learn API 25 | ---------------- 26 | 27 | .. autoclass:: lightgbm.LGBMModel 28 | :members: 29 | :show-inheritance: 30 | 31 | .. autoclass:: lightgbm.LGBMClassifier 32 | :members: 33 | :show-inheritance: 34 | 35 | .. autoclass:: lightgbm.LGBMRegressor 36 | :members: 37 | :show-inheritance: 38 | 39 | .. autoclass:: lightgbm.LGBMRanker 40 | :members: 41 | :show-inheritance: 42 | 43 | 44 | Callbacks 45 | --------- 46 | 47 | .. autofunction:: lightgbm.early_stopping 48 | 49 | .. autofunction:: lightgbm.print_evaluation 50 | 51 | .. autofunction:: lightgbm.record_evaluation 52 | 53 | .. autofunction:: lightgbm.reset_parameter 54 | 55 | 56 | Plotting 57 | -------- 58 | 59 | .. autofunction:: lightgbm.plot_importance 60 | 61 | .. autofunction:: lightgbm.plot_metric 62 | 63 | .. autofunction:: lightgbm.plot_tree 64 | 65 | .. autofunction:: lightgbm.create_tree_digraph 66 | -------------------------------------------------------------------------------- /docker/gpu/README.md: -------------------------------------------------------------------------------- 1 | # Dockerfile for LightGBM GPU Version with Python 2 | 3 | A docker file with LightGBM utilizing nvidia-docker. The file is based on the nvidia/cuda:8.0 image. LightGBM can be utilized in GPU and CPU modes and via Python (2.7 & 3.5) 4 | 5 | ## Contents 6 | 7 | - LightGBM (cpu + gpu) 8 | - Python 2.7 (Conda) + scikit-learn notebooks pandas matplotlib 9 | - Python 3.5 (Conda) + scikit-learn notebooks pandas matplotlib 10 | 11 | Running the container starts a jupyter notebook at localhost:8888 12 | 13 | jupyter password: keras 14 | 15 | ## Requirements 16 | 17 | Requires docker and [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) on host machine. 18 | 19 | ## Quickstart 20 | 21 | ### Build Docker Image 22 | 23 | ```sh 24 | mkdir lightgbm-docker 25 | cd lightgbm-docker 26 | wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/gpu/dockerfile.gpu 27 | docker build -f dockerfile.gpu -t lightgbm-gpu . 28 | ``` 29 | 30 | ### Run Image 31 | 32 | ```sh 33 | nvidia-docker run --rm -d --name lightgbm-gpu -p 8888:8888 -v /home:/home lightgbm-gpu 34 | ``` 35 | 36 | ### Attach with Command Line Access (if required) 37 | 38 | ```sh 39 | docker exec -it lightgbm-gpu bash 40 | ``` 41 | 42 | ### Jupyter Notebook 43 | 44 | ```sh 45 | localhost:8888 46 | ``` 47 | -------------------------------------------------------------------------------- /R-package/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method("dimnames<-",lgb.Dataset) 4 | S3method(dim,lgb.Dataset) 5 | S3method(dimnames,lgb.Dataset) 6 | S3method(getinfo,lgb.Dataset) 7 | S3method(predict,lgb.Booster) 8 | S3method(setinfo,lgb.Dataset) 9 | S3method(slice,lgb.Dataset) 10 | export(getinfo) 11 | export(lgb.Dataset) 12 | export(lgb.Dataset.construct) 13 | export(lgb.Dataset.create.valid) 14 | export(lgb.Dataset.save) 15 | export(lgb.Dataset.set.categorical) 16 | export(lgb.Dataset.set.reference) 17 | export(lgb.cv) 18 | export(lgb.dump) 19 | export(lgb.get.eval.result) 20 | export(lgb.importance) 21 | export(lgb.interprete) 22 | export(lgb.load) 23 | export(lgb.model.dt.tree) 24 | export(lgb.plot.importance) 25 | export(lgb.plot.interpretation) 26 | export(lgb.prepare) 27 | export(lgb.prepare2) 28 | export(lgb.prepare_rules) 29 | export(lgb.prepare_rules2) 30 | export(lgb.save) 31 | export(lgb.train) 32 | export(lgb.unloader) 33 | export(lightgbm) 34 | export(readRDS.lgb.Booster) 35 | export(saveRDS.lgb.Booster) 36 | export(setinfo) 37 | export(slice) 38 | import(methods) 39 | importFrom(R6,R6Class) 40 | importFrom(data.table,":=") 41 | importFrom(data.table,set) 42 | importFrom(graphics,barplot) 43 | importFrom(graphics,par) 44 | importFrom(magrittr,"%>%") 45 | importFrom(magrittr,"%T>%") 46 | useDynLib(lib_lightgbm) 47 | -------------------------------------------------------------------------------- /R-package/man/readRDS.lgb.Booster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readRDS.lgb.Booster.R 3 | \name{readRDS.lgb.Booster} 4 | \alias{readRDS.lgb.Booster} 5 | \title{readRDS for lgb.Booster models} 6 | \usage{ 7 | readRDS.lgb.Booster(file = "", refhook = NULL) 8 | } 9 | \arguments{ 10 | \item{file}{a connection or the name of the file where the R object is saved to or read from.} 11 | 12 | \item{refhook}{a hook function for handling reference objects.} 13 | } 14 | \value{ 15 | lgb.Booster. 16 | } 17 | \description{ 18 | Attemps to load a model using RDS. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(agaricus.train, package = "lightgbm") 24 | train <- agaricus.train 25 | dtrain <- lgb.Dataset(train$data, label = train$label) 26 | data(agaricus.test, package = "lightgbm") 27 | test <- agaricus.test 28 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 29 | params <- list(objective = "regression", metric = "l2") 30 | valids <- list(test = dtest) 31 | model <- lgb.train(params, 32 | dtrain, 33 | 100, 34 | valids, 35 | min_data = 1, 36 | learning_rate = 1, 37 | early_stopping_rounds = 10) 38 | saveRDS.lgb.Booster(model, "model.rds") 39 | new_model <- readRDS.lgb.Booster("model.rds") 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /R-package/man/getinfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{getinfo} 4 | \alias{getinfo} 5 | \alias{getinfo.lgb.Dataset} 6 | \title{Get information of an lgb.Dataset object} 7 | \usage{ 8 | getinfo(dataset, ...) 9 | 10 | \method{getinfo}{lgb.Dataset}(dataset, name, ...) 11 | } 12 | \arguments{ 13 | \item{dataset}{Object of class \code{lgb.Dataset}} 14 | 15 | \item{...}{other parameters} 16 | 17 | \item{name}{the name of the information field to get (see details)} 18 | } 19 | \value{ 20 | info data 21 | } 22 | \description{ 23 | Get information of an lgb.Dataset object 24 | } 25 | \details{ 26 | The \code{name} field can be one of the following: 27 | 28 | \itemize{ 29 | \item \code{label}: label lightgbm learn from ; 30 | \item \code{weight}: to do a weight rescale ; 31 | \item \code{group}: group size 32 | \item \code{init_score}: initial score is the base prediction lightgbm will boost from ; 33 | } 34 | } 35 | \examples{ 36 | \dontrun{ 37 | library(lightgbm) 38 | data(agaricus.train, package = "lightgbm") 39 | train <- agaricus.train 40 | dtrain <- lgb.Dataset(train$data, label = train$label) 41 | lgb.Dataset.construct(dtrain) 42 | 43 | labels <- lightgbm::getinfo(dtrain, "label") 44 | lightgbm::setinfo(dtrain, "label", 1 - labels) 45 | 46 | labels2 <- lightgbm::getinfo(dtrain, "label") 47 | stopifnot(all(labels2 == 1 - labels)) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R-package/man/setinfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{setinfo} 4 | \alias{setinfo} 5 | \alias{setinfo.lgb.Dataset} 6 | \title{Set information of an lgb.Dataset object} 7 | \usage{ 8 | setinfo(dataset, ...) 9 | 10 | \method{setinfo}{lgb.Dataset}(dataset, name, info, ...) 11 | } 12 | \arguments{ 13 | \item{dataset}{Object of class "lgb.Dataset"} 14 | 15 | \item{...}{other parameters} 16 | 17 | \item{name}{the name of the field to get} 18 | 19 | \item{info}{the specific field of information to set} 20 | } 21 | \value{ 22 | passed object 23 | } 24 | \description{ 25 | Set information of an lgb.Dataset object 26 | } 27 | \details{ 28 | The \code{name} field can be one of the following: 29 | 30 | \itemize{ 31 | \item \code{label}: label lightgbm learn from ; 32 | \item \code{weight}: to do a weight rescale ; 33 | \item \code{init_score}: initial score is the base prediction lightgbm will boost from ; 34 | \item \code{group}. 35 | } 36 | } 37 | \examples{ 38 | \dontrun{ 39 | library(lightgbm) 40 | data(agaricus.train, package = "lightgbm") 41 | train <- agaricus.train 42 | dtrain <- lgb.Dataset(train$data, label = train$label) 43 | lgb.Dataset.construct(dtrain) 44 | 45 | labels <- lightgbm::getinfo(dtrain, "label") 46 | lightgbm::setinfo(dtrain, "label", 1 - labels) 47 | 48 | labels2 <- lightgbm::getinfo(dtrain, "label") 49 | stopifnot(all.equal(labels2, 1 - labels)) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /R-package/demo/boost_from_prediction.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | require(methods) 3 | 4 | # Load in the agaricus dataset 5 | data(agaricus.train, package = "lightgbm") 6 | data(agaricus.test, package = "lightgbm") 7 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 8 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 9 | 10 | valids <- list(eval = dtest, train = dtrain) 11 | #--------------------Advanced features --------------------------- 12 | # advanced: start from a initial base prediction 13 | print("Start running example to start from a initial prediction") 14 | 15 | # Train lightgbm for 1 round 16 | param <- list(num_leaves = 4, 17 | learning_rate = 1, 18 | nthread = 2, 19 | objective = "binary") 20 | bst <- lgb.train(param, dtrain, 1, valids = valids) 21 | 22 | # Note: we need the margin value instead of transformed prediction in set_init_score 23 | ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE) 24 | ptest <- predict(bst, agaricus.test$data, rawscore = TRUE) 25 | 26 | # set the init_score property of dtrain and dtest 27 | # base margin is the base prediction we will boost from 28 | setinfo(dtrain, "init_score", ptrain) 29 | setinfo(dtest, "init_score", ptest) 30 | 31 | print("This is result of boost from initial prediction") 32 | bst <- lgb.train(params = param, 33 | data = dtrain, 34 | nrounds = 5, 35 | valids = valids) 36 | -------------------------------------------------------------------------------- /R-package/man/lgb.Dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Dataset.R 3 | \name{lgb.Dataset} 4 | \alias{lgb.Dataset} 5 | \title{Construct lgb.Dataset object} 6 | \usage{ 7 | lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL, 8 | categorical_feature = NULL, free_raw_data = TRUE, info = list(), ...) 9 | } 10 | \arguments{ 11 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} 12 | 13 | \item{params}{a list of parameters} 14 | 15 | \item{reference}{reference dataset} 16 | 17 | \item{colnames}{names of columns} 18 | 19 | \item{categorical_feature}{categorical features} 20 | 21 | \item{free_raw_data}{TRUE for need to free raw data after construct} 22 | 23 | \item{info}{a list of information of the lgb.Dataset object} 24 | 25 | \item{...}{other information to pass to \code{info} or parameters pass to \code{params}} 26 | } 27 | \value{ 28 | constructed dataset 29 | } 30 | \description{ 31 | Construct lgb.Dataset object from dense matrix, sparse matrix 32 | or local file (that was created previously by saving an \code{lgb.Dataset}). 33 | } 34 | \examples{ 35 | \dontrun{ 36 | library(lightgbm) 37 | data(agaricus.train, package = "lightgbm") 38 | train <- agaricus.train 39 | dtrain <- lgb.Dataset(train$data, label = train$label) 40 | lgb.Dataset.save(dtrain, "lgb.Dataset.data") 41 | dtrain <- lgb.Dataset("lgb.Dataset.data") 42 | lgb.Dataset.construct(dtrain) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /R-package/man/lgb.get.eval.result.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.get.eval.result} 4 | \alias{lgb.get.eval.result} 5 | \title{Get record evaluation result from booster} 6 | \usage{ 7 | lgb.get.eval.result(booster, data_name, eval_name, iters = NULL, 8 | is_err = FALSE) 9 | } 10 | \arguments{ 11 | \item{booster}{Object of class \code{lgb.Booster}} 12 | 13 | \item{data_name}{name of dataset} 14 | 15 | \item{eval_name}{name of evaluation} 16 | 17 | \item{iters}{iterations, NULL will return all} 18 | 19 | \item{is_err}{TRUE will return evaluation error instead} 20 | } 21 | \value{ 22 | vector of evaluation result 23 | } 24 | \description{ 25 | Get record evaluation result from booster 26 | } 27 | \examples{ 28 | \dontrun{ 29 | library(lightgbm) 30 | data(agaricus.train, package = "lightgbm") 31 | train <- agaricus.train 32 | dtrain <- lgb.Dataset(train$data, label = train$label) 33 | data(agaricus.test, package = "lightgbm") 34 | test <- agaricus.test 35 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 36 | params <- list(objective = "regression", metric = "l2") 37 | valids <- list(test = dtest) 38 | model <- lgb.train(params, 39 | dtrain, 40 | 100, 41 | valids, 42 | min_data = 1, 43 | learning_rate = 1, 44 | early_stopping_rounds = 10) 45 | lgb.get.eval.result(model, "test", "l2") 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /R-package/man/lgb.importance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.importance.R 3 | \name{lgb.importance} 4 | \alias{lgb.importance} 5 | \title{Compute feature importance in a model} 6 | \usage{ 7 | lgb.importance(model, percentage = TRUE) 8 | } 9 | \arguments{ 10 | \item{model}{object of class \code{lgb.Booster}.} 11 | 12 | \item{percentage}{whether to show importance in relative percentage.} 13 | } 14 | \value{ 15 | For a tree model, a \code{data.table} with the following columns: 16 | \itemize{ 17 | \item \code{Feature} Feature names in the model. 18 | \item \code{Gain} The total gain of this feature's splits. 19 | \item \code{Cover} The number of observation related to this feature. 20 | \item \code{Frequency} The number of times a feature splited in trees. 21 | } 22 | } 23 | \description{ 24 | Creates a \code{data.table} of feature importances in a model. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(lightgbm) 29 | data(agaricus.train, package = "lightgbm") 30 | train <- agaricus.train 31 | dtrain <- lgb.Dataset(train$data, label = train$label) 32 | 33 | params = list(objective = "binary", 34 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 35 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 36 | model <- lgb.train(params, dtrain, 20) 37 | model <- lgb.train(params, dtrain, 20) 38 | 39 | tree_imp1 <- lgb.importance(model, percentage = TRUE) 40 | tree_imp2 <- lgb.importance(model, percentage = FALSE) 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/treelearner/tree_learner.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "serial_tree_learner.h" 4 | #include "gpu_tree_learner.h" 5 | #include "parallel_tree_learner.h" 6 | 7 | namespace LightGBM { 8 | 9 | TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, const std::string& device_type, const Config* config) { 10 | if (device_type == std::string("cpu")) { 11 | if (learner_type == std::string("serial")) { 12 | return new SerialTreeLearner(config); 13 | } else if (learner_type == std::string("feature")) { 14 | return new FeatureParallelTreeLearner(config); 15 | } else if (learner_type == std::string("data")) { 16 | return new DataParallelTreeLearner(config); 17 | } else if (learner_type == std::string("voting")) { 18 | return new VotingParallelTreeLearner(config); 19 | } 20 | } 21 | else if (device_type == std::string("gpu")) { 22 | if (learner_type == std::string("serial")) { 23 | return new GPUTreeLearner(config); 24 | } else if (learner_type == std::string("feature")) { 25 | return new FeatureParallelTreeLearner(config); 26 | } else if (learner_type == std::string("data")) { 27 | return new DataParallelTreeLearner(config); 28 | } else if (learner_type == std::string("voting")) { 29 | return new VotingParallelTreeLearner(config); 30 | } 31 | } 32 | return nullptr; 33 | } 34 | 35 | } // namespace LightGBM 36 | -------------------------------------------------------------------------------- /R-package/man/lgb.load.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{lgb.load} 4 | \alias{lgb.load} 5 | \title{Load LightGBM model} 6 | \usage{ 7 | lgb.load(filename = NULL, model_str = NULL) 8 | } 9 | \arguments{ 10 | \item{filename}{path of model file} 11 | 12 | \item{model_str}{a str containing the model} 13 | } 14 | \value{ 15 | lgb.Booster 16 | } 17 | \description{ 18 | Load LightGBM model from saved model file or string 19 | Load LightGBM takes in either a file path or model string 20 | If both are provided, Load will default to loading from file 21 | } 22 | \examples{ 23 | \dontrun{ 24 | library(lightgbm) 25 | data(agaricus.train, package = "lightgbm") 26 | train <- agaricus.train 27 | dtrain <- lgb.Dataset(train$data, label = train$label) 28 | data(agaricus.test, package = "lightgbm") 29 | test <- agaricus.test 30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 31 | params <- list(objective = "regression", metric = "l2") 32 | valids <- list(test = dtest) 33 | model <- lgb.train(params, 34 | dtrain, 35 | 100, 36 | valids, 37 | min_data = 1, 38 | learning_rate = 1, 39 | early_stopping_rounds = 10) 40 | lgb.save(model, "model.txt") 41 | load_booster <- lgb.load(filename = "model.txt") 42 | model_string <- model$save_model_to_string(NULL) # saves best iteration 43 | load_booster_from_str <- lgb.load(model_str = model_string) 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. LightGBM documentation master file, created by 2 | sphinx-quickstart on Thu May 4 14:30:58 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to LightGBM's documentation! 7 | ==================================== 8 | 9 | **LightGBM** is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages: 10 | 11 | - Faster training speed and higher efficiency 12 | - Lower memory usage 13 | - Better accuracy 14 | - Parallel and GPU learning supported 15 | - Capable of handling large-scale data 16 | 17 | For more details, please refer to `Features <./Features.rst>`__. 18 | 19 | .. toctree:: 20 | :maxdepth: 1 21 | :caption: Contents: 22 | 23 | Installation Guide 24 | Quick Start 25 | Python Quick Start 26 | Features 27 | Experiments 28 | Parameters 29 | Parameters Tuning 30 | Python API 31 | Parallel Learning Guide 32 | GPU Tutorial 33 | Advanced Topics 34 | FAQ 35 | Development Guide 36 | 37 | .. toctree:: 38 | :hidden: 39 | 40 | GPU-Performance 41 | GPU-Targets 42 | GPU-Windows 43 | gcc-Tips 44 | README 45 | 46 | Indices and Tables 47 | ================== 48 | 49 | * :ref:`genindex` 50 | -------------------------------------------------------------------------------- /R-package/demo/efficient_many_training.R: -------------------------------------------------------------------------------- 1 | # Efficient training means training without giving up too much RAM 2 | # In the case of many trainings (like 100+ models), RAM will be eaten very quickly 3 | # Therefore, it is essential to know a strategy to deal with such issue 4 | 5 | # More results can be found here: https://github.com/Microsoft/LightGBM/issues/879#issuecomment-326656580 6 | # Quote: "@Laurae2 Thanks for nice easily reproducible example (unlike mine). 7 | # With reset=FALSE you get after 500 iterations (not 1000): OS reports 27GB usage, while R gc() reports 1.5GB. 8 | # Just doing reset=TRUE will already improve things: OS reports 4.6GB. 9 | # Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip." 10 | 11 | # Load library 12 | library(lightgbm) 13 | 14 | # Generate fictive data of size 1M x 100 15 | set.seed(11111) 16 | x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100) 17 | y_data <- rnorm(n = 1000000, mean = 0, sd = 5) 18 | 19 | # Create lgb.Dataset for training 20 | data <- lgb.Dataset(x_data, label = y_data) 21 | data$construct() 22 | 23 | # Loop through a training of 1000 models, please check your RAM on your task manager 24 | # It MUST remain constant (if not increasing very slightly) 25 | gbm <- list() 26 | 27 | for (i in 1:1000) { 28 | print(i) 29 | gbm[[i]] <- lgb.train(params = list(objective = "regression"), 30 | data = data, 31 | 1, 32 | reset_data = TRUE) 33 | gc(verbose = FALSE) 34 | } 35 | -------------------------------------------------------------------------------- /windows/LightGBM.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_mpi|x64 = Debug_mpi|x64 11 | Debug|x64 = Debug|x64 12 | DLL|x64 = DLL|x64 13 | Release_mpi|x64 = Release_mpi|x64 14 | Release|x64 = Release|x64 15 | EndGlobalSection 16 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 17 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64 18 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64 19 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64 20 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64 21 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.ActiveCfg = DLL|x64 22 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.DLL|x64.Build.0 = DLL|x64 23 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64 24 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64 25 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64 26 | {F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64 27 | EndGlobalSection 28 | GlobalSection(SolutionProperties) = preSolution 29 | HideSolutionNode = FALSE 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /.travis/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $TRAVIS_OS_NAME == "osx" ]]; then 4 | sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3" # fix "fatal error: _stdio.h: No such file or directory" 5 | rm '/usr/local/include/c++' 6 | # brew cask uninstall oclint # reserve variant to deal with conflict link 7 | if [[ $TASK == "mpi" ]]; then 8 | brew install open-mpi 9 | else 10 | brew install gcc 11 | fi 12 | # brew link --overwrite gcc # previous variant to deal with conflict link 13 | wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda${PYTHON_VERSION:0:1}-latest-MacOSX-x86_64.sh 14 | else 15 | if [[ $TASK == "mpi" ]]; then 16 | sudo apt-get install -y libopenmpi-dev openmpi-bin 17 | fi 18 | if [[ $TASK == "gpu" ]]; then 19 | sudo apt-get install -y ocl-icd-opencl-dev 20 | fi 21 | wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda${PYTHON_VERSION:0:1}-latest-Linux-x86_64.sh 22 | fi 23 | 24 | sh conda.sh -b -p $HOME/miniconda 25 | conda config --set always_yes yes --set changeps1 no 26 | conda update -q conda 27 | 28 | if [[ $TASK == "gpu" ]] && [[ $TRAVIS_OS_NAME == "linux" ]]; then 29 | wget https://github.com/Microsoft/LightGBM/releases/download/v2.0.12/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2 30 | tar -xjf AMD-APP-SDK*.tar.bz2 31 | mkdir -p $OPENCL_VENDOR_PATH 32 | sh AMD-APP-SDK*.sh --tar -xf -C $AMDAPPSDK 33 | mv $AMDAPPSDK/lib/x86_64/sdk/* $AMDAPPSDK/lib/x86_64/ 34 | echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd 35 | fi 36 | -------------------------------------------------------------------------------- /examples/python-guide/simple_example.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | from sklearn.metrics import mean_squared_error 6 | 7 | 8 | # load or create your dataset 9 | print('Load data...') 10 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 11 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 12 | 13 | y_train = df_train[0].values 14 | y_test = df_test[0].values 15 | X_train = df_train.drop(0, axis=1).values 16 | X_test = df_test.drop(0, axis=1).values 17 | 18 | # create dataset for lightgbm 19 | lgb_train = lgb.Dataset(X_train, y_train) 20 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 21 | 22 | # specify your configurations as a dict 23 | params = { 24 | 'task': 'train', 25 | 'boosting_type': 'gbdt', 26 | 'objective': 'regression', 27 | 'metric': {'l2', 'auc'}, 28 | 'num_leaves': 31, 29 | 'learning_rate': 0.05, 30 | 'feature_fraction': 0.9, 31 | 'bagging_fraction': 0.8, 32 | 'bagging_freq': 5, 33 | 'verbose': 0 34 | } 35 | 36 | print('Start training...') 37 | # train 38 | gbm = lgb.train(params, 39 | lgb_train, 40 | num_boost_round=20, 41 | valid_sets=lgb_eval, 42 | early_stopping_rounds=5) 43 | 44 | print('Save model...') 45 | # save model to file 46 | gbm.save_model('model.txt') 47 | 48 | print('Start predicting...') 49 | # predict 50 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) 51 | # eval 52 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) 53 | -------------------------------------------------------------------------------- /python-package/lightgbm/libpath.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Find the path to lightgbm dynamic library files.""" 3 | import os 4 | 5 | from platform import system 6 | 7 | 8 | def find_lib_path(): 9 | """Find the path to LightGBM library files. 10 | Returns 11 | ------- 12 | lib_path: list(string) 13 | List of all found library path to LightGBM 14 | """ 15 | if os.environ.get('LIGHTGBM_BUILD_DOC', False): 16 | # we don't need lib_lightgbm while building docs 17 | return [] 18 | 19 | curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) 20 | dll_path = [curr_path, os.path.join(curr_path, '../../'), 21 | os.path.join(curr_path, 'compile'), 22 | os.path.join(curr_path, '../compile'), 23 | os.path.join(curr_path, '../../lib/')] 24 | if system() in ('Windows', 'Microsoft'): 25 | dll_path.append(os.path.join(curr_path, '../compile/Release/')) 26 | dll_path.append(os.path.join(curr_path, '../compile/windows/x64/DLL/')) 27 | dll_path.append(os.path.join(curr_path, '../../Release/')) 28 | dll_path.append(os.path.join(curr_path, '../../windows/x64/DLL/')) 29 | dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path] 30 | else: 31 | dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path] 32 | lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] 33 | if not lib_path: 34 | dll_path = [os.path.realpath(p) for p in dll_path] 35 | raise Exception('Cannot find lightgbm library in following paths: ' + '\n'.join(dll_path)) 36 | return lib_path 37 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_parameters.R: -------------------------------------------------------------------------------- 1 | data(agaricus.train, package='lightgbm') 2 | data(agaricus.test, package='lightgbm') 3 | train <- agaricus.train 4 | test <- agaricus.test 5 | 6 | test_that("Feature penalties work properly", { 7 | # Fit a series of models with varying penalty on most important variable 8 | var_name <- "odor=none" 9 | var_index <- which(train$data@Dimnames[[2]] == var_name) 10 | 11 | bst <- lapply(seq(1, 0, by = -0.1), function(x) { 12 | feature_penalties <- rep(1, ncol(train$data)) 13 | feature_penalties[var_index] <- x 14 | lightgbm( 15 | data = train$data, 16 | label = train$label, 17 | num_leaves = 5, 18 | learning_rate = 0.05, 19 | nrounds = 20, 20 | objective = "binary", 21 | feature_penalty = paste0(feature_penalties, collapse = ","), 22 | metric="binary_error", 23 | verbose = -1 24 | ) 25 | }) 26 | 27 | var_gain <- lapply(bst, function(x) lgb.importance(x)[Feature == var_name, Gain]) 28 | var_cover <- lapply(bst, function(x) lgb.importance(x)[Feature == var_name, Cover]) 29 | var_freq <- lapply(bst, function(x) lgb.importance(x)[Feature == var_name, Frequency]) 30 | 31 | # Ensure that feature gain, cover, and frequency decreases with stronger penalties 32 | expect_true(all(diff(unlist(var_gain)) <= 0)) 33 | expect_true(all(diff(unlist(var_cover)) <= 0)) 34 | expect_true(all(diff(unlist(var_freq)) <= 0)) 35 | 36 | expect_lt(min(diff(unlist(var_gain))), 0) 37 | expect_lt(min(diff(unlist(var_cover))), 0) 38 | expect_lt(min(diff(unlist(var_freq))), 0) 39 | 40 | # Ensure that feature is not used when feature_penalty = 0 41 | expect_length(var_gain[[length(var_gain)]], 0) 42 | }) -------------------------------------------------------------------------------- /examples/multiclass_classification/train.conf: -------------------------------------------------------------------------------- 1 | # task type, support train and predict 2 | task = train 3 | 4 | # boosting type, support gbdt for now, alias: boosting, boost 5 | boosting_type = gbdt 6 | 7 | # application type, support following application 8 | # regression , regression task 9 | # binary , binary classification task 10 | # lambdarank , lambdarank task 11 | # multiclass 12 | # alias: application, app 13 | objective = multiclass 14 | 15 | # eval metrics, support multi metric, delimite by ',' , support following metrics 16 | # l1 17 | # l2 , default metric for regression 18 | # ndcg , default metric for lambdarank 19 | # auc 20 | # binary_logloss , default metric for binary 21 | # binary_error 22 | # multi_logloss 23 | # multi_error 24 | metric = multi_logloss 25 | 26 | # number of class, for multiclass classification 27 | num_class = 5 28 | 29 | # frequence for metric output 30 | metric_freq = 1 31 | 32 | # true if need output metric for training data, alias: tranining_metric, train_metric 33 | is_training_metric = true 34 | 35 | # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 36 | max_bin = 255 37 | 38 | # training data 39 | # if exsting weight file, should name to "regression.train.weight" 40 | # alias: train_data, train 41 | data = multiclass.train 42 | 43 | # valid data 44 | valid_data = multiclass.test 45 | 46 | # round for early stopping 47 | early_stopping = 10 48 | 49 | # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds 50 | num_trees = 100 51 | 52 | # shrinkage rate , alias: shrinkage_rate 53 | learning_rate = 0.05 54 | 55 | # number of leaves for one tree, alias: num_leaf 56 | num_leaves = 31 57 | -------------------------------------------------------------------------------- /examples/lambdarank/rank.train.query: -------------------------------------------------------------------------------- 1 | 1 2 | 13 3 | 5 4 | 8 5 | 19 6 | 12 7 | 18 8 | 5 9 | 14 10 | 13 11 | 8 12 | 9 13 | 16 14 | 11 15 | 21 16 | 14 17 | 21 18 | 9 19 | 14 20 | 11 21 | 20 22 | 18 23 | 13 24 | 20 25 | 22 26 | 22 27 | 13 28 | 17 29 | 10 30 | 13 31 | 12 32 | 13 33 | 13 34 | 23 35 | 18 36 | 13 37 | 20 38 | 12 39 | 22 40 | 14 41 | 13 42 | 23 43 | 13 44 | 14 45 | 14 46 | 5 47 | 13 48 | 15 49 | 14 50 | 14 51 | 16 52 | 16 53 | 15 54 | 21 55 | 22 56 | 10 57 | 22 58 | 18 59 | 25 60 | 16 61 | 12 62 | 12 63 | 15 64 | 15 65 | 25 66 | 13 67 | 9 68 | 12 69 | 8 70 | 16 71 | 25 72 | 19 73 | 24 74 | 12 75 | 16 76 | 10 77 | 16 78 | 9 79 | 17 80 | 15 81 | 7 82 | 9 83 | 15 84 | 14 85 | 16 86 | 17 87 | 8 88 | 17 89 | 12 90 | 18 91 | 23 92 | 10 93 | 12 94 | 12 95 | 4 96 | 14 97 | 12 98 | 15 99 | 27 100 | 16 101 | 20 102 | 13 103 | 19 104 | 13 105 | 17 106 | 17 107 | 16 108 | 12 109 | 15 110 | 14 111 | 14 112 | 19 113 | 12 114 | 23 115 | 18 116 | 16 117 | 9 118 | 23 119 | 11 120 | 15 121 | 8 122 | 10 123 | 10 124 | 16 125 | 11 126 | 15 127 | 22 128 | 16 129 | 17 130 | 23 131 | 16 132 | 22 133 | 17 134 | 14 135 | 12 136 | 14 137 | 20 138 | 15 139 | 17 140 | 15 141 | 15 142 | 22 143 | 9 144 | 21 145 | 9 146 | 17 147 | 16 148 | 15 149 | 13 150 | 13 151 | 15 152 | 14 153 | 18 154 | 21 155 | 14 156 | 17 157 | 15 158 | 14 159 | 16 160 | 12 161 | 17 162 | 19 163 | 16 164 | 11 165 | 18 166 | 11 167 | 13 168 | 14 169 | 9 170 | 16 171 | 15 172 | 16 173 | 25 174 | 9 175 | 13 176 | 22 177 | 16 178 | 18 179 | 20 180 | 14 181 | 11 182 | 9 183 | 16 184 | 19 185 | 19 186 | 11 187 | 11 188 | 13 189 | 14 190 | 14 191 | 13 192 | 16 193 | 6 194 | 21 195 | 16 196 | 12 197 | 16 198 | 11 199 | 24 200 | 12 201 | 10 202 | -------------------------------------------------------------------------------- /R-package/R/readRDS.lgb.Booster.R: -------------------------------------------------------------------------------- 1 | #' readRDS for lgb.Booster models 2 | #' 3 | #' Attemps to load a model using RDS. 4 | #' 5 | #' @param file a connection or the name of the file where the R object is saved to or read from. 6 | #' @param refhook a hook function for handling reference objects. 7 | #' 8 | #' @return lgb.Booster. 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' library(lightgbm) 13 | #' data(agaricus.train, package = "lightgbm") 14 | #' train <- agaricus.train 15 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 16 | #' data(agaricus.test, package = "lightgbm") 17 | #' test <- agaricus.test 18 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 19 | #' params <- list(objective = "regression", metric = "l2") 20 | #' valids <- list(test = dtest) 21 | #' model <- lgb.train(params, 22 | #' dtrain, 23 | #' 100, 24 | #' valids, 25 | #' min_data = 1, 26 | #' learning_rate = 1, 27 | #' early_stopping_rounds = 10) 28 | #' saveRDS.lgb.Booster(model, "model.rds") 29 | #' new_model <- readRDS.lgb.Booster("model.rds") 30 | #' } 31 | #' 32 | #' @export 33 | readRDS.lgb.Booster <- function(file = "", refhook = NULL) { 34 | 35 | # Read RDS file 36 | object <- readRDS(file = file, refhook = refhook) 37 | 38 | # Check if object has the model stored 39 | if (!is.na(object$raw)) { 40 | 41 | # Create temporary model for the model loading 42 | object2 <- lgb.load(model_str = object$raw) 43 | 44 | # Restore best iteration and recorded evaluations 45 | object2$best_iter <- object$best_iter 46 | object2$record_evals <- object$record_evals 47 | 48 | # Return newly loaded object 49 | return(object2) 50 | 51 | } else { 52 | 53 | # Return RDS loaded object 54 | return(object) 55 | 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /R-package/man/lgb.plot.importance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.plot.importance.R 3 | \name{lgb.plot.importance} 4 | \alias{lgb.plot.importance} 5 | \title{Plot feature importance as a bar graph} 6 | \usage{ 7 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain", 8 | left_margin = 10, cex = NULL) 9 | } 10 | \arguments{ 11 | \item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.} 12 | 13 | \item{top_n}{maximal number of top features to include into the plot.} 14 | 15 | \item{measure}{the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".} 16 | 17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.} 18 | 19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.} 20 | } 21 | \value{ 22 | The \code{lgb.plot.importance} function creates a \code{barplot} 23 | and silently returns a processed data.table with \code{top_n} features sorted by defined importance. 24 | } 25 | \description{ 26 | Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph. 27 | } 28 | \details{ 29 | The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature. 30 | Features are shown ranked in a decreasing importance order. 31 | } 32 | \examples{ 33 | \dontrun{ 34 | data(agaricus.train, package = "lightgbm") 35 | train <- agaricus.train 36 | dtrain <- lgb.Dataset(train$data, label = train$label) 37 | 38 | params = list(objective = "binary", 39 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 40 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 41 | model <- lgb.train(params, dtrain, 20) 42 | model <- lgb.train(params, dtrain, 20) 43 | 44 | tree_imp <- lgb.importance(model, percentage = TRUE) 45 | lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /R-package/man/lgb.interprete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.interprete.R 3 | \name{lgb.interprete} 4 | \alias{lgb.interprete} 5 | \title{Compute feature contribution of prediction} 6 | \usage{ 7 | lgb.interprete(model, data, idxset, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{model}{object of class \code{lgb.Booster}.} 11 | 12 | \item{data}{a matrix object or a dgCMatrix object.} 13 | 14 | \item{idxset}{a integer vector of indices of rows needed.} 15 | 16 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.} 17 | } 18 | \value{ 19 | For regression, binary classification and lambdarank model, a \code{list} of \code{data.table} with the following columns: 20 | \itemize{ 21 | \item \code{Feature} Feature names in the model. 22 | \item \code{Contribution} The total contribution of this feature's splits. 23 | } 24 | For multiclass classification, a \code{list} of \code{data.table} with the Feature column and Contribution columns to each class. 25 | } 26 | \description{ 27 | Computes feature contribution components of rawscore prediction. 28 | } 29 | \examples{ 30 | \dontrun{ 31 | library(lightgbm) 32 | Sigmoid <- function(x) 1 / (1 + exp(-x)) 33 | Logit <- function(x) log(x / (1 - x)) 34 | data(agaricus.train, package = "lightgbm") 35 | train <- agaricus.train 36 | dtrain <- lgb.Dataset(train$data, label = train$label) 37 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) 38 | data(agaricus.test, package = "lightgbm") 39 | test <- agaricus.test 40 | 41 | params = list(objective = "binary", 42 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 43 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 44 | model <- lgb.train(params, dtrain, 20) 45 | model <- lgb.train(params, dtrain, 20) 46 | 47 | tree_interpretation <- lgb.interprete(model, test$data, 1:5) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R-package/demo/cross_validation.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | # load in the agaricus dataset 3 | data(agaricus.train, package = "lightgbm") 4 | data(agaricus.test, package = "lightgbm") 5 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 6 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 7 | 8 | nrounds <- 2 9 | param <- list(num_leaves = 4, 10 | learning_rate = 1, 11 | objective = "binary") 12 | 13 | print("Running cross validation") 14 | # Do cross validation, this will print result out as 15 | # [iteration] metric_name:mean_value+std_value 16 | # std_value is standard deviation of the metric 17 | lgb.cv(param, 18 | dtrain, 19 | nrounds, 20 | nfold = 5, 21 | eval = "binary_error") 22 | 23 | print("Running cross validation, disable standard deviation display") 24 | # do cross validation, this will print result out as 25 | # [iteration] metric_name:mean_value+std_value 26 | # std_value is standard deviation of the metric 27 | lgb.cv(param, 28 | dtrain, 29 | nrounds, 30 | nfold = 5, 31 | eval = "binary_error", 32 | showsd = FALSE) 33 | 34 | # You can also do cross validation with cutomized loss function 35 | print("Running cross validation, with cutomsized loss function") 36 | 37 | logregobj <- function(preds, dtrain) { 38 | labels <- getinfo(dtrain, "label") 39 | preds <- 1 / (1 + exp(-preds)) 40 | grad <- preds - labels 41 | hess <- preds * (1 - preds) 42 | return(list(grad = grad, hess = hess)) 43 | } 44 | evalerror <- function(preds, dtrain) { 45 | labels <- getinfo(dtrain, "label") 46 | err <- as.numeric(sum(labels != (preds > 0))) / length(labels) 47 | return(list(name = "error", value = err, higher_better = FALSE)) 48 | } 49 | 50 | # train with customized objective 51 | lgb.cv(params = param, 52 | data = dtrain, 53 | nrounds = nrounds, 54 | obj = logregobj, 55 | eval = evalerror, 56 | nfold = 5) 57 | -------------------------------------------------------------------------------- /R-package/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: lightgbm 2 | Type: Package 3 | Title: Light Gradient Boosting Machine 4 | Version: 2.1.2 5 | Date: 2018-06-22 6 | Authors@R: c( 7 | person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")), 8 | person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")), 9 | person("Yachen", "Yan", role = c("ctb")), 10 | person("James", "Lamb", role = c("ctb")) 11 | ) 12 | Description: Tree based algorithms can be improved by introducing boosting frameworks. LightGBM is one such framework, and this package offers an R interface to work with it. 13 | It is designed to be distributed and efficient with the following advantages: 14 | 1. Faster training speed and higher efficiency. 15 | 2. Lower memory usage. 16 | 3. Better accuracy. 17 | 4. Parallel learning supported. 18 | 5. Capable of handling large-scale data. 19 | In recognition of these advantages, LightGBM has being widely-used in many winning solutions of machine learning competitions. 20 | Comparison experiments on public datasets suggest that LightGBM can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. In addition, parallel experiments suggest that in certain circumstances, LightGBM can achieve a linear speed-up in training time by using multiple machines. 21 | License: MIT + file LICENSE 22 | URL: https://github.com/Microsoft/LightGBM 23 | BugReports: https://github.com/Microsoft/LightGBM/issues 24 | VignetteBuilder: knitr 25 | Suggests: 26 | knitr, 27 | rmarkdown, 28 | ggplot2 (>= 1.0.1), 29 | DiagrammeR (>= 0.8.1), 30 | Ckmeans.1d.dp (>= 3.3.1), 31 | vcd (>= 1.3), 32 | testthat, 33 | igraph (>= 1.0.1), 34 | stringi (>= 0.5.2) 35 | Depends: 36 | R (>= 3.0), 37 | R6 (>= 2.0) 38 | Imports: 39 | graphics, 40 | methods, 41 | Matrix (>= 1.1-0), 42 | data.table (>= 1.9.6), 43 | magrittr (>= 1.5), 44 | jsonlite (>= 1.0) 45 | RoxygenNote: 6.0.1 46 | -------------------------------------------------------------------------------- /examples/python-guide/plot_example.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | 6 | if lgb.compat.MATPLOTLIB_INSTALLED: 7 | import matplotlib.pyplot as plt 8 | else: 9 | raise ImportError('You need to install matplotlib for plot_example.py.') 10 | 11 | # load or create your dataset 12 | print('Load data...') 13 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 14 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 15 | 16 | y_train = df_train[0].values 17 | y_test = df_test[0].values 18 | X_train = df_train.drop(0, axis=1).values 19 | X_test = df_test.drop(0, axis=1).values 20 | 21 | # create dataset for lightgbm 22 | lgb_train = lgb.Dataset(X_train, y_train) 23 | lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train) 24 | 25 | # specify your configurations as a dict 26 | params = { 27 | 'num_leaves': 5, 28 | 'metric': ('l1', 'l2'), 29 | 'verbose': 0 30 | } 31 | 32 | evals_result = {} # to record eval results for plotting 33 | 34 | print('Start training...') 35 | # train 36 | gbm = lgb.train(params, 37 | lgb_train, 38 | num_boost_round=100, 39 | valid_sets=[lgb_train, lgb_test], 40 | feature_name=['f' + str(i + 1) for i in range(28)], 41 | categorical_feature=[21], 42 | evals_result=evals_result, 43 | verbose_eval=10) 44 | 45 | print('Plot metrics recorded during training...') 46 | ax = lgb.plot_metric(evals_result, metric='l1') 47 | plt.show() 48 | 49 | print('Plot feature importances...') 50 | ax = lgb.plot_importance(gbm, max_num_features=10) 51 | plt.show() 52 | 53 | print('Plot 84th tree...') # one tree use categorical feature to split 54 | ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) 55 | plt.show() 56 | 57 | print('Plot 84th tree with graphviz...') 58 | graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') 59 | graph.render(view=True) 60 | -------------------------------------------------------------------------------- /include/LightGBM/meta.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_META_H_ 2 | #define LIGHTGBM_META_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace LightGBM { 12 | 13 | /*! \brief Type of data size, it is better to use signed type*/ 14 | typedef int32_t data_size_t; 15 | 16 | // Enable following marco to use double for score_t 17 | // #define SCORE_T_USE_DOUBLE 18 | 19 | // Enable following marco to use double for label_t 20 | // #define LABEL_T_USE_DOUBLE 21 | 22 | /*! \brief Type of score, and gradients */ 23 | #ifdef SCORE_T_USE_DOUBLE 24 | typedef double score_t; 25 | #else 26 | typedef float score_t; 27 | #endif 28 | 29 | /*! \brief Type of metadata, include weight and label */ 30 | #ifdef LABEL_T_USE_DOUBLE 31 | typedef double label_t; 32 | #else 33 | typedef float label_t; 34 | #endif 35 | 36 | const score_t kMinScore = -std::numeric_limits::infinity(); 37 | 38 | const score_t kEpsilon = 1e-15f; 39 | 40 | const double kZeroThreshold = 1e-35f; 41 | 42 | 43 | typedef int32_t comm_size_t; 44 | 45 | using PredictFunction = 46 | std::function>&, double* output)>; 47 | 48 | typedef void(*ReduceFunction)(const char* input, char* output, int type_size, comm_size_t array_size); 49 | 50 | 51 | typedef void(*ReduceScatterFunction)(char* input, comm_size_t input_size, int type_size, 52 | const comm_size_t* block_start, const comm_size_t* block_len, int num_block, char* output, comm_size_t output_size, 53 | const ReduceFunction& reducer); 54 | 55 | typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm_size_t* block_start, 56 | const comm_size_t* block_len, int num_block, char* output, comm_size_t output_size); 57 | 58 | 59 | #define NO_SPECIFIC (-1) 60 | 61 | #if (_MSC_VER <= 1800) 62 | #define __func__ __FUNCTION__ 63 | #endif 64 | 65 | } // namespace LightGBM 66 | 67 | #endif // LightGBM_META_H_ 68 | -------------------------------------------------------------------------------- /R-package/man/lgb.unloader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.unloader.R 3 | \name{lgb.unloader} 4 | \alias{lgb.unloader} 5 | \title{LightGBM unloading error fix} 6 | \usage{ 7 | lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) 8 | } 9 | \arguments{ 10 | \item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.} 11 | 12 | \item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.} 13 | 14 | \item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.} 15 | } 16 | \value{ 17 | NULL invisibly. 18 | } 19 | \description{ 20 | Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | library(lightgbm) 25 | data(agaricus.train, package = "lightgbm") 26 | train <- agaricus.train 27 | dtrain <- lgb.Dataset(train$data, label = train$label) 28 | data(agaricus.test, package = "lightgbm") 29 | test <- agaricus.test 30 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 31 | params <- list(objective = "regression", metric = "l2") 32 | valids <- list(test = dtest) 33 | model <- lgb.train(params, 34 | dtrain, 35 | 100, 36 | valids, 37 | min_data = 1, 38 | learning_rate = 1, 39 | early_stopping_rounds = 10) 40 | lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) 41 | rm(model, dtrain, dtest) # Not needed if wipe = TRUE 42 | gc() # Not needed if wipe = TRUE 43 | 44 | library(lightgbm) 45 | # Do whatever you want again with LightGBM without object clashing 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /include/LightGBM/utils/openmp_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ 2 | #define LIGHTGBM_OPENMP_WRAPPER_H_ 3 | #ifdef _OPENMP 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "log.h" 12 | 13 | class ThreadExceptionHelper { 14 | public: 15 | ThreadExceptionHelper() { 16 | ex_ptr_ = nullptr; 17 | } 18 | 19 | ~ThreadExceptionHelper() { 20 | ReThrow(); 21 | } 22 | void ReThrow() { 23 | if (ex_ptr_ != nullptr) { 24 | std::rethrow_exception(ex_ptr_); 25 | } 26 | } 27 | void CaptureException() { 28 | // only catch first exception. 29 | if (ex_ptr_ != nullptr) { return; } 30 | std::unique_lock guard(lock_); 31 | if (ex_ptr_ != nullptr) { return; } 32 | ex_ptr_ = std::current_exception(); 33 | } 34 | private: 35 | std::exception_ptr ex_ptr_; 36 | std::mutex lock_; 37 | }; 38 | 39 | #define OMP_INIT_EX() ThreadExceptionHelper omp_except_helper 40 | #define OMP_LOOP_EX_BEGIN() try { 41 | 42 | #define OMP_LOOP_EX_END() } \ 43 | catch(std::exception& ex) { Log::Warning(ex.what()); omp_except_helper.CaptureException(); } \ 44 | catch(...) { omp_except_helper.CaptureException(); } 45 | #define OMP_THROW_EX() omp_except_helper.ReThrow() 46 | 47 | #else 48 | 49 | #ifdef _MSC_VER 50 | #pragma warning( disable : 4068 ) // disable unknown pragma warning 51 | #endif 52 | 53 | #ifdef __cplusplus 54 | extern "C" { 55 | #endif 56 | /** Fall here if no OPENMP support, so just 57 | simulate a single thread running. 58 | All #pragma omp should be ignored by the compiler **/ 59 | inline void omp_set_num_threads(int) {} 60 | inline void omp_set_nested(int) {} 61 | inline int omp_get_num_threads() {return 1;} 62 | inline int omp_get_thread_num() {return 0;} 63 | #ifdef __cplusplus 64 | }; // extern "C" 65 | #endif 66 | 67 | #define OMP_INIT_EX() 68 | #define OMP_LOOP_EX_BEGIN() 69 | #define OMP_LOOP_EX_END() 70 | #define OMP_THROW_EX() 71 | 72 | #endif 73 | 74 | 75 | 76 | #endif /* LIGHTGBM_OPENMP_WRAPPER_H_ */ 77 | -------------------------------------------------------------------------------- /R-package/demo/early_stopping.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | require(methods) 3 | 4 | # Load in the agaricus dataset 5 | data(agaricus.train, package = "lightgbm") 6 | data(agaricus.test, package = "lightgbm") 7 | 8 | dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) 9 | dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) 10 | 11 | # Note: for customized objective function, we leave objective as default 12 | # Note: what we are getting is margin value in prediction 13 | # You must know what you are doing 14 | param <- list(num_leaves = 4, 15 | learning_rate = 1) 16 | valids <- list(eval = dtest) 17 | num_round <- 20 18 | 19 | # User define objective function, given prediction, return gradient and second order gradient 20 | # This is loglikelihood loss 21 | logregobj <- function(preds, dtrain) { 22 | labels <- getinfo(dtrain, "label") 23 | preds <- 1 / (1 + exp(-preds)) 24 | grad <- preds - labels 25 | hess <- preds * (1 - preds) 26 | return(list(grad = grad, hess = hess)) 27 | } 28 | 29 | # User defined evaluation function, return a pair metric_name, result, higher_better 30 | # NOTE: when you do customized loss function, the default prediction value is margin 31 | # This may make buildin evalution metric not function properly 32 | # For example, we are doing logistic loss, the prediction is score before logistic transformation 33 | # The buildin evaluation error assumes input is after logistic transformation 34 | # Take this in mind when you use the customization, and maybe you need write customized evaluation function 35 | evalerror <- function(preds, dtrain) { 36 | labels <- getinfo(dtrain, "label") 37 | err <- as.numeric(sum(labels != (preds > 0.5))) / length(labels) 38 | return(list(name = "error", value = err, higher_better = FALSE)) 39 | } 40 | print("Start training with early Stopping setting") 41 | 42 | bst <- lgb.train(param, 43 | dtrain, 44 | num_round, 45 | valids, 46 | objective = logregobj, 47 | eval = evalerror, 48 | early_stopping_round = 3) 49 | -------------------------------------------------------------------------------- /include/LightGBM/utils/file_io.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_FILE_IO_H_ 2 | #define LIGHTGBM_UTILS_FILE_IO_H_ 3 | 4 | #include 5 | 6 | namespace LightGBM{ 7 | 8 | /*! 9 | * \brief An interface for writing files from buffers 10 | */ 11 | struct VirtualFileWriter { 12 | virtual ~VirtualFileWriter() {}; 13 | /*! 14 | * \brief Initialize the writer 15 | * \return True when the file is available for writes 16 | */ 17 | virtual bool Init() = 0; 18 | /*! 19 | * \brief Append buffer to file 20 | * \param data Buffer to write from 21 | * \param bytes Number of bytes to write from buffer 22 | * \return Number of bytes written 23 | */ 24 | virtual size_t Write(const void* data, size_t bytes) const = 0; 25 | /*! 26 | * \brief Create appropriate writer for filename 27 | * \param filename Filename of the data 28 | * \return File writer instance 29 | */ 30 | static std::unique_ptr Make(const std::string& filename); 31 | /*! 32 | * \brief Check filename existence 33 | * \param filename Filename of the data 34 | * \return True when the file exists 35 | */ 36 | static bool Exists(const std::string& filename); 37 | }; 38 | 39 | /** 40 | * \brief An interface for reading files into buffers 41 | */ 42 | struct VirtualFileReader { 43 | /*! 44 | * \brief Constructor 45 | * \param filename Filename of the data 46 | */ 47 | virtual ~VirtualFileReader() {}; 48 | /*! 49 | * \brief Initialize the reader 50 | * \return True when the file is available for read 51 | */ 52 | virtual bool Init() = 0; 53 | /*! 54 | * \brief Read data into buffer 55 | * \param buffer Buffer to read data into 56 | * \param bytes Number of bytes to read 57 | * \return Number of bytes read 58 | */ 59 | virtual size_t Read(void* buffer, size_t bytes) const = 0; 60 | /*! 61 | * \brief Create appropriate reader for filename 62 | * \param filename Filename of the data 63 | * \return File reader instance 64 | */ 65 | static std::unique_ptr Make(const std::string& filename); 66 | }; 67 | 68 | } // namespace LightGBM 69 | 70 | #endif // LightGBM_UTILS_FILE_IO_H_ 71 | -------------------------------------------------------------------------------- /R-package/man/lgb.plot.interpretation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.plot.interpretation.R 3 | \name{lgb.plot.interpretation} 4 | \alias{lgb.plot.interpretation} 5 | \title{Plot feature contribution as a bar graph} 6 | \usage{ 7 | lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1, 8 | left_margin = 10, cex = NULL) 9 | } 10 | \arguments{ 11 | \item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.} 12 | 13 | \item{top_n}{maximal number of top features to include into the plot.} 14 | 15 | \item{cols}{the column numbers of layout, will be used only for multiclass classification feature contribution.} 16 | 17 | \item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.} 18 | 19 | \item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.} 20 | } 21 | \value{ 22 | The \code{lgb.plot.interpretation} function creates a \code{barplot}. 23 | } 24 | \description{ 25 | Plot previously calculated feature contribution as a bar graph. 26 | } 27 | \details{ 28 | The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature. 29 | Features are shown ranked in a decreasing contribution order. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | library(lightgbm) 34 | Sigmoid <- function(x) {1 / (1 + exp(-x))} 35 | Logit <- function(x) {log(x / (1 - x))} 36 | data(agaricus.train, package = "lightgbm") 37 | train <- agaricus.train 38 | dtrain <- lgb.Dataset(train$data, label = train$label) 39 | setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) 40 | data(agaricus.test, package = "lightgbm") 41 | test <- agaricus.test 42 | 43 | params = list(objective = "binary", 44 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 45 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 46 | model <- lgb.train(params, dtrain, 20) 47 | model <- lgb.train(params, dtrain, 20) 48 | 49 | tree_interpretation <- lgb.interprete(model, test$data, 1:5) 50 | lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /include/LightGBM/utils/pipeline_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_ 2 | #define LIGHTGBM_UTILS_PIPELINE_READER_H_ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "file_io.h" 13 | 14 | namespace LightGBM{ 15 | 16 | /*! 17 | * \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block 18 | */ 19 | class PipelineReader { 20 | public: 21 | /*! 22 | * \brief Read data from a file, use pipeline methods 23 | * \param filename Filename of data 24 | * \process_fun Process function 25 | */ 26 | static size_t Read(const char* filename, int skip_bytes, const std::function& process_fun) { 27 | auto reader = VirtualFileReader::Make(filename); 28 | if (!reader->Init()) { 29 | return 0; 30 | } 31 | size_t cnt = 0; 32 | const size_t buffer_size = 16 * 1024 * 1024 ; 33 | // buffer used for the process_fun 34 | auto buffer_process = std::vector(buffer_size); 35 | // buffer used for the file reading 36 | auto buffer_read = std::vector(buffer_size); 37 | size_t read_cnt = 0; 38 | if (skip_bytes > 0) { 39 | // skip first k bytes 40 | read_cnt = reader->Read(buffer_process.data(), skip_bytes); 41 | } 42 | // read first block 43 | read_cnt = reader->Read(buffer_process.data(), buffer_size); 44 | 45 | size_t last_read_cnt = 0; 46 | while (read_cnt > 0) { 47 | // start read thread 48 | std::thread read_worker = std::thread( 49 | [&reader, &buffer_read, buffer_size, &last_read_cnt] { 50 | last_read_cnt = reader->Read(buffer_read.data(), buffer_size); 51 | } 52 | ); 53 | // start process 54 | cnt += process_fun(buffer_process.data(), read_cnt); 55 | // wait for read thread 56 | read_worker.join(); 57 | // exchange the buffer 58 | std::swap(buffer_process, buffer_read); 59 | read_cnt = last_read_cnt; 60 | } 61 | return cnt; 62 | } 63 | 64 | }; 65 | 66 | } // namespace LightGBM 67 | 68 | #endif // LightGBM_UTILS_PIPELINE_READER_H_ 69 | -------------------------------------------------------------------------------- /R-package/man/lgb.prepare.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.prepare.R 3 | \name{lgb.prepare} 4 | \alias{lgb.prepare} 5 | \title{Data preparator for LightGBM datasets (numeric)} 6 | \usage{ 7 | lgb.prepare(data) 8 | } 9 | \arguments{ 10 | \item{data}{A data.frame or data.table to prepare.} 11 | } 12 | \value{ 13 | The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset. 14 | } 15 | \description{ 16 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets. 17 | } 18 | \examples{ 19 | \dontrun{ 20 | library(lightgbm) 21 | data(iris) 22 | 23 | str(iris) 24 | # 'data.frame': 150 obs. of 5 variables: 25 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 26 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 27 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 28 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 29 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... 30 | 31 | str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric 32 | # 'data.frame': 150 obs. of 5 variables: 33 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 34 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 35 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 36 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 37 | # $ Species : num 1 1 1 1 1 1 1 1 1 1 ... 38 | 39 | # When lightgbm package is installed, and you do not want to load it 40 | # You can still use the function! 41 | lgb.unloader() 42 | str(lightgbm::lgb.prepare(data = iris)) 43 | # 'data.frame': 150 obs. of 5 variables: 44 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 45 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 46 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 47 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 48 | # $ Species : num 1 1 1 1 1 1 1 1 1 1 ... 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: required 3 | dist: trusty 4 | 5 | git: 6 | submodules: true 7 | 8 | compiler: 9 | - gcc 10 | 11 | os: 12 | - linux 13 | - osx 14 | 15 | env: 16 | global: 17 | - PYTHON_VERSION=3.6 18 | matrix: 19 | - TASK=regular 20 | - TASK=mpi PYTHON_VERSION=2.7 21 | - TASK=pylint 22 | - TASK=check-docs 23 | - TASK=if-else 24 | - TASK=sdist PYTHON_VERSION=3.4 25 | - TASK=bdist PYTHON_VERSION=3.5 26 | - TASK=gpu METHOD=source 27 | - TASK=gpu METHOD=pip 28 | 29 | matrix: 30 | exclude: 31 | - os: osx 32 | env: TASK=gpu METHOD=source 33 | - os: osx 34 | env: TASK=gpu METHOD=pip 35 | - os: osx 36 | env: TASK=if-else 37 | - os: osx 38 | env: TASK=pylint 39 | - os: osx 40 | env: TASK=check-docs 41 | 42 | before_install: 43 | - test -n $CC && unset CC 44 | - test -n $CXX && unset CXX 45 | - export PATH="$HOME/miniconda/bin:$PATH" 46 | - export LGB_VER=$(head -n 1 VERSION.txt) 47 | - export AMDAPPSDK=$HOME/AMDAPPSDK 48 | - export LD_LIBRARY_PATH="$AMDAPPSDK/lib/x86_64:$LD_LIBRARY_PATH" 49 | - export OPENCL_VENDOR_PATH=$AMDAPPSDK/etc/OpenCL/vendors 50 | 51 | install: 52 | - bash .travis/setup.sh 53 | 54 | script: 55 | - bash .travis/test.sh 56 | 57 | notifications: 58 | email: false 59 | 60 | deploy: 61 | provider: releases 62 | api-key: 63 | secure: "idU9Fb/yUz7VsVOEb0vGR8qqxcvXr4eh1tMzkKiMWLRx5XNeq7RCUzfKAPMIizFkML9zdMh/5vPtZ1Zs++3oWPpbZE2/o4CURoE+BvwDUyEDrKTdNSGoWgWZq0QLjfahj/PR8ObWlU+XCHqRQzKXlwbynwwUGRpOJrlEY0To5Kt9gTV5W8MxSlW7xFU2TTmMa499IZut38OuenJ3Nm9mTe6MCHFW4Y5uGp/gwNuBYfqzwUXDi6h/cJiJJD5drwtNnSneFZ2PZplrKxJxSJdSQ2aHttU+Wr8xogi9hLI/H6OA4UYCF69HrWOLSggplkZt6qUzaG7UfYyid4m6YbeKMUQRNBuGXhYVGr5qkyAzqXiOesGAef550346pWEZNGPLfNnKAwqPgkp8Q8tV9i0srjzyttqFAlLqhA76yST3kuX+QS0VGepSUTV+kkfxCaHZagxtX9Xve5RNybu4B44UmHWIGJnS6ijYpxWKwvWnMmBCIezFbZYyqsiXYC+9d5RfBgNFQ4PlRfmY0vnJlwUhx1AnyL9jsxnthwl9CNczo4mgBqnCSXxlhXNHz6ToMQuhgdhnqm5+qqJzI5/eUugxh8CW18qZTZBkrnL4DxEMm+bQ2QT8O07ZHrEDPKPXxQw7tBsphWvECetJ4DxXfNaf59GrY+eD6TFZuxurB5Vvo6s=" 64 | file_glob: true 65 | file: python-package/dist/*.whl 66 | skip_cleanup: true 67 | draft: true 68 | tag_name: $TRAVIS_TAG 69 | on: 70 | condition: "$TASK = bdist" 71 | tags: true 72 | all_branches: true 73 | -------------------------------------------------------------------------------- /R-package/man/saveRDS.lgb.Booster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/saveRDS.lgb.Booster.R 3 | \name{saveRDS.lgb.Booster} 4 | \alias{saveRDS.lgb.Booster} 5 | \title{saveRDS for lgb.Booster models} 6 | \usage{ 7 | saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL, 8 | compress = TRUE, refhook = NULL, raw = TRUE) 9 | } 10 | \arguments{ 11 | \item{object}{R object to serialize.} 12 | 13 | \item{file}{a connection or the name of the file where the R object is saved to or read from.} 14 | 15 | \item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.} 16 | 17 | \item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.} 18 | 19 | \item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.} 20 | 21 | \item{refhook}{a hook function for handling reference objects.} 22 | 23 | \item{raw}{whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.} 24 | } 25 | \value{ 26 | NULL invisibly. 27 | } 28 | \description{ 29 | Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | library(lightgbm) 34 | data(agaricus.train, package = "lightgbm") 35 | train <- agaricus.train 36 | dtrain <- lgb.Dataset(train$data, label = train$label) 37 | data(agaricus.test, package = "lightgbm") 38 | test <- agaricus.test 39 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 40 | params <- list(objective = "regression", metric = "l2") 41 | valids <- list(test = dtest) 42 | model <- lgb.train(params, 43 | dtrain, 44 | 100, 45 | valids, 46 | min_data = 1, 47 | learning_rate = 1, 48 | early_stopping_rounds = 10) 49 | saveRDS.lgb.Booster(model, "model.rds") 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /.nuget/create_nuget.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from distutils.file_util import copy_file 5 | 6 | 7 | if __name__ == "__main__": 8 | source = sys.argv[1] 9 | current_dir = os.path.abspath(os.path.dirname(__file__)) 10 | linux_folder_path = os.path.join(current_dir, "runtimes", "linux-x64", "native") 11 | if not os.path.exists(linux_folder_path): 12 | os.makedirs(linux_folder_path) 13 | osx_folder_path = os.path.join(current_dir, "runtimes", "osx-x64", "native") 14 | if not os.path.exists(osx_folder_path): 15 | os.makedirs(osx_folder_path) 16 | windows_folder_path = os.path.join(current_dir, "runtimes", "win-x64", "native") 17 | if not os.path.exists(windows_folder_path): 18 | os.makedirs(windows_folder_path) 19 | copy_file(os.path.join(source, "lib_lightgbm.so"), os.path.join(linux_folder_path, "lib_lightgbm.so")) 20 | copy_file(os.path.join(source, "lib_lightgbm.dylib"), os.path.join(osx_folder_path, "lib_lightgbm.dylib")) 21 | copy_file(os.path.join(source, "lib_lightgbm.dll"), os.path.join(windows_folder_path, "lib_lightgbm.dll")) 22 | version = open(os.path.join(current_dir, os.path.pardir, 'VERSION.txt')).read().strip() 23 | nuget_str = ''' 24 | 25 | 26 | LightGBM 27 | %s 28 | Guolin Ke 29 | Guolin Ke 30 | https://github.com/Microsoft/LightGBM/blob/master/LICENSE 31 | https://github.com/Microsoft/LightGBM 32 | false 33 | A fast, distributed, high performance gradient boosting framework 34 | Copyright 2018 @ Microsoft 35 | machine-learning data-mining distributed native boosting gbdt 36 | 37 | 38 | 39 | 40 | 41 | 42 | ''' % version 43 | with open(os.path.join(current_dir, "LightGBM.nuspec"), "w") as nuget_file: 44 | nuget_file.write(nuget_str) 45 | -------------------------------------------------------------------------------- /R-package/man/lgb.model.dt.tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.model.dt.tree.R 3 | \name{lgb.model.dt.tree} 4 | \alias{lgb.model.dt.tree} 5 | \title{Parse a LightGBM model json dump} 6 | \usage{ 7 | lgb.model.dt.tree(model, num_iteration = NULL) 8 | } 9 | \arguments{ 10 | \item{model}{object of class \code{lgb.Booster}} 11 | 12 | \item{num_iteration}{number of iterations you want to predict with. NULL or 13 | <= 0 means use best iteration} 14 | } 15 | \value{ 16 | A \code{data.table} with detailed information about model trees' nodes and leafs. 17 | 18 | The columns of the \code{data.table} are: 19 | 20 | \itemize{ 21 | \item \code{tree_index}: ID of a tree in a model (integer) 22 | \item \code{split_index}: ID of a node in a tree (integer) 23 | \item \code{split_feature}: for a node, it's a feature name (character); 24 | for a leaf, it simply labels it as \code{"NA"} 25 | \item \code{node_parent}: ID of the parent node for current node (integer) 26 | \item \code{leaf_index}: ID of a leaf in a tree (integer) 27 | \item \code{leaf_parent}: ID of the parent node for current leaf (integer) 28 | \item \code{split_gain}: Split gain of a node 29 | \item \code{threshold}: Spliting threshold value of a node 30 | \item \code{decision_type}: Decision type of a node 31 | \item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right 32 | \item \code{internal_value}: Node value 33 | \item \code{internal_count}: The number of observation collected by a node 34 | \item \code{leaf_value}: Leaf value 35 | \item \code{leaf_count}: The number of observation collected by a leaf 36 | } 37 | } 38 | \description{ 39 | Parse a LightGBM model json dump into a \code{data.table} structure. 40 | } 41 | \examples{ 42 | \dontrun{ 43 | library(lightgbm) 44 | 45 | data(agaricus.train, package = "lightgbm") 46 | train <- agaricus.train 47 | dtrain <- lgb.Dataset(train$data, label = train$label) 48 | 49 | params = list(objective = "binary", 50 | learning_rate = 0.01, num_leaves = 63, max_depth = -1, 51 | min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 52 | model <- lgb.train(params, dtrain, 20) 53 | model <- lgb.train(params, dtrain, 20) 54 | 55 | tree_dt <- lgb.model.dt.tree(model) 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Using LightGBM via Docker 2 | 3 | This directory contains `Dockerfile` to make it easy to build and run LightGBM via [Docker](http://www.docker.com/). 4 | 5 | ## Installing Docker 6 | 7 | Follow the general installation instructions 8 | [on the Docker site](https://docs.docker.com/installation/): 9 | 10 | * [macOS](https://docs.docker.com/installation/mac/): [docker toolbox](https://www.docker.com/toolbox) 11 | * [Ubuntu](https://docs.docker.com/installation/ubuntulinux/) 12 | 13 | ## Using CLI Version of LightGBM via Docker 14 | 15 | Build a Docker image with LightGBM CLI: 16 | 17 | ``` 18 | mkdir lightgbm-docker 19 | cd lightgbm-docker 20 | wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-cli 21 | docker build -t lightgbm-cli -f dockerfile-cli . 22 | ``` 23 | 24 | where `lightgbm-cli` is the desired Docker image name. 25 | 26 | Run the CLI from the container: 27 | 28 | ``` 29 | docker run --rm -it \ 30 | --volume $HOME/lgbm.conf:/lgbm.conf \ 31 | --volume $HOME/model.txt:/model.txt \ 32 | --volume $HOME/tmp:/out \ 33 | lightgbm-cli \ 34 | config=lgbm.conf 35 | ``` 36 | 37 | In the above example, three volumes are [mounted](https://docs.docker.com/engine/reference/commandline/run/#mount-volume--v-read-only) 38 | from the host machine to the Docker container: 39 | 40 | * `lgbm.conf` - task config, for example 41 | 42 | ``` 43 | app=multiclass 44 | num_class=3 45 | task=convert_model 46 | input_model=model.txt 47 | convert_model=/out/predict.cpp 48 | convert_model_language=cpp 49 | ``` 50 | 51 | * `model.txt` - an input file for the task, could be training data or, in this case, a pre-trained model. 52 | * `out` - a directory to store the output of the task, notice that `convert_model` in the task config is using it. 53 | 54 | `config=lgbm.conf` is a command-line argument passed to the `lightgbm` executable, more arguments can 55 | be passed if required. 56 | 57 | ## Running the Python-package Сontainer 58 | 59 | Build the container, for Python users: 60 | 61 | ``` 62 | mkdir lightgbm-docker 63 | cd lightgbm-docker 64 | wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-python 65 | docker build -t lightgbm -f dockerfile-python . 66 | ``` 67 | 68 | After build finished, run the container: 69 | 70 | ``` 71 | docker run --rm -it lightgbm 72 | ``` 73 | -------------------------------------------------------------------------------- /src/boosting/boosting.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gbdt.h" 3 | #include "dart.hpp" 4 | #include "goss.hpp" 5 | #include "rf.hpp" 6 | 7 | namespace LightGBM { 8 | 9 | std::string GetBoostingTypeFromModelFile(const char* filename) { 10 | TextReader model_reader(filename, true); 11 | std::string type = model_reader.first_line(); 12 | return type; 13 | } 14 | 15 | bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) { 16 | auto start_time = std::chrono::steady_clock::now(); 17 | if (boosting != nullptr) { 18 | TextReader model_reader(filename, true); 19 | size_t buffer_len = 0; 20 | auto buffer = model_reader.ReadContent(&buffer_len); 21 | if (!boosting->LoadModelFromString(buffer.data(), buffer_len)) { 22 | return false; 23 | } 24 | } 25 | std::chrono::duration delta = (std::chrono::steady_clock::now() - start_time); 26 | Log::Debug("Time for loading model: %f seconds", 1e-3*delta); 27 | return true; 28 | } 29 | 30 | Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) { 31 | if (filename == nullptr || filename[0] == '\0') { 32 | if (type == std::string("gbdt")) { 33 | return new GBDT(); 34 | } else if (type == std::string("dart")) { 35 | return new DART(); 36 | } else if (type == std::string("goss")) { 37 | return new GOSS(); 38 | } else if (type == std::string("rf")) { 39 | return new RF(); 40 | } else { 41 | return nullptr; 42 | } 43 | } else { 44 | std::unique_ptr ret; 45 | if (GetBoostingTypeFromModelFile(filename) == std::string("tree")) { 46 | if (type == std::string("gbdt")) { 47 | ret.reset(new GBDT()); 48 | } else if (type == std::string("dart")) { 49 | ret.reset(new DART()); 50 | } else if (type == std::string("goss")) { 51 | ret.reset(new GOSS()); 52 | } else if (type == std::string("rf")) { 53 | return new RF(); 54 | } else { 55 | Log::Fatal("Unknown boosting type %s", type.c_str()); 56 | } 57 | LoadFileToBoosting(ret.get(), filename); 58 | } else { 59 | Log::Fatal("Unknown model format or submodel type in model file %s", filename); 60 | } 61 | return ret.release(); 62 | } 63 | } 64 | 65 | } // namespace LightGBM 66 | -------------------------------------------------------------------------------- /docs/gcc-Tips.rst: -------------------------------------------------------------------------------- 1 | Recommendations When Using gcc 2 | ============================== 3 | 4 | It is recommended to use ``-O3 -mtune=native`` to achieve maximum speed during LightGBM training. 5 | 6 | Using Intel Ivy Bridge CPU on 1M x 1K Bosch dataset, the performance increases as follow: 7 | 8 | +-------------------------------------+---------------------+ 9 | | Compilation Flag | Performance Index | 10 | +=====================================+=====================+ 11 | | ``-O2 -mtune=core2`` | 100.00% | 12 | +-------------------------------------+---------------------+ 13 | | ``-O2 -mtune=native`` | 100.90% | 14 | +-------------------------------------+---------------------+ 15 | | ``-O3 -mtune=native`` | 102.78% | 16 | +-------------------------------------+---------------------+ 17 | | ``-O3 -ffast-math -mtune=native`` | 100.64% | 18 | +-------------------------------------+---------------------+ 19 | 20 | You can find more details on the experimentation below: 21 | 22 | - `Laurae++/Benchmarks `__ 23 | 24 | - `Laurae2/gbt\_benchmarks `__ 25 | 26 | - `Laurae's Benchmark Master Data (Interactive) `__ 27 | 28 | - `Kaggle Paris Meetup #12 Slides `__ 29 | 30 | Some explanatory pictures: 31 | 32 | .. image:: ./_static/images/gcc-table.png 33 | :align: center 34 | :target: ./_static/images/gcc-table.png 35 | 36 | .. image:: ./_static/images/gcc-bars.png 37 | :align: center 38 | :target: ./_static/images/gcc-bars.png 39 | 40 | .. image:: ./_static/images/gcc-chart.png 41 | :align: center 42 | :target: ./_static/images/gcc-chart.png 43 | 44 | .. image:: ./_static/images/gcc-comparison-1.png 45 | :align: center 46 | :target: ./_static/images/gcc-comparison-1.png 47 | 48 | .. image:: ./_static/images/gcc-comparison-2.png 49 | :align: center 50 | :target: ./_static/images/gcc-comparison-2.png 51 | 52 | .. image:: ./_static/images/gcc-meetup-1.png 53 | :align: center 54 | :target: ./_static/images/gcc-meetup-1.png 55 | 56 | .. image:: ./_static/images/gcc-meetup-2.png 57 | :align: center 58 | :target: ./_static/images/gcc-meetup-2.png 59 | -------------------------------------------------------------------------------- /examples/python-guide/sklearn_example.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import numpy as np 4 | import pandas as pd 5 | import lightgbm as lgb 6 | 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.model_selection import GridSearchCV 9 | 10 | # load or create your dataset 11 | print('Load data...') 12 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 13 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 14 | 15 | y_train = df_train[0].values 16 | y_test = df_test[0].values 17 | X_train = df_train.drop(0, axis=1).values 18 | X_test = df_test.drop(0, axis=1).values 19 | 20 | print('Start training...') 21 | # train 22 | gbm = lgb.LGBMRegressor(objective='regression', 23 | num_leaves=31, 24 | learning_rate=0.05, 25 | n_estimators=20) 26 | gbm.fit(X_train, y_train, 27 | eval_set=[(X_test, y_test)], 28 | eval_metric='l1', 29 | early_stopping_rounds=5) 30 | 31 | print('Start predicting...') 32 | # predict 33 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) 34 | # eval 35 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) 36 | 37 | # feature importances 38 | print('Feature importances:', list(gbm.feature_importances_)) 39 | 40 | 41 | # self-defined eval metric 42 | # f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool 43 | # Root Mean Squared Logarithmic Error (RMSLE) 44 | def rmsle(y_true, y_pred): 45 | return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False 46 | 47 | 48 | print('Start training with custom eval function...') 49 | # train 50 | gbm.fit(X_train, y_train, 51 | eval_set=[(X_test, y_test)], 52 | eval_metric=rmsle, 53 | early_stopping_rounds=5) 54 | 55 | print('Start predicting...') 56 | # predict 57 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) 58 | # eval 59 | print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1]) 60 | 61 | # other scikit-learn modules 62 | estimator = lgb.LGBMRegressor(num_leaves=31) 63 | 64 | param_grid = { 65 | 'learning_rate': [0.01, 0.1, 1], 66 | 'n_estimators': [20, 40] 67 | } 68 | 69 | gbm = GridSearchCV(estimator, param_grid) 70 | 71 | gbm.fit(X_train, y_train) 72 | 73 | print('Best parameters found by grid search are:', gbm.best_params_) 74 | -------------------------------------------------------------------------------- /R-package/man/lgb.prepare2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.prepare2.R 3 | \name{lgb.prepare2} 4 | \alias{lgb.prepare2} 5 | \title{Data preparator for LightGBM datasets (integer)} 6 | \usage{ 7 | lgb.prepare2(data) 8 | } 9 | \arguments{ 10 | \item{data}{A data.frame or data.table to prepare.} 11 | } 12 | \value{ 13 | The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset. 14 | } 15 | \description{ 16 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. 17 | } 18 | \examples{ 19 | \dontrun{ 20 | library(lightgbm) 21 | data(iris) 22 | 23 | str(iris) 24 | # 'data.frame': 150 obs. of 5 variables: 25 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 26 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 27 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 28 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 29 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... 30 | 31 | str(lgb.prepare2(data = iris)) # Convert all factors/chars to integer 32 | # 'data.frame': 150 obs. of 5 variables: 33 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 34 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 35 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 36 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 37 | # $ Species : int 1 1 1 1 1 1 1 1 1 1 ... 38 | 39 | # When lightgbm package is installed, and you do not want to load it 40 | # You can still use the function! 41 | lgb.unloader() 42 | str(lightgbm::lgb.prepare2(data = iris)) 43 | # 'data.frame': 150 obs. of 5 variables: 44 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 45 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 46 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 47 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 48 | # $ Species : int 1 1 1 1 1 1 1 1 1 1 ... 49 | 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /R-package/R/lgb.importance.R: -------------------------------------------------------------------------------- 1 | #' Compute feature importance in a model 2 | #' 3 | #' Creates a \code{data.table} of feature importances in a model. 4 | #' 5 | #' @param model object of class \code{lgb.Booster}. 6 | #' @param percentage whether to show importance in relative percentage. 7 | #' 8 | #' @return 9 | #' 10 | #' For a tree model, a \code{data.table} with the following columns: 11 | #' \itemize{ 12 | #' \item \code{Feature} Feature names in the model. 13 | #' \item \code{Gain} The total gain of this feature's splits. 14 | #' \item \code{Cover} The number of observation related to this feature. 15 | #' \item \code{Frequency} The number of times a feature splited in trees. 16 | #' } 17 | #' 18 | #' @examples 19 | #' \dontrun{ 20 | #' library(lightgbm) 21 | #' data(agaricus.train, package = "lightgbm") 22 | #' train <- agaricus.train 23 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 24 | #' 25 | #' params = list(objective = "binary", 26 | #' learning_rate = 0.01, num_leaves = 63, max_depth = -1, 27 | #' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 28 | #' model <- lgb.train(params, dtrain, 20) 29 | #' model <- lgb.train(params, dtrain, 20) 30 | #' 31 | #' tree_imp1 <- lgb.importance(model, percentage = TRUE) 32 | #' tree_imp2 <- lgb.importance(model, percentage = FALSE) 33 | #' } 34 | #' 35 | #' @importFrom magrittr %>% %T>% 36 | #' @importFrom data.table := 37 | #' @export 38 | lgb.importance <- function(model, percentage = TRUE) { 39 | 40 | # Check if model is a lightgbm model 41 | if (!inherits(model, "lgb.Booster")) { 42 | stop("'model' has to be an object of class lgb.Booster") 43 | } 44 | 45 | # Setup importance 46 | tree_dt <- lgb.model.dt.tree(model) 47 | 48 | # Extract elements 49 | tree_imp <- tree_dt %>% 50 | magrittr::extract(., 51 | i = ! is.na(split_index), 52 | j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N), 53 | by = "split_feature") %T>% 54 | data.table::setnames(., old = "split_feature", new = "Feature") %>% 55 | magrittr::extract(., i = order(Gain, decreasing = TRUE)) 56 | 57 | # Check if relative values are requested 58 | if (percentage) { 59 | tree_imp[, ":="(Gain = Gain / sum(Gain), 60 | Cover = Cover / sum(Cover), 61 | Frequency = Frequency / sum(Frequency))] 62 | } 63 | 64 | # Return importance table 65 | return(tree_imp) 66 | 67 | } 68 | -------------------------------------------------------------------------------- /include/LightGBM/application.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_APPLICATION_H_ 2 | #define LIGHTGBM_APPLICATION_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace LightGBM { 11 | 12 | class DatasetLoader; 13 | class Dataset; 14 | class Boosting; 15 | class ObjectiveFunction; 16 | class Metric; 17 | 18 | /*! 19 | * \brief The main entrance of LightGBM. this application has two tasks: 20 | * Train and Predict. 21 | * Train task will train a new model 22 | * Predict task will predict the scores of test data using existing model, 23 | * and save the score to disk. 24 | */ 25 | class Application { 26 | public: 27 | Application(int argc, char** argv); 28 | 29 | /*! \brief Destructor */ 30 | ~Application(); 31 | 32 | /*! \brief To call this funciton to run application*/ 33 | inline void Run(); 34 | 35 | private: 36 | 37 | /*! \brief Load parameters from command line and config file*/ 38 | void LoadParameters(int argc, char** argv); 39 | 40 | /*! \brief Load data, including training data and validation data*/ 41 | void LoadData(); 42 | 43 | /*! \brief Initialization before training*/ 44 | void InitTrain(); 45 | 46 | /*! \brief Main Training logic */ 47 | void Train(); 48 | 49 | /*! \brief Initializations before prediction */ 50 | void InitPredict(); 51 | 52 | /*! \brief Main predicting logic */ 53 | void Predict(); 54 | 55 | /*! \brief Main Convert model logic */ 56 | void ConvertModel(); 57 | 58 | /*! \brief All configs */ 59 | Config config_; 60 | /*! \brief Training data */ 61 | std::unique_ptr train_data_; 62 | /*! \brief Validation data */ 63 | std::vector> valid_datas_; 64 | /*! \brief Metric for training data */ 65 | std::vector> train_metric_; 66 | /*! \brief Metrics for validation data */ 67 | std::vector>> valid_metrics_; 68 | /*! \brief Boosting object */ 69 | std::unique_ptr boosting_; 70 | /*! \brief Training objective function */ 71 | std::unique_ptr objective_fun_; 72 | }; 73 | 74 | 75 | inline void Application::Run() { 76 | if (config_.task == TaskType::kPredict || config_.task == TaskType::KRefitTree) { 77 | InitPredict(); 78 | Predict(); 79 | } else if (config_.task == TaskType::kConvertModel) { 80 | ConvertModel(); 81 | } else { 82 | InitTrain(); 83 | Train(); 84 | } 85 | } 86 | 87 | } // namespace LightGBM 88 | 89 | #endif // LightGBM_APPLICATION_H_ 90 | -------------------------------------------------------------------------------- /R-package/R/lgb.unloader.R: -------------------------------------------------------------------------------- 1 | #' LightGBM unloading error fix 2 | #' 3 | #' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object. 4 | #' 5 | #' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed. 6 | #' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them. 7 | #' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment. 8 | #' 9 | #' @return NULL invisibly. 10 | #' 11 | #' @examples 12 | #' \dontrun{ 13 | #' library(lightgbm) 14 | #' data(agaricus.train, package = "lightgbm") 15 | #' train <- agaricus.train 16 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 17 | #' data(agaricus.test, package = "lightgbm") 18 | #' test <- agaricus.test 19 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 20 | #' params <- list(objective = "regression", metric = "l2") 21 | #' valids <- list(test = dtest) 22 | #' model <- lgb.train(params, 23 | #' dtrain, 24 | #' 100, 25 | #' valids, 26 | #' min_data = 1, 27 | #' learning_rate = 1, 28 | #' early_stopping_rounds = 10) 29 | #' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) 30 | #' rm(model, dtrain, dtest) # Not needed if wipe = TRUE 31 | #' gc() # Not needed if wipe = TRUE 32 | #' 33 | #' library(lightgbm) 34 | #' # Do whatever you want again with LightGBM without object clashing 35 | #' } 36 | #' 37 | #' @export 38 | lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) { 39 | 40 | # Unload package 41 | try(detach("package:lightgbm", unload = TRUE), silent = TRUE) 42 | 43 | # Should we wipe variables? (lgb.Booster, lgb.Dataset) 44 | if (wipe) { 45 | boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir)) 46 | datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir)) 47 | rm(list = c(boosters, datasets), envir = envir) 48 | gc(verbose = FALSE) 49 | } 50 | 51 | # Load package back? 52 | if (restore) { 53 | library(lightgbm) 54 | } 55 | 56 | invisible() 57 | 58 | } 59 | -------------------------------------------------------------------------------- /R-package/demo/multiclass.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | 3 | # We load the default iris dataset shipped with R 4 | data(iris) 5 | 6 | # We must convert factors to numeric 7 | # They must be starting from number 0 to use multiclass 8 | # For instance: 0, 1, 2, 3, 4, 5... 9 | iris$Species <- as.numeric(as.factor(iris$Species)) - 1 10 | 11 | # We cut the data set into 80% train and 20% validation 12 | # The 10 last samples of each class are for validation 13 | 14 | train <- as.matrix(iris[c(1:40, 51:90, 101:140), ]) 15 | test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) 16 | dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5]) 17 | dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5]) 18 | valids <- list(test = dtest) 19 | 20 | # Method 1 of training 21 | params <- list(objective = "multiclass", metric = "multi_error", num_class = 3) 22 | model <- lgb.train(params, 23 | dtrain, 24 | 100, 25 | valids, 26 | min_data = 1, 27 | learning_rate = 1, 28 | early_stopping_rounds = 10) 29 | 30 | # We can predict on test data, outputs a 90-length vector 31 | # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... 32 | my_preds <- predict(model, test[, 1:4]) 33 | 34 | # Method 2 of training, identical 35 | model <- lgb.train(list(), 36 | dtrain, 37 | 100, 38 | valids, 39 | min_data = 1, 40 | learning_rate = 1, 41 | early_stopping_rounds = 10, 42 | objective = "multiclass", 43 | metric = "multi_error", 44 | num_class = 3) 45 | 46 | # We can predict on test data, identical 47 | my_preds <- predict(model, test[, 1:4]) 48 | 49 | # A (30x3) matrix with the predictions, use parameter reshape 50 | # class1 class2 class3 51 | # obs1 obs1 obs1 52 | # obs2 obs2 obs2 53 | # .... .... .... 54 | my_preds <- predict(model, test[, 1:4], reshape = TRUE) 55 | 56 | # We can also get the predicted scores before the Sigmoid/Softmax application 57 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE) 58 | 59 | # Raw score predictions as matrix instead of vector 60 | my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE) 61 | 62 | # We can also get the leaf index 63 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE) 64 | 65 | # Predict leaf index as matrix instead of vector 66 | my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE) 67 | -------------------------------------------------------------------------------- /R-package/man/predict.lgb.Booster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.Booster.R 3 | \name{predict.lgb.Booster} 4 | \alias{predict.lgb.Booster} 5 | \title{Predict method for LightGBM model} 6 | \usage{ 7 | \method{predict}{lgb.Booster}(object, data, num_iteration = NULL, 8 | rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE, 9 | ...) 10 | } 11 | \arguments{ 12 | \item{object}{Object of class \code{lgb.Booster}} 13 | 14 | \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} 15 | 16 | \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} 17 | 18 | \item{rawscore}{whether the prediction should be returned in the for of original untransformed 19 | sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for 20 | logistic regression would result in predictions for log-odds instead of probabilities.} 21 | 22 | \item{predleaf}{whether predict leaf index instead.} 23 | 24 | \item{header}{only used for prediction for text file. True if text file has header} 25 | 26 | \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 27 | prediction outputs per case.} 28 | 29 | \item{...}{Additional named arguments passed to the \code{predict()} method of 30 | the \code{lgb.Booster} object passed to \code{object}.} 31 | } 32 | \value{ 33 | For regression or binary classification, it returns a vector of length \code{nrows(data)}. 34 | For multiclass classification, either a \code{num_class * nrows(data)} vector or 35 | a \code{(nrows(data), num_class)} dimension matrix is returned, depending on 36 | the \code{reshape} value. 37 | 38 | When \code{predleaf = TRUE}, the output is a matrix object with the 39 | number of columns corresponding to the number of trees. 40 | } 41 | \description{ 42 | Predicted values based on class \code{lgb.Booster} 43 | } 44 | \examples{ 45 | \dontrun{ 46 | library(lightgbm) 47 | data(agaricus.train, package = "lightgbm") 48 | train <- agaricus.train 49 | dtrain <- lgb.Dataset(train$data, label = train$label) 50 | data(agaricus.test, package = "lightgbm") 51 | test <- agaricus.test 52 | dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 53 | params <- list(objective = "regression", metric = "l2") 54 | valids <- list(test = dtest) 55 | model <- lgb.train(params, 56 | dtrain, 57 | 100, 58 | valids, 59 | min_data = 1, 60 | learning_rate = 1, 61 | early_stopping_rounds = 10) 62 | preds <- predict(model, test$data) 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/boosting/prediction_early_stop.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace { 10 | 11 | using namespace LightGBM; 12 | 13 | PredictionEarlyStopInstance CreateNone(const PredictionEarlyStopConfig&) { 14 | return PredictionEarlyStopInstance{ 15 | [](const double*, int) { 16 | return false; 17 | }, 18 | std::numeric_limits::max() // make sure the lambda is almost never called 19 | }; 20 | } 21 | 22 | PredictionEarlyStopInstance CreateMulticlass(const PredictionEarlyStopConfig& config) { 23 | // margin_threshold will be captured by value 24 | const double margin_threshold = config.margin_threshold; 25 | 26 | return PredictionEarlyStopInstance{ 27 | [margin_threshold](const double* pred, int sz) { 28 | if (sz < 2) { 29 | Log::Fatal("Multiclass early stopping needs predictions to be of length two or larger"); 30 | } 31 | 32 | // copy and sort 33 | std::vector votes(static_cast(sz)); 34 | for (int i = 0; i < sz; ++i) { 35 | votes[i] = pred[i]; 36 | } 37 | std::partial_sort(votes.begin(), votes.begin() + 2, votes.end(), std::greater()); 38 | 39 | const auto margin = votes[0] - votes[1]; 40 | 41 | if (margin > margin_threshold) { 42 | return true; 43 | } 44 | 45 | return false; 46 | }, 47 | config.round_period 48 | }; 49 | } 50 | 51 | PredictionEarlyStopInstance CreateBinary(const PredictionEarlyStopConfig& config) { 52 | // margin_threshold will be captured by value 53 | const double margin_threshold = config.margin_threshold; 54 | 55 | return PredictionEarlyStopInstance{ 56 | [margin_threshold](const double* pred, int sz) { 57 | if (sz != 1) { 58 | Log::Fatal("Binary early stopping needs predictions to be of length one"); 59 | } 60 | const auto margin = 2.0 * fabs(pred[0]); 61 | 62 | if (margin > margin_threshold) { 63 | return true; 64 | } 65 | 66 | return false; 67 | }, 68 | config.round_period 69 | }; 70 | } 71 | 72 | } 73 | 74 | namespace LightGBM { 75 | 76 | PredictionEarlyStopInstance CreatePredictionEarlyStopInstance(const std::string& type, 77 | const PredictionEarlyStopConfig& config) { 78 | if (type == "none") { 79 | return CreateNone(config); 80 | } else if (type == "multiclass") { 81 | return CreateMulticlass(config); 82 | } else if (type == "binary") { 83 | return CreateBinary(config); 84 | } else { 85 | throw std::runtime_error("Unknown early stopping type: " + type); 86 | } 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /docs/Advanced-Topics.rst: -------------------------------------------------------------------------------- 1 | Advanced Topics 2 | =============== 3 | 4 | Missing Value Handle 5 | -------------------- 6 | 7 | - LightGBM enables the missing value handle by default. Disable it by setting ``use_missing=false``. 8 | 9 | - LightGBM uses NA (NaN) to represent missing values by default. Change it to use zero by setting ``zero_as_missing=true``. 10 | 11 | - When ``zero_as_missing=false`` (default), the unshown values in sparse matrices (and LightSVM) are treated as zeros. 12 | 13 | - When ``zero_as_missing=true``, NA and zeros (including unshown values in sparse matrices (and LightSVM)) are treated as missing. 14 | 15 | Categorical Feature Support 16 | --------------------------- 17 | 18 | - LightGBM offers good accuracy with integer-encoded categorical features. LightGBM applies 19 | `Fisher (1958) `_ 20 | to find the optimal split over categories as 21 | `described here <./Features.rst#optimal-split-for-categorical-features>`_. This often performs better than one-hot encoding. 22 | 23 | - Use ``categorical_feature`` to specify the categorical features. 24 | Refer to the parameter ``categorical_feature`` in `Parameters <./Parameters.rst#categorical_feature>`__. 25 | 26 | - Categorical features must be encoded as non-negative integers (``int``) less than ``Int32.MaxValue`` (2147483647). 27 | It is best to use a contiguous range of integers. 28 | 29 | - Use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting (when ``#data`` is small or ``#category`` is large). 30 | 31 | - For a categorical feature with high cardinality (``#category`` is large), it often works best to 32 | treat the feature as numeric, either by simply ignoring the categorical interpretation of the integers or 33 | by embedding the categories in a low-dimensional numeric space. 34 | 35 | LambdaRank 36 | ---------- 37 | 38 | - The label should be of type ``int``, such that larger numbers correspond to higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect). 39 | 40 | - Use ``label_gain`` to set the gain(weight) of ``int`` label. 41 | 42 | - Use ``max_position`` to set the NDCG optimization position. 43 | 44 | Parameters Tuning 45 | ----------------- 46 | 47 | - Refer to `Parameters Tuning <./Parameters-Tuning.rst>`__. 48 | 49 | Parallel Learning 50 | ----------------- 51 | 52 | - Refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__. 53 | 54 | GPU Support 55 | ----------- 56 | 57 | - Refer to `GPU Tutorial <./GPU-Tutorial.rst>`__ and `GPU Targets <./GPU-Targets.rst>`__. 58 | 59 | Recommendations for gcc Users (MinGW, \*nix) 60 | -------------------------------------------- 61 | 62 | - Refer to `gcc Tips <./gcc-Tips.rst>`__. 63 | -------------------------------------------------------------------------------- /examples/python-guide/README.md: -------------------------------------------------------------------------------- 1 | Python-package Examples 2 | ======================= 3 | 4 | Here is an example for LightGBM to use Python-package. 5 | 6 | You should install LightGBM [Python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) first. 7 | 8 | You also need scikit-learn, pandas, matplotlib (only for plot example), and scipy (only for logistic regression example) to run the examples, but they are not required for the package itself. You can install them with pip: 9 | 10 | ``` 11 | pip install scikit-learn pandas matplotlib scipy -U 12 | ``` 13 | 14 | Now you can run examples in this folder, for example: 15 | 16 | ``` 17 | python simple_example.py 18 | ``` 19 | 20 | Examples include: 21 | 22 | - [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py) 23 | - Construct Dataset 24 | - Basic train and predict 25 | - Eval during training 26 | - Early stopping 27 | - Save model to file 28 | - [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py) 29 | - Create data for learning with sklearn interface 30 | - Basic train and predict with sklearn interface 31 | - Feature importances with sklearn interface 32 | - Self-defined eval metric with sklearn interface 33 | - Find best parameters for the model with sklearn's GridSearchCV 34 | - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py) 35 | - Set feature names 36 | - Directly use categorical features without one-hot encoding 37 | - Dump model to json format 38 | - Get feature importances 39 | - Get feature names 40 | - Load model to predict 41 | - Dump and load model with pickle 42 | - Load model file to continue training 43 | - Change learning rates during training 44 | - Change any parameters during training 45 | - Self-defined objective function 46 | - Self-defined eval metric 47 | - Callback function 48 | - [logistic_regression.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/logistic_regression.py) 49 | - Use objective `xentropy` or `binary` 50 | - Use `xentropy` with binary labels or probability labels 51 | - Use `binary` only with binary labels 52 | - Compare speed of `xentropy` versus `binary` 53 | - [plot_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/plot_example.py) 54 | - Construct Dataset 55 | - Train and record eval results for further plotting 56 | - Plot metrics recorded during training 57 | - Plot feature importances 58 | - Plot one specified tree 59 | - Plot one specified tree with Graphviz 60 | -------------------------------------------------------------------------------- /include/LightGBM/objective_function.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_ 2 | #define LIGHTGBM_OBJECTIVE_FUNCTION_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace LightGBM { 10 | /*! 11 | * \brief The interface of Objective Function. 12 | */ 13 | class ObjectiveFunction { 14 | public: 15 | /*! \brief virtual destructor */ 16 | virtual ~ObjectiveFunction() {} 17 | 18 | /*! 19 | * \brief Initialize 20 | * \param metadata Label data 21 | * \param num_data Number of data 22 | */ 23 | virtual void Init(const Metadata& metadata, data_size_t num_data) = 0; 24 | 25 | /*! 26 | * \brief calculating first order derivative of loss function 27 | * \param score prediction score in this round 28 | * \gradients Output gradients 29 | * \hessians Output hessians 30 | */ 31 | virtual void GetGradients(const double* score, 32 | score_t* gradients, score_t* hessians) const = 0; 33 | 34 | virtual const char* GetName() const = 0; 35 | 36 | virtual bool IsConstantHessian() const { return false; } 37 | 38 | virtual bool IsRenewTreeOutput() const { return false; } 39 | 40 | virtual double RenewTreeOutput(double ori_output, const double*, 41 | const data_size_t*, 42 | const data_size_t*, 43 | data_size_t) const { return ori_output; } 44 | 45 | virtual double BoostFromScore() const { return 0.0f; } 46 | 47 | virtual bool SkipEmptyClass() const { return false; } 48 | 49 | virtual int NumModelPerIteration() const { return 1; } 50 | 51 | virtual int NumPredictOneRow() const { return 1; } 52 | 53 | /*! \brief The prediction should be accurate or not. True will disable early stopping for prediction. */ 54 | virtual bool NeedAccuratePrediction() const { return true; } 55 | 56 | virtual void ConvertOutput(const double* input, double* output) const { 57 | output[0] = input[0]; 58 | } 59 | 60 | virtual std::string ToString() const = 0; 61 | 62 | ObjectiveFunction() = default; 63 | /*! \brief Disable copy */ 64 | ObjectiveFunction& operator=(const ObjectiveFunction&) = delete; 65 | /*! \brief Disable copy */ 66 | ObjectiveFunction(const ObjectiveFunction&) = delete; 67 | 68 | /*! 69 | * \brief Create object of objective function 70 | * \param type Specific type of objective function 71 | * \param config Config for objective function 72 | */ 73 | LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& type, 74 | const Config& config); 75 | 76 | /*! 77 | * \brief Load objective function from string object 78 | */ 79 | LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str); 80 | }; 81 | 82 | } // namespace LightGBM 83 | 84 | #endif // LightGBM_OBJECTIVE_FUNCTION_H_ 85 | -------------------------------------------------------------------------------- /swig/lightgbmlib.i: -------------------------------------------------------------------------------- 1 | /* lightgbmlib.i */ 2 | %module lightgbmlib 3 | %ignore LGBM_BoosterSaveModelToString; 4 | %{ 5 | /* Includes the header in the wrapper code */ 6 | #include "../include/LightGBM/export.h" 7 | #include "../include/LightGBM/utils/log.h" 8 | #include "../include/LightGBM/c_api.h" 9 | %} 10 | 11 | /* header files */ 12 | %include "../include/LightGBM/export.h" 13 | %include "../include/LightGBM/c_api.h" 14 | %include "cpointer.i" 15 | %include "carrays.i" 16 | 17 | %inline %{ 18 | char * LGBM_BoosterSaveModelToStringSWIG(BoosterHandle handle, 19 | int num_iteration, 20 | int64_t buffer_len, 21 | int64_t* out_len) { 22 | char* dst = new char[buffer_len]; 23 | int result = LGBM_BoosterSaveModelToString(handle, num_iteration, buffer_len, out_len, dst); 24 | if (result != 0) { 25 | return nullptr; 26 | } 27 | return dst; 28 | } 29 | %} 30 | 31 | %pointer_functions(int, intp) 32 | %pointer_functions(long, longp) 33 | %pointer_functions(double, doublep) 34 | %pointer_functions(float, floatp) 35 | %pointer_functions(int64_t, int64_tp) 36 | %pointer_functions(int32_t, int32_tp) 37 | 38 | %pointer_cast(int64_t *, long *, int64_t_to_long_ptr) 39 | %pointer_cast(int64_t *, double *, int64_t_to_double_ptr) 40 | %pointer_cast(int32_t *, int *, int32_t_to_int_ptr) 41 | %pointer_cast(long *, int64_t *, long_to_int64_t_ptr) 42 | %pointer_cast(double *, int64_t *, double_to_int64_t_ptr) 43 | %pointer_cast(double *, void *, double_to_voidp_ptr) 44 | %pointer_cast(int *, int32_t *, int_to_int32_t_ptr) 45 | %pointer_cast(float *, void *, float_to_voidp_ptr) 46 | 47 | %array_functions(double, doubleArray) 48 | %array_functions(float, floatArray) 49 | %array_functions(int, intArray) 50 | %array_functions(long, longArray) 51 | 52 | /* Custom pointer manipulation template */ 53 | %define %pointer_manipulation(TYPE,NAME) 54 | %{ 55 | static TYPE *new_##NAME() { %} 56 | %{ TYPE* NAME = new TYPE; return NAME; %} 57 | %{} 58 | 59 | static void delete_##NAME(TYPE *self) { %} 60 | %{ if (self) delete self; %} 61 | %{} 62 | %} 63 | 64 | TYPE *new_##NAME(); 65 | void delete_##NAME(TYPE *self); 66 | 67 | %enddef 68 | 69 | %define %pointer_dereference(TYPE,NAME) 70 | %{ 71 | static TYPE NAME ##_value(TYPE *self) { 72 | TYPE NAME = *self; 73 | return NAME; 74 | } 75 | %} 76 | 77 | TYPE NAME##_value(TYPE *self); 78 | 79 | %enddef 80 | 81 | %define %pointer_handle(TYPE,NAME) 82 | %{ 83 | static TYPE* NAME ##_handle() { %} 84 | %{ TYPE* NAME = new TYPE; *NAME = (TYPE)operator new(sizeof(int*)); return NAME; %} 85 | %{} 86 | %} 87 | 88 | TYPE *NAME##_handle(); 89 | 90 | %enddef 91 | 92 | %pointer_manipulation(void*, voidpp) 93 | 94 | /* Allow dereferencing of void** to void* */ 95 | %pointer_dereference(void*, voidpp) 96 | 97 | /* Allow retrieving handle to void** */ 98 | %pointer_handle(void*, voidpp) 99 | 100 | -------------------------------------------------------------------------------- /R-package/R/saveRDS.lgb.Booster.R: -------------------------------------------------------------------------------- 1 | #' saveRDS for lgb.Booster models 2 | #' 3 | #' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. 4 | #' 5 | #' @param object R object to serialize. 6 | #' @param file a connection or the name of the file where the R object is saved to or read from. 7 | #' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save. 8 | #' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions. 9 | #' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection. 10 | #' @param refhook a hook function for handling reference objects. 11 | #' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}. 12 | #' 13 | #' @return NULL invisibly. 14 | #' 15 | #' @examples 16 | #' \dontrun{ 17 | #' library(lightgbm) 18 | #' data(agaricus.train, package = "lightgbm") 19 | #' train <- agaricus.train 20 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 21 | #' data(agaricus.test, package = "lightgbm") 22 | #' test <- agaricus.test 23 | #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) 24 | #' params <- list(objective = "regression", metric = "l2") 25 | #' valids <- list(test = dtest) 26 | #' model <- lgb.train(params, 27 | #' dtrain, 28 | #' 100, 29 | #' valids, 30 | #' min_data = 1, 31 | #' learning_rate = 1, 32 | #' early_stopping_rounds = 10) 33 | #' saveRDS.lgb.Booster(model, "model.rds") 34 | #' } 35 | #' 36 | #' @export 37 | saveRDS.lgb.Booster <- function(object, 38 | file = "", 39 | ascii = FALSE, 40 | version = NULL, 41 | compress = TRUE, 42 | refhook = NULL, 43 | raw = TRUE) { 44 | 45 | # Check if object has a raw value (and if the user wants to store the raw) 46 | if (is.na(object$raw) && raw) { 47 | 48 | # Save model 49 | object$save() 50 | 51 | # Save RDS 52 | saveRDS(object, 53 | file = file, 54 | ascii = ascii, 55 | version = version, 56 | compress = compress, 57 | refhook = refhook) 58 | 59 | # Free model from memory 60 | object$raw <- NA 61 | 62 | } else { 63 | 64 | # Save as usual 65 | saveRDS(object, 66 | file = file, 67 | ascii = ascii, 68 | version = version, 69 | compress = compress, 70 | refhook = refhook) 71 | 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /include/LightGBM/utils/log.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_LOG_H_ 2 | #define LIGHTGBM_UTILS_LOG_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace LightGBM { 13 | 14 | #if defined(_MSC_VER) 15 | #define THREAD_LOCAL __declspec(thread) 16 | #else 17 | #define THREAD_LOCAL thread_local 18 | #endif 19 | 20 | #ifndef CHECK 21 | #define CHECK(condition) \ 22 | if (!(condition)) Log::Fatal("Check failed: " #condition \ 23 | " at %s, line %d .\n", __FILE__, __LINE__); 24 | #endif 25 | 26 | #ifndef CHECK_NOTNULL 27 | #define CHECK_NOTNULL(pointer) \ 28 | if ((pointer) == nullptr) LightGBM::Log::Fatal(#pointer " Can't be NULL at %s, line %d .\n", __FILE__, __LINE__); 29 | #endif 30 | 31 | 32 | enum class LogLevel: int { 33 | Fatal = -1, 34 | Warning = 0, 35 | Info = 1, 36 | Debug = 2, 37 | }; 38 | 39 | 40 | /*! 41 | * \brief A static Log class 42 | */ 43 | class Log { 44 | public: 45 | /*! 46 | * \brief Resets the minimal log level. It is INFO by default. 47 | * \param level The new minimal log level. 48 | */ 49 | static void ResetLogLevel(LogLevel level) { 50 | GetLevel() = level; 51 | } 52 | 53 | static void Debug(const char *format, ...) { 54 | va_list val; 55 | va_start(val, format); 56 | Write(LogLevel::Debug, "Debug", format, val); 57 | va_end(val); 58 | } 59 | static void Info(const char *format, ...) { 60 | va_list val; 61 | va_start(val, format); 62 | Write(LogLevel::Info, "Info", format, val); 63 | va_end(val); 64 | } 65 | static void Warning(const char *format, ...) { 66 | va_list val; 67 | va_start(val, format); 68 | Write(LogLevel::Warning, "Warning", format, val); 69 | va_end(val); 70 | } 71 | static void Fatal(const char *format, ...) { 72 | va_list val; 73 | char str_buf[1024]; 74 | va_start(val, format); 75 | #ifdef _MSC_VER 76 | vsprintf_s(str_buf, format, val); 77 | #else 78 | vsprintf(str_buf, format, val); 79 | #endif 80 | va_end(val); 81 | fprintf(stderr, "[LightGBM] [Fatal] %s\n", str_buf); 82 | fflush(stderr); 83 | throw std::runtime_error(std::string(str_buf)); 84 | } 85 | 86 | private: 87 | 88 | static void Write(LogLevel level, const char* level_str, const char *format, va_list val) { 89 | if (level <= GetLevel()) { // omit the message with low level 90 | // write to STDOUT 91 | printf("[LightGBM] [%s] ", level_str); 92 | vprintf(format, val); 93 | printf("\n"); 94 | fflush(stdout); 95 | } 96 | } 97 | 98 | // a trick to use static variable in header file. 99 | // May be not good, but avoid to use an additional cpp file 100 | static LogLevel& GetLevel() { static THREAD_LOCAL LogLevel level = LogLevel::Info; return level; } 101 | 102 | }; 103 | 104 | } // namespace LightGBM 105 | #endif // LightGBM_UTILS_LOG_H_ 106 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_dataset.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | require(Matrix) 3 | 4 | context("testing lgb.Dataset functionality") 5 | 6 | data(agaricus.test, package='lightgbm') 7 | test_data <- agaricus.test$data[1:100,] 8 | test_label <- agaricus.test$label[1:100] 9 | 10 | test_that("lgb.Dataset: basic construction, saving, loading", { 11 | # from sparse matrix 12 | dtest1 <- lgb.Dataset(test_data, label=test_label) 13 | # from dense matrix 14 | dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label) 15 | expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) 16 | 17 | # save to a local file 18 | tmp_file <- tempfile('lgb.Dataset_') 19 | lgb.Dataset.save(dtest1, tmp_file) 20 | # read from a local file 21 | dtest3 <- lgb.Dataset(tmp_file) 22 | lgb.Dataset.construct(dtest3) 23 | unlink(tmp_file) 24 | expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) 25 | }) 26 | 27 | test_that("lgb.Dataset: getinfo & setinfo", { 28 | dtest <- lgb.Dataset(test_data) 29 | dtest$construct() 30 | 31 | setinfo(dtest, 'label', test_label) 32 | labels <- getinfo(dtest, 'label') 33 | expect_equal(test_label, getinfo(dtest, 'label')) 34 | 35 | expect_true(length(getinfo(dtest, 'weight')) == 0) 36 | expect_true(length(getinfo(dtest, 'init_score')) == 0) 37 | 38 | # any other label should error 39 | expect_error(setinfo(dtest, 'asdf', test_label)) 40 | }) 41 | 42 | test_that("lgb.Dataset: slice, dim", { 43 | dtest <- lgb.Dataset(test_data, label=test_label) 44 | lgb.Dataset.construct(dtest) 45 | expect_equal(dim(dtest), dim(test_data)) 46 | dsub1 <- slice(dtest, 1:42) 47 | lgb.Dataset.construct(dsub1) 48 | expect_equal(nrow(dsub1), 42) 49 | expect_equal(ncol(dsub1), ncol(test_data)) 50 | }) 51 | 52 | test_that("lgb.Dataset: colnames", { 53 | dtest <- lgb.Dataset(test_data, label=test_label) 54 | expect_equal(colnames(dtest), colnames(test_data)) 55 | lgb.Dataset.construct(dtest) 56 | expect_equal(colnames(dtest), colnames(test_data)) 57 | expect_error( colnames(dtest) <- 'asdf') 58 | new_names <- make.names(1:ncol(test_data)) 59 | expect_silent(colnames(dtest) <- new_names) 60 | expect_equal(colnames(dtest), new_names) 61 | }) 62 | 63 | test_that("lgb.Dataset: nrow is correct for a very sparse matrix", { 64 | nr <- 1000 65 | x <- rsparsematrix(nr, 100, density=0.0005) 66 | # we want it very sparse, so that last rows are empty 67 | expect_lt(max(x@i), nr) 68 | dtest <- lgb.Dataset(x) 69 | expect_equal(dim(dtest), dim(x)) 70 | }) 71 | 72 | test_that("lgb.Dataset: Dataset should be able to construct from matrix and return non-null handle", { 73 | rawData <- matrix(runif(1000),ncol=10) 74 | handle <- NA_real_ 75 | ref_handle <- NULL 76 | handle <- lightgbm:::lgb.call("LGBM_DatasetCreateFromMat_R" 77 | , ret = handle 78 | , rawData 79 | , nrow(rawData) 80 | , ncol(rawData) 81 | , lightgbm:::lgb.params2str(params=list()) 82 | , ref_handle) 83 | expect_false(is.na(handle)) 84 | }) 85 | -------------------------------------------------------------------------------- /R-package/R/lgb.plot.importance.R: -------------------------------------------------------------------------------- 1 | #' Plot feature importance as a bar graph 2 | #' 3 | #' Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph. 4 | #' 5 | #' @param tree_imp a \code{data.table} returned by \code{\link{lgb.importance}}. 6 | #' @param top_n maximal number of top features to include into the plot. 7 | #' @param measure the name of importance measure to plot, can be "Gain", "Cover" or "Frequency". 8 | #' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names. 9 | #' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}. 10 | #' 11 | #' @details 12 | #' The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature. 13 | #' Features are shown ranked in a decreasing importance order. 14 | #' 15 | #' @return 16 | #' The \code{lgb.plot.importance} function creates a \code{barplot} 17 | #' and silently returns a processed data.table with \code{top_n} features sorted by defined importance. 18 | #' 19 | #' @examples 20 | #' \dontrun{ 21 | #' data(agaricus.train, package = "lightgbm") 22 | #' train <- agaricus.train 23 | #' dtrain <- lgb.Dataset(train$data, label = train$label) 24 | #' 25 | #' params = list(objective = "binary", 26 | #' learning_rate = 0.01, num_leaves = 63, max_depth = -1, 27 | #' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) 28 | #' model <- lgb.train(params, dtrain, 20) 29 | #' model <- lgb.train(params, dtrain, 20) 30 | #' 31 | #' tree_imp <- lgb.importance(model, percentage = TRUE) 32 | #' lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain") 33 | #' } 34 | #' @importFrom graphics barplot par 35 | #' @export 36 | lgb.plot.importance <- function(tree_imp, 37 | top_n = 10, 38 | measure = "Gain", 39 | left_margin = 10, 40 | cex = NULL) { 41 | 42 | # Check for measurement (column names) correctness 43 | measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE) 44 | 45 | # Get top N importance (defaults to 10) 46 | top_n <- min(top_n, nrow(tree_imp)) 47 | 48 | # Parse importance 49 | tree_imp <- tree_imp[order(abs(get(measure)), decreasing = TRUE),][seq_len(top_n),] 50 | 51 | # Attempt to setup a correct cex 52 | if (is.null(cex)) { 53 | cex <- 2.5 / log2(1 + top_n) 54 | } 55 | 56 | # Refresh plot 57 | op <- graphics::par(no.readonly = TRUE) 58 | on.exit(graphics::par(op)) 59 | 60 | # Do some magic plotting 61 | graphics::par(mar = op$mar %>% magrittr::inset(., 2, left_margin)) 62 | 63 | # Do plot 64 | tree_imp[.N:1, 65 | graphics::barplot( 66 | height = get(measure), 67 | names.arg = Feature, 68 | horiz = TRUE, 69 | border = NA, 70 | main = "Feature Importance", 71 | xlab = measure, 72 | cex.names = cex, 73 | las = 1 74 | )] 75 | 76 | # Return invisibly 77 | invisible(tree_imp) 78 | 79 | } 80 | -------------------------------------------------------------------------------- /.appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 2.1.2.{build} 2 | 3 | configuration: # a trick to construct a build matrix 4 | - 3.5 5 | - 3.6 6 | 7 | environment: 8 | matrix: 9 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 10 | COMPILER: MSVC 11 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 12 | COMPILER: MINGW 13 | 14 | clone_depth: 50 15 | 16 | install: 17 | - git submodule update --init --recursive # get `compute` folder 18 | - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% # delete sh.exe from PATH (mingw32-make fix) 19 | - set PATH=C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% 20 | - set PYTHON_VERSION=%CONFIGURATION% 21 | - ps: >- 22 | switch ($env:PYTHON_VERSION) { 23 | "2.7" {$env:MINICONDA = """C:\Miniconda-x64"""} 24 | "3.4" {$env:MINICONDA = """C:\Miniconda34-x64"""} 25 | "3.5" {$env:MINICONDA = """C:\Miniconda35-x64"""} 26 | "3.6" {$env:MINICONDA = """C:\Miniconda36-x64"""} 27 | default {$env:MINICONDA = """C:\Miniconda36-x64"""} 28 | } 29 | - set PATH=%MINICONDA%;%MINICONDA%\Scripts;%PATH% 30 | - ps: $env:LGB_VER = (Get-Content VERSION.txt).trim() 31 | - conda config --set always_yes yes --set changeps1 no 32 | - conda update -q conda 33 | - conda create -q -n test-env python=%PYTHON_VERSION% numpy nose scipy scikit-learn pandas matplotlib python-graphviz pytest 34 | - activate test-env 35 | 36 | build_script: 37 | - mkdir %APPVEYOR_BUILD_FOLDER%\build && cd %APPVEYOR_BUILD_FOLDER%\build 38 | - cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. && cmake --build . --target ALL_BUILD --config Release 39 | 40 | test_script: 41 | - pytest %APPVEYOR_BUILD_FOLDER%\tests\c_api_test\test_.py 42 | - cd %APPVEYOR_BUILD_FOLDER%\python-package && python setup.py sdist --formats gztar 43 | - IF "%COMPILER%"=="MINGW" ( 44 | pip install %APPVEYOR_BUILD_FOLDER%\python-package\dist\lightgbm-%LGB_VER%.tar.gz --install-option=--mingw -v) 45 | ELSE ( 46 | pip install %APPVEYOR_BUILD_FOLDER%\python-package\dist\lightgbm-%LGB_VER%.tar.gz -v) 47 | - pytest %APPVEYOR_BUILD_FOLDER%\tests\python_package_test 48 | - cd %APPVEYOR_BUILD_FOLDER%\examples\python-guide 49 | - ps: >- 50 | @("import matplotlib", "matplotlib.use('Agg')") + (Get-Content "plot_example.py") | Set-Content "plot_example.py" # prevent interactive window mode 51 | - ps: >- 52 | foreach ($file in @(Get-ChildItem *.py)) { 53 | python $file 54 | if ($LastExitCode -ne 0) { $host.SetShouldExit($LastExitCode) } 55 | } # run all examples 56 | - IF "%COMPILER%"=="MINGW" appveyor exit # skip all further steps 57 | - cd %APPVEYOR_BUILD_FOLDER%\python-package && python setup.py bdist_wheel --plat-name=win-amd64 --universal 58 | 59 | artifacts: 60 | - path: Release/lib_lightgbm.dll 61 | name: Library 62 | - path: Release/lightgbm.exe 63 | name: Exe 64 | - path: python-package/dist/* 65 | name: Pip 66 | 67 | deploy: 68 | release: $(APPVEYOR_REPO_TAG_NAME) 69 | provider: GitHub 70 | auth_token: 71 | secure: KR44XwtxY0cLlVpQwY726BvC6gzT0cYTf0ahJ4cSvvS0UVoSJxkR900ICfVXHRoT 72 | artifact: Library,Exe,Pip 73 | force_update: true 74 | draft: true 75 | on: 76 | appveyor_repo_tag: true 77 | -------------------------------------------------------------------------------- /src/metric/metric.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regression_metric.hpp" 3 | #include "binary_metric.hpp" 4 | #include "rank_metric.hpp" 5 | #include "map_metric.hpp" 6 | #include "multiclass_metric.hpp" 7 | #include "xentropy_metric.hpp" 8 | 9 | namespace LightGBM { 10 | 11 | Metric* Metric::CreateMetric(const std::string& type, const Config& config) { 12 | if (type == std::string("regression") || type == std::string("regression_l2") || type == std::string("l2") || type == std::string("mean_squared_error") || type == std::string("mse")) { 13 | return new L2Metric(config); 14 | } else if (type == std::string("l2_root") || type == std::string("root_mean_squared_error") || type == std::string("rmse")) { 15 | return new RMSEMetric(config); 16 | } else if (type == std::string("regression_l1") || type == std::string("l1") || type == std::string("mean_absolute_error") || type == std::string("mae")) { 17 | return new L1Metric(config); 18 | } else if (type == std::string("quantile")) { 19 | return new QuantileMetric(config); 20 | } else if (type == std::string("huber")) { 21 | return new HuberLossMetric(config); 22 | } else if (type == std::string("fair")) { 23 | return new FairLossMetric(config); 24 | } else if (type == std::string("poisson")) { 25 | return new PoissonMetric(config); 26 | } else if (type == std::string("binary_logloss") || type == std::string("binary")) { 27 | return new BinaryLoglossMetric(config); 28 | } else if (type == std::string("binary_error")) { 29 | return new BinaryErrorMetric(config); 30 | } else if (type == std::string("auc")) { 31 | return new AUCMetric(config); 32 | } else if (type == std::string("ndcg")) { 33 | return new NDCGMetric(config); 34 | } else if (type == std::string("map") || type == std::string("mean_average_precision")) { 35 | return new MapMetric(config); 36 | } else if (type == std::string("multi_logloss") || type == std::string("multiclass") || type == std::string("softmax") || type == std::string("multiclassova") || type == std::string("multiclass_ova") || type == std::string("ova") || type == std::string("ovr")) { 37 | return new MultiSoftmaxLoglossMetric(config); 38 | } else if (type == std::string("multi_error")) { 39 | return new MultiErrorMetric(config); 40 | } else if (type == std::string("xentropy") || type == std::string("cross_entropy")) { 41 | return new CrossEntropyMetric(config); 42 | } else if (type == std::string("xentlambda") || type == std::string("cross_entropy_lambda")) { 43 | return new CrossEntropyLambdaMetric(config); 44 | } else if (type == std::string("kldiv") || type == std::string("kullback_leibler")) { 45 | return new KullbackLeiblerDivergence(config); 46 | } else if (type == std::string("mean_absolute_percentage_error") || type == std::string("mape")) { 47 | return new MAPEMetric(config); 48 | } else if (type == std::string("gamma")) { 49 | return new GammaMetric(config); 50 | } else if (type == std::string("gamma_deviance")) { 51 | return new GammaDevianceMetric(config); 52 | } else if (type == std::string("tweedie")) { 53 | return new TweedieMetric(config); 54 | } 55 | return nullptr; 56 | } 57 | 58 | } // namespace LightGBM 59 | -------------------------------------------------------------------------------- /include/LightGBM/tree_learner.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_TREE_LEARNER_H_ 2 | #define LIGHTGBM_TREE_LEARNER_H_ 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | using namespace json11; 12 | 13 | namespace LightGBM { 14 | 15 | /*! \brief forward declaration */ 16 | class Tree; 17 | class Dataset; 18 | class ObjectiveFunction; 19 | 20 | /*! 21 | * \brief Interface for tree learner 22 | */ 23 | class TreeLearner { 24 | public: 25 | /*! \brief virtual destructor */ 26 | virtual ~TreeLearner() {} 27 | 28 | /*! 29 | * \brief Initialize tree learner with training dataset 30 | * \param train_data The used training data 31 | * \param is_constant_hessian True if all hessians share the same value 32 | */ 33 | virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0; 34 | 35 | virtual void ResetTrainingData(const Dataset* train_data) = 0; 36 | 37 | /*! 38 | * \brief Reset tree configs 39 | * \param config config of tree 40 | */ 41 | virtual void ResetConfig(const Config* config) = 0; 42 | 43 | /*! 44 | * \brief training tree model on dataset 45 | * \param gradients The first order gradients 46 | * \param hessians The second order gradients 47 | * \param is_constant_hessian True if all hessians share the same value 48 | * \return A trained tree 49 | */ 50 | virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian, 51 | Json& forced_split_json) = 0; 52 | 53 | /*! 54 | * \brief use a existing tree to fit the new gradients and hessians. 55 | */ 56 | virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0; 57 | 58 | virtual Tree* FitByExistingTree(const Tree* old_tree, const std::vector& leaf_pred, 59 | const score_t* gradients, const score_t* hessians) = 0; 60 | 61 | /*! 62 | * \brief Set bagging data 63 | * \param used_indices Used data indices 64 | * \param num_data Number of used data 65 | */ 66 | virtual void SetBaggingData(const data_size_t* used_indices, 67 | data_size_t num_data) = 0; 68 | 69 | /*! 70 | * \brief Using last trained tree to predict score then adding to out_score; 71 | * \param out_score output score 72 | */ 73 | virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0; 74 | 75 | virtual void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, const double* prediction, 76 | data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0; 77 | 78 | TreeLearner() = default; 79 | /*! \brief Disable copy */ 80 | TreeLearner& operator=(const TreeLearner&) = delete; 81 | /*! \brief Disable copy */ 82 | TreeLearner(const TreeLearner&) = delete; 83 | 84 | /*! 85 | * \brief Create object of tree learner 86 | * \param learner_type Type of tree learner 87 | * \param device_type Type of tree learner 88 | * \param config config of tree 89 | */ 90 | static TreeLearner* CreateTreeLearner(const std::string& learner_type, 91 | const std::string& device_type, 92 | const Config* config); 93 | }; 94 | 95 | } // namespace LightGBM 96 | 97 | #endif // LightGBM_TREE_LEARNER_H_ 98 | -------------------------------------------------------------------------------- /docs/Parameters-Tuning.rst: -------------------------------------------------------------------------------- 1 | Parameters Tuning 2 | ================= 3 | 4 | This page contains parameters tuning guides for different scenarios. 5 | 6 | **List of other helpful links** 7 | 8 | - `Parameters <./Parameters.rst>`__ 9 | - `Python API <./Python-API.rst>`__ 10 | 11 | Tune Parameters for the Leaf-wise (Best-first) Tree 12 | --------------------------------------------------- 13 | 14 | LightGBM uses the `leaf-wise <./Features.rst#leaf-wise-best-first-tree-growth>`__ tree growth algorithm, while many other popular tools use depth-wise tree growth. 15 | Compared with depth-wise growth, the leaf-wise algorithm can converge much faster. 16 | However, the leaf-wise growth may be over-fitting if not used with the appropriate parameters. 17 | 18 | To get good results using a leaf-wise tree, these are some important parameters: 19 | 20 | 1. ``num_leaves``. This is the main parameter to control the complexity of the tree model. 21 | Theoretically, we can set ``num_leaves = 2^(max_depth)`` to obtain the same number of leaves as depth-wise tree. 22 | However, this simple conversion is not good in practice. 23 | The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting. 24 | Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``. 25 | For example, when the ``max_depth=7`` the depth-wise tree can get good accuracy, 26 | but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise. 27 | 28 | 2. ``min_data_in_leaf``. This is a very important parameter to prevent over-fitting in a leaf-wise tree. 29 | Its optimal value depends on the number of training samples and ``num_leaves``. 30 | Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. 31 | In practice, setting it to hundreds or thousands is enough for a large dataset. 32 | 33 | 3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly. 34 | 35 | For Faster Speed 36 | ---------------- 37 | 38 | - Use bagging by setting ``bagging_fraction`` and ``bagging_freq`` 39 | 40 | - Use feature sub-sampling by setting ``feature_fraction`` 41 | 42 | - Use small ``max_bin`` 43 | 44 | - Use ``save_binary`` to speed up data loading in future learning 45 | 46 | - Use parallel learning, refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ 47 | 48 | 49 | For Better Accuracy 50 | ------------------- 51 | 52 | - Use large ``max_bin`` (may be slower) 53 | 54 | - Use small ``learning_rate`` with large ``num_iterations`` 55 | 56 | - Use large ``num_leaves`` (may cause over-fitting) 57 | 58 | - Use bigger training data 59 | 60 | - Try ``dart`` 61 | 62 | Deal with Over-fitting 63 | ---------------------- 64 | 65 | - Use small ``max_bin`` 66 | 67 | - Use small ``num_leaves`` 68 | 69 | - Use ``min_data_in_leaf`` and ``min_sum_hessian_in_leaf`` 70 | 71 | - Use bagging by set ``bagging_fraction`` and ``bagging_freq`` 72 | 73 | - Use feature sub-sampling by set ``feature_fraction`` 74 | 75 | - Use bigger training data 76 | 77 | - Try ``lambda_l1``, ``lambda_l2`` and ``min_gain_to_split`` for regularization 78 | 79 | - Try ``max_depth`` to avoid growing deep tree 80 | -------------------------------------------------------------------------------- /R-package/demo/multiclass_custom_objective.R: -------------------------------------------------------------------------------- 1 | require(lightgbm) 2 | 3 | # We load the default iris dataset shipped with R 4 | data(iris) 5 | 6 | # We must convert factors to numeric 7 | # They must be starting from number 0 to use multiclass 8 | # For instance: 0, 1, 2, 3, 4, 5... 9 | iris$Species <- as.numeric(as.factor(iris$Species)) - 1 10 | 11 | # We cut the data set into 80% train and 20% validation 12 | # The 10 last samples of each class are for validation 13 | 14 | train <- as.matrix(iris[c(1:40, 51:90, 101:140), ]) 15 | test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) 16 | dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5]) 17 | dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5]) 18 | valids <- list(test = dtest) 19 | 20 | # Method 1 of training with built-in multiclass objective 21 | model_builtin <- lgb.train(list(), 22 | dtrain, 23 | 100, 24 | valids, 25 | min_data = 1, 26 | learning_rate = 1, 27 | early_stopping_rounds = 10, 28 | objective = "multiclass", 29 | metric = "multi_logloss", 30 | num_class = 3) 31 | 32 | preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE) 33 | 34 | # Method 2 of training with custom objective function 35 | 36 | # User defined objective function, given prediction, return gradient and second order gradient 37 | custom_multiclass_obj = function(preds, dtrain) { 38 | labels = getinfo(dtrain, "label") 39 | 40 | # preds is a matrix with rows corresponding to samples and colums corresponding to choices 41 | preds = matrix(preds, nrow = length(labels)) 42 | 43 | # to prevent overflow, normalize preds by row 44 | preds = preds - apply(preds, 1, max) 45 | prob = exp(preds) / rowSums(exp(preds)) 46 | 47 | # compute gradient 48 | grad = prob 49 | grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1 50 | 51 | # compute hessian (approximation) 52 | hess = 2 * prob * (1 - prob) 53 | 54 | return(list(grad = grad, hess = hess)) 55 | } 56 | 57 | # define custom metric 58 | custom_multiclass_metric = function(preds, dtrain) { 59 | labels = getinfo(dtrain, "label") 60 | preds = matrix(preds, nrow = length(labels)) 61 | preds = preds - apply(preds, 1, max) 62 | prob = exp(preds) / rowSums(exp(preds)) 63 | 64 | return(list(name = "error", 65 | value = -mean(log(prob[cbind(1:length(labels), labels + 1)])), 66 | higher_better = FALSE)) 67 | 68 | } 69 | 70 | model_custom <- lgb.train(list(), 71 | dtrain, 72 | 100, 73 | valids, 74 | min_data = 1, 75 | learning_rate = 1, 76 | early_stopping_rounds = 10, 77 | objective = custom_multiclass_obj, 78 | eval = custom_multiclass_metric, 79 | num_class = 3) 80 | 81 | preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE) 82 | 83 | # compare predictions 84 | identical(preds_builtin, preds_custom) 85 | 86 | -------------------------------------------------------------------------------- /include/LightGBM/utils/random.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_UTILS_RANDOM_H_ 2 | #define LIGHTGBM_UTILS_RANDOM_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace LightGBM { 11 | 12 | /*! 13 | * \brief A wrapper for random generator 14 | */ 15 | class Random { 16 | public: 17 | /*! 18 | * \brief Constructor, with random seed 19 | */ 20 | Random() { 21 | std::random_device rd; 22 | auto genrator = std::mt19937(rd()); 23 | std::uniform_int_distribution distribution(0, x); 24 | x = distribution(genrator); 25 | } 26 | /*! 27 | * \brief Constructor, with specific seed 28 | */ 29 | Random(int seed) { 30 | x = seed; 31 | } 32 | /*! 33 | * \brief Generate random integer, int16 range. [0, 65536] 34 | * \param lower_bound lower bound 35 | * \param upper_bound upper bound 36 | * \return The random integer between [lower_bound, upper_bound) 37 | */ 38 | inline int NextShort(int lower_bound, int upper_bound) { 39 | return (RandInt16()) % (upper_bound - lower_bound) + lower_bound; 40 | } 41 | 42 | /*! 43 | * \brief Generate random integer, int32 range 44 | * \param lower_bound lower bound 45 | * \param upper_bound upper bound 46 | * \return The random integer between [lower_bound, upper_bound) 47 | */ 48 | inline int NextInt(int lower_bound, int upper_bound) { 49 | return (RandInt32()) % (upper_bound - lower_bound) + lower_bound; 50 | } 51 | 52 | /*! 53 | * \brief Generate random float data 54 | * \return The random float between [0.0, 1.0) 55 | */ 56 | inline float NextFloat() { 57 | // get random float in [0,1) 58 | return static_cast(RandInt16()) / (32768.0f); 59 | } 60 | /*! 61 | * \brief Sample K data from {0,1,...,N-1} 62 | * \param N 63 | * \param K 64 | * \return K Ordered sampled data from {0,1,...,N-1} 65 | */ 66 | inline std::vector Sample(int N, int K) { 67 | std::vector ret; 68 | ret.reserve(K); 69 | if (K > N || K <= 0) { 70 | return ret; 71 | } else if (K == N) { 72 | for (int i = 0; i < N; ++i) { 73 | ret.push_back(i); 74 | } 75 | } else if (K > 1 && K > (N / std::log2(K))) { 76 | for (int i = 0; i < N; ++i) { 77 | double prob = (K - ret.size()) / static_cast(N - i); 78 | if (NextFloat() < prob) { 79 | ret.push_back(i); 80 | } 81 | } 82 | } else { 83 | std::set sample_set; 84 | while (static_cast(sample_set.size()) < K) { 85 | int next = RandInt32() % N; 86 | if (sample_set.count(next) == 0) { 87 | sample_set.insert(next); 88 | } 89 | } 90 | for (auto iter = sample_set.begin(); iter != sample_set.end(); ++iter) { 91 | ret.push_back(*iter); 92 | } 93 | } 94 | return ret; 95 | } 96 | private: 97 | inline int RandInt16() { 98 | x = (214013 * x + 2531011); 99 | return static_cast((x >> 16) & 0x7FFF); 100 | } 101 | 102 | inline int RandInt32() { 103 | x = (214013 * x + 2531011); 104 | return static_cast(x & 0x7FFFFFFF); 105 | } 106 | 107 | unsigned int x = 123456789; 108 | }; 109 | 110 | 111 | } // namespace LightGBM 112 | 113 | #endif // LightGBM_UTILS_RANDOM_H_ 114 | -------------------------------------------------------------------------------- /R-package/tests/testthat/test_basic.R: -------------------------------------------------------------------------------- 1 | context("basic functions") 2 | 3 | data(agaricus.train, package='lightgbm') 4 | data(agaricus.test, package='lightgbm') 5 | train <- agaricus.train 6 | test <- agaricus.test 7 | 8 | windows_flag = grepl('Windows', Sys.info()[['sysname']]) 9 | 10 | test_that("train and predict binary classification", { 11 | nrounds = 10 12 | bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5, 13 | nrounds = nrounds, objective = "binary", metric="binary_error") 14 | expect_false(is.null(bst$record_evals)) 15 | record_results <- lgb.get.eval.result(bst, "train", "binary_error") 16 | expect_lt(min(record_results), 0.02) 17 | 18 | pred <- predict(bst, test$data) 19 | expect_equal(length(pred), 1611) 20 | 21 | pred1 <- predict(bst, train$data, num_iteration = 1) 22 | expect_equal(length(pred1), 6513) 23 | err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label) 24 | err_log <- record_results[1] 25 | expect_lt(abs(err_pred1 - err_log), 10e-6) 26 | }) 27 | 28 | 29 | test_that("train and predict softmax", { 30 | lb <- as.numeric(iris$Species) - 1 31 | 32 | bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb, 33 | num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20, 34 | objective = "multiclass", metric="multi_error", num_class=3) 35 | 36 | expect_false(is.null(bst$record_evals)) 37 | record_results <- lgb.get.eval.result(bst, "train", "multi_error") 38 | expect_lt(min(record_results), 0.03) 39 | 40 | pred <- predict(bst, as.matrix(iris[, -5])) 41 | expect_equal(length(pred), nrow(iris) * 3) 42 | }) 43 | 44 | 45 | test_that("use of multiple eval metrics works", { 46 | bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4, 47 | learning_rate=1, nrounds = 10, objective = "binary", 48 | metric = list("binary_error","auc","binary_logloss") ) 49 | expect_false(is.null(bst$record_evals)) 50 | }) 51 | 52 | 53 | test_that("training continuation works", { 54 | dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE) 55 | watchlist = list(train=dtrain) 56 | param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1) 57 | 58 | # for the reference, use 10 iterations at once: 59 | bst <- lgb.train(param, dtrain, nrounds = 10, watchlist) 60 | err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10) 61 | # first 5 iterations: 62 | bst1 <- lgb.train(param, dtrain, nrounds = 5, watchlist) 63 | # test continuing from a model in file 64 | lgb.save(bst1, "lightgbm.model") 65 | # continue for 5 more: 66 | bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = bst1) 67 | err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10) 68 | expect_lt(abs(err_bst - err_bst2), 0.01) 69 | 70 | bst2 <- lgb.train(param, dtrain, nrounds = 5, watchlist, init_model = "lightgbm.model") 71 | err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10) 72 | expect_lt(abs(err_bst - err_bst2), 0.01) 73 | }) 74 | 75 | 76 | test_that("cv works", { 77 | dtrain <- lgb.Dataset(train$data, label=train$label) 78 | params <- list(objective="regression", metric="l2,l1") 79 | bst <- lgb.cv(params, dtrain, 10, nflod=5, min_data=1, learning_rate=1, early_stopping_rounds=10) 80 | expect_false(is.null(bst$record_evals)) 81 | }) 82 | -------------------------------------------------------------------------------- /docs/Quick-Start.rst: -------------------------------------------------------------------------------- 1 | Quick Start 2 | =========== 3 | 4 | This is a quick start guide for LightGBM CLI version. 5 | 6 | Follow the `Installation Guide <./Installation-Guide.rst>`__ to install LightGBM first. 7 | 8 | **List of other helpful links** 9 | 10 | - `Parameters <./Parameters.rst>`__ 11 | 12 | - `Parameters Tuning <./Parameters-Tuning.rst>`__ 13 | 14 | - `Python-package Quick Start <./Python-Intro.rst>`__ 15 | 16 | - `Python API <./Python-API.rst>`__ 17 | 18 | Training Data Format 19 | -------------------- 20 | 21 | LightGBM supports input data files with `CSV`_, `TSV`_ and `LibSVM`_ formats. 22 | 23 | Files could be both with and without headers. 24 | 25 | Label column could be specified both by index and by name. 26 | 27 | Some columns could be ignored. 28 | 29 | Categorical Feature Support 30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 31 | 32 | LightGBM can use categorical features directly (without one-hot encoding). 33 | The experiment on `Expo data`_ shows about 8x speed-up compared with one-hot encoding. 34 | 35 | For the setting details, please refer to `Parameters <./Parameters.rst#categorical_feature>`__. 36 | 37 | Weight and Query/Group Data 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | LightGBM also supports weighted training, it needs an additional `weight data <./Parameters.rst#weight-data>`__. 41 | And it needs an additional `query data <./Parameters.rst#query-data>`_ for ranking task. 42 | 43 | Also, weight and query data could be specified as columns in training data in the same manner as label. 44 | 45 | Parameters Quick Look 46 | --------------------- 47 | 48 | The parameters format is ``key1=value1 key2=value2 ...``. 49 | 50 | Parameters can be set both in config file and command line. 51 | If one parameter appears in both command line and config file, LightGBM will use the parameter from the command line. 52 | 53 | The most important parameters which new users should take a look to are located into `Core Parameters <./Parameters.rst#core-parameters>`__ 54 | and the top of `Learning Control Parameters <./Parameters.rst#learning-control-parameters>`__ 55 | sections of the full detailed list of `LightGBM's parameters <./Parameters.rst>`__. 56 | 57 | Run LightGBM 58 | ------------ 59 | 60 | For Windows: 61 | 62 | :: 63 | 64 | lightgbm.exe config=your_config_file other_args ... 65 | 66 | For Unix: 67 | 68 | :: 69 | 70 | ./lightgbm config=your_config_file other_args ... 71 | 72 | Parameters can be set both in config file and command line, and the parameters in command line have higher priority than in config file. 73 | For example, following command line will keep ``num_trees=10`` and ignore the same parameter in config file. 74 | 75 | :: 76 | 77 | ./lightgbm config=train.conf num_trees=10 78 | 79 | Examples 80 | -------- 81 | 82 | - `Binary Classification `__ 83 | 84 | - `Regression `__ 85 | 86 | - `Lambdarank `__ 87 | 88 | - `Parallel Learning `__ 89 | 90 | .. _CSV: https://en.wikipedia.org/wiki/Comma-separated_values 91 | 92 | .. _TSV: https://en.wikipedia.org/wiki/Tab-separated_values 93 | 94 | .. _LibSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/ 95 | 96 | .. _Expo data: http://stat-computing.org/dataexpo/2009/ 97 | -------------------------------------------------------------------------------- /src/boosting/gbdt_prediction.cpp: -------------------------------------------------------------------------------- 1 | #include "gbdt.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace LightGBM { 8 | 9 | void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { 10 | int early_stop_round_counter = 0; 11 | // set zero 12 | std::memset(output, 0, sizeof(double) * num_tree_per_iteration_); 13 | for (int i = 0; i < num_iteration_for_pred_; ++i) { 14 | // predict all the trees for one iteration 15 | for (int k = 0; k < num_tree_per_iteration_; ++k) { 16 | output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features); 17 | } 18 | // check early stopping 19 | ++early_stop_round_counter; 20 | if (early_stop->round_period == early_stop_round_counter) { 21 | if (early_stop->callback_function(output, num_tree_per_iteration_)) { 22 | return; 23 | } 24 | early_stop_round_counter = 0; 25 | } 26 | } 27 | } 28 | 29 | void GBDT::PredictRawByMap(const std::unordered_map& features, double* output, const PredictionEarlyStopInstance* early_stop) const { 30 | int early_stop_round_counter = 0; 31 | // set zero 32 | std::memset(output, 0, sizeof(double) * num_tree_per_iteration_); 33 | for (int i = 0; i < num_iteration_for_pred_; ++i) { 34 | // predict all the trees for one iteration 35 | for (int k = 0; k < num_tree_per_iteration_; ++k) { 36 | output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features); 37 | } 38 | // check early stopping 39 | ++early_stop_round_counter; 40 | if (early_stop->round_period == early_stop_round_counter) { 41 | if (early_stop->callback_function(output, num_tree_per_iteration_)) { 42 | return; 43 | } 44 | early_stop_round_counter = 0; 45 | } 46 | } 47 | } 48 | 49 | void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { 50 | PredictRaw(features, output, early_stop); 51 | if (average_output_) { 52 | for (int k = 0; k < num_tree_per_iteration_; ++k) { 53 | output[k] /= num_iteration_for_pred_; 54 | } 55 | } else if (objective_function_ != nullptr) { 56 | objective_function_->ConvertOutput(output, output); 57 | } 58 | } 59 | 60 | void GBDT::PredictByMap(const std::unordered_map& features, double* output, const PredictionEarlyStopInstance* early_stop) const { 61 | PredictRawByMap(features, output, early_stop); 62 | if (average_output_) { 63 | for (int k = 0; k < num_tree_per_iteration_; ++k) { 64 | output[k] /= num_iteration_for_pred_; 65 | } 66 | } else if (objective_function_ != nullptr) { 67 | objective_function_->ConvertOutput(output, output); 68 | } 69 | } 70 | 71 | void GBDT::PredictLeafIndex(const double* features, double* output) const { 72 | int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_; 73 | for (int i = 0; i < total_tree; ++i) { 74 | output[i] = models_[i]->PredictLeafIndex(features); 75 | } 76 | } 77 | 78 | void GBDT::PredictLeafIndexByMap(const std::unordered_map& features, double* output) const { 79 | int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_; 80 | for (int i = 0; i < total_tree; ++i) { 81 | output[i] = models_[i]->PredictLeafIndexByMap(features); 82 | } 83 | } 84 | 85 | } // namespace LightGBM 86 | -------------------------------------------------------------------------------- /src/treelearner/feature_parallel_tree_learner.cpp: -------------------------------------------------------------------------------- 1 | #include "parallel_tree_learner.h" 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace LightGBM { 8 | 9 | 10 | template 11 | FeatureParallelTreeLearner::FeatureParallelTreeLearner(const Config* config) 12 | :TREELEARNER_T(config) { 13 | } 14 | 15 | template 16 | FeatureParallelTreeLearner::~FeatureParallelTreeLearner() { 17 | 18 | } 19 | 20 | template 21 | void FeatureParallelTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { 22 | TREELEARNER_T::Init(train_data, is_constant_hessian); 23 | rank_ = Network::rank(); 24 | num_machines_ = Network::num_machines(); 25 | input_buffer_.resize((sizeof(SplitInfo) + sizeof(uint32_t) * this->config_->max_cat_threshold) * 2); 26 | output_buffer_.resize((sizeof(SplitInfo) + sizeof(uint32_t) * this->config_->max_cat_threshold) * 2); 27 | } 28 | 29 | 30 | template 31 | void FeatureParallelTreeLearner::BeforeTrain() { 32 | TREELEARNER_T::BeforeTrain(); 33 | // get feature partition 34 | std::vector> feature_distribution(num_machines_, std::vector()); 35 | std::vector num_bins_distributed(num_machines_, 0); 36 | for (int i = 0; i < this->train_data_->num_total_features(); ++i) { 37 | int inner_feature_index = this->train_data_->InnerFeatureIndex(i); 38 | if (inner_feature_index == -1) { continue; } 39 | if (this->is_feature_used_[inner_feature_index]) { 40 | int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); 41 | feature_distribution[cur_min_machine].push_back(inner_feature_index); 42 | num_bins_distributed[cur_min_machine] += this->train_data_->FeatureNumBin(inner_feature_index); 43 | this->is_feature_used_[inner_feature_index] = false; 44 | } 45 | } 46 | // get local used features 47 | for (auto fid : feature_distribution[rank_]) { 48 | this->is_feature_used_[fid] = true; 49 | } 50 | } 51 | 52 | template 53 | void FeatureParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { 54 | TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract); 55 | SplitInfo smaller_best_split, larger_best_split; 56 | // get best split at smaller leaf 57 | smaller_best_split = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()]; 58 | // find local best split for larger leaf 59 | if (this->larger_leaf_splits_->LeafIndex() >= 0) { 60 | larger_best_split = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()]; 61 | } 62 | // sync global best info 63 | SyncUpGlobalBestSplit(input_buffer_.data(), input_buffer_.data(), &smaller_best_split, &larger_best_split, this->config_->max_cat_threshold); 64 | // update best split 65 | this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best_split; 66 | if (this->larger_leaf_splits_->LeafIndex() >= 0) { 67 | this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best_split; 68 | } 69 | } 70 | 71 | // instantiate template classes, otherwise linker cannot find the code 72 | template class FeatureParallelTreeLearner; 73 | template class FeatureParallelTreeLearner; 74 | } // namespace LightGBM 75 | -------------------------------------------------------------------------------- /tests/python_package_test/test_basic.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: skip-file 3 | import os 4 | import tempfile 5 | import unittest 6 | 7 | import lightgbm as lgb 8 | import numpy as np 9 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file 10 | from sklearn.model_selection import train_test_split 11 | 12 | 13 | class TestBasic(unittest.TestCase): 14 | 15 | def test(self): 16 | X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) 17 | train_data = lgb.Dataset(X_train, label=y_train) 18 | valid_data = train_data.create_valid(X_test, label=y_test) 19 | 20 | params = { 21 | "objective": "binary", 22 | "metric": "auc", 23 | "min_data": 10, 24 | "num_leaves": 15, 25 | "verbose": -1, 26 | "num_threads": 1, 27 | "max_bin": 255 28 | } 29 | bst = lgb.Booster(params, train_data) 30 | bst.add_valid(valid_data, "valid_1") 31 | 32 | for i in range(30): 33 | bst.update() 34 | if i % 10 == 0: 35 | print(bst.eval_train(), bst.eval_valid()) 36 | bst.save_model("model.txt") 37 | pred_from_matr = bst.predict(X_test) 38 | with tempfile.NamedTemporaryFile() as f: 39 | tname = f.name 40 | with open(tname, "w+b") as f: 41 | dump_svmlight_file(X_test, y_test, f) 42 | pred_from_file = bst.predict(tname) 43 | os.remove(tname) 44 | self.assertEqual(len(pred_from_matr), len(pred_from_file)) 45 | for preds in zip(pred_from_matr, pred_from_file): 46 | self.assertAlmostEqual(*preds, places=15) 47 | 48 | # check saved model persistence 49 | bst = lgb.Booster(params, model_file="model.txt") 50 | pred_from_model_file = bst.predict(X_test) 51 | self.assertEqual(len(pred_from_matr), len(pred_from_model_file)) 52 | for preds in zip(pred_from_matr, pred_from_model_file): 53 | # we need to check the consistency of model file here, so test for exact equal 54 | self.assertEqual(*preds) 55 | 56 | # check early stopping is working. Make it stop very early, so the scores should be very close to zero 57 | pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} 58 | pred_early_stopping = bst.predict(X_test, **pred_parameter) 59 | self.assertEqual(len(pred_from_matr), len(pred_early_stopping)) 60 | for preds in zip(pred_early_stopping, pred_from_matr): 61 | # scores likely to be different, but prediction should still be the same 62 | self.assertEqual(preds[0] > 0, preds[1] > 0) 63 | 64 | def test_chunked_dataset(self): 65 | X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) 66 | 67 | chunk_size = X_train.shape[0] // 10 + 1 68 | X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] 69 | X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)] 70 | 71 | train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100}) 72 | valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100}) 73 | 74 | train_data.construct() 75 | valid_data.construct() 76 | -------------------------------------------------------------------------------- /R-package/man/lgb.prepare_rules.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.prepare_rules.R 3 | \name{lgb.prepare_rules} 4 | \alias{lgb.prepare_rules} 5 | \title{Data preparator for LightGBM datasets with rules (numeric)} 6 | \usage{ 7 | lgb.prepare_rules(data, rules = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{A data.frame or data.table to prepare.} 11 | 12 | \item{rules}{A set of rules from the data preparator, if already used.} 13 | } 14 | \value{ 15 | A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset. 16 | } 17 | \description{ 18 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(iris) 24 | 25 | str(iris) 26 | # 'data.frame': 150 obs. of 5 variables: 27 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 28 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 29 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 30 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 31 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... 32 | 33 | new_iris <- lgb.prepare_rules(data = iris) # Autoconverter 34 | str(new_iris$data) 35 | # 'data.frame': 150 obs. of 5 variables: 36 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 37 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 38 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 39 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 40 | # $ Species : num 1 1 1 1 1 1 1 1 1 1 ... 41 | 42 | data(iris) # Erase iris dataset 43 | iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA) 44 | # Warning message: 45 | # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, : 46 | # invalid factor level, NA generated 47 | 48 | # Use conversion using known rules 49 | # Unknown factors become 0, excellent for sparse datasets 50 | newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules) 51 | 52 | # Unknown factor is now zero, perfect for sparse datasets 53 | newer_iris$data[1, ] # Species became 0 as it is an unknown factor 54 | # Sepal.Length Sepal.Width Petal.Length Petal.Width Species 55 | # 1 5.1 3.5 1.4 0.2 0 56 | 57 | newer_iris$data[1, 5] <- 1 # Put back real initial value 58 | 59 | # Is the newly created dataset equal? YES! 60 | all.equal(new_iris$data, newer_iris$data) 61 | # [1] TRUE 62 | 63 | # Can we test our own rules? 64 | data(iris) # Erase iris dataset 65 | 66 | # We remapped values differently 67 | personal_rules <- list(Species = c("setosa" = 3, 68 | "versicolor" = 2, 69 | "virginica" = 1)) 70 | newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules) 71 | str(newest_iris$data) # SUCCESS! 72 | # 'data.frame': 150 obs. of 5 variables: 73 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 74 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 75 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 76 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 77 | # $ Species : num 3 3 3 3 3 3 3 3 3 3 ... 78 | 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /include/LightGBM/dataset_loader.h: -------------------------------------------------------------------------------- 1 | #ifndef LIGHTGBM_DATASET_LOADER_H_ 2 | #define LIGHTGBM_DATASET_LOADER_H_ 3 | 4 | #include 5 | 6 | namespace LightGBM { 7 | 8 | class DatasetLoader { 9 | public: 10 | 11 | LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename); 12 | 13 | LIGHTGBM_EXPORT ~DatasetLoader(); 14 | 15 | LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file, int rank, int num_machines); 16 | 17 | LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, const char* initscore_file) { 18 | return LoadFromFile(filename, initscore_file, 0, 1); 19 | } 20 | 21 | LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const char* initscore_file, const Dataset* train_data); 22 | 23 | LIGHTGBM_EXPORT Dataset* CostructFromSampleData(double** sample_values, 24 | int** sample_indices, int num_col, const int* num_per_col, 25 | size_t total_sample_size, data_size_t num_data); 26 | 27 | /*! \brief Disable copy */ 28 | DatasetLoader& operator=(const DatasetLoader&) = delete; 29 | /*! \brief Disable copy */ 30 | DatasetLoader(const DatasetLoader&) = delete; 31 | 32 | private: 33 | 34 | Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); 35 | 36 | void SetHeader(const char* filename); 37 | 38 | void CheckDataset(const Dataset* dataset); 39 | 40 | std::vector LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); 41 | 42 | std::vector SampleTextDataFromMemory(const std::vector& data); 43 | 44 | std::vector SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); 45 | 46 | void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector& sample_data, const Parser* parser, Dataset* dataset); 47 | 48 | /*! \brief Extract local features from memory */ 49 | void ExtractFeaturesFromMemory(std::vector& text_data, const Parser* parser, Dataset* dataset); 50 | 51 | /*! \brief Extract local features from file */ 52 | void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector& used_data_indices, Dataset* dataset); 53 | 54 | /*! \brief Check can load from binary file */ 55 | std::string CheckCanLoadFromBin(const char* filename); 56 | 57 | const Config& config_; 58 | /*! \brief Random generator*/ 59 | Random random_; 60 | /*! \brief prediction function for initial model */ 61 | const PredictFunction& predict_fun_; 62 | /*! \brief number of classes */ 63 | int num_class_; 64 | /*! \brief index of label column */ 65 | int label_idx_; 66 | /*! \brief index of weight column */ 67 | int weight_idx_; 68 | /*! \brief index of group column */ 69 | int group_idx_; 70 | /*! \brief Mapper from real feature index to used index*/ 71 | std::unordered_set ignore_features_; 72 | /*! \brief store feature names */ 73 | std::vector feature_names_; 74 | /*! \brief Mapper from real feature index to used index*/ 75 | std::unordered_set categorical_features_; 76 | }; 77 | 78 | } 79 | 80 | #endif // LIGHTGBM_DATASET_LOADER_H_ 81 | -------------------------------------------------------------------------------- /R-package/man/lgb.prepare_rules2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lgb.prepare_rules2.R 3 | \name{lgb.prepare_rules2} 4 | \alias{lgb.prepare_rules2} 5 | \title{Data preparator for LightGBM datasets with rules (integer)} 6 | \usage{ 7 | lgb.prepare_rules2(data, rules = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{A data.frame or data.table to prepare.} 11 | 12 | \item{rules}{A set of rules from the data preparator, if already used.} 13 | } 14 | \value{ 15 | A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset. 16 | } 17 | \description{ 18 | Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(lightgbm) 23 | data(iris) 24 | 25 | str(iris) 26 | # 'data.frame': 150 obs. of 5 variables: 27 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 28 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 29 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 30 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 31 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... 32 | 33 | new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter 34 | str(new_iris$data) 35 | # 'data.frame': 150 obs. of 5 variables: 36 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 37 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 38 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 39 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 40 | # $ Species : int 1 1 1 1 1 1 1 1 1 1 ... 41 | 42 | data(iris) # Erase iris dataset 43 | iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA) 44 | # Warning message: 45 | # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, : 46 | # invalid factor level, NA generated 47 | 48 | # Use conversion using known rules 49 | # Unknown factors become 0, excellent for sparse datasets 50 | newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules) 51 | 52 | # Unknown factor is now zero, perfect for sparse datasets 53 | newer_iris$data[1, ] # Species became 0 as it is an unknown factor 54 | # Sepal.Length Sepal.Width Petal.Length Petal.Width Species 55 | # 1 5.1 3.5 1.4 0.2 0 56 | 57 | newer_iris$data[1, 5] <- 1 # Put back real initial value 58 | 59 | # Is the newly created dataset equal? YES! 60 | all.equal(new_iris$data, newer_iris$data) 61 | # [1] TRUE 62 | 63 | # Can we test our own rules? 64 | data(iris) # Erase iris dataset 65 | 66 | # We remapped values differently 67 | personal_rules <- list(Species = c("setosa" = 3L, 68 | "versicolor" = 2L, 69 | "virginica" = 1L)) 70 | newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules) 71 | str(newest_iris$data) # SUCCESS! 72 | # 'data.frame': 150 obs. of 5 variables: 73 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 74 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 75 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 76 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 77 | # $ Species : int 3 3 3 3 3 3 3 3 3 3 ... 78 | 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /R-package/demo/categorical_features_prepare.R: -------------------------------------------------------------------------------- 1 | # Here we are going to try training a model with categorical features 2 | 3 | # Load libraries 4 | library(data.table) 5 | library(lightgbm) 6 | 7 | # Load data and look at the structure 8 | # 9 | # Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: 10 | # $ age : int 30 33 35 30 59 35 36 39 41 43 ... 11 | # $ job : chr "unemployed" "services" "management" "management" ... 12 | # $ marital : chr "married" "married" "single" "married" ... 13 | # $ education: chr "primary" "secondary" "tertiary" "tertiary" ... 14 | # $ default : chr "no" "no" "no" "no" ... 15 | # $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... 16 | # $ housing : chr "no" "yes" "yes" "yes" ... 17 | # $ loan : chr "no" "yes" "no" "yes" ... 18 | # $ contact : chr "cellular" "cellular" "cellular" "unknown" ... 19 | # $ day : int 19 11 16 3 5 23 14 6 14 17 ... 20 | # $ month : chr "oct" "may" "apr" "jun" ... 21 | # $ duration : int 79 220 185 199 226 141 341 151 57 313 ... 22 | # $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... 23 | # $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... 24 | # $ previous : int 0 4 1 0 0 3 2 0 0 2 ... 25 | # $ poutcome : chr "unknown" "failure" "failure" "unknown" ... 26 | # $ y : chr "no" "no" "no" "no" ... 27 | data(bank, package = "lightgbm") 28 | str(bank) 29 | 30 | # We must now transform the data to fit in LightGBM 31 | # For this task, we use lgb.prepare 32 | # The function transforms the data into a fittable data 33 | # 34 | # Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: 35 | # $ age : int 30 33 35 30 59 35 36 39 41 43 ... 36 | # $ job : chr "unemployed" "services" "management" "management" ... 37 | # $ marital : chr "married" "married" "single" "married" ... 38 | # $ education: chr "primary" "secondary" "tertiary" "tertiary" ... 39 | # $ default : chr "no" "no" "no" "no" ... 40 | # $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... 41 | # $ housing : chr "no" "yes" "yes" "yes" ... 42 | # $ loan : chr "no" "yes" "no" "yes" ... 43 | # $ contact : chr "cellular" "cellular" "cellular" "unknown" ... 44 | # $ day : int 19 11 16 3 5 23 14 6 14 17 ... 45 | # $ month : chr "oct" "may" "apr" "jun" ... 46 | # $ duration : int 79 220 185 199 226 141 341 151 57 313 ... 47 | # $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... 48 | # $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... 49 | # $ previous : int 0 4 1 0 0 3 2 0 0 2 ... 50 | # $ poutcome : chr "unknown" "failure" "failure" "unknown" ... 51 | # $ y : chr "no" "no" "no" "no" ... 52 | bank <- lgb.prepare(data = bank) 53 | str(bank) 54 | 55 | # Remove 1 to label because it must be between 0 and 1 56 | bank$y <- bank$y - 1 57 | 58 | # Data input to LightGBM must be a matrix, without the label 59 | my_data <- as.matrix(bank[, 1:16, with = FALSE]) 60 | 61 | # Creating the LightGBM dataset with categorical features 62 | # The categorical features must be indexed like in R (1-indexed, not 0-indexed) 63 | lgb_data <- lgb.Dataset(data = my_data, 64 | label = bank$y, 65 | categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)) 66 | 67 | # We can now train a model 68 | model <- lgb.train(list(objective = "binary", 69 | metric = "l2", 70 | min_data = 1, 71 | learning_rate = 0.1, 72 | min_data = 0, 73 | min_hessian = 1, 74 | max_depth = 2), 75 | lgb_data, 76 | 100, 77 | valids = list(train = lgb_data)) 78 | 79 | # Try to find split_feature: 2 80 | # If you find it, it means it used a categorical feature in the first tree 81 | lgb.dump(model, num_iteration = 1) 82 | --------------------------------------------------------------------------------