├── tests
    ├── __init__.py
    ├── test_main.cpp
    ├── test_math.cpp
    ├── math_test.py
    ├── cpp_test.py
    └── python_package_test
    │   ├── test_basic.py
    │   └── test_sklearn.py
├── .pytest_cache
    └── v
    │   └── cache
    │       ├── nodeids
    │       └── lastfailed
├── vs
    └── LiteMORT
    │   ├── SA_salp.cpp
    │   ├── stdafx.h
    │   ├── dllmain.cpp
    │   ├── stdafx.cpp
    │   ├── targetver.h
    │   ├── LiteMORT.cpp
    │   ├── ReadMe.txt
    │   ├── LiteMORT.sln
    │   └── LiteMORT.vcxproj.filters
├── MANIFEST.in
├── src
    ├── data_fold
    │   ├── Cluster.hpp
    │   ├── EDA.cpp
    │   ├── EDA.hpp
    │   ├── Loss.hpp
    │   ├── Move.hpp
    │   ├── Binfold.cpp
    │   ├── Binfold.hpp
    │   ├── DataFold.cpp
    │   ├── DataFold.hpp
    │   ├── FeatVector.cpp
    │   ├── FeatVector.hpp
    │   ├── Histogram.cpp
    │   ├── Histogram.hpp
    │   ├── Distribution.hpp
    │   ├── FeatVec_EXP.hpp
    │   ├── FeatVec_Quanti.hpp
    │   ├── CMakeLists.txt
    │   ├── Imputer.hpp
    │   ├── Loss_binary.hpp
    │   ├── FeatVec_2D.hpp
    │   └── Representive.hpp
    ├── LiteMORT.cpp
    ├── learn
    │   ├── LOSS.cpp
    │   ├── DCRIMI_.hpp
    │   ├── Pruning.cpp
    │   ├── CMakeLists.txt
    │   ├── Regression.hpp
    │   ├── LMachine.hpp
    │   ├── Pruning.hpp
    │   └── DCRIMI_.cpp
    ├── tree
    │   ├── GBRT.cpp
    │   ├── BiSplit.cpp
    │   ├── BiSplit.hpp
    │   ├── GST_fno.hpp
    │   ├── old
    │   │   ├── GiFace.cpp
    │   │   ├── RF_ConfiRegress.h
    │   │   ├── RF_ConfiRegress.cpp
    │   │   ├── RF_ShapeRegress.cpp
    │   │   └── RF_ShapeRegress.h
    │   ├── ManifoldTree.cpp
    │   ├── ManifoldTree.hpp
    │   ├── BoostingForest.cpp
    │   ├── BoostingForest.hpp
    │   ├── CMakeLists.txt
    │   └── GBRT.hpp
    ├── util
    │   ├── GST_def.h
    │   ├── BLAS_t.cpp
    │   ├── BLAS_t.hpp
    │   ├── FeatData.cpp
    │   ├── FeatData.hpp
    │   ├── GRander.hpp
    │   ├── Object.hpp
    │   ├── samp_set.hpp
    │   ├── CMakeLists.txt
    │   ├── Parallel_t.hpp
    │   ├── Statistics_t.hpp
    │   ├── GRander.cpp
    │   ├── pcg_oneil
    │   │   ├── pcg_basic.h
    │   │   ├── xoshiro256starstar.c
    │   │   ├── xoshiro256plusplus.c
    │   │   └── pcg_basic.c
    │   ├── PY_obj.hpp
    │   ├── Float16.hpp
    │   └── FastExpLog.c
    ├── EDA
    │   ├── SA_salp.cpp
    │   ├── SA_salp.hpp
    │   ├── Feat_Selection.cpp
    │   ├── Feat_Selection.hpp
    │   └── CMakeLists.txt
    ├── python
    │   ├── pyMORT_DLL.cpp
    │   └── pyMORT_DLL.h
    ├── include
    │   └── LiteBOM_config.h
    └── __version__.py
├── python-package
    ├── .pytest_cache
    │   └── v
    │   │   └── cache
    │   │       └── stepwise
    ├── LiteMORT
    │   ├── VERSION.txt
    │   ├── LiteMORT_regression.py
    │   ├── __version__.py
    │   ├── __init__.py
    │   ├── LiteMORT_time.py
    │   ├── libpath.py
    │   ├── compat.py
    │   ├── LiteMORT_problems.py
    │   ├── LiteMORT_hyppo.py
    │   ├── LiteMORT_ERA.py
    │   └── LiteMORT_EDA.py
    ├── MANIFEST.in
    ├── lgbm_importances.png
    ├── README.md
    ├── LICENSE
    ├── mort_local.py
    ├── setup.py
    ├── case_poct.py
    ├── case_higgs.py
    ├── case_future_sales.py
    ├── case_earthquake.py
    ├── lgb_kim.py
    ├── case_ieee_fraud.py
    └── pycharm_test.py
├── doc
    └── 基于二阶泛函优化的梯度提升算法.pptx
├── .idea
    ├── libraries
    │   └── R_User_Library.xml
    ├── vcs.xml
    ├── other.xml
    ├── encodings.xml
    ├── modules.xml
    ├── misc.xml
    └── LiteMORT.iml
├── case_future_sales.py
├── LICENSE
├── CMakeLists.txt
├── README.md
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.pytest_cache/v/cache/nodeids:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/vs/LiteMORT/SA_salp.cpp:
--------------------------------------------------------------------------------
1 | class 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | 


--------------------------------------------------------------------------------
/src/data_fold/Cluster.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 


--------------------------------------------------------------------------------
/python-package/.pytest_cache/v/cache/stepwise:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/python-package/LiteMORT/VERSION.txt:
--------------------------------------------------------------------------------
1 | 0.1.0
2 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/LiteMORT_regression.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/python-package/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | 


--------------------------------------------------------------------------------
/tests/test_main.cpp:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | #include <catch.hpp>


--------------------------------------------------------------------------------
/src/LiteMORT.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/LiteMORT.cpp


--------------------------------------------------------------------------------
/src/learn/LOSS.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/learn/LOSS.cpp


--------------------------------------------------------------------------------
/src/tree/GBRT.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/GBRT.cpp


--------------------------------------------------------------------------------
/src/util/GST_def.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/GST_def.h


--------------------------------------------------------------------------------
/src/EDA/SA_salp.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/SA_salp.cpp


--------------------------------------------------------------------------------
/src/EDA/SA_salp.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/SA_salp.hpp


--------------------------------------------------------------------------------
/src/data_fold/EDA.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/EDA.cpp


--------------------------------------------------------------------------------
/src/data_fold/EDA.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/EDA.hpp


--------------------------------------------------------------------------------
/src/learn/DCRIMI_.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/learn/DCRIMI_.hpp


--------------------------------------------------------------------------------
/src/learn/Pruning.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/learn/Pruning.cpp


--------------------------------------------------------------------------------
/src/tree/BiSplit.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BiSplit.cpp


--------------------------------------------------------------------------------
/src/tree/BiSplit.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BiSplit.hpp


--------------------------------------------------------------------------------
/src/tree/GST_fno.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/GST_fno.hpp


--------------------------------------------------------------------------------
/src/util/BLAS_t.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/BLAS_t.cpp


--------------------------------------------------------------------------------
/src/util/BLAS_t.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/BLAS_t.hpp


--------------------------------------------------------------------------------
/src/util/FeatData.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/FeatData.cpp


--------------------------------------------------------------------------------
/src/util/FeatData.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/FeatData.hpp


--------------------------------------------------------------------------------
/src/util/GRander.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/GRander.hpp


--------------------------------------------------------------------------------
/src/util/Object.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/Object.hpp


--------------------------------------------------------------------------------
/src/util/samp_set.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/samp_set.hpp


--------------------------------------------------------------------------------
/vs/LiteMORT/stdafx.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/stdafx.h


--------------------------------------------------------------------------------
/src/data_fold/Loss.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Loss.hpp


--------------------------------------------------------------------------------
/src/data_fold/Move.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Move.hpp


--------------------------------------------------------------------------------
/src/tree/old/GiFace.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/GiFace.cpp


--------------------------------------------------------------------------------
/vs/LiteMORT/dllmain.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/dllmain.cpp


--------------------------------------------------------------------------------
/vs/LiteMORT/stdafx.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/stdafx.cpp


--------------------------------------------------------------------------------
/vs/LiteMORT/targetver.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/targetver.h


--------------------------------------------------------------------------------
/doc/基于二阶泛函优化的梯度提升算法.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/doc/基于二阶泛函优化的梯度提升算法.pptx


--------------------------------------------------------------------------------
/src/EDA/Feat_Selection.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/Feat_Selection.cpp


--------------------------------------------------------------------------------
/src/EDA/Feat_Selection.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/Feat_Selection.hpp


--------------------------------------------------------------------------------
/src/data_fold/Binfold.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Binfold.cpp


--------------------------------------------------------------------------------
/src/data_fold/Binfold.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Binfold.hpp


--------------------------------------------------------------------------------
/src/data_fold/DataFold.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/DataFold.cpp


--------------------------------------------------------------------------------
/src/data_fold/DataFold.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/DataFold.hpp


--------------------------------------------------------------------------------
/src/python/pyMORT_DLL.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/python/pyMORT_DLL.cpp


--------------------------------------------------------------------------------
/src/tree/ManifoldTree.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/ManifoldTree.cpp


--------------------------------------------------------------------------------
/src/tree/ManifoldTree.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/ManifoldTree.hpp


--------------------------------------------------------------------------------
/vs/LiteMORT/LiteMORT.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/LiteMORT.cpp


--------------------------------------------------------------------------------
/src/data_fold/FeatVector.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVector.cpp


--------------------------------------------------------------------------------
/src/data_fold/FeatVector.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVector.hpp


--------------------------------------------------------------------------------
/src/data_fold/Histogram.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Histogram.cpp


--------------------------------------------------------------------------------
/src/data_fold/Histogram.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Histogram.hpp


--------------------------------------------------------------------------------
/src/include/LiteBOM_config.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/include/LiteBOM_config.h


--------------------------------------------------------------------------------
/src/tree/BoostingForest.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BoostingForest.cpp


--------------------------------------------------------------------------------
/src/tree/BoostingForest.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BoostingForest.hpp


--------------------------------------------------------------------------------
/src/data_fold/Distribution.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Distribution.hpp


--------------------------------------------------------------------------------
/src/data_fold/FeatVec_EXP.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVec_EXP.hpp


--------------------------------------------------------------------------------
/src/tree/old/RF_ConfiRegress.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/RF_ConfiRegress.h


--------------------------------------------------------------------------------
/python-package/LiteMORT/__version__.py:
--------------------------------------------------------------------------------
1 | 
2 | VERSION = (0, 1, 18)
3 | 
4 | __version__ = '.'.join(map(str, VERSION))
5 | 


--------------------------------------------------------------------------------
/src/data_fold/FeatVec_Quanti.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVec_Quanti.hpp


--------------------------------------------------------------------------------
/src/tree/old/RF_ConfiRegress.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/RF_ConfiRegress.cpp


--------------------------------------------------------------------------------
/src/tree/old/RF_ShapeRegress.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/RF_ShapeRegress.cpp


--------------------------------------------------------------------------------
/python-package/lgbm_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/python-package/lgbm_importances.png


--------------------------------------------------------------------------------
/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 | <component name="libraryTable">
2 |   <library name="R User Library">
3 |     <CLASSES />
4 |     <SOURCES />
5 |   </library>
6 | </component>


--------------------------------------------------------------------------------
/tests/test_math.cpp:
--------------------------------------------------------------------------------
1 | #include <catch.hpp>
2 | 
3 | #include "math.hpp"
4 | 
5 | TEST_CASE("Addition and subtraction")
6 | {
7 |     REQUIRE(add(1, 1) == 2);
8 |     REQUIRE(subtract(1, 1) == 0);
9 | }


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding">
4 |     <file url="file://$PROJECT_DIR$/python-package/case_ashrae.py" charset="UTF-8" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/src/util/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(util_)
 4 | 
 5 | # Create the main executable
 6 | # add the header files also so that the IDE knows they exist 
 7 | # in the source tree
 8 | add_library(util_
 9 | 	./GRander.cpp
10 | 	)


--------------------------------------------------------------------------------
/python-package/README.md:
--------------------------------------------------------------------------------
1 | A fast gradient boosting framework on manifolds(from regression tree,classification tree,neural net....).
2 | 
3 | ##### 1)Faster than LightGBM with same accuracy. 
4 | 
5 | ##### 2)sklearn-like api interface.
6 | 
7 | ##### 3)Support parameters of LightGBM.


--------------------------------------------------------------------------------
/src/EDA/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(eda_)
 4 | 
 5 | # Create the main executable
 6 | # add the header files also so that the IDE knows they exist 
 7 | # in the source tree
 8 | add_library(eda_
 9 | 	./Feat_Selection.cpp
10 | 	./SA_salp.cpp
11 | 	)


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/LiteMORT.iml" filepath="$PROJECT_DIR$/.idea/LiteMORT.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/src/learn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(learn_)
 4 | 
 5 | # Create the main executable
 6 | # add the header files also so that the IDE knows they exist 
 7 | # in the source tree
 8 | add_library(learn_
 9 | 	./LOSS.cpp
10 | 	./DCRIMI_.cpp
11 | 	./Pruning.cpp
12 | 	)


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/src/tree/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(tree_)
 4 | 
 5 | # Create the main executable
 6 | # add the header files also so that the IDE knows they exist 
 7 | # in the source tree
 8 | add_library(tree_
 9 | 	./ManifoldTree.cpp
10 | 	./GBRT.cpp
11 | 	./BoostingForest.cpp
12 | 	./BiSplit.cpp
13 | 	)


--------------------------------------------------------------------------------
/src/data_fold/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(data_fold_)
 4 | 
 5 | # Create the main executable
 6 | # add the header files also so that the IDE knows they exist 
 7 | # in the source tree
 8 | add_library(data_fold_
 9 | 	./Histogram.cpp
10 | 	./EDA.cpp
11 | 	./DataFold.cpp
12 | 	./FeatVector.cpp
13 | 	)


--------------------------------------------------------------------------------
/tests/math_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import python_cpp_example
 3 | 
 4 | class MainTest(unittest.TestCase):
 5 |     def test_add(self):
 6 |         self.assertEqual(python_cpp_example.add(1, 1), 2)
 7 | 
 8 |     def test_subtract(self):
 9 |         self.assertEqual(python_cpp_example.subtract(1, 1), 0)
10 | 
11 | if __name__ == '__main__':
12 |     unittest.main()


--------------------------------------------------------------------------------
/src/__version__.py:
--------------------------------------------------------------------------------
1 | # 8b    d8 Yb  dP 88""Yb    db     dP""b8 88  dP    db     dP""b8 888888
2 | # 88b  d88  YbdP  88__dP   dPYb   dP   `" 88odP    dPYb   dP   `" 88__
3 | # 88YbdP88   8P   88"""   dP__Yb  Yb      88"Yb   dP__Yb  Yb  "88 88""
4 | # 88 YY 88  dP    88     dP""""Yb  YboodP 88  Yb dP""""Yb  YboodP 888888
5 | 
6 | VERSION = (5, 2, 0)
7 | 
8 | __version__ = '.'.join(map(str, VERSION))
9 | 


--------------------------------------------------------------------------------
/.pytest_cache/v/cache/lastfailed:
--------------------------------------------------------------------------------
1 | {
2 |   "tests/python_package_test/test_basic.py::TestBasic::test": true,
3 |   "tests/python_package_test/test_sklearn.py": true,
4 |   "tests/python_package_test/test_sklearn.py::TestSklearn": true,
5 |   "tests/python_package_test/test_sklearn.py::TestSklearn::test_binary": true,
6 |   "tests/python_package_test/test_sklearn.py::TestSklearn::test_pandas_categorical": true
7 | }


--------------------------------------------------------------------------------
/case_future_sales.py:
--------------------------------------------------------------------------------
 1 | #https://www.kaggle.com/hukuda222/nfl-simple-model-using-lightgbm
 2 | 
 3 | import os
 4 | import pandas as pd
 5 | from kaggle.competitions import nflrush
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn import preprocessing
 9 | import matplotlib.pyplot as plt
10 | import random
11 | from sklearn.model_selection import KFold
12 | import lightgbm as lgb
13 | import gc
14 | import pickle
15 | import tqdm


--------------------------------------------------------------------------------
/tests/cpp_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import subprocess
 3 | import os
 4 | 
 5 | 
 6 | class MainTest(unittest.TestCase):
 7 |     def test_cpp(self):
 8 |         print("\n\nTesting C++ code...")
 9 |         subprocess.check_call(os.path.join(os.path.dirname(
10 |             os.path.relpath(__file__)), 'bin', 'python_cpp_example_test'))
11 |         print("\nResuming Python tests...\n")
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     unittest.main()
16 | 


--------------------------------------------------------------------------------
/src/data_fold/Imputer.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <string>
 5 | #include <vector>
 6 | #include "../util/GST_def.h"
 7 | #include "../include/LiteBOM_config.h"
 8 | 
 9 | using namespace std;
10 | 
11 | namespace Grusoft {
12 | 	template<typename Tx>
13 | 	void Imputer_Fill_(LiteBOM_Config&config, size_t nSamp_, Tx *vec,double fill,int flag=0x0 ) {
14 | 		for (size_t i = 0; i<nSamp_; i++) {
15 | 			if (IS_NAN_INF(vec[i])) {
16 | 				vec[i] = fill;
17 | 			}
18 | 		}
19 | 	}
20 | }


--------------------------------------------------------------------------------
/src/data_fold/Loss_binary.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | //#include "loss.hpp"
 4 | 
 5 | namespace Grusoft {
 6 | 	class LambdaRank {
 7 | 	protected:
 8 | 		size_t nUnderFlow = 0, nOverFlow = 0;
 9 | 		double a_0, a_1, grid;
10 | 		size_t nMost=-1;
11 | 		double sigma=20000;
12 | 		double *tables=nullptr;
13 | 	public:
14 | 		LambdaRank(int flag=0)	{}
15 | 		virtual ~LambdaRank() {
16 | 			if (tables != nullptr)
17 | 				delete[] tables;
18 | 		}
19 | 		virtual void Init(double sigma,double a_0,double a_1, size_t nMost, int flag = 0x0);
20 | 		virtual double At(double a);
21 | 
22 | 	};
23 | 	
24 | 
25 | 
26 | 
27 | 
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/src/util/Parallel_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <omp.h>
 3 | 
 4 | #define OMP_FOR_func(lambda_func)											\
 5 | for (int thread = 0; thread < num_threads; thread++) {			\
 6 | 	size_t start = thread*step, end = MIN2(start + step, dim), i;	\
 7 | 	for (i = start; i < end; i++) {	{lambda_func;}	}		\
 8 | }
 9 | 
10 | namespace Grusoft{
11 | 	inline int  OMP_FOR_STATIC_1(const size_t nSamp, size_t& step,int min_size=64, int flag = 0x0) {
12 | 		int num_threads = 1;
13 | 		step = nSamp;
14 | 		if (nSamp > min_size) {
15 | #pragma omp parallel	
16 | #pragma omp master											
17 | 			{	num_threads = omp_get_num_threads();	}
18 | 			step = (nSamp + num_threads - 1) / num_threads;
19 | 		}
20 | 		return num_threads;
21 | 	}
22 | }


--------------------------------------------------------------------------------
/src/util/Statistics_t.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <chrono>
 3 | #include <thread>
 4 | 
 5 | #ifdef WIN32
 6 | 	/*#include <time.h>
 7 | 	#define GST_NOW( )		(clock( ))
 8 | 	#define GST_TIC(tick)	clock_t tick=clock( );
 9 | 	#define GST_TOC(tick)	((clock()-(tick))*1.0f/CLOCKS_PER_SEC)*/
10 | 	typedef std::chrono::high_resolution_clock Clock;
11 | 	#define GST_NOW( )	(Clock::now( ))
12 | 	#define GST_TIC(tick)	auto tick = Clock::now( );
13 | 	#define GST_TOC(tick)	( (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now( )-(tick)).count( ))/1000000.0)
14 | 
15 | #else
16 | 	typedef std::chrono::high_resolution_clock Clock;
17 | 	#define GST_NOW( )	(Clock::now( ))
18 | 	#define GST_TIC(tick)	auto tick = Clock::now( );
19 | 	#define GST_TOC(tick)	( (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now( )-(tick)).count( ))/1000000.0)
20 | #endif


--------------------------------------------------------------------------------
/src/data_fold/FeatVec_2D.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "./DataFold.hpp"
 4 | 
 5 | namespace Grusoft {
 6 | 	class HistoGRAM_2D : public HistoGRAM {
 7 | 	protected:
 8 | 		HistoGRAM *histoX = nullptr, *histoY = nullptr;
 9 | 	public:
10 | 		HistoGRAM_2D(FeatVector*hFeat_, size_t nMost, int flag = 0x0) : HistoGRAM(hFeat_,nMost, flag) {
11 | 		}
12 | 		virtual ~HistoGRAM_2D() {
13 | 			;
14 | 		}
15 | 
16 | 		virtual void GreedySplit_X(const FeatsOnFold *hData_, const SAMP_SET& samp_set, int flag = 0x0);
17 | 	};
18 | 
19 | 	class FeatVec_2D : public FeatVec_T<short> {
20 | 	protected:
21 | 		FeatVec_T<short> *featX = nullptr, *featY = nullptr;
22 | 	public:
23 | 		FeatVec_2D(FeatsOnFold *hData_, int id_, const FeatVec_T<short> *fX, const FeatVec_T<short> *fY, size_t nMostDup, int flag = 0x0);
24 | 		virtual ~FeatVec_2D() {
25 | 		}
26 | 
27 | 
28 | 		
29 | 	};
30 | 
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """LiteMORT, Light Gradient Boosting Machine.
 3 | 
 4 | __author__ = 'Yingshi Chen'
 5 | """
 6 | from __future__ import absolute_import
 7 | import os
 8 | 
 9 | #from .LiteMORT_problems import Mort_Problems
10 | from .__version__ import __version__
11 | from .LiteMORT import LiteMORT,LiteMORT_profile
12 | from .LiteMORT_preprocess import Mort_Preprocess,Mort_PickSamples
13 | from .LiteMORT_hyppo import MORT_feat_select_
14 | '''
15 | try:
16 | except ImportError:
17 | pass
18 | '''
19 | 
20 | '''
21 | try:
22 |     from .plotting import plot_importance, plot_metric, plot_tree, create_tree_digraph
23 | except ImportError:
24 |     pass
25 | '''
26 | 
27 | dir_path = os.path.dirname(os.path.realpath(__file__))
28 | #print(f"__init_ dir_path={dir_path}")
29 | 
30 | __all__ = ['LiteMORT','LiteMORT_profile','Mort_Preprocess','Mort_PickSamples','MORT_feat_select_']
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/.idea/LiteMORT.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 6 |       <sourceFolder url="file://$MODULE_DIR$/tests/python_package_test" isTestSource="false" />
 7 |       <sourceFolder url="file://$MODULE_DIR$/python-package/litemort" isTestSource="false" />
 8 |     </content>
 9 |     <orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
10 |     <orderEntry type="sourceFolder" forTests="false" />
11 |     <orderEntry type="library" name="R User Library" level="project" />
12 |     <orderEntry type="library" name="R Skeletons" level="application" />
13 |   </component>
14 |   <component name="TestRunnerService">
15 |     <option name="projectConfiguration" value="pytest" />
16 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
17 |   </component>
18 | </module>


--------------------------------------------------------------------------------
/python-package/LiteMORT/LiteMORT_time.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn import skbase
 3 | import numpy as np
 4 | 
 5 | #https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113784#latest-656376
 6 | class DatetimeConvertCyclical(skbase.BaseEstimator, skbase.TransformerMixin):
 7 |     def __init__(self):
 8 |         self.time_periods = {'second': 24 * 60 * 60,
 9 |                              'minute': 24 * 60,
10 |                              'hour': 24,
11 |                              'day': 30,
12 |                              'dayofweek': 7,
13 |                              'month': 12}
14 | 
15 |     def fit(self, X, y=None):
16 |         return self
17 | 
18 |     def transform(self, X):
19 |         for period, value in self.time_periods.items():
20 |             X[period] = getattr(X['timestamp'].dt, period)
21 | 
22 |             X['sin_' + period] = np.sin(2 * np.pi * X[period] / value)
23 |             X['cos_' + period] = np.cos(2 * np.pi * X[period] / value)
24 | 
25 |             X.drop(str(period), axis=1, inplace=True)
26 | 
27 |         return X


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright <YEAR> <COPYRIGHT HOLDER>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/python-package/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright <YEAR> <COPYRIGHT HOLDER>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/src/util/GRander.cpp:
--------------------------------------------------------------------------------
 1 | #include "samp_set.hpp"
 2 | #include "GRander.hpp"
 3 | using namespace Grusoft;
 4 | 
 5 | extern "C" uint64_t xoroshiro_next(void);
 6 | 
 7 | uint64_t GRander::RandRersResrResdra() {  // Combined period = 2^116.23
 8 | 	int alg = 2;
 9 | 	switch (alg) {
10 | 	case 0:
11 | 		break;	//return pcg32_random_r(&rng_neil);		//32-bit unsigned int   -  period:      2^64
12 | 	case 1:
13 | 		return xoroshiro_next();
14 | 	default:
15 | 		xx = rotl(xx, 8) - rotl(xx, 29);                 //RERS,   period = 4758085248529 (prime)
16 | 		yy = rotl(yy, 21) - yy;  yy = rotl(yy, 20);      //RESR,   period = 3841428396121 (prime)
17 | 		zz = rotl(zz, 42) - zz;  zz = zz + rotl(zz, 14); //RESDRA, period = 5345004409 (prime)
18 | 		return xx ^ yy ^ zz;
19 | 	}
20 | 	return 0;
21 | }
22 | 
23 | /*
24 | DIST_RangeN::DIST_RangeN(int seed, double a0, double a1) : 
25 | 	GRander(seed), rMin(a0), rMax(a1)  {	
26 | 	std::normal_distribution<> d1((rMax+rMin)/2,(rMax-rMin)/6);
27 | 	d=d1;
28 | }
29 | 
30 | double DIST_RangeN::gen(){
31 | 	double a;
32 | 	do{
33 | 		a = d(g);
34 | 	} while (a<rMin || a>rMax);
35 | 	return (a);
36 | }*/


--------------------------------------------------------------------------------
/python-package/mort_local.py:
--------------------------------------------------------------------------------
 1 | import litemort
 2 | from litemort import *
 3 | print(litemort.__version__)
 4 | 
 5 | early_stop = 20
 6 | verbose_eval = 5
 7 | metric = 'l2'
 8 | #num_rounds=1000, lr=0.05, bf=0.3
 9 | num_rounds = 1000;      lr = 0.05;          bf = 0.3
10 | params = {'num_leaves': 31, 'n_estimators': num_rounds,
11 |               'objective': 'regression',
12 |               'max_bin': 256,
13 |               #               'max_depth': -1,
14 |               'learning_rate': lr,
15 |               "boosting": "gbdt",
16 |               "bagging_freq": 5,
17 |               "bagging_fraction": bf,
18 |               "feature_fraction": 0.9,  # STRANGE GBDT  why("bagging_freq": 5 "feature_fraction": 0.9)!!!
19 |               "metric": metric, "verbose_eval": verbose_eval, 'n_jobs': 8, "elitism": 0,"debug":'1',
20 |               "early_stopping_rounds": early_stop, "adaptive": 'weight1', 'verbose': 0, 'min_data_in_leaf': 20,
21 |               #               "verbosity": -1,
22 |               #               'reg_alpha': 0.1,
23 |               #               'reg_lambda': 0.3
24 |               }
25 | mort=LiteMORT(params)


--------------------------------------------------------------------------------
/vs/LiteMORT/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | ========================================================================
 2 |     动态链接库：LiteMORT 项目概述
 3 | ========================================================================
 4 | 
 5 | 应用程序向导已为您创建了此 LiteMORT DLL。
 6 | 
 7 | 本文件概要介绍组成 LiteMORT 应用程序的每个文件的内容。
 8 | 
 9 | 
10 | LiteMORT.vcxproj
11 |     这是使用应用程序向导生成的 VC++ 项目的主项目文件，其中包含生成该文件的 Visual C++ 的版本信息，以及有关使用应用程序向导选择的平台、配置和项目功能的信息。
12 | 
13 | LiteMORT.vcxproj.filters
14 |     这是使用“应用程序向导”生成的 VC++ 项目筛选器文件。它包含有关项目文件与筛选器之间的关联信息。在 IDE 中，通过这种关联，在特定节点下以分组形式显示具有相似扩展名的文件。例如，“.cpp”文件与“源文件”筛选器关联。
15 | 
16 | LiteMORT.cpp
17 |     这是主 DLL 源文件。
18 | 
19 | 	此 DLL 在创建时不导出任何符号。因此，生成时不会产生 .lib 文件。如果希望此项目成为其他某个项目的项目依赖项，则需要添加代码以从 DLL 导出某些符号，以便产生一个导出库，或者，也可以在项目“属性页”对话框中的“链接器”文件夹中，将“常规”属性页上的“忽略输入库”属性设置为“是”。
20 | 
21 | /////////////////////////////////////////////////////////////////////////////
22 | 其他标准文件:
23 | 
24 | StdAfx.h, StdAfx.cpp
25 |     这些文件用于生成名为 LiteMORT.pch 的预编译头 (PCH) 文件和名为 StdAfx.obj 的预编译类型文件。
26 | 
27 | /////////////////////////////////////////////////////////////////////////////
28 | 其他注释:
29 | 
30 | 应用程序向导使用“TODO:”注释来指示应添加或自定义的源代码部分。
31 | 
32 | /////////////////////////////////////////////////////////////////////////////
33 | 


--------------------------------------------------------------------------------
/src/data_fold/Representive.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <map>
 8 | #include <numeric>
 9 | #include <time.h>
10 | using namespace std;
11 | #include "./FeatVector.hpp"
12 | 
13 | #ifdef WIN32
14 | #include <tchar.h>
15 | #include <assert.h>
16 | #else    
17 | #include <assert.h>
18 | //#define assert(cond)
19 | #endif
20 | 
21 | 
22 | namespace Grusoft {
23 | 	class MT_BiSplit;
24 | 
25 | 	class FeatsOnFold;
26 | 	class Distribution;
27 | 
28 | 	struct FeatPresent {
29 | 		FeatVector *hFeat = nullptr;
30 | 		float T_min = 5;
31 | 		FeatPresent(FeatVector *hF, float T_,int flag=0x0) : hFeat(hF),T_min(T_)	{
32 | 
33 | 		}
34 | 		
35 | 	};
36 | 
37 | 	class Representive {
38 | 		vector<FeatPresent*> arrPFeat;
39 | 	public:
40 | 		Representive() {
41 | 
42 | 		}
43 | 		virtual ~Representive() {
44 | 			for (auto pf : arrPFeat)
45 | 				delete pf;
46 | 			arrPFeat.clear();
47 | 		}
48 | 		void Append(FeatVector *hF, float T_, int flag = 0x0) {
49 | 			arrPFeat.push_back(new FeatPresent(hF, T_));
50 | 		}
51 | 		bool isValid(const MT_BiSplit *hNode,int flag=0x0);
52 | 		void dump(int flag=0x0);
53 | 	};
54 | 
55 | 
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/src/learn/Regression.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace Grusoft {
 4 | 	class Regression {
 5 | 		double _slope, _yInt;
 6 | 	public:
 7 | 		Regression(string alg,int flag=0x0) {
 8 | 
 9 | 		}
10 | 
11 | 		/*
12 | 			https://web.archive.org/web/20150715022401/http://faculty.cs.niu.edu/~hutchins/csci230/best-fit.htm
13 | 			Y = Slope * X + YInt
14 | 		*/
15 | 		template<typename Tx, typename Ty>
16 | 		bool Fit(size_t nSamp, tpSAMP_ID *samps,Tx *arrX, Ty *arrY, int flag = 0x0) {
17 | 			assert (nSamp >= 2) ;
18 | 			double sumX = 0, sumY = 0, sumXY = 0, sumX2 = 0;
19 | 			Tx x,y;
20 | 			tpSAMP_ID samp;
21 | 			for (int i = 0; i<nSamp; i++) {
22 | 				samp = samps[i];
23 | 				x = arrX[samp], y = arrY[samp];
24 | 				sumX += x;				sumY += y;
25 | 				sumXY += x*y;			sumX2 += x*x;
26 | 			}
27 | 			double xMean = sumX / nSamp;
28 | 			double yMean = sumY / nSamp;
29 | 			double denominator = sumX2 - sumX * xMean;
30 | 			// You can tune the eps (1e-7) below for your specific task
31 | 			if (std::fabs(denominator) < 1e-7) {
32 | 				// Fail: it seems a vertical line
33 | 				return false;
34 | 			}
35 | 			_slope = (sumXY - sumX * yMean) / denominator;
36 | 			_yInt = yMean - _slope * xMean;
37 | 			return true;
38 | 		}
39 | 
40 | 		template<typename Tx>
41 | 		Tx At(Tx x, int flag = 0x0) {
42 | 			double y = _slope*x+ _yInt;
43 | 			return (Tx)(y);
44 | 		}
45 | 	};
46 | 
47 | 	
48 | }


--------------------------------------------------------------------------------
/vs/LiteMORT/LiteMORT.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 14
 4 | VisualStudioVersion = 14.0.25420.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LiteMORT", "LiteMORT.vcxproj", "{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Debug|x86 = Debug|x86
12 | 		Release|x64 = Release|x64
13 | 		Release|x86 = Release|x86
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x64.Build.0 = Debug|x64
18 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x86.ActiveCfg = Debug|Win32
19 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x86.Build.0 = Debug|Win32
20 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x64.ActiveCfg = Release|x64
21 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x64.Build.0 = Release|x64
22 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x86.ActiveCfg = Release|Win32
23 | 		{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x86.Build.0 = Release|Win32
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | EndGlobal
29 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/libpath.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """Find the path to LiteMORT dynamic library files."""
 3 | import os
 4 | 
 5 | from platform import system
 6 | 
 7 | 
 8 | def find_lib_path():
 9 |     """Find the path to LiteMORT library files.
10 |     Returns
11 |     -------
12 |     lib_path: list(string)
13 |        List of all found library path to LiteMORT
14 |     """
15 |     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
16 |     dll_path = [curr_path, os.path.join(curr_path, '../../'),
17 |                 os.path.join(curr_path, 'compile'),
18 |                 os.path.join(curr_path, '../compile'),
19 |                 os.path.join(curr_path, '../../lib/')]
20 |     if system() in ('Windows', 'Microsoft'):
21 |         dll_path.append(os.path.join(curr_path, '../compile/Release/'))
22 |         dll_path.append(os.path.join(curr_path, '../compile/windows/x64/DLL/'))
23 |         dll_path.append(os.path.join(curr_path, '../../Release/'))
24 |         dll_path.append(os.path.join(curr_path, '../../windows/x64/DLL/'))
25 |         dll_path = [os.path.join(p, 'LiteMORT.dll') for p in dll_path]
26 |     else:
27 |         dll_path = [os.path.join(p, 'libLiteMORT.so') for p in dll_path]
28 |     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
29 |     if not lib_path:
30 |         dll_path = [os.path.realpath(p) for p in dll_path]
31 |         raise Exception('Cannot find LiteMORT library in following paths: ' + '\n'.join(dll_path))
32 | 
33 |     return lib_path
34 | 


--------------------------------------------------------------------------------
/src/learn/LMachine.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <random>
 5 | #include <algorithm>
 6 | #include <assert.h>
 7 | using namespace std;
 8 | 
 9 | namespace Grusoft{
10 | 	class LMachine	{
11 | 	public:
12 | 		typedef std::mt19937* hRANDER;		//(pseudo) random generator
13 | 		typedef enum{
14 | 			CLASIFY,REGRESSION
15 | 		}MODEL;
16 | 		enum{		//constant
17 | 			SAMPL_OOB=100,SAMPL_InB,
18 | 			RAND_REINIT=9991,
19 | 		};
20 | 		struct SKDU{		//Learn Schdule
21 | 			//each cascade contain nSteps.each step contain 1 or n trees
22 | 			int cascad,step,nStep,noT,nTree,noLeaf;		
23 | 			bool isLastStep( )	{	return step==nStep-1;	}
24 | 			bool isLastTree( )	{	return noT==nTree-1;	}
25 | 			LMachine* hMachine;
26 | 			float rBase,rMax,rMin,gamma,lr;	
27 | 			SKDU( ):cascad(0),step(0),nStep(0),nTree(0),noT(-1),noLeaf(-1){}
28 | 		};
29 | 		
30 | 
31 | 		struct CASE{
32 | 			float label,predict;		//for classification and 1-var regression
33 | 			int nBag;
34 | 			CASE( ):nBag(0),label(0.0),predict(0.0)		{	;	}
35 | 			virtual ~CASE( )							{;}
36 | 		};
37 | 		typedef vector<CASE*> CASEs;
38 | 		CASEs SamplSet;
39 | 
40 | 	protected:
41 | 		bool isDumpLeaf;
42 | 		hRANDER hRander;
43 | 		MODEL model;
44 | 		int nThread;
45 | 		SKDU skdu;		
46 | 		void *user_data;
47 | 
48 | 		double impurity,sBalance,eOOB,eInB;
49 | 		int nFeat,nClass,nPickWeak;
50 | 		vector<string>FeatNames;
51 | 		//vector<FeatsOnFold *> arrDat;
52 | 	public:
53 | 		string name;
54 | 
55 | 		hRANDER InitRander( unsigned seed );
56 | 		virtual void Clear( );		
57 | 
58 | 	};
59 | }
60 | 


--------------------------------------------------------------------------------
/src/tree/GBRT.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <random>
 5 | #include <algorithm>
 6 | #include <assert.h>
 7 | #include "BoostingForest.hpp"
 8 | #include "../data_fold/Representive.hpp"
 9 | using namespace std;
10 | 
11 | namespace Grusoft{	
12 | 	class FeatsOnFold;
13 | 	/*
14 | 		residual boosting mayber better 
15 | 	*/
16 | 	class GBRT : public BoostingForest {
17 | 		string sPre;
18 | 		//random_state
19 | 	protected:
20 | 		double shrinkage=0.1;
21 | 		double nzWeak;
22 | 		bool isCalcErr;
23 | 		int nBlitThread;
24 | 		virtual bool GetFeatDistri(WeakLearner *hWeak, float *distri = nullptr, int flag = 0x0);
25 | 		//virtual bool LeafModel(WeakLearner *hWeak, int flag = 0x0);
26 | 		//virtual void UpdateFeat(int flag);
27 | 		//virtual void BlitSamps(WeakLearner *hWeak, SAMPs &fnL, SAMPs &fnR, int flag = 0x0);
28 | 		//virtual hBLIT GetBlit(WeakLearner *hWeak, int flag = 0x0);
29 | 		virtual void GetYDistri(WeakLearner *hWeak, float *distri = nullptr, int flag = 0x0);
30 | 		//virtual void Confi_Impuri(WeakLearner *hWeak, int flag);
31 | 		virtual void AfterTrain(FeatsOnFold *hData, int cas, int nMulti, int flag = 0x0);
32 | 	public:
33 | 
34 | 		tpDOWN mOff, mSum;
35 | 
36 | 		typedef enum {
37 | 			SINGLE_TREE, MULTI_TREE
38 | 		}REGULAR;
39 | 		REGULAR regular = SINGLE_TREE;
40 | 		arrPFNO Tests;
41 | 		//double eta, lenda;
42 | 
43 | 		typedef enum {
44 | 			BT_ALL, BT_MAX_ERR, BT_MIN_ERR, BT_RANDOM_3
45 | 		}BOOT;
46 | 		BOOT boot;
47 | 		int rounds, dup, nOOB, no;
48 | 
49 | 		GBRT(FeatsOnFold *hTrain, FeatsOnFold *hEval, double sOOB, MODEL mo_, int nTree, int flag = 0x0);
50 | 		virtual ~GBRT() {
51 | 		}
52 | 		const LiteBOM_Config& Config() const {
53 | 			return hTrainData->config;
54 | 		}
55 | 		virtual void BeforeTrain(FeatsOnFold *hData, int flag = 0x0);
56 | 		virtual int Train(string sTitle, int cas, int flag = 0x0);
57 | 		virtual int Prune(int flag = 0x0);
58 | 		virtual int IterTrain(int round,int flag);
59 | 		virtual double Predict(FeatsOnFold *hData,bool updateStopping=false,bool checkLossy=false,bool resumeLast=false, int flag=0x0);
60 | 		virtual int Test(string sTitle, BoostingForest::CASEs& TestSet, int nCls, int flag);
61 | 		virtual bool isPassNode(FeatsOnFold *hData_, hMTNode hNode, int flag = 0x0);
62 | 
63 | 		
64 | 	};
65 | }
66 | 


--------------------------------------------------------------------------------
/src/python/pyMORT_DLL.h:
--------------------------------------------------------------------------------
 1 | #if (defined _WINDOWS) || (defined WIN32)
 2 | 	#ifdef PYMORT_DLL_EXPORTS
 3 | 	#define PYMORT_DLL_API __declspec(dllexport)
 4 | 	#else
 5 | 	#define PYMORT_DLL_API __declspec(dllimport)
 6 | 	#endif
 7 | #else
 8 | 	#define PYMORT_DLL_API
 9 | #endif
10 | 
11 | #include "../util/PY_obj.hpp"
12 | 
13 | #define __API_BEGIN__() try {
14 | 
15 | #define __API_END__() } \
16 | catch(std::exception& ex) { return (ex); } \
17 | catch(std::string& ex) { return (ex); } \
18 | catch(...) { return ("unknown exception"); } \
19 | return 0;
20 | 
21 | 
22 | struct PY_ITEM {
23 | 	char *Keys;
24 | 	float Values;
25 | 	char *text;
26 | 	void *arr;
27 | };
28 | 
29 | 
30 | #ifdef __cplusplus
31 | extern "C" {
32 | #endif
33 | 
34 | 	PYMORT_DLL_API void* LiteMORT_init(PY_ITEM* params, int nParam, PY_DATASET_LIST *merge_list,int64_t flag);
35 | 	PYMORT_DLL_API void LiteMORT_clear(void*);
36 | 
37 | 	PYMORT_DLL_API void LiteMORT_set_mergesets(void *, PY_DATASET_LIST *train, int64_t flag);
38 | 
39 | 	//PYMORT_DLL_API void LiteMORT_set_feat(PY_ITEM* params, int nParam, int flag);
40 | 	PYMORT_DLL_API void LiteMORT_fit(void *,float *h_data, tpY *h_target, size_t nSamp, size_t ldS, float *eval_data, tpY *eval_target, size_t nEval, size_t flag);
41 | 	PYMORT_DLL_API void LiteMORT_predict(void *,float *X, tpY *y, size_t nFeat_0, size_t nSamp, size_t flag);
42 | 	PYMORT_DLL_API void LiteMORT_Imputer_f(float *X, tpY *y, size_t nFeat_0, size_t nSamp, size_t flag);
43 | 	PYMORT_DLL_API void LiteMORT_Imputer_d(double *X, tpY *y, size_t nFeat_0, size_t nSamp, size_t flag);
44 | 	//PYMORT_DLL_API void LiteMORT_EDA(void *, const float *X, const tpY *y, const size_t nFeat_0, const size_t nn, const size_t nValid,
45 | 	//	PY_ITEM* params, int nParam, const size_t flag);
46 | 
47 | 	//PYMORT_DLL_API void LiteMORT_fit_1(void *, PY_COLUMN *train, PY_COLUMN *target, size_t nSamp, size_t nFeat_0, PY_COLUMN *eval, PY_COLUMN *eval_target, size_t nEval, size_t flag);
48 | 	PYMORT_DLL_API void LiteMORT_fit_1(void *, PY_DATASET_LIST *train, PY_DATASET_LIST *eval, size_t flag);
49 | 	//PYMORT_DLL_API void LiteMORT_predict_1(void *, PY_COLUMN *X, PY_COLUMN *y, size_t nFeat_0,size_t nSamp,  size_t flag);
50 | 	PYMORT_DLL_API void LiteMORT_predict_1(void *, PY_DATASET_LIST*predict, size_t flag);
51 | 	PYMORT_DLL_API void cpp_test(void *, PY_DATASET*dat);
52 | 
53 | 
54 | #ifdef __cplusplus
55 | }
56 | #endif
57 | 


--------------------------------------------------------------------------------
/src/learn/Pruning.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <time.h>
 6 | #include <omp.h>
 7 | #include "../util/samp_set.hpp"
 8 | 
 9 | 
10 | namespace Grusoft{
11 | 	typedef double tpMetricU;
12 | 	class ManifoldTree;
13 | 	class FeatsOnFold;
14 | 	class EnsemblePruning{
15 | 		double *orth=nullptr;
16 | 		double *gamma = nullptr;
17 | 		int num_orth=0,ldOrth=0;
18 | 	protected:
19 | 		static bool isDebug,isRand;
20 | 		FeatsOnFold *hFold = nullptr;
21 | 		BoostingForest *hBoost = nullptr;
22 | 
23 | 		//|wx|=1	|wy|=1 
24 | 		tpMetricU *mA = nullptr, *ax_=nullptr;
25 | 		tpMetricU *mB = nullptr, *wy=nullptr;		//live section of (A,x)
26 | 		int ldA_;	//row_major
27 | 		int nLive = 0, nLive_0 = 0;
28 | 		int *wasSmall=nullptr, *isLive = nullptr, *wasLive=nullptr,*y2x=nullptr;
29 | 		std::vector<tpSAMP_ID> sorted_indices;
30 | 		int nSparsified() {
31 | 			int nPick, i;
32 | 			for (nPick = 0, i = 0; i < nWeak; i++) {
33 | 				if (wx[i] > 0)
34 | 					nPick++;
35 | 			}
36 | 			return nPick;
37 | 		}
38 | 		//is_live=abs_x < 1.0-delta;	 mB = mA[:,is_live];		wy = wx[is_live]
39 | 		int SubOnLive(double delta,bool update_orth,double *v_0,double *v_sub,int flag);
40 | 
41 | 		void ToCSV(const string& sPath, int flag);
42 | 		void LoadCSV(const string& sPath, int flag);
43 | 		double UpateGamma(int *isLive, int nY,int flag = 0x0);
44 | 		bool partial_infty_color(int nX,bool balance, int flag = 0x0);
45 | 		void sorted_ax(int flag=0x0);
46 | 		void make_orthogonal(tpMetricU *b, int ldB, int &nRun, int nMost, int nLive_0, int *isSmall, int flag=0x0);
47 | 		void basic_local_search(double *,bool balanced = false, int flag = 0x0);
48 | 		void local_improvements(double *, bool balanced = false, int flag = 0x0);
49 | 		void greedy(double*,bool balanced = false, int flag = 0x0);
50 | 		void round_coloring(bool balanced = false, int flag=0x0);
51 | 		virtual void Prepare(int flag = 0x0);
52 | 	public:
53 | 		size_t nSamp = 0, nWeak = 0, nMostWeak = 0;
54 | 		int nPruneOperation = 0;
55 | 		tpMetricU *init_score = nullptr;
56 | 		std::vector<ManifoldTree*>forest;
57 | 		double *plus_minus = nullptr;
58 | 		//combination coefficient
59 | 		tpMetricU *cc_0 = nullptr, *cc_1 = nullptr,cc_0_sum=0, *wx = nullptr;
60 | 
61 | 		EnsemblePruning(BoostingForest *hBoost,FeatsOnFold *hFold, int nWeak_,int flag=0x0);
62 | 		virtual ~EnsemblePruning();
63 | 		virtual bool isValid() { return true; }
64 | 
65 | 		virtual void Reset4Pick(int flag);
66 | 		virtual bool Pick(int nWeak_,int isToCSV, int flag);
67 | 		virtual bool Compare( int flag);
68 | 
69 | 		virtual void OnStep(ManifoldTree *hTree, tpDOWN*down, int flag = 0x0);
70 | 	};
71 | 
72 | };
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.12)
 2 | SET(PROJECT_NAME LiteMORT)
 3 | PROJECT(${PROJECT_NAME} LANGUAGES CXX)
 4 | message("Hello, It's ${PROJECT_NAME} by CYS!")
 5 | cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
 6 | set (CMAKE_CXX_STANDARD 11)
 7 | 
 8 | OPTION(USE_OPENMP "Enable OpenMP" ON)
 9 | 
10 | if(USE_OPENMP)
11 |     find_package(OpenMP REQUIRED)
12 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
13 | else()
14 |     # Ignore unknown #pragma warning
15 |     if( (CMAKE_CXX_COMPILER_ID MATCHES "[cC][lL][aA][nN][gG]")
16 |       OR (CMAKE_CXX_COMPILER_ID MATCHES "[gG][nN][uU]"))
17 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
18 |     endif()
19 | endif(USE_OPENMP)
20 | 
21 | if(MSVC)
22 |     if(MSVC_VERSION LESS 1900)
23 |         message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a newer MSVC.")
24 |     endif()
25 | 
26 |     SET(variables
27 |         CMAKE_C_FLAGS_DEBUG
28 |         CMAKE_C_FLAGS_MINSIZEREL
29 |         CMAKE_C_FLAGS_RELEASE
30 |         CMAKE_C_FLAGS_RELWITHDEBINFO
31 |         CMAKE_CXX_FLAGS_DEBUG
32 |         CMAKE_CXX_FLAGS_MINSIZEREL
33 |         CMAKE_CXX_FLAGS_RELEASE
34 |         CMAKE_CXX_FLAGS_RELWITHDEBINFO
35 |     )
36 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /O2 /Ob2 /Oi /Ot /Oy /GL")
37 | else()
38 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
39 | endif()
40 | 
41 | 
42 | set (cLIB "../../lib/")
43 | #set (PYTHON_INC "../../lib/")
44 | SET(some_COMPILE_FLAGS "-static -std=c++11 -pthread -O3 -I${cLIB}" )
45 | 
46 | SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${some_COMPILE_FLAGS}")
47 | 
48 | SET(SOURCE_DIR "src")
49 | # Tell cmake that headers are in alse in source_dir
50 | include_directories(${SOURCE_DIR})
51 | SET(SOURCE_FILES ${SOURCE_DIR}/LiteMORT.cpp ${SOURCE_DIR}/python/pyMORT_DLL.cpp)
52 | add_subdirectory(${SOURCE_DIR}/data_fold)
53 | add_subdirectory(${SOURCE_DIR}/tree)
54 | add_subdirectory(${SOURCE_DIR}/util)
55 | add_subdirectory(${SOURCE_DIR}/learn)
56 | add_subdirectory(${SOURCE_DIR}/EDA)
57 | 
58 | #add_executable("${PROJECT_NAME}" ${SOURCE_FILES})
59 | ADD_LIBRARY("${PROJECT_NAME}" SHARED ${SOURCE_FILES})
60 | target_link_libraries(${PROJECT_NAME} data_fold_ tree_ util_ learn_ eda_)
61 | # SET(TEST_DIR "tests")
62 | # SET(TESTS ${SOURCES}
63 | #     "${TEST_DIR}/test_main.cpp"
64 | #     "${TEST_DIR}/test_math.cpp")
65 | 
66 | # Generate a test executable
67 | # include_directories(lib/catch/include)
68 | # add_executable("${PROJECT_NAME}_test" ${SOURCE_FILES})
69 | 
70 | 
71 | # Generate python module
72 | # add_subdirectory(lib/pybind11)
73 | # pybind11_add_module(python_cpp_example ${SOURCES} "${SOURCE_DIR}/bindings.cpp")
74 | 


--------------------------------------------------------------------------------
/tests/python_package_test/test_basic.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: skip-file
 3 | import os
 4 | import tempfile
 5 | import unittest
 6 | 
 7 | import lightgbm as lgb
 8 | #from litemort import LiteMORT
 9 | import numpy as np
10 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file
11 | from sklearn.model_selection import train_test_split
12 | 
13 | 
14 | class TestBasic(unittest.TestCase):
15 | 
16 |     def test(self):
17 |         X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
18 |         train_data = lgb.Dataset(X_train, label=y_train)
19 |         valid_data = train_data.create_valid(X_test, label=y_test)
20 | 
21 |         params = {
22 |             "objective": "binary",
23 |             "metric": "auc",
24 |             "min_data": 10,
25 |             "num_leaves": 15,
26 |             "verbose": -1,
27 |             "num_threads": 1,
28 |             "max_bin": 255
29 |         }
30 |         bst = lgb.Booster(params, train_data)
31 |         bst.add_valid(valid_data, "valid_1")
32 | 
33 |         for i in range(30):
34 |             bst.update()
35 |             if i % 10 == 0:
36 |                 print(bst.eval_train(), bst.eval_valid())
37 |         #bst.save_model("model.txt")
38 |         pred_from_matr = bst.predict(X_test)
39 |         with tempfile.NamedTemporaryFile() as f:
40 |             tname = f.name
41 |         with open(tname, "w+b") as f:
42 |             dump_svmlight_file(X_test, y_test, f)
43 |         pred_from_file = bst.predict(tname)
44 |         os.remove(tname)
45 |         self.assertEqual(len(pred_from_matr), len(pred_from_file))
46 |         for preds in zip(pred_from_matr, pred_from_file):
47 |             self.assertAlmostEqual(*preds, places=15)
48 | 
49 |         # check saved model persistence
50 |         bst = lgb.Booster(params, model_file="model.txt")
51 |         pred_from_model_file = bst.predict(X_test)
52 |         self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
53 |         for preds in zip(pred_from_matr, pred_from_model_file):
54 |             # we need to check the consistency of model file here, so test for exact equal
55 |             self.assertEqual(*preds)
56 | 
57 |         # check early stopping is working. Make it stop very early, so the scores should be very close to zero
58 |         pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
59 |         pred_early_stopping = bst.predict(X_test, **pred_parameter)
60 |         self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
61 |         for preds in zip(pred_early_stopping, pred_from_matr):
62 |             # scores likely to be different, but prediction should still be the same
63 |             self.assertEqual(preds[0] > 0, preds[1] > 0)
64 | 


--------------------------------------------------------------------------------
/src/util/pcg_oneil/pcg_basic.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * PCG Random Number Generation for C.
 3 |  *
 4 |  * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  *
18 |  * For additional information about the PCG random number generation scheme,
19 |  * including its license and other licensing options, visit
20 |  *
21 |  *     http://www.pcg-random.org
22 |  */
23 | 
24 | /*
25 |  * This code is derived from the full C implementation, which is in turn
26 |  * derived from the canonical C++ PCG implementation. The C++ version
27 |  * has many additional features and is preferable if you can use C++ in
28 |  * your project.
29 |  */
30 | 
31 | #ifndef PCG_BASIC_H_INCLUDED
32 | #define PCG_BASIC_H_INCLUDED 1
33 | 
34 | #include <inttypes.h>
35 | 
36 | #if __cplusplus
37 | extern "C" {
38 | #endif
39 | 
40 | struct pcg_state_setseq_64 {    // Internals are *Private*.
41 |     uint64_t state;             // RNG state.  All values are possible.
42 |     uint64_t inc;               // Controls which RNG sequence (stream) is
43 |                                 // selected. Must *always* be odd.
44 | };
45 | typedef struct pcg_state_setseq_64 pcg32_random_t;
46 | 
47 | // If you *must* statically initialize it, here's one.
48 | 
49 | #define PCG32_INITIALIZER   { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL }
50 | 
51 | // pcg32_srandom(initstate, initseq)
52 | // pcg32_srandom_r(rng, initstate, initseq):
53 | //     Seed the rng.  Specified in two parts, state initializer and a
54 | //     sequence selection constant (a.k.a. stream id)
55 | 
56 | void pcg32_srandom(uint64_t initstate, uint64_t initseq);
57 | void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate,
58 |                      uint64_t initseq);
59 | 
60 | // pcg32_random()
61 | // pcg32_random_r(rng)
62 | //     Generate a uniformly distributed 32-bit random number
63 | 
64 | uint32_t pcg32_random(void);
65 | uint32_t pcg32_random_r(pcg32_random_t* rng);
66 | 
67 | // pcg32_boundedrand(bound):
68 | // pcg32_boundedrand_r(rng, bound):
69 | //     Generate a uniformly distributed number, r, where 0 <= r < bound
70 | 
71 | uint32_t pcg32_boundedrand(uint32_t bound);
72 | uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound);
73 | 
74 | #if __cplusplus
75 | }
76 | #endif
77 | 
78 | #endif // PCG_BASIC_H_INCLUDED
79 | 


--------------------------------------------------------------------------------
/src/tree/old/RF_ShapeRegress.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include ".\GruST\learn\DecisionTree.hpp"
 4 | #include ".\GruST\image\BMPfold.hpp"
 5 | 
 6 | namespace Grusoft{
 7 | 
 8 | class RF_ShapeRegress : public RandomForest{
 9 | 	string sPre;
10 | 	char sLine[1000];
11 | 
12 | protected:
13 | 	bool isToCPP;
14 | 	FILE *fpC,*fpD,*fpT;
15 | 
16 | 	float *cand_dis;
17 | 	arrPFNO featLines;
18 | 	void RandCandidate( int nCand,ShapeBMPfold &mean,ShapeBMPfold::PTFs &cands,int flag=0x0 );
19 | 	void RandCandidate_2( int nCand,ShapeBMPfold &mean,ShapeBMPfold::PTFs &cands,int flag=0x0 );
20 | 	virtual hBLIT GetBlit( WeakLearner *hWeak,int flag=0x0 );
21 | 	virtual void BlitSamps( WeakLearner *hWeak,SAMPs &fnL,SAMPs &fnR,int flag=0x0 );
22 | 	//virtual void ToCPP(WeakLearner *hWeak,int flag=0x0);
23 | 	virtual void Confi_Impuri(  WeakLearner *hWeak,int flag );
24 | 	virtual bool GetFeatDistri( WeakLearner *hWeak,float *distri=nullptr,int flag=0x0 );
25 | 	//bool Confi_Regress(  WeakLearner *hWeak,int flag );
26 | 	virtual bool LeafModel(  WeakLearner *hWeak,int flag=0x0 );
27 | 	virtual double ErrorAt( arrPFNO& samps );
28 | 	virtual void BootSample( DecisionTree *hTree,arrPFNO &boot,arrPFNO &oob,FeatData *hDat,int flag=0x0 );
29 | 	void FeatLineBmp( string sPath,int flag=0x0 );
30 | 	virtual int nPickAtSplit( WeakLearner *hWeak ){	
31 | 			return nPickWeak;	
32 | 		}
33 | 	virtual void DumpTree( int nox,DecisionTree *hTree,int flag=0x0 );
34 | 	virtual void UpdateFeat( int flag=0x0 );
35 | 	virtual void OnMultiTree( int cas,int nMulti,int flag=0x0 );
36 | 	virtual void AfterTrain( int cas,int nMulti,int flag=0x0 );
37 | 
38 | 	ShapeBMPfold& spMean;
39 | 	ShapeBMPfold::PTFs cands,*arrPTF;
40 | 	double nzWeak;
41 | 	bool isCalcErr;
42 | 	int nBlitThread;
43 | public:
44 | 	typedef enum{
45 | 		SINGLE_TREE,MULTI_TREE
46 | 	}REGULAR;
47 | 	REGULAR regular;
48 | 
49 | 	typedef enum{
50 | 		BT_ALL,BT_MAX_ERR,BT_MIN_ERR,BT_RANDOM_3
51 | 	}BOOT;
52 | 	BOOT boot;
53 | 	ShapeBMPfold::PT_INDEX index;
54 | 	int nTree,dup,nOOB,no;
55 | //	SHAPE_PtSet sp;
56 | 	arrPFNO Tests;
57 | 	double eta,lenda;
58 | 	//Eigen::MatrixXd mOff,mSum;
59 | 	ShapeBMPfold::VECT mOff,mSum;
60 | 	vector<ShapeBMPfold*> Trains;
61 | 	RF_ShapeRegress( ):RandomForest( ),nOOB(0),regular(SINGLE_TREE),boot(BT_MIN_ERR),spMean(ShapeBMPfold::VIRTU),
62 | 		isToCPP(false),fpC(NULL),fpD(NULL),fpT(NULL),arrPTF(nullptr){	;	}
63 | 	RF_ShapeRegress( vector<ShapeBMPfold*>&Trains,ShapeBMPfold &spMean,int nCand,int nStep,int nEach,int nOB,int cas,int flag=0x0);
64 | 	~RF_ShapeRegress();
65 | 
66 | 	virtual int Train( string sTitle,int cas,int flag );
67 | 	virtual void AfterTrain( FeatData *hData,int flag=0x0 );
68 | 	void TraceBmp( string sPath,int type,int flag=0x0 );
69 | 
70 | 	bool InitCPP( char *pathC,char *pathD,char *pathT,int type,int flag=0x0 );
71 | 	virtual void ToCPP( DecisionTree *hTree,int cas,int step,int tree,int flag=0x0 );
72 | 	virtual void ToCPP( int cas,int flag=0x0 );
73 | 	void CoreInCPP( int cas,int flag );
74 | friend class RF_ConfiRegress;
75 | };
76 | 
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/src/util/pcg_oneil/xoshiro256starstar.c:
--------------------------------------------------------------------------------
  1 | /*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
  2 | 
  3 | To the extent possible under law, the author has dedicated all copyright
  4 | and related and neighboring rights to this software to the public domain
  5 | worldwide. This software is distributed without any warranty.
  6 | 
  7 | See <http://creativecommons.org/publicdomain/zero/1.0/>. */
  8 | 
  9 | #include <stdint.h>
 10 | 
 11 | /* This is xoshiro256** 1.0, one of our all-purpose, rock-solid
 12 |    generators. It has excellent (sub-ns) speed, a state (256 bits) that is
 13 |    large enough for any parallel application, and it passes all tests we
 14 |    are aware of.
 15 | 
 16 |    For generating just floating-point numbers, xoshiro256+ is even faster.
 17 | 
 18 |    The state must be seeded so that it is not everywhere zero. If you have
 19 |    a 64-bit seed, we suggest to seed a splitmix64 generator and use its
 20 |    output to fill s. */
 21 | 
 22 | static inline uint64_t rotl(const uint64_t x, int k) {
 23 | 	return (x << k) | (x >> (64 - k));
 24 | }
 25 | 
 26 | 
 27 | static uint64_t s[4];
 28 | 
 29 | uint64_t next(void) {
 30 | 	const uint64_t result = rotl(s[1] * 5, 7) * 9;
 31 | 
 32 | 	const uint64_t t = s[1] << 17;
 33 | 
 34 | 	s[2] ^= s[0];
 35 | 	s[3] ^= s[1];
 36 | 	s[1] ^= s[2];
 37 | 	s[0] ^= s[3];
 38 | 
 39 | 	s[2] ^= t;
 40 | 
 41 | 	s[3] = rotl(s[3], 45);
 42 | 
 43 | 	return result;
 44 | }
 45 | 
 46 | 
 47 | /* This is the jump function for the generator. It is equivalent
 48 |    to 2^128 calls to next(); it can be used to generate 2^128
 49 |    non-overlapping subsequences for parallel computations. */
 50 | 
 51 | void jump(void) {
 52 | 	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
 53 | 
 54 | 	uint64_t s0 = 0;
 55 | 	uint64_t s1 = 0;
 56 | 	uint64_t s2 = 0;
 57 | 	uint64_t s3 = 0;
 58 | 	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
 59 | 		for(int b = 0; b < 64; b++) {
 60 | 			if (JUMP[i] & UINT64_C(1) << b) {
 61 | 				s0 ^= s[0];
 62 | 				s1 ^= s[1];
 63 | 				s2 ^= s[2];
 64 | 				s3 ^= s[3];
 65 | 			}
 66 | 			next();	
 67 | 		}
 68 | 		
 69 | 	s[0] = s0;
 70 | 	s[1] = s1;
 71 | 	s[2] = s2;
 72 | 	s[3] = s3;
 73 | }
 74 | 
 75 | 
 76 | 
 77 | /* This is the long-jump function for the generator. It is equivalent to
 78 |    2^192 calls to next(); it can be used to generate 2^64 starting points,
 79 |    from each of which jump() will generate 2^64 non-overlapping
 80 |    subsequences for parallel distributed computations. */
 81 | 
 82 | void long_jump(void) {
 83 | 	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };
 84 | 
 85 | 	uint64_t s0 = 0;
 86 | 	uint64_t s1 = 0;
 87 | 	uint64_t s2 = 0;
 88 | 	uint64_t s3 = 0;
 89 | 	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
 90 | 		for(int b = 0; b < 64; b++) {
 91 | 			if (LONG_JUMP[i] & UINT64_C(1) << b) {
 92 | 				s0 ^= s[0];
 93 | 				s1 ^= s[1];
 94 | 				s2 ^= s[2];
 95 | 				s3 ^= s[3];
 96 | 			}
 97 | 			next();	
 98 | 		}
 99 | 		
100 | 	s[0] = s0;
101 | 	s[1] = s1;
102 | 	s[2] = s2;
103 | 	s[3] = s3;
104 | }
105 | 


--------------------------------------------------------------------------------
/src/util/pcg_oneil/xoshiro256plusplus.c:
--------------------------------------------------------------------------------
  1 | /*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)
  2 | 
  3 | To the extent possible under law, the author has dedicated all copyright
  4 | and related and neighboring rights to this software to the public domain
  5 | worldwide. This software is distributed without any warranty.
  6 | 
  7 | See <http://creativecommons.org/publicdomain/zero/1.0/>. */
  8 | 
  9 | #include <stdint.h>
 10 | 
 11 | //We suggest to use a SplitMix64 to initialize the state of our generators starting from a 64-bit seed, as research has shown that initialization must be performed with a generator radically different in nature from the one initialized to avoid correlation on similar seeds.
 12 | uint64_t SplitMix64_next() {
 13 | 	static uint64_t x; /* The state can be seeded with any value. */
 14 | 	uint64_t z = (x += 0x9e3779b97f4a7c15);
 15 | 	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
 16 | 	z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
 17 | 	return z ^ (z >> 31);
 18 | }
 19 | 
 20 | /* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
 21 |    It has excellent (sub-ns) speed, a state (256 bits) that is large
 22 |    enough for any parallel application, and it passes all tests we are
 23 |    aware of.
 24 | 
 25 |    For generating just floating-point numbers, xoshiro256+ is even faster.
 26 | 
 27 |    The state must be seeded so that it is not everywhere zero. If you have
 28 |    a 64-bit seed, we suggest to seed a splitmix64 generator and use its
 29 |    output to fill s. */
 30 | 
 31 | static inline uint64_t xoroshiro_rotl(const uint64_t x, int k) {
 32 | 	return (x << k) | (x >> (64 - k));
 33 | }
 34 | 
 35 | 
 36 | static uint64_t s[4] = { 1,3,5,7 };
 37 | 
 38 | uint64_t xoroshiro_next(void) {
 39 | 	const uint64_t result = xoroshiro_rotl(s[0] + s[3], 23) + s[0];
 40 | 
 41 | 	const uint64_t t = s[1] << 17;
 42 | 
 43 | 	s[2] ^= s[0];
 44 | 	s[3] ^= s[1];
 45 | 	s[1] ^= s[2];
 46 | 	s[0] ^= s[3];
 47 | 
 48 | 	s[2] ^= t;
 49 | 
 50 | 	s[3] = xoroshiro_rotl(s[3], 45);
 51 | 
 52 | 	return result;
 53 | }
 54 | 
 55 | 
 56 | /* This is the jump function for the generator. It is equivalent
 57 |    to 2^128 calls to next(); it can be used to generate 2^128
 58 |    non-overlapping subsequences for parallel computations. */
 59 | 
 60 | void xoroshiro_jump(void) {
 61 | 	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
 62 | 
 63 | 	uint64_t s0 = 0;
 64 | 	uint64_t s1 = 0;
 65 | 	uint64_t s2 = 0;
 66 | 	uint64_t s3 = 0;
 67 | 	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
 68 | 		for(int b = 0; b < 64; b++) {
 69 | 			if (JUMP[i] & UINT64_C(1) << b) {
 70 | 				s0 ^= s[0];
 71 | 				s1 ^= s[1];
 72 | 				s2 ^= s[2];
 73 | 				s3 ^= s[3];
 74 | 			}
 75 | 			xoroshiro_next();
 76 | 		}
 77 | 		
 78 | 	s[0] = s0;
 79 | 	s[1] = s1;
 80 | 	s[2] = s2;
 81 | 	s[3] = s3;
 82 | }
 83 | 
 84 | 
 85 | 
 86 | /* This is the long-jump function for the generator. It is equivalent to
 87 |    2^192 calls to next(); it can be used to generate 2^64 starting points,
 88 |    from each of which jump() will generate 2^64 non-overlapping
 89 |    subsequences for parallel distributed computations. */
 90 | 
 91 | void xoroshiro_long_jump(void) {
 92 | 	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };
 93 | 
 94 | 	uint64_t s0 = 0;
 95 | 	uint64_t s1 = 0;
 96 | 	uint64_t s2 = 0;
 97 | 	uint64_t s3 = 0;
 98 | 	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
 99 | 		for(int b = 0; b < 64; b++) {
100 | 			if (LONG_JUMP[i] & UINT64_C(1) << b) {
101 | 				s0 ^= s[0];
102 | 				s1 ^= s[1];
103 | 				s2 ^= s[2];
104 | 				s3 ^= s[3];
105 | 			}
106 | 			xoroshiro_next();
107 | 		}
108 | 		
109 | 	s[0] = s0;
110 | 	s[1] = s1;
111 | 	s[2] = s2;
112 | 	s[3] = s3;
113 | }
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Gradient boosting algorithm is one of the most interesting and overlooked algorithm in machine learning. There are huge gaps between the simple theoretical formula  and practical implementations, especially the histogram technique . The histogram-based feature representation not only greatly improves the speed, but also improves the accuracy. In some sense, the histogram is a sparse embedding technique, which map the noisy feature to a more compact and  more robust space. And we could get more along this direction.  Based on the deep understanding of feature embedding technique, we present LiteMORT, which use much less memory than other GBDT libs. It also has higher accuracy in some datasets. LiteMORT reveals that GBDT algorithm can have much more potential than most people would expect. 
 2 | 
 3 | ## Some key features of LiteMORT
 4 | 
 5 | #### 1. Faster than LightGBM with higher accuracy
 6 | 
 7 | For example , in the latest Kaggle competition  [IEEE-CIS Fraud Detection competition](https://www.kaggle.com/c/ieee-fraud-detection/overview) (binary classification problem) :
 8 | 
 9 | 1） **LiteMORT is much faster than LightGBM**. LiteMORT needs only a quarter of the time of LightGBM.
10 | 
11 | 2）**LiteMORT has higher auc than LightGBM**. 
12 | 
13 | ![auc_8_fold](https://github.com/closest-git/ieee_fraud/raw/master/auc_8_fold.jpg)
14 | 
15 | ![time_8_fold](https://github.com/closest-git/ieee_fraud/raw/master/time_8_fold.jpg)
16 | 
17 | For the detail comparison of this competition, please see https://github.com/closest-git/ieee_fraud.
18 | 
19 | #### 2. Use much less memory than other GBDT libs 
20 | 
21 | 1) **Share memory with data source** (pandas dataframe, numpy ndarray, list, vector… )
22 | 
23 | LiteMORT would not allocate extra memory for features stored in continuous memory. In the gradient boosting process, nearly all visit to data is on the pointer and some offsets. 
24 | 
25 | 2) **Implicit merging  for “merge overflow problem”**
26 | 
27 | In real application, we usually don’t save all the data in one big data table. They are always many smaller ones instead. But in the data analysis or machine learning task, we have to access all datas. Or we have to merge some small datasets to get some huge datasets, which are too huge to be processed by many classical machine learning algorithms. We called this phenomenon as **“merge overflow problem”**. 		LiteMORT use a smart implicit merging technique to deal with this problem. Just send all small datasets to LiteMORT, LiteMORT would generate the histograms for each merged features. In the later training process, all operations are on these histograms. No need to generate the huge merged dataset as the classical method or other GBDT libs(LightGBM, XGBoost,...)
28 | 
29 | #### 3. sklearn-like api interface.
30 | 
31 | ```python
32 | from litemort import *
33 | mode = LiteMORT(params).fit(train_x, train_y, eval_set=[(eval_x, eval_y)])
34 | pred_val = model.predict(eval_x)
35 | pred_raw = model.predict_raw(eval_x)
36 | ```
37 | 
38 | #### 4. Just one line to transform from lightGBM to LiteMORT.
39 | 
40 | Support parameters of LightGBM
41 | 
42 | As shown below, just one more line to transform from lightGBM to LiteMORT.  
43 | 
44 | ```python
45 | if model_type == 'mort':
46 |     model = LiteMORT(params).fit_1(X_train, y_train, eval_set=[(X_valid, y_valid)])
47 | if model_type == 'lgb':
48 |     model = lgb.LGBMRegressor(**params, n_jobs=-1)
49 |     model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)])
50 | pred_test = model.predict(X_test)
51 | ```
52 | 
53 | 
54 | 
55 | ## Citation
56 | 
57 | Please use the following bibtex entry:
58 | 
59 | ```
60 | [1] Chen, Yingshi."LiteMORT: A memory efficient gradient boosting tree system on adaptive compact distributions." arXiv preprint arXiv:2001.09419 (2020).
61 | ```
62 | 
63 | ## Author
64 | 
65 | LiteMORT was written by Yingshi Chen (gsp.cys@gmail.com)


--------------------------------------------------------------------------------
/src/util/PY_obj.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>		//for shared_ptr
  4 | #include <string>
  5 | #include <vector>
  6 | #include <typeinfo>
  7 | #include <algorithm>
  8 | #include <complex>
  9 | #include <limits.h>
 10 | #include <cstring>
 11 | #include <stdio.h>  
 12 | #include "Object.hpp"
 13 | #include "Float16.hpp"
 14 | 
 15 | struct PY_COLUMN {
 16 | 	char *name;
 17 | 	void *data;
 18 | 	char *dtype;
 19 | 	char *type_x;
 20 | 	double v_min;
 21 | 	double v_max;
 22 | 	float representive;
 23 | 
 24 | 	bool isCategory()	const {
 25 | 		return type_x!=NULL && strcmp(type_x,"*")==0;
 26 | 	}
 27 | 	bool isDiscrete()	const {
 28 | 		return type_x != NULL && strcmp(type_x, "#") == 0;
 29 | 	}
 30 | 	bool isInt8() {
 31 | 		std::string type = dtype;
 32 | 		return type == "char" || type == "int8" || type == "uint8";
 33 | 	}
 34 | 	bool isInt32() {
 35 | 		std::string type = dtype;
 36 | 		return type == "int" || type == "int32" || type == "uint32";
 37 | 	}
 38 | 	bool isInt16() {
 39 | 		std::string type = dtype;
 40 | 		return type == "int16" || type == "uint16";
 41 | 	}
 42 | 	bool isInt64() {
 43 | 		std::string type = dtype;
 44 | 		return type == "int64" || type == "uint64";
 45 | 	}
 46 | 	bool isFloat() {
 47 | 		std::string type = dtype;
 48 | 		return type == "float32";
 49 | 	}
 50 | 	bool isFloat16() {
 51 | 		std::string type = dtype;
 52 | 		return type == "float16";	
 53 | 	}
 54 | 	bool isDouble() {
 55 | 		std::string type = dtype;
 56 | 		return type == "float64";
 57 | 	}
 58 | 
 59 | 	template<typename Tx>
 60 | 	void CopyTo_(size_t nSamp, Tx* dst, int flag = 0x0) {
 61 | 		if (isInt8()) {
 62 | 			//assert(typeof(Tx) == typeof(int8_t));
 63 | 			//G_MEMCOPY_(nSamp, dst, (int8_t*)data, flag);
 64 | 			int8_t *i8_ = (int8_t*)data;
 65 | 			for (size_t i = 0; i < nSamp; i++) {
 66 | 				dst[i] = i8_[i];
 67 | 			}
 68 | 		}
 69 | 		else if (isDouble()){
 70 | 			double *dbl = (double*)data;
 71 | 			for (size_t i = 0; i < nSamp; i++) {
 72 | 				dst[i] = dbl[i];
 73 | 			}
 74 | 		}	else if (isFloat()) {
 75 | 			float *flt = (float*)data;
 76 | 			for (size_t i = 0; i < nSamp; i++) {
 77 | 				dst[i] = flt[i];
 78 | 			}
 79 | 		}	else if (isInt64()) {
 80 | 			int64_t *i64 = (int64_t*)data;
 81 | 			for (size_t i = 0; i < nSamp; i++) {
 82 | 				dst[i] = i64[i];
 83 | 			}
 84 | 		}
 85 | 		else if (isInt32()) {
 86 | 			int32_t *i32 = (int32_t*)data;
 87 | 			for (size_t i = 0; i < nSamp; i++) {
 88 | 				dst[i] = i32[i];
 89 | 			}
 90 | 		}
 91 | 		else if (isFloat16()) {		//https://stackoverflow.com/questions/22210684/16-bit-floats-and-gl-half-float
 92 | 			int16_t *flt16= (int16_t*)data;
 93 | 			float fRet;
 94 | 			for (size_t i = 0; i < nSamp; i++, flt16++) {
 95 | 				dst[i] = Float16::GLM_toFloat32(*flt16);
 96 | 				/*int fltInt32 = (((*flt16) & 0x8000) << 16);
 97 | 				fltInt32 |= (((*flt16) & 0x7fff) << 13) + 0x38000000;
 98 | 				memcpy(&fRet, &fltInt32, sizeof(float));
 99 | 				dst[i] = fRet;*/
100 | 			}
101 | 		}
102 | 		else {
103 | 			throw "PY_COLUMN::CopyTo_ is mismatch!!!";
104 | 		}
105 | 	}
106 | };
107 | 
108 | struct PY_DATASET {
109 | 	char *name=nullptr;
110 | 	size_t nSamp;
111 | 	int ldFeat;
112 | 	int ldY;
113 | 	PY_COLUMN *columnX = nullptr;		//PY_COLUMN
114 | 	PY_COLUMN *columnY = nullptr;		//PY_COLUMN
115 | 	PY_COLUMN *merge_left = nullptr;
116 | 	//int merge_rigt = -1;
117 | 	int x;	
118 | 
119 | 	bool isValid() {
120 | 		return false;
121 | 	}
122 | 
123 | 	PY_COLUMN* GetColumn(int id) {
124 | 		assert(id >= 0 && id < ldFeat);
125 | 		return columnX + id;
126 | 	}
127 | };
128 | 
129 | struct PY_DATASET_LIST {
130 | 	char *name=nullptr;
131 | 	int nSet=0;
132 | 	PY_DATASET *list = nullptr;		//PY_COLUMN
133 | 	int x=0;
134 | 
135 | 	static PY_DATASET* GetSet(PY_DATASET_LIST *data_list,int no=0x0) {
136 | 		assert(data_list!=nullptr && data_list->nSet > no);
137 | 		PY_DATASET *set = data_list->list+no;
138 | 		assert(set->ldFeat > 0);
139 | 		assert(set->ldY > 0);
140 | 		assert(set->nSamp > 0);
141 | 		return set;
142 | 	}
143 | };
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/compat.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # pylint: disable = C0103
  3 | """Compatibility"""
  4 | from __future__ import absolute_import
  5 | 
  6 | import inspect
  7 | import sys
  8 | 
  9 | import numpy as np
 10 | 
 11 | is_py3 = (sys.version_info[0] == 3)
 12 | 
 13 | """compatibility between python2 and python3"""
 14 | if is_py3:
 15 |     zip_ = zip
 16 |     string_type = str
 17 |     numeric_types = (int, float, bool)
 18 |     integer_types = (int, )
 19 |     range_ = range
 20 | 
 21 |     def argc_(func):
 22 |         """return number of arguments of a function"""
 23 |         return len(inspect.signature(func).parameters)
 24 | 
 25 |     def decode_string(bytestring):
 26 |         return bytestring.decode('utf-8')
 27 | else:
 28 |     from itertools import izip as zip_
 29 |     string_type = basestring
 30 |     numeric_types = (int, long, float, bool)
 31 |     integer_types = (int, long)
 32 |     range_ = xrange
 33 | 
 34 |     def argc_(func):
 35 |         """return number of arguments of a function"""
 36 |         return len(inspect.getargspec(func).args)
 37 | 
 38 |     def decode_string(bytestring):
 39 |         return bytestring
 40 | 
 41 | """json"""
 42 | try:
 43 |     import simplejson as json
 44 | except (ImportError, SyntaxError):
 45 |     # simplejson does not support Python 3.2, it throws a SyntaxError
 46 |     # because of u'...' Unicode literals.
 47 |     import json
 48 | 
 49 | 
 50 | def json_default_with_numpy(obj):
 51 |     if isinstance(obj, (np.integer, np.floating, np.bool_)):
 52 |         return obj.item()
 53 |     elif isinstance(obj, np.ndarray):
 54 |         return obj.tolist()
 55 |     else:
 56 |         return obj
 57 | 
 58 | 
 59 | """pandas"""
 60 | try:
 61 |     from pandas import Series, DataFrame
 62 |     PANDAS_INSTALLED = True
 63 | except ImportError:
 64 |     PANDAS_INSTALLED = False
 65 | 
 66 |     class Series(object):
 67 |         pass
 68 | 
 69 |     class DataFrame(object):
 70 |         pass
 71 | 
 72 | """matplotlib"""
 73 | try:
 74 |     import matplotlib
 75 |     MATPLOTLIB_INSTALLED = True
 76 | except ImportError:
 77 |     MATPLOTLIB_INSTALLED = False
 78 | 
 79 | """graphviz"""
 80 | try:
 81 |     import graphviz
 82 |     GRAPHVIZ_INSTALLED = True
 83 | except ImportError:
 84 |     GRAPHVIZ_INSTALLED = False
 85 | 
 86 | """sklearn"""
 87 | try:
 88 |     from sklearn.base import BaseEstimator
 89 |     from sklearn.base import RegressorMixin, ClassifierMixin
 90 |     from sklearn.preprocessing import LabelEncoder
 91 |     from sklearn.utils.class_weight import compute_sample_weight
 92 |     from sklearn.utils.multiclass import check_classification_targets
 93 |     from sklearn.utils.validation import check_X_y, check_array, check_consistent_length
 94 |     try:
 95 |         from sklearn.model_selection import StratifiedKFold, GroupKFold
 96 |         from sklearn.exceptions import NotFittedError
 97 |     except ImportError:
 98 |         from sklearn.cross_validation import StratifiedKFold, GroupKFold
 99 |         from sklearn.utils.validation import NotFittedError
100 |     SKLEARN_INSTALLED = True
101 |     _MortModelBase = BaseEstimator
102 |     _MortRegressorBase = RegressorMixin
103 |     _MortClassifierBase = ClassifierMixin
104 |     _MortLabelEncoder = LabelEncoder
105 |     MortNotFittedError = NotFittedError
106 |     _MortStratifiedKFold = StratifiedKFold
107 |     _MortGroupKFold = GroupKFold
108 |     _MortCheckXY = check_X_y
109 |     _MortCheckArray = check_array
110 |     _MortCheckConsistentLength = check_consistent_length
111 |     _MortCheckClassificationTargets = check_classification_targets
112 |     _MortComputeSampleWeight = compute_sample_weight
113 | except ImportError:
114 |     SKLEARN_INSTALLED = False
115 |     _MortModelBase = object
116 |     _MortClassifierBase = object
117 |     _MortRegressorBase = object
118 |     _MortLabelEncoder = None
119 |     MortNotFittedError = ValueError
120 |     _MortStratifiedKFold = None
121 |     _MortGroupKFold = None
122 |     _MortCheckXY = None
123 |     _MortCheckArray = None
124 |     _MortCheckConsistentLength = None
125 |     _MortCheckClassificationTargets = None
126 |     _MortComputeSampleWeight = None
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/src/util/pcg_oneil/pcg_basic.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * PCG Random Number Generation for C.
  3 |  *
  4 |  * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  *
 18 |  * For additional information about the PCG random number generation scheme,
 19 |  * including its license and other licensing options, visit
 20 |  *
 21 |  *       http://www.pcg-random.org
 22 |  */
 23 | 
 24 | /*
 25 |  * This code is derived from the full C implementation, which is in turn
 26 |  * derived from the canonical C++ PCG implementation. The C++ version
 27 |  * has many additional features and is preferable if you can use C++ in
 28 |  * your project.
 29 |  */
 30 | 
 31 | #include "pcg_basic.h"
 32 | 
 33 | // state for global RNGs
 34 | 
 35 | static pcg32_random_t pcg32_global = PCG32_INITIALIZER;
 36 | 
 37 | // pcg32_srandom(initstate, initseq)
 38 | // pcg32_srandom_r(rng, initstate, initseq):
 39 | //     Seed the rng.  Specified in two parts, state initializer and a
 40 | //     sequence selection constant (a.k.a. stream id)
 41 | 
 42 | void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq)
 43 | {
 44 |     rng->state = 0U;
 45 |     rng->inc = (initseq << 1u) | 1u;
 46 |     pcg32_random_r(rng);
 47 |     rng->state += initstate;
 48 |     pcg32_random_r(rng);
 49 | }
 50 | 
 51 | void pcg32_srandom(uint64_t seed, uint64_t seq)
 52 | {
 53 |     pcg32_srandom_r(&pcg32_global, seed, seq);
 54 | }
 55 | 
 56 | // pcg32_random()
 57 | // pcg32_random_r(rng)
 58 | //     Generate a uniformly distributed 32-bit random number
 59 | 
 60 | uint32_t pcg32_random_r(pcg32_random_t* rng)
 61 | {
 62 |     uint64_t oldstate = rng->state;
 63 |     rng->state = oldstate * 6364136223846793005ULL + rng->inc;
 64 |     uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
 65 |     uint32_t rot = oldstate >> 59u;
 66 |     return (xorshifted >> rot) | (xorshifted << ((-(int32_t)(rot)) & 31));
 67 | }
 68 | 
 69 | uint32_t pcg32_random()
 70 | {
 71 |     return pcg32_random_r(&pcg32_global);
 72 | }
 73 | 
 74 | 
 75 | // pcg32_boundedrand(bound):
 76 | // pcg32_boundedrand_r(rng, bound):
 77 | //     Generate a uniformly distributed number, r, where 0 <= r < bound
 78 | 
 79 | uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound)
 80 | {
 81 |     // To avoid bias, we need to make the range of the RNG a multiple of
 82 |     // bound, which we do by dropping output less than a threshold.
 83 |     // A naive scheme to calculate the threshold would be to do
 84 |     //
 85 |     //     uint32_t threshold = 0x100000000ull % bound;
 86 |     //
 87 |     // but 64-bit div/mod is slower than 32-bit div/mod (especially on
 88 |     // 32-bit platforms).  In essence, we do
 89 |     //
 90 |     //     uint32_t threshold = (0x100000000ull-bound) % bound;
 91 |     //
 92 |     // because this version will calculate the same modulus, but the LHS
 93 |     // value is less than 2^32.
 94 | 
 95 |     uint32_t threshold = -(int32_t)(bound) % bound;
 96 | 
 97 |     // Uniformity guarantees that this loop will terminate.  In practice, it
 98 |     // should usually terminate quickly; on average (assuming all bounds are
 99 |     // equally likely), 82.25% of the time, we can expect it to require just
100 |     // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
101 |     // (i.e., 2147483649), which invalidates almost 50% of the range.  In 
102 |     // practice, bounds are typically small and only a tiny amount of the range
103 |     // is eliminated.
104 |     for (;;) {
105 |         uint32_t r = pcg32_random_r(rng);
106 |         if (r >= threshold)
107 |             return r % bound;
108 |     }
109 | }
110 | 
111 | 
112 | uint32_t pcg32_boundedrand(uint32_t bound)
113 | {
114 |     return pcg32_boundedrand_r(&pcg32_global, bound);
115 | }
116 | 
117 | 


--------------------------------------------------------------------------------
/python-package/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Note: To use the 'upload' functionality of this file, you must:
  5 | #   $ pip install twine
  6 | 
  7 | import io
  8 | import os
  9 | import sys
 10 | from shutil import rmtree
 11 | 
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | # Package meta-data.
 15 | NAME = 'litemort'
 16 | DESCRIPTION = 'Fastest gradient boosting library with higher accuracy'
 17 | URL = 'https://github.com/closest-git/LiteMORT'
 18 | EMAIL = 'gsp.cys@gmail.com'
 19 | AUTHOR = 'Yingshi Chen'
 20 | REQUIRES_PYTHON = '>=3.0.0'
 21 | VERSION = None
 22 | 
 23 | # What packages are required for this module to be executed?
 24 | REQUIRED = [
 25 |     'numpy', 'scipy', 'scikit-learn',
 26 | ]
 27 | 
 28 | # What packages are optional?
 29 | EXTRAS = {
 30 |     # 'fancy feature': ['django'],
 31 | }
 32 | 
 33 | # The rest you shouldn't have to touch too much :)
 34 | # ------------------------------------------------
 35 | # Except, perhaps the License and Trove Classifiers!
 36 | # If you do change the License, remember to change the Trove Classifier for that!
 37 | 
 38 | here = os.path.abspath(os.path.dirname(__file__))
 39 | 
 40 | # Import the README and use it as the long-description.
 41 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 42 | try:
 43 |     with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
 44 |         long_description = '\n' + f.read()
 45 | except FileNotFoundError:
 46 |     long_description = DESCRIPTION
 47 | 
 48 | # Load the package's __version__.py module as a dictionary.
 49 | about = {}
 50 | if not VERSION:
 51 |     project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
 52 |     with open(os.path.join(here, project_slug, '__version__.py')) as f:
 53 |         exec(f.read(), about)
 54 | else:
 55 |     about['__version__'] = VERSION
 56 | 
 57 | 
 58 | class UploadCommand(Command):
 59 |     """Support setup.py upload."""
 60 | 
 61 |     description = 'Build and publish the package.'
 62 |     user_options = []
 63 | 
 64 |     @staticmethod
 65 |     def status(s):
 66 |         """Prints things in bold."""
 67 |         print('\033[1m{0}\033[0m'.format(s))
 68 | 
 69 |     def initialize_options(self):
 70 |         pass
 71 | 
 72 |     def finalize_options(self):
 73 |         pass
 74 | 
 75 |     def run(self):
 76 |         try:
 77 |             self.status('Removing previous builds…')
 78 |             rmtree(os.path.join(here, 'dist'))
 79 |         except OSError:
 80 |             pass
 81 | 
 82 |         self.status('Building Source and Wheel (universal) distribution…')
 83 |         os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable))
 84 | 
 85 |         self.status('Uploading the package to PyPI via Twine…')
 86 |         os.system('twine upload dist/*')
 87 | 
 88 |         self.status('Pushing git tags…')
 89 |         os.system('git tag v{0}'.format(about['__version__']))
 90 |         os.system('git push --tags')
 91 |         
 92 |         sys.exit()
 93 | 
 94 | 
 95 | # Where the magic happens:
 96 | setup(
 97 |     name=NAME,
 98 |     version=about['__version__'],
 99 |     description=DESCRIPTION,
100 |     long_description=long_description,
101 |     long_description_content_type='text/markdown',
102 |     author=AUTHOR,
103 |     author_email=EMAIL,
104 |     python_requires=REQUIRES_PYTHON,
105 |     url=URL,
106 |     packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
107 |     data_files=[('./litemort',["./litemort/libLiteMORT.so","./litemort/LiteMORT.dll"])],
108 | 
109 |     # If your package is a single module, use this instead of 'packages':
110 |     # py_modules=['mypackage'],
111 | 
112 |     # entry_points={
113 |     #     'console_scripts': ['mycli=mymodule:cli'],
114 |     # },
115 |     install_requires=REQUIRED,
116 |     extras_require=EXTRAS,
117 |     include_package_data=True,
118 |     license='MIT',
119 |     classifiers=[
120 |         # Trove classifiers
121 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
122 |         'License :: OSI Approved :: MIT License',
123 |         'Programming Language :: Python',
124 |         'Programming Language :: Python :: 3',
125 |         'Programming Language :: Python :: 3.6',
126 |         'Programming Language :: Python :: Implementation :: CPython',
127 |         'Programming Language :: Python :: Implementation :: PyPy',
128 |         'Operating System :: Microsoft :: Windows',
129 |         'Operating System :: Unix',
130 |     ],
131 |     # $ setup.py publish support.
132 |     cmdclass={
133 |         'upload': UploadCommand,
134 |     },
135 | )
136 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/LiteMORT_problems.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.preprocessing import LabelEncoder
  5 | from sklearn.linear_model import Lasso
  6 | from sklearn.linear_model import Ridge,ElasticNet
  7 | from .compat import (_MortModelBase,_MortClassifierBase,_MortRegressorBase)
  8 | 
  9 | 
 10 | class Mort_Problems(_MortModelBase):
 11 |     def __init__(self,  **kwargs):
 12 |         pass
 13 | 
 14 |     def get_params(self, deep=True):
 15 |         params = super(_MortModelBase, self).get_params(deep=deep)
 16 |         params.update(self._other_params)
 17 |         return params
 18 | 
 19 |     # minor change to support `**kwargs`
 20 |     def set_params(self, **params):
 21 |         for key, value in params.items():
 22 |             setattr(self, key, value)
 23 |             if hasattr(self, '_' + key):
 24 |                 setattr(self, '_' + key, value)
 25 |             self._other_params[key] = value
 26 |         return self
 27 | 
 28 |     #  注意 Y_t与y_train不一样
 29 |     def OnY(self, y_train, np_type):
 30 |         # print(type(y_train))
 31 |         if type(y_train) is pd.Series:
 32 |             np_target = y_train.values.astype(np_type)
 33 |         elif isinstance(y_train, pd.DataFrame):
 34 |             np_target = y_train.values.astype(np_type)
 35 |         else:
 36 |             np_target = y_train.astype(np_type)
 37 |         return np_target
 38 | 
 39 |     def BeforeFit(self,train_set,eval_set):
 40 |         return False,None,None
 41 | 
 42 |     def AfterPredict(self, X_, Y_):
 43 |         return Y_
 44 | 
 45 | 
 46 |     def OnResult(self,result_,pred_leaf=False, pred_contrib=False,raw_score=False):
 47 |         return result_
 48 | 
 49 | class Mort_BinaryClass(Mort_Problems):
 50 |     '''
 51 |         or LogisticRegression
 52 |     '''
 53 |     def __init__(self,params,  **kwargs):
 54 |         super(Mort_BinaryClass, self).__init__()
 55 |         self._labelOfY=None
 56 | 
 57 |     def OnY(self, y_train, np_type):
 58 |         if self._labelOfY is None:
 59 |             self._labelOfY=LabelEncoder()
 60 |             self._labelOfY.fit(y_train)
 61 |             transformed_labels = self._labelOfY.transform(y_train)
 62 |             self._classes = self._labelOfY.classes_
 63 |             self._n_classes = len(self._classes)
 64 |             if self._n_classes != 2:
 65 |                 raise ValueError("The class of Y is {}. Not a binary-classification problem!!!".format(self._n_classes) )
 66 |         else:
 67 |             transformed_labels = self._labelOfY.transform(y_train)
 68 |         return super(Mort_BinaryClass, self).OnY(transformed_labels, np_type)
 69 | 
 70 |     """LiteMORT Binary classifier.     https://en.wikipedia.org/wiki/Binary_classification"""
 71 |     def OnResult(self,result_,pred_leaf=False, pred_contrib=False,raw_score=False):
 72 |         # the predicted probability of 2 class
 73 |         result_ = np.vstack((1. - result_, result_)).transpose()
 74 |         if raw_score or pred_leaf or pred_contrib:
 75 |             return result_
 76 |         else:
 77 |             class_index = np.argmax(result_, axis=1)
 78 |             if self._labelOfY is not None:
 79 |                 return self._labelOfY.inverse_transform(class_index)
 80 |             else:
 81 |                 return class_index
 82 |     pass
 83 | 
 84 | class Mort_MultiClass(Mort_Problems, _MortClassifierBase):
 85 |     pass
 86 | 
 87 | class Mort_Regressor(Mort_Problems, _MortRegressorBase):
 88 |     """LiteMORT regressor."""
 89 |     def __init__(self,params,  **kwargs):
 90 |         super(Mort_Regressor, self).__init__()
 91 |         self.alpha = 1
 92 |         self.gressor = None;        self.alg="None"
 93 |         if 'cascade' in params and params['cascade']=="lasso":
 94 |             self.gressor = Lasso(alpha=self.alpha, normalize=True)
 95 |             self.alg = params['cascade']
 96 |         #self.gressor = Ridge(alpha=0.05, normalize=True)
 97 |         #self.gressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
 98 |         self.mse = 0
 99 | 
100 | 
101 |     def BeforeFit(self,train_set,eval_set):
102 |         if self.gressor is None:
103 |             return False,None,None
104 | 
105 |         print(f"====== Mort_Regressor::BeforeFit@{self.gressor} alpha={self.alpha}")
106 |         x_train, y_train = train_set
107 |         self.gressor.fit(x_train, y_train)
108 |         pred = self.gressor.predict(x_train)
109 |         self.mse = np.mean((pred - y_train)**2)
110 |         y_train = y_train - pred
111 | 
112 |         y_eval = None
113 |         if (eval_set is not None and len(eval_set) > 0):
114 |             X_eval, y_eval = eval_set[0]
115 |             y_pred = self.gressor.predict(X_eval)
116 |             y_eval = y_eval-y_pred
117 |         return True,y_train,[y_eval]
118 | 
119 |     def AfterPredict(self,X_,Y_):
120 |         if self.gressor is not None:
121 |             y_pred = self.gressor.predict(X_)
122 |             Y_= Y_+y_pred
123 |         return Y_
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/src/learn/DCRIMI_.cpp:
--------------------------------------------------------------------------------
  1 | #include <memory>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | //#include <tchar.h>
  5 | #include <time.h>
  6 | #define _USE_MATH_DEFINES
  7 | #include <math.h>
  8 | #include <iostream>
  9 | #include <fstream>
 10 | #include "DCRIMI_.hpp"
 11 | #include "../util/GST_def.h"
 12 | 
 13 | using namespace Grusoft;
 14 | using namespace std;
 15 | 
 16 | //double DCRIMI_::tX = 0;
 17 | double DCRIMI_2::tX = 0;
 18 | 
 19 | /*
 20 | 	Copyright 2008-present, Grusoft.
 21 | 	v0.1	cys
 22 | 		6/13/2015
 23 | */
 24 | //int DCRIMI_::nSPAN = 1000;
 25 | DCRIMI_::DCRIMI_(void *hB, int span, int flag) : D_span(span), hBase(hB), dump(1), isSaveFalse(false), isBidui(false) {
 26 | 	D_span = span;			hBase = hB;			dump = 1;
 27 | 	assert(span >= 100 && hBase != nullptr);
 28 | 	D_inter = new float[D_span + 1];		D_intra = new float[D_span + 1];
 29 | 	for (int i = 0; i <= D_span; i++) {
 30 | 		D_inter[i] = 0.0;		D_intra[i] = 0.0;
 31 | 	}
 32 | }
 33 | DCRIMI_::DCRIMI_(const DCRIMI_& dcri) :isSaveFalse(dcri.isSaveFalse) {
 34 | 	memset(this, 0x0, sizeof(DCRIMI_));
 35 | 
 36 | 	rFAR = dcri.rFAR, rFRR = dcri.rFRR, rEER = dcri.rEER,
 37 | 		D_ = dcri.D_, sep = dcri.sep, eer_sep = dcri.eer_sep;
 38 | 	mean_a = dcri.mean_a, mean_r = dcri.mean_r;
 39 | 	devia_a = dcri.devia_a, devia_r = dcri.devia_r;
 40 | 	dump = dcri.dump;
 41 | }
 42 | 
 43 | void DCRIMI_::Init(int flag) {
 44 | 	for (int i = 0; i <= D_span; i++) {
 45 | 		D_inter[i] = 0.0;		D_intra[i] = 0.0;
 46 | 	}
 47 | 	rFAR = 0.0, rFRR = 0.0, rEER = 0.0, D_ = 0.0, sep = 0.0, eer_sep = 0.0;
 48 | 	rTop_1 = 0.0;			rTop_5 = 0.0;
 49 | }
 50 | 
 51 | 
 52 | void DCRIMI_::Insert_1(float dis, bool isIntra, int flag) {
 53 | 	assert(dis>-0.001 && dis<1.001);
 54 | 	int pos = dis*D_span;
 55 | 	pos = max(pos, 0);		pos = min(pos, D_span);
 56 | 	if (isIntra) {
 57 | 		D_intra[pos]++;
 58 | 	}
 59 | 	else {
 60 | 		D_inter[pos]++;
 61 | 	}
 62 | 
 63 | }
 64 | void DCRIMI_::Analyze(const string &sTitle, int flag) {
 65 | 	assert(D_inter != nullptr && D_intra != nullptr);
 66 | 	int i, grid = 0;
 67 | 	double s, D_s, f_ar, f_rr, w_a, w_r, f_ar_g = 1.0e-7;
 68 | 
 69 | 	for (i = 0; i<8; i++) {
 70 | 		f_ar_8[i] = -1.0;			f_rr_8[i] = -1.0;				hd_8[i] = -1.0;
 71 | 	}
 72 | 	D_s = 1.0 / D_span;		//only for hamming distance
 73 | 
 74 | 	mean_a = 0.0;		mean_r = 0;
 75 | 	nz_a = 0.0;			nz_r = 0.0;
 76 | 	max_a = max_r = 0.0;		min_a = min_r = 1.0;
 77 | 	for (i = 0; i <= D_span; i++) {
 78 | 		mean_a += i*D_intra[i];		nz_a += D_intra[i];
 79 | 		if (D_intra[i]>0) {
 80 | 			max_a = MAX2(max_a, i*D_s);		min_a = MIN2(min_a, i*D_s);
 81 | 		}
 82 | 		mean_r += i*D_inter[i];		nz_r += D_inter[i];
 83 | 		if (D_inter[i]>0) {
 84 | 			max_r = MAX2(max_r, i*D_s);		min_r = MIN2(min_r, i*D_s);
 85 | 		}
 86 | 	}
 87 | 	//	nz_a = nz_a;				nz_r = nz_r;
 88 | 	mean_a = nz_a == 0 ? 0.0 : mean_a / nz_a*D_s;
 89 | 	mean_r = nz_r == 0 ? 0.0 : mean_r / nz_r*D_s;
 90 | 	//	mean_a = mean_a;				mean_r = mean_r;
 91 | 
 92 | 	devia_a = 0.0;		devia_r = 0;
 93 | 	w_a = 0.0;			w_r = 0.0;
 94 | 	for (i = 0; i <= D_span; i++) {
 95 | 		w_a += D_intra[i];		w_r += D_inter[i];
 96 | 		f_ar = w_r*1.0 / nz_r;
 97 | 		f_rr = (nz_a - w_a)*1.0 / nz_a;
 98 | 		while (f_ar >= f_ar_g && f_ar_8[grid] == -1.0) {
 99 | 			f_ar_8[grid] = f_ar;
100 | 			f_rr_8[grid] = f_rr;				hd_8[grid] = i*1.0 / D_span;
101 | 			f_ar_g *= 10;		grid++;
102 | 			if (f_ar < f_ar_g)
103 | 				break;
104 | 		}
105 | 		if (f_ar >= f_rr && rEER == 0.0) {
106 | 			rEER = (f_ar + f_rr) / 2.0;
107 | 			eer_sep = i*1.0 / D_span;//sep;
108 | 		}
109 | 		if (f_ar>1.0e-3 && sep == 0.0) {
110 | 			//if( f_ar>1.0e-2 && sep==0.0 )	{
111 | 			sep = i*1.0 / D_span;
112 | 			rFAR = f_ar;
113 | 			rFRR = f_rr;
114 | 		}
115 | 		if (D_intra[i] != 0) {
116 | 			s = (i*D_s - mean_a);
117 | 			devia_a += s*s*D_intra[i];
118 | 		}
119 | 		s = (i*D_s - mean_r);
120 | 		devia_r += s*s*D_inter[i];
121 | 	}
122 | 	devia_a = nz_a == 0 ? 0.0 : sqrt(devia_a / nz_a);
123 | 	devia_r = nz_r == 0 ? 0.0 : sqrt(devia_r / nz_r);
124 | 	//	devia_a = devian_a;				devia_r = devian_r;
125 | 
126 | 	s = sqrt((devia_a*devia_a) + (devia_r*devia_r)) / 2.0;
127 | 	D_ = s == 0 ? 0.0 : fabs(mean_a - mean_r) / s;
128 | 	double accu = (1.0 - rFRR)*100.0;
129 | 	if (rFAR>2 * 1.0e-2) { accu /= (rFAR / 1.0e-2); }		//rFAR=1%
130 | 	if (dump != 0) {
131 | 		printf("\n@@@\"%s\" nz=(%g,%g) intra=(%.3g,%.3g,%.3g,%.3g),inter=(%.3g,%.3g,%.3g,%.3g)"
132 | 			"\n@@@\taccu=%.3g%%(T=%g,frr=%.3g far=%.2g%%)    EER=%.3g(%.3g) _DCRIMI_\n"
133 | 			, sTitle.c_str(), nz_a, nz_r, mean_a, devia_a, max_a, min_a, mean_r, devia_r, max_r, min_r,
134 | 			accu, sep, rFRR, rFAR * 100, rEER, eer_sep);
135 | 		for (i = 0; i<8; i++) {
136 | 			printf("(%.1e,%.3g)", f_ar_8[i], f_rr_8[i]);
137 | 		}
138 | 		printf("\n");
139 | 	}
140 | 
141 | }
142 | 
143 | void DCRIMI_::GetRoc(float *roc, int flag) {
144 | 	int i;
145 | 	double s, D_s, f_ar, f_rr, w_a = 0.0, w_r = 0.0, f_ar_g = 1.0e-7;
146 | 	for (i = 0; i < D_span; i++) {
147 | 		w_a += D_intra[i];		w_r += D_inter[i];
148 | 		roc[2 * i] = w_r*1.0 / nz_r;
149 | 		roc[2 * i + 1] = (nz_a - w_a)*1.0 / nz_a;
150 | 	}
151 | }
152 | 
153 | double DCRIMI_::T_intra(int flag) {
154 | 	if (nz_a == 0)
155 | 		return 1.0;
156 | 	double t = mean_a + 7 * devia_a;
157 | 	t = min(t, 1.0);
158 | 	assert(t>0 && t <= 1.0);
159 | 	return t;
160 | }


--------------------------------------------------------------------------------
/python-package/case_poct.py:
--------------------------------------------------------------------------------
  1 | import lightgbm as lgb
  2 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
  3 | from sklearn.metrics import mean_absolute_error,mean_squared_error
  4 | from sklearn.metrics import roc_auc_score, roc_curve,auc
  5 | import time
  6 | import numpy as np
  7 | from litemort import *
  8 | import sys
  9 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 10 | #isMORT = True
 11 | import matplotlib.pyplot as plt
 12 | import pandas as pd
 13 | import gc
 14 | import seaborn as sns
 15 | import pickle
 16 | '''
 17 |     histo->RandomCompress() 似乎可以通过遍历更多的空间来提高准确率
 18 | 
 19 | '''
 20 | 
 21 | def ROC_plot(features,X_,y_, pred_,title):
 22 |     fpr_, tpr_, thresholds = roc_curve(y_, pred_)
 23 |     optimal_idx = np.argmax(tpr_ - fpr_)
 24 | #https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python
 25 |     optimal_threshold = thresholds[optimal_idx]
 26 |     auc_ = auc(fpr_, tpr_)
 27 |     title = "{} auc=".format(title)
 28 |     print("{} auc={} OT={:.4g}".format(title, auc_,optimal_threshold))
 29 |     plt.plot(fpr_, tpr_, label="{}:{:.4g}".format(title, auc_))
 30 |     plt.xlabel('False positive rate')
 31 |     plt.ylabel('True positive rate')
 32 |     plt.title('SMPLEs={} Features={} OT={:.4g}'.format(X_.shape[0],len(features),optimal_threshold))
 33 |     plt.legend(loc='best')
 34 |     plt.savefig("./_auc_[{}].jpg".format(features))
 35 |     plt.show()
 36 |     return auc_,optimal_threshold
 37 | 
 38 | def runLgb(X, y, test=None, num_rounds=10000, max_depth=-1, eta=0.01, subsample=0.8,
 39 |            colsample=0.8, min_child_weight=1, early_stopping_rounds=500, seeds_val=2017):
 40 |     plot_feature_importance = True
 41 |     features = list(X.columns)
 42 |     print("X={} y={}".format(X.shape,y.shape))
 43 |     params = {'task': 'train',
 44 |               'max_bin': 256,
 45 |               'salp_bins':32,
 46 |               #'elitism': 2,        #不适用于本算例
 47 |              'min_data_in_leaf': 32,
 48 |              'boosting_type': 'gbdt',
 49 |              'objective': 'binary',
 50 |              'learning_rate': eta,
 51 |              # 'metric': {'multi_logloss'},
 52 |              'metric': 'auc',
 53 |              'early_stop':early_stopping_rounds,
 54 |              'max_depth': max_depth,
 55 |              # 'min_child_weight':min_child_weight,
 56 |              'bagging_fraction': subsample,
 57 |              'feature_fraction': colsample,
 58 |              'bagging_seed': seeds_val,
 59 |              'num_iterations': num_rounds,
 60 |              'num_leaves': 32,
 61 |              'lambda_l1': 1.0,
 62 |              'verbose': 0,
 63 |              'nthread': -1}
 64 |     n_fold = 5
 65 |     folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
 66 |     y_pred=np.zeros(y.shape[0])
 67 |     feature_importance = None
 68 |     if not isMORT:
 69 |         feature_importance = pd.DataFrame()
 70 |     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
 71 |         t0 = time.time()
 72 | 
 73 |         if type(X) == np.ndarray:
 74 |             X_train, X_valid = X[train_index], X[valid_index]
 75 |             y_train, y_valid = y[train_index], y[valid_index]
 76 |         else:
 77 |             X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
 78 |             y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
 79 | 
 80 |         if isMORT:
 81 |             model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
 82 |             pred_val = model.predict(X_valid)
 83 |             pred_raw = model.predict_raw(X_valid)
 84 |             y_pred[valid_index] = pred_raw
 85 |             fold_score = roc_auc_score(y_valid, pred_raw)
 86 |         else:
 87 |             lgtrain = lgb.Dataset(X_train, y_train)
 88 |             lgval = lgb.Dataset(X_valid, y_valid)
 89 |             model = lgb.train(params, lgtrain, num_rounds, valid_sets=lgval,
 90 |                               early_stopping_rounds=early_stopping_rounds, verbose_eval=100)
 91 |             plt.figure(figsize=(12, 6))
 92 |             lgb.plot_importance(model, max_num_features=30)
 93 |             plt.title("Featurertances")
 94 |             plt.show()
 95 | 
 96 |             fold_importance = pd.DataFrame()
 97 |             fold_importance["feature"] = X.columns
 98 |             fold_importance["importance"] = model.feature_importance()
 99 |             fold_importance["fold"] = fold_n + 1
100 |             feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
101 |             model.save_model(f'model_lgb_poct_{fold_n}_.txt')
102 |             pred_val = model.predict(X_valid)
103 |             y_pred[valid_index] = pred_val
104 |             fold_score = roc_auc_score(y_valid, pred_val)
105 | 
106 |         print("fold n°{} time={:.3g} score={:.4g}".format(fold_n, time.time() - t0, fold_score))
107 |         if test is not None:
108 |             pred_test = model.predict(test, num_iteration=model.best_iteration)
109 | 
110 |         else:
111 |             pred_test = None
112 |         #break
113 |     auc = roc_auc_score(y, y_pred)
114 |     if feature_importance is not None:
115 |         feature_importance["importance"] /= n_fold
116 |         if plot_feature_importance:
117 |             cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
118 |                 by="importance", ascending=False)[:].index
119 |             best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
120 |             plt.figure(figsize=(5, 3));
121 |             sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
122 |             plt.xlabel("importance of each feature")
123 |             plt.title('AUC={:.3f} ({}-folds)'.format(auc,n_fold))
124 |             plt.savefig("./_importance_[{}].jpg".format(features))
125 |             plt.show()
126 | 
127 |     ROC_plot(features,X, y, y_pred, "")
128 | 
129 |     print("CV score: {:<8.5f}".format(auc))
130 |     return auc
131 | 
132 | pkl_path=f"E:/POCTx/poct_InHospital.pkl"
133 | with open(pkl_path, "rb") as fp:  # Pickling
134 |     X = pickle.load(fp)
135 |     y = pickle.load(fp)
136 | score = runLgb(X, y)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | [Xx]64/
 19 | [Xx]86/
 20 | [Bb]uild/
 21 | bld/
 22 | [Bb]in/
 23 | [Oo]bj/
 24 | docs/_build
 25 | tests/bin
 26 | lib
 27 | data
 28 | _000
 29 | python-package/dist
 30 | .pytest_cache/v/cache
 31 | 
 32 | # Visual Studio 2015 cache/options directory
 33 | .vs/
 34 | # Uncomment if you have tasks that create the project's static files in wwwroot
 35 | #wwwroot/
 36 | 
 37 | # MSTest test Results
 38 | [Tt]est[Rr]esult*/
 39 | [Bb]uild[Ll]og.*
 40 | 
 41 | # NUNIT
 42 | *.VisualState.xml
 43 | TestResult.xml
 44 | 
 45 | # Build Results of an ATL Project
 46 | [Dd]ebugPS/
 47 | [Rr]eleasePS/
 48 | dlldata.c
 49 | 
 50 | # DNX
 51 | project.lock.json
 52 | artifacts/
 53 | 
 54 | # Python
 55 | *.egg-info
 56 | __pycache__
 57 | .eggs
 58 | 
 59 | # VS Code
 60 | .vscode
 61 | 
 62 | # Prerequisites
 63 | *.d
 64 | 
 65 | # Compiled Object files
 66 | *.slo
 67 | *.lo
 68 | *.o
 69 | *.obj
 70 | 
 71 | # Precompiled Headers
 72 | *.gch
 73 | 
 74 | *_i.c
 75 | *_p.c
 76 | *_i.h
 77 | *.ilk
 78 | *.meta
 79 | *.obj
 80 | *.pch
 81 | *.pdb
 82 | *.pgc
 83 | *.pgd
 84 | *.rsp
 85 | *.sbr
 86 | *.tlb
 87 | *.tli
 88 | *.tlh
 89 | *.tmp
 90 | *.tmp_proj
 91 | *.log
 92 | *.vspscc
 93 | *.vssscc
 94 | .builds
 95 | *.pidb
 96 | *.svclog
 97 | *.scc
 98 | *.rar
 99 | *.ym
100 | *.model
101 | 
102 | 
103 | # Chutzpah Test files
104 | _Chutzpah*
105 | 
106 | # Visual C++ cache files
107 | ipch/
108 | *.aps
109 | *.ncb
110 | *.opendb
111 | *.opensdf
112 | *.sdf
113 | *.cachefile
114 | *.VC.db
115 | 
116 | # Visual Studio profiler
117 | *.psess
118 | *.vsp
119 | *.vspx
120 | *.sap
121 | 
122 | # TFS 2012 Local Workspace
123 | $tf/
124 | 
125 | # Guidance Automation Toolkit
126 | *.gpState
127 | 
128 | # ReSharper is a .NET coding add-in
129 | _ReSharper*/
130 | *.[Rr]e[Ss]harper
131 | *.DotSettings.user
132 | 
133 | # JustCode is a .NET coding add-in
134 | .JustCode
135 | 
136 | # TeamCity is a build add-in
137 | _TeamCity*
138 | 
139 | # DotCover is a Code Coverage Tool
140 | *.dotCover
141 | 
142 | # NCrunch
143 | _NCrunch_*
144 | .*crunch*.local.xml
145 | nCrunchTemp_*
146 | 
147 | # MightyMoose
148 | *.mm.*
149 | AutoTest.Net/
150 | 
151 | # Web workbench (sass)
152 | .sass-cache/
153 | 
154 | # Installshield output folder
155 | [Ee]xpress/
156 | 
157 | # DocProject is a documentation generator add-in
158 | DocProject/buildhelp/
159 | DocProject/Help/*.HxT
160 | DocProject/Help/*.HxC
161 | DocProject/Help/*.hhc
162 | DocProject/Help/*.hhk
163 | DocProject/Help/*.hhp
164 | DocProject/Help/Html2
165 | DocProject/Help/html
166 | 
167 | # Click-Once directory
168 | publish/
169 | 
170 | # Publish Web Output
171 | *.[Pp]ublish.xml
172 | *.azurePubxml
173 | 
174 | # TODO: Un-comment the next line if you do not want to checkin
175 | # your web deploy settings because they may include unencrypted
176 | # passwords
177 | #*.pubxml
178 | *.publishproj
179 | 
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/packages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/packages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/packages/repositories.config
188 | # NuGet v3's project.json files produces more ignoreable files
189 | *.nuget.props
190 | *.nuget.targets
191 | 
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 | 
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 | 
200 | # Windows Store app package directory
201 | AppPackages/
202 | BundleArtifacts/
203 | 
204 | # Visual Studio cache files
205 | # files ending in .cache can be ignored
206 | *.[Cc]ache
207 | # but keep track of directories ending in .cache
208 | !*.[Cc]ache/
209 | 
210 | # Others
211 | ClientBin/
212 | [Ss]tyle[Cc]op.*
213 | ~$*
214 | *~
215 | *.dbmdl
216 | *.dbproj.schemaview
217 | *.pfx
218 | *.publishsettings
219 | node_modules/
220 | orleans.codegen.cs
221 | 
222 | # RIA/Silverlight projects
223 | Generated_Code/
224 | 
225 | # Backup & report files from converting an old project file
226 | # to a newer Visual Studio version. Backup files are not needed,
227 | # because we have git ;-)
228 | _UpgradeReport_Files/
229 | Backup*/
230 | UpgradeLog*.XML
231 | UpgradeLog*.htm
232 | 
233 | # SQL Server files
234 | *.mdf
235 | *.ldf
236 | 
237 | # Business Intelligence projects
238 | *.rdl.data
239 | *.bim.layout
240 | *.bim_*.settings
241 | 
242 | # Microsoft Fakes
243 | FakesAssemblies/
244 | 
245 | # GhostDoc plugin setting file
246 | *.GhostDoc.xml
247 | 
248 | # Node.js Tools for Visual Studio
249 | .ntvs_analysis.dat
250 | 
251 | # Visual Studio 6 build log
252 | *.plg
253 | 
254 | # Visual Studio 6 workspace options file
255 | *.opt
256 | 
257 | # Visual Studio LightSwitch build output
258 | **/*.HTMLClient/GeneratedArtifacts
259 | **/*.DesktopClient/GeneratedArtifacts
260 | **/*.DesktopClient/ModelManifest.xml
261 | **/*.Server/GeneratedArtifacts
262 | **/*.Server/ModelManifest.xml
263 | _Pvt_Extensions
264 | 
265 | # LightSwitch generated files
266 | GeneratedArtifacts/
267 | ModelManifest.xml
268 | 
269 | # Paket dependency manager
270 | .paket/paket.exe
271 | 
272 | # FAKE - F# Make
273 | .fake/
274 | *.lai
275 | *.la
276 | *.a
277 | *.lib
278 | *.zip
279 | *.info
280 | *.dll
281 | *.so
282 | *.dylib
283 | *.mA_bin
284 | *.dat
285 | *.avi
286 | *.ogv
287 | *.asv
288 | *.code
289 | /tests/python_package_test/.pytest_cache/v/cache
290 | /tests/python_package_test/categorical.model
291 | /python-package/geo_test.py
292 | *.csv
293 | /python-package/.pytest_cache/v/cache/lastfailed
294 | /python-package/.pytest_cache/v/cache/nodeids
295 | /python-package/case_qq2019.py
296 | *.txt
297 | *.jpg
298 | /src/learn/discpy.py
299 | /src/learn/sparsipy.py
300 | /doc/Gradient boosting on adpative distrubutions.docx
301 | /python-package/litemort/桌面.lnk
302 | /python-package/LiteMORT_hyppo.py
303 | /python-package/shap_test.py
304 | *.pickle
305 | *.gz
306 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/LiteMORT_hyppo.py:
--------------------------------------------------------------------------------
  1 | #hyperparameter optimization
  2 | #from bayes_opt import BayesianOptimization
  3 | from sklearn.model_selection import KFold, train_test_split
  4 | from sklearn.metrics import mean_squared_error
  5 | from litemort import *
  6 | import numpy as np
  7 | 
  8 | def hyparam_search(func_core,pds,n_init=5, n_iter=12):
  9 |     optimizer = BayesianOptimization(func_core, pds, random_state=7)
 10 |     optimizer.maximize(init_points=n_init, n_iter=n_iter)
 11 |     print(optimizer.max)
 12 |     for i, res in enumerate(optimizer.res):
 13 |         print("Iteration {}: \n\t{}".format(i, res))
 14 |     input(f"......BayesianOptimization is OK......")
 15 |     return optimizer.max
 16 | 
 17 | def _feat_select_core_(**kwargs):
 18 |     print(kwargs)
 19 |     feats=[]
 20 |     nFeat = len(kwargs)
 21 |     no=0
 22 |     feat_factor = np.zeros(nFeat)
 23 |     for k,v in kwargs.items():
 24 |         feats.append(k)
 25 |         feat_factor[no]=v;  no=no+1
 26 |     train_tmp = train_data[feats]
 27 |     print('Training with {} features'.format(train_tmp.shape[1]))
 28 |     x_train, x_val, y_train, y_val = train_test_split(train_tmp, train_target, test_size = 0.2, random_state = 42)
 29 |     feat_factor=feat_factor.astype(np.float32)
 30 |     param_mort["feat_factor"]=feat_factor
 31 |     mort = LiteMORT(param_mort).fit(x_train, y_train,eval_set=[(x_val, y_val)])
 32 |     eval_pred = mort.predict(x_val)
 33 |     score = np.sqrt(mean_squared_error(eval_pred, y_val))
 34 |     return -score
 35 | 
 36 | #10/20/2019 实测效果较差  BayesianOptimization并不适合非常多的参数
 37 | def MORT_feat_bayesian_search(train,target,feat_fix,feat_select,n_init=5, n_iter=12):
 38 |     global train_data,train_target
 39 |     train_data,train_target = train,target
 40 |     print(f"train={train_data.shape} target={train_target.shape}")
 41 |     feat_useful=[]
 42 |     pds = {}
 43 |     for feat in feat_fix:
 44 |         pds[feat]=(1, 1)
 45 |     for feat in feat_select:
 46 |         pds[feat]=(0, 1)
 47 |     optimizer = BayesianOptimization(_feat_select_core_, pds, random_state=42)
 48 |     optimizer.maximize(init_points=n_init, n_iter=n_iter)
 49 |     print(optimizer.max)
 50 |     for i, res in enumerate(optimizer.res):
 51 |         print("Iteration {}: \n\t{}".format(i, res))
 52 |     input(f"......BayesianOptimization is OK......")
 53 |     return feat_useful
 54 | 
 55 | def MORT_feat_select_(dataX,dataY,feat_fix,feat_select,select_params,nMostSelect=10):
 56 |     nFeat=len(feat_fix)+len(feat_select)
 57 |     feats = []
 58 |     no = 0
 59 |     feat_factor = np.zeros(nFeat)
 60 |     for feat in feat_fix:
 61 |         feats.append(feat)
 62 |         feat_factor[no] = 1
 63 |         no = no + 1
 64 |     for feat in feat_select:
 65 |         feats.append(feat)
 66 |         feat_factor[no] = 0
 67 |         no = no + 1
 68 |     data_tmp = dataX[feats]
 69 | 
 70 |     #select_params['learning_rate'] = select_params['learning_rate']*2
 71 |     #select_params['early_stopping_rounds'] = 100
 72 |     #select_params['verbose'] = 0
 73 | 
 74 |     print(f"======MORT_feat_select_ nFix={len(feat_fix)} nSelect={len(feat_select)} ")
 75 |     feat_useful_ = []
 76 |     if True:
 77 |         for loop in range(nMostSelect):
 78 |             select_params["feat_factor"] = feat_factor
 79 |             if('split_idxs' in select_params):
 80 |                 assert(len(select_params['split_idxs'])>0)
 81 |                 tr_idx, val_idx=select_params['split_idxs'][0]
 82 |                 y_train = dataY[tr_idx]
 83 |                 x_train =data_tmp.iloc[tr_idx, :]
 84 |                 x_val, y_val = data_tmp.iloc[val_idx, :], dataY[val_idx]
 85 |             else:
 86 |                 x_train, x_val, y_train, y_val = train_test_split(data_tmp, dataY, test_size=0.2, random_state=42)
 87 |             cat_features = select_params['category_features'] if 'category_features' in select_params else None
 88 |             mort = LiteMORT(select_params).fit(x_train, y_train, eval_set=[(x_val, y_val)], categorical_feature=cat_features)
 89 |             feat_factor_1 = mort.params.feat_factor
 90 |             rank = np.argsort(feat_factor_1)[::-1]
 91 |             nAdd=0
 92 |             for no in rank:
 93 |                 if feat_factor[no]==1:
 94 |                     continue
 95 |                 if feat_factor_1[no] > 0:
 96 |                     feat_useful_.append(feats[no]);     nAdd=nAdd+1
 97 |                     print(f"___MORT_feat_select___@{loop}:\t{feats[no]}={feat_factor_1[no]:.5g}" )
 98 |                     feat_factor[no]=1
 99 |             if nAdd==0:
100 |                 print(f"___MORT_feat_select___@{loop} break out")
101 |             print(f"___MORT_feat_select___@{loop} feat_useful_={feat_useful_}")
102 |         input(f"......MORT_feat_select_ is OK......")
103 | 
104 |     else:       #original forward feature selection
105 |         x_train, x_val, y_train, y_val = train_test_split(dataX[feat_fix], dataY, test_size = 0.2, random_state = 42)
106 |         mort = LiteMORT(param_mort).fit(x_train, y_train, eval_set=[(x_val, y_val)])
107 |         predictions = mort.predict(x_val)
108 |         rmse_score = np.sqrt(mean_squared_error(y_val, predictions))
109 |         print("RMSE baseline val score: ", rmse_score)
110 |         best_score = rmse_score
111 |         train_columns = list(dataX.columns[13:])
112 |         for num, i in enumerate(train_columns):
113 |             train_tmp = dataX[feat_fix + feat_useful_ + [i]]
114 |             x_train, x_val, y_train, y_val = train_test_split(train_tmp, dataY, test_size=0.2, random_state=42)
115 |             mort = LiteMORT(param_mort).fit(x_train, y_train, eval_set=[(x_val, y_val)])
116 |             predictions = mort.predict(x_val)
117 |             rmse_score = np.sqrt(mean_squared_error(y_val, predictions))
118 |             percent = (best_score-rmse_score) / best_score*100.0;
119 |             if rmse_score < best_score:
120 |                 print(f'------ \"{i}\" is usefull {percent:.3g}% [{best_score:.7g}=>{rmse_score:.7g}]------')
121 |                 best_score = rmse_score
122 |                 feat_useful_.append(i)
123 |             else:
124 |                 pass    #rint('Column {} is not usefull'.format(i))
125 |     print(feat_useful_)
126 |     return feat_useful_


--------------------------------------------------------------------------------
/python-package/case_higgs.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     https://archive.ics.uci.edu/ml/datasets/HIGGS
  3 | This is a classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not.
  4 | The data has been produced using Monte Carlo simulations. The first 21 features (columns 2-22) are kinematic properties measured by the particle detectors in the accelerator. The last seven features are functions of the first 21 features; these are high-level features derived by physicists to help discriminate between the two classes. There is an interest in using deep learning methods to obviate the need for physicists to manually develop such features. Benchmark results using Bayesian Decision Trees from a standard physics package and 5-layer neural networks are presented in the original paper. The last 500,000 examples are used as a test set.
  5 | 
  6 |     https://github.com/Laurae2/boosting_tree_benchmarks/tree/master/data
  7 |     https://github.com/guolinke/boosting_tree_benchmarks/tree/master/data
  8 |     https://blog.bigml.com/2017/09/28/case-study-finding-higgs-bosons-with-deepnets/
  9 | 
 10 |     5/19/2019 需要确定是regression 或 binary classification
 11 |     8/23/2019   subsample subfeature 似乎都没用(2000000测试)
 12 |         lesome_rows=2000000 iter=2000 auc=0.83775(1,1) auc=0.83847(0.8,1);auc=0.83618(0.8,0.5)
 13 | 
 14 | '''
 15 | import lightgbm as lgb
 16 | import time
 17 | import sys
 18 | import os
 19 | import gc
 20 | import pandas as pd
 21 | import numpy as np
 22 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
 23 | import pickle
 24 | from litemort import *
 25 | #from LiteMORT_EDA import *
 26 | 
 27 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 28 | #isMORT = True
 29 | model_type = 'mort' if isMORT else 'lgb'
 30 | #some_rows=      200000
 31 | #some_rows=      2000000
 32 | some_rows=      10500000
 33 | nTotal =        11000000
 34 | nLastForTest =    500000       #The last 500,000 examples are used as a test set.
 35 | 
 36 | #some_rows=None
 37 | 
 38 | def read_higgs_data(path):
 39 |     pkl_path = 'F:/Datasets/HIGGS_/higgs_data_{}.pickle'.format(some_rows)
 40 |     if os.path.isfile(pkl_path):
 41 |         print("====== Load pickle @{} ......".format(pkl_path))
 42 |         with open(pkl_path, "rb") as fp:
 43 |             [X, y, X_test,y_test] = pickle.load(fp)
 44 |     else:
 45 |         assert(some_rows<=nTotal-nLastForTest)
 46 |         print("====== Read last {} examples as training set ......".format(some_rows))
 47 |         df = pd.read_csv(path, nrows=some_rows,header=None)
 48 |         y=pd.Series(df.iloc[:,0])
 49 |         X=df.iloc[:,1:]
 50 |         print("====== Read last {} examples as testing set ......".format(nLastForTest))
 51 |         df = pd.read_csv(path, skiprows = nTotal-nLastForTest,nrows=nLastForTest, header=None)
 52 |         y_test = pd.Series(df.iloc[:, 0])
 53 |         X_test = df.iloc[:,1:]
 54 |         del df
 55 |         gc.collect()
 56 |         print("====== Save pickle @{} ......".format(pkl_path))
 57 |         with open(pkl_path, "wb") as fp:  # Pickling
 58 |             pickle.dump([X, y, X_test,y_test], fp)
 59 |     print("====== read_higgs_data X={}, y={}, X_test={} ...... OK".format(X.shape, y.shape, X_test.shape))
 60 |     return X,y,X_test
 61 | 
 62 | X,y,X_test = read_higgs_data("F:/Datasets/HIGGS_/HIGGS.csv")
 63 | #X = Unique_Expand(X)
 64 | #X_test = Unique_Expand(X_test)
 65 | num_rounds = 10001
 66 | params = {
 67 |         "objective": "binary",
 68 |         "metric": "auc",        #"binary_logloss"
 69 |         "adaptive":'weight',
 70 |             'max_bin': 256,
 71 |           'num_leaves': 64,
 72 |           'learning_rate': 0.1,
 73 |           'tree_learner': 'serial',
 74 |           'task': 'train',
 75 |           'is_training_metric': 'false',
 76 |           'min_data_in_leaf': 512,
 77 |           #'min_sum_hessian_in_leaf': 100,
 78 |           #'bagging_fraction': 1,#0.2,
 79 |           'subsample': 1,     'bagging_freq': 1,
 80 |             'feature_fraction': 1,
 81 |           #'ndcg_eval_at': [1, 3, 5, 10],
 82 |           #'sparse_threshold': 1.0,
 83 |             'n_estimators':num_rounds,
 84 |             'early_stopping_rounds': 500,
 85 |             'verbose':667,
 86 |           #'device': 'cpu'
 87 |            #'device': 'gpu',
 88 |           #'gpu_platform_id': 0,
 89 |           #'gpu_device_id': 0
 90 |           }
 91 | n_fold = 5
 92 | folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
 93 | for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
 94 |     t0 = time.time()
 95 | 
 96 |     if type(X) == np.ndarray:
 97 |         X_train, X_valid = X[train_index], X[valid_index]
 98 |         y_train, y_valid = y[train_index], y[valid_index]
 99 |     else:
100 |         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
101 |         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
102 |     if False:
103 |         mean = y_train.mean();
104 |         d_train = pd.concat([y_train, X_train], ignore_index=True, axis=1)
105 |         print("X_train={}, y_train={} d_train={}".format(X_train.shape, y_train.shape, d_train.shape))
106 |         np.savetxt("D:/LightGBM-master/examples/regression/geo_test.csv", d_train, delimiter='\t')
107 | 
108 |     if model_type == 'mort':
109 |         model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
110 |         #y_pred_valid = model.predict(X_valid)
111 |         #y_pred = model.predict(X_test)
112 | 
113 |     if model_type == 'lgb':
114 |         model = lgb.LGBMRegressor(**params, n_jobs=-1)
115 |         model.fit(X_train, y_train,
116 |                   eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',verbose=5)
117 |         model.booster_.save_model('geo_test_.model')
118 |         #y_pred_valid = model.predict(X_valid)
119 |         #y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
120 |     break
121 | 
122 | input("loss is {} time={:.3g} model={}...".format(0,time.time()-t0,model_type))
123 | sys.exit(-1)
124 | 
125 | t0 = time.time()
126 | gbm = lgb.train(params, train_set=dtrain, num_boost_round=10,
127 |                 valid_sets=None, valid_names=None,
128 |                 fobj=None, feval=None, init_model=None,
129 |                 feature_name='auto', categorical_feature='auto',
130 |                 early_stopping_rounds=None, evals_result=None,
131 |                 verbose_eval=True,
132 |                 keep_training_booster=False, callbacks=None)
133 | t1 = time.time()
134 | 
135 | print('cpu version elapse time: {}'.format(t1 - t0))
136 | 


--------------------------------------------------------------------------------
/src/util/Float16.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <memory.h>
  3 | 
  4 | //extern short FloatToFloat16(float value);
  5 | //extern float Float16ToFloat(short value);
  6 | 
  7 | class Float16
  8 | {
  9 | protected:
 10 | 	short mValue;
 11 | 
 12 | 	short FloatToFloat16(float value) {
 13 | 		short   fltInt16;
 14 | 		int     fltInt32;
 15 | 		memcpy(&fltInt32, &value, sizeof(float));
 16 | 		fltInt16 = ((fltInt32 & 0x7fffffff) >> 13) - (0x38000000 >> 13);
 17 | 		fltInt16 |= ((fltInt32 & 0x80000000) >> 16);
 18 | 
 19 | 		return fltInt16;
 20 | 	}
 21 | 
 22 | 	float Float16ToFloat(short fltInt16) const {
 23 | 		int fltInt32 = ((fltInt16 & 0x8000) << 16);
 24 | 		fltInt32 |= ((fltInt16 & 0x7fff) << 13) + 0x38000000;
 25 | 
 26 | 		float fRet;
 27 | 		memcpy(&fRet, &fltInt32, sizeof(float));
 28 | 		return fRet;
 29 | 	}
 30 | 
 31 | public:
 32 | 	Float16();
 33 | 	Float16(float value);
 34 | 	Float16(const Float16& value);
 35 | 
 36 | 	operator float();
 37 | 	operator float() const;
 38 | 
 39 | 	friend Float16 operator + (const Float16& val1, const Float16& val2);
 40 | 	friend Float16 operator - (const Float16& val1, const Float16& val2);
 41 | 	friend Float16 operator * (const Float16& val1, const Float16& val2);
 42 | 	friend Float16 operator / (const Float16& val1, const Float16& val2);
 43 | 
 44 | 	Float16& operator =(const Float16& val);
 45 | 	Float16& operator +=(const Float16& val);
 46 | 	Float16& operator -=(const Float16& val);
 47 | 	Float16& operator *=(const Float16& val);
 48 | 	Float16& operator /=(const Float16& val);
 49 | 	Float16& operator -();
 50 | 
 51 | 	//https://codedocs.xyz/HipsterSloth/PSMoveService/__detail_8hpp_source.html
 52 | 	union uif32 {
 53 | 		float f;
 54 | 		unsigned int i;
 55 | 	};
 56 | 
 57 | 	//https://github.com/g-truc/glm/blob/0.9.5/glm/detail/type_half.inl
 58 | 	static float GLM_toFloat32(const short& value, int flag = 0x0) {
 59 | 		int s = (value >> 15) & 0x00000001;
 60 | 		int e = (value >> 10) & 0x0000001f;
 61 | 		int m = value & 0x000003ff;
 62 | 		uif32 result;
 63 | 		if (e == 0) {
 64 | 			if (m == 0) {
 65 | 				result.i = (unsigned int)(s << 31);
 66 | 				return result.f;
 67 | 			}
 68 | 			else {
 69 | 				//
 70 | 				// Denormalized number -- renormalize it
 71 | 				//
 72 | 				while (!(m & 0x00000400)) {
 73 | 					m <<= 1;
 74 | 					e -= 1;
 75 | 				}
 76 | 				e += 1;
 77 | 				m &= ~0x00000400;
 78 | 			}
 79 | 		}
 80 | 		else if (e == 31) {
 81 | 			if (m == 0) {
 82 | 				//
 83 | 				// Positive or negative infinity
 84 | 				//
 85 | 				result.i = (unsigned int)((s << 31) | 0x7f800000);
 86 | 				return result.f;
 87 | 			}		else {
 88 | 				//
 89 | 				// Nan -- preserve sign and significand bits
 90 | 				//
 91 | 				uif32 result;
 92 | 				result.i = (unsigned int)((s << 31) | 0x7f800000 | (m << 13));
 93 | 				return result.f;
 94 | 			}
 95 | 		}
 96 | 
 97 | 		e = e + (127 - 15);
 98 | 		m = m << 13;
 99 | 		uif32 Result;
100 | 		Result.i = (unsigned int)((s << 31) | (e << 23) | m);
101 | 		return Result.f;
102 | 	};
103 | };
104 | 
105 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
106 | 
107 | inline Float16::Float16()
108 | {
109 | }
110 | 
111 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
112 | 
113 | inline Float16::Float16(float value){
114 | 	mValue = FloatToFloat16(value);
115 | }
116 | 
117 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
118 | 
119 | inline Float16::Float16(const Float16 &value){
120 | 	mValue = value.mValue;
121 | }
122 | 
123 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
124 | 
125 | inline Float16::operator float()
126 | {
127 | 	return Float16ToFloat(mValue);
128 | }
129 | 
130 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
131 | 
132 | inline Float16::operator float() const
133 | {
134 | 	return Float16ToFloat(mValue);
135 | }
136 | 
137 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
138 | 
139 | inline Float16& Float16::operator =(const Float16& val)
140 | {
141 | 	mValue = val.mValue;
142 | }
143 | 
144 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
145 | 
146 | inline Float16& Float16::operator +=(const Float16& val)
147 | {
148 | 	*this = *this + val;
149 | 	return *this;
150 | }
151 | 
152 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
153 | 
154 | inline Float16& Float16::operator -=(const Float16& val)
155 | {
156 | 	*this = *this - val;
157 | 	return *this;
158 | 
159 | }
160 | 
161 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
162 | 
163 | inline Float16& Float16::operator *=(const Float16& val)
164 | {
165 | 	*this = *this * val;
166 | 	return *this;
167 | }
168 | 
169 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
170 | 
171 | inline Float16& Float16::operator /=(const Float16& val)
172 | {
173 | 	*this = *this / val;
174 | 	return *this;
175 | }
176 | 
177 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
178 | 
179 | inline Float16& Float16::operator -()
180 | {
181 | 	*this = Float16(-(float)*this);
182 | 	return *this;
183 | }
184 | 
185 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
186 | /*+----+                                 Friends                                       +----+*/
187 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
188 | 
189 | inline Float16 operator + (const Float16& val1, const Float16& val2)
190 | {
191 | 	return Float16((float)val1 + (float)val2);
192 | }
193 | 
194 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
195 | 
196 | inline Float16 operator - (const Float16& val1, const Float16& val2)
197 | {
198 | 	return Float16((float)val1 - (float)val2);
199 | }
200 | 
201 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
202 | 
203 | inline Float16 operator * (const Float16& val1, const Float16& val2)
204 | {
205 | 	return Float16((float)val1 * (float)val2);
206 | }
207 | 
208 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/
209 | 
210 | inline Float16 operator / (const Float16& val1, const Float16& val2)
211 | {
212 | 	return Float16((float)val1 / (float)val2);
213 | }
214 | 
215 | 
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/python-package/case_future_sales.py:
--------------------------------------------------------------------------------
  1 | #https://www.kaggle.com/dhimananubhav/feature-engineering-xgboost
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | pd.set_option('display.max_rows', 500)
  6 | pd.set_option('display.max_columns', 100)
  7 | 
  8 | from itertools import product
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.metrics import mean_squared_error
 11 | import seaborn as sns
 12 | import matplotlib.pyplot as plt
 13 | from xgboost import XGBRegressor
 14 | from xgboost import plot_importance
 15 | import time
 16 | import sys
 17 | import gc
 18 | import pickle
 19 | import random
 20 | from litemort import *
 21 | 
 22 | from bayes_opt import BayesianOptimization
 23 | 
 24 | 
 25 | def plot_features(booster, figsize):
 26 |     fig, ax = plt.subplots(1,1,figsize=figsize)
 27 |     return plot_importance(booster=booster, ax=ax)
 28 | 
 29 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 30 | isMORT = True
 31 | alg='MORT' if isMORT else 'LGB'
 32 | #some_rows = 5000
 33 | some_rows = None
 34 | #data_root = '../input/'
 35 | data_root = "F:/Datasets/future_sales"
 36 | 
 37 | test  = pd.read_csv(f'{data_root}/test.csv').set_index('ID')
 38 | data = pd.read_pickle(f'{data_root}/data.pkl')
 39 | if some_rows is not None:
 40 |     nMost=data.shape[0]
 41 |     random.seed(42)
 42 |     subset = random.sample(range(nMost), some_rows)
 43 |     data = data.iloc[subset, :].reset_index(drop=True)
 44 |     print('====== Some Samples ... data={}'.format(data.shape))
 45 | 
 46 | data = data[[
 47 |     'date_block_num',
 48 |     'shop_id',
 49 |     'item_id',
 50 |     'item_cnt_month',
 51 |     'city_code',
 52 |     'item_category_id',
 53 |     'type_code',
 54 |     'subtype_code',
 55 |     'item_cnt_month_lag_1',
 56 |     'item_cnt_month_lag_2',
 57 |     'item_cnt_month_lag_3',
 58 |     'item_cnt_month_lag_6',
 59 |     'item_cnt_month_lag_12',
 60 |     'date_avg_item_cnt_lag_1',
 61 |     'date_item_avg_item_cnt_lag_1',
 62 |     'date_item_avg_item_cnt_lag_2',
 63 |     'date_item_avg_item_cnt_lag_3',
 64 |     'date_item_avg_item_cnt_lag_6',
 65 |     'date_item_avg_item_cnt_lag_12',
 66 |     'date_shop_avg_item_cnt_lag_1',
 67 |     'date_shop_avg_item_cnt_lag_2',
 68 |     'date_shop_avg_item_cnt_lag_3',
 69 |     'date_shop_avg_item_cnt_lag_6',
 70 |     'date_shop_avg_item_cnt_lag_12',
 71 |     'date_cat_avg_item_cnt_lag_1',
 72 |     'date_shop_cat_avg_item_cnt_lag_1',
 73 |     #'date_shop_type_avg_item_cnt_lag_1',
 74 |     #'date_shop_subtype_avg_item_cnt_lag_1',
 75 |     'date_city_avg_item_cnt_lag_1',
 76 |     'date_item_city_avg_item_cnt_lag_1',
 77 |     #'date_type_avg_item_cnt_lag_1',
 78 |     #'date_subtype_avg_item_cnt_lag_1',
 79 |     'delta_price_lag',
 80 |     'month',
 81 |     'days',
 82 |     'item_shop_last_sale',
 83 |     'item_last_sale',
 84 |     'item_shop_first_sale',
 85 |     'item_first_sale',
 86 | ]]
 87 | 
 88 | X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
 89 | Y_train = data[data.date_block_num < 33]['item_cnt_month']
 90 | X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
 91 | Y_valid = data[data.date_block_num == 33]['item_cnt_month']
 92 | X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)
 93 | print(f"X_train={X_train.shape} Y_train={Y_train.shape}")
 94 | print(f"X_valid={X_valid.shape} Y_valid={Y_valid.shape}")
 95 | print(f"X_test={X_test.shape} ")
 96 | del data
 97 | gc.collect();
 98 | 
 99 | params={'num_leaves': 550,   'n_estimators':1000,'early_stopping_rounds':20,
100 |         'feature_fraction': 1,     'bagging_fraction': 1,
101 |         'max_bin': 512,
102 |        "adaptive":'weight1'
103 |                   '',   #无效，晕
104 |     #"learning_schedule":"adaptive",
105 |      'max_depth': 10,
106 |      'min_child_weight': 300,    #'min_data_in_leaf': 300,
107 |      'learning_rate': 0.1,
108 |      'objective': 'regression',
109 |      'boosting_type': 'gbdt',
110 |      'verbose': 1,
111 |      'metric': {'rmse'}
112 | }
113 | 
114 | 
115 | def hyparam_core(num_leaves, feature_fraction, bagging_fraction, max_depth, learning_rate, min_data_in_leaf,max_bin):
116 |     param_1 = params
117 |     param_1['verbose']=0
118 |     param_1["num_leaves"] = int(round(num_leaves))
119 |     param_1['feature_fraction'] = max(min(feature_fraction, 1), 0)
120 |     param_1['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
121 |     param_1['max_depth'] = int(round(max_depth))
122 |     param_1['learning_rate'] = learning_rate
123 |     param_1['min_data_in_leaf'] = int(round(min_data_in_leaf))
124 |     param_1['max_bin'] = int(round(max_bin))
125 | 
126 |     model = LiteMORT(param_1).fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)])
127 |     Y_pred = model.predict(X_valid).clip(0, 20)
128 |     score = np.sqrt(mean_squared_error(Y_pred, Y_valid))
129 |     return -score
130 | 
131 | if isMORT:
132 |     if False:   #BayesianOptimization
133 |         pds = {'num_leaves': (547, 547),
134 |                'feature_fraction': (1, 1),
135 |                'bagging_fraction': (1, 1),
136 |                'max_depth': (10,10),
137 |                'learning_rate': (0.1, 0.1),
138 |                'min_data_in_leaf': (20, 20),
139 |                'max_bin': (128, 1024),
140 |                }
141 | 
142 |         optimizer = BayesianOptimization(hyparam_core, pds, random_state=7)
143 |         optimizer.maximize(init_points=5, n_iter=12)
144 |         print(optimizer.max)
145 |         for i, res in enumerate(optimizer.res):
146 |             print("Iteration {}: \n\t{}".format(i, res))
147 |         input(f"......BayesianOptimization is OK......")
148 | 
149 |     model = LiteMORT(params).fit(X_train,Y_train,eval_set=[(X_valid, Y_valid)])
150 | else:
151 |     model = XGBRegressor(
152 |         max_depth=8,
153 |         n_estimators=1000,
154 |         min_child_weight=300,
155 |         colsample_bytree=0.8,
156 |         subsample=0.8,
157 |         eta=0.3,
158 |         seed=42)
159 | 
160 |     model.fit(
161 |         X_train,
162 |         Y_train,
163 |         eval_metric="rmse",
164 |         eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
165 |         verbose=True,
166 |         early_stopping_rounds = 10)
167 |     alg = 'xgboost'
168 | 
169 | Y_pred = model.predict(X_valid).clip(0, 20)
170 | score = np.sqrt(mean_squared_error(Y_pred, Y_valid))
171 | Y_test = model.predict(X_test).clip(0, 20)
172 | 
173 | if not isMORT:
174 |     plot_features(model, (10, 14))
175 | 
176 | path=""
177 | if some_rows is None:
178 |     submission = pd.DataFrame({
179 |         "ID": test.index,
180 |         "item_cnt_month": Y_test
181 |     })
182 |     path = f'{data_root}/{alg}_[{score:.5g}].csv'
183 |     submission.to_csv(path, index=False)
184 | 
185 |     # save predictions for an ensemble
186 |     #pickle.dump(Y_pred, open(f'{data_root}/xgb_train.pickle', 'wb'))
187 |     #pickle.dump(Y_test, open(f'{data_root}/xgb_test.pickle', 'wb'))
188 | input(f"......Save submit @{path}......")
189 | 


--------------------------------------------------------------------------------
/python-package/LiteMORT/LiteMORT_ERA.py:
--------------------------------------------------------------------------------
  1 | #Exploratory result analysis
  2 | import math
  3 | import seaborn as sns; sns.set(style="ticks", color_codes=True)
  4 | import matplotlib.pyplot as plt
  5 | import pandas as pd
  6 | import numpy as np
  7 | import random
  8 | import gc
  9 | import time
 10 | from sklearn.metrics import roc_auc_score
 11 | from scipy.stats import rankdata
 12 | 
 13 | def auc_u_test(vec, len_A, len_B):
 14 |   rank_value = rankdata(vec)
 15 |   rank_sum = sum(rank_value[0:len_A])
 16 |   u_value = rank_sum - (len_A*(len_A+1))/2
 17 |   auc = u_value / (len_A * len_B)
 18 |   if auc < 0.50:
 19 |     auc = 1.0 - auc
 20 |   return(auc)
 21 | 
 22 | # from https://gist.github.com/mattsgithub/dedaa017adc1f30d9833175a5c783221
 23 | def roc_auc_alternative(y_true, y_score):
 24 |     # Total number of observations
 25 |     N = y_true.shape[0]
 26 |     I = np.arange(1, N + 1)
 27 |     N_pos = np.sum(y_true)
 28 |     N_neg = N - N_pos
 29 |     I = y_score.argsort()[::-1][:N]
 30 |     y_pred = y_true[I]
 31 |     I = np.arange(1, N + 1)
 32 |     return 1. + ((N_pos + 1.) / (2 * N_neg)) - (1. / (N_pos * N_neg)) * I.dot(y_pred)
 33 | 
 34 | def Robert_M_Johnson_test( ):
 35 |     np.random.seed(42)
 36 |     N = np.arange(start=20, stop=1000000, step=10000)
 37 | 
 38 |     t_sklearn = []
 39 |     t_dot = []
 40 |     for n in N:
 41 |         N_pos = np.random.randint(low=1, high=n + 1)
 42 |         y_true = np.concatenate((np.ones(N_pos), np.zeros(n - N_pos)))
 43 |         random.shuffle(y_true)
 44 |         y_true = np.array([0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.])
 45 |         y_score = np.random.random(size=n)
 46 | 
 47 |         # Timeit
 48 |         t0 = time.time()
 49 |         y1 = roc_auc_score(y_true=y_true, y_score=y_score)
 50 |         t1 = time.time()
 51 |         t_sklearn.append(t1 - t0)
 52 | 
 53 |         # Timeit
 54 |         t0 = time.time()
 55 |         y2 = roc_auc_alternative(y_true=y_true, y_score=y_score)
 56 |         t1 = time.time()
 57 |         t_dot.append(t1 - t0)
 58 | 
 59 |         # Proves their equality
 60 |         # Raises error if not almost equal (up to 14 decimal places)
 61 |         np.testing.assert_almost_equal(y1, y2, decimal=14)
 62 | 
 63 | class Feature_Importance(object):
 64 |     def __init__(self, columns):
 65 |         self.columns = columns
 66 |         self.df = pd.DataFrame()
 67 | 
 68 |     def OnFold(self,fold,f_importance):
 69 |         fold_importance_df = pd.DataFrame()
 70 |         fold_importance_df["Feature"] = self.columns
 71 |         fold_importance_df["importance"] = f_importance
 72 |         fold_importance_df["fold"] = fold
 73 |         self.df = pd.concat([self.df, fold_importance_df], axis=0)
 74 | 
 75 |     def SomePlot(self):
 76 |         cols = (self.df[["Feature", "importance"]].groupby("Feature").mean()
 77 |             .sort_values(by="importance", ascending=False)[:32].index)
 78 |         best_features = self.df.loc[self.df.Feature.isin(cols)]
 79 |         plt.figure(figsize=(14, 25))
 80 |         sns.barplot(x="importance",                y="Feature",
 81 |                     data=best_features.sort_values(by="importance", ascending=False))
 82 |         plt.title('LightGBM Features (avg over folds)')
 83 |         plt.tight_layout()
 84 |         plt.savefig('lgbm_importances.png')
 85 |         plt.show(block=True)
 86 | 
 87 | def ERA_pair(features,train,test_0,predict_0, target_0):
 88 |     predict = pd.Series(predict_0)
 89 |     target = target_0.reset_index(drop=True)
 90 |     err = (predict-target).abs()
 91 |     a0,a1=err.min(),err.max()
 92 |     thrsh = a0+(a1-a0)/10
 93 |     idx_1 = err.index[err > thrsh]
 94 |     idx_2 = err.index[err <= thrsh]
 95 |     test = test_0.reset_index(drop=True)
 96 |     df1 = test[test.index.isin(idx_1)][features]
 97 |     df2 = test[test.index.isin(idx_2)][features]
 98 |     df1.fillna(0.0, inplace=True)
 99 |     df2.fillna(0.0, inplace=True)
100 |     g = sns.pairplot(df1)
101 |     g.fig.suptitle("df={} err=[{:.3g}-{:.3g}]".format(df1.shape,thrsh,a1))
102 |     g = sns.pairplot(df2)
103 |     g.fig.suptitle("df={} err=[{:.3g}-{:.3g}]".format(df2.shape, a0,thrsh))
104 |     plt.show()
105 |     plt.show(block=True)
106 |     #plt.close()
107 |     del df1,df2
108 |     gc.collect()
109 |     return
110 | 
111 | #   'h_mean','log_mean'必须不是0
112 | def df_mix_(df,cols,alg='exp_mean'):
113 |     mix_lg, mix_hm = 0, 0
114 |     if alg=='log_mean':
115 |         gc.collect()
116 |     elif alg == 'exp_mean':
117 |         for col in cols:
118 |             mix_hm += np.exp(df[col])
119 |         mix_hm = np.log(mix_hm)
120 |         df['mix'] = mix_hm
121 |         gc.collect()
122 |     elif alg=='h_mean':
123 |         for col in cols:
124 |             mix_hm += 1 / df[col]
125 |         mix_hm = 1/mix_hm
126 |         df['mix'] = mix_hm
127 |         gc.collect()
128 |     else:
129 |         df['mix'] = df[cols].max(axis=1)
130 |     return df['mix']
131 | 
132 | def cys_mix_ID_TRAGET_(ID,TARGET,path,files,alg='h_mean'):
133 |     #path='H:/Project/fraud_click/bagging/'
134 |     #files = [path+'{{{[H]_7_0.05.txt}}}_cys_.csv',path+'{{{[H]_8_eta.txt}}}_cys_.csv',path+'{{{[H]_9_eta.txt}}}_cys_.csv']
135 |     mix_lg,mix_hm=0,0
136 |     #alg='log_mean'              #效果很好，令人吃惊
137 |     #alg='h_mean'                #harmonic mean
138 |     # alg='max_out'                #
139 |     # alg='log_rank_mean'        #可以试试
140 |     out = '{}[{}]_BAG{}.csv'.format(path,alg,len(files))
141 |     df = pd.DataFrame()
142 |     cols = []
143 |     if alg=='log_mean':
144 |         for idx, fp in enumerate(files):
145 |             print('====== Load {}...'.format(fp))
146 |             tmp = pd.read_csv(fp, nrows=10000)  # , nrows=10000
147 |             mix_lg += np.log(tmp.TARGET)
148 |         df[ID] = tmp[ID]
149 |         mix_lg = np.exp(mix_lg / len(files))
150 |         df[TARGET] = mix_lg
151 |         del tmp
152 |         gc.collect()
153 |     elif alg=='h_mean':
154 |         for idx, fp in enumerate(files):
155 |             print('====== Load {}...'.format(fp))
156 |             tmp = pd.read_csv(fp)  #, nrows=10000
157 |             mix_hm += 1 / (tmp.TARGET)
158 |         df[ID] = tmp[ID]
159 |         mix_hm = 1/mix_hm
160 |         df[TARGET] = mix_hm
161 |         del tmp
162 |         gc.collect()
163 |     else:
164 |         df=pd.DataFrame()
165 |         cols=[]
166 |         for idx, fp in enumerate(files):
167 |             print('====== Load {}...'.format(fp))
168 |             tmp = pd.read_csv(fp)       # , nrows=10000
169 |             title = 'att_{}'.format(idx)
170 |             cols.append(title)
171 |             df[title] = tmp[TARGET]
172 |         df[ID] = tmp[ID]
173 |         df[TARGET] = df[cols].max(axis=1)
174 |         out = path+'{{{'+'maxout'+'}}}_bag.csv'
175 |     nN = df.isnull().sum().sum()
176 |     print( '======{} out={} shape={},NAN={} ...\n{}'.format(alg,out,df.shape,nN,df.head()) )
177 |     df[[ID, TARGET]].to_csv(out, index=False,float_format='%.8f')
178 |     print( '======{} ... OK\n'.format(alg,out,df.shape,nN) )
179 | 
180 | if __name__=='__main__':
181 |     Robert_M_Johnson_test()


--------------------------------------------------------------------------------
/vs/LiteMORT/LiteMORT.vcxproj.filters:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup>
  4 |     <Filter Include="源文件">
  5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
  6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
  7 |     </Filter>
  8 |     <Filter Include="头文件">
  9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
 10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
 11 |     </Filter>
 12 |     <Filter Include="资源文件">
 13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
 14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
 15 |     </Filter>
 16 |     <Filter Include="源文件\data_fold">
 17 |       <UniqueIdentifier>{e17e50fe-dad5-49cb-9b89-e0f946d8426b}</UniqueIdentifier>
 18 |     </Filter>
 19 |     <Filter Include="源文件\learn">
 20 |       <UniqueIdentifier>{163fefa2-bee8-49cb-8d14-10a13710ebf3}</UniqueIdentifier>
 21 |     </Filter>
 22 |     <Filter Include="源文件\tree">
 23 |       <UniqueIdentifier>{4ccc43f1-1162-462e-b590-9dc7aa994b7a}</UniqueIdentifier>
 24 |     </Filter>
 25 |     <Filter Include="源文件\util">
 26 |       <UniqueIdentifier>{fd9e0871-c601-475d-b095-e87773b68644}</UniqueIdentifier>
 27 |     </Filter>
 28 |     <Filter Include="源文件\python">
 29 |       <UniqueIdentifier>{c2f85e65-ddf3-484e-82ce-966e3214f827}</UniqueIdentifier>
 30 |     </Filter>
 31 |     <Filter Include="源文件\EDA">
 32 |       <UniqueIdentifier>{52705ab1-350d-4f26-a4db-89374027eba6}</UniqueIdentifier>
 33 |     </Filter>
 34 |   </ItemGroup>
 35 |   <ItemGroup>
 36 |     <Text Include="ReadMe.txt" />
 37 |   </ItemGroup>
 38 |   <ItemGroup>
 39 |     <ClCompile Include="..\..\src\LiteMORT.cpp">
 40 |       <Filter>源文件</Filter>
 41 |     </ClCompile>
 42 |     <ClCompile Include="..\..\src\data_fold\Histogram.cpp">
 43 |       <Filter>源文件\data_fold</Filter>
 44 |     </ClCompile>
 45 |     <ClCompile Include="..\..\src\data_fold\EDA.cpp">
 46 |       <Filter>源文件\data_fold</Filter>
 47 |     </ClCompile>
 48 |     <ClCompile Include="..\..\src\data_fold\DataFold.cpp">
 49 |       <Filter>源文件\data_fold</Filter>
 50 |     </ClCompile>
 51 |     <ClCompile Include="..\..\src\learn\DCRIMI_.cpp">
 52 |       <Filter>源文件\learn</Filter>
 53 |     </ClCompile>
 54 |     <ClCompile Include="..\..\src\learn\LOSS.cpp">
 55 |       <Filter>源文件\learn</Filter>
 56 |     </ClCompile>
 57 |     <ClCompile Include="..\..\src\tree\GBRT.cpp">
 58 |       <Filter>源文件\tree</Filter>
 59 |     </ClCompile>
 60 |     <ClCompile Include="..\..\src\tree\BiSplit.cpp">
 61 |       <Filter>源文件\tree</Filter>
 62 |     </ClCompile>
 63 |     <ClCompile Include="..\..\src\tree\ManifoldTree.cpp">
 64 |       <Filter>源文件\tree</Filter>
 65 |     </ClCompile>
 66 |     <ClCompile Include="..\..\src\tree\BoostingForest.cpp">
 67 |       <Filter>源文件\tree</Filter>
 68 |     </ClCompile>
 69 |     <ClCompile Include="..\..\src\python\pyMORT_DLL.cpp">
 70 |       <Filter>源文件\python</Filter>
 71 |     </ClCompile>
 72 |     <ClCompile Include="..\..\src\EDA\SA_salp.cpp">
 73 |       <Filter>源文件\EDA</Filter>
 74 |     </ClCompile>
 75 |     <ClCompile Include="..\..\src\EDA\Feat_Selection.cpp">
 76 |       <Filter>源文件\EDA</Filter>
 77 |     </ClCompile>
 78 |     <ClCompile Include="..\..\src\util\GRander.cpp">
 79 |       <Filter>源文件\util</Filter>
 80 |     </ClCompile>
 81 |     <ClCompile Include="..\..\src\util\pcg_oneil\xoshiro256plusplus.c">
 82 |       <Filter>源文件\util</Filter>
 83 |     </ClCompile>
 84 |     <ClCompile Include="..\..\src\learn\Pruning.cpp">
 85 |       <Filter>源文件\learn</Filter>
 86 |     </ClCompile>
 87 |     <ClCompile Include="..\..\src\util\FastExpLog.c">
 88 |       <Filter>源文件\util</Filter>
 89 |     </ClCompile>
 90 |     <ClCompile Include="..\..\src\data_fold\FeatVector.cpp">
 91 |       <Filter>源文件\data_fold</Filter>
 92 |     </ClCompile>
 93 |   </ItemGroup>
 94 |   <ItemGroup>
 95 |     <ClInclude Include="..\..\src\data_fold\Loss.hpp">
 96 |       <Filter>源文件\data_fold</Filter>
 97 |     </ClInclude>
 98 |     <ClInclude Include="..\..\src\data_fold\EDA.hpp">
 99 |       <Filter>源文件\data_fold</Filter>
100 |     </ClInclude>
101 |     <ClInclude Include="..\..\src\data_fold\Histogram.hpp">
102 |       <Filter>源文件\data_fold</Filter>
103 |     </ClInclude>
104 |     <ClInclude Include="..\..\src\data_fold\DataFold.hpp">
105 |       <Filter>源文件\data_fold</Filter>
106 |     </ClInclude>
107 |     <ClInclude Include="..\..\src\data_fold\FeatVector.hpp">
108 |       <Filter>源文件\data_fold</Filter>
109 |     </ClInclude>
110 |     <ClInclude Include="..\..\src\data_fold\Distribution.hpp">
111 |       <Filter>源文件\data_fold</Filter>
112 |     </ClInclude>
113 |     <ClInclude Include="..\..\src\data_fold\FeatVec_2D.hpp">
114 |       <Filter>源文件\data_fold</Filter>
115 |     </ClInclude>
116 |     <ClInclude Include="..\..\src\data_fold\Move.hpp">
117 |       <Filter>源文件\data_fold</Filter>
118 |     </ClInclude>
119 |     <ClInclude Include="..\..\src\data_fold\FeatVec_EXP.hpp">
120 |       <Filter>源文件\data_fold</Filter>
121 |     </ClInclude>
122 |     <ClInclude Include="..\..\src\data_fold\Imputer.hpp">
123 |       <Filter>源文件\data_fold</Filter>
124 |     </ClInclude>
125 |     <ClInclude Include="..\..\src\learn\DCRIMI_.hpp">
126 |       <Filter>源文件\learn</Filter>
127 |     </ClInclude>
128 |     <ClInclude Include="..\..\src\tree\BiSplit.hpp">
129 |       <Filter>源文件\tree</Filter>
130 |     </ClInclude>
131 |     <ClInclude Include="..\..\src\tree\BoostingForest.hpp">
132 |       <Filter>源文件\tree</Filter>
133 |     </ClInclude>
134 |     <ClInclude Include="..\..\src\tree\ManifoldTree.hpp">
135 |       <Filter>源文件\tree</Filter>
136 |     </ClInclude>
137 |     <ClInclude Include="..\..\src\tree\GBRT.hpp">
138 |       <Filter>源文件\tree</Filter>
139 |     </ClInclude>
140 |     <ClInclude Include="..\..\src\python\pyMORT_DLL.h">
141 |       <Filter>源文件\python</Filter>
142 |     </ClInclude>
143 |     <ClInclude Include="..\..\src\util\Parallel_t.hpp">
144 |       <Filter>源文件\util</Filter>
145 |     </ClInclude>
146 |     <ClInclude Include="..\..\src\util\samp_set.hpp">
147 |       <Filter>源文件\util</Filter>
148 |     </ClInclude>
149 |     <ClInclude Include="..\..\src\util\Statistics_t.hpp">
150 |       <Filter>源文件\util</Filter>
151 |     </ClInclude>
152 |     <ClInclude Include="..\..\src\util\GST_def.h">
153 |       <Filter>源文件\util</Filter>
154 |     </ClInclude>
155 |     <ClInclude Include="..\..\src\util\Object.hpp">
156 |       <Filter>源文件\util</Filter>
157 |     </ClInclude>
158 |     <ClInclude Include="..\..\src\include\LiteBOM_config.h">
159 |       <Filter>头文件</Filter>
160 |     </ClInclude>
161 |     <ClInclude Include="..\..\src\EDA\SA_salp.hpp">
162 |       <Filter>源文件\EDA</Filter>
163 |     </ClInclude>
164 |     <ClInclude Include="..\..\src\EDA\Feat_Selection.hpp">
165 |       <Filter>源文件\EDA</Filter>
166 |     </ClInclude>
167 |     <ClInclude Include="..\..\src\util\GRander.hpp">
168 |       <Filter>源文件\util</Filter>
169 |     </ClInclude>
170 |     <ClInclude Include="..\..\src\learn\Pruning.hpp">
171 |       <Filter>源文件\learn</Filter>
172 |     </ClInclude>
173 |     <ClInclude Include="..\..\src\util\PY_obj.hpp">
174 |       <Filter>源文件\util</Filter>
175 |     </ClInclude>
176 |     <ClInclude Include="..\..\src\data_fold\Loss_binary.hpp">
177 |       <Filter>源文件\data_fold</Filter>
178 |     </ClInclude>
179 |     <ClInclude Include="..\..\src\util\Float16.hpp">
180 |       <Filter>源文件\util</Filter>
181 |     </ClInclude>
182 |     <ClInclude Include="..\..\src\data_fold\Representive.hpp">
183 |       <Filter>源文件\data_fold</Filter>
184 |     </ClInclude>
185 |     <ClInclude Include="..\..\src\data_fold\FeatVec_Quanti.hpp">
186 |       <Filter>源文件\data_fold</Filter>
187 |     </ClInclude>
188 |   </ItemGroup>
189 | </Project>


--------------------------------------------------------------------------------
/python-package/case_earthquake.py:
--------------------------------------------------------------------------------
  1 | # https://www.kaggle.com/tocha4/lanl-master-s-approach
  2 | 
  3 | import numpy as np # linear algebra
  4 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  5 | import scipy as sc
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | import gc
  9 | import warnings
 10 | warnings.filterwarnings("ignore")
 11 | warnings.simplefilter(action='ignore', category=FutureWarning)
 12 | from tqdm import tqdm_notebook
 13 | import datetime
 14 | import time
 15 | import random
 16 | from joblib import Parallel, delayed
 17 | 
 18 | 
 19 | import lightgbm as lgb
 20 | from tensorflow import keras
 21 | from gplearn.genetic import SymbolicRegressor
 22 | #from catboost import Pool, CatBoostRegressor
 23 | from litemort import *
 24 | from sklearn.pipeline import Pipeline
 25 | from sklearn.preprocessing import StandardScaler
 26 | from sklearn.metrics import mean_absolute_error,mean_squared_error
 27 | from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
 28 | from sklearn.feature_selection import RFECV, SelectFromModel
 29 | import os
 30 | import sys
 31 | import pickle
 32 | from sklearn.linear_model import LinearRegression, Ridge
 33 | from sklearn.tree import DecisionTreeRegressor
 34 | from sklearn.svm import NuSVR, SVR
 35 | from sklearn.kernel_ridge import KernelRidge
 36 | from sklearn.ensemble import AdaBoostRegressor
 37 | from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
 38 | 
 39 | today = datetime.date.today().strftime('%m%d')
 40 | 
 41 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 42 | #isMORT = True
 43 | #some_rows=3000
 44 | some_rows=None
 45 | model_type='mort' if isMORT else 'lgb'
 46 | nVerbose = 500
 47 | pkl_path = 'G:/kaggle/Earthquake/data/anton_2_{}.pickle'.format(some_rows)
 48 | pkl_path = 'G:/kaggle/Earthquake/data/anton_cys0_{}.pickle'.format(some_rows)
 49 | eval_metric='l1'
 50 | min_error = mean_squared_error if eval_metric=='l1' else mean_absolute_error
 51 | params = {
 52 |             'n_estimators':50000,      #减少n_estimators 并不能控制overfit
 53 |         'early_stopping_rounds': 200,
 54 |         'num_leaves': 256,              #128
 55 |         #'max_bin':  64,
 56 |           'min_data_in_leaf': 32,       #79
 57 |           'objective': 'tweedie',    #'regression',
 58 |           'max_depth': -1,
 59 |           'learning_rate': 0.01,
 60 |           #"boosting": "gbdt",
 61 |           "bagging_freq": 5,
 62 |           "bagging_fraction": 1,#0.8126672064208567,   #0.8126672064208567,
 63 |           "bagging_seed": 11,
 64 |           "metric": 'mae',
 65 |           "verbosity": nVerbose,
 66 |           #'reg_alpha': 0.1302650970728192,
 67 |           #'reg_lambda': 0.3603427518866501,
 68 |           'colsample_bytree': 0.05
 69 |          }
 70 | print("params=\n{}\n".format(params))
 71 | submission = pd.read_csv('G:/kaggle/Earthquake/input/sample_submission.csv')
 72 | 
 73 | def Load_MoreDatas(paths):
 74 |     train_s=[]
 75 |     y_s=[]
 76 |     for path,nFile in paths:
 77 |         for i in range(nFile):
 78 |             path_X,path_y="{}/train_X_features_{}.csv".format(path,i+1),"{}/train_y_{}.csv".format(path,i+1)
 79 |             X_ = pd.read_csv(path_X)
 80 |             y_ = pd.read_csv(path_y, index_col=False,  header=None)
 81 |             train_s.append(X_)
 82 |             y_s.append(y_)
 83 |             print("X_[{}]@{}\ny_[{}]@{}".format(X_.shape,path_X,y_.shape,path_y))
 84 |     if len(train_s)>0:
 85 |         train_X = pd.concat(train_s, axis=0)
 86 |         y = pd.concat(y_s, axis=0)
 87 |     train_X = train_X.reset_index(drop=True)
 88 |     y = y.reset_index(drop=True)
 89 |     print("Load_MoreDatas X_[{}] y_[{}]".format(train_X.shape,  y.shape))
 90 |     return train_X,y
 91 | 
 92 | if os.path.isfile(pkl_path):
 93 |     print("\n======load pickle file from {} ...".format(pkl_path))
 94 |     with open(pkl_path, "rb") as fp:  # Pickling
 95 |         [train_X, test_X, train_y] = pickle.load(fp)
 96 |     if some_rows is not None:
 97 |         train_X = train_X[:some_rows]
 98 |         test_X = test_X[:some_rows]
 99 |         train_y = train_y[:some_rows]
100 |     print("\n======train_X={} test_X={} train_y={} \n".format(train_X.shape, test_X.shape, train_y.shape))
101 | else:
102 |     #train_X_2,y_2 = Load_MoreDatas([('G:/kaggle/Earthquake/data/cys/15000', 14),
103 |     #                                ('G:/kaggle/Earthquake/data/cys/17000', 15)])
104 |     train_X_0 = pd.read_csv("G:/kaggle/Earthquake/data/train_X_features_865_0.csv")
105 |     train_X_1 = pd.read_csv("G:/kaggle/Earthquake/data/train_X_features_865_1.csv")
106 |     y_0 = pd.read_csv("G:/kaggle/Earthquake/data/train_y_0.csv", index_col=False,  header=None)
107 |     y_1 = pd.read_csv("G:/kaggle/Earthquake/data/train_y_1.csv", index_col=False,  header=None)
108 |     train_X = pd.concat([train_X_0, train_X_1], axis=0)
109 |     y = pd.concat([y_0, y_1], axis=0)
110 | 
111 |     train_X = train_X.reset_index(drop=True)
112 |     print(train_X.shape)
113 |     print(train_X.head())
114 | 
115 |     y = y.reset_index(drop=True)
116 |     print(y[0].shape)
117 |     train_y = pd.Series(y[0].values)
118 |     test_X = pd.read_csv("G:/kaggle/Earthquake/data/test_X_features_10.csv")
119 |     scaler = StandardScaler()
120 |     train_columns = train_X.columns
121 | 
122 |     train_X[train_columns] = scaler.fit_transform(train_X[train_columns])
123 |     test_X[train_columns] = scaler.transform(test_X[train_columns])
124 |     with open(pkl_path, "wb") as fp:  # Pickling
125 |         pickle.dump([train_X, test_X, train_y], fp)
126 |     print("Save pickle file at {} train_X={} test_X={} train_y={}".format(pkl_path,train_X.shape, test_X.shape, train_y.shape))
127 |     sys.exit(-2)
128 | 
129 | train_columns = train_X.columns
130 | n_fold = 5      #n_fold=10 只是增加了过拟合，莫名其妙
131 | folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
132 | 
133 | oof = np.zeros(len(train_X))
134 | train_score = []
135 | fold_idxs = []
136 | # if PREDICTION:
137 | predictions = np.zeros(len(test_X))
138 | 
139 | feature_importance_df = pd.DataFrame()
140 | #run model
141 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X,train_y.values)):
142 |     t0=time.time()
143 |     strLog = "fold {}".format(fold_)
144 |     print(strLog)
145 |     fold_idxs.append(val_idx)
146 |     fold_importance_df = pd.DataFrame()
147 |     fold_importance_df["Feature"] = train_columns
148 | 
149 |     X_train, X_valid = train_X[train_columns].iloc[trn_idx], train_X[train_columns].iloc[val_idx]
150 |     y_train, y_valid = train_y.iloc[trn_idx], train_y.iloc[val_idx]
151 |     if model_type == 'mort':
152 |         params['objective'] = 'regression'
153 |         # model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
154 |         model = LiteMORT(params).fit_1(X_train, y_train, eval_set=[(X_valid, y_valid)])
155 |     if model_type == 'cat':
156 |         model = CatBoostRegressor(n_estimators=25000, verbose=-1, objective="MAE", loss_function="MAE", boosting_type="Ordered", task_type="GPU")
157 |         model.fit(X_tr,
158 |                   y_tr,
159 |                   eval_set=[(X_val, y_val)],
160 |     #               eval_metric='mae',
161 |                   verbose=2500,
162 |                   early_stopping_rounds=500)
163 |     if model_type == 'lgb':
164 |         model = lgb.LGBMRegressor(**params,  n_jobs=-1)#n_estimators=50000,
165 |         model.fit(X_train, y_train,
166 |                   eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
167 |                   verbose=nVerbose, early_stopping_rounds=200)   #
168 |         fold_importance_df["importance"] = model.feature_importances_[:len(train_columns)]
169 |         fold_importance_df["fold"] = fold_ + 1
170 |         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
171 | 
172 |     oof[val_idx] = model.predict(X_valid)
173 |     fold_score = mean_absolute_error(oof[val_idx], y_valid)
174 |     print("{}\tscore={:.4g} time={:.4g}".format(strLog,fold_score,time.time()-t0))
175 | 
176 |     #predictions
177 |     predictions += model.predict(test_X[train_columns]) / folds.n_splits
178 |     train_score.append(fold_score)
179 | 
180 | cv_score = mean_absolute_error(train_y, oof)
181 | print(f"\n======After {n_fold} score = {cv_score:.3f}, CV_fold = {np.mean(train_score):.3f} | {np.std(train_score):.3f}", end=" ")
182 | 
183 | 
184 | 
185 | 
186 | submission["time_to_failure"] = predictions
187 | submission.to_csv(f'G:/kaggle/Earthquake/result/{model_type}_{today}_[{cv_score:.3f},{np.std(train_score):.3f}].csv', index=False)
188 | submission.head()


--------------------------------------------------------------------------------
/tests/python_package_test/test_sklearn.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # pylint: skip-file
  3 | import math
  4 | import os
  5 | import unittest
  6 | 
  7 | from litemort import (LiteMORT,Mort_Preprocess)
  8 | import lightgbm as lgb
  9 | import numpy as np
 10 | from sklearn.base import clone
 11 | from sklearn import preprocessing
 12 | from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
 13 |                               load_iris, load_svmlight_file)
 14 | from sklearn.externals import joblib
 15 | from sklearn.metrics import log_loss, mean_squared_error
 16 | from sklearn.model_selection import GridSearchCV, train_test_split
 17 | from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest,
 18 |                                             check_parameters_default_constructible)
 19 | isMORT=True
 20 | 
 21 | try:
 22 |     from sklearn.utils.estimator_checks import check_no_fit_attributes_set_in_init
 23 |     sklearn_at_least_019 = True
 24 | except ImportError:
 25 |     sklearn_at_least_019 = False
 26 | 
 27 | 
 28 | def multi_error(y_true, y_pred):
 29 |     return np.mean(y_true != y_pred)
 30 | 
 31 | 
 32 | def multi_logloss(y_true, y_pred):
 33 |     return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
 34 | 
 35 | 
 36 | class TestSklearn(unittest.TestCase):
 37 | 
 38 |     def test_binary_breast(self):
 39 |         params = {
 40 |             "objective": "binary", "metric": "logloss",'early_stop': 5, 'num_boost_round': 50,
 41 |             "verbosity": 1,
 42 |         }
 43 |         X, y = load_breast_cancer(True)
 44 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
 45 |         if isMORT:
 46 |             mort = LiteMORT(params)
 47 |             mort.fit(X_train, y_train, eval_set=[(X_test, y_test)], params=params)
 48 |             result = mort.predict(X_test)
 49 |             ret = log_loss(y_test, mort.predict_proba(X_test))
 50 |         else:
 51 |             gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
 52 |             gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
 53 |             result = gbm.predict(X_test)
 54 |             ret = log_loss(y_test, gbm.predict_proba(X_test))
 55 |             self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1],places=5)
 56 |         self.assertLess(ret, 0.15)
 57 | 
 58 |     def ttest_binary_digits(self):
 59 |         from sklearn.datasets import load_digits
 60 |         from sklearn.model_selection import KFold
 61 |         rng = np.random.RandomState(1994)
 62 |         params = {
 63 |             "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50,
 64 |             "verbosity": 1,
 65 |         }
 66 |         digits = load_digits(2)
 67 |         y = digits['target']
 68 |         X = digits['data']
 69 |         kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 70 |         for train_index, test_index in kf.split(X, y):
 71 |             #xgb_model = cls(random_state=42).fit(X[train_index], y[train_index])
 72 |             #xgb_model.predict(X[test_index])
 73 |             mort = LiteMORT(params).fit(X[train_index], y[train_index])
 74 |             preds = mort.predict(X[test_index])
 75 |             labels = y[test_index]
 76 |             err = sum(1 for i in range(len(preds))
 77 |                       if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
 78 |             assert err < 0.1
 79 | 
 80 | 
 81 |     def ttest_regression(self):
 82 |         params = {
 83 |             "objective": "regression",  'early_stop': 5, 'num_boost_round': 50, "verbosity": 1,
 84 |         }
 85 |         X, y = load_boston(True)
 86 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
 87 |         if isMORT:
 88 |             mort = LiteMORT(params)
 89 |             mort.fit(X_train, y_train, eval_set=[(X_test, y_test)], params=params)
 90 |             ret = mean_squared_error(y_test, mort.predict(X_test))
 91 |         else:
 92 |             gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
 93 |             gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
 94 |             ret = mean_squared_error(y_test, gbm.predict(X_test))
 95 |             self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
 96 |         self.assertLess(ret, 16)
 97 | 
 98 |     def ttest_regression_boston_housing(self):
 99 |         rng = np.random.RandomState(1994)
100 |         params = {
101 |             "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1,
102 |         }
103 |         from sklearn.metrics import mean_squared_error
104 |         from sklearn.datasets import load_boston
105 |         from sklearn.model_selection import KFold
106 |         params = {
107 |             "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1,
108 |         }
109 |         boston = load_boston()
110 |         y = boston['target']
111 |         X = boston['data']
112 |         kf = KFold(n_splits=2, shuffle=True, random_state=rng)
113 |         for train_index, test_index in kf.split(X, y):
114 |             #xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
115 |             mort = LiteMORT(params)
116 |             mort.fit(X[train_index], y[train_index], params=params)
117 |             preds = mort.predict(X[test_index])
118 |             labels = y[test_index]
119 |             assert mean_squared_error(preds, labels) < 25
120 | 
121 |     
122 |     #@unittest.skipIf(not litemort.combat.PANDAS_INSTALLED, 'pandas is not installed')
123 |     def test_pandas_categorical(self):
124 |         params = {      #需要更详细的的测试
125 |             "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50,
126 |             "verbosity": 1,
127 |         }
128 |         import pandas as pd
129 |         X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
130 |                           "B": np.random.permutation([1, 2, 3] * 100),  # int
131 |                           "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
132 |                           "D": np.random.permutation([True, False] * 150)})  # bool
133 |         y = np.random.permutation([0, 1] * 150)
134 |         X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
135 |                                "B": np.random.permutation([1, 3] * 30),
136 |                                "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
137 |                                "D": np.random.permutation([True, False] * 30)})
138 |         if True:
139 |             X, X_test = Mort_Preprocess.OrdinalEncode_(X,X_test)
140 |         for col in ["A", "B", "C", "D"]:
141 |             X[col] = X[col].astype('category')
142 |             X_test[col] = X_test[col].astype('category')
143 |         #trn_data = lgb.Dataset(X, label=y)
144 | 
145 |         if isMORT:
146 |             mort0 = LiteMORT(params).fit(X, y)
147 |             pred0 = list(mort0.predict(X_test))
148 |             mort1 = LiteMORT(params).fit(X, y, categorical_feature=[0])
149 |             pred1 = list(mort1.predict(X_test))
150 |             mort2 = LiteMORT(params).fit(X, y, categorical_feature=['A'])
151 |             pred2 = list(mort2.predict(X_test))
152 |             mort3 = LiteMORT(params).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
153 |             pred3 = list(mort3.predict(X_test))
154 |         else:
155 |             clf=lgb.sklearn.LGBMClassifier()
156 |             gbm_ = clf.fit(X, y)
157 |             gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
158 |             pred0 = list(gbm0.predict(X_test))
159 |             gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
160 |             pred1 = list(gbm1.predict(X_test))
161 |             gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
162 |             pred2 = list(gbm2.predict(X_test))
163 |             gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
164 |             pred3 = list(gbm3.predict(X_test))
165 |             gbm3.booster_.save_model('categorical.model')
166 |             gbm4 = lgb.Booster(model_file='categorical.model')
167 |             pred4 = list(gbm4.predict(X_test))
168 |             pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
169 |             np.testing.assert_almost_equal(pred_prob, pred4)
170 |             input("...")
171 |         #np.testing.assert_almost_equal(pred0, pred1)
172 |         #np.testing.assert_almost_equal(pred0, pred2)
173 |         #np.testing.assert_almost_equal(pred0, pred3)
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/python-package/lgb_kim.py:
--------------------------------------------------------------------------------
  1 | #很好的测试算例     X_train_0=(159999, 200) y_train=(159999,)......
  2 | #https://www.kaggle.com/chocozzz/santander-lightgbm-baseline-lb-0-899
  3 | #http://www.stat.ucdavis.edu/~chohsieh/teaching/STA141C_Spring2017/final_project_proposal.pdf
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | import seaborn as sns
  9 | import lightgbm as lgb
 10 | from sklearn.model_selection import KFold, StratifiedKFold
 11 | import warnings
 12 | import gc
 13 | import time
 14 | import sys
 15 | import datetime
 16 | import matplotlib.pyplot as plt
 17 | import seaborn as sns
 18 | from sklearn.metrics import mean_squared_error
 19 | warnings.simplefilter(action='ignore', category=FutureWarning)
 20 | warnings.filterwarnings('ignore')
 21 | from sklearn import metrics
 22 | from litemort import *
 23 | 
 24 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 25 | #isMORT = True
 26 | #some_rows=10000
 27 | some_rows=None
 28 | pick_samples=None
 29 | #pick_samples=1500
 30 | early_stop=500      #0.898->0.899
 31 | max_bin = 256      #119增大到1280
 32 | lr=0.02             #0.02   0.01提升一点点   0.89826-》0.89842
 33 | nFold,nLeaves = 5,10
 34 | n_round=35000
 35 | min_child=32         #11(0.89830)-> 32(0.89882)
 36 | #x_feat,x_sub=1,0.15
 37 | x_feat,x_sub=0.15,0.3
 38 | #x_feat,x_sub=1,1;       nLeaves=2;n_round=5       #仅用于测试
 39 | #n_round,nLeaves=50,2
 40 | 
 41 | print('argv={}\nsome_rows={} pick_samples={}'.format(sys.argv,some_rows,pick_samples))
 42 | plt.style.use('seaborn')
 43 | sns.set(font_scale=1)
 44 | pd.set_option('display.max_columns', 500)
 45 | 
 46 | train_df = pd.read_csv("E:/kaggle/Santander/input/train.csv",nrows=some_rows)
 47 | test_df = pd.read_csv("E:/kaggle/Santander/input/test.csv",nrows=some_rows)
 48 | 
 49 | features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
 50 | target = train_df['target']
 51 | cat_cols=[];
 52 | if True:
 53 |     #see_all_2(train_df, test_df, features, target, [0, 256, 1024])
 54 |     #plot_binary_dist(train_df, test_df, ['var_81','var_68','var_139'],bins=1024)      #','var_139','var_12','var_53','var_110'
 55 |     #[train_df,test_df],features=feat_extend([train_df,test_df],features)
 56 |     print("")
 57 | else:
 58 |     #[train_df,test_df],features=feat_extend([train_df,test_df],features)
 59 |     may_cols=['var_68','var_6','var_108','var_13','var_33','var_146','var_21','var_80','var_139','var_81']
 60 |     may_cols=['var_68']     #var_68 is date?
 61 |     #may_cols = features
 62 |     #[train_df,test_df],features,cat_cols=df2category_hisogram([train_df,test_df],features,may_cols)
 63 |     #train_df,test_df,features,cat_cols = df2category_rf(train_df,target,test_df,features)
 64 | 
 65 | from sklearn.metrics import roc_auc_score, roc_curve
 66 | #Target Encoding
 67 | TE_folds, TE_inner_folds=10,5
 68 | if True:
 69 |     for var_name in cat_cols:
 70 |         #train_df, test_df, feat_T = TE_cross(5, 2, train_df, 'target', test_df, var_name)
 71 |         train_df, test_df, feat_T = TE_expm(train_df, 'target', test_df, var_name)
 72 |         features.append(feat_T)
 73 | 
 74 | 
 75 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
 76 | oof = np.zeros(len(train_df))
 77 | predictions = np.zeros(len(test_df))
 78 | feature_importance_df = pd.DataFrame()
 79 | 
 80 | start = time.time()
 81 | 
 82 | param = {
 83 |         'num_leaves': nLeaves,
 84 |         'num_round':n_round,
 85 |         #'num_leaves': 32,
 86 |         #'max_bin': 119,
 87 |         'max_bin': max_bin,
 88 |         #"adaptive":'weight',
 89 |         'min_data_in_leaf': min_child,
 90 |         'learning_rate': lr,
 91 |         #'learning_rate': 0.5,
 92 |         'min_sum_hessian_in_leaf': 0.00245,
 93 |         'bagging_fraction': x_sub,
 94 |         'bagging_freq': 5,
 95 |         'feature_fraction': x_feat,
 96 |         'lambda_l1': 4.972,
 97 |         'lambda_l2': 2.276,
 98 |         'min_gain_to_split': 0.65,
 99 |         'max_depth': 14,
100 |         'save_binary': True,
101 |         'seed': 1337,
102 |         'feature_fraction_seed': 1337,
103 |         'bagging_seed': 1337,
104 |         'drop_seed': 1337,
105 |         'data_random_seed': 1337,
106 |         'objective': 'binary',
107 |         'boosting_type': 'gbdt',
108 |         'verbose': 1,
109 |         'metric': 'auc',
110 |         #'metric': 'auc',
111 |         'is_unbalance': True,
112 |         'boost_from_average': False,
113 |     }
114 | N_min = 100     #   N_min 越大，regularization效果越强 smoothing term, minimum sample size, if sample size is less than N_min, add up to N_min
115 | #
116 | 
117 | alg="LGBMRegressor"
118 | print("LightGBM training... train={} test={} \nparam={}".format(train_df.shape,test_df.shape,param))
119 | features_0=features.copy()
120 | for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_df.values, target.values)):
121 |     t0=time.time()
122 |     print("fold n°{}".format(fold_))
123 |     num_round = 10000
124 |     features = features_0.copy()
125 |     X_train = train_df.iloc[trn_idx][features].astype(np.float32)
126 |     Y_train = target.iloc[trn_idx].astype(np.double)
127 |     X_test = train_df.iloc[val_idx][features].astype(np.float32)
128 |     Y_test = target.iloc[val_idx].astype(np.double)
129 |     if isMORT:
130 |         #mort = LiteMORT(param).fit(X_train, Y_train, eval_set=[(X_test, Y_test)])
131 |         mort = LiteMORT(param).fit_1(X_train, Y_train, eval_set=[(X_test, Y_test)])
132 |         oof[val_idx] = mort.predict_raw(X_test)
133 |         fold_score = roc_auc_score(Y_test, oof[val_idx])
134 |         #print("\nFold ", fold_, " score: ", fold_score)
135 |         predictions += mort.predict_raw(test_df[features]) / 5
136 |     else:
137 |         if alg=="LGBMRegressor":
138 |             dev=X_train
139 |             val=X_test
140 |             target_col='target_col';  dev[target_col]=Y_train
141 |             clf = lgb.LGBMRegressor(num_boost_round=num_round, early_stopping_rounds=early_stop,**param)
142 |             if True:
143 |                 print("features={} X_train={} Y_train={} X_test={} Y_test={} ".format( len(features),
144 |                     X_train.shape,Y_train.shape,X_test.shape,Y_test.shape))
145 |                 clf.fit(X_train[features], Y_train, eval_set=[(X_train[features], Y_train),(X_test[features], Y_test)],eval_metric="auc",categorical_feature=cat_cols, verbose=1000)
146 |                 feat_importance = clf.feature_importances_
147 |                 best_iteration = clf.best_iteration_
148 |                 if best_iteration is None:
149 |                     best_iteration = -1
150 |                 oof[val_idx] = clf.predict(val[features],num_iteration=best_iteration)
151 |             else:
152 |                 gLR = GBDT_LR(clf)
153 |                 gLR.fit(X_train, Y_train, eval_set=[(X_test, Y_test)],eval_metric="auc", verbose=1000)
154 |                 feat_importance = gLR.feature_importance()
155 |                 best_iteration = -1
156 |                 clf=gLR
157 |                 oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], y_=target.iloc[val_idx],
158 |                                            num_iteration=best_iteration)
159 | 
160 |         else:       #lambda ranker
161 |             gbr = lgb.LGBMRanker()
162 |             gbr.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
163 |                     eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
164 |                     callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
165 | 
166 |         fold_importance_df = pd.DataFrame()
167 |         fold_importance_df["feature"] = features
168 |         fold_importance_df["importance"] = feat_importance
169 |         fold_importance_df["fold"] = fold_ + 1
170 |         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
171 | 
172 |         predictions += clf.predict(test_df[features], num_iteration=best_iteration) / 5
173 |     fold_score = roc_auc_score(Y_test, oof[val_idx])
174 |     print("fold n°{} time={} score={}".format(fold_,time.time()-t0,fold_score))
175 |     #break
176 | cv_score = roc_auc_score(target, oof)
177 | print("CV score: {:<8.5f}".format(cv_score))
178 | 
179 | if feature_importance_df.size>0:
180 | #if False:
181 |     cols = (feature_importance_df[["feature", "importance"]]
182 |             .groupby("feature")
183 |             .mean()
184 |             .sort_values(by="importance", ascending=False)[:32].index)
185 |     best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
186 | 
187 |     plt.figure(figsize=(14,26))
188 |     sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False))
189 |     plt.title('LightGBM Features (averaged over folds)')
190 |     plt.tight_layout()
191 |     plt.savefig('lgbm_importances.png')
192 | 
193 | 
194 | input("Press Enter to continue...")


--------------------------------------------------------------------------------
/python-package/case_ieee_fraud.py:
--------------------------------------------------------------------------------
  1 | #https://www.kaggle.com/kyakovlev/ieee-simple-lgbm
  2 | 
  3 | # General imports
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os, sys, gc, warnings, random, datetime
  7 | import time
  8 | import pickle
  9 | from sklearn import metrics
 10 | from sklearn.model_selection import train_test_split, KFold
 11 | from sklearn.preprocessing import LabelEncoder
 12 | from litemort import *
 13 | from tqdm import tqdm
 14 | import math
 15 | warnings.filterwarnings('ignore')
 16 | 
 17 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 18 | isMORT = True
 19 | model='MORT' if isMORT else 'LGB'
 20 | NFOLDS = 8
 21 | #some_rows = 5000
 22 | some_rows = None
 23 | data_root = 'E:/Kaggle/ieee_fraud/input/'
 24 | #data_root = '../input/'
 25 | pkl_path = f'{data_root}/_kyakovlev_{some_rows}.pickle'
 26 | 
 27 | def M_PickSamples(pick_samples,df_train,df_test):
 28 |     nMost = min(df_train.shape[0], df_test.shape[0])
 29 |     random.seed(42)
 30 |     subset = random.sample(range(nMost), pick_samples)
 31 |     df_train = df_train.iloc[subset, :].reset_index(drop=True)
 32 |     df_test = df_test.iloc[subset, :].reset_index(drop=True)
 33 |     print('====== Mort_PickSamples ... df_train={} df_test={}'.format(df_train.shape, df_test.shape))
 34 |     return df_train,df_test
 35 | 
 36 | def seed_everything(seed=0):
 37 |     random.seed(seed)
 38 |     os.environ['PYTHONHASHSEED'] = str(seed)
 39 |     np.random.seed(seed)
 40 | 
 41 | def reduce_mem_usage(df, verbose=True):
 42 |     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
 43 |     start_mem = df.memory_usage().sum() / 1024**2
 44 |     for col in df.columns:
 45 |         col_type = df[col].dtypes
 46 |         if col_type in numerics:
 47 |             c_min = df[col].min()
 48 |             c_max = df[col].max()
 49 |             if str(col_type)[:3] == 'int':
 50 |                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
 51 |                     df[col] = df[col].astype(np.int8)
 52 |                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
 53 |                     df[col] = df[col].astype(np.int16)
 54 |                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
 55 |                     df[col] = df[col].astype(np.int32)
 56 |                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
 57 |                     df[col] = df[col].astype(np.int64)
 58 |             else:
 59 |                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
 60 |                     df[col] = df[col].astype(np.float16)
 61 |                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
 62 |                     df[col] = df[col].astype(np.float32)
 63 |                 else:
 64 |                     df[col] = df[col].astype(np.float64)
 65 |     end_mem = df.memory_usage().sum() / 1024**2
 66 |     if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 67 |     return df
 68 | 
 69 | import lightgbm as lgb
 70 | 
 71 | 
 72 | def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
 73 |     print(f'train_df={tr_df.shape} test_df={tt_df.shape} \nlgb_params={lgb_params}')
 74 |     folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
 75 | 
 76 |     #X, y = tr_df[features_columns], tr_df[target]
 77 |     #P, P_y = tt_df[features_columns], tt_df[target]
 78 |     y, P_y = tr_df[target], tt_df[target]
 79 | 
 80 | 
 81 |     predictions = np.zeros(len(tt_df))
 82 | 
 83 |     for fold_, (trn_idx, val_idx) in enumerate(folds.split(tr_df[features_columns], y)):
 84 |         t0=time.time()
 85 |         print('Fold:', fold_)
 86 |         tr_x, tr_y = tr_df[features_columns].iloc[trn_idx, :], y[trn_idx]
 87 |         vl_x, vl_y = tr_df[features_columns].iloc[val_idx, :], y[val_idx]
 88 |         print(len(tr_x), len(vl_x))
 89 | 
 90 |         if isMORT:
 91 |             model = LiteMORT(lgb_params).fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)])
 92 |             best_iter = 1000
 93 |             # pred_val = model.predict(vl_x)
 94 |             pred_raw = model.predict_raw(vl_x)
 95 |             # y_pred[val_idx] = pred_raw
 96 |             fold_score = metrics.roc_auc_score(vl_y, pred_raw)
 97 |             pp_p = model.predict_raw(tt_df[features_columns])
 98 |         else:
 99 |             tr_data = lgb.Dataset(tr_x, label=tr_y)
100 |             if LOCAL_TEST:
101 |                 vl_data = lgb.Dataset(tt_df[features_columns], label=P_y)
102 |             else:
103 |                 vl_data = lgb.Dataset(vl_x, label=vl_y)
104 |             estimator = lgb.train(
105 |                 lgb_params,
106 |                 tr_data,
107 |                 valid_sets=[tr_data, vl_data],
108 |                 verbose_eval=200,
109 |             )
110 |             pred_raw = estimator.predict(vl_x)
111 |             fold_score = metrics.roc_auc_score(vl_y, pred_raw)
112 |             pp_p = estimator.predict(tt_df[features_columns])
113 |             del tr_data, vl_data
114 | 
115 |         predictions += pp_p / NFOLDS
116 | 
117 |         if LOCAL_TEST:
118 |             feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(), X.columns)),
119 |                                        columns=['Value', 'Feature'])
120 |             print(feature_imp)
121 |         print(f'Fold:{fold_} score={fold_score} time={time.time() - t0:.4g} tr_x={tr_x.shape} val_x={vl_x.shape}')
122 |         del tr_x, tr_y, vl_x, vl_y
123 |         gc.collect()
124 |         #break
125 |     tt_df = tt_df[['TransactionID', target]]
126 |     tt_df['prediction'] = predictions
127 |     gc.collect()
128 | 
129 |     return tt_df,fold_score
130 | 
131 | SEED = 42
132 | seed_everything(SEED)
133 | LOCAL_TEST = False
134 | TARGET = 'isFraud'
135 | START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
136 | lgb_params = {
137 |                     'objective':'binary',
138 |                     'boosting_type':'gbdt',
139 |                     'metric':'auc',
140 |                     'n_jobs':-1,
141 |                     'learning_rate':0.01,
142 |                     "adaptive":'weight',
143 |                     "prune":0,
144 |                     'num_leaves': 2**8,
145 |                     'max_depth':-1,
146 |                     'tree_learner':'serial',
147 |                     'colsample_bytree': 0.7,
148 |                     'subsample_freq':1,
149 |                     'subsample':0.7,
150 |                     'n_estimators':800,
151 |                     'max_bin':255,
152 |                     'verbose':666,
153 |                     'seed': SEED,
154 |                     'early_stopping_rounds':100,
155 |                 }
156 | 
157 | if os.path.isfile(pkl_path):
158 |     print("====== Load pickle @{} ......".format(pkl_path))
159 |     with open(pkl_path, "rb") as fp:
160 |         [train_df, test_df, features_columns] = pickle.load(fp)
161 | else:
162 |     print('Load Data......')
163 |     train_df = pd.read_pickle(f'{data_root}/ieee-fe-with-some-eda/train_df.pkl')
164 | 
165 |     if LOCAL_TEST:
166 |         test_df = train_df[train_df['DT_M']==train_df['DT_M'].max()].reset_index(drop=True)
167 |         train_df = train_df[train_df['DT_M']<(train_df['DT_M'].max()-1)].reset_index(drop=True)
168 |     else:
169 |         test_df = pd.read_pickle(f'{data_root}/ieee-fe-with-some-eda/test_df.pkl')
170 | 
171 |     remove_features = pd.read_pickle(f'{data_root}/ieee-fe-with-some-eda/remove_features.pkl')
172 |     remove_features = list(remove_features['features_to_remove'].values)
173 |     print('Load Data OK\nShape control:', train_df.shape, test_df.shape)
174 | 
175 |     features_columns = [col for col in list(train_df) if col not in remove_features]
176 | 
177 |     ########################### Final Minification
178 |     print('reduce_mem_usage......')
179 |     train_df = reduce_mem_usage(train_df)
180 |     test_df  = reduce_mem_usage(test_df)
181 |     print('reduce_mem_usage......OK!!!')
182 |     if some_rows is not None:
183 |         train_df,test_df = M_PickSamples(some_rows,train_df,test_df)
184 |     with open(pkl_path, "wb") as fp:  # Pickling
185 |         pickle.dump([train_df, test_df, features_columns], fp)
186 |         print("====== Dump pickle @{} ......OK".format(pkl_path))
187 | 
188 | 
189 | if LOCAL_TEST:
190 |     lgb_params['learning_rate'] = 0.01
191 |     lgb_params['n_estimators'] = 20000
192 |     lgb_params['early_stopping_rounds'] = 100
193 |     test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params)
194 |     print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
195 | else:
196 |     lgb_params['learning_rate'] = 0.005
197 |     lgb_params['n_estimators'] = 5000
198 |     lgb_params['early_stopping_rounds'] = 100
199 |     test_predictions,fold_score = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=NFOLDS)
200 |     test_predictions['isFraud'] = test_predictions['prediction']
201 |     if some_rows is None:
202 |         # test_predictions[['TransactionID', 'isFraud']].to_csv(f'submit_{some_rows}_{0.5}.csv', index=False,compression='gzip')
203 |         path = f'E:/Kaggle/ieee_fraud/result/[{model}]_{some_rows}_{fold_score:.5f}_F{NFOLDS}_.csv'
204 |         test_predictions[['TransactionID', 'isFraud']].to_csv(path, index=False)  # ,compression='gzip'
205 |         print(f"test_predictions[['TransactionID', 'isFraud']] to_csv @{path}")
206 |     input("Press Enter to exit...")


--------------------------------------------------------------------------------
/python-package/LiteMORT/LiteMORT_EDA.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import matplotlib.gridspec as gridspec
  3 | import numpy as np
  4 | import seaborn as sns; sns.set()
  5 | import math
  6 | import time
  7 | from pandas.api.types import is_string_dtype
  8 | from pandas.api.types import is_numeric_dtype
  9 | import pandas as pd
 10 | 
 11 | def Unique_Expand(df):
 12 |     unique_samples = []
 13 |     unique_count = np.zeros_like(df)
 14 |     if True:
 15 |         ndf=df.values
 16 |         #for feature in tqdm(range(df.shape[1])):
 17 |         for feature in (range(ndf.shape[1])):
 18 |             _,index_, count_ = np.unique(ndf[:, feature], return_counts=True, return_index=True)
 19 |             unique_count[index_[count_ == 1], feature] += 1
 20 |         real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) ==2)[:, 0]
 21 |         synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
 22 |         df['unique']=0
 23 |         df['unique'].iloc[real_samples_indexes]=1
 24 |     else:
 25 |         for f in df.columns:
 26 |             new_feat = []
 27 |             v = dict(pd.value_counts(df[f]))
 28 |             for el in df[f].values:
 29 |                 new_feat.append(v[el])
 30 |             df["{}_counts".format(f)] = new_feat
 31 |             print("Unique_Expand::{}_counts...".format(f))
 32 | 
 33 |     return df
 34 | 
 35 | def all_element_values(user_data,col,tMost=60*10):
 36 |     t0=time.time()
 37 |     nz,nDump=0,100000
 38 |     nAllRow = user_data.shape[0]
 39 |     # df = user_data[col].str.split(',')
 40 |     if is_numeric_dtype(user_data[col]):
 41 |         elements=user_data[col].unique()
 42 |     else:
 43 |         elements=set()
 44 |         for row in user_data[col]:
 45 |             tokens = row.strip().split(',')
 46 |             elements = elements | set(tokens)
 47 |             nz = nz+1
 48 |             if nz>nDump and nz%nDump==0:
 49 |                 print("\r{}({:.3g})\t time={:.3g}...".format(nz,nz*1.0/nAllRow,time.time()-t0),end="")
 50 |             if time.time()-t0>tMost:    #难以想象要超过10分钟
 51 |                 print("\n{}({:.3g})\t time={:.3g}...BREAK!!!\n".format(nz,nz*1.0/nAllRow,time.time()-t0),end="")
 52 |                 break
 53 |     elements = list(elements)
 54 |     nz = min(1000,len(elements))
 55 |     print("{} elements@\'{}\' type={} time={:.3g}\n elements={} :\n".format(len(elements),col, type(elements[1]),
 56 |                                                                             time.time()-t0,elements[0:nz]))
 57 |     return elements
 58 | 
 59 | #only for bianry classification
 60 | def see_all_2(train, test,features,target,bins,dump_root="../see_all/"):
 61 |     nBinLevel = len(bins);  assert nBinLevel>1
 62 |     values, counts = np.unique(train.target, return_counts=True)
 63 |     for f_name in features:
 64 |         n = 0
 65 |         print("see_all_2@...".format(f_name), end="")
 66 |         fig, ax = plt.subplots(nBinLevel, 2, figsize=(20, 10))
 67 |         #a = train[f_name].loc[train.target == 0]
 68 |         fig.suptitle("\"{}\" V={} N={} 0={} 1={}".format(f_name,values, counts,"Blue","Red"))
 69 |         for bin in bins:
 70 |             bin = bin if bin >0 else None
 71 |             sns.distplot(train[f_name].loc[train.target == 0],ax=ax[n,0], color="Blue", bins=bin,norm_hist=True)
 72 |             sns.distplot(train.loc[train.target == 1, f_name],ax=ax[n,0], color="Red", bins=bin, norm_hist=True)
 73 |             sns.distplot(test.loc[:, f_name],ax=ax[n,1], color="Mediumseagreen", bins=bin, norm_hist=True)
 74 |             #ax[0].set_xlabel("")
 75 |             #ax[1].set_xlabel("")
 76 |             n=n+1
 77 |         #plt.show(block=True)
 78 |         plt.savefig("{}_[{}]_.jpg".format(dump_root, f_name))
 79 |         plt.clf();        plt.cla();        plt.close()
 80 | 
 81 | def plot_binary_dist(train,test,feature_names,bins=None):
 82 |     n_top = max(2,len(feature_names))       #1-1D array of subplots.
 83 |     fig, ax = plt.subplots(n_top, 2, figsize=(10, 5 * n_top))        #, figsize=(10, 5 * n_top)
 84 |     n=0
 85 |     for f_name in feature_names:
 86 |         a = train[f_name].loc[train.target == 0]
 87 |         sns.distplot(train[f_name].loc[train.target == 0], ax=ax[n, 0], color="Blue", bins=bins,norm_hist=True)
 88 |         sns.distplot(train.loc[train.target == 1, f_name], ax=ax[n, 0], color="Red", bins=bins, norm_hist=True)
 89 |         sns.distplot(test.loc[:, f_name], ax=ax[n, 1], color="Mediumseagreen", bins=bins, norm_hist=True)
 90 |         ax[n, 0].set_title("Train {}".format(f_name))
 91 |         ax[n, 1].set_title("Test {}".format(f_name))
 92 |         ax[n, 0].set_xlabel("")
 93 |         ax[n, 1].set_xlabel("")
 94 |         n=n+1
 95 |     plt.show(block=True)
 96 | 
 97 | def ann(row,col_A,col_B,axis=None):
 98 |     ind = row[0]
 99 |     r = row[1]
100 |     info = "{}:{:.2g}".format(r[col_A],r[col_B])     #ind
101 |     info = "{:.2g}".format(r[col_B])
102 |     plt.gca().annotate(info, xy=(r[col_A], r[col_B]), xytext=(2,2) , textcoords ="offset points" )
103 | 
104 | def plot_join_distri(df,listDict):
105 |     no,nFig = 0,len(listDict)
106 |     listG=[]
107 |     for dict in listDict:
108 |         x,y,title=dict['x'], dict['y'], dict['title']
109 |         sns.set(style="darkgrid", color_codes=True)
110 |         #marginal = dict(bins=15, rug=True)
111 |         marginal = {'bins':150, 'rug':True}
112 |         g = sns.jointplot(x, y, data=df, kind="reg",size=10,marginal_kws=marginal)        #kind=
113 |         if False:
114 |             g = g.plot_joint(plt.scatter, color="m", edgecolor="white")
115 |             _ = g.ax_marg_x.hist(x, color="b", alpha=.6)
116 |             _ = g.ax_marg_y.hist(y, color="r", alpha=.6,orientation="horizontal")
117 |         head = df.sort_values(by=[x], ascending=[False]).head(5)
118 |         #tail = tips.sort_values(by=['resid'], ascending=[False]).tail(5)
119 |         for row in head.iterrows():
120 |             ann(row,x,y)
121 |         plt.title(title)
122 |         path = "{}_{}.png".format(title, no)
123 |         g.savefig(path);        listG.append(path)
124 |         #plt.close()     #必须close 不然plt.show()会重复
125 |         no = no + 1
126 | 
127 |     if False:        # subplots migration
128 |         fig = plt.figure(figsize=(2, 2))    #    plt.figure(figsize=(12, 8))
129 |         no=0
130 |         for path in listG:
131 |             no = no + 1
132 |             img=mpimg.imread(path)
133 |             fig.add_subplot(2, 2, no)
134 |             plt.imshow(img)
135 | 
136 |     plt.show()
137 |     print("listG={}".format(len(listG)))
138 | 
139 | #https://stackoverflow.com/questions/43010462/annotate-outliers-on-seaborn-jointplot
140 | #很多问题，会丢失坐标轴
141 | def plot_join_distri_0(df,listDict):
142 |     no,nFig = 0,len(listDict)
143 |     nRow=int(math.sqrt(nFig))
144 |     nCol=(int)(math.ceil(nFig*1.0/nRow))
145 |     fig, axs = plt.subplots(nRow,nCol)
146 |     for dict in listDict:
147 |         x,y,title=dict['x'], dict['y'], dict['title']
148 |         row,col=(int)(no/nCol),(int)(no%nCol)
149 |         axis = axs[row,col]
150 |         g = sns.jointplot(x, y, data=df, kind="reg",size=7, ax=axis)
151 |         head = df.sort_values(by=[x], ascending=[False]).head(5)
152 |         #tail = tips.sort_values(by=['resid'], ascending=[False]).tail(5)
153 |         for row in head.iterrows():
154 |             ann(row,x,y,axis)
155 |         no=no+1
156 |         plt.title(title)
157 |         plt.close()  # 必须close 不然plt.show()会重复
158 | 
159 |     plt.show()
160 |     print("listG={}".format(0))
161 | # https://stackoverflow.com/questions/35042255/how-to-plot-multiple-seaborn-jointplot-in-subplot
162 | class SeabornFig2Grid():
163 | 
164 |     def __init__(self, seaborngrid, fig,  subplot_spec):
165 |         self.fig = fig
166 |         self.sg = seaborngrid
167 |         self.subplot = subplot_spec
168 |         if isinstance(self.sg, sns.axisgrid.FacetGrid) or \
169 |             isinstance(self.sg, sns.axisgrid.PairGrid):
170 |             self._movegrid()
171 |         elif isinstance(self.sg, sns.axisgrid.JointGrid):
172 |             self._movejointgrid()
173 |         self._finalize()
174 | 
175 |     def _movegrid(self):
176 |         """ Move PairGrid or Facetgrid """
177 |         self._resize()
178 |         n = self.sg.axes.shape[0]
179 |         m = self.sg.axes.shape[1]
180 |         self.subgrid = gridspec.GridSpecFromSubplotSpec(n,m, subplot_spec=self.subplot)
181 |         for i in range(n):
182 |             for j in range(m):
183 |                 self._moveaxes(self.sg.axes[i,j], self.subgrid[i,j])
184 | 
185 |     def _movejointgrid(self):
186 |         """ Move Jointgrid """
187 |         h= self.sg.ax_joint.get_position().height
188 |         h2= self.sg.ax_marg_x.get_position().height
189 |         r = int(np.round(h/h2))
190 |         self._resize()
191 |         self.subgrid = gridspec.GridSpecFromSubplotSpec(r+1,r+1, subplot_spec=self.subplot)
192 | 
193 |         self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1])
194 |         self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1])
195 |         self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1])
196 | 
197 |     def _moveaxes(self, ax, gs):
198 |         #https://stackoverflow.com/a/46906599/4124317
199 |         ax.remove()
200 |         ax.figure=self.fig
201 |         self.fig.axes.append(ax)
202 |         self.fig.add_axes(ax)
203 |         ax._subplotspec = gs
204 |         ax.set_position(gs.get_position(self.fig))
205 |         ax.set_subplotspec(gs)
206 | 
207 |     def _finalize(self):
208 |         plt.close(self.sg.fig)
209 |         self.fig.canvas.mpl_connect("resize_event", self._resize)
210 |         self.fig.canvas.draw()
211 | 
212 |     def _resize(self, evt=None):
213 |         self.sg.fig.set_size_inches(self.fig.get_size_inches())
214 | 
215 | if __name__ == "__main__":
216 |     iris = sns.load_dataset("iris")
217 |     tips = sns.load_dataset("tips")
218 | 
219 |     # An lmplot
220 |     g0 = sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips,palette=dict(Yes="g", No="m"))
221 |     # A PairGrid
222 |     g1 = sns.PairGrid(iris, hue="species")
223 |     g1.map(plt.scatter, s=5)
224 |     # A FacetGrid
225 |     g2 = sns.FacetGrid(tips, col="time",  hue="smoker")
226 |     g2.map(plt.scatter, "total_bill", "tip", edgecolor="w")
227 |     # A JointGrid
228 |     g3 = sns.jointplot("sepal_width", "petal_length", data=iris,kind="kde", space=0, color="g")
229 |     fig = plt.figure(figsize=(13,8))
230 |     gs = gridspec.GridSpec(2, 2)
231 |     mg0 = SeabornFig2Grid.SeabornFig2Grid(g0, fig, gs[0])
232 |     mg1 = SeabornFig2Grid.SeabornFig2Grid(g1, fig, gs[1])
233 |     mg2 = SeabornFig2Grid.SeabornFig2Grid(g2, fig, gs[3])
234 |     mg3 = SeabornFig2Grid.SeabornFig2Grid(g3, fig, gs[2])
235 |     gs.tight_layout(fig)
236 |     #gs.update(top=0.7)
237 |     plt.show()


--------------------------------------------------------------------------------
/python-package/pycharm_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     http://www.cnblogs.com/amazement/p/10341328.html
  3 |     包含相对引用的 module，不要直接利用  解释器执行(如果直接执行，这个文件名.py 对应的module __name__ 值就是  '__main__')
  4 | '''
  5 | import gc
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn import preprocessing
  9 | import os
 10 | from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,load_iris, load_svmlight_file)
 11 | import time
 12 | import pickle
 13 | from sklearn.metrics import log_loss, mean_squared_error
 14 | import matplotlib.pyplot as plt
 15 | from sklearn.model_selection import GridSearchCV, train_test_split
 16 | import shap
 17 | import sys
 18 | from litemort import *
 19 | import lightgbm as lgb
 20 | from sklearn import metrics
 21 | 
 22 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
 23 | #isMORT = True
 24 | 
 25 | def auc2(m, train, test,y_train,y_test):
 26 |     return (metrics.roc_auc_score(y_train,m.predict(train)),
 27 |                             metrics.roc_auc_score(y_test,m.predict(test)))
 28 | 
 29 | # https://www.kdnuggets.com/2018/03/catboost-vs-light-gbm-vs-xgboost.html
 30 | def test_fly_( ):
 31 |     import pandas as pd, numpy as np, time
 32 |     from sklearn.model_selection import train_test_split
 33 |     frac=0.1
 34 |     pkl_path = 'G:/kaggle/flight/flight_{}.pickle'.format(frac)
 35 |     if os.path.isfile(pkl_path):
 36 |         with open(pkl_path, "rb") as fp:  # Pickling
 37 |             [data] = pickle.load(fp)
 38 |     else:
 39 |         data = pd.read_csv("G:/kaggle/flight/flights.csv")
 40 |         data = data.sample(frac=frac, random_state=10)
 41 |         data = data[["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT",
 42 |                      "ORIGIN_AIRPORT", "AIR_TIME", "DEPARTURE_TIME", "DISTANCE", "ARRIVAL_DELAY"]]
 43 |         data.dropna(inplace=True)
 44 |         data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"] > 10) * 1
 45 |         with open(pkl_path, "wb") as fp:  # Pickling
 46 |             pickle.dump([data], fp)
 47 |         os._exit(-1)
 48 | 
 49 |     cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
 50 |     for item in cols:
 51 |        data[item] = data[item].astype("category").cat.codes + 1
 52 |     train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"], random_state=10, test_size=0.25)
 53 | 
 54 |     if False:
 55 |         lg = lgb.LGBMClassifier(silent=False)
 56 |         param_dist = {"max_depth": [25,50, 75],
 57 |                       "learning_rate" : [0.01,0.05,0.1],
 58 |                       "num_leaves": [300,900,1200],
 59 |                       "n_estimators": [200]
 60 |                      }
 61 |         grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=5)
 62 |         grid_search.fit(train,y_train)
 63 |         grid_search.best_estimator_
 64 |     params = {  "objective": "binary",'subsample': 1,
 65 |                 "metric": "binary_logloss",#""binary_logloss",
 66 |               "max_depth": 50, "learning_rate": 0.1, "num_leaves": 900, "n_estimators": 300}
 67 |     cate_features_name = ["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
 68 |     t0=time.time()
 69 |     a1,a2=0,0
 70 |     if isMORT:
 71 |         model2 = LiteMORT(params).fit(train, y_train)
 72 |         if False:
 73 |             y_predict = model2.predict(test,raw_score=True)[:,1]
 74 |             a1 = metrics.roc_auc_score(y_test,model2.predict(test,raw_score=True)[:,1])
 75 |             print("------ No categorical auc={}".format(a1))
 76 |             #model2 = LiteMORT(params).fit(train, y_train, categorical_feature = cate_features_name)
 77 |             #a2 = metrics.roc_auc_score(y_test,model2.predict(test,raw_score=True)[:,1])
 78 |             print("------ With Categorical auc={}".format(a2))
 79 |     elif True:
 80 |         model2 = lgb.LGBMClassifier( **params )
 81 |         model2.fit(train, y_train, eval_set=[(train, y_train)], verbose=True)
 82 |         result = model2.predict_proba(test)
 83 |     else:
 84 |         d_train = lgb.Dataset(train, label=y_train,free_raw_data=False)
 85 |         # Without Categorical Features
 86 |         model2 = lgb.train(params, d_train,valid_sets=[d_train])
 87 |         model2.save_model('gbm_test_fly_.model')
 88 |         a1=auc2(model2, train, test,y_train,y_test)
 89 |         print("------ No categorical auc2={}".format(a1))
 90 | 
 91 |         #With Catgeorical Features
 92 |         #model2 = lgb.train(params, d_train, categorical_feature = cate_features_name)
 93 |         #a2=auc2(model2, train, test,y_train,y_test)
 94 |         print("------ With categorical auc2={}".format(a2))
 95 |         del d_train
 96 |         gc.collect()
 97 |     input("loss@test_fly_ is {} time={} model={}...".format(a1,time.time()-t0,model2))
 98 |     os._exit(-98)
 99 | 
100 | def test_shap_adult_():
101 |     shap.initjs()
102 |     X,y = shap.datasets.adult()
103 |     X_display,y_display = shap.datasets.adult(display=True)
104 |     # create a train/test split
105 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
106 | 
107 |     params = {
108 |         "max_bin": 512,
109 |         "learning_rate": 0.05,
110 |         "boosting_type": "gbdt",
111 |         "objective": "binary",
112 |         "metric": "binary_logloss",
113 |         "num_leaves": 10,
114 |         "verbose": 1000,
115 |         "min_data": 100,
116 |         "boost_from_average": True,
117 |         'early_stop': 50, 'num_boost_round': 10000,
118 |     }
119 |     if isMORT:
120 |         model = LiteMORT(params).fit(X_train, y_train,eval_set=[(X_test,y_test)])
121 |         result = model.predict(X_test)
122 |         result = model.predict(X_test,raw_score=True)
123 |     elif True:
124 |         gbm = lgb.LGBMClassifier(n_estimators=10000, silent=True)
125 |         gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)
126 |         result = gbm.predict(X_test)        #predict_proba+_le.inverse_transform
127 |         result = gbm.predict_proba(X_test)
128 |     else:   #晕!!! LGBMClassifier和lgb.train返回结果不一样
129 |         d_train = lgb.Dataset(X_train, label=y_train)
130 |         d_test = lgb.Dataset(X_test, label=y_test)
131 |         model = lgb.train(params, d_train, 10000, valid_sets=[d_test], early_stopping_rounds=50, verbose_eval=1000)
132 |         if False:#https://slundberg.github.io/shap/notebooks/Census%20income%20classification%20with%20LightGBM.html
133 |             explainer = shap.TreeExplainer(model)
134 |             shap_values = explainer.shap_values(X)
135 |             shap.force_plot(explainer.expected_value, shap_values[0, :], X_display.iloc[0, :])
136 |             shap.force_plot(explainer.expected_value, shap_values[:1000, :], X_display.iloc[:1000, :])
137 |             shap.summary_plot(shap_values, X)
138 |             plt.show()
139 |         result = model.predict(X_test)
140 |     loss = log_loss(y_test, result)
141 |     input("loss@test_shap_adult_ is {} model={}...".format(loss,model))
142 |     os._exit(-99)
143 | 
144 | def test_1():
145 |     from sklearn.metrics import mean_squared_error
146 |     from sklearn.datasets import load_boston
147 |     from sklearn.model_selection import KFold
148 | 
149 |     params = {
150 |         "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1,
151 |     }
152 |     boston = load_boston()
153 |     y = boston['target']
154 |     X = boston['data']
155 |     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
156 |     for train_index, test_index in kf.split(X, y):
157 |         # xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
158 |         mort = LiteMORT(params)
159 |         mort.fit(X[train_index], y[train_index], params=params)
160 |         preds = mort.predict(X[test_index])
161 |         labels = y[test_index]
162 |         assert mean_squared_error(preds, labels) < 25
163 | 
164 |     params = {
165 |         "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50,
166 |         "verbosity": 1, 'subsample': 1,
167 |     }
168 | if __name__ == "__main__":
169 |     test_fly_()
170 |     #test_shap_adult_()
171 |     nTree=100   #100
172 | 
173 |     rng = np.random.RandomState(1994)
174 |     np.random.seed(42)
175 |     params = {
176 |         "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': nTree,
177 |         "verbosity": 1,
178 |     }
179 |     X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
180 |                       "B": np.random.permutation([1, 2, 3] * 100),  # int
181 |                       "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
182 |                       "D": np.random.permutation([True, False] * 150)})  # bool
183 | 
184 |     y = np.random.permutation([0, 1] * 150)
185 |     X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
186 |                            "B": np.random.permutation([1, 3] * 30),
187 |                            "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
188 |                            "D": np.random.permutation([True, False] * 30)})
189 |     if True:
190 |         #prepocess = Mort_Preprocess()
191 |         X, X_test = Mort_Preprocess.OrdinalEncode_(X, X_test)
192 |     '''
193 |     for col in ["A", "B", "C", "D"]:
194 |         X[col] = X[col].astype('category')
195 |         X_test[col] = X_test[col].astype('category')
196 |     '''
197 | 
198 |     if True:
199 |         isLabel=True
200 |         gbm0 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y)
201 |         gbm0.booster_.save_model('gbm0.model')
202 |         result = gbm0.predict(X_test,raw_score=isLabel,n_estimators = nTree)
203 |         pred0 = list(gbm0.predict(X_test,raw_score=isLabel))
204 |         gbm1 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y, categorical_feature=[0])
205 |         gbm1.booster_.save_model('gbm1.model')
206 |         pred1 = list(gbm1.predict(X_test,raw_score=isLabel))
207 |         gbm2 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y, categorical_feature=['A'])
208 |         pred2 = list(gbm2.predict(X_test,raw_score=isLabel))
209 |         gbm3 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
210 |         pred3 = list(gbm3.predict(X_test,raw_score=isLabel))
211 |         np.testing.assert_almost_equal(pred0, pred1)
212 |         np.testing.assert_almost_equal(pred0, pred2)
213 |         np.testing.assert_almost_equal(pred0, pred3)
214 |         '''
215 |         gbm3.booster_.save_model('categorical.model')
216 |         gbm4 = lgb.Booster(model_file='categorical.model')
217 |         pred4 = list(gbm4.predict(X_test))
218 |         pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
219 |         np.testing.assert_almost_equal(pred_prob, pred4)
220 |         '''
221 | 
222 |     if False:
223 |         mort0 = LiteMORT(params).fit(X, y)
224 |         pred0 = list(mort0.predict(X_test))
225 |     else:
226 |         mort1 = LiteMORT(params).fit(X, y, categorical_feature=[0])
227 |         pred1 = list(mort1.predict(X_test))
228 | 
229 |         mort2 = LiteMORT(params).fit(X, y, categorical_feature=['A'])
230 |         pred2 = list(mort2.predict(X_test))
231 |         mort3 = LiteMORT(params).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
232 |         pred3 = list(mort3.predict(X_test))
233 |         #np.testing.assert_almost_equal(pred1, pred1)
234 |         np.testing.assert_almost_equal(pred1, pred2)
235 |         #np.testing.assert_almost_equal(pred1, pred3)
236 |     input("...")
237 |     # gc.collect()
238 |     #ret = log_loss(y_test, mort.predict_proba(X_test))


--------------------------------------------------------------------------------
/src/util/FastExpLog.c:
--------------------------------------------------------------------------------
  1 | //http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.1569&rep=rep1&type=pdf
  2 | 
  3 | /*
  4 | * See http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
  5 | *
  6 | * All of these rely on being on a little endian machine, such as an Intel box.
  7 | *
  8 | * These can be _quite_ inaccurate. ~20% in many cases, but being much faster (~7x) may
  9 | * permit more loop iterations of tuning algorithms that only need approximate powers.
 10 | *
 11 | * This version of Ankerl's algorithm has been extended to provide optionally conservative (lower) bounds
 12 | * and also to generate a full linear interpolation across the entire significand rather than 'stair-step'
 13 | * at the expense of performing a 64 bit operation rather than a 32 bit one. This is cheap these days.
 14 | *
 15 | * 'exp' is further improved by using a suggestion by Nic Schraudolph:
 16 | *
 17 | * "You can get a much better approximation (piecewise rational instead of linear) at
 18 | * the cost of a single floating-point division by using better_exp(x) = exp(x/2)/exp(-x/2),
 19 | * where exp() is my published approximation but you don't need the additive constant anymore,
 20 | * you can use c=0. On machines with hardware division this is very attractive." -- Nic Schraudolph
 21 | *
 22 | * --Edward Kmett
 23 | *
 24 | * TODO: Incorporate the techniques from https://code.google.com/p/fastapprox/ to enable us
 25 | * to calculate more interesting approximate functions. They might need to be generalized to work on
 26 | * Double values where appropriate I suppose.
 27 | *
 28 | * Magic numbers:
 29 | * float /int      : round(1<<23/log(2)) = 12102203,          127<<23 = 1065353216
 30 | * double/int      : round(1<<20/log(2)) = 1512775,          1023<<20 = 1072693248
 31 | * double/long long: round(1<<52/log(2)) = 6497320848556798, 1023<<52 = 4607182418800017408
 32 | *
 33 | * The fudge factors such that exp y <= exp_fast y:
 34 | * >>> ceiling (2^23 * (1 - (log (log 2) + 1)/log 2))
 35 | * 722019
 36 | * >>> ceiling (2^20 * (1 - (log (log 2) + 1)/log 2))
 37 | * 90253
 38 | * >>> ceiling (2^52 * (1 - (log (log 2) + 1)/log 2))
 39 | * 387630818974388
 40 | *
 41 | * The fudge factor such that exp_fast y <= exp y is uniformly -1
 42 | *
 43 | * TODO: perform exponential doubling for pow based on better_exp_fast instead for better accuracy.
 44 | */
 45 | 
 46 | /* Schraudolph's published algorithm extended into the least significant bits to avoid the stair step.
 47 | double long long approximation: round 1<<52/log(2) 6497320848556798,
 48 | mask = 0x3ff0000000000000LL = 4607182418800017408LL
 49 | double approximation: round(1<<20/log(2)) = 1512775, 1023<<20 = 1072693248
 50 | */
 51 | 
 52 | /* 4607182418800017408 - 387630818974388 = 4606794787981043020
 53 | 
 54 | Exponent mask adapted to full 64 bit precision:
 55 | >>> 1023 * 2^52
 56 | 4607182418800017408
 57 | 
 58 | The fudge factor for conservative lower bound adapted to full 64 bit precision:
 59 | >>> round (2^52 * (1 - (log (log 2) + 1)/log 2))
 60 | 387630818974388
 61 | 
 62 | As a lower bound this is suitable for use when generating Mass and Precision estimates.
 63 | */
 64 | double exp_fast_lb(double a) {
 65 | 	union { double d; long long x; } u;
 66 | 	u.x = (long long)(6497320848556798LL * a + 4606794787981043020);
 67 | 	return u.d;
 68 | }
 69 | 
 70 | /* 4607182418800017408 + 1 */
 71 | double exp_fast_ub(double a) {
 72 | 	union { double d; long long x; } u;
 73 | 	u.x = (long long)(6497320848556798LL * a + 4607182418800017409);
 74 | 	return u.d;
 75 | }
 76 | 
 77 | double exp_fast(double a) {
 78 | 	union { double d; long long x; } u;
 79 | 	u.x = (long long)(6497320848556798LL * a + 0x3fef127e83d16f12LL);
 80 | 	return u.d;
 81 | }
 82 | 
 83 | double better_exp_fast(double a) {
 84 | 	union { double d; long long x; } u, v;
 85 | 	u.x = (long long)(3248660424278399LL * a + 0x3fdf127e83d16f12LL);
 86 | 	v.x = (long long)(0x3fdf127e83d16f12LL - 3248660424278399LL * a);
 87 | 	return u.d / v.d;
 88 | }
 89 | 
 90 | /* Schraudolph's published algorithm */
 91 | double exp_fast_schraudolph(double a) {
 92 | 	union { double d; int x[2]; } u;
 93 | 	u.x[1] = (int)(1512775 * a + 1072632447);
 94 | 	u.x[0] = 0;
 95 | 	return u.d;
 96 | }
 97 | 
 98 | /* 1065353216 + 1 */
 99 | float expf_fast_ub(float a) {
100 | 	union { float f; int x; } u;
101 | 	u.x = (int)(12102203 * a + 1065353217);
102 | 	return u.f;
103 | }
104 | 
105 | /* Schraudolph's published algorithm with John's constants */
106 | /* 1065353216 - 486411 = 1064866805 */
107 | float expf_fast(float a) {
108 | 	union { float f; int x; } u;
109 | 	u.x = (int)(12102203 * a + 1064866805);
110 | 	return u.f;
111 | }
112 | 
113 | //  1056478197 
114 | double better_expf_fast(float a) {
115 | 	union { float f; int x; } u, v;
116 | 	u.x = (long long)(6051102 * a + 1056478197);
117 | 	v.x = (long long)(1056478197 - 6051102 * a);
118 | 	return u.f / v.f;
119 | }
120 | 
121 | /* 1065353216 - 722019 */
122 | float expf_fast_lb(float a) {
123 | 	union { float f; int x; } u;
124 | 	u.x = (int)(12102203 * a + 1064631197);
125 | 	return u.f;
126 | }
127 | 
128 | /* Ankerl's inversion of Schraudolph's published algorithm, converted to explicit multiplication */
129 | double log_fast_ankerl(double a) {
130 | 	union { double d; int x[2]; } u = { a };
131 | 	return (u.x[1] - 1072632447) * 6.610368362777016e-7; /* 1 / 1512775.0; */
132 | }
133 | 
134 | double log_fast_ub(double a) {
135 | 	union { double d; long long x; } u = { a };
136 | 	return (u.x - 4606794787981043020) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */
137 | }
138 | 
139 | /* Ankerl's inversion of Schraudolph's published algorithm with my constants */
140 | double log_fast(double a) {
141 | 	union { double d; long long x; } u = { a };
142 | 	return (u.x - 4606921278410026770) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */
143 | }
144 | 
145 | double log_fast_lb(double a) {
146 | 	union { double d; long long x; } u = { a };
147 | 	return (u.x - 4607182418800017409) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */
148 | }
149 | 
150 | 
151 | /* 1065353216 - 722019 */
152 | float logf_fast_ub(float a) {
153 | 	union { float f; int x; } u = { a };
154 | 	return (u.x - 1064631197) * 8.262958405176314e-8f; /* 1 / 12102203.0; */
155 | }
156 | 
157 | /* Ankerl's adaptation of Schraudolph's published algorithm with John's constants */
158 | /* 1065353216 - 486411 = 1064866805 */
159 | float logf_fast(float a) {
160 | 	union { float f; int x; } u = { a };
161 | 	return (u.x - 1064866805) * 8.262958405176314e-8f; /* 1 / 12102203.0; */
162 | }
163 | 
164 | /* 1065353216 + 1 */
165 | float logf_fast_lb(float a) {
166 | 	union { float f; int x; } u = { a };
167 | 	return (u.x - 1065353217) * 8.262958405176314e-8f; /* 1 / 12102203.0 */
168 | }
169 | 
170 | /* Ankerl's version of Schraudolph's approximation. */
171 | double pow_fast_ankerl(double a, double b) {
172 | 	union { double d; int x[2]; } u = { a };
173 | 	u.x[1] = (int)(b * (u.x[1] - 1072632447) + 1072632447);
174 | 	u.x[0] = 0;
175 | 	return u.d;
176 | }
177 | 
178 | /*
179 | These constants are based loosely on the following comment off of Ankerl's blog:
180 | 
181 | "I have used the same trick for float, not double, with some slight modification to the constants to suite IEEE754 float format. The first constant for float is 1<<23/log(2) and the second is 127<<23 (for double they are 1<<20/log(2) and 1023<<20)." -- John
182 | */
183 | 
184 | /* 1065353216 + 1      = 1065353217 ub */
185 | /* 1065353216 - 486411 = 1064866805 min RMSE */
186 | /* 1065353216 - 722019 = 1064631197 lb */
187 | float powf_fast(float a, float b) {
188 | 	union { float d; int x; } u = { a };
189 | 	u.x = (int)(b * (u.x - 1064866805) + 1064866805);
190 | 	return u.d;
191 | }
192 | 
193 | float powf_fast_lb(float a, float b) {
194 | 	union { float d; int x; } u = { a };
195 | 	u.x = (int)(b * (u.x - 1065353217) + 1064631197);
196 | 	return u.d;
197 | }
198 | 
199 | float powf_fast_ub(float a, float b) {
200 | 	union { float d; int x; } u = { a };
201 | 	u.x = (int)(b * (u.x - 1064631197) + 1065353217);
202 | 	return u.d;
203 | }
204 | 
205 | /*
206 | Now that 64 bit arithmetic is cheap we can (try to) improve on Ankerl's algorithm.
207 | 
208 | double long long approximation: round 1<<52/log(2) 6497320848556798,
209 | mask = 0x3ff0000000000000LL = 4607182418800017408LL
210 | 
211 | >>> round (2**52 * log (3 / (8 * log 2) + 1/2) / log 2 - 1/2)
212 | 261140389990638
213 | >>> 0x3ff0000000000000 - round (2**52 * log (3 / (8 * log 2) + 1/2) / log 2 - 1/2)
214 | 4606921278410026770
215 | 
216 | */
217 | 
218 | double pow_fast_ub(double a, double b) {
219 | 	union { double d; long long x; } u = { a };
220 | 	u.x = (long long)(b * (u.x - 4606794787981043020LL) + 4607182418800017409LL);
221 | 	return u.d;
222 | }
223 | 
224 | double pow_fast(double a, double b) {
225 | 	union { double d; long long x; } u = { a };
226 | 	u.x = (long long)(b * (u.x - 4606921278410026770LL) + 4606921278410026770LL);
227 | 	return u.d;
228 | }
229 | 
230 | double pow_fast_lb(double a, double b) {
231 | 	union { double d; long long x; } u = { a };
232 | 	u.x = (long long)(b * (u.x - 4607182418800017409LL) + 4606794787981043020LL);
233 | 	return u.d;
234 | }
235 | 
236 | /* should be much more precise with large b, still ~3.3x faster. */
237 | double pow_fast_precise_ankerl(double a, double b) {
238 | 	int flipped = 0;
239 | 	if (b < 0) {
240 | 		flipped = 1;
241 | 		b = -b;
242 | 	}
243 | 
244 | 	/* calculate approximation with fraction of the exponent */
245 | 	int e = (int)b;
246 | 	union { double d; int x[2]; } u = { a };
247 | 	u.x[1] = (int)((b - e) * (u.x[1] - 1072632447) + 1072632447);
248 | 	u.x[0] = 0;
249 | 
250 | 	double r = 1.0;
251 | 	while (e) {
252 | 		if (e & 1) {
253 | 			r *= a;
254 | 		}
255 | 		a *= a;
256 | 		e >>= 1;
257 | 	}
258 | 
259 | 	r *= u.d;
260 | 	return flipped ? 1.0 / r : r;
261 | }
262 | 
263 | /* should be much more precise with large b, still ~3.3x faster. */
264 | double pow_fast_precise(double a, double b) {
265 | 	int flipped = 0;
266 | 	if (b < 0) {
267 | 		flipped = 1;
268 | 		b = -b;
269 | 	}
270 | 
271 | 	/* calculate approximation with fraction of the exponent */
272 | 	int e = (int)b;
273 | 	double d = exp_fast(b - e);
274 | 
275 | 	double r = 1.0;
276 | 	while (e) {
277 | 		if (e & 1) r *= a;
278 | 		a *= a;
279 | 		e >>= 1;
280 | 	}
281 | 
282 | 	r *= d;
283 | 	return flipped ? 1.0 / r : r;
284 | }
285 | 
286 | double better_pow_fast_precise(double a, double b) {
287 | 	int flipped = 0;
288 | 	if (b < 0) {
289 | 		flipped = 1;
290 | 		b = -b;
291 | 	}
292 | 
293 | 	/* calculate approximation with fraction of the exponent */
294 | 	int e = (int)b;
295 | 	double d = better_exp_fast(b - e);
296 | 
297 | 	double r = 1.0;
298 | 	while (e) {
299 | 		if (e & 1) r *= a;
300 | 		a *= a;
301 | 		e >>= 1;
302 | 	}
303 | 
304 | 	r *= d;
305 | 	return flipped ? 1.0 / r : r;
306 | }
307 | 
308 | 
309 | /* should be much more precise with large b */
310 | float powf_fast_precise(float a, float b) {
311 | 	int flipped = 0;
312 | 	if (b < 0) {
313 | 		flipped = 1;
314 | 		b = -b;
315 | 	}
316 | 
317 | 	/* calculate approximation with fraction of the exponent */
318 | 	int e = (int)b;
319 | 	union { float f; int x; } u = { a };
320 | 	u.x = (int)((b - e) * (u.x - 1065353216) + 1065353216);
321 | 
322 | 	float r = 1.0f;
323 | 	while (e) {
324 | 		if (e & 1) {
325 | 			r *= a;
326 | 		}
327 | 		a *= a;
328 | 		e >>= 1;
329 | 	}
330 | 
331 | 	r *= u.f;
332 | 	return flipped ? 1.0f / r : r;
333 | }
334 | 
335 | /* should be much more precise with large b */
336 | float better_powf_fast_precise(float a, float b) {
337 | 	int flipped = 0;
338 | 	if (b < 0) {
339 | 		flipped = 1;
340 | 		b = -b;
341 | 	}
342 | 
343 | 	/* calculate approximation with fraction of the exponent */
344 | 	int e = (int)b;
345 | 	float f = better_expf_fast(b - e);
346 | 
347 | 	float r = 1.0f;
348 | 	while (e) {
349 | 		if (e & 1) {
350 | 			r *= a;
351 | 		}
352 | 		a *= a;
353 | 		e >>= 1;
354 | 	}
355 | 
356 | 	r *= f;
357 | 	return flipped ? 1.0f / r : r;
358 | }
359 | 


--------------------------------------------------------------------------------