├── tests ├── __init__.py ├── test_main.cpp ├── test_math.cpp ├── math_test.py ├── cpp_test.py └── python_package_test │ ├── test_basic.py │ └── test_sklearn.py ├── .pytest_cache └── v │ └── cache │ ├── nodeids │ └── lastfailed ├── vs └── LiteMORT │ ├── SA_salp.cpp │ ├── stdafx.h │ ├── dllmain.cpp │ ├── stdafx.cpp │ ├── targetver.h │ ├── LiteMORT.cpp │ ├── ReadMe.txt │ ├── LiteMORT.sln │ └── LiteMORT.vcxproj.filters ├── MANIFEST.in ├── src ├── data_fold │ ├── Cluster.hpp │ ├── EDA.cpp │ ├── EDA.hpp │ ├── Loss.hpp │ ├── Move.hpp │ ├── Binfold.cpp │ ├── Binfold.hpp │ ├── DataFold.cpp │ ├── DataFold.hpp │ ├── FeatVector.cpp │ ├── FeatVector.hpp │ ├── Histogram.cpp │ ├── Histogram.hpp │ ├── Distribution.hpp │ ├── FeatVec_EXP.hpp │ ├── FeatVec_Quanti.hpp │ ├── CMakeLists.txt │ ├── Imputer.hpp │ ├── Loss_binary.hpp │ ├── FeatVec_2D.hpp │ └── Representive.hpp ├── LiteMORT.cpp ├── learn │ ├── LOSS.cpp │ ├── DCRIMI_.hpp │ ├── Pruning.cpp │ ├── CMakeLists.txt │ ├── Regression.hpp │ ├── LMachine.hpp │ ├── Pruning.hpp │ └── DCRIMI_.cpp ├── tree │ ├── GBRT.cpp │ ├── BiSplit.cpp │ ├── BiSplit.hpp │ ├── GST_fno.hpp │ ├── old │ │ ├── GiFace.cpp │ │ ├── RF_ConfiRegress.h │ │ ├── RF_ConfiRegress.cpp │ │ ├── RF_ShapeRegress.cpp │ │ └── RF_ShapeRegress.h │ ├── ManifoldTree.cpp │ ├── ManifoldTree.hpp │ ├── BoostingForest.cpp │ ├── BoostingForest.hpp │ ├── CMakeLists.txt │ └── GBRT.hpp ├── util │ ├── GST_def.h │ ├── BLAS_t.cpp │ ├── BLAS_t.hpp │ ├── FeatData.cpp │ ├── FeatData.hpp │ ├── GRander.hpp │ ├── Object.hpp │ ├── samp_set.hpp │ ├── CMakeLists.txt │ ├── Parallel_t.hpp │ ├── Statistics_t.hpp │ ├── GRander.cpp │ ├── pcg_oneil │ │ ├── pcg_basic.h │ │ ├── xoshiro256starstar.c │ │ ├── xoshiro256plusplus.c │ │ └── pcg_basic.c │ ├── PY_obj.hpp │ ├── Float16.hpp │ └── FastExpLog.c ├── EDA │ ├── SA_salp.cpp │ ├── SA_salp.hpp │ ├── Feat_Selection.cpp │ ├── Feat_Selection.hpp │ └── CMakeLists.txt ├── python │ ├── pyMORT_DLL.cpp │ └── pyMORT_DLL.h ├── include │ └── LiteBOM_config.h └── __version__.py ├── python-package ├── .pytest_cache │ └── v │ │ └── cache │ │ └── stepwise ├── LiteMORT │ ├── VERSION.txt │ ├── LiteMORT_regression.py │ ├── __version__.py │ ├── __init__.py │ ├── LiteMORT_time.py │ ├── libpath.py │ ├── compat.py │ ├── LiteMORT_problems.py │ ├── LiteMORT_hyppo.py │ ├── LiteMORT_ERA.py │ └── LiteMORT_EDA.py ├── MANIFEST.in ├── lgbm_importances.png ├── README.md ├── LICENSE ├── mort_local.py ├── setup.py ├── case_poct.py ├── case_higgs.py ├── case_future_sales.py ├── case_earthquake.py ├── lgb_kim.py ├── case_ieee_fraud.py └── pycharm_test.py ├── doc └── 基于二阶泛函优化的梯度提升算法.pptx ├── .idea ├── libraries │ └── R_User_Library.xml ├── vcs.xml ├── other.xml ├── encodings.xml ├── modules.xml ├── misc.xml └── LiteMORT.iml ├── case_future_sales.py ├── LICENSE ├── CMakeLists.txt ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.pytest_cache/v/cache/nodeids: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /vs/LiteMORT/SA_salp.cpp: -------------------------------------------------------------------------------- 1 | class -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | -------------------------------------------------------------------------------- /src/data_fold/Cluster.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | -------------------------------------------------------------------------------- /python-package/.pytest_cache/v/cache/stepwise: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /python-package/LiteMORT/VERSION.txt: -------------------------------------------------------------------------------- 1 | 0.1.0 2 | -------------------------------------------------------------------------------- /python-package/LiteMORT/LiteMORT_regression.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /python-package/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | -------------------------------------------------------------------------------- /tests/test_main.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include -------------------------------------------------------------------------------- /src/LiteMORT.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/LiteMORT.cpp -------------------------------------------------------------------------------- /src/learn/LOSS.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/learn/LOSS.cpp -------------------------------------------------------------------------------- /src/tree/GBRT.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/GBRT.cpp -------------------------------------------------------------------------------- /src/util/GST_def.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/GST_def.h -------------------------------------------------------------------------------- /src/EDA/SA_salp.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/SA_salp.cpp -------------------------------------------------------------------------------- /src/EDA/SA_salp.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/SA_salp.hpp -------------------------------------------------------------------------------- /src/data_fold/EDA.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/EDA.cpp -------------------------------------------------------------------------------- /src/data_fold/EDA.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/EDA.hpp -------------------------------------------------------------------------------- /src/learn/DCRIMI_.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/learn/DCRIMI_.hpp -------------------------------------------------------------------------------- /src/learn/Pruning.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/learn/Pruning.cpp -------------------------------------------------------------------------------- /src/tree/BiSplit.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BiSplit.cpp -------------------------------------------------------------------------------- /src/tree/BiSplit.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BiSplit.hpp -------------------------------------------------------------------------------- /src/tree/GST_fno.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/GST_fno.hpp -------------------------------------------------------------------------------- /src/util/BLAS_t.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/BLAS_t.cpp -------------------------------------------------------------------------------- /src/util/BLAS_t.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/BLAS_t.hpp -------------------------------------------------------------------------------- /src/util/FeatData.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/FeatData.cpp -------------------------------------------------------------------------------- /src/util/FeatData.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/FeatData.hpp -------------------------------------------------------------------------------- /src/util/GRander.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/GRander.hpp -------------------------------------------------------------------------------- /src/util/Object.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/Object.hpp -------------------------------------------------------------------------------- /src/util/samp_set.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/util/samp_set.hpp -------------------------------------------------------------------------------- /vs/LiteMORT/stdafx.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/stdafx.h -------------------------------------------------------------------------------- /src/data_fold/Loss.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Loss.hpp -------------------------------------------------------------------------------- /src/data_fold/Move.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Move.hpp -------------------------------------------------------------------------------- /src/tree/old/GiFace.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/GiFace.cpp -------------------------------------------------------------------------------- /vs/LiteMORT/dllmain.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/dllmain.cpp -------------------------------------------------------------------------------- /vs/LiteMORT/stdafx.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/stdafx.cpp -------------------------------------------------------------------------------- /vs/LiteMORT/targetver.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/targetver.h -------------------------------------------------------------------------------- /doc/基于二阶泛函优化的梯度提升算法.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/doc/基于二阶泛函优化的梯度提升算法.pptx -------------------------------------------------------------------------------- /src/EDA/Feat_Selection.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/Feat_Selection.cpp -------------------------------------------------------------------------------- /src/EDA/Feat_Selection.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/EDA/Feat_Selection.hpp -------------------------------------------------------------------------------- /src/data_fold/Binfold.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Binfold.cpp -------------------------------------------------------------------------------- /src/data_fold/Binfold.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Binfold.hpp -------------------------------------------------------------------------------- /src/data_fold/DataFold.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/DataFold.cpp -------------------------------------------------------------------------------- /src/data_fold/DataFold.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/DataFold.hpp -------------------------------------------------------------------------------- /src/python/pyMORT_DLL.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/python/pyMORT_DLL.cpp -------------------------------------------------------------------------------- /src/tree/ManifoldTree.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/ManifoldTree.cpp -------------------------------------------------------------------------------- /src/tree/ManifoldTree.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/ManifoldTree.hpp -------------------------------------------------------------------------------- /vs/LiteMORT/LiteMORT.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/vs/LiteMORT/LiteMORT.cpp -------------------------------------------------------------------------------- /src/data_fold/FeatVector.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVector.cpp -------------------------------------------------------------------------------- /src/data_fold/FeatVector.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVector.hpp -------------------------------------------------------------------------------- /src/data_fold/Histogram.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Histogram.cpp -------------------------------------------------------------------------------- /src/data_fold/Histogram.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Histogram.hpp -------------------------------------------------------------------------------- /src/include/LiteBOM_config.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/include/LiteBOM_config.h -------------------------------------------------------------------------------- /src/tree/BoostingForest.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BoostingForest.cpp -------------------------------------------------------------------------------- /src/tree/BoostingForest.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/BoostingForest.hpp -------------------------------------------------------------------------------- /src/data_fold/Distribution.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/Distribution.hpp -------------------------------------------------------------------------------- /src/data_fold/FeatVec_EXP.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVec_EXP.hpp -------------------------------------------------------------------------------- /src/tree/old/RF_ConfiRegress.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/RF_ConfiRegress.h -------------------------------------------------------------------------------- /python-package/LiteMORT/__version__.py: -------------------------------------------------------------------------------- 1 | 2 | VERSION = (0, 1, 18) 3 | 4 | __version__ = '.'.join(map(str, VERSION)) 5 | -------------------------------------------------------------------------------- /src/data_fold/FeatVec_Quanti.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/data_fold/FeatVec_Quanti.hpp -------------------------------------------------------------------------------- /src/tree/old/RF_ConfiRegress.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/RF_ConfiRegress.cpp -------------------------------------------------------------------------------- /src/tree/old/RF_ShapeRegress.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/src/tree/old/RF_ShapeRegress.cpp -------------------------------------------------------------------------------- /python-package/lgbm_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/closest-git/LiteMORT/HEAD/python-package/lgbm_importances.png -------------------------------------------------------------------------------- /.idea/libraries/R_User_Library.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /tests/test_math.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "math.hpp" 4 | 5 | TEST_CASE("Addition and subtraction") 6 | { 7 | REQUIRE(add(1, 1) == 2); 8 | REQUIRE(subtract(1, 1) == 0); 9 | } -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(util_) 4 | 5 | # Create the main executable 6 | # add the header files also so that the IDE knows they exist 7 | # in the source tree 8 | add_library(util_ 9 | ./GRander.cpp 10 | ) -------------------------------------------------------------------------------- /python-package/README.md: -------------------------------------------------------------------------------- 1 | A fast gradient boosting framework on manifolds(from regression tree,classification tree,neural net....). 2 | 3 | ##### 1)Faster than LightGBM with same accuracy. 4 | 5 | ##### 2)sklearn-like api interface. 6 | 7 | ##### 3)Support parameters of LightGBM. -------------------------------------------------------------------------------- /src/EDA/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(eda_) 4 | 5 | # Create the main executable 6 | # add the header files also so that the IDE knows they exist 7 | # in the source tree 8 | add_library(eda_ 9 | ./Feat_Selection.cpp 10 | ./SA_salp.cpp 11 | ) -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/learn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(learn_) 4 | 5 | # Create the main executable 6 | # add the header files also so that the IDE knows they exist 7 | # in the source tree 8 | add_library(learn_ 9 | ./LOSS.cpp 10 | ./DCRIMI_.cpp 11 | ./Pruning.cpp 12 | ) -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /src/tree/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(tree_) 4 | 5 | # Create the main executable 6 | # add the header files also so that the IDE knows they exist 7 | # in the source tree 8 | add_library(tree_ 9 | ./ManifoldTree.cpp 10 | ./GBRT.cpp 11 | ./BoostingForest.cpp 12 | ./BiSplit.cpp 13 | ) -------------------------------------------------------------------------------- /src/data_fold/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(data_fold_) 4 | 5 | # Create the main executable 6 | # add the header files also so that the IDE knows they exist 7 | # in the source tree 8 | add_library(data_fold_ 9 | ./Histogram.cpp 10 | ./EDA.cpp 11 | ./DataFold.cpp 12 | ./FeatVector.cpp 13 | ) -------------------------------------------------------------------------------- /tests/math_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import python_cpp_example 3 | 4 | class MainTest(unittest.TestCase): 5 | def test_add(self): 6 | self.assertEqual(python_cpp_example.add(1, 1), 2) 7 | 8 | def test_subtract(self): 9 | self.assertEqual(python_cpp_example.subtract(1, 1), 0) 10 | 11 | if __name__ == '__main__': 12 | unittest.main() -------------------------------------------------------------------------------- /src/__version__.py: -------------------------------------------------------------------------------- 1 | # 8b d8 Yb dP 88""Yb db dP""b8 88 dP db dP""b8 888888 2 | # 88b d88 YbdP 88__dP dPYb dP `" 88odP dPYb dP `" 88__ 3 | # 88YbdP88 8P 88""" dP__Yb Yb 88"Yb dP__Yb Yb "88 88"" 4 | # 88 YY 88 dP 88 dP""""Yb YboodP 88 Yb dP""""Yb YboodP 888888 5 | 6 | VERSION = (5, 2, 0) 7 | 8 | __version__ = '.'.join(map(str, VERSION)) 9 | -------------------------------------------------------------------------------- /.pytest_cache/v/cache/lastfailed: -------------------------------------------------------------------------------- 1 | { 2 | "tests/python_package_test/test_basic.py::TestBasic::test": true, 3 | "tests/python_package_test/test_sklearn.py": true, 4 | "tests/python_package_test/test_sklearn.py::TestSklearn": true, 5 | "tests/python_package_test/test_sklearn.py::TestSklearn::test_binary": true, 6 | "tests/python_package_test/test_sklearn.py::TestSklearn::test_pandas_categorical": true 7 | } -------------------------------------------------------------------------------- /case_future_sales.py: -------------------------------------------------------------------------------- 1 | #https://www.kaggle.com/hukuda222/nfl-simple-model-using-lightgbm 2 | 3 | import os 4 | import pandas as pd 5 | from kaggle.competitions import nflrush 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn import preprocessing 9 | import matplotlib.pyplot as plt 10 | import random 11 | from sklearn.model_selection import KFold 12 | import lightgbm as lgb 13 | import gc 14 | import pickle 15 | import tqdm -------------------------------------------------------------------------------- /tests/cpp_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import subprocess 3 | import os 4 | 5 | 6 | class MainTest(unittest.TestCase): 7 | def test_cpp(self): 8 | print("\n\nTesting C++ code...") 9 | subprocess.check_call(os.path.join(os.path.dirname( 10 | os.path.relpath(__file__)), 'bin', 'python_cpp_example_test')) 11 | print("\nResuming Python tests...\n") 12 | 13 | 14 | if __name__ == '__main__': 15 | unittest.main() 16 | -------------------------------------------------------------------------------- /src/data_fold/Imputer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "../util/GST_def.h" 7 | #include "../include/LiteBOM_config.h" 8 | 9 | using namespace std; 10 | 11 | namespace Grusoft { 12 | template 13 | void Imputer_Fill_(LiteBOM_Config&config, size_t nSamp_, Tx *vec,double fill,int flag=0x0 ) { 14 | for (size_t i = 0; i 3 | 4 | #define OMP_FOR_func(lambda_func) \ 5 | for (int thread = 0; thread < num_threads; thread++) { \ 6 | size_t start = thread*step, end = MIN2(start + step, dim), i; \ 7 | for (i = start; i < end; i++) { {lambda_func;} } \ 8 | } 9 | 10 | namespace Grusoft{ 11 | inline int OMP_FOR_STATIC_1(const size_t nSamp, size_t& step,int min_size=64, int flag = 0x0) { 12 | int num_threads = 1; 13 | step = nSamp; 14 | if (nSamp > min_size) { 15 | #pragma omp parallel 16 | #pragma omp master 17 | { num_threads = omp_get_num_threads(); } 18 | step = (nSamp + num_threads - 1) / num_threads; 19 | } 20 | return num_threads; 21 | } 22 | } -------------------------------------------------------------------------------- /src/util/Statistics_t.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #ifdef WIN32 6 | /*#include 7 | #define GST_NOW( ) (clock( )) 8 | #define GST_TIC(tick) clock_t tick=clock( ); 9 | #define GST_TOC(tick) ((clock()-(tick))*1.0f/CLOCKS_PER_SEC)*/ 10 | typedef std::chrono::high_resolution_clock Clock; 11 | #define GST_NOW( ) (Clock::now( )) 12 | #define GST_TIC(tick) auto tick = Clock::now( ); 13 | #define GST_TOC(tick) ( (std::chrono::duration_cast(Clock::now( )-(tick)).count( ))/1000000.0) 14 | 15 | #else 16 | typedef std::chrono::high_resolution_clock Clock; 17 | #define GST_NOW( ) (Clock::now( )) 18 | #define GST_TIC(tick) auto tick = Clock::now( ); 19 | #define GST_TOC(tick) ( (std::chrono::duration_cast(Clock::now( )-(tick)).count( ))/1000000.0) 20 | #endif -------------------------------------------------------------------------------- /src/data_fold/FeatVec_2D.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "./DataFold.hpp" 4 | 5 | namespace Grusoft { 6 | class HistoGRAM_2D : public HistoGRAM { 7 | protected: 8 | HistoGRAM *histoX = nullptr, *histoY = nullptr; 9 | public: 10 | HistoGRAM_2D(FeatVector*hFeat_, size_t nMost, int flag = 0x0) : HistoGRAM(hFeat_,nMost, flag) { 11 | } 12 | virtual ~HistoGRAM_2D() { 13 | ; 14 | } 15 | 16 | virtual void GreedySplit_X(const FeatsOnFold *hData_, const SAMP_SET& samp_set, int flag = 0x0); 17 | }; 18 | 19 | class FeatVec_2D : public FeatVec_T { 20 | protected: 21 | FeatVec_T *featX = nullptr, *featY = nullptr; 22 | public: 23 | FeatVec_2D(FeatsOnFold *hData_, int id_, const FeatVec_T *fX, const FeatVec_T *fY, size_t nMostDup, int flag = 0x0); 24 | virtual ~FeatVec_2D() { 25 | } 26 | 27 | 28 | 29 | }; 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /python-package/LiteMORT/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """LiteMORT, Light Gradient Boosting Machine. 3 | 4 | __author__ = 'Yingshi Chen' 5 | """ 6 | from __future__ import absolute_import 7 | import os 8 | 9 | #from .LiteMORT_problems import Mort_Problems 10 | from .__version__ import __version__ 11 | from .LiteMORT import LiteMORT,LiteMORT_profile 12 | from .LiteMORT_preprocess import Mort_Preprocess,Mort_PickSamples 13 | from .LiteMORT_hyppo import MORT_feat_select_ 14 | ''' 15 | try: 16 | except ImportError: 17 | pass 18 | ''' 19 | 20 | ''' 21 | try: 22 | from .plotting import plot_importance, plot_metric, plot_tree, create_tree_digraph 23 | except ImportError: 24 | pass 25 | ''' 26 | 27 | dir_path = os.path.dirname(os.path.realpath(__file__)) 28 | #print(f"__init_ dir_path={dir_path}") 29 | 30 | __all__ = ['LiteMORT','LiteMORT_profile','Mort_Preprocess','Mort_PickSamples','MORT_feat_select_'] 31 | 32 | 33 | -------------------------------------------------------------------------------- /.idea/LiteMORT.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 18 | -------------------------------------------------------------------------------- /python-package/LiteMORT/LiteMORT_time.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn import skbase 3 | import numpy as np 4 | 5 | #https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113784#latest-656376 6 | class DatetimeConvertCyclical(skbase.BaseEstimator, skbase.TransformerMixin): 7 | def __init__(self): 8 | self.time_periods = {'second': 24 * 60 * 60, 9 | 'minute': 24 * 60, 10 | 'hour': 24, 11 | 'day': 30, 12 | 'dayofweek': 7, 13 | 'month': 12} 14 | 15 | def fit(self, X, y=None): 16 | return self 17 | 18 | def transform(self, X): 19 | for period, value in self.time_periods.items(): 20 | X[period] = getattr(X['timestamp'].dt, period) 21 | 22 | X['sin_' + period] = np.sin(2 * np.pi * X[period] / value) 23 | X['cos_' + period] = np.cos(2 * np.pi * X[period] / value) 24 | 25 | X.drop(str(period), axis=1, inplace=True) 26 | 27 | return X -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /python-package/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /src/util/GRander.cpp: -------------------------------------------------------------------------------- 1 | #include "samp_set.hpp" 2 | #include "GRander.hpp" 3 | using namespace Grusoft; 4 | 5 | extern "C" uint64_t xoroshiro_next(void); 6 | 7 | uint64_t GRander::RandRersResrResdra() { // Combined period = 2^116.23 8 | int alg = 2; 9 | switch (alg) { 10 | case 0: 11 | break; //return pcg32_random_r(&rng_neil); //32-bit unsigned int - period: 2^64 12 | case 1: 13 | return xoroshiro_next(); 14 | default: 15 | xx = rotl(xx, 8) - rotl(xx, 29); //RERS, period = 4758085248529 (prime) 16 | yy = rotl(yy, 21) - yy; yy = rotl(yy, 20); //RESR, period = 3841428396121 (prime) 17 | zz = rotl(zz, 42) - zz; zz = zz + rotl(zz, 14); //RESDRA, period = 5345004409 (prime) 18 | return xx ^ yy ^ zz; 19 | } 20 | return 0; 21 | } 22 | 23 | /* 24 | DIST_RangeN::DIST_RangeN(int seed, double a0, double a1) : 25 | GRander(seed), rMin(a0), rMax(a1) { 26 | std::normal_distribution<> d1((rMax+rMin)/2,(rMax-rMin)/6); 27 | d=d1; 28 | } 29 | 30 | double DIST_RangeN::gen(){ 31 | double a; 32 | do{ 33 | a = d(g); 34 | } while (arMax); 35 | return (a); 36 | }*/ -------------------------------------------------------------------------------- /python-package/mort_local.py: -------------------------------------------------------------------------------- 1 | import litemort 2 | from litemort import * 3 | print(litemort.__version__) 4 | 5 | early_stop = 20 6 | verbose_eval = 5 7 | metric = 'l2' 8 | #num_rounds=1000, lr=0.05, bf=0.3 9 | num_rounds = 1000; lr = 0.05; bf = 0.3 10 | params = {'num_leaves': 31, 'n_estimators': num_rounds, 11 | 'objective': 'regression', 12 | 'max_bin': 256, 13 | # 'max_depth': -1, 14 | 'learning_rate': lr, 15 | "boosting": "gbdt", 16 | "bagging_freq": 5, 17 | "bagging_fraction": bf, 18 | "feature_fraction": 0.9, # STRANGE GBDT why("bagging_freq": 5 "feature_fraction": 0.9)!!! 19 | "metric": metric, "verbose_eval": verbose_eval, 'n_jobs': 8, "elitism": 0,"debug":'1', 20 | "early_stopping_rounds": early_stop, "adaptive": 'weight1', 'verbose': 0, 'min_data_in_leaf': 20, 21 | # "verbosity": -1, 22 | # 'reg_alpha': 0.1, 23 | # 'reg_lambda': 0.3 24 | } 25 | mort=LiteMORT(params) -------------------------------------------------------------------------------- /vs/LiteMORT/ReadMe.txt: -------------------------------------------------------------------------------- 1 | ======================================================================== 2 | 动态链接库:LiteMORT 项目概述 3 | ======================================================================== 4 | 5 | 应用程序向导已为您创建了此 LiteMORT DLL。 6 | 7 | 本文件概要介绍组成 LiteMORT 应用程序的每个文件的内容。 8 | 9 | 10 | LiteMORT.vcxproj 11 | 这是使用应用程序向导生成的 VC++ 项目的主项目文件,其中包含生成该文件的 Visual C++ 的版本信息,以及有关使用应用程序向导选择的平台、配置和项目功能的信息。 12 | 13 | LiteMORT.vcxproj.filters 14 | 这是使用“应用程序向导”生成的 VC++ 项目筛选器文件。它包含有关项目文件与筛选器之间的关联信息。在 IDE 中,通过这种关联,在特定节点下以分组形式显示具有相似扩展名的文件。例如,“.cpp”文件与“源文件”筛选器关联。 15 | 16 | LiteMORT.cpp 17 | 这是主 DLL 源文件。 18 | 19 | 此 DLL 在创建时不导出任何符号。因此,生成时不会产生 .lib 文件。如果希望此项目成为其他某个项目的项目依赖项,则需要添加代码以从 DLL 导出某些符号,以便产生一个导出库,或者,也可以在项目“属性页”对话框中的“链接器”文件夹中,将“常规”属性页上的“忽略输入库”属性设置为“是”。 20 | 21 | ///////////////////////////////////////////////////////////////////////////// 22 | 其他标准文件: 23 | 24 | StdAfx.h, StdAfx.cpp 25 | 这些文件用于生成名为 LiteMORT.pch 的预编译头 (PCH) 文件和名为 StdAfx.obj 的预编译类型文件。 26 | 27 | ///////////////////////////////////////////////////////////////////////////// 28 | 其他注释: 29 | 30 | 应用程序向导使用“TODO:”注释来指示应添加或自定义的源代码部分。 31 | 32 | ///////////////////////////////////////////////////////////////////////////// 33 | -------------------------------------------------------------------------------- /src/data_fold/Representive.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | using namespace std; 11 | #include "./FeatVector.hpp" 12 | 13 | #ifdef WIN32 14 | #include 15 | #include 16 | #else 17 | #include 18 | //#define assert(cond) 19 | #endif 20 | 21 | 22 | namespace Grusoft { 23 | class MT_BiSplit; 24 | 25 | class FeatsOnFold; 26 | class Distribution; 27 | 28 | struct FeatPresent { 29 | FeatVector *hFeat = nullptr; 30 | float T_min = 5; 31 | FeatPresent(FeatVector *hF, float T_,int flag=0x0) : hFeat(hF),T_min(T_) { 32 | 33 | } 34 | 35 | }; 36 | 37 | class Representive { 38 | vector arrPFeat; 39 | public: 40 | Representive() { 41 | 42 | } 43 | virtual ~Representive() { 44 | for (auto pf : arrPFeat) 45 | delete pf; 46 | arrPFeat.clear(); 47 | } 48 | void Append(FeatVector *hF, float T_, int flag = 0x0) { 49 | arrPFeat.push_back(new FeatPresent(hF, T_)); 50 | } 51 | bool isValid(const MT_BiSplit *hNode,int flag=0x0); 52 | void dump(int flag=0x0); 53 | }; 54 | 55 | 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/learn/Regression.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace Grusoft { 4 | class Regression { 5 | double _slope, _yInt; 6 | public: 7 | Regression(string alg,int flag=0x0) { 8 | 9 | } 10 | 11 | /* 12 | https://web.archive.org/web/20150715022401/http://faculty.cs.niu.edu/~hutchins/csci230/best-fit.htm 13 | Y = Slope * X + YInt 14 | */ 15 | template 16 | bool Fit(size_t nSamp, tpSAMP_ID *samps,Tx *arrX, Ty *arrY, int flag = 0x0) { 17 | assert (nSamp >= 2) ; 18 | double sumX = 0, sumY = 0, sumXY = 0, sumX2 = 0; 19 | Tx x,y; 20 | tpSAMP_ID samp; 21 | for (int i = 0; i 41 | Tx At(Tx x, int flag = 0x0) { 42 | double y = _slope*x+ _yInt; 43 | return (Tx)(y); 44 | } 45 | }; 46 | 47 | 48 | } -------------------------------------------------------------------------------- /vs/LiteMORT/LiteMORT.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LiteMORT", "LiteMORT.vcxproj", "{668D61FD-5B48-4AFF-A9C8-3680CA9A0147}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x64.ActiveCfg = Debug|x64 17 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x64.Build.0 = Debug|x64 18 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x86.ActiveCfg = Debug|Win32 19 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Debug|x86.Build.0 = Debug|Win32 20 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x64.ActiveCfg = Release|x64 21 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x64.Build.0 = Release|x64 22 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x86.ActiveCfg = Release|Win32 23 | {668D61FD-5B48-4AFF-A9C8-3680CA9A0147}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /python-package/LiteMORT/libpath.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Find the path to LiteMORT dynamic library files.""" 3 | import os 4 | 5 | from platform import system 6 | 7 | 8 | def find_lib_path(): 9 | """Find the path to LiteMORT library files. 10 | Returns 11 | ------- 12 | lib_path: list(string) 13 | List of all found library path to LiteMORT 14 | """ 15 | curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) 16 | dll_path = [curr_path, os.path.join(curr_path, '../../'), 17 | os.path.join(curr_path, 'compile'), 18 | os.path.join(curr_path, '../compile'), 19 | os.path.join(curr_path, '../../lib/')] 20 | if system() in ('Windows', 'Microsoft'): 21 | dll_path.append(os.path.join(curr_path, '../compile/Release/')) 22 | dll_path.append(os.path.join(curr_path, '../compile/windows/x64/DLL/')) 23 | dll_path.append(os.path.join(curr_path, '../../Release/')) 24 | dll_path.append(os.path.join(curr_path, '../../windows/x64/DLL/')) 25 | dll_path = [os.path.join(p, 'LiteMORT.dll') for p in dll_path] 26 | else: 27 | dll_path = [os.path.join(p, 'libLiteMORT.so') for p in dll_path] 28 | lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] 29 | if not lib_path: 30 | dll_path = [os.path.realpath(p) for p in dll_path] 31 | raise Exception('Cannot find LiteMORT library in following paths: ' + '\n'.join(dll_path)) 32 | 33 | return lib_path 34 | -------------------------------------------------------------------------------- /src/learn/LMachine.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | namespace Grusoft{ 10 | class LMachine { 11 | public: 12 | typedef std::mt19937* hRANDER; //(pseudo) random generator 13 | typedef enum{ 14 | CLASIFY,REGRESSION 15 | }MODEL; 16 | enum{ //constant 17 | SAMPL_OOB=100,SAMPL_InB, 18 | RAND_REINIT=9991, 19 | }; 20 | struct SKDU{ //Learn Schdule 21 | //each cascade contain nSteps.each step contain 1 or n trees 22 | int cascad,step,nStep,noT,nTree,noLeaf; 23 | bool isLastStep( ) { return step==nStep-1; } 24 | bool isLastTree( ) { return noT==nTree-1; } 25 | LMachine* hMachine; 26 | float rBase,rMax,rMin,gamma,lr; 27 | SKDU( ):cascad(0),step(0),nStep(0),nTree(0),noT(-1),noLeaf(-1){} 28 | }; 29 | 30 | 31 | struct CASE{ 32 | float label,predict; //for classification and 1-var regression 33 | int nBag; 34 | CASE( ):nBag(0),label(0.0),predict(0.0) { ; } 35 | virtual ~CASE( ) {;} 36 | }; 37 | typedef vector CASEs; 38 | CASEs SamplSet; 39 | 40 | protected: 41 | bool isDumpLeaf; 42 | hRANDER hRander; 43 | MODEL model; 44 | int nThread; 45 | SKDU skdu; 46 | void *user_data; 47 | 48 | double impurity,sBalance,eOOB,eInB; 49 | int nFeat,nClass,nPickWeak; 50 | vectorFeatNames; 51 | //vector arrDat; 52 | public: 53 | string name; 54 | 55 | hRANDER InitRander( unsigned seed ); 56 | virtual void Clear( ); 57 | 58 | }; 59 | } 60 | -------------------------------------------------------------------------------- /src/tree/GBRT.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "BoostingForest.hpp" 8 | #include "../data_fold/Representive.hpp" 9 | using namespace std; 10 | 11 | namespace Grusoft{ 12 | class FeatsOnFold; 13 | /* 14 | residual boosting mayber better 15 | */ 16 | class GBRT : public BoostingForest { 17 | string sPre; 18 | //random_state 19 | protected: 20 | double shrinkage=0.1; 21 | double nzWeak; 22 | bool isCalcErr; 23 | int nBlitThread; 24 | virtual bool GetFeatDistri(WeakLearner *hWeak, float *distri = nullptr, int flag = 0x0); 25 | //virtual bool LeafModel(WeakLearner *hWeak, int flag = 0x0); 26 | //virtual void UpdateFeat(int flag); 27 | //virtual void BlitSamps(WeakLearner *hWeak, SAMPs &fnL, SAMPs &fnR, int flag = 0x0); 28 | //virtual hBLIT GetBlit(WeakLearner *hWeak, int flag = 0x0); 29 | virtual void GetYDistri(WeakLearner *hWeak, float *distri = nullptr, int flag = 0x0); 30 | //virtual void Confi_Impuri(WeakLearner *hWeak, int flag); 31 | virtual void AfterTrain(FeatsOnFold *hData, int cas, int nMulti, int flag = 0x0); 32 | public: 33 | 34 | tpDOWN mOff, mSum; 35 | 36 | typedef enum { 37 | SINGLE_TREE, MULTI_TREE 38 | }REGULAR; 39 | REGULAR regular = SINGLE_TREE; 40 | arrPFNO Tests; 41 | //double eta, lenda; 42 | 43 | typedef enum { 44 | BT_ALL, BT_MAX_ERR, BT_MIN_ERR, BT_RANDOM_3 45 | }BOOT; 46 | BOOT boot; 47 | int rounds, dup, nOOB, no; 48 | 49 | GBRT(FeatsOnFold *hTrain, FeatsOnFold *hEval, double sOOB, MODEL mo_, int nTree, int flag = 0x0); 50 | virtual ~GBRT() { 51 | } 52 | const LiteBOM_Config& Config() const { 53 | return hTrainData->config; 54 | } 55 | virtual void BeforeTrain(FeatsOnFold *hData, int flag = 0x0); 56 | virtual int Train(string sTitle, int cas, int flag = 0x0); 57 | virtual int Prune(int flag = 0x0); 58 | virtual int IterTrain(int round,int flag); 59 | virtual double Predict(FeatsOnFold *hData,bool updateStopping=false,bool checkLossy=false,bool resumeLast=false, int flag=0x0); 60 | virtual int Test(string sTitle, BoostingForest::CASEs& TestSet, int nCls, int flag); 61 | virtual bool isPassNode(FeatsOnFold *hData_, hMTNode hNode, int flag = 0x0); 62 | 63 | 64 | }; 65 | } 66 | -------------------------------------------------------------------------------- /src/python/pyMORT_DLL.h: -------------------------------------------------------------------------------- 1 | #if (defined _WINDOWS) || (defined WIN32) 2 | #ifdef PYMORT_DLL_EXPORTS 3 | #define PYMORT_DLL_API __declspec(dllexport) 4 | #else 5 | #define PYMORT_DLL_API __declspec(dllimport) 6 | #endif 7 | #else 8 | #define PYMORT_DLL_API 9 | #endif 10 | 11 | #include "../util/PY_obj.hpp" 12 | 13 | #define __API_BEGIN__() try { 14 | 15 | #define __API_END__() } \ 16 | catch(std::exception& ex) { return (ex); } \ 17 | catch(std::string& ex) { return (ex); } \ 18 | catch(...) { return ("unknown exception"); } \ 19 | return 0; 20 | 21 | 22 | struct PY_ITEM { 23 | char *Keys; 24 | float Values; 25 | char *text; 26 | void *arr; 27 | }; 28 | 29 | 30 | #ifdef __cplusplus 31 | extern "C" { 32 | #endif 33 | 34 | PYMORT_DLL_API void* LiteMORT_init(PY_ITEM* params, int nParam, PY_DATASET_LIST *merge_list,int64_t flag); 35 | PYMORT_DLL_API void LiteMORT_clear(void*); 36 | 37 | PYMORT_DLL_API void LiteMORT_set_mergesets(void *, PY_DATASET_LIST *train, int64_t flag); 38 | 39 | //PYMORT_DLL_API void LiteMORT_set_feat(PY_ITEM* params, int nParam, int flag); 40 | PYMORT_DLL_API void LiteMORT_fit(void *,float *h_data, tpY *h_target, size_t nSamp, size_t ldS, float *eval_data, tpY *eval_target, size_t nEval, size_t flag); 41 | PYMORT_DLL_API void LiteMORT_predict(void *,float *X, tpY *y, size_t nFeat_0, size_t nSamp, size_t flag); 42 | PYMORT_DLL_API void LiteMORT_Imputer_f(float *X, tpY *y, size_t nFeat_0, size_t nSamp, size_t flag); 43 | PYMORT_DLL_API void LiteMORT_Imputer_d(double *X, tpY *y, size_t nFeat_0, size_t nSamp, size_t flag); 44 | //PYMORT_DLL_API void LiteMORT_EDA(void *, const float *X, const tpY *y, const size_t nFeat_0, const size_t nn, const size_t nValid, 45 | // PY_ITEM* params, int nParam, const size_t flag); 46 | 47 | //PYMORT_DLL_API void LiteMORT_fit_1(void *, PY_COLUMN *train, PY_COLUMN *target, size_t nSamp, size_t nFeat_0, PY_COLUMN *eval, PY_COLUMN *eval_target, size_t nEval, size_t flag); 48 | PYMORT_DLL_API void LiteMORT_fit_1(void *, PY_DATASET_LIST *train, PY_DATASET_LIST *eval, size_t flag); 49 | //PYMORT_DLL_API void LiteMORT_predict_1(void *, PY_COLUMN *X, PY_COLUMN *y, size_t nFeat_0,size_t nSamp, size_t flag); 50 | PYMORT_DLL_API void LiteMORT_predict_1(void *, PY_DATASET_LIST*predict, size_t flag); 51 | PYMORT_DLL_API void cpp_test(void *, PY_DATASET*dat); 52 | 53 | 54 | #ifdef __cplusplus 55 | } 56 | #endif 57 | -------------------------------------------------------------------------------- /src/learn/Pruning.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../util/samp_set.hpp" 8 | 9 | 10 | namespace Grusoft{ 11 | typedef double tpMetricU; 12 | class ManifoldTree; 13 | class FeatsOnFold; 14 | class EnsemblePruning{ 15 | double *orth=nullptr; 16 | double *gamma = nullptr; 17 | int num_orth=0,ldOrth=0; 18 | protected: 19 | static bool isDebug,isRand; 20 | FeatsOnFold *hFold = nullptr; 21 | BoostingForest *hBoost = nullptr; 22 | 23 | //|wx|=1 |wy|=1 24 | tpMetricU *mA = nullptr, *ax_=nullptr; 25 | tpMetricU *mB = nullptr, *wy=nullptr; //live section of (A,x) 26 | int ldA_; //row_major 27 | int nLive = 0, nLive_0 = 0; 28 | int *wasSmall=nullptr, *isLive = nullptr, *wasLive=nullptr,*y2x=nullptr; 29 | std::vector sorted_indices; 30 | int nSparsified() { 31 | int nPick, i; 32 | for (nPick = 0, i = 0; i < nWeak; i++) { 33 | if (wx[i] > 0) 34 | nPick++; 35 | } 36 | return nPick; 37 | } 38 | //is_live=abs_x < 1.0-delta; mB = mA[:,is_live]; wy = wx[is_live] 39 | int SubOnLive(double delta,bool update_orth,double *v_0,double *v_sub,int flag); 40 | 41 | void ToCSV(const string& sPath, int flag); 42 | void LoadCSV(const string& sPath, int flag); 43 | double UpateGamma(int *isLive, int nY,int flag = 0x0); 44 | bool partial_infty_color(int nX,bool balance, int flag = 0x0); 45 | void sorted_ax(int flag=0x0); 46 | void make_orthogonal(tpMetricU *b, int ldB, int &nRun, int nMost, int nLive_0, int *isSmall, int flag=0x0); 47 | void basic_local_search(double *,bool balanced = false, int flag = 0x0); 48 | void local_improvements(double *, bool balanced = false, int flag = 0x0); 49 | void greedy(double*,bool balanced = false, int flag = 0x0); 50 | void round_coloring(bool balanced = false, int flag=0x0); 51 | virtual void Prepare(int flag = 0x0); 52 | public: 53 | size_t nSamp = 0, nWeak = 0, nMostWeak = 0; 54 | int nPruneOperation = 0; 55 | tpMetricU *init_score = nullptr; 56 | std::vectorforest; 57 | double *plus_minus = nullptr; 58 | //combination coefficient 59 | tpMetricU *cc_0 = nullptr, *cc_1 = nullptr,cc_0_sum=0, *wx = nullptr; 60 | 61 | EnsemblePruning(BoostingForest *hBoost,FeatsOnFold *hFold, int nWeak_,int flag=0x0); 62 | virtual ~EnsemblePruning(); 63 | virtual bool isValid() { return true; } 64 | 65 | virtual void Reset4Pick(int flag); 66 | virtual bool Pick(int nWeak_,int isToCSV, int flag); 67 | virtual bool Compare( int flag); 68 | 69 | virtual void OnStep(ManifoldTree *hTree, tpDOWN*down, int flag = 0x0); 70 | }; 71 | 72 | }; 73 | 74 | 75 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12) 2 | SET(PROJECT_NAME LiteMORT) 3 | PROJECT(${PROJECT_NAME} LANGUAGES CXX) 4 | message("Hello, It's ${PROJECT_NAME} by CYS!") 5 | cmake_minimum_required(VERSION 3.2 FATAL_ERROR) 6 | set (CMAKE_CXX_STANDARD 11) 7 | 8 | OPTION(USE_OPENMP "Enable OpenMP" ON) 9 | 10 | if(USE_OPENMP) 11 | find_package(OpenMP REQUIRED) 12 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 13 | else() 14 | # Ignore unknown #pragma warning 15 | if( (CMAKE_CXX_COMPILER_ID MATCHES "[cC][lL][aA][nN][gG]") 16 | OR (CMAKE_CXX_COMPILER_ID MATCHES "[gG][nN][uU]")) 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") 18 | endif() 19 | endif(USE_OPENMP) 20 | 21 | if(MSVC) 22 | if(MSVC_VERSION LESS 1900) 23 | message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a newer MSVC.") 24 | endif() 25 | 26 | SET(variables 27 | CMAKE_C_FLAGS_DEBUG 28 | CMAKE_C_FLAGS_MINSIZEREL 29 | CMAKE_C_FLAGS_RELEASE 30 | CMAKE_C_FLAGS_RELWITHDEBINFO 31 | CMAKE_CXX_FLAGS_DEBUG 32 | CMAKE_CXX_FLAGS_MINSIZEREL 33 | CMAKE_CXX_FLAGS_RELEASE 34 | CMAKE_CXX_FLAGS_RELWITHDEBINFO 35 | ) 36 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /O2 /Ob2 /Oi /Ot /Oy /GL") 37 | else() 38 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") 39 | endif() 40 | 41 | 42 | set (cLIB "../../lib/") 43 | #set (PYTHON_INC "../../lib/") 44 | SET(some_COMPILE_FLAGS "-static -std=c++11 -pthread -O3 -I${cLIB}" ) 45 | 46 | SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${some_COMPILE_FLAGS}") 47 | 48 | SET(SOURCE_DIR "src") 49 | # Tell cmake that headers are in alse in source_dir 50 | include_directories(${SOURCE_DIR}) 51 | SET(SOURCE_FILES ${SOURCE_DIR}/LiteMORT.cpp ${SOURCE_DIR}/python/pyMORT_DLL.cpp) 52 | add_subdirectory(${SOURCE_DIR}/data_fold) 53 | add_subdirectory(${SOURCE_DIR}/tree) 54 | add_subdirectory(${SOURCE_DIR}/util) 55 | add_subdirectory(${SOURCE_DIR}/learn) 56 | add_subdirectory(${SOURCE_DIR}/EDA) 57 | 58 | #add_executable("${PROJECT_NAME}" ${SOURCE_FILES}) 59 | ADD_LIBRARY("${PROJECT_NAME}" SHARED ${SOURCE_FILES}) 60 | target_link_libraries(${PROJECT_NAME} data_fold_ tree_ util_ learn_ eda_) 61 | # SET(TEST_DIR "tests") 62 | # SET(TESTS ${SOURCES} 63 | # "${TEST_DIR}/test_main.cpp" 64 | # "${TEST_DIR}/test_math.cpp") 65 | 66 | # Generate a test executable 67 | # include_directories(lib/catch/include) 68 | # add_executable("${PROJECT_NAME}_test" ${SOURCE_FILES}) 69 | 70 | 71 | # Generate python module 72 | # add_subdirectory(lib/pybind11) 73 | # pybind11_add_module(python_cpp_example ${SOURCES} "${SOURCE_DIR}/bindings.cpp") 74 | -------------------------------------------------------------------------------- /tests/python_package_test/test_basic.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: skip-file 3 | import os 4 | import tempfile 5 | import unittest 6 | 7 | import lightgbm as lgb 8 | #from litemort import LiteMORT 9 | import numpy as np 10 | from sklearn.datasets import load_breast_cancer, dump_svmlight_file 11 | from sklearn.model_selection import train_test_split 12 | 13 | 14 | class TestBasic(unittest.TestCase): 15 | 16 | def test(self): 17 | X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) 18 | train_data = lgb.Dataset(X_train, label=y_train) 19 | valid_data = train_data.create_valid(X_test, label=y_test) 20 | 21 | params = { 22 | "objective": "binary", 23 | "metric": "auc", 24 | "min_data": 10, 25 | "num_leaves": 15, 26 | "verbose": -1, 27 | "num_threads": 1, 28 | "max_bin": 255 29 | } 30 | bst = lgb.Booster(params, train_data) 31 | bst.add_valid(valid_data, "valid_1") 32 | 33 | for i in range(30): 34 | bst.update() 35 | if i % 10 == 0: 36 | print(bst.eval_train(), bst.eval_valid()) 37 | #bst.save_model("model.txt") 38 | pred_from_matr = bst.predict(X_test) 39 | with tempfile.NamedTemporaryFile() as f: 40 | tname = f.name 41 | with open(tname, "w+b") as f: 42 | dump_svmlight_file(X_test, y_test, f) 43 | pred_from_file = bst.predict(tname) 44 | os.remove(tname) 45 | self.assertEqual(len(pred_from_matr), len(pred_from_file)) 46 | for preds in zip(pred_from_matr, pred_from_file): 47 | self.assertAlmostEqual(*preds, places=15) 48 | 49 | # check saved model persistence 50 | bst = lgb.Booster(params, model_file="model.txt") 51 | pred_from_model_file = bst.predict(X_test) 52 | self.assertEqual(len(pred_from_matr), len(pred_from_model_file)) 53 | for preds in zip(pred_from_matr, pred_from_model_file): 54 | # we need to check the consistency of model file here, so test for exact equal 55 | self.assertEqual(*preds) 56 | 57 | # check early stopping is working. Make it stop very early, so the scores should be very close to zero 58 | pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} 59 | pred_early_stopping = bst.predict(X_test, **pred_parameter) 60 | self.assertEqual(len(pred_from_matr), len(pred_early_stopping)) 61 | for preds in zip(pred_early_stopping, pred_from_matr): 62 | # scores likely to be different, but prediction should still be the same 63 | self.assertEqual(preds[0] > 0, preds[1] > 0) 64 | -------------------------------------------------------------------------------- /src/util/pcg_oneil/pcg_basic.h: -------------------------------------------------------------------------------- 1 | /* 2 | * PCG Random Number Generation for C. 3 | * 4 | * Copyright 2014 Melissa O'Neill 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * For additional information about the PCG random number generation scheme, 19 | * including its license and other licensing options, visit 20 | * 21 | * http://www.pcg-random.org 22 | */ 23 | 24 | /* 25 | * This code is derived from the full C implementation, which is in turn 26 | * derived from the canonical C++ PCG implementation. The C++ version 27 | * has many additional features and is preferable if you can use C++ in 28 | * your project. 29 | */ 30 | 31 | #ifndef PCG_BASIC_H_INCLUDED 32 | #define PCG_BASIC_H_INCLUDED 1 33 | 34 | #include 35 | 36 | #if __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | struct pcg_state_setseq_64 { // Internals are *Private*. 41 | uint64_t state; // RNG state. All values are possible. 42 | uint64_t inc; // Controls which RNG sequence (stream) is 43 | // selected. Must *always* be odd. 44 | }; 45 | typedef struct pcg_state_setseq_64 pcg32_random_t; 46 | 47 | // If you *must* statically initialize it, here's one. 48 | 49 | #define PCG32_INITIALIZER { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL } 50 | 51 | // pcg32_srandom(initstate, initseq) 52 | // pcg32_srandom_r(rng, initstate, initseq): 53 | // Seed the rng. Specified in two parts, state initializer and a 54 | // sequence selection constant (a.k.a. stream id) 55 | 56 | void pcg32_srandom(uint64_t initstate, uint64_t initseq); 57 | void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, 58 | uint64_t initseq); 59 | 60 | // pcg32_random() 61 | // pcg32_random_r(rng) 62 | // Generate a uniformly distributed 32-bit random number 63 | 64 | uint32_t pcg32_random(void); 65 | uint32_t pcg32_random_r(pcg32_random_t* rng); 66 | 67 | // pcg32_boundedrand(bound): 68 | // pcg32_boundedrand_r(rng, bound): 69 | // Generate a uniformly distributed number, r, where 0 <= r < bound 70 | 71 | uint32_t pcg32_boundedrand(uint32_t bound); 72 | uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound); 73 | 74 | #if __cplusplus 75 | } 76 | #endif 77 | 78 | #endif // PCG_BASIC_H_INCLUDED 79 | -------------------------------------------------------------------------------- /src/tree/old/RF_ShapeRegress.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include ".\GruST\learn\DecisionTree.hpp" 4 | #include ".\GruST\image\BMPfold.hpp" 5 | 6 | namespace Grusoft{ 7 | 8 | class RF_ShapeRegress : public RandomForest{ 9 | string sPre; 10 | char sLine[1000]; 11 | 12 | protected: 13 | bool isToCPP; 14 | FILE *fpC,*fpD,*fpT; 15 | 16 | float *cand_dis; 17 | arrPFNO featLines; 18 | void RandCandidate( int nCand,ShapeBMPfold &mean,ShapeBMPfold::PTFs &cands,int flag=0x0 ); 19 | void RandCandidate_2( int nCand,ShapeBMPfold &mean,ShapeBMPfold::PTFs &cands,int flag=0x0 ); 20 | virtual hBLIT GetBlit( WeakLearner *hWeak,int flag=0x0 ); 21 | virtual void BlitSamps( WeakLearner *hWeak,SAMPs &fnL,SAMPs &fnR,int flag=0x0 ); 22 | //virtual void ToCPP(WeakLearner *hWeak,int flag=0x0); 23 | virtual void Confi_Impuri( WeakLearner *hWeak,int flag ); 24 | virtual bool GetFeatDistri( WeakLearner *hWeak,float *distri=nullptr,int flag=0x0 ); 25 | //bool Confi_Regress( WeakLearner *hWeak,int flag ); 26 | virtual bool LeafModel( WeakLearner *hWeak,int flag=0x0 ); 27 | virtual double ErrorAt( arrPFNO& samps ); 28 | virtual void BootSample( DecisionTree *hTree,arrPFNO &boot,arrPFNO &oob,FeatData *hDat,int flag=0x0 ); 29 | void FeatLineBmp( string sPath,int flag=0x0 ); 30 | virtual int nPickAtSplit( WeakLearner *hWeak ){ 31 | return nPickWeak; 32 | } 33 | virtual void DumpTree( int nox,DecisionTree *hTree,int flag=0x0 ); 34 | virtual void UpdateFeat( int flag=0x0 ); 35 | virtual void OnMultiTree( int cas,int nMulti,int flag=0x0 ); 36 | virtual void AfterTrain( int cas,int nMulti,int flag=0x0 ); 37 | 38 | ShapeBMPfold& spMean; 39 | ShapeBMPfold::PTFs cands,*arrPTF; 40 | double nzWeak; 41 | bool isCalcErr; 42 | int nBlitThread; 43 | public: 44 | typedef enum{ 45 | SINGLE_TREE,MULTI_TREE 46 | }REGULAR; 47 | REGULAR regular; 48 | 49 | typedef enum{ 50 | BT_ALL,BT_MAX_ERR,BT_MIN_ERR,BT_RANDOM_3 51 | }BOOT; 52 | BOOT boot; 53 | ShapeBMPfold::PT_INDEX index; 54 | int nTree,dup,nOOB,no; 55 | // SHAPE_PtSet sp; 56 | arrPFNO Tests; 57 | double eta,lenda; 58 | //Eigen::MatrixXd mOff,mSum; 59 | ShapeBMPfold::VECT mOff,mSum; 60 | vector Trains; 61 | RF_ShapeRegress( ):RandomForest( ),nOOB(0),regular(SINGLE_TREE),boot(BT_MIN_ERR),spMean(ShapeBMPfold::VIRTU), 62 | isToCPP(false),fpC(NULL),fpD(NULL),fpT(NULL),arrPTF(nullptr){ ; } 63 | RF_ShapeRegress( vector&Trains,ShapeBMPfold &spMean,int nCand,int nStep,int nEach,int nOB,int cas,int flag=0x0); 64 | ~RF_ShapeRegress(); 65 | 66 | virtual int Train( string sTitle,int cas,int flag ); 67 | virtual void AfterTrain( FeatData *hData,int flag=0x0 ); 68 | void TraceBmp( string sPath,int type,int flag=0x0 ); 69 | 70 | bool InitCPP( char *pathC,char *pathD,char *pathT,int type,int flag=0x0 ); 71 | virtual void ToCPP( DecisionTree *hTree,int cas,int step,int tree,int flag=0x0 ); 72 | virtual void ToCPP( int cas,int flag=0x0 ); 73 | void CoreInCPP( int cas,int flag ); 74 | friend class RF_ConfiRegress; 75 | }; 76 | 77 | } 78 | 79 | -------------------------------------------------------------------------------- /src/util/pcg_oneil/xoshiro256starstar.c: -------------------------------------------------------------------------------- 1 | /* Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org) 2 | 3 | To the extent possible under law, the author has dedicated all copyright 4 | and related and neighboring rights to this software to the public domain 5 | worldwide. This software is distributed without any warranty. 6 | 7 | See . */ 8 | 9 | #include 10 | 11 | /* This is xoshiro256** 1.0, one of our all-purpose, rock-solid 12 | generators. It has excellent (sub-ns) speed, a state (256 bits) that is 13 | large enough for any parallel application, and it passes all tests we 14 | are aware of. 15 | 16 | For generating just floating-point numbers, xoshiro256+ is even faster. 17 | 18 | The state must be seeded so that it is not everywhere zero. If you have 19 | a 64-bit seed, we suggest to seed a splitmix64 generator and use its 20 | output to fill s. */ 21 | 22 | static inline uint64_t rotl(const uint64_t x, int k) { 23 | return (x << k) | (x >> (64 - k)); 24 | } 25 | 26 | 27 | static uint64_t s[4]; 28 | 29 | uint64_t next(void) { 30 | const uint64_t result = rotl(s[1] * 5, 7) * 9; 31 | 32 | const uint64_t t = s[1] << 17; 33 | 34 | s[2] ^= s[0]; 35 | s[3] ^= s[1]; 36 | s[1] ^= s[2]; 37 | s[0] ^= s[3]; 38 | 39 | s[2] ^= t; 40 | 41 | s[3] = rotl(s[3], 45); 42 | 43 | return result; 44 | } 45 | 46 | 47 | /* This is the jump function for the generator. It is equivalent 48 | to 2^128 calls to next(); it can be used to generate 2^128 49 | non-overlapping subsequences for parallel computations. */ 50 | 51 | void jump(void) { 52 | static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; 53 | 54 | uint64_t s0 = 0; 55 | uint64_t s1 = 0; 56 | uint64_t s2 = 0; 57 | uint64_t s3 = 0; 58 | for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) 59 | for(int b = 0; b < 64; b++) { 60 | if (JUMP[i] & UINT64_C(1) << b) { 61 | s0 ^= s[0]; 62 | s1 ^= s[1]; 63 | s2 ^= s[2]; 64 | s3 ^= s[3]; 65 | } 66 | next(); 67 | } 68 | 69 | s[0] = s0; 70 | s[1] = s1; 71 | s[2] = s2; 72 | s[3] = s3; 73 | } 74 | 75 | 76 | 77 | /* This is the long-jump function for the generator. It is equivalent to 78 | 2^192 calls to next(); it can be used to generate 2^64 starting points, 79 | from each of which jump() will generate 2^64 non-overlapping 80 | subsequences for parallel distributed computations. */ 81 | 82 | void long_jump(void) { 83 | static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 }; 84 | 85 | uint64_t s0 = 0; 86 | uint64_t s1 = 0; 87 | uint64_t s2 = 0; 88 | uint64_t s3 = 0; 89 | for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) 90 | for(int b = 0; b < 64; b++) { 91 | if (LONG_JUMP[i] & UINT64_C(1) << b) { 92 | s0 ^= s[0]; 93 | s1 ^= s[1]; 94 | s2 ^= s[2]; 95 | s3 ^= s[3]; 96 | } 97 | next(); 98 | } 99 | 100 | s[0] = s0; 101 | s[1] = s1; 102 | s[2] = s2; 103 | s[3] = s3; 104 | } 105 | -------------------------------------------------------------------------------- /src/util/pcg_oneil/xoshiro256plusplus.c: -------------------------------------------------------------------------------- 1 | /* Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org) 2 | 3 | To the extent possible under law, the author has dedicated all copyright 4 | and related and neighboring rights to this software to the public domain 5 | worldwide. This software is distributed without any warranty. 6 | 7 | See . */ 8 | 9 | #include 10 | 11 | //We suggest to use a SplitMix64 to initialize the state of our generators starting from a 64-bit seed, as research has shown that initialization must be performed with a generator radically different in nature from the one initialized to avoid correlation on similar seeds. 12 | uint64_t SplitMix64_next() { 13 | static uint64_t x; /* The state can be seeded with any value. */ 14 | uint64_t z = (x += 0x9e3779b97f4a7c15); 15 | z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; 16 | z = (z ^ (z >> 27)) * 0x94d049bb133111eb; 17 | return z ^ (z >> 31); 18 | } 19 | 20 | /* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators. 21 | It has excellent (sub-ns) speed, a state (256 bits) that is large 22 | enough for any parallel application, and it passes all tests we are 23 | aware of. 24 | 25 | For generating just floating-point numbers, xoshiro256+ is even faster. 26 | 27 | The state must be seeded so that it is not everywhere zero. If you have 28 | a 64-bit seed, we suggest to seed a splitmix64 generator and use its 29 | output to fill s. */ 30 | 31 | static inline uint64_t xoroshiro_rotl(const uint64_t x, int k) { 32 | return (x << k) | (x >> (64 - k)); 33 | } 34 | 35 | 36 | static uint64_t s[4] = { 1,3,5,7 }; 37 | 38 | uint64_t xoroshiro_next(void) { 39 | const uint64_t result = xoroshiro_rotl(s[0] + s[3], 23) + s[0]; 40 | 41 | const uint64_t t = s[1] << 17; 42 | 43 | s[2] ^= s[0]; 44 | s[3] ^= s[1]; 45 | s[1] ^= s[2]; 46 | s[0] ^= s[3]; 47 | 48 | s[2] ^= t; 49 | 50 | s[3] = xoroshiro_rotl(s[3], 45); 51 | 52 | return result; 53 | } 54 | 55 | 56 | /* This is the jump function for the generator. It is equivalent 57 | to 2^128 calls to next(); it can be used to generate 2^128 58 | non-overlapping subsequences for parallel computations. */ 59 | 60 | void xoroshiro_jump(void) { 61 | static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; 62 | 63 | uint64_t s0 = 0; 64 | uint64_t s1 = 0; 65 | uint64_t s2 = 0; 66 | uint64_t s3 = 0; 67 | for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) 68 | for(int b = 0; b < 64; b++) { 69 | if (JUMP[i] & UINT64_C(1) << b) { 70 | s0 ^= s[0]; 71 | s1 ^= s[1]; 72 | s2 ^= s[2]; 73 | s3 ^= s[3]; 74 | } 75 | xoroshiro_next(); 76 | } 77 | 78 | s[0] = s0; 79 | s[1] = s1; 80 | s[2] = s2; 81 | s[3] = s3; 82 | } 83 | 84 | 85 | 86 | /* This is the long-jump function for the generator. It is equivalent to 87 | 2^192 calls to next(); it can be used to generate 2^64 starting points, 88 | from each of which jump() will generate 2^64 non-overlapping 89 | subsequences for parallel distributed computations. */ 90 | 91 | void xoroshiro_long_jump(void) { 92 | static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 }; 93 | 94 | uint64_t s0 = 0; 95 | uint64_t s1 = 0; 96 | uint64_t s2 = 0; 97 | uint64_t s3 = 0; 98 | for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) 99 | for(int b = 0; b < 64; b++) { 100 | if (LONG_JUMP[i] & UINT64_C(1) << b) { 101 | s0 ^= s[0]; 102 | s1 ^= s[1]; 103 | s2 ^= s[2]; 104 | s3 ^= s[3]; 105 | } 106 | xoroshiro_next(); 107 | } 108 | 109 | s[0] = s0; 110 | s[1] = s1; 111 | s[2] = s2; 112 | s[3] = s3; 113 | } 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Gradient boosting algorithm is one of the most interesting and overlooked algorithm in machine learning. There are huge gaps between the simple theoretical formula and practical implementations, especially the histogram technique . The histogram-based feature representation not only greatly improves the speed, but also improves the accuracy. In some sense, the histogram is a sparse embedding technique, which map the noisy feature to a more compact and more robust space. And we could get more along this direction. Based on the deep understanding of feature embedding technique, we present LiteMORT, which use much less memory than other GBDT libs. It also has higher accuracy in some datasets. LiteMORT reveals that GBDT algorithm can have much more potential than most people would expect. 2 | 3 | ## Some key features of LiteMORT 4 | 5 | #### 1. Faster than LightGBM with higher accuracy 6 | 7 | For example , in the latest Kaggle competition [IEEE-CIS Fraud Detection competition](https://www.kaggle.com/c/ieee-fraud-detection/overview) (binary classification problem) : 8 | 9 | 1) **LiteMORT is much faster than LightGBM**. LiteMORT needs only a quarter of the time of LightGBM. 10 | 11 | 2)**LiteMORT has higher auc than LightGBM**. 12 | 13 | ![auc_8_fold](https://github.com/closest-git/ieee_fraud/raw/master/auc_8_fold.jpg) 14 | 15 | ![time_8_fold](https://github.com/closest-git/ieee_fraud/raw/master/time_8_fold.jpg) 16 | 17 | For the detail comparison of this competition, please see https://github.com/closest-git/ieee_fraud. 18 | 19 | #### 2. Use much less memory than other GBDT libs 20 | 21 | 1) **Share memory with data source** (pandas dataframe, numpy ndarray, list, vector… ) 22 | 23 | LiteMORT would not allocate extra memory for features stored in continuous memory. In the gradient boosting process, nearly all visit to data is on the pointer and some offsets. 24 | 25 | 2) **Implicit merging for “merge overflow problem”** 26 | 27 | In real application, we usually don’t save all the data in one big data table. They are always many smaller ones instead. But in the data analysis or machine learning task, we have to access all datas. Or we have to merge some small datasets to get some huge datasets, which are too huge to be processed by many classical machine learning algorithms. We called this phenomenon as **“merge overflow problem”**. LiteMORT use a smart implicit merging technique to deal with this problem. Just send all small datasets to LiteMORT, LiteMORT would generate the histograms for each merged features. In the later training process, all operations are on these histograms. No need to generate the huge merged dataset as the classical method or other GBDT libs(LightGBM, XGBoost,...) 28 | 29 | #### 3. sklearn-like api interface. 30 | 31 | ```python 32 | from litemort import * 33 | mode = LiteMORT(params).fit(train_x, train_y, eval_set=[(eval_x, eval_y)]) 34 | pred_val = model.predict(eval_x) 35 | pred_raw = model.predict_raw(eval_x) 36 | ``` 37 | 38 | #### 4. Just one line to transform from lightGBM to LiteMORT. 39 | 40 | Support parameters of LightGBM 41 | 42 | As shown below, just one more line to transform from lightGBM to LiteMORT. 43 | 44 | ```python 45 | if model_type == 'mort': 46 | model = LiteMORT(params).fit_1(X_train, y_train, eval_set=[(X_valid, y_valid)]) 47 | if model_type == 'lgb': 48 | model = lgb.LGBMRegressor(**params, n_jobs=-1) 49 | model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)]) 50 | pred_test = model.predict(X_test) 51 | ``` 52 | 53 | 54 | 55 | ## Citation 56 | 57 | Please use the following bibtex entry: 58 | 59 | ``` 60 | [1] Chen, Yingshi."LiteMORT: A memory efficient gradient boosting tree system on adaptive compact distributions." arXiv preprint arXiv:2001.09419 (2020). 61 | ``` 62 | 63 | ## Author 64 | 65 | LiteMORT was written by Yingshi Chen (gsp.cys@gmail.com) -------------------------------------------------------------------------------- /src/util/PY_obj.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include //for shared_ptr 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "Object.hpp" 13 | #include "Float16.hpp" 14 | 15 | struct PY_COLUMN { 16 | char *name; 17 | void *data; 18 | char *dtype; 19 | char *type_x; 20 | double v_min; 21 | double v_max; 22 | float representive; 23 | 24 | bool isCategory() const { 25 | return type_x!=NULL && strcmp(type_x,"*")==0; 26 | } 27 | bool isDiscrete() const { 28 | return type_x != NULL && strcmp(type_x, "#") == 0; 29 | } 30 | bool isInt8() { 31 | std::string type = dtype; 32 | return type == "char" || type == "int8" || type == "uint8"; 33 | } 34 | bool isInt32() { 35 | std::string type = dtype; 36 | return type == "int" || type == "int32" || type == "uint32"; 37 | } 38 | bool isInt16() { 39 | std::string type = dtype; 40 | return type == "int16" || type == "uint16"; 41 | } 42 | bool isInt64() { 43 | std::string type = dtype; 44 | return type == "int64" || type == "uint64"; 45 | } 46 | bool isFloat() { 47 | std::string type = dtype; 48 | return type == "float32"; 49 | } 50 | bool isFloat16() { 51 | std::string type = dtype; 52 | return type == "float16"; 53 | } 54 | bool isDouble() { 55 | std::string type = dtype; 56 | return type == "float64"; 57 | } 58 | 59 | template 60 | void CopyTo_(size_t nSamp, Tx* dst, int flag = 0x0) { 61 | if (isInt8()) { 62 | //assert(typeof(Tx) == typeof(int8_t)); 63 | //G_MEMCOPY_(nSamp, dst, (int8_t*)data, flag); 64 | int8_t *i8_ = (int8_t*)data; 65 | for (size_t i = 0; i < nSamp; i++) { 66 | dst[i] = i8_[i]; 67 | } 68 | } 69 | else if (isDouble()){ 70 | double *dbl = (double*)data; 71 | for (size_t i = 0; i < nSamp; i++) { 72 | dst[i] = dbl[i]; 73 | } 74 | } else if (isFloat()) { 75 | float *flt = (float*)data; 76 | for (size_t i = 0; i < nSamp; i++) { 77 | dst[i] = flt[i]; 78 | } 79 | } else if (isInt64()) { 80 | int64_t *i64 = (int64_t*)data; 81 | for (size_t i = 0; i < nSamp; i++) { 82 | dst[i] = i64[i]; 83 | } 84 | } 85 | else if (isInt32()) { 86 | int32_t *i32 = (int32_t*)data; 87 | for (size_t i = 0; i < nSamp; i++) { 88 | dst[i] = i32[i]; 89 | } 90 | } 91 | else if (isFloat16()) { //https://stackoverflow.com/questions/22210684/16-bit-floats-and-gl-half-float 92 | int16_t *flt16= (int16_t*)data; 93 | float fRet; 94 | for (size_t i = 0; i < nSamp; i++, flt16++) { 95 | dst[i] = Float16::GLM_toFloat32(*flt16); 96 | /*int fltInt32 = (((*flt16) & 0x8000) << 16); 97 | fltInt32 |= (((*flt16) & 0x7fff) << 13) + 0x38000000; 98 | memcpy(&fRet, &fltInt32, sizeof(float)); 99 | dst[i] = fRet;*/ 100 | } 101 | } 102 | else { 103 | throw "PY_COLUMN::CopyTo_ is mismatch!!!"; 104 | } 105 | } 106 | }; 107 | 108 | struct PY_DATASET { 109 | char *name=nullptr; 110 | size_t nSamp; 111 | int ldFeat; 112 | int ldY; 113 | PY_COLUMN *columnX = nullptr; //PY_COLUMN 114 | PY_COLUMN *columnY = nullptr; //PY_COLUMN 115 | PY_COLUMN *merge_left = nullptr; 116 | //int merge_rigt = -1; 117 | int x; 118 | 119 | bool isValid() { 120 | return false; 121 | } 122 | 123 | PY_COLUMN* GetColumn(int id) { 124 | assert(id >= 0 && id < ldFeat); 125 | return columnX + id; 126 | } 127 | }; 128 | 129 | struct PY_DATASET_LIST { 130 | char *name=nullptr; 131 | int nSet=0; 132 | PY_DATASET *list = nullptr; //PY_COLUMN 133 | int x=0; 134 | 135 | static PY_DATASET* GetSet(PY_DATASET_LIST *data_list,int no=0x0) { 136 | assert(data_list!=nullptr && data_list->nSet > no); 137 | PY_DATASET *set = data_list->list+no; 138 | assert(set->ldFeat > 0); 139 | assert(set->ldY > 0); 140 | assert(set->nSamp > 0); 141 | return set; 142 | } 143 | }; 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /python-package/LiteMORT/compat.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = C0103 3 | """Compatibility""" 4 | from __future__ import absolute_import 5 | 6 | import inspect 7 | import sys 8 | 9 | import numpy as np 10 | 11 | is_py3 = (sys.version_info[0] == 3) 12 | 13 | """compatibility between python2 and python3""" 14 | if is_py3: 15 | zip_ = zip 16 | string_type = str 17 | numeric_types = (int, float, bool) 18 | integer_types = (int, ) 19 | range_ = range 20 | 21 | def argc_(func): 22 | """return number of arguments of a function""" 23 | return len(inspect.signature(func).parameters) 24 | 25 | def decode_string(bytestring): 26 | return bytestring.decode('utf-8') 27 | else: 28 | from itertools import izip as zip_ 29 | string_type = basestring 30 | numeric_types = (int, long, float, bool) 31 | integer_types = (int, long) 32 | range_ = xrange 33 | 34 | def argc_(func): 35 | """return number of arguments of a function""" 36 | return len(inspect.getargspec(func).args) 37 | 38 | def decode_string(bytestring): 39 | return bytestring 40 | 41 | """json""" 42 | try: 43 | import simplejson as json 44 | except (ImportError, SyntaxError): 45 | # simplejson does not support Python 3.2, it throws a SyntaxError 46 | # because of u'...' Unicode literals. 47 | import json 48 | 49 | 50 | def json_default_with_numpy(obj): 51 | if isinstance(obj, (np.integer, np.floating, np.bool_)): 52 | return obj.item() 53 | elif isinstance(obj, np.ndarray): 54 | return obj.tolist() 55 | else: 56 | return obj 57 | 58 | 59 | """pandas""" 60 | try: 61 | from pandas import Series, DataFrame 62 | PANDAS_INSTALLED = True 63 | except ImportError: 64 | PANDAS_INSTALLED = False 65 | 66 | class Series(object): 67 | pass 68 | 69 | class DataFrame(object): 70 | pass 71 | 72 | """matplotlib""" 73 | try: 74 | import matplotlib 75 | MATPLOTLIB_INSTALLED = True 76 | except ImportError: 77 | MATPLOTLIB_INSTALLED = False 78 | 79 | """graphviz""" 80 | try: 81 | import graphviz 82 | GRAPHVIZ_INSTALLED = True 83 | except ImportError: 84 | GRAPHVIZ_INSTALLED = False 85 | 86 | """sklearn""" 87 | try: 88 | from sklearn.base import BaseEstimator 89 | from sklearn.base import RegressorMixin, ClassifierMixin 90 | from sklearn.preprocessing import LabelEncoder 91 | from sklearn.utils.class_weight import compute_sample_weight 92 | from sklearn.utils.multiclass import check_classification_targets 93 | from sklearn.utils.validation import check_X_y, check_array, check_consistent_length 94 | try: 95 | from sklearn.model_selection import StratifiedKFold, GroupKFold 96 | from sklearn.exceptions import NotFittedError 97 | except ImportError: 98 | from sklearn.cross_validation import StratifiedKFold, GroupKFold 99 | from sklearn.utils.validation import NotFittedError 100 | SKLEARN_INSTALLED = True 101 | _MortModelBase = BaseEstimator 102 | _MortRegressorBase = RegressorMixin 103 | _MortClassifierBase = ClassifierMixin 104 | _MortLabelEncoder = LabelEncoder 105 | MortNotFittedError = NotFittedError 106 | _MortStratifiedKFold = StratifiedKFold 107 | _MortGroupKFold = GroupKFold 108 | _MortCheckXY = check_X_y 109 | _MortCheckArray = check_array 110 | _MortCheckConsistentLength = check_consistent_length 111 | _MortCheckClassificationTargets = check_classification_targets 112 | _MortComputeSampleWeight = compute_sample_weight 113 | except ImportError: 114 | SKLEARN_INSTALLED = False 115 | _MortModelBase = object 116 | _MortClassifierBase = object 117 | _MortRegressorBase = object 118 | _MortLabelEncoder = None 119 | MortNotFittedError = ValueError 120 | _MortStratifiedKFold = None 121 | _MortGroupKFold = None 122 | _MortCheckXY = None 123 | _MortCheckArray = None 124 | _MortCheckConsistentLength = None 125 | _MortCheckClassificationTargets = None 126 | _MortComputeSampleWeight = None 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/util/pcg_oneil/pcg_basic.c: -------------------------------------------------------------------------------- 1 | /* 2 | * PCG Random Number Generation for C. 3 | * 4 | * Copyright 2014 Melissa O'Neill 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * For additional information about the PCG random number generation scheme, 19 | * including its license and other licensing options, visit 20 | * 21 | * http://www.pcg-random.org 22 | */ 23 | 24 | /* 25 | * This code is derived from the full C implementation, which is in turn 26 | * derived from the canonical C++ PCG implementation. The C++ version 27 | * has many additional features and is preferable if you can use C++ in 28 | * your project. 29 | */ 30 | 31 | #include "pcg_basic.h" 32 | 33 | // state for global RNGs 34 | 35 | static pcg32_random_t pcg32_global = PCG32_INITIALIZER; 36 | 37 | // pcg32_srandom(initstate, initseq) 38 | // pcg32_srandom_r(rng, initstate, initseq): 39 | // Seed the rng. Specified in two parts, state initializer and a 40 | // sequence selection constant (a.k.a. stream id) 41 | 42 | void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq) 43 | { 44 | rng->state = 0U; 45 | rng->inc = (initseq << 1u) | 1u; 46 | pcg32_random_r(rng); 47 | rng->state += initstate; 48 | pcg32_random_r(rng); 49 | } 50 | 51 | void pcg32_srandom(uint64_t seed, uint64_t seq) 52 | { 53 | pcg32_srandom_r(&pcg32_global, seed, seq); 54 | } 55 | 56 | // pcg32_random() 57 | // pcg32_random_r(rng) 58 | // Generate a uniformly distributed 32-bit random number 59 | 60 | uint32_t pcg32_random_r(pcg32_random_t* rng) 61 | { 62 | uint64_t oldstate = rng->state; 63 | rng->state = oldstate * 6364136223846793005ULL + rng->inc; 64 | uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; 65 | uint32_t rot = oldstate >> 59u; 66 | return (xorshifted >> rot) | (xorshifted << ((-(int32_t)(rot)) & 31)); 67 | } 68 | 69 | uint32_t pcg32_random() 70 | { 71 | return pcg32_random_r(&pcg32_global); 72 | } 73 | 74 | 75 | // pcg32_boundedrand(bound): 76 | // pcg32_boundedrand_r(rng, bound): 77 | // Generate a uniformly distributed number, r, where 0 <= r < bound 78 | 79 | uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound) 80 | { 81 | // To avoid bias, we need to make the range of the RNG a multiple of 82 | // bound, which we do by dropping output less than a threshold. 83 | // A naive scheme to calculate the threshold would be to do 84 | // 85 | // uint32_t threshold = 0x100000000ull % bound; 86 | // 87 | // but 64-bit div/mod is slower than 32-bit div/mod (especially on 88 | // 32-bit platforms). In essence, we do 89 | // 90 | // uint32_t threshold = (0x100000000ull-bound) % bound; 91 | // 92 | // because this version will calculate the same modulus, but the LHS 93 | // value is less than 2^32. 94 | 95 | uint32_t threshold = -(int32_t)(bound) % bound; 96 | 97 | // Uniformity guarantees that this loop will terminate. In practice, it 98 | // should usually terminate quickly; on average (assuming all bounds are 99 | // equally likely), 82.25% of the time, we can expect it to require just 100 | // one iteration. In the worst case, someone passes a bound of 2^31 + 1 101 | // (i.e., 2147483649), which invalidates almost 50% of the range. In 102 | // practice, bounds are typically small and only a tiny amount of the range 103 | // is eliminated. 104 | for (;;) { 105 | uint32_t r = pcg32_random_r(rng); 106 | if (r >= threshold) 107 | return r % bound; 108 | } 109 | } 110 | 111 | 112 | uint32_t pcg32_boundedrand(uint32_t bound) 113 | { 114 | return pcg32_boundedrand_r(&pcg32_global, bound); 115 | } 116 | 117 | -------------------------------------------------------------------------------- /python-package/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Note: To use the 'upload' functionality of this file, you must: 5 | # $ pip install twine 6 | 7 | import io 8 | import os 9 | import sys 10 | from shutil import rmtree 11 | 12 | from setuptools import find_packages, setup, Command 13 | 14 | # Package meta-data. 15 | NAME = 'litemort' 16 | DESCRIPTION = 'Fastest gradient boosting library with higher accuracy' 17 | URL = 'https://github.com/closest-git/LiteMORT' 18 | EMAIL = 'gsp.cys@gmail.com' 19 | AUTHOR = 'Yingshi Chen' 20 | REQUIRES_PYTHON = '>=3.0.0' 21 | VERSION = None 22 | 23 | # What packages are required for this module to be executed? 24 | REQUIRED = [ 25 | 'numpy', 'scipy', 'scikit-learn', 26 | ] 27 | 28 | # What packages are optional? 29 | EXTRAS = { 30 | # 'fancy feature': ['django'], 31 | } 32 | 33 | # The rest you shouldn't have to touch too much :) 34 | # ------------------------------------------------ 35 | # Except, perhaps the License and Trove Classifiers! 36 | # If you do change the License, remember to change the Trove Classifier for that! 37 | 38 | here = os.path.abspath(os.path.dirname(__file__)) 39 | 40 | # Import the README and use it as the long-description. 41 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 42 | try: 43 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 44 | long_description = '\n' + f.read() 45 | except FileNotFoundError: 46 | long_description = DESCRIPTION 47 | 48 | # Load the package's __version__.py module as a dictionary. 49 | about = {} 50 | if not VERSION: 51 | project_slug = NAME.lower().replace("-", "_").replace(" ", "_") 52 | with open(os.path.join(here, project_slug, '__version__.py')) as f: 53 | exec(f.read(), about) 54 | else: 55 | about['__version__'] = VERSION 56 | 57 | 58 | class UploadCommand(Command): 59 | """Support setup.py upload.""" 60 | 61 | description = 'Build and publish the package.' 62 | user_options = [] 63 | 64 | @staticmethod 65 | def status(s): 66 | """Prints things in bold.""" 67 | print('\033[1m{0}\033[0m'.format(s)) 68 | 69 | def initialize_options(self): 70 | pass 71 | 72 | def finalize_options(self): 73 | pass 74 | 75 | def run(self): 76 | try: 77 | self.status('Removing previous builds…') 78 | rmtree(os.path.join(here, 'dist')) 79 | except OSError: 80 | pass 81 | 82 | self.status('Building Source and Wheel (universal) distribution…') 83 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 84 | 85 | self.status('Uploading the package to PyPI via Twine…') 86 | os.system('twine upload dist/*') 87 | 88 | self.status('Pushing git tags…') 89 | os.system('git tag v{0}'.format(about['__version__'])) 90 | os.system('git push --tags') 91 | 92 | sys.exit() 93 | 94 | 95 | # Where the magic happens: 96 | setup( 97 | name=NAME, 98 | version=about['__version__'], 99 | description=DESCRIPTION, 100 | long_description=long_description, 101 | long_description_content_type='text/markdown', 102 | author=AUTHOR, 103 | author_email=EMAIL, 104 | python_requires=REQUIRES_PYTHON, 105 | url=URL, 106 | packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]), 107 | data_files=[('./litemort',["./litemort/libLiteMORT.so","./litemort/LiteMORT.dll"])], 108 | 109 | # If your package is a single module, use this instead of 'packages': 110 | # py_modules=['mypackage'], 111 | 112 | # entry_points={ 113 | # 'console_scripts': ['mycli=mymodule:cli'], 114 | # }, 115 | install_requires=REQUIRED, 116 | extras_require=EXTRAS, 117 | include_package_data=True, 118 | license='MIT', 119 | classifiers=[ 120 | # Trove classifiers 121 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 122 | 'License :: OSI Approved :: MIT License', 123 | 'Programming Language :: Python', 124 | 'Programming Language :: Python :: 3', 125 | 'Programming Language :: Python :: 3.6', 126 | 'Programming Language :: Python :: Implementation :: CPython', 127 | 'Programming Language :: Python :: Implementation :: PyPy', 128 | 'Operating System :: Microsoft :: Windows', 129 | 'Operating System :: Unix', 130 | ], 131 | # $ setup.py publish support. 132 | cmdclass={ 133 | 'upload': UploadCommand, 134 | }, 135 | ) 136 | -------------------------------------------------------------------------------- /python-package/LiteMORT/LiteMORT_problems.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.preprocessing import LabelEncoder 5 | from sklearn.linear_model import Lasso 6 | from sklearn.linear_model import Ridge,ElasticNet 7 | from .compat import (_MortModelBase,_MortClassifierBase,_MortRegressorBase) 8 | 9 | 10 | class Mort_Problems(_MortModelBase): 11 | def __init__(self, **kwargs): 12 | pass 13 | 14 | def get_params(self, deep=True): 15 | params = super(_MortModelBase, self).get_params(deep=deep) 16 | params.update(self._other_params) 17 | return params 18 | 19 | # minor change to support `**kwargs` 20 | def set_params(self, **params): 21 | for key, value in params.items(): 22 | setattr(self, key, value) 23 | if hasattr(self, '_' + key): 24 | setattr(self, '_' + key, value) 25 | self._other_params[key] = value 26 | return self 27 | 28 | # 注意 Y_t与y_train不一样 29 | def OnY(self, y_train, np_type): 30 | # print(type(y_train)) 31 | if type(y_train) is pd.Series: 32 | np_target = y_train.values.astype(np_type) 33 | elif isinstance(y_train, pd.DataFrame): 34 | np_target = y_train.values.astype(np_type) 35 | else: 36 | np_target = y_train.astype(np_type) 37 | return np_target 38 | 39 | def BeforeFit(self,train_set,eval_set): 40 | return False,None,None 41 | 42 | def AfterPredict(self, X_, Y_): 43 | return Y_ 44 | 45 | 46 | def OnResult(self,result_,pred_leaf=False, pred_contrib=False,raw_score=False): 47 | return result_ 48 | 49 | class Mort_BinaryClass(Mort_Problems): 50 | ''' 51 | or LogisticRegression 52 | ''' 53 | def __init__(self,params, **kwargs): 54 | super(Mort_BinaryClass, self).__init__() 55 | self._labelOfY=None 56 | 57 | def OnY(self, y_train, np_type): 58 | if self._labelOfY is None: 59 | self._labelOfY=LabelEncoder() 60 | self._labelOfY.fit(y_train) 61 | transformed_labels = self._labelOfY.transform(y_train) 62 | self._classes = self._labelOfY.classes_ 63 | self._n_classes = len(self._classes) 64 | if self._n_classes != 2: 65 | raise ValueError("The class of Y is {}. Not a binary-classification problem!!!".format(self._n_classes) ) 66 | else: 67 | transformed_labels = self._labelOfY.transform(y_train) 68 | return super(Mort_BinaryClass, self).OnY(transformed_labels, np_type) 69 | 70 | """LiteMORT Binary classifier. https://en.wikipedia.org/wiki/Binary_classification""" 71 | def OnResult(self,result_,pred_leaf=False, pred_contrib=False,raw_score=False): 72 | # the predicted probability of 2 class 73 | result_ = np.vstack((1. - result_, result_)).transpose() 74 | if raw_score or pred_leaf or pred_contrib: 75 | return result_ 76 | else: 77 | class_index = np.argmax(result_, axis=1) 78 | if self._labelOfY is not None: 79 | return self._labelOfY.inverse_transform(class_index) 80 | else: 81 | return class_index 82 | pass 83 | 84 | class Mort_MultiClass(Mort_Problems, _MortClassifierBase): 85 | pass 86 | 87 | class Mort_Regressor(Mort_Problems, _MortRegressorBase): 88 | """LiteMORT regressor.""" 89 | def __init__(self,params, **kwargs): 90 | super(Mort_Regressor, self).__init__() 91 | self.alpha = 1 92 | self.gressor = None; self.alg="None" 93 | if 'cascade' in params and params['cascade']=="lasso": 94 | self.gressor = Lasso(alpha=self.alpha, normalize=True) 95 | self.alg = params['cascade'] 96 | #self.gressor = Ridge(alpha=0.05, normalize=True) 97 | #self.gressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False) 98 | self.mse = 0 99 | 100 | 101 | def BeforeFit(self,train_set,eval_set): 102 | if self.gressor is None: 103 | return False,None,None 104 | 105 | print(f"====== Mort_Regressor::BeforeFit@{self.gressor} alpha={self.alpha}") 106 | x_train, y_train = train_set 107 | self.gressor.fit(x_train, y_train) 108 | pred = self.gressor.predict(x_train) 109 | self.mse = np.mean((pred - y_train)**2) 110 | y_train = y_train - pred 111 | 112 | y_eval = None 113 | if (eval_set is not None and len(eval_set) > 0): 114 | X_eval, y_eval = eval_set[0] 115 | y_pred = self.gressor.predict(X_eval) 116 | y_eval = y_eval-y_pred 117 | return True,y_train,[y_eval] 118 | 119 | def AfterPredict(self,X_,Y_): 120 | if self.gressor is not None: 121 | y_pred = self.gressor.predict(X_) 122 | Y_= Y_+y_pred 123 | return Y_ 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /src/learn/DCRIMI_.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | //#include 5 | #include 6 | #define _USE_MATH_DEFINES 7 | #include 8 | #include 9 | #include 10 | #include "DCRIMI_.hpp" 11 | #include "../util/GST_def.h" 12 | 13 | using namespace Grusoft; 14 | using namespace std; 15 | 16 | //double DCRIMI_::tX = 0; 17 | double DCRIMI_2::tX = 0; 18 | 19 | /* 20 | Copyright 2008-present, Grusoft. 21 | v0.1 cys 22 | 6/13/2015 23 | */ 24 | //int DCRIMI_::nSPAN = 1000; 25 | DCRIMI_::DCRIMI_(void *hB, int span, int flag) : D_span(span), hBase(hB), dump(1), isSaveFalse(false), isBidui(false) { 26 | D_span = span; hBase = hB; dump = 1; 27 | assert(span >= 100 && hBase != nullptr); 28 | D_inter = new float[D_span + 1]; D_intra = new float[D_span + 1]; 29 | for (int i = 0; i <= D_span; i++) { 30 | D_inter[i] = 0.0; D_intra[i] = 0.0; 31 | } 32 | } 33 | DCRIMI_::DCRIMI_(const DCRIMI_& dcri) :isSaveFalse(dcri.isSaveFalse) { 34 | memset(this, 0x0, sizeof(DCRIMI_)); 35 | 36 | rFAR = dcri.rFAR, rFRR = dcri.rFRR, rEER = dcri.rEER, 37 | D_ = dcri.D_, sep = dcri.sep, eer_sep = dcri.eer_sep; 38 | mean_a = dcri.mean_a, mean_r = dcri.mean_r; 39 | devia_a = dcri.devia_a, devia_r = dcri.devia_r; 40 | dump = dcri.dump; 41 | } 42 | 43 | void DCRIMI_::Init(int flag) { 44 | for (int i = 0; i <= D_span; i++) { 45 | D_inter[i] = 0.0; D_intra[i] = 0.0; 46 | } 47 | rFAR = 0.0, rFRR = 0.0, rEER = 0.0, D_ = 0.0, sep = 0.0, eer_sep = 0.0; 48 | rTop_1 = 0.0; rTop_5 = 0.0; 49 | } 50 | 51 | 52 | void DCRIMI_::Insert_1(float dis, bool isIntra, int flag) { 53 | assert(dis>-0.001 && dis<1.001); 54 | int pos = dis*D_span; 55 | pos = max(pos, 0); pos = min(pos, D_span); 56 | if (isIntra) { 57 | D_intra[pos]++; 58 | } 59 | else { 60 | D_inter[pos]++; 61 | } 62 | 63 | } 64 | void DCRIMI_::Analyze(const string &sTitle, int flag) { 65 | assert(D_inter != nullptr && D_intra != nullptr); 66 | int i, grid = 0; 67 | double s, D_s, f_ar, f_rr, w_a, w_r, f_ar_g = 1.0e-7; 68 | 69 | for (i = 0; i<8; i++) { 70 | f_ar_8[i] = -1.0; f_rr_8[i] = -1.0; hd_8[i] = -1.0; 71 | } 72 | D_s = 1.0 / D_span; //only for hamming distance 73 | 74 | mean_a = 0.0; mean_r = 0; 75 | nz_a = 0.0; nz_r = 0.0; 76 | max_a = max_r = 0.0; min_a = min_r = 1.0; 77 | for (i = 0; i <= D_span; i++) { 78 | mean_a += i*D_intra[i]; nz_a += D_intra[i]; 79 | if (D_intra[i]>0) { 80 | max_a = MAX2(max_a, i*D_s); min_a = MIN2(min_a, i*D_s); 81 | } 82 | mean_r += i*D_inter[i]; nz_r += D_inter[i]; 83 | if (D_inter[i]>0) { 84 | max_r = MAX2(max_r, i*D_s); min_r = MIN2(min_r, i*D_s); 85 | } 86 | } 87 | // nz_a = nz_a; nz_r = nz_r; 88 | mean_a = nz_a == 0 ? 0.0 : mean_a / nz_a*D_s; 89 | mean_r = nz_r == 0 ? 0.0 : mean_r / nz_r*D_s; 90 | // mean_a = mean_a; mean_r = mean_r; 91 | 92 | devia_a = 0.0; devia_r = 0; 93 | w_a = 0.0; w_r = 0.0; 94 | for (i = 0; i <= D_span; i++) { 95 | w_a += D_intra[i]; w_r += D_inter[i]; 96 | f_ar = w_r*1.0 / nz_r; 97 | f_rr = (nz_a - w_a)*1.0 / nz_a; 98 | while (f_ar >= f_ar_g && f_ar_8[grid] == -1.0) { 99 | f_ar_8[grid] = f_ar; 100 | f_rr_8[grid] = f_rr; hd_8[grid] = i*1.0 / D_span; 101 | f_ar_g *= 10; grid++; 102 | if (f_ar < f_ar_g) 103 | break; 104 | } 105 | if (f_ar >= f_rr && rEER == 0.0) { 106 | rEER = (f_ar + f_rr) / 2.0; 107 | eer_sep = i*1.0 / D_span;//sep; 108 | } 109 | if (f_ar>1.0e-3 && sep == 0.0) { 110 | //if( f_ar>1.0e-2 && sep==0.0 ) { 111 | sep = i*1.0 / D_span; 112 | rFAR = f_ar; 113 | rFRR = f_rr; 114 | } 115 | if (D_intra[i] != 0) { 116 | s = (i*D_s - mean_a); 117 | devia_a += s*s*D_intra[i]; 118 | } 119 | s = (i*D_s - mean_r); 120 | devia_r += s*s*D_inter[i]; 121 | } 122 | devia_a = nz_a == 0 ? 0.0 : sqrt(devia_a / nz_a); 123 | devia_r = nz_r == 0 ? 0.0 : sqrt(devia_r / nz_r); 124 | // devia_a = devian_a; devia_r = devian_r; 125 | 126 | s = sqrt((devia_a*devia_a) + (devia_r*devia_r)) / 2.0; 127 | D_ = s == 0 ? 0.0 : fabs(mean_a - mean_r) / s; 128 | double accu = (1.0 - rFRR)*100.0; 129 | if (rFAR>2 * 1.0e-2) { accu /= (rFAR / 1.0e-2); } //rFAR=1% 130 | if (dump != 0) { 131 | printf("\n@@@\"%s\" nz=(%g,%g) intra=(%.3g,%.3g,%.3g,%.3g),inter=(%.3g,%.3g,%.3g,%.3g)" 132 | "\n@@@\taccu=%.3g%%(T=%g,frr=%.3g far=%.2g%%) EER=%.3g(%.3g) _DCRIMI_\n" 133 | , sTitle.c_str(), nz_a, nz_r, mean_a, devia_a, max_a, min_a, mean_r, devia_r, max_r, min_r, 134 | accu, sep, rFRR, rFAR * 100, rEER, eer_sep); 135 | for (i = 0; i<8; i++) { 136 | printf("(%.1e,%.3g)", f_ar_8[i], f_rr_8[i]); 137 | } 138 | printf("\n"); 139 | } 140 | 141 | } 142 | 143 | void DCRIMI_::GetRoc(float *roc, int flag) { 144 | int i; 145 | double s, D_s, f_ar, f_rr, w_a = 0.0, w_r = 0.0, f_ar_g = 1.0e-7; 146 | for (i = 0; i < D_span; i++) { 147 | w_a += D_intra[i]; w_r += D_inter[i]; 148 | roc[2 * i] = w_r*1.0 / nz_r; 149 | roc[2 * i + 1] = (nz_a - w_a)*1.0 / nz_a; 150 | } 151 | } 152 | 153 | double DCRIMI_::T_intra(int flag) { 154 | if (nz_a == 0) 155 | return 1.0; 156 | double t = mean_a + 7 * devia_a; 157 | t = min(t, 1.0); 158 | assert(t>0 && t <= 1.0); 159 | return t; 160 | } -------------------------------------------------------------------------------- /python-package/case_poct.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold 3 | from sklearn.metrics import mean_absolute_error,mean_squared_error 4 | from sklearn.metrics import roc_auc_score, roc_curve,auc 5 | import time 6 | import numpy as np 7 | from litemort import * 8 | import sys 9 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 10 | #isMORT = True 11 | import matplotlib.pyplot as plt 12 | import pandas as pd 13 | import gc 14 | import seaborn as sns 15 | import pickle 16 | ''' 17 | histo->RandomCompress() 似乎可以通过遍历更多的空间来提高准确率 18 | 19 | ''' 20 | 21 | def ROC_plot(features,X_,y_, pred_,title): 22 | fpr_, tpr_, thresholds = roc_curve(y_, pred_) 23 | optimal_idx = np.argmax(tpr_ - fpr_) 24 | #https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python 25 | optimal_threshold = thresholds[optimal_idx] 26 | auc_ = auc(fpr_, tpr_) 27 | title = "{} auc=".format(title) 28 | print("{} auc={} OT={:.4g}".format(title, auc_,optimal_threshold)) 29 | plt.plot(fpr_, tpr_, label="{}:{:.4g}".format(title, auc_)) 30 | plt.xlabel('False positive rate') 31 | plt.ylabel('True positive rate') 32 | plt.title('SMPLEs={} Features={} OT={:.4g}'.format(X_.shape[0],len(features),optimal_threshold)) 33 | plt.legend(loc='best') 34 | plt.savefig("./_auc_[{}].jpg".format(features)) 35 | plt.show() 36 | return auc_,optimal_threshold 37 | 38 | def runLgb(X, y, test=None, num_rounds=10000, max_depth=-1, eta=0.01, subsample=0.8, 39 | colsample=0.8, min_child_weight=1, early_stopping_rounds=500, seeds_val=2017): 40 | plot_feature_importance = True 41 | features = list(X.columns) 42 | print("X={} y={}".format(X.shape,y.shape)) 43 | params = {'task': 'train', 44 | 'max_bin': 256, 45 | 'salp_bins':32, 46 | #'elitism': 2, #不适用于本算例 47 | 'min_data_in_leaf': 32, 48 | 'boosting_type': 'gbdt', 49 | 'objective': 'binary', 50 | 'learning_rate': eta, 51 | # 'metric': {'multi_logloss'}, 52 | 'metric': 'auc', 53 | 'early_stop':early_stopping_rounds, 54 | 'max_depth': max_depth, 55 | # 'min_child_weight':min_child_weight, 56 | 'bagging_fraction': subsample, 57 | 'feature_fraction': colsample, 58 | 'bagging_seed': seeds_val, 59 | 'num_iterations': num_rounds, 60 | 'num_leaves': 32, 61 | 'lambda_l1': 1.0, 62 | 'verbose': 0, 63 | 'nthread': -1} 64 | n_fold = 5 65 | folds = KFold(n_splits=n_fold, shuffle=True, random_state=11) 66 | y_pred=np.zeros(y.shape[0]) 67 | feature_importance = None 68 | if not isMORT: 69 | feature_importance = pd.DataFrame() 70 | for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): 71 | t0 = time.time() 72 | 73 | if type(X) == np.ndarray: 74 | X_train, X_valid = X[train_index], X[valid_index] 75 | y_train, y_valid = y[train_index], y[valid_index] 76 | else: 77 | X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] 78 | y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] 79 | 80 | if isMORT: 81 | model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)]) 82 | pred_val = model.predict(X_valid) 83 | pred_raw = model.predict_raw(X_valid) 84 | y_pred[valid_index] = pred_raw 85 | fold_score = roc_auc_score(y_valid, pred_raw) 86 | else: 87 | lgtrain = lgb.Dataset(X_train, y_train) 88 | lgval = lgb.Dataset(X_valid, y_valid) 89 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=lgval, 90 | early_stopping_rounds=early_stopping_rounds, verbose_eval=100) 91 | plt.figure(figsize=(12, 6)) 92 | lgb.plot_importance(model, max_num_features=30) 93 | plt.title("Featurertances") 94 | plt.show() 95 | 96 | fold_importance = pd.DataFrame() 97 | fold_importance["feature"] = X.columns 98 | fold_importance["importance"] = model.feature_importance() 99 | fold_importance["fold"] = fold_n + 1 100 | feature_importance = pd.concat([feature_importance, fold_importance], axis=0) 101 | model.save_model(f'model_lgb_poct_{fold_n}_.txt') 102 | pred_val = model.predict(X_valid) 103 | y_pred[valid_index] = pred_val 104 | fold_score = roc_auc_score(y_valid, pred_val) 105 | 106 | print("fold n°{} time={:.3g} score={:.4g}".format(fold_n, time.time() - t0, fold_score)) 107 | if test is not None: 108 | pred_test = model.predict(test, num_iteration=model.best_iteration) 109 | 110 | else: 111 | pred_test = None 112 | #break 113 | auc = roc_auc_score(y, y_pred) 114 | if feature_importance is not None: 115 | feature_importance["importance"] /= n_fold 116 | if plot_feature_importance: 117 | cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values( 118 | by="importance", ascending=False)[:].index 119 | best_features = feature_importance.loc[feature_importance.feature.isin(cols)] 120 | plt.figure(figsize=(5, 3)); 121 | sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) 122 | plt.xlabel("importance of each feature") 123 | plt.title('AUC={:.3f} ({}-folds)'.format(auc,n_fold)) 124 | plt.savefig("./_importance_[{}].jpg".format(features)) 125 | plt.show() 126 | 127 | ROC_plot(features,X, y, y_pred, "") 128 | 129 | print("CV score: {:<8.5f}".format(auc)) 130 | return auc 131 | 132 | pkl_path=f"E:/POCTx/poct_InHospital.pkl" 133 | with open(pkl_path, "rb") as fp: # Pickling 134 | X = pickle.load(fp) 135 | y = pickle.load(fp) 136 | score = runLgb(X, y) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | [Xx]64/ 19 | [Xx]86/ 20 | [Bb]uild/ 21 | bld/ 22 | [Bb]in/ 23 | [Oo]bj/ 24 | docs/_build 25 | tests/bin 26 | lib 27 | data 28 | _000 29 | python-package/dist 30 | .pytest_cache/v/cache 31 | 32 | # Visual Studio 2015 cache/options directory 33 | .vs/ 34 | # Uncomment if you have tasks that create the project's static files in wwwroot 35 | #wwwroot/ 36 | 37 | # MSTest test Results 38 | [Tt]est[Rr]esult*/ 39 | [Bb]uild[Ll]og.* 40 | 41 | # NUNIT 42 | *.VisualState.xml 43 | TestResult.xml 44 | 45 | # Build Results of an ATL Project 46 | [Dd]ebugPS/ 47 | [Rr]eleasePS/ 48 | dlldata.c 49 | 50 | # DNX 51 | project.lock.json 52 | artifacts/ 53 | 54 | # Python 55 | *.egg-info 56 | __pycache__ 57 | .eggs 58 | 59 | # VS Code 60 | .vscode 61 | 62 | # Prerequisites 63 | *.d 64 | 65 | # Compiled Object files 66 | *.slo 67 | *.lo 68 | *.o 69 | *.obj 70 | 71 | # Precompiled Headers 72 | *.gch 73 | 74 | *_i.c 75 | *_p.c 76 | *_i.h 77 | *.ilk 78 | *.meta 79 | *.obj 80 | *.pch 81 | *.pdb 82 | *.pgc 83 | *.pgd 84 | *.rsp 85 | *.sbr 86 | *.tlb 87 | *.tli 88 | *.tlh 89 | *.tmp 90 | *.tmp_proj 91 | *.log 92 | *.vspscc 93 | *.vssscc 94 | .builds 95 | *.pidb 96 | *.svclog 97 | *.scc 98 | *.rar 99 | *.ym 100 | *.model 101 | 102 | 103 | # Chutzpah Test files 104 | _Chutzpah* 105 | 106 | # Visual C++ cache files 107 | ipch/ 108 | *.aps 109 | *.ncb 110 | *.opendb 111 | *.opensdf 112 | *.sdf 113 | *.cachefile 114 | *.VC.db 115 | 116 | # Visual Studio profiler 117 | *.psess 118 | *.vsp 119 | *.vspx 120 | *.sap 121 | 122 | # TFS 2012 Local Workspace 123 | $tf/ 124 | 125 | # Guidance Automation Toolkit 126 | *.gpState 127 | 128 | # ReSharper is a .NET coding add-in 129 | _ReSharper*/ 130 | *.[Rr]e[Ss]harper 131 | *.DotSettings.user 132 | 133 | # JustCode is a .NET coding add-in 134 | .JustCode 135 | 136 | # TeamCity is a build add-in 137 | _TeamCity* 138 | 139 | # DotCover is a Code Coverage Tool 140 | *.dotCover 141 | 142 | # NCrunch 143 | _NCrunch_* 144 | .*crunch*.local.xml 145 | nCrunchTemp_* 146 | 147 | # MightyMoose 148 | *.mm.* 149 | AutoTest.Net/ 150 | 151 | # Web workbench (sass) 152 | .sass-cache/ 153 | 154 | # Installshield output folder 155 | [Ee]xpress/ 156 | 157 | # DocProject is a documentation generator add-in 158 | DocProject/buildhelp/ 159 | DocProject/Help/*.HxT 160 | DocProject/Help/*.HxC 161 | DocProject/Help/*.hhc 162 | DocProject/Help/*.hhk 163 | DocProject/Help/*.hhp 164 | DocProject/Help/Html2 165 | DocProject/Help/html 166 | 167 | # Click-Once directory 168 | publish/ 169 | 170 | # Publish Web Output 171 | *.[Pp]ublish.xml 172 | *.azurePubxml 173 | 174 | # TODO: Un-comment the next line if you do not want to checkin 175 | # your web deploy settings because they may include unencrypted 176 | # passwords 177 | #*.pubxml 178 | *.publishproj 179 | 180 | # NuGet Packages 181 | *.nupkg 182 | # The packages folder can be ignored because of Package Restore 183 | **/packages/* 184 | # except build/, which is used as an MSBuild target. 185 | !**/packages/build/ 186 | # Uncomment if necessary however generally it will be regenerated when needed 187 | #!**/packages/repositories.config 188 | # NuGet v3's project.json files produces more ignoreable files 189 | *.nuget.props 190 | *.nuget.targets 191 | 192 | # Microsoft Azure Build Output 193 | csx/ 194 | *.build.csdef 195 | 196 | # Microsoft Azure Emulator 197 | ecf/ 198 | rcf/ 199 | 200 | # Windows Store app package directory 201 | AppPackages/ 202 | BundleArtifacts/ 203 | 204 | # Visual Studio cache files 205 | # files ending in .cache can be ignored 206 | *.[Cc]ache 207 | # but keep track of directories ending in .cache 208 | !*.[Cc]ache/ 209 | 210 | # Others 211 | ClientBin/ 212 | [Ss]tyle[Cc]op.* 213 | ~$* 214 | *~ 215 | *.dbmdl 216 | *.dbproj.schemaview 217 | *.pfx 218 | *.publishsettings 219 | node_modules/ 220 | orleans.codegen.cs 221 | 222 | # RIA/Silverlight projects 223 | Generated_Code/ 224 | 225 | # Backup & report files from converting an old project file 226 | # to a newer Visual Studio version. Backup files are not needed, 227 | # because we have git ;-) 228 | _UpgradeReport_Files/ 229 | Backup*/ 230 | UpgradeLog*.XML 231 | UpgradeLog*.htm 232 | 233 | # SQL Server files 234 | *.mdf 235 | *.ldf 236 | 237 | # Business Intelligence projects 238 | *.rdl.data 239 | *.bim.layout 240 | *.bim_*.settings 241 | 242 | # Microsoft Fakes 243 | FakesAssemblies/ 244 | 245 | # GhostDoc plugin setting file 246 | *.GhostDoc.xml 247 | 248 | # Node.js Tools for Visual Studio 249 | .ntvs_analysis.dat 250 | 251 | # Visual Studio 6 build log 252 | *.plg 253 | 254 | # Visual Studio 6 workspace options file 255 | *.opt 256 | 257 | # Visual Studio LightSwitch build output 258 | **/*.HTMLClient/GeneratedArtifacts 259 | **/*.DesktopClient/GeneratedArtifacts 260 | **/*.DesktopClient/ModelManifest.xml 261 | **/*.Server/GeneratedArtifacts 262 | **/*.Server/ModelManifest.xml 263 | _Pvt_Extensions 264 | 265 | # LightSwitch generated files 266 | GeneratedArtifacts/ 267 | ModelManifest.xml 268 | 269 | # Paket dependency manager 270 | .paket/paket.exe 271 | 272 | # FAKE - F# Make 273 | .fake/ 274 | *.lai 275 | *.la 276 | *.a 277 | *.lib 278 | *.zip 279 | *.info 280 | *.dll 281 | *.so 282 | *.dylib 283 | *.mA_bin 284 | *.dat 285 | *.avi 286 | *.ogv 287 | *.asv 288 | *.code 289 | /tests/python_package_test/.pytest_cache/v/cache 290 | /tests/python_package_test/categorical.model 291 | /python-package/geo_test.py 292 | *.csv 293 | /python-package/.pytest_cache/v/cache/lastfailed 294 | /python-package/.pytest_cache/v/cache/nodeids 295 | /python-package/case_qq2019.py 296 | *.txt 297 | *.jpg 298 | /src/learn/discpy.py 299 | /src/learn/sparsipy.py 300 | /doc/Gradient boosting on adpative distrubutions.docx 301 | /python-package/litemort/桌面.lnk 302 | /python-package/LiteMORT_hyppo.py 303 | /python-package/shap_test.py 304 | *.pickle 305 | *.gz 306 | -------------------------------------------------------------------------------- /python-package/LiteMORT/LiteMORT_hyppo.py: -------------------------------------------------------------------------------- 1 | #hyperparameter optimization 2 | #from bayes_opt import BayesianOptimization 3 | from sklearn.model_selection import KFold, train_test_split 4 | from sklearn.metrics import mean_squared_error 5 | from litemort import * 6 | import numpy as np 7 | 8 | def hyparam_search(func_core,pds,n_init=5, n_iter=12): 9 | optimizer = BayesianOptimization(func_core, pds, random_state=7) 10 | optimizer.maximize(init_points=n_init, n_iter=n_iter) 11 | print(optimizer.max) 12 | for i, res in enumerate(optimizer.res): 13 | print("Iteration {}: \n\t{}".format(i, res)) 14 | input(f"......BayesianOptimization is OK......") 15 | return optimizer.max 16 | 17 | def _feat_select_core_(**kwargs): 18 | print(kwargs) 19 | feats=[] 20 | nFeat = len(kwargs) 21 | no=0 22 | feat_factor = np.zeros(nFeat) 23 | for k,v in kwargs.items(): 24 | feats.append(k) 25 | feat_factor[no]=v; no=no+1 26 | train_tmp = train_data[feats] 27 | print('Training with {} features'.format(train_tmp.shape[1])) 28 | x_train, x_val, y_train, y_val = train_test_split(train_tmp, train_target, test_size = 0.2, random_state = 42) 29 | feat_factor=feat_factor.astype(np.float32) 30 | param_mort["feat_factor"]=feat_factor 31 | mort = LiteMORT(param_mort).fit(x_train, y_train,eval_set=[(x_val, y_val)]) 32 | eval_pred = mort.predict(x_val) 33 | score = np.sqrt(mean_squared_error(eval_pred, y_val)) 34 | return -score 35 | 36 | #10/20/2019 实测效果较差 BayesianOptimization并不适合非常多的参数 37 | def MORT_feat_bayesian_search(train,target,feat_fix,feat_select,n_init=5, n_iter=12): 38 | global train_data,train_target 39 | train_data,train_target = train,target 40 | print(f"train={train_data.shape} target={train_target.shape}") 41 | feat_useful=[] 42 | pds = {} 43 | for feat in feat_fix: 44 | pds[feat]=(1, 1) 45 | for feat in feat_select: 46 | pds[feat]=(0, 1) 47 | optimizer = BayesianOptimization(_feat_select_core_, pds, random_state=42) 48 | optimizer.maximize(init_points=n_init, n_iter=n_iter) 49 | print(optimizer.max) 50 | for i, res in enumerate(optimizer.res): 51 | print("Iteration {}: \n\t{}".format(i, res)) 52 | input(f"......BayesianOptimization is OK......") 53 | return feat_useful 54 | 55 | def MORT_feat_select_(dataX,dataY,feat_fix,feat_select,select_params,nMostSelect=10): 56 | nFeat=len(feat_fix)+len(feat_select) 57 | feats = [] 58 | no = 0 59 | feat_factor = np.zeros(nFeat) 60 | for feat in feat_fix: 61 | feats.append(feat) 62 | feat_factor[no] = 1 63 | no = no + 1 64 | for feat in feat_select: 65 | feats.append(feat) 66 | feat_factor[no] = 0 67 | no = no + 1 68 | data_tmp = dataX[feats] 69 | 70 | #select_params['learning_rate'] = select_params['learning_rate']*2 71 | #select_params['early_stopping_rounds'] = 100 72 | #select_params['verbose'] = 0 73 | 74 | print(f"======MORT_feat_select_ nFix={len(feat_fix)} nSelect={len(feat_select)} ") 75 | feat_useful_ = [] 76 | if True: 77 | for loop in range(nMostSelect): 78 | select_params["feat_factor"] = feat_factor 79 | if('split_idxs' in select_params): 80 | assert(len(select_params['split_idxs'])>0) 81 | tr_idx, val_idx=select_params['split_idxs'][0] 82 | y_train = dataY[tr_idx] 83 | x_train =data_tmp.iloc[tr_idx, :] 84 | x_val, y_val = data_tmp.iloc[val_idx, :], dataY[val_idx] 85 | else: 86 | x_train, x_val, y_train, y_val = train_test_split(data_tmp, dataY, test_size=0.2, random_state=42) 87 | cat_features = select_params['category_features'] if 'category_features' in select_params else None 88 | mort = LiteMORT(select_params).fit(x_train, y_train, eval_set=[(x_val, y_val)], categorical_feature=cat_features) 89 | feat_factor_1 = mort.params.feat_factor 90 | rank = np.argsort(feat_factor_1)[::-1] 91 | nAdd=0 92 | for no in rank: 93 | if feat_factor[no]==1: 94 | continue 95 | if feat_factor_1[no] > 0: 96 | feat_useful_.append(feats[no]); nAdd=nAdd+1 97 | print(f"___MORT_feat_select___@{loop}:\t{feats[no]}={feat_factor_1[no]:.5g}" ) 98 | feat_factor[no]=1 99 | if nAdd==0: 100 | print(f"___MORT_feat_select___@{loop} break out") 101 | print(f"___MORT_feat_select___@{loop} feat_useful_={feat_useful_}") 102 | input(f"......MORT_feat_select_ is OK......") 103 | 104 | else: #original forward feature selection 105 | x_train, x_val, y_train, y_val = train_test_split(dataX[feat_fix], dataY, test_size = 0.2, random_state = 42) 106 | mort = LiteMORT(param_mort).fit(x_train, y_train, eval_set=[(x_val, y_val)]) 107 | predictions = mort.predict(x_val) 108 | rmse_score = np.sqrt(mean_squared_error(y_val, predictions)) 109 | print("RMSE baseline val score: ", rmse_score) 110 | best_score = rmse_score 111 | train_columns = list(dataX.columns[13:]) 112 | for num, i in enumerate(train_columns): 113 | train_tmp = dataX[feat_fix + feat_useful_ + [i]] 114 | x_train, x_val, y_train, y_val = train_test_split(train_tmp, dataY, test_size=0.2, random_state=42) 115 | mort = LiteMORT(param_mort).fit(x_train, y_train, eval_set=[(x_val, y_val)]) 116 | predictions = mort.predict(x_val) 117 | rmse_score = np.sqrt(mean_squared_error(y_val, predictions)) 118 | percent = (best_score-rmse_score) / best_score*100.0; 119 | if rmse_score < best_score: 120 | print(f'------ \"{i}\" is usefull {percent:.3g}% [{best_score:.7g}=>{rmse_score:.7g}]------') 121 | best_score = rmse_score 122 | feat_useful_.append(i) 123 | else: 124 | pass #rint('Column {} is not usefull'.format(i)) 125 | print(feat_useful_) 126 | return feat_useful_ -------------------------------------------------------------------------------- /python-package/case_higgs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | https://archive.ics.uci.edu/ml/datasets/HIGGS 3 | This is a classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not. 4 | The data has been produced using Monte Carlo simulations. The first 21 features (columns 2-22) are kinematic properties measured by the particle detectors in the accelerator. The last seven features are functions of the first 21 features; these are high-level features derived by physicists to help discriminate between the two classes. There is an interest in using deep learning methods to obviate the need for physicists to manually develop such features. Benchmark results using Bayesian Decision Trees from a standard physics package and 5-layer neural networks are presented in the original paper. The last 500,000 examples are used as a test set. 5 | 6 | https://github.com/Laurae2/boosting_tree_benchmarks/tree/master/data 7 | https://github.com/guolinke/boosting_tree_benchmarks/tree/master/data 8 | https://blog.bigml.com/2017/09/28/case-study-finding-higgs-bosons-with-deepnets/ 9 | 10 | 5/19/2019 需要确定是regression 或 binary classification 11 | 8/23/2019 subsample subfeature 似乎都没用(2000000测试) 12 | lesome_rows=2000000 iter=2000 auc=0.83775(1,1) auc=0.83847(0.8,1);auc=0.83618(0.8,0.5) 13 | 14 | ''' 15 | import lightgbm as lgb 16 | import time 17 | import sys 18 | import os 19 | import gc 20 | import pandas as pd 21 | import numpy as np 22 | from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold 23 | import pickle 24 | from litemort import * 25 | #from LiteMORT_EDA import * 26 | 27 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 28 | #isMORT = True 29 | model_type = 'mort' if isMORT else 'lgb' 30 | #some_rows= 200000 31 | #some_rows= 2000000 32 | some_rows= 10500000 33 | nTotal = 11000000 34 | nLastForTest = 500000 #The last 500,000 examples are used as a test set. 35 | 36 | #some_rows=None 37 | 38 | def read_higgs_data(path): 39 | pkl_path = 'F:/Datasets/HIGGS_/higgs_data_{}.pickle'.format(some_rows) 40 | if os.path.isfile(pkl_path): 41 | print("====== Load pickle @{} ......".format(pkl_path)) 42 | with open(pkl_path, "rb") as fp: 43 | [X, y, X_test,y_test] = pickle.load(fp) 44 | else: 45 | assert(some_rows<=nTotal-nLastForTest) 46 | print("====== Read last {} examples as training set ......".format(some_rows)) 47 | df = pd.read_csv(path, nrows=some_rows,header=None) 48 | y=pd.Series(df.iloc[:,0]) 49 | X=df.iloc[:,1:] 50 | print("====== Read last {} examples as testing set ......".format(nLastForTest)) 51 | df = pd.read_csv(path, skiprows = nTotal-nLastForTest,nrows=nLastForTest, header=None) 52 | y_test = pd.Series(df.iloc[:, 0]) 53 | X_test = df.iloc[:,1:] 54 | del df 55 | gc.collect() 56 | print("====== Save pickle @{} ......".format(pkl_path)) 57 | with open(pkl_path, "wb") as fp: # Pickling 58 | pickle.dump([X, y, X_test,y_test], fp) 59 | print("====== read_higgs_data X={}, y={}, X_test={} ...... OK".format(X.shape, y.shape, X_test.shape)) 60 | return X,y,X_test 61 | 62 | X,y,X_test = read_higgs_data("F:/Datasets/HIGGS_/HIGGS.csv") 63 | #X = Unique_Expand(X) 64 | #X_test = Unique_Expand(X_test) 65 | num_rounds = 10001 66 | params = { 67 | "objective": "binary", 68 | "metric": "auc", #"binary_logloss" 69 | "adaptive":'weight', 70 | 'max_bin': 256, 71 | 'num_leaves': 64, 72 | 'learning_rate': 0.1, 73 | 'tree_learner': 'serial', 74 | 'task': 'train', 75 | 'is_training_metric': 'false', 76 | 'min_data_in_leaf': 512, 77 | #'min_sum_hessian_in_leaf': 100, 78 | #'bagging_fraction': 1,#0.2, 79 | 'subsample': 1, 'bagging_freq': 1, 80 | 'feature_fraction': 1, 81 | #'ndcg_eval_at': [1, 3, 5, 10], 82 | #'sparse_threshold': 1.0, 83 | 'n_estimators':num_rounds, 84 | 'early_stopping_rounds': 500, 85 | 'verbose':667, 86 | #'device': 'cpu' 87 | #'device': 'gpu', 88 | #'gpu_platform_id': 0, 89 | #'gpu_device_id': 0 90 | } 91 | n_fold = 5 92 | folds = KFold(n_splits=n_fold, shuffle=True, random_state=11) 93 | for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): 94 | t0 = time.time() 95 | 96 | if type(X) == np.ndarray: 97 | X_train, X_valid = X[train_index], X[valid_index] 98 | y_train, y_valid = y[train_index], y[valid_index] 99 | else: 100 | X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] 101 | y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] 102 | if False: 103 | mean = y_train.mean(); 104 | d_train = pd.concat([y_train, X_train], ignore_index=True, axis=1) 105 | print("X_train={}, y_train={} d_train={}".format(X_train.shape, y_train.shape, d_train.shape)) 106 | np.savetxt("D:/LightGBM-master/examples/regression/geo_test.csv", d_train, delimiter='\t') 107 | 108 | if model_type == 'mort': 109 | model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)]) 110 | #y_pred_valid = model.predict(X_valid) 111 | #y_pred = model.predict(X_test) 112 | 113 | if model_type == 'lgb': 114 | model = lgb.LGBMRegressor(**params, n_jobs=-1) 115 | model.fit(X_train, y_train, 116 | eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',verbose=5) 117 | model.booster_.save_model('geo_test_.model') 118 | #y_pred_valid = model.predict(X_valid) 119 | #y_pred = model.predict(X_test, num_iteration=model.best_iteration_) 120 | break 121 | 122 | input("loss is {} time={:.3g} model={}...".format(0,time.time()-t0,model_type)) 123 | sys.exit(-1) 124 | 125 | t0 = time.time() 126 | gbm = lgb.train(params, train_set=dtrain, num_boost_round=10, 127 | valid_sets=None, valid_names=None, 128 | fobj=None, feval=None, init_model=None, 129 | feature_name='auto', categorical_feature='auto', 130 | early_stopping_rounds=None, evals_result=None, 131 | verbose_eval=True, 132 | keep_training_booster=False, callbacks=None) 133 | t1 = time.time() 134 | 135 | print('cpu version elapse time: {}'.format(t1 - t0)) 136 | -------------------------------------------------------------------------------- /src/util/Float16.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | //extern short FloatToFloat16(float value); 5 | //extern float Float16ToFloat(short value); 6 | 7 | class Float16 8 | { 9 | protected: 10 | short mValue; 11 | 12 | short FloatToFloat16(float value) { 13 | short fltInt16; 14 | int fltInt32; 15 | memcpy(&fltInt32, &value, sizeof(float)); 16 | fltInt16 = ((fltInt32 & 0x7fffffff) >> 13) - (0x38000000 >> 13); 17 | fltInt16 |= ((fltInt32 & 0x80000000) >> 16); 18 | 19 | return fltInt16; 20 | } 21 | 22 | float Float16ToFloat(short fltInt16) const { 23 | int fltInt32 = ((fltInt16 & 0x8000) << 16); 24 | fltInt32 |= ((fltInt16 & 0x7fff) << 13) + 0x38000000; 25 | 26 | float fRet; 27 | memcpy(&fRet, &fltInt32, sizeof(float)); 28 | return fRet; 29 | } 30 | 31 | public: 32 | Float16(); 33 | Float16(float value); 34 | Float16(const Float16& value); 35 | 36 | operator float(); 37 | operator float() const; 38 | 39 | friend Float16 operator + (const Float16& val1, const Float16& val2); 40 | friend Float16 operator - (const Float16& val1, const Float16& val2); 41 | friend Float16 operator * (const Float16& val1, const Float16& val2); 42 | friend Float16 operator / (const Float16& val1, const Float16& val2); 43 | 44 | Float16& operator =(const Float16& val); 45 | Float16& operator +=(const Float16& val); 46 | Float16& operator -=(const Float16& val); 47 | Float16& operator *=(const Float16& val); 48 | Float16& operator /=(const Float16& val); 49 | Float16& operator -(); 50 | 51 | //https://codedocs.xyz/HipsterSloth/PSMoveService/__detail_8hpp_source.html 52 | union uif32 { 53 | float f; 54 | unsigned int i; 55 | }; 56 | 57 | //https://github.com/g-truc/glm/blob/0.9.5/glm/detail/type_half.inl 58 | static float GLM_toFloat32(const short& value, int flag = 0x0) { 59 | int s = (value >> 15) & 0x00000001; 60 | int e = (value >> 10) & 0x0000001f; 61 | int m = value & 0x000003ff; 62 | uif32 result; 63 | if (e == 0) { 64 | if (m == 0) { 65 | result.i = (unsigned int)(s << 31); 66 | return result.f; 67 | } 68 | else { 69 | // 70 | // Denormalized number -- renormalize it 71 | // 72 | while (!(m & 0x00000400)) { 73 | m <<= 1; 74 | e -= 1; 75 | } 76 | e += 1; 77 | m &= ~0x00000400; 78 | } 79 | } 80 | else if (e == 31) { 81 | if (m == 0) { 82 | // 83 | // Positive or negative infinity 84 | // 85 | result.i = (unsigned int)((s << 31) | 0x7f800000); 86 | return result.f; 87 | } else { 88 | // 89 | // Nan -- preserve sign and significand bits 90 | // 91 | uif32 result; 92 | result.i = (unsigned int)((s << 31) | 0x7f800000 | (m << 13)); 93 | return result.f; 94 | } 95 | } 96 | 97 | e = e + (127 - 15); 98 | m = m << 13; 99 | uif32 Result; 100 | Result.i = (unsigned int)((s << 31) | (e << 23) | m); 101 | return Result.f; 102 | }; 103 | }; 104 | 105 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 106 | 107 | inline Float16::Float16() 108 | { 109 | } 110 | 111 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 112 | 113 | inline Float16::Float16(float value){ 114 | mValue = FloatToFloat16(value); 115 | } 116 | 117 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 118 | 119 | inline Float16::Float16(const Float16 &value){ 120 | mValue = value.mValue; 121 | } 122 | 123 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 124 | 125 | inline Float16::operator float() 126 | { 127 | return Float16ToFloat(mValue); 128 | } 129 | 130 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 131 | 132 | inline Float16::operator float() const 133 | { 134 | return Float16ToFloat(mValue); 135 | } 136 | 137 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 138 | 139 | inline Float16& Float16::operator =(const Float16& val) 140 | { 141 | mValue = val.mValue; 142 | } 143 | 144 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 145 | 146 | inline Float16& Float16::operator +=(const Float16& val) 147 | { 148 | *this = *this + val; 149 | return *this; 150 | } 151 | 152 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 153 | 154 | inline Float16& Float16::operator -=(const Float16& val) 155 | { 156 | *this = *this - val; 157 | return *this; 158 | 159 | } 160 | 161 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 162 | 163 | inline Float16& Float16::operator *=(const Float16& val) 164 | { 165 | *this = *this * val; 166 | return *this; 167 | } 168 | 169 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 170 | 171 | inline Float16& Float16::operator /=(const Float16& val) 172 | { 173 | *this = *this / val; 174 | return *this; 175 | } 176 | 177 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 178 | 179 | inline Float16& Float16::operator -() 180 | { 181 | *this = Float16(-(float)*this); 182 | return *this; 183 | } 184 | 185 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 186 | /*+----+ Friends +----+*/ 187 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 188 | 189 | inline Float16 operator + (const Float16& val1, const Float16& val2) 190 | { 191 | return Float16((float)val1 + (float)val2); 192 | } 193 | 194 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 195 | 196 | inline Float16 operator - (const Float16& val1, const Float16& val2) 197 | { 198 | return Float16((float)val1 - (float)val2); 199 | } 200 | 201 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 202 | 203 | inline Float16 operator * (const Float16& val1, const Float16& val2) 204 | { 205 | return Float16((float)val1 * (float)val2); 206 | } 207 | 208 | /*+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+*/ 209 | 210 | inline Float16 operator / (const Float16& val1, const Float16& val2) 211 | { 212 | return Float16((float)val1 / (float)val2); 213 | } 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /python-package/case_future_sales.py: -------------------------------------------------------------------------------- 1 | #https://www.kaggle.com/dhimananubhav/feature-engineering-xgboost 2 | 3 | import numpy as np 4 | import pandas as pd 5 | pd.set_option('display.max_rows', 500) 6 | pd.set_option('display.max_columns', 100) 7 | 8 | from itertools import product 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.metrics import mean_squared_error 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | from xgboost import XGBRegressor 14 | from xgboost import plot_importance 15 | import time 16 | import sys 17 | import gc 18 | import pickle 19 | import random 20 | from litemort import * 21 | 22 | from bayes_opt import BayesianOptimization 23 | 24 | 25 | def plot_features(booster, figsize): 26 | fig, ax = plt.subplots(1,1,figsize=figsize) 27 | return plot_importance(booster=booster, ax=ax) 28 | 29 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 30 | isMORT = True 31 | alg='MORT' if isMORT else 'LGB' 32 | #some_rows = 5000 33 | some_rows = None 34 | #data_root = '../input/' 35 | data_root = "F:/Datasets/future_sales" 36 | 37 | test = pd.read_csv(f'{data_root}/test.csv').set_index('ID') 38 | data = pd.read_pickle(f'{data_root}/data.pkl') 39 | if some_rows is not None: 40 | nMost=data.shape[0] 41 | random.seed(42) 42 | subset = random.sample(range(nMost), some_rows) 43 | data = data.iloc[subset, :].reset_index(drop=True) 44 | print('====== Some Samples ... data={}'.format(data.shape)) 45 | 46 | data = data[[ 47 | 'date_block_num', 48 | 'shop_id', 49 | 'item_id', 50 | 'item_cnt_month', 51 | 'city_code', 52 | 'item_category_id', 53 | 'type_code', 54 | 'subtype_code', 55 | 'item_cnt_month_lag_1', 56 | 'item_cnt_month_lag_2', 57 | 'item_cnt_month_lag_3', 58 | 'item_cnt_month_lag_6', 59 | 'item_cnt_month_lag_12', 60 | 'date_avg_item_cnt_lag_1', 61 | 'date_item_avg_item_cnt_lag_1', 62 | 'date_item_avg_item_cnt_lag_2', 63 | 'date_item_avg_item_cnt_lag_3', 64 | 'date_item_avg_item_cnt_lag_6', 65 | 'date_item_avg_item_cnt_lag_12', 66 | 'date_shop_avg_item_cnt_lag_1', 67 | 'date_shop_avg_item_cnt_lag_2', 68 | 'date_shop_avg_item_cnt_lag_3', 69 | 'date_shop_avg_item_cnt_lag_6', 70 | 'date_shop_avg_item_cnt_lag_12', 71 | 'date_cat_avg_item_cnt_lag_1', 72 | 'date_shop_cat_avg_item_cnt_lag_1', 73 | #'date_shop_type_avg_item_cnt_lag_1', 74 | #'date_shop_subtype_avg_item_cnt_lag_1', 75 | 'date_city_avg_item_cnt_lag_1', 76 | 'date_item_city_avg_item_cnt_lag_1', 77 | #'date_type_avg_item_cnt_lag_1', 78 | #'date_subtype_avg_item_cnt_lag_1', 79 | 'delta_price_lag', 80 | 'month', 81 | 'days', 82 | 'item_shop_last_sale', 83 | 'item_last_sale', 84 | 'item_shop_first_sale', 85 | 'item_first_sale', 86 | ]] 87 | 88 | X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1) 89 | Y_train = data[data.date_block_num < 33]['item_cnt_month'] 90 | X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1) 91 | Y_valid = data[data.date_block_num == 33]['item_cnt_month'] 92 | X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1) 93 | print(f"X_train={X_train.shape} Y_train={Y_train.shape}") 94 | print(f"X_valid={X_valid.shape} Y_valid={Y_valid.shape}") 95 | print(f"X_test={X_test.shape} ") 96 | del data 97 | gc.collect(); 98 | 99 | params={'num_leaves': 550, 'n_estimators':1000,'early_stopping_rounds':20, 100 | 'feature_fraction': 1, 'bagging_fraction': 1, 101 | 'max_bin': 512, 102 | "adaptive":'weight1' 103 | '', #无效,晕 104 | #"learning_schedule":"adaptive", 105 | 'max_depth': 10, 106 | 'min_child_weight': 300, #'min_data_in_leaf': 300, 107 | 'learning_rate': 0.1, 108 | 'objective': 'regression', 109 | 'boosting_type': 'gbdt', 110 | 'verbose': 1, 111 | 'metric': {'rmse'} 112 | } 113 | 114 | 115 | def hyparam_core(num_leaves, feature_fraction, bagging_fraction, max_depth, learning_rate, min_data_in_leaf,max_bin): 116 | param_1 = params 117 | param_1['verbose']=0 118 | param_1["num_leaves"] = int(round(num_leaves)) 119 | param_1['feature_fraction'] = max(min(feature_fraction, 1), 0) 120 | param_1['bagging_fraction'] = max(min(bagging_fraction, 1), 0) 121 | param_1['max_depth'] = int(round(max_depth)) 122 | param_1['learning_rate'] = learning_rate 123 | param_1['min_data_in_leaf'] = int(round(min_data_in_leaf)) 124 | param_1['max_bin'] = int(round(max_bin)) 125 | 126 | model = LiteMORT(param_1).fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)]) 127 | Y_pred = model.predict(X_valid).clip(0, 20) 128 | score = np.sqrt(mean_squared_error(Y_pred, Y_valid)) 129 | return -score 130 | 131 | if isMORT: 132 | if False: #BayesianOptimization 133 | pds = {'num_leaves': (547, 547), 134 | 'feature_fraction': (1, 1), 135 | 'bagging_fraction': (1, 1), 136 | 'max_depth': (10,10), 137 | 'learning_rate': (0.1, 0.1), 138 | 'min_data_in_leaf': (20, 20), 139 | 'max_bin': (128, 1024), 140 | } 141 | 142 | optimizer = BayesianOptimization(hyparam_core, pds, random_state=7) 143 | optimizer.maximize(init_points=5, n_iter=12) 144 | print(optimizer.max) 145 | for i, res in enumerate(optimizer.res): 146 | print("Iteration {}: \n\t{}".format(i, res)) 147 | input(f"......BayesianOptimization is OK......") 148 | 149 | model = LiteMORT(params).fit(X_train,Y_train,eval_set=[(X_valid, Y_valid)]) 150 | else: 151 | model = XGBRegressor( 152 | max_depth=8, 153 | n_estimators=1000, 154 | min_child_weight=300, 155 | colsample_bytree=0.8, 156 | subsample=0.8, 157 | eta=0.3, 158 | seed=42) 159 | 160 | model.fit( 161 | X_train, 162 | Y_train, 163 | eval_metric="rmse", 164 | eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 165 | verbose=True, 166 | early_stopping_rounds = 10) 167 | alg = 'xgboost' 168 | 169 | Y_pred = model.predict(X_valid).clip(0, 20) 170 | score = np.sqrt(mean_squared_error(Y_pred, Y_valid)) 171 | Y_test = model.predict(X_test).clip(0, 20) 172 | 173 | if not isMORT: 174 | plot_features(model, (10, 14)) 175 | 176 | path="" 177 | if some_rows is None: 178 | submission = pd.DataFrame({ 179 | "ID": test.index, 180 | "item_cnt_month": Y_test 181 | }) 182 | path = f'{data_root}/{alg}_[{score:.5g}].csv' 183 | submission.to_csv(path, index=False) 184 | 185 | # save predictions for an ensemble 186 | #pickle.dump(Y_pred, open(f'{data_root}/xgb_train.pickle', 'wb')) 187 | #pickle.dump(Y_test, open(f'{data_root}/xgb_test.pickle', 'wb')) 188 | input(f"......Save submit @{path}......") 189 | -------------------------------------------------------------------------------- /python-package/LiteMORT/LiteMORT_ERA.py: -------------------------------------------------------------------------------- 1 | #Exploratory result analysis 2 | import math 3 | import seaborn as sns; sns.set(style="ticks", color_codes=True) 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | import numpy as np 7 | import random 8 | import gc 9 | import time 10 | from sklearn.metrics import roc_auc_score 11 | from scipy.stats import rankdata 12 | 13 | def auc_u_test(vec, len_A, len_B): 14 | rank_value = rankdata(vec) 15 | rank_sum = sum(rank_value[0:len_A]) 16 | u_value = rank_sum - (len_A*(len_A+1))/2 17 | auc = u_value / (len_A * len_B) 18 | if auc < 0.50: 19 | auc = 1.0 - auc 20 | return(auc) 21 | 22 | # from https://gist.github.com/mattsgithub/dedaa017adc1f30d9833175a5c783221 23 | def roc_auc_alternative(y_true, y_score): 24 | # Total number of observations 25 | N = y_true.shape[0] 26 | I = np.arange(1, N + 1) 27 | N_pos = np.sum(y_true) 28 | N_neg = N - N_pos 29 | I = y_score.argsort()[::-1][:N] 30 | y_pred = y_true[I] 31 | I = np.arange(1, N + 1) 32 | return 1. + ((N_pos + 1.) / (2 * N_neg)) - (1. / (N_pos * N_neg)) * I.dot(y_pred) 33 | 34 | def Robert_M_Johnson_test( ): 35 | np.random.seed(42) 36 | N = np.arange(start=20, stop=1000000, step=10000) 37 | 38 | t_sklearn = [] 39 | t_dot = [] 40 | for n in N: 41 | N_pos = np.random.randint(low=1, high=n + 1) 42 | y_true = np.concatenate((np.ones(N_pos), np.zeros(n - N_pos))) 43 | random.shuffle(y_true) 44 | y_true = np.array([0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.]) 45 | y_score = np.random.random(size=n) 46 | 47 | # Timeit 48 | t0 = time.time() 49 | y1 = roc_auc_score(y_true=y_true, y_score=y_score) 50 | t1 = time.time() 51 | t_sklearn.append(t1 - t0) 52 | 53 | # Timeit 54 | t0 = time.time() 55 | y2 = roc_auc_alternative(y_true=y_true, y_score=y_score) 56 | t1 = time.time() 57 | t_dot.append(t1 - t0) 58 | 59 | # Proves their equality 60 | # Raises error if not almost equal (up to 14 decimal places) 61 | np.testing.assert_almost_equal(y1, y2, decimal=14) 62 | 63 | class Feature_Importance(object): 64 | def __init__(self, columns): 65 | self.columns = columns 66 | self.df = pd.DataFrame() 67 | 68 | def OnFold(self,fold,f_importance): 69 | fold_importance_df = pd.DataFrame() 70 | fold_importance_df["Feature"] = self.columns 71 | fold_importance_df["importance"] = f_importance 72 | fold_importance_df["fold"] = fold 73 | self.df = pd.concat([self.df, fold_importance_df], axis=0) 74 | 75 | def SomePlot(self): 76 | cols = (self.df[["Feature", "importance"]].groupby("Feature").mean() 77 | .sort_values(by="importance", ascending=False)[:32].index) 78 | best_features = self.df.loc[self.df.Feature.isin(cols)] 79 | plt.figure(figsize=(14, 25)) 80 | sns.barplot(x="importance", y="Feature", 81 | data=best_features.sort_values(by="importance", ascending=False)) 82 | plt.title('LightGBM Features (avg over folds)') 83 | plt.tight_layout() 84 | plt.savefig('lgbm_importances.png') 85 | plt.show(block=True) 86 | 87 | def ERA_pair(features,train,test_0,predict_0, target_0): 88 | predict = pd.Series(predict_0) 89 | target = target_0.reset_index(drop=True) 90 | err = (predict-target).abs() 91 | a0,a1=err.min(),err.max() 92 | thrsh = a0+(a1-a0)/10 93 | idx_1 = err.index[err > thrsh] 94 | idx_2 = err.index[err <= thrsh] 95 | test = test_0.reset_index(drop=True) 96 | df1 = test[test.index.isin(idx_1)][features] 97 | df2 = test[test.index.isin(idx_2)][features] 98 | df1.fillna(0.0, inplace=True) 99 | df2.fillna(0.0, inplace=True) 100 | g = sns.pairplot(df1) 101 | g.fig.suptitle("df={} err=[{:.3g}-{:.3g}]".format(df1.shape,thrsh,a1)) 102 | g = sns.pairplot(df2) 103 | g.fig.suptitle("df={} err=[{:.3g}-{:.3g}]".format(df2.shape, a0,thrsh)) 104 | plt.show() 105 | plt.show(block=True) 106 | #plt.close() 107 | del df1,df2 108 | gc.collect() 109 | return 110 | 111 | # 'h_mean','log_mean'必须不是0 112 | def df_mix_(df,cols,alg='exp_mean'): 113 | mix_lg, mix_hm = 0, 0 114 | if alg=='log_mean': 115 | gc.collect() 116 | elif alg == 'exp_mean': 117 | for col in cols: 118 | mix_hm += np.exp(df[col]) 119 | mix_hm = np.log(mix_hm) 120 | df['mix'] = mix_hm 121 | gc.collect() 122 | elif alg=='h_mean': 123 | for col in cols: 124 | mix_hm += 1 / df[col] 125 | mix_hm = 1/mix_hm 126 | df['mix'] = mix_hm 127 | gc.collect() 128 | else: 129 | df['mix'] = df[cols].max(axis=1) 130 | return df['mix'] 131 | 132 | def cys_mix_ID_TRAGET_(ID,TARGET,path,files,alg='h_mean'): 133 | #path='H:/Project/fraud_click/bagging/' 134 | #files = [path+'{{{[H]_7_0.05.txt}}}_cys_.csv',path+'{{{[H]_8_eta.txt}}}_cys_.csv',path+'{{{[H]_9_eta.txt}}}_cys_.csv'] 135 | mix_lg,mix_hm=0,0 136 | #alg='log_mean' #效果很好,令人吃惊 137 | #alg='h_mean' #harmonic mean 138 | # alg='max_out' # 139 | # alg='log_rank_mean' #可以试试 140 | out = '{}[{}]_BAG{}.csv'.format(path,alg,len(files)) 141 | df = pd.DataFrame() 142 | cols = [] 143 | if alg=='log_mean': 144 | for idx, fp in enumerate(files): 145 | print('====== Load {}...'.format(fp)) 146 | tmp = pd.read_csv(fp, nrows=10000) # , nrows=10000 147 | mix_lg += np.log(tmp.TARGET) 148 | df[ID] = tmp[ID] 149 | mix_lg = np.exp(mix_lg / len(files)) 150 | df[TARGET] = mix_lg 151 | del tmp 152 | gc.collect() 153 | elif alg=='h_mean': 154 | for idx, fp in enumerate(files): 155 | print('====== Load {}...'.format(fp)) 156 | tmp = pd.read_csv(fp) #, nrows=10000 157 | mix_hm += 1 / (tmp.TARGET) 158 | df[ID] = tmp[ID] 159 | mix_hm = 1/mix_hm 160 | df[TARGET] = mix_hm 161 | del tmp 162 | gc.collect() 163 | else: 164 | df=pd.DataFrame() 165 | cols=[] 166 | for idx, fp in enumerate(files): 167 | print('====== Load {}...'.format(fp)) 168 | tmp = pd.read_csv(fp) # , nrows=10000 169 | title = 'att_{}'.format(idx) 170 | cols.append(title) 171 | df[title] = tmp[TARGET] 172 | df[ID] = tmp[ID] 173 | df[TARGET] = df[cols].max(axis=1) 174 | out = path+'{{{'+'maxout'+'}}}_bag.csv' 175 | nN = df.isnull().sum().sum() 176 | print( '======{} out={} shape={},NAN={} ...\n{}'.format(alg,out,df.shape,nN,df.head()) ) 177 | df[[ID, TARGET]].to_csv(out, index=False,float_format='%.8f') 178 | print( '======{} ... OK\n'.format(alg,out,df.shape,nN) ) 179 | 180 | if __name__=='__main__': 181 | Robert_M_Johnson_test() -------------------------------------------------------------------------------- /vs/LiteMORT/LiteMORT.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | {e17e50fe-dad5-49cb-9b89-e0f946d8426b} 18 | 19 | 20 | {163fefa2-bee8-49cb-8d14-10a13710ebf3} 21 | 22 | 23 | {4ccc43f1-1162-462e-b590-9dc7aa994b7a} 24 | 25 | 26 | {fd9e0871-c601-475d-b095-e87773b68644} 27 | 28 | 29 | {c2f85e65-ddf3-484e-82ce-966e3214f827} 30 | 31 | 32 | {52705ab1-350d-4f26-a4db-89374027eba6} 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 源文件 41 | 42 | 43 | 源文件\data_fold 44 | 45 | 46 | 源文件\data_fold 47 | 48 | 49 | 源文件\data_fold 50 | 51 | 52 | 源文件\learn 53 | 54 | 55 | 源文件\learn 56 | 57 | 58 | 源文件\tree 59 | 60 | 61 | 源文件\tree 62 | 63 | 64 | 源文件\tree 65 | 66 | 67 | 源文件\tree 68 | 69 | 70 | 源文件\python 71 | 72 | 73 | 源文件\EDA 74 | 75 | 76 | 源文件\EDA 77 | 78 | 79 | 源文件\util 80 | 81 | 82 | 源文件\util 83 | 84 | 85 | 源文件\learn 86 | 87 | 88 | 源文件\util 89 | 90 | 91 | 源文件\data_fold 92 | 93 | 94 | 95 | 96 | 源文件\data_fold 97 | 98 | 99 | 源文件\data_fold 100 | 101 | 102 | 源文件\data_fold 103 | 104 | 105 | 源文件\data_fold 106 | 107 | 108 | 源文件\data_fold 109 | 110 | 111 | 源文件\data_fold 112 | 113 | 114 | 源文件\data_fold 115 | 116 | 117 | 源文件\data_fold 118 | 119 | 120 | 源文件\data_fold 121 | 122 | 123 | 源文件\data_fold 124 | 125 | 126 | 源文件\learn 127 | 128 | 129 | 源文件\tree 130 | 131 | 132 | 源文件\tree 133 | 134 | 135 | 源文件\tree 136 | 137 | 138 | 源文件\tree 139 | 140 | 141 | 源文件\python 142 | 143 | 144 | 源文件\util 145 | 146 | 147 | 源文件\util 148 | 149 | 150 | 源文件\util 151 | 152 | 153 | 源文件\util 154 | 155 | 156 | 源文件\util 157 | 158 | 159 | 头文件 160 | 161 | 162 | 源文件\EDA 163 | 164 | 165 | 源文件\EDA 166 | 167 | 168 | 源文件\util 169 | 170 | 171 | 源文件\learn 172 | 173 | 174 | 源文件\util 175 | 176 | 177 | 源文件\data_fold 178 | 179 | 180 | 源文件\util 181 | 182 | 183 | 源文件\data_fold 184 | 185 | 186 | 源文件\data_fold 187 | 188 | 189 | -------------------------------------------------------------------------------- /python-package/case_earthquake.py: -------------------------------------------------------------------------------- 1 | # https://www.kaggle.com/tocha4/lanl-master-s-approach 2 | 3 | import numpy as np # linear algebra 4 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 5 | import scipy as sc 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | import gc 9 | import warnings 10 | warnings.filterwarnings("ignore") 11 | warnings.simplefilter(action='ignore', category=FutureWarning) 12 | from tqdm import tqdm_notebook 13 | import datetime 14 | import time 15 | import random 16 | from joblib import Parallel, delayed 17 | 18 | 19 | import lightgbm as lgb 20 | from tensorflow import keras 21 | from gplearn.genetic import SymbolicRegressor 22 | #from catboost import Pool, CatBoostRegressor 23 | from litemort import * 24 | from sklearn.pipeline import Pipeline 25 | from sklearn.preprocessing import StandardScaler 26 | from sklearn.metrics import mean_absolute_error,mean_squared_error 27 | from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV 28 | from sklearn.feature_selection import RFECV, SelectFromModel 29 | import os 30 | import sys 31 | import pickle 32 | from sklearn.linear_model import LinearRegression, Ridge 33 | from sklearn.tree import DecisionTreeRegressor 34 | from sklearn.svm import NuSVR, SVR 35 | from sklearn.kernel_ridge import KernelRidge 36 | from sklearn.ensemble import AdaBoostRegressor 37 | from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 38 | 39 | today = datetime.date.today().strftime('%m%d') 40 | 41 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 42 | #isMORT = True 43 | #some_rows=3000 44 | some_rows=None 45 | model_type='mort' if isMORT else 'lgb' 46 | nVerbose = 500 47 | pkl_path = 'G:/kaggle/Earthquake/data/anton_2_{}.pickle'.format(some_rows) 48 | pkl_path = 'G:/kaggle/Earthquake/data/anton_cys0_{}.pickle'.format(some_rows) 49 | eval_metric='l1' 50 | min_error = mean_squared_error if eval_metric=='l1' else mean_absolute_error 51 | params = { 52 | 'n_estimators':50000, #减少n_estimators 并不能控制overfit 53 | 'early_stopping_rounds': 200, 54 | 'num_leaves': 256, #128 55 | #'max_bin': 64, 56 | 'min_data_in_leaf': 32, #79 57 | 'objective': 'tweedie', #'regression', 58 | 'max_depth': -1, 59 | 'learning_rate': 0.01, 60 | #"boosting": "gbdt", 61 | "bagging_freq": 5, 62 | "bagging_fraction": 1,#0.8126672064208567, #0.8126672064208567, 63 | "bagging_seed": 11, 64 | "metric": 'mae', 65 | "verbosity": nVerbose, 66 | #'reg_alpha': 0.1302650970728192, 67 | #'reg_lambda': 0.3603427518866501, 68 | 'colsample_bytree': 0.05 69 | } 70 | print("params=\n{}\n".format(params)) 71 | submission = pd.read_csv('G:/kaggle/Earthquake/input/sample_submission.csv') 72 | 73 | def Load_MoreDatas(paths): 74 | train_s=[] 75 | y_s=[] 76 | for path,nFile in paths: 77 | for i in range(nFile): 78 | path_X,path_y="{}/train_X_features_{}.csv".format(path,i+1),"{}/train_y_{}.csv".format(path,i+1) 79 | X_ = pd.read_csv(path_X) 80 | y_ = pd.read_csv(path_y, index_col=False, header=None) 81 | train_s.append(X_) 82 | y_s.append(y_) 83 | print("X_[{}]@{}\ny_[{}]@{}".format(X_.shape,path_X,y_.shape,path_y)) 84 | if len(train_s)>0: 85 | train_X = pd.concat(train_s, axis=0) 86 | y = pd.concat(y_s, axis=0) 87 | train_X = train_X.reset_index(drop=True) 88 | y = y.reset_index(drop=True) 89 | print("Load_MoreDatas X_[{}] y_[{}]".format(train_X.shape, y.shape)) 90 | return train_X,y 91 | 92 | if os.path.isfile(pkl_path): 93 | print("\n======load pickle file from {} ...".format(pkl_path)) 94 | with open(pkl_path, "rb") as fp: # Pickling 95 | [train_X, test_X, train_y] = pickle.load(fp) 96 | if some_rows is not None: 97 | train_X = train_X[:some_rows] 98 | test_X = test_X[:some_rows] 99 | train_y = train_y[:some_rows] 100 | print("\n======train_X={} test_X={} train_y={} \n".format(train_X.shape, test_X.shape, train_y.shape)) 101 | else: 102 | #train_X_2,y_2 = Load_MoreDatas([('G:/kaggle/Earthquake/data/cys/15000', 14), 103 | # ('G:/kaggle/Earthquake/data/cys/17000', 15)]) 104 | train_X_0 = pd.read_csv("G:/kaggle/Earthquake/data/train_X_features_865_0.csv") 105 | train_X_1 = pd.read_csv("G:/kaggle/Earthquake/data/train_X_features_865_1.csv") 106 | y_0 = pd.read_csv("G:/kaggle/Earthquake/data/train_y_0.csv", index_col=False, header=None) 107 | y_1 = pd.read_csv("G:/kaggle/Earthquake/data/train_y_1.csv", index_col=False, header=None) 108 | train_X = pd.concat([train_X_0, train_X_1], axis=0) 109 | y = pd.concat([y_0, y_1], axis=0) 110 | 111 | train_X = train_X.reset_index(drop=True) 112 | print(train_X.shape) 113 | print(train_X.head()) 114 | 115 | y = y.reset_index(drop=True) 116 | print(y[0].shape) 117 | train_y = pd.Series(y[0].values) 118 | test_X = pd.read_csv("G:/kaggle/Earthquake/data/test_X_features_10.csv") 119 | scaler = StandardScaler() 120 | train_columns = train_X.columns 121 | 122 | train_X[train_columns] = scaler.fit_transform(train_X[train_columns]) 123 | test_X[train_columns] = scaler.transform(test_X[train_columns]) 124 | with open(pkl_path, "wb") as fp: # Pickling 125 | pickle.dump([train_X, test_X, train_y], fp) 126 | print("Save pickle file at {} train_X={} test_X={} train_y={}".format(pkl_path,train_X.shape, test_X.shape, train_y.shape)) 127 | sys.exit(-2) 128 | 129 | train_columns = train_X.columns 130 | n_fold = 5 #n_fold=10 只是增加了过拟合,莫名其妙 131 | folds = KFold(n_splits=n_fold, shuffle=True, random_state=42) 132 | 133 | oof = np.zeros(len(train_X)) 134 | train_score = [] 135 | fold_idxs = [] 136 | # if PREDICTION: 137 | predictions = np.zeros(len(test_X)) 138 | 139 | feature_importance_df = pd.DataFrame() 140 | #run model 141 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X,train_y.values)): 142 | t0=time.time() 143 | strLog = "fold {}".format(fold_) 144 | print(strLog) 145 | fold_idxs.append(val_idx) 146 | fold_importance_df = pd.DataFrame() 147 | fold_importance_df["Feature"] = train_columns 148 | 149 | X_train, X_valid = train_X[train_columns].iloc[trn_idx], train_X[train_columns].iloc[val_idx] 150 | y_train, y_valid = train_y.iloc[trn_idx], train_y.iloc[val_idx] 151 | if model_type == 'mort': 152 | params['objective'] = 'regression' 153 | # model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)]) 154 | model = LiteMORT(params).fit_1(X_train, y_train, eval_set=[(X_valid, y_valid)]) 155 | if model_type == 'cat': 156 | model = CatBoostRegressor(n_estimators=25000, verbose=-1, objective="MAE", loss_function="MAE", boosting_type="Ordered", task_type="GPU") 157 | model.fit(X_tr, 158 | y_tr, 159 | eval_set=[(X_val, y_val)], 160 | # eval_metric='mae', 161 | verbose=2500, 162 | early_stopping_rounds=500) 163 | if model_type == 'lgb': 164 | model = lgb.LGBMRegressor(**params, n_jobs=-1)#n_estimators=50000, 165 | model.fit(X_train, y_train, 166 | eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae', 167 | verbose=nVerbose, early_stopping_rounds=200) # 168 | fold_importance_df["importance"] = model.feature_importances_[:len(train_columns)] 169 | fold_importance_df["fold"] = fold_ + 1 170 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 171 | 172 | oof[val_idx] = model.predict(X_valid) 173 | fold_score = mean_absolute_error(oof[val_idx], y_valid) 174 | print("{}\tscore={:.4g} time={:.4g}".format(strLog,fold_score,time.time()-t0)) 175 | 176 | #predictions 177 | predictions += model.predict(test_X[train_columns]) / folds.n_splits 178 | train_score.append(fold_score) 179 | 180 | cv_score = mean_absolute_error(train_y, oof) 181 | print(f"\n======After {n_fold} score = {cv_score:.3f}, CV_fold = {np.mean(train_score):.3f} | {np.std(train_score):.3f}", end=" ") 182 | 183 | 184 | 185 | 186 | submission["time_to_failure"] = predictions 187 | submission.to_csv(f'G:/kaggle/Earthquake/result/{model_type}_{today}_[{cv_score:.3f},{np.std(train_score):.3f}].csv', index=False) 188 | submission.head() -------------------------------------------------------------------------------- /tests/python_package_test/test_sklearn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: skip-file 3 | import math 4 | import os 5 | import unittest 6 | 7 | from litemort import (LiteMORT,Mort_Preprocess) 8 | import lightgbm as lgb 9 | import numpy as np 10 | from sklearn.base import clone 11 | from sklearn import preprocessing 12 | from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, 13 | load_iris, load_svmlight_file) 14 | from sklearn.externals import joblib 15 | from sklearn.metrics import log_loss, mean_squared_error 16 | from sklearn.model_selection import GridSearchCV, train_test_split 17 | from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest, 18 | check_parameters_default_constructible) 19 | isMORT=True 20 | 21 | try: 22 | from sklearn.utils.estimator_checks import check_no_fit_attributes_set_in_init 23 | sklearn_at_least_019 = True 24 | except ImportError: 25 | sklearn_at_least_019 = False 26 | 27 | 28 | def multi_error(y_true, y_pred): 29 | return np.mean(y_true != y_pred) 30 | 31 | 32 | def multi_logloss(y_true, y_pred): 33 | return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) 34 | 35 | 36 | class TestSklearn(unittest.TestCase): 37 | 38 | def test_binary_breast(self): 39 | params = { 40 | "objective": "binary", "metric": "logloss",'early_stop': 5, 'num_boost_round': 50, 41 | "verbosity": 1, 42 | } 43 | X, y = load_breast_cancer(True) 44 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) 45 | if isMORT: 46 | mort = LiteMORT(params) 47 | mort.fit(X_train, y_train, eval_set=[(X_test, y_test)], params=params) 48 | result = mort.predict(X_test) 49 | ret = log_loss(y_test, mort.predict_proba(X_test)) 50 | else: 51 | gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) 52 | gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) 53 | result = gbm.predict(X_test) 54 | ret = log_loss(y_test, gbm.predict_proba(X_test)) 55 | self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1],places=5) 56 | self.assertLess(ret, 0.15) 57 | 58 | def ttest_binary_digits(self): 59 | from sklearn.datasets import load_digits 60 | from sklearn.model_selection import KFold 61 | rng = np.random.RandomState(1994) 62 | params = { 63 | "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50, 64 | "verbosity": 1, 65 | } 66 | digits = load_digits(2) 67 | y = digits['target'] 68 | X = digits['data'] 69 | kf = KFold(n_splits=2, shuffle=True, random_state=rng) 70 | for train_index, test_index in kf.split(X, y): 71 | #xgb_model = cls(random_state=42).fit(X[train_index], y[train_index]) 72 | #xgb_model.predict(X[test_index]) 73 | mort = LiteMORT(params).fit(X[train_index], y[train_index]) 74 | preds = mort.predict(X[test_index]) 75 | labels = y[test_index] 76 | err = sum(1 for i in range(len(preds)) 77 | if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) 78 | assert err < 0.1 79 | 80 | 81 | def ttest_regression(self): 82 | params = { 83 | "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1, 84 | } 85 | X, y = load_boston(True) 86 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) 87 | if isMORT: 88 | mort = LiteMORT(params) 89 | mort.fit(X_train, y_train, eval_set=[(X_test, y_test)], params=params) 90 | ret = mean_squared_error(y_test, mort.predict(X_test)) 91 | else: 92 | gbm = lgb.LGBMRegressor(n_estimators=50, silent=True) 93 | gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) 94 | ret = mean_squared_error(y_test, gbm.predict(X_test)) 95 | self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) 96 | self.assertLess(ret, 16) 97 | 98 | def ttest_regression_boston_housing(self): 99 | rng = np.random.RandomState(1994) 100 | params = { 101 | "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1, 102 | } 103 | from sklearn.metrics import mean_squared_error 104 | from sklearn.datasets import load_boston 105 | from sklearn.model_selection import KFold 106 | params = { 107 | "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1, 108 | } 109 | boston = load_boston() 110 | y = boston['target'] 111 | X = boston['data'] 112 | kf = KFold(n_splits=2, shuffle=True, random_state=rng) 113 | for train_index, test_index in kf.split(X, y): 114 | #xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) 115 | mort = LiteMORT(params) 116 | mort.fit(X[train_index], y[train_index], params=params) 117 | preds = mort.predict(X[test_index]) 118 | labels = y[test_index] 119 | assert mean_squared_error(preds, labels) < 25 120 | 121 | 122 | #@unittest.skipIf(not litemort.combat.PANDAS_INSTALLED, 'pandas is not installed') 123 | def test_pandas_categorical(self): 124 | params = { #需要更详细的的测试 125 | "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50, 126 | "verbosity": 1, 127 | } 128 | import pandas as pd 129 | X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str 130 | "B": np.random.permutation([1, 2, 3] * 100), # int 131 | "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float 132 | "D": np.random.permutation([True, False] * 150)}) # bool 133 | y = np.random.permutation([0, 1] * 150) 134 | X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), 135 | "B": np.random.permutation([1, 3] * 30), 136 | "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), 137 | "D": np.random.permutation([True, False] * 30)}) 138 | if True: 139 | X, X_test = Mort_Preprocess.OrdinalEncode_(X,X_test) 140 | for col in ["A", "B", "C", "D"]: 141 | X[col] = X[col].astype('category') 142 | X_test[col] = X_test[col].astype('category') 143 | #trn_data = lgb.Dataset(X, label=y) 144 | 145 | if isMORT: 146 | mort0 = LiteMORT(params).fit(X, y) 147 | pred0 = list(mort0.predict(X_test)) 148 | mort1 = LiteMORT(params).fit(X, y, categorical_feature=[0]) 149 | pred1 = list(mort1.predict(X_test)) 150 | mort2 = LiteMORT(params).fit(X, y, categorical_feature=['A']) 151 | pred2 = list(mort2.predict(X_test)) 152 | mort3 = LiteMORT(params).fit(X, y, categorical_feature=['A', 'B', 'C', 'D']) 153 | pred3 = list(mort3.predict(X_test)) 154 | else: 155 | clf=lgb.sklearn.LGBMClassifier() 156 | gbm_ = clf.fit(X, y) 157 | gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y) 158 | pred0 = list(gbm0.predict(X_test)) 159 | gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0]) 160 | pred1 = list(gbm1.predict(X_test)) 161 | gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A']) 162 | pred2 = list(gbm2.predict(X_test)) 163 | gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D']) 164 | pred3 = list(gbm3.predict(X_test)) 165 | gbm3.booster_.save_model('categorical.model') 166 | gbm4 = lgb.Booster(model_file='categorical.model') 167 | pred4 = list(gbm4.predict(X_test)) 168 | pred_prob = list(gbm0.predict_proba(X_test)[:, 1]) 169 | np.testing.assert_almost_equal(pred_prob, pred4) 170 | input("...") 171 | #np.testing.assert_almost_equal(pred0, pred1) 172 | #np.testing.assert_almost_equal(pred0, pred2) 173 | #np.testing.assert_almost_equal(pred0, pred3) 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /python-package/lgb_kim.py: -------------------------------------------------------------------------------- 1 | #很好的测试算例 X_train_0=(159999, 200) y_train=(159999,)...... 2 | #https://www.kaggle.com/chocozzz/santander-lightgbm-baseline-lb-0-899 3 | #http://www.stat.ucdavis.edu/~chohsieh/teaching/STA141C_Spring2017/final_project_proposal.pdf 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | import lightgbm as lgb 10 | from sklearn.model_selection import KFold, StratifiedKFold 11 | import warnings 12 | import gc 13 | import time 14 | import sys 15 | import datetime 16 | import matplotlib.pyplot as plt 17 | import seaborn as sns 18 | from sklearn.metrics import mean_squared_error 19 | warnings.simplefilter(action='ignore', category=FutureWarning) 20 | warnings.filterwarnings('ignore') 21 | from sklearn import metrics 22 | from litemort import * 23 | 24 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 25 | #isMORT = True 26 | #some_rows=10000 27 | some_rows=None 28 | pick_samples=None 29 | #pick_samples=1500 30 | early_stop=500 #0.898->0.899 31 | max_bin = 256 #119增大到1280 32 | lr=0.02 #0.02 0.01提升一点点 0.89826-》0.89842 33 | nFold,nLeaves = 5,10 34 | n_round=35000 35 | min_child=32 #11(0.89830)-> 32(0.89882) 36 | #x_feat,x_sub=1,0.15 37 | x_feat,x_sub=0.15,0.3 38 | #x_feat,x_sub=1,1; nLeaves=2;n_round=5 #仅用于测试 39 | #n_round,nLeaves=50,2 40 | 41 | print('argv={}\nsome_rows={} pick_samples={}'.format(sys.argv,some_rows,pick_samples)) 42 | plt.style.use('seaborn') 43 | sns.set(font_scale=1) 44 | pd.set_option('display.max_columns', 500) 45 | 46 | train_df = pd.read_csv("E:/kaggle/Santander/input/train.csv",nrows=some_rows) 47 | test_df = pd.read_csv("E:/kaggle/Santander/input/test.csv",nrows=some_rows) 48 | 49 | features = [c for c in train_df.columns if c not in ['ID_code', 'target']] 50 | target = train_df['target'] 51 | cat_cols=[]; 52 | if True: 53 | #see_all_2(train_df, test_df, features, target, [0, 256, 1024]) 54 | #plot_binary_dist(train_df, test_df, ['var_81','var_68','var_139'],bins=1024) #','var_139','var_12','var_53','var_110' 55 | #[train_df,test_df],features=feat_extend([train_df,test_df],features) 56 | print("") 57 | else: 58 | #[train_df,test_df],features=feat_extend([train_df,test_df],features) 59 | may_cols=['var_68','var_6','var_108','var_13','var_33','var_146','var_21','var_80','var_139','var_81'] 60 | may_cols=['var_68'] #var_68 is date? 61 | #may_cols = features 62 | #[train_df,test_df],features,cat_cols=df2category_hisogram([train_df,test_df],features,may_cols) 63 | #train_df,test_df,features,cat_cols = df2category_rf(train_df,target,test_df,features) 64 | 65 | from sklearn.metrics import roc_auc_score, roc_curve 66 | #Target Encoding 67 | TE_folds, TE_inner_folds=10,5 68 | if True: 69 | for var_name in cat_cols: 70 | #train_df, test_df, feat_T = TE_cross(5, 2, train_df, 'target', test_df, var_name) 71 | train_df, test_df, feat_T = TE_expm(train_df, 'target', test_df, var_name) 72 | features.append(feat_T) 73 | 74 | 75 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) 76 | oof = np.zeros(len(train_df)) 77 | predictions = np.zeros(len(test_df)) 78 | feature_importance_df = pd.DataFrame() 79 | 80 | start = time.time() 81 | 82 | param = { 83 | 'num_leaves': nLeaves, 84 | 'num_round':n_round, 85 | #'num_leaves': 32, 86 | #'max_bin': 119, 87 | 'max_bin': max_bin, 88 | #"adaptive":'weight', 89 | 'min_data_in_leaf': min_child, 90 | 'learning_rate': lr, 91 | #'learning_rate': 0.5, 92 | 'min_sum_hessian_in_leaf': 0.00245, 93 | 'bagging_fraction': x_sub, 94 | 'bagging_freq': 5, 95 | 'feature_fraction': x_feat, 96 | 'lambda_l1': 4.972, 97 | 'lambda_l2': 2.276, 98 | 'min_gain_to_split': 0.65, 99 | 'max_depth': 14, 100 | 'save_binary': True, 101 | 'seed': 1337, 102 | 'feature_fraction_seed': 1337, 103 | 'bagging_seed': 1337, 104 | 'drop_seed': 1337, 105 | 'data_random_seed': 1337, 106 | 'objective': 'binary', 107 | 'boosting_type': 'gbdt', 108 | 'verbose': 1, 109 | 'metric': 'auc', 110 | #'metric': 'auc', 111 | 'is_unbalance': True, 112 | 'boost_from_average': False, 113 | } 114 | N_min = 100 # N_min 越大,regularization效果越强 smoothing term, minimum sample size, if sample size is less than N_min, add up to N_min 115 | # 116 | 117 | alg="LGBMRegressor" 118 | print("LightGBM training... train={} test={} \nparam={}".format(train_df.shape,test_df.shape,param)) 119 | features_0=features.copy() 120 | for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_df.values, target.values)): 121 | t0=time.time() 122 | print("fold n°{}".format(fold_)) 123 | num_round = 10000 124 | features = features_0.copy() 125 | X_train = train_df.iloc[trn_idx][features].astype(np.float32) 126 | Y_train = target.iloc[trn_idx].astype(np.double) 127 | X_test = train_df.iloc[val_idx][features].astype(np.float32) 128 | Y_test = target.iloc[val_idx].astype(np.double) 129 | if isMORT: 130 | #mort = LiteMORT(param).fit(X_train, Y_train, eval_set=[(X_test, Y_test)]) 131 | mort = LiteMORT(param).fit_1(X_train, Y_train, eval_set=[(X_test, Y_test)]) 132 | oof[val_idx] = mort.predict_raw(X_test) 133 | fold_score = roc_auc_score(Y_test, oof[val_idx]) 134 | #print("\nFold ", fold_, " score: ", fold_score) 135 | predictions += mort.predict_raw(test_df[features]) / 5 136 | else: 137 | if alg=="LGBMRegressor": 138 | dev=X_train 139 | val=X_test 140 | target_col='target_col'; dev[target_col]=Y_train 141 | clf = lgb.LGBMRegressor(num_boost_round=num_round, early_stopping_rounds=early_stop,**param) 142 | if True: 143 | print("features={} X_train={} Y_train={} X_test={} Y_test={} ".format( len(features), 144 | X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)) 145 | clf.fit(X_train[features], Y_train, eval_set=[(X_train[features], Y_train),(X_test[features], Y_test)],eval_metric="auc",categorical_feature=cat_cols, verbose=1000) 146 | feat_importance = clf.feature_importances_ 147 | best_iteration = clf.best_iteration_ 148 | if best_iteration is None: 149 | best_iteration = -1 150 | oof[val_idx] = clf.predict(val[features],num_iteration=best_iteration) 151 | else: 152 | gLR = GBDT_LR(clf) 153 | gLR.fit(X_train, Y_train, eval_set=[(X_test, Y_test)],eval_metric="auc", verbose=1000) 154 | feat_importance = gLR.feature_importance() 155 | best_iteration = -1 156 | clf=gLR 157 | oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], y_=target.iloc[val_idx], 158 | num_iteration=best_iteration) 159 | 160 | else: #lambda ranker 161 | gbr = lgb.LGBMRanker() 162 | gbr.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], 163 | eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, 164 | callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)]) 165 | 166 | fold_importance_df = pd.DataFrame() 167 | fold_importance_df["feature"] = features 168 | fold_importance_df["importance"] = feat_importance 169 | fold_importance_df["fold"] = fold_ + 1 170 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 171 | 172 | predictions += clf.predict(test_df[features], num_iteration=best_iteration) / 5 173 | fold_score = roc_auc_score(Y_test, oof[val_idx]) 174 | print("fold n°{} time={} score={}".format(fold_,time.time()-t0,fold_score)) 175 | #break 176 | cv_score = roc_auc_score(target, oof) 177 | print("CV score: {:<8.5f}".format(cv_score)) 178 | 179 | if feature_importance_df.size>0: 180 | #if False: 181 | cols = (feature_importance_df[["feature", "importance"]] 182 | .groupby("feature") 183 | .mean() 184 | .sort_values(by="importance", ascending=False)[:32].index) 185 | best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)] 186 | 187 | plt.figure(figsize=(14,26)) 188 | sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False)) 189 | plt.title('LightGBM Features (averaged over folds)') 190 | plt.tight_layout() 191 | plt.savefig('lgbm_importances.png') 192 | 193 | 194 | input("Press Enter to continue...") -------------------------------------------------------------------------------- /python-package/case_ieee_fraud.py: -------------------------------------------------------------------------------- 1 | #https://www.kaggle.com/kyakovlev/ieee-simple-lgbm 2 | 3 | # General imports 4 | import numpy as np 5 | import pandas as pd 6 | import os, sys, gc, warnings, random, datetime 7 | import time 8 | import pickle 9 | from sklearn import metrics 10 | from sklearn.model_selection import train_test_split, KFold 11 | from sklearn.preprocessing import LabelEncoder 12 | from litemort import * 13 | from tqdm import tqdm 14 | import math 15 | warnings.filterwarnings('ignore') 16 | 17 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 18 | isMORT = True 19 | model='MORT' if isMORT else 'LGB' 20 | NFOLDS = 8 21 | #some_rows = 5000 22 | some_rows = None 23 | data_root = 'E:/Kaggle/ieee_fraud/input/' 24 | #data_root = '../input/' 25 | pkl_path = f'{data_root}/_kyakovlev_{some_rows}.pickle' 26 | 27 | def M_PickSamples(pick_samples,df_train,df_test): 28 | nMost = min(df_train.shape[0], df_test.shape[0]) 29 | random.seed(42) 30 | subset = random.sample(range(nMost), pick_samples) 31 | df_train = df_train.iloc[subset, :].reset_index(drop=True) 32 | df_test = df_test.iloc[subset, :].reset_index(drop=True) 33 | print('====== Mort_PickSamples ... df_train={} df_test={}'.format(df_train.shape, df_test.shape)) 34 | return df_train,df_test 35 | 36 | def seed_everything(seed=0): 37 | random.seed(seed) 38 | os.environ['PYTHONHASHSEED'] = str(seed) 39 | np.random.seed(seed) 40 | 41 | def reduce_mem_usage(df, verbose=True): 42 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 43 | start_mem = df.memory_usage().sum() / 1024**2 44 | for col in df.columns: 45 | col_type = df[col].dtypes 46 | if col_type in numerics: 47 | c_min = df[col].min() 48 | c_max = df[col].max() 49 | if str(col_type)[:3] == 'int': 50 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 51 | df[col] = df[col].astype(np.int8) 52 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 53 | df[col] = df[col].astype(np.int16) 54 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 55 | df[col] = df[col].astype(np.int32) 56 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 57 | df[col] = df[col].astype(np.int64) 58 | else: 59 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 60 | df[col] = df[col].astype(np.float16) 61 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 62 | df[col] = df[col].astype(np.float32) 63 | else: 64 | df[col] = df[col].astype(np.float64) 65 | end_mem = df.memory_usage().sum() / 1024**2 66 | if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) 67 | return df 68 | 69 | import lightgbm as lgb 70 | 71 | 72 | def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2): 73 | print(f'train_df={tr_df.shape} test_df={tt_df.shape} \nlgb_params={lgb_params}') 74 | folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) 75 | 76 | #X, y = tr_df[features_columns], tr_df[target] 77 | #P, P_y = tt_df[features_columns], tt_df[target] 78 | y, P_y = tr_df[target], tt_df[target] 79 | 80 | 81 | predictions = np.zeros(len(tt_df)) 82 | 83 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(tr_df[features_columns], y)): 84 | t0=time.time() 85 | print('Fold:', fold_) 86 | tr_x, tr_y = tr_df[features_columns].iloc[trn_idx, :], y[trn_idx] 87 | vl_x, vl_y = tr_df[features_columns].iloc[val_idx, :], y[val_idx] 88 | print(len(tr_x), len(vl_x)) 89 | 90 | if isMORT: 91 | model = LiteMORT(lgb_params).fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)]) 92 | best_iter = 1000 93 | # pred_val = model.predict(vl_x) 94 | pred_raw = model.predict_raw(vl_x) 95 | # y_pred[val_idx] = pred_raw 96 | fold_score = metrics.roc_auc_score(vl_y, pred_raw) 97 | pp_p = model.predict_raw(tt_df[features_columns]) 98 | else: 99 | tr_data = lgb.Dataset(tr_x, label=tr_y) 100 | if LOCAL_TEST: 101 | vl_data = lgb.Dataset(tt_df[features_columns], label=P_y) 102 | else: 103 | vl_data = lgb.Dataset(vl_x, label=vl_y) 104 | estimator = lgb.train( 105 | lgb_params, 106 | tr_data, 107 | valid_sets=[tr_data, vl_data], 108 | verbose_eval=200, 109 | ) 110 | pred_raw = estimator.predict(vl_x) 111 | fold_score = metrics.roc_auc_score(vl_y, pred_raw) 112 | pp_p = estimator.predict(tt_df[features_columns]) 113 | del tr_data, vl_data 114 | 115 | predictions += pp_p / NFOLDS 116 | 117 | if LOCAL_TEST: 118 | feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(), X.columns)), 119 | columns=['Value', 'Feature']) 120 | print(feature_imp) 121 | print(f'Fold:{fold_} score={fold_score} time={time.time() - t0:.4g} tr_x={tr_x.shape} val_x={vl_x.shape}') 122 | del tr_x, tr_y, vl_x, vl_y 123 | gc.collect() 124 | #break 125 | tt_df = tt_df[['TransactionID', target]] 126 | tt_df['prediction'] = predictions 127 | gc.collect() 128 | 129 | return tt_df,fold_score 130 | 131 | SEED = 42 132 | seed_everything(SEED) 133 | LOCAL_TEST = False 134 | TARGET = 'isFraud' 135 | START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d') 136 | lgb_params = { 137 | 'objective':'binary', 138 | 'boosting_type':'gbdt', 139 | 'metric':'auc', 140 | 'n_jobs':-1, 141 | 'learning_rate':0.01, 142 | "adaptive":'weight', 143 | "prune":0, 144 | 'num_leaves': 2**8, 145 | 'max_depth':-1, 146 | 'tree_learner':'serial', 147 | 'colsample_bytree': 0.7, 148 | 'subsample_freq':1, 149 | 'subsample':0.7, 150 | 'n_estimators':800, 151 | 'max_bin':255, 152 | 'verbose':666, 153 | 'seed': SEED, 154 | 'early_stopping_rounds':100, 155 | } 156 | 157 | if os.path.isfile(pkl_path): 158 | print("====== Load pickle @{} ......".format(pkl_path)) 159 | with open(pkl_path, "rb") as fp: 160 | [train_df, test_df, features_columns] = pickle.load(fp) 161 | else: 162 | print('Load Data......') 163 | train_df = pd.read_pickle(f'{data_root}/ieee-fe-with-some-eda/train_df.pkl') 164 | 165 | if LOCAL_TEST: 166 | test_df = train_df[train_df['DT_M']==train_df['DT_M'].max()].reset_index(drop=True) 167 | train_df = train_df[train_df['DT_M']<(train_df['DT_M'].max()-1)].reset_index(drop=True) 168 | else: 169 | test_df = pd.read_pickle(f'{data_root}/ieee-fe-with-some-eda/test_df.pkl') 170 | 171 | remove_features = pd.read_pickle(f'{data_root}/ieee-fe-with-some-eda/remove_features.pkl') 172 | remove_features = list(remove_features['features_to_remove'].values) 173 | print('Load Data OK\nShape control:', train_df.shape, test_df.shape) 174 | 175 | features_columns = [col for col in list(train_df) if col not in remove_features] 176 | 177 | ########################### Final Minification 178 | print('reduce_mem_usage......') 179 | train_df = reduce_mem_usage(train_df) 180 | test_df = reduce_mem_usage(test_df) 181 | print('reduce_mem_usage......OK!!!') 182 | if some_rows is not None: 183 | train_df,test_df = M_PickSamples(some_rows,train_df,test_df) 184 | with open(pkl_path, "wb") as fp: # Pickling 185 | pickle.dump([train_df, test_df, features_columns], fp) 186 | print("====== Dump pickle @{} ......OK".format(pkl_path)) 187 | 188 | 189 | if LOCAL_TEST: 190 | lgb_params['learning_rate'] = 0.01 191 | lgb_params['n_estimators'] = 20000 192 | lgb_params['early_stopping_rounds'] = 100 193 | test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params) 194 | print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction'])) 195 | else: 196 | lgb_params['learning_rate'] = 0.005 197 | lgb_params['n_estimators'] = 5000 198 | lgb_params['early_stopping_rounds'] = 100 199 | test_predictions,fold_score = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=NFOLDS) 200 | test_predictions['isFraud'] = test_predictions['prediction'] 201 | if some_rows is None: 202 | # test_predictions[['TransactionID', 'isFraud']].to_csv(f'submit_{some_rows}_{0.5}.csv', index=False,compression='gzip') 203 | path = f'E:/Kaggle/ieee_fraud/result/[{model}]_{some_rows}_{fold_score:.5f}_F{NFOLDS}_.csv' 204 | test_predictions[['TransactionID', 'isFraud']].to_csv(path, index=False) # ,compression='gzip' 205 | print(f"test_predictions[['TransactionID', 'isFraud']] to_csv @{path}") 206 | input("Press Enter to exit...") -------------------------------------------------------------------------------- /python-package/LiteMORT/LiteMORT_EDA.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.gridspec as gridspec 3 | import numpy as np 4 | import seaborn as sns; sns.set() 5 | import math 6 | import time 7 | from pandas.api.types import is_string_dtype 8 | from pandas.api.types import is_numeric_dtype 9 | import pandas as pd 10 | 11 | def Unique_Expand(df): 12 | unique_samples = [] 13 | unique_count = np.zeros_like(df) 14 | if True: 15 | ndf=df.values 16 | #for feature in tqdm(range(df.shape[1])): 17 | for feature in (range(ndf.shape[1])): 18 | _,index_, count_ = np.unique(ndf[:, feature], return_counts=True, return_index=True) 19 | unique_count[index_[count_ == 1], feature] += 1 20 | real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) ==2)[:, 0] 21 | synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0] 22 | df['unique']=0 23 | df['unique'].iloc[real_samples_indexes]=1 24 | else: 25 | for f in df.columns: 26 | new_feat = [] 27 | v = dict(pd.value_counts(df[f])) 28 | for el in df[f].values: 29 | new_feat.append(v[el]) 30 | df["{}_counts".format(f)] = new_feat 31 | print("Unique_Expand::{}_counts...".format(f)) 32 | 33 | return df 34 | 35 | def all_element_values(user_data,col,tMost=60*10): 36 | t0=time.time() 37 | nz,nDump=0,100000 38 | nAllRow = user_data.shape[0] 39 | # df = user_data[col].str.split(',') 40 | if is_numeric_dtype(user_data[col]): 41 | elements=user_data[col].unique() 42 | else: 43 | elements=set() 44 | for row in user_data[col]: 45 | tokens = row.strip().split(',') 46 | elements = elements | set(tokens) 47 | nz = nz+1 48 | if nz>nDump and nz%nDump==0: 49 | print("\r{}({:.3g})\t time={:.3g}...".format(nz,nz*1.0/nAllRow,time.time()-t0),end="") 50 | if time.time()-t0>tMost: #难以想象要超过10分钟 51 | print("\n{}({:.3g})\t time={:.3g}...BREAK!!!\n".format(nz,nz*1.0/nAllRow,time.time()-t0),end="") 52 | break 53 | elements = list(elements) 54 | nz = min(1000,len(elements)) 55 | print("{} elements@\'{}\' type={} time={:.3g}\n elements={} :\n".format(len(elements),col, type(elements[1]), 56 | time.time()-t0,elements[0:nz])) 57 | return elements 58 | 59 | #only for bianry classification 60 | def see_all_2(train, test,features,target,bins,dump_root="../see_all/"): 61 | nBinLevel = len(bins); assert nBinLevel>1 62 | values, counts = np.unique(train.target, return_counts=True) 63 | for f_name in features: 64 | n = 0 65 | print("see_all_2@...".format(f_name), end="") 66 | fig, ax = plt.subplots(nBinLevel, 2, figsize=(20, 10)) 67 | #a = train[f_name].loc[train.target == 0] 68 | fig.suptitle("\"{}\" V={} N={} 0={} 1={}".format(f_name,values, counts,"Blue","Red")) 69 | for bin in bins: 70 | bin = bin if bin >0 else None 71 | sns.distplot(train[f_name].loc[train.target == 0],ax=ax[n,0], color="Blue", bins=bin,norm_hist=True) 72 | sns.distplot(train.loc[train.target == 1, f_name],ax=ax[n,0], color="Red", bins=bin, norm_hist=True) 73 | sns.distplot(test.loc[:, f_name],ax=ax[n,1], color="Mediumseagreen", bins=bin, norm_hist=True) 74 | #ax[0].set_xlabel("") 75 | #ax[1].set_xlabel("") 76 | n=n+1 77 | #plt.show(block=True) 78 | plt.savefig("{}_[{}]_.jpg".format(dump_root, f_name)) 79 | plt.clf(); plt.cla(); plt.close() 80 | 81 | def plot_binary_dist(train,test,feature_names,bins=None): 82 | n_top = max(2,len(feature_names)) #1-1D array of subplots. 83 | fig, ax = plt.subplots(n_top, 2, figsize=(10, 5 * n_top)) #, figsize=(10, 5 * n_top) 84 | n=0 85 | for f_name in feature_names: 86 | a = train[f_name].loc[train.target == 0] 87 | sns.distplot(train[f_name].loc[train.target == 0], ax=ax[n, 0], color="Blue", bins=bins,norm_hist=True) 88 | sns.distplot(train.loc[train.target == 1, f_name], ax=ax[n, 0], color="Red", bins=bins, norm_hist=True) 89 | sns.distplot(test.loc[:, f_name], ax=ax[n, 1], color="Mediumseagreen", bins=bins, norm_hist=True) 90 | ax[n, 0].set_title("Train {}".format(f_name)) 91 | ax[n, 1].set_title("Test {}".format(f_name)) 92 | ax[n, 0].set_xlabel("") 93 | ax[n, 1].set_xlabel("") 94 | n=n+1 95 | plt.show(block=True) 96 | 97 | def ann(row,col_A,col_B,axis=None): 98 | ind = row[0] 99 | r = row[1] 100 | info = "{}:{:.2g}".format(r[col_A],r[col_B]) #ind 101 | info = "{:.2g}".format(r[col_B]) 102 | plt.gca().annotate(info, xy=(r[col_A], r[col_B]), xytext=(2,2) , textcoords ="offset points" ) 103 | 104 | def plot_join_distri(df,listDict): 105 | no,nFig = 0,len(listDict) 106 | listG=[] 107 | for dict in listDict: 108 | x,y,title=dict['x'], dict['y'], dict['title'] 109 | sns.set(style="darkgrid", color_codes=True) 110 | #marginal = dict(bins=15, rug=True) 111 | marginal = {'bins':150, 'rug':True} 112 | g = sns.jointplot(x, y, data=df, kind="reg",size=10,marginal_kws=marginal) #kind= 113 | if False: 114 | g = g.plot_joint(plt.scatter, color="m", edgecolor="white") 115 | _ = g.ax_marg_x.hist(x, color="b", alpha=.6) 116 | _ = g.ax_marg_y.hist(y, color="r", alpha=.6,orientation="horizontal") 117 | head = df.sort_values(by=[x], ascending=[False]).head(5) 118 | #tail = tips.sort_values(by=['resid'], ascending=[False]).tail(5) 119 | for row in head.iterrows(): 120 | ann(row,x,y) 121 | plt.title(title) 122 | path = "{}_{}.png".format(title, no) 123 | g.savefig(path); listG.append(path) 124 | #plt.close() #必须close 不然plt.show()会重复 125 | no = no + 1 126 | 127 | if False: # subplots migration 128 | fig = plt.figure(figsize=(2, 2)) # plt.figure(figsize=(12, 8)) 129 | no=0 130 | for path in listG: 131 | no = no + 1 132 | img=mpimg.imread(path) 133 | fig.add_subplot(2, 2, no) 134 | plt.imshow(img) 135 | 136 | plt.show() 137 | print("listG={}".format(len(listG))) 138 | 139 | #https://stackoverflow.com/questions/43010462/annotate-outliers-on-seaborn-jointplot 140 | #很多问题,会丢失坐标轴 141 | def plot_join_distri_0(df,listDict): 142 | no,nFig = 0,len(listDict) 143 | nRow=int(math.sqrt(nFig)) 144 | nCol=(int)(math.ceil(nFig*1.0/nRow)) 145 | fig, axs = plt.subplots(nRow,nCol) 146 | for dict in listDict: 147 | x,y,title=dict['x'], dict['y'], dict['title'] 148 | row,col=(int)(no/nCol),(int)(no%nCol) 149 | axis = axs[row,col] 150 | g = sns.jointplot(x, y, data=df, kind="reg",size=7, ax=axis) 151 | head = df.sort_values(by=[x], ascending=[False]).head(5) 152 | #tail = tips.sort_values(by=['resid'], ascending=[False]).tail(5) 153 | for row in head.iterrows(): 154 | ann(row,x,y,axis) 155 | no=no+1 156 | plt.title(title) 157 | plt.close() # 必须close 不然plt.show()会重复 158 | 159 | plt.show() 160 | print("listG={}".format(0)) 161 | # https://stackoverflow.com/questions/35042255/how-to-plot-multiple-seaborn-jointplot-in-subplot 162 | class SeabornFig2Grid(): 163 | 164 | def __init__(self, seaborngrid, fig, subplot_spec): 165 | self.fig = fig 166 | self.sg = seaborngrid 167 | self.subplot = subplot_spec 168 | if isinstance(self.sg, sns.axisgrid.FacetGrid) or \ 169 | isinstance(self.sg, sns.axisgrid.PairGrid): 170 | self._movegrid() 171 | elif isinstance(self.sg, sns.axisgrid.JointGrid): 172 | self._movejointgrid() 173 | self._finalize() 174 | 175 | def _movegrid(self): 176 | """ Move PairGrid or Facetgrid """ 177 | self._resize() 178 | n = self.sg.axes.shape[0] 179 | m = self.sg.axes.shape[1] 180 | self.subgrid = gridspec.GridSpecFromSubplotSpec(n,m, subplot_spec=self.subplot) 181 | for i in range(n): 182 | for j in range(m): 183 | self._moveaxes(self.sg.axes[i,j], self.subgrid[i,j]) 184 | 185 | def _movejointgrid(self): 186 | """ Move Jointgrid """ 187 | h= self.sg.ax_joint.get_position().height 188 | h2= self.sg.ax_marg_x.get_position().height 189 | r = int(np.round(h/h2)) 190 | self._resize() 191 | self.subgrid = gridspec.GridSpecFromSubplotSpec(r+1,r+1, subplot_spec=self.subplot) 192 | 193 | self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1]) 194 | self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1]) 195 | self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1]) 196 | 197 | def _moveaxes(self, ax, gs): 198 | #https://stackoverflow.com/a/46906599/4124317 199 | ax.remove() 200 | ax.figure=self.fig 201 | self.fig.axes.append(ax) 202 | self.fig.add_axes(ax) 203 | ax._subplotspec = gs 204 | ax.set_position(gs.get_position(self.fig)) 205 | ax.set_subplotspec(gs) 206 | 207 | def _finalize(self): 208 | plt.close(self.sg.fig) 209 | self.fig.canvas.mpl_connect("resize_event", self._resize) 210 | self.fig.canvas.draw() 211 | 212 | def _resize(self, evt=None): 213 | self.sg.fig.set_size_inches(self.fig.get_size_inches()) 214 | 215 | if __name__ == "__main__": 216 | iris = sns.load_dataset("iris") 217 | tips = sns.load_dataset("tips") 218 | 219 | # An lmplot 220 | g0 = sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips,palette=dict(Yes="g", No="m")) 221 | # A PairGrid 222 | g1 = sns.PairGrid(iris, hue="species") 223 | g1.map(plt.scatter, s=5) 224 | # A FacetGrid 225 | g2 = sns.FacetGrid(tips, col="time", hue="smoker") 226 | g2.map(plt.scatter, "total_bill", "tip", edgecolor="w") 227 | # A JointGrid 228 | g3 = sns.jointplot("sepal_width", "petal_length", data=iris,kind="kde", space=0, color="g") 229 | fig = plt.figure(figsize=(13,8)) 230 | gs = gridspec.GridSpec(2, 2) 231 | mg0 = SeabornFig2Grid.SeabornFig2Grid(g0, fig, gs[0]) 232 | mg1 = SeabornFig2Grid.SeabornFig2Grid(g1, fig, gs[1]) 233 | mg2 = SeabornFig2Grid.SeabornFig2Grid(g2, fig, gs[3]) 234 | mg3 = SeabornFig2Grid.SeabornFig2Grid(g3, fig, gs[2]) 235 | gs.tight_layout(fig) 236 | #gs.update(top=0.7) 237 | plt.show() -------------------------------------------------------------------------------- /python-package/pycharm_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | http://www.cnblogs.com/amazement/p/10341328.html 3 | 包含相对引用的 module,不要直接利用 解释器执行(如果直接执行,这个文件名.py 对应的module __name__ 值就是 '__main__') 4 | ''' 5 | import gc 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn import preprocessing 9 | import os 10 | from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,load_iris, load_svmlight_file) 11 | import time 12 | import pickle 13 | from sklearn.metrics import log_loss, mean_squared_error 14 | import matplotlib.pyplot as plt 15 | from sklearn.model_selection import GridSearchCV, train_test_split 16 | import shap 17 | import sys 18 | from litemort import * 19 | import lightgbm as lgb 20 | from sklearn import metrics 21 | 22 | isMORT = len(sys.argv)>1 and sys.argv[1] == "mort" 23 | #isMORT = True 24 | 25 | def auc2(m, train, test,y_train,y_test): 26 | return (metrics.roc_auc_score(y_train,m.predict(train)), 27 | metrics.roc_auc_score(y_test,m.predict(test))) 28 | 29 | # https://www.kdnuggets.com/2018/03/catboost-vs-light-gbm-vs-xgboost.html 30 | def test_fly_( ): 31 | import pandas as pd, numpy as np, time 32 | from sklearn.model_selection import train_test_split 33 | frac=0.1 34 | pkl_path = 'G:/kaggle/flight/flight_{}.pickle'.format(frac) 35 | if os.path.isfile(pkl_path): 36 | with open(pkl_path, "rb") as fp: # Pickling 37 | [data] = pickle.load(fp) 38 | else: 39 | data = pd.read_csv("G:/kaggle/flight/flights.csv") 40 | data = data.sample(frac=frac, random_state=10) 41 | data = data[["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", 42 | "ORIGIN_AIRPORT", "AIR_TIME", "DEPARTURE_TIME", "DISTANCE", "ARRIVAL_DELAY"]] 43 | data.dropna(inplace=True) 44 | data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"] > 10) * 1 45 | with open(pkl_path, "wb") as fp: # Pickling 46 | pickle.dump([data], fp) 47 | os._exit(-1) 48 | 49 | cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"] 50 | for item in cols: 51 | data[item] = data[item].astype("category").cat.codes + 1 52 | train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"], random_state=10, test_size=0.25) 53 | 54 | if False: 55 | lg = lgb.LGBMClassifier(silent=False) 56 | param_dist = {"max_depth": [25,50, 75], 57 | "learning_rate" : [0.01,0.05,0.1], 58 | "num_leaves": [300,900,1200], 59 | "n_estimators": [200] 60 | } 61 | grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=5) 62 | grid_search.fit(train,y_train) 63 | grid_search.best_estimator_ 64 | params = { "objective": "binary",'subsample': 1, 65 | "metric": "binary_logloss",#""binary_logloss", 66 | "max_depth": 50, "learning_rate": 0.1, "num_leaves": 900, "n_estimators": 300} 67 | cate_features_name = ["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "DESTINATION_AIRPORT","ORIGIN_AIRPORT"] 68 | t0=time.time() 69 | a1,a2=0,0 70 | if isMORT: 71 | model2 = LiteMORT(params).fit(train, y_train) 72 | if False: 73 | y_predict = model2.predict(test,raw_score=True)[:,1] 74 | a1 = metrics.roc_auc_score(y_test,model2.predict(test,raw_score=True)[:,1]) 75 | print("------ No categorical auc={}".format(a1)) 76 | #model2 = LiteMORT(params).fit(train, y_train, categorical_feature = cate_features_name) 77 | #a2 = metrics.roc_auc_score(y_test,model2.predict(test,raw_score=True)[:,1]) 78 | print("------ With Categorical auc={}".format(a2)) 79 | elif True: 80 | model2 = lgb.LGBMClassifier( **params ) 81 | model2.fit(train, y_train, eval_set=[(train, y_train)], verbose=True) 82 | result = model2.predict_proba(test) 83 | else: 84 | d_train = lgb.Dataset(train, label=y_train,free_raw_data=False) 85 | # Without Categorical Features 86 | model2 = lgb.train(params, d_train,valid_sets=[d_train]) 87 | model2.save_model('gbm_test_fly_.model') 88 | a1=auc2(model2, train, test,y_train,y_test) 89 | print("------ No categorical auc2={}".format(a1)) 90 | 91 | #With Catgeorical Features 92 | #model2 = lgb.train(params, d_train, categorical_feature = cate_features_name) 93 | #a2=auc2(model2, train, test,y_train,y_test) 94 | print("------ With categorical auc2={}".format(a2)) 95 | del d_train 96 | gc.collect() 97 | input("loss@test_fly_ is {} time={} model={}...".format(a1,time.time()-t0,model2)) 98 | os._exit(-98) 99 | 100 | def test_shap_adult_(): 101 | shap.initjs() 102 | X,y = shap.datasets.adult() 103 | X_display,y_display = shap.datasets.adult(display=True) 104 | # create a train/test split 105 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) 106 | 107 | params = { 108 | "max_bin": 512, 109 | "learning_rate": 0.05, 110 | "boosting_type": "gbdt", 111 | "objective": "binary", 112 | "metric": "binary_logloss", 113 | "num_leaves": 10, 114 | "verbose": 1000, 115 | "min_data": 100, 116 | "boost_from_average": True, 117 | 'early_stop': 50, 'num_boost_round': 10000, 118 | } 119 | if isMORT: 120 | model = LiteMORT(params).fit(X_train, y_train,eval_set=[(X_test,y_test)]) 121 | result = model.predict(X_test) 122 | result = model.predict(X_test,raw_score=True) 123 | elif True: 124 | gbm = lgb.LGBMClassifier(n_estimators=10000, silent=True) 125 | gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False) 126 | result = gbm.predict(X_test) #predict_proba+_le.inverse_transform 127 | result = gbm.predict_proba(X_test) 128 | else: #晕!!! LGBMClassifier和lgb.train返回结果不一样 129 | d_train = lgb.Dataset(X_train, label=y_train) 130 | d_test = lgb.Dataset(X_test, label=y_test) 131 | model = lgb.train(params, d_train, 10000, valid_sets=[d_test], early_stopping_rounds=50, verbose_eval=1000) 132 | if False:#https://slundberg.github.io/shap/notebooks/Census%20income%20classification%20with%20LightGBM.html 133 | explainer = shap.TreeExplainer(model) 134 | shap_values = explainer.shap_values(X) 135 | shap.force_plot(explainer.expected_value, shap_values[0, :], X_display.iloc[0, :]) 136 | shap.force_plot(explainer.expected_value, shap_values[:1000, :], X_display.iloc[:1000, :]) 137 | shap.summary_plot(shap_values, X) 138 | plt.show() 139 | result = model.predict(X_test) 140 | loss = log_loss(y_test, result) 141 | input("loss@test_shap_adult_ is {} model={}...".format(loss,model)) 142 | os._exit(-99) 143 | 144 | def test_1(): 145 | from sklearn.metrics import mean_squared_error 146 | from sklearn.datasets import load_boston 147 | from sklearn.model_selection import KFold 148 | 149 | params = { 150 | "objective": "regression", 'early_stop': 5, 'num_boost_round': 50, "verbosity": 1, 151 | } 152 | boston = load_boston() 153 | y = boston['target'] 154 | X = boston['data'] 155 | kf = KFold(n_splits=2, shuffle=True, random_state=rng) 156 | for train_index, test_index in kf.split(X, y): 157 | # xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) 158 | mort = LiteMORT(params) 159 | mort.fit(X[train_index], y[train_index], params=params) 160 | preds = mort.predict(X[test_index]) 161 | labels = y[test_index] 162 | assert mean_squared_error(preds, labels) < 25 163 | 164 | params = { 165 | "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50, 166 | "verbosity": 1, 'subsample': 1, 167 | } 168 | if __name__ == "__main__": 169 | test_fly_() 170 | #test_shap_adult_() 171 | nTree=100 #100 172 | 173 | rng = np.random.RandomState(1994) 174 | np.random.seed(42) 175 | params = { 176 | "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': nTree, 177 | "verbosity": 1, 178 | } 179 | X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str 180 | "B": np.random.permutation([1, 2, 3] * 100), # int 181 | "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float 182 | "D": np.random.permutation([True, False] * 150)}) # bool 183 | 184 | y = np.random.permutation([0, 1] * 150) 185 | X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), 186 | "B": np.random.permutation([1, 3] * 30), 187 | "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), 188 | "D": np.random.permutation([True, False] * 30)}) 189 | if True: 190 | #prepocess = Mort_Preprocess() 191 | X, X_test = Mort_Preprocess.OrdinalEncode_(X, X_test) 192 | ''' 193 | for col in ["A", "B", "C", "D"]: 194 | X[col] = X[col].astype('category') 195 | X_test[col] = X_test[col].astype('category') 196 | ''' 197 | 198 | if True: 199 | isLabel=True 200 | gbm0 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y) 201 | gbm0.booster_.save_model('gbm0.model') 202 | result = gbm0.predict(X_test,raw_score=isLabel,n_estimators = nTree) 203 | pred0 = list(gbm0.predict(X_test,raw_score=isLabel)) 204 | gbm1 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y, categorical_feature=[0]) 205 | gbm1.booster_.save_model('gbm1.model') 206 | pred1 = list(gbm1.predict(X_test,raw_score=isLabel)) 207 | gbm2 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y, categorical_feature=['A']) 208 | pred2 = list(gbm2.predict(X_test,raw_score=isLabel)) 209 | gbm3 = lgb.sklearn.LGBMClassifier(n_estimators = nTree).fit(X, y, categorical_feature=['A', 'B', 'C', 'D']) 210 | pred3 = list(gbm3.predict(X_test,raw_score=isLabel)) 211 | np.testing.assert_almost_equal(pred0, pred1) 212 | np.testing.assert_almost_equal(pred0, pred2) 213 | np.testing.assert_almost_equal(pred0, pred3) 214 | ''' 215 | gbm3.booster_.save_model('categorical.model') 216 | gbm4 = lgb.Booster(model_file='categorical.model') 217 | pred4 = list(gbm4.predict(X_test)) 218 | pred_prob = list(gbm0.predict_proba(X_test)[:, 1]) 219 | np.testing.assert_almost_equal(pred_prob, pred4) 220 | ''' 221 | 222 | if False: 223 | mort0 = LiteMORT(params).fit(X, y) 224 | pred0 = list(mort0.predict(X_test)) 225 | else: 226 | mort1 = LiteMORT(params).fit(X, y, categorical_feature=[0]) 227 | pred1 = list(mort1.predict(X_test)) 228 | 229 | mort2 = LiteMORT(params).fit(X, y, categorical_feature=['A']) 230 | pred2 = list(mort2.predict(X_test)) 231 | mort3 = LiteMORT(params).fit(X, y, categorical_feature=['A', 'B', 'C', 'D']) 232 | pred3 = list(mort3.predict(X_test)) 233 | #np.testing.assert_almost_equal(pred1, pred1) 234 | np.testing.assert_almost_equal(pred1, pred2) 235 | #np.testing.assert_almost_equal(pred1, pred3) 236 | input("...") 237 | # gc.collect() 238 | #ret = log_loss(y_test, mort.predict_proba(X_test)) -------------------------------------------------------------------------------- /src/util/FastExpLog.c: -------------------------------------------------------------------------------- 1 | //http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.1569&rep=rep1&type=pdf 2 | 3 | /* 4 | * See http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/ 5 | * 6 | * All of these rely on being on a little endian machine, such as an Intel box. 7 | * 8 | * These can be _quite_ inaccurate. ~20% in many cases, but being much faster (~7x) may 9 | * permit more loop iterations of tuning algorithms that only need approximate powers. 10 | * 11 | * This version of Ankerl's algorithm has been extended to provide optionally conservative (lower) bounds 12 | * and also to generate a full linear interpolation across the entire significand rather than 'stair-step' 13 | * at the expense of performing a 64 bit operation rather than a 32 bit one. This is cheap these days. 14 | * 15 | * 'exp' is further improved by using a suggestion by Nic Schraudolph: 16 | * 17 | * "You can get a much better approximation (piecewise rational instead of linear) at 18 | * the cost of a single floating-point division by using better_exp(x) = exp(x/2)/exp(-x/2), 19 | * where exp() is my published approximation but you don't need the additive constant anymore, 20 | * you can use c=0. On machines with hardware division this is very attractive." -- Nic Schraudolph 21 | * 22 | * --Edward Kmett 23 | * 24 | * TODO: Incorporate the techniques from https://code.google.com/p/fastapprox/ to enable us 25 | * to calculate more interesting approximate functions. They might need to be generalized to work on 26 | * Double values where appropriate I suppose. 27 | * 28 | * Magic numbers: 29 | * float /int : round(1<<23/log(2)) = 12102203, 127<<23 = 1065353216 30 | * double/int : round(1<<20/log(2)) = 1512775, 1023<<20 = 1072693248 31 | * double/long long: round(1<<52/log(2)) = 6497320848556798, 1023<<52 = 4607182418800017408 32 | * 33 | * The fudge factors such that exp y <= exp_fast y: 34 | * >>> ceiling (2^23 * (1 - (log (log 2) + 1)/log 2)) 35 | * 722019 36 | * >>> ceiling (2^20 * (1 - (log (log 2) + 1)/log 2)) 37 | * 90253 38 | * >>> ceiling (2^52 * (1 - (log (log 2) + 1)/log 2)) 39 | * 387630818974388 40 | * 41 | * The fudge factor such that exp_fast y <= exp y is uniformly -1 42 | * 43 | * TODO: perform exponential doubling for pow based on better_exp_fast instead for better accuracy. 44 | */ 45 | 46 | /* Schraudolph's published algorithm extended into the least significant bits to avoid the stair step. 47 | double long long approximation: round 1<<52/log(2) 6497320848556798, 48 | mask = 0x3ff0000000000000LL = 4607182418800017408LL 49 | double approximation: round(1<<20/log(2)) = 1512775, 1023<<20 = 1072693248 50 | */ 51 | 52 | /* 4607182418800017408 - 387630818974388 = 4606794787981043020 53 | 54 | Exponent mask adapted to full 64 bit precision: 55 | >>> 1023 * 2^52 56 | 4607182418800017408 57 | 58 | The fudge factor for conservative lower bound adapted to full 64 bit precision: 59 | >>> round (2^52 * (1 - (log (log 2) + 1)/log 2)) 60 | 387630818974388 61 | 62 | As a lower bound this is suitable for use when generating Mass and Precision estimates. 63 | */ 64 | double exp_fast_lb(double a) { 65 | union { double d; long long x; } u; 66 | u.x = (long long)(6497320848556798LL * a + 4606794787981043020); 67 | return u.d; 68 | } 69 | 70 | /* 4607182418800017408 + 1 */ 71 | double exp_fast_ub(double a) { 72 | union { double d; long long x; } u; 73 | u.x = (long long)(6497320848556798LL * a + 4607182418800017409); 74 | return u.d; 75 | } 76 | 77 | double exp_fast(double a) { 78 | union { double d; long long x; } u; 79 | u.x = (long long)(6497320848556798LL * a + 0x3fef127e83d16f12LL); 80 | return u.d; 81 | } 82 | 83 | double better_exp_fast(double a) { 84 | union { double d; long long x; } u, v; 85 | u.x = (long long)(3248660424278399LL * a + 0x3fdf127e83d16f12LL); 86 | v.x = (long long)(0x3fdf127e83d16f12LL - 3248660424278399LL * a); 87 | return u.d / v.d; 88 | } 89 | 90 | /* Schraudolph's published algorithm */ 91 | double exp_fast_schraudolph(double a) { 92 | union { double d; int x[2]; } u; 93 | u.x[1] = (int)(1512775 * a + 1072632447); 94 | u.x[0] = 0; 95 | return u.d; 96 | } 97 | 98 | /* 1065353216 + 1 */ 99 | float expf_fast_ub(float a) { 100 | union { float f; int x; } u; 101 | u.x = (int)(12102203 * a + 1065353217); 102 | return u.f; 103 | } 104 | 105 | /* Schraudolph's published algorithm with John's constants */ 106 | /* 1065353216 - 486411 = 1064866805 */ 107 | float expf_fast(float a) { 108 | union { float f; int x; } u; 109 | u.x = (int)(12102203 * a + 1064866805); 110 | return u.f; 111 | } 112 | 113 | // 1056478197 114 | double better_expf_fast(float a) { 115 | union { float f; int x; } u, v; 116 | u.x = (long long)(6051102 * a + 1056478197); 117 | v.x = (long long)(1056478197 - 6051102 * a); 118 | return u.f / v.f; 119 | } 120 | 121 | /* 1065353216 - 722019 */ 122 | float expf_fast_lb(float a) { 123 | union { float f; int x; } u; 124 | u.x = (int)(12102203 * a + 1064631197); 125 | return u.f; 126 | } 127 | 128 | /* Ankerl's inversion of Schraudolph's published algorithm, converted to explicit multiplication */ 129 | double log_fast_ankerl(double a) { 130 | union { double d; int x[2]; } u = { a }; 131 | return (u.x[1] - 1072632447) * 6.610368362777016e-7; /* 1 / 1512775.0; */ 132 | } 133 | 134 | double log_fast_ub(double a) { 135 | union { double d; long long x; } u = { a }; 136 | return (u.x - 4606794787981043020) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */ 137 | } 138 | 139 | /* Ankerl's inversion of Schraudolph's published algorithm with my constants */ 140 | double log_fast(double a) { 141 | union { double d; long long x; } u = { a }; 142 | return (u.x - 4606921278410026770) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */ 143 | } 144 | 145 | double log_fast_lb(double a) { 146 | union { double d; long long x; } u = { a }; 147 | return (u.x - 4607182418800017409) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */ 148 | } 149 | 150 | 151 | /* 1065353216 - 722019 */ 152 | float logf_fast_ub(float a) { 153 | union { float f; int x; } u = { a }; 154 | return (u.x - 1064631197) * 8.262958405176314e-8f; /* 1 / 12102203.0; */ 155 | } 156 | 157 | /* Ankerl's adaptation of Schraudolph's published algorithm with John's constants */ 158 | /* 1065353216 - 486411 = 1064866805 */ 159 | float logf_fast(float a) { 160 | union { float f; int x; } u = { a }; 161 | return (u.x - 1064866805) * 8.262958405176314e-8f; /* 1 / 12102203.0; */ 162 | } 163 | 164 | /* 1065353216 + 1 */ 165 | float logf_fast_lb(float a) { 166 | union { float f; int x; } u = { a }; 167 | return (u.x - 1065353217) * 8.262958405176314e-8f; /* 1 / 12102203.0 */ 168 | } 169 | 170 | /* Ankerl's version of Schraudolph's approximation. */ 171 | double pow_fast_ankerl(double a, double b) { 172 | union { double d; int x[2]; } u = { a }; 173 | u.x[1] = (int)(b * (u.x[1] - 1072632447) + 1072632447); 174 | u.x[0] = 0; 175 | return u.d; 176 | } 177 | 178 | /* 179 | These constants are based loosely on the following comment off of Ankerl's blog: 180 | 181 | "I have used the same trick for float, not double, with some slight modification to the constants to suite IEEE754 float format. The first constant for float is 1<<23/log(2) and the second is 127<<23 (for double they are 1<<20/log(2) and 1023<<20)." -- John 182 | */ 183 | 184 | /* 1065353216 + 1 = 1065353217 ub */ 185 | /* 1065353216 - 486411 = 1064866805 min RMSE */ 186 | /* 1065353216 - 722019 = 1064631197 lb */ 187 | float powf_fast(float a, float b) { 188 | union { float d; int x; } u = { a }; 189 | u.x = (int)(b * (u.x - 1064866805) + 1064866805); 190 | return u.d; 191 | } 192 | 193 | float powf_fast_lb(float a, float b) { 194 | union { float d; int x; } u = { a }; 195 | u.x = (int)(b * (u.x - 1065353217) + 1064631197); 196 | return u.d; 197 | } 198 | 199 | float powf_fast_ub(float a, float b) { 200 | union { float d; int x; } u = { a }; 201 | u.x = (int)(b * (u.x - 1064631197) + 1065353217); 202 | return u.d; 203 | } 204 | 205 | /* 206 | Now that 64 bit arithmetic is cheap we can (try to) improve on Ankerl's algorithm. 207 | 208 | double long long approximation: round 1<<52/log(2) 6497320848556798, 209 | mask = 0x3ff0000000000000LL = 4607182418800017408LL 210 | 211 | >>> round (2**52 * log (3 / (8 * log 2) + 1/2) / log 2 - 1/2) 212 | 261140389990638 213 | >>> 0x3ff0000000000000 - round (2**52 * log (3 / (8 * log 2) + 1/2) / log 2 - 1/2) 214 | 4606921278410026770 215 | 216 | */ 217 | 218 | double pow_fast_ub(double a, double b) { 219 | union { double d; long long x; } u = { a }; 220 | u.x = (long long)(b * (u.x - 4606794787981043020LL) + 4607182418800017409LL); 221 | return u.d; 222 | } 223 | 224 | double pow_fast(double a, double b) { 225 | union { double d; long long x; } u = { a }; 226 | u.x = (long long)(b * (u.x - 4606921278410026770LL) + 4606921278410026770LL); 227 | return u.d; 228 | } 229 | 230 | double pow_fast_lb(double a, double b) { 231 | union { double d; long long x; } u = { a }; 232 | u.x = (long long)(b * (u.x - 4607182418800017409LL) + 4606794787981043020LL); 233 | return u.d; 234 | } 235 | 236 | /* should be much more precise with large b, still ~3.3x faster. */ 237 | double pow_fast_precise_ankerl(double a, double b) { 238 | int flipped = 0; 239 | if (b < 0) { 240 | flipped = 1; 241 | b = -b; 242 | } 243 | 244 | /* calculate approximation with fraction of the exponent */ 245 | int e = (int)b; 246 | union { double d; int x[2]; } u = { a }; 247 | u.x[1] = (int)((b - e) * (u.x[1] - 1072632447) + 1072632447); 248 | u.x[0] = 0; 249 | 250 | double r = 1.0; 251 | while (e) { 252 | if (e & 1) { 253 | r *= a; 254 | } 255 | a *= a; 256 | e >>= 1; 257 | } 258 | 259 | r *= u.d; 260 | return flipped ? 1.0 / r : r; 261 | } 262 | 263 | /* should be much more precise with large b, still ~3.3x faster. */ 264 | double pow_fast_precise(double a, double b) { 265 | int flipped = 0; 266 | if (b < 0) { 267 | flipped = 1; 268 | b = -b; 269 | } 270 | 271 | /* calculate approximation with fraction of the exponent */ 272 | int e = (int)b; 273 | double d = exp_fast(b - e); 274 | 275 | double r = 1.0; 276 | while (e) { 277 | if (e & 1) r *= a; 278 | a *= a; 279 | e >>= 1; 280 | } 281 | 282 | r *= d; 283 | return flipped ? 1.0 / r : r; 284 | } 285 | 286 | double better_pow_fast_precise(double a, double b) { 287 | int flipped = 0; 288 | if (b < 0) { 289 | flipped = 1; 290 | b = -b; 291 | } 292 | 293 | /* calculate approximation with fraction of the exponent */ 294 | int e = (int)b; 295 | double d = better_exp_fast(b - e); 296 | 297 | double r = 1.0; 298 | while (e) { 299 | if (e & 1) r *= a; 300 | a *= a; 301 | e >>= 1; 302 | } 303 | 304 | r *= d; 305 | return flipped ? 1.0 / r : r; 306 | } 307 | 308 | 309 | /* should be much more precise with large b */ 310 | float powf_fast_precise(float a, float b) { 311 | int flipped = 0; 312 | if (b < 0) { 313 | flipped = 1; 314 | b = -b; 315 | } 316 | 317 | /* calculate approximation with fraction of the exponent */ 318 | int e = (int)b; 319 | union { float f; int x; } u = { a }; 320 | u.x = (int)((b - e) * (u.x - 1065353216) + 1065353216); 321 | 322 | float r = 1.0f; 323 | while (e) { 324 | if (e & 1) { 325 | r *= a; 326 | } 327 | a *= a; 328 | e >>= 1; 329 | } 330 | 331 | r *= u.f; 332 | return flipped ? 1.0f / r : r; 333 | } 334 | 335 | /* should be much more precise with large b */ 336 | float better_powf_fast_precise(float a, float b) { 337 | int flipped = 0; 338 | if (b < 0) { 339 | flipped = 1; 340 | b = -b; 341 | } 342 | 343 | /* calculate approximation with fraction of the exponent */ 344 | int e = (int)b; 345 | float f = better_expf_fast(b - e); 346 | 347 | float r = 1.0f; 348 | while (e) { 349 | if (e & 1) { 350 | r *= a; 351 | } 352 | a *= a; 353 | e >>= 1; 354 | } 355 | 356 | r *= f; 357 | return flipped ? 1.0f / r : r; 358 | } 359 | --------------------------------------------------------------------------------