├── .gitignore ├── readme.ipynb ├── readme.md ├── requirements.txt ├── stacking_classifier.py └── stacking_regressor.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | .ipynb_checkpoints/ 4 | catboost_info/ -------------------------------------------------------------------------------- /readme.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 特点\n", 8 | "(1)方便扩展,比如扩展Sklearn,Keras,CatBoost等工具(只需继承stacking_classifier中的Classifier类,并实现相应方法即可); \n", 9 | "(2)可以构建很深,很复杂的stacking结构 \n", 10 | "(3)支持离散变量(为了方便lightgbm,catboost) \n", 11 | "(4)支持并行/并发训练 \n", 12 | "\n", 13 | "接下来,我在手写数值识别上演示api使用示例: " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from stacking_classifier import *\n", 23 | "from sklearn.model_selection import train_test_split\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "from sklearn.datasets import load_digits\n", 26 | "digits = load_digits()\n", 27 | "X, y = digits['data'], digits['target']\n", 28 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "### 一.基本分类器的使用\n", 36 | "这里所有的分类器都需要实现Classifier类的接口,如果你是使用的Sklearn风格的分类器,只需要做如下操作即可,stacking_classifier中默认封装了SVMClassifier,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier,LogisticRegression,NaiveBayesClassifier,LightGBMClassifier,CatBoostClassifier等分类器" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "class LogisticRegression(SklearnClassifier):\n", 46 | " def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None,\n", 47 | " categorical_feature_indices=None, n_jobs=1):\n", 48 | " from sklearn.linear_model import LogisticRegression\n", 49 | " SklearnClassifier.__init__(self, train_params, LogisticRegression, subsample_features_rate,\n", 50 | " subsample_features_indices, categorical_feature_indices, n_jobs)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "0.9454545086848583\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "classifier = LogisticRegression()\n", 68 | "classifier.build_model()\n", 69 | "classifier.fit(X_train, y_train)\n", 70 | "p_test = classifier.predict(X_test)\n", 71 | "print(f1_score(y_test, p_test, average='macro'))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### 二.KFolds_Classifier_Training_Wrapper包装器的使用\n", 79 | "```KFolds_Classifier_Training_Wrapper```可以将数据切分成```k_fold```份,并训练```k_fold```个分类器" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "0.9385512218861211\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "classifier = RandomForestClassifier()\n", 97 | "classifier = KFolds_Classifier_Training_Wrapper(classifier,k_fold=5)#这里封装一下即可,默认k_fold=5\n", 98 | "classifier.build_model()\n", 99 | "classifier.fit(X_train, y_train)\n", 100 | "p_test = classifier.predict(X_test)\n", 101 | "print(f1_score(y_test, p_test, average='macro'))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "0.9420283908932341\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "classifier = RandomForestClassifier()\n", 119 | "#KFolds_Classifier_Training_Wrapper也可以嵌套封装,这样下面就有25个基分类器\n", 120 | "classifier = KFolds_Classifier_Training_Wrapper(KFolds_Classifier_Training_Wrapper(classifier))\n", 121 | "classifier.build_model()\n", 122 | "classifier.fit(X_train, y_train)\n", 123 | "p_test = classifier.predict(X_test)\n", 124 | "print(f1_score(y_test, p_test, average='macro'))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### 三.StackingClassifier分类器的使用\n", 132 | "```StackingClassifier```中的基分类器和元分类器可以是任意继承了Classifier类的子类,由于```KFolds_Classifier_Training_Wrapper```以及```StackingClassifier```都继承了```Classifier```类,所以意味着你可以任意嵌套..." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "0.9368902197269791\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "classifier = StackingClassifier(\n", 150 | " base_classifiers=[\n", 151 | " RandomForestClassifier(),\n", 152 | " AdaBoostClassifier(),\n", 153 | " BaggingClassifier(),\n", 154 | " SVMClassifier(),\n", 155 | " ],\n", 156 | " meta_classifier=LogisticRegression(),\n", 157 | " force_cv=False#默认为True,会对base_classifiers,meta_classifier进行KFolds_Classifier_Training_Wrapper包装\n", 158 | ")\n", 159 | "classifier.build_model()\n", 160 | "classifier.fit(train_x=X_train, train_y=y_train)\n", 161 | "p_test = classifier.predict(X_test)\n", 162 | "print(f1_score(y_test, p_test, average='macro'))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "0.9573682188285277\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "classifier = StackingClassifier(\n", 180 | " base_classifiers=[\n", 181 | " RandomForestClassifier(),\n", 182 | " AdaBoostClassifier(),\n", 183 | " BaggingClassifier(),\n", 184 | " SVMClassifier(),\n", 185 | " StackingClassifier(\n", 186 | " base_classifiers=[\n", 187 | " LogisticRegression(),\n", 188 | " RandomForestClassifier(),\n", 189 | " ],\n", 190 | " meta_classifier=GradientBoostingClassifier(),\n", 191 | " )\n", 192 | " ],\n", 193 | " meta_classifier=LogisticRegression(),\n", 194 | " base_k_fold=5,#基分类器分拆份数,force_cv=True时生效,\n", 195 | " meta_k_fold=5,#元分类器分拆份数,force_cv=True时生效,\n", 196 | ")\n", 197 | "classifier.build_model()\n", 198 | "classifier.fit(train_x=X_train, train_y=y_train)\n", 199 | "p_test = classifier.predict(X_test)\n", 200 | "print(f1_score(y_test, p_test, average='macro'))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### 四.随机/指定选择训练和预测的feature\n", 208 | "可以随机选择,通过```subsample_features_indices```指定选择训练的feature,```subsample_features_rate```随机选择训练的feature" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 8, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "0.9527371565889977\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "classifier = StackingClassifier(\n", 226 | " base_classifiers=[\n", 227 | " RandomForestClassifier(subsample_features_indices=[1,4,7,8]),#指定只使用第1,4,7,8列特征用于训练和预测,上层的参数不会覆盖此参数\n", 228 | " AdaBoostClassifier(subsample_features_rate=0.1),#随机选择10%的特征用于训练和预测,上层的参数不会覆盖此参数\n", 229 | " BaggingClassifier(),\n", 230 | " SVMClassifier(),\n", 231 | " StackingClassifier(\n", 232 | " base_classifiers=[\n", 233 | " LogisticRegression(),\n", 234 | " RandomForestClassifier(),\n", 235 | " ],\n", 236 | " meta_classifier=GradientBoostingClassifier(),\n", 237 | " )\n", 238 | " ],\n", 239 | " meta_classifier=LogisticRegression(),\n", 240 | " subsample_features_rate=0.5#该参数会向下传递到最底层的所有未指定subsample_features_rate参数的分类器,subsample_features_indices同理\n", 241 | ")\n", 242 | "classifier.build_model()\n", 243 | "classifier.fit(train_x=X_train, train_y=y_train)\n", 244 | "p_test = classifier.predict(X_test)\n", 245 | "print(f1_score(y_test, p_test, average='macro'))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "### 五.支持离散变量的输入\n", 253 | "这里为了方便lightgbm,catboost操作而支持离散变量类型,注意: \n", 254 | "(1)**必须在最顶层指定str/object类型的变量**(这样底层不支持str/object类型的分类器才能过滤掉这些特征); \n", 255 | "(2)lightgbm不支持'x','y','z'这种类型的离散变量,只支持‘1’,'2','3'或者int/float类型的离散变量,所以有时需要单独指定; \n", 256 | "(3)如果指定了```categorical_feature_indices```参数,```subsample_features_rate,subsample_features_indices```退化为只对剩余的非```categorical_feature_indices```特征生效" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 9, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "#为原始数据添加两列:一列是数值的字符串,一列是随意的字符串\n", 266 | "import numpy as np\n", 267 | "new_column = np.asarray(['1'] * 1797)\n", 268 | "new_column2 = np.asarray(['x'] * 1797)\n", 269 | "X_new = np.concatenate([X, new_column.reshape(1797, 1), new_column2.reshape(1797, 1)], axis=1)\n", 270 | "X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, y, test_size=0.7, random_state=42)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 10, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "array([['0.0', '0.0', '10.0', '13.0', '9.0', '1.0', '0.0', '0.0', '0.0',\n", 282 | " '2.0', '16.0', '7.0', '10.0', '8.0', '0.0', '0.0', '0.0', '0.0',\n", 283 | " '12.0', '12.0', '7.0', '11.0', '0.0', '0.0', '0.0', '3.0',\n", 284 | " '16.0', '16.0', '16.0', '7.0', '0.0', '0.0', '0.0', '0.0', '5.0',\n", 285 | " '8.0', '12.0', '10.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0',\n", 286 | " '0.0', '11.0', '7.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0',\n", 287 | " '3.0', '15.0', '0.0', '0.0', '0.0', '11.0', '16.0', '16.0',\n", 288 | " '16.0', '8.0', '0.0', '1', 'x']], dtype=' 1:\n", 477 | " return probas\n", 478 | " else:\n", 479 | " return np.asarray([[1 - proba, proba] for proba in probas])" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 14, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "0.9561217803277486\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "#然后就可以嵌入到Stacking中了\n", 497 | "classifier = StackingClassifier(\n", 498 | " base_classifiers=[\n", 499 | " LightGBMClassifier(),\n", 500 | " CatBoostClassifier(),\n", 501 | " RandomForestClassifier(),\n", 502 | " AdaBoostClassifier(),\n", 503 | " BaggingClassifier(),\n", 504 | " SVMClassifier(),\n", 505 | " StackingClassifier(\n", 506 | " base_classifiers=[\n", 507 | " SimpleMLPClassifer(train_params={'input_num':64,'class_num':10}),#比如放这儿\n", 508 | " RandomForestClassifier(),\n", 509 | " ],\n", 510 | " meta_classifier=GradientBoostingClassifier(),\n", 511 | " )\n", 512 | " ],\n", 513 | " meta_classifier=LogisticRegression()\n", 514 | ")\n", 515 | "classifier.build_model()\n", 516 | "classifier.fit(train_x=X_train, train_y=y_train)\n", 517 | "p_test = classifier.predict(X_test)\n", 518 | "print(f1_score(y_test, p_test, average='macro'))" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "### 八.并行/并发训练\n", 526 | "在Linux中采用多进程并行的方式训练,在Windows中采用多线程并发的方式训练,目前仅在Windows中做过简单测试,能比串行训练提速70%+左右(视具体Stacking结构的不同,提速效率也不一样,不建议将meta_classifier定义为复杂的结构,这部分没有做过多优化),使用方式很简单,在最顶层设置```n_jobs=-1```即可,该模块后面还会持续优化..." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 15, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "name": "stdout", 536 | "output_type": "stream", 537 | "text": [ 538 | "0.9522582080173111\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "classifier = StackingClassifier(\n", 544 | " base_classifiers=[\n", 545 | " RandomForestClassifier(subsample_features_indices=[1,4,7,8],train_params={'n_estimators':200}),\n", 546 | " AdaBoostClassifier(subsample_features_rate=0.1),\n", 547 | " LogisticRegression(train_params={'penalty':'l2','C':1.0}),\n", 548 | " LightGBMClassifier(),\n", 549 | " CatBoostClassifier(train_params={'depth': 3, 'iterations': 50}),\n", 550 | " StackingClassifier(\n", 551 | " base_classifiers=[\n", 552 | " LogisticRegression(train_params={'C':2.0}),\n", 553 | " RandomForestClassifier(),\n", 554 | " ],\n", 555 | " meta_classifier=GradientBoostingClassifier(),\n", 556 | " )\n", 557 | " ],\n", 558 | " meta_classifier=LogisticRegression(),\n", 559 | " subsample_features_rate=0.5,\n", 560 | " n_jobs=-1#这里\n", 561 | ")\n", 562 | "classifier.build_model()\n", 563 | "classifier.fit(train_x=X_train, train_y=y_train)\n", 564 | "p_test = classifier.predict(X_test)\n", 565 | "print(f1_score(y_test, p_test, average='macro'))" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "### 九.模型保存与加载" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 16, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "#保存\n", 582 | "classifier.save_model('stacking.model')" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 17, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "#加载\n", 592 | "new_classifier=Classifier.load_model('stacking.model')#注意是Classifier类,不是classifier对象" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 18, 598 | "metadata": {}, 599 | "outputs": [ 600 | { 601 | "name": "stdout", 602 | "output_type": "stream", 603 | "text": [ 604 | "0.9522582080173111\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "p_test = new_classifier.predict(X_test)\n", 610 | "print(f1_score(y_test, p_test, average='macro'))" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "### 十.回归\n", 618 | "回归的操作与Classifier类似,不再赘述,下面列一下对应关系: \n", 619 | "stacking_classifier->stacking_regressor \n", 620 | "Classifier->Regressor \n", 621 | "SklearnClassifier->SklearnRegressor \n", 622 | "KFolds_Classifier_Training_Wrapper->KFolds_Regressor_Training_Wrapper \n", 623 | "StackingClassifier->StackingRegressor \n", 624 | "\n", 625 | "```subsample_features_rate,subsample_features_indices,categorical_feature_indices,n_jobs```的相关内容还未在回归中实现,后续更新..." 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | } 635 | ], 636 | "metadata": { 637 | "kernelspec": { 638 | "display_name": "Python 3", 639 | "language": "python", 640 | "name": "python3" 641 | }, 642 | "language_info": { 643 | "codemirror_mode": { 644 | "name": "ipython", 645 | "version": 3 646 | }, 647 | "file_extension": ".py", 648 | "mimetype": "text/x-python", 649 | "name": "python", 650 | "nbconvert_exporter": "python", 651 | "pygments_lexer": "ipython3", 652 | "version": "3.6.5" 653 | } 654 | }, 655 | "nbformat": 4, 656 | "nbformat_minor": 2 657 | } 658 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### 特点 3 | (1)方便扩展,比如扩展Sklearn,Keras,CatBoost等工具(只需继承stacking_classifier中的Classifier类,并实现相应方法即可); 4 | (2)可以构建很深,很复杂的stacking结构 5 | (3)支持离散变量(为了方便lightgbm,catboost) 6 | (4)支持并行/并发训练 7 | 8 | 接下来,我在手写数值识别上演示api使用示例: 9 | 10 | 11 | ```python 12 | from stacking_classifier import * 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.metrics import f1_score 15 | from sklearn.datasets import load_digits 16 | digits = load_digits() 17 | X, y = digits['data'], digits['target'] 18 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42) 19 | ``` 20 | 21 | ### 一.基本分类器的使用 22 | 这里所有的分类器都需要实现Classifier类的接口,如果你是使用的Sklearn风格的分类器,只需要做如下操作即可,stacking_classifier中默认封装了SVMClassifier,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier,LogisticRegression,NaiveBayesClassifier,LightGBMClassifier,CatBoostClassifier等分类器 23 | 24 | 25 | ```python 26 | class LogisticRegression(SklearnClassifier): 27 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 28 | categorical_feature_indices=None, n_jobs=1): 29 | from sklearn.linear_model import LogisticRegression 30 | SklearnClassifier.__init__(self, train_params, LogisticRegression, subsample_features_rate, 31 | subsample_features_indices, categorical_feature_indices, n_jobs) 32 | ``` 33 | 34 | 35 | ```python 36 | classifier = LogisticRegression() 37 | classifier.build_model() 38 | classifier.fit(X_train, y_train) 39 | p_test = classifier.predict(X_test) 40 | print(f1_score(y_test, p_test, average='macro')) 41 | ``` 42 | 43 | 0.9454545086848583 44 | 45 | 46 | ### 二.KFolds_Classifier_Training_Wrapper包装器的使用 47 | ```KFolds_Classifier_Training_Wrapper```可以将数据切分成```k_fold```份,并训练```k_fold```个分类器 48 | 49 | 50 | ```python 51 | classifier = RandomForestClassifier() 52 | classifier = KFolds_Classifier_Training_Wrapper(classifier,k_fold=5)#这里封装一下即可,默认k_fold=5 53 | classifier.build_model() 54 | classifier.fit(X_train, y_train) 55 | p_test = classifier.predict(X_test) 56 | print(f1_score(y_test, p_test, average='macro')) 57 | ``` 58 | 59 | 0.9385512218861211 60 | 61 | 62 | 63 | ```python 64 | classifier = RandomForestClassifier() 65 | #KFolds_Classifier_Training_Wrapper也可以嵌套封装,这样下面就有25个基分类器 66 | classifier = KFolds_Classifier_Training_Wrapper(KFolds_Classifier_Training_Wrapper(classifier)) 67 | classifier.build_model() 68 | classifier.fit(X_train, y_train) 69 | p_test = classifier.predict(X_test) 70 | print(f1_score(y_test, p_test, average='macro')) 71 | ``` 72 | 73 | 0.9420283908932341 74 | 75 | 76 | ### 三.StackingClassifier分类器的使用 77 | ```StackingClassifier```中的基分类器和元分类器可以是任意继承了Classifier类的子类,由于```KFolds_Classifier_Training_Wrapper```以及```StackingClassifier```都继承了```Classifier```类,所以意味着你可以任意嵌套... 78 | 79 | 80 | ```python 81 | classifier = StackingClassifier( 82 | base_classifiers=[ 83 | RandomForestClassifier(), 84 | AdaBoostClassifier(), 85 | BaggingClassifier(), 86 | SVMClassifier(), 87 | ], 88 | meta_classifier=LogisticRegression(), 89 | force_cv=False#默认为True,会对base_classifiers,meta_classifier进行KFolds_Classifier_Training_Wrapper包装 90 | ) 91 | classifier.build_model() 92 | classifier.fit(train_x=X_train, train_y=y_train) 93 | p_test = classifier.predict(X_test) 94 | print(f1_score(y_test, p_test, average='macro')) 95 | ``` 96 | 97 | 0.9368902197269791 98 | 99 | 100 | 101 | ```python 102 | classifier = StackingClassifier( 103 | base_classifiers=[ 104 | RandomForestClassifier(), 105 | AdaBoostClassifier(), 106 | BaggingClassifier(), 107 | SVMClassifier(), 108 | StackingClassifier( 109 | base_classifiers=[ 110 | LogisticRegression(), 111 | RandomForestClassifier(), 112 | ], 113 | meta_classifier=GradientBoostingClassifier(), 114 | ) 115 | ], 116 | meta_classifier=LogisticRegression(), 117 | base_k_fold=5,#基分类器分拆份数,force_cv=True时生效, 118 | meta_k_fold=5,#元分类器分拆份数,force_cv=True时生效, 119 | ) 120 | classifier.build_model() 121 | classifier.fit(train_x=X_train, train_y=y_train) 122 | p_test = classifier.predict(X_test) 123 | print(f1_score(y_test, p_test, average='macro')) 124 | ``` 125 | 126 | 0.9573682188285277 127 | 128 | 129 | ### 四.随机/指定选择训练和预测的feature 130 | 可以随机选择,通过```subsample_features_indices```指定选择训练的feature,```subsample_features_rate```随机选择训练的feature 131 | 132 | 133 | ```python 134 | classifier = StackingClassifier( 135 | base_classifiers=[ 136 | RandomForestClassifier(subsample_features_indices=[1,4,7,8]),#指定只使用第1,4,7,8列特征用于训练和预测,上层的参数不会覆盖此参数 137 | AdaBoostClassifier(subsample_features_rate=0.1),#随机选择10%的特征用于训练和预测,上层的参数不会覆盖此参数 138 | BaggingClassifier(), 139 | SVMClassifier(), 140 | StackingClassifier( 141 | base_classifiers=[ 142 | LogisticRegression(), 143 | RandomForestClassifier(), 144 | ], 145 | meta_classifier=GradientBoostingClassifier(), 146 | ) 147 | ], 148 | meta_classifier=LogisticRegression(), 149 | subsample_features_rate=0.5#该参数会向下传递到最底层的所有未指定subsample_features_rate参数的分类器,subsample_features_indices同理 150 | ) 151 | classifier.build_model() 152 | classifier.fit(train_x=X_train, train_y=y_train) 153 | p_test = classifier.predict(X_test) 154 | print(f1_score(y_test, p_test, average='macro')) 155 | ``` 156 | 157 | 0.9527371565889977 158 | 159 | 160 | ### 五.支持离散变量的输入 161 | 这里为了方便lightgbm,catboost操作而支持离散变量类型,注意: 162 | (1)**必须在最顶层指定str/object类型的变量**(这样底层不支持str/object类型的分类器才能过滤掉这些特征); 163 | (2)lightgbm不支持'x','y','z'这种类型的离散变量,只支持‘1’,'2','3'或者int/float类型的离散变量,所以有时需要单独指定; 164 | (3)如果指定了```categorical_feature_indices```参数,```subsample_features_rate,subsample_features_indices```退化为只对剩余的非```categorical_feature_indices```特征生效 165 | 166 | 167 | ```python 168 | #为原始数据添加两列:一列是数值的字符串,一列是随意的字符串 169 | import numpy as np 170 | new_column = np.asarray(['1'] * 1797) 171 | new_column2 = np.asarray(['x'] * 1797) 172 | X_new = np.concatenate([X, new_column.reshape(1797, 1), new_column2.reshape(1797, 1)], axis=1) 173 | X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, y, test_size=0.7, random_state=42) 174 | ``` 175 | 176 | 177 | ```python 178 | X_new_train[0:1] 179 | ``` 180 | 181 | 182 | 183 | 184 | array([['0.0', '0.0', '10.0', '13.0', '9.0', '1.0', '0.0', '0.0', '0.0', 185 | '2.0', '16.0', '7.0', '10.0', '8.0', '0.0', '0.0', '0.0', '0.0', 186 | '12.0', '12.0', '7.0', '11.0', '0.0', '0.0', '0.0', '3.0', 187 | '16.0', '16.0', '16.0', '7.0', '0.0', '0.0', '0.0', '0.0', '5.0', 188 | '8.0', '12.0', '10.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', 189 | '0.0', '11.0', '7.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', 190 | '3.0', '15.0', '0.0', '0.0', '0.0', '11.0', '16.0', '16.0', 191 | '16.0', '8.0', '0.0', '1', 'x']], dtype=' 1: 332 | return probas 333 | else: 334 | return np.asarray([[1 - proba, proba] for proba in probas]) 335 | ``` 336 | 337 | Using TensorFlow backend. 338 | 339 | 340 | 341 | ```python 342 | #然后就可以嵌入到Stacking中了 343 | classifier = StackingClassifier( 344 | base_classifiers=[ 345 | LightGBMClassifier(), 346 | CatBoostClassifier(), 347 | RandomForestClassifier(), 348 | AdaBoostClassifier(), 349 | BaggingClassifier(), 350 | SVMClassifier(), 351 | StackingClassifier( 352 | base_classifiers=[ 353 | SimpleMLPClassifer(train_params={'input_num':64,'class_num':10}),#比如放这儿 354 | RandomForestClassifier(), 355 | ], 356 | meta_classifier=GradientBoostingClassifier(), 357 | ) 358 | ], 359 | meta_classifier=LogisticRegression() 360 | ) 361 | classifier.build_model() 362 | classifier.fit(train_x=X_train, train_y=y_train) 363 | p_test = classifier.predict(X_test) 364 | print(f1_score(y_test, p_test, average='macro')) 365 | ``` 366 | 367 | 0.9561217803277486 368 | 369 | 370 | ### 八.并行/并发训练 371 | 在Linux中采用多进程并行的方式训练,在Windows中采用多线程并发的方式训练,目前仅在Windows中做过简单测试,能比串行训练提速70%+左右(视具体Stacking结构的不同,提速效率也不一样,不建议将meta_classifier定义为复杂的结构,这部分没有做过多优化),使用方式很简单,在最顶层设置```n_jobs=-1```即可,该模块后面还会持续优化... 372 | 373 | 374 | ```python 375 | classifier = StackingClassifier( 376 | base_classifiers=[ 377 | RandomForestClassifier(subsample_features_indices=[1,4,7,8],train_params={'n_estimators':200}), 378 | AdaBoostClassifier(subsample_features_rate=0.1), 379 | LogisticRegression(train_params={'penalty':'l2','C':1.0}), 380 | LightGBMClassifier(), 381 | CatBoostClassifier(train_params={'depth': 3, 'iterations': 50}), 382 | StackingClassifier( 383 | base_classifiers=[ 384 | LogisticRegression(train_params={'C':2.0}), 385 | RandomForestClassifier(), 386 | ], 387 | meta_classifier=GradientBoostingClassifier(), 388 | ) 389 | ], 390 | meta_classifier=LogisticRegression(), 391 | subsample_features_rate=0.5, 392 | n_jobs=-1#这里 393 | ) 394 | classifier.build_model() 395 | classifier.fit(train_x=X_train, train_y=y_train) 396 | p_test = classifier.predict(X_test) 397 | print(f1_score(y_test, p_test, average='macro')) 398 | ``` 399 | 400 | 0.9522582080173111 401 | 402 | 403 | ### 九.模型保存与加载 404 | 405 | 406 | ```python 407 | #保存 408 | classifier.save_model('stacking.model') 409 | ``` 410 | 411 | 412 | ```python 413 | #加载 414 | new_classifier=Classifier.load_model('stacking.model')#注意是Classifier类,不是classifier对象 415 | ``` 416 | 417 | 418 | ```python 419 | p_test = new_classifier.predict(X_test) 420 | print(f1_score(y_test, p_test, average='macro')) 421 | ``` 422 | 423 | 0.9522582080173111 424 | 425 | 426 | ### 十.回归 427 | 回归的操作与Classifier类似,不再赘述,下面列一下对应关系: 428 | stacking_classifier->stacking_regressor 429 | Classifier->Regressor 430 | SklearnClassifier->SklearnRegressor 431 | KFolds_Classifier_Training_Wrapper->KFolds_Regressor_Training_Wrapper 432 | StackingClassifier->StackingRegressor 433 | 434 | ```subsample_features_rate,subsample_features_indices,categorical_feature_indices,n_jobs```的相关内容还未在回归中实现,后续更新... 435 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sklearn>=0.20.3 2 | catboost>=0.11.2 3 | lightgbm>=2.2.1 -------------------------------------------------------------------------------- /stacking_classifier.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import KFold, train_test_split 2 | from multiprocessing import Pool, cpu_count 3 | import threading 4 | import numpy as np 5 | import pickle 6 | import copy 7 | import random 8 | import platform 9 | import warnings 10 | 11 | warnings.filterwarnings("ignore") 12 | 13 | ''' 14 | 常用函数 15 | ''' 16 | 17 | ''' 18 | 类别标签转one-hot 19 | ''' 20 | 21 | 22 | def to_categorical(y, num_classes=None, dtype='float32'): 23 | # copy from keras 24 | y = np.array(y, dtype='int') 25 | input_shape = y.shape 26 | if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: 27 | input_shape = tuple(input_shape[:-1]) 28 | y = y.ravel() 29 | if not num_classes: 30 | num_classes = np.max(y) + 1 31 | n = y.shape[0] 32 | categorical = np.zeros((n, num_classes), dtype=dtype) 33 | categorical[np.arange(n), y] = 1 34 | output_shape = input_shape + (num_classes,) 35 | categorical = np.reshape(categorical, output_shape) 36 | return categorical 37 | 38 | 39 | ''' 40 | 保证输入数据类型为numpy 41 | ''' 42 | 43 | 44 | def force2ndarray(fn): 45 | def clean_data(*args, **kwargs): 46 | if len(kwargs) != 0: 47 | append_args = [] 48 | keys = kwargs.keys() 49 | if 'train_x' in keys: 50 | append_args.append(kwargs['train_x']) 51 | if 'test_x' in keys: 52 | append_args.append(kwargs['test_x']) 53 | if 'train_y' in keys: 54 | append_args.append(kwargs['train_y']) 55 | args += tuple(append_args) 56 | 57 | if args[1].__class__.__name__ == 'DataFrame': 58 | inputs_0 = args[1].values 59 | elif args[1].__class__.__name__ == 'list': 60 | inputs_0 = np.asarray(args[1]) 61 | elif args[1].__class__.__name__ == 'ndarray': 62 | inputs_0 = args[1] 63 | else: 64 | raise RuntimeError('未知数据类型:', args[1].__class__.__name__) 65 | 66 | if len(args) == 3: 67 | if args[2].__class__.__name__ == 'Series': 68 | inputs_1 = args[2].values 69 | elif args[2].__class__.__name__ == 'list': 70 | inputs_1 = np.asarray(args[2]) 71 | elif args[2].__class__.__name__ == 'ndarray': 72 | inputs_1 = args[2] 73 | else: 74 | raise RuntimeError('未知数据类型:', args[2].__class__.__name__) 75 | if len(args) == 2: 76 | return fn(args[0], inputs_0) 77 | else: 78 | return fn(args[0], inputs_0, inputs_1) 79 | 80 | return clean_data 81 | 82 | 83 | """ 84 | 分类器接口 85 | """ 86 | 87 | 88 | class Classifier(object): 89 | """ 90 | 定义分类器接口 91 | """ 92 | 93 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 94 | categorical_feature_indices=None, n_jobs=1): 95 | """ 96 | :param train_params: 训练参数 97 | """ 98 | self.train_params = {} if train_params is None else train_params 99 | self.subsample_features_rate = subsample_features_rate 100 | self.subsample_features_indices = subsample_features_indices 101 | self.categorical_feature_indices = categorical_feature_indices 102 | self.n_jobs = n_jobs 103 | 104 | def reshape_features(self, features): 105 | """ 106 | 读取features指定列用于训练或者随机选择某几列训练 107 | :param features: 108 | :return: 109 | """ 110 | _, columns = features.shape 111 | indices = list(range(0, columns)) 112 | # 默认会排除字符串变量 113 | no_categorical_feature_indices = [] 114 | if self.categorical_feature_indices is not None: 115 | for index in indices: 116 | if index not in self.categorical_feature_indices: 117 | no_categorical_feature_indices.append(index) 118 | else: 119 | no_categorical_feature_indices = indices 120 | 121 | if self.subsample_features_indices is None and self.subsample_features_rate is not None: 122 | random.shuffle(no_categorical_feature_indices) 123 | self.subsample_features_indices = no_categorical_feature_indices[ 124 | :int(len(no_categorical_feature_indices) * self.subsample_features_rate)] 125 | if self.subsample_features_indices is not None: 126 | return features[:, self.subsample_features_indices] 127 | return features[:, no_categorical_feature_indices] 128 | 129 | @staticmethod 130 | def update_params(current_classifier, subsample_features_rate, subsample_features_indices, 131 | categorical_feature_indices): 132 | ''' 133 | 递归向下更新参数 134 | :return: 135 | ''' 136 | if current_classifier.subsample_features_rate is None: 137 | current_classifier.subsample_features_rate = subsample_features_rate 138 | if current_classifier.subsample_features_indices is None: 139 | current_classifier.subsample_features_indices = subsample_features_indices 140 | if current_classifier.categorical_feature_indices is None: 141 | current_classifier.categorical_feature_indices = categorical_feature_indices 142 | 143 | if current_classifier.__class__.__name__ == 'KFolds_Classifier_Training_Wrapper': 144 | Classifier.update_params(current_classifier.base_classifier, current_classifier.subsample_features_rate, 145 | current_classifier.subsample_features_indices, 146 | current_classifier.categorical_feature_indices) 147 | if current_classifier.__class__.__name__ == 'StackingClassifier': 148 | for base_classifier in current_classifier.base_classifiers: 149 | Classifier.update_params(base_classifier, current_classifier.subsample_features_rate, 150 | current_classifier.subsample_features_indices, 151 | current_classifier.categorical_feature_indices) 152 | 153 | def build_model(self): 154 | """ 155 | 创建模型 156 | :return: 157 | """ 158 | raise RuntimeError("need to implement!") 159 | 160 | def fit(self, train_x, train_y): 161 | """ 162 | 拟合数据 163 | :return: 164 | """ 165 | raise RuntimeError("need to implement!") 166 | 167 | def predict(self, test_x): 168 | """ 169 | 预测标签 170 | :param test_x: 171 | :return: 172 | """ 173 | raise RuntimeError("need to implement!") 174 | 175 | def predict_categorical(self, test_x): 176 | """ 177 | 预测标签分布 178 | :param test_x: 179 | :return:[0,0,1,0,...] 180 | """ 181 | raise RuntimeError("need to implement!") 182 | 183 | def predict_proba(self, test_x): 184 | """ 185 | 预测标签概率(分布) 186 | :param test_x: 187 | :return: 188 | """ 189 | 190 | def predict_categorical_proba(self, test_x): 191 | """ 192 | 预测标签概率分布 193 | :param test_x: 194 | :return: 195 | """ 196 | 197 | def save_model(self, model_path): 198 | """ 199 | 存储模型 200 | :return: 201 | """ 202 | with open(model_path, 'wb') as model_file: 203 | pickle.dump(self, model_file) 204 | 205 | @staticmethod 206 | def load_model(model_path): 207 | """ 208 | 加载模型 209 | :return: 210 | """ 211 | with open(model_path, 'rb') as model_file: 212 | new_model = pickle.load(model_file) 213 | return new_model 214 | 215 | 216 | class SklearnClassifier(Classifier): 217 | """ 218 | 基于sklearn api的classifier实现 219 | """ 220 | 221 | def __init__(self, train_params=None, classifier_class=None, subsample_features_rate=None, 222 | subsample_features_indices=None, categorical_feature_indices=None, n_jobs=1): 223 | Classifier.__init__(self, train_params, subsample_features_rate, subsample_features_indices, 224 | categorical_feature_indices, n_jobs) 225 | self.classifier_class = classifier_class 226 | 227 | def build_model(self): 228 | self.classifier_model = self.classifier_class(**self.train_params) 229 | 230 | @force2ndarray 231 | def fit(self, train_x, train_y): 232 | 233 | self.class_num = len(set(train_y)) 234 | self.classifier_model.fit(self.reshape_features(train_x).astype('float64'), train_y) 235 | 236 | @force2ndarray 237 | def predict(self, test_x): 238 | return self.classifier_model.predict(self.reshape_features(test_x)) 239 | 240 | @force2ndarray 241 | def predict_categorical(self, test_x): 242 | return to_categorical(self.predict(test_x), self.class_num) 243 | 244 | @force2ndarray 245 | def predict_proba(self, test_x): 246 | return self.classifier_model.predict_proba(self.reshape_features(test_x).astype('float64')) 247 | 248 | @force2ndarray 249 | def predict_categorical_proba(self, test_x): 250 | probas = self.classifier_model.predict_proba(self.reshape_features(test_x).astype('float64')) 251 | _, col = probas.shape 252 | if col > 1: 253 | return probas 254 | else: 255 | return np.asarray([[1 - proba, proba] for proba in probas]) 256 | 257 | 258 | class SVMClassifier(SklearnClassifier): 259 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 260 | categorical_feature_indices=None, n_jobs=1): 261 | from sklearn.svm import SVC 262 | if train_params is None: 263 | train_params = {'probability': True} 264 | else: 265 | train_params['probability'] = True 266 | SklearnClassifier.__init__(self, train_params, SVC, subsample_features_rate, subsample_features_indices, 267 | categorical_feature_indices, n_jobs) 268 | 269 | 270 | class RandomForestClassifier(SklearnClassifier): 271 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 272 | categorical_feature_indices=None, n_jobs=1): 273 | from sklearn.ensemble import RandomForestClassifier 274 | SklearnClassifier.__init__(self, train_params, RandomForestClassifier, subsample_features_rate, 275 | subsample_features_indices, categorical_feature_indices, n_jobs) 276 | 277 | 278 | class GradientBoostingClassifier(SklearnClassifier): 279 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 280 | categorical_feature_indices=None, n_jobs=1): 281 | from sklearn.ensemble import GradientBoostingClassifier 282 | SklearnClassifier.__init__(self, train_params, GradientBoostingClassifier, subsample_features_rate, 283 | subsample_features_indices, categorical_feature_indices, n_jobs) 284 | 285 | 286 | class AdaBoostClassifier(SklearnClassifier): 287 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 288 | categorical_feature_indices=None, n_jobs=1): 289 | from sklearn.ensemble import AdaBoostClassifier 290 | SklearnClassifier.__init__(self, train_params, AdaBoostClassifier, subsample_features_rate, 291 | subsample_features_indices, categorical_feature_indices, n_jobs) 292 | 293 | 294 | class BaggingClassifier(SklearnClassifier): 295 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 296 | categorical_feature_indices=None, n_jobs=1): 297 | from sklearn.ensemble import BaggingClassifier 298 | SklearnClassifier.__init__(self, train_params, BaggingClassifier, subsample_features_rate, 299 | subsample_features_indices, categorical_feature_indices, n_jobs) 300 | 301 | 302 | class LogisticRegression(SklearnClassifier): 303 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 304 | categorical_feature_indices=None, n_jobs=1): 305 | from sklearn.linear_model import LogisticRegression 306 | SklearnClassifier.__init__(self, train_params, LogisticRegression, subsample_features_rate, 307 | subsample_features_indices, categorical_feature_indices, n_jobs) 308 | 309 | 310 | class NaiveBayesClassifier(SklearnClassifier): 311 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 312 | categorical_feature_indices=None, n_jobs=1): 313 | from sklearn.naive_bayes import GaussianNB 314 | SklearnClassifier.__init__(self, train_params, GaussianNB, subsample_features_rate, subsample_features_indices, 315 | categorical_feature_indices, n_jobs) 316 | 317 | 318 | class KFolds_Classifier_Training_Wrapper(Classifier): 319 | ''' 320 | 对训练的分类器进行交叉式训练,是对原始分类器的扩展,可独立使用 321 | ''' 322 | 323 | def __init__(self, base_classifer=None, k_fold=5, random_state=42, subsample_features_rate=None, 324 | subsample_features_indices=None, categorical_feature_indices=None, n_jobs=1): 325 | """ 326 | 327 | :param base_classifer: 328 | :param k_fold: 329 | """ 330 | Classifier.__init__(self) 331 | self.base_classifier = base_classifer 332 | self.k_fold = k_fold 333 | self.random_state = random_state 334 | self.n_jobs = n_jobs 335 | # subsample_features_rate,subsample_features_indices,categorical_feature_indices参数向下递归传递给具体的base_classifiers 336 | Classifier.update_params(self, subsample_features_rate, subsample_features_indices, categorical_feature_indices) 337 | 338 | def build_model(self): 339 | """ 340 | 创建模型 341 | :return: 342 | """ 343 | self.extend_classifiers = [] 344 | for _ in range(0, self.k_fold): 345 | new_classifier = copy.deepcopy(self.base_classifier) 346 | new_classifier.build_model() 347 | self.extend_classifiers.append(new_classifier) 348 | 349 | @force2ndarray 350 | def fit(self, train_x, train_y): 351 | """ 352 | :param train_x: 训练特征 353 | :param train_y: 训练标签 354 | :return: 355 | """ 356 | if self.n_jobs not in [None, 0, 1]: 357 | # 并行训练 358 | mpt = MultiProcessTrainer(self.n_jobs) 359 | mpt.build_trainer_tree(self, train_x, train_y) 360 | mpt.fit() 361 | else: 362 | kf = KFold(n_splits=self.k_fold, shuffle=False, random_state=self.random_state) 363 | index = 0 364 | for train_index, _ in kf.split(train_x): 365 | X_train = train_x[train_index] 366 | y_train = train_y[train_index] 367 | self.extend_classifiers[index].fit(X_train, y_train) 368 | index += 1 369 | 370 | @force2ndarray 371 | def extract_k_fold_data_catogorical_features(self, train_x): 372 | """ 373 | 抽取交叉分割数据后的标签分布预测结果 374 | :return: 375 | """ 376 | catogorical_results = [] 377 | kf = KFold(n_splits=self.k_fold, shuffle=False, random_state=self.random_state) 378 | kf.get_n_splits(train_x) 379 | index = 0 380 | for _, test_index in kf.split(train_x): 381 | X_test = train_x[test_index] 382 | catogorical_results.append(self.extend_classifiers[index].predict_categorical(X_test)) 383 | index += 1 384 | return np.concatenate(catogorical_results, axis=0) 385 | 386 | @force2ndarray 387 | def extract_k_fold_data_catogorical_proba_features(self, train_x): 388 | """ 389 | 抽取交叉分割数据后的标签概率分布预测结果 390 | :return: 391 | """ 392 | catogorical_proba_results = [] 393 | kf = KFold(n_splits=self.k_fold, shuffle=False, random_state=self.random_state) 394 | index = 0 395 | for _, test_index in kf.split(train_x): 396 | X_test = train_x[test_index] 397 | catogorical_proba_results.append( 398 | self.extend_classifiers[index].predict_categorical_proba(X_test)) 399 | index += 1 400 | return np.concatenate(catogorical_proba_results, axis=0) 401 | 402 | @force2ndarray 403 | def predict(self, test_x): 404 | """ 405 | 预测标签 406 | :param test_x: 407 | :return: 408 | """ 409 | categorical_result = self.extend_classifiers[0].predict_categorical(test_x) 410 | for classifier_id in range(1, len(self.extend_classifiers)): 411 | categorical_result += self.extend_classifiers[classifier_id].predict_categorical(test_x) 412 | new_result = [] 413 | for current_index in range(0, len(categorical_result)): 414 | current_row = categorical_result[current_index].tolist() 415 | maxvalue_index = current_row.index(max(current_row)) 416 | new_result.append(maxvalue_index) 417 | return new_result 418 | 419 | @force2ndarray 420 | def predict_categorical(self, test_x): 421 | """ 422 | 预测标签分布 423 | :param test_x: 424 | :return:[0,0,1,0,...] 425 | """ 426 | categorical_result = self.extend_classifiers[0].predict_categorical(test_x) 427 | for classifier_id in range(1, len(self.extend_classifiers)): 428 | categorical_result += self.extend_classifiers[classifier_id].predict_categorical(test_x) 429 | new_categorical_result = np.zeros(shape=categorical_result.shape, dtype=int) 430 | for current_index in range(0, len(categorical_result)): 431 | current_row = categorical_result[current_index].tolist() 432 | maxvalue_index = current_row.index(max(current_row)) 433 | new_categorical_result[current_index][maxvalue_index] = 1 434 | return new_categorical_result 435 | 436 | @force2ndarray 437 | def predict_proba(self, test_x): 438 | """ 439 | 预测标签概率(分布) 440 | :param test_x: 441 | :return: 442 | """ 443 | proba_result = self.extend_classifiers[0].predict_proba(test_x) 444 | for classifier_id in range(1, len(self.extend_classifiers)): 445 | proba_result += self.extend_classifiers[classifier_id].predict_proba(test_x) 446 | return proba_result / (len(self.extend_classifiers) * 1.0) 447 | 448 | @force2ndarray 449 | def predict_categorical_proba(self, test_x): 450 | """ 451 | 预测标签概率分布 452 | :param test_x: 453 | :return: 454 | """ 455 | categorical_proba_result = self.extend_classifiers[0].predict_categorical_proba(test_x) 456 | for classifier_id in range(1, len(self.extend_classifiers)): 457 | categorical_proba_result += self.extend_classifiers[classifier_id].predict_categorical_proba(test_x) 458 | return categorical_proba_result / (len(self.extend_classifiers) * 1.0) 459 | 460 | 461 | class StackingClassifier(Classifier): 462 | def __init__(self, base_classifiers=list(), meta_classifier=None, use_probas=True, force_cv=True, base_k_fold=5, 463 | meta_k_fold=5, subsample_features_rate=None, subsample_features_indices=None, 464 | categorical_feature_indices=None, n_jobs=1): 465 | """ 466 | 为cv训练方式提供更好的支持 467 | 468 | :param base_classifiers: 基分类器列表 469 | :param meta_classifier: 元分类器(对基分类器的预测结果再次训练) 470 | :param use_probas: 基于基分类器的概率预测分布训练(默认使用类别标签的分布) 471 | :param force_cv 是否强制使用cv的方式训练所有基分类器以及元分类器(建议直接True),如果基分类器和未被KFolds_Training_Warpper包装,会被强制包装一次 472 | :param base_k_fold:包装基分类器的k_fold 473 | :param meta_k_fold:包装元分类器的k_fold 474 | """ 475 | Classifier.__init__(self) 476 | self.base_classifiers = base_classifiers 477 | self.meta_classifier = meta_classifier 478 | self.use_probas = use_probas 479 | self.n_jobs = n_jobs 480 | self.force_cv = force_cv 481 | if self.force_cv: 482 | for index in range(0, len(self.base_classifiers)): 483 | if not isinstance(self.base_classifiers[index], KFolds_Classifier_Training_Wrapper): 484 | self.base_classifiers[index] = KFolds_Classifier_Training_Wrapper(self.base_classifiers[index], 485 | k_fold=base_k_fold) 486 | if not isinstance(self.meta_classifier, KFolds_Classifier_Training_Wrapper): 487 | self.meta_classifier = KFolds_Classifier_Training_Wrapper(self.meta_classifier, k_fold=meta_k_fold) 488 | 489 | # subsample_features_rate,subsample_features_indices,categorical_feature_indices参数向下递归传递给具体的base_classifiers 490 | Classifier.update_params(self, subsample_features_rate, subsample_features_indices, categorical_feature_indices) 491 | 492 | def build_model(self): 493 | """ 494 | 构建全部分类器 495 | :return: 496 | """ 497 | for classifier in self.base_classifiers: 498 | classifier.build_model() 499 | self.meta_classifier.build_model() 500 | 501 | @force2ndarray 502 | def fit(self, train_x, train_y): 503 | """ 504 | 训练全部分类器 505 | :param train_x: 506 | :param train_y: 507 | :return: 508 | """ 509 | if self.n_jobs not in [None, 0, 1]: 510 | # 并行训练 511 | mpt = MultiProcessTrainer(self.n_jobs) 512 | mpt.build_trainer_tree(self, train_x, train_y) 513 | mpt.fit() 514 | else: 515 | for classifier in self.base_classifiers: 516 | classifier.fit(train_x, train_y) 517 | 518 | if self.use_probas: 519 | meta_train_x = self.get_base_classifier_training_categorical_proba(train_x) 520 | else: 521 | meta_train_x = self.get_base_classifier_training_categorical(train_x) 522 | 523 | self.meta_classifier.fit(meta_train_x, train_y) 524 | 525 | @force2ndarray 526 | def get_base_classifier_training_categorical_proba(self, train_x): 527 | """ 528 | 获取基分类器的训练数据 529 | :return: 530 | """ 531 | _all_categorical_probas = [] 532 | for classifier in self.base_classifiers: 533 | try: 534 | current_category_labels = classifier.extract_k_fold_data_catogorical_proba_features( 535 | train_x) # 使用KFolds_Training_wrapper包装过的分类器会调用该api 536 | except: 537 | current_category_labels = classifier.predict_categorical_proba(train_x) 538 | _all_categorical_probas.append(current_category_labels) 539 | return np.concatenate(_all_categorical_probas, axis=-1) 540 | 541 | @force2ndarray 542 | def get_base_classifier_training_categorical(self, train_x): 543 | """ 544 | 获取基分类器的训练数据 545 | :return: 546 | """ 547 | _all_categorical_labels = [] 548 | for classifier in self.base_classifiers: 549 | try: 550 | current_category_labels = classifier.extract_k_fold_data_catogorical_features( 551 | train_x) # 使用KFolds_Training_wrapper包装过的分类器会调用该api 552 | except: 553 | current_category_labels = classifier.predict_categorical(train_x) 554 | _all_categorical_labels.append(current_category_labels) 555 | return np.concatenate(_all_categorical_labels, axis=-1) 556 | 557 | @force2ndarray 558 | def combine_base_classifier_predict_categorical(self, test_x=None): 559 | """ 560 | 基分类器预测标签分布的组合 561 | :param test_x: 562 | :return: 563 | """ 564 | _all_categorical_labels = [classifier.predict_categorical(test_x) for classifier in self.base_classifiers] 565 | return np.concatenate(_all_categorical_labels, axis=-1) 566 | 567 | @force2ndarray 568 | def combine_base_classifier_predict_categorical_proba(self, test_x=None): 569 | """ 570 | 基分类器预测标签概率分布的组合 571 | :param test_x: 572 | :return: 573 | """ 574 | _all_categorical_probas = [classifier.predict_categorical_proba(test_x) for classifier in self.base_classifiers] 575 | return np.concatenate(_all_categorical_probas, axis=-1) 576 | 577 | @force2ndarray 578 | def predict(self, test_x): 579 | """ 580 | 预测标签 581 | :param test_x: 582 | :return: 583 | """ 584 | return self.meta_classifier.predict(self.combine_base_classifier_predict_categorical_proba( 585 | test_x)) if self.use_probas else self.meta_classifier.predict( 586 | self.combine_base_classifier_predict_categorical(test_x)) 587 | 588 | @force2ndarray 589 | def predict_categorical(self, test_x): 590 | """ 591 | 预测标签分布 592 | :param test_x: 593 | :return:[0,0,1,0,...] 594 | """ 595 | return self.meta_classifier.predict_categorical(self.combine_base_classifier_predict_categorical_proba( 596 | test_x)) if self.use_probas else self.meta_classifier.predict_categorical( 597 | self.combine_base_classifier_predict_categorical(test_x)) 598 | 599 | @force2ndarray 600 | def predict_proba(self, test_x): 601 | """ 602 | 预测标签概率(分布) 603 | :param test_x: 604 | :return: 605 | """ 606 | return self.meta_classifier.predict_proba(self.combine_base_classifier_predict_categorical_proba( 607 | test_x)) if self.use_probas else self.meta_classifier.predict_proba( 608 | self.combine_base_classifier_predict_categorical(test_x)) 609 | 610 | @force2ndarray 611 | def predict_categorical_proba(self, test_x): 612 | """ 613 | 预测标签概率分布 614 | :param test_x: 615 | :return: 616 | """ 617 | return self.meta_classifier.predict_categorical_proba(self.combine_base_classifier_predict_categorical_proba( 618 | test_x)) if self.use_probas else self.meta_classifier.predict_categorical_proba( 619 | self.combine_base_classifier_predict_categorical(test_x)) 620 | 621 | 622 | ''' 623 | LightGBMClassifier封装,主要是对添加进的categorical_feature进行处理, 624 | 注意:categorical_feature可以是int、float、str类型,如果是str必须是数值,比如'1','2',而不能是'x','y' 625 | 更多:https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html# 626 | ''' 627 | 628 | 629 | class LightGBMClassifier(SklearnClassifier): 630 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 631 | categorical_feature_indices=None): 632 | from lightgbm import LGBMClassifier 633 | SklearnClassifier.__init__(self, train_params, LGBMClassifier, subsample_features_rate, 634 | subsample_features_indices, categorical_feature_indices=None) 635 | self.self_define_categorical_feature_indices = categorical_feature_indices 636 | 637 | # 由于LGBMClassifier允许字符串变量,这里需要重写reshape_features 638 | def reshape_features(self, features): 639 | """ 640 | 读取features指定列用于训练或者随机选择某几列训练 641 | :param features: 642 | :return: 643 | """ 644 | self.training_categorical_feature_indices = None 645 | _, columns = features.shape 646 | indices = list(range(0, columns)) 647 | # 默认会排除字符串变量 648 | no_categorical_feature_indices = [] 649 | if self.categorical_feature_indices is not None or self.self_define_categorical_feature_indices is not None: 650 | combine_categorical_feature_indices = set( 651 | [] if self.categorical_feature_indices is None else self.categorical_feature_indices) | set( 652 | [] if self.self_define_categorical_feature_indices is None else self.self_define_categorical_feature_indices) 653 | for index in indices: 654 | if index not in combine_categorical_feature_indices: 655 | no_categorical_feature_indices.append(index) 656 | else: 657 | no_categorical_feature_indices = indices 658 | 659 | if self.subsample_features_indices is None and self.subsample_features_rate is not None: 660 | random.shuffle(no_categorical_feature_indices) 661 | self.subsample_features_indices = no_categorical_feature_indices[ 662 | :int(len(no_categorical_feature_indices) * self.subsample_features_rate)] 663 | # 单独将categorical_feature放到最前面 664 | if self.self_define_categorical_feature_indices is not None: 665 | top_categorical_feature_indices = self.self_define_categorical_feature_indices 666 | else: 667 | top_categorical_feature_indices = self.categorical_feature_indices 668 | 669 | if self.subsample_features_indices is not None: 670 | if top_categorical_feature_indices is None: 671 | return features[:, self.subsample_features_indices] 672 | else: 673 | self.training_categorical_feature_indices = list(range(0, len(top_categorical_feature_indices))) 674 | return np.concatenate( 675 | [features[:, top_categorical_feature_indices], features[:, self.subsample_features_indices]], 676 | axis=1) 677 | if top_categorical_feature_indices is None: 678 | return features[:, no_categorical_feature_indices] 679 | else: 680 | self.training_categorical_feature_indices = list(range(0, len(top_categorical_feature_indices))) 681 | return np.concatenate( 682 | [features[:, top_categorical_feature_indices], features[:, no_categorical_feature_indices]], 683 | axis=1) 684 | 685 | # 添加是否有离散值情况的判断 686 | @force2ndarray 687 | def fit(self, train_x, train_y): 688 | self.class_num = len(set(train_y)) 689 | reshape_train_x = self.reshape_features(train_x) 690 | if self.training_categorical_feature_indices is None: 691 | self.classifier_model.fit(reshape_train_x, train_y) 692 | else: 693 | self.classifier_model.fit(reshape_train_x, train_y, 694 | categorical_feature=self.training_categorical_feature_indices) 695 | 696 | # 允许numpy中含有字符串 697 | @force2ndarray 698 | def predict_proba(self, test_x): 699 | return self.classifier_model.predict_proba(self.reshape_features(test_x)) 700 | 701 | @force2ndarray 702 | def predict_categorical_proba(self, test_x): 703 | probas = self.classifier_model.predict_proba(self.reshape_features(test_x)) 704 | _, col = probas.shape 705 | if col > 1: 706 | return probas 707 | else: 708 | return np.asarray([[1 - proba, proba] for proba in probas]) 709 | 710 | 711 | ''' 712 | 对CatBoostClassifier封装 713 | ''' 714 | 715 | 716 | class CatBoostClassifier(SklearnClassifier): 717 | def __init__(self, train_params=None, subsample_features_rate=None, subsample_features_indices=None, 718 | categorical_feature_indices=None): 719 | from catboost import CatBoostClassifier 720 | SklearnClassifier.__init__(self, train_params, CatBoostClassifier, subsample_features_rate, 721 | subsample_features_indices, categorical_feature_indices=None) 722 | self.self_define_categorical_feature_indices = categorical_feature_indices 723 | 724 | # 由于CatBoostClassifier允许字符串变量,这里需要重写reshape_features 725 | def reshape_features(self, features): 726 | """ 727 | 读取features指定列用于训练或者随机选择某几列训练 728 | :param features: 729 | :return: 730 | """ 731 | self.training_categorical_feature_indices = None 732 | _, columns = features.shape 733 | indices = list(range(0, columns)) 734 | # 默认会排除字符串变量 735 | no_categorical_feature_indices = [] 736 | if self.categorical_feature_indices is not None or self.self_define_categorical_feature_indices is not None: 737 | combine_categorical_feature_indices = set( 738 | [] if self.categorical_feature_indices is None else self.categorical_feature_indices) | set( 739 | [] if self.self_define_categorical_feature_indices is None else self.self_define_categorical_feature_indices) 740 | for index in indices: 741 | if index not in combine_categorical_feature_indices: 742 | no_categorical_feature_indices.append(index) 743 | else: 744 | no_categorical_feature_indices = indices 745 | 746 | if self.subsample_features_indices is None and self.subsample_features_rate is not None: 747 | random.shuffle(no_categorical_feature_indices) 748 | self.subsample_features_indices = no_categorical_feature_indices[ 749 | :int(len(no_categorical_feature_indices) * self.subsample_features_rate)] 750 | # 单独将categorical_feature放到最前面 751 | if self.self_define_categorical_feature_indices is not None: 752 | top_categorical_feature_indices = self.self_define_categorical_feature_indices 753 | else: 754 | top_categorical_feature_indices = self.categorical_feature_indices 755 | 756 | if self.subsample_features_indices is not None: 757 | if top_categorical_feature_indices is None: 758 | return features[:, self.subsample_features_indices] 759 | else: 760 | self.training_categorical_feature_indices = list(range(0, len(top_categorical_feature_indices))) 761 | return np.concatenate( 762 | [features[:, top_categorical_feature_indices], features[:, self.subsample_features_indices]], 763 | axis=1) 764 | if top_categorical_feature_indices is None: 765 | return features[:, no_categorical_feature_indices] 766 | else: 767 | self.training_categorical_feature_indices = list(range(0, len(top_categorical_feature_indices))) 768 | return np.concatenate( 769 | [features[:, top_categorical_feature_indices], features[:, no_categorical_feature_indices]], 770 | axis=1) 771 | 772 | # 添加是否有离散值情况的判断 773 | @force2ndarray 774 | def fit(self, train_x, train_y): 775 | self.class_num = len(set(train_y)) 776 | reshape_train_x = self.reshape_features(train_x) 777 | # 切分一部分出来做eval data 778 | X_new_train, X_new_eval, y_new_train, y_new_eval = train_test_split(reshape_train_x, train_y) 779 | if self.training_categorical_feature_indices is None: 780 | self.classifier_model.fit(X_new_train, y_new_train, eval_set=(X_new_eval, y_new_eval), use_best_model=True, 781 | verbose=False) 782 | else: 783 | self.classifier_model.fit(X_new_train, y_new_train, eval_set=(X_new_eval, y_new_eval), use_best_model=True, 784 | cat_features=self.training_categorical_feature_indices, verbose=False) 785 | 786 | # 允许numpy中含有字符串 787 | @force2ndarray 788 | def predict_proba(self, test_x): 789 | return self.classifier_model.predict_proba(self.reshape_features(test_x)) 790 | 791 | @force2ndarray 792 | def predict_categorical_proba(self, test_x): 793 | probas = self.classifier_model.predict_proba(self.reshape_features(test_x)) 794 | _, col = probas.shape 795 | if col > 1: 796 | return probas 797 | else: 798 | return np.asarray([[1 - proba, proba] for proba in probas]) 799 | 800 | 801 | ''' 802 | 训练树结构,进行多进程训练的节点结构 803 | ''' 804 | 805 | 806 | class TrainerNode(object): 807 | def __init__(self, classifier=None, train_x=None, train_y=None, if_stacking=False): 808 | self.classifier = classifier 809 | self.train_x = train_x 810 | self.train_y = train_y 811 | self.if_stacking = if_stacking 812 | self.children_nodes = [] 813 | 814 | def train(self): 815 | if self.if_stacking is False: 816 | self.classifier.fit(self.train_x, self.train_y) 817 | else: 818 | # 计算meta_train_x 819 | if self.classifier.use_probas: 820 | meta_train_x = self.classifier.get_base_classifier_training_categorical_proba(self.train_x) 821 | else: 822 | meta_train_x = self.classifier.get_base_classifier_training_categorical(self.train_x) 823 | if self.classifier.meta_classifier.__class__.__name__ in ['KFolds_Classifier_Training_Wrapper', 824 | 'StackingClassifier']: 825 | # 并行训练 826 | mpt = MultiProcessTrainer(self.classifier.meta_classifier.n_jobs) 827 | mpt.build_trainer_tree(self.classifier.meta_classifier, meta_train_x, self.train_y) 828 | mpt.fit() 829 | else: 830 | self.classifier.meta_classifier.fit(meta_train_x, self.train_y) 831 | 832 | 833 | ''' 834 | 协助模型进行多进程训练 835 | ''' 836 | 837 | 838 | class MultiProcessTrainer(object): 839 | def __init__(self, n_jobs): 840 | self.n_jobs = n_jobs 841 | 842 | ''' 843 | 构建训练树结构 844 | ''' 845 | 846 | def build_trainer_tree(self, classifier, train_x, train_y): 847 | """ 848 | :param classifier: 当前分类器 849 | :param train_x: 训练特征 850 | :param train_y: 训练标签 851 | :return: 852 | """ 853 | # 创建空根节点 854 | self.root_node = TrainerNode(None, None, None) 855 | 856 | # 递归创建子节点 857 | if classifier.__class__.__name__ == 'StackingClassifier': 858 | self.build_stacking_node(self.root_node, classifier, train_x, train_y) 859 | elif classifier.__class__.__name__ == 'KFolds_Classifier_Training_Wrapper': 860 | self.build_cv_node(self.root_node, classifier, train_x, train_y) 861 | else: 862 | self.build_normal_node(self.root_node, classifier, train_x, train_y) 863 | 864 | ''' 865 | 构建stacking树节点 866 | ''' 867 | 868 | def build_stacking_node(self, parent_node, current_classifier, X_train, y_train): 869 | stacking_node = TrainerNode(current_classifier, train_x=X_train, train_y=y_train, 870 | if_stacking=True) 871 | parent_node.children_nodes.append(stacking_node) 872 | # 构建stacking的子节点 873 | for child_classifier in current_classifier.base_classifiers: 874 | if child_classifier.__class__.__name__ == 'StackingClassifier': 875 | self.build_stacking_node(stacking_node, child_classifier, X_train, y_train) 876 | elif child_classifier.__class__.__name__ == 'KFolds_Classifier_Training_Wrapper': 877 | self.build_cv_node(stacking_node, child_classifier, X_train, y_train) 878 | else: 879 | self.build_normal_node(stacking_node, child_classifier, X_train, y_train) 880 | 881 | ''' 882 | 构建cv树节点 883 | ''' 884 | 885 | def build_cv_node(self, parent_node, current_classifier, train_x, train_y): 886 | kf = KFold(n_splits=current_classifier.k_fold, shuffle=False, random_state=current_classifier.random_state) 887 | index = 0 888 | for train_index, _ in kf.split(train_x): 889 | X_train = train_x[train_index] 890 | y_train = train_y[train_index] 891 | if current_classifier.extend_classifiers[index].__class__.__name__ == 'StackingClassifier': 892 | self.build_stacking_node(parent_node, current_classifier.extend_classifiers[index], X_train, y_train) 893 | elif current_classifier.extend_classifiers[ 894 | index].__class__.__name__ == 'KFolds_Classifier_Training_Wrapper': 895 | self.build_cv_node(parent_node, current_classifier.extend_classifiers[index], X_train, y_train) 896 | else: 897 | self.build_normal_node(parent_node, current_classifier.extend_classifiers[index], X_train, y_train) 898 | index += 1 899 | 900 | ''' 901 | 构建normal树节点 902 | ''' 903 | 904 | def build_normal_node(self, parent_node, current_classifier, train_x, train_y): 905 | normal_node = TrainerNode(classifier=current_classifier, train_x=train_x, train_y=train_y, if_stacking=False) 906 | parent_node.children_nodes.append(normal_node) 907 | 908 | ''' 909 | 并行训练模型 910 | ''' 911 | 912 | def fit(self): 913 | def trainer_fit(node): 914 | node.train() 915 | 916 | if self.n_jobs == -1: 917 | max_cpu_count = cpu_count() 918 | else: 919 | max_cpu_count = min(cpu_count(), self.n_jobs) 920 | 921 | # 构建训练的层次结构索引 922 | self.trainer_level_dict = {} 923 | 924 | # 检索层次结构 925 | self.search_trainer_level(1, self.root_node) 926 | 927 | # 多进程/线程训练 928 | for index in range(99, 1, -1): 929 | trainers = self.trainer_level_dict.get(index) 930 | if trainers is not None: 931 | # if platform.system() == 'Linux': 932 | # # 多进程支持,linux中生效 933 | # p = Pool(min(max_cpu_count, len(trainers))) 934 | # for i in range(len(trainers)): 935 | # p.apply_async(trainer_fit, args=(trainers[i],)) 936 | # p.close() 937 | # p.join() 938 | # else: 939 | # # 多线程支持,windows中生效 940 | # tasks = [] 941 | # for i in range(len(trainers)): 942 | # task = threading.Thread(target=trainer_fit, args=(trainers[i],)) 943 | # task.start() 944 | # tasks.append(task) 945 | # for task in tasks: 946 | # task.join() 947 | try: 948 | # 先尝试多进程 949 | p = Pool(min(max_cpu_count, len(trainers))) 950 | # for i in range(len(trainers)): 951 | # p.apply_async(trainer_fit, args=(trainers[i],)) 952 | # p.close() 953 | # p.join() 954 | p.map(trainer_fit, trainers) 955 | except: 956 | # 失败再尝试多线程 957 | tasks = [] 958 | for i in range(len(trainers)): 959 | task = threading.Thread(target=trainer_fit, args=(trainers[i],)) 960 | task.start() 961 | tasks.append(task) 962 | for task in tasks: 963 | task.join() 964 | 965 | ''' 966 | 检索训练器的层次结构 967 | ''' 968 | 969 | def search_trainer_level(self, current_level, current_node): 970 | if self.trainer_level_dict.get(current_level) is None: 971 | self.trainer_level_dict[current_level] = [current_node] 972 | else: 973 | self.trainer_level_dict[current_level].append(current_node) 974 | 975 | if len(current_node.children_nodes) > 0: 976 | for children_node in current_node.children_nodes: 977 | self.search_trainer_level(current_level + 1, children_node) -------------------------------------------------------------------------------- /stacking_regressor.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import copy 3 | import numpy as np 4 | from sklearn.model_selection import KFold 5 | import warnings 6 | 7 | warnings.filterwarnings("ignore") 8 | 9 | 10 | class Regressor(object): 11 | ''' 12 | 定义回归器接口 13 | ''' 14 | 15 | def __init__(self, train_params=None): 16 | """ 17 | :param train_params: 训练参数 18 | """ 19 | self.train_params = {} if train_params is None else train_params 20 | 21 | def build_model(self): 22 | """ 23 | 创建模型 24 | :return: 25 | """ 26 | raise RuntimeError("need to implement!") 27 | 28 | def fit(self, train_x, train_y): 29 | """ 30 | 拟合数据 31 | :return: 32 | """ 33 | raise RuntimeError("need to implement!") 34 | 35 | def predict(self, test_x): 36 | """ 37 | 预测结果 38 | :param test_x: 39 | :return: 40 | """ 41 | raise RuntimeError("need to implement!") 42 | 43 | def save_model(self, model_path): 44 | """ 45 | 存储模型 46 | :return: 47 | """ 48 | with open(model_path, 'wb') as model_file: 49 | pickle.dump(self, model_file) 50 | 51 | @staticmethod 52 | def load_model(model_path): 53 | """ 54 | 加载模型 55 | :return: 56 | """ 57 | with open(model_path, 'rb') as model_file: 58 | new_model = pickle.load(model_file) 59 | return new_model 60 | 61 | 62 | class SklearnRegressor(Regressor): 63 | """ 64 | 基于sklearn api的regressor实现 65 | """ 66 | 67 | def __init__(self, train_params=None, regressor_class=None): 68 | Regressor.__init__(self, train_params) 69 | self.regressor_class = regressor_class 70 | 71 | def build_model(self): 72 | self.regressor_model = self.regressor_class(**self.train_params) 73 | 74 | def fit(self, train_x, train_y): 75 | self.regressor_model.fit(train_x, train_y) 76 | 77 | def predict(self, test_x): 78 | return self.regressor_model.predict(test_x) 79 | 80 | 81 | class DecisionTreeRegressor(SklearnRegressor): 82 | def __init__(self, train_params=None): 83 | from sklearn.tree import DecisionTreeRegressor 84 | SklearnRegressor.__init__(self, train_params, DecisionTreeRegressor) 85 | 86 | 87 | class LinearRegression(SklearnRegressor): 88 | def __init__(self, train_params=None): 89 | from sklearn.linear_model import LinearRegression 90 | SklearnRegressor.__init__(self, train_params, LinearRegression) 91 | 92 | 93 | class KNeighborsRegressor(SklearnRegressor): 94 | def __init__(self, train_params=None): 95 | from sklearn.neighbors import KNeighborsRegressor 96 | SklearnRegressor.__init__(self, train_params, KNeighborsRegressor) 97 | 98 | 99 | class AdaBoostRegressor(SklearnRegressor): 100 | def __init__(self, train_params=None): 101 | from sklearn.ensemble import AdaBoostRegressor 102 | SklearnRegressor.__init__(self, train_params, AdaBoostRegressor) 103 | 104 | 105 | class GradientBoostingRegressor(SklearnRegressor): 106 | def __init__(self, train_params=None): 107 | from sklearn.ensemble import GradientBoostingRegressor 108 | SklearnRegressor.__init__(self, train_params, GradientBoostingRegressor) 109 | 110 | 111 | class BaggingRegressor(SklearnRegressor): 112 | def __init__(self, train_params=None): 113 | from sklearn.ensemble import BaggingRegressor 114 | SklearnRegressor.__init__(self, train_params, BaggingRegressor) 115 | 116 | 117 | class ExtraTreeRegressor(SklearnRegressor): 118 | def __init__(self, train_params=None): 119 | from sklearn.tree import ExtraTreeRegressor 120 | SklearnRegressor.__init__(self, train_params, ExtraTreeRegressor) 121 | 122 | 123 | class SVRRegressor(SklearnRegressor): 124 | def __init__(self, train_params=None): 125 | from sklearn.svm import SVR 126 | SklearnRegressor.__init__(self, train_params, SVR) 127 | 128 | 129 | class LinearSVR(SklearnRegressor): 130 | def __init__(self, train_params=None): 131 | from sklearn.svm import LinearSVR 132 | SklearnRegressor.__init__(self, train_params, LinearSVR) 133 | 134 | 135 | class ElasticNet(SklearnRegressor): 136 | def __init__(self, train_params=None): 137 | from sklearn.linear_model import ElasticNet 138 | SklearnRegressor.__init__(self, train_params, ElasticNet) 139 | 140 | 141 | class ElasticNetCV(SklearnRegressor): 142 | def __init__(self, train_params=None): 143 | from sklearn.linear_model import ElasticNetCV 144 | SklearnRegressor.__init__(self, train_params, ElasticNetCV) 145 | 146 | 147 | class BayesianRidge(SklearnRegressor): 148 | def __init__(self, train_params=None): 149 | from sklearn.linear_model import BayesianRidge 150 | SklearnRegressor.__init__(self, train_params, BayesianRidge) 151 | 152 | 153 | class Lasso(SklearnRegressor): 154 | def __init__(self, train_params=None): 155 | from sklearn.linear_model import Lasso 156 | SklearnRegressor.__init__(self, train_params, Lasso) 157 | 158 | 159 | class KFolds_Regressor_Training_Wrapper(Regressor): 160 | ''' 161 | 对训练的回归器进行交叉式训练,是对原始回归器的扩展,可独立使用 162 | ''' 163 | 164 | def __init__(self, base_regressor=None, k_fold=5,random_state=42): 165 | """ 166 | 167 | :param base_regressor: 168 | :param k_fold: 169 | """ 170 | Regressor.__init__(self) 171 | self.base_regressor = base_regressor 172 | self.k_fold = k_fold 173 | self.random_state=random_state 174 | 175 | def build_model(self): 176 | """ 177 | 创建模型 178 | :return: 179 | """ 180 | self.extend_regressors = [] 181 | for _ in range(0, self.k_fold): 182 | new_regressor = copy.deepcopy(self.base_regressor) 183 | new_regressor.build_model() 184 | self.extend_regressors.append(new_regressor) 185 | 186 | def fit(self, train_x, train_y): 187 | """ 188 | 拟合数据:切分数据并训练 189 | :return: 190 | """ 191 | self.train_x = train_x 192 | self.train_y = train_y 193 | kf = KFold(n_splits=self.k_fold, shuffle=False, random_state=self.random_state) 194 | index = 0 195 | for train_index, _ in kf.split(train_x): 196 | X_train = train_x[train_index] 197 | y_train = train_y[train_index] 198 | self.extend_regressors[index].fit(X_train, y_train) 199 | index += 1 200 | 201 | def _extract_k_fold_data_features(self): 202 | """ 203 | 抽取每个回归器的预测结果,并组合 204 | :return: 205 | """ 206 | regression_results = [] 207 | kf = KFold(n_splits=self.k_fold, shuffle=False, random_state=self.random_state) 208 | kf.get_n_splits(self.train_x) 209 | index = 0 210 | for _, test_index in kf.split(self.train_x): 211 | X_test = self.train_x[test_index] 212 | regression_results.append(self.extend_regressors[index].predict(X_test)) 213 | index += 1 214 | return np.concatenate(regression_results, axis=0) 215 | 216 | def predict(self, test_x): 217 | """ 218 | 预测 219 | :param test_x: 220 | :return: 221 | """ 222 | regression_result = self.extend_regressors[0].predict(test_x) 223 | for regressor_id in range(1, len(self.extend_regressors)): 224 | regression_result += self.extend_regressors[regressor_id].predict(test_x) 225 | return regression_result / (1.0 * self.k_fold) 226 | 227 | 228 | class StackingRegressor(Regressor): 229 | def __init__(self, base_regressors=list(), meta_regressor=None, force_cv=True, base_k_fold=5, meta_k_fold=5): 230 | """ 231 | 为cv训练方式提供更好的支持 232 | 233 | :param regressors: 回归器 234 | :param meta_regressor: 元回归器(基于基回归器的结果再次训练) 235 | :param force_cv 是否强制使用cv的方式训练所有基回归器以及元回归器(建议直接True),如果基回归器和未被KFolds_Regreesor_Training_Warpper包装,会被强制包装一次 236 | :param base_k_fold:基回归器的k_fold 237 | :param meta_k_fold:元回归器的k_fold 238 | """ 239 | Regressor.__init__(self) 240 | self.base_regressors = base_regressors 241 | self.meta_regressor = meta_regressor 242 | self.meta_train_x = None 243 | self.meta_train_y = None 244 | self.force_cv = force_cv 245 | self._suffix_for_cv = None # 被KFolds_Regressor_Training_Warpper包装时,存放添加的后缀 246 | if self.force_cv: 247 | for index in range(0, len(self.base_regressors)): 248 | if not isinstance(self.base_regressors[index], KFolds_Regressor_Training_Wrapper): 249 | self.base_regressors[index] = KFolds_Regressor_Training_Wrapper(self.base_regressors[index], 250 | k_fold=base_k_fold) 251 | if not isinstance(self.meta_regressor, KFolds_Regressor_Training_Wrapper): 252 | self.meta_regressor = KFolds_Regressor_Training_Wrapper(self.meta_regressor, k_fold=meta_k_fold) 253 | 254 | def _build_base_regressor_models(self): 255 | """ 256 | 构建基回归器 257 | :return: 258 | """ 259 | for regressor in self.base_regressors: 260 | regressor.build_model() 261 | 262 | def _build_meta_regressor_model(self): 263 | """ 264 | 构建元回归器 265 | :return: 266 | """ 267 | self.meta_regressor.build_model() 268 | 269 | def build_model(self): 270 | """ 271 | 构建全部回归器 272 | :return: 273 | """ 274 | self._build_base_regressor_models() 275 | self._build_meta_regressor_model() 276 | 277 | def _fit_base_regressors(self, train_x, train_y): 278 | """ 279 | 训练基回归器 280 | :return: 281 | """ 282 | for regressor in self.base_regressors: 283 | regressor.fit(train_x, train_y) 284 | 285 | def _fit_meta_regressor(self): 286 | """ 287 | 训练元回归器 288 | :return: 289 | 290 | """ 291 | self.meta_regressor.fit(self.meta_train_x, self.meta_train_y) 292 | 293 | def fit(self, train_x, train_y): 294 | """ 295 | 训练全部回归器 296 | :param train_x: 297 | :param train_y: 298 | :return: 299 | """ 300 | self._fit_base_regressors(train_x, train_y) 301 | self.meta_train_x = self._get_base_regressors_training_data(train_x) 302 | self.meta_train_y = train_y 303 | self._fit_meta_regressor() 304 | 305 | def _get_base_regressors_training_data(self, train_x): 306 | """ 307 | 获取基回归器的训练数据 308 | :return: 309 | """ 310 | _all_regression_results = [] 311 | for regressor in self.base_regressors: 312 | try: 313 | current_regressor_result = regressor._extract_k_fold_data_features() # 使用KFolds_Regressor_Training_wrapper包装过的回归器会调用该api 314 | except: 315 | current_regressor_result = regressor.predict(train_x) 316 | _all_regression_results.append(current_regressor_result.reshape(-1, 1)) 317 | return np.concatenate(_all_regression_results, axis=-1) 318 | 319 | def _combine_base_regressor_predict(self, test_x=None): 320 | """ 321 | 基回归器预测结果 322 | :param test_x: 323 | :return: 324 | """ 325 | _all_regression_results = [(regressor.predict(test_x)).reshape(-1, 1) for regressor in self.base_regressors] 326 | return np.concatenate(_all_regression_results, axis=-1) 327 | 328 | def predict(self, test_x): 329 | """ 330 | 预测结果 331 | :param test_x: 332 | :return: 333 | """ 334 | return self.meta_regressor.predict(self._combine_base_regressor_predict(test_x)) 335 | --------------------------------------------------------------------------------