├── Marketing_Campaign_Response_Prediction.ipynb ├── README.md └── data ├── order.xlsx └── order_predict_result.xlsx /Marketing_Campaign_Response_Prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 客户营销响应预测" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 背景" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "某企业业务部门准备基于以往产品营销活动的经验,对现有的客户有选择性地进行营销活动。受制于营销活动预算的限制,业务部门希望能够通过以前类似的营销活动找出能够响应此次营销活动的客户名单和响应概率。能够在有限的成本控制中得到较高的客户响应率,以提高此次营销活动的效果。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## 目的" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "通过数据预测在下一次营销活动时,响应活动会员的具体名单和响应概率,以此来制定针对性的营销策略。" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## 数据" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "- 训练数据:3999条\n", 50 | "- 测试数据:8843条\n", 51 | "\n", 52 | "字段名|字段含义|变量类型\n", 53 | ":-|:-|:-\n", 54 | "age|年龄|数值\t\n", 55 | "total_pageviews|总页面浏览数|数值\n", 56 | "edu|教育程度|分类[1,10]\t\n", 57 | "edu_ages|受教育年限|数值\n", 58 | "user_level|会员等级|分类[1,7]\n", 59 | "industry|用户行业|分类[1,15]\t\n", 60 | "value_level|用户价值|分类[1,6]\n", 61 | "act_level|用户活跃度|分类[1,5]\t\n", 62 | "sex|性别|分类[0,1] 1表示男性\t\n", 63 | "blue_money|历史蓝色优惠券使用金额|数值\n", 64 | "red_money|历史红色优惠券使用金额|数值\n", 65 | "work_hours|在线时长|数值\n", 66 | "region|地区|分类[1,41]\n", 67 | "response|是否响应|分类[0,1] 1表示响应" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## 1. 数据加载" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 1, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "import numpy as np \n", 86 | "import pandas as pd \n", 87 | "from sklearn.preprocessing import OneHotEncoder \n", 88 | "from sklearn.model_selection import StratifiedKFold, cross_val_score \n", 89 | "from sklearn.feature_selection import SelectPercentile, f_classif \n", 90 | "from sklearn.ensemble import AdaBoostClassifier \n", 91 | "from sklearn.pipeline import Pipeline\n", 92 | "from sklearn.metrics import accuracy_score\n", 93 | "import warnings\n", 94 | "warnings.filterwarnings(\"ignore\")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 2, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "train = pd.read_excel('order.xlsx', sheetname=0) \n", 106 | "test = pd.read_excel('order.xlsx', sheetname=1) " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 3, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/html": [ 117 | "
\n", 118 | "\n", 131 | "\n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | "
agetotal_pageviewseduedu_agesuser_levelindustryvalue_levelact_levelsexblue_moneyred_moneywork_hoursregionresponse
039.077516.01.013.01.01.011.01.021740.0401.00
150.083311.01.013.02.02.021.01.000.0131.00
238.0215646.02.09.03.03.011.01.000.0401.00
353.0234721.02.07.02.03.022.01.000.0401.00
428.0338409.01.013.02.04.032.00.000.0402.00
\n", 239 | "
" 240 | ], 241 | "text/plain": [ 242 | " age total_pageviews edu edu_ages user_level industry value_level \\\n", 243 | "0 39.0 77516.0 1.0 13.0 1.0 1.0 1 \n", 244 | "1 50.0 83311.0 1.0 13.0 2.0 2.0 2 \n", 245 | "2 38.0 215646.0 2.0 9.0 3.0 3.0 1 \n", 246 | "3 53.0 234721.0 2.0 7.0 2.0 3.0 2 \n", 247 | "4 28.0 338409.0 1.0 13.0 2.0 4.0 3 \n", 248 | "\n", 249 | " act_level sex blue_money red_money work_hours region response \n", 250 | "0 1.0 1.0 2174 0.0 40 1.0 0 \n", 251 | "1 1.0 1.0 0 0.0 13 1.0 0 \n", 252 | "2 1.0 1.0 0 0.0 40 1.0 0 \n", 253 | "3 2.0 1.0 0 0.0 40 1.0 0 \n", 254 | "4 2.0 0.0 0 0.0 40 2.0 0 " 255 | ] 256 | }, 257 | "execution_count": 3, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "train.head()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 4, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/html": [ 274 | "
\n", 275 | "\n", 288 | "\n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | "
agetotal_pageviewseduedu_agesuser_levelindustryvalue_levelact_levelsexblue_moneyred_moneywork_hoursregionresponse
3999424.0194102.01.013.01.02.041.01.000.0401.00
3999535.0295127.02.010.03.09.051.01.000.0501.00
3999660.0102310.05.012.03.012.011.00.000.04511.00
3999748.0240175.02.07.05.05.052.01.000.0221.00
3999841.0145441.02.09.02.010.021.01.000.0401.01
\n", 396 | "
" 397 | ], 398 | "text/plain": [ 399 | " age total_pageviews edu edu_ages user_level industry \\\n", 400 | "39994 24.0 194102.0 1.0 13.0 1.0 2.0 \n", 401 | "39995 35.0 295127.0 2.0 10.0 3.0 9.0 \n", 402 | "39996 60.0 102310.0 5.0 12.0 3.0 12.0 \n", 403 | "39997 48.0 240175.0 2.0 7.0 5.0 5.0 \n", 404 | "39998 41.0 145441.0 2.0 9.0 2.0 10.0 \n", 405 | "\n", 406 | " value_level act_level sex blue_money red_money work_hours region \\\n", 407 | "39994 4 1.0 1.0 0 0.0 40 1.0 \n", 408 | "39995 5 1.0 1.0 0 0.0 50 1.0 \n", 409 | "39996 1 1.0 0.0 0 0.0 45 11.0 \n", 410 | "39997 5 2.0 1.0 0 0.0 22 1.0 \n", 411 | "39998 2 1.0 1.0 0 0.0 40 1.0 \n", 412 | "\n", 413 | " response \n", 414 | "39994 0 \n", 415 | "39995 0 \n", 416 | "39996 0 \n", 417 | "39997 0 \n", 418 | "39998 1 " 419 | ] 420 | }, 421 | "execution_count": 4, 422 | "metadata": {}, 423 | "output_type": "execute_result" 424 | } 425 | ], 426 | "source": [ 427 | "train.tail()" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 5, 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "name": "stdout", 437 | "output_type": "stream", 438 | "text": [ 439 | "\n", 440 | "RangeIndex: 39999 entries, 0 to 39998\n", 441 | "Data columns (total 14 columns):\n", 442 | "age 39998 non-null float64\n", 443 | "total_pageviews 39998 non-null float64\n", 444 | "edu 39998 non-null float64\n", 445 | "edu_ages 39998 non-null float64\n", 446 | "user_level 39998 non-null float64\n", 447 | "industry 39997 non-null float64\n", 448 | "value_level 39999 non-null int64\n", 449 | "act_level 39998 non-null float64\n", 450 | "sex 39998 non-null float64\n", 451 | "blue_money 39999 non-null int64\n", 452 | "red_money 39998 non-null float64\n", 453 | "work_hours 39999 non-null int64\n", 454 | "region 39997 non-null float64\n", 455 | "response 39999 non-null int64\n", 456 | "dtypes: float64(10), int64(4)\n", 457 | "memory usage: 4.3 MB\n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "train.info()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 6, 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "\n", 475 | "RangeIndex: 8843 entries, 0 to 8842\n", 476 | "Data columns (total 14 columns):\n", 477 | "age 8843 non-null int64\n", 478 | "total_pageviews 8843 non-null int64\n", 479 | "edu 8843 non-null int64\n", 480 | "edu_ages 8843 non-null int64\n", 481 | "user_level 8841 non-null float64\n", 482 | "industry 8841 non-null float64\n", 483 | "value_level 8843 non-null int64\n", 484 | "act_level 8843 non-null int64\n", 485 | "sex 8843 non-null int64\n", 486 | "blue_money 8843 non-null int64\n", 487 | "red_money 8843 non-null int64\n", 488 | "work_hours 8843 non-null int64\n", 489 | "region 8838 non-null float64\n", 490 | "final_response 8843 non-null int64\n", 491 | "dtypes: float64(3), int64(11)\n", 492 | "memory usage: 967.3 KB\n" 493 | ] 494 | } 495 | ], 496 | "source": [ 497 | "test.info()" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 7, 503 | "metadata": {}, 504 | "outputs": [ 505 | { 506 | "data": { 507 | "text/html": [ 508 | "
\n", 509 | "\n", 522 | "\n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | "
agetotal_pageviewseduedu_agesuser_levelindustryvalue_levelact_levelsexblue_moneyred_moneywork_hoursregionresponse
count39998.0000003.999800e+0439998.00000039998.00000039998.00000039997.00000039999.00000039998.00000039998.00000039999.00000039998.00000039999.00000039997.00000039999.000000
mean38.5896541.895136e+052.51162610.0767542.0870045.6771262.5462891.2210360.6680831089.14252987.37939440.4424862.2515190.239606
std13.6634901.053109e+051.6381102.5733841.2609923.3959481.4432100.6266180.4709077491.275548402.93035012.3760334.9134820.426848
min17.0000001.228500e+041.0000001.0000001.0000001.0000001.0000001.0000000.0000000.0000000.0000001.0000001.0000000.000000
25%28.0000001.175282e+052.0000009.0000001.0000003.0000001.0000001.0000000.0000000.0000000.00000040.0000001.0000000.000000
50%37.0000001.783410e+052.00000010.0000002.0000005.0000002.0000001.0000001.0000000.0000000.00000040.0000001.0000000.000000
75%48.0000002.372685e+052.00000012.0000002.0000008.0000004.0000001.0000001.0000000.0000000.00000045.0000001.0000000.000000
max90.0000001.484705e+0610.00000016.0000007.00000015.0000006.0000005.0000001.00000099999.0000004356.00000099.00000041.0000001.000000
\n", 681 | "
" 682 | ], 683 | "text/plain": [ 684 | " age total_pageviews edu edu_ages \\\n", 685 | "count 39998.000000 3.999800e+04 39998.000000 39998.000000 \n", 686 | "mean 38.589654 1.895136e+05 2.511626 10.076754 \n", 687 | "std 13.663490 1.053109e+05 1.638110 2.573384 \n", 688 | "min 17.000000 1.228500e+04 1.000000 1.000000 \n", 689 | "25% 28.000000 1.175282e+05 2.000000 9.000000 \n", 690 | "50% 37.000000 1.783410e+05 2.000000 10.000000 \n", 691 | "75% 48.000000 2.372685e+05 2.000000 12.000000 \n", 692 | "max 90.000000 1.484705e+06 10.000000 16.000000 \n", 693 | "\n", 694 | " user_level industry value_level act_level sex \\\n", 695 | "count 39998.000000 39997.000000 39999.000000 39998.000000 39998.000000 \n", 696 | "mean 2.087004 5.677126 2.546289 1.221036 0.668083 \n", 697 | "std 1.260992 3.395948 1.443210 0.626618 0.470907 \n", 698 | "min 1.000000 1.000000 1.000000 1.000000 0.000000 \n", 699 | "25% 1.000000 3.000000 1.000000 1.000000 0.000000 \n", 700 | "50% 2.000000 5.000000 2.000000 1.000000 1.000000 \n", 701 | "75% 2.000000 8.000000 4.000000 1.000000 1.000000 \n", 702 | "max 7.000000 15.000000 6.000000 5.000000 1.000000 \n", 703 | "\n", 704 | " blue_money red_money work_hours region response \n", 705 | "count 39999.000000 39998.000000 39999.000000 39997.000000 39999.000000 \n", 706 | "mean 1089.142529 87.379394 40.442486 2.251519 0.239606 \n", 707 | "std 7491.275548 402.930350 12.376033 4.913482 0.426848 \n", 708 | "min 0.000000 0.000000 1.000000 1.000000 0.000000 \n", 709 | "25% 0.000000 0.000000 40.000000 1.000000 0.000000 \n", 710 | "50% 0.000000 0.000000 40.000000 1.000000 0.000000 \n", 711 | "75% 0.000000 0.000000 45.000000 1.000000 0.000000 \n", 712 | "max 99999.000000 4356.000000 99.000000 41.000000 1.000000 " 713 | ] 714 | }, 715 | "execution_count": 7, 716 | "metadata": {}, 717 | "output_type": "execute_result" 718 | } 719 | ], 720 | "source": [ 721 | "train.describe()" 722 | ] 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "metadata": {}, 727 | "source": [ 728 | "## 2. 数据预处理" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "### 去重" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 9, 741 | "metadata": { 742 | "collapsed": true 743 | }, 744 | "outputs": [], 745 | "source": [ 746 | "train.drop_duplicates(inplace=True)" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "### 缺失值处理" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 27, 759 | "metadata": {}, 760 | "outputs": [ 761 | { 762 | "data": { 763 | "text/plain": [ 764 | "response 0\n", 765 | "region 0\n", 766 | "work_hours 0\n", 767 | "red_money 0\n", 768 | "blue_money 0\n", 769 | "sex 0\n", 770 | "act_level 0\n", 771 | "value_level 0\n", 772 | "industry 0\n", 773 | "user_level 0\n", 774 | "edu_ages 0\n", 775 | "edu 0\n", 776 | "total_pageviews 0\n", 777 | "age 0\n", 778 | "dtype: int64" 779 | ] 780 | }, 781 | "execution_count": 27, 782 | "metadata": {}, 783 | "output_type": "execute_result" 784 | } 785 | ], 786 | "source": [ 787 | "train.isnull().sum().sort_values(ascending=False)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 11, 793 | "metadata": {}, 794 | "outputs": [ 795 | { 796 | "data": { 797 | "text/plain": [ 798 | "region 5\n", 799 | "industry 2\n", 800 | "user_level 2\n", 801 | "final_response 0\n", 802 | "work_hours 0\n", 803 | "red_money 0\n", 804 | "blue_money 0\n", 805 | "sex 0\n", 806 | "act_level 0\n", 807 | "value_level 0\n", 808 | "edu_ages 0\n", 809 | "edu 0\n", 810 | "total_pageviews 0\n", 811 | "age 0\n", 812 | "dtype: int64" 813 | ] 814 | }, 815 | "execution_count": 11, 816 | "metadata": {}, 817 | "output_type": "execute_result" 818 | } 819 | ], 820 | "source": [ 821 | "test.isnull().sum().sort_values(ascending=False)" 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": 12, 827 | "metadata": { 828 | "collapsed": true 829 | }, 830 | "outputs": [], 831 | "source": [ 832 | "def na_replace(df):\n", 833 | " na_rules = {'age': df['age'].mean(),\n", 834 | " 'total_pageviews': df['total_pageviews'].mean(),\n", 835 | " 'edu': df['edu'].median(),\n", 836 | " 'edu_ages': df['edu_ages'].median(),\n", 837 | " 'user_level': df['user_level'].median(),\n", 838 | " 'industry': df['user_level'].median(),\n", 839 | " 'act_level': df['act_level'].median(),\n", 840 | " 'sex': df['sex'].median(),\n", 841 | " 'red_money': df['red_money'].mean(),\n", 842 | " 'region': df['region'].median()\n", 843 | " } \n", 844 | " df = df.fillna(na_rules)\n", 845 | " return df" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": 13, 851 | "metadata": { 852 | "collapsed": true 853 | }, 854 | "outputs": [], 855 | "source": [ 856 | "train = na_replace(train) \n", 857 | "test = na_replace(test)" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "### 独热编码" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": 14, 870 | "metadata": { 871 | "collapsed": true 872 | }, 873 | "outputs": [], 874 | "source": [ 875 | "def symbol_con(df, enc_object=None, train=True):\n", 876 | " convert_cols = ['edu', 'user_level', 'industry', 'value_level', 'act_level', 'sex', 'region'] \n", 877 | " df_con = df[convert_cols] \n", 878 | " df_org = df[['age', 'total_pageviews', 'edu_ages', 'blue_money', 'red_money', 'work_hours']].values \n", 879 | " if train == True: \n", 880 | " enc = OneHotEncoder() \n", 881 | " enc.fit(df_con) \n", 882 | " df_con_new = enc.transform(df_con).toarray() \n", 883 | " new_matrix = np.hstack((df_con_new, df_org)) \n", 884 | " return new_matrix, enc\n", 885 | " else:\n", 886 | " df_con_new = enc_object.transform(df_con).toarray() \n", 887 | " new_matrix = np.hstack((df_con_new, df_org))\n", 888 | " return new_matrix\n" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": 15, 894 | "metadata": { 895 | "collapsed": true 896 | }, 897 | "outputs": [], 898 | "source": [ 899 | "X_train, enc = symbol_con(train, enc_object=None, train=True) \n", 900 | "y_train = train['response']" 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": 16, 906 | "metadata": { 907 | "collapsed": true 908 | }, 909 | "outputs": [], 910 | "source": [ 911 | "X_test = symbol_con(test, enc_object=enc, train=False)\n", 912 | "y_test = test['final_response']" 913 | ] 914 | }, 915 | { 916 | "cell_type": "markdown", 917 | "metadata": {}, 918 | "source": [ 919 | "## 3. 建模" 920 | ] 921 | }, 922 | { 923 | "cell_type": "markdown", 924 | "metadata": {}, 925 | "source": [ 926 | "### 参数优化选择" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": 17, 932 | "metadata": { 933 | "collapsed": true 934 | }, 935 | "outputs": [], 936 | "source": [ 937 | "transform = SelectPercentile(f_classif, percentile=50) " 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": 18, 943 | "metadata": { 944 | "collapsed": true 945 | }, 946 | "outputs": [], 947 | "source": [ 948 | "model_adaboost = AdaBoostClassifier() " 949 | ] 950 | }, 951 | { 952 | "cell_type": "code", 953 | "execution_count": 19, 954 | "metadata": {}, 955 | "outputs": [ 956 | { 957 | "name": "stdout", 958 | "output_type": "stream", 959 | "text": [ 960 | "[mean: 0.85406, std: 0.00282, params: {'model_adaboost__learning_rate': 0.5, 'model_adaboost__n_estimators': 20}, mean: 0.85834, std: 0.00353, params: {'model_adaboost__learning_rate': 0.5, 'model_adaboost__n_estimators': 50}, mean: 0.86090, std: 0.00346, params: {'model_adaboost__learning_rate': 0.5, 'model_adaboost__n_estimators': 100}, mean: 0.85369, std: 0.00463, params: {'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 20}, mean: 0.85982, std: 0.00404, params: {'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 50}, mean: 0.86335, std: 0.00406, params: {'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 100}]\n", 961 | "------------------------------\n", 962 | "{'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 100} 0.863349684653\n" 963 | ] 964 | } 965 | ], 966 | "source": [ 967 | "from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve\n", 968 | "pipe=Pipeline([('select',transform), \n", 969 | " ('model_adaboost', model_adaboost)])\n", 970 | "\n", 971 | "param_test = {'model_adaboost__n_estimators':[20,50,100], \n", 972 | " 'model_adaboost__learning_rate':[0.5,1]\n", 973 | " }\n", 974 | "gsearch = GridSearchCV(estimator = pipe, param_grid = param_test, scoring='accuracy', cv=5)\n", 975 | "gsearch.fit(X_train,y_train)\n", 976 | "\n", 977 | "print(gsearch.grid_scores_)\n", 978 | "print('-'*30)\n", 979 | "print(gsearch.best_params_, gsearch.best_score_)" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 20, 985 | "metadata": {}, 986 | "outputs": [ 987 | { 988 | "data": { 989 | "text/plain": [ 990 | "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n", 991 | " learning_rate=1.0, n_estimators=100, random_state=None)" 992 | ] 993 | }, 994 | "execution_count": 20, 995 | "metadata": {}, 996 | "output_type": "execute_result" 997 | } 998 | ], 999 | "source": [ 1000 | "transform.fit(X_train, y_train)\n", 1001 | "reduce_X_train = transform.transform(X_train)\n", 1002 | "final_model = AdaBoostClassifier(n_estimators=100) \n", 1003 | "final_model.fit(reduce_X_train, y_train)" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": {}, 1009 | "source": [ 1010 | "### 预测" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": 21, 1016 | "metadata": { 1017 | "collapsed": true 1018 | }, 1019 | "outputs": [], 1020 | "source": [ 1021 | "reduce_X_test = transform.transform(X_test) " 1022 | ] 1023 | }, 1024 | { 1025 | "cell_type": "code", 1026 | "execution_count": 23, 1027 | "metadata": { 1028 | "collapsed": true 1029 | }, 1030 | "outputs": [], 1031 | "source": [ 1032 | "data = test.drop('final_response', axis=1) \n", 1033 | "predict_labels = pd.DataFrame(final_model.predict(reduce_X_test), columns=['labels']) \n", 1034 | "predict_proba = pd.DataFrame(final_model.predict_proba(reduce_X_test), columns=['noproba', 'yesproba']) \n", 1035 | "predict_pd = pd.concat((data, predict_labels, predict_proba), axis=1) " 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 25, 1041 | "metadata": {}, 1042 | "outputs": [ 1043 | { 1044 | "name": "stdout", 1045 | "output_type": "stream", 1046 | "text": [ 1047 | "final accuracy: 0.862150853782653\n" 1048 | ] 1049 | } 1050 | ], 1051 | "source": [ 1052 | "accuracy_score(y_test, predict_labels)" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "code", 1057 | "execution_count": 24, 1058 | "metadata": { 1059 | "collapsed": true 1060 | }, 1061 | "outputs": [], 1062 | "source": [ 1063 | "predict_pd.to_excel('order_predict_result.xlsx', 'Sheet1') " 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "markdown", 1068 | "metadata": { 1069 | "collapsed": true 1070 | }, 1071 | "source": [ 1072 | "## 总结与思考" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "markdown", 1077 | "metadata": {}, 1078 | "source": [ 1079 | "### 模型实施" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "markdown", 1084 | "metadata": {}, 1085 | "source": [ 1086 | "- 制定了营销响应率不低于80%的KPI作为本次营销活动的绩效考核目标。\n", 1087 | "- 结合历史销售订单数据计算本次活动的预期收益,制定ROI目标。\n", 1088 | "- 基于预期的订单金额和订单数量,以及关联的用券数量和金额,申请对应的营销资源用于促销用户购买转化。" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": null, 1094 | "metadata": { 1095 | "collapsed": true 1096 | }, 1097 | "outputs": [], 1098 | "source": [] 1099 | } 1100 | ], 1101 | "metadata": { 1102 | "kernelspec": { 1103 | "display_name": "Python 3", 1104 | "language": "python", 1105 | "name": "python3" 1106 | }, 1107 | "language_info": { 1108 | "codemirror_mode": { 1109 | "name": "ipython", 1110 | "version": 3 1111 | }, 1112 | "file_extension": ".py", 1113 | "mimetype": "text/x-python", 1114 | "name": "python", 1115 | "nbconvert_exporter": "python", 1116 | "pygments_lexer": "ipython3", 1117 | "version": "3.6.2" 1118 | }, 1119 | "toc": { 1120 | "base_numbering": 1, 1121 | "nav_menu": {}, 1122 | "number_sections": false, 1123 | "sideBar": true, 1124 | "skip_h1_title": false, 1125 | "title_cell": "Table of Contents", 1126 | "title_sidebar": "Contents", 1127 | "toc_cell": false, 1128 | "toc_position": { 1129 | "height": "calc(100% - 180px)", 1130 | "left": "10px", 1131 | "top": "150px", 1132 | "width": "165px" 1133 | }, 1134 | "toc_section_display": true, 1135 | "toc_window_display": true 1136 | } 1137 | }, 1138 | "nbformat": 4, 1139 | "nbformat_minor": 2 1140 | } 1141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Marketing_Campaign_Response_Prediction 2 | 基于Adaboost算法,预测在下一次营销活动时,响应活动会员的具体名单和响应概率。 3 | 4 | -------------------------------------------------------------------------------- /data/order.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weizhuang1113/Marketing_Campaign_Response_Prediction/29cf732c8f7e9bee60a463bbb6e9c4b64e2e76a7/data/order.xlsx -------------------------------------------------------------------------------- /data/order_predict_result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weizhuang1113/Marketing_Campaign_Response_Prediction/29cf732c8f7e9bee60a463bbb6e9c4b64e2e76a7/data/order_predict_result.xlsx --------------------------------------------------------------------------------