└── ABtest.ipynb /ABtest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "%matplotlib inline" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "#### 数据集解释:\n", 20 | "#### emb_tb_2.csv:用户特征数据集\n", 21 | "#### effct_tb.csv:广告点击情况数据集\n", 22 | "#### seed_cand_tb.csv:用户类型数据集\n", 23 | "#### 本报告主要使用广告点击情况数据,主要涉及字段如下:\n", 24 | "#### dmp_id:营销策略编号(源数据文档未做说明,这个根据情况设定为1.对照组,2.营销策略一,3.营销策略二)\n", 25 | "#### user_id:支付宝用户id\n", 26 | "#### label:用户当天是否点击活动广告(0:未点击,1:点击)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "
\n", 38 | "\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | "
user_idlabeldmp_id
0101
1100000401
2100000402
\n", 81 | "
" 82 | ], 83 | "text/plain": [ 84 | " user_id label dmp_id\n", 85 | "0 1 0 1\n", 86 | "1 1000004 0 1\n", 87 | "2 1000004 0 2" 88 | ] 89 | }, 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "#加载数据\n", 97 | "data = pd.read_csv('F:/数据分析/ABtest_data/effect_tb.csv',header = None)\n", 98 | "data.columns = [\"dt\",\"user_id\",\"label\",\"dmp_id\"]\n", 99 | "#日志天数属性用户上,删除该列\n", 100 | "data = data.drop(columns = \"dt\")\n", 101 | "data.head(3)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/html": [ 112 | "
\n", 113 | "\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
user_idlabeldmp_id
count2.645958e+062.645958e+062.645958e+06
mean3.112995e+061.456297e-021.395761e+00
std1.828262e+061.197952e-016.920480e-01
min1.000000e+000.000000e+001.000000e+00
25%1.526772e+060.000000e+001.000000e+00
50%3.062184e+060.000000e+001.000000e+00
75%4.721132e+060.000000e+002.000000e+00
max6.265402e+061.000000e+003.000000e+00
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " user_id label dmp_id\n", 190 | "count 2.645958e+06 2.645958e+06 2.645958e+06\n", 191 | "mean 3.112995e+06 1.456297e-02 1.395761e+00\n", 192 | "std 1.828262e+06 1.197952e-01 6.920480e-01\n", 193 | "min 1.000000e+00 0.000000e+00 1.000000e+00\n", 194 | "25% 1.526772e+06 0.000000e+00 1.000000e+00\n", 195 | "50% 3.062184e+06 0.000000e+00 1.000000e+00\n", 196 | "75% 4.721132e+06 0.000000e+00 2.000000e+00\n", 197 | "max 6.265402e+06 1.000000e+00 3.000000e+00" 198 | ] 199 | }, 200 | "execution_count": 4, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "# 查看计数,平均数,方差,最小值和四分位数,最大值\n", 207 | "data.describe()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "### 重复值处理" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 5, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "(2645958, 3)" 226 | ] 227 | }, 228 | "execution_count": 5, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "data.shape" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "user_id 2410683\n", 246 | "label 2\n", 247 | "dmp_id 3\n", 248 | "dtype: int64" 249 | ] 250 | }, 251 | "execution_count": 6, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "data.nunique()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "数据行与独立用户数不统一,检查是否存在重复行。" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 7, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/html": [ 275 | "
\n", 276 | "\n", 289 | "\n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | "
user_idlabeldmp_id
8529102701
1485546102701
1579415147101
127827147101
404862246801
............
1382121626463301
1382245626494001
2575140626494001
1382306626508203
2575171626508203
\n", 367 | "

25966 rows × 3 columns

\n", 368 | "
" 369 | ], 370 | "text/plain": [ 371 | " user_id label dmp_id\n", 372 | "8529 1027 0 1\n", 373 | "1485546 1027 0 1\n", 374 | "1579415 1471 0 1\n", 375 | "127827 1471 0 1\n", 376 | "404862 2468 0 1\n", 377 | "... ... ... ...\n", 378 | "1382121 6264633 0 1\n", 379 | "1382245 6264940 0 1\n", 380 | "2575140 6264940 0 1\n", 381 | "1382306 6265082 0 3\n", 382 | "2575171 6265082 0 3\n", 383 | "\n", 384 | "[25966 rows x 3 columns]" 385 | ] 386 | }, 387 | "execution_count": 7, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "data[data.duplicated(keep = False)].sort_values(by=[\"user_id\"])" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 8, 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "text/html": [ 404 | "
\n", 405 | "\n", 418 | "\n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | "
user_idlabeldmp_id
\n", 430 | "
" 431 | ], 432 | "text/plain": [ 433 | "Empty DataFrame\n", 434 | "Columns: [user_id, label, dmp_id]\n", 435 | "Index: []" 436 | ] 437 | }, 438 | "execution_count": 8, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "#删除重复值\n", 445 | "data = data.drop_duplicates()\n", 446 | "data[data.duplicated(keep = False)]" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### 3.空值处理" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 9, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "name": "stdout", 463 | "output_type": "stream", 464 | "text": [ 465 | "\n", 466 | "Int64Index: 2632975 entries, 0 to 2645957\n", 467 | "Data columns (total 3 columns):\n", 468 | " # Column Non-Null Count Dtype\n", 469 | "--- ------ -------------- -----\n", 470 | " 0 user_id 2632975 non-null int64\n", 471 | " 1 label 2632975 non-null int64\n", 472 | " 2 dmp_id 2632975 non-null int64\n", 473 | "dtypes: int64(3)\n", 474 | "memory usage: 80.4 MB\n" 475 | ] 476 | } 477 | ], 478 | "source": [ 479 | "data.info(null_counts = True)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 10, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "#### 数据集没有空值可以不用处理" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "### 4.异常值检查\n", 496 | "##### 通过透视表格检查各属性字段是否存在不合理取值" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 11, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "data": { 506 | "text/html": [ 507 | "
\n", 508 | "\n", 521 | "\n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | "
label01All
dmp_id
11881745239181905663
24048116296411107
33079238282316205
All2594479384962632975
\n", 563 | "
" 564 | ], 565 | "text/plain": [ 566 | "label 0 1 All\n", 567 | "dmp_id \n", 568 | "1 1881745 23918 1905663\n", 569 | "2 404811 6296 411107\n", 570 | "3 307923 8282 316205\n", 571 | "All 2594479 38496 2632975" 572 | ] 573 | }, 574 | "execution_count": 11, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "data.pivot_table(index = \"dmp_id\",columns = \"label\",values = \"user_id\",aggfunc = \"count\",margins = True)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "属性字段没有发现一场自,无需进行处理" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": {}, 593 | "source": [ 594 | "### 5.数据类型" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 12, 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "data": { 604 | "text/plain": [ 605 | "user_id int64\n", 606 | "label int64\n", 607 | "dmp_id int64\n", 608 | "dtype: object" 609 | ] 610 | }, 611 | "execution_count": 12, 612 | "metadata": {}, 613 | "output_type": "execute_result" 614 | } 615 | ], 616 | "source": [ 617 | "data.dtypes" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "数据类型正常不需要转换" 625 | ] 626 | }, 627 | { 628 | "cell_type": "markdown", 629 | "metadata": {}, 630 | "source": [ 631 | "### 2.2样本容量检验\n", 632 | "一般采用样本量计算工具:sample size calculator.使用可以看自己写的ABtest学习文档" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 13, 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/plain": [ 643 | "0.012551012429794775" 644 | ] 645 | }, 646 | "execution_count": 13, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "data[data[\"dmp_id\"] == 1][\"label\"].mean()" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "这个数字是对照组的点击率,为1.26%,我们需要的新的营销策略能让广告点击率至少提高1个百分点,那么\n", 660 | "我们在网站中https://www.evanmiller.org/ab-testing/sample-size.html\n", 661 | " baseline conversion rate 框中输入1.26\n", 662 | " minimum detectable effect 框中输入 1\n", 663 | " 计算得出我们实验所需要的的最小样本量为2167" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 14, 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "data": { 673 | "text/plain": [ 674 | "1 1905663\n", 675 | "2 411107\n", 676 | "3 316205\n", 677 | "Name: dmp_id, dtype: int64" 678 | ] 679 | }, 680 | "execution_count": 14, 681 | "metadata": {}, 682 | "output_type": "execute_result" 683 | } 684 | ], 685 | "source": [ 686 | "#查看每个组中的样本数\n", 687 | "data[\"dmp_id\"].value_counts()" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "可以看到三种策略的样本数都比我们最小样本数量大,因此样本合适" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 15, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "#保存清洗好的数据备用\n", 704 | "data.to_csv(\"F:/数据分析/ABtest_data/output.csv\",index = False)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": {}, 710 | "source": [ 711 | "### 3.假设验证\n", 712 | "先观察几组实验的点击情况" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 16, 718 | "metadata": {}, 719 | "outputs": [ 720 | { 721 | "name": "stdout", 722 | "output_type": "stream", 723 | "text": [ 724 | "对照组: 0.012551012429794775\n", 725 | "策略一组: 0.015314747742072015\n", 726 | "策略二组: 0.026191869198779274\n" 727 | ] 728 | } 729 | ], 730 | "source": [ 731 | "print(\"对照组:\",data[data[\"dmp_id\"] ==1][\"label\"].mean())\n", 732 | "print(\"策略一组:\",data[data[\"dmp_id\"] ==2][\"label\"].mean())\n", 733 | "print(\"策略二组:\",data[data[\"dmp_id\"] ==3][\"label\"].mean())" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "从点击率来看,策略一和策略二在对照组的基础上都有一定的提升。\n", 741 | "其中策略一提高了0.2个百分点,策略二提高了1.3个百分点,只有策略二满足了我们对点击率提升最小值的要求\n", 742 | "接下来需要进行假设验证,来看看策略二的点击率提升是否显著" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "#### a.零假设和责备假设\n", 750 | "##### 记对照组点击率为p1,策略二点击率为p2,则:\n", 751 | "零假设设为H0:p1>=p2\n", 752 | "备择假设为H1:p1< p2\n", 753 | "##### b.分布类型,检验类型和显著性水平\n", 754 | "样本服从两点分布,独立双样本,样本大小为n>30,总体均值和标准差未知,所以采用z检验,显著性水平取0.05" 755 | ] 756 | }, 757 | { 758 | "cell_type": "markdown", 759 | "metadata": {}, 760 | "source": [ 761 | "#### 3.1公式计算" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": 20, 767 | "metadata": {}, 768 | "outputs": [ 769 | { 770 | "name": "stdout", 771 | "output_type": "stream", 772 | "text": [ 773 | "总和点击率: 0.014492310074225832\n" 774 | ] 775 | }, 776 | { 777 | "name": "stderr", 778 | "output_type": "stream", 779 | "text": [ 780 | "C:\\Users\\18042\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", 781 | " \n", 782 | "C:\\Users\\18042\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", 783 | " import sys\n" 784 | ] 785 | } 786 | ], 787 | "source": [ 788 | "#用户数\n", 789 | "n_old = len(data[data.dmp_id == 1]) #对照组\n", 790 | "n_new = len(data[data.dmp_id == 3]) #策略二\n", 791 | "\n", 792 | "#点击数\n", 793 | "c_old = len(data[data.dmp_id == 1][data.label == 1])\n", 794 | "c_new = len(data[data.dmp_id == 3][data.label == 1])\n", 795 | "\n", 796 | "#计算点击率\n", 797 | "try:\n", 798 | " if c_new == 0:\n", 799 | " print(\"不做运算\")\n", 800 | " else:\n", 801 | " r_old = c_old / n_old\n", 802 | " r_new = c_new / c_new\n", 803 | "except:\n", 804 | " print(\"除数为0\")\n", 805 | "#总和点击率\n", 806 | "r = (c_old + c_new) / (n_old + n_new)\n", 807 | "print(\"总和点击率:\", r)" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 22, 813 | "metadata": {}, 814 | "outputs": [ 815 | { 816 | "name": "stdout", 817 | "output_type": "stream", 818 | "text": [ 819 | "检验统计量z: -4302.928619508961\n" 820 | ] 821 | } 822 | ], 823 | "source": [ 824 | "#计算检验统计量Z\n", 825 | "z = (r_old - r_new) / np.sqrt(r*(1-r)*(1/n_old + 1/n_new))\n", 826 | "print(\"检验统计量z:\", z)" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": 23, 832 | "metadata": {}, 833 | "outputs": [ 834 | { 835 | "data": { 836 | "text/plain": [ 837 | "-1.6448536269514729" 838 | ] 839 | }, 840 | "execution_count": 23, 841 | "metadata": {}, 842 | "output_type": "execute_result" 843 | } 844 | ], 845 | "source": [ 846 | "#看显著水平0.05对应的Z的分位数\n", 847 | "from scipy.stats import norm\n", 848 | "z_alpha = norm.ppf(0.05)\n", 849 | "z_alpha" 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "z_aloha = -1.644,检验统计量z为-4032 ,该检验为左侧单尾检验,拒绝域为{z