└── ABtest.ipynb
/ABtest.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "%matplotlib inline"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "#### 数据集解释:\n",
20 | "#### emb_tb_2.csv:用户特征数据集\n",
21 | "#### effct_tb.csv:广告点击情况数据集\n",
22 | "#### seed_cand_tb.csv:用户类型数据集\n",
23 | "#### 本报告主要使用广告点击情况数据,主要涉及字段如下:\n",
24 | "#### dmp_id:营销策略编号(源数据文档未做说明,这个根据情况设定为1.对照组,2.营销策略一,3.营销策略二)\n",
25 | "#### user_id:支付宝用户id\n",
26 | "#### label:用户当天是否点击活动广告(0:未点击,1:点击)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/html": [
37 | "
\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " | \n",
55 | " user_id | \n",
56 | " label | \n",
57 | " dmp_id | \n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " \n",
62 | " | 0 | \n",
63 | " 1 | \n",
64 | " 0 | \n",
65 | " 1 | \n",
66 | "
\n",
67 | " \n",
68 | " | 1 | \n",
69 | " 1000004 | \n",
70 | " 0 | \n",
71 | " 1 | \n",
72 | "
\n",
73 | " \n",
74 | " | 2 | \n",
75 | " 1000004 | \n",
76 | " 0 | \n",
77 | " 2 | \n",
78 | "
\n",
79 | " \n",
80 | "
\n",
81 | "
"
82 | ],
83 | "text/plain": [
84 | " user_id label dmp_id\n",
85 | "0 1 0 1\n",
86 | "1 1000004 0 1\n",
87 | "2 1000004 0 2"
88 | ]
89 | },
90 | "execution_count": 3,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "#加载数据\n",
97 | "data = pd.read_csv('F:/数据分析/ABtest_data/effect_tb.csv',header = None)\n",
98 | "data.columns = [\"dt\",\"user_id\",\"label\",\"dmp_id\"]\n",
99 | "#日志天数属性用户上,删除该列\n",
100 | "data = data.drop(columns = \"dt\")\n",
101 | "data.head(3)"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 4,
107 | "metadata": {},
108 | "outputs": [
109 | {
110 | "data": {
111 | "text/html": [
112 | "\n",
113 | "\n",
126 | "
\n",
127 | " \n",
128 | " \n",
129 | " | \n",
130 | " user_id | \n",
131 | " label | \n",
132 | " dmp_id | \n",
133 | "
\n",
134 | " \n",
135 | " \n",
136 | " \n",
137 | " | count | \n",
138 | " 2.645958e+06 | \n",
139 | " 2.645958e+06 | \n",
140 | " 2.645958e+06 | \n",
141 | "
\n",
142 | " \n",
143 | " | mean | \n",
144 | " 3.112995e+06 | \n",
145 | " 1.456297e-02 | \n",
146 | " 1.395761e+00 | \n",
147 | "
\n",
148 | " \n",
149 | " | std | \n",
150 | " 1.828262e+06 | \n",
151 | " 1.197952e-01 | \n",
152 | " 6.920480e-01 | \n",
153 | "
\n",
154 | " \n",
155 | " | min | \n",
156 | " 1.000000e+00 | \n",
157 | " 0.000000e+00 | \n",
158 | " 1.000000e+00 | \n",
159 | "
\n",
160 | " \n",
161 | " | 25% | \n",
162 | " 1.526772e+06 | \n",
163 | " 0.000000e+00 | \n",
164 | " 1.000000e+00 | \n",
165 | "
\n",
166 | " \n",
167 | " | 50% | \n",
168 | " 3.062184e+06 | \n",
169 | " 0.000000e+00 | \n",
170 | " 1.000000e+00 | \n",
171 | "
\n",
172 | " \n",
173 | " | 75% | \n",
174 | " 4.721132e+06 | \n",
175 | " 0.000000e+00 | \n",
176 | " 2.000000e+00 | \n",
177 | "
\n",
178 | " \n",
179 | " | max | \n",
180 | " 6.265402e+06 | \n",
181 | " 1.000000e+00 | \n",
182 | " 3.000000e+00 | \n",
183 | "
\n",
184 | " \n",
185 | "
\n",
186 | "
"
187 | ],
188 | "text/plain": [
189 | " user_id label dmp_id\n",
190 | "count 2.645958e+06 2.645958e+06 2.645958e+06\n",
191 | "mean 3.112995e+06 1.456297e-02 1.395761e+00\n",
192 | "std 1.828262e+06 1.197952e-01 6.920480e-01\n",
193 | "min 1.000000e+00 0.000000e+00 1.000000e+00\n",
194 | "25% 1.526772e+06 0.000000e+00 1.000000e+00\n",
195 | "50% 3.062184e+06 0.000000e+00 1.000000e+00\n",
196 | "75% 4.721132e+06 0.000000e+00 2.000000e+00\n",
197 | "max 6.265402e+06 1.000000e+00 3.000000e+00"
198 | ]
199 | },
200 | "execution_count": 4,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "# 查看计数,平均数,方差,最小值和四分位数,最大值\n",
207 | "data.describe()"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "### 重复值处理"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 5,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/plain": [
225 | "(2645958, 3)"
226 | ]
227 | },
228 | "execution_count": 5,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "data.shape"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 6,
240 | "metadata": {},
241 | "outputs": [
242 | {
243 | "data": {
244 | "text/plain": [
245 | "user_id 2410683\n",
246 | "label 2\n",
247 | "dmp_id 3\n",
248 | "dtype: int64"
249 | ]
250 | },
251 | "execution_count": 6,
252 | "metadata": {},
253 | "output_type": "execute_result"
254 | }
255 | ],
256 | "source": [
257 | "data.nunique()"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "数据行与独立用户数不统一,检查是否存在重复行。"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 7,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/html": [
275 | "\n",
276 | "\n",
289 | "
\n",
290 | " \n",
291 | " \n",
292 | " | \n",
293 | " user_id | \n",
294 | " label | \n",
295 | " dmp_id | \n",
296 | "
\n",
297 | " \n",
298 | " \n",
299 | " \n",
300 | " | 8529 | \n",
301 | " 1027 | \n",
302 | " 0 | \n",
303 | " 1 | \n",
304 | "
\n",
305 | " \n",
306 | " | 1485546 | \n",
307 | " 1027 | \n",
308 | " 0 | \n",
309 | " 1 | \n",
310 | "
\n",
311 | " \n",
312 | " | 1579415 | \n",
313 | " 1471 | \n",
314 | " 0 | \n",
315 | " 1 | \n",
316 | "
\n",
317 | " \n",
318 | " | 127827 | \n",
319 | " 1471 | \n",
320 | " 0 | \n",
321 | " 1 | \n",
322 | "
\n",
323 | " \n",
324 | " | 404862 | \n",
325 | " 2468 | \n",
326 | " 0 | \n",
327 | " 1 | \n",
328 | "
\n",
329 | " \n",
330 | " | ... | \n",
331 | " ... | \n",
332 | " ... | \n",
333 | " ... | \n",
334 | "
\n",
335 | " \n",
336 | " | 1382121 | \n",
337 | " 6264633 | \n",
338 | " 0 | \n",
339 | " 1 | \n",
340 | "
\n",
341 | " \n",
342 | " | 1382245 | \n",
343 | " 6264940 | \n",
344 | " 0 | \n",
345 | " 1 | \n",
346 | "
\n",
347 | " \n",
348 | " | 2575140 | \n",
349 | " 6264940 | \n",
350 | " 0 | \n",
351 | " 1 | \n",
352 | "
\n",
353 | " \n",
354 | " | 1382306 | \n",
355 | " 6265082 | \n",
356 | " 0 | \n",
357 | " 3 | \n",
358 | "
\n",
359 | " \n",
360 | " | 2575171 | \n",
361 | " 6265082 | \n",
362 | " 0 | \n",
363 | " 3 | \n",
364 | "
\n",
365 | " \n",
366 | "
\n",
367 | "
25966 rows × 3 columns
\n",
368 | "
"
369 | ],
370 | "text/plain": [
371 | " user_id label dmp_id\n",
372 | "8529 1027 0 1\n",
373 | "1485546 1027 0 1\n",
374 | "1579415 1471 0 1\n",
375 | "127827 1471 0 1\n",
376 | "404862 2468 0 1\n",
377 | "... ... ... ...\n",
378 | "1382121 6264633 0 1\n",
379 | "1382245 6264940 0 1\n",
380 | "2575140 6264940 0 1\n",
381 | "1382306 6265082 0 3\n",
382 | "2575171 6265082 0 3\n",
383 | "\n",
384 | "[25966 rows x 3 columns]"
385 | ]
386 | },
387 | "execution_count": 7,
388 | "metadata": {},
389 | "output_type": "execute_result"
390 | }
391 | ],
392 | "source": [
393 | "data[data.duplicated(keep = False)].sort_values(by=[\"user_id\"])"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 8,
399 | "metadata": {},
400 | "outputs": [
401 | {
402 | "data": {
403 | "text/html": [
404 | "\n",
405 | "\n",
418 | "
\n",
419 | " \n",
420 | " \n",
421 | " | \n",
422 | " user_id | \n",
423 | " label | \n",
424 | " dmp_id | \n",
425 | "
\n",
426 | " \n",
427 | " \n",
428 | " \n",
429 | "
\n",
430 | "
"
431 | ],
432 | "text/plain": [
433 | "Empty DataFrame\n",
434 | "Columns: [user_id, label, dmp_id]\n",
435 | "Index: []"
436 | ]
437 | },
438 | "execution_count": 8,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "#删除重复值\n",
445 | "data = data.drop_duplicates()\n",
446 | "data[data.duplicated(keep = False)]"
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {},
452 | "source": [
453 | "### 3.空值处理"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 9,
459 | "metadata": {},
460 | "outputs": [
461 | {
462 | "name": "stdout",
463 | "output_type": "stream",
464 | "text": [
465 | "\n",
466 | "Int64Index: 2632975 entries, 0 to 2645957\n",
467 | "Data columns (total 3 columns):\n",
468 | " # Column Non-Null Count Dtype\n",
469 | "--- ------ -------------- -----\n",
470 | " 0 user_id 2632975 non-null int64\n",
471 | " 1 label 2632975 non-null int64\n",
472 | " 2 dmp_id 2632975 non-null int64\n",
473 | "dtypes: int64(3)\n",
474 | "memory usage: 80.4 MB\n"
475 | ]
476 | }
477 | ],
478 | "source": [
479 | "data.info(null_counts = True)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 10,
485 | "metadata": {},
486 | "outputs": [],
487 | "source": [
488 | "#### 数据集没有空值可以不用处理"
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {},
494 | "source": [
495 | "### 4.异常值检查\n",
496 | "##### 通过透视表格检查各属性字段是否存在不合理取值"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 11,
502 | "metadata": {},
503 | "outputs": [
504 | {
505 | "data": {
506 | "text/html": [
507 | "\n",
508 | "\n",
521 | "
\n",
522 | " \n",
523 | " \n",
524 | " | label | \n",
525 | " 0 | \n",
526 | " 1 | \n",
527 | " All | \n",
528 | "
\n",
529 | " \n",
530 | " | dmp_id | \n",
531 | " | \n",
532 | " | \n",
533 | " | \n",
534 | "
\n",
535 | " \n",
536 | " \n",
537 | " \n",
538 | " | 1 | \n",
539 | " 1881745 | \n",
540 | " 23918 | \n",
541 | " 1905663 | \n",
542 | "
\n",
543 | " \n",
544 | " | 2 | \n",
545 | " 404811 | \n",
546 | " 6296 | \n",
547 | " 411107 | \n",
548 | "
\n",
549 | " \n",
550 | " | 3 | \n",
551 | " 307923 | \n",
552 | " 8282 | \n",
553 | " 316205 | \n",
554 | "
\n",
555 | " \n",
556 | " | All | \n",
557 | " 2594479 | \n",
558 | " 38496 | \n",
559 | " 2632975 | \n",
560 | "
\n",
561 | " \n",
562 | "
\n",
563 | "
"
564 | ],
565 | "text/plain": [
566 | "label 0 1 All\n",
567 | "dmp_id \n",
568 | "1 1881745 23918 1905663\n",
569 | "2 404811 6296 411107\n",
570 | "3 307923 8282 316205\n",
571 | "All 2594479 38496 2632975"
572 | ]
573 | },
574 | "execution_count": 11,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "data.pivot_table(index = \"dmp_id\",columns = \"label\",values = \"user_id\",aggfunc = \"count\",margins = True)"
581 | ]
582 | },
583 | {
584 | "cell_type": "markdown",
585 | "metadata": {},
586 | "source": [
587 | "属性字段没有发现一场自,无需进行处理"
588 | ]
589 | },
590 | {
591 | "cell_type": "markdown",
592 | "metadata": {},
593 | "source": [
594 | "### 5.数据类型"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 12,
600 | "metadata": {},
601 | "outputs": [
602 | {
603 | "data": {
604 | "text/plain": [
605 | "user_id int64\n",
606 | "label int64\n",
607 | "dmp_id int64\n",
608 | "dtype: object"
609 | ]
610 | },
611 | "execution_count": 12,
612 | "metadata": {},
613 | "output_type": "execute_result"
614 | }
615 | ],
616 | "source": [
617 | "data.dtypes"
618 | ]
619 | },
620 | {
621 | "cell_type": "markdown",
622 | "metadata": {},
623 | "source": [
624 | "数据类型正常不需要转换"
625 | ]
626 | },
627 | {
628 | "cell_type": "markdown",
629 | "metadata": {},
630 | "source": [
631 | "### 2.2样本容量检验\n",
632 | "一般采用样本量计算工具:sample size calculator.使用可以看自己写的ABtest学习文档"
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": 13,
638 | "metadata": {},
639 | "outputs": [
640 | {
641 | "data": {
642 | "text/plain": [
643 | "0.012551012429794775"
644 | ]
645 | },
646 | "execution_count": 13,
647 | "metadata": {},
648 | "output_type": "execute_result"
649 | }
650 | ],
651 | "source": [
652 | "data[data[\"dmp_id\"] == 1][\"label\"].mean()"
653 | ]
654 | },
655 | {
656 | "cell_type": "markdown",
657 | "metadata": {},
658 | "source": [
659 | "这个数字是对照组的点击率,为1.26%,我们需要的新的营销策略能让广告点击率至少提高1个百分点,那么\n",
660 | "我们在网站中https://www.evanmiller.org/ab-testing/sample-size.html\n",
661 | " baseline conversion rate 框中输入1.26\n",
662 | " minimum detectable effect 框中输入 1\n",
663 | " 计算得出我们实验所需要的的最小样本量为2167"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 14,
669 | "metadata": {},
670 | "outputs": [
671 | {
672 | "data": {
673 | "text/plain": [
674 | "1 1905663\n",
675 | "2 411107\n",
676 | "3 316205\n",
677 | "Name: dmp_id, dtype: int64"
678 | ]
679 | },
680 | "execution_count": 14,
681 | "metadata": {},
682 | "output_type": "execute_result"
683 | }
684 | ],
685 | "source": [
686 | "#查看每个组中的样本数\n",
687 | "data[\"dmp_id\"].value_counts()"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "metadata": {},
693 | "source": [
694 | "可以看到三种策略的样本数都比我们最小样本数量大,因此样本合适"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 15,
700 | "metadata": {},
701 | "outputs": [],
702 | "source": [
703 | "#保存清洗好的数据备用\n",
704 | "data.to_csv(\"F:/数据分析/ABtest_data/output.csv\",index = False)"
705 | ]
706 | },
707 | {
708 | "cell_type": "markdown",
709 | "metadata": {},
710 | "source": [
711 | "### 3.假设验证\n",
712 | "先观察几组实验的点击情况"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 16,
718 | "metadata": {},
719 | "outputs": [
720 | {
721 | "name": "stdout",
722 | "output_type": "stream",
723 | "text": [
724 | "对照组: 0.012551012429794775\n",
725 | "策略一组: 0.015314747742072015\n",
726 | "策略二组: 0.026191869198779274\n"
727 | ]
728 | }
729 | ],
730 | "source": [
731 | "print(\"对照组:\",data[data[\"dmp_id\"] ==1][\"label\"].mean())\n",
732 | "print(\"策略一组:\",data[data[\"dmp_id\"] ==2][\"label\"].mean())\n",
733 | "print(\"策略二组:\",data[data[\"dmp_id\"] ==3][\"label\"].mean())"
734 | ]
735 | },
736 | {
737 | "cell_type": "markdown",
738 | "metadata": {},
739 | "source": [
740 | "从点击率来看,策略一和策略二在对照组的基础上都有一定的提升。\n",
741 | "其中策略一提高了0.2个百分点,策略二提高了1.3个百分点,只有策略二满足了我们对点击率提升最小值的要求\n",
742 | "接下来需要进行假设验证,来看看策略二的点击率提升是否显著"
743 | ]
744 | },
745 | {
746 | "cell_type": "markdown",
747 | "metadata": {},
748 | "source": [
749 | "#### a.零假设和责备假设\n",
750 | "##### 记对照组点击率为p1,策略二点击率为p2,则:\n",
751 | "零假设设为H0:p1>=p2\n",
752 | "备择假设为H1:p1< p2\n",
753 | "##### b.分布类型,检验类型和显著性水平\n",
754 | "样本服从两点分布,独立双样本,样本大小为n>30,总体均值和标准差未知,所以采用z检验,显著性水平取0.05"
755 | ]
756 | },
757 | {
758 | "cell_type": "markdown",
759 | "metadata": {},
760 | "source": [
761 | "#### 3.1公式计算"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": 20,
767 | "metadata": {},
768 | "outputs": [
769 | {
770 | "name": "stdout",
771 | "output_type": "stream",
772 | "text": [
773 | "总和点击率: 0.014492310074225832\n"
774 | ]
775 | },
776 | {
777 | "name": "stderr",
778 | "output_type": "stream",
779 | "text": [
780 | "C:\\Users\\18042\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
781 | " \n",
782 | "C:\\Users\\18042\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
783 | " import sys\n"
784 | ]
785 | }
786 | ],
787 | "source": [
788 | "#用户数\n",
789 | "n_old = len(data[data.dmp_id == 1]) #对照组\n",
790 | "n_new = len(data[data.dmp_id == 3]) #策略二\n",
791 | "\n",
792 | "#点击数\n",
793 | "c_old = len(data[data.dmp_id == 1][data.label == 1])\n",
794 | "c_new = len(data[data.dmp_id == 3][data.label == 1])\n",
795 | "\n",
796 | "#计算点击率\n",
797 | "try:\n",
798 | " if c_new == 0:\n",
799 | " print(\"不做运算\")\n",
800 | " else:\n",
801 | " r_old = c_old / n_old\n",
802 | " r_new = c_new / c_new\n",
803 | "except:\n",
804 | " print(\"除数为0\")\n",
805 | "#总和点击率\n",
806 | "r = (c_old + c_new) / (n_old + n_new)\n",
807 | "print(\"总和点击率:\", r)"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 22,
813 | "metadata": {},
814 | "outputs": [
815 | {
816 | "name": "stdout",
817 | "output_type": "stream",
818 | "text": [
819 | "检验统计量z: -4302.928619508961\n"
820 | ]
821 | }
822 | ],
823 | "source": [
824 | "#计算检验统计量Z\n",
825 | "z = (r_old - r_new) / np.sqrt(r*(1-r)*(1/n_old + 1/n_new))\n",
826 | "print(\"检验统计量z:\", z)"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": 23,
832 | "metadata": {},
833 | "outputs": [
834 | {
835 | "data": {
836 | "text/plain": [
837 | "-1.6448536269514729"
838 | ]
839 | },
840 | "execution_count": 23,
841 | "metadata": {},
842 | "output_type": "execute_result"
843 | }
844 | ],
845 | "source": [
846 | "#看显著水平0.05对应的Z的分位数\n",
847 | "from scipy.stats import norm\n",
848 | "z_alpha = norm.ppf(0.05)\n",
849 | "z_alpha"
850 | ]
851 | },
852 | {
853 | "cell_type": "markdown",
854 | "metadata": {},
855 | "source": [
856 | "z_aloha = -1.644,检验统计量z为-4032 ,该检验为左侧单尾检验,拒绝域为{z