├── main.py
├── readme.md
├── run.sh
├── temp
    └── clean.sh
├── tests
    └── test_pearson_draw.py
├── utils
    ├── data_analysis.py
    └── data_parse.py
└── xgb
    └── xgb_model.py


/main.py:
--------------------------------------------------------------------------------
  1 | from utils.data_parse import Data_Parser
  2 | from utils.data_analysis import Data_Analysis
  3 | from xgb.xgb_model import XGB_Model
  4 | import os
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time
  8 | 
  9 | 
 10 | class Main():
 11 |     '''
 12 |     pipeline for game churn analysis
 13 |     change the code to fit it in all the ops, not only the key ops 
 14 |     '''
 15 | 
 16 |     def __init__(self, day, sql_in, k):
 17 |         self.day = day
 18 |         self.sql_in = sql_in
 19 |         self.k = k
 20 |         self.training_data = {
 21 |             1: './temp/fc_train.txt',
 22 |             2: './temp/sc_train.txt',
 23 |             3: './temp/tc_train.txt'
 24 |         }
 25 |         self.training_label = {
 26 |             1: './temp/fc_label.pkl',
 27 |             2: './temp/sc_label.pkl',
 28 |             3: './temp/tc_label.pkl'
 29 |         }
 30 | 
 31 |         self.x = None
 32 |         self.y = None
 33 |         self.op = None
 34 | 
 35 |         # training data generate
 36 |         self._load_data()
 37 |         self.da = Data_Analysis(
 38 |             self.training_data[self.day], self.training_label[self.day], self.sql_in)
 39 |         self.op_churn = self.da.statistics_op_churn()
 40 |         self.op_clicks = self.da.statistics_op_clicks()
 41 |         temp_intervals = self.da.statistics_op_intervals()
 42 |         self.op_intervals = temp_intervals[0]
 43 |         self.op_median_intervals = temp_intervals[1]
 44 |         self.op_stages = self.da.statistics_op_stage()
 45 | 
 46 |         # new dict         
 47 |         self.op_verbose = {}
 48 |         # end of new dict 
 49 | 
 50 |         self.key_ops = None
 51 | 
 52 |     def _load_data(self):
 53 |         dp = Data_Parser(self.sql_in)
 54 |         if os.path.exists(self.training_data[self.day]) and os.path.exists(self.training_label[self.day]):
 55 |             pass
 56 |         else:
 57 |             dp.parse()
 58 |             dp.write_in([
 59 |                 './temp/fc_train.txt',
 60 |                 './temp/sc_train.txt',
 61 |                 './temp/tc_train.txt',
 62 |                 './temp/fc_label.pkl',
 63 |                 './temp/sc_label.pkl',
 64 |                 './temp/tc_label.pkl'
 65 |             ])
 66 |         self.x, self.y, self.op = dp.load_tfidf(
 67 |             self.training_data[self.day], self.training_label[self.day])
 68 | 
 69 |     def _get_key_ops(self):
 70 |         # self._load_data()
 71 |         xgb = XGB_Model(self.x, self.y, self.op, 0.2, 0.1)
 72 |         xgb.model()
 73 |         print(len(xgb.key_ops))
 74 |         self.key_ops = xgb.key_ops
 75 | 
 76 |     def _get_op_berbose(self):
 77 |         '''
 78 |         '''
 79 |         import xlrd        
 80 |         data = xlrd.open_workbook('./data/动作说明.xlsx')
 81 |         table = data.sheet_by_name('Sheet1')
 82 | 
 83 |         nrows = table.nrows
 84 |         
 85 |         for i in range(nrows):
 86 |             line = table.row_values(i)
 87 |             if line[0] not in self.op_verbose:
 88 |                 self.op_verbose[line[0]] = line[1]                
 89 | 
 90 |     def ops_analysis(self):
 91 |         self._get_key_ops()
 92 |         self._get_op_berbose()
 93 |         sorted_keyops = np.argsort(self.key_ops)
 94 |         print(len(self.key_ops))
 95 | 
 96 |         st_op_churn = {}
 97 |         st_op_clicks = {}
 98 |         st_op_stage = {}
 99 |         st_op_churnnum = {}
100 | 
101 |         for k, v in self.op_churn.items():
102 |             if v[0] == 0:
103 |                 st_op_churn[k] = -1
104 |             else:
105 |                 st_op_churn[k] = v[1] * 1.0 / v[0]
106 |             st_op_churnnum[k] = v[1]
107 | 
108 |         for k, v in self.op_clicks.items():
109 |             if v[1] == 0:
110 |                 st_op_clicks[k] = -1 * v[0]
111 |             else:
112 |                 st_op_clicks[k] = v[0] * 1.0 / v[1]
113 | 
114 |         for k, v in self.op_stages.items():
115 |             if len(v) > 4:
116 |                 v.remove(max(v))
117 |                 v.remove(min(v))
118 |             st_op_stage[k] = np.mean(v)
119 |             pass
120 | 
121 |         # here
122 |         print('动作平均时间间隔: {}'.format(self.da.statistics_op_avg_intervals()))
123 |         print('动作平均点击比例: {}'.format(self.da.statistics_op_avg_clicks_ratio()))
124 |         print('动作平均时间间隔和动作留存比之间的pearson系数: {}'.format(
125 |             self.da.statistics_pearson_clicks_intervals()))
126 |         print('动作点击比值和动作阶段之间的pearson系数: {}'.format(
127 |             self.da.statistics_pearson_clicks_stage()))
128 |         self.draw()
129 | 
130 |         if self.k is True:
131 |             # 只对关键动作进行分析
132 |             data = []
133 |             for opid in sorted_keyops[::-1]:
134 |                 op_name = self.op[opid]
135 |                 if self.key_ops[opid] == 0:
136 |                     break
137 |                 # 动作的名称， 动作的留存比， 动作的点击次数比值， 动作属于前期还是后期动作， 动作的平均时间间隔， 动作随后时间间隔的中位数， 动作的重要性
138 |                 verbose = self.op_verbose[op_name] if op_name in self.op_verbose else ""
139 |                 print('{}|{}|{:.5f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.4f}'.format(
140 |                     op_name, verbose, st_op_churn[op_name], st_op_churnnum[op_name], st_op_clicks[op_name], st_op_stage[op_name], self.op_intervals[op_name], self.op_median_intervals[op_name], self.key_ops[opid]))
141 |                 data.append(
142 |                     [op_name,
143 |                     verbose,
144 |                      "{:.5f}".format(st_op_churn[op_name]),
145 |                      "{:.2f}".format(st_op_churnnum[op_name]),
146 |                      "{:.2f}".format(st_op_clicks[op_name]),
147 |                      "{:.2f}".format(st_op_stage[op_name]),
148 |                      "{:.2f}".format(self.op_intervals[op_name]),
149 |                      "{:.2f}".format(self.op_median_intervals[op_name]),
150 |                      "{:.4f}".format(self.key_ops[opid])]
151 |                 )
152 | 
153 |             df = pd.DataFrame(data=data, columns=[
154 |                 '动作', '动作详细说明', '留存比', '动作流失人数', '点击次数比', '动作时段', '随后时间间隔', '随后中位数时间间隔', '重要性'])
155 | 
156 |             self.writeTo(parent_dir='output', path='',
157 |                          file_name='xlsx', pd_file=df)
158 |         else:
159 |             # 对全部的动作进行分析
160 |             data = []
161 |             for op_name, _ in st_op_clicks.items():
162 |                 # 动作的名称，动作的留存比，动作的点击次数的比值，动作属于前期还是后期动作，动作的平均时间间隔，动作的中位数时间间隔
163 |                 print('{}|{:.5f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}'.format(
164 |                     op_name, st_op_churn[op_name], st_op_churnnum[op_name], st_op_clicks[op_name], st_op_stage[op_name], self.op_intervals[op_name], self.op_median_intervals[op_name]))
165 |                 data.append(
166 |                     [op_name,
167 |                      "{:.5f}".format(st_op_churn[op_name]),
168 |                      "{:.2f}".format(st_op_churnnum[op_name]),
169 |                      "{:.2f}".format(st_op_clicks[op_name]),
170 |                      "{:.2f}".format(st_op_stage[op_name]),
171 |                      "{:.2f}".format(self.op_intervals[op_name]),
172 |                      "{:.2f}".format(self.op_median_intervals[op_name])]
173 |                 )
174 | 
175 |             df = pd.DataFrame(data=data, columns=[
176 |                 '动作', '留存比', '动作流失人数', '点击次数比', '动作时段', '随后平均时间间隔', '随后中位数时间间隔'])
177 | 
178 |             self.writeTo(parent_dir='output', path='',
179 |                          file_name='csv', pd_file=df)
180 | 
181 |     def writeTo(self, parent_dir, path, file_name, pd_file, output_format='csv'):
182 |         full_path = os.path.join(parent_dir, path)
183 |         os.makedirs(full_path, exist_ok=True)
184 | 
185 |         timeFlag = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
186 |         time_stamp = '_'.join(timeFlag.split())
187 |         file_name = file_name + "_" + time_stamp
188 |         full_path = os.path.join(full_path, file_name)
189 | 
190 |         if output_format.lower() == 'csv':
191 |             full_path = full_path + "." + output_format.lower()
192 |             print(full_path)
193 |             pd_file.to_csv(full_path)
194 |         else:
195 |             full_path = full_path + ".xlsx"
196 |             pd_file.to_excel(full_path, "sheet1",
197 |                              index=False, engine='xlsxwriter')
198 | 
199 |     def draw(self):
200 |         from pyecharts import Line
201 |         line = Line("")
202 |         churn_rates = []
203 |         intervals = []
204 |         for k, v in self.op_churn.items():
205 |             churn_rate = v[1] * 1.0 / v[0]
206 |             churn_rates.append(churn_rate)
207 |             intervals.append(self.op_intervals[k])
208 |         attr = [_ for _ in range(len(intervals))]
209 |         churnrates = [rate * 1000 for rate in churn_rates]
210 | 
211 |         new_intervals = [i for i in intervals[::10]]
212 |         new_churnrates = [c for c in churnrates[::10]]
213 |         attr = [i for i in range(0, len(intervals), 10)]
214 |         line = Line("")
215 |         line.add("动作随后时间间隔", attr, new_intervals)
216 |         line.add("动作留存比 * 1000", attr, new_churnrates)
217 |         line.show_config()
218 |         line.render('./output/render.html')
219 | 
220 | 
221 | import sys
222 | import getopt
223 | 
224 | 
225 | def usage():
226 |     print('-i: Input file, which can only be a database file')
227 |     print('-d: The day number, can only be 1 to 3')
228 |     print('-k: t for key actions analysis, f for all actions analysis')
229 |     pass
230 | 
231 | 
232 | if __name__ == '__main__':
233 |     opts, args = getopt.getopt(sys.argv[1:], 'hi:d:k:')
234 |     for op, value in opts:
235 |         if op == '-i':
236 |             sql_in = value
237 |         if op == '-d':
238 |             day = int(value)
239 |         if op == '-k':
240 |             if value == 't':
241 |                 k = True
242 |             else:
243 |                 k = False
244 |         if op == '-h':
245 |             print('help')
246 |             usage()
247 |     main = Main(day=day, sql_in=sql_in, k=k)
248 |     main.ops_analysis()
249 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Churn Analysis SDK
  2 | ## 简介
  3 | 
  4 | 这是一个分析用户流失的程序，通过解析一个数据库形式的埋点数据，进行游戏用户流失分析，该环境运行在ubuntu16.04，python35，需要安装sklearn，pandas，numpy，XGBoost等包
  5 | 
  6 | 为更加方便用户使用，现已经上述环境和代码封装于docker镜像中，并上传至阿里云镜像，名字为
  7 | <center>
  8 | **registry.cn-hangzhou.aliyuncs.com/jingchunzhen/churn_analysis**
  9 | </center>
 10 | 
 11 | 
 12 | ## 输入输出
 13 | 
 14 | #### Step 1
 15 | 用户需要在该项目中新建一个名为data的文件夹， 在data中存放需要处理的需要进行解析的数据库文件（sqlite），数据库文件中至少需要包含以下字段
 16 | 
 17 | <center>
 18 | 
 19 | 字段|数据类型|意义
 20 | -----|-----------|---------
 21 | user_id|int|用户标识
 22 | op|text(string)|动作的名称
 23 | current_day|int|当前玩的天数
 24 | num_days_played|int|总共玩的天数
 25 | relative_timestamp|float|动作发生的时间
 26 | 
 27 | </center>
 28 | 
 29 | #### Step 2
 30 | 
 31 | ```
 32 | python3 main.py -i XXX.db2 -d 1 -k t 
 33 | ```
 34 | -i 之后的参数表示的是数据库文件的存放地址
 35 | -d 表示的是需要处理的是第i天的数据即首日用户流失分析，次留玩家流失分析，三留玩家流失分析
 36 | -k 参数t表示的是处理全部的动作信息，参数f表示仅处理由分类器提取出的较为重要的动作信息
 37 | 
 38 | #### 输出
 39 | 
 40 | 输出为一个csv文件，存储在output文件夹中
 41 | 
 42 | <center>
 43 | 
 44 | 字段|字段说明
 45 | ----|---------
 46 | 动作|埋点数据中动作的编码
 47 | 留存比|点击该动作流失的人数与点击该动作未流失人数的比值
 48 | 动作流失人数|点击该动作流失的人数
 49 | 点击次数比|非流失玩家点击该动作的平均次数与流失玩家点击该动作的平均次数的比值
 50 | 动作时段|该动作首次出现在游戏中的平均时间，以衡量该动作属于游戏前期，中期或者后期动作
 51 | 随后平均时间间隔|卡点分析，判断该动作是否是一个潜在的卡点
 52 | 随后中位数时间间隔|卡点分析
 53 | 
 54 | </center>
 55 | 
 56 | 同时，还计算
 57 | * 动作留存比和动作随后时间间隔之间的Pearson系数，以验证留存和卡点之间的相关性，并绘制相关性曲线
 58 | * 动作平均卡顿时间
 59 | * 所有动作点击次数比的平均值，以验证某一个动作非流失玩家或者流失玩家的偏好性
 60 | 
 61 | 
 62 | ## 项目结构
 63 | 
 64 | #### utils
 65 | 
 66 | - data_parser.py 处理原始数据，生成临时的数据，存储在temp文件夹下。
 67 | - data_analysis.py 统计一些数据的特征，如卡点分析，留存率和卡点之间的相关性系数等。
 68 | 
 69 | #### temp
 70 | 
 71 | 存储生成的临时文件（包含处理数据之后的文件）
 72 | 
 73 | - clean.sh 清理临时文件夹
 74 | 
 75 | #### xgb
 76 |  
 77 | - xgb_model.py 进行流失用户分析的算法
 78 | 
 79 | #### main.py 
 80 | 
 81 | 存放模型的流程
 82 | 
 83 | #### output
 84 | 
 85 | 该文件夹存储最终生成的数据报表文件和曲线图
 86 | 
 87 | ## Docker镜像使用说明
 88 | 
 89 | #### Step 1
 90 | 
 91 | 安装docker
 92 | 
 93 | #### Step 2
 94 | 
 95 | ```
 96 | docker pull registry.cn-hangzhou.aliyuncs.com/jingchunzhen/churn_analysis
 97 | 
 98 | docker run -t -i --name temp registry.cn-hangzhou.aliyuncs.com/jingchunzhen/churn_analysis /bin/bash
 99 | 
100 | docker cp 宿主机中埋点数据的文件地址 temp: /home/workspace/data
101 | ```
102 | 
103 | #### Step 3
104 | 
105 | 进入容器内
106 | 
107 | ```
108 | cd /home/workspace
109 | 
110 | bash run.sh 
111 | ```
112 | 
113 | #### Step 4
114 | 
115 | ```
116 | docker cp temp:/home/workspace/output/生成的数据报表文件 宿主机中的文件地址
117 | ```
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | python3 main.py -i ./data/an.db2 -d 1 -k f


--------------------------------------------------------------------------------
/temp/clean.sh:
--------------------------------------------------------------------------------
 1 | # 当传入有参数时，删除参数所指定的文件
 2 | # 当传入无参数时，删除所有文件
 3 | 
 4 | if [ $# -eq 0 ]
 5 | then
 6 |     rm *.txt
 7 |     rm *.pkl
 8 | else
 9 |     for arg in "$@";do
10 |         if [ ! -d "$arg" ];
11 |         then
12 |             echo "$arg not exist"
13 |         else
14 |             rm $arg
15 |             echo "$arg cleaned up"
16 |         fi
17 |     done
18 | fi


--------------------------------------------------------------------------------
/tests/test_pearson_draw.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pyecharts import Line
 3 | 
 4 | '''
 5 | 对excel表中的某列进行排序
 6 | 对排序之后的结果分割
 7 | 对分割之后的结果绘制pearson曲线，并计算pearson系数
 8 | '''
 9 | 
10 | def draw(file_in, file_out):
11 |     '''
12 |     '''
13 |     df = pd.read_csv(file_in, encoding='utf-8')
14 |     print(type(df['留存比']))
15 |     print(df['留存比'].corr(df['随后中位数时间间隔']))
16 |     
17 |     # 对每一条数据存如列表中
18 |     l1 = df['留存比'].tolist()
19 |     l1 = [e * 1000 for e in l1]
20 |     l2 = df['随后中位数时间间隔'].tolist()
21 | 
22 |     attr = [i for i in range(len(l1))]
23 | 
24 |     line = Line("")
25 |     line.add("流失留存比 * 1000", attr, l1)
26 |     line.add("卡顿时间", attr, l2)
27 | 
28 |     line.show_config()
29 |     line.render(file_out)
30 | 
31 | if __name__ == "__main__":
32 |     '''
33 |     0.309806861451
34 |     0.465331387018
35 |     0.0989582758376
36 |     '''
37 |     draw('../output/0-600.csv', '../output/0-600.html')
38 |     draw('../output/600-1200.csv', '../output/600-1200.html')
39 |     draw('../output/1200-2400.csv', '../output/1200-2400.html')
40 |     pass


--------------------------------------------------------------------------------
/utils/data_analysis.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import sqlite3
  3 | import random
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sklearn
  7 | import sys
  8 | from sklearn import preprocessing
  9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 10 | 
 11 | 
 12 | class Data_Analysis():
 13 |     '''
 14 |     data analysis for churn rate, interval of each op
 15 |     the pearson between interval and churn rate 
 16 |     and so on 
 17 |     '''
 18 | 
 19 |     def __init__(self, *file_in):
 20 |         '''
 21 |         Args:
 22 |             file_in: ops path (file), label path (pickle), sql file path (db file) 
 23 |         '''
 24 |         self.file_ops = file_in[0]
 25 |         self.file_labels = file_in[1]
 26 |         self.sql_in = file_in[2]
 27 |         with open(file_in[0], 'rb') as f_ops, open(file_in[1], 'rb') as f_label:
 28 |             self.user_labels = pickle.load(f_label)  # get a label list 1 for churned 0 for not churned 
 29 |         self.op_categories = set()
 30 |         self.op_churn = {}
 31 |         self.op_intervals = {}
 32 |         self.op_median_intervals = {}
 33 |         self.op_clicks = {}
 34 |         self.op_stage = {}
 35 |         with open(self.file_ops, 'rb') as f_ops:
 36 |             for ops in f_ops:
 37 |                 ops = ops.decode('utf-8')
 38 |                 ops_list = ops.strip().split(' ')
 39 |                 [self.op_categories.add(op) for op in ops_list]
 40 | 
 41 |     def statistics_op_churn(self):
 42 |         '''
 43 |         Return:
 44 |             (dict): key (string): the op's name, 
 45 |             value (list, length is 2) 
 46 |             [0] for how many user not churned when did this op
 47 |             [1] for how many user churned when did this op
 48 |         '''
 49 |         with open(self.file_ops, 'rb') as f_ops:
 50 |             i = 0
 51 |             for ops in f_ops:
 52 |                 ops = ops.decode('utf-8')
 53 |                 ops_list = ops.strip().split(' ')
 54 |                 if self.user_labels[i] == 0:
 55 |                     # how many user not churned at this op
 56 |                     for op in set(ops_list):
 57 |                         if op not in self.op_churn:
 58 |                             self.op_churn[op] = [0, 0]
 59 |                         self.op_churn[op][0] += 1
 60 |                 else:
 61 |                     # how many user churned at this op
 62 |                     for op in set(ops_list[:-1]):
 63 |                         if op not in self.op_churn:
 64 |                             self.op_churn[op] = [0, 0]
 65 |                         self.op_churn[op][0] += 1
 66 |                     if ops_list[-1] not in self.op_churn:
 67 |                         self.op_churn[ops_list[-1]] = [0, 0]
 68 |                     self.op_churn[ops_list[-1]][1] += 1
 69 |                 i += 1
 70 |         return self.op_churn
 71 | 
 72 |     def statistics_op_clicks(self):
 73 |         '''        
 74 |         Return:
 75 |             (dict): key(string): op's name 
 76 |             value (float list of length of 2):
 77 |             [0] for the average clicks of the unchurned user,
 78 |             [1] for the average clicks of the churned user
 79 | 
 80 |         '''
 81 |         sum_users = [0] * 2
 82 |         with open(self.file_ops, 'rb') as f_ops:
 83 |             i = 0
 84 |             for ops in f_ops:
 85 |                 sum_users[self.user_labels[i]] += 1
 86 |                 ops = ops.decode('utf-8')
 87 |                 ops_list = ops.strip().split(' ')
 88 |                 for op in ops_list:
 89 |                     if op not in self.op_clicks:
 90 |                         self.op_clicks[op] = [0, 0]
 91 |                     self.op_clicks[op][self.user_labels[i]] += 1
 92 |                 i += 1
 93 |         for k, v in self.op_clicks.items():
 94 |             a = v[0] * 1.0 / sum_users[0]
 95 |             b = v[1] * 1.0 / sum_users[1]
 96 |             self.op_clicks[k] = [a, b]
 97 |         return self.op_clicks
 98 | 
 99 |     def statistics_op_avg_clicks_ratio(self):
100 |         '''
101 |         Return:
102 |             (float): ratio the total clilcks of unchurn user / the total clicks of churn user 
103 |         '''
104 |         total_clicks = [[], []]
105 |         with open(self.file_ops, 'rb') as f_ops:
106 |             i = 0
107 |             for ops in f_ops:
108 |                 ops = ops.decode('utf-8')
109 |                 ops_list = ops.strip().split(' ')
110 |                 total_clicks[self.user_labels[i]].append(len(ops_list))
111 |                 i += 1
112 |         total_clicks[0].remove(max(total_clicks[0]))
113 |         total_clicks[0].remove(min(total_clicks[0]))
114 |         total_clicks[1].remove(max(total_clicks[1]))
115 |         total_clicks[1].remove(min(total_clicks[1]))
116 |         return np.mean(total_clicks[0]) * 1.0 / np.mean(total_clicks[1])
117 | 
118 |     def statistics_op_intervals(self):
119 |         '''
120 |         Returns:
121 |             (dict): key (string): op's name, value (float): the mean interval of this op 
122 |             (dict): key (string): op's name, value (float): the median interval of this op  
123 |         '''
124 |         conn = sqlite3.connect(self.sql_in)
125 |         c = conn.cursor()
126 |         query_sql = "SELECT user_id, op, current_day, num_days_played, relative_timestamp \
127 |             FROM maidian ORDER BY user_id, relative_timestamp"
128 | 
129 |         previous_relativetimestamp = 0
130 |         previous_userid = None
131 |         previous_op = None
132 |         intervals = []
133 |         for row in c.execute(query_sql):
134 |             user_id = row[0]
135 |             op = row[1].strip().replace(' ', '')
136 |             current_day = row[2]
137 |             num_days_played = row[3]
138 |             relative_timestamp = row[4]
139 |             # calculate the interval
140 |             interval = relative_timestamp - previous_relativetimestamp
141 |             if previous_userid == user_id:
142 |                 if previous_op not in self.op_intervals:
143 |                     self.op_intervals[previous_op] = []
144 |                 self.op_intervals[previous_op].append(interval)
145 |             else:
146 |                 pass
147 |             previous_userid = user_id
148 |             previous_relativetimestamp = relative_timestamp
149 |             previous_op = op
150 | 
151 |         for k, intervals in self.op_intervals.items():
152 |             self.op_median_intervals[k] = np.median(intervals)
153 |             if len(intervals) >= 10:
154 |                 intervals.remove(max(intervals))
155 |                 intervals.remove(max(intervals))
156 |                 intervals.remove(min(intervals))
157 |                 intervals.remove(min(intervals))
158 |             self.op_intervals[k] = np.mean(intervals)
159 |         return self.op_intervals, self.op_median_intervals
160 | 
161 |     def statistics_op_avg_intervals(self):
162 |         '''
163 |         Return:
164 |             (float): the mean of all the ops 
165 |         '''
166 |         intervals = []
167 |         for _, interval in self.op_intervals.items():
168 |             intervals.append(interval)
169 |         intervals.remove(max(intervals))
170 |         intervals.remove(min(intervals))
171 |         return np.mean(intervals)
172 | 
173 |     def statistics_op_median_intervals(self):
174 |         '''
175 |         Return:
176 |             (float): the median of all the ops 
177 |         '''
178 |         intervals = []
179 |         for _, interval in self.op_intervals.items():
180 |             intervals.append(interval)
181 |         return np.median(intervals)
182 | 
183 |     def statistics_op_stage(self):
184 |         '''
185 |         Return:
186 |             (dict): key (string): op's name 
187 |                 value (float list): op's first occurrences 
188 |         '''
189 |         conn = sqlite3.connect(self.sql_in)
190 |         c = conn.cursor()
191 |         query_sql = "SELECT user_id, op, relative_timestamp \
192 |             FROM maidian ORDER BY user_id, relative_timestamp ASC"
193 | 
194 |         previous_userid = None
195 |         start_time = None
196 | 
197 |         for row in c.execute(query_sql):
198 |             user_id = row[0]
199 |             op = row[1].strip().replace(' ', '')
200 |             relative_timestamp = row[2]
201 | 
202 |             if previous_userid is None:
203 |                 start_time = relative_timestamp
204 |                 temp_dict = {}
205 |                 temp_dict[op] = relative_timestamp
206 |             elif previous_userid == user_id:
207 |                 # only the first occurrence of the ops will be record
208 |                 if op not in temp_dict:
209 |                     temp_dict[op] = relative_timestamp
210 |             else:
211 |                 # the user changed                 
212 |                 for op, rt in temp_dict.items():
213 |                     if op not in self.op_stage:
214 |                         self.op_stage[op] = []
215 |                     else:
216 |                         self.op_stage[op].append(rt - start_time)
217 |                 temp_dict = {}
218 |                 start_time = relative_timestamp
219 |                 temp_dict[op] = relative_timestamp
220 | 
221 |             previous_userid = user_id
222 |         return self.op_stage
223 | 
224 |     def statistics_pearson_clicks_intervals(self):
225 |         '''
226 |         Return:
227 |             (float): the Pearson between the op's interval and the churn rate of this op
228 |         '''
229 |         churn_rates = []
230 |         intervals = []
231 |         for k, v in self.op_churn.items():
232 |             churn_rate = v[1] * 1.0 / v[0]
233 |             churn_rates.append(churn_rate)
234 |             intervals.append(self.op_intervals[k])
235 | 
236 |         s1 = pd.Series(intervals)
237 |         s2 = pd.Series(churn_rates)
238 |         return s1.corr(s2)
239 | 
240 |     def statistics_pearson_clicks_stage(self):
241 |         '''
242 |         deprecated 
243 |         used for test
244 |         '''
245 |         stages = []
246 |         clicks = []
247 | 
248 |         for k, _ in self.op_clicks.items():            
249 |             if self.op_clicks[k][1] == 0:
250 |                 clicks.append(-1)
251 |             else:
252 |                 clicks.append(self.op_clicks[k][0]
253 |                               * 1.0 / self.op_clicks[k][1])
254 | 
255 |             v = self.op_stage[k]
256 |             if len(v) > 4:
257 |                 v.remove(max(v))
258 |                 v.remove(min(v))
259 |             stages.append(np.mean(v) * 10)
260 | 
261 |         s1 = pd.Series(stages)
262 |         s2 = pd.Series(clicks)
263 |         return s1.corr(s2)
264 | 


--------------------------------------------------------------------------------
/utils/data_parse.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import sqlite3
  3 | import random
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sklearn
  7 | import sys
  8 | from sklearn import preprocessing
  9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 10 | 
 11 | 
 12 | class Data_Parser(object):
 13 | 
 14 |     def __init__(self, sql_file):
 15 |         self.sql_in = sql_file
 16 |         self.fc_user_ops = {}
 17 |         self.fc_user_label = {}
 18 |         self.sc_user_ops = {}
 19 |         self.sc_user_label = {}
 20 |         self.tc_user_ops = {}
 21 |         self.tc_user_label = {}
 22 |         pass
 23 | 
 24 |     def parse(self):
 25 |         '''
 26 |         The player's operation sequence is divided by day to analyze the churn in the first, second, third days. 
 27 |         '''
 28 |         conn = sqlite3.connect(self.sql_in)
 29 |         c = conn.cursor()
 30 |         query_sql = "SELECT user_id, op, current_day, num_days_played, relative_timestamp \
 31 |             FROM maidian ORDER BY user_id, relative_timestamp ASC"
 32 |         for row in c.execute(query_sql):
 33 |             user_id = row[0]
 34 |             op = row[1].strip().replace(" ", '')
 35 |             current_day = row[2]
 36 |             num_days_played = row[3]
 37 | 
 38 |             if current_day == 1:
 39 |                 self.fc_user_label[user_id] = 1 if num_days_played == 1 else 0
 40 |                 if user_id not in self.fc_user_ops:
 41 |                     self.fc_user_ops[user_id] = []
 42 |                 self.fc_user_ops[user_id].append(op)
 43 |             elif current_day == 2:
 44 |                 self.sc_user_label[user_id] = 1 if num_days_played == 2 else 0
 45 |                 if user_id not in self.sc_user_ops:
 46 |                     self.sc_user_ops[user_id] = []
 47 |                 self.sc_user_ops[user_id].append(op)
 48 |             elif current_day == 3:
 49 |                 self.tc_user_label[user_id] = 1 if num_days_played == 3 else 0
 50 |                 if user_id not in self.tc_user_ops:
 51 |                     self.tc_user_ops[user_id] = []
 52 |                 self.tc_user_ops[user_id].append(op)
 53 |             else:
 54 |                 pass
 55 |             pass
 56 | 
 57 |     def write_in(self, file_out):
 58 |         with open(file_out[0], 'a') as f_fc_train, open(file_out[1], 'a') as f_sc_train, open(file_out[2], 'a') as f_tc_train, \
 59 |                 open(file_out[3], 'wb') as f_fc_label, open(file_out[4], 'wb') as f_sc_label, open(file_out[5], 'wb') as f_tc_label:
 60 |             fc_labels = []
 61 |             sc_labels = []
 62 |             tc_labels = []
 63 |             for user in self.fc_user_ops:
 64 |                 s = ' '.join(self.fc_user_ops[user])
 65 |                 f_fc_train.write(s + '\n')
 66 |                 fc_labels.append(self.fc_user_label[user])
 67 |             for user in self.sc_user_ops:
 68 |                 s = ' '.join(self.sc_user_ops[user])
 69 |                 f_sc_train.write(s + '\n')
 70 |                 sc_labels.append(self.sc_user_label[user])
 71 |             for user in self.tc_user_ops:
 72 |                 s = ' '.join(self.tc_user_ops[user])
 73 |                 f_tc_train.write(s + '\n')
 74 |                 tc_labels.append(self.tc_user_label[user])
 75 | 
 76 |             pickle.dump(fc_labels, f_fc_label)
 77 |             pickle.dump(sc_labels, f_sc_label)
 78 |             pickle.dump(tc_labels, f_tc_label)
 79 | 
 80 |     def load_tfidf(self, *file_in, minimum_support=5, sample_rate=0, method='tfidf'):
 81 |         '''
 82 |         get the tfidf of the op's sequence 
 83 |         Args:
 84 |             file_in: training data and label data
 85 |             minimum_support (int): minimum count for op
 86 |             sample rate (int [0, 10)): if 0 not sample
 87 |             method (string): can either be 'count' or 'tfidf'
 88 |         Returns:
 89 |             X: (np.array)
 90 |             Y: (list)
 91 |             op: (list)
 92 |         '''
 93 |         assert len(file_in) == 2
 94 |         corpus = []
 95 |         new_corpus = []
 96 |         new_labels = []
 97 |         op_counts = {}
 98 | 
 99 |         with open(file_in[0], 'rb') as f_train:
100 |             for line in f_train:
101 |                 ops = set(line.decode('utf-8').strip().split(' '))
102 |                 for op in ops:
103 |                     if op not in op_counts:
104 |                         op_counts[op] = 1
105 |                     else:
106 |                         op_counts[op] += 1
107 | 
108 |         with open(file_in[0], 'rb') as f_train, open(file_in[1], 'rb') as f_label:
109 |             for line in f_train:
110 |                 ops = line.decode('utf-8').strip().split(' ')
111 |                 [ops.remove(op)
112 |                  for op in ops if op_counts[op] <= minimum_support]
113 |                 line = ' '.join(ops)
114 |                 corpus.append(line)
115 | 
116 |             labels = pickle.load(f_label)
117 | 
118 |         if sample_rate != 0:
119 |             sampled_corpus = []
120 |             sampled_labels = []
121 |             sample_index = []
122 |             for i in range(len(labels)):
123 |                 if labels[i] == 0:
124 |                     sampled_corpus.append(corpus[i])
125 |                     sampled_labels.append(labels[i])
126 |                 else:
127 |                     if random.randint(0, 100) > sample_rate * 100:
128 |                         sampled_corpus.append(corpus[i])
129 |                         sampled_labels.append(labels[i])
130 |             new_corpus = sampled_corpus
131 |             new_labels = sampled_labels
132 |         else:
133 |             new_corpus = corpus
134 |             new_labels = labels
135 |             pass
136 | 
137 |         vectorizer = CountVectorizer(analyzer=str.split)
138 | 
139 |         if method.lower() == 'count':
140 |             X = vectorizer.fit_transform(
141 |                 new_corpus).toarray()  #
142 |         elif method.lower() == 'tfidf':
143 |             transformer = TfidfTransformer()
144 |             tfidf = transformer.fit_transform(
145 |                 vectorizer.fit_transform(new_corpus))
146 |             X = tfidf.toarray()
147 |         Y = new_labels
148 |         op = vectorizer.get_feature_names()
149 | 
150 |         return X, Y, op
151 | 


--------------------------------------------------------------------------------
/xgb/xgb_model.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | import sklearn
 3 | from sklearn import metrics
 4 | import numpy as np
 5 | import pickle
 6 | from sklearn.cross_validation import train_test_split
 7 | from sklearn.metrics import accuracy_score
 8 | from sklearn.metrics import precision_recall_fscore_support
 9 | from sklearn.feature_selection import SelectKBest
10 | 
11 | 
12 | class XGB_Model():
13 |     '''
14 |     '''
15 | 
16 |     def __init__(self, X, Y, op_name, validate_size, test_size):
17 |         self.X = X
18 |         self.Y = Y
19 |         self.op_name = op_name
20 |         self.key_ops = None
21 |         X_train, self.X_test, Y_train, self.Y_test = train_test_split(
22 |             X, Y, test_size=test_size)
23 |         self.X_train, self.X_validate, self.Y_train, self.Y_validate = train_test_split(
24 |             X_train, Y_train, test_size=validate_size)
25 | 
26 |     def model(self):                
27 |         model = xgb.XGBClassifier(
28 |             learning_rate=0.1, n_estimators=20, max_depth=3, subsample=1)
29 |         eval_set = [(self.X_validate, self.Y_validate)]
30 |         model.fit(self.X_train, self.Y_train, early_stopping_rounds=20,
31 |                   eval_metric="logloss", eval_set=eval_set, verbose=True)
32 | 
33 |         # Y_pred -> np.ndarray Y_train -> list
34 |         Y_pred = model.predict(self.X_train)
35 |         print('training score {}'.format(accuracy_score(self.Y_train, Y_pred)))
36 |         Y_pred = model.predict(self.X_validate)
37 |         print('validate score {}'.format(
38 |             accuracy_score(self.Y_validate, Y_pred)))
39 |         Y_pred = model.predict(self.X_test)
40 |         print('test score {}'.format(accuracy_score(self.Y_test, Y_pred)))
41 |         print(precision_recall_fscore_support(
42 |             self.Y_test, Y_pred, average=None))
43 | 
44 |         print(np.shape(model.feature_importances_))
45 |         self.key_ops = list(model.feature_importances_)
46 | 


--------------------------------------------------------------------------------