├── main.py ├── readme.md ├── run.sh ├── temp └── clean.sh ├── tests └── test_pearson_draw.py ├── utils ├── data_analysis.py └── data_parse.py └── xgb └── xgb_model.py /main.py: -------------------------------------------------------------------------------- 1 | from utils.data_parse import Data_Parser 2 | from utils.data_analysis import Data_Analysis 3 | from xgb.xgb_model import XGB_Model 4 | import os 5 | import numpy as np 6 | import pandas as pd 7 | import time 8 | 9 | 10 | class Main(): 11 | ''' 12 | pipeline for game churn analysis 13 | change the code to fit it in all the ops, not only the key ops 14 | ''' 15 | 16 | def __init__(self, day, sql_in, k): 17 | self.day = day 18 | self.sql_in = sql_in 19 | self.k = k 20 | self.training_data = { 21 | 1: './temp/fc_train.txt', 22 | 2: './temp/sc_train.txt', 23 | 3: './temp/tc_train.txt' 24 | } 25 | self.training_label = { 26 | 1: './temp/fc_label.pkl', 27 | 2: './temp/sc_label.pkl', 28 | 3: './temp/tc_label.pkl' 29 | } 30 | 31 | self.x = None 32 | self.y = None 33 | self.op = None 34 | 35 | # training data generate 36 | self._load_data() 37 | self.da = Data_Analysis( 38 | self.training_data[self.day], self.training_label[self.day], self.sql_in) 39 | self.op_churn = self.da.statistics_op_churn() 40 | self.op_clicks = self.da.statistics_op_clicks() 41 | temp_intervals = self.da.statistics_op_intervals() 42 | self.op_intervals = temp_intervals[0] 43 | self.op_median_intervals = temp_intervals[1] 44 | self.op_stages = self.da.statistics_op_stage() 45 | 46 | # new dict 47 | self.op_verbose = {} 48 | # end of new dict 49 | 50 | self.key_ops = None 51 | 52 | def _load_data(self): 53 | dp = Data_Parser(self.sql_in) 54 | if os.path.exists(self.training_data[self.day]) and os.path.exists(self.training_label[self.day]): 55 | pass 56 | else: 57 | dp.parse() 58 | dp.write_in([ 59 | './temp/fc_train.txt', 60 | './temp/sc_train.txt', 61 | './temp/tc_train.txt', 62 | './temp/fc_label.pkl', 63 | './temp/sc_label.pkl', 64 | './temp/tc_label.pkl' 65 | ]) 66 | self.x, self.y, self.op = dp.load_tfidf( 67 | self.training_data[self.day], self.training_label[self.day]) 68 | 69 | def _get_key_ops(self): 70 | # self._load_data() 71 | xgb = XGB_Model(self.x, self.y, self.op, 0.2, 0.1) 72 | xgb.model() 73 | print(len(xgb.key_ops)) 74 | self.key_ops = xgb.key_ops 75 | 76 | def _get_op_berbose(self): 77 | ''' 78 | ''' 79 | import xlrd 80 | data = xlrd.open_workbook('./data/动作说明.xlsx') 81 | table = data.sheet_by_name('Sheet1') 82 | 83 | nrows = table.nrows 84 | 85 | for i in range(nrows): 86 | line = table.row_values(i) 87 | if line[0] not in self.op_verbose: 88 | self.op_verbose[line[0]] = line[1] 89 | 90 | def ops_analysis(self): 91 | self._get_key_ops() 92 | self._get_op_berbose() 93 | sorted_keyops = np.argsort(self.key_ops) 94 | print(len(self.key_ops)) 95 | 96 | st_op_churn = {} 97 | st_op_clicks = {} 98 | st_op_stage = {} 99 | st_op_churnnum = {} 100 | 101 | for k, v in self.op_churn.items(): 102 | if v[0] == 0: 103 | st_op_churn[k] = -1 104 | else: 105 | st_op_churn[k] = v[1] * 1.0 / v[0] 106 | st_op_churnnum[k] = v[1] 107 | 108 | for k, v in self.op_clicks.items(): 109 | if v[1] == 0: 110 | st_op_clicks[k] = -1 * v[0] 111 | else: 112 | st_op_clicks[k] = v[0] * 1.0 / v[1] 113 | 114 | for k, v in self.op_stages.items(): 115 | if len(v) > 4: 116 | v.remove(max(v)) 117 | v.remove(min(v)) 118 | st_op_stage[k] = np.mean(v) 119 | pass 120 | 121 | # here 122 | print('动作平均时间间隔: {}'.format(self.da.statistics_op_avg_intervals())) 123 | print('动作平均点击比例: {}'.format(self.da.statistics_op_avg_clicks_ratio())) 124 | print('动作平均时间间隔和动作留存比之间的pearson系数: {}'.format( 125 | self.da.statistics_pearson_clicks_intervals())) 126 | print('动作点击比值和动作阶段之间的pearson系数: {}'.format( 127 | self.da.statistics_pearson_clicks_stage())) 128 | self.draw() 129 | 130 | if self.k is True: 131 | # 只对关键动作进行分析 132 | data = [] 133 | for opid in sorted_keyops[::-1]: 134 | op_name = self.op[opid] 135 | if self.key_ops[opid] == 0: 136 | break 137 | # 动作的名称, 动作的留存比, 动作的点击次数比值, 动作属于前期还是后期动作, 动作的平均时间间隔, 动作随后时间间隔的中位数, 动作的重要性 138 | verbose = self.op_verbose[op_name] if op_name in self.op_verbose else "" 139 | print('{}|{}|{:.5f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.4f}'.format( 140 | op_name, verbose, st_op_churn[op_name], st_op_churnnum[op_name], st_op_clicks[op_name], st_op_stage[op_name], self.op_intervals[op_name], self.op_median_intervals[op_name], self.key_ops[opid])) 141 | data.append( 142 | [op_name, 143 | verbose, 144 | "{:.5f}".format(st_op_churn[op_name]), 145 | "{:.2f}".format(st_op_churnnum[op_name]), 146 | "{:.2f}".format(st_op_clicks[op_name]), 147 | "{:.2f}".format(st_op_stage[op_name]), 148 | "{:.2f}".format(self.op_intervals[op_name]), 149 | "{:.2f}".format(self.op_median_intervals[op_name]), 150 | "{:.4f}".format(self.key_ops[opid])] 151 | ) 152 | 153 | df = pd.DataFrame(data=data, columns=[ 154 | '动作', '动作详细说明', '留存比', '动作流失人数', '点击次数比', '动作时段', '随后时间间隔', '随后中位数时间间隔', '重要性']) 155 | 156 | self.writeTo(parent_dir='output', path='', 157 | file_name='xlsx', pd_file=df) 158 | else: 159 | # 对全部的动作进行分析 160 | data = [] 161 | for op_name, _ in st_op_clicks.items(): 162 | # 动作的名称,动作的留存比,动作的点击次数的比值,动作属于前期还是后期动作,动作的平均时间间隔,动作的中位数时间间隔 163 | print('{}|{:.5f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}'.format( 164 | op_name, st_op_churn[op_name], st_op_churnnum[op_name], st_op_clicks[op_name], st_op_stage[op_name], self.op_intervals[op_name], self.op_median_intervals[op_name])) 165 | data.append( 166 | [op_name, 167 | "{:.5f}".format(st_op_churn[op_name]), 168 | "{:.2f}".format(st_op_churnnum[op_name]), 169 | "{:.2f}".format(st_op_clicks[op_name]), 170 | "{:.2f}".format(st_op_stage[op_name]), 171 | "{:.2f}".format(self.op_intervals[op_name]), 172 | "{:.2f}".format(self.op_median_intervals[op_name])] 173 | ) 174 | 175 | df = pd.DataFrame(data=data, columns=[ 176 | '动作', '留存比', '动作流失人数', '点击次数比', '动作时段', '随后平均时间间隔', '随后中位数时间间隔']) 177 | 178 | self.writeTo(parent_dir='output', path='', 179 | file_name='csv', pd_file=df) 180 | 181 | def writeTo(self, parent_dir, path, file_name, pd_file, output_format='csv'): 182 | full_path = os.path.join(parent_dir, path) 183 | os.makedirs(full_path, exist_ok=True) 184 | 185 | timeFlag = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 186 | time_stamp = '_'.join(timeFlag.split()) 187 | file_name = file_name + "_" + time_stamp 188 | full_path = os.path.join(full_path, file_name) 189 | 190 | if output_format.lower() == 'csv': 191 | full_path = full_path + "." + output_format.lower() 192 | print(full_path) 193 | pd_file.to_csv(full_path) 194 | else: 195 | full_path = full_path + ".xlsx" 196 | pd_file.to_excel(full_path, "sheet1", 197 | index=False, engine='xlsxwriter') 198 | 199 | def draw(self): 200 | from pyecharts import Line 201 | line = Line("") 202 | churn_rates = [] 203 | intervals = [] 204 | for k, v in self.op_churn.items(): 205 | churn_rate = v[1] * 1.0 / v[0] 206 | churn_rates.append(churn_rate) 207 | intervals.append(self.op_intervals[k]) 208 | attr = [_ for _ in range(len(intervals))] 209 | churnrates = [rate * 1000 for rate in churn_rates] 210 | 211 | new_intervals = [i for i in intervals[::10]] 212 | new_churnrates = [c for c in churnrates[::10]] 213 | attr = [i for i in range(0, len(intervals), 10)] 214 | line = Line("") 215 | line.add("动作随后时间间隔", attr, new_intervals) 216 | line.add("动作留存比 * 1000", attr, new_churnrates) 217 | line.show_config() 218 | line.render('./output/render.html') 219 | 220 | 221 | import sys 222 | import getopt 223 | 224 | 225 | def usage(): 226 | print('-i: Input file, which can only be a database file') 227 | print('-d: The day number, can only be 1 to 3') 228 | print('-k: t for key actions analysis, f for all actions analysis') 229 | pass 230 | 231 | 232 | if __name__ == '__main__': 233 | opts, args = getopt.getopt(sys.argv[1:], 'hi:d:k:') 234 | for op, value in opts: 235 | if op == '-i': 236 | sql_in = value 237 | if op == '-d': 238 | day = int(value) 239 | if op == '-k': 240 | if value == 't': 241 | k = True 242 | else: 243 | k = False 244 | if op == '-h': 245 | print('help') 246 | usage() 247 | main = Main(day=day, sql_in=sql_in, k=k) 248 | main.ops_analysis() 249 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Churn Analysis SDK 2 | ## 简介 3 | 4 | 这是一个分析用户流失的程序,通过解析一个数据库形式的埋点数据,进行游戏用户流失分析,该环境运行在ubuntu16.04,python35,需要安装sklearn,pandas,numpy,XGBoost等包 5 | 6 | 为更加方便用户使用,现已经上述环境和代码封装于docker镜像中,并上传至阿里云镜像,名字为 7 |
8 | **registry.cn-hangzhou.aliyuncs.com/jingchunzhen/churn_analysis** 9 |
10 | 11 | 12 | ## 输入输出 13 | 14 | #### Step 1 15 | 用户需要在该项目中新建一个名为data的文件夹, 在data中存放需要处理的需要进行解析的数据库文件(sqlite),数据库文件中至少需要包含以下字段 16 | 17 |
18 | 19 | 字段|数据类型|意义 20 | -----|-----------|--------- 21 | user_id|int|用户标识 22 | op|text(string)|动作的名称 23 | current_day|int|当前玩的天数 24 | num_days_played|int|总共玩的天数 25 | relative_timestamp|float|动作发生的时间 26 | 27 |
28 | 29 | #### Step 2 30 | 31 | ``` 32 | python3 main.py -i XXX.db2 -d 1 -k t 33 | ``` 34 | -i 之后的参数表示的是数据库文件的存放地址 35 | -d 表示的是需要处理的是第i天的数据即首日用户流失分析,次留玩家流失分析,三留玩家流失分析 36 | -k 参数t表示的是处理全部的动作信息,参数f表示仅处理由分类器提取出的较为重要的动作信息 37 | 38 | #### 输出 39 | 40 | 输出为一个csv文件,存储在output文件夹中 41 | 42 |
43 | 44 | 字段|字段说明 45 | ----|--------- 46 | 动作|埋点数据中动作的编码 47 | 留存比|点击该动作流失的人数与点击该动作未流失人数的比值 48 | 动作流失人数|点击该动作流失的人数 49 | 点击次数比|非流失玩家点击该动作的平均次数与流失玩家点击该动作的平均次数的比值 50 | 动作时段|该动作首次出现在游戏中的平均时间,以衡量该动作属于游戏前期,中期或者后期动作 51 | 随后平均时间间隔|卡点分析,判断该动作是否是一个潜在的卡点 52 | 随后中位数时间间隔|卡点分析 53 | 54 |
55 | 56 | 同时,还计算 57 | * 动作留存比和动作随后时间间隔之间的Pearson系数,以验证留存和卡点之间的相关性,并绘制相关性曲线 58 | * 动作平均卡顿时间 59 | * 所有动作点击次数比的平均值,以验证某一个动作非流失玩家或者流失玩家的偏好性 60 | 61 | 62 | ## 项目结构 63 | 64 | #### utils 65 | 66 | - data_parser.py 处理原始数据,生成临时的数据,存储在temp文件夹下。 67 | - data_analysis.py 统计一些数据的特征,如卡点分析,留存率和卡点之间的相关性系数等。 68 | 69 | #### temp 70 | 71 | 存储生成的临时文件(包含处理数据之后的文件) 72 | 73 | - clean.sh 清理临时文件夹 74 | 75 | #### xgb 76 | 77 | - xgb_model.py 进行流失用户分析的算法 78 | 79 | #### main.py 80 | 81 | 存放模型的流程 82 | 83 | #### output 84 | 85 | 该文件夹存储最终生成的数据报表文件和曲线图 86 | 87 | ## Docker镜像使用说明 88 | 89 | #### Step 1 90 | 91 | 安装docker 92 | 93 | #### Step 2 94 | 95 | ``` 96 | docker pull registry.cn-hangzhou.aliyuncs.com/jingchunzhen/churn_analysis 97 | 98 | docker run -t -i --name temp registry.cn-hangzhou.aliyuncs.com/jingchunzhen/churn_analysis /bin/bash 99 | 100 | docker cp 宿主机中埋点数据的文件地址 temp: /home/workspace/data 101 | ``` 102 | 103 | #### Step 3 104 | 105 | 进入容器内 106 | 107 | ``` 108 | cd /home/workspace 109 | 110 | bash run.sh 111 | ``` 112 | 113 | #### Step 4 114 | 115 | ``` 116 | docker cp temp:/home/workspace/output/生成的数据报表文件 宿主机中的文件地址 117 | ``` 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | python3 main.py -i ./data/an.db2 -d 1 -k f -------------------------------------------------------------------------------- /temp/clean.sh: -------------------------------------------------------------------------------- 1 | # 当传入有参数时,删除参数所指定的文件 2 | # 当传入无参数时,删除所有文件 3 | 4 | if [ $# -eq 0 ] 5 | then 6 | rm *.txt 7 | rm *.pkl 8 | else 9 | for arg in "$@";do 10 | if [ ! -d "$arg" ]; 11 | then 12 | echo "$arg not exist" 13 | else 14 | rm $arg 15 | echo "$arg cleaned up" 16 | fi 17 | done 18 | fi -------------------------------------------------------------------------------- /tests/test_pearson_draw.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pyecharts import Line 3 | 4 | ''' 5 | 对excel表中的某列进行排序 6 | 对排序之后的结果分割 7 | 对分割之后的结果绘制pearson曲线,并计算pearson系数 8 | ''' 9 | 10 | def draw(file_in, file_out): 11 | ''' 12 | ''' 13 | df = pd.read_csv(file_in, encoding='utf-8') 14 | print(type(df['留存比'])) 15 | print(df['留存比'].corr(df['随后中位数时间间隔'])) 16 | 17 | # 对每一条数据存如列表中 18 | l1 = df['留存比'].tolist() 19 | l1 = [e * 1000 for e in l1] 20 | l2 = df['随后中位数时间间隔'].tolist() 21 | 22 | attr = [i for i in range(len(l1))] 23 | 24 | line = Line("") 25 | line.add("流失留存比 * 1000", attr, l1) 26 | line.add("卡顿时间", attr, l2) 27 | 28 | line.show_config() 29 | line.render(file_out) 30 | 31 | if __name__ == "__main__": 32 | ''' 33 | 0.309806861451 34 | 0.465331387018 35 | 0.0989582758376 36 | ''' 37 | draw('../output/0-600.csv', '../output/0-600.html') 38 | draw('../output/600-1200.csv', '../output/600-1200.html') 39 | draw('../output/1200-2400.csv', '../output/1200-2400.html') 40 | pass -------------------------------------------------------------------------------- /utils/data_analysis.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import sqlite3 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | import sys 8 | from sklearn import preprocessing 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 10 | 11 | 12 | class Data_Analysis(): 13 | ''' 14 | data analysis for churn rate, interval of each op 15 | the pearson between interval and churn rate 16 | and so on 17 | ''' 18 | 19 | def __init__(self, *file_in): 20 | ''' 21 | Args: 22 | file_in: ops path (file), label path (pickle), sql file path (db file) 23 | ''' 24 | self.file_ops = file_in[0] 25 | self.file_labels = file_in[1] 26 | self.sql_in = file_in[2] 27 | with open(file_in[0], 'rb') as f_ops, open(file_in[1], 'rb') as f_label: 28 | self.user_labels = pickle.load(f_label) # get a label list 1 for churned 0 for not churned 29 | self.op_categories = set() 30 | self.op_churn = {} 31 | self.op_intervals = {} 32 | self.op_median_intervals = {} 33 | self.op_clicks = {} 34 | self.op_stage = {} 35 | with open(self.file_ops, 'rb') as f_ops: 36 | for ops in f_ops: 37 | ops = ops.decode('utf-8') 38 | ops_list = ops.strip().split(' ') 39 | [self.op_categories.add(op) for op in ops_list] 40 | 41 | def statistics_op_churn(self): 42 | ''' 43 | Return: 44 | (dict): key (string): the op's name, 45 | value (list, length is 2) 46 | [0] for how many user not churned when did this op 47 | [1] for how many user churned when did this op 48 | ''' 49 | with open(self.file_ops, 'rb') as f_ops: 50 | i = 0 51 | for ops in f_ops: 52 | ops = ops.decode('utf-8') 53 | ops_list = ops.strip().split(' ') 54 | if self.user_labels[i] == 0: 55 | # how many user not churned at this op 56 | for op in set(ops_list): 57 | if op not in self.op_churn: 58 | self.op_churn[op] = [0, 0] 59 | self.op_churn[op][0] += 1 60 | else: 61 | # how many user churned at this op 62 | for op in set(ops_list[:-1]): 63 | if op not in self.op_churn: 64 | self.op_churn[op] = [0, 0] 65 | self.op_churn[op][0] += 1 66 | if ops_list[-1] not in self.op_churn: 67 | self.op_churn[ops_list[-1]] = [0, 0] 68 | self.op_churn[ops_list[-1]][1] += 1 69 | i += 1 70 | return self.op_churn 71 | 72 | def statistics_op_clicks(self): 73 | ''' 74 | Return: 75 | (dict): key(string): op's name 76 | value (float list of length of 2): 77 | [0] for the average clicks of the unchurned user, 78 | [1] for the average clicks of the churned user 79 | 80 | ''' 81 | sum_users = [0] * 2 82 | with open(self.file_ops, 'rb') as f_ops: 83 | i = 0 84 | for ops in f_ops: 85 | sum_users[self.user_labels[i]] += 1 86 | ops = ops.decode('utf-8') 87 | ops_list = ops.strip().split(' ') 88 | for op in ops_list: 89 | if op not in self.op_clicks: 90 | self.op_clicks[op] = [0, 0] 91 | self.op_clicks[op][self.user_labels[i]] += 1 92 | i += 1 93 | for k, v in self.op_clicks.items(): 94 | a = v[0] * 1.0 / sum_users[0] 95 | b = v[1] * 1.0 / sum_users[1] 96 | self.op_clicks[k] = [a, b] 97 | return self.op_clicks 98 | 99 | def statistics_op_avg_clicks_ratio(self): 100 | ''' 101 | Return: 102 | (float): ratio the total clilcks of unchurn user / the total clicks of churn user 103 | ''' 104 | total_clicks = [[], []] 105 | with open(self.file_ops, 'rb') as f_ops: 106 | i = 0 107 | for ops in f_ops: 108 | ops = ops.decode('utf-8') 109 | ops_list = ops.strip().split(' ') 110 | total_clicks[self.user_labels[i]].append(len(ops_list)) 111 | i += 1 112 | total_clicks[0].remove(max(total_clicks[0])) 113 | total_clicks[0].remove(min(total_clicks[0])) 114 | total_clicks[1].remove(max(total_clicks[1])) 115 | total_clicks[1].remove(min(total_clicks[1])) 116 | return np.mean(total_clicks[0]) * 1.0 / np.mean(total_clicks[1]) 117 | 118 | def statistics_op_intervals(self): 119 | ''' 120 | Returns: 121 | (dict): key (string): op's name, value (float): the mean interval of this op 122 | (dict): key (string): op's name, value (float): the median interval of this op 123 | ''' 124 | conn = sqlite3.connect(self.sql_in) 125 | c = conn.cursor() 126 | query_sql = "SELECT user_id, op, current_day, num_days_played, relative_timestamp \ 127 | FROM maidian ORDER BY user_id, relative_timestamp" 128 | 129 | previous_relativetimestamp = 0 130 | previous_userid = None 131 | previous_op = None 132 | intervals = [] 133 | for row in c.execute(query_sql): 134 | user_id = row[0] 135 | op = row[1].strip().replace(' ', '') 136 | current_day = row[2] 137 | num_days_played = row[3] 138 | relative_timestamp = row[4] 139 | # calculate the interval 140 | interval = relative_timestamp - previous_relativetimestamp 141 | if previous_userid == user_id: 142 | if previous_op not in self.op_intervals: 143 | self.op_intervals[previous_op] = [] 144 | self.op_intervals[previous_op].append(interval) 145 | else: 146 | pass 147 | previous_userid = user_id 148 | previous_relativetimestamp = relative_timestamp 149 | previous_op = op 150 | 151 | for k, intervals in self.op_intervals.items(): 152 | self.op_median_intervals[k] = np.median(intervals) 153 | if len(intervals) >= 10: 154 | intervals.remove(max(intervals)) 155 | intervals.remove(max(intervals)) 156 | intervals.remove(min(intervals)) 157 | intervals.remove(min(intervals)) 158 | self.op_intervals[k] = np.mean(intervals) 159 | return self.op_intervals, self.op_median_intervals 160 | 161 | def statistics_op_avg_intervals(self): 162 | ''' 163 | Return: 164 | (float): the mean of all the ops 165 | ''' 166 | intervals = [] 167 | for _, interval in self.op_intervals.items(): 168 | intervals.append(interval) 169 | intervals.remove(max(intervals)) 170 | intervals.remove(min(intervals)) 171 | return np.mean(intervals) 172 | 173 | def statistics_op_median_intervals(self): 174 | ''' 175 | Return: 176 | (float): the median of all the ops 177 | ''' 178 | intervals = [] 179 | for _, interval in self.op_intervals.items(): 180 | intervals.append(interval) 181 | return np.median(intervals) 182 | 183 | def statistics_op_stage(self): 184 | ''' 185 | Return: 186 | (dict): key (string): op's name 187 | value (float list): op's first occurrences 188 | ''' 189 | conn = sqlite3.connect(self.sql_in) 190 | c = conn.cursor() 191 | query_sql = "SELECT user_id, op, relative_timestamp \ 192 | FROM maidian ORDER BY user_id, relative_timestamp ASC" 193 | 194 | previous_userid = None 195 | start_time = None 196 | 197 | for row in c.execute(query_sql): 198 | user_id = row[0] 199 | op = row[1].strip().replace(' ', '') 200 | relative_timestamp = row[2] 201 | 202 | if previous_userid is None: 203 | start_time = relative_timestamp 204 | temp_dict = {} 205 | temp_dict[op] = relative_timestamp 206 | elif previous_userid == user_id: 207 | # only the first occurrence of the ops will be record 208 | if op not in temp_dict: 209 | temp_dict[op] = relative_timestamp 210 | else: 211 | # the user changed 212 | for op, rt in temp_dict.items(): 213 | if op not in self.op_stage: 214 | self.op_stage[op] = [] 215 | else: 216 | self.op_stage[op].append(rt - start_time) 217 | temp_dict = {} 218 | start_time = relative_timestamp 219 | temp_dict[op] = relative_timestamp 220 | 221 | previous_userid = user_id 222 | return self.op_stage 223 | 224 | def statistics_pearson_clicks_intervals(self): 225 | ''' 226 | Return: 227 | (float): the Pearson between the op's interval and the churn rate of this op 228 | ''' 229 | churn_rates = [] 230 | intervals = [] 231 | for k, v in self.op_churn.items(): 232 | churn_rate = v[1] * 1.0 / v[0] 233 | churn_rates.append(churn_rate) 234 | intervals.append(self.op_intervals[k]) 235 | 236 | s1 = pd.Series(intervals) 237 | s2 = pd.Series(churn_rates) 238 | return s1.corr(s2) 239 | 240 | def statistics_pearson_clicks_stage(self): 241 | ''' 242 | deprecated 243 | used for test 244 | ''' 245 | stages = [] 246 | clicks = [] 247 | 248 | for k, _ in self.op_clicks.items(): 249 | if self.op_clicks[k][1] == 0: 250 | clicks.append(-1) 251 | else: 252 | clicks.append(self.op_clicks[k][0] 253 | * 1.0 / self.op_clicks[k][1]) 254 | 255 | v = self.op_stage[k] 256 | if len(v) > 4: 257 | v.remove(max(v)) 258 | v.remove(min(v)) 259 | stages.append(np.mean(v) * 10) 260 | 261 | s1 = pd.Series(stages) 262 | s2 = pd.Series(clicks) 263 | return s1.corr(s2) 264 | -------------------------------------------------------------------------------- /utils/data_parse.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import sqlite3 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | import sys 8 | from sklearn import preprocessing 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 10 | 11 | 12 | class Data_Parser(object): 13 | 14 | def __init__(self, sql_file): 15 | self.sql_in = sql_file 16 | self.fc_user_ops = {} 17 | self.fc_user_label = {} 18 | self.sc_user_ops = {} 19 | self.sc_user_label = {} 20 | self.tc_user_ops = {} 21 | self.tc_user_label = {} 22 | pass 23 | 24 | def parse(self): 25 | ''' 26 | The player's operation sequence is divided by day to analyze the churn in the first, second, third days. 27 | ''' 28 | conn = sqlite3.connect(self.sql_in) 29 | c = conn.cursor() 30 | query_sql = "SELECT user_id, op, current_day, num_days_played, relative_timestamp \ 31 | FROM maidian ORDER BY user_id, relative_timestamp ASC" 32 | for row in c.execute(query_sql): 33 | user_id = row[0] 34 | op = row[1].strip().replace(" ", '') 35 | current_day = row[2] 36 | num_days_played = row[3] 37 | 38 | if current_day == 1: 39 | self.fc_user_label[user_id] = 1 if num_days_played == 1 else 0 40 | if user_id not in self.fc_user_ops: 41 | self.fc_user_ops[user_id] = [] 42 | self.fc_user_ops[user_id].append(op) 43 | elif current_day == 2: 44 | self.sc_user_label[user_id] = 1 if num_days_played == 2 else 0 45 | if user_id not in self.sc_user_ops: 46 | self.sc_user_ops[user_id] = [] 47 | self.sc_user_ops[user_id].append(op) 48 | elif current_day == 3: 49 | self.tc_user_label[user_id] = 1 if num_days_played == 3 else 0 50 | if user_id not in self.tc_user_ops: 51 | self.tc_user_ops[user_id] = [] 52 | self.tc_user_ops[user_id].append(op) 53 | else: 54 | pass 55 | pass 56 | 57 | def write_in(self, file_out): 58 | with open(file_out[0], 'a') as f_fc_train, open(file_out[1], 'a') as f_sc_train, open(file_out[2], 'a') as f_tc_train, \ 59 | open(file_out[3], 'wb') as f_fc_label, open(file_out[4], 'wb') as f_sc_label, open(file_out[5], 'wb') as f_tc_label: 60 | fc_labels = [] 61 | sc_labels = [] 62 | tc_labels = [] 63 | for user in self.fc_user_ops: 64 | s = ' '.join(self.fc_user_ops[user]) 65 | f_fc_train.write(s + '\n') 66 | fc_labels.append(self.fc_user_label[user]) 67 | for user in self.sc_user_ops: 68 | s = ' '.join(self.sc_user_ops[user]) 69 | f_sc_train.write(s + '\n') 70 | sc_labels.append(self.sc_user_label[user]) 71 | for user in self.tc_user_ops: 72 | s = ' '.join(self.tc_user_ops[user]) 73 | f_tc_train.write(s + '\n') 74 | tc_labels.append(self.tc_user_label[user]) 75 | 76 | pickle.dump(fc_labels, f_fc_label) 77 | pickle.dump(sc_labels, f_sc_label) 78 | pickle.dump(tc_labels, f_tc_label) 79 | 80 | def load_tfidf(self, *file_in, minimum_support=5, sample_rate=0, method='tfidf'): 81 | ''' 82 | get the tfidf of the op's sequence 83 | Args: 84 | file_in: training data and label data 85 | minimum_support (int): minimum count for op 86 | sample rate (int [0, 10)): if 0 not sample 87 | method (string): can either be 'count' or 'tfidf' 88 | Returns: 89 | X: (np.array) 90 | Y: (list) 91 | op: (list) 92 | ''' 93 | assert len(file_in) == 2 94 | corpus = [] 95 | new_corpus = [] 96 | new_labels = [] 97 | op_counts = {} 98 | 99 | with open(file_in[0], 'rb') as f_train: 100 | for line in f_train: 101 | ops = set(line.decode('utf-8').strip().split(' ')) 102 | for op in ops: 103 | if op not in op_counts: 104 | op_counts[op] = 1 105 | else: 106 | op_counts[op] += 1 107 | 108 | with open(file_in[0], 'rb') as f_train, open(file_in[1], 'rb') as f_label: 109 | for line in f_train: 110 | ops = line.decode('utf-8').strip().split(' ') 111 | [ops.remove(op) 112 | for op in ops if op_counts[op] <= minimum_support] 113 | line = ' '.join(ops) 114 | corpus.append(line) 115 | 116 | labels = pickle.load(f_label) 117 | 118 | if sample_rate != 0: 119 | sampled_corpus = [] 120 | sampled_labels = [] 121 | sample_index = [] 122 | for i in range(len(labels)): 123 | if labels[i] == 0: 124 | sampled_corpus.append(corpus[i]) 125 | sampled_labels.append(labels[i]) 126 | else: 127 | if random.randint(0, 100) > sample_rate * 100: 128 | sampled_corpus.append(corpus[i]) 129 | sampled_labels.append(labels[i]) 130 | new_corpus = sampled_corpus 131 | new_labels = sampled_labels 132 | else: 133 | new_corpus = corpus 134 | new_labels = labels 135 | pass 136 | 137 | vectorizer = CountVectorizer(analyzer=str.split) 138 | 139 | if method.lower() == 'count': 140 | X = vectorizer.fit_transform( 141 | new_corpus).toarray() # 142 | elif method.lower() == 'tfidf': 143 | transformer = TfidfTransformer() 144 | tfidf = transformer.fit_transform( 145 | vectorizer.fit_transform(new_corpus)) 146 | X = tfidf.toarray() 147 | Y = new_labels 148 | op = vectorizer.get_feature_names() 149 | 150 | return X, Y, op 151 | -------------------------------------------------------------------------------- /xgb/xgb_model.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import sklearn 3 | from sklearn import metrics 4 | import numpy as np 5 | import pickle 6 | from sklearn.cross_validation import train_test_split 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.metrics import precision_recall_fscore_support 9 | from sklearn.feature_selection import SelectKBest 10 | 11 | 12 | class XGB_Model(): 13 | ''' 14 | ''' 15 | 16 | def __init__(self, X, Y, op_name, validate_size, test_size): 17 | self.X = X 18 | self.Y = Y 19 | self.op_name = op_name 20 | self.key_ops = None 21 | X_train, self.X_test, Y_train, self.Y_test = train_test_split( 22 | X, Y, test_size=test_size) 23 | self.X_train, self.X_validate, self.Y_train, self.Y_validate = train_test_split( 24 | X_train, Y_train, test_size=validate_size) 25 | 26 | def model(self): 27 | model = xgb.XGBClassifier( 28 | learning_rate=0.1, n_estimators=20, max_depth=3, subsample=1) 29 | eval_set = [(self.X_validate, self.Y_validate)] 30 | model.fit(self.X_train, self.Y_train, early_stopping_rounds=20, 31 | eval_metric="logloss", eval_set=eval_set, verbose=True) 32 | 33 | # Y_pred -> np.ndarray Y_train -> list 34 | Y_pred = model.predict(self.X_train) 35 | print('training score {}'.format(accuracy_score(self.Y_train, Y_pred))) 36 | Y_pred = model.predict(self.X_validate) 37 | print('validate score {}'.format( 38 | accuracy_score(self.Y_validate, Y_pred))) 39 | Y_pred = model.predict(self.X_test) 40 | print('test score {}'.format(accuracy_score(self.Y_test, Y_pred))) 41 | print(precision_recall_fscore_support( 42 | self.Y_test, Y_pred, average=None)) 43 | 44 | print(np.shape(model.feature_importances_)) 45 | self.key_ops = list(model.feature_importances_) 46 | --------------------------------------------------------------------------------