├── docker_file ├── lr_dockerfile └── xgboost_Dockerfile ├── data_intro.md ├── load_lr_model_filter.py ├── criteo_stat.py ├── criteo_one_hot.py └── xgboost_model_parser.py /docker_file/lr_dockerfile: -------------------------------------------------------------------------------- 1 | ##############################t_hub_store############################## 2 | 3 | # Dockerfile to build MongoDB container images 4 | 5 | # Based on Ubuntu 6 | 7 | ############################################################ 8 | 9 | 10 | # Set the base image to Ubuntu 11 | 12 | FROM ubuntu 13 | 14 | # File Author 15 | 16 | MAINTAINER wuwei 17 | 18 | ################## BEGIN INSTALLATION ###################### 19 | RUN apt-get update 20 | 21 | RUN apt-get install -y git 22 | 23 | RUN apt-get install -y openssh-server 24 | 25 | RUN apt-get install -y gcc 26 | RUN apt-get install -y g++ 27 | RUN apt-get install -y automake autoconf libtool make 28 | 29 | 30 | 31 | #RUN /usr/sbin/sshd -D 32 | 33 | ##################### INSTALLATION END ##################### 34 | -------------------------------------------------------------------------------- /docker_file/xgboost_Dockerfile: -------------------------------------------------------------------------------- 1 | ##############################t_hub_store############################## 2 | 3 | # Dockerfile to build MongoDB container images 4 | 5 | # Based on Ubuntu 6 | 7 | ############################################################ 8 | 9 | 10 | # Set the base image to Ubuntu 11 | 12 | FROM centos:centos7 13 | 14 | # File Author 15 | 16 | MAINTAINER wuwei 17 | 18 | ################## BEGIN INSTALLATION ###################### 19 | RUN yum install -y git 20 | 21 | RUN yum install -y openssh-server 22 | 23 | RUN yum install -y gcc gcc-c++ kernel-devel 24 | 25 | RUN yum install -y automake autoconf libtool make 26 | 27 | 28 | #RUN mkdir -p /github/wormhole 29 | COPY ./git_hub_store/xgboost /github/wormhole/xgboost 30 | 31 | #cd /github/wormhole && \ 32 | #git clone --recursive https://github.com/weiweijiuzaizhe/xgboost.git && \ 33 | #cd /github/wormhole/xgboost/dmlc-core && \ 34 | #make -j8 && \ 35 | #cd /github/wormhole/xgboost/rabit && \ 36 | #make -j8 && \ 37 | RUN cd /github/wormhole/xgboost/ && \ 38 | sh build.sh 39 | 40 | 41 | 42 | 43 | #RUN /usr/sbin/sshd -D 44 | 45 | ##################### INSTALLATION END ##################### 46 | -------------------------------------------------------------------------------- /data_intro.md: -------------------------------------------------------------------------------- 1 | # 数据源介绍 2 | 3 | * kaggle 2014 criteo 点击率预估比赛,数据经过脱敏处理,官方页面: https://www.kaggle.com/c/criteo-display-ad-challenge/data  ,但是下载数据的页面已经失效了,通过迅雷离线下载到了,要想个方法分享出来,下载的文件是 dac.tar.gz,解压缩之后是45840617行的train.txt,以及6042135行的test.txt, 4 | train.txt的前两行为例子: 5 | 6 | 0       1       1       5       0       1382    4       15      2       181     1       2               2       68fd1e64        80e26c9b        fb936136        7b4723c4        25c83c98        7e0ccccf        de7995b8        1f89b562        a73ee510a8cd5504        b2cb9c98        37c9c164        2824a5f6        1adce6ef        8ba8b39a        891b62e7        e5ba7672        f54016b9        21ddcdc9        b1252a9d        07b5194c                3a171ecb        c5c50484        e8b83407        9727dd16 7 | 8 | 9 | 0       2       0       44      1       102     8       2       2       4       1       1               4       68fd1e64        f0cf0024        6f67f7e5        41274cd7        25c83c98        fe6b92e5        922afcc0        0b153874        a73ee5102b53e5fb        4f1b46f3        623049e6        d7020589        b28479f6        e6c5b5cd        c92f3b61        07c540c4        b04e4670        21ddcdc9        5840adea        60f6221e                3a171ecb        43f13e8b        e8b83407        731c3655 10 | 11 | 12 | * Label - Target variable that indicates if an ad was clicked (1) or not (0). 第1列是label值,1或0表示点或者不点 13 | * I1-I13 - A total of 13 columns of integer features (mostly count features). 第2-14列是数值特征 14 | * C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes.   第15-40列是类别特征,每个值都被hash成了32bit 15 |  需要注意的是,有些行的值是空,既不是整数,也不是hash值,criteo_stat.py对于整个数据出现的维度和值的频次进行统计,输出cls_value_cnt_id.txt文件,提供给后面的one hot编码使用 16 | 17 | 18 | 19 | 20 | * Avito Context Ad Clicks  点击率预估比赛,数据没有经过脱敏处理,带有原始的信息 : https://www.kaggle.com/c/avito-context-ad-clicks/data 21 | -------------------------------------------------------------------------------- /load_lr_model_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import time 5 | import subprocess 6 | import os 7 | 8 | distance = 1000000 9 | cls_num = 40 10 | threshold = 1 11 | cache_size = 10*10*1024 12 | 13 | 14 | def dict_from_model(model_file): 15 | file = open(model_file) 16 | oldLine = '0' 17 | count = 0 18 | to_return_dict = {} 19 | 20 | 21 | while 1: 22 | lines = file.readlines( cache_size ) 23 | if not lines: 24 | break 25 | for line in lines: 26 | if line.strip(): 27 | newLine = line 28 | if (newLine != oldLine): 29 | newLine = newLine.strip() 30 | feature_id = newLine.split("\t")[0] 31 | to_return_dict[ feature_id ] = 1 32 | 33 | file.close() 34 | return to_return_dict 35 | 36 | 37 | def file_filter(train_file,feature_dict): 38 | oldLine = '0' 39 | file = open(train_file) 40 | while 1: 41 | lines = file.readlines( cache_size ) 42 | if not lines: 43 | break 44 | for line in lines: 45 | if line.strip(): 46 | newLine = line 47 | if (newLine != oldLine): 48 | split_res = newLine.strip().split(" ") 49 | label = split_res[0] # id 50 | feature_res = split_res[1:] 51 | key_dict = {} 52 | for k in feature_res: # 这一行的feature 53 | feature_id = k.split(":")[0] 54 | if feature_dict.get(feature_id,-1) > 0: 55 | key_dict[ int(feature_id) ] = 1 56 | 57 | sorted_dict = sorted(key_dict.items(), key=lambda d:d[0]) #根据key排序 58 | str_to_print = "" 59 | for v in sorted_dict: 60 | str_to_print = str_to_print + str( v[0] ) +":1 " 61 | str_to_print = str_to_print.strip() 62 | print label + " " + str_to_print 63 | 64 | model_file = sys.argv[1]; 65 | input_file = sys.argv[2]; 66 | feature_dict = dict_from_model(model_file) 67 | file_filter(input_file,feature_dict) 68 | 69 | -------------------------------------------------------------------------------- /criteo_stat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import time 5 | import subprocess 6 | import os 7 | 8 | 9 | 10 | distance = 100000 11 | cls_num = 40 12 | threshold = 1 13 | 14 | def read_from_data(input_file,cls): # input_file 输入的文件路径,cls 对应的列号 15 | s1 = time.clock() 16 | file = open(input_file) 17 | oldLine = '0' 18 | count = 0 19 | cl_dist = dict() 20 | to_return = dict() 21 | 22 | while 1: 23 | lines = file.readlines(10*10*1024) #用缓存提高速度 24 | #print len(lines) 25 | if not lines: 26 | break 27 | for line in lines: 28 | if line.strip(): #去掉首末的不可见字符 29 | newLine = line 30 | if (newLine != oldLine): 31 | split_res = newLine.split("\t") 32 | #tag = int( split_res[ 0 ] ) 33 | cls_v = split_res[ cls ] #得到指定列的值 34 | 35 | if (cl_dist.has_key( cls_v ) ): 36 | cl_dist[ cls_v ] = cl_dist[ cls_v ] + 1 37 | else: 38 | cl_dist[ cls_v ] = 1 39 | 40 | 41 | oldLine = newLine 42 | count += 1 43 | if (count % distance == 0): 44 | print "now have read %s lines" %(count) 45 | 46 | for k in cl_dist: 47 | if cl_dist[ k ] > threshold : #出现了threshold次以上 48 | to_return[ str(cls) + "_" + str(k) ] = cl_dist[ k ] 49 | 50 | 51 | print "deal %s lines" %(count) 52 | e1 = time.clock() 53 | print "spent time:" + str(e1-s1) 54 | return to_return 55 | 56 | 57 | 58 | def write_dict_to_file(output_file,input_dict): 59 | f = open(output_file, 'w') 60 | for k in input_dict: 61 | f.write( str(k) + ":" + str(input_dict[k]) + "\n" ) 62 | 63 | f.close() 64 | 65 | 66 | def get_featue_id_file(out_file,input_file,max_cls): #最好在这里将中间文件产生与消除 67 | for i in range(1,cls_num ): 68 | print i 69 | stat_dict = read_from_data(input_file,i) 70 | write_dict_to_file(out_file + "_" + str(i),stat_dict) #会产生 cls_num - 1 个中间文件 71 | 72 | command_str="ls|grep \""+out_file+"\"|xargs cat |awk '{printf(\"%d:%s\\n\",NR,$0)}'> "+out_file+".txt" 73 | print command_str 74 | os.system(command_str) 75 | rm_command = "ls|grep \""+out_file+"\"|grep -v \".txt\"|xargs rm" 76 | print rm_command 77 | os.system(rm_command) #删除中间文件 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | input_file = sys.argv[1]; 86 | out_file = sys.argv[2]; 87 | 88 | 89 | get_featue_id_file(out_file,input_file,cls_num) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /criteo_one_hot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import time 5 | import subprocess 6 | import os 7 | 8 | distance = 1000000 9 | cls_num = 40 10 | threshold = 1 11 | cache_size = 10*10*1024 12 | 13 | """ 14 | Parameters: 15 | input_file - 输入特征编码文件,文件格式如下: 16 | 1:1_44:9260 17 | 2:1_45:8671 18 | feature_id:cls_value:cnt 19 | 20 | Returns: 21 | dict cls_value->hash_id 22 | """ 23 | def dict_from_meta(dict_file): 24 | s1 = time.clock() 25 | file = open(dict_file) 26 | oldLine = '0' 27 | count = 0 28 | 29 | to_return = dict() 30 | 31 | while 1: 32 | lines = file.readlines( cache_size ) 33 | if not lines: 34 | break 35 | for line in lines: 36 | if line.strip(): 37 | newLine = line 38 | if (newLine != oldLine): 39 | split_res = newLine.split(":") 40 | hash_id = split_res[ 0 ] 41 | cls_v = split_res[ 1 ] 42 | to_return[ cls_v ] = hash_id 43 | oldLine = newLine 44 | count += 1 45 | 46 | return to_return 47 | 48 | 49 | 50 | 51 | 52 | """ 53 | Parameters: 54 | meta_dict - 输入的数组cls_value->hash_id 55 | input_file - criteo训练文件,tag以及39列数据 56 | 57 | Returns: 58 | print one_hot 编码后的数据,调用方法类似于root@91fbbd3742ac:/github/ctr_pipeline# nohup python criteo_one_hot.py feature_id.txt /github/temp_data/train.txt > /github/temp_data/train_one_hot.txt & 59 | """ 60 | def from_criteo_to_format(meta_dict,input_file): 61 | s1 = time.clock() 62 | file = open(input_file) 63 | oldLine = '0' 64 | count = 0 65 | 66 | to_return = dict() 67 | 68 | while 1: 69 | lines = file.readlines( cache_size ) 70 | if not lines: 71 | break 72 | for line in lines: 73 | str_to_print = "" 74 | if line.strip(): 75 | newLine = line 76 | if (newLine != oldLine): 77 | split_res = newLine.split("\t") #将输入的一行分解为数组 78 | key_dict = {} 79 | for i in range(0,cls_num ): 80 | if(i != 0): 81 | key = str(i) + "_" + split_res[ i ] 82 | value = meta_dict.get(key , -1) 83 | if(value > 0): #找到了这个对应的下标 84 | #str_to_print = str_to_print + value +":1 " 85 | key_dict[ int(value) ] = 1 86 | 87 | 88 | if(i == 0): 89 | tag = split_res[ 0 ] 90 | str_to_print = str_to_print + tag + " " 91 | 92 | 93 | count += 1 94 | sorted_dict = sorted(key_dict.items(), key=lambda d:d[0]) #根据key排序 95 | for v in sorted_dict: 96 | str_to_print = str_to_print + str(v[0]) +":1 " 97 | str_to_print = str_to_print.strip() 98 | print str_to_print 99 | 100 | oldLine = newLine 101 | return to_return 102 | 103 | 104 | 105 | dict_file = sys.argv[1]; 106 | input_file = sys.argv[2]; 107 | 108 | 109 | feature_id_dict = dict_from_meta(dict_file) 110 | from_criteo_to_format(feature_id_dict,input_file) 111 | 112 | -------------------------------------------------------------------------------- /xgboost_model_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import time 5 | import subprocess 6 | import os 7 | 8 | distance = 1000000 9 | cls_num = 40 10 | threshold = 1 11 | cache_size = 10*10*1024 12 | 13 | """ 14 | 解析xgboost模型,将node_id和node_content放入一个dict中,不同tree的dict放入一个list,liist的下表了是tree_id 15 | Parameters: 16 | model_file - 输入xgboost的模型文件,文件格式如下: 17 | feature_id:cls_value:cnt 例子 18 | 1:1_44:9260 19 | 2:1_45:8671 20 | max_feature_id - 当前已经分配的最大的维度id 21 | Returns: 22 | dict - to_return_list[tree_id]=>{node_id=>node_content} 23 | """ 24 | def dict_from_model(model_file,max_feature_id): 25 | file = open(model_file) 26 | oldLine = '0' 27 | count = 0 28 | to_return_list = [] 29 | current_tree_id = -1 30 | current_feature_id = max_feature_id 31 | 32 | while 1: 33 | lines = file.readlines( cache_size ) 34 | if not lines: 35 | break 36 | for line in lines: 37 | if line.strip(): 38 | newLine = line 39 | if (newLine != oldLine): 40 | if newLine.find("booster[") ==0: # 该行以booster开头,说明行是一个tree的开头 41 | current_tree_id = int(newLine.replace("booster[","").replace("]:","")) 42 | current_tree = {} 43 | to_return_list.insert(current_tree_id,current_tree) 44 | continue 45 | split_res = newLine.split(":") 46 | node_id = int(split_res[ 0 ]) 47 | node_content = split_res[ 1 ] 48 | if node_content.find("leaf=") >= 0: # 这是个leaf 49 | node_content = node_content.replace("\n","") + \ 50 | ":new_feature=" + str(current_feature_id) # 将这个leaf申请一个新feature 51 | current_feature_id = current_feature_id + 1 52 | to_return_list[ current_tree_id ][ node_id ] = node_content.replace("\n","") 53 | oldLine = newLine 54 | 55 | file.close() 56 | return to_return_list 57 | 58 | """ 59 | class: 60 | 存放一个Node的信息, 61 | left_node_id - 左Node的id,往往是<和missing进入这个分支 62 | right_node_id - 右Node的id 63 | feature_id - 特征id > 0说明是原有的维度,feature_id < 0 表示是gbdt新增加出来的维度 64 | """ 65 | class Node: 66 | left_node_id = 0 67 | right_node_id = 0 68 | feature_id = 0 69 | 70 | 71 | 72 | """ 73 | 输入之前从dict_from_model得到的dict,得到一个数组,下标[tree_id]{node_id},对应的内容是Node 74 | Parameters: 75 | model_list_dict - 输入的一个list,其中下标值是tree_id,每个value是一个dict,key是node_id,value是node_content 76 | Returns: 77 | list - [tree_id]{node_id} => value>0 表示的是feature_id,value<0表示的是一个从根节点到叶节点的新feature_id 78 | """ 79 | def list_from_dict(model_list_dict): 80 | to_return = [] 81 | 82 | for tree_id in range(0, len(model_list_dict)): # 第tree_id棵树 83 | current_list = {} 84 | for node_id in model_list_dict[ tree_id ]: # dict的key 85 | 86 | if(model_list_dict[ tree_id ][ node_id ].find("<") > 0): # 是一个中间节点 87 | currnet_node = Node(); 88 | feature_id = model_list_dict[ tree_id ][ node_id ].split("<")[0].split("[f")[1] 89 | currnet_node.feature_id = int(feature_id) 90 | left_node_id = model_list_dict[ tree_id ][ node_id ].split("yes=")[1].split(",")[0] 91 | right_node_id = model_list_dict[ tree_id ][ node_id ].split("no=")[1].split(",")[0] 92 | currnet_node.left_node_id = int(left_node_id) 93 | currnet_node.right_node_id= int(right_node_id) 94 | 95 | elif(model_list_dict[ tree_id ][ node_id ].find("new_feature") > 0): # 是一个叶子节点 96 | currnet_node = Node(); 97 | currnet_node.left_node_id = 0 98 | currnet_node.right_node_id = 0 99 | feature_id = int(model_list_dict[ tree_id ][ node_id ].split("new_feature=")[1]) * -1 100 | currnet_node.feature_id = feature_id 101 | current_list[node_id] = currnet_node 102 | 103 | to_return.insert(tree_id, current_list) 104 | 105 | return to_return 106 | 107 | 108 | 109 | """ 110 | Parameters: 111 | model_list[][] - model_list[tree_id][node_id] 112 | tree_num - number of tress in model 113 | sampled_tree_id - the tree will be printed in the model 114 | Returns: 115 | none 116 | """ 117 | def list_from_dict_test(model_list,tree_num,sampled_tree_id): 118 | for tree_id in model_list: 119 | if tree_id == sampled_tree_id: 120 | for node_id in range(0,len(model_list[ tree_id ])): 121 | print str(tree_id),node_id, \ 122 | str(model_list[ tree_id ][ node_id ].left_node_id), \ 123 | str(model_list[ tree_id ][ node_id ].right_node_id), \ 124 | str(model_list[ tree_id ][ node_id ].feature_id) 125 | 126 | 127 | """ 128 | 读取feature_id_file,找到里面feature_id最大的值 129 | Parameters: 130 | feature_id_file - 输入feature_id_file的模型文件,文件格式如下: 131 | feature_id:cls_value:cnt 132 | 1:1_44:9260 133 | 2:1_45:8671 134 | 3:1_46:8184 135 | 4:1_47:7614 136 | 5:1_40:11962 137 | 6:1_41:11068 138 | 7:1_42:10616 139 | Returns: 140 | dict max_feature_id 141 | """ 142 | def max_feature_id(feature_id_file): 143 | file = open(feature_id_file) 144 | oldLine = '0' 145 | count = 0 146 | to_return_id = 0 147 | 148 | to_return = dict() 149 | 150 | while 1: 151 | lines = file.readlines( cache_size ) 152 | if not lines: 153 | break 154 | for line in lines: 155 | if line.strip(): 156 | newLine = line 157 | if (newLine != oldLine): 158 | split_res = newLine.split(":") 159 | hash_id = int(split_res[ 0 ]) 160 | if to_return_id < hash_id: 161 | to_return_id = hash_id 162 | file.close() 163 | return to_return_id 164 | 165 | 166 | 167 | """ 168 | Parameters: 169 | line_feature_dict - feature_id为key,1为value 170 | tree_list - tree_list是list_from_dict返回的一个元素,是用list表示的某一课树 171 | Returns: 172 | to_return_id - 这棵树要添加上的新维度,没有新维度返回0 173 | """ 174 | def find_feature_id(line_feature_dict,tree_list): 175 | suffix = 0 176 | to_return_id = tree_list[ suffix ].feature_id 177 | all_missing_flag = 0 178 | while( 1): 179 | if to_return_id < 0: 180 | break 181 | elif line_feature_dict.get(tree_list[ suffix ].feature_id) > 0 : # 这个节点的维度不是0 182 | suffix = tree_list[ suffix ].right_node_id # found,turn right 183 | all_missing_flag = all_missing_flag + 1 184 | else: 185 | suffix = tree_list[ suffix ].left_node_id # not found,turn left 186 | to_return_id = tree_list[ suffix ].feature_id 187 | 188 | if all_missing_flag == 0 : #如果一次都没有命中,都走missing的维度没有意义 189 | to_return_id = 0 190 | 191 | return to_return_id * -1 192 | 193 | 194 | """ 195 | Parameters: 196 | one_hot_libsvm_file - 输入的libsvm格式文件,一般进行了one_hot编码\ 197 | list_from_dict - 保存所有树的二维数组 198 | Returns: 199 | 需要添加在输入数据尾部的新字符串 200 | 201 | """ 202 | def add_gdbt_feature(one_hot_libsvm_file,list_from_dict): 203 | file = open(one_hot_libsvm_file) 204 | oldLine = '0' 205 | while 1: 206 | lines = file.readlines( cache_size ) 207 | if not lines: 208 | break 209 | for line in lines: 210 | if line.strip(): 211 | newLine = line 212 | if (newLine != oldLine): 213 | split_res = newLine.strip().split(" ") 214 | feature_res = split_res[1:] 215 | line_feature_dict = {} 216 | for k in feature_res: # 这一行的feature 217 | feature_id = int(k.split(":")[0]) 218 | line_feature_dict[ feature_id ] = 1 # 把目前已经有的feature放入dict中作为key 219 | str_to_add = " " 220 | 221 | for tree_id in range(0, len(list_from_dict) - 1 ): 222 | new_feature_id = find_feature_id(line_feature_dict,list_from_dict[ tree_id ]) 223 | if new_feature_id > 0: 224 | str_to_add = str(new_feature_id ) + ":1 " + str_to_add 225 | if str_to_add != " ": 226 | print newLine.replace("\n","") + str_to_add 227 | 228 | file.close() 229 | 230 | 231 | feature_id_file = sys.argv[1] 232 | model_file = sys.argv[2] 233 | libsvm_file_path= sys.argv[3] 234 | 235 | max_feature_id = max_feature_id(feature_id_file) 236 | 237 | model_list_dict = dict_from_model(model_file,max_feature_id) 238 | 239 | 240 | model_list = list_from_dict(model_list_dict) 241 | 242 | 243 | add_gdbt_feature(libsvm_file_path,model_list) 244 | #list_from_dict_test(model_list,100,93) 245 | 246 | 247 | #from_criteo_to_format(feature_id_dict,input_file) 248 | 249 | --------------------------------------------------------------------------------