├── docker_file
    ├── lr_dockerfile
    └── xgboost_Dockerfile
├── data_intro.md
├── load_lr_model_filter.py
├── criteo_stat.py
├── criteo_one_hot.py
└── xgboost_model_parser.py


/docker_file/lr_dockerfile:
--------------------------------------------------------------------------------
 1 | ##############################t_hub_store##############################
 2 | 
 3 | # Dockerfile to build MongoDB container images
 4 | 
 5 | # Based on Ubuntu
 6 | 
 7 | ############################################################
 8 | 
 9 | 
10 | # Set the base image to Ubuntu
11 | 
12 | FROM ubuntu
13 | 
14 | # File Author
15 | 
16 | MAINTAINER wuwei
17 | 
18 | ################## BEGIN INSTALLATION ######################
19 | RUN apt-get update
20 | 
21 | RUN apt-get install -y git
22 | 
23 | RUN apt-get install -y openssh-server
24 | 
25 | RUN apt-get install -y gcc 
26 | RUN apt-get install -y g++
27 | RUN apt-get install -y automake autoconf libtool make
28 | 
29 | 
30 | 
31 | #RUN /usr/sbin/sshd -D
32 | 
33 | ##################### INSTALLATION END #####################
34 | 


--------------------------------------------------------------------------------
/docker_file/xgboost_Dockerfile:
--------------------------------------------------------------------------------
 1 | ##############################t_hub_store##############################
 2 | 
 3 | # Dockerfile to build MongoDB container images
 4 | 
 5 | # Based on Ubuntu
 6 | 
 7 | ############################################################
 8 | 
 9 | 
10 | # Set the base image to Ubuntu
11 | 
12 | FROM centos:centos7
13 | 
14 | # File Author
15 | 
16 | MAINTAINER wuwei
17 | 
18 | ################## BEGIN INSTALLATION ######################
19 | RUN yum install -y git
20 | 
21 | RUN yum install -y openssh-server
22 | 
23 | RUN yum install -y gcc gcc-c++ kernel-devel
24 | 
25 | RUN yum install -y automake autoconf libtool make
26 | 
27 | 
28 | #RUN mkdir -p /github/wormhole
29 | COPY ./git_hub_store/xgboost /github/wormhole/xgboost
30 | 
31 | #cd /github/wormhole && \
32 | #git clone --recursive  https://github.com/weiweijiuzaizhe/xgboost.git && \
33 | #cd /github/wormhole/xgboost/dmlc-core && \
34 | #make -j8 && \
35 | #cd /github/wormhole/xgboost/rabit && \
36 | #make -j8 && \
37 | RUN cd /github/wormhole/xgboost/ && \
38 | sh build.sh
39 | 
40 | 
41 | 
42 | 
43 | #RUN /usr/sbin/sshd -D
44 | 
45 | ##################### INSTALLATION END #####################
46 | 


--------------------------------------------------------------------------------
/data_intro.md:
--------------------------------------------------------------------------------
 1 |  # 数据源介绍
 2 | 
 3 |  * kaggle 2014 criteo 点击率预估比赛,数据经过脱敏处理,官方页面: https://www.kaggle.com/c/criteo-display-ad-challenge/data  ,但是下载数据的页面已经失效了,通过迅雷离线下载到了,要想个方法分享出来,下载的文件是 dac.tar.gz,解压缩之后是45840617行的train.txt,以及6042135行的test.txt,
 4 | train.txt的前两行为例子:
 5 | 
 6 | 0       1       1       5       0       1382    4       15      2       181     1       2               2       68fd1e64        80e26c9b        fb936136        7b4723c4        25c83c98        7e0ccccf        de7995b8        1f89b562        a73ee510a8cd5504        b2cb9c98        37c9c164        2824a5f6        1adce6ef        8ba8b39a        891b62e7        e5ba7672        f54016b9        21ddcdc9        b1252a9d        07b5194c                3a171ecb        c5c50484        e8b83407        9727dd16
 7 | 
 8 | 
 9 | 0       2       0       44      1       102     8       2       2       4       1       1               4       68fd1e64        f0cf0024        6f67f7e5        41274cd7        25c83c98        fe6b92e5        922afcc0        0b153874        a73ee5102b53e5fb        4f1b46f3        623049e6        d7020589        b28479f6        e6c5b5cd        c92f3b61        07c540c4        b04e4670        21ddcdc9        5840adea        60f6221e                3a171ecb        43f13e8b        e8b83407        731c3655
10 | 
11 | 
12 |  * Label - Target variable that indicates if an ad was clicked (1) or not (0). 第1列是label值,1或0表示点或者不点
13 |  * I1-I13 - A total of 13 columns of integer features (mostly count features). 第2-14列是数值特征
14 |  * C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes.   第15-40列是类别特征,每个值都被hash成了32bit
15 |  需要注意的是,有些行的值是空,既不是整数,也不是hash值,criteo_stat.py对于整个数据出现的维度和值的频次进行统计,输出cls_value_cnt_id.txt文件,提供给后面的one hot编码使用
16 | 
17 | 
18 | 
19 | 
20 |  * Avito Context Ad Clicks  点击率预估比赛,数据没有经过脱敏处理,带有原始的信息 : https://www.kaggle.com/c/avito-context-ad-clicks/data 
21 | 


--------------------------------------------------------------------------------
/load_lr_model_filter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-  
 2 | 
 3 | import sys
 4 | import time
 5 | import subprocess
 6 | import os 
 7 | 
 8 | distance = 1000000
 9 | cls_num = 40
10 | threshold = 1
11 | cache_size = 10*10*1024
12 | 
13 | 
14 | def dict_from_model(model_file):  
15 |     file = open(model_file)
16 |     oldLine = '0'
17 |     count = 0
18 |     to_return_dict = {}
19 | 
20 | 
21 |     while 1:
22 |         lines = file.readlines( cache_size ) 
23 |         if not lines:
24 |             break
25 |         for line in lines:
26 |             if line.strip():  
27 |                 newLine =  line
28 |                 if (newLine != oldLine):
29 |                 	newLine = newLine.strip()
30 |                 	feature_id = newLine.split("\t")[0]
31 |                 	to_return_dict[ feature_id ] = 1 
32 | 
33 |     file.close()
34 |     return to_return_dict
35 | 
36 | 
37 | def file_filter(train_file,feature_dict):
38 |     oldLine = '0'
39 |     file = open(train_file)
40 |     while 1:
41 |         lines = file.readlines( cache_size ) 
42 |         if not lines:
43 |             break
44 |         for line in lines:
45 |             if line.strip():  
46 |                 newLine =  line
47 |                 if (newLine != oldLine):
48 |                 	split_res = newLine.strip().split(" ")
49 |                 	label = split_res[0] # id
50 |                 	feature_res = split_res[1:]
51 |                 	key_dict = {}
52 |                 	for k in feature_res: # 这一行的feature
53 |                 		feature_id = k.split(":")[0]
54 |                 		if feature_dict.get(feature_id,-1) > 0: 
55 |                 			key_dict[ int(feature_id) ] = 1
56 | 
57 |                 	sorted_dict = sorted(key_dict.items(), key=lambda d:d[0]) #根据key排序
58 |                 	str_to_print = ""
59 |                 	for v in sorted_dict:
60 |                 		str_to_print = str_to_print + str( v[0] ) +":1 "
61 |                 	str_to_print = str_to_print.strip()
62 |                 	print label + " " + str_to_print
63 | 
64 | model_file = sys.argv[1];
65 | input_file = sys.argv[2]; 
66 | feature_dict = dict_from_model(model_file)
67 | file_filter(input_file,feature_dict)
68 | 
69 | 


--------------------------------------------------------------------------------
/criteo_stat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-  
 2 | 
 3 | import sys
 4 | import time
 5 | import subprocess
 6 | import os 
 7 | 
 8 | 
 9 | 
10 | distance = 100000
11 | cls_num = 40
12 | threshold = 1
13 | 
14 | def read_from_data(input_file,cls):  # input_file 输入的文件路径,cls 对应的列号
15 |     s1 = time.clock()
16 |     file = open(input_file)
17 |     oldLine = '0'
18 |     count = 0
19 |     cl_dist = dict()
20 |     to_return = dict()
21 |     
22 |     while 1:
23 |         lines = file.readlines(10*10*1024)  #用缓存提高速度
24 |         #print len(lines)
25 |         if not lines:
26 |             break
27 |         for line in lines:
28 |             if line.strip():  #去掉首末的不可见字符
29 |                 newLine =  line
30 |                 if (newLine != oldLine):
31 |                     split_res = newLine.split("\t")
32 |                     #tag = int( split_res[ 0 ] )
33 |                     cls_v  =  split_res[ cls ]  #得到指定列的值
34 |                     
35 |                     if (cl_dist.has_key( cls_v ) ): 
36 |                       cl_dist[ cls_v ] = cl_dist[ cls_v ] + 1
37 |                     else:
38 |                       cl_dist[ cls_v ] = 1
39 | 
40 | 
41 |                     oldLine = newLine
42 |                     count += 1
43 |                     if (count % distance == 0):
44 |     					print  "now have read %s lines" %(count)
45 | 
46 |     for k in cl_dist:
47 |         if cl_dist[ k ] > threshold : #出现了threshold次以上
48 |             to_return[ str(cls) + "_" + str(k) ] =  cl_dist[ k ]
49 | 
50 | 
51 |     print "deal %s lines" %(count)
52 |     e1 = time.clock()
53 |     print "spent time:" + str(e1-s1)
54 |     return to_return
55 | 
56 | 
57 | 
58 | def write_dict_to_file(output_file,input_dict):
59 |     f = open(output_file, 'w')
60 |     for k in input_dict:
61 |         f.write( str(k) + ":" + str(input_dict[k]) + "\n" )
62 | 
63 |     f.close() 
64 | 
65 | 
66 | def get_featue_id_file(out_file,input_file,max_cls): #最好在这里将中间文件产生与消除
67 |     for i in range(1,cls_num ):
68 |         print i
69 |         stat_dict = read_from_data(input_file,i)
70 |         write_dict_to_file(out_file + "_" + str(i),stat_dict) #会产生 cls_num - 1 个中间文件
71 | 
72 |     command_str="ls|grep \""+out_file+"\"|xargs cat |awk '{printf(\"%d:%s\\n\",NR,$0)}'> "+out_file+".txt" 
73 |     print command_str
74 |     os.system(command_str)
75 |     rm_command = "ls|grep \""+out_file+"\"|grep -v \".txt\"|xargs rm"
76 |     print rm_command
77 |     os.system(rm_command) #删除中间文件
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | input_file = sys.argv[1];
86 | out_file = sys.argv[2]; 
87 | 
88 | 
89 | get_featue_id_file(out_file,input_file,cls_num)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/criteo_one_hot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-  
  2 | 
  3 | import sys
  4 | import time
  5 | import subprocess
  6 | import os 
  7 | 
  8 | distance = 1000000
  9 | cls_num = 40
 10 | threshold = 1
 11 | cache_size = 10*10*1024
 12 | 
 13 | """
 14 | Parameters:
 15 |   input_file - 输入特征编码文件,文件格式如下:
 16 | 	1:1_44:9260
 17 | 	2:1_45:8671
 18 | 	feature_id:cls_value:cnt
 19 | 
 20 | Returns:
 21 |   dict  cls_value->hash_id
 22 | """
 23 | def dict_from_meta(dict_file):  
 24 |     s1 = time.clock()
 25 |     file = open(dict_file)
 26 |     oldLine = '0'
 27 |     count = 0
 28 |     
 29 |     to_return = dict()
 30 |     
 31 |     while 1:
 32 |         lines = file.readlines( cache_size ) 
 33 |         if not lines:
 34 |             break
 35 |         for line in lines:
 36 |             if line.strip():  
 37 |                 newLine =  line
 38 |                 if (newLine != oldLine):
 39 |                     split_res = newLine.split(":")
 40 |                     hash_id = split_res[ 0 ]
 41 |                     cls_v = split_res[ 1 ]
 42 |                     to_return[ cls_v ] = hash_id
 43 |                     oldLine = newLine
 44 |                     count += 1
 45 | 
 46 |     return to_return
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | """
 53 | Parameters:
 54 |   meta_dict - 输入的数组cls_value->hash_id
 55 |   input_file - criteo训练文件,tag以及39列数据
 56 | 
 57 | Returns:
 58 |   print one_hot 编码后的数据,调用方法类似于root@91fbbd3742ac:/github/ctr_pipeline# nohup python criteo_one_hot.py feature_id.txt /github/temp_data/train.txt >  /github/temp_data/train_one_hot.txt &
 59 | """
 60 | def from_criteo_to_format(meta_dict,input_file):
 61 |     s1 = time.clock()
 62 |     file = open(input_file)
 63 |     oldLine = '0'
 64 |     count = 0
 65 |     
 66 |     to_return = dict()
 67 |     
 68 |     while 1:
 69 |         lines = file.readlines( cache_size ) 
 70 |         if not lines:
 71 |             break
 72 |         for line in lines:
 73 |             str_to_print = ""
 74 |             if line.strip():  
 75 |                 newLine =  line
 76 |                 if (newLine != oldLine):
 77 |                     split_res = newLine.split("\t") #将输入的一行分解为数组
 78 |                     key_dict = {}
 79 |                     for i in range(0,cls_num   ):
 80 |                         if(i != 0):
 81 |                             key = str(i) + "_" + split_res[ i ]
 82 |                             value = meta_dict.get(key , -1)
 83 |                             if(value > 0): #找到了这个对应的下标
 84 |                             	#str_to_print = str_to_print + value +":1 "
 85 |                             	key_dict[ int(value) ] = 1
 86 | 
 87 | 
 88 |                         if(i == 0):
 89 |                         	tag = split_res[ 0 ]
 90 |                         	str_to_print = str_to_print + tag + " "
 91 | 
 92 | 
 93 |                 count += 1
 94 |                 sorted_dict = sorted(key_dict.items(), key=lambda d:d[0]) #根据key排序 
 95 |                 for v in sorted_dict:
 96 |                 	str_to_print = str_to_print + str(v[0]) +":1 "
 97 |                 str_to_print = str_to_print.strip()
 98 |                 print str_to_print
 99 | 
100 |                 oldLine = newLine
101 |     return to_return
102 | 
103 | 
104 | 
105 | dict_file = sys.argv[1];
106 | input_file = sys.argv[2]; 
107 | 
108 | 
109 | feature_id_dict = dict_from_meta(dict_file) 
110 | from_criteo_to_format(feature_id_dict,input_file)
111 | 
112 | 


--------------------------------------------------------------------------------
/xgboost_model_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-  
  2 | 
  3 | import sys
  4 | import time
  5 | import subprocess
  6 | import os 
  7 | 
  8 | distance = 1000000
  9 | cls_num = 40
 10 | threshold = 1
 11 | cache_size = 10*10*1024
 12 | 
 13 | """
 14 | 解析xgboost模型,将node_id和node_content放入一个dict中,不同tree的dict放入一个list,liist的下表了是tree_id
 15 | Parameters:
 16 |   model_file		- 输入xgboost的模型文件,文件格式如下:
 17 | 	feature_id:cls_value:cnt 例子
 18 | 	1:1_44:9260
 19 | 	2:1_45:8671	
 20 |   max_feature_id	- 当前已经分配的最大的维度id
 21 | Returns:
 22 |   dict  			- to_return_list[tree_id]=>{node_id=>node_content}
 23 | """
 24 | def dict_from_model(model_file,max_feature_id):  
 25 |     file = open(model_file)
 26 |     oldLine = '0'
 27 |     count = 0
 28 |     to_return_list = []
 29 |     current_tree_id = -1
 30 |     current_feature_id = max_feature_id
 31 | 
 32 |     while 1:
 33 |         lines = file.readlines( cache_size ) 
 34 |         if not lines:
 35 |             break
 36 |         for line in lines:
 37 |             if line.strip():  
 38 |                 newLine =  line
 39 |                 if (newLine != oldLine):
 40 |                     if newLine.find("booster[") ==0: # 该行以booster开头,说明行是一个tree的开头
 41 |                     	current_tree_id = int(newLine.replace("booster[","").replace("]:",""))
 42 |                     	current_tree = {}
 43 |                     	to_return_list.insert(current_tree_id,current_tree)
 44 |                     	continue
 45 |                     split_res = newLine.split(":")
 46 |                     node_id = int(split_res[ 0 ])
 47 |                     node_content = split_res[ 1 ]
 48 |                     if node_content.find("leaf=") >= 0: # 这是个leaf
 49 |                     	 node_content = node_content.replace("\n","") + \
 50 |                     	 ":new_feature=" + str(current_feature_id) # 将这个leaf申请一个新feature
 51 |                     	 current_feature_id = current_feature_id + 1
 52 |                     to_return_list[ current_tree_id ][ node_id ] = node_content.replace("\n","")
 53 |                     oldLine = newLine
 54 | 
 55 |     file.close()
 56 |     return to_return_list
 57 | 
 58 | """
 59 | class:
 60 |   存放一个Node的信息,
 61 | left_node_id	- 左Node的id,往往是<和missing进入这个分支
 62 | right_node_id	- 右Node的id
 63 | feature_id 		- 特征id > 0说明是原有的维度,feature_id < 0 表示是gbdt新增加出来的维度
 64 | """
 65 | class Node:
 66 | 	left_node_id = 0
 67 | 	right_node_id = 0
 68 | 	feature_id = 0
 69 | 
 70 | 
 71 | 
 72 | """
 73 | 输入之前从dict_from_model得到的dict,得到一个数组,下标[tree_id]{node_id},对应的内容是Node
 74 | Parameters:
 75 |   model_list_dict - 输入的一个list,其中下标值是tree_id,每个value是一个dict,key是node_id,value是node_content
 76 | Returns:
 77 |   list			  -  [tree_id]{node_id} => value>0 表示的是feature_id,value<0表示的是一个从根节点到叶节点的新feature_id
 78 | """
 79 | def list_from_dict(model_list_dict):
 80 | 	to_return = []
 81 | 
 82 | 	for tree_id in range(0, len(model_list_dict)):  # 第tree_id棵树
 83 | 		current_list = {}
 84 | 		for node_id in model_list_dict[ tree_id ]: # dict的key
 85 | 			
 86 | 			if(model_list_dict[ tree_id ][ node_id ].find("<") > 0): # 是一个中间节点
 87 | 				currnet_node = Node();
 88 | 				feature_id = model_list_dict[ tree_id ][ node_id ].split("<")[0].split("[f")[1]
 89 | 				currnet_node.feature_id = int(feature_id)
 90 | 				left_node_id	= model_list_dict[ tree_id ][ node_id ].split("yes=")[1].split(",")[0]
 91 | 				right_node_id	= model_list_dict[ tree_id ][ node_id ].split("no=")[1].split(",")[0] 
 92 | 				currnet_node.left_node_id = int(left_node_id)
 93 | 				currnet_node.right_node_id= int(right_node_id)
 94 | 			
 95 | 			elif(model_list_dict[ tree_id ][ node_id ].find("new_feature") > 0): # 是一个叶子节点
 96 | 				currnet_node = Node();
 97 | 				currnet_node.left_node_id = 0
 98 | 				currnet_node.right_node_id = 0
 99 | 				feature_id = int(model_list_dict[ tree_id ][ node_id ].split("new_feature=")[1]) * -1
100 | 				currnet_node.feature_id = feature_id
101 | 			current_list[node_id] = currnet_node
102 | 
103 | 		to_return.insert(tree_id, current_list)
104 | 
105 | 	return to_return
106 | 
107 | 
108 | 
109 | """
110 | Parameters:
111 | 	model_list[][]	- model_list[tree_id][node_id]
112 | 	tree_num 		- number of tress in model
113 | 	sampled_tree_id	- the tree will be printed in the model
114 | Returns:
115 | 	none
116 | """
117 | def list_from_dict_test(model_list,tree_num,sampled_tree_id):
118 | 	for tree_id in model_list:
119 | 		if tree_id == sampled_tree_id:
120 | 			for node_id in range(0,len(model_list[ tree_id ])):
121 | 				print str(tree_id),node_id, \
122 | 				str(model_list[ tree_id ][ node_id ].left_node_id),	\
123 | 				str(model_list[ tree_id ][ node_id ].right_node_id), \
124 | 				str(model_list[ tree_id ][ node_id ].feature_id)
125 | 
126 | 
127 | """
128 | 读取feature_id_file,找到里面feature_id最大的值
129 | Parameters:
130 |   feature_id_file - 输入feature_id_file的模型文件,文件格式如下:
131 | 		feature_id:cls_value:cnt
132 | 		1:1_44:9260
133 | 		2:1_45:8671
134 | 		3:1_46:8184
135 | 		4:1_47:7614
136 | 		5:1_40:11962
137 | 		6:1_41:11068
138 | 		7:1_42:10616
139 | Returns:
140 |   dict  max_feature_id
141 | """
142 | def max_feature_id(feature_id_file):
143 |     file = open(feature_id_file)
144 |     oldLine = '0'
145 |     count = 0
146 |     to_return_id = 0
147 |     
148 |     to_return = dict()
149 |     
150 |     while 1:
151 |         lines = file.readlines( cache_size ) 
152 |         if not lines:
153 |             break
154 |         for line in lines:
155 |             if line.strip():  
156 |                 newLine =  line
157 |                 if (newLine != oldLine):
158 |                     split_res = newLine.split(":")
159 |                     hash_id = int(split_res[ 0 ])
160 |                     if to_return_id < hash_id:
161 |                     	to_return_id = hash_id
162 |     file.close()
163 |     return to_return_id
164 | 
165 | 
166 | 
167 | """
168 | Parameters:
169 | 	line_feature_dict	- feature_id为key,1为value 
170 | 	tree_list			- tree_list是list_from_dict返回的一个元素,是用list表示的某一课树
171 | Returns:
172 | 	to_return_id		- 这棵树要添加上的新维度,没有新维度返回0
173 | """
174 | def find_feature_id(line_feature_dict,tree_list):
175 | 	suffix = 0
176 | 	to_return_id = tree_list[ suffix ].feature_id
177 | 	all_missing_flag = 0
178 | 	while( 1):		
179 | 		if to_return_id < 0:
180 | 			break
181 | 		elif  line_feature_dict.get(tree_list[ suffix ].feature_id) > 0 :  # 这个节点的维度不是0
182 | 			suffix = tree_list[ suffix ].right_node_id # found,turn right
183 | 			all_missing_flag = all_missing_flag + 1
184 | 		else:
185 | 			suffix = tree_list[ suffix ].left_node_id # not found,turn left
186 | 		to_return_id = tree_list[ suffix ].feature_id
187 | 	
188 | 	if all_missing_flag == 0 : #如果一次都没有命中,都走missing的维度没有意义
189 | 		to_return_id = 0 
190 | 
191 | 	return to_return_id * -1
192 | 
193 | 
194 | """
195 | Parameters:
196 | 	one_hot_libsvm_file	- 输入的libsvm格式文件,一般进行了one_hot编码\
197 | 	list_from_dict 		- 保存所有树的二维数组
198 | Returns:
199 | 	需要添加在输入数据尾部的新字符串 
200 | 	
201 | """
202 | def add_gdbt_feature(one_hot_libsvm_file,list_from_dict):
203 |     file = open(one_hot_libsvm_file)
204 |     oldLine = '0'
205 |     while 1:
206 |         lines = file.readlines( cache_size ) 
207 |         if not lines:
208 |             break
209 |         for line in lines:
210 |             if line.strip():  
211 |                 newLine =  line
212 |                 if (newLine != oldLine):
213 |                     split_res = newLine.strip().split(" ")
214 |                     feature_res = split_res[1:]
215 |                     line_feature_dict = {}
216 |                     for k in feature_res: # 这一行的feature
217 |                     	feature_id = int(k.split(":")[0])
218 |                     	line_feature_dict[ feature_id ] = 1 # 把目前已经有的feature放入dict中作为key
219 |                     str_to_add = " "
220 | 
221 |                     for tree_id in range(0, len(list_from_dict) - 1 ):
222 |                     	new_feature_id = find_feature_id(line_feature_dict,list_from_dict[ tree_id ])
223 |                     	if new_feature_id > 0:                  	
224 |                     		str_to_add = str(new_feature_id ) + ":1 " + str_to_add
225 |                     if str_to_add != " ":
226 | 	                    print newLine.replace("\n","") + str_to_add
227 | 
228 |     file.close()
229 |     
230 | 
231 | feature_id_file = sys.argv[1]
232 | model_file 		= sys.argv[2]
233 | libsvm_file_path= sys.argv[3]
234 | 
235 | max_feature_id = max_feature_id(feature_id_file)
236 | 
237 | model_list_dict = dict_from_model(model_file,max_feature_id)
238 | 
239 | 
240 | model_list = list_from_dict(model_list_dict)
241 | 
242 | 
243 | add_gdbt_feature(libsvm_file_path,model_list)
244 | #list_from_dict_test(model_list,100,93)
245 | 
246 | 
247 | #from_criteo_to_format(feature_id_dict,input_file)
248 | 
249 | 


--------------------------------------------------------------------------------