├── CookieLibrary.py
├── README.md
├── Variables.py
├── VariablesTST.py
├── predict.py
└── train.py
/CookieLibrary.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2015 Roberto Diaz Morales
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 |
16 |
17 | import csv
18 | import numpy as np
19 | import re
20 | from collections import Counter,defaultdict
21 | import os
22 | import inspect
23 | import sys
24 | import sklearn
25 | from sklearn import cross_validation
26 | import pickle
27 | import xgboost as xgb
28 |
29 |
30 |
31 | #######################################################################################
32 | # THIS FUNCTION PARSES THE FILES WITH THE INFORMATION ABOUT DEVICES AND COOKIES #
33 | # AND CREATES LISTS WITH THE IDENTIFIERS OF THE CATEGORICAL FEATURES. #
34 | # THE INDEX OF THE LIST WILL BE USED AS THE VALUE OF THE FEATURE IN THE NUMPY MATRICES#
35 | #######################################################################################
36 |
37 | def GetIdentifiers(trainfile,testfile,cookiefile):
38 |
39 | DeviceList=list()
40 | CookieList=list()
41 | HandleList=list()
42 | DevTypeList=list()
43 | DevOsList=list()
44 | ComputerOsList=list()
45 | ComputerVList=list()
46 | CountryList=list()
47 | annC1List=list()
48 | annC2List=list()
49 |
50 |
51 | with open(trainfile,'rb') as csvfile:
52 | spamreader=csv.reader(csvfile,delimiter=',')
53 | spamreader.next()
54 | for row in spamreader:
55 | HandleList.append(row[0])
56 | DeviceList.append(row[1])
57 | DevTypeList.append(row[2])
58 | DevOsList.append(row[3])
59 | CountryList.append(row[4])
60 | annC1List.append(row[6])
61 | annC2List.append(row[7])
62 |
63 | DeviceList=list(set(DeviceList))
64 | CookieList=list(set(CookieList))
65 | HandleList=list(set(HandleList))
66 | DevTypeList=list(set(DevTypeList))
67 | DevOsList=list(set(DevOsList))
68 | CountryList=list(set(CountryList))
69 | annC1List=list(set(annC1List))
70 | annC2List=list(set(annC2List))
71 |
72 |
73 | with open(testfile,'rb') as csvfile:
74 | spamreader=csv.reader(csvfile,delimiter=',')
75 | spamreader.next()
76 | for row in spamreader:
77 | HandleList.append(row[0])
78 | DeviceList.append(row[1])
79 | DevTypeList.append(row[2])
80 | DevOsList.append(row[3])
81 | CountryList.append(row[4])
82 | annC1List.append(row[6])
83 | annC2List.append(row[7])
84 |
85 | DeviceList=list(set(DeviceList))
86 | CookieList=list(set(CookieList))
87 | HandleList=list(set(HandleList))
88 | DevTypeList=list(set(DevTypeList))
89 | DevOsList=list(set(DevOsList))
90 | CountryList=list(set(CountryList))
91 | annC1List=list(set(annC1List))
92 | annC2List=list(set(annC2List))
93 |
94 | with open(cookiefile,'rb') as csvfile:
95 | spamreader=csv.reader(csvfile,delimiter=',')
96 | spamreader.next()
97 | for row in spamreader:
98 | HandleList.append(row[0])
99 | CookieList.append(row[1])
100 | ComputerOsList.append(row[2])
101 | ComputerVList.append(row[3])
102 | CountryList.append(row[4])
103 | annC1List.append(row[6])
104 | annC2List.append(row[7])
105 |
106 | DeviceList=list(set(DeviceList))
107 | CookieList=list(set(CookieList))
108 | HandleList=list(set(HandleList))
109 | DevTypeList=list(set(DevTypeList))
110 | DevOsList=list(set(DevOsList))
111 | ComputerOsList=list(set(ComputerOsList))
112 | ComputerVList=list(set(ComputerVList))
113 | CountryList=list(set(CountryList))
114 | annC1List=list(set(annC1List))
115 | annC2List=list(set(annC2List))
116 |
117 | return (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)
118 |
119 |
120 | ############################################################################################################
121 | # THIS FUNCTION RECEIVES A LIST AND CREATES A DICTIONARY TO GET THE INDEX WHEN THE VALUE IS GIVEN AS A KEY #
122 | ############################################################################################################
123 |
124 | def list2Dict(lista):
125 | newDict=dict()
126 | for i in range(len(lista)):
127 | newDict[lista[i]]=i
128 | return newDict
129 |
130 |
131 | ##############################################################################
132 | # THIS FUNCTION CREATES A NUMPY MATRIX WITH THE INFORMATION OF A DEVICE FILE #
133 | ##############################################################################
134 |
135 | def loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2):
136 |
137 | NumRows = 0
138 | with open(trainfile,'rb') as csvfile:
139 | spamreader=csv.reader(csvfile,delimiter=',')
140 | spamreader.next()
141 | for row in spamreader:
142 | NumRows = NumRows + 1
143 |
144 | XDevices = np.zeros((NumRows,11))
145 |
146 | NumRows = 0
147 | with open(trainfile,'rb') as csvfile:
148 | spamreader=csv.reader(csvfile,delimiter=',')
149 | spamreader.next()
150 | for row in spamreader:
151 | XDevices[NumRows,0]=DictHandle[row[0]]
152 | XDevices[NumRows,1]=DictDevice[row[1]]
153 | XDevices[NumRows,2]=DictDevType[row[2]]
154 | XDevices[NumRows,3]=DictDevOs[row[3]]
155 | XDevices[NumRows,4]=DictCountry[row[4]]
156 | XDevices[NumRows,5]=np.float_(row[5])
157 | XDevices[NumRows,6]=DictAnnC1[row[6]]
158 | XDevices[NumRows,7]=DictAnnC2[row[7]]
159 | XDevices[NumRows,8]=np.float_(row[8])
160 | XDevices[NumRows,9]=np.float_(row[9])
161 | XDevices[NumRows,10]=np.float_(row[10])
162 |
163 | NumRows = NumRows + 1
164 |
165 | return XDevices
166 |
167 |
168 | ##############################################################################
169 | # THIS FUNCTION CREATES A NUMPY MATRIX WITH THE INFORMATION OF A COOKIE FILE #
170 | ##############################################################################
171 |
172 | def loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2):
173 |
174 | maxindex=np.int(np.max(DictCookie.values()))
175 |
176 | XCookies = np.zeros((maxindex+1,11))
177 |
178 | with open(cookiefile,'rb') as csvfile:
179 | spamreader=csv.reader(csvfile,delimiter=',')
180 | spamreader.next()
181 | for row in spamreader:
182 | fila=np.int(DictCookie[row[1]])
183 | XCookies[fila,0]=DictHandle[row[0]]
184 | XCookies[fila,1]=DictCookie[row[1]]
185 | XCookies[fila,2]=DictComputerOs[row[2]]
186 | XCookies[fila,3]=DictComputerV[row[3]]
187 | XCookies[fila,4]=DictCountry[row[4]]
188 | XCookies[fila,5]=np.float_(row[5])
189 | XCookies[fila,6]=DictAnnC1[row[6]]
190 | XCookies[fila,7]=DictAnnC2[row[7]]
191 | XCookies[fila,8]=np.float_(row[8])
192 | XCookies[fila,9]=np.float_(row[9])
193 | XCookies[fila,10]=np.float_(row[10])
194 |
195 | return XCookies
196 |
197 | ####################################################################################################
198 | # THIS FUNCTION CREATES A DICTIONARY WHERE THE KEYS ARE THE IP ADDRESSES OF THE IP AGGREGATED FILE #
199 | # AND THE VALUE A NUMPY ARRAY WITH ITS INFORMATION. #
200 | ####################################################################################################
201 |
202 | def loadIPAGG(ipaggfile):
203 |
204 | XIPS=dict()
205 |
206 | with open(ipaggfile,'rb') as csvfile:
207 | spamreader=csv.reader(csvfile,delimiter=',')
208 | spamreader.next()
209 | for row in spamreader:
210 |
211 | datoIP=np.zeros(5)
212 | datoIP[0]=np.float_(row[1])
213 | datoIP[1]=np.float_(row[2])
214 | datoIP[2]=np.float_(row[3])
215 | datoIP[3]=np.float_(row[4])
216 | datoIP[4]=np.float_(row[5])
217 |
218 | XIPS[row[0]]=datoIP
219 |
220 | return XIPS
221 |
222 | #####################################################################
223 | # THIS FUNCTION CREATES A DICTIONARY WHERE THE KEYS ARE THE DEVICES #
224 | # AND THE VALUE DICTIONARY OF THE PROPERTIES AND ITS INFORMATION #
225 | #####################################################################
226 |
227 | def loadPROPS(fileprops,DictDevice,DictCookie):
228 |
229 | DevProps=dict()
230 |
231 | with open(fileprops) as fp:
232 | fp.readline()
233 |
234 | for line in fp:
235 |
236 | matchObj = re.match( r'([a-zA-Z0-9_]*),([0-9\-]*),{([(a-zA-Z0-9.(),\-_]*)}', line, flags=0)
237 |
238 | if(matchObj.group(2)=='0'):
239 | props = re.findall(r'\((.*?)\)',matchObj.group(3))
240 | ValProps=dict()
241 | for prop in props:
242 | propV = prop.split(',')
243 | ValProps[propV[0]]=np.float_(propV[1])
244 | Devic=DictDevice.get(matchObj.group(1),-1)
245 | if Devic>-1:
246 | DevProps[Devic]=ValProps
247 |
248 | return DevProps
249 |
250 | #################################################################################################
251 | # THIS FUNCTION CREATES: #
252 | # A DICTIONARY WHERE THE KEYS ARE THE DEVICES OF THE TRAINING SET AND THE VALUES THEIR COOKIES #
253 | # A DICTIONARY WHERE THE KEYS ARE THE COOKIES AND THE VALUES OTHER COOKIES WITH THE SAME HANDLE #
254 | # A DICTIONARY WHERE THE KEYS ARE THE COOKIES AND THE VALUES THE DEVICES WITH THE SAME HANDLE #
255 | #################################################################################################
256 |
257 | def creatingLabels(XDevices,XCookies,DictHandle):
258 |
259 | HDC=dict()
260 | unknown = DictHandle['-1']
261 | Handles=np.unique(XCookies[:,0])
262 | for i in range(len(Handles)):
263 | if Handles[i] != unknown:
264 | HDC[Handles[i]]=dict()
265 | HDC[Handles[i]]['Devices']=set()
266 | HDC[Handles[i]]['Cookies']=set()
267 |
268 | (NDevices,NDim)=XDevices.shape
269 |
270 | for i in range(NDevices):
271 | HDC[XDevices[i,0]]['Devices'].add(XDevices[i,1])
272 |
273 | (NCookies,NDim)=XCookies.shape
274 |
275 | for i in range(NCookies):
276 | if XCookies[i,0] != unknown:
277 | mdic=HDC.get(XCookies[i,0])
278 | mdic['Cookies'].add(XCookies[i,1])
279 |
280 |
281 | Labels=dict()
282 | Groups = dict()
283 | WhosDevice=dict()
284 |
285 | for k,v in HDC.iteritems():
286 | for dev in v['Devices']:
287 | Labels[dev]=v['Cookies']
288 | for coo in v['Cookies']:
289 | Groups[coo]=v['Cookies']
290 | WhosDevice[coo]=v['Devices']
291 |
292 | for i in range(NCookies):
293 | if XCookies[i,0] == unknown:
294 | name=XCookies[i,1]
295 | setcoo=set()
296 | setcoo.add(name)
297 | Groups[name]=setcoo
298 |
299 | return (Labels,Groups,WhosDevice)
300 |
301 | ############################################################################
302 | # THIS FUNCTION EVALUATES THE F05 SCORE ON THE RESULTS OF A VALIDATION SET #
303 | ############################################################################
304 |
305 | def calculateF05(Results,Target):
306 |
307 | BetaQ=0.5*0.5
308 |
309 | F05=list()
310 |
311 | for k in Results.keys():
312 | pos=Results[k]
313 | tla=Target[k]
314 |
315 | tp=np.float_(len(pos & tla))
316 | fp=np.float_(len(pos)-tp)
317 | fn=np.float_(len(tla)-tp)
318 | p=tp/(tp+fp)
319 | r=tp/(tp+fn)
320 | if p*r>0.0:
321 | f=(1.0+BetaQ)*p*r/(BetaQ*p+r)
322 | else:
323 | f=0.0
324 | F05.append(f)
325 | return np.mean(F05)
326 |
327 | #################################################
328 | # THIS FUNCTION CREATES THE DATA STRUCTURES TO: #
329 | # FIND THE IP ADDRESSES OF EVERY DEVICE #
330 | # FIND THE IP ADDRESSES OF EVERY COOKIE #
331 | # FIND THE DEVICES OF EVERY IP ADDRESS #
332 | # FINC THE COOKIES OF EVERY IP ADDRESS #
333 | #################################################
334 |
335 | def loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups):
336 |
337 | DeviceIPS=dict()
338 | CookieIPS=dict()
339 | IPDev=defaultdict(set)
340 | IPCoo=defaultdict(set)
341 |
342 | with open(ipfile) as fp:
343 | fp.readline()
344 |
345 | for line in fp:
346 | matchObj = re.match( r'([a-zA-Z0-9_]*),([0-9\-]*),{([(a-zA-Z0-9(),\-_]*)}', line, flags = 0)
347 | ips = re.findall(r'(\w*,\w*,\w*,\w*,\w*,\w*,\w*)',matchObj.group(3))
348 |
349 | ValIPS=dict()
350 | for ip in ips:
351 | Indiv = ip.split(',')
352 | arr=np.zeros(11)
353 | arr[0]=np.float_(Indiv[1])
354 | arr[1]=np.float_(Indiv[2])
355 | arr[2]=np.float_(Indiv[3])
356 | arr[3]=np.float_(Indiv[4])
357 | arr[4]=np.float_(Indiv[5])
358 | arr[5]=np.float_(Indiv[6])
359 | dIP=XIPS[Indiv[0]]
360 | arr[6]=np.float_(dIP[0])
361 | arr[7]=np.float_(dIP[1])
362 | arr[8]=np.float_(dIP[2])
363 | arr[9]=np.float_(dIP[3])
364 | arr[10]=np.float_(dIP[4])
365 |
366 | ValIPS[Indiv[0]]=arr
367 |
368 | if(matchObj.group(2)=='0'):
369 | Device=DictDevice.get(matchObj.group(1),-1)
370 | if Device>-1:
371 | DeviceIPS[Device]=ValIPS
372 | for k in ValIPS.keys():
373 | IPDev[k].add(Device)
374 | else:
375 | DeviceIPS[matchObj.group(1)]=ValIPS
376 | for k in ValIPS.keys():
377 | IPDev[k].add(matchObj.group(1))
378 |
379 |
380 | else:
381 | Cookie=DictCookie[matchObj.group(1)]
382 | CookieIPS[Cookie]=ValIPS
383 | for k in ValIPS.keys():
384 | IPCoo[k].add(Cookie)
385 |
386 |
387 |
388 | for k,v in Groups.iteritems():
389 | if len(v)>1:
390 | for cook1 in v:
391 | for cook2 in v:
392 | if cook1 != cook2:
393 | d1=CookieIPS[cook1]
394 | d2=CookieIPS[cook2]
395 | for n1,n2 in d1.iteritems():
396 | if n1 not in d2.keys():
397 | d2[n1]=n2
398 | IPCoo[n1].add(cook2)
399 |
400 | return (IPDev,IPCoo,DeviceIPS,CookieIPS)
401 |
402 | ################################################################################
403 | # THIS FUNCTION FOR A GIVEN DEVICE CREATES: #
404 | # A SET OF COOKIES WITH KNOWN HANDLE THAT SHARE IP ADDRESSES WITH THE DEVICE #
405 | # A SET OF COOKIES WITH UNKNOWN HANDLE THAT SHARE IP ADDRESSES WITH THE DEVICE #
406 | ################################################################################
407 |
408 | def fullCandidates(device,XDevices,XCookies,IPDev,IPCoo,DeviceIPS,DictHandle):
409 |
410 | CandidatesKnown=dict()
411 | CandidatesUnknown=dict()
412 |
413 | candidatestotalKnown=set()
414 | candidatestotalUnknown=set()
415 |
416 | Unknown = DictHandle['-1']
417 |
418 | ips=DeviceIPS[device].keys()
419 |
420 | for ip in ips:
421 | if(len(IPDev.get(ip,set()))<=30):
422 | candidates=IPCoo[ip]
423 | for candidate in candidates:
424 | if(XCookies[np.int(candidate),0] != Unknown):
425 | candidatestotalKnown.add(candidate)
426 | else:
427 | candidatestotalUnknown.add(candidate)
428 |
429 | if (len(candidatestotalKnown)==0):
430 | for ip in ips:
431 | candidates=IPCoo[ip]
432 | for candidate in candidates:
433 | if(XCookies[np.int(candidate),0] != Unknown):
434 | candidatestotalKnown.add(candidate)
435 | else:
436 | candidatestotalUnknown.add(candidate)
437 |
438 |
439 | CandidatesKnown[device]=candidatestotalKnown
440 | CandidatesUnknown[device]=candidatestotalUnknown
441 |
442 | return (CandidatesKnown,CandidatesUnknown)
443 |
444 | ###############################################################################
445 | # THIS FUNCTION CREATES THE INITIAL SELECTION OF CANDIDATES FOR EVERY DEVICE #
446 | ###############################################################################
447 |
448 | def selectCandidates(XDevices,XCookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle):
449 |
450 | devices = np.unique(XDevices[:,1])
451 | Candidates=dict()
452 |
453 | Unknown=DictHandle['-1']
454 |
455 | for i in range(len(devices)):
456 | device = devices[i]
457 |
458 | candidatestotal=set()
459 | ips=DeviceIPS[device].keys()
460 | for ip in ips:
461 | if(len(IPDev.get(ip,set()))<=10 and len(IPCoo.get(ip,set()))<=20):
462 | candidates=IPCoo[ip]
463 | for candidate in candidates:
464 | if(XCookies[np.int(candidate),0] != Unknown):
465 | candidatestotal.add(candidate)
466 |
467 | if len(candidatestotal)==0:
468 | for ip in ips:
469 | if(len(IPDev.get(ip,set()))<=25 and len(IPCoo.get(ip,set()))<=50):
470 | candidates=IPCoo[ip]
471 | for candidate in candidates:
472 | if(XCookies[np.int(candidate),0] != Unknown):
473 | candidatestotal.add(candidate)
474 |
475 |
476 | if len(candidatestotal)==0:
477 | for ip in ips:
478 | candidates=IPCoo[ip]
479 | for candidate in candidates:
480 | if(XCookies[np.int(candidate),0] != Unknown):
481 | candidatestotal.add(candidate)
482 |
483 | if len(candidatestotal)==0:
484 | for ip in ips:
485 | candidates=IPCoo[ip]
486 | for candidate in candidates:
487 | candidatestotal.add(candidate)
488 |
489 |
490 |
491 | Candidates[device]=candidatestotal
492 |
493 | return Candidates
494 |
495 |
496 | ###########################################
497 | # THIS CREATES A THE TRAINING OR TEST SET #
498 | ###########################################
499 |
500 |
501 | def createDataSet(Candidates,XDevice,XCookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProps):
502 |
503 | OriginalIndex=dict()
504 | numdifs=0
505 | numpatterns=0
506 | for k,v in Candidates.iteritems():
507 | numpatterns=numpatterns+len(v)
508 |
509 |
510 | Added=0
511 | for k,v in Candidates.iteritems():
512 | Device=XDevice[XDevice[:,1]==k,np.array([2,3,4,5,6,7,8,9,10])]
513 |
514 | IndivIndex=dict()
515 |
516 | setk=set()
517 | setk.add(k)
518 | setdevips=set(DeviceIPS.get(k,dict()).keys())
519 | setdevpro=set(DevProps.get(k,dict()).keys())
520 |
521 | for coo in v:
522 |
523 | Cookie=XCookies[np.int(coo),np.array([2,3,4,5,6,7,8,9,10])]
524 |
525 | row=np.concatenate((Device,Cookie))
526 |
527 |
528 | setcooips=set(CookieIPS.get(coo,dict()).keys())
529 |
530 | PROPS=setdevpro
531 | mipro=PROPS
532 |
533 |
534 | IPS=(setdevips & setcooips)
535 | miips=set()
536 | for ip in IPS:
537 | if(len(IPDev.get(ip,set()))<=10 and len(IPCoo.get(ip,set()))<=20):
538 | miips.add(ip)
539 | if len(miips)==0:
540 | for ip in IPS:
541 | miips.add(ip)
542 |
543 | OtherDevices=set(WhosDevice.get(coo,set()))-setk
544 |
545 | devp=set()
546 | devi=set()
547 |
548 | for odev in OtherDevices:
549 | devp=devp | set(DevProps.get(odev,dict().keys()))
550 | devi=devi | set(DeviceIPS.get(odev,dict().keys()))
551 |
552 |
553 | intersec=np.float_(len(devp & setdevpro))
554 | interseci=np.float_(len(devi & setdevips))
555 |
556 |
557 | if intersec>0:
558 | intersec=intersec/np.float_(len(setdevpro))
559 |
560 | if interseci>0:
561 | intersec=intersec/np.float_(len(setdevips))
562 |
563 |
564 | row=np.concatenate((row,np.array([np.float_(len(OtherDevices))])))
565 | row=np.concatenate((row,np.array([np.float_(intersec)])))
566 |
567 | row=np.concatenate((row,np.array([np.float_(interseci)])))
568 |
569 |
570 | row=np.concatenate((row,np.array([np.float_(len(IPS))])))
571 | row=np.concatenate((row,np.array([np.float_(len(setdevips))])))
572 | row=np.concatenate((row,np.array([np.float_(len(setcooips))])))
573 |
574 | row=np.concatenate((row,np.array([np.float_(len(PROPS))])))
575 | row=np.concatenate((row,np.array([np.float_(len(setdevpro))])))
576 |
577 |
578 | row=np.concatenate((row,np.array([np.float_(len(Groups.get(coo,set())))])))
579 | row=np.concatenate((row,np.array([np.float_(len(Groups.get(coo,set()) & v))])))
580 |
581 | row=np.concatenate((row,np.array([np.float_(len(miips))])))
582 |
583 |
584 | iprow=np.zeros(22)
585 | niprows=0
586 | for ip in miips:
587 | iprow=iprow+np.concatenate((DeviceIPS[k][ip].reshape(-1),CookieIPS[coo][ip].reshape(-1)))
588 | niprows=niprows+1
589 |
590 | if niprows>0:
591 | meaniprows=iprow/np.float_(niprows)
592 | else:
593 | meaniprows=iprow
594 |
595 |
596 | row=np.concatenate((row.reshape(-1),iprow.reshape(-1)))
597 | row=np.concatenate((row.reshape(-1),meaniprows.reshape(-1)))
598 | row=np.concatenate((row.reshape(-1),(iprow[0:6]-iprow[11:-5]).reshape(-1)))
599 |
600 |
601 | if Added==0:
602 | XTR=np.zeros((numpatterns,len(row)))
603 |
604 | IndivIndex[coo]=Added
605 |
606 | XTR[Added,:]=row
607 |
608 | Added=Added+1
609 | OriginalIndex[k]=IndivIndex
610 | return (XTR,OriginalIndex)
611 |
612 | #####################################################
613 | # THIS CREATES A THE LABELS FOR SUPERVISED LEARNING #
614 | #####################################################
615 |
616 | def createTrainingLabels(Candidates,Labels):
617 |
618 | numpatterns=0
619 |
620 | for k,v in Candidates.iteritems():
621 | numpatterns=numpatterns+len(v)
622 |
623 | YTR=np.zeros(numpatterns)
624 |
625 | Added=0
626 | for k,v in Candidates.iteritems():
627 | for coo in v:
628 | if(coo in Labels[k]):
629 | YTR[Added]=1.0
630 | Added=Added+1
631 |
632 | return YTR
633 |
634 |
635 | ######################################################
636 | # THIS FINCTION SELECTS THE COOKIES FOR EVERY DEVICE #
637 | # GIVEN THE PREDICTIONS OF THE CLASSIFIER #
638 | ######################################################
639 |
640 | def bestSelection(predictions, OriginalIndex, values,Groups):
641 |
642 | result=dict()
643 |
644 | threshold=dict()
645 |
646 | for k,v in OriginalIndex.iteritems():
647 |
648 | cook=set()
649 | maxval=0.0
650 | cookies=v.keys()
651 |
652 | scores=np.zeros(len(cookies))
653 |
654 | for i in range(len(cookies)):
655 | scores[i]=predictions[v[cookies[i]]]
656 |
657 |
658 | Orden=sorted(range(len(scores)),key=lambda x:-scores[x])
659 |
660 | if len(cookies)>0:
661 | if Groups.get(cookies[Orden[0]],-100) != -100:
662 | maxval=scores[Orden[0]]
663 | cook= (cook | Groups[cookies[Orden[0]]])
664 |
665 | if (maxval<0.9):
666 | for i in range(len(values)):
667 | if (i<= len(cook)):
668 | if (i1 & tam2==1):
672 | if(scores[Orden[i]]>maxval*(values[i]-0.15)):
673 | cook= (cook | Groups.get(cookies[Orden[i]],set()))
674 | elif (tam1>1 & tam2>1):
675 | if(scores[Orden[i]]>maxval*(values[i]+0.1)):
676 | cook= (cook | Groups.get(cookies[Orden[i]],set()))
677 | elif (tam1==1 & tam2==1):
678 | if(scores[Orden[i]]>maxval*(values[i])):
679 | cook= (cook | Groups.get(cookies[Orden[i]],set()))
680 |
681 |
682 | result[k]=cook
683 | threshold[k]=maxval
684 | return (result,threshold)
685 |
686 | #####################################################
687 | # THIS FUNCTION TRAINS THE CLASSIFIER USING XGBOOST #
688 | #####################################################
689 |
690 | def trainXGBoost(xtr,ytr,rounds,eta,xtst,ytst):
691 | xgmat = xgb.DMatrix( xtr, label=ytr)
692 | xgmat2 = xgb.DMatrix( xtst, label=ytst)
693 | param = {}
694 | param['eta'] = eta
695 | param['max_depth'] = 10
696 | param['subsample'] = 1.0
697 | param['nthread'] = 12
698 | param['min_child_weight']=4
699 | param['gamma']=5.0
700 | param['colsample_bytree']=1.0
701 | param['silent']=1
702 | param['objective'] = 'binary:logistic'
703 | param['eval_metric']='error'
704 | watchlist = [ (xgmat,'train') ,(xgmat2,'test')]
705 | num_round = rounds
706 | bst = xgb.train( param, xgmat, num_round, watchlist );
707 | return bst
708 |
709 | #######################################
710 | # THIS FUNCTION MAKES THE PREDICTIONS #
711 | #######################################
712 |
713 | def predictXGBoost(X,bst):
714 | xgmat = xgb.DMatrix( X)
715 | return bst.predict( xgmat )
716 |
717 | #########################################################################
718 | # THIS FUNCTION TRAINS THE ALGORITHM USING 8 BAGGERS AND AVERAGING THEM #
719 | #########################################################################
720 |
721 | def FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain,Groups,Labels):
722 | NFOLDS=8
723 |
724 | skf = sklearn.cross_validation.KFold(len(OriginalIndexTR.keys()),n_folds=NFOLDS,random_state=0)
725 |
726 | resultadosVal=np.zeros(len(YTR))
727 |
728 |
729 | (tamTST,dTST)=XTST.shape
730 | resultadosTST=np.zeros(tamTST)
731 |
732 |
733 | classifiers=list()
734 |
735 | iteration=0
736 | for (train,test) in skf:
737 |
738 | iteration=iteration+1
739 | Originaltmp=dict()
740 | print "Training Bagger ",iteration, "of", NFOLDS
741 |
742 |
743 | trainind=list()
744 | testind=list()
745 | traindev=list()
746 | testdev=list()
747 |
748 | for i in train:
749 | devtr=DevicesTrain[i,1]
750 | traindev.append(devtr)
751 | trainind.extend(OriginalIndexTR[devtr].values())
752 |
753 | for i in test:
754 | devtr=DevicesTrain[i,1]
755 | testdev.append(devtr)
756 | testind.extend(OriginalIndexTR[devtr].values())
757 | Originaltmp[devtr]=OriginalIndexTR[devtr]
758 |
759 | trainind=np.array(trainind)
760 | testind=np.array(testind)
761 |
762 | XvalTR=XTR[trainind,:]
763 | XvalTST=XTR[testind,:]
764 |
765 | YvalTR=YTR[trainind]
766 | YvalTST=YTR[testind]
767 |
768 |
769 | bst=trainXGBoost(XvalTR,YvalTR,200,0.10,XvalTST,YvalTST)
770 |
771 | classifiers.append((bst,traindev,testdev))
772 |
773 | pTT=predictXGBoost(XvalTR,bst)
774 | pTR=predictXGBoost(XvalTST,bst)
775 |
776 | resultadosVal[testind]=pTR
777 |
778 | (validat,thTR)=bestSelection(resultadosVal, Originaltmp, np.array([1.0]),Groups)
779 |
780 | pTST=predictXGBoost(XTST,bst)
781 |
782 |
783 | resultadosTST=resultadosTST+pTST
784 |
785 |
786 | resultadosTST=resultadosTST/np.float_(NFOLDS)
787 | return(resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)
788 |
789 | ###############################################################################################
790 | # THIS FUNCTION LOOKS FOR DEVICES WHOSE BEST CANDIDATE SCORES LESS THAN 0.05, #
791 | # CREATES A NEW SET OF CANDIDATES CONTAINING EVERY COOKIE THAT SHARES AN IP ADDRESS WITH HIM, #
792 | # SCORES THEM WITH XGBOOST AND SELECT THE CANDIDATES FOR THE SUBMISSION #
793 | ###############################################################################################
794 |
795 | def PostAnalysisTrain(validat,thTR,classifiers,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle,Labels):
796 |
797 | itn=0
798 | for k,v in validat.iteritems():
799 | itn=itn+1
800 | if thTR[k]<0.05:
801 | (fcandK,fcandU)=fullCandidates(k,DevicesTrain,Cookies,IPDev,IPCoo,DeviceIPS,DictHandle)
802 |
803 | validatTHK=dict()
804 | thTHK=dict()
805 | if(len(fcandK[k])>0):
806 | (XTHK,OriginalIndexTHK)=createDataSet(fcandK,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
807 | YTHK=createTrainingLabels(fcandK,Labels)
808 | estimK=np.zeros(len(YTHK))
809 |
810 | for (classifier,traindev,testdev) in classifiers:
811 | if k in testdev:
812 | estimK=predictXGBoost(XTHK,classifier)
813 |
814 | (validatTHK,thTHK)=bestSelection(estimK, OriginalIndexTHK, np.array([1.0,0.9]),Groups)
815 |
816 | validatTHU=dict()
817 | thTHU=dict()
818 | if(len(fcandU[k])>0):
819 | (XTHU,OriginalIndexTHU)=createDataSet(fcandU,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
820 | YTHU=createTrainingLabels(fcandU,Labels)
821 | estimU=np.zeros(len(YTHU))
822 |
823 | for (classifier,traindev,testdev) in classifiers:
824 | if k in testdev:
825 | estimU=predictXGBoost(XTHU,classifier)
826 |
827 | (validatTHU,thTHU)=bestSelection(estimU, OriginalIndexTHU, np.array([1.0,0.9]),Groups)
828 |
829 | if len(validatTHK)>0:
830 | if len(validatTHU)>0:
831 | if(thTHU[k]>(thTHK[k]+0.7)):
832 | validat[k]=validatTHU[k]
833 | thTR[k]=thTHU[k]
834 | else:
835 | if thTR[k]<=0.025:
836 | validat[k]=validatTHK[k]
837 | thTR[k]=thTHK[k]
838 | else:
839 | if thTR[k]+0.30:
920 | if Groups.get(cookies[Orden[0]],-100) != -100:
921 | cook= (cook | Groups[cookies[Orden[0]]])
922 | ValorMax=scores[Orden[0]]
923 |
924 | Segun=-1
925 | Terminado='NO'
926 |
927 | for i in range(len(cookies)):
928 | if i>0:
929 | if Terminado=='NO':
930 | if (cookies[Orden[i]] not in cook):
931 | Segun=scores[Orden[i]]
932 | Terminado='SI'
933 |
934 | if (Segun<0.05 and ValorMax>0.4):
935 | probCandidates[k]=Groups[cookies[Orden[0]]]
936 |
937 | return probCandidates
938 |
939 | #########################################
940 | # THIS FUNCTION MERGES THE DICTIONARIES #
941 | # FOR THE SEMI SUPERVISED LEARNING #
942 | #########################################
943 |
944 | def createOtherDevicesDict(dict1,dict2,dict3):
945 |
946 | OtherDevices=defaultdict(set)
947 | for k,v in dict1.iteritems():
948 | for cookie in v:
949 | OtherDevices[cookie].add(k)
950 | for k,v in dict2.iteritems():
951 | for cookie in v:
952 | OtherDevices[cookie].add(k)
953 | for k,v in dict3.iteritems():
954 | for cookie in v:
955 | OtherDevices[cookie].add(k)
956 |
957 | return OtherDevices
958 |
959 |
960 |
961 | ######################################################
962 | # THIS FUNCTION SAVE THE FINAL PREDICTIONS IN A FILE #
963 | ######################################################
964 |
965 | def writeSolution(file,selected,DeviceList,CookieList):
966 |
967 | header=list()
968 | header.append('device_id')
969 | header.append('cookie_id')
970 |
971 | with open(file, 'wb') as csvfile:
972 | spamwriter = csv.writer(csvfile, delimiter=',')
973 | spamwriter.writerow(header)
974 |
975 |
976 | for k,v in selected.iteritems():
977 | row=list()
978 | items=list()
979 | row.append(DeviceList[np.int(k)])
980 | for elem in (v):
981 | items.append(CookieList[np.int(elem)])
982 | if len(v)==0:
983 | items.append('id_10')
984 | row.append(' '.join(items))
985 | spamwriter.writerow(row)
986 |
987 |
988 | ###############################################################################################
989 | # THIS FUNCTION MAKES THE POST PROCESSING ON A TEST #
990 | # IT LOOKS FOR DEVICES WHOSE BEST CANDIDATE SCORES LESS THAN 0.05, #
991 | # CREATES A NEW SET OF CANDIDATES CONTAINING EVERY COOKIE THAT SHARES AN IP ADDRESS WITH HIM, #
992 | # SCORES THEM WITH XGBOOST AND SELECT THE CANDIDATES FOR THE SUBMISSION #
993 | ###############################################################################################
994 |
995 | def PostAnalysisTest(validatTST,thTST,classifiers,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle):
996 |
997 | itn=0
998 | for k,v in validatTST.iteritems():
999 | itn=itn+1
1000 | if thTST[k]<0.05:
1001 |
1002 | (fcandK,fcandU)=fullCandidates(k,DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,DictHandle)
1003 |
1004 | validatTHK=dict()
1005 | thTHK=dict()
1006 | if(len(fcandK[k])>0):
1007 |
1008 | (XTHK,OriginalIndexTHK)=createDataSet(fcandK,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
1009 |
1010 | (tmxK,dmxK)=XTHK.shape
1011 | estimK=np.zeros(tmxK)
1012 |
1013 | for (classifier,traindev,testdev) in classifiers:
1014 | estimK=estimK+predictXGBoost(XTHK,classifier)
1015 |
1016 | estimK=estimK/np.float_(len(classifiers))
1017 |
1018 | (validatTHK,thTHK)=bestSelection(estimK, OriginalIndexTHK, np.array([1.0,0.90]),Groups)
1019 |
1020 | validatTHU=dict()
1021 | thTHU=dict()
1022 | if(len(fcandU[k])>0):
1023 | (XTHU,OriginalIndexTHU)=createDataSet(fcandU,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
1024 | (tmxU,dmxU)=XTHU.shape
1025 | estimU=np.zeros(tmxU)
1026 |
1027 | for (classifier,traindev,testdev) in classifiers:
1028 | estimU=estimU+predictXGBoost(XTHU,classifier)
1029 |
1030 | estimU=estimU/np.float_(len(classifiers))
1031 |
1032 | (validatTHU,thTHU)=bestSelection(estimU, OriginalIndexTHU, np.array([1.0,0.90]),Groups)
1033 |
1034 |
1035 | if len(validatTHK)>0:
1036 | if len(validatTHU)>0:
1037 | if(thTHU[k]>(thTHK[k]+0.7)):
1038 | validatTST[k]=validatTHU[k]
1039 | thTST[k]=thTHU[k]
1040 | else:
1041 | if thTST[k]<=0.025:
1042 | validatTST[k]=validatTHK[k]
1043 | thTST[k]=thTHK[k]
1044 | else:
1045 | if thTST[k]+0.3.
15 |
16 |
17 |
18 | import csv
19 | import os
20 | import re
21 | import numpy as np
22 | import inspect
23 | import sys
24 | import sklearn
25 | from sklearn import cross_validation
26 | import pickle
27 |
28 |
29 | from Variables import *
30 | from VariablesTST import *
31 |
32 |
33 | code_path = os.path.join(pathXGBoost)
34 | sys.path.append(code_path)
35 | import xgboost as xgb
36 | from CookieLibrary import *
37 |
38 | #################################################################################
39 | # PARSING THE FILES PROVIDED FOR THE CHALLENGE AND CREATING THE DATA STRUCTURES #
40 | # THAT THE ALGORITHM NEEDS #
41 | #################################################################################
42 |
43 | # Some features in the files that describe the cookies and the devices are categorical features in test mode.
44 | # For example, the countries are like: 'country_147', or the handle is like 'handle_1301101'.
45 | # This function creates dictionaries to transform that text into a numerical value to load them in a numpy matrix.
46 |
47 | print('Loading Dictionaries')
48 | (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)=GetIdentifiers(trainfile,testfile,cookiefile)
49 |
50 | DictHandle = list2Dict(HandleList)
51 | DictDevice = list2Dict(DeviceList)
52 | DictCookie = list2Dict(CookieList)
53 | DictDevType = list2Dict(DevTypeList)
54 | DictDevOs = list2Dict(DevOsList)
55 | DictComputerOs = list2Dict(ComputerOsList)
56 | DictComputerV = list2Dict(ComputerVList)
57 | DictCountry = list2Dict(CountryList)
58 | DictAnnC1 = list2Dict(annC1List)
59 | DictAnnC2 = list2Dict(annC2List)
60 |
61 |
62 | # This part loads the content of the devices into a numpy matrix using the dictionaries to transform the text values into numerical values
63 | print('Loading Devices Files')
64 | DevicesTrain = loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
65 |
66 | # This part loads the content of the cookies into a numpy matrix using the dictionaries to transform the text values into numerical values
67 | print('Loading Cookies File')
68 | Cookies = loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2)
69 |
70 | # It loads the Properties of the devices
71 | print('Loading Properties File')
72 | DevProperties=loadPROPS(propfile,DictDevice,DictCookie)
73 |
74 | # It read the train information and creates a dictionary with the cookies of every device, a dicionary that gives for every cookie the other cookies in its same handle and for every cookie its devices
75 | (Labels,Groups,WhosDevice)=creatingLabels(DevicesTrain,Cookies,DictHandle)
76 |
77 |
78 | # It creates a dictionary whose keys are the ip address and the value a numpy array with the IP info
79 | print('Loading IP Files')
80 | XIPS=loadIPAGG(ipaggfile)
81 |
82 | # It loads the IP file and creates four dictionaries.
83 | # The first one gives the devices of every ip, the second one the cookies of every ip, the third one the ips of every device and the last one the ips of every cookie.
84 | (IPDev,IPCoo,DeviceIPS,CookieIPS)=loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups)
85 |
86 |
87 | #########################
88 | # LOADING THE TEST FILE #
89 | #########################
90 |
91 | print('STEP: Loading test file')
92 | DevicesTest = loadDevices(predictFile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
93 |
94 |
95 | ###################################
96 | # INITIAL SELECTION OF CANDIDATES #
97 | ###################################
98 |
99 | print('STEP: Initial selection of candidates')
100 | CandidatesTST=selectCandidates(DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle)
101 |
102 | #####################
103 | # LOADING THE MODEL #
104 | #####################
105 |
106 | print('Loading the model')
107 | (classifiers,DictOtherDevices) = loadModel(modelpath)
108 |
109 | ########################
110 | # CREATING THE DATASET #
111 | ########################
112 |
113 | print('STEP: Creating the dataset')
114 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties)
115 |
116 | ########################
117 | # USING THE CLASSIFIER #
118 | ########################
119 |
120 | print('STEP: Using the classifier')
121 | resultadosTST = Predict(XTST,classifiers)
122 |
123 | ########################
124 | # POST PROCESSING STEP #
125 | ########################
126 |
127 | print('STEP: Post Processing')
128 | (validatTST,thTST)=bestSelection(resultadosTST, OriginalIndexTST, np.array([1.0,0.9]),Groups)
129 |
130 | (validatTST,thTST) = PostAnalysisTest(validatTST,thTST,classifiers,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle)
131 |
132 | #########################################
133 | # WRITIG THE FINAL SOLUTION IN THE FILE #
134 | #########################################
135 |
136 | print('Writing the file with the result')
137 | writeSolution(resultFile,validatTST,DeviceList,CookieList)
138 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2015 Roberto Diaz Morales
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 |
16 |
17 |
18 | import csv
19 | import os
20 | import re
21 | import numpy as np
22 | import inspect
23 | import sys
24 | import sklearn
25 | from sklearn import cross_validation
26 | import pickle
27 |
28 | from Variables import *
29 |
30 | code_path = os.path.join(pathXGBoost)
31 | sys.path.append(code_path)
32 | import xgboost as xgb
33 | from CookieLibrary import *
34 |
35 | #################################################################################
36 | # PARSING THE FILES PROVIDED FOR THE CHALLENGE AND CREATING THE DATA STRUCTURES #
37 | # THAT THE ALGORITHM NEEDS #
38 | #################################################################################
39 |
40 | # Some features in the files that describe the cookies and the devices are categorical features in test mode.
41 | # For example, the countries are like: 'country_147', or the handle is like 'handle_1301101'.
42 | # This function creates dictionaries to transform that text into a numerical value to load them in a numpy matrix.
43 |
44 | print('Loading Dictionaries')
45 | (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)=GetIdentifiers(trainfile,testfile,cookiefile)
46 |
47 | DictHandle = list2Dict(HandleList)
48 | DictDevice = list2Dict(DeviceList)
49 | DictCookie = list2Dict(CookieList)
50 | DictDevType = list2Dict(DevTypeList)
51 | DictDevOs = list2Dict(DevOsList)
52 | DictComputerOs = list2Dict(ComputerOsList)
53 | DictComputerV = list2Dict(ComputerVList)
54 | DictCountry = list2Dict(CountryList)
55 | DictAnnC1 = list2Dict(annC1List)
56 | DictAnnC2 = list2Dict(annC2List)
57 |
58 |
59 | # This part loads the content of the devices into a numpy matrix using the dictionaries to transform the text values into numerical values
60 | print('Loading Devices Files')
61 | DevicesTrain = loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
62 | DevicesTest = loadDevices(testfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
63 |
64 | # This part loads the content of the cookies into a numpy matrix using the dictionaries to transform the text values into numerical values
65 | print('Loading Cookies File')
66 | Cookies = loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2)
67 |
68 | # It loads the Properties of the devices
69 | print('Loading Properties File')
70 | DevProperties=loadPROPS(propfile,DictDevice,DictCookie)
71 |
72 | # It read the train information and creates a dictionary with the cookies of every device, a dicionary that gives for every cookie the other cookies in its same handle and for every cookie its devices
73 | (Labels,Groups,WhosDevice)=creatingLabels(DevicesTrain,Cookies,DictHandle)
74 |
75 |
76 | # It creates a dictionary whose keys are the ip address and the value a numpy array with the IP info
77 | print('Loading IP Files')
78 | XIPS=loadIPAGG(ipaggfile)
79 |
80 | # It loads the IP file and creates four dictionaries.
81 | # The first one gives the devices of every ip, the second one the cookies of every ip, the third one the ips of every device and the last one the ips of every cookie.
82 | (IPDev,IPCoo,DeviceIPS,CookieIPS)=loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups)
83 |
84 |
85 | #################################################################################################
86 | # PROCEDURE WITH THE INITIAL SELECTION OF CANDIDATES (PROCEDURE DESCRIBED IN THE DOCUMENTATION) #
87 | #################################################################################################
88 | print('STEP: Initial selection of candidates')
89 | # Using simple rules with select a set of candidate cookies for every device
90 | CandidatesTR=selectCandidates(DevicesTrain,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle)
91 | CandidatesTST=selectCandidates(DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle)
92 |
93 |
94 | #####################################################
95 | # CREATION OF THE TRAINING AND TEST SET #
96 | # (THE FEATURES ARE DESCRIBED IN THE DOCUMENTATION) #
97 | #####################################################
98 |
99 | print('STEP: Creating the dataset')
100 |
101 | # It creates the training and test set for supervised learning creating pairs (device,cookie) using the selected candidates.
102 | (XTR,OriginalIndexTR)=createDataSet(CandidatesTR,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
103 | YTR=createTrainingLabels(CandidatesTR,Labels)
104 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
105 |
106 | ######################################
107 | # TRAINING USING BAGGING AND XGBOOST #
108 | ######################################
109 |
110 | print('STEP: Training Supervised Learning')
111 | (resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)=FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain, Groups, Labels)
112 |
113 | #############################################################################
114 | # UPDATING THE DATA STRUCTURES DATASETS WITH NEW INFORMATION OF THE RESULTS #
115 | #############################################################################
116 |
117 | print('Updating features for with semisupervised learning information')
118 | # SECOND LOOP FOR SEMISUPERVISED LEARNING
119 | # It repeats the training procedure adding the cookies with high probability to the WhosDevice structure
120 | uniqueCand=uniqueCandidates(DevicesTest,Cookies,IPCoo,DeviceIPS,DictHandle,Groups)
121 | probCand=mostProbable(resultadosTST, OriginalIndexTST, Groups)
122 | DictOtherDevices=createOtherDevicesDict(Labels,uniqueCand,probCand)
123 |
124 |
125 | # It creates the training and test set for supervised learning creating pairs (device,cookie) using the selected candidates.
126 | (XTR,OriginalIndexTR)=createDataSet(CandidatesTR,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties)
127 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties)
128 |
129 | ####################################################################################################
130 | # SECOND TRAINING USING XGBOOST AND BAGGNG INCLUDING THE NEW INFORMATION(SEMI SUPERVUSED LEARNING) #
131 | ####################################################################################################
132 |
133 | # Training, it trains using 10 fold CV and the predicions are the average of the classifiers of every fold.
134 | print('STEP: Training Semi-Supervised Learning')
135 | (resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)=FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain, Groups, Labels)
136 |
137 | ############################################################
138 | # POST PROCESSING PROCEDURE DESCRIBED IN THE DOCUMENTATION #
139 | ############################################################
140 |
141 | print('STEP: Post Processing')
142 | # Initial selection of the cookies associated to every device
143 | (validat,thTR)=bestSelection(resultadosVal, OriginalIndexTR, np.array([1.0,0.9]),Groups)
144 |
145 | # Increasing the number of candidates in devices whose best candiate doesn't have a good likelihood
146 | (validat,thTR) = PostAnalysisTrain(validat,thTR,classifiers,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle,Labels)
147 |
148 | F05=calculateF05(validat,Labels)
149 | print "F05 Validation",F05
150 |
151 | ####################
152 | # SAVING THE MODEL #
153 | ####################
154 |
155 | print('Saving model')
156 | saveModel(modelpath,classifiers,DictOtherDevices)
157 |
--------------------------------------------------------------------------------