├── CookieLibrary.py ├── README.md ├── Variables.py ├── VariablesTST.py ├── predict.py └── train.py /CookieLibrary.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015 Roberto Diaz Morales 2 | # 3 | # This program is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU General Public License 14 | # along with this program. If not, see . 15 | 16 | 17 | import csv 18 | import numpy as np 19 | import re 20 | from collections import Counter,defaultdict 21 | import os 22 | import inspect 23 | import sys 24 | import sklearn 25 | from sklearn import cross_validation 26 | import pickle 27 | import xgboost as xgb 28 | 29 | 30 | 31 | ####################################################################################### 32 | # THIS FUNCTION PARSES THE FILES WITH THE INFORMATION ABOUT DEVICES AND COOKIES # 33 | # AND CREATES LISTS WITH THE IDENTIFIERS OF THE CATEGORICAL FEATURES. # 34 | # THE INDEX OF THE LIST WILL BE USED AS THE VALUE OF THE FEATURE IN THE NUMPY MATRICES# 35 | ####################################################################################### 36 | 37 | def GetIdentifiers(trainfile,testfile,cookiefile): 38 | 39 | DeviceList=list() 40 | CookieList=list() 41 | HandleList=list() 42 | DevTypeList=list() 43 | DevOsList=list() 44 | ComputerOsList=list() 45 | ComputerVList=list() 46 | CountryList=list() 47 | annC1List=list() 48 | annC2List=list() 49 | 50 | 51 | with open(trainfile,'rb') as csvfile: 52 | spamreader=csv.reader(csvfile,delimiter=',') 53 | spamreader.next() 54 | for row in spamreader: 55 | HandleList.append(row[0]) 56 | DeviceList.append(row[1]) 57 | DevTypeList.append(row[2]) 58 | DevOsList.append(row[3]) 59 | CountryList.append(row[4]) 60 | annC1List.append(row[6]) 61 | annC2List.append(row[7]) 62 | 63 | DeviceList=list(set(DeviceList)) 64 | CookieList=list(set(CookieList)) 65 | HandleList=list(set(HandleList)) 66 | DevTypeList=list(set(DevTypeList)) 67 | DevOsList=list(set(DevOsList)) 68 | CountryList=list(set(CountryList)) 69 | annC1List=list(set(annC1List)) 70 | annC2List=list(set(annC2List)) 71 | 72 | 73 | with open(testfile,'rb') as csvfile: 74 | spamreader=csv.reader(csvfile,delimiter=',') 75 | spamreader.next() 76 | for row in spamreader: 77 | HandleList.append(row[0]) 78 | DeviceList.append(row[1]) 79 | DevTypeList.append(row[2]) 80 | DevOsList.append(row[3]) 81 | CountryList.append(row[4]) 82 | annC1List.append(row[6]) 83 | annC2List.append(row[7]) 84 | 85 | DeviceList=list(set(DeviceList)) 86 | CookieList=list(set(CookieList)) 87 | HandleList=list(set(HandleList)) 88 | DevTypeList=list(set(DevTypeList)) 89 | DevOsList=list(set(DevOsList)) 90 | CountryList=list(set(CountryList)) 91 | annC1List=list(set(annC1List)) 92 | annC2List=list(set(annC2List)) 93 | 94 | with open(cookiefile,'rb') as csvfile: 95 | spamreader=csv.reader(csvfile,delimiter=',') 96 | spamreader.next() 97 | for row in spamreader: 98 | HandleList.append(row[0]) 99 | CookieList.append(row[1]) 100 | ComputerOsList.append(row[2]) 101 | ComputerVList.append(row[3]) 102 | CountryList.append(row[4]) 103 | annC1List.append(row[6]) 104 | annC2List.append(row[7]) 105 | 106 | DeviceList=list(set(DeviceList)) 107 | CookieList=list(set(CookieList)) 108 | HandleList=list(set(HandleList)) 109 | DevTypeList=list(set(DevTypeList)) 110 | DevOsList=list(set(DevOsList)) 111 | ComputerOsList=list(set(ComputerOsList)) 112 | ComputerVList=list(set(ComputerVList)) 113 | CountryList=list(set(CountryList)) 114 | annC1List=list(set(annC1List)) 115 | annC2List=list(set(annC2List)) 116 | 117 | return (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List) 118 | 119 | 120 | ############################################################################################################ 121 | # THIS FUNCTION RECEIVES A LIST AND CREATES A DICTIONARY TO GET THE INDEX WHEN THE VALUE IS GIVEN AS A KEY # 122 | ############################################################################################################ 123 | 124 | def list2Dict(lista): 125 | newDict=dict() 126 | for i in range(len(lista)): 127 | newDict[lista[i]]=i 128 | return newDict 129 | 130 | 131 | ############################################################################## 132 | # THIS FUNCTION CREATES A NUMPY MATRIX WITH THE INFORMATION OF A DEVICE FILE # 133 | ############################################################################## 134 | 135 | def loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2): 136 | 137 | NumRows = 0 138 | with open(trainfile,'rb') as csvfile: 139 | spamreader=csv.reader(csvfile,delimiter=',') 140 | spamreader.next() 141 | for row in spamreader: 142 | NumRows = NumRows + 1 143 | 144 | XDevices = np.zeros((NumRows,11)) 145 | 146 | NumRows = 0 147 | with open(trainfile,'rb') as csvfile: 148 | spamreader=csv.reader(csvfile,delimiter=',') 149 | spamreader.next() 150 | for row in spamreader: 151 | XDevices[NumRows,0]=DictHandle[row[0]] 152 | XDevices[NumRows,1]=DictDevice[row[1]] 153 | XDevices[NumRows,2]=DictDevType[row[2]] 154 | XDevices[NumRows,3]=DictDevOs[row[3]] 155 | XDevices[NumRows,4]=DictCountry[row[4]] 156 | XDevices[NumRows,5]=np.float_(row[5]) 157 | XDevices[NumRows,6]=DictAnnC1[row[6]] 158 | XDevices[NumRows,7]=DictAnnC2[row[7]] 159 | XDevices[NumRows,8]=np.float_(row[8]) 160 | XDevices[NumRows,9]=np.float_(row[9]) 161 | XDevices[NumRows,10]=np.float_(row[10]) 162 | 163 | NumRows = NumRows + 1 164 | 165 | return XDevices 166 | 167 | 168 | ############################################################################## 169 | # THIS FUNCTION CREATES A NUMPY MATRIX WITH THE INFORMATION OF A COOKIE FILE # 170 | ############################################################################## 171 | 172 | def loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2): 173 | 174 | maxindex=np.int(np.max(DictCookie.values())) 175 | 176 | XCookies = np.zeros((maxindex+1,11)) 177 | 178 | with open(cookiefile,'rb') as csvfile: 179 | spamreader=csv.reader(csvfile,delimiter=',') 180 | spamreader.next() 181 | for row in spamreader: 182 | fila=np.int(DictCookie[row[1]]) 183 | XCookies[fila,0]=DictHandle[row[0]] 184 | XCookies[fila,1]=DictCookie[row[1]] 185 | XCookies[fila,2]=DictComputerOs[row[2]] 186 | XCookies[fila,3]=DictComputerV[row[3]] 187 | XCookies[fila,4]=DictCountry[row[4]] 188 | XCookies[fila,5]=np.float_(row[5]) 189 | XCookies[fila,6]=DictAnnC1[row[6]] 190 | XCookies[fila,7]=DictAnnC2[row[7]] 191 | XCookies[fila,8]=np.float_(row[8]) 192 | XCookies[fila,9]=np.float_(row[9]) 193 | XCookies[fila,10]=np.float_(row[10]) 194 | 195 | return XCookies 196 | 197 | #################################################################################################### 198 | # THIS FUNCTION CREATES A DICTIONARY WHERE THE KEYS ARE THE IP ADDRESSES OF THE IP AGGREGATED FILE # 199 | # AND THE VALUE A NUMPY ARRAY WITH ITS INFORMATION. # 200 | #################################################################################################### 201 | 202 | def loadIPAGG(ipaggfile): 203 | 204 | XIPS=dict() 205 | 206 | with open(ipaggfile,'rb') as csvfile: 207 | spamreader=csv.reader(csvfile,delimiter=',') 208 | spamreader.next() 209 | for row in spamreader: 210 | 211 | datoIP=np.zeros(5) 212 | datoIP[0]=np.float_(row[1]) 213 | datoIP[1]=np.float_(row[2]) 214 | datoIP[2]=np.float_(row[3]) 215 | datoIP[3]=np.float_(row[4]) 216 | datoIP[4]=np.float_(row[5]) 217 | 218 | XIPS[row[0]]=datoIP 219 | 220 | return XIPS 221 | 222 | ##################################################################### 223 | # THIS FUNCTION CREATES A DICTIONARY WHERE THE KEYS ARE THE DEVICES # 224 | # AND THE VALUE DICTIONARY OF THE PROPERTIES AND ITS INFORMATION # 225 | ##################################################################### 226 | 227 | def loadPROPS(fileprops,DictDevice,DictCookie): 228 | 229 | DevProps=dict() 230 | 231 | with open(fileprops) as fp: 232 | fp.readline() 233 | 234 | for line in fp: 235 | 236 | matchObj = re.match( r'([a-zA-Z0-9_]*),([0-9\-]*),{([(a-zA-Z0-9.(),\-_]*)}', line, flags=0) 237 | 238 | if(matchObj.group(2)=='0'): 239 | props = re.findall(r'\((.*?)\)',matchObj.group(3)) 240 | ValProps=dict() 241 | for prop in props: 242 | propV = prop.split(',') 243 | ValProps[propV[0]]=np.float_(propV[1]) 244 | Devic=DictDevice.get(matchObj.group(1),-1) 245 | if Devic>-1: 246 | DevProps[Devic]=ValProps 247 | 248 | return DevProps 249 | 250 | ################################################################################################# 251 | # THIS FUNCTION CREATES: # 252 | # A DICTIONARY WHERE THE KEYS ARE THE DEVICES OF THE TRAINING SET AND THE VALUES THEIR COOKIES # 253 | # A DICTIONARY WHERE THE KEYS ARE THE COOKIES AND THE VALUES OTHER COOKIES WITH THE SAME HANDLE # 254 | # A DICTIONARY WHERE THE KEYS ARE THE COOKIES AND THE VALUES THE DEVICES WITH THE SAME HANDLE # 255 | ################################################################################################# 256 | 257 | def creatingLabels(XDevices,XCookies,DictHandle): 258 | 259 | HDC=dict() 260 | unknown = DictHandle['-1'] 261 | Handles=np.unique(XCookies[:,0]) 262 | for i in range(len(Handles)): 263 | if Handles[i] != unknown: 264 | HDC[Handles[i]]=dict() 265 | HDC[Handles[i]]['Devices']=set() 266 | HDC[Handles[i]]['Cookies']=set() 267 | 268 | (NDevices,NDim)=XDevices.shape 269 | 270 | for i in range(NDevices): 271 | HDC[XDevices[i,0]]['Devices'].add(XDevices[i,1]) 272 | 273 | (NCookies,NDim)=XCookies.shape 274 | 275 | for i in range(NCookies): 276 | if XCookies[i,0] != unknown: 277 | mdic=HDC.get(XCookies[i,0]) 278 | mdic['Cookies'].add(XCookies[i,1]) 279 | 280 | 281 | Labels=dict() 282 | Groups = dict() 283 | WhosDevice=dict() 284 | 285 | for k,v in HDC.iteritems(): 286 | for dev in v['Devices']: 287 | Labels[dev]=v['Cookies'] 288 | for coo in v['Cookies']: 289 | Groups[coo]=v['Cookies'] 290 | WhosDevice[coo]=v['Devices'] 291 | 292 | for i in range(NCookies): 293 | if XCookies[i,0] == unknown: 294 | name=XCookies[i,1] 295 | setcoo=set() 296 | setcoo.add(name) 297 | Groups[name]=setcoo 298 | 299 | return (Labels,Groups,WhosDevice) 300 | 301 | ############################################################################ 302 | # THIS FUNCTION EVALUATES THE F05 SCORE ON THE RESULTS OF A VALIDATION SET # 303 | ############################################################################ 304 | 305 | def calculateF05(Results,Target): 306 | 307 | BetaQ=0.5*0.5 308 | 309 | F05=list() 310 | 311 | for k in Results.keys(): 312 | pos=Results[k] 313 | tla=Target[k] 314 | 315 | tp=np.float_(len(pos & tla)) 316 | fp=np.float_(len(pos)-tp) 317 | fn=np.float_(len(tla)-tp) 318 | p=tp/(tp+fp) 319 | r=tp/(tp+fn) 320 | if p*r>0.0: 321 | f=(1.0+BetaQ)*p*r/(BetaQ*p+r) 322 | else: 323 | f=0.0 324 | F05.append(f) 325 | return np.mean(F05) 326 | 327 | ################################################# 328 | # THIS FUNCTION CREATES THE DATA STRUCTURES TO: # 329 | # FIND THE IP ADDRESSES OF EVERY DEVICE # 330 | # FIND THE IP ADDRESSES OF EVERY COOKIE # 331 | # FIND THE DEVICES OF EVERY IP ADDRESS # 332 | # FINC THE COOKIES OF EVERY IP ADDRESS # 333 | ################################################# 334 | 335 | def loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups): 336 | 337 | DeviceIPS=dict() 338 | CookieIPS=dict() 339 | IPDev=defaultdict(set) 340 | IPCoo=defaultdict(set) 341 | 342 | with open(ipfile) as fp: 343 | fp.readline() 344 | 345 | for line in fp: 346 | matchObj = re.match( r'([a-zA-Z0-9_]*),([0-9\-]*),{([(a-zA-Z0-9(),\-_]*)}', line, flags = 0) 347 | ips = re.findall(r'(\w*,\w*,\w*,\w*,\w*,\w*,\w*)',matchObj.group(3)) 348 | 349 | ValIPS=dict() 350 | for ip in ips: 351 | Indiv = ip.split(',') 352 | arr=np.zeros(11) 353 | arr[0]=np.float_(Indiv[1]) 354 | arr[1]=np.float_(Indiv[2]) 355 | arr[2]=np.float_(Indiv[3]) 356 | arr[3]=np.float_(Indiv[4]) 357 | arr[4]=np.float_(Indiv[5]) 358 | arr[5]=np.float_(Indiv[6]) 359 | dIP=XIPS[Indiv[0]] 360 | arr[6]=np.float_(dIP[0]) 361 | arr[7]=np.float_(dIP[1]) 362 | arr[8]=np.float_(dIP[2]) 363 | arr[9]=np.float_(dIP[3]) 364 | arr[10]=np.float_(dIP[4]) 365 | 366 | ValIPS[Indiv[0]]=arr 367 | 368 | if(matchObj.group(2)=='0'): 369 | Device=DictDevice.get(matchObj.group(1),-1) 370 | if Device>-1: 371 | DeviceIPS[Device]=ValIPS 372 | for k in ValIPS.keys(): 373 | IPDev[k].add(Device) 374 | else: 375 | DeviceIPS[matchObj.group(1)]=ValIPS 376 | for k in ValIPS.keys(): 377 | IPDev[k].add(matchObj.group(1)) 378 | 379 | 380 | else: 381 | Cookie=DictCookie[matchObj.group(1)] 382 | CookieIPS[Cookie]=ValIPS 383 | for k in ValIPS.keys(): 384 | IPCoo[k].add(Cookie) 385 | 386 | 387 | 388 | for k,v in Groups.iteritems(): 389 | if len(v)>1: 390 | for cook1 in v: 391 | for cook2 in v: 392 | if cook1 != cook2: 393 | d1=CookieIPS[cook1] 394 | d2=CookieIPS[cook2] 395 | for n1,n2 in d1.iteritems(): 396 | if n1 not in d2.keys(): 397 | d2[n1]=n2 398 | IPCoo[n1].add(cook2) 399 | 400 | return (IPDev,IPCoo,DeviceIPS,CookieIPS) 401 | 402 | ################################################################################ 403 | # THIS FUNCTION FOR A GIVEN DEVICE CREATES: # 404 | # A SET OF COOKIES WITH KNOWN HANDLE THAT SHARE IP ADDRESSES WITH THE DEVICE # 405 | # A SET OF COOKIES WITH UNKNOWN HANDLE THAT SHARE IP ADDRESSES WITH THE DEVICE # 406 | ################################################################################ 407 | 408 | def fullCandidates(device,XDevices,XCookies,IPDev,IPCoo,DeviceIPS,DictHandle): 409 | 410 | CandidatesKnown=dict() 411 | CandidatesUnknown=dict() 412 | 413 | candidatestotalKnown=set() 414 | candidatestotalUnknown=set() 415 | 416 | Unknown = DictHandle['-1'] 417 | 418 | ips=DeviceIPS[device].keys() 419 | 420 | for ip in ips: 421 | if(len(IPDev.get(ip,set()))<=30): 422 | candidates=IPCoo[ip] 423 | for candidate in candidates: 424 | if(XCookies[np.int(candidate),0] != Unknown): 425 | candidatestotalKnown.add(candidate) 426 | else: 427 | candidatestotalUnknown.add(candidate) 428 | 429 | if (len(candidatestotalKnown)==0): 430 | for ip in ips: 431 | candidates=IPCoo[ip] 432 | for candidate in candidates: 433 | if(XCookies[np.int(candidate),0] != Unknown): 434 | candidatestotalKnown.add(candidate) 435 | else: 436 | candidatestotalUnknown.add(candidate) 437 | 438 | 439 | CandidatesKnown[device]=candidatestotalKnown 440 | CandidatesUnknown[device]=candidatestotalUnknown 441 | 442 | return (CandidatesKnown,CandidatesUnknown) 443 | 444 | ############################################################################### 445 | # THIS FUNCTION CREATES THE INITIAL SELECTION OF CANDIDATES FOR EVERY DEVICE # 446 | ############################################################################### 447 | 448 | def selectCandidates(XDevices,XCookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle): 449 | 450 | devices = np.unique(XDevices[:,1]) 451 | Candidates=dict() 452 | 453 | Unknown=DictHandle['-1'] 454 | 455 | for i in range(len(devices)): 456 | device = devices[i] 457 | 458 | candidatestotal=set() 459 | ips=DeviceIPS[device].keys() 460 | for ip in ips: 461 | if(len(IPDev.get(ip,set()))<=10 and len(IPCoo.get(ip,set()))<=20): 462 | candidates=IPCoo[ip] 463 | for candidate in candidates: 464 | if(XCookies[np.int(candidate),0] != Unknown): 465 | candidatestotal.add(candidate) 466 | 467 | if len(candidatestotal)==0: 468 | for ip in ips: 469 | if(len(IPDev.get(ip,set()))<=25 and len(IPCoo.get(ip,set()))<=50): 470 | candidates=IPCoo[ip] 471 | for candidate in candidates: 472 | if(XCookies[np.int(candidate),0] != Unknown): 473 | candidatestotal.add(candidate) 474 | 475 | 476 | if len(candidatestotal)==0: 477 | for ip in ips: 478 | candidates=IPCoo[ip] 479 | for candidate in candidates: 480 | if(XCookies[np.int(candidate),0] != Unknown): 481 | candidatestotal.add(candidate) 482 | 483 | if len(candidatestotal)==0: 484 | for ip in ips: 485 | candidates=IPCoo[ip] 486 | for candidate in candidates: 487 | candidatestotal.add(candidate) 488 | 489 | 490 | 491 | Candidates[device]=candidatestotal 492 | 493 | return Candidates 494 | 495 | 496 | ########################################### 497 | # THIS CREATES A THE TRAINING OR TEST SET # 498 | ########################################### 499 | 500 | 501 | def createDataSet(Candidates,XDevice,XCookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProps): 502 | 503 | OriginalIndex=dict() 504 | numdifs=0 505 | numpatterns=0 506 | for k,v in Candidates.iteritems(): 507 | numpatterns=numpatterns+len(v) 508 | 509 | 510 | Added=0 511 | for k,v in Candidates.iteritems(): 512 | Device=XDevice[XDevice[:,1]==k,np.array([2,3,4,5,6,7,8,9,10])] 513 | 514 | IndivIndex=dict() 515 | 516 | setk=set() 517 | setk.add(k) 518 | setdevips=set(DeviceIPS.get(k,dict()).keys()) 519 | setdevpro=set(DevProps.get(k,dict()).keys()) 520 | 521 | for coo in v: 522 | 523 | Cookie=XCookies[np.int(coo),np.array([2,3,4,5,6,7,8,9,10])] 524 | 525 | row=np.concatenate((Device,Cookie)) 526 | 527 | 528 | setcooips=set(CookieIPS.get(coo,dict()).keys()) 529 | 530 | PROPS=setdevpro 531 | mipro=PROPS 532 | 533 | 534 | IPS=(setdevips & setcooips) 535 | miips=set() 536 | for ip in IPS: 537 | if(len(IPDev.get(ip,set()))<=10 and len(IPCoo.get(ip,set()))<=20): 538 | miips.add(ip) 539 | if len(miips)==0: 540 | for ip in IPS: 541 | miips.add(ip) 542 | 543 | OtherDevices=set(WhosDevice.get(coo,set()))-setk 544 | 545 | devp=set() 546 | devi=set() 547 | 548 | for odev in OtherDevices: 549 | devp=devp | set(DevProps.get(odev,dict().keys())) 550 | devi=devi | set(DeviceIPS.get(odev,dict().keys())) 551 | 552 | 553 | intersec=np.float_(len(devp & setdevpro)) 554 | interseci=np.float_(len(devi & setdevips)) 555 | 556 | 557 | if intersec>0: 558 | intersec=intersec/np.float_(len(setdevpro)) 559 | 560 | if interseci>0: 561 | intersec=intersec/np.float_(len(setdevips)) 562 | 563 | 564 | row=np.concatenate((row,np.array([np.float_(len(OtherDevices))]))) 565 | row=np.concatenate((row,np.array([np.float_(intersec)]))) 566 | 567 | row=np.concatenate((row,np.array([np.float_(interseci)]))) 568 | 569 | 570 | row=np.concatenate((row,np.array([np.float_(len(IPS))]))) 571 | row=np.concatenate((row,np.array([np.float_(len(setdevips))]))) 572 | row=np.concatenate((row,np.array([np.float_(len(setcooips))]))) 573 | 574 | row=np.concatenate((row,np.array([np.float_(len(PROPS))]))) 575 | row=np.concatenate((row,np.array([np.float_(len(setdevpro))]))) 576 | 577 | 578 | row=np.concatenate((row,np.array([np.float_(len(Groups.get(coo,set())))]))) 579 | row=np.concatenate((row,np.array([np.float_(len(Groups.get(coo,set()) & v))]))) 580 | 581 | row=np.concatenate((row,np.array([np.float_(len(miips))]))) 582 | 583 | 584 | iprow=np.zeros(22) 585 | niprows=0 586 | for ip in miips: 587 | iprow=iprow+np.concatenate((DeviceIPS[k][ip].reshape(-1),CookieIPS[coo][ip].reshape(-1))) 588 | niprows=niprows+1 589 | 590 | if niprows>0: 591 | meaniprows=iprow/np.float_(niprows) 592 | else: 593 | meaniprows=iprow 594 | 595 | 596 | row=np.concatenate((row.reshape(-1),iprow.reshape(-1))) 597 | row=np.concatenate((row.reshape(-1),meaniprows.reshape(-1))) 598 | row=np.concatenate((row.reshape(-1),(iprow[0:6]-iprow[11:-5]).reshape(-1))) 599 | 600 | 601 | if Added==0: 602 | XTR=np.zeros((numpatterns,len(row))) 603 | 604 | IndivIndex[coo]=Added 605 | 606 | XTR[Added,:]=row 607 | 608 | Added=Added+1 609 | OriginalIndex[k]=IndivIndex 610 | return (XTR,OriginalIndex) 611 | 612 | ##################################################### 613 | # THIS CREATES A THE LABELS FOR SUPERVISED LEARNING # 614 | ##################################################### 615 | 616 | def createTrainingLabels(Candidates,Labels): 617 | 618 | numpatterns=0 619 | 620 | for k,v in Candidates.iteritems(): 621 | numpatterns=numpatterns+len(v) 622 | 623 | YTR=np.zeros(numpatterns) 624 | 625 | Added=0 626 | for k,v in Candidates.iteritems(): 627 | for coo in v: 628 | if(coo in Labels[k]): 629 | YTR[Added]=1.0 630 | Added=Added+1 631 | 632 | return YTR 633 | 634 | 635 | ###################################################### 636 | # THIS FINCTION SELECTS THE COOKIES FOR EVERY DEVICE # 637 | # GIVEN THE PREDICTIONS OF THE CLASSIFIER # 638 | ###################################################### 639 | 640 | def bestSelection(predictions, OriginalIndex, values,Groups): 641 | 642 | result=dict() 643 | 644 | threshold=dict() 645 | 646 | for k,v in OriginalIndex.iteritems(): 647 | 648 | cook=set() 649 | maxval=0.0 650 | cookies=v.keys() 651 | 652 | scores=np.zeros(len(cookies)) 653 | 654 | for i in range(len(cookies)): 655 | scores[i]=predictions[v[cookies[i]]] 656 | 657 | 658 | Orden=sorted(range(len(scores)),key=lambda x:-scores[x]) 659 | 660 | if len(cookies)>0: 661 | if Groups.get(cookies[Orden[0]],-100) != -100: 662 | maxval=scores[Orden[0]] 663 | cook= (cook | Groups[cookies[Orden[0]]]) 664 | 665 | if (maxval<0.9): 666 | for i in range(len(values)): 667 | if (i<= len(cook)): 668 | if (i1 & tam2==1): 672 | if(scores[Orden[i]]>maxval*(values[i]-0.15)): 673 | cook= (cook | Groups.get(cookies[Orden[i]],set())) 674 | elif (tam1>1 & tam2>1): 675 | if(scores[Orden[i]]>maxval*(values[i]+0.1)): 676 | cook= (cook | Groups.get(cookies[Orden[i]],set())) 677 | elif (tam1==1 & tam2==1): 678 | if(scores[Orden[i]]>maxval*(values[i])): 679 | cook= (cook | Groups.get(cookies[Orden[i]],set())) 680 | 681 | 682 | result[k]=cook 683 | threshold[k]=maxval 684 | return (result,threshold) 685 | 686 | ##################################################### 687 | # THIS FUNCTION TRAINS THE CLASSIFIER USING XGBOOST # 688 | ##################################################### 689 | 690 | def trainXGBoost(xtr,ytr,rounds,eta,xtst,ytst): 691 | xgmat = xgb.DMatrix( xtr, label=ytr) 692 | xgmat2 = xgb.DMatrix( xtst, label=ytst) 693 | param = {} 694 | param['eta'] = eta 695 | param['max_depth'] = 10 696 | param['subsample'] = 1.0 697 | param['nthread'] = 12 698 | param['min_child_weight']=4 699 | param['gamma']=5.0 700 | param['colsample_bytree']=1.0 701 | param['silent']=1 702 | param['objective'] = 'binary:logistic' 703 | param['eval_metric']='error' 704 | watchlist = [ (xgmat,'train') ,(xgmat2,'test')] 705 | num_round = rounds 706 | bst = xgb.train( param, xgmat, num_round, watchlist ); 707 | return bst 708 | 709 | ####################################### 710 | # THIS FUNCTION MAKES THE PREDICTIONS # 711 | ####################################### 712 | 713 | def predictXGBoost(X,bst): 714 | xgmat = xgb.DMatrix( X) 715 | return bst.predict( xgmat ) 716 | 717 | ######################################################################### 718 | # THIS FUNCTION TRAINS THE ALGORITHM USING 8 BAGGERS AND AVERAGING THEM # 719 | ######################################################################### 720 | 721 | def FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain,Groups,Labels): 722 | NFOLDS=8 723 | 724 | skf = sklearn.cross_validation.KFold(len(OriginalIndexTR.keys()),n_folds=NFOLDS,random_state=0) 725 | 726 | resultadosVal=np.zeros(len(YTR)) 727 | 728 | 729 | (tamTST,dTST)=XTST.shape 730 | resultadosTST=np.zeros(tamTST) 731 | 732 | 733 | classifiers=list() 734 | 735 | iteration=0 736 | for (train,test) in skf: 737 | 738 | iteration=iteration+1 739 | Originaltmp=dict() 740 | print "Training Bagger ",iteration, "of", NFOLDS 741 | 742 | 743 | trainind=list() 744 | testind=list() 745 | traindev=list() 746 | testdev=list() 747 | 748 | for i in train: 749 | devtr=DevicesTrain[i,1] 750 | traindev.append(devtr) 751 | trainind.extend(OriginalIndexTR[devtr].values()) 752 | 753 | for i in test: 754 | devtr=DevicesTrain[i,1] 755 | testdev.append(devtr) 756 | testind.extend(OriginalIndexTR[devtr].values()) 757 | Originaltmp[devtr]=OriginalIndexTR[devtr] 758 | 759 | trainind=np.array(trainind) 760 | testind=np.array(testind) 761 | 762 | XvalTR=XTR[trainind,:] 763 | XvalTST=XTR[testind,:] 764 | 765 | YvalTR=YTR[trainind] 766 | YvalTST=YTR[testind] 767 | 768 | 769 | bst=trainXGBoost(XvalTR,YvalTR,200,0.10,XvalTST,YvalTST) 770 | 771 | classifiers.append((bst,traindev,testdev)) 772 | 773 | pTT=predictXGBoost(XvalTR,bst) 774 | pTR=predictXGBoost(XvalTST,bst) 775 | 776 | resultadosVal[testind]=pTR 777 | 778 | (validat,thTR)=bestSelection(resultadosVal, Originaltmp, np.array([1.0]),Groups) 779 | 780 | pTST=predictXGBoost(XTST,bst) 781 | 782 | 783 | resultadosTST=resultadosTST+pTST 784 | 785 | 786 | resultadosTST=resultadosTST/np.float_(NFOLDS) 787 | return(resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers) 788 | 789 | ############################################################################################### 790 | # THIS FUNCTION LOOKS FOR DEVICES WHOSE BEST CANDIDATE SCORES LESS THAN 0.05, # 791 | # CREATES A NEW SET OF CANDIDATES CONTAINING EVERY COOKIE THAT SHARES AN IP ADDRESS WITH HIM, # 792 | # SCORES THEM WITH XGBOOST AND SELECT THE CANDIDATES FOR THE SUBMISSION # 793 | ############################################################################################### 794 | 795 | def PostAnalysisTrain(validat,thTR,classifiers,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle,Labels): 796 | 797 | itn=0 798 | for k,v in validat.iteritems(): 799 | itn=itn+1 800 | if thTR[k]<0.05: 801 | (fcandK,fcandU)=fullCandidates(k,DevicesTrain,Cookies,IPDev,IPCoo,DeviceIPS,DictHandle) 802 | 803 | validatTHK=dict() 804 | thTHK=dict() 805 | if(len(fcandK[k])>0): 806 | (XTHK,OriginalIndexTHK)=createDataSet(fcandK,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties) 807 | YTHK=createTrainingLabels(fcandK,Labels) 808 | estimK=np.zeros(len(YTHK)) 809 | 810 | for (classifier,traindev,testdev) in classifiers: 811 | if k in testdev: 812 | estimK=predictXGBoost(XTHK,classifier) 813 | 814 | (validatTHK,thTHK)=bestSelection(estimK, OriginalIndexTHK, np.array([1.0,0.9]),Groups) 815 | 816 | validatTHU=dict() 817 | thTHU=dict() 818 | if(len(fcandU[k])>0): 819 | (XTHU,OriginalIndexTHU)=createDataSet(fcandU,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties) 820 | YTHU=createTrainingLabels(fcandU,Labels) 821 | estimU=np.zeros(len(YTHU)) 822 | 823 | for (classifier,traindev,testdev) in classifiers: 824 | if k in testdev: 825 | estimU=predictXGBoost(XTHU,classifier) 826 | 827 | (validatTHU,thTHU)=bestSelection(estimU, OriginalIndexTHU, np.array([1.0,0.9]),Groups) 828 | 829 | if len(validatTHK)>0: 830 | if len(validatTHU)>0: 831 | if(thTHU[k]>(thTHK[k]+0.7)): 832 | validat[k]=validatTHU[k] 833 | thTR[k]=thTHU[k] 834 | else: 835 | if thTR[k]<=0.025: 836 | validat[k]=validatTHK[k] 837 | thTR[k]=thTHK[k] 838 | else: 839 | if thTR[k]+0.30: 920 | if Groups.get(cookies[Orden[0]],-100) != -100: 921 | cook= (cook | Groups[cookies[Orden[0]]]) 922 | ValorMax=scores[Orden[0]] 923 | 924 | Segun=-1 925 | Terminado='NO' 926 | 927 | for i in range(len(cookies)): 928 | if i>0: 929 | if Terminado=='NO': 930 | if (cookies[Orden[i]] not in cook): 931 | Segun=scores[Orden[i]] 932 | Terminado='SI' 933 | 934 | if (Segun<0.05 and ValorMax>0.4): 935 | probCandidates[k]=Groups[cookies[Orden[0]]] 936 | 937 | return probCandidates 938 | 939 | ######################################### 940 | # THIS FUNCTION MERGES THE DICTIONARIES # 941 | # FOR THE SEMI SUPERVISED LEARNING # 942 | ######################################### 943 | 944 | def createOtherDevicesDict(dict1,dict2,dict3): 945 | 946 | OtherDevices=defaultdict(set) 947 | for k,v in dict1.iteritems(): 948 | for cookie in v: 949 | OtherDevices[cookie].add(k) 950 | for k,v in dict2.iteritems(): 951 | for cookie in v: 952 | OtherDevices[cookie].add(k) 953 | for k,v in dict3.iteritems(): 954 | for cookie in v: 955 | OtherDevices[cookie].add(k) 956 | 957 | return OtherDevices 958 | 959 | 960 | 961 | ###################################################### 962 | # THIS FUNCTION SAVE THE FINAL PREDICTIONS IN A FILE # 963 | ###################################################### 964 | 965 | def writeSolution(file,selected,DeviceList,CookieList): 966 | 967 | header=list() 968 | header.append('device_id') 969 | header.append('cookie_id') 970 | 971 | with open(file, 'wb') as csvfile: 972 | spamwriter = csv.writer(csvfile, delimiter=',') 973 | spamwriter.writerow(header) 974 | 975 | 976 | for k,v in selected.iteritems(): 977 | row=list() 978 | items=list() 979 | row.append(DeviceList[np.int(k)]) 980 | for elem in (v): 981 | items.append(CookieList[np.int(elem)]) 982 | if len(v)==0: 983 | items.append('id_10') 984 | row.append(' '.join(items)) 985 | spamwriter.writerow(row) 986 | 987 | 988 | ############################################################################################### 989 | # THIS FUNCTION MAKES THE POST PROCESSING ON A TEST # 990 | # IT LOOKS FOR DEVICES WHOSE BEST CANDIDATE SCORES LESS THAN 0.05, # 991 | # CREATES A NEW SET OF CANDIDATES CONTAINING EVERY COOKIE THAT SHARES AN IP ADDRESS WITH HIM, # 992 | # SCORES THEM WITH XGBOOST AND SELECT THE CANDIDATES FOR THE SUBMISSION # 993 | ############################################################################################### 994 | 995 | def PostAnalysisTest(validatTST,thTST,classifiers,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle): 996 | 997 | itn=0 998 | for k,v in validatTST.iteritems(): 999 | itn=itn+1 1000 | if thTST[k]<0.05: 1001 | 1002 | (fcandK,fcandU)=fullCandidates(k,DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,DictHandle) 1003 | 1004 | validatTHK=dict() 1005 | thTHK=dict() 1006 | if(len(fcandK[k])>0): 1007 | 1008 | (XTHK,OriginalIndexTHK)=createDataSet(fcandK,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties) 1009 | 1010 | (tmxK,dmxK)=XTHK.shape 1011 | estimK=np.zeros(tmxK) 1012 | 1013 | for (classifier,traindev,testdev) in classifiers: 1014 | estimK=estimK+predictXGBoost(XTHK,classifier) 1015 | 1016 | estimK=estimK/np.float_(len(classifiers)) 1017 | 1018 | (validatTHK,thTHK)=bestSelection(estimK, OriginalIndexTHK, np.array([1.0,0.90]),Groups) 1019 | 1020 | validatTHU=dict() 1021 | thTHU=dict() 1022 | if(len(fcandU[k])>0): 1023 | (XTHU,OriginalIndexTHU)=createDataSet(fcandU,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties) 1024 | (tmxU,dmxU)=XTHU.shape 1025 | estimU=np.zeros(tmxU) 1026 | 1027 | for (classifier,traindev,testdev) in classifiers: 1028 | estimU=estimU+predictXGBoost(XTHU,classifier) 1029 | 1030 | estimU=estimU/np.float_(len(classifiers)) 1031 | 1032 | (validatTHU,thTHU)=bestSelection(estimU, OriginalIndexTHU, np.array([1.0,0.90]),Groups) 1033 | 1034 | 1035 | if len(validatTHK)>0: 1036 | if len(validatTHU)>0: 1037 | if(thTHU[k]>(thTHK[k]+0.7)): 1038 | validatTST[k]=validatTHU[k] 1039 | thTST[k]=thTHU[k] 1040 | else: 1041 | if thTST[k]<=0.025: 1042 | validatTST[k]=validatTHK[k] 1043 | thTST[k]=thTHK[k] 1044 | else: 1045 | if thTST[k]+0.3. 15 | 16 | 17 | 18 | import csv 19 | import os 20 | import re 21 | import numpy as np 22 | import inspect 23 | import sys 24 | import sklearn 25 | from sklearn import cross_validation 26 | import pickle 27 | 28 | 29 | from Variables import * 30 | from VariablesTST import * 31 | 32 | 33 | code_path = os.path.join(pathXGBoost) 34 | sys.path.append(code_path) 35 | import xgboost as xgb 36 | from CookieLibrary import * 37 | 38 | ################################################################################# 39 | # PARSING THE FILES PROVIDED FOR THE CHALLENGE AND CREATING THE DATA STRUCTURES # 40 | # THAT THE ALGORITHM NEEDS # 41 | ################################################################################# 42 | 43 | # Some features in the files that describe the cookies and the devices are categorical features in test mode. 44 | # For example, the countries are like: 'country_147', or the handle is like 'handle_1301101'. 45 | # This function creates dictionaries to transform that text into a numerical value to load them in a numpy matrix. 46 | 47 | print('Loading Dictionaries') 48 | (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)=GetIdentifiers(trainfile,testfile,cookiefile) 49 | 50 | DictHandle = list2Dict(HandleList) 51 | DictDevice = list2Dict(DeviceList) 52 | DictCookie = list2Dict(CookieList) 53 | DictDevType = list2Dict(DevTypeList) 54 | DictDevOs = list2Dict(DevOsList) 55 | DictComputerOs = list2Dict(ComputerOsList) 56 | DictComputerV = list2Dict(ComputerVList) 57 | DictCountry = list2Dict(CountryList) 58 | DictAnnC1 = list2Dict(annC1List) 59 | DictAnnC2 = list2Dict(annC2List) 60 | 61 | 62 | # This part loads the content of the devices into a numpy matrix using the dictionaries to transform the text values into numerical values 63 | print('Loading Devices Files') 64 | DevicesTrain = loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2) 65 | 66 | # This part loads the content of the cookies into a numpy matrix using the dictionaries to transform the text values into numerical values 67 | print('Loading Cookies File') 68 | Cookies = loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2) 69 | 70 | # It loads the Properties of the devices 71 | print('Loading Properties File') 72 | DevProperties=loadPROPS(propfile,DictDevice,DictCookie) 73 | 74 | # It read the train information and creates a dictionary with the cookies of every device, a dicionary that gives for every cookie the other cookies in its same handle and for every cookie its devices 75 | (Labels,Groups,WhosDevice)=creatingLabels(DevicesTrain,Cookies,DictHandle) 76 | 77 | 78 | # It creates a dictionary whose keys are the ip address and the value a numpy array with the IP info 79 | print('Loading IP Files') 80 | XIPS=loadIPAGG(ipaggfile) 81 | 82 | # It loads the IP file and creates four dictionaries. 83 | # The first one gives the devices of every ip, the second one the cookies of every ip, the third one the ips of every device and the last one the ips of every cookie. 84 | (IPDev,IPCoo,DeviceIPS,CookieIPS)=loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups) 85 | 86 | 87 | ######################### 88 | # LOADING THE TEST FILE # 89 | ######################### 90 | 91 | print('STEP: Loading test file') 92 | DevicesTest = loadDevices(predictFile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2) 93 | 94 | 95 | ################################### 96 | # INITIAL SELECTION OF CANDIDATES # 97 | ################################### 98 | 99 | print('STEP: Initial selection of candidates') 100 | CandidatesTST=selectCandidates(DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle) 101 | 102 | ##################### 103 | # LOADING THE MODEL # 104 | ##################### 105 | 106 | print('Loading the model') 107 | (classifiers,DictOtherDevices) = loadModel(modelpath) 108 | 109 | ######################## 110 | # CREATING THE DATASET # 111 | ######################## 112 | 113 | print('STEP: Creating the dataset') 114 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties) 115 | 116 | ######################## 117 | # USING THE CLASSIFIER # 118 | ######################## 119 | 120 | print('STEP: Using the classifier') 121 | resultadosTST = Predict(XTST,classifiers) 122 | 123 | ######################## 124 | # POST PROCESSING STEP # 125 | ######################## 126 | 127 | print('STEP: Post Processing') 128 | (validatTST,thTST)=bestSelection(resultadosTST, OriginalIndexTST, np.array([1.0,0.9]),Groups) 129 | 130 | (validatTST,thTST) = PostAnalysisTest(validatTST,thTST,classifiers,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle) 131 | 132 | ######################################### 133 | # WRITIG THE FINAL SOLUTION IN THE FILE # 134 | ######################################### 135 | 136 | print('Writing the file with the result') 137 | writeSolution(resultFile,validatTST,DeviceList,CookieList) 138 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015 Roberto Diaz Morales 2 | # 3 | # This program is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU General Public License 14 | # along with this program. If not, see . 15 | 16 | 17 | 18 | import csv 19 | import os 20 | import re 21 | import numpy as np 22 | import inspect 23 | import sys 24 | import sklearn 25 | from sklearn import cross_validation 26 | import pickle 27 | 28 | from Variables import * 29 | 30 | code_path = os.path.join(pathXGBoost) 31 | sys.path.append(code_path) 32 | import xgboost as xgb 33 | from CookieLibrary import * 34 | 35 | ################################################################################# 36 | # PARSING THE FILES PROVIDED FOR THE CHALLENGE AND CREATING THE DATA STRUCTURES # 37 | # THAT THE ALGORITHM NEEDS # 38 | ################################################################################# 39 | 40 | # Some features in the files that describe the cookies and the devices are categorical features in test mode. 41 | # For example, the countries are like: 'country_147', or the handle is like 'handle_1301101'. 42 | # This function creates dictionaries to transform that text into a numerical value to load them in a numpy matrix. 43 | 44 | print('Loading Dictionaries') 45 | (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)=GetIdentifiers(trainfile,testfile,cookiefile) 46 | 47 | DictHandle = list2Dict(HandleList) 48 | DictDevice = list2Dict(DeviceList) 49 | DictCookie = list2Dict(CookieList) 50 | DictDevType = list2Dict(DevTypeList) 51 | DictDevOs = list2Dict(DevOsList) 52 | DictComputerOs = list2Dict(ComputerOsList) 53 | DictComputerV = list2Dict(ComputerVList) 54 | DictCountry = list2Dict(CountryList) 55 | DictAnnC1 = list2Dict(annC1List) 56 | DictAnnC2 = list2Dict(annC2List) 57 | 58 | 59 | # This part loads the content of the devices into a numpy matrix using the dictionaries to transform the text values into numerical values 60 | print('Loading Devices Files') 61 | DevicesTrain = loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2) 62 | DevicesTest = loadDevices(testfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2) 63 | 64 | # This part loads the content of the cookies into a numpy matrix using the dictionaries to transform the text values into numerical values 65 | print('Loading Cookies File') 66 | Cookies = loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2) 67 | 68 | # It loads the Properties of the devices 69 | print('Loading Properties File') 70 | DevProperties=loadPROPS(propfile,DictDevice,DictCookie) 71 | 72 | # It read the train information and creates a dictionary with the cookies of every device, a dicionary that gives for every cookie the other cookies in its same handle and for every cookie its devices 73 | (Labels,Groups,WhosDevice)=creatingLabels(DevicesTrain,Cookies,DictHandle) 74 | 75 | 76 | # It creates a dictionary whose keys are the ip address and the value a numpy array with the IP info 77 | print('Loading IP Files') 78 | XIPS=loadIPAGG(ipaggfile) 79 | 80 | # It loads the IP file and creates four dictionaries. 81 | # The first one gives the devices of every ip, the second one the cookies of every ip, the third one the ips of every device and the last one the ips of every cookie. 82 | (IPDev,IPCoo,DeviceIPS,CookieIPS)=loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups) 83 | 84 | 85 | ################################################################################################# 86 | # PROCEDURE WITH THE INITIAL SELECTION OF CANDIDATES (PROCEDURE DESCRIBED IN THE DOCUMENTATION) # 87 | ################################################################################################# 88 | print('STEP: Initial selection of candidates') 89 | # Using simple rules with select a set of candidate cookies for every device 90 | CandidatesTR=selectCandidates(DevicesTrain,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle) 91 | CandidatesTST=selectCandidates(DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle) 92 | 93 | 94 | ##################################################### 95 | # CREATION OF THE TRAINING AND TEST SET # 96 | # (THE FEATURES ARE DESCRIBED IN THE DOCUMENTATION) # 97 | ##################################################### 98 | 99 | print('STEP: Creating the dataset') 100 | 101 | # It creates the training and test set for supervised learning creating pairs (device,cookie) using the selected candidates. 102 | (XTR,OriginalIndexTR)=createDataSet(CandidatesTR,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties) 103 | YTR=createTrainingLabels(CandidatesTR,Labels) 104 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties) 105 | 106 | ###################################### 107 | # TRAINING USING BAGGING AND XGBOOST # 108 | ###################################### 109 | 110 | print('STEP: Training Supervised Learning') 111 | (resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)=FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain, Groups, Labels) 112 | 113 | ############################################################################# 114 | # UPDATING THE DATA STRUCTURES DATASETS WITH NEW INFORMATION OF THE RESULTS # 115 | ############################################################################# 116 | 117 | print('Updating features for with semisupervised learning information') 118 | # SECOND LOOP FOR SEMISUPERVISED LEARNING 119 | # It repeats the training procedure adding the cookies with high probability to the WhosDevice structure 120 | uniqueCand=uniqueCandidates(DevicesTest,Cookies,IPCoo,DeviceIPS,DictHandle,Groups) 121 | probCand=mostProbable(resultadosTST, OriginalIndexTST, Groups) 122 | DictOtherDevices=createOtherDevicesDict(Labels,uniqueCand,probCand) 123 | 124 | 125 | # It creates the training and test set for supervised learning creating pairs (device,cookie) using the selected candidates. 126 | (XTR,OriginalIndexTR)=createDataSet(CandidatesTR,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties) 127 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties) 128 | 129 | #################################################################################################### 130 | # SECOND TRAINING USING XGBOOST AND BAGGNG INCLUDING THE NEW INFORMATION(SEMI SUPERVUSED LEARNING) # 131 | #################################################################################################### 132 | 133 | # Training, it trains using 10 fold CV and the predicions are the average of the classifiers of every fold. 134 | print('STEP: Training Semi-Supervised Learning') 135 | (resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)=FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain, Groups, Labels) 136 | 137 | ############################################################ 138 | # POST PROCESSING PROCEDURE DESCRIBED IN THE DOCUMENTATION # 139 | ############################################################ 140 | 141 | print('STEP: Post Processing') 142 | # Initial selection of the cookies associated to every device 143 | (validat,thTR)=bestSelection(resultadosVal, OriginalIndexTR, np.array([1.0,0.9]),Groups) 144 | 145 | # Increasing the number of candidates in devices whose best candiate doesn't have a good likelihood 146 | (validat,thTR) = PostAnalysisTrain(validat,thTR,classifiers,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle,Labels) 147 | 148 | F05=calculateF05(validat,Labels) 149 | print "F05 Validation",F05 150 | 151 | #################### 152 | # SAVING THE MODEL # 153 | #################### 154 | 155 | print('Saving model') 156 | saveModel(modelpath,classifiers,DictOtherDevices) 157 | --------------------------------------------------------------------------------