├── CookieLibrary.py
├── README.md
├── Variables.py
├── VariablesTST.py
├── predict.py
└── train.py


/CookieLibrary.py:
--------------------------------------------------------------------------------
   1 | #    Copyright (C) 2015  Roberto Diaz Morales
   2 | #
   3 | #    This program is free software: you can redistribute it and/or modify
   4 | #    it under the terms of the GNU General Public License as published by
   5 | #    the Free Software Foundation, either version 3 of the License, or
   6 | #    (at your option) any later version.
   7 | #
   8 | #    This program is distributed in the hope that it will be useful,
   9 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 | #    GNU General Public License for more details.
  12 | #
  13 | #    You should have received a copy of the GNU General Public License
  14 | #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  15 | 
  16 | 
  17 | import csv
  18 | import numpy as np
  19 | import re
  20 | from collections import Counter,defaultdict
  21 | import os
  22 | import inspect
  23 | import sys
  24 | import sklearn
  25 | from sklearn import cross_validation
  26 | import pickle
  27 | import xgboost as xgb
  28 | 
  29 | 
  30 | 
  31 | #######################################################################################
  32 | # THIS FUNCTION PARSES THE FILES WITH THE INFORMATION ABOUT DEVICES AND COOKIES       #
  33 | # AND CREATES LISTS WITH THE IDENTIFIERS OF THE CATEGORICAL FEATURES.                 #
  34 | # THE INDEX OF THE LIST WILL BE USED AS THE VALUE OF THE FEATURE IN THE NUMPY MATRICES#
  35 | #######################################################################################
  36 | 
  37 | def GetIdentifiers(trainfile,testfile,cookiefile):
  38 | 
  39 |     DeviceList=list()
  40 |     CookieList=list()
  41 |     HandleList=list()
  42 |     DevTypeList=list()
  43 |     DevOsList=list()
  44 |     ComputerOsList=list()
  45 |     ComputerVList=list()
  46 |     CountryList=list()
  47 |     annC1List=list()
  48 |     annC2List=list()
  49 |     
  50 |     
  51 |     with open(trainfile,'rb') as csvfile:
  52 |         spamreader=csv.reader(csvfile,delimiter=',')
  53 |         spamreader.next()
  54 |         for row in spamreader:
  55 |             HandleList.append(row[0])
  56 |             DeviceList.append(row[1])
  57 |             DevTypeList.append(row[2])
  58 |             DevOsList.append(row[3])
  59 |             CountryList.append(row[4])
  60 |             annC1List.append(row[6])
  61 |             annC2List.append(row[7])
  62 | 
  63 |     DeviceList=list(set(DeviceList))
  64 |     CookieList=list(set(CookieList))
  65 |     HandleList=list(set(HandleList))
  66 |     DevTypeList=list(set(DevTypeList))
  67 |     DevOsList=list(set(DevOsList))
  68 |     CountryList=list(set(CountryList))
  69 |     annC1List=list(set(annC1List))
  70 |     annC2List=list(set(annC2List))
  71 | 
  72 | 
  73 |     with open(testfile,'rb') as csvfile:
  74 |         spamreader=csv.reader(csvfile,delimiter=',')
  75 |         spamreader.next()
  76 |         for row in spamreader:
  77 |             HandleList.append(row[0])
  78 |             DeviceList.append(row[1])
  79 |             DevTypeList.append(row[2])
  80 |             DevOsList.append(row[3])
  81 |             CountryList.append(row[4])
  82 |             annC1List.append(row[6])
  83 |             annC2List.append(row[7])            
  84 | 
  85 |     DeviceList=list(set(DeviceList))
  86 |     CookieList=list(set(CookieList))
  87 |     HandleList=list(set(HandleList))
  88 |     DevTypeList=list(set(DevTypeList))
  89 |     DevOsList=list(set(DevOsList))
  90 |     CountryList=list(set(CountryList))
  91 |     annC1List=list(set(annC1List))
  92 |     annC2List=list(set(annC2List))
  93 | 
  94 |     with open(cookiefile,'rb') as csvfile:
  95 |         spamreader=csv.reader(csvfile,delimiter=',')
  96 |         spamreader.next()
  97 |         for row in spamreader:
  98 |             HandleList.append(row[0])
  99 |             CookieList.append(row[1])
 100 |             ComputerOsList.append(row[2])
 101 |             ComputerVList.append(row[3])
 102 |             CountryList.append(row[4])
 103 |             annC1List.append(row[6])
 104 |             annC2List.append(row[7])
 105 | 
 106 |     DeviceList=list(set(DeviceList))
 107 |     CookieList=list(set(CookieList))
 108 |     HandleList=list(set(HandleList))
 109 |     DevTypeList=list(set(DevTypeList))
 110 |     DevOsList=list(set(DevOsList))
 111 |     ComputerOsList=list(set(ComputerOsList))
 112 |     ComputerVList=list(set(ComputerVList))
 113 |     CountryList=list(set(CountryList))
 114 |     annC1List=list(set(annC1List))
 115 |     annC2List=list(set(annC2List))
 116 | 
 117 |     return (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)
 118 | 
 119 | 
 120 | ############################################################################################################
 121 | # THIS FUNCTION RECEIVES A LIST AND CREATES A DICTIONARY TO GET THE INDEX WHEN THE VALUE IS GIVEN AS A KEY #
 122 | ############################################################################################################
 123 | 
 124 | def list2Dict(lista):
 125 |     newDict=dict()
 126 |     for i in range(len(lista)):
 127 |         newDict[lista[i]]=i
 128 |     return newDict
 129 | 
 130 | 
 131 | ##############################################################################
 132 | # THIS FUNCTION CREATES A NUMPY MATRIX WITH THE INFORMATION OF A DEVICE FILE #
 133 | ##############################################################################
 134 |     
 135 | def loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2):
 136 | 
 137 |     NumRows = 0
 138 |     with open(trainfile,'rb') as csvfile:
 139 |         spamreader=csv.reader(csvfile,delimiter=',')
 140 |         spamreader.next()
 141 |         for row in spamreader:
 142 |             NumRows = NumRows + 1
 143 | 
 144 |     XDevices = np.zeros((NumRows,11))
 145 |     
 146 |     NumRows = 0
 147 |     with open(trainfile,'rb') as csvfile:
 148 |         spamreader=csv.reader(csvfile,delimiter=',')
 149 |         spamreader.next()
 150 |         for row in spamreader:
 151 |             XDevices[NumRows,0]=DictHandle[row[0]]
 152 |             XDevices[NumRows,1]=DictDevice[row[1]]
 153 |             XDevices[NumRows,2]=DictDevType[row[2]]
 154 |             XDevices[NumRows,3]=DictDevOs[row[3]]
 155 |             XDevices[NumRows,4]=DictCountry[row[4]]
 156 |             XDevices[NumRows,5]=np.float_(row[5])
 157 |             XDevices[NumRows,6]=DictAnnC1[row[6]]
 158 |             XDevices[NumRows,7]=DictAnnC2[row[7]]
 159 |             XDevices[NumRows,8]=np.float_(row[8])
 160 |             XDevices[NumRows,9]=np.float_(row[9])
 161 |             XDevices[NumRows,10]=np.float_(row[10])
 162 |             
 163 |             NumRows = NumRows + 1
 164 | 
 165 |     return XDevices
 166 | 
 167 | 
 168 | ##############################################################################
 169 | # THIS FUNCTION CREATES A NUMPY MATRIX WITH THE INFORMATION OF A COOKIE FILE #
 170 | ##############################################################################
 171 |     
 172 | def loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2):
 173 |     
 174 |     maxindex=np.int(np.max(DictCookie.values()))
 175 |     
 176 |     XCookies = np.zeros((maxindex+1,11))
 177 |     
 178 |     with open(cookiefile,'rb') as csvfile:
 179 |         spamreader=csv.reader(csvfile,delimiter=',')
 180 |         spamreader.next()
 181 |         for row in spamreader:
 182 |             fila=np.int(DictCookie[row[1]])
 183 |             XCookies[fila,0]=DictHandle[row[0]]
 184 |             XCookies[fila,1]=DictCookie[row[1]]
 185 |             XCookies[fila,2]=DictComputerOs[row[2]]
 186 |             XCookies[fila,3]=DictComputerV[row[3]]
 187 |             XCookies[fila,4]=DictCountry[row[4]]
 188 |             XCookies[fila,5]=np.float_(row[5])
 189 |             XCookies[fila,6]=DictAnnC1[row[6]]
 190 |             XCookies[fila,7]=DictAnnC2[row[7]]
 191 |             XCookies[fila,8]=np.float_(row[8])
 192 |             XCookies[fila,9]=np.float_(row[9])
 193 |             XCookies[fila,10]=np.float_(row[10])
 194 |             
 195 |     return XCookies    
 196 | 
 197 | ####################################################################################################
 198 | # THIS FUNCTION CREATES A DICTIONARY WHERE THE KEYS ARE THE IP ADDRESSES OF THE IP AGGREGATED FILE #
 199 | # AND THE VALUE A NUMPY ARRAY WITH ITS INFORMATION.                                                #
 200 | ####################################################################################################
 201 | 
 202 | def loadIPAGG(ipaggfile):
 203 |     
 204 |     XIPS=dict()
 205 |     
 206 |     with open(ipaggfile,'rb') as csvfile:
 207 |         spamreader=csv.reader(csvfile,delimiter=',')
 208 |         spamreader.next()
 209 |         for row in spamreader:
 210 | 
 211 |             datoIP=np.zeros(5)
 212 |             datoIP[0]=np.float_(row[1])
 213 |             datoIP[1]=np.float_(row[2])
 214 |             datoIP[2]=np.float_(row[3])
 215 |             datoIP[3]=np.float_(row[4])
 216 |             datoIP[4]=np.float_(row[5])
 217 | 
 218 |             XIPS[row[0]]=datoIP            
 219 | 
 220 |     return XIPS
 221 | 
 222 | #####################################################################
 223 | # THIS FUNCTION CREATES A DICTIONARY WHERE THE KEYS ARE THE DEVICES #
 224 | # AND THE VALUE DICTIONARY OF THE PROPERTIES AND ITS INFORMATION    #
 225 | #####################################################################
 226 |    
 227 | def loadPROPS(fileprops,DictDevice,DictCookie):
 228 | 
 229 |     DevProps=dict()
 230 |     
 231 |     with open(fileprops) as fp:
 232 |         fp.readline()
 233 |         
 234 |         for line in fp:
 235 |             
 236 |             matchObj = re.match( r'([a-zA-Z0-9_]*),([0-9\-]*),{([(a-zA-Z0-9.(),\-_]*)}', line, flags=0)            
 237 |  
 238 |             if(matchObj.group(2)=='0'):
 239 |                 props = re.findall(r'\((.*?)\)',matchObj.group(3))
 240 |                 ValProps=dict()
 241 |                 for prop in props:
 242 |                     propV = prop.split(',')
 243 |                     ValProps[propV[0]]=np.float_(propV[1])
 244 |                 Devic=DictDevice.get(matchObj.group(1),-1)
 245 |                 if Devic>-1:
 246 |                     DevProps[Devic]=ValProps
 247 | 
 248 |     return DevProps
 249 | 
 250 | #################################################################################################
 251 | # THIS FUNCTION CREATES:                                                                        #
 252 | # A DICTIONARY WHERE THE KEYS ARE THE DEVICES OF THE TRAINING SET AND THE VALUES THEIR COOKIES  #
 253 | # A DICTIONARY WHERE THE KEYS ARE THE COOKIES AND THE VALUES OTHER COOKIES WITH THE SAME HANDLE #
 254 | # A DICTIONARY WHERE THE KEYS ARE THE COOKIES AND THE VALUES THE DEVICES WITH THE SAME HANDLE   #
 255 | #################################################################################################
 256 | 
 257 | def creatingLabels(XDevices,XCookies,DictHandle):
 258 | 
 259 |     HDC=dict()
 260 |     unknown = DictHandle['-1']
 261 |     Handles=np.unique(XCookies[:,0])
 262 |     for i in range(len(Handles)):
 263 |         if Handles[i] != unknown:
 264 |             HDC[Handles[i]]=dict()
 265 |             HDC[Handles[i]]['Devices']=set()
 266 |             HDC[Handles[i]]['Cookies']=set()
 267 | 
 268 |     (NDevices,NDim)=XDevices.shape
 269 | 
 270 |     for i in range(NDevices):
 271 |         HDC[XDevices[i,0]]['Devices'].add(XDevices[i,1])
 272 | 
 273 |     (NCookies,NDim)=XCookies.shape
 274 | 
 275 |     for i in range(NCookies):
 276 |         if XCookies[i,0] != unknown:
 277 |             mdic=HDC.get(XCookies[i,0])
 278 |             mdic['Cookies'].add(XCookies[i,1])
 279 | 
 280 | 
 281 |     Labels=dict()
 282 |     Groups = dict()
 283 |     WhosDevice=dict()
 284 | 
 285 |     for k,v in HDC.iteritems():
 286 |         for dev in v['Devices']:
 287 |             Labels[dev]=v['Cookies']
 288 |         for coo in v['Cookies']:
 289 |             Groups[coo]=v['Cookies']
 290 |             WhosDevice[coo]=v['Devices']
 291 | 
 292 |     for i in range(NCookies):
 293 |         if XCookies[i,0] == unknown:
 294 |             name=XCookies[i,1]
 295 |             setcoo=set()
 296 |             setcoo.add(name)
 297 |             Groups[name]=setcoo
 298 | 
 299 |     return (Labels,Groups,WhosDevice)
 300 | 
 301 | ############################################################################
 302 | # THIS FUNCTION EVALUATES THE F05 SCORE ON THE RESULTS OF A VALIDATION SET #
 303 | ############################################################################
 304 |  
 305 | def calculateF05(Results,Target):
 306 | 
 307 |     BetaQ=0.5*0.5
 308 | 
 309 |     F05=list()
 310 | 
 311 |     for k in Results.keys():
 312 |         pos=Results[k]
 313 |         tla=Target[k]
 314 | 
 315 |         tp=np.float_(len(pos & tla))
 316 |         fp=np.float_(len(pos)-tp)
 317 |         fn=np.float_(len(tla)-tp)
 318 |         p=tp/(tp+fp)
 319 |         r=tp/(tp+fn)
 320 |         if p*r>0.0:
 321 |             f=(1.0+BetaQ)*p*r/(BetaQ*p+r)
 322 |         else:
 323 |             f=0.0
 324 |         F05.append(f)
 325 |     return np.mean(F05)
 326 | 
 327 | #################################################
 328 | # THIS FUNCTION CREATES THE DATA STRUCTURES TO: #
 329 | # FIND THE IP ADDRESSES OF EVERY DEVICE         #
 330 | # FIND THE IP ADDRESSES OF EVERY COOKIE         #
 331 | # FIND THE DEVICES OF EVERY IP ADDRESS          #
 332 | # FINC THE COOKIES OF EVERY IP ADDRESS          #
 333 | #################################################
 334 |    
 335 | def loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups):
 336 |    
 337 |     DeviceIPS=dict()
 338 |     CookieIPS=dict()
 339 |     IPDev=defaultdict(set)
 340 |     IPCoo=defaultdict(set)
 341 |    
 342 |     with open(ipfile) as fp:
 343 |         fp.readline()
 344 | 
 345 |         for line in fp:
 346 |             matchObj = re.match( r'([a-zA-Z0-9_]*),([0-9\-]*),{([(a-zA-Z0-9(),\-_]*)}', line, flags = 0)            
 347 |             ips = re.findall(r'(\w*,\w*,\w*,\w*,\w*,\w*,\w*)',matchObj.group(3))
 348 | 
 349 |             ValIPS=dict()
 350 |             for ip in ips:
 351 |                 Indiv = ip.split(',')  
 352 |                 arr=np.zeros(11)
 353 |                 arr[0]=np.float_(Indiv[1])
 354 |                 arr[1]=np.float_(Indiv[2])
 355 |                 arr[2]=np.float_(Indiv[3])
 356 |                 arr[3]=np.float_(Indiv[4])
 357 |                 arr[4]=np.float_(Indiv[5])
 358 |                 arr[5]=np.float_(Indiv[6])   
 359 |                 dIP=XIPS[Indiv[0]]
 360 |                 arr[6]=np.float_(dIP[0])
 361 |                 arr[7]=np.float_(dIP[1])
 362 |                 arr[8]=np.float_(dIP[2])
 363 |                 arr[9]=np.float_(dIP[3])
 364 |                 arr[10]=np.float_(dIP[4])
 365 | 
 366 |                 ValIPS[Indiv[0]]=arr                           
 367 |                     
 368 |             if(matchObj.group(2)=='0'):            
 369 |                 Device=DictDevice.get(matchObj.group(1),-1)
 370 |                 if Device>-1:
 371 |                     DeviceIPS[Device]=ValIPS
 372 |                     for k in ValIPS.keys():
 373 |                         IPDev[k].add(Device)
 374 |                 else:
 375 |                     DeviceIPS[matchObj.group(1)]=ValIPS
 376 |                     for k in ValIPS.keys():
 377 |                         IPDev[k].add(matchObj.group(1))
 378 |                     
 379 | 
 380 |             else:
 381 |                 Cookie=DictCookie[matchObj.group(1)]
 382 |                 CookieIPS[Cookie]=ValIPS
 383 |                 for k in ValIPS.keys():
 384 |                     IPCoo[k].add(Cookie)
 385 | 
 386 | 
 387 | 
 388 |     for k,v in Groups.iteritems():
 389 |         if len(v)>1:
 390 |             for cook1 in v:
 391 |                 for cook2 in v:
 392 |                     if cook1 != cook2:
 393 |                         d1=CookieIPS[cook1]
 394 |                         d2=CookieIPS[cook2]
 395 |                         for n1,n2 in d1.iteritems():
 396 |                             if n1 not in d2.keys():
 397 |                                 d2[n1]=n2
 398 |                                 IPCoo[n1].add(cook2)
 399 | 
 400 |     return (IPDev,IPCoo,DeviceIPS,CookieIPS)
 401 | 
 402 | ################################################################################
 403 | # THIS FUNCTION FOR A GIVEN DEVICE CREATES:                                    #
 404 | # A SET OF COOKIES WITH KNOWN HANDLE THAT SHARE IP ADDRESSES WITH THE DEVICE   #
 405 | # A SET OF COOKIES WITH UNKNOWN HANDLE THAT SHARE IP ADDRESSES WITH THE DEVICE #
 406 | ################################################################################
 407 | 
 408 | def fullCandidates(device,XDevices,XCookies,IPDev,IPCoo,DeviceIPS,DictHandle):
 409 | 
 410 |     CandidatesKnown=dict()
 411 |     CandidatesUnknown=dict()
 412 | 
 413 |     candidatestotalKnown=set()
 414 |     candidatestotalUnknown=set()
 415 | 
 416 |     Unknown = DictHandle['-1']
 417 | 
 418 |     ips=DeviceIPS[device].keys()
 419 |     
 420 |     for ip in ips:
 421 |         if(len(IPDev.get(ip,set()))<=30):
 422 |             candidates=IPCoo[ip]
 423 |             for candidate in candidates:
 424 |                 if(XCookies[np.int(candidate),0] != Unknown):
 425 |                     candidatestotalKnown.add(candidate)
 426 |                 else:
 427 |                     candidatestotalUnknown.add(candidate)
 428 | 
 429 |     if (len(candidatestotalKnown)==0):
 430 |         for ip in ips:
 431 |             candidates=IPCoo[ip]
 432 |             for candidate in candidates:
 433 |                 if(XCookies[np.int(candidate),0] != Unknown):
 434 |                     candidatestotalKnown.add(candidate)
 435 |                 else:
 436 |                     candidatestotalUnknown.add(candidate)
 437 | 
 438 | 
 439 |     CandidatesKnown[device]=candidatestotalKnown
 440 |     CandidatesUnknown[device]=candidatestotalUnknown
 441 |         
 442 |     return (CandidatesKnown,CandidatesUnknown)
 443 | 
 444 | ###############################################################################
 445 | # THIS FUNCTION CREATES THE INITIAL SELECTION OF CANDIDATES FOR EVERY DEVICE  #
 446 | ###############################################################################
 447 |                 
 448 | def selectCandidates(XDevices,XCookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle):
 449 | 
 450 |     devices = np.unique(XDevices[:,1])
 451 |     Candidates=dict()
 452 |     
 453 |     Unknown=DictHandle['-1']
 454 |     
 455 |     for i in range(len(devices)):
 456 |         device = devices[i]
 457 | 
 458 |         candidatestotal=set()
 459 |         ips=DeviceIPS[device].keys()
 460 |         for ip in ips:
 461 |             if(len(IPDev.get(ip,set()))<=10 and len(IPCoo.get(ip,set()))<=20):
 462 |                 candidates=IPCoo[ip]
 463 |                 for candidate in candidates:
 464 |                     if(XCookies[np.int(candidate),0] != Unknown):
 465 |                         candidatestotal.add(candidate)
 466 |     
 467 |         if len(candidatestotal)==0:
 468 |             for ip in ips:
 469 |                 if(len(IPDev.get(ip,set()))<=25 and len(IPCoo.get(ip,set()))<=50):
 470 |                     candidates=IPCoo[ip]
 471 |                     for candidate in candidates:
 472 |                         if(XCookies[np.int(candidate),0] != Unknown):
 473 |                             candidatestotal.add(candidate)
 474 |     
 475 |     
 476 |         if len(candidatestotal)==0:
 477 |             for ip in ips:
 478 |                 candidates=IPCoo[ip]
 479 |                 for candidate in candidates:
 480 |                     if(XCookies[np.int(candidate),0] != Unknown):
 481 |                         candidatestotal.add(candidate)
 482 | 
 483 |         if len(candidatestotal)==0:
 484 |             for ip in ips:
 485 |                 candidates=IPCoo[ip]
 486 |                 for candidate in candidates:
 487 |                     candidatestotal.add(candidate)
 488 |     
 489 | 
 490 | 
 491 |         Candidates[device]=candidatestotal
 492 |         
 493 |     return Candidates
 494 | 
 495 | 
 496 | ###########################################
 497 | # THIS CREATES A THE TRAINING OR TEST SET #
 498 | ###########################################
 499 | 
 500 | 
 501 | def createDataSet(Candidates,XDevice,XCookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProps):
 502 | 
 503 |     OriginalIndex=dict()
 504 |     numdifs=0
 505 |     numpatterns=0
 506 |     for k,v in Candidates.iteritems():
 507 |         numpatterns=numpatterns+len(v)
 508 | 
 509 | 
 510 |     Added=0
 511 |     for k,v in Candidates.iteritems():
 512 |         Device=XDevice[XDevice[:,1]==k,np.array([2,3,4,5,6,7,8,9,10])]
 513 | 
 514 |         IndivIndex=dict()
 515 | 
 516 |         setk=set()
 517 |         setk.add(k)
 518 |         setdevips=set(DeviceIPS.get(k,dict()).keys())
 519 |         setdevpro=set(DevProps.get(k,dict()).keys())
 520 | 
 521 |         for coo in v:
 522 | 
 523 |             Cookie=XCookies[np.int(coo),np.array([2,3,4,5,6,7,8,9,10])]
 524 |     
 525 |             row=np.concatenate((Device,Cookie))
 526 | 
 527 | 
 528 |             setcooips=set(CookieIPS.get(coo,dict()).keys())
 529 |     
 530 |             PROPS=setdevpro
 531 |             mipro=PROPS
 532 |        
 533 | 
 534 |             IPS=(setdevips & setcooips)
 535 |             miips=set()
 536 |             for ip in IPS:
 537 |                 if(len(IPDev.get(ip,set()))<=10 and len(IPCoo.get(ip,set()))<=20):
 538 |                     miips.add(ip)
 539 |             if len(miips)==0:
 540 |                 for ip in IPS:
 541 |                     miips.add(ip)
 542 | 
 543 |             OtherDevices=set(WhosDevice.get(coo,set()))-setk
 544 | 
 545 |             devp=set()
 546 |             devi=set()
 547 | 
 548 |             for odev in OtherDevices:
 549 |                 devp=devp | set(DevProps.get(odev,dict().keys()))
 550 |                 devi=devi | set(DeviceIPS.get(odev,dict().keys()))
 551 | 
 552 | 
 553 |             intersec=np.float_(len(devp & setdevpro))
 554 |             interseci=np.float_(len(devi & setdevips))
 555 | 
 556 | 
 557 |             if intersec>0:
 558 |                 intersec=intersec/np.float_(len(setdevpro))
 559 | 
 560 |             if interseci>0:
 561 |                 intersec=intersec/np.float_(len(setdevips))
 562 | 
 563 | 
 564 |             row=np.concatenate((row,np.array([np.float_(len(OtherDevices))])))
 565 |             row=np.concatenate((row,np.array([np.float_(intersec)])))
 566 | 
 567 |             row=np.concatenate((row,np.array([np.float_(interseci)])))
 568 | 
 569 | 
 570 |             row=np.concatenate((row,np.array([np.float_(len(IPS))])))
 571 |             row=np.concatenate((row,np.array([np.float_(len(setdevips))])))
 572 |             row=np.concatenate((row,np.array([np.float_(len(setcooips))])))
 573 | 
 574 |             row=np.concatenate((row,np.array([np.float_(len(PROPS))])))
 575 |             row=np.concatenate((row,np.array([np.float_(len(setdevpro))])))
 576 | 
 577 | 
 578 |             row=np.concatenate((row,np.array([np.float_(len(Groups.get(coo,set())))])))
 579 |             row=np.concatenate((row,np.array([np.float_(len(Groups.get(coo,set()) & v))])))            
 580 |                             
 581 |             row=np.concatenate((row,np.array([np.float_(len(miips))])))
 582 | 
 583 | 
 584 |             iprow=np.zeros(22)
 585 |             niprows=0
 586 |             for ip in miips:                
 587 |                 iprow=iprow+np.concatenate((DeviceIPS[k][ip].reshape(-1),CookieIPS[coo][ip].reshape(-1)))
 588 |                 niprows=niprows+1
 589 | 
 590 |             if niprows>0:
 591 |                 meaniprows=iprow/np.float_(niprows)
 592 |             else:
 593 |                 meaniprows=iprow
 594 | 
 595 | 
 596 |             row=np.concatenate((row.reshape(-1),iprow.reshape(-1)))
 597 |             row=np.concatenate((row.reshape(-1),meaniprows.reshape(-1)))
 598 |             row=np.concatenate((row.reshape(-1),(iprow[0:6]-iprow[11:-5]).reshape(-1)))
 599 |                 
 600 | 
 601 |             if Added==0:
 602 |                 XTR=np.zeros((numpatterns,len(row)))
 603 | 
 604 |             IndivIndex[coo]=Added
 605 | 
 606 |             XTR[Added,:]=row
 607 | 
 608 |             Added=Added+1
 609 |         OriginalIndex[k]=IndivIndex
 610 |     return (XTR,OriginalIndex)
 611 | 
 612 | #####################################################
 613 | # THIS CREATES A THE LABELS FOR SUPERVISED LEARNING #
 614 | #####################################################
 615 | 
 616 | def createTrainingLabels(Candidates,Labels):
 617 | 
 618 |     numpatterns=0
 619 |     
 620 |     for k,v in Candidates.iteritems():
 621 |         numpatterns=numpatterns+len(v)
 622 | 
 623 |     YTR=np.zeros(numpatterns)
 624 | 
 625 |     Added=0
 626 |     for k,v in Candidates.iteritems():
 627 |         for coo in v:
 628 |             if(coo in Labels[k]):
 629 |                 YTR[Added]=1.0
 630 |             Added=Added+1
 631 | 
 632 |     return YTR
 633 | 
 634 |        
 635 | ######################################################
 636 | # THIS FINCTION SELECTS THE COOKIES FOR EVERY DEVICE #
 637 | # GIVEN THE PREDICTIONS OF THE CLASSIFIER            #
 638 | ######################################################
 639 | 
 640 | def bestSelection(predictions, OriginalIndex, values,Groups):
 641 | 
 642 |     result=dict()
 643 | 
 644 |     threshold=dict()
 645 | 
 646 |     for k,v in OriginalIndex.iteritems():
 647 | 
 648 |         cook=set()
 649 |         maxval=0.0
 650 |         cookies=v.keys()
 651 | 
 652 |         scores=np.zeros(len(cookies))
 653 | 
 654 |         for i in range(len(cookies)):
 655 |             scores[i]=predictions[v[cookies[i]]]
 656 | 
 657 | 
 658 |         Orden=sorted(range(len(scores)),key=lambda x:-scores[x])
 659 | 
 660 |         if len(cookies)>0:
 661 |             if Groups.get(cookies[Orden[0]],-100) != -100:
 662 |                 maxval=scores[Orden[0]]
 663 |                 cook= (cook | Groups[cookies[Orden[0]]])
 664 | 
 665 |         if (maxval<0.9):
 666 |             for i in range(len(values)):
 667 |                 if (i<= len(cook)):
 668 |                     if (i<len(cookies) and (i<len(values))) :
 669 |                         tam1=len(Groups.get(cookies[Orden[0]],set()))
 670 |                         tam2=len(Groups.get(cookies[Orden[i]],set()))
 671 |                         if (tam1>1 & tam2==1):
 672 |                             if(scores[Orden[i]]>maxval*(values[i]-0.15)):
 673 |                                 cook= (cook | Groups.get(cookies[Orden[i]],set()))
 674 |                         elif (tam1>1 & tam2>1):
 675 |                             if(scores[Orden[i]]>maxval*(values[i]+0.1)):
 676 |                                 cook= (cook | Groups.get(cookies[Orden[i]],set()))
 677 |                         elif (tam1==1 & tam2==1):
 678 |                             if(scores[Orden[i]]>maxval*(values[i])):
 679 |                                 cook= (cook | Groups.get(cookies[Orden[i]],set()))
 680 | 
 681 | 
 682 |         result[k]=cook
 683 |         threshold[k]=maxval
 684 |     return (result,threshold)
 685 | 
 686 | #####################################################
 687 | # THIS FUNCTION TRAINS THE CLASSIFIER USING XGBOOST #
 688 | #####################################################
 689 | 
 690 | def trainXGBoost(xtr,ytr,rounds,eta,xtst,ytst):
 691 |     xgmat = xgb.DMatrix( xtr, label=ytr)
 692 |     xgmat2 = xgb.DMatrix( xtst, label=ytst)
 693 |     param = {}
 694 |     param['eta'] = eta
 695 |     param['max_depth'] = 10
 696 |     param['subsample'] = 1.0
 697 |     param['nthread'] = 12
 698 |     param['min_child_weight']=4
 699 |     param['gamma']=5.0
 700 |     param['colsample_bytree']=1.0
 701 |     param['silent']=1
 702 |     param['objective'] = 'binary:logistic'
 703 |     param['eval_metric']='error'
 704 |     watchlist = [ (xgmat,'train') ,(xgmat2,'test')]
 705 |     num_round = rounds
 706 |     bst = xgb.train( param, xgmat, num_round, watchlist );
 707 |     return bst
 708 | 
 709 | #######################################
 710 | # THIS FUNCTION MAKES THE PREDICTIONS #
 711 | #######################################
 712 | 
 713 | def predictXGBoost(X,bst):
 714 |     xgmat = xgb.DMatrix( X)
 715 |     return bst.predict( xgmat )
 716 | 
 717 | #########################################################################
 718 | # THIS FUNCTION TRAINS THE ALGORITHM USING 8 BAGGERS AND AVERAGING THEM #
 719 | #########################################################################
 720 | 
 721 | def FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain,Groups,Labels):
 722 |     NFOLDS=8
 723 | 
 724 |     skf = sklearn.cross_validation.KFold(len(OriginalIndexTR.keys()),n_folds=NFOLDS,random_state=0)
 725 | 
 726 |     resultadosVal=np.zeros(len(YTR))
 727 | 
 728 | 
 729 |     (tamTST,dTST)=XTST.shape
 730 |     resultadosTST=np.zeros(tamTST)
 731 | 
 732 | 
 733 |     classifiers=list()
 734 | 
 735 |     iteration=0
 736 |     for (train,test) in skf:
 737 |         
 738 |         iteration=iteration+1
 739 |         Originaltmp=dict()
 740 |         print "Training Bagger ",iteration, "of", NFOLDS
 741 | 
 742 | 
 743 |         trainind=list()
 744 |         testind=list()
 745 |         traindev=list()
 746 |         testdev=list()
 747 | 
 748 |         for i in train:
 749 |              devtr=DevicesTrain[i,1]
 750 |              traindev.append(devtr)
 751 |              trainind.extend(OriginalIndexTR[devtr].values())
 752 | 
 753 |         for i in test:
 754 |             devtr=DevicesTrain[i,1]
 755 |             testdev.append(devtr)
 756 |             testind.extend(OriginalIndexTR[devtr].values())
 757 |             Originaltmp[devtr]=OriginalIndexTR[devtr]
 758 |  
 759 |         trainind=np.array(trainind)
 760 |         testind=np.array(testind)
 761 | 
 762 |         XvalTR=XTR[trainind,:]
 763 |         XvalTST=XTR[testind,:]
 764 |     
 765 |         YvalTR=YTR[trainind]
 766 |         YvalTST=YTR[testind]
 767 | 
 768 | 
 769 |         bst=trainXGBoost(XvalTR,YvalTR,200,0.10,XvalTST,YvalTST)
 770 | 
 771 |         classifiers.append((bst,traindev,testdev))
 772 | 
 773 |         pTT=predictXGBoost(XvalTR,bst)
 774 |         pTR=predictXGBoost(XvalTST,bst)
 775 | 
 776 |         resultadosVal[testind]=pTR
 777 | 
 778 |         (validat,thTR)=bestSelection(resultadosVal, Originaltmp, np.array([1.0]),Groups)
 779 | 
 780 |         pTST=predictXGBoost(XTST,bst)
 781 | 
 782 | 
 783 |         resultadosTST=resultadosTST+pTST
 784 | 
 785 | 
 786 |     resultadosTST=resultadosTST/np.float_(NFOLDS)
 787 |     return(resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)
 788 | 
 789 | ###############################################################################################
 790 | # THIS FUNCTION LOOKS FOR DEVICES WHOSE BEST CANDIDATE SCORES LESS THAN 0.05,                 #
 791 | # CREATES A NEW SET OF CANDIDATES CONTAINING EVERY COOKIE THAT SHARES AN IP ADDRESS WITH HIM, #
 792 | # SCORES THEM WITH XGBOOST AND SELECT THE CANDIDATES FOR THE SUBMISSION                       #
 793 | ###############################################################################################
 794 | 
 795 | def PostAnalysisTrain(validat,thTR,classifiers,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle,Labels):
 796 | 
 797 |     itn=0
 798 |     for k,v in validat.iteritems():
 799 |         itn=itn+1
 800 |         if thTR[k]<0.05:
 801 |             (fcandK,fcandU)=fullCandidates(k,DevicesTrain,Cookies,IPDev,IPCoo,DeviceIPS,DictHandle)
 802 |          
 803 |             validatTHK=dict()
 804 |             thTHK=dict()
 805 |             if(len(fcandK[k])>0):
 806 |                 (XTHK,OriginalIndexTHK)=createDataSet(fcandK,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
 807 |                 YTHK=createTrainingLabels(fcandK,Labels)
 808 |                 estimK=np.zeros(len(YTHK))
 809 | 
 810 |                 for (classifier,traindev,testdev) in classifiers:
 811 |                     if k in testdev:
 812 |                         estimK=predictXGBoost(XTHK,classifier)
 813 | 
 814 |                 (validatTHK,thTHK)=bestSelection(estimK, OriginalIndexTHK, np.array([1.0,0.9]),Groups)
 815 |         
 816 |             validatTHU=dict()
 817 |             thTHU=dict()
 818 |             if(len(fcandU[k])>0):
 819 |                 (XTHU,OriginalIndexTHU)=createDataSet(fcandU,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
 820 |                 YTHU=createTrainingLabels(fcandU,Labels)
 821 |                 estimU=np.zeros(len(YTHU))
 822 | 
 823 |                 for (classifier,traindev,testdev) in classifiers:
 824 |                     if k in testdev:
 825 |                         estimU=predictXGBoost(XTHU,classifier)
 826 | 
 827 |                 (validatTHU,thTHU)=bestSelection(estimU, OriginalIndexTHU, np.array([1.0,0.9]),Groups)
 828 | 
 829 |             if len(validatTHK)>0:
 830 |                 if len(validatTHU)>0:
 831 |                     if(thTHU[k]>(thTHK[k]+0.7)):
 832 |                         validat[k]=validatTHU[k]
 833 |                         thTR[k]=thTHU[k]
 834 |                     else:
 835 |                         if thTR[k]<=0.025:
 836 |                             validat[k]=validatTHK[k]
 837 |                             thTR[k]=thTHK[k]
 838 |                         else:
 839 |                             if thTR[k]+0.3<thTHK[k]:
 840 |                                 validat[k]=validatTHK[k]
 841 |                                 thTR[k]=thTHK[k]
 842 |                 else:
 843 |                     if thTR[k]<=0.025:
 844 |                         validat[k]=validatTHK[k]
 845 |                         thTR[k]=thTHK[k]
 846 |                     else:
 847 |                         if thTR[k]+0.3<thTHK[k]:
 848 |                             validat[k]=validatTHK[k]
 849 |                             thTR[k]=thTHK[k]
 850 |             else:
 851 |                 validat[k]=validatTHU[k]
 852 |                 thTR[k]=thTHU[k]
 853 | 
 854 |     return(validat,thTR)
 855 | 
 856 | ########################################################################
 857 | # THIS FUNCTION RETURNS THE DEVICES THAT SHARES IPS WITH ONLY 1 COOKIE #
 858 | ########################################################################
 859 | 
 860 | def uniqueCandidates(XDevices,XCookies,IPCoo,DeviceIPS,DictHandle,OtherCookies):
 861 | 
 862 |     UniqueCandidates=dict()
 863 | 
 864 |     devices=np.unique(XDevices[:,1])
 865 | 
 866 |     numUnique=0
 867 | 
 868 |     Unknown=DictHandle['-1']
 869 |     for i in range(len(devices)):
 870 | 
 871 |         device=devices[i]
 872 | 
 873 |         candidatestotal=set()
 874 | 
 875 |         ips=DeviceIPS[device].keys()
 876 | 
 877 |         for ip in ips:
 878 |             candidates=IPCoo[ip]
 879 |             for candidate in candidates:
 880 |                 if(XCookies[np.int(candidate),0] != Unknown):
 881 |                     candidatestotal.add(candidate)
 882 | 
 883 |         if len(candidatestotal)==0:
 884 |             for ip in ips:
 885 |                 candidates=IPCoo[ip]
 886 |                 candidatestotal=(candidatestotal | candidates)
 887 | 
 888 |         finallist=set()
 889 |         for candidate in candidatestotal:
 890 |             finallist=(finallist | OtherCookies[candidate])
 891 | 
 892 |         if OtherCookies[min(finallist)]==finallist:
 893 |             UniqueCandidates[device]=finallist
 894 |             numUnique=numUnique+1
 895 |         
 896 |     return UniqueCandidates
 897 | 
 898 | #################################################################################
 899 | # THIS FUNCTION RETURNS THE DEVICES WHOSE BEST CANDIDATE SCORES HIGHER THAN 0.4 #
 900 | # AND THE SECOND CANDIDATE SCORES LESS THAN 0.05                                #
 901 | #################################################################################
 902 | 
 903 | def mostProbable(predictions, OriginalIndex, Groups):
 904 |     
 905 |     probCandidates=dict()
 906 | 
 907 |     for k,v in OriginalIndex.iteritems():
 908 | 
 909 |         cookies=v.keys()
 910 |         scores=np.zeros(len(cookies))
 911 |         
 912 |         for i in range(len(cookies)):
 913 |             scores[i]=predictions[v[cookies[i]]]
 914 |         
 915 |         Orden=sorted(range(len(scores)),key=lambda x:-scores[x])
 916 | 
 917 |         ValorMax=-1
 918 |         cook=set()
 919 |         if len(cookies)>0:
 920 |             if Groups.get(cookies[Orden[0]],-100) != -100:
 921 |                 cook= (cook | Groups[cookies[Orden[0]]])
 922 |                 ValorMax=scores[Orden[0]]
 923 |     
 924 |         Segun=-1
 925 |         Terminado='NO'
 926 | 
 927 |         for i in range(len(cookies)):
 928 |             if i>0:    
 929 |                 if Terminado=='NO':
 930 |                     if (cookies[Orden[i]] not in cook):
 931 |                         Segun=scores[Orden[i]]
 932 |                         Terminado='SI'
 933 | 
 934 |         if (Segun<0.05 and ValorMax>0.4):
 935 |             probCandidates[k]=Groups[cookies[Orden[0]]]
 936 | 
 937 |     return probCandidates
 938 | 
 939 | #########################################
 940 | # THIS FUNCTION MERGES THE DICTIONARIES #
 941 | # FOR THE SEMI SUPERVISED LEARNING      #
 942 | #########################################
 943 | 
 944 | def createOtherDevicesDict(dict1,dict2,dict3):
 945 | 
 946 |     OtherDevices=defaultdict(set)
 947 |     for k,v in dict1.iteritems():
 948 |         for cookie in v:
 949 |             OtherDevices[cookie].add(k)
 950 |     for k,v in dict2.iteritems():
 951 |         for cookie in v:
 952 |             OtherDevices[cookie].add(k)
 953 |     for k,v in dict3.iteritems():
 954 |         for cookie in v:
 955 |             OtherDevices[cookie].add(k)
 956 | 
 957 |     return OtherDevices
 958 | 
 959 | 
 960 | 
 961 | ######################################################
 962 | # THIS FUNCTION SAVE THE FINAL PREDICTIONS IN A FILE #
 963 | ######################################################
 964 | 
 965 | def writeSolution(file,selected,DeviceList,CookieList):
 966 | 
 967 |     header=list()
 968 |     header.append('device_id')
 969 |     header.append('cookie_id')
 970 | 
 971 |     with open(file, 'wb') as csvfile:
 972 |         spamwriter = csv.writer(csvfile, delimiter=',')
 973 |         spamwriter.writerow(header)
 974 | 
 975 | 
 976 |         for k,v in selected.iteritems():
 977 |             row=list()
 978 |             items=list()
 979 |             row.append(DeviceList[np.int(k)])
 980 |             for elem in (v):
 981 |                 items.append(CookieList[np.int(elem)])
 982 |             if len(v)==0:
 983 |                 items.append('id_10')            
 984 |             row.append(' '.join(items))
 985 |             spamwriter.writerow(row)
 986 | 
 987 | 
 988 | ###############################################################################################
 989 | # THIS FUNCTION MAKES THE POST PROCESSING ON A TEST                                           #                       
 990 | # IT LOOKS FOR DEVICES WHOSE BEST CANDIDATE SCORES LESS THAN 0.05,                            #
 991 | # CREATES A NEW SET OF CANDIDATES CONTAINING EVERY COOKIE THAT SHARES AN IP ADDRESS WITH HIM, #
 992 | # SCORES THEM WITH XGBOOST AND SELECT THE CANDIDATES FOR THE SUBMISSION                       #
 993 | ###############################################################################################
 994 | 
 995 | def PostAnalysisTest(validatTST,thTST,classifiers,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle):
 996 | 
 997 |     itn=0
 998 |     for k,v in validatTST.iteritems():
 999 |         itn=itn+1
1000 |         if thTST[k]<0.05:
1001 | 
1002 |             (fcandK,fcandU)=fullCandidates(k,DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,DictHandle)
1003 | 
1004 |             validatTHK=dict()
1005 |             thTHK=dict()
1006 |             if(len(fcandK[k])>0):
1007 | 
1008 |                 (XTHK,OriginalIndexTHK)=createDataSet(fcandK,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
1009 | 
1010 |                 (tmxK,dmxK)=XTHK.shape
1011 |                 estimK=np.zeros(tmxK)
1012 | 
1013 |                 for (classifier,traindev,testdev) in classifiers:
1014 |                     estimK=estimK+predictXGBoost(XTHK,classifier)
1015 | 
1016 |                 estimK=estimK/np.float_(len(classifiers))
1017 | 
1018 |                 (validatTHK,thTHK)=bestSelection(estimK, OriginalIndexTHK, np.array([1.0,0.90]),Groups)
1019 | 
1020 |             validatTHU=dict()
1021 |             thTHU=dict()
1022 |             if(len(fcandU[k])>0):
1023 |                 (XTHU,OriginalIndexTHU)=createDataSet(fcandU,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
1024 |                 (tmxU,dmxU)=XTHU.shape
1025 |                 estimU=np.zeros(tmxU)
1026 | 
1027 |                 for (classifier,traindev,testdev) in classifiers:
1028 |                     estimU=estimU+predictXGBoost(XTHU,classifier)
1029 | 
1030 |                 estimU=estimU/np.float_(len(classifiers))
1031 | 
1032 |                 (validatTHU,thTHU)=bestSelection(estimU, OriginalIndexTHU, np.array([1.0,0.90]),Groups)
1033 | 
1034 |               
1035 |             if len(validatTHK)>0:
1036 |                 if len(validatTHU)>0:
1037 |                     if(thTHU[k]>(thTHK[k]+0.7)):
1038 |                         validatTST[k]=validatTHU[k]
1039 |                         thTST[k]=thTHU[k]
1040 |                     else:
1041 |                         if thTST[k]<=0.025:
1042 |                             validatTST[k]=validatTHK[k]
1043 |                             thTST[k]=thTHK[k]
1044 |                         else:
1045 |                             if thTST[k]+0.3<thTHK[k]:
1046 |                                 validatTST[k]=validatTHK[k]
1047 |                                 thTST[k]=thTHK[k]
1048 |                 else:
1049 |                     if thTST[k]<=0.025:
1050 |                         validatTST[k]=validatTHK[k]
1051 |                         thTST[k]=thTHK[k]
1052 |                     else:
1053 |                         if thTST[k]+0.3<thTHK[k]:
1054 |                             validatTST[k]=validatTHK[k]
1055 |                             thTST[k]=thTHK[k]
1056 |             else:
1057 |                 validatTST[k]=validatTHU[k]
1058 |                 thTST[k]=thTHU[k]
1059 | 
1060 | 
1061 |     return(validatTST,thTST)
1062 | 
1063 | 
1064 | ##############################################################################################
1065 | # THIS FUNCTION USES THE CLASSIFIER TO MAKE THE PRECICTIONS IN EVERY DEVICE/CANDIDATE COOKIE #
1066 | ##############################################################################################
1067 |     
1068 | def Predict(XTST,classifiers):
1069 | 
1070 |     (tamTST,dTST)=XTST.shape
1071 |     resultadosTST=np.zeros(tamTST)
1072 | 
1073 |     for (bst,traindev,testdev) in classifiers:
1074 | 
1075 |         pTST=predictXGBoost(XTST,bst)
1076 |         resultadosTST=resultadosTST+pTST
1077 | 
1078 | 
1079 |     resultadosTST=resultadosTST/np.float_(len(classifiers))
1080 |     return resultadosTST
1081 | 
1082 | #################################
1083 | # THIS FUNCTION LOADS THE MODEL #
1084 | #################################
1085 | 
1086 | def loadModel(modelpath):
1087 | 
1088 |     modelfile=modelpath+os.path.sep+'model.pkl'
1089 | 
1090 |     f = open(modelfile, "r")
1091 | 
1092 |     nclassifier = pickle.load(f)
1093 |     DictOtherDevices = pickle.load(f)
1094 | 
1095 |     f.close()
1096 | 
1097 |     classifiers=list()
1098 | 
1099 |     for i in range(nclassifier):
1100 |         classifier = xgb.Booster({'nthread':12})
1101 |         classifier.load_model(modelpath+os.path.sep+str(i)+'.model')
1102 |         classifiers.append((classifier,set(),set()))
1103 |     
1104 |     return (classifiers,DictOtherDevices)
1105 | 
1106 | 
1107 | #################################
1108 | # THIS FUNCTION SAVES THE MODEL #
1109 | #################################
1110 | 
1111 | def saveModel(modelpath,classifiers,DictOtherDevices):
1112 | 
1113 |     d = os.path.dirname(modelpath)
1114 | 
1115 |     if not os.path.exists(d):
1116 |         os.makedirs(d)
1117 | 
1118 |     modelfile=modelpath+os.path.sep+'model.pkl'
1119 | 
1120 |     f = open(modelfile, "w")
1121 | 
1122 |     pickle.dump(len(classifiers), f)
1123 |     
1124 |     nclassifier=0
1125 | 
1126 |     for (classifier,indtr,indtst) in classifiers:
1127 |         classifier.save_model(modelpath+os.path.sep+str(nclassifier)+'.model')
1128 |         nclassifier = nclassifier +1
1129 |     
1130 |     pickle.dump(DictOtherDevices, f)
1131 |     f.close()
1132 | 
1133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ICDM2015
  2 | 
  3 | ICDM 2015 Drawbridge Cross-Device Connections Challenge: Third Prize Solution
  4 | 
  5 | Instalation:
  6 | ============
  7 | 
  8 | This software is implemented in python and uses the external software XGBoost.
  9 | 
 10 | The libraries of python needed are:
 11 |  - [Numpy] (http://www.numpy.org/)
 12 |  - [Scipy] (http://www.scipy.org/)
 13 |  - [SciKit Learn](http://scikit-learn.org/)
 14 | 
 15 | XGBoost software can be downloaded from:
 16 | XGBoost: https://github.com/dmlc/xgboost
 17 | In this challenge we used the version 0.3.
 18 | 
 19 | 
 20 | Settings:
 21 | =========
 22 | 
 23 | This software contains two setting files:
 24 | 
 25 | 'Variables.py' contains the environment variables that the software needs:
 26 | 
 27 |  * There is an environment variable for the absolute path of every file provided in the challenge.
 28 |  * There is an environment variable for the python wrapper folder of XGBoost.
 29 |  * There is an environment variable for the path to save/load the model.
 30 | 
 31 | 'VariablesTST.py' contains two environment variables:
 32 | 
 33 |  * There is an environment variable for the absolute path of the test file.
 34 |  * There is an environment variable for the absolute path of the file with the results to submit to kaggle.
 35 | 
 36 | Edit 'Variables.py' and 'VariablesTST.py' and use the values of you files and the path that contains the wrapper of your installation of XGBoost.
 37 | 
 38 | The procedure 'train.py' makes use of 'Variable.py'
 39 | The procedure 'test.py' makes use of 'Variable.py' and 'VariableTST.py' (it uses Variable.py because the model has very big data structures that is more practical to load again from the training files than saving and loading again, if you want to evaluate your model in a different test file than the one provided for the challenge uses the variable "testfile" in 'Variables.py' for the original test file provided by Drawbridge and the variable "predictFile" in 'VariablesTST.py' for the file that you wish to evaluate).
 40 | 
 41 | Requirements:
 42 | =============
 43 | 
 44 | The experiments were executed on a HP DL160 G6 server with 48 GBytes and 2 Intel Xeon X5675 processors (each one has 6 cores with hyperthreading technology).
 45 | The operating system was linux gentoo
 46 | 
 47 | (32 GB of RAM should be enough).
 48 | 
 49 | Running the code:
 50 | =================
 51 | 
 52 | To train the algorithm and create the model:
 53 | 
 54 |     python train.py
 55 | 
 56 | To use the model and make the predictions on a test file:
 57 | 
 58 |     python predict.py
 59 | 
 60 | 
 61 | Description of the algorithm:
 62 | =============================
 63 | 
 64 | * Preprocessing
 65 | At the initial stage, we iterate over the list of cookies looking for other cookies with the same handle. Then, for every pair of cookies with the same handle, if one of them doesn't appear in an IP address that the other cookie appears, we include all the information about this IP address in the cookie.
 66 | 
 67 | * Initial selection of candidate cookies for every Device:
 68 | It is not possible to create a training set containing every combination of devices and cookies due to the high number of them. In order to reduce the initial complexity of the problem and to create an affordable dataset, some basic rules have been created to obtain an initial reduced set of candidate cookies for every device. The rules are based on the IP addresses that both device and cookie have in common and how frequent they are in other devices and cookies. The procedure, for every device, is as follows.
 69 | 
 70 |   1. We create a set that contains the device's IP addresses that appear in less than ten devices and less than twenty cookies. The initial list of candidates is every cookie with known handle that appears in any of theses IP addresses.
 71 |   1. If the previous rule returned an empty set of candidates we create a set that contains the device's IP addresses that appear in less than twenty five devices and less than fifty cookies. The initial list of candidates is every with known handle cookie that appears in any of theses IP addresses.
 72 |   1. If the previous rule returned an empty set of candidates we create a set that contains the device's IP addresses. The initial list of candidates is every cookie with known handle that appears in any of theses IP addresses.
 73 |   1. If the previous rule returned an empty set of candidates we create a set that contains the device's IP addresses. The initial list of candidates is every cookie that appears in any of theses IP addresses.
 74 |   1. If a cookie has the same handle than any of the candidates then this cookie is a candidate too.
 75 | 
 76 | * Creating the datasets:
 77 | Every pattern in the training and test set represents a device/candidate cookie pair obtained by the previous step and contains information about the device (Operating System (OS), Country, ...), the cookie (Cookie Browser Version, Cookie Computer OS,...) and the relation between them (number of IP addresses shared by both device and cookie, number of other cookies with the same handle than the cookie,...).
 78 | 
 79 | * Training procedure (Supervised Learning + Bagging)
 80 | To create the classifier, we have selected a Regularized Boosted Trees algorithm. The software that we used was XGBoost.
 81 | We have used 8 baggers creating 8 different subdatasets from the original dataset.
 82 | 
 83 | * Semi Supervised Learning:
 84 | Semi-supervised learning is a class of supervised learning that also make use of unlabeled data. In our case we make use of the data contained in the test set. If we sort the scores obtained by every candidate and the first score is high and the second is very low, is very likely that the first cookie belongs to the device. We make use of this information to update some features in the training set and retrain the algorithm again.
 85 | 
 86 | * PostProcessing:
 87 | We iterate over the devices using the following procedure:
 88 | If the initial selection of candidates did not find a candidate with enough likelihood (logistic output of the classifier) we choose a new set of candidate cookies selecting every cookie that shares an IP address with the device and we score them using the classifier.
 89 | We label the cookie with highest score as one of the device's cookies. If there are other cookies with the same handle than this cookie we label them too.
 90 | We sort the candidates in descending order by the score they have reached and we iterate over them. We label them as a device's cookie if they reach a threshold.
 91 | The value of the threshold changes attending to:
 92 |  The number of cookies already labeled as device's cookies.
 93 |  The number of other cookies with the same handle than this one.
 94 |  The handle of the cookie is known or not.
 95 |  The of the best candidate is known or not.
 96 | 
 97 | Feature extraction:
 98 | ===================
 99 | 
100 | This section contains the description of every feature contained in the training and test sets.
101 | 
102 | 1) Device Features:
103 |  * Device Type
104 |  * Device OS
105 |  * Device Country
106 |  * Device Annonymous c0
107 |  * Device Annonymous c1
108 |  * Device Annonymous c2
109 |  * Device Annonymous 5
110 |  * Device Annonymous 6
111 |  * Device Annonymous 7
112 |  * Number of IP addresses associated to the Device
113 |  * Number of Properties associated to the Device
114 | 
115 | 2) Cookie Features:
116 |  * Cookie Computer OS
117 |  * Cookie Browser Version
118 |  * Cookie Country
119 |  * Cookie Annonymous c0
120 |  * Cookie Annonymous c1
121 |  * Cookie Annonymous c2
122 |  * Cookie Annonymous 5
123 |  * Cookie Annonymous 6
124 |  * Cookie Annonymous 7
125 |  * Number of IP addresses visited by the Cookie
126 | 
127 | 3) Relational Features
128 | 
129 | we have created some sets in order to extract features that represent the relation between the device and the cookie:
130 | 
131 |  * setIP: It contains IP addresses that appear in less than ten devices and less than twenty cookies visited by both device and cookie. If this set is empty we include every IP address visited by both device and cookie.
132 |  * setOtherDevices: This set contains every device of the training set with the same handle than the cookie except by the device of this pattern.
133 |  * setOtherIP: This set contains the IP addresses visited by any device in setOtherCookies.
134 |  * setOtherP roperties: This set contains the properties visited by any device in setOtherCookies.
135 | 
136 | Using these sets we have created the following features:
137 | 
138 |  * Number of IP addresses visited by both device and cookie.
139 |  * Number of IP addresses that appear in less than ten devices and less than twenty cookies visited by both device and cookie.
140 |  * Number of devices in setOtherDevices
141 |  * Number of IP addresses that the device has in common with setOtherIP.
142 |  * Number of Properties that the device has in common with setOtherP roperties.
143 |  * Number of cookies with the same handle than this cookie.
144 |  * Addition of features IsCell, Total Frequency, Count C0, Count C1 and Count C2 of every IP address in setIP.
145 |  * Average of features IsCell, Total Frequency, Count C0, Count C1 and Count C2 of every IP address in setIP.
146 |  * Addition of features Frequency Count, Count 1, Count 2, Count 3, Count 4 and Count 5 of the device’s IP addresses in setIP.
147 |  * Average of features Frequency Count, Count 1, Count 2, Count 3, Count 4 and Count 5 of the device’s IP addresses in setIP.
148 |  * Addition of features Frequency Count, Count 1, Count 2, Count 3, Count 4 and Count 5 of the cookie’s IP addresses in setIP.
149 |  * Average of features Frequency Count, Count 1, Count 2, Count 3, Count 4 and Count 5 of the cookie’s IP addresses in setIP.
150 |  * Addition of features Frequency Count, Count 1, Count 2, Count 3, Count 4 and Count 5 of the device’s IP addresses in setIP minus addition of features Frequency Count, Count 1, Count 2, Count 3, Count 4 and Count 5 of the cookie’s IP addresses in setIP.
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/Variables.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #########################################################
 4 | # ABSOLUTE PATH OF EVERY FILE PROVIDED IN THE CHALLENGE #
 5 | #########################################################
 6 | 
 7 | # Absolute path of the Device Train Basic file
 8 | trainfile='/export/clusterdata/rdiazm/ICDM/dev_train_basic.csv'
 9 | 
10 | # Absolute path of the Device Test Basic file
11 | testfile='/export/clusterdata/rdiazm/ICDM/dev_test_basic.csv'
12 | 
13 | # Absolute path of the Cookie All Basic file
14 | cookiefile='/export/clusterdata/rdiazm/ICDM/cookie_all_basic.csv'
15 | 
16 | # Absolute path of the All IP file
17 | ipfile='/export/clusterdata/rdiazm/ICDM/id_all_ip.csv'
18 | 
19 | # Absolute path of the All Property file
20 | propfile='/export/clusterdata/rdiazm/ICDM/id_all_property.csv'
21 | 
22 | # Absolute path of the IP Aggregated file
23 | ipaggfile='/export/clusterdata/rdiazm/ICDM/ipagg_all.csv'
24 | 
25 | # Absolute path of the Property Category file
26 | propcatfile='/export/clusterdata/rdiazm/ICDM/property_category.csv'
27 | 
28 | 
29 | ######################################################################
30 | # ABSOLUTE PATH FOR THE PYTHN WRAPPER FOLDER IN THE SOFTWARE XGBOOST #
31 | ######################################################################
32 | 
33 | pathXGBoost='/export/usuarios01/rdiazm/kaggle/xgboost-master/wrapper/'
34 | 
35 | #####################################################
36 | # ABSOLUTE PATH FOR THE FILE TO SAVE/LOAD THE MODEL #
37 | #####################################################
38 | 
39 | modelpath='/export/clusterdata/rdiazm/ICDM/model/'
40 | 
41 | 


--------------------------------------------------------------------------------
/VariablesTST.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #######################################
 3 | # ABSOLUTE PATH WITH OF THE TEST FILE #
 4 | #######################################
 5 | 
 6 | predictFile='/export/clusterdata/rdiazm/ICDM/dev_test_basic.csv'
 7 | 
 8 | 
 9 | #####################################
10 | # ABSOLUTE PATH TO SAVE THE RESULTS #
11 | #####################################
12 | 
13 | resultFile='/export/clusterdata/rdiazm/ICDM/KaggleICDM.csv'
14 | 
15 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | #    Copyright (C) 2015  Roberto Diaz Morales
  2 | #
  3 | #    This program is free software: you can redistribute it and/or modify
  4 | #    it under the terms of the GNU General Public License as published by
  5 | #    the Free Software Foundation, either version 3 of the License, or
  6 | #    (at your option) any later version.
  7 | #
  8 | #    This program is distributed in the hope that it will be useful,
  9 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | #    GNU General Public License for more details.
 12 | #
 13 | #    You should have received a copy of the GNU General Public License
 14 | #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | 
 17 | 
 18 | import csv
 19 | import os
 20 | import re
 21 | import numpy as np
 22 | import inspect
 23 | import sys
 24 | import sklearn
 25 | from sklearn import cross_validation
 26 | import pickle
 27 | 
 28 | 
 29 | from Variables import *
 30 | from VariablesTST import *
 31 | 
 32 | 
 33 | code_path = os.path.join(pathXGBoost)
 34 | sys.path.append(code_path)
 35 | import xgboost as xgb
 36 | from CookieLibrary import *
 37 | 
 38 | #################################################################################
 39 | # PARSING THE FILES PROVIDED FOR THE CHALLENGE AND CREATING THE DATA STRUCTURES #
 40 | # THAT THE ALGORITHM NEEDS                                                      #
 41 | #################################################################################
 42 | 
 43 | # Some features in the files that describe the cookies and the devices are categorical features in test mode.
 44 | # For example, the countries are like: 'country_147', or the handle is like 'handle_1301101'.
 45 | # This function creates dictionaries to transform that text into a numerical value to load them in a numpy matrix.
 46 | 
 47 | print('Loading Dictionaries')
 48 | (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)=GetIdentifiers(trainfile,testfile,cookiefile)
 49 | 
 50 | DictHandle = list2Dict(HandleList)
 51 | DictDevice = list2Dict(DeviceList)
 52 | DictCookie = list2Dict(CookieList)
 53 | DictDevType = list2Dict(DevTypeList)
 54 | DictDevOs = list2Dict(DevOsList)
 55 | DictComputerOs = list2Dict(ComputerOsList)
 56 | DictComputerV = list2Dict(ComputerVList)
 57 | DictCountry = list2Dict(CountryList)
 58 | DictAnnC1 = list2Dict(annC1List)
 59 | DictAnnC2 = list2Dict(annC2List)
 60 | 
 61 | 
 62 | # This part loads the content of the devices into a numpy matrix using the dictionaries to transform the text values into numerical values
 63 | print('Loading Devices Files')
 64 | DevicesTrain = loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
 65 | 
 66 | # This part loads the content of the cookies into a numpy matrix using the dictionaries to transform the text values into numerical values
 67 | print('Loading Cookies File')
 68 | Cookies = loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2)
 69 | 
 70 | # It loads the Properties of the devices
 71 | print('Loading Properties File')
 72 | DevProperties=loadPROPS(propfile,DictDevice,DictCookie)
 73 | 
 74 | # It read the train information and creates a dictionary with the cookies of every device, a dicionary that gives for every cookie the other cookies in its same handle and for every cookie its devices
 75 | (Labels,Groups,WhosDevice)=creatingLabels(DevicesTrain,Cookies,DictHandle)
 76 | 
 77 | 
 78 | # It creates a dictionary whose keys are the ip address and the value a numpy array with the IP info
 79 | print('Loading IP Files')
 80 | XIPS=loadIPAGG(ipaggfile)
 81 | 
 82 | # It loads the IP file and creates four dictionaries.
 83 | # The first one gives the devices of every ip, the second one the cookies of every ip, the third one the ips of every device and the last one the ips of every cookie.
 84 | (IPDev,IPCoo,DeviceIPS,CookieIPS)=loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups)
 85 | 
 86 | 
 87 | #########################
 88 | # LOADING THE TEST FILE #
 89 | #########################
 90 | 
 91 | print('STEP: Loading test file')
 92 | DevicesTest = loadDevices(predictFile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
 93 | 
 94 | 
 95 | ###################################
 96 | # INITIAL SELECTION OF CANDIDATES #
 97 | ###################################
 98 | 
 99 | print('STEP: Initial selection of candidates')
100 | CandidatesTST=selectCandidates(DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle)
101 | 
102 | #####################
103 | # LOADING THE MODEL #
104 | #####################
105 | 
106 | print('Loading the model') 
107 | (classifiers,DictOtherDevices) = loadModel(modelpath)
108 | 
109 | ########################
110 | # CREATING THE DATASET #
111 | ########################
112 | 
113 | print('STEP: Creating the dataset')
114 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties)
115 | 
116 | ########################
117 | # USING THE CLASSIFIER #
118 | ########################
119 | 
120 | print('STEP: Using the classifier')
121 | resultadosTST = Predict(XTST,classifiers)
122 | 
123 | ########################
124 | # POST PROCESSING STEP #
125 | ########################
126 | 
127 | print('STEP: Post Processing')
128 | (validatTST,thTST)=bestSelection(resultadosTST, OriginalIndexTST, np.array([1.0,0.9]),Groups)
129 | 
130 | (validatTST,thTST) = PostAnalysisTest(validatTST,thTST,classifiers,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle)
131 | 
132 | #########################################
133 | # WRITIG THE FINAL SOLUTION IN THE FILE #
134 | #########################################
135 | 
136 | print('Writing the file with the result')
137 | writeSolution(resultFile,validatTST,DeviceList,CookieList)
138 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #    Copyright (C) 2015  Roberto Diaz Morales
  2 | #
  3 | #    This program is free software: you can redistribute it and/or modify
  4 | #    it under the terms of the GNU General Public License as published by
  5 | #    the Free Software Foundation, either version 3 of the License, or
  6 | #    (at your option) any later version.
  7 | #
  8 | #    This program is distributed in the hope that it will be useful,
  9 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | #    GNU General Public License for more details.
 12 | #
 13 | #    You should have received a copy of the GNU General Public License
 14 | #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | 
 17 | 
 18 | import csv
 19 | import os
 20 | import re
 21 | import numpy as np
 22 | import inspect
 23 | import sys
 24 | import sklearn
 25 | from sklearn import cross_validation
 26 | import pickle
 27 | 
 28 | from Variables import *
 29 | 
 30 | code_path = os.path.join(pathXGBoost)
 31 | sys.path.append(code_path)
 32 | import xgboost as xgb
 33 | from CookieLibrary import *
 34 | 
 35 | #################################################################################
 36 | # PARSING THE FILES PROVIDED FOR THE CHALLENGE AND CREATING THE DATA STRUCTURES #
 37 | # THAT THE ALGORITHM NEEDS                                                      #
 38 | #################################################################################
 39 | 
 40 | # Some features in the files that describe the cookies and the devices are categorical features in test mode.
 41 | # For example, the countries are like: 'country_147', or the handle is like 'handle_1301101'.
 42 | # This function creates dictionaries to transform that text into a numerical value to load them in a numpy matrix.
 43 | 
 44 | print('Loading Dictionaries')
 45 | (DeviceList, CookieList, HandleList, DevTypeList, DevOsList,ComputerOsList,ComputerVList,CountryList,annC1List,annC2List)=GetIdentifiers(trainfile,testfile,cookiefile)
 46 | 
 47 | DictHandle = list2Dict(HandleList)
 48 | DictDevice = list2Dict(DeviceList)
 49 | DictCookie = list2Dict(CookieList)
 50 | DictDevType = list2Dict(DevTypeList)
 51 | DictDevOs = list2Dict(DevOsList)
 52 | DictComputerOs = list2Dict(ComputerOsList)
 53 | DictComputerV = list2Dict(ComputerVList)
 54 | DictCountry = list2Dict(CountryList)
 55 | DictAnnC1 = list2Dict(annC1List)
 56 | DictAnnC2 = list2Dict(annC2List)
 57 | 
 58 | 
 59 | # This part loads the content of the devices into a numpy matrix using the dictionaries to transform the text values into numerical values
 60 | print('Loading Devices Files')
 61 | DevicesTrain = loadDevices(trainfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
 62 | DevicesTest = loadDevices(testfile,DictHandle,DictDevice,DictDevType,DictDevOs,DictCountry,DictAnnC1,DictAnnC2)
 63 | 
 64 | # This part loads the content of the cookies into a numpy matrix using the dictionaries to transform the text values into numerical values
 65 | print('Loading Cookies File')
 66 | Cookies = loadCookies(cookiefile,DictHandle,DictCookie,DictComputerOs,DictComputerV,DictCountry,DictAnnC1,DictAnnC2)
 67 | 
 68 | # It loads the Properties of the devices
 69 | print('Loading Properties File')
 70 | DevProperties=loadPROPS(propfile,DictDevice,DictCookie)
 71 | 
 72 | # It read the train information and creates a dictionary with the cookies of every device, a dicionary that gives for every cookie the other cookies in its same handle and for every cookie its devices
 73 | (Labels,Groups,WhosDevice)=creatingLabels(DevicesTrain,Cookies,DictHandle)
 74 | 
 75 | 
 76 | # It creates a dictionary whose keys are the ip address and the value a numpy array with the IP info
 77 | print('Loading IP Files')
 78 | XIPS=loadIPAGG(ipaggfile)
 79 | 
 80 | # It loads the IP file and creates four dictionaries.
 81 | # The first one gives the devices of every ip, the second one the cookies of every ip, the third one the ips of every device and the last one the ips of every cookie.
 82 | (IPDev,IPCoo,DeviceIPS,CookieIPS)=loadIPS(ipfile,DictDevice,DictCookie,XIPS,Groups)
 83 | 
 84 | 
 85 | #################################################################################################
 86 | # PROCEDURE WITH THE INITIAL SELECTION OF CANDIDATES (PROCEDURE DESCRIBED IN THE DOCUMENTATION) #
 87 | #################################################################################################
 88 | print('STEP: Initial selection of candidates')
 89 | # Using simple rules with select a set of candidate cookies for every device
 90 | CandidatesTR=selectCandidates(DevicesTrain,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle)
 91 | CandidatesTST=selectCandidates(DevicesTest,Cookies,IPDev,IPCoo,DeviceIPS,CookieIPS,DictHandle)
 92 | 
 93 | 
 94 | #####################################################
 95 | # CREATION OF THE TRAINING AND TEST SET             #
 96 | # (THE FEATURES ARE DESCRIBED IN THE DOCUMENTATION) #
 97 | #####################################################
 98 | 
 99 | print('STEP: Creating the dataset')
100 | 
101 | # It creates the training and test set for supervised learning creating pairs (device,cookie) using the selected candidates.
102 | (XTR,OriginalIndexTR)=createDataSet(CandidatesTR,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
103 | YTR=createTrainingLabels(CandidatesTR,Labels)
104 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties)
105 | 
106 | ######################################
107 | # TRAINING USING BAGGING AND XGBOOST #
108 | ######################################
109 | 
110 | print('STEP: Training Supervised Learning')
111 | (resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)=FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain, Groups, Labels)
112 | 
113 | #############################################################################
114 | # UPDATING THE DATA STRUCTURES DATASETS WITH NEW INFORMATION OF THE RESULTS #
115 | #############################################################################
116 | 
117 | print('Updating features for with semisupervised learning information')
118 | # SECOND LOOP FOR SEMISUPERVISED LEARNING
119 | # It repeats the training procedure adding the cookies with high probability to the WhosDevice structure
120 | uniqueCand=uniqueCandidates(DevicesTest,Cookies,IPCoo,DeviceIPS,DictHandle,Groups)
121 | probCand=mostProbable(resultadosTST, OriginalIndexTST, Groups)
122 | DictOtherDevices=createOtherDevicesDict(Labels,uniqueCand,probCand)
123 | 
124 | 
125 | # It creates the training and test set for supervised learning creating pairs (device,cookie) using the selected candidates.
126 | (XTR,OriginalIndexTR)=createDataSet(CandidatesTR,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties)
127 | (XTST,OriginalIndexTST)=createDataSet(CandidatesTST,DevicesTest,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,DictOtherDevices,DevProperties)
128 | 
129 | ####################################################################################################
130 | # SECOND TRAINING USING XGBOOST AND BAGGNG INCLUDING THE NEW INFORMATION(SEMI SUPERVUSED LEARNING) #
131 | ####################################################################################################
132 | 
133 | # Training, it trains using 10 fold CV and the predicions are the average of the classifiers of every fold.
134 | print('STEP: Training Semi-Supervised Learning')
135 | (resultadosVal,resultadosTST, OriginalIndexTR,OriginalIndexTST, classifiers)=FullTraining(YTR,XTR,XTST,OriginalIndexTR,OriginalIndexTST,DevicesTrain, Groups, Labels)
136 | 
137 | ############################################################
138 | # POST PROCESSING PROCEDURE DESCRIBED IN THE DOCUMENTATION #
139 | ############################################################
140 | 
141 | print('STEP: Post Processing')
142 | # Initial selection of the cookies associated to every device
143 | (validat,thTR)=bestSelection(resultadosVal, OriginalIndexTR, np.array([1.0,0.9]),Groups)
144 | 
145 | # Increasing the number of candidates in devices whose best candiate doesn't have a good likelihood
146 | (validat,thTR) = PostAnalysisTrain(validat,thTR,classifiers,DevicesTrain,Cookies,DeviceIPS,CookieIPS,IPDev,IPCoo,Groups,WhosDevice,DevProperties,DictHandle,Labels)
147 | 
148 | F05=calculateF05(validat,Labels)
149 | print "F05 Validation",F05
150 | 
151 | ####################
152 | # SAVING THE MODEL #
153 | ####################
154 | 
155 | print('Saving model')
156 | saveModel(modelpath,classifiers,DictOtherDevices)
157 | 


--------------------------------------------------------------------------------