├── .gitattributes
├── .gitignore
├── README.md
├── csv_to_vw.py
├── vw_command_line.txt
└── vw_to_kaggle.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # =========================
18 | # Operating System Files
19 | # =========================
20 | 
21 | # OSX
22 | # =========================
23 | 
24 | .DS_Store
25 | .AppleDouble
26 | .LSOverride
27 | 
28 | # Icon must ends with two \r.
29 | Icon
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Kaggle Criteo : Display Advertising Challenge
 2 | =============
 3 | Code to beat the logistic regression benchmark using Python and Vowpal Wabbit.
 4 | 
 5 | Competition page: https://www.kaggle.com/c/criteo-display-ad-challenge
 6 | 
 7 | Forum share post: http://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/9583/beat-the-benchmark-with-vowpal-wabbit
 8 | 
 9 | Full description at MLWave.com: http://mlwave.com/predicting-click-through-rates-with-online-machine-learning/
10 | 
11 | Vowpal Wabbit repo: https://github.com/JohnLangford/vowpal_wabbit
12 | 


--------------------------------------------------------------------------------
/csv_to_vw.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | ########################################################
 4 | # __Author__: Triskelion <info@mlwave.com>             #
 5 | # Kaggle competition "Display Advertising Challenge":  #
 6 | # http://www.kaggle.com/c/criteo-display-ad-challenge/ #
 7 | # Credit: Zygmunt Zając <zygmunt@fastml.com>           #
 8 | ########################################################
 9 | 
10 | from datetime import datetime
11 | from csv import DictReader
12 | 
13 | def csv_to_vw(loc_csv, loc_output, train=True):
14 |   """
15 |   Munges a CSV file (loc_csv) to a VW file (loc_output). Set "train"
16 |   to False when munging a test set.
17 |   TODO: Too slow for a daily cron job. Try optimize, Pandas or Go.
18 |   """
19 |   start = datetime.now()
20 |   print("\nTurning %s into %s. Is_train_set? %s"%(loc_csv,loc_output,train))
21 |   
22 |   with open(loc_output,"wb") as outfile:
23 |     for e, row in enumerate( DictReader(open(loc_csv)) ):
24 | 	
25 | 	  #Creating the features
26 |       numerical_features = ""
27 |       categorical_features = ""
28 |       for k,v in row.items():
29 |         if k not in ["Label","Id"]:
30 |           if "I" in k: # numerical feature, example: I5
31 |             if len(str(v)) > 0: #check for empty values
32 |               numerical_features += " %s:%s" % (k,v)
33 |           if "C" in k: # categorical feature, example: C2
34 |             if len(str(v)) > 0:
35 |               categorical_features += " %s" % v
36 | 			  
37 | 	  #Creating the labels		  
38 |       if train: #we care about labels
39 |         if row['Label'] == "1":
40 |           label = 1
41 |         else:
42 |           label = -1 #we set negative label to -1
43 |         outfile.write( "%s '%s |i%s |c%s\n" % (label,row['Id'],numerical_features,categorical_features) )
44 | 		
45 |       else: #we dont care about labels
46 |         outfile.write( "1 '%s |i%s |c%s\n" % (row['Id'],numerical_features,categorical_features) )
47 |       
48 | 	  #Reporting progress
49 |       if e % 1000000 == 0:
50 |         print("%s\t%s"%(e, str(datetime.now() - start)))
51 | 
52 |   print("\n %s Task execution time:\n\t%s"%(e, str(datetime.now() - start)))
53 | 
54 | #csv_to_vw("d:\\Downloads\\train\\train.csv", "c:\\click.train.vw",train=True)
55 | #csv_to_vw("d:\\Downloads\\test\\test.csv", "d:\\click.test.vw",train=False)


--------------------------------------------------------------------------------
/vw_command_line.txt:
--------------------------------------------------------------------------------
1 | Training VW:
2 | 
3 | ./vw click.train.vw -f click.model.vw --loss_function logistic
4 | 
5 | Testing VW:
6 | 
7 | ./vw click.test.vw -t -i click.model.vw -p click.preds.txt


--------------------------------------------------------------------------------
/vw_to_kaggle.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | def zygmoid(x):
 4 | 	#I know it's a common Sigmoid feature, but that's why I probably found
 5 | 	#it on FastML too: https://github.com/zygmuntz/kaggle-stackoverflow/blob/master/sigmoid_mc.py
 6 | 	return 1 / (1 + math.exp(-x))
 7 | 
 8 | with open("kaggle.click.submission.csv","wb") as outfile:
 9 | 	outfile.write("Id,Predicted\n")
10 | 	for line in open("c:\\click.preds.txt"):
11 | 		row = line.strip().split(" ")
12 | 		outfile.write("%s,%f\n"%(row[1],zygmoid(float(row[0]))))
13 | 	


--------------------------------------------------------------------------------