├── .gitattributes ├── .gitignore ├── README.md ├── csv_to_vw.py ├── vw_command_line.txt └── vw_to_kaggle.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # ========================= 18 | # Operating System Files 19 | # ========================= 20 | 21 | # OSX 22 | # ========================= 23 | 24 | .DS_Store 25 | .AppleDouble 26 | .LSOverride 27 | 28 | # Icon must ends with two \r. 29 | Icon 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Kaggle Criteo : Display Advertising Challenge 2 | ============= 3 | Code to beat the logistic regression benchmark using Python and Vowpal Wabbit. 4 | 5 | Competition page: https://www.kaggle.com/c/criteo-display-ad-challenge 6 | 7 | Forum share post: http://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/9583/beat-the-benchmark-with-vowpal-wabbit 8 | 9 | Full description at MLWave.com: http://mlwave.com/predicting-click-through-rates-with-online-machine-learning/ 10 | 11 | Vowpal Wabbit repo: https://github.com/JohnLangford/vowpal_wabbit 12 | -------------------------------------------------------------------------------- /csv_to_vw.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | ######################################################## 4 | # __Author__: Triskelion # 5 | # Kaggle competition "Display Advertising Challenge": # 6 | # http://www.kaggle.com/c/criteo-display-ad-challenge/ # 7 | # Credit: Zygmunt Zając # 8 | ######################################################## 9 | 10 | from datetime import datetime 11 | from csv import DictReader 12 | 13 | def csv_to_vw(loc_csv, loc_output, train=True): 14 | """ 15 | Munges a CSV file (loc_csv) to a VW file (loc_output). Set "train" 16 | to False when munging a test set. 17 | TODO: Too slow for a daily cron job. Try optimize, Pandas or Go. 18 | """ 19 | start = datetime.now() 20 | print("\nTurning %s into %s. Is_train_set? %s"%(loc_csv,loc_output,train)) 21 | 22 | with open(loc_output,"wb") as outfile: 23 | for e, row in enumerate( DictReader(open(loc_csv)) ): 24 | 25 | #Creating the features 26 | numerical_features = "" 27 | categorical_features = "" 28 | for k,v in row.items(): 29 | if k not in ["Label","Id"]: 30 | if "I" in k: # numerical feature, example: I5 31 | if len(str(v)) > 0: #check for empty values 32 | numerical_features += " %s:%s" % (k,v) 33 | if "C" in k: # categorical feature, example: C2 34 | if len(str(v)) > 0: 35 | categorical_features += " %s" % v 36 | 37 | #Creating the labels 38 | if train: #we care about labels 39 | if row['Label'] == "1": 40 | label = 1 41 | else: 42 | label = -1 #we set negative label to -1 43 | outfile.write( "%s '%s |i%s |c%s\n" % (label,row['Id'],numerical_features,categorical_features) ) 44 | 45 | else: #we dont care about labels 46 | outfile.write( "1 '%s |i%s |c%s\n" % (row['Id'],numerical_features,categorical_features) ) 47 | 48 | #Reporting progress 49 | if e % 1000000 == 0: 50 | print("%s\t%s"%(e, str(datetime.now() - start))) 51 | 52 | print("\n %s Task execution time:\n\t%s"%(e, str(datetime.now() - start))) 53 | 54 | #csv_to_vw("d:\\Downloads\\train\\train.csv", "c:\\click.train.vw",train=True) 55 | #csv_to_vw("d:\\Downloads\\test\\test.csv", "d:\\click.test.vw",train=False) -------------------------------------------------------------------------------- /vw_command_line.txt: -------------------------------------------------------------------------------- 1 | Training VW: 2 | 3 | ./vw click.train.vw -f click.model.vw --loss_function logistic 4 | 5 | Testing VW: 6 | 7 | ./vw click.test.vw -t -i click.model.vw -p click.preds.txt -------------------------------------------------------------------------------- /vw_to_kaggle.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def zygmoid(x): 4 | #I know it's a common Sigmoid feature, but that's why I probably found 5 | #it on FastML too: https://github.com/zygmuntz/kaggle-stackoverflow/blob/master/sigmoid_mc.py 6 | return 1 / (1 + math.exp(-x)) 7 | 8 | with open("kaggle.click.submission.csv","wb") as outfile: 9 | outfile.write("Id,Predicted\n") 10 | for line in open("c:\\click.preds.txt"): 11 | row = line.strip().split(" ") 12 | outfile.write("%s,%f\n"%(row[1],zygmoid(float(row[0])))) 13 | --------------------------------------------------------------------------------