├── README.md
├── optional
├── README.md
├── run_sub.py
├── split.py
├── mlogloss.py
└── bag.py
├── .gitattributes
├── sigmoid_mc.py
├── LICENSE
├── csv2vw.py
├── extract.py
└── .gitignore
/README.md:
--------------------------------------------------------------------------------
1 | See [http://fastml.com/predicting-closed-questions-on-stack-overflow/](http://fastml.com/predicting-closed-questions-on-stack-overflow/) for description.
2 |
--------------------------------------------------------------------------------
/optional/README.md:
--------------------------------------------------------------------------------
1 | bag.py - average your solutions to get a better one
2 | mlogloss.py - compute multiclass log loss using test file in CSV format (pre-processed)
3 | and predictions file
4 | run_sub.py - train, predict and output predictions in submission format
5 | split.py - split a file into two - useful for creating a validation set from a training set
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 | *.sln merge=union
7 | *.csproj merge=union
8 | *.vbproj merge=union
9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 |
12 | # Standard to msysgit
13 | *.doc diff=astextplain
14 | *.DOC diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot diff=astextplain
18 | *.DOT diff=astextplain
19 | *.pdf diff=astextplain
20 | *.PDF diff=astextplain
21 | *.rtf diff=astextplain
22 | *.RTF diff=astextplain
23 |
--------------------------------------------------------------------------------
/optional/run_sub.py:
--------------------------------------------------------------------------------
1 | 'train, predict and output predictions in submission format'
2 |
3 | train_file = 'train.vw'
4 | test_file = 'test.vw'
5 |
6 | model_file = 'model'
7 | r_file = 'raw_predictions.txt'
8 | p_file = 'p.txt'
9 |
10 | import os
11 |
12 | cmd = 'vw --loss_function logistic --oaa 5 -d %s -f %s' % ( train_file, model_file )
13 | print cmd
14 | os.system( cmd )
15 |
16 | cmd = 'vw --loss_function logistic --oaa 5 -i %s -t -d %s -r %s' % ( model_file, test_file, r_file )
17 | print cmd
18 | os.system( cmd )
19 |
20 | cmd = 'python sigmoid_mc.py %s %s' % ( r_file, p_file )
21 | print cmd
22 | os.system( cmd )
23 |
24 |
--------------------------------------------------------------------------------
/sigmoid_mc.py:
--------------------------------------------------------------------------------
1 | 'read vw raw predictions file, compute and normalize probabilities, write in submission format'
2 |
3 | import sys, csv, math
4 |
5 | def sigmoid(x):
6 | return 1 / (1 + math.exp(-x))
7 |
8 | def normalize( predictions ):
9 | s = sum( predictions )
10 | normalized = []
11 | for p in predictions:
12 | normalized.append( p / s )
13 | return normalized
14 |
15 | ###
16 |
17 | input_file = sys.argv[1]
18 | output_file = sys.argv[2]
19 |
20 | i = open( input_file )
21 | o = open( output_file, 'wb' )
22 |
23 | reader = csv.reader( i, delimiter = " " )
24 | writer = csv.writer( o )
25 |
26 | for line in reader:
27 |
28 | post_id = reader.next()[1]
29 |
30 | probs = []
31 | for element in line:
32 | prediction = element.split( ":" )[1]
33 | prob = sigmoid( float( prediction ))
34 | probs.append( prob )
35 |
36 | new_line = normalize( probs )
37 |
38 | writer.writerow( [post_id] + new_line )
--------------------------------------------------------------------------------
/optional/split.py:
--------------------------------------------------------------------------------
1 | 'split file lines randomly. Usage: split.py