├── README.md
├── optional
    ├── README.md
    ├── run_sub.py
    ├── split.py
    ├── mlogloss.py
    └── bag.py
├── .gitattributes
├── sigmoid_mc.py
├── LICENSE
├── csv2vw.py
├── extract.py
└── .gitignore


/README.md:
--------------------------------------------------------------------------------
1 | See [http://fastml.com/predicting-closed-questions-on-stack-overflow/](http://fastml.com/predicting-closed-questions-on-stack-overflow/) for description.
2 | 


--------------------------------------------------------------------------------
/optional/README.md:
--------------------------------------------------------------------------------
1 | 	bag.py - average your solutions to get a better one
2 | 	mlogloss.py - compute multiclass log loss using test file in CSV format (pre-processed) 
3 | 		and predictions file
4 | 	run_sub.py - train, predict and output predictions in submission format
5 | 	split.py - split a file into two - useful for creating a validation set from a training set


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/optional/run_sub.py:
--------------------------------------------------------------------------------
 1 | 'train, predict and output predictions in submission format'
 2 | 
 3 | train_file = 'train.vw'
 4 | test_file = 'test.vw'
 5 | 
 6 | model_file = 'model'
 7 | r_file = 'raw_predictions.txt'
 8 | p_file = 'p.txt'
 9 | 
10 | import os
11 | 
12 | cmd = 'vw --loss_function logistic --oaa 5 -d %s -f %s' % ( train_file, model_file )
13 | print cmd
14 | os.system( cmd ) 
15 | 
16 | cmd = 'vw --loss_function logistic --oaa 5 -i %s -t -d %s -r %s' % ( model_file, test_file, r_file )
17 | print cmd
18 | os.system( cmd ) 	
19 | 
20 | cmd = 'python sigmoid_mc.py %s %s' % ( r_file, p_file )
21 | print cmd
22 | os.system( cmd ) 	
23 | 
24 | 


--------------------------------------------------------------------------------
/sigmoid_mc.py:
--------------------------------------------------------------------------------
 1 | 'read vw raw predictions file, compute and normalize probabilities, write in submission format'
 2 | 
 3 | import sys, csv, math
 4 | 
 5 | def sigmoid(x):
 6 | 	return 1 / (1 + math.exp(-x))
 7 |   
 8 | def normalize( predictions ):
 9 | 	s = sum( predictions )
10 | 	normalized = []
11 | 	for p in predictions:
12 | 		normalized.append( p / s )
13 | 	return normalized  
14 |   
15 | ###  
16 |   
17 | input_file = sys.argv[1]
18 | output_file = sys.argv[2]
19 | 
20 | i = open( input_file )
21 | o = open( output_file, 'wb' )
22 | 
23 | reader = csv.reader( i, delimiter = " " )
24 | writer = csv.writer( o )
25 | 
26 | for line in reader:
27 | 	
28 | 	post_id = reader.next()[1]
29 | 	
30 | 	probs = []
31 | 	for element in line:
32 | 		prediction = element.split( ":" )[1]
33 | 		prob = sigmoid( float( prediction ))
34 | 		probs.append( prob )
35 | 	
36 | 	new_line = normalize( probs )
37 | 	
38 | 	writer.writerow( [post_id] + new_line )


--------------------------------------------------------------------------------
/optional/split.py:
--------------------------------------------------------------------------------
 1 | 'split file lines randomly. Usage: split.py <input file> <output file 1> <output file 2> [<probability of writing to the first file>]'
 2 | 
 3 | import csv
 4 | import sys
 5 | import random
 6 | 
 7 | try:
 8 | 	P = float( sys.argv[4] )
 9 | except IndexError:
10 | 	P = 0.9
11 | 	
12 | print "P = %s" % ( P )
13 | 
14 | input_file = sys.argv[1]
15 | output_file1 = sys.argv[2]
16 | output_file2 = sys.argv[3]
17 | 
18 | i = open( input_file )
19 | o1 = open( output_file1, 'wb' )
20 | o2 = open( output_file2, 'wb' )
21 | 
22 | reader = csv.reader( i )
23 | writer1 = csv.writer( o1 )
24 | writer2 = csv.writer( o2 )
25 | 
26 | #headers = reader.next()
27 | #writer1.writerow( headers )
28 | #writer2.writerow( headers )
29 | 
30 | for line in reader:
31 | 	r = random.random()
32 | 	if r > P:
33 | 		writer2.writerow( line )
34 | 	else:
35 | 		writer1.writerow( line )
36 | 	
37 | 
38 | 	
39 | 
40 | 		
41 | 		
42 | 		
43 | 		
44 | 		
45 | 		
46 | 		


--------------------------------------------------------------------------------
/optional/mlogloss.py:
--------------------------------------------------------------------------------
 1 | 'compute multiclass log loss from test file in CSV format (pre-processed) and predictions file'
 2 | 
 3 | import sys, csv
 4 | from math import log
 5 | 
 6 | statuses = ['not a real question', 'not constructive', 'off topic', 'open', 'too localized']
 7 | 
 8 | test_file = sys.argv[1]
 9 | predictions_file = sys.argv[2]
10 | 
11 | test_reader = csv.reader( open( test_file ))
12 | p_reader = csv.reader( open( predictions_file ))
13 | 
14 | logs = []
15 | n = 0
16 | 
17 | for p_line in p_reader:
18 | 	test_line = test_reader.next()
19 | 	p_line.pop( 0 )		# get rid of post id
20 | 	
21 | 	n += 1
22 | 	
23 | 	status = test_line[1]
24 | 	true_index = statuses.index( status )
25 | 	
26 | 	prediction_for_true = p_line[true_index]
27 | 	# print prediction_for_true
28 | 	
29 | 	log_p = log( float( prediction_for_true ))
30 | 	logs.append( log_p )
31 | 	
32 | logs = sum( logs )
33 | logloss = - logs / n * 1.0
34 | 
35 | print "%s %s" % ( test_file, predictions_file )
36 | print logloss
37 | print


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Zygmunt Zając
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/csv2vw.py:
--------------------------------------------------------------------------------
 1 | 'convert from [stackoverflow-specific] CSV to VW format'
 2 | 
 3 | import sys, csv, re
 4 | 
 5 | test_label = '1'
 6 | 
 7 | def get_label( status ):
 8 | 	statuses = ['not a real question', 'not constructive', 'off topic', 'open', 'too localized']
 9 | 	label = statuses.index( status ) + 1
10 | 	return label
11 | 
12 | input_file = sys.argv[1]
13 | output_file = sys.argv[2]
14 | 
15 | reader = csv.reader( open( input_file ))
16 | o = open( output_file, 'wb' )
17 | 
18 | counter = 0	
19 | for line in reader:
20 | 
21 | 	counter += 1
22 | 
23 | 	post_id = line[0]
24 | 	status = line[1]
25 | 	reputation = line[2]
26 | 	good_posts = line[3]
27 | 	words = line[4]
28 | 	tags = line[5:10]
29 | 	tags = " ".join( tags ).strip()
30 | 	
31 | 	body = line[10]
32 | 	
33 | 	if status != '0':
34 | 		label = get_label( status )	
35 | 	else:
36 | 		label = test_label
37 | 		
38 | 	output_line = "%s %s %s" % ( label, 1, post_id ) 	# weight is 1
39 | 	output_line += "|n %s %s" % ( reputation, good_posts )
40 | 	output_line += "|w %s |t %s |b %s" % ( words, tags, body )
41 | 	output_line += "\n"
42 | 
43 | 	o.write( output_line )
44 | 
45 | 	if counter % 100000 == 0:
46 | 		print counter
47 | 


--------------------------------------------------------------------------------
/optional/bag.py:
--------------------------------------------------------------------------------
 1 | 'a script for averaging your [pretty good] solutions (in submission format) to get a better one'
 2 | 
 3 | # edit this:
 4 | 
 5 | input_files = [ "p_sub%s.csv" % ( x ) for x in range( 1, 6 ) ]
 6 | input_files.append( "p_sub_num.csv" )
 7 | 
 8 | output_file = "p_sub_bagged.csv"
 9 | 
10 | print "%s ---> '%s'" % ( input_files, output_file )
11 | 
12 | ###########################################################
13 | 
14 | import csv
15 | 
16 | num_files = len( input_files )
17 | 
18 | readers = {}
19 | for i in range( num_files ):
20 | 	input = open( input_files[i] )
21 | 	readers[i] = csv.reader( input )
22 | 
23 | writer = csv.writer( open( output_file, 'wb' ))
24 | reader_0 = readers[0]	
25 | 	
26 | for line in reader_0:
27 | 	lines = [ line ] + [ readers[i].next() for i in range( 1, num_files ) ]
28 | 	#print lines
29 | 	
30 | 	post_id = line[0]
31 | 	new_line = [ post_id ]
32 | 	
33 | 	for column in range( 1, 6 ):		# columns in sub file
34 | 		votes = []
35 | 		for l in range( num_files ):	
36 | 			value = float( lines[l][column] )
37 | 			votes.append( value )
38 | 			
39 | 		prediction = sum( votes ) / num_files	
40 | 		new_line.append( prediction )
41 | 		
42 | 	writer.writerow( new_line )


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
 1 | 'pre-process a CSV file'
 2 | 
 3 | import sys, csv, re
 4 | 
 5 | input_file = sys.argv[1]
 6 | output_file = sys.argv[2]
 7 | 
 8 | def get_words( text ):
 9 | 	text = text.replace( "'", "" )
10 | 	text = re.sub( r'\W+', ' ', text )
11 | 	text = text.lower()
12 | 	
13 | 	text = text.split()
14 | 	words = []
15 | 	for w in text:
16 | 		if w in words:
17 | 			continue
18 | 		words.append( w )
19 | 		
20 | 	words = " ".join( words )
21 | 	return words
22 | 
23 | def prepare_tag( tag ):
24 | 	tag = re.sub( r'\W+', '', tag )
25 | 	tag = tag.lower()
26 | 	return tag
27 | 	
28 | def get_unique_tags( tags ):
29 | 	unique_tags = []
30 | 	for tag in tags:
31 | 		if tag in unique_tags:
32 | 			unique_tags.append( '' )
33 | 		else:
34 | 			unique_tags.append( tag )
35 | 	return unique_tags
36 | 
37 | reader = csv.reader( open( input_file ))
38 | writer = csv.writer( open( output_file, 'wb' ))
39 | 
40 | headers = reader.next()
41 | 
42 | counter = 0
43 | for line in reader:
44 | 
45 | 	post_id = line[0]
46 | 	try:
47 | 		post_status = line[14]
48 | 	except IndexError:
49 | 		post_status = 0
50 | 	
51 | 	reputation = line[4]
52 | 	good_posts = line[5]
53 | 	
54 | 	post_title = get_words( line[6] )
55 | 	post_body = get_words( line[7] )
56 | 	tags = line[8:13]
57 | 	tags = map( prepare_tag, tags )
58 | 	
59 | 	unique_tags = get_unique_tags( tags )
60 | 
61 | 	
62 | 	writer.writerow( [ post_id, post_status, reputation, good_posts, post_title] + unique_tags + [ post_body ] )
63 | 	
64 | 	counter += 1
65 | 	if counter % 10000 == 0:
66 | 		print counter
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # PDT-specific
 29 | .buildpath
 30 | 
 31 | 
 32 | #################
 33 | ## Visual Studio
 34 | #################
 35 | 
 36 | ## Ignore Visual Studio temporary files, build results, and
 37 | ## files generated by popular Visual Studio add-ons.
 38 | 
 39 | # User-specific files
 40 | *.suo
 41 | *.user
 42 | *.sln.docstates
 43 | 
 44 | # Build results
 45 | [Dd]ebug/
 46 | [Rr]elease/
 47 | *_i.c
 48 | *_p.c
 49 | *.ilk
 50 | *.meta
 51 | *.obj
 52 | *.pch
 53 | *.pdb
 54 | *.pgc
 55 | *.pgd
 56 | *.rsp
 57 | *.sbr
 58 | *.tlb
 59 | *.tli
 60 | *.tlh
 61 | *.tmp
 62 | *.vspscc
 63 | .builds
 64 | *.dotCover
 65 | 
 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this
 67 | #packages/
 68 | 
 69 | # Visual C++ cache files
 70 | ipch/
 71 | *.aps
 72 | *.ncb
 73 | *.opensdf
 74 | *.sdf
 75 | 
 76 | # Visual Studio profiler
 77 | *.psess
 78 | *.vsp
 79 | 
 80 | # ReSharper is a .NET coding add-in
 81 | _ReSharper*
 82 | 
 83 | # Installshield output folder
 84 | [Ee]xpress
 85 | 
 86 | # DocProject is a documentation generator add-in
 87 | DocProject/buildhelp/
 88 | DocProject/Help/*.HxT
 89 | DocProject/Help/*.HxC
 90 | DocProject/Help/*.hhc
 91 | DocProject/Help/*.hhk
 92 | DocProject/Help/*.hhp
 93 | DocProject/Help/Html2
 94 | DocProject/Help/html
 95 | 
 96 | # Click-Once directory
 97 | publish
 98 | 
 99 | # Others
100 | [Bb]in
101 | [Oo]bj
102 | sql
103 | TestResults
104 | *.Cache
105 | ClientBin
106 | stylecop.*
107 | ~$*
108 | *.dbmdl
109 | Generated_Code #added for RIA/Silverlight projects
110 | 
111 | # Backup & report files from converting an old project file to a newer
112 | # Visual Studio version. Backup files are not needed, because we have git ;-)
113 | _UpgradeReport_Files/
114 | Backup*/
115 | UpgradeLog*.XML
116 | 
117 | 
118 | 
119 | ############
120 | ## Windows
121 | ############
122 | 
123 | # Windows image file caches
124 | Thumbs.db
125 | 
126 | # Folder config file
127 | Desktop.ini
128 | 
129 | 
130 | #############
131 | ## Python
132 | #############
133 | 
134 | *.py[co]
135 | 
136 | # Packages
137 | *.egg
138 | *.egg-info
139 | dist
140 | build
141 | eggs
142 | parts
143 | bin
144 | var
145 | sdist
146 | develop-eggs
147 | .installed.cfg
148 | 
149 | # Installer logs
150 | pip-log.txt
151 | 
152 | # Unit test / coverage reports
153 | .coverage
154 | .tox
155 | 
156 | #Translations
157 | *.mo
158 | 
159 | #Mr Developer
160 | .mr.developer.cfg
161 | 
162 | # Mac crap
163 | .DS_Store
164 | 


--------------------------------------------------------------------------------