├── README.md ├── optional ├── README.md ├── run_sub.py ├── split.py ├── mlogloss.py └── bag.py ├── .gitattributes ├── sigmoid_mc.py ├── LICENSE ├── csv2vw.py ├── extract.py └── .gitignore /README.md: -------------------------------------------------------------------------------- 1 | See [http://fastml.com/predicting-closed-questions-on-stack-overflow/](http://fastml.com/predicting-closed-questions-on-stack-overflow/) for description. 2 | -------------------------------------------------------------------------------- /optional/README.md: -------------------------------------------------------------------------------- 1 | bag.py - average your solutions to get a better one 2 | mlogloss.py - compute multiclass log loss using test file in CSV format (pre-processed) 3 | and predictions file 4 | run_sub.py - train, predict and output predictions in submission format 5 | split.py - split a file into two - useful for creating a validation set from a training set -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /optional/run_sub.py: -------------------------------------------------------------------------------- 1 | 'train, predict and output predictions in submission format' 2 | 3 | train_file = 'train.vw' 4 | test_file = 'test.vw' 5 | 6 | model_file = 'model' 7 | r_file = 'raw_predictions.txt' 8 | p_file = 'p.txt' 9 | 10 | import os 11 | 12 | cmd = 'vw --loss_function logistic --oaa 5 -d %s -f %s' % ( train_file, model_file ) 13 | print cmd 14 | os.system( cmd ) 15 | 16 | cmd = 'vw --loss_function logistic --oaa 5 -i %s -t -d %s -r %s' % ( model_file, test_file, r_file ) 17 | print cmd 18 | os.system( cmd ) 19 | 20 | cmd = 'python sigmoid_mc.py %s %s' % ( r_file, p_file ) 21 | print cmd 22 | os.system( cmd ) 23 | 24 | -------------------------------------------------------------------------------- /sigmoid_mc.py: -------------------------------------------------------------------------------- 1 | 'read vw raw predictions file, compute and normalize probabilities, write in submission format' 2 | 3 | import sys, csv, math 4 | 5 | def sigmoid(x): 6 | return 1 / (1 + math.exp(-x)) 7 | 8 | def normalize( predictions ): 9 | s = sum( predictions ) 10 | normalized = [] 11 | for p in predictions: 12 | normalized.append( p / s ) 13 | return normalized 14 | 15 | ### 16 | 17 | input_file = sys.argv[1] 18 | output_file = sys.argv[2] 19 | 20 | i = open( input_file ) 21 | o = open( output_file, 'wb' ) 22 | 23 | reader = csv.reader( i, delimiter = " " ) 24 | writer = csv.writer( o ) 25 | 26 | for line in reader: 27 | 28 | post_id = reader.next()[1] 29 | 30 | probs = [] 31 | for element in line: 32 | prediction = element.split( ":" )[1] 33 | prob = sigmoid( float( prediction )) 34 | probs.append( prob ) 35 | 36 | new_line = normalize( probs ) 37 | 38 | writer.writerow( [post_id] + new_line ) -------------------------------------------------------------------------------- /optional/split.py: -------------------------------------------------------------------------------- 1 | 'split file lines randomly. Usage: split.py []' 2 | 3 | import csv 4 | import sys 5 | import random 6 | 7 | try: 8 | P = float( sys.argv[4] ) 9 | except IndexError: 10 | P = 0.9 11 | 12 | print "P = %s" % ( P ) 13 | 14 | input_file = sys.argv[1] 15 | output_file1 = sys.argv[2] 16 | output_file2 = sys.argv[3] 17 | 18 | i = open( input_file ) 19 | o1 = open( output_file1, 'wb' ) 20 | o2 = open( output_file2, 'wb' ) 21 | 22 | reader = csv.reader( i ) 23 | writer1 = csv.writer( o1 ) 24 | writer2 = csv.writer( o2 ) 25 | 26 | #headers = reader.next() 27 | #writer1.writerow( headers ) 28 | #writer2.writerow( headers ) 29 | 30 | for line in reader: 31 | r = random.random() 32 | if r > P: 33 | writer2.writerow( line ) 34 | else: 35 | writer1.writerow( line ) 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /optional/mlogloss.py: -------------------------------------------------------------------------------- 1 | 'compute multiclass log loss from test file in CSV format (pre-processed) and predictions file' 2 | 3 | import sys, csv 4 | from math import log 5 | 6 | statuses = ['not a real question', 'not constructive', 'off topic', 'open', 'too localized'] 7 | 8 | test_file = sys.argv[1] 9 | predictions_file = sys.argv[2] 10 | 11 | test_reader = csv.reader( open( test_file )) 12 | p_reader = csv.reader( open( predictions_file )) 13 | 14 | logs = [] 15 | n = 0 16 | 17 | for p_line in p_reader: 18 | test_line = test_reader.next() 19 | p_line.pop( 0 ) # get rid of post id 20 | 21 | n += 1 22 | 23 | status = test_line[1] 24 | true_index = statuses.index( status ) 25 | 26 | prediction_for_true = p_line[true_index] 27 | # print prediction_for_true 28 | 29 | log_p = log( float( prediction_for_true )) 30 | logs.append( log_p ) 31 | 32 | logs = sum( logs ) 33 | logloss = - logs / n * 1.0 34 | 35 | print "%s %s" % ( test_file, predictions_file ) 36 | print logloss 37 | print -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Zygmunt Zając 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /csv2vw.py: -------------------------------------------------------------------------------- 1 | 'convert from [stackoverflow-specific] CSV to VW format' 2 | 3 | import sys, csv, re 4 | 5 | test_label = '1' 6 | 7 | def get_label( status ): 8 | statuses = ['not a real question', 'not constructive', 'off topic', 'open', 'too localized'] 9 | label = statuses.index( status ) + 1 10 | return label 11 | 12 | input_file = sys.argv[1] 13 | output_file = sys.argv[2] 14 | 15 | reader = csv.reader( open( input_file )) 16 | o = open( output_file, 'wb' ) 17 | 18 | counter = 0 19 | for line in reader: 20 | 21 | counter += 1 22 | 23 | post_id = line[0] 24 | status = line[1] 25 | reputation = line[2] 26 | good_posts = line[3] 27 | words = line[4] 28 | tags = line[5:10] 29 | tags = " ".join( tags ).strip() 30 | 31 | body = line[10] 32 | 33 | if status != '0': 34 | label = get_label( status ) 35 | else: 36 | label = test_label 37 | 38 | output_line = "%s %s %s" % ( label, 1, post_id ) # weight is 1 39 | output_line += "|n %s %s" % ( reputation, good_posts ) 40 | output_line += "|w %s |t %s |b %s" % ( words, tags, body ) 41 | output_line += "\n" 42 | 43 | o.write( output_line ) 44 | 45 | if counter % 100000 == 0: 46 | print counter 47 | -------------------------------------------------------------------------------- /optional/bag.py: -------------------------------------------------------------------------------- 1 | 'a script for averaging your [pretty good] solutions (in submission format) to get a better one' 2 | 3 | # edit this: 4 | 5 | input_files = [ "p_sub%s.csv" % ( x ) for x in range( 1, 6 ) ] 6 | input_files.append( "p_sub_num.csv" ) 7 | 8 | output_file = "p_sub_bagged.csv" 9 | 10 | print "%s ---> '%s'" % ( input_files, output_file ) 11 | 12 | ########################################################### 13 | 14 | import csv 15 | 16 | num_files = len( input_files ) 17 | 18 | readers = {} 19 | for i in range( num_files ): 20 | input = open( input_files[i] ) 21 | readers[i] = csv.reader( input ) 22 | 23 | writer = csv.writer( open( output_file, 'wb' )) 24 | reader_0 = readers[0] 25 | 26 | for line in reader_0: 27 | lines = [ line ] + [ readers[i].next() for i in range( 1, num_files ) ] 28 | #print lines 29 | 30 | post_id = line[0] 31 | new_line = [ post_id ] 32 | 33 | for column in range( 1, 6 ): # columns in sub file 34 | votes = [] 35 | for l in range( num_files ): 36 | value = float( lines[l][column] ) 37 | votes.append( value ) 38 | 39 | prediction = sum( votes ) / num_files 40 | new_line.append( prediction ) 41 | 42 | writer.writerow( new_line ) -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | 'pre-process a CSV file' 2 | 3 | import sys, csv, re 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | def get_words( text ): 9 | text = text.replace( "'", "" ) 10 | text = re.sub( r'\W+', ' ', text ) 11 | text = text.lower() 12 | 13 | text = text.split() 14 | words = [] 15 | for w in text: 16 | if w in words: 17 | continue 18 | words.append( w ) 19 | 20 | words = " ".join( words ) 21 | return words 22 | 23 | def prepare_tag( tag ): 24 | tag = re.sub( r'\W+', '', tag ) 25 | tag = tag.lower() 26 | return tag 27 | 28 | def get_unique_tags( tags ): 29 | unique_tags = [] 30 | for tag in tags: 31 | if tag in unique_tags: 32 | unique_tags.append( '' ) 33 | else: 34 | unique_tags.append( tag ) 35 | return unique_tags 36 | 37 | reader = csv.reader( open( input_file )) 38 | writer = csv.writer( open( output_file, 'wb' )) 39 | 40 | headers = reader.next() 41 | 42 | counter = 0 43 | for line in reader: 44 | 45 | post_id = line[0] 46 | try: 47 | post_status = line[14] 48 | except IndexError: 49 | post_status = 0 50 | 51 | reputation = line[4] 52 | good_posts = line[5] 53 | 54 | post_title = get_words( line[6] ) 55 | post_body = get_words( line[7] ) 56 | tags = line[8:13] 57 | tags = map( prepare_tag, tags ) 58 | 59 | unique_tags = get_unique_tags( tags ) 60 | 61 | 62 | writer.writerow( [ post_id, post_status, reputation, good_posts, post_title] + unique_tags + [ post_body ] ) 63 | 64 | counter += 1 65 | if counter % 10000 == 0: 66 | print counter 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | [Dd]ebug/ 46 | [Rr]elease/ 47 | *_i.c 48 | *_p.c 49 | *.ilk 50 | *.meta 51 | *.obj 52 | *.pch 53 | *.pdb 54 | *.pgc 55 | *.pgd 56 | *.rsp 57 | *.sbr 58 | *.tlb 59 | *.tli 60 | *.tlh 61 | *.tmp 62 | *.vspscc 63 | .builds 64 | *.dotCover 65 | 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this 67 | #packages/ 68 | 69 | # Visual C++ cache files 70 | ipch/ 71 | *.aps 72 | *.ncb 73 | *.opensdf 74 | *.sdf 75 | 76 | # Visual Studio profiler 77 | *.psess 78 | *.vsp 79 | 80 | # ReSharper is a .NET coding add-in 81 | _ReSharper* 82 | 83 | # Installshield output folder 84 | [Ee]xpress 85 | 86 | # DocProject is a documentation generator add-in 87 | DocProject/buildhelp/ 88 | DocProject/Help/*.HxT 89 | DocProject/Help/*.HxC 90 | DocProject/Help/*.hhc 91 | DocProject/Help/*.hhk 92 | DocProject/Help/*.hhp 93 | DocProject/Help/Html2 94 | DocProject/Help/html 95 | 96 | # Click-Once directory 97 | publish 98 | 99 | # Others 100 | [Bb]in 101 | [Oo]bj 102 | sql 103 | TestResults 104 | *.Cache 105 | ClientBin 106 | stylecop.* 107 | ~$* 108 | *.dbmdl 109 | Generated_Code #added for RIA/Silverlight projects 110 | 111 | # Backup & report files from converting an old project file to a newer 112 | # Visual Studio version. Backup files are not needed, because we have git ;-) 113 | _UpgradeReport_Files/ 114 | Backup*/ 115 | UpgradeLog*.XML 116 | 117 | 118 | 119 | ############ 120 | ## Windows 121 | ############ 122 | 123 | # Windows image file caches 124 | Thumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | 130 | ############# 131 | ## Python 132 | ############# 133 | 134 | *.py[co] 135 | 136 | # Packages 137 | *.egg 138 | *.egg-info 139 | dist 140 | build 141 | eggs 142 | parts 143 | bin 144 | var 145 | sdist 146 | develop-eggs 147 | .installed.cfg 148 | 149 | # Installer logs 150 | pip-log.txt 151 | 152 | # Unit test / coverage reports 153 | .coverage 154 | .tox 155 | 156 | #Translations 157 | *.mo 158 | 159 | #Mr Developer 160 | .mr.developer.cfg 161 | 162 | # Mac crap 163 | .DS_Store 164 | --------------------------------------------------------------------------------