├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── adult_results.txt ├── batch.txt ├── csv_output_snippet.py ├── first.py ├── gensim_add_labels.py ├── gensim_lda.py ├── gensim_lsi.py ├── gensim_rp.py ├── gensim_tfidf.py ├── libsvm2csv.py ├── rf.r └── spams_nmf.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | [Dd]ebug/ 46 | [Rr]elease/ 47 | *_i.c 48 | *_p.c 49 | *.ilk 50 | *.meta 51 | *.obj 52 | *.pch 53 | *.pdb 54 | *.pgc 55 | *.pgd 56 | *.rsp 57 | *.sbr 58 | *.tlb 59 | *.tli 60 | *.tlh 61 | *.tmp 62 | *.vspscc 63 | .builds 64 | *.dotCover 65 | 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this 67 | #packages/ 68 | 69 | # Visual C++ cache files 70 | ipch/ 71 | *.aps 72 | *.ncb 73 | *.opensdf 74 | *.sdf 75 | 76 | # Visual Studio profiler 77 | *.psess 78 | *.vsp 79 | 80 | # ReSharper is a .NET coding add-in 81 | _ReSharper* 82 | 83 | # Installshield output folder 84 | [Ee]xpress 85 | 86 | # DocProject is a documentation generator add-in 87 | DocProject/buildhelp/ 88 | DocProject/Help/*.HxT 89 | DocProject/Help/*.HxC 90 | DocProject/Help/*.hhc 91 | DocProject/Help/*.hhk 92 | DocProject/Help/*.hhp 93 | DocProject/Help/Html2 94 | DocProject/Help/html 95 | 96 | # Click-Once directory 97 | publish 98 | 99 | # Others 100 | [Bb]in 101 | [Oo]bj 102 | sql 103 | TestResults 104 | *.Cache 105 | ClientBin 106 | stylecop.* 107 | ~$* 108 | *.dbmdl 109 | Generated_Code #added for RIA/Silverlight projects 110 | 111 | # Backup & report files from converting an old project file to a newer 112 | # Visual Studio version. Backup files are not needed, because we have git ;-) 113 | _UpgradeReport_Files/ 114 | Backup*/ 115 | UpgradeLog*.XML 116 | 117 | 118 | 119 | ############ 120 | ## Windows 121 | ############ 122 | 123 | # Windows image file caches 124 | Thumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | 130 | ############# 131 | ## Python 132 | ############# 133 | 134 | *.py[co] 135 | 136 | # Packages 137 | *.egg 138 | *.egg-info 139 | dist 140 | build 141 | eggs 142 | parts 143 | bin 144 | var 145 | sdist 146 | develop-eggs 147 | .installed.cfg 148 | 149 | # Installer logs 150 | pip-log.txt 151 | 152 | # Unit test / coverage reports 153 | .coverage 154 | .tox 155 | 156 | #Translations 157 | *.mo 158 | 159 | #Mr Developer 160 | .mr.developer.cfg 161 | 162 | # Mac crap 163 | .DS_Store 164 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Dimensionality reduction for sparse binary data 2 | =============================================== 3 | 4 | See [http://fastml.com/dimensionality-reduction-for-sparse-binary-data/](http://dimensionality-reduction-for-sparse-binary-data/) for description. 5 | 6 | adult_results.txt - results of testing on _adult_ dataset 7 | batch.txt - a batch file of commands for conversion 8 | csv_output_snippet.py - how to output csv from gensim 9 | first.py - extract some lines from a file, see batch.txt 10 | 11 | gensim_add_labels.py - add labels (lost during conversion) 12 | gensim_lda.py - perform LDA conversion 13 | gensim_lsi.py - perform LSI conversion 14 | gensim_rp.py - perform random projections conversion 15 | gensim_tfidf.py - perform TF-IDF preprocessing 16 | 17 | libsvm2csv.py - convert libsvm file to csv 18 | rf.r - random forest code used for testing 19 | 20 | spams_nmf.py - perform NMF conversion. Requires SPAMS and scikit-learn for tf-idf. 21 | -------------------------------------------------------------------------------- /adult_results.txt: -------------------------------------------------------------------------------- 1 | Scores from a random forest with 100 trees. 2 | 3 | ORIG DATA 4 | 5 | # accuracy: 0.8485351 6 | # auc: 0.8792633 7 | 8 | 9 | LSI (+ TF-IDF) 10 | 11 | # accuracy: 0.8295559 12 | # auc: 0.8588338 13 | 14 | LSI 20 (+ TF-IDF) 15 | 16 | # accuracy: 0.8294945 17 | # auc: 0.8592035 18 | 19 | 20 | RP (+ TF-IDF) 21 | 22 | # accuracy: 0.806093 23 | # auc: 0.8047654 24 | 25 | 26 | LDA + TF-IDF 27 | 28 | # accuracy: 0.788895 29 | # auc: 0.8091261 30 | 31 | LDA 32 | 33 | # accuracy: 0.8308458 34 | # auc: 0.826304 35 | 36 | 37 | 38 | ASPECT 10 39 | 40 | # accuracy: 0.8231681 41 | # auc: 0.8653033 42 | 43 | ASPECT 50 44 | 45 | # accuracy: 0.8366808 46 | # auc: 0.8866067 47 | 48 | SPAMS NMF 49 | 50 | # accuracy: 0.8220625 51 | # auc: 0.8639657 52 | 53 | SPAMS NMF + TF-IDF 54 | 55 | # accuracy: 0.8258092 56 | # auc: 0.865312 57 | 58 | CTM 59 | 60 | # accuracy: 0.8247036 61 | # auc: 0.8534664 62 | 63 | ICA 10/10 64 | 65 | # accuracy: 0.8259935 66 | # auc: 0.8647878 67 | 68 | ICA (+ TFIDF) 10/10 w/stabilization 69 | 70 | # accuracy: 0.8273448 71 | # auc: 0.8653916 72 | -------------------------------------------------------------------------------- /batch.txt: -------------------------------------------------------------------------------- 1 | # a list of commands for transforming data 2 | # you can use it as a shell script, or a batch file on Windows (change # to rem) 3 | 4 | # run LSI, save and split back into train and test, convert to csv 5 | 6 | gensim_lsi.py data/orig/all.txt data/lsi_all_.txt 10 7 | 8 | gensim_add_labels.py data/orig/all.txt data/lsi_all_.txt data/lsi_all.txt 9 | 10 | first.py data/lsi_all.txt data/lsi_train.txt 32561 11 | first.py data/lsi_all.txt data/lsi_test.txt 99999 32561 12 | 13 | libsvm2csv.py data/lsi_train.txt data/lsi_train.csv 10 14 | libsvm2csv.py data/lsi_test.txt data/lsi_test.csv 10 15 | -------------------------------------------------------------------------------- /csv_output_snippet.py: -------------------------------------------------------------------------------- 1 | from gensim import matutils 2 | 3 | output_file = 'output.csv' 4 | 5 | # http://docs.scipy.org/doc/numpy/reference/generated/numpy.savetxt.html 6 | numpy.savetxt( output_file, matutils.corpus2dense( transformed_corpus, num_cols ), fmt = '%.6f', delimiter = ',' ) 7 | -------------------------------------------------------------------------------- /first.py: -------------------------------------------------------------------------------- 1 | 'Take n first lines from input file and save them to output file' 2 | 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | try: 8 | lines = int( sys.argv[3] ) 9 | except IndexError: 10 | lines = 100 11 | 12 | try: 13 | offset = int( sys.argv[4] ) 14 | except IndexError: 15 | offset = 0 16 | 17 | 18 | i = open( input_file ) 19 | o = open( output_file, 'wb' ) 20 | 21 | count = 0 22 | for line in i: 23 | 24 | if offset > 0: 25 | offset -= 1 26 | continue 27 | 28 | o.write( line ) 29 | count += 1 30 | 31 | if count >= lines: 32 | break 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /gensim_add_labels.py: -------------------------------------------------------------------------------- 1 | 'add labels from the original libsvm file after gensim conversion' 2 | 3 | import sys, csv 4 | 5 | original_file = sys.argv[1] 6 | input_file = sys.argv[2] 7 | output_file = sys.argv[3] 8 | 9 | orig_reader = csv.reader( open( original_file ), delimiter = ' ' ) 10 | reader = csv.reader( open( input_file ), delimiter = ' ' ) 11 | writer = csv.writer( open( output_file, 'wb' ), delimiter = ' ' ) 12 | 13 | n = 0 14 | for line in orig_reader: 15 | label = line[0] 16 | 17 | line = reader.next() 18 | line[0] = label 19 | 20 | writer.writerow( line ) 21 | 22 | n += 1 23 | if n % 10000 == 0: 24 | print n 25 | -------------------------------------------------------------------------------- /gensim_lda.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | from gensim import corpora, models, similarities 3 | from gensim.corpora import SvmLightCorpus 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | try: 8 | num_topics = int( sys.argv[3] ) 9 | except IndexError: 10 | num_topics = 50 11 | 12 | ### 13 | 14 | print "loading data..." 15 | print time.strftime("%H:%M:%S", time.localtime()) 16 | print 17 | 18 | c = SvmLightCorpus( input_file ) 19 | 20 | print "running LDA..." 21 | print time.strftime("%H:%M:%S", time.localtime()) 22 | print 23 | 24 | lda = models.LdaModel( c, id2word = None, num_topics = num_topics ) 25 | 26 | print "converting corpus to LDA..." 27 | print time.strftime("%H:%M:%S", time.localtime()) 28 | print 29 | 30 | c_lda = lda[c] 31 | 32 | print "saving..." 33 | print 34 | 35 | SvmLightCorpus.serialize( output_file, c_lda ) 36 | 37 | print "done." 38 | print time.strftime("%H:%M:%S", time.localtime()) 39 | -------------------------------------------------------------------------------- /gensim_lsi.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | from gensim import corpora, models, similarities 3 | from gensim.corpora import SvmLightCorpus 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | try: 8 | num_topics = int( sys.argv[3] ) 9 | except IndexError: 10 | num_topics = 50 11 | 12 | ### 13 | 14 | print "loading data..." 15 | print time.strftime("%H:%M:%S", time.localtime()) 16 | 17 | c = SvmLightCorpus( input_file ) 18 | 19 | print "starting tf-idf..." 20 | print 21 | 22 | tfidf = models.TfidfModel( c ) 23 | c_tfidf = tfidf[c] 24 | 25 | print "running LSI..." 26 | print time.strftime("%H:%M:%S", time.localtime()) 27 | print 28 | 29 | lsi = models.LsiModel( c_tfidf, id2word = None, num_topics = num_topics ) 30 | 31 | print "converting corpus to LSI..." 32 | print time.strftime("%H:%M:%S", time.localtime()) 33 | 34 | c_lsi = lsi[c_tfidf] 35 | 36 | print "saving..." 37 | print 38 | 39 | SvmLightCorpus.serialize( output_file, c_lsi ) 40 | 41 | print "done." 42 | print time.strftime("%H:%M:%S", time.localtime()) 43 | -------------------------------------------------------------------------------- /gensim_rp.py: -------------------------------------------------------------------------------- 1 | import time, sys 2 | from gensim import corpora, models, similarities 3 | from gensim.corpora import SvmLightCorpus 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | try: 8 | num_topics = int( sys.argv[3] ) 9 | except IndexError: 10 | num_topics = 50 11 | 12 | ### 13 | 14 | print "loading data..." 15 | print time.strftime("%H:%M:%S", time.localtime()) 16 | 17 | c = SvmLightCorpus( input_file ) 18 | 19 | print "starting tf-idf..." 20 | print 21 | 22 | tfidf = models.TfidfModel( c ) 23 | c_tfidf = tfidf[c] 24 | 25 | print "running RP..." 26 | print time.strftime("%H:%M:%S", time.localtime()) 27 | print 28 | 29 | rp = models.RpModel( c_tfidf, num_topics = num_topics ) 30 | 31 | print "converting corpus to RP..." 32 | print time.strftime("%H:%M:%S", time.localtime()) 33 | 34 | c_rp = rp[c_tfidf] 35 | 36 | print "saving..." 37 | print 38 | 39 | SvmLightCorpus.serialize( output_file, c_rp ) 40 | 41 | print "done." 42 | print time.strftime("%H:%M:%S", time.localtime()) 43 | -------------------------------------------------------------------------------- /gensim_tfidf.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | from gensim import corpora, models, similarities 3 | from gensim.corpora import SvmLightCorpus 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | ### 9 | 10 | print "loading data..." 11 | print time.strftime("%H:%M:%S", time.localtime()) 12 | 13 | c = SvmLightCorpus( input_file ) 14 | 15 | print "starting tf-idf..." 16 | print 17 | 18 | tfidf = models.TfidfModel( c ) 19 | c_tfidf = tfidf[c] 20 | 21 | print "saving..." 22 | print 23 | 24 | SvmLightCorpus.serialize( output_file, c_tfidf ) 25 | 26 | print "done." 27 | print time.strftime("%H:%M:%S", time.localtime()) 28 | -------------------------------------------------------------------------------- /libsvm2csv.py: -------------------------------------------------------------------------------- 1 | 'convert libsvm file to csv' 2 | 'libsvm2csv.py ' 3 | 4 | import sys, csv 5 | #import numpy as np 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | d = int( sys.argv[3] ) 11 | assert ( d > 0 ) 12 | 13 | reader = csv.reader( open( input_file ), delimiter = " " ) 14 | writer = csv.writer( open( output_file, 'wb' )) 15 | 16 | for line in reader: 17 | label = line.pop( 0 ) 18 | if line[-1].strip() == '': 19 | line.pop( -1 ) 20 | 21 | # print line 22 | 23 | line = map( lambda x: tuple( x.split( ":" )), line ) 24 | #print line 25 | 26 | new_line = [ label ] + [ 0 ] * d 27 | for i, v in line: 28 | new_line[int(i)] = v 29 | 30 | writer.writerow( new_line ) 31 | -------------------------------------------------------------------------------- /rf.r: -------------------------------------------------------------------------------- 1 | # train a random forest on original or transformed data 2 | 3 | library( randomForest ) 4 | library( caTools ) 5 | 6 | ntrees = 100 7 | 8 | # change the file names 9 | train_file = 'data/train.csv' 10 | validation_file = 'data/test.csv' 11 | label_index = 1 12 | 13 | train <- read.csv( train_file, header = F ) 14 | validation <- read.csv( validation_file, header = F ) 15 | 16 | x_train = train[, -label_index] 17 | y_train = train[, label_index] 18 | 19 | x_validation = validation[, -label_index] 20 | y_validation = validation[, label_index] 21 | 22 | ### 23 | 24 | rf <- randomForest( x_train, as.factor( y_train ), ntree = ntrees, do.trace = 1 ) 25 | 26 | p <- predict( rf, x_validation, type = 'prob' ) 27 | probs = p[,2] 28 | 29 | p_binary <- predict( rf, validation[,-1] ) 30 | 31 | accuracy = sum( p_binary == y_validation ) / length( p_binary ) 32 | cat( "accuracy:", accuracy, "\n" ) 33 | 34 | auc = colAUC( probs, ( y_validation + 1 ) / 2 ) 35 | auc = auc[1] 36 | cat( "auc:", auc, "\n" ) 37 | 38 | -------------------------------------------------------------------------------- /spams_nmf.py: -------------------------------------------------------------------------------- 1 | 'map input file (libsvm) to topics, output csv' 2 | 3 | from sklearn.datasets import load_svmlight_file 4 | import numpy as np 5 | import spams 6 | import sys 7 | 8 | input_file = sys.argv[1] 9 | output_file = sys.argv[2] 10 | try: 11 | num_topics = int( sys.argv[3] ) 12 | except IndexError: 13 | num_topics = 50 14 | 15 | x_train, y_train = load_svmlight_file( input_file ) 16 | x_train_t = np.transpose( x_train ) 17 | 18 | u = spams.nmf( x_train_t, return_lasso = False, K = num_topics ) 19 | 20 | mapped_x = x_train * u 21 | 22 | y_train.shape = y_train.shape[0], 1 23 | np.savetxt( output_file, np.hstack(( y_train, mapped_x )), delimiter = ",", fmt = '%.6f' ) 24 | --------------------------------------------------------------------------------