├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── adult_results.txt
├── batch.txt
├── csv_output_snippet.py
├── first.py
├── gensim_add_labels.py
├── gensim_lda.py
├── gensim_lsi.py
├── gensim_rp.py
├── gensim_tfidf.py
├── libsvm2csv.py
├── rf.r
└── spams_nmf.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # PDT-specific
 29 | .buildpath
 30 | 
 31 | 
 32 | #################
 33 | ## Visual Studio
 34 | #################
 35 | 
 36 | ## Ignore Visual Studio temporary files, build results, and
 37 | ## files generated by popular Visual Studio add-ons.
 38 | 
 39 | # User-specific files
 40 | *.suo
 41 | *.user
 42 | *.sln.docstates
 43 | 
 44 | # Build results
 45 | [Dd]ebug/
 46 | [Rr]elease/
 47 | *_i.c
 48 | *_p.c
 49 | *.ilk
 50 | *.meta
 51 | *.obj
 52 | *.pch
 53 | *.pdb
 54 | *.pgc
 55 | *.pgd
 56 | *.rsp
 57 | *.sbr
 58 | *.tlb
 59 | *.tli
 60 | *.tlh
 61 | *.tmp
 62 | *.vspscc
 63 | .builds
 64 | *.dotCover
 65 | 
 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this
 67 | #packages/
 68 | 
 69 | # Visual C++ cache files
 70 | ipch/
 71 | *.aps
 72 | *.ncb
 73 | *.opensdf
 74 | *.sdf
 75 | 
 76 | # Visual Studio profiler
 77 | *.psess
 78 | *.vsp
 79 | 
 80 | # ReSharper is a .NET coding add-in
 81 | _ReSharper*
 82 | 
 83 | # Installshield output folder
 84 | [Ee]xpress
 85 | 
 86 | # DocProject is a documentation generator add-in
 87 | DocProject/buildhelp/
 88 | DocProject/Help/*.HxT
 89 | DocProject/Help/*.HxC
 90 | DocProject/Help/*.hhc
 91 | DocProject/Help/*.hhk
 92 | DocProject/Help/*.hhp
 93 | DocProject/Help/Html2
 94 | DocProject/Help/html
 95 | 
 96 | # Click-Once directory
 97 | publish
 98 | 
 99 | # Others
100 | [Bb]in
101 | [Oo]bj
102 | sql
103 | TestResults
104 | *.Cache
105 | ClientBin
106 | stylecop.*
107 | ~$*
108 | *.dbmdl
109 | Generated_Code #added for RIA/Silverlight projects
110 | 
111 | # Backup & report files from converting an old project file to a newer
112 | # Visual Studio version. Backup files are not needed, because we have git ;-)
113 | _UpgradeReport_Files/
114 | Backup*/
115 | UpgradeLog*.XML
116 | 
117 | 
118 | 
119 | ############
120 | ## Windows
121 | ############
122 | 
123 | # Windows image file caches
124 | Thumbs.db
125 | 
126 | # Folder config file
127 | Desktop.ini
128 | 
129 | 
130 | #############
131 | ## Python
132 | #############
133 | 
134 | *.py[co]
135 | 
136 | # Packages
137 | *.egg
138 | *.egg-info
139 | dist
140 | build
141 | eggs
142 | parts
143 | bin
144 | var
145 | sdist
146 | develop-eggs
147 | .installed.cfg
148 | 
149 | # Installer logs
150 | pip-log.txt
151 | 
152 | # Unit test / coverage reports
153 | .coverage
154 | .tox
155 | 
156 | #Translations
157 | *.mo
158 | 
159 | #Mr Developer
160 | .mr.developer.cfg
161 | 
162 | # Mac crap
163 | .DS_Store
164 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Dimensionality reduction for sparse binary data
 2 | ===============================================
 3 | 
 4 | See [http://fastml.com/dimensionality-reduction-for-sparse-binary-data/](http://dimensionality-reduction-for-sparse-binary-data/) for description.
 5 | 
 6 | 	adult_results.txt - results of testing on _adult_ dataset
 7 | 	batch.txt - a batch file of commands for conversion
 8 | 	csv_output_snippet.py - how to output csv from gensim
 9 | 	first.py - extract some lines from a file, see batch.txt
10 | 	
11 | 	gensim_add_labels.py - add labels (lost during conversion)
12 | 	gensim_lda.py - perform LDA conversion
13 | 	gensim_lsi.py - perform LSI conversion
14 | 	gensim_rp.py - perform random projections conversion
15 | 	gensim_tfidf.py - perform TF-IDF preprocessing
16 | 	
17 | 	libsvm2csv.py - convert libsvm file to csv
18 | 	rf.r - random forest code used for testing
19 | 	
20 | 	spams_nmf.py - perform NMF conversion. Requires SPAMS and scikit-learn for tf-idf.
21 | 	


--------------------------------------------------------------------------------
/adult_results.txt:
--------------------------------------------------------------------------------
 1 | Scores from a random forest with 100 trees.
 2 | 
 3 | ORIG DATA
 4 | 
 5 | # accuracy: 0.8485351 
 6 | # auc: 0.8792633 
 7 | 
 8 | 
 9 | LSI (+ TF-IDF)
10 | 
11 | # accuracy: 0.8295559 
12 | # auc: 0.8588338 
13 | 
14 | LSI 20 (+ TF-IDF)
15 | 
16 | # accuracy: 0.8294945 
17 | # auc: 0.8592035 
18 | 
19 | 
20 | RP (+ TF-IDF)
21 | 
22 | # accuracy: 0.806093 
23 | # auc: 0.8047654 
24 | 
25 | 
26 | LDA + TF-IDF
27 | 
28 | # accuracy: 0.788895 
29 | # auc: 0.8091261 
30 | 
31 | LDA
32 | 
33 | # accuracy: 0.8308458 
34 | # auc: 0.826304 
35 | 
36 | 
37 | 
38 | ASPECT 10
39 | 
40 | # accuracy: 0.8231681 
41 | # auc: 0.8653033 
42 | 
43 | ASPECT 50
44 | 
45 | # accuracy: 0.8366808 
46 | # auc: 0.8866067 
47 | 
48 | SPAMS NMF
49 | 
50 | # accuracy: 0.8220625 
51 | # auc: 0.8639657 
52 | 
53 | SPAMS NMF + TF-IDF
54 | 
55 | # accuracy: 0.8258092 
56 | # auc: 0.865312 
57 | 
58 | CTM
59 | 
60 | # accuracy: 0.8247036 
61 | # auc: 0.8534664 
62 | 
63 | ICA 10/10
64 | 
65 | # accuracy: 0.8259935 
66 | # auc: 0.8647878
67 | 
68 | ICA (+ TFIDF) 10/10 w/stabilization
69 | 
70 | # accuracy: 0.8273448 
71 | # auc: 0.8653916 
72 | 


--------------------------------------------------------------------------------
/batch.txt:
--------------------------------------------------------------------------------
 1 | # a list of commands for transforming data
 2 | # you can use it as a shell script, or a batch file on Windows (change # to rem)
 3 | 
 4 | # run LSI, save and split back into train and test, convert to csv
 5 | 
 6 | gensim_lsi.py data/orig/all.txt data/lsi_all_.txt 10
 7 | 
 8 | gensim_add_labels.py data/orig/all.txt data/lsi_all_.txt data/lsi_all.txt
 9 | 
10 | first.py data/lsi_all.txt data/lsi_train.txt 32561
11 | first.py data/lsi_all.txt data/lsi_test.txt 99999 32561
12 | 
13 | libsvm2csv.py data/lsi_train.txt data/lsi_train.csv 10
14 | libsvm2csv.py data/lsi_test.txt data/lsi_test.csv 10
15 | 


--------------------------------------------------------------------------------
/csv_output_snippet.py:
--------------------------------------------------------------------------------
1 | from gensim import matutils 
2 | 
3 | output_file = 'output.csv'
4 | 
5 | # http://docs.scipy.org/doc/numpy/reference/generated/numpy.savetxt.html
6 | numpy.savetxt( output_file, matutils.corpus2dense( transformed_corpus, num_cols ), fmt = '%.6f', delimiter = ',' )
7 | 


--------------------------------------------------------------------------------
/first.py:
--------------------------------------------------------------------------------
 1 | 'Take n first lines from input file and save them to output file'
 2 | 
 3 | import sys
 4 | 
 5 | input_file = sys.argv[1]
 6 | output_file = sys.argv[2]
 7 | try:
 8 | 	lines = int( sys.argv[3] )
 9 | except IndexError:
10 | 	lines = 100
11 | 	
12 | try:
13 | 	offset = int( sys.argv[4] )
14 | except IndexError:
15 | 	offset = 0	
16 | 
17 | 
18 | i = open( input_file )
19 | o = open( output_file, 'wb' )
20 | 
21 | count =  0
22 | for line in i:
23 | 
24 | 	if offset > 0:
25 | 		offset -= 1
26 | 		continue
27 | 
28 | 	o.write( line )
29 | 	count += 1
30 | 	
31 | 	if count >= lines:
32 | 		break
33 | 	
34 | 
35 | 		
36 | 		
37 | 		
38 | 		
39 | 		
40 | 		
41 | 		


--------------------------------------------------------------------------------
/gensim_add_labels.py:
--------------------------------------------------------------------------------
 1 | 'add labels from the original libsvm file after gensim conversion'
 2 | 
 3 | import sys, csv
 4 | 
 5 | original_file = sys.argv[1]
 6 | input_file = sys.argv[2]
 7 | output_file = sys.argv[3]
 8 | 
 9 | orig_reader = csv.reader( open( original_file ), delimiter = ' ' )
10 | reader = csv.reader( open( input_file ), delimiter = ' ' )
11 | writer = csv.writer( open( output_file, 'wb' ), delimiter = ' ' )
12 | 
13 | n = 0
14 | for line in orig_reader:
15 | 	label = line[0]
16 | 	
17 | 	line = reader.next()
18 | 	line[0] = label
19 | 	
20 | 	writer.writerow( line )
21 | 	
22 | 	n += 1
23 | 	if n % 10000 == 0:
24 | 		print n
25 | 


--------------------------------------------------------------------------------
/gensim_lda.py:
--------------------------------------------------------------------------------
 1 | import sys, time
 2 | from gensim import corpora, models, similarities
 3 | from gensim.corpora import SvmLightCorpus
 4 | 
 5 | input_file = sys.argv[1]
 6 | output_file = sys.argv[2]
 7 | try:
 8 | 	num_topics = int( sys.argv[3] )
 9 | except IndexError:
10 | 	num_topics = 50
11 | 	
12 | ###
13 | 
14 | print "loading data..."
15 | print time.strftime("%H:%M:%S", time.localtime())
16 | print
17 | 
18 | c = SvmLightCorpus( input_file )
19 | 
20 | print "running LDA..."
21 | print time.strftime("%H:%M:%S", time.localtime())
22 | print
23 | 
24 | lda = models.LdaModel( c, id2word = None, num_topics = num_topics )
25 | 
26 | print "converting corpus to LDA..."
27 | print time.strftime("%H:%M:%S", time.localtime())
28 | print
29 | 
30 | c_lda = lda[c] 
31 | 
32 | print "saving..."
33 | print
34 | 
35 | SvmLightCorpus.serialize( output_file, c_lda  )
36 | 
37 | print "done."
38 | print time.strftime("%H:%M:%S", time.localtime())
39 | 


--------------------------------------------------------------------------------
/gensim_lsi.py:
--------------------------------------------------------------------------------
 1 | import sys, time
 2 | from gensim import corpora, models, similarities
 3 | from gensim.corpora import SvmLightCorpus
 4 | 
 5 | input_file = sys.argv[1]
 6 | output_file = sys.argv[2]
 7 | try:
 8 | 	num_topics = int( sys.argv[3] )
 9 | except IndexError:
10 | 	num_topics = 50
11 | 
12 | ###
13 | 
14 | print "loading data..."
15 | print time.strftime("%H:%M:%S", time.localtime())
16 | 
17 | c = SvmLightCorpus( input_file )
18 | 
19 | print "starting tf-idf..."
20 | print
21 | 
22 | tfidf = models.TfidfModel( c )
23 | c_tfidf = tfidf[c]
24 | 
25 | print "running LSI..."
26 | print time.strftime("%H:%M:%S", time.localtime())
27 | print
28 | 
29 | lsi = models.LsiModel( c_tfidf, id2word = None, num_topics = num_topics )
30 | 
31 | print "converting corpus to LSI..."
32 | print time.strftime("%H:%M:%S", time.localtime())
33 | 
34 | c_lsi = lsi[c_tfidf] 
35 | 
36 | print "saving..."
37 | print
38 | 
39 | SvmLightCorpus.serialize( output_file, c_lsi  )
40 | 
41 | print "done."
42 | print time.strftime("%H:%M:%S", time.localtime())
43 | 


--------------------------------------------------------------------------------
/gensim_rp.py:
--------------------------------------------------------------------------------
 1 | import time, sys
 2 | from gensim import corpora, models, similarities
 3 | from gensim.corpora import SvmLightCorpus
 4 | 
 5 | input_file = sys.argv[1]
 6 | output_file = sys.argv[2]
 7 | try:
 8 | 	num_topics = int( sys.argv[3] )
 9 | except IndexError:
10 | 	num_topics = 50
11 | 
12 | ###
13 | 
14 | print "loading data..."
15 | print time.strftime("%H:%M:%S", time.localtime())
16 | 
17 | c = SvmLightCorpus( input_file )
18 | 
19 | print "starting tf-idf..."
20 | print
21 | 
22 | tfidf = models.TfidfModel( c )
23 | c_tfidf = tfidf[c]
24 | 
25 | print "running RP..."
26 | print time.strftime("%H:%M:%S", time.localtime())
27 | print
28 | 
29 | rp = models.RpModel( c_tfidf, num_topics = num_topics )
30 | 
31 | print "converting corpus to RP..."
32 | print time.strftime("%H:%M:%S", time.localtime())
33 | 
34 | c_rp = rp[c_tfidf] 
35 | 
36 | print "saving..."
37 | print
38 | 
39 | SvmLightCorpus.serialize( output_file, c_rp  )
40 | 
41 | print "done."
42 | print time.strftime("%H:%M:%S", time.localtime())
43 | 


--------------------------------------------------------------------------------
/gensim_tfidf.py:
--------------------------------------------------------------------------------
 1 | import sys, time
 2 | from gensim import corpora, models, similarities
 3 | from gensim.corpora import SvmLightCorpus
 4 | 
 5 | input_file = sys.argv[1]
 6 | output_file = sys.argv[2]
 7 | 
 8 | ###
 9 | 
10 | print "loading data..."
11 | print time.strftime("%H:%M:%S", time.localtime())
12 | 
13 | c = SvmLightCorpus( input_file )
14 | 
15 | print "starting tf-idf..."
16 | print
17 | 
18 | tfidf = models.TfidfModel( c )
19 | c_tfidf = tfidf[c]
20 | 
21 | print "saving..."
22 | print
23 | 
24 | SvmLightCorpus.serialize( output_file, c_tfidf  )
25 | 
26 | print "done."
27 | print time.strftime("%H:%M:%S", time.localtime())
28 | 


--------------------------------------------------------------------------------
/libsvm2csv.py:
--------------------------------------------------------------------------------
 1 | 'convert libsvm file to csv'
 2 | 'libsvm2csv.py <input file> <output file> <dimensionality>'
 3 | 
 4 | import sys, csv
 5 | #import numpy as np
 6 | 
 7 | input_file = sys.argv[1]
 8 | output_file = sys.argv[2]
 9 | 
10 | d = int( sys.argv[3] )
11 | assert ( d > 0 )
12 | 
13 | reader = csv.reader( open( input_file ), delimiter = " " )
14 | writer = csv.writer( open( output_file, 'wb' ))
15 | 
16 | for line in reader:
17 | 	label = line.pop( 0 )
18 | 	if line[-1].strip() == '':
19 | 		line.pop( -1 )
20 | 		
21 | 	# print line
22 | 	
23 | 	line = map( lambda x: tuple( x.split( ":" )), line )
24 | 	#print line
25 | 	
26 | 	new_line = [ label ] + [ 0 ] * d
27 | 	for i, v in line:
28 | 		new_line[int(i)] = v
29 | 		
30 | 	writer.writerow( new_line )
31 | 


--------------------------------------------------------------------------------
/rf.r:
--------------------------------------------------------------------------------
 1 | # train a random forest on original or transformed data
 2 | 
 3 | library( randomForest )
 4 | library( caTools )
 5 | 
 6 | ntrees = 100
 7 | 
 8 | # change the file names
 9 | train_file = 'data/train.csv'
10 | validation_file = 'data/test.csv'
11 | label_index = 1
12 | 
13 | train <- read.csv( train_file, header = F )
14 | validation <- read.csv( validation_file, header = F )
15 | 
16 | x_train = train[, -label_index]
17 | y_train = train[, label_index]
18 | 
19 | x_validation = validation[, -label_index]
20 | y_validation = validation[, label_index]
21 | 
22 | ###
23 | 
24 | rf <- randomForest( x_train, as.factor( y_train ), ntree = ntrees, do.trace = 1 ) 
25 | 
26 | p <- predict( rf, x_validation, type = 'prob' )
27 | probs =  p[,2]
28 | 
29 | p_binary <- predict( rf, validation[,-1] )
30 | 
31 | accuracy = sum( p_binary == y_validation ) / length( p_binary )
32 | cat( "accuracy:", accuracy, "\n" )
33 | 
34 | auc = colAUC( probs, ( y_validation + 1 ) / 2 )
35 | auc = auc[1]
36 | cat( "auc:", auc, "\n" )
37 | 
38 | 


--------------------------------------------------------------------------------
/spams_nmf.py:
--------------------------------------------------------------------------------
 1 | 'map input file (libsvm) to <argv[3]> topics, output csv'
 2 | 
 3 | from sklearn.datasets import load_svmlight_file
 4 | import numpy as np
 5 | import spams
 6 | import sys
 7 | 
 8 | input_file = sys.argv[1]
 9 | output_file = sys.argv[2]
10 | try:
11 | 	num_topics = int( sys.argv[3] )
12 | except IndexError:
13 | 	num_topics = 50
14 | 
15 | x_train, y_train = load_svmlight_file( input_file )
16 | x_train_t = np.transpose( x_train )
17 | 
18 | u = spams.nmf( x_train_t, return_lasso = False, K = num_topics )
19 | 
20 | mapped_x = x_train * u
21 | 
22 | y_train.shape = y_train.shape[0], 1
23 | np.savetxt( output_file, np.hstack(( y_train, mapped_x )), delimiter = ",", fmt = '%.6f' )
24 | 


--------------------------------------------------------------------------------