├── DESCRIPTION
├── Makefile
├── NAMESPACE
├── R
    ├── is.installed.R
    ├── read.afm.R
    ├── rface.filter.R
    ├── rface.load.R
    ├── rface.predict.R
    ├── rface.save.R
    └── rface.train.R
├── README
├── data
    ├── 6_num_features_X_10_cases.tsv
    ├── dataGenerator.m
    ├── scriptTestDataGenerator.m
    ├── test_4by10_feature1_splits_target0.tsv
    └── test_6by10_featurerows_matrix.tsv
├── doxy.cfg
├── install_R.sh
├── make_package.sh
├── make_win32.bat
├── make_win64.bat
├── man
    ├── is.installed.Rd
    ├── rface.filter.Rd
    ├── rface.predict.Rd
    ├── rface.train.Rd
    └── rfacer.Rd
├── matlab
    └── rface_filter.cpp
├── rf-ace-launcher.sh
├── rf_ace_batch.py
├── src
    ├── Makevars
    ├── argparse.hpp
    ├── datadefs.cpp
    ├── datadefs.hpp
    ├── densetreedata.cpp
    ├── densetreedata.hpp
    ├── distributions.cpp
    ├── distributions.hpp
    ├── errno.hpp
    ├── exceptions.hpp
    ├── feature.cpp
    ├── feature.hpp
    ├── math.cpp
    ├── math.hpp
    ├── mtrand.h
    ├── murmurhash3.cpp
    ├── murmurhash3.hpp
    ├── node.cpp
    ├── node.hpp
    ├── options.hpp
    ├── progress.cpp
    ├── progress.hpp
    ├── reader.cpp
    ├── reader.hpp
    ├── rf_ace.cpp
    ├── rf_ace.hpp
    ├── rf_ace_R.cpp
    ├── rootnode.cpp
    ├── rootnode.hpp
    ├── statistics.cpp
    ├── statistics.hpp
    ├── stochasticforest.cpp
    ├── stochasticforest.hpp
    ├── timer.hpp
    ├── treedata.hpp
    ├── utils.cpp
    └── utils.hpp
├── test
    ├── GBT_benchmark.cpp
    ├── R
    │   ├── benchmark.R
    │   ├── run_tests.R
    │   └── utils.R
    ├── bash
    │   └── treesize_vs_pmissing.sh
    ├── data
    │   ├── 12by21_categorical_matrix.arff
    │   ├── 3by8_mixed_NA_matrix.afm
    │   ├── 3by8_mixed_NA_transposed_matrix.afm
    │   └── 5by10_numeric_matrix.arff
    ├── datadefs_newtest.hpp
    ├── distributions_newtest.hpp
    ├── math_newtest.hpp
    ├── matlab
    │   ├── categoricalFeatureSplit.m
    │   ├── deltaImpurity.m
    │   ├── numericalFeatureSplit.m
    │   ├── readAFM.m
    │   └── writeAFM.m
    ├── newtest.hpp
    ├── node_newtest.hpp
    ├── python
    │   ├── ampute.py
    │   └── deltaImpurity.py
    ├── reader_newtest.hpp
    ├── rface_newtest.hpp
    ├── rootnode_newtest.hpp
    ├── run_newtests.cpp
    ├── treedata_newtest.hpp
    └── utils_newtest.hpp
├── test_103by300_mixed_matrix.afm
├── test_103by300_mixed_nan_matrix.afm
├── test_2by10_text_matrix.afm
├── test_2by8_numerical_matrix.tsv
├── test_3by10_categorical_matrix.tsv
├── test_6by10_mixed_matrix.tsv
├── test_fullSplitterSweep.txt
├── test_fullSplitterSweep_class.txt
├── test_predictor.sf
├── test_rfacer.R
├── testdata.tsv
└── tmp
    ├── feature.cpp
    ├── feature.hpp
    └── treesizes.tsv


/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: rfacer
 2 | Type: Package
 3 | Title: Random Forests with Artificial Contrast Ensembles
 4 | Version: 1.0.8
 5 | Date: 2012-10-01
 6 | Author: Timo Erkkila
 7 | Maintainer: Timo Erkkila <timo.erkkila@gmail.com>
 8 | Description: Random Forests with Artificial Contrast Ensembles
 9 | License: Apache 2.0
10 | Depends: Rcpp


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | COMPILER = g++
 2 | CFLAGS = -O3 -std=c++0x -Wall -Wextra -pedantic -Isrc/ -lz
 3 | TFLAGS = -pthread
 4 | SOURCEFILES = src/densetreedata.cpp src/murmurhash3.cpp src/datadefs.cpp src/progress.cpp src/statistics.cpp src/math.cpp src/stochasticforest.cpp src/rootnode.cpp src/node.cpp src/utils.cpp src/distributions.cpp src/reader.cpp src/feature.cpp
 5 | STATICFLAGS = -static-libgcc -static
 6 | TESTFILES = test/rface_test.hpp test/distributions_test.hpp test/argparse_test.hpp test/datadefs_test.hpp test/stochasticforest_test.hpp test/utils_test.hpp test/math_test.hpp test/rootnode_test.hpp test/node_test.hpp test/densetreedata_test.hpp
 7 | TESTFLAGS = -std=c++0x -L${HOME}/lib/ -L/usr/local/lib -lcppunit -ldl -pedantic -I${HOME}/include/ -I/usr/local/include -Itest/ -Isrc/
 8 | .PHONY: all test clean  # Squash directory checks for the usual suspects
 9 | 
10 | all: rf-ace
11 | 
12 | rf-ace: $(SOURCEFILES)
13 | 	$(COMPILER) $(CFLAGS) src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace
14 | 
15 | rf-ace-i386: $(SOURCEFILES)
16 | 	$(COMPILER) $(CFLAGS) -m32 src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace-i386
17 | 
18 | rf-ace-amd64: $(SOURCEFILES)
19 | 	$(COMPILER) $(CFLAGS) -m64 src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace-amd64
20 | 
21 | no-threads: $(SOURCEFILES)
22 | 	$(COMPILER) $(CFLAGS) -DNOTHREADS $(SOURCEFILES) src/rf_ace.cpp -o bin/rf-ace
23 | 
24 | debug: $(SOURCEFILES)
25 | 	$(COMPILER) $(CFLAGS) src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace -g -ggdb -pg
26 | 
27 | static: $(SOURCEFILES)
28 | 	$(COMPILER) $(CFLAGS) src/rf_ace.cpp $(STATICFLAGS) $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace
29 | 
30 | static-no-threads: $(SOURCEFILES)
31 | 	$(COMPILER) $(CFLAGS) -DNOTHREADS src/rf_ace.cpp $(STATICFLAGS) $(SOURCEFILES) -o bin/rf-ace
32 | 
33 | GBT_benchmark: test/GBT_benchmark.cpp $(SOURCEFILES)
34 | 	$(COMPILER) $(CFLAGS) test/GBT_benchmark.cpp $(SOURCEFILES) $(TFLAGS) -o bin/GBT_benchmark
35 | 
36 | test: $(SOURCEFILES) 
37 | 	rm -f bin/newtest; $(COMPILER) $(CFLAGS) test/run_newtests.cpp $(SOURCEFILES) $(TFLAGS) -o bin/newtest -ggdb; ./bin/newtest
38 | 
39 | test-no-threads: $(SOURCEFILES)
40 | 	rm -f bin/newtest; $(COMPILER) $(CFLAGS) -DNOTHREADS test/run_newtests.cpp $(SOURCEFILES) -o bin/newtest -ggdb; ./bin/newtest
41 | 
42 | clean:
43 | 	rm -rf bin/rf-ace bin/benchmark bin/GBT_benchmark bin/test bin/*.dSYM/ src/*.o
44 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 | import("Rcpp")
3 | useDynLib("rfacer")
4 | 


--------------------------------------------------------------------------------
/R/is.installed.R:
--------------------------------------------------------------------------------
1 | is.installed <-
2 | function(mypkg) is.element(mypkg, installed.packages()[,1])
3 | 


--------------------------------------------------------------------------------
/R/read.afm.R:
--------------------------------------------------------------------------------
 1 | read.afm <- function(file) 
 2 | {
 3 | 
 4 | trainData <- read.table(file,head=TRUE,sep="\t",row.names=1)
 5 | 
 6 | trainData <- as.data.frame(t(trainData))
 7 | 
 8 | featureNames <- names(trainData)
 9 | 
10 | for( i in 1:length(featureNames) ) {
11 |         if ( substr(featureNames[i],1,2) != "N:" ) {
12 |                 for( j in 1:length(row.names(trainData)) ) {
13 |                         trainData[j,i] <- as.character(trainData[j,i])
14 |                 }
15 |         }
16 | }
17 | return(trainData)
18 | }
19 | 


--------------------------------------------------------------------------------
/R/rface.filter.R:
--------------------------------------------------------------------------------
1 | rface.filter <-
2 | function(filterData, target, featureWeights = vector(length=0), nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 1000, nThreads = 1) {
3 |   filterOutput <- .Call("rfaceFilter", filterData, as.character(target), featureWeights, nTrees, mTry, nodeSize, nMaxLeaves, nThreads)
4 |   return(filterOutput)
5 | }
6 | 


--------------------------------------------------------------------------------
/R/rface.load.R:
--------------------------------------------------------------------------------
1 | rface.load <-
2 | function(predictorFile, nThreads = 1) {
3 |   predictorObj <- .Call("rfaceLoad",predictorFile,nThreads)
4 |   return(predictorObj)
5 | }


--------------------------------------------------------------------------------
/R/rface.predict.R:
--------------------------------------------------------------------------------
1 | rface.predict <-
2 | function(predictorObj,testData,quantiles=vector(length=0),nSamplesForQuantiles=10,distributions=FALSE) {
3 |   predictions <- .Call("rfacePredict",predictorObj,testData,quantiles,nSamplesForQuantiles,distributions);
4 |   return(predictions)
5 | }
6 | 


--------------------------------------------------------------------------------
/R/rface.save.R:
--------------------------------------------------------------------------------
1 | rface.save <-
2 | function(predictorObj,fileName) {
3 |   .Call("rfaceSave",predictorObj,fileName)
4 | }


--------------------------------------------------------------------------------
/R/rface.train.R:
--------------------------------------------------------------------------------
1 | rface.train <-
2 | function(trainData, target, featureWeights = vector(length=0), forestType = "RF", nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 0, shrinkage = 0.01, noNABranching = FALSE, nThreads = 1) {
3 |   predictorObj <- .Call("rfaceTrain", trainData, as.character(target), featureWeights, as.character(forestType), nTrees, mTry, nodeSize, nMaxLeaves, shrinkage, noNABranching, nThreads)
4 |   return(predictorObj)
5 | }
6 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | #summary Manual pages.
  2 | 
  3 | *The manual pages have been written on the basis of RF-ACE verson 0.5.5*
  4 | 
  5 | = Description =
  6 | 
  7 | RF-ACE is an efficient C++ implementation of a robust machine learning algorithm for uncovering multivariate associations from large and diverse data sets. RF-ACE natively handles numerical and categorical data with missing values, and potentially large quantities of noninformative features are handled gracefully utilizing artificial contrast features, bootstrapping, and p-value estimation.
  8 | 
  9 | = Installation =
 10 | 
 11 | Download the latest stable release from the [http://code.google.com/p/rf-ace/downloads/list download page], or checkout the latest development version (to directory rf-ace/) by typing
 12 | {{{
 13 | svn checkout http://rf-ace.googlecode.com/svn/trunk/ rf-ace
 14 | }}}
 15 | 
 16 | Compiler makefiles for Linux (`Makefile`) and Visual Studio for Windows (`make.bat`) are provided in the package. In Linux, you can compile the program by typing 
 17 | {{{
 18 | make
 19 | }}}
 20 | or
 21 | {{{
 22 | make rf_ace
 23 | }}}
 24 | 
 25 | In Windows and using Visual Studio, first open up the Visual Studio terminal and execute `make.bat` by typing
 26 | {{{
 27 | make
 28 | }}}
 29 | Simple as that! If you feel lucky, check for compiled binaries at the [http://code.google.com/p/rf-ace/downloads/list download page]. 
 30 | 
 31 | = Supported data formats =
 32 | RF-ACE currently supports two file formats, Annotated Feature Matrix (AFM) and Attribute-Relation File Format (ARFF).
 33 | 
 34 | == Annotated Feature Matrix (AFM) ==
 35 | 
 36 | Annotated Feature Matrix represents the data as a tab-delimited table, where both columns and rows contain headers describing the samples and features. Based on the headers, the AFM reader is able to discern the right orientation (features as rows or columns in the matrix) of the matrix. Namely AFM feature headers must encode whether the feature is (`N`)umerical, (`C`)ategorical, (`O`)rdinal, or (`B`)inary, followed by colon and the actual name of the feature as follows:
 37 | 
 38 |  * `B:is_alive`
 39 |  * `N:age`
 40 |  * `C:tumor_grage` 
 41 |  * `O:anatomic_organ_subdivision`
 42 | 
 43 | In fact any string, even including colons, spaces, and other special characters, encodes a valid feature name as long as it starts with the preamble `N:`/`C:`/`O:`/`B:`. Thus, the following is a valid feature header:
 44 | 
 45 |  * `N:GEXP:TP53:chr17:123:456`
 46 | 
 47 | Sample headers are not constrained, except that they must not contain preambles `N:`/`C:`/`O:`/`B:`, being reserved for the feature headers. 
 48 | 
 49 | == Attribute-Relation File Format (ARFF) ==
 50 | 
 51 | [http://www.cs.waikato.ac.nz/~ml/weka/arff.html ARFF specification].      
 52 | 
 53 | = Usage =
 54 | The following examples follow Linux syntax. Type 
 55 | {{{
 56 | bin/rf_ace --help
 57 | }}}
 58 | or 
 59 | {{{
 60 | bin/rf_ace -h
 61 | }}}
 62 | to bring up help:
 63 | {{{
 64 | REQUIRED ARGUMENTS:
 65 |  -I / --input        input feature file (AFM or ARFF)
 66 |  -i / --target       target, specified as integer or string that is to be matched with the content of input
 67 |  -O / --output       output association file
 68 | 
 69 | OPTIONAL ARGUMENTS:
 70 |  -n / --ntrees       number of trees per RF (default nsamples/nrealsamples)
 71 |  -m / --mtry         number of randomly drawn features per node split (default sqrt(nfeatures))
 72 |  -s / --nodesize     minimum number of train samples per node, affects tree depth (default max{5,nsamples/20})
 73 |  -p / --nperms       number of Random Forests (default 50)
 74 |  -t / --pthreshold   p-value threshold below which associations are listed (default 0.1)
 75 |  -g / --gbt          Enable (1 == YES) Gradient Boosting Trees, a subsequent filtering procedure (default 0 == NO)
 76 | }}} 
 77 | 
 78 | So all that is required is an input file (`-I/--input`), either of type `.arff` or `.afm`, and a target (`-i/--target`) to build the RF-ACE model upon. Target in this case corresponds to a feature in the input file, and it can be identified with an index corresponding to it's order of appearance in the file, or with it's name. Thus, if the target is `N:age` (we would be looking for features associated with age) existing on row `123` (0-base and omitting the header row), one execute RF-ACE by typing
 79 | {{{
 80 | bin/rf_ace --input featurematrix.afm --target 123 --output associations.tsv 
 81 | }}}
 82 | or with the short-hand notation equivalently as
 83 | {{{
 84 | bin/rf_ace -I featurematrix.afm -i 123 -O associations.tsv 
 85 | }}}
 86 | or by using the header "N:age" instead of the index by typing
 87 | {{{
 88 | bin/rf_ace -I featurematrix.afm -i N:age -O associations.tsv
 89 | }}}
 90 | In case a provided (sub)string identifies multiple target candidates, RF-ACE will be executed serially for all target candidates, results catenated in the specified output file.
 91 | 
 92 | The above will execute RF-ACE with the default parameters; as the help documentation points out, most of the parameters are estimated dynamically based on the data dimensions and content, so running RF-ACE with no information about the algorithm itself is possible.
 93 | 
 94 | = Output = 
 95 | The following call (assuming now the substring `age` uniquely identifies just one feature, `N:age`)
 96 | {{{
 97 | bin/rf_ace -I featurematrix.afm -i age -O associations.tsv
 98 | }}}
 99 | produces the output
100 | {{{
101 | 
102 | 
103 |  ---------------------------------------------------------------
104 | | RF-ACE -- efficient feature selection with heterogeneous data |
105 | |                                                               |
106 | |  Version:      RF-ACE v0.5.5, July 4th, 2011                  |
107 | |  Project page: http://code.google.com/p/rf-ace                |
108 | |  Contact:      timo.p.erkkila@tut.fi                          |
109 | |                kari.torkkola@gmail.com                        |
110 | |                                                               |
111 | |              DEVELOPMENT VERSION, BUGS EXIST!                 |
112 |  ---------------------------------------------------------------
113 | 
114 | Reading file 'featurematrix.afm'
115 | File type is unknown -- defaulting to Annotated Feature Matrix (AFM)
116 | AFM orientation: features as rows
117 | 
118 | RF-ACE parameter configuration:
119 |   --input      = featurematrix.afm
120 |   --nsamples   = 223 / 282 (20.922% missing)
121 |   --nfeatures  = 48912
122 |   --targetidx  = 123, header 'N:age'
123 |   --ntrees     = 356
124 |   --mtry       = 221
125 |   --nodesize   = 12
126 |   --nperms     = 50
127 |   --pthresold  = 0.1
128 |   --output     = associations.tsv
129 | 
130 | Growing 50 Random Forests (RFs), please wait...
131 |   RF 1: 4880 nodes (avg. 13.7079 nodes / tree)
132 |   RF 2: 4810 nodes (avg. 13.5112 nodes / tree)
133 |   RF 3: 4856 nodes (avg. 13.6404 nodes / tree)
134 |   RF 4: 4994 nodes (avg. 14.0281 nodes / tree)
135 |   RF 5: 5036 nodes (avg. 14.1461 nodes / tree)
136 |   RF 6: 5016 nodes (avg. 14.0899 nodes / tree)
137 |   RF 7: 5132 nodes (avg. 14.4157 nodes / tree)
138 | ...
139 |   RF 47: 4736 nodes (avg. 13.3034 nodes / tree)
140 |   RF 48: 5234 nodes (avg. 14.7022 nodes / tree)
141 |   RF 49: 4582 nodes (avg. 12.8708 nodes / tree)
142 |   RF 50: 5210 nodes (avg. 14.6348 nodes / tree)
143 | 50 RFs, 17800 trees, and 247516 nodes generated in 102.91 seconds (2405.17 nodes per second)
144 | Gradient Boosting Trees *DISABLED*
145 | 
146 | Association file created. Format:
147 | TARGET   PREDICTOR   P-VALUE   IMPORTANCE   CORRELATION
148 | 
149 | Done.
150 | }}}
151 | 
152 | If there are no associations found, the program would end as follows:
153 | {{{
154 | No significant associations found, quitting...
155 | }}}
156 | 
157 | = RF-ACE configuration =
158 | 
159 | Information will be added in the future


--------------------------------------------------------------------------------
/data/6_num_features_X_10_cases.tsv:
--------------------------------------------------------------------------------
1 | foo	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2 | N:F1	1	8.5	3.4	7.2	5	6	7	11	9	1
3 | N:F2	2	3	4	5	6	1	1	9	1	10
4 | N:F3	1	1	1	1	1	1	1	2	2	2
5 | N:F4	10	9.9	8	7	6	5	4	3	2.4	1
6 | N:F5	3	3	3	4	4	5	3	2	2	2
7 | N:F6	9	8	7	9	8	7	3	2	1	1
8 | 


--------------------------------------------------------------------------------
/data/scriptTestDataGenerator.m:
--------------------------------------------------------------------------------
 1 | 
 2 | % Script to test dataGenerator
 3 | 
 4 | clear par;
 5 | 
 6 | % basic parameters
 7 | %par.dependency = 'linear'; % or 'nonlinear'
 8 | par.dependency = 'nonlinear'; 
 9 | par.O = 1;      % number of target variables 
10 | par.N = 200;   % number of samples generated
11 | par.n = 5;     % number of relevant variables from which target is generated
12 | %par.seed = 1;   % random number generator seed, if not given, generated from time
13 | par.sets = 10;   % how many data sets to generate
14 | par.testFraction = 0; % fraction of each set written to test file
15 | 
16 | % used by nonlinear dependency generation
17 | par.L = 5;     % number of functions added together to construct the target
18 | 
19 | % used for linear dependency generation, if not specified, will generate randomly 
20 | par.P = 1:(-0.1):0.1;
21 | %par.P = [1 0.5 0.25 0.125 0.0625];
22 | 
23 | % post dependency generation
24 | par.Kn = 100;    % number of additional noise variables concatenated to data
25 | par.maxClasses = 0;       % discretize target, 0=regression
26 | par.mixedType  = 0.0;     % discretize this fraction of the input variables
27 | par.maxLevels = 6;       % max num discrete levels
28 | par.randomizeTarget=0.02; % add noise to target with var 'randomizeTarget'
29 | par.missing = 0.1;        % fraction of missing values
30 | 
31 | % uncomment one output option
32 | % par.fileFormat='R';     % samples  as rows, tsv, cat levels are strings, (this is slow)
33 | par.fileFormat='x';     % features as rows, tsv, cat levels are numbers
34 | % par.fileFormat='arff';  % arff file
35 | % par.fileFormat='none';  % return a cell array of sets
36 | par.fileFormat={'none','R','x','arff'};
37 | par.sampleHeader = 1;     % generate a header for samples 
38 | 
39 | [traindata, testdata] = dataGenerator( par );
40 | 
41 | 


--------------------------------------------------------------------------------
/data/test_4by10_feature1_splits_target0.tsv:
--------------------------------------------------------------------------------
1 | 	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2 | N:F0	na	5.1	nA	3.8	4.2	1.4	8.2	9.1	4.5	na
3 | N:F1	4.2	5.2	7.2	3.9	4.3	1.5	8.3	9.2	4.6	NAN
4 | N:F2	4.2	nan	nan	6.1	1.4	7.3	7.3	0.2	na	naN
5 | N:F3	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan


--------------------------------------------------------------------------------
/data/test_6by10_featurerows_matrix.tsv:
--------------------------------------------------------------------------------
1 | foo	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2 | N:F1	nA	8.5	3.4	7.2	5	6	7	11	9	NA
3 | N:F2	2	3	4	5	6	NA	NA	9	nan	10
4 | C:F3	NA	nA	naN	NaN	1	1	1	2	2	2
5 | N:F4	10	9.9	8	7	6	5	4	3	2.4	1
6 | C:F5	3	3	3	4	4	5	3	2	2	2
7 | N:F6	9	8	7	9	8	7	3	2	1.0	99.23


--------------------------------------------------------------------------------
/install_R.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## Prepare C++ compiler flags
 4 | export PKG_CPPFLAGS="`Rscript -e 'Rcpp:::CxxFlags()'` -std=c++0x -Wall -Wextra -pedantic"
 5 | 
 6 | ## Prepare library flags
 7 | export PKG_LIBS=`Rscript -e "Rcpp:::LdFlags()"`
 8 | 
 9 | ## Make shared library
10 | R CMD SHLIB -o lib/rf_ace_R.so src/rf_ace_R.cpp src/progress.cpp src/statistics.cpp src/math.cpp src/stochasticforest.cpp src/rootnode.cpp src/node.cpp src/treedata.cpp src/datadefs.cpp src/utils.cpp src/distributions.cpp


--------------------------------------------------------------------------------
/make_package.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/sh
2 | 
3 | package=$1
4 | 
5 | tar -czf $package src/*.*pp bin/rf-ace* test/ test_*.* testdata.tsv Makefile make_win32.bat make_win64.bat doxy.cfg rf_ace_batch.py rf-ace-launcher.sh 
6 | 


--------------------------------------------------------------------------------
/make_win32.bat:
--------------------------------------------------------------------------------
1 | mkdir bin
2 | 
3 | SetEnv.cmd /x86 /Release
4 | 
5 | cl /EHsc /O2 /analyze /DNOTHREADS /Febin\rf-ace-win32.exe src\murmurhash3.cpp src\rf_ace.cpp src\statistics.cpp src\distributions.cpp src\progress.cpp src\stochasticforest.cpp src\rootnode.cpp src\node.cpp src\treedata.cpp src\datadefs.cpp src\math.cpp src\utils.cpp src\reader.cpp src\feature.cpp
6 | 
7 | del *.obj
8 | 
9 | 


--------------------------------------------------------------------------------
/make_win64.bat:
--------------------------------------------------------------------------------
1 | mkdir bin
2 | 
3 | SetEnv.cmd /x64 /Release
4 | 
5 | cl /EHsc /O2 /analyze /DNOTHREADS /Febin\rf-ace-win64.exe src\murmurhash3.cpp src\rf_ace.cpp src\statistics.cpp src\distributions.cpp src\progress.cpp src\stochasticforest.cpp src\rootnode.cpp src\node.cpp src\treedata.cpp src\datadefs.cpp src\math.cpp src\utils.cpp src\reader.cpp src\feature.cpp
6 | 
7 | del *.obj
8 | 
9 | 


--------------------------------------------------------------------------------
/man/is.installed.Rd:
--------------------------------------------------------------------------------
 1 | \name{is.installed}
 2 | \alias{is.installed}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |   foo bar 
 6 | }
 7 | \description{
 8 | %%  ~~ A concise (1-5 lines) description of what the function does. ~~
 9 | }
10 | \usage{
11 | is.installed(mypkg)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{mypkg}{
16 | %%     ~~Describe \code{mypkg} here~~
17 | }
18 | }
19 | \details{
20 | %%  ~~ If necessary, more details than the description above ~~
21 | }
22 | \value{
23 | %%  ~Describe the value returned
24 | %%  If it is a LIST, use
25 | %%  \item{comp1 }{Description of 'comp1'}
26 | %%  \item{comp2 }{Description of 'comp2'}
27 | %% ...
28 | }
29 | \references{
30 | %% ~put references to the literature/web site here ~
31 | }
32 | \author{
33 | %%  ~~who you are~~
34 | }
35 | \note{
36 | %%  ~~further notes~~
37 | }
38 | 
39 | %% ~Make other sections like Warning with \section{Warning }{....} ~
40 | 
41 | \seealso{
42 | %% ~~objects to See Also as \code{\link{help}}, ~~~
43 | }
44 | \examples{
45 | ##---- Should be DIRECTLY executable !! ----
46 | ##-- ==>  Define data, use random,
47 | ##--	or do  help(data=index)  for the standard data sets.
48 | 
49 | ## The function is currently defined as
50 | function (mypkg) 
51 | is.element(mypkg, installed.packages()[, 1])
52 | }
53 | % Add one or more standard keywords, see file 'KEYWORDS' in the
54 | % R documentation directory.
55 | \keyword{ ~kwd1 }
56 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
57 | 


--------------------------------------------------------------------------------
/man/rface.filter.Rd:
--------------------------------------------------------------------------------
 1 | \name{rface.filter}
 2 | \alias{rface.filter}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |  Apply feature selection with RF-ACE.
 6 | }
 7 | \description{
 8 |  Apply feature selection with RF-ACE.
 9 | }
10 | \usage{
11 |  associations <- rface.filter(filterData, target, featureWeights = vector(length=0), nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 1000, nThreads = 1)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{filterData}{
16 |   A data.frame storing the data for feature selection. filterData[featureName] stores a vector of strings (categorical feature) or floats (numerical feature).
17 | }
18 |   \item{target}{
19 |   An integer or string, pointing to a feature in filterData, i.e. filterData[target].
20 | }
21 |   \item{featureWeights}{
22 |   A vector of nonnegative weights for the features; affects the sampling distribution. By default all features get weight 1 corresponding uniform sampling.
23 | }
24 |   \item{nTrees}{  
25 |   Number of trees in the forest. Default 100.
26 | }
27 |   \item{mTry}{
28 |   Number of randomly sampled candidate features per split. Default 10. 
29 | }
30 |   \item{nodeSize}{
31 |   Minimum number of train samples per node. Default 3.
32 | }
33 |   \item{nMaxLeaves}{
34 |   Maximum number of leaves per tree. Default 1000.
35 | }
36 |   \item{nThreads}{
37 |   Number of CPU threads to train the model with. Default 1.
38 | }
39 | }
40 | \details{
41 | 
42 | }
43 | \value{
44 | %%  ~Describe the value returned
45 | %%  If it is a LIST, use
46 | %%  \item{comp1 }{Description of 'comp1'}
47 | %%  \item{comp2 }{Description of 'comp2'}
48 | %% ...
49 | }
50 | \references{
51 |   http://code.google.com/p/rf-ace
52 | }
53 | \author{
54 | Timo Erkkila
55 | }
56 | \note{
57 | %%  ~~further notes~~
58 | }
59 | 
60 | %% ~Make other sections like Warning with \section{Warning }{....} ~
61 | 
62 | \seealso{
63 |   \code{ \link{read.afm}, \link{rface.train}, \link{rface.predict}, \link{rface.save}, \link{rface.load} }
64 | }
65 | \examples{
66 | 
67 |   afmFile <- "test_103by300_mixed_nan_matrix.afm";
68 |   target <- "N:output";
69 |   
70 |   nTrees <- 100;
71 |   mTry <- 30;
72 |   
73 |   nThreads <- 4;
74 | 
75 |   filterData <- read.afm(afmFile);
76 | 
77 |   associations <- rface.filter(filterData, target, nTrees = nTrees, mTry = mTry, nThreads = nThreads);
78 | 
79 | }
80 | % Add one or more standard keywords, see file 'KEYWORDS' in the
81 | % R documentation directory.
82 | \keyword{ read.afm }
83 | \keyword{ rface.filter }
84 | 


--------------------------------------------------------------------------------
/man/rface.predict.Rd:
--------------------------------------------------------------------------------
 1 | \name{rface.predict}
 2 | \alias{rface.predict}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |  foo bar
 6 | }
 7 | \description{
 8 | %%  ~~ A concise (1-5 lines) description of what the function does. ~~
 9 | }
10 | \usage{
11 | rface.predict(predictor, testData)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{predictor}{
16 | %%     ~~Describe \code{predictor} here~~
17 | }
18 |   \item{testData}{
19 | %%     ~~Describe \code{testData} here~~
20 | }
21 | }
22 | \details{
23 | %%  ~~ If necessary, more details than the description above ~~
24 | }
25 | \value{
26 | %%  ~Describe the value returned
27 | %%  If it is a LIST, use
28 | %%  \item{comp1 }{Description of 'comp1'}
29 | %%  \item{comp2 }{Description of 'comp2'}
30 | %% ...
31 | }
32 | \references{
33 | %% ~put references to the literature/web site here ~
34 | }
35 | \author{
36 | %%  ~~who you are~~
37 | }
38 | \note{
39 | %%  ~~further notes~~
40 | }
41 | 
42 | %% ~Make other sections like Warning with \section{Warning }{....} ~
43 | 
44 | \seealso{
45 | %% ~~objects to See Also as \code{\link{help}}, ~~~
46 | }
47 | \examples{
48 | ##---- Should be DIRECTLY executable !! ----
49 | ##-- ==>  Define data, use random,
50 | ##--	or do  help(data=index)  for the standard data sets.
51 | 
52 | ## The function is currently defined as
53 | function (predictor, testData) 
54 | {
55 |     .Call("rfacePredict", predictor, testData)
56 |   }
57 | }
58 | % Add one or more standard keywords, see file 'KEYWORDS' in the
59 | % R documentation directory.
60 | \keyword{ ~kwd1 }
61 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
62 | 


--------------------------------------------------------------------------------
/man/rface.train.Rd:
--------------------------------------------------------------------------------
 1 | \name{rface.train}
 2 | \alias{rface.train}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 |  Builds an RF-ACE predictor object.
 6 | }
 7 | \description{
 8 |  Builds an RF-ACE predictor object.
 9 | }
10 | \usage{
11 |  predictorObj <- rface.train(trainData, target, featureWeights = vector(length=0), nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 1000, quantiles = vector(length=0), nThreads = 1)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{trainData}{
16 |   A data.frame storing the training data. trainData[featureName] stores a vector of strings (categorical feature) or floats (numerical feature).
17 | }
18 |   \item{target}{
19 |   An integer or string, pointing to a feature in trainData, i.e. trainData[target].
20 | }
21 |   \item{featureWeights}{
22 |   A vector of nonnegative weights for the features; affects the sampling distribution. By default all features get weight 1 corresponding uniform sampling.
23 | }
24 |   \item{nTrees}{  
25 |   Number of trees in the forest. Default 100.
26 | }
27 |   \item{mTry}{
28 |   Number of randomly sampled candidate features per split. Default 10. 
29 | }
30 |   \item{nodeSize}{
31 |   Minimum number of train samples per node. Default 3.
32 | }
33 |   \item{nMaxLeaves}{
34 |   Maximum number of leaves per tree. Default 1000.
35 | }
36 |   \item{quantiles}{
37 |   A vector of quantile points to provide predictions for. If empty, mean prediction will be calculated. Quantiles are only applicable in regression.
38 | }
39 |   \item{nThreads}{
40 |   Number of CPU threads to train the model with. Default 1.
41 | }
42 | }
43 | \details{
44 | 
45 | }
46 | \value{
47 | %%  ~Describe the value returned
48 | %%  If it is a LIST, use
49 | %%  \item{comp1 }{Description of 'comp1'}
50 | %%  \item{comp2 }{Description of 'comp2'}
51 | %% ...
52 | }
53 | \references{
54 |   http://code.google.com/p/rf-ace
55 | }
56 | \author{
57 | Timo Erkkila
58 | }
59 | \note{
60 | %%  ~~further notes~~
61 | }
62 | 
63 | %% ~Make other sections like Warning with \section{Warning }{....} ~
64 | 
65 | \seealso{
66 |   \code{ \link{read.afm}, \link{rface.filter}, \link{rface.predict}, \link{rface.save}, \link{rface.load} }
67 | }
68 | \examples{
69 | 
70 |   afmFile <- "test_103by300_mixed_nan_matrix.afm";
71 |   target <- "N:output";
72 |   
73 |   nTrees <- 100;
74 |   mTry <- 30;
75 |   
76 |   nThreads <- 4;
77 | 
78 |   trainData <- read.afm(afmFile);
79 | 
80 |   predictorObj <- rface.train(trainData, target, nTrees = nTrees, mTry = mTry, nThreads = nThreads);
81 | 
82 | }
83 | % Add one or more standard keywords, see file 'KEYWORDS' in the
84 | % R documentation directory.
85 | \keyword{ read.afm }
86 | \keyword{ rface.predict }
87 | 


--------------------------------------------------------------------------------
/man/rfacer.Rd:
--------------------------------------------------------------------------------
 1 | \name{skeleton-package}
 2 | \alias{skeleton-package}
 3 | \alias{skeleton}
 4 | \docType{package}
 5 | \title{
 6 | What the package does (short line)
 7 | ~~ package title ~~
 8 | }
 9 | \description{
10 | More about what it does (maybe more than one line)
11 | ~~ A concise (1-5 lines) description of the package ~~
12 | }
13 | \details{
14 | \tabular{ll}{
15 | Package: \tab skeleton\cr
16 | Type: \tab Package\cr
17 | Version: \tab 1.0\cr
18 | Date: \tab 2012-10-01\cr
19 | License: \tab What license is it under?\cr
20 | }
21 | ~~ An overview of how to use the package, including the most important ~~
22 | ~~ functions ~~
23 | }
24 | \author{
25 | Who wrote it
26 | 
27 | Maintainer: Who to complain to <yourfault@somewhere.net>
28 | ~~ The author and/or maintainer of the package ~~
29 | }
30 | \references{
31 | ~~ Literature or other references for background information ~~
32 | }
33 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~
34 | ~~ the R documentation directory ~~
35 | \keyword{ package }
36 | \seealso{
37 | ~~ Optional links to other man pages, e.g. ~~
38 | ~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~
39 | }
40 | \examples{
41 | ~~ simple examples of the most important functions ~~
42 | }
43 | 


--------------------------------------------------------------------------------
/matlab/rface_filter.cpp:
--------------------------------------------------------------------------------
1 | #include "mex.h"
2 | 
3 | void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
4 | 
5 |   
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/rf-ace-launcher.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Generic launcher for RF-ACE
 3 | #  
 4 | #  This script is used primarily with GenePattern, allowing us to specify
 5 | #  all options explicitly, with defaults provided by GenePattern's web 
 6 | #  interface. In effect, this makes all parameters positional.
 7 | #
 8 | #  The GenePattern command line for this is:
 9 | #   sh <libdir>rf-ace-launcher.sh <libdir> <input> <target> <output> \
10 | #    <RF_ntrees> <RF_mtry> <RF_nodesize> <RF_nperms> <RF_pthreshold> \
11 | #    <GBT_ntrees> <GBT_maxleaves> <GBT_shrinkage> <GBT_samplesize> \
12 | #    "<RF_enable><RF_optimize><GBT_enable><GBT_optimize><fmask>"
13 | 
14 | export PATH=$1:$PATH
15 | 
16 | chmod a+x $1/rf_ace
17 | echo "Running: \
18 | rf_ace --input=$2 --target=$3 --output=$4 --RF_ntrees=$5 \
19 | --RF_mtry=$6 --RF_nodesize=$7 --RF_nperms=$8 --RF_pthreshold=$9 \
20 | --GBT_ntrees=${10} --GBT_maxleaves=${11} --GBT_shrinkage=${12} \
21 | --GBT_samplesize=${13} ${14}"
22 | 
23 | rf_ace --input=$2 --target=$3 --output=$4 --RF_ntrees=$5 \
24 | --RF_mtry=$6 --RF_nodesize=$7 --RF_nperms=$8 --RF_pthreshold=$9 \
25 | --GBT_ntrees=${10} --GBT_maxleaves=${11} --GBT_shrinkage=${12} \
26 | --GBT_samplesize=${13} ${14}
27 | 


--------------------------------------------------------------------------------
/rf_ace_batch.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | assert sys.argv[1] != sys.argv[4]
 4 | assert int(sys.argv[2]) <= int(sys.argv[3])
 5 | 
 6 | xlist = xrange(int(sys.argv[2]),int(sys.argv[3])+1)
 7 | 
 8 | for targetidx in xlist:
 9 |     os.system('bin/rf_ace --traindata '+sys.argv[1]+' --target '+str(targetidx)+' --associations '+sys.argv[4]+'_'+str(targetidx))
10 | 
11 | os.system('cat '+sys.argv[4]+'_* > '+sys.argv[4])
12 | os.system('rm '+sys.argv[4]+'_*')
13 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | ## Prepare C++ compiler flags
2 | PKG_CPPFLAGS=$(shell ${R_HOME}/bin/Rscript -e 'Rcpp:::CxxFlags()') -std=c++0x -Wall -Wextra -pedantic -DNOTHREADS
3 | 
4 | ## Prepare library flags
5 | PKG_LIBS=$(shell ${R_HOME}/bin/Rscript -e 'Rcpp:::LdFlags()')
6 | 
7 | ## Make shared library
8 | ## R CMD SHLIB -o lib/rf_ace_R.so src/rf_ace_R.cpp src/progress.cpp src/statistics.cpp src/math.cpp src/stochasticforest.cpp src/rootnode.cpp src/node.cpp src/treedata.cpp src/datadefs.cpp src/utils.cpp src/distributions.cpp


--------------------------------------------------------------------------------
/src/argparse.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef ARGPARSE_HPP
  2 | #define ARGPARSE_HPP
  3 | 
  4 | #include <cassert>
  5 | #include <iostream>
  6 | #include <map>
  7 | #include <sstream>
  8 | #include <string.h>
  9 | #include <vector>
 10 | #include "errno.hpp"
 11 | //#include "exceptions.hpp"
 12 | 
 13 | using namespace std;
 14 | 
 15 | /**
 16 |  * Generic argument parser that allows options to be checked
 17 |  * speculatively. Currently uses a map representation of the argument tree. 
 18 |  */
 19 | class ArgParse {
 20 | 
 21 | public:
 22 | 
 23 |   ArgParse() {}
 24 | 
 25 |   ArgParse(const int argc, char* const argv[]) {
 26 |     if (argc < 1) {
 27 |       throw ERRNO::INVALID_ARGUMENT;
 28 |       //throw EXCEPTION_INVALID_ARGUMENT;
 29 |     }
 30 | 
 31 |     string currArg = "";
 32 |     for (int i = 0; i < argc; ++i) {
 33 |       
 34 |       try {
 35 |         // !! Correctness Note: this strategy may attempt to dereference
 36 |         // !!  corrupt memory. Thus, these runtime checks, while better than an
 37 |         // !!  outright crash, may imply security vulnerabilities in dependent
 38 |         // !!  code. Beware!
 39 |         if (argv[i] == NULL || argv[i][0] == 0x0) {
 40 |           throw ERRNO::INVALID_ARGUMENT;
 41 |         } else {
 42 |           if (!currArg.empty()) {
 43 |             mappedArgs[currArg] = string(argv[i]);
 44 |             currArg = string("");
 45 |             //continue;
 46 |           } 
 47 |           switch(argv[i][0]) {
 48 |           case '-':
 49 |           case '+':
 50 |             if (argv[i][1] == '\0') { break; }
 51 |             if (argv[i][1] == '-' ||
 52 |                 argv[i][1] == '+') {
 53 |               
 54 |               stringstream argSS;
 55 |               bool containsEquals = false;
 56 |               size_t idx;
 57 |               size_t len = strlen(argv[i]);
 58 |               for (idx = 2; idx < len; ++idx) {
 59 |                 if (argv[i][idx] == '=') {
 60 |                   containsEquals = true;
 61 |                   break;
 62 |                 }
 63 |                 argSS << (char)argv[i][idx];
 64 |               }
 65 |               
 66 |               currArg = argSS.str();
 67 |               if (containsEquals) {
 68 |                 stringstream valSS;
 69 |                 for (++idx; idx < len; ++idx) {
 70 |                   valSS << (char)argv[i][idx];
 71 |                 }
 72 |                 mappedArgs[currArg] = valSS.str();
 73 |                 currArg = string("");
 74 |               }
 75 |               
 76 |               
 77 |             } else {
 78 |               size_t len = strlen(argv[i]);
 79 |               for (size_t idx = 1; idx < len; ++idx) {
 80 |                 char arg[] = {argv[i][idx], '\0'};
 81 |                 mappedArgs[string(arg)] = string("");
 82 |                 currArg = string(arg);
 83 |               }
 84 |             }
 85 |             
 86 |             break;
 87 |           default:
 88 |             extraArgs.push_back(string(argv[i]));
 89 |             break;
 90 |           }
 91 |         }
 92 |       } catch (...) {
 93 |         //assert(false); // Check if argv was corrupt or overstepped. Implies a
 94 |                          //  major FIXME if this is hit. (Disabled in lieu of
 95 |                          //  runtime checks during testing)
 96 |         
 97 |         throw ERRNO::ILLEGAL_MEMORY_ACCESS;  // Perform a safer runtime check
 98 | 	                                                       // that should never be hit by
 99 |                                                                // correct code.
100 |       }
101 |     }
102 |     if (!currArg.empty()) {
103 |       mappedArgs[currArg] = string("");
104 |       currArg = string("");
105 |     }
106 | 
107 |     /*
108 |     cout << "Mapped args:" << endl;
109 |     for (map<string,string>::iterator it = mappedArgs.begin(); it != mappedArgs.end(); ++it) {
110 |       cout << (*it).first << "->" << (*it).second << endl;
111 |     }
112 | 
113 |     cout << "Extra args:" << endl;
114 |     for (int i = 0; i < extraArgs.size(); ++i) {
115 |       cout << extraArgs[i] << endl;
116 |       }*/
117 |   }
118 |   ~ArgParse() {}
119 | 
120 |   /**
121 |    * Queries the backend map for the current argument-value pair. Extra
122 |    *  arguments passed positionally are not yet supported.
123 |    *
124 |    *  Contractual guarantees:
125 |    *
126 |    *  + The behavior of the extraction operator (>>) will be used with input
127 |    *     type T. This may cause unexpected results if your type explicitly
128 |    *     specifies an append instead of an overwrite for this
129 |    *     operation. Declare the contents of returnVal carefully or redefine
130 |    *     your type for these cases.
131 |    *
132 |    *  + Certain types specifiable for T may cause memory access violations that
133 |    *     are difficult to debug. For example, specifying char* may throw a
134 |    *     memory access violation at 'SOptarg >> returnVal'. It is expected and
135 |    *     indeed required that your types be well-defined before passing them
136 |    *     to this method.
137 |    *
138 |    *  + Attempting to pass a NULL pointer for any input value will not
139 |    *     work. Don't do it.
140 |    *
141 |    *  + Duplicate arguments will prefer the last long specification over the
142 |    *     last short specification of that same argument.
143 |    *
144 |    *  + Arguments are case-sensitive.
145 |    *
146 |    *  Sets returnVal and returns true if an argument was found; false
147 |    *  otherwise. !! TODO return a unified status code instead
148 |    */
149 |   template <typename T> bool getArgument(const char* shortName, const char* longName, T& returnVal) {
150 | 
151 |     assert(shortName != NULL);
152 |     assert(longName != NULL);
153 |     assert(strlen(shortName) == 1);
154 |     assert(*longName != 0);
155 |     
156 |     map<string,string>::iterator it = mappedArgs.find(longName);
157 |     if (it == mappedArgs.end()) {
158 |         it = mappedArgs.find(shortName);
159 |     }
160 | 
161 |     if (it != mappedArgs.end()) {
162 |       string found = (*it).second;
163 |       if (found.empty()) {
164 |         throw ERRNO::INVALID_VALUE;
165 |       }
166 |       stringstream ss(found);
167 |       ss >> returnVal;
168 | 
169 |       if (ss.fail() || !ss.eof()) {
170 |         throw ERRNO::INVALID_VALUE;
171 |       }
172 |       return true;
173 |     }
174 |     
175 |     return false;
176 |   }
177 | 
178 |   template <typename T> bool getArgument(const string& shortName, const string& longName, T& returnVal) {
179 |     return getArgument<T>(shortName.c_str(), longName.c_str(), returnVal);
180 |   }
181 | 
182 |   /**
183 |    * Queries the backend map for the presence of a flag. This can also be used
184 |    *  to check for the presence of an argument-value pair or non-presence of a
185 |    *  value, but abusing this functionality is not recommended.
186 |    *
187 |    *  Contractual guarantees:
188 |    *
189 |    *  + Attempting to pass a NULL pointer for any input value will not
190 |    *     work. Don't do it.
191 |    *
192 |    *  + Duplicate flags are assumed to be one instance of the set
193 |    *     flag. Conflicting, non-duplicate flags are your problem. 
194 |    *
195 |    *  + Flags are case-sensitive.
196 |    *
197 |    */
198 |   bool getFlag(const char* shortName, const char* longName, bool& returnVal) {
199 |     
200 |     assert(shortName != NULL);
201 |     assert(longName != NULL);
202 |     assert(strlen(shortName) == 1);
203 |     assert(*longName != 0);
204 |     
205 |     map<string,string>::iterator it = mappedArgs.find(longName);
206 |     if (it == mappedArgs.end()) {
207 |         it = mappedArgs.find(shortName);
208 |     }
209 | 
210 |     if (it != mappedArgs.end()) {
211 |       returnVal = true;
212 |       return true;
213 |     }
214 |     
215 |     return false;
216 |   }
217 | 
218 |   bool getFlag(const string& shortName, const string& longName, bool& returnVal) {
219 |     return getFlag(shortName.c_str(), longName.c_str(), returnVal);
220 |   }
221 | 
222 |   
223 | private:
224 |   map<string,string> mappedArgs;
225 |   vector<string> extraArgs;
226 | };
227 | 
228 | #endif
229 | 


--------------------------------------------------------------------------------
/src/datadefs.cpp:
--------------------------------------------------------------------------------
  1 | #include "datadefs.hpp"
  2 | #include <math.h>
  3 | #include <cassert>
  4 | #include <sstream>
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <limits>
  8 | 
  9 | #ifndef NOTHREADS
 10 | #include <thread>
 11 | #endif
 12 | 
 13 | 
 14 | using namespace std;
 15 | //using datadefs::ForestType;
 16 | 
 17 | ////////////////////////////////////////////////////////////
 18 | // CONSTANTS
 19 | ////////////////////////////////////////////////////////////
 20 | const datadefs::num_t datadefs::NUM_NAN = numeric_limits<double>::quiet_NaN();//numeric_limits<double>::infinity();
 21 | //const datadefs::cat_t datadefs::CAT_NAN = "NA";
 22 | const string datadefs::STR_NAN = "NA";
 23 | const datadefs::num_t datadefs::NUM_INF = numeric_limits<double>::infinity();
 24 | const size_t datadefs::MAX_IDX = numeric_limits<int32_t>::max() - 1;
 25 | const datadefs::num_t datadefs::EPS = 1e-18; //1e-12;
 26 | const datadefs::num_t datadefs::NUM_PI = 3.1415926535;
 27 | const datadefs::num_t datadefs::A = 0.140012;
 28 | const datadefs::num_t datadefs::LOG_OF_MAX_NUM = 70.0; /** !! Potentially
 29 |                                                         * spurious. Do you mean
 30 |                                                         * the log of the
 31 |                                                         * maximum number
 32 |                                                         * expressible as a
 33 |                                                         * num_t? */
 34 | 
 35 | const char datadefs::tokenDelimiters[] = " \t,.;:?!@'\"-\n\0";
 36 | 
 37 | // List of NaN's adapted from http://en.wikipedia.org/wiki/NaN#Display
 38 | const set<datadefs::NAN_t> datadefs::NANs = {"NA","NAN","NULL","?"};
 39 | 
 40 | const string datadefs::CONTRAST = "CONTRAST";
 41 | 
 42 | #ifndef NOTHREADS
 43 | const size_t datadefs::MAX_THREADS = thread::hardware_concurrency();
 44 | #endif
 45 | 
 46 | #ifdef NOTHREADS 
 47 | const size_t datadefs::MAX_THREADS = 1;
 48 | #endif
 49 | 
 50 | //enum ForestType {RF, GBT, CART, UNKNOWN};
 51 | 
 52 | const map<string,datadefs::forest_t> datadefs::forestTypeAssign = { {"RF",datadefs::forest_t::RF}, {"GBT",datadefs::forest_t::GBT}, {"QRF",datadefs::forest_t::QRF} };
 53 | 
 54 | const bool                      datadefs::SF_DEFAULT_NO_NA_BRANCHING = false;
 55 | const vector<datadefs::num_t>   datadefs::SF_DEFAULT_QUANTILES = {};
 56 | 
 57 | // Random Forest default configuration
 58 | const size_t                  datadefs::RF_DEFAULT_N_TREES = 100;
 59 | const size_t                  datadefs::RF_DEFAULT_M_TRY = 0;
 60 | const size_t                  datadefs::RF_DEFAULT_N_MAX_LEAVES = datadefs::MAX_IDX;
 61 | const size_t                  datadefs::RF_DEFAULT_NODE_SIZE = 3;
 62 | const datadefs::num_t         datadefs::RF_DEFAULT_IN_BOX_FRACTION = 1.0;
 63 | const datadefs::num_t         datadefs::RF_DEFAULT_SAMPLE_WITH_REPLACEMENT = true;
 64 | const bool                    datadefs::RF_DEFAULT_USE_CONTRASTS = false;
 65 | const datadefs::num_t         datadefs::RF_DEFAULT_CONTRAST_FRACTION = 0.5;
 66 | const bool                    datadefs::RF_DEFAULT_IS_RANDOM_SPLIT = true;
 67 | const datadefs::num_t         datadefs::RF_DEFAULT_SHRINKAGE = 0.0;
 68 | const vector<datadefs::num_t> datadefs::RF_DEFAULT_QUANTILES = {};
 69 | const size_t                  datadefs::RF_DEFAULT_N_SAMPLES_FOR_QUANTILES = 0;
 70 | 
 71 | // Random Forest default configuration
 72 | const size_t                  datadefs::QRF_DEFAULT_N_TREES = 100;
 73 | const size_t                  datadefs::QRF_DEFAULT_M_TRY = 0;
 74 | const size_t                  datadefs::QRF_DEFAULT_N_MAX_LEAVES = datadefs::MAX_IDX;
 75 | const size_t                  datadefs::QRF_DEFAULT_NODE_SIZE = 3;
 76 | const datadefs::num_t         datadefs::QRF_DEFAULT_IN_BOX_FRACTION = 1.0;
 77 | const datadefs::num_t         datadefs::QRF_DEFAULT_SAMPLE_WITH_REPLACEMENT = true;
 78 | const bool                    datadefs::QRF_DEFAULT_USE_CONTRASTS = false;
 79 | const datadefs::num_t         datadefs::QRF_DEFAULT_CONTRAST_FRACTION = 0.5;
 80 | const bool                    datadefs::QRF_DEFAULT_IS_RANDOM_SPLIT = true;
 81 | const datadefs::num_t         datadefs::QRF_DEFAULT_SHRINKAGE = 0.0;
 82 | const vector<datadefs::num_t> datadefs::QRF_DEFAULT_QUANTILES = {0.25,0.5,0.75};
 83 | const size_t                  datadefs::QRF_DEFAULT_N_SAMPLES_FOR_QUANTILES = 10;
 84 | 
 85 | // Gradient Boosting Trees default configuration
 86 | const size_t                  datadefs::GBT_DEFAULT_N_TREES = 100;
 87 | const size_t                  datadefs::GBT_DEFAULT_M_TRY = 0;
 88 | const size_t                  datadefs::GBT_DEFAULT_N_MAX_LEAVES = 6;
 89 | const size_t                  datadefs::GBT_DEFAULT_NODE_SIZE = 3;
 90 | const datadefs::num_t         datadefs::GBT_DEFAULT_IN_BOX_FRACTION = 0.5;
 91 | const datadefs::num_t         datadefs::GBT_DEFAULT_SAMPLE_WITH_REPLACEMENT = false;
 92 | const bool                    datadefs::GBT_DEFAULT_USE_CONTRASTS = false;
 93 | const datadefs::num_t         datadefs::GBT_DEFAULT_CONTRAST_FRACTION = 0.5;
 94 | const bool                    datadefs::GBT_DEFAULT_IS_RANDOM_SPLIT = false;
 95 | const datadefs::num_t         datadefs::GBT_DEFAULT_SHRINKAGE = 0.1;
 96 | const vector<datadefs::num_t> datadefs::GBT_DEFAULT_QUANTILES = {};
 97 | const size_t                  datadefs::GBT_DEFAULT_N_SAMPLES_FOR_QUANTILES = 0;
 98 | 
 99 | // Statistical test default configuration
100 | const size_t          datadefs::FILTER_DEFAULT_N_PERMS = 20;
101 | const datadefs::num_t datadefs::FILTER_DEFAULT_P_VALUE_THRESHOLD = 0.05;
102 | const bool            datadefs::FILTER_DEFAULT_IS_ADJUSTED_P_VALUE = false;
103 | const datadefs::num_t datadefs::FILTER_DEFAULT_IMPORTANCE_THRESHOLD = 10;
104 | const bool            datadefs::FILTER_NORMALIZE_IMPORTANCE_VALUES = false;
105 | const bool            datadefs::FILTER_DEFAULT_REPORT_NONEXISTENT_FEATURES = false;
106 | 
107 | // Default general configuration
108 | const bool            datadefs::GENERAL_DEFAULT_PRINT_HELP = false;
109 | const char            datadefs::GENERAL_DEFAULT_DATA_DELIMITER = '\t';
110 | const char            datadefs::GENERAL_DEFAULT_HEADER_DELIMITER = ':';
111 | const size_t          datadefs::GENERAL_DEFAULT_MIN_SAMPLES = 5;
112 | const int             datadefs::GENERAL_DEFAULT_SEED = -1;
113 | const size_t          datadefs::GENERAL_DEFAULT_N_THREADS = 1;
114 | const bool            datadefs::GENERAL_DEFAULT_IS_MAX_THREADS = false;
115 | const datadefs::num_t datadefs::GENERAL_DEFAULT_FEATURE_WEIGHT = 0;
116 | 
117 | ////////////////////////////////////////////////////////////
118 | // HELPER FUNCTIONS
119 | ////////////////////////////////////////////////////////////
120 | 
121 | /**
122 |  * Promote each character in a sequence to uppercase. Effectively, a wrapper
123 |  * around std::transform.
124 |  */
125 | 
126 | string datadefs::toUpperCase(const string& str) {
127 |   int (*pf)(int) = toupper;  
128 |   string strcopy(str);
129 |   transform(strcopy.begin(), strcopy.end(), strcopy.begin(), pf);
130 |   return(strcopy);
131 | }
132 | 
133 | bool datadefs::isInteger(const string& str, int& integer) {
134 |   stringstream ss(str);
135 |   if(ss >> integer && ss.eof()) {
136 |     return(true);
137 |   } else {
138 |     return(false);
139 |   }
140 | }
141 | 
142 |   
143 | /**
144 |  * Count all values that aren't transfinite
145 |  !! Correctness: what about representations of infinity? And to be entirely
146 |  pedantic: signaling NaN, post-trap? These should have specific non-guarantees.
147 | */
148 | void datadefs::countRealValues(vector<num_t> const& data, size_t& nRealValues) {
149 |   nRealValues = 0;
150 |   for(size_t i = 0; i < data.size(); ++i) {
151 |     if(!datadefs::isNAN(data[i])) {
152 |       ++nRealValues;
153 |     }
154 |   }
155 | }
156 | 
157 | 


--------------------------------------------------------------------------------
/src/densetreedata.hpp:
--------------------------------------------------------------------------------
  1 | //densetreedata.hpp
  2 | //
  3 | //
  4 | 
  5 | #ifndef DENSETREEDATA_HPP
  6 | #define DENSETREEDATA_HPP
  7 | 
  8 | #include <cstdlib>
  9 | #include <map>
 10 | #include <fstream>
 11 | #include <unordered_map>
 12 | #include <unordered_set>
 13 | 
 14 | #include "datadefs.hpp"
 15 | #include "distributions.hpp"
 16 | #include "options.hpp"
 17 | #include "feature.hpp"
 18 | #include "reader.hpp"
 19 | #include "treedata.hpp"
 20 | 
 21 | using namespace std;
 22 | using datadefs::num_t;
 23 | 
 24 | class DenseTreeData : public TreeData {
 25 | public:
 26 | 
 27 |   // Initializes the object 
 28 |   DenseTreeData(const vector<Feature>& features, bool useContrasts = false, const vector<string>& sampleHeaders = vector<string>(0));
 29 | 
 30 |   // Initializes the object and reads in a data matrix
 31 |   DenseTreeData(string fileName, const char dataDelimiter, const char headerDelimiter, const bool useContrasts = false);
 32 | 
 33 |   ~DenseTreeData();
 34 | 
 35 |   // Reveals the Feature class interface to the user
 36 |   const Feature* feature(const size_t featureIdx) const {
 37 |     return( &features_[featureIdx] );
 38 |   }
 39 |   
 40 |   // Returns the number of features
 41 |   size_t nFeatures() const;
 42 |   
 43 |   // Returns feature index, given the name
 44 |   size_t getFeatureIdx(const string& featureName) const;
 45 |   
 46 |   // A value denoting the "one-over-last" feature in matrix
 47 |   size_t end() const { return( datadefs::MAX_IDX ); }
 48 |   
 49 |   // Returns sample name, given sample index
 50 |   string getSampleName(const size_t sampleIdx);
 51 |   
 52 |   // Returns the number of samples
 53 |   size_t nSamples() const;
 54 |   
 55 |   vector<num_t> getFeatureWeights() const;
 56 |   
 57 |   void separateMissingSamples(const size_t featureIdx,
 58 | 			      vector<size_t>& sampleIcs,
 59 | 			      vector<size_t>& missingIcs);
 60 |   
 61 |   num_t numericalFeatureSplit(const size_t targetIdx,
 62 | 			      const size_t featureIdx,
 63 | 			      const size_t minSamples,
 64 | 			      vector<size_t>& sampleIcs_left,
 65 | 			      vector<size_t>& sampleIcs_right,
 66 | 			      num_t& splitValue);
 67 | 
 68 |   num_t categoricalFeatureSplit(const size_t targetIdx,
 69 | 				const size_t featureIdx,
 70 | 				const vector<cat_t>& catOrder,
 71 | 				const size_t minSamples,
 72 | 				vector<size_t>& sampleIcs_left,
 73 | 				vector<size_t>& sampleIcs_right,
 74 | 				unordered_set<cat_t>& splitValues_left);
 75 | 
 76 |   num_t textualFeatureSplit(const size_t targetIdx,
 77 | 			    const size_t featureIdx,
 78 | 			    const uint32_t hashIdx,
 79 | 			    const size_t minSamples,
 80 | 			    vector<size_t>& sampleIcs_left,
 81 | 			    vector<size_t>& sampleIcs_right);
 82 |     
 83 |   //string getRawFeatureData(const size_t featureIdx, const size_t sampleIdx);
 84 |   //string getRawFeatureData(const size_t featureIdx, const num_t data);
 85 |   //vector<string> getRawFeatureData(const size_t featureIdx);
 86 |   
 87 |   // Generates a bootstrap sample from the real samples of featureIdx. Samples not in the bootstrap sample will be stored in oob_ics,
 88 |   // and the number of oob samples is stored in noob.
 89 |   void bootstrapFromRealSamples(distributions::Random* random,
 90 | 				const bool withReplacement, 
 91 |                                 const num_t sampleSize, 
 92 |                                 const size_t featureIdx, 
 93 |                                 vector<size_t>& ics, 
 94 |                                 vector<size_t>& oobIcs);
 95 | 
 96 |   void createContrasts();
 97 |   void permuteContrasts(distributions::Random* random);
 98 | 
 99 |   void replaceFeatureData(const size_t featureIdx, const vector<num_t>& featureData);
100 |   void replaceFeatureData(const size_t featureIdx, const vector<string>& rawFeatureData);
101 | 
102 |   
103 | #ifndef TEST__
104 | private:
105 | #endif
106 |   
107 |   enum FileType {UNKNOWN, AFM, ARFF};
108 | 
109 |   FileType getFileType(const string& fileName);
110 | 
111 |   bool isRowsAsSamplesInAFM(Reader& reader, const char headerDelimiter);
112 | 
113 |   void readAFM(const string& fileName, const char dataDelimiter, const char headerDelimiter);
114 |   //void readARFF(const string& fileName);
115 | 
116 |   //void parseARFFattribute(const string& str, string& attributeName, bool& isFeatureNumerical);
117 | 
118 |   bool isValidNumericalHeader(const string& str, const char headerDelimiter);
119 |   bool isValidCategoricalHeader(const string& str, const char headerDelimiter);
120 |   bool isValidTextHeader(const string& str, const char headerDelimiter);
121 |   bool isValidFeatureHeader(const string& str, const char headerDelimiter);
122 | 
123 |   //template <typename T> void transpose(vector<vector<T> >& mat);
124 | 
125 |   bool useContrasts_;
126 |   
127 |   vector<Feature> features_;
128 |   vector<string> sampleHeaders_;
129 | 
130 |   unordered_map<string,size_t> name2idx_;
131 |   
132 | };
133 | 
134 | #endif
135 | 


--------------------------------------------------------------------------------
/src/distributions.cpp:
--------------------------------------------------------------------------------
 1 | #include "distributions.hpp"
 2 | 
 3 | #include "utils.hpp"
 4 | 
 5 | using datadefs::num_t;
 6 | 
 7 | distributions::Random::Random():
 8 |   rand_(0,datadefs::MAX_IDX) {
 9 |   this->seed( distributions::generateSeed() );
10 | }
11 | 
12 | distributions::Random::Random(size_t seed):
13 |   rand_(0,datadefs::MAX_IDX) {
14 | 
15 |   this->seed(seed);
16 | 
17 | }
18 | 
19 | distributions::Random::~Random() {
20 |   
21 | }
22 | 
23 | void distributions::Random::seed(size_t seed) {
24 |   eng_.seed(seed);
25 | }
26 | 
27 | size_t distributions::Random::integer() {
28 |   return( rand_(eng_) );
29 | }
30 | 
31 | num_t distributions::Random::uniform() {
32 | 
33 |   return( 1.0 * rand_(eng_) / ( datadefs::MAX_IDX + 1 ) );
34 | 
35 | }
36 | 
37 | distributions::PMF::PMF(const vector<num_t>& weights) {
38 |   
39 |   size_t n = weights.size();
40 | 
41 |   num_t sum = 0.0;
42 |   
43 |   for ( size_t i = 0; i < n; ++i ) {
44 |     assert( weights[i] >= 0.0 );
45 |     sum += weights[i];
46 |   } 
47 |   
48 |   prob_.resize(n);
49 |   alias_.resize(n);
50 |   
51 |   vector<size_t> HL(n);
52 |   vector<size_t>::iterator H(HL.begin()-1);
53 |   vector<size_t>::iterator L(HL.end());
54 | 
55 |   for ( size_t i = 0; i < n; ++i ) {
56 |     prob_[i] = weights[i] / sum * n;
57 |     if ( prob_[i] < 1.0 ) {
58 |       *(++H) = i;
59 |     } else {
60 |       *(--L) = i;
61 |     }
62 |   }
63 | 
64 |   for ( size_t k = 0; k < n-1; k++ ) {
65 |     size_t i = HL[k];
66 |     size_t j = *L;
67 |     alias_[i] = j;
68 |     prob_[j] += prob_[i] - 1.0;
69 |     if ( prob_[j] < 1.0 ) {
70 |       L++;
71 |     }
72 |     if ( L >= HL.end() ) {
73 |       break;
74 |     }
75 |   }
76 |   
77 | }
78 | 
79 | distributions::PMF::~PMF() { }
80 | 
81 | size_t distributions::PMF::sample(distributions::Random* random) const {
82 | 
83 |   size_t i = random->integer() % prob_.size(); 
84 | 
85 |   return( random->uniform() < prob_[i] ? i : alias_[i] );
86 | 
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/distributions.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DISTRIBUTIONS_HPP
 2 | #define DISTRIBUTIONS_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <vector>
 6 | #include <tr1/random>
 7 | #include <ctime>
 8 | #include "datadefs.hpp"
 9 | 
10 | using namespace std;
11 | 
12 | namespace distributions {
13 | 
14 |   typedef tr1::mt19937 Engine;
15 |   // typedef std::ranlux_base_01 Engine;
16 |   
17 |   inline unsigned int generateSeed() { return( clock() + time(0) ); }
18 | 
19 |   class Random {
20 |   public:
21 |     
22 |     // Initialize the generator
23 |     Random();
24 |     Random(size_t seed);
25 |     
26 |     // Destructor
27 |     ~Random();
28 |     
29 |     void seed(size_t seed);
30 |     
31 |     // Return random int
32 |     size_t integer();
33 | 
34 |     // Generate and normalize random int
35 |     datadefs::num_t uniform();
36 |     
37 |     size_t minIdx() { return( 0 ); }
38 |     size_t maxIdx() { return( datadefs::MAX_IDX ); }
39 |     
40 |   private:
41 |     
42 |     Engine eng_;
43 |     tr1::uniform_int<size_t> rand_;
44 |     
45 |   };
46 |  
47 |   class PMF {
48 |   public:
49 |     
50 |     PMF(const vector<datadefs::num_t>& weights);
51 |     ~PMF();
52 |     
53 |     size_t sample(Random* random) const;
54 |     
55 | #ifndef TEST__
56 |   private:
57 | #endif
58 |     
59 |     vector<datadefs::num_t> prob_;
60 |     vector<size_t> alias_;
61 |     
62 |   };
63 |   
64 | }
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/src/errno.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ERRNO_HPP
 2 | #define ERRNO_HPP
 3 | 
 4 | enum ERRNO { SUCCESS,
 5 | 	     UNDEFINED,
 6 | 	     INVALID_ARGUMENT,
 7 | 	     INVALID_VALUE,
 8 | 	     NULL_POINTER_DEREFENCE,
 9 | 	     ILLEGAL_MEMORY_ACCESS,
10 | 	     NUMERIC_OVERFLOW,
11 | 	     NUMERIC_UNDERFLOW,
12 | 	     PARTIAL_READ,
13 | 	     INVALID_READ,
14 | 	     NAN_GIVEN_FOR_SORTING };
15 | 
16 | #endif // ERRNO_HPP
17 | 


--------------------------------------------------------------------------------
/src/exceptions.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef EXCEPTIONS_HPP
 2 | #define EXCEPTIONS_HPP
 3 | 
 4 | #include <exception>
 5 | #include <string>
 6 | #include <sstream>
 7 | 
 8 | #include "errno.hpp"
 9 | 
10 | class RFACE_EXCEPTION : public exception {
11 | 
12 | public:
13 | 
14 |   RFACE_EXCEPTION(const ERRNO& errno, const std::string& note = "") throw():errno_(errno) {
15 |     stringstream ss;
16 |     ss << "ERRNO (" << errno_ << "): ";
17 |     switch ( errno_ ) {
18 |     case ERRNO::INVALID_ARGUMENT:
19 |       ss << "invalid command-line argument.";
20 |       break;
21 |     case ERRNO::INVALID_VALUE:
22 |       ss << "invalid command-line value.";
23 |       break;
24 |     case ERRNO::ILLEGAL_MEMORY_ACCESS:
25 |       ss << "illegal memory access.";
26 |       break;
27 |     default:
28 |       ss << "unknown exception!";
29 |       break;
30 |     }
31 |     ss << " " << note;
32 |     msg_ = ss.str();
33 |   }
34 | 
35 |   ~RFACE_EXCEPTION() throw() {}
36 | 
37 |   virtual const char* what() const throw() {
38 |     return msg_.c_str();
39 |   }
40 |   
41 | private:
42 |   std::string msg_;
43 |   ERRNO errno_;
44 |   
45 | };
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/src/feature.cpp:
--------------------------------------------------------------------------------
  1 | #include "feature.hpp"
  2 | 
  3 | #include <algorithm>
  4 | 
  5 | #include "utils.hpp"
  6 | 
  7 | 
  8 | Feature::Feature():
  9 |   type_(Feature::Type::UNKNOWN) {
 10 | }
 11 | 
 12 | Feature::Feature(Feature::Type newType, const string& newName, const size_t nSamples):
 13 |   type_(newType),
 14 |   name_(newName) {
 15 |   
 16 |   if ( type_ == Feature::Type::NUM ) {
 17 |     numData.resize(nSamples);
 18 |     catData.clear();
 19 |     txtData.clear();
 20 |   } else if ( type_ == Feature::Type::CAT ) {
 21 |     numData.clear();
 22 |     catData.resize(nSamples);
 23 |     txtData.clear();
 24 |   } else {
 25 |     numData.clear();
 26 |     catData.clear();
 27 |     txtData.resize(nSamples);
 28 |   }
 29 | 
 30 | }
 31 | 
 32 | void Feature::setNumSampleValue(const size_t sampleIdx, const num_t val) {
 33 |   assert( type_ == Feature::Type::NUM );
 34 |   numData[sampleIdx] = val;
 35 | }
 36 | 
 37 | void Feature::setCatSampleValue(const size_t sampleIdx, const cat_t& val) {
 38 |   assert( type_ == Feature::Type::CAT );
 39 |   catData[sampleIdx] = val;
 40 | }
 41 | 
 42 | void Feature::setTxtSampleValue(const size_t sampleIdx, const string& str) {
 43 |   assert( type_ == Feature::Type::TXT );
 44 |   if ( datadefs::isNAN(str) ) {
 45 |     txtData[sampleIdx] = utils::hashText("");
 46 |   } else {
 47 |     txtData[sampleIdx] = utils::hashText(str);
 48 |   }
 49 | }
 50 | 
 51 | cat_t Feature::getCatData(const size_t sampleIdx) const {
 52 |   assert(type_ == Feature::Type::CAT);
 53 |   return(catData[sampleIdx]);
 54 | }
 55 | 
 56 | vector<cat_t> Feature::getCatData() const {
 57 |   assert(type_ == Feature::Type::CAT);
 58 |   return(catData);
 59 | }
 60 | 
 61 | vector<cat_t> Feature::getCatData(const vector<size_t>& sampleIcs) const {
 62 |   assert(type_ == Feature::Type::CAT);
 63 |   vector<cat_t> data(sampleIcs.size());
 64 |   for ( size_t i = 0; i < sampleIcs.size(); ++i ) {
 65 |     data[i] = catData[sampleIcs[i]];
 66 |   }
 67 |   return(data);
 68 | }
 69 | 
 70 | num_t Feature::getNumData(const size_t sampleIdx) const {
 71 |   assert(type_ == Feature::Type::NUM);
 72 |   return(numData[sampleIdx]);
 73 | }
 74 | 
 75 | vector<num_t> Feature::getNumData() const {
 76 |   assert(type_ == Feature::Type::NUM);
 77 |   return(numData);
 78 | }
 79 | 
 80 | vector<num_t> Feature::getNumData(const vector<size_t>& sampleIcs) const {
 81 |   assert(type_ == Feature::Type::NUM);
 82 |   vector<num_t> data(sampleIcs.size());
 83 |   for ( size_t i = 0; i < sampleIcs.size(); ++i ) {
 84 |     data[i] = numData[sampleIcs[i]];
 85 |   }
 86 |   return(data);
 87 | }
 88 | 
 89 | unordered_set<uint32_t> Feature::getTxtData(const size_t sampleIdx) const {
 90 |   assert(type_ == Feature::Type::TXT);
 91 |   return(txtData[sampleIdx]);
 92 | }
 93 | 
 94 | Feature::Feature(const vector<num_t>& newNumData, const string& newName):
 95 |   type_(Feature::Type::NUM),
 96 |   name_(newName) {
 97 |   numData = newNumData;
 98 | }
 99 | 
100 | Feature::Feature(const vector<cat_t>& newCatData, const string& newName):
101 |   type_(Feature::Type::CAT),
102 |   name_(newName) {
103 |   catData = newCatData;
104 |   }
105 | 
106 | Feature::Feature(const vector<string>& newTxtData, const string& newName, const bool doHash):
107 |   type_(Feature::Type::TXT),
108 |   name_(newName) {
109 |   
110 |   assert(doHash);
111 | 
112 |   size_t nSamples = newTxtData.size();
113 |   
114 |   txtData.resize(nSamples);
115 |   
116 |   for ( size_t i = 0; i < nSamples; ++i ) {
117 |     if ( datadefs::isNAN(newTxtData[i]) ) {
118 |       txtData[i] = utils::hashText("");
119 |     } else {
120 |       txtData[i] = utils::hashText(newTxtData[i]);  
121 |     }
122 |   }
123 |   
124 | }
125 | 
126 | Feature::~Feature() { }
127 | 
128 | bool Feature::isNumerical() const {
129 |   return( type_ == Feature::Type::NUM ? true : false );
130 | }
131 | 
132 | bool Feature::isCategorical() const {
133 |   return( type_ == Feature::Type::CAT ? true : false );
134 | }
135 | 
136 | bool Feature::isTextual() const {
137 |   return( type_ == Feature::Type::TXT ? true : false );
138 | }
139 | 
140 | bool Feature::isMissing(const size_t sampleIdx) const {
141 |   switch (type_) {
142 |   case NUM:
143 |     return( datadefs::isNAN<num_t>(numData[sampleIdx]) );
144 |   case CAT:
145 |     return( datadefs::isNAN<cat_t>(catData[sampleIdx]) );
146 |   case TXT:
147 |     return( txtData[sampleIdx].size() == 0 );
148 |   case UNKNOWN:
149 |     break;
150 |   } 
151 | 
152 |   cerr << "Feature::isMissing() -- tried to use with unset feature object!" << endl;
153 |   exit(1);
154 | }
155 | 
156 | size_t Feature::nSamples() const {
157 |   switch ( type_ ) {
158 |   case NUM:
159 |     return( numData.size() );
160 |   case CAT:
161 |     return( catData.size() );
162 |   case TXT:
163 |     return( txtData.size() );
164 |   case UNKNOWN:
165 |     break;
166 |   }
167 | 
168 |   cerr << "Feature::nSamples() -- tried to use with unset feature object!" << endl;
169 |   exit(1);
170 | }
171 | 									      
172 | size_t Feature::nRealSamples() const {
173 |   
174 |   size_t n = 0;
175 | 
176 |   for ( size_t i = 0; i < this->nSamples(); ++i ) {
177 |     if ( !this->isMissing(i) ) {
178 |       ++n;
179 |     }
180 |   }
181 | 
182 |   return(n);
183 |   
184 | }
185 | 
186 | string Feature::name() const {
187 |   return( name_ );
188 | }
189 | 
190 | void Feature::setName(const string& newName) {
191 |   assert( newName.length() > 0 );
192 |   name_ = newName;
193 | }
194 | 
195 | 
196 | vector<cat_t> Feature::categories() const {
197 |   
198 |   vector<cat_t> categories;
199 |   
200 |   if( this->isNumerical() || this->isTextual() ) {
201 |     return( categories );
202 |   }
203 | 
204 |   unordered_set<cat_t> categoriesSet;
205 |   
206 |   for ( size_t i = 0; i < catData.size(); ++i ) {
207 |     if ( !this->isMissing(i) ) {
208 |       categoriesSet.insert(catData[i]);
209 |     }
210 |   }
211 | 
212 |   categories.resize(categoriesSet.size());
213 | 
214 |   copy(categoriesSet.begin(),categoriesSet.end(),categories.begin());
215 |   
216 |   return( categories );
217 |   
218 | }
219 | 
220 | 
221 | uint32_t Feature::getHash(const size_t sampleIdx, const size_t integer) const {
222 | 
223 |   assert( type_ == Feature::Type::TXT );
224 | 
225 |   size_t pos = integer % this->txtData[sampleIdx].size();
226 | 
227 |   unordered_set<uint32_t>::const_iterator it(this->txtData[sampleIdx].begin());
228 |   for ( size_t i = 0; i < pos; ++i ) {
229 |     it++;
230 |   }
231 | 
232 |   return(*it);
233 | 
234 | }
235 | 
236 | bool Feature::hasHash(const size_t sampleIdx, const uint32_t hashIdx) const {
237 | 
238 |   return( this->txtData[sampleIdx].find(hashIdx) != this->txtData[sampleIdx].end() );
239 | 
240 | }
241 | 
242 | unordered_map<uint32_t,size_t> Feature::getHashKeyFrequency() const {
243 | 
244 |   size_t nSamples = txtData.size();
245 | 
246 |   unordered_map<uint32_t,size_t> visitedKeys;
247 |   
248 |   for ( size_t i = 0; i < nSamples; ++i ) {
249 |     for ( unordered_set<uint32_t>::const_iterator it(txtData[i].begin()); it != txtData[i].end(); ++it ) {
250 |       visitedKeys[*it]++;
251 |     }
252 |   }
253 |   
254 |   return(visitedKeys);
255 | 
256 | }
257 | 
258 | num_t Feature::entropy() const {
259 | 
260 |   size_t nSamples = txtData.size();
261 | 
262 |   unordered_map<uint32_t,size_t> visitedKeys = getHashKeyFrequency();
263 | 
264 |   unordered_map<uint32_t,size_t>::const_iterator it(visitedKeys.begin());
265 | 
266 |   num_t entropy = 0.0;
267 | 
268 |   for ( ; it != visitedKeys.end(); ++it ) {
269 |     num_t f = static_cast<num_t>(it->second) / static_cast<num_t>(nSamples);
270 |     if ( fabs(f) > 1e-5 && fabs(1-f) > 1e-5 ) {
271 |       entropy -= (f * log(f) + (1-f)*log(1-f))/log(2);
272 |     }
273 |   }
274 | 
275 |   return(entropy);
276 | 
277 | }
278 | 
279 | void Feature::removeFrequentHashKeys(num_t fThreshold) {
280 | 
281 |   size_t nSamples = txtData.size();
282 | 
283 |   const unordered_map<uint32_t,size_t> visitedKeys = this->getHashKeyFrequency();
284 | 
285 |   unordered_map<uint32_t,size_t>::const_iterator it(visitedKeys.begin());
286 |   
287 |   for ( ; it != visitedKeys.end(); ++it ) {
288 |     num_t f = static_cast<num_t>(it->second) / static_cast<num_t>(nSamples);
289 |     if ( f > fThreshold ) {
290 |       for ( size_t sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) {
291 | 	uint32_t hashKey = it->first;
292 | 	if ( txtData[sampleIdx].find(hashKey) != txtData[sampleIdx].end() ) {
293 | 	  txtData[sampleIdx].erase(it->first);
294 | 	}
295 |       }
296 |     }
297 |   }
298 |   
299 | }
300 | 


--------------------------------------------------------------------------------
/src/feature.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef FEATURE_HPP
 2 | #define FEATURE_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <vector>
 6 | #include <unordered_set>
 7 | #include <map>
 8 | #include <string>
 9 | 
10 | #include "datadefs.hpp"
11 | 
12 | using namespace std;
13 | using datadefs::num_t;
14 | using datadefs::cat_t;
15 | 
16 | class Feature {
17 | public:
18 | 
19 |   enum Type { NUM, CAT, TXT, UNKNOWN };
20 | 
21 |   vector<num_t> numData;
22 |   vector<cat_t> catData;
23 |   vector<unordered_set<uint32_t> > txtData;
24 | 
25 |   Feature();
26 |   Feature(Type newType, const string& newName, const size_t nSamples);
27 |   Feature(const vector<num_t>& newNumData, const string& newName);
28 |   Feature(const vector<cat_t>& newCatData, const string& newName);
29 |   Feature(const vector<string>& newTxtData, const string& newName, const bool doHash);
30 |   ~Feature();
31 | 
32 |   void setNumSampleValue(const size_t sampleIdx, const num_t   val);
33 |   void setCatSampleValue(const size_t sampleIdx, const cat_t&  val);
34 |   void setTxtSampleValue(const size_t sampleIdx, const string& str);
35 | 
36 |   num_t getNumData(const size_t sampleIdx) const;
37 |   vector<num_t> getNumData() const;
38 |   vector<num_t> getNumData(const vector<size_t>& sampleIcs) const;
39 | 
40 |   cat_t getCatData(const size_t sampleIdx) const;
41 |   vector<cat_t> getCatData() const;
42 |   vector<cat_t> getCatData(const vector<size_t>& sampleIcs) const;
43 | 
44 |   unordered_set<uint32_t> getTxtData(const size_t sampleIdx) const;
45 | 
46 |   bool isNumerical() const;
47 |   bool isCategorical() const;
48 |   bool isTextual() const;
49 | 
50 |   bool isMissing(const size_t sampleIdx) const;
51 | 
52 |   size_t nSamples() const;
53 |   size_t nRealSamples() const;
54 | 
55 |   string name() const;
56 |   void setName(const string& newName);
57 |   
58 |   vector<cat_t> categories() const;
59 | 
60 |   uint32_t getHash(const size_t sampleIdx, const size_t integer) const;
61 |   bool hasHash(const size_t sampleIdx, const uint32_t hashIdx) const;
62 | 
63 |   num_t entropy() const;
64 | 
65 |   unordered_map<uint32_t,size_t> getHashKeyFrequency() const;
66 | 
67 |   void removeFrequentHashKeys(const num_t fThreshold);
68 | 
69 | #ifndef TEST__
70 | private:
71 | #endif
72 | 
73 |   Type type_;
74 |   string name_;
75 | 
76 | };
77 | 
78 | 
79 | #endif
80 | 


--------------------------------------------------------------------------------
/src/math.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cmath>
  3 | //#include "gamma.hpp"
  4 | #include "math.hpp"
  5 | 
  6 | void math::transformLogistic(size_t nCategories,
  7 | 			     vector<num_t>& prediction, 
  8 | 			     vector<num_t>& probability) {
  9 | 
 10 |   //size_t nCategories = trainData_->nCategories();
 11 | 
 12 |   // Multiclass logistic transform of class probabilities from current probability estimates.
 13 |   assert(nCategories == prediction.size());
 14 |   vector<num_t>& expPrediction = probability; // just using the space by a different name
 15 | 
 16 |   // find maximum prediction
 17 |   vector<num_t>::iterator maxPrediction = max_element(prediction.begin(),prediction.end());
 18 |   // scale by maximum to prevent numerical errors
 19 | 
 20 |   num_t expSum = 0.0;
 21 |   size_t k;
 22 |   for (k = 0; k < nCategories; ++k) {
 23 |     expPrediction[k] = exp(prediction[k] - *maxPrediction); // scale by maximum
 24 |     expSum += expPrediction[k];
 25 |   }
 26 |   for (k = 0; k < nCategories; ++k) {
 27 |     probability[k] = expPrediction[k] / expSum;
 28 |   }
 29 | }
 30 | 
 31 | 
 32 | void math::adjustPValues(vector<num_t>& pValues, const size_t nTests) {
 33 | 
 34 |   num_t previousPValue = 0.0;
 35 | 
 36 |   for ( size_t i = 0; i < pValues.size(); ++i ) {
 37 |     
 38 |     pValues[i] *= nTests / ( i + 1 );
 39 |     
 40 |     if ( pValues[i] > 1.0 ) {
 41 |       pValues[i] = 1.0;
 42 |     }
 43 |     
 44 |     if ( pValues[i] < previousPValue ) {
 45 |       pValues[i] = previousPValue;
 46 |     } else {
 47 |       previousPValue = pValues[i];
 48 |     }
 49 |     
 50 |   }
 51 |   
 52 | }
 53 | 
 54 | 
 55 | /**
 56 |    Two-sample t-test
 57 | */
 58 | num_t math::ttest(const vector<num_t>& x,
 59 | 		  const vector<num_t>& y,
 60 | 		  const bool WS) {
 61 | 
 62 |   // Sample mean and variance of x
 63 |   num_t mean_x = math::mean(x);
 64 |   num_t var_x = math::var(x,mean_x);
 65 |   size_t n_x = x.size();
 66 | 
 67 |   // If sample size is too small, we exit
 68 |   if ( n_x < 2 ) {
 69 |     return( datadefs::NUM_NAN );
 70 |   }
 71 | 
 72 |   // Sample mean and variance of y
 73 |   num_t mean_y = math::mean(y);
 74 |   num_t var_y = math::var(y,mean_y);
 75 |   size_t n_y = y.size();
 76 | 
 77 |   // If sample size is too small, we exit
 78 |   if ( n_y < 2 ) {
 79 |     return( datadefs::NUM_NAN );
 80 |   }
 81 | 
 82 |   // Degrees of freedom
 83 |   num_t v;
 84 | 
 85 |   // Standard deviation
 86 |   num_t s;
 87 | 
 88 |   if ( !WS ) {
 89 |     v = static_cast<num_t>( n_x + n_y - 2 );
 90 |     num_t sp = sqrt(((n_x-1) * var_x + (n_y-1) * var_y) / v);
 91 |     s = sp * sqrt(1.0 / n_x + 1.0 / n_y);
 92 |   } else {
 93 |     num_t h1 = pow(var_x / n_x + var_y / n_y,2);
 94 |     num_t h2 = pow( var_x / n_x, 2) / (n_x - 1) + pow(var_y/n_y,2)/(n_y-1);
 95 |     v = h1 / h2 ;
 96 |     s = sqrt( var_x / n_x + var_y / n_y );
 97 |   }
 98 | 
 99 |   // If pooled standard deviation is zero...
100 |   if ( fabs(s) < datadefs::EPS ) {
101 |     if ( mean_x > mean_y ) {
102 |       return( datadefs::EPS ); // ... and x larger than y => p = EPS 
103 |     } else if ( fabs( mean_x - mean_y ) < datadefs::EPS ) {
104 |       return( 0.5 ); // ... and x and y almost equal => p = 0.5
105 |     } else {
106 |       return( 1.0 ); // ... and x smaller than y => p = 1.0
107 |     }
108 |   }
109 | 
110 |   // T-test statistic
111 |   num_t tvalue = (mean_x - mean_y) / s;
112 | 
113 |   // Transformed t-test statistic
114 |   num_t ttrans = v / ( pow(tvalue,2) + v ); 
115 |   
116 |   // This variable will store the integral of the tail of the t-distribution
117 |   num_t integral;
118 | 
119 |   // When ttrans > 0.9, we need to recast the integration in order to retain
120 |   // accuracy. In other words we make use of the following identity:
121 |   //
122 |   // I(x,a,b) = 1 - I(1-x,b,a)
123 |   if ( ttrans > 0.9 ) {
124 | 
125 |     // Calculate I(x,a,b) as 1 - I(1-x,b,a)
126 |     integral = 1 - math::regularizedIncompleteBeta(1 - ttrans, 0.5, v/2);
127 | 
128 |   } else {
129 | 
130 |     // Calculate I(x,a,b) directly
131 |     integral = math::regularizedIncompleteBeta(ttrans, v/2, 0.5);
132 |   }
133 | 
134 |   // We need to be careful about which way to calculate the integral so that it represents 
135 |   // the tail of the t-distribution. The sign of the tvalue hints which way to integrate
136 |   if ( tvalue > 0.0 ) {
137 |     return( integral / 2 );
138 |   } else {
139 |     return( 1 - integral / 2 );
140 |   }
141 | 
142 | }
143 | 
144 | /**
145 |    Odd factors for the infinite continued fraction representation of the 
146 |    regularized incomplete beta function
147 | */
148 | num_t dO(const num_t m, 
149 | 	 const num_t x, 
150 | 	 const num_t a, 
151 | 	 const num_t b) {
152 |   return( -1.0*(a+m)*(a+b+m)*x / ( (a+2*m)*(a+2*m+1) ) );
153 | }
154 | 
155 | /**
156 |    Even factors for the infinite continued fraction representation of the
157 |    regularized incomplete beta function
158 | */
159 | num_t dE(const num_t m, 
160 | 	 const num_t x, 
161 | 	 const num_t a, 
162 | 	 const num_t b) {
163 |   return( m*(b-m)*x / ((a+2*m-1)*(a+2*m)) );
164 | }
165 | 
166 | /**
167 |    Beta function, implemented as function of log-gamma functions implemented 
168 |    in "gamma.hpp"
169 | */
170 | num_t beta(const num_t a, const num_t b) {
171 |   return( exp( lgamma(a) + lgamma(b) - lgamma(a+b) ) );
172 |   // return( exp( LogGamma(a) + LogGamma(b) - LogGamma(a+b) ) );
173 | }
174 | 
175 | // http://en.wikipedia.org/wiki/Beta_function
176 | // http://en.wikipedia.org/wiki/Student's_t-distribution
177 | // http://www.boost.org/doc/libs/1_38_0/libs/math/doc/sf_and_dist/html/math_toolkit/special/sf_beta/ibeta_function.html
178 | // http://www.mpi-hd.mpg.de/astrophysik/HEA/internal/Numerical_Recipes/f6-4.pdf
179 | num_t math::regularizedIncompleteBeta(const num_t x, 
180 | 				      const num_t a,
181 | 				      const num_t b) {
182 | 
183 |   // Number of factors in the infinite continued fraction representation
184 |   size_t i = 50;
185 | 
186 |   num_t continuedFraction = 1; 
187 | 
188 |   // Accumulate the continued fraction
189 |   while ( i >= 1 ) {
190 |     num_t m = static_cast<num_t>(i);
191 |     continuedFraction = 1 + dE(m,x,a,b) / ( 1 + dO(m,x,a,b) / continuedFraction );
192 |     --i;
193 |   }
194 |   
195 |   return( pow(x,a)*pow(1-x,b) / ( a * beta(a,b) * ( 1 + dO(0,x,a,b) / continuedFraction ) ) );
196 | 
197 | }
198 | 
199 | num_t math::erf(const num_t x) {
200 | 
201 |   num_t x2 = x*x;
202 | 
203 |   num_t sgn;
204 |   if(x < 0.0) {
205 |     sgn = -1.0;
206 |   } else {
207 |     sgn = 1.0;
208 |   }
209 | 
210 |   return( sgn*sqrt(1.0 - exp(-x2*(4.0/datadefs::NUM_PI + datadefs::A*x2) / (1+datadefs::A*x2))) );
211 | 
212 | }
213 | 
214 | num_t math::pearsonCorrelation(const vector<num_t>& x,
215 | 			       const vector<num_t>& y) {
216 | 
217 |   assert( x.size() == y.size() );
218 |   
219 |   size_t n = x.size();
220 | 
221 |   if ( n == 0 ) {
222 |     return( datadefs::NUM_NAN );
223 |   }
224 | 
225 |   num_t corr = 0.0;
226 |   
227 |   num_t mu_x = math::mean(x);
228 |   num_t se_x = math::var(x,mu_x) * (n - 1);
229 |   num_t mu_y = math::mean(y);
230 |   num_t se_y = math::var(y,mu_y) * (n - 1);
231 |   
232 |   for(size_t i = 0; i < n; ++i) {
233 |     corr += ( x[i] - mu_x ) * ( y[i] - mu_y );
234 |   }
235 | 
236 |   return( corr / sqrt(se_x*se_y) );
237 | 
238 | }
239 | 
240 | num_t math::gamma(const vector<num_t>& x, const size_t nCategories) {
241 |   
242 |   size_t n = x.size();
243 |   assert( n > 0 );
244 |   assert( nCategories > 0 );
245 |   
246 |   num_t numerator = 0.0;
247 |   num_t denominator = 0.0;
248 |   
249 |   for (size_t i = 0; i < n; ++i) {
250 |     num_t abs_data_i = fabs( x[i] );
251 |     denominator += abs_data_i * (1.0 - abs_data_i);
252 |     numerator   += x[i];
253 |   }
254 |   
255 |   if ( fabs(denominator) <= datadefs::EPS ) {
256 |     return( datadefs::LOG_OF_MAX_NUM * numerator );
257 |   } else {
258 |     return( (numerator*(nCategories - 1)) / (denominator*nCategories) );
259 |   }
260 |   
261 | }
262 | 
263 | 
264 | num_t math::numericalError(const vector<num_t>& x, const vector<num_t>& y) {
265 | 
266 |   assert( x.size() == y.size() );
267 | 
268 |   size_t n = x.size();
269 | 
270 |   if ( n == 0 ) {
271 |     return(datadefs::NUM_NAN);
272 |   }
273 | 
274 |   num_t ret = 0.0;
275 | 
276 |   for ( size_t i = 0; i < n; ++i ) {
277 |     ret += pow( x[i] - y[i], 2 ) / n;
278 |   }
279 | 
280 |   return( sqrt(ret) );
281 | 
282 | }
283 | 
284 | 
285 | num_t math::var(const vector<num_t>& x) {
286 | 
287 |   num_t mu = math::mean(x);
288 | 
289 |   return( math::var(x,mu) );
290 | 
291 | }
292 | 
293 | num_t math::var(const vector<num_t>& x, const num_t& mu) {
294 |   
295 |   if ( x.size() < 2 ) {
296 |     return( datadefs::NUM_NAN );
297 |   }
298 | 
299 |   size_t n = x.size();
300 | 
301 |   num_t ret = 0.0;
302 | 
303 |   for(size_t i = 0; i < n; ++i) {
304 |     ret += pow(x[i] - mu,2) / ( n - 1 );
305 |   }
306 | 
307 |   return( ret );
308 | 
309 | }
310 | 
311 | 


--------------------------------------------------------------------------------
/src/math.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef MATH_HPP
  2 | #define MATH_HPP
  3 | 
  4 | #include <vector>
  5 | #include <algorithm>
  6 | #include <map>
  7 | #include <unordered_map>
  8 | #include <utility>
  9 | #include "datadefs.hpp"
 10 | #include "errno.hpp"
 11 | 
 12 | 
 13 | using namespace std;
 14 | using datadefs::num_t;
 15 | 
 16 | 
 17 | namespace math {
 18 | 
 19 |   /**
 20 |      Returns the p'th percentile of the data vector x
 21 |   */
 22 |   template<class T>
 23 |   T percentile(vector<T> x, const num_t p) {
 24 | 
 25 |     // If the data vector has length 0, return
 26 |     if ( x.size() == 0 ) {
 27 |       cerr << "math::percentile() -- cannot compute with vector of length 0!" << endl;
 28 |       exit(1);
 29 |     }
 30 | 
 31 |     T prc;
 32 | 
 33 |     // Sort data to increasing order
 34 |     sort(x.begin(),x.end());
 35 | 
 36 |     // Exact index without rounding
 37 |     T k = ( x.size() - 1 ) * p;
 38 | 
 39 |     // Lower bound of the index
 40 |     T f = floor(k);
 41 | 
 42 |     // Upper bound of the index
 43 |     T c = ceil(k);
 44 | 
 45 |     // If the upper and lower bounds are equal,
 46 |     // we can calculate the percentile directly
 47 |     // by the index k
 48 |     if(fabs(f - c) < datadefs::EPS) {
 49 |       prc = x[static_cast<size_t>(k)];
 50 |     } else {
 51 | 
 52 |       // Otherwise we will interpolate linearly based on the
 53 |       // distances from the intermediate point (k) to both
 54 |       // bounds: ceil->k and k->floor
 55 |       T d0 = x[static_cast<size_t>(f)] * (c - k);
 56 |       T d1 = x[static_cast<size_t>(c)] * (k - f);
 57 | 
 58 |       // This operation equals to the weighted average,
 59 |       // which in other words is the interpolated percentile
 60 |       // we were after
 61 |       prc = d0 + d1;
 62 |     }
 63 | 
 64 |     // Finally return the calculated percentile
 65 |     return( prc );
 66 |     
 67 |   }
 68 | 
 69 |   void transformLogistic(size_t nCategories, vector<num_t>& prediction, vector<num_t>& probability);
 70 | 
 71 |   /**
 72 |      Error function
 73 |      NOTE: see http://en.wikipedia.org/wiki/Error_function
 74 |   */
 75 |   num_t erf(num_t x);
 76 | 
 77 |   void adjustPValues(vector<num_t>& pValues, const size_t nTests);
 78 | 
 79 |   /**
 80 |      Two-sample t-test
 81 |      NOTE: see http://en.wikipedia.org/wiki/Student's_t-test
 82 |   */
 83 |   num_t ttest(const vector<num_t>& x,
 84 |               const vector<num_t>& y,
 85 | 	      const bool WS = false);
 86 | 
 87 |   /**
 88 |      Regularized incomplete Beta function
 89 |      NOTE: see http://en.wikipedia.org/wiki/Beta_function
 90 |   */
 91 |   num_t regularizedIncompleteBeta(const num_t x,
 92 | 				  const num_t a,
 93 | 				  const num_t b);
 94 | 
 95 | 
 96 |   
 97 |   num_t pearsonCorrelation(const vector<num_t>& x,
 98 | 			   const vector<num_t>& y);
 99 | 
100 |   inline num_t mean(const vector<num_t>& x) {
101 |     
102 |     if ( x.size() == 0 ) {
103 |       return( datadefs::NUM_NAN );
104 |     }
105 | 
106 |     num_t mu = 0.0;
107 | 
108 |     for(size_t i = 0; i < x.size(); ++i) {
109 | 	mu += x[i];
110 |     }
111 | 
112 |     return( mu / x.size() );
113 |     
114 |   }
115 | 
116 |   template<typename T>
117 |   unordered_map<T,size_t> frequency(const vector<T>& x) {
118 |     unordered_map<T,size_t> freq;
119 |     for(size_t i = 0; i < x.size(); ++i) {
120 |       if( freq.find(x[i]) == freq.end() ) {
121 | 	freq[ x[i] ] = 1;
122 |       } else {
123 | 	++freq[ x[i] ];
124 |       }
125 |     } 
126 |     return( freq );
127 |   }
128 | 
129 |   template<typename T>
130 |   T mode(const vector<T>& x) {
131 |     unordered_map<T,size_t> freq = frequency(x);
132 |     typename unordered_map<T,size_t>::const_iterator maxElement( freq.begin() );
133 |     for ( typename unordered_map<T,size_t>::const_iterator it(freq.begin()); it != freq.end(); ++it ) {
134 |       if ( it->second > maxElement->second ) {
135 | 	maxElement = it;
136 |       }
137 |     }
138 |     return( maxElement->first );
139 |   }
140 | 
141 |   template<typename T>
142 |   size_t nMismatches(const vector<T>& x, const T& y) {
143 |     size_t count = 0;
144 |     for ( size_t i = 0; i < x.size(); ++i ) {
145 |       if ( x[i] != y ) {
146 | 	++count;
147 |       }
148 |     }
149 |     return( count );
150 |   }
151 |   
152 |   template<typename T>
153 |   map<T,map<T,size_t> > confusionMap(const vector<T>& x, const vector<T>& y) {
154 | 
155 |     assert(x.size() == y.size());
156 | 
157 |     map<T,map<T,size_t> > cMap;
158 | 
159 |     set<T> allClasses;
160 | 
161 |     for ( size_t i = 0; i < x.size(); ++i ) {
162 |       T a = x[i];
163 |       T b = y[i];
164 |       allClasses.insert(a);
165 |       allClasses.insert(b);
166 |       if ( cMap[a].find(b) == cMap[a].end() ) {
167 | 	cMap[a][b] = 1;
168 |       } else {
169 | 	++cMap[a][b];
170 |       }
171 |     }
172 | 
173 |   }
174 | 
175 |   template<typename T> 
176 |   num_t categoricalError(const vector<T>& x, const vector<T>& y) {
177 | 
178 |     assert( x.size() == y.size() );
179 | 
180 |     size_t n = x.size();
181 | 
182 |     if ( n == 0 ) {
183 |       return(datadefs::NUM_NAN);
184 |     }
185 | 
186 |     num_t ret = 0.0;
187 | 
188 |     for ( size_t i = 0; i < n; ++i ) {
189 |       ret += static_cast<num_t>( x[i] != y[i] ) / n;
190 |     }
191 |     
192 |     return( ret );
193 |     
194 |   }
195 |   
196 |   num_t numericalError(const vector<num_t>& x, const vector<num_t>& y);
197 | 
198 | 
199 |   num_t gamma(const vector<num_t>& x, const size_t nCategories); 
200 |   
201 |   //num_t squaredError(const vector<num_t>& x);
202 |   
203 |   //num_t squaredError(const vector<num_t>& x, const num_t mu); 
204 | 
205 |   // Unbiased variance estimate: 1/(n-1)*sum(y-y_i)^2
206 |   num_t var(const vector<num_t>& x);
207 | 
208 |   num_t var(const vector<num_t>& x, const num_t& mu);
209 | 
210 |   /**
211 |      Updates the squared frequency by ADDING x_n to the set
212 |      NOTE: NANs will corrupt the data
213 |   */
214 |   template<typename T>
215 |   inline void incrementSquaredFrequency(const T& x_n,
216 | 					unordered_map<T,size_t>& freq,
217 | 					size_t& sqFreq) {
218 | 
219 | 
220 |     // Check if the value already exists in the frequency map
221 |     typename unordered_map<T,size_t>::iterator it(freq.find(x_n));
222 |     if(it == freq.end()) {
223 | 
224 |       // If not, squared frequency becomes updated by 1
225 |       sqFreq += 1;
226 |       freq[x_n] = 1;
227 | 
228 |     } else {
229 | 
230 |       // Otherwise the squared frequency becomes updated by 
231 |       // 2*freq + 1
232 |       sqFreq += 2*freq[x_n] + 1;
233 |       it->second++; //freq[x_n];
234 | 
235 |     }
236 |   }
237 |  
238 |   /**
239 |      Updates the squared frequency by REMOVING x_n
240 |      from the set
241 |      NOTE: NANs will corrupt the data
242 |   */ 
243 |   template<typename T>
244 |   inline void decrementSquaredFrequency(const T& x_n,
245 |                                         unordered_map<T,size_t>& freq,
246 |                                         size_t& sqFreq) {
247 |     
248 |     assert( freq.find(x_n) != freq.end() );
249 |     assert( freq[x_n] > 0);
250 |     
251 |     sqFreq -= 2*freq[x_n] - 1;
252 |     --freq[x_n];
253 |     
254 |     if(freq[x_n] == 0) {
255 |       freq.erase(x_n);
256 |     }
257 |   }
258 |   
259 |   // Calculates decrease in impurity for a numerical target
260 |   inline num_t deltaImpurity_regr(const num_t mu_tot,
261 | 				  const size_t n_tot,
262 | 				  const num_t mu_left,
263 | 				  const size_t n_left,
264 | 				  const num_t mu_right,
265 | 				  const size_t n_right) {
266 | 
267 |     return( - mu_tot   * mu_tot
268 | 	    + mu_left  * mu_left  * n_left  / n_tot
269 | 	    + mu_right * mu_right * n_right / n_tot );
270 |     
271 |   }
272 |  
273 |   inline num_t deltaImpurity_class(const size_t sf_tot,
274 | 				   const size_t n_tot,
275 | 				   const size_t sf_left,
276 | 				   const size_t n_left,
277 | 				   const size_t sf_right,
278 | 				   const size_t n_right) {
279 | 
280 |     //cout << - 1.0 * sf_tot   / ( 1.0 * n_tot * n_tot   ) << " + " << 1.0 * sf_left  / ( 1.0 * n_tot * n_left  ) << " + " << 1.0 * sf_right / ( 1.0 * n_tot * n_right ) << endl;
281 | 
282 |     return( - 1.0 * sf_tot   / ( n_tot * n_tot   ) 
283 | 	    + 1.0 * sf_left  / ( n_tot * n_left  )
284 | 	    + 1.0 * sf_right / ( n_tot * n_right ) );
285 | 
286 |   }
287 | 
288 |   template <typename T>
289 |   inline void setUnion(set<T>& baseSet, const set<T>& newSet) {
290 | 
291 |     for ( typename set<T>::const_iterator it( newSet.begin() ); it != newSet.end(); ++it ) {
292 |       baseSet.insert(*it);
293 |     }
294 | 
295 |   }
296 | 
297 |   
298 | }
299 |   
300 | #endif
301 | 


--------------------------------------------------------------------------------
/src/mtrand.h:
--------------------------------------------------------------------------------
  1 | // mtrand.h
  2 | // C++ include file for MT19937, with initialization improved 2002/1/26.
  3 | // Coded by Takuji Nishimura and Makoto Matsumoto.
  4 | // Ported to C++ by Jasper Bedaux 2003/1/1 (see http://www.bedaux.net/mtrand/).
  5 | // The generators returning floating point numbers are based on
  6 | // a version by Isaku Wada, 2002/01/09
  7 | //
  8 | // Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
  9 | // All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions
 13 | // are met:
 14 | //
 15 | // 1. Redistributions of source code must retain the above copyright
 16 | //    notice, this list of conditions and the following disclaimer.
 17 | //
 18 | // 2. Redistributions in binary form must reproduce the above copyright
 19 | //    notice, this list of conditions and the following disclaimer in the
 20 | //    documentation and/or other materials provided with the distribution.
 21 | //
 22 | // 3. The names of its contributors may not be used to endorse or promote
 23 | //    products derived from this software without specific prior written
 24 | //    permission.
 25 | //
 26 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 27 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 28 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 29 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 37 | //
 38 | // Any feedback is very welcome.
 39 | // http://www.math.keio.ac.jp/matumoto/emt.html
 40 | // email: matumoto@math.keio.ac.jp
 41 | //
 42 | // Feedback about the C++ port should be sent to Jasper Bedaux,
 43 | // see http://www.bedaux.net/mtrand/ for e-mail address and info.
 44 | 
 45 | #ifndef MTRAND_H
 46 | #define MTRAND_H
 47 | 
 48 | class MTRand_int32 { // Mersenne Twister random number generator
 49 | public:
 50 | // default constructor: uses default seed only if this is the first instance
 51 |   MTRand_int32() { if (!init) seed(5489UL); init = true; }
 52 | // constructor with 32 bit int as seed
 53 |   MTRand_int32(unsigned long s) { seed(s); init = true; }
 54 | // constructor with array of size 32 bit ints as seed
 55 |   MTRand_int32(const unsigned long* array, int size) { seed(array, size); init = true; }
 56 | // the two seed functions
 57 |   void seed(unsigned long); // seed with 32 bit integer
 58 |   void seed(const unsigned long*, int size); // seed with array
 59 | // overload operator() to make this a generator (functor)
 60 |   unsigned long operator()() { return rand_int32(); }
 61 | // 2007-02-11: made the destructor virtual; thanks "double more" for pointing this out
 62 |   virtual ~MTRand_int32() {} // destructor
 63 | protected: // used by derived classes, otherwise not accessible; use the ()-operator
 64 |   unsigned long rand_int32(); // generate 32 bit random integer
 65 | private:
 66 |   static const int n = 624, m = 397; // compile time constants
 67 | // the variables below are static (no duplicates can exist)
 68 |   static unsigned long state[n]; // state vector array
 69 |   static int p; // position in state array
 70 |   static bool init; // true if init function is called
 71 | // private functions used to generate the pseudo random numbers
 72 |   unsigned long twiddle(unsigned long, unsigned long); // used by gen_state()
 73 |   void gen_state(); // generate new state
 74 | // make copy constructor and assignment operator unavailable, they don't make sense
 75 |   MTRand_int32(const MTRand_int32&); // copy constructor not defined
 76 |   void operator=(const MTRand_int32&); // assignment operator not defined
 77 | };
 78 | 
 79 | // inline for speed, must therefore reside in header file
 80 | inline unsigned long MTRand_int32::twiddle(unsigned long u, unsigned long v) {
 81 |   return (((u & 0x80000000UL) | (v & 0x7FFFFFFFUL)) >> 1)
 82 |     ^ ((v & 1UL) ? 0x9908B0DFUL : 0x0UL);
 83 | }
 84 | 
 85 | inline unsigned long MTRand_int32::rand_int32() { // generate 32 bit random int
 86 |   if (p == n) gen_state(); // new state vector needed
 87 | // gen_state() is split off to be non-inline, because it is only called once
 88 | // in every 624 calls and otherwise irand() would become too big to get inlined
 89 |   unsigned long x = state[p++];
 90 |   x ^= (x >> 11);
 91 |   x ^= (x << 7) & 0x9D2C5680UL;
 92 |   x ^= (x << 15) & 0xEFC60000UL;
 93 |   return x ^ (x >> 18);
 94 | }
 95 | 
 96 | // generates double floating point numbers in the half-open interval [0, 1)
 97 | class MTRand : public MTRand_int32 {
 98 | public:
 99 |   MTRand() : MTRand_int32() {}
100 |   MTRand(unsigned long seed) : MTRand_int32(seed) {}
101 |   MTRand(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
102 |   ~MTRand() {}
103 |   double operator()() {
104 |     return static_cast<double>(rand_int32()) * (1. / 4294967296.); } // divided by 2^32
105 | private:
106 |   MTRand(const MTRand&); // copy constructor not defined
107 |   void operator=(const MTRand&); // assignment operator not defined
108 | };
109 | 
110 | // generates double floating point numbers in the closed interval [0, 1]
111 | class MTRand_closed : public MTRand_int32 {
112 | public:
113 |   MTRand_closed() : MTRand_int32() {}
114 |   MTRand_closed(unsigned long seed) : MTRand_int32(seed) {}
115 |   MTRand_closed(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
116 |   ~MTRand_closed() {}
117 |   double operator()() {
118 |     return static_cast<double>(rand_int32()) * (1. / 4294967295.); } // divided by 2^32 - 1
119 | private:
120 |   MTRand_closed(const MTRand_closed&); // copy constructor not defined
121 |   void operator=(const MTRand_closed&); // assignment operator not defined
122 | };
123 | 
124 | // generates double floating point numbers in the open interval (0, 1)
125 | class MTRand_open : public MTRand_int32 {
126 | public:
127 |   MTRand_open() : MTRand_int32() {}
128 |   MTRand_open(unsigned long seed) : MTRand_int32(seed) {}
129 |   MTRand_open(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
130 |   ~MTRand_open() {}
131 |   double operator()() {
132 |     return (static_cast<double>(rand_int32()) + .5) * (1. / 4294967296.); } // divided by 2^32
133 | private:
134 |   MTRand_open(const MTRand_open&); // copy constructor not defined
135 |   void operator=(const MTRand_open&); // assignment operator not defined
136 | };
137 | 
138 | // generates 53 bit resolution doubles in the half-open interval [0, 1)
139 | class MTRand53 : public MTRand_int32 {
140 | public:
141 |   MTRand53() : MTRand_int32() {}
142 |   MTRand53(unsigned long seed) : MTRand_int32(seed) {}
143 |   MTRand53(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
144 |   ~MTRand53() {}
145 |   double operator()() {
146 |     return (static_cast<double>(rand_int32() >> 5) * 67108864. + 
147 |       static_cast<double>(rand_int32() >> 6)) * (1. / 9007199254740992.); }
148 | private:
149 |   MTRand53(const MTRand53&); // copy constructor not defined
150 |   void operator=(const MTRand53&); // assignment operator not defined
151 | };
152 | 
153 | #endif // MTRAND_H
154 | 


--------------------------------------------------------------------------------
/src/murmurhash3.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
  3 | // domain. The author hereby disclaims copyright to this source code.
  4 | 
  5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the
  6 | // algorithms are optimized for their respective platforms. You can still
  7 | // compile and run any of them on any platform, but your performance with the
  8 | // non-native version will be less than optimal.
  9 | 
 10 | #include "murmurhash3.hpp"
 11 | 
 12 | //-----------------------------------------------------------------------------
 13 | // Platform-specific functions and macros
 14 | 
 15 | // Microsoft Visual Studio
 16 | 
 17 | #if defined(_MSC_VER)
 18 | 
 19 | #define FORCE_INLINE	__forceinline
 20 | 
 21 | #include <stdlib.h>
 22 | 
 23 | #define ROTL32(x,y)	_rotl(x,y)
 24 | #define ROTL64(x,y)	_rotl64(x,y)
 25 | 
 26 | #define BIG_CONSTANT(x) (x)
 27 | 
 28 | // Other compilers
 29 | 
 30 | #else	// defined(_MSC_VER)
 31 | 
 32 | #define	FORCE_INLINE inline __attribute__((always_inline))
 33 | 
 34 | inline uint32_t rotl32 ( uint32_t x, int8_t r )
 35 | {
 36 |   return (x << r) | (x >> (32 - r));
 37 | }
 38 | 
 39 | inline uint64_t rotl64 ( uint64_t x, int8_t r )
 40 | {
 41 |   return (x << r) | (x >> (64 - r));
 42 | }
 43 | 
 44 | #define	ROTL32(x,y)	rotl32(x,y)
 45 | #define ROTL64(x,y)	rotl64(x,y)
 46 | 
 47 | #define BIG_CONSTANT(x) (x##LLU)
 48 | 
 49 | #endif // !defined(_MSC_VER)
 50 | 
 51 | //-----------------------------------------------------------------------------
 52 | // Block read - if your platform needs to do endian-swapping or can only
 53 | // handle aligned reads, do the conversion here
 54 | 
 55 | FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
 56 | {
 57 |   return p[i];
 58 | }
 59 | 
 60 | FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
 61 | {
 62 |   return p[i];
 63 | }
 64 | 
 65 | //-----------------------------------------------------------------------------
 66 | // Finalization mix - force all bits of a hash block to avalanche
 67 | 
 68 | FORCE_INLINE uint32_t fmix ( uint32_t h )
 69 | {
 70 |   h ^= h >> 16;
 71 |   h *= 0x85ebca6b;
 72 |   h ^= h >> 13;
 73 |   h *= 0xc2b2ae35;
 74 |   h ^= h >> 16;
 75 | 
 76 |   return h;
 77 | }
 78 | 
 79 | //----------
 80 | 
 81 | FORCE_INLINE uint64_t fmix ( uint64_t k )
 82 | {
 83 |   k ^= k >> 33;
 84 |   k *= BIG_CONSTANT(0xff51afd7ed558ccd);
 85 |   k ^= k >> 33;
 86 |   k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
 87 |   k ^= k >> 33;
 88 | 
 89 |   return k;
 90 | }
 91 | 
 92 | //-----------------------------------------------------------------------------
 93 | 
 94 | void MurmurHash3_x86_32 ( const void * key, int len,
 95 |                           uint32_t seed, void * out )
 96 | {
 97 |   const uint8_t * data = (const uint8_t*)key;
 98 |   const int nblocks = len / 4;
 99 | 
100 |   uint32_t h1 = seed;
101 | 
102 |   const uint32_t c1 = 0xcc9e2d51;
103 |   const uint32_t c2 = 0x1b873593;
104 | 
105 |   //----------
106 |   // body
107 | 
108 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
109 | 
110 |   for(int i = -nblocks; i; i++)
111 |   {
112 |     uint32_t k1 = getblock(blocks,i);
113 | 
114 |     k1 *= c1;
115 |     k1 = ROTL32(k1,15);
116 |     k1 *= c2;
117 |     
118 |     h1 ^= k1;
119 |     h1 = ROTL32(h1,13); 
120 |     h1 = h1*5+0xe6546b64;
121 |   }
122 | 
123 |   //----------
124 |   // tail
125 | 
126 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
127 | 
128 |   uint32_t k1 = 0;
129 | 
130 |   switch(len & 3)
131 |   {
132 |   case 3: k1 ^= tail[2] << 16;
133 |   case 2: k1 ^= tail[1] << 8;
134 |   case 1: k1 ^= tail[0];
135 |           k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
136 |   };
137 | 
138 |   //----------
139 |   // finalization
140 | 
141 |   h1 ^= len;
142 | 
143 |   h1 = fmix(h1);
144 | 
145 |   *(uint32_t*)out = h1;
146 | } 
147 | 
148 | //-----------------------------------------------------------------------------
149 | 
150 | void MurmurHash3_x86_128 ( const void * key, const int len,
151 |                            uint32_t seed, void * out )
152 | {
153 |   const uint8_t * data = (const uint8_t*)key;
154 |   const int nblocks = len / 16;
155 | 
156 |   uint32_t h1 = seed;
157 |   uint32_t h2 = seed;
158 |   uint32_t h3 = seed;
159 |   uint32_t h4 = seed;
160 | 
161 |   const uint32_t c1 = 0x239b961b; 
162 |   const uint32_t c2 = 0xab0e9789;
163 |   const uint32_t c3 = 0x38b34ae5; 
164 |   const uint32_t c4 = 0xa1e38b93;
165 | 
166 |   //----------
167 |   // body
168 | 
169 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
170 | 
171 |   for(int i = -nblocks; i; i++)
172 |   {
173 |     uint32_t k1 = getblock(blocks,i*4+0);
174 |     uint32_t k2 = getblock(blocks,i*4+1);
175 |     uint32_t k3 = getblock(blocks,i*4+2);
176 |     uint32_t k4 = getblock(blocks,i*4+3);
177 | 
178 |     k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
179 | 
180 |     h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
181 | 
182 |     k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
183 | 
184 |     h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
185 | 
186 |     k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
187 | 
188 |     h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
189 | 
190 |     k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
191 | 
192 |     h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
193 |   }
194 | 
195 |   //----------
196 |   // tail
197 | 
198 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
199 | 
200 |   uint32_t k1 = 0;
201 |   uint32_t k2 = 0;
202 |   uint32_t k3 = 0;
203 |   uint32_t k4 = 0;
204 | 
205 |   switch(len & 15)
206 |   {
207 |   case 15: k4 ^= tail[14] << 16;
208 |   case 14: k4 ^= tail[13] << 8;
209 |   case 13: k4 ^= tail[12] << 0;
210 |            k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
211 | 
212 |   case 12: k3 ^= tail[11] << 24;
213 |   case 11: k3 ^= tail[10] << 16;
214 |   case 10: k3 ^= tail[ 9] << 8;
215 |   case  9: k3 ^= tail[ 8] << 0;
216 |            k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
217 | 
218 |   case  8: k2 ^= tail[ 7] << 24;
219 |   case  7: k2 ^= tail[ 6] << 16;
220 |   case  6: k2 ^= tail[ 5] << 8;
221 |   case  5: k2 ^= tail[ 4] << 0;
222 |            k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
223 | 
224 |   case  4: k1 ^= tail[ 3] << 24;
225 |   case  3: k1 ^= tail[ 2] << 16;
226 |   case  2: k1 ^= tail[ 1] << 8;
227 |   case  1: k1 ^= tail[ 0] << 0;
228 |            k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
229 |   };
230 | 
231 |   //----------
232 |   // finalization
233 | 
234 |   h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
235 | 
236 |   h1 += h2; h1 += h3; h1 += h4;
237 |   h2 += h1; h3 += h1; h4 += h1;
238 | 
239 |   h1 = fmix(h1);
240 |   h2 = fmix(h2);
241 |   h3 = fmix(h3);
242 |   h4 = fmix(h4);
243 | 
244 |   h1 += h2; h1 += h3; h1 += h4;
245 |   h2 += h1; h3 += h1; h4 += h1;
246 | 
247 |   ((uint32_t*)out)[0] = h1;
248 |   ((uint32_t*)out)[1] = h2;
249 |   ((uint32_t*)out)[2] = h3;
250 |   ((uint32_t*)out)[3] = h4;
251 | }
252 | 
253 | //-----------------------------------------------------------------------------
254 | 
255 | void MurmurHash3_x64_128 ( const void * key, const int len,
256 |                            const uint32_t seed, void * out )
257 | {
258 |   const uint8_t * data = (const uint8_t*)key;
259 |   const int nblocks = len / 16;
260 | 
261 |   uint64_t h1 = seed;
262 |   uint64_t h2 = seed;
263 | 
264 |   const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
265 |   const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
266 | 
267 |   //----------
268 |   // body
269 | 
270 |   const uint64_t * blocks = (const uint64_t *)(data);
271 | 
272 |   for(int i = 0; i < nblocks; i++)
273 |   {
274 |     uint64_t k1 = getblock(blocks,i*2+0);
275 |     uint64_t k2 = getblock(blocks,i*2+1);
276 | 
277 |     k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
278 | 
279 |     h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
280 | 
281 |     k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
282 | 
283 |     h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
284 |   }
285 | 
286 |   //----------
287 |   // tail
288 | 
289 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
290 | 
291 |   uint64_t k1 = 0;
292 |   uint64_t k2 = 0;
293 | 
294 |   switch(len & 15)
295 |   {
296 |   case 15: k2 ^= uint64_t(tail[14]) << 48;
297 |   case 14: k2 ^= uint64_t(tail[13]) << 40;
298 |   case 13: k2 ^= uint64_t(tail[12]) << 32;
299 |   case 12: k2 ^= uint64_t(tail[11]) << 24;
300 |   case 11: k2 ^= uint64_t(tail[10]) << 16;
301 |   case 10: k2 ^= uint64_t(tail[ 9]) << 8;
302 |   case  9: k2 ^= uint64_t(tail[ 8]) << 0;
303 |            k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
304 | 
305 |   case  8: k1 ^= uint64_t(tail[ 7]) << 56;
306 |   case  7: k1 ^= uint64_t(tail[ 6]) << 48;
307 |   case  6: k1 ^= uint64_t(tail[ 5]) << 40;
308 |   case  5: k1 ^= uint64_t(tail[ 4]) << 32;
309 |   case  4: k1 ^= uint64_t(tail[ 3]) << 24;
310 |   case  3: k1 ^= uint64_t(tail[ 2]) << 16;
311 |   case  2: k1 ^= uint64_t(tail[ 1]) << 8;
312 |   case  1: k1 ^= uint64_t(tail[ 0]) << 0;
313 |            k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
314 |   };
315 | 
316 |   //----------
317 |   // finalization
318 | 
319 |   h1 ^= len; h2 ^= len;
320 | 
321 |   h1 += h2;
322 |   h2 += h1;
323 | 
324 |   h1 = fmix(h1);
325 |   h2 = fmix(h2);
326 | 
327 |   h1 += h2;
328 |   h2 += h1;
329 | 
330 |   ((uint64_t*)out)[0] = h1;
331 |   ((uint64_t*)out)[1] = h2;
332 | }
333 | 
334 | //-----------------------------------------------------------------------------
335 | 
336 | 


--------------------------------------------------------------------------------
/src/murmurhash3.hpp:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
 3 | // domain. The author hereby disclaims copyright to this source code.
 4 | 
 5 | #ifndef _MURMURHASH3_H_
 6 | #define _MURMURHASH3_H_
 7 | 
 8 | //-----------------------------------------------------------------------------
 9 | // Platform-specific functions and macros
10 | 
11 | // Microsoft Visual Studio
12 | 
13 | #if defined(_MSC_VER)
14 | 
15 | typedef unsigned char uint8_t;
16 | typedef unsigned long uint32_t;
17 | typedef unsigned __int64 uint64_t;
18 | 
19 | // Other compilers
20 | 
21 | #else	// defined(_MSC_VER)
22 | 
23 | #include <stdint.h>
24 | 
25 | #endif // !defined(_MSC_VER)
26 | 
27 | //-----------------------------------------------------------------------------
28 | 
29 | void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
30 | 
31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
32 | 
33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
34 | 
35 | //-----------------------------------------------------------------------------
36 | 
37 | #endif // _MURMURHASH3_H_
38 | 


--------------------------------------------------------------------------------
/src/node.hpp:
--------------------------------------------------------------------------------
  1 | //node.hpp
  2 | //
  3 | //A node class for CARTs
  4 | 
  5 | #ifndef NODE_HPP
  6 | #define NODE_HPP
  7 | 
  8 | #include <cstdlib>
  9 | #include <vector>
 10 | #include <map>
 11 | #include <set>
 12 | #include <unordered_set>
 13 | #include <string>
 14 | #include "datadefs.hpp"
 15 | #include "treedata.hpp"
 16 | #include "options.hpp"
 17 | #include "utils.hpp"
 18 | #include "distributions.hpp"
 19 | 
 20 | using namespace std;
 21 | using datadefs::num_t;
 22 | 
 23 | class Node {
 24 | public:
 25 | 
 26 |   struct Prediction {
 27 | 
 28 |     Feature::Type type;
 29 |     num_t numTrainPrediction;
 30 |     cat_t catTrainPrediction;
 31 |     vector<num_t> numTrainData;
 32 |     vector<cat_t> catTrainData;
 33 |   };
 34 |   
 35 |   struct Splitter {
 36 |     
 37 |     num_t fitness;
 38 |     string name;
 39 |     Feature::Type type;
 40 |     uint32_t hashValue;
 41 |     num_t leftLeqValue;
 42 |     unordered_set<cat_t> leftValues;
 43 |     
 44 |     Splitter(): fitness(0.0), name(""), type(Feature::Type::UNKNOWN) {}
 45 |     
 46 |   };
 47 | 
 48 |   //Initializes node.
 49 |   Node();
 50 |   ~Node();
 51 |   
 52 |   //Gets the splitter for the node
 53 |   const string& splitterName() const { return( splitter_.name ); }
 54 |   
 55 |   //Sets a splitter feature for the node.
 56 |   //NOTE: splitter can be assigned only once! Subsequent setter calls will raise an assertion failure.
 57 |   void setSplitter(const num_t splitFitness,
 58 | 		   const string& splitterName,
 59 |                    const num_t splitLeftLeqValue,
 60 | 		   Node& leftChild,
 61 | 		   Node& rightChild);
 62 |   
 63 |   void setSplitter(const num_t splitFitness,
 64 | 		   const string& splitterName,
 65 |                    const unordered_set<cat_t>& leftSplitValues,
 66 | 		   Node& leftChild,
 67 | 		   Node& rightChild);
 68 |   
 69 |   void setSplitter(const num_t splitFitness,
 70 | 		   const string& splitterName,
 71 | 		   const uint32_t hashIdx,
 72 | 		   Node& leftChild,
 73 | 		   Node& rightChild);
 74 |   
 75 |   void setMissingChild(Node& missingChild);
 76 |   
 77 |   //Given a value, descends to either one of the child nodes, if existing, otherwise returns a pointer to the current node
 78 |   Node* percolate(TreeData* testData, const size_t sampleIdx, const size_t scrambleFeatureIdx = datadefs::MAX_IDX);
 79 |   
 80 |   void setNumTrainPrediction(const num_t& numTrainPrediction);
 81 |   void setCatTrainPrediction(const cat_t& catTrainPrediction);
 82 |   
 83 |   //Logic test whether the node has children or not
 84 |   inline bool hasChildren() const { return( this->leftChild() || this->rightChild() ); }
 85 | 
 86 |   Node* leftChild() const;
 87 |   Node* rightChild() const;
 88 |   Node* missingChild() const;
 89 | 
 90 |   vector<Node*> getSubTreeLeaves();
 91 | 
 92 |   void setNumTrainData(const vector<num_t>& numTrainData);
 93 |   void setCatTrainData(const vector<cat_t>& catTrainData);
 94 | 
 95 |   const Prediction& getPrediction();
 96 | 
 97 |   const Splitter& getSplitter();
 98 | 
 99 |   void recursiveWriteTree(string& traversal, ofstream& toFile);
100 | 
101 |   enum PredictionFunctionType { MEAN, MODE, GAMMA };
102 | 
103 | #ifndef TEST__
104 | protected:
105 | #endif
106 | 
107 |   struct SplitCache {
108 | 
109 |     size_t nSamples;
110 |     vector<size_t> featureSampleIcs;
111 | 
112 |     vector<size_t> sampleIcs_left;
113 |     vector<size_t> sampleIcs_right;
114 |     vector<size_t> sampleIcs_missing;
115 |     uint32_t hashIdx;
116 |     size_t splitFeatureIdx;
117 |     num_t splitValue;
118 |     unordered_set<cat_t> splitValues_left;
119 |     num_t splitFitness;
120 | 
121 |     vector<size_t> newSampleIcs_left;
122 |     vector<size_t> newSampleIcs_right;
123 |     vector<size_t> newSampleIcs_missing;
124 |     uint32_t newHashIdx;
125 |     size_t newSplitFeatureIdx;
126 |     num_t newSplitValue;
127 |     unordered_set<cat_t> newSplitValues_left;
128 |     num_t newSplitFitness;
129 | 
130 |   };
131 | 
132 |   void recursiveNodeSplit(TreeData* treeData,
133 |                           const size_t targetIdx,
134 | 			  const ForestOptions* forestOptions,
135 | 			  distributions::Random* random,
136 | 			  const PredictionFunctionType& predictionFunctionType,
137 | 			  const distributions::PMF* pmf,
138 | 			  const vector<size_t>& sampleIcs,
139 |                           size_t* nLeaves,
140 | 			  size_t& childIdx,
141 | 			  vector<Node>& children,
142 | 			  SplitCache& splitCache);
143 | 
144 |   bool regularSplitterSeek(TreeData* treeData,
145 | 			   const size_t targetIdx,
146 | 			   const ForestOptions* forestOptions,
147 | 			   distributions::Random* random,
148 | 			   const vector<size_t>& sampleIcs,
149 | 			   size_t& childIdx,
150 | 			   vector<Node>& children,
151 | 			   SplitCache& splitCache);
152 | 
153 | 
154 |   void recursiveGetSubTreeLeaves(vector<Node*>& leaves);
155 | 
156 | #ifndef TEST__
157 | private:
158 | #endif
159 | 
160 |   Splitter splitter_;
161 | 
162 |   Prediction prediction_;
163 | 
164 |   Node* leftChild_;
165 |   Node* rightChild_;
166 |   Node* missingChild_;
167 | 
168 | };
169 | 
170 | #endif
171 | 


--------------------------------------------------------------------------------
/src/progress.cpp:
--------------------------------------------------------------------------------
 1 | #include "progress.hpp"
 2 | 
 3 | 
 4 | Progress::Progress(): 
 5 |   width_(3) { 
 6 |   cout << setw(width_) << "0" << "%" << flush; 
 7 | }
 8 | 
 9 | Progress::~Progress() { 
10 |   reset(); 
11 | }
12 | 
13 | void Progress::update(const num_t fraction) { 
14 |   
15 |   reset(); 
16 |   
17 |   cout << setw(width_) << static_cast<size_t>(fraction*100) << "%" << flush; 
18 | 
19 | }
20 | 
21 | void Progress::reset() { 
22 | 
23 |   for(size_t i = 0; i <= width_; ++i) { 
24 |     cout << "\b"; 
25 |   } 
26 | 
27 | }
28 | 
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/src/progress.hpp:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <iostream>
 3 | #include <iomanip>
 4 | 
 5 | #include "datadefs.hpp"
 6 | 
 7 | using namespace std;
 8 | using datadefs::num_t;
 9 | 
10 | class Progress {
11 | public:
12 |   Progress();
13 |   ~Progress();
14 | 
15 |   void update(const num_t fraction);
16 | 
17 | private:
18 | 
19 |   void reset();
20 | 
21 |   size_t width_;
22 | 
23 | };
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/src/reader.cpp:
--------------------------------------------------------------------------------
 1 | #include "reader.hpp"
 2 | #include <iostream>
 3 | 
 4 | using namespace std;
 5 | 
 6 | Reader::Reader(const string& fileName, const char delimiter): 
 7 |   delimiter_(delimiter) {
 8 | 
 9 |   this->init(fileName);
10 | 
11 | }
12 | 
13 | Reader::~Reader() {
14 | 
15 |   if ( inStream_.is_open() ) {
16 |     inStream_.close();
17 |   }
18 | 
19 | }
20 | 
21 | void Reader::init(const string& fileName) {
22 | 
23 |   inStream_.open(fileName.c_str());
24 | 
25 |   if ( !inStream_.good() ) {
26 |     cerr << "ERROR: failed to open file '" << fileName << "' for reading. Make sure the file exists. Quitting..." << endl;
27 |     exit(1);
28 |   }
29 |   
30 |   this->setLineFeed("");
31 | 
32 |   nLines_ = 0;
33 | 
34 |   string line;
35 | 
36 |   for ( nLines_ = 0; getline(inStream_,line); ++nLines_ ) { }
37 | 
38 |   this->rewind();
39 | 
40 | }
41 | 
42 | bool Reader::endOfLine() const {
43 |   return( lineFeed_.rdbuf()->in_avail() == 0 );
44 | }
45 | 
46 | bool Reader::nextLine() {
47 |   
48 |   string line;
49 |   
50 |   if ( getline(inStream_,line) ) {
51 |     this->setLineFeed(line);
52 |     return(true);
53 |   } else {
54 |     this->setLineFeed(line);
55 |     return(false);
56 |   }
57 |   
58 | }
59 | 
60 | bool Reader::skipField() {
61 |   
62 |   string field;
63 |   
64 |   if ( getline(lineFeed_,field,delimiter_) ) {
65 |     return(true);
66 |   } else {
67 |     return(false);
68 |   }
69 |   
70 | }
71 | 
72 | void Reader::rewind() {
73 | 
74 |   inStream_.clear();
75 |   inStream_.seekg(ios_base::beg);
76 | 
77 |   this->setLineFeed("");
78 | 
79 | }
80 | 
81 | void Reader::checkLineFeed() const {
82 | 
83 |   if ( this->endOfLine() ) {
84 |     cerr << "READ ERROR: tried to read from an empty linefeed. Did you forget Reader::nextLine()?" << endl;
85 |     exit(1);
86 |   }
87 | 
88 | }
89 | 
90 | void Reader::setLineFeed(const string& str) {
91 |   lineFeed_.clear();
92 |   lineFeed_.str(str);
93 | }
94 | 


--------------------------------------------------------------------------------
/src/reader.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef READER_HPP
 2 | #define READER_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <fstream>
 6 | #include <string>
 7 | #include <sstream>
 8 | #include <ios>
 9 | 
10 | #include "utils.hpp"
11 | #include "datadefs.hpp"
12 | 
13 | class Reader {
14 | public:
15 | 
16 |   Reader(const std::string& fileName, const char delimiter = '\t');
17 |   ~Reader();
18 |   
19 |   template<typename T> inline friend Reader& operator>>(Reader& reader, T& val) {
20 |     reader.checkLineFeed();
21 |     std::string field;
22 |     std::getline(reader.lineFeed_,field,reader.delimiter_);
23 |     std::stringstream ss( utils::chomp(field) );
24 |     ss >> val;
25 |     return(reader);
26 |   }
27 |   
28 |   bool nextLine();
29 | 
30 |   bool skipField();
31 | 
32 |   void rewind();
33 | 
34 |   bool endOfLine() const;
35 | 
36 |   size_t nLines() const { return( nLines_ ); }
37 | 
38 |   void setDelimiter(const char delimiter) { delimiter_ = delimiter; }
39 | 
40 | #ifndef TEST__
41 | private:
42 | #endif
43 | 
44 |   void init(const std::string& fileName);
45 | 
46 |   void checkLineFeed() const;
47 | 
48 |   void setLineFeed(const string& str);
49 | 
50 |   std::ifstream inStream_;
51 | 
52 |   char delimiter_;
53 | 
54 |   size_t nLines_;
55 | 
56 |   stringstream lineFeed_;
57 | 
58 | };
59 | 
60 | template<> inline Reader& operator>>(Reader& reader, datadefs::num_t& val) {
61 |   reader.checkLineFeed();
62 |   std::string field;
63 |   std::getline(reader.lineFeed_,field,reader.delimiter_);
64 |   field = utils::chomp(field);
65 |   if ( datadefs::isNAN_STR(field) ) {
66 |     val = datadefs::NUM_NAN;
67 |   } else {
68 |     std::stringstream ss( utils::chomp(field) );
69 |     ss >> val;
70 |   }
71 |   return(reader);
72 | }
73 | 
74 | /*
75 |   template<> inline Reader& operator>>(Reader& reader, datadefs::cat_t& val) {
76 |   reader.checkLineFeed();
77 |   std::string field;
78 |   std::getline(reader.lineFeed_,field,reader.delimiter_);
79 |   field = utils::chomp(field);
80 |   if ( datadefs::isNAN_STR(field) ) {
81 |   val = datadefs::CAT_NAN;
82 |   } else {
83 |   std::stringstream ss( utils::chomp(field) );
84 |   ss >> val;
85 |   }
86 |   return(reader);
87 |   }
88 | */
89 | 
90 | template<> inline Reader& operator>>(Reader& reader, string& str) {
91 |   reader.checkLineFeed();
92 |   std::getline(reader.lineFeed_,str,reader.delimiter_);
93 |   str = utils::chomp(str);
94 |   return(reader);
95 | }
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/src/rf_ace_R.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include <cstdlib>
  3 | #include <vector>
  4 | 
  5 | #include "rf_ace.hpp"
  6 | #include "treedata.hpp"
  7 | #include "datadefs.hpp"
  8 | #include "options.hpp"
  9 | #include "utils.hpp"
 10 | 
 11 | using namespace std;
 12 | using datadefs::num_t;
 13 | 
 14 | void parseDataFrame(SEXP dataFrameObj, vector<Feature>& dataMatrix, vector<string>& sampleHeaders) {
 15 | 
 16 |   Rcpp::DataFrame df(dataFrameObj);
 17 | 
 18 |   //Rcpp::CharacterVector colNames = df.attr("names");
 19 |   //Rcpp::CharacterVector rowNames = df.attr("row.names");
 20 | 
 21 |   vector<string> featureHeaders = df.attr("names");
 22 |   vector<string> foo = df.attr("row.names");
 23 |   sampleHeaders = foo;
 24 | 
 25 |   dataMatrix.resize( 0 );
 26 | 
 27 |   //cout << "nf = " << featureHeaders.size() << endl;
 28 |   //cout << "ns = " << sampleHeaders.size() << endl;
 29 | 
 30 |   // Read one column of information, which in this case is assumed to be one sample
 31 |   for ( size_t i = 0; i < featureHeaders.size(); ++i ) {
 32 |     Rcpp::List vec = df[i];
 33 |     assert(vec.length() == sampleHeaders.size() );
 34 |     //cout << " " << foo[0] << flush;
 35 |     //cout << " df[" << i << "].length() = " << vec.length() << endl;
 36 |     if ( featureHeaders[i].substr(0,2) != "N:" ) {
 37 |       vector<string> sVec(sampleHeaders.size());
 38 |       for ( size_t j = 0; j < sampleHeaders.size(); ++j ) {
 39 |         //cout << Rcpp::as<string>(vec[j]) << endl;
 40 |         sVec[j] = Rcpp::as<string>(vec[j]);
 41 |       }
 42 |       if ( featureHeaders[i].substr(0,2) == "T:" ) {
 43 | 	bool doHash = true;
 44 | 	dataMatrix.push_back( Feature(sVec,featureHeaders[i],doHash) );
 45 |       } else {
 46 | 	dataMatrix.push_back( Feature(sVec,featureHeaders[i]) );
 47 |       }
 48 |     } else {
 49 |       vector<num_t> sVec(sampleHeaders.size());
 50 |       for ( size_t j = 0; j < sampleHeaders.size(); ++j ) {
 51 |         sVec[j] = Rcpp::as<num_t>(vec[j]);
 52 |       }
 53 |       dataMatrix.push_back( Feature(sVec,featureHeaders[i]) );
 54 |     }
 55 | 
 56 |     //  cout << "df[" << j << "," << i << "] = " << Rcpp::as<num_t>(vec[j]) << endl;
 57 |     // }
 58 |   }
 59 | 
 60 |   assert( dataMatrix.size() == featureHeaders.size() );
 61 | 
 62 | }
 63 | 
 64 | RcppExport void rfaceSave(SEXP rfaceObj, SEXP fileName) {
 65 | 
 66 |   Rcpp::XPtr<RFACE> rface(rfaceObj);
 67 | 
 68 |   rface->save(Rcpp::as<string>(fileName));
 69 | 
 70 | }
 71 | 
 72 | RcppExport SEXP rfaceLoad(SEXP rfaceFile) {
 73 | 
 74 |   
 75 |   Rcpp::XPtr<RFACE> rface( new RFACE, true);
 76 | 
 77 |   rface->load(Rcpp::as<string>(rfaceFile));
 78 | 
 79 |   return(rface);
 80 | 
 81 | }
 82 | 
 83 | RcppExport SEXP rfaceTrain(SEXP trainDataFrameObj, 
 84 | 			   SEXP targetStrR, 
 85 | 			   SEXP featureWeightsR, 
 86 | 			   SEXP forestTypeR, 
 87 | 			   SEXP nTreesR, 
 88 | 			   SEXP mTryR, 
 89 | 			   SEXP nodeSizeR, 
 90 | 			   SEXP nMaxLeavesR, 
 91 | 			   SEXP shrinkageR, 
 92 | 			   SEXP noNABranchingR,
 93 | 			   SEXP nThreadsR) {
 94 | 
 95 | 
 96 |   ForestOptions forestOptions( forest_t::QRF );
 97 | 
 98 |   string targetStr            = Rcpp::as<string>(targetStrR);
 99 |   forestOptions.nTrees        = Rcpp::as<size_t>(nTreesR);
100 |   forestOptions.mTry          = Rcpp::as<size_t>(mTryR);
101 |   forestOptions.nodeSize      = Rcpp::as<size_t>(nodeSizeR);
102 |   forestOptions.nMaxLeaves    = Rcpp::as<size_t>(nMaxLeavesR);
103 |   forestOptions.shrinkage     = Rcpp::as<num_t>(shrinkageR);
104 |   forestOptions.noNABranching = Rcpp::as<bool>(noNABranchingR);
105 |   size_t nThreads             = Rcpp::as<size_t>(nThreadsR);
106 | 
107 |   vector<Feature> dataMatrix;
108 |   vector<string> sampleHeaders;
109 | 
110 |   parseDataFrame(trainDataFrameObj,dataMatrix,sampleHeaders);
111 | 
112 |   bool useContrasts = false;
113 |   Treedata trainData(dataMatrix,useContrasts,sampleHeaders);
114 | 
115 |   if ( forestOptions.nMaxLeaves == 0 ) {
116 |     forestOptions.nMaxLeaves = datadefs::MAX_IDX;
117 |   }
118 | 
119 |   size_t targetIdx = trainData.getFeatureIdx(targetStr);
120 | 
121 |   if ( targetIdx == trainData.end() ) {
122 |     int integer;
123 |     if ( datadefs::isInteger(targetStr,integer) && integer >= 0 && integer < static_cast<int>(trainData.nFeatures()) ) {
124 |       targetIdx = static_cast<size_t>(integer);
125 |     } else {
126 |       cerr << "Invalid target: " << targetStr << endl;
127 |       exit(1);
128 |     }
129 |   }
130 | 
131 |   Rcpp::XPtr<RFACE> rface( new RFACE(nThreads), true);
132 | 
133 |   vector<num_t> featureWeights = Rcpp::as<vector<num_t> >(featureWeightsR);
134 | 
135 |   if ( featureWeights.size() == 0 ) {
136 |     featureWeights = trainData.getFeatureWeights();
137 |   } 
138 |  
139 |   featureWeights[targetIdx] = 0.0;
140 | 
141 |   rface->train(&trainData,targetIdx,featureWeights,&forestOptions);
142 | 
143 |   return(rface);
144 | 
145 | }
146 | 
147 | RcppExport SEXP rfacePredict(SEXP rfaceObj, SEXP testDataFrameObj, SEXP quantilesR, SEXP nSamplesForQuantilesR, SEXP distributionsR) {
148 | 
149 |   Rcpp::XPtr<RFACE> rface(rfaceObj);
150 | 
151 |   ForestOptions forestOptions(forest_t::QRF);
152 | 
153 |   {
154 |     vector<num_t> quantiles = Rcpp::as<vector<num_t> >(quantilesR);
155 |     if ( quantiles.size() > 0 ) {
156 |       forestOptions.quantiles = quantiles;
157 |     }
158 |   }
159 | 
160 |   forestOptions.nSamplesForQuantiles = Rcpp::as<size_t>(nSamplesForQuantilesR);
161 |   forestOptions.distributions = Rcpp::as<bool>(distributionsR);
162 | 
163 |   vector<Feature> testDataMatrix;
164 |   vector<string> sampleHeaders;
165 | 
166 |   parseDataFrame(testDataFrameObj,testDataMatrix,sampleHeaders);
167 | 
168 |   bool useContrasts = false;
169 | 
170 |   Treedata testData(testDataMatrix,useContrasts,sampleHeaders);
171 | 
172 |   RFACE::QRFPredictionOutput qPredOut = rface->predictQRF(&testData,forestOptions);
173 |   
174 |   if ( qPredOut.isTargetNumerical ) {
175 | 
176 |     vector<vector<num_t> > numPredictionsTrans = utils::transpose(qPredOut.numPredictions);
177 | 
178 |     if ( forestOptions.distributions ) {
179 |       
180 |       return( Rcpp::List::create(Rcpp::Named("targetName")=qPredOut.targetName,
181 | 				 Rcpp::Named("sampleNames")=qPredOut.sampleNames,
182 | 				 Rcpp::Named("trueData")=qPredOut.trueNumData,
183 | 				 Rcpp::Named("predictions")=numPredictionsTrans,
184 | 				 Rcpp::Named("quantiles")=qPredOut.quantiles,
185 | 				 Rcpp::Named("distributions")=qPredOut.numDistributions));
186 | 
187 |     } else {
188 | 
189 |       return( Rcpp::List::create(Rcpp::Named("targetName")=qPredOut.targetName,
190 |                                  Rcpp::Named("sampleNames")=qPredOut.sampleNames,
191 |                                  Rcpp::Named("trueData")=qPredOut.trueNumData,
192 |                                  Rcpp::Named("predictions")=numPredictionsTrans,
193 |                                  Rcpp::Named("quantiles")=qPredOut.quantiles));
194 | 
195 |     }    
196 | 
197 |   } else {
198 | 
199 |     vector<vector<num_t> > catPredictionsTrans = utils::transpose(qPredOut.catPredictions);
200 |     
201 |     return( Rcpp::List::create(Rcpp::Named("targetName")=qPredOut.targetName,
202 | 			       Rcpp::Named("sampleNames")=qPredOut.sampleNames,
203 | 			       Rcpp::Named("trueData")=qPredOut.trueCatData,
204 | 			       Rcpp::Named("predictions")=catPredictionsTrans,
205 | 			       Rcpp::Named("categories")=qPredOut.categories));
206 |   }
207 |   
208 | }
209 | 
210 | RcppExport SEXP rfaceFilter(SEXP filterDataFrameObj,  SEXP targetStrR, SEXP featureWeightsR, SEXP nTreesR, SEXP mTryR, SEXP nodeSizeR, SEXP nMaxLeavesR, SEXP nThreadsR) {
211 | 
212 |   string targetStr = Rcpp::as<string>(targetStrR);
213 | 
214 |   ForestOptions forestOptions(forest_t::RF);
215 |   forestOptions.nTrees = Rcpp::as<size_t>(nTreesR);
216 |   forestOptions.mTry = Rcpp::as<size_t>(mTryR);
217 |   forestOptions.nodeSize = Rcpp::as<size_t>(nodeSizeR);
218 |   forestOptions.nMaxLeaves = Rcpp::as<size_t>(nMaxLeavesR);
219 | 
220 |   size_t nThreads = Rcpp::as<size_t>(nThreadsR);
221 | 
222 |   FilterOptions filterOptions;
223 | 
224 |   vector<Feature> dataMatrix;
225 |   vector<string> sampleHeaders;
226 | 
227 |   parseDataFrame(filterDataFrameObj,dataMatrix,sampleHeaders);
228 | 
229 |   bool useContrasts = true;
230 | 
231 |   Treedata filterData(dataMatrix,useContrasts,sampleHeaders);
232 |  
233 |   size_t targetIdx = filterData.getFeatureIdx(targetStr);
234 | 
235 |   if ( targetIdx == filterData.end() ) {
236 |     int integer;
237 |     if ( datadefs::isInteger(targetStr,integer) && integer >= 0 && integer < static_cast<int>(filterData.nFeatures()) ) {
238 |       targetIdx = static_cast<size_t>(integer);
239 |     } else {
240 |       cerr << "Invalid target: " << targetStr << endl;
241 |       exit(1);
242 |     }
243 |   }
244 | 
245 |   vector<num_t> featureWeights = Rcpp::as<vector<num_t> >(featureWeightsR);
246 |   if ( featureWeights.size() == 0 ) {
247 |     featureWeights = filterData.getFeatureWeights();
248 |   }
249 |   featureWeights[targetIdx] = 0.0;
250 | 
251 |   RFACE rface(nThreads);
252 | 
253 |   RFACE::FilterOutput filterOutput = rface.filter(&filterData,targetIdx,featureWeights,&forestOptions,&filterOptions);
254 | 
255 |   Rcpp::List filterOutputR = Rcpp::List::create(Rcpp::Named("featureNames")=filterOutput.featureNames,
256 |                                                 Rcpp::Named("pValues")=filterOutput.pValues,
257 |                                                 Rcpp::Named("importances")=filterOutput.importances,
258 |                                                 Rcpp::Named("correlations")=filterOutput.correlations,
259 |                                                 Rcpp::Named("sampleCounts")=filterOutput.sampleCounts);
260 |   
261 | 
262 |   return(filterOutputR);
263 |   
264 | }
265 | 
266 | 


--------------------------------------------------------------------------------
/src/rootnode.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ROOTNODE_HPP
 2 | #define ROOTNODE_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <map>
 6 | #include <vector>
 7 | #include <set>
 8 | #include <utility>
 9 | #include <fstream>
10 | #include "node.hpp"
11 | #include "treedata.hpp"
12 | #include "options.hpp"
13 | #include "distributions.hpp"
14 | #include "datadefs.hpp"
15 | 
16 | using datadefs::num_t;
17 | using datadefs::cat_t;
18 | 
19 | class RootNode : public Node {
20 | public:
21 | 
22 |   // Empty tree
23 |   RootNode();
24 |   
25 |   // Learn Tree from data
26 |   RootNode(TreeData* trainData, const size_t targetIdx, const distributions::PMF* pmf, const ForestOptions* forestOptions, distributions::Random* random);
27 | 
28 |   // Load tree from file
29 |   RootNode(ifstream& treeStream);
30 | 
31 |   ~RootNode();
32 | 
33 |   void reset(const size_t nNodes);
34 |   
35 |   void loadTree(ifstream& treeStream);
36 |   
37 |   void writeTree(ofstream& toFile);
38 |   
39 |   void growTree(TreeData* trainData, const size_t targetIdx, const distributions::PMF* pmf, const ForestOptions* forestOptions, distributions::Random* random);
40 |   
41 |   Node& childRef(const size_t childIdx);
42 |   
43 |   size_t nNodes() const;
44 |   
45 |   size_t nLeaves() const;
46 | 
47 |   const Prediction& getPrediction(TreeData* treeData, const size_t sampleIdx);
48 | 
49 |   vector<num_t> getChildLeafNumTrainData(TreeData* treeData, const size_t sampleIdx);
50 |   vector<cat_t> getChildLeafCatTrainData(TreeData* treeData, const size_t sampleIdx);
51 | 
52 |   vector<size_t> getOobIcs();
53 | 
54 |   size_t nOobSamples();
55 | 
56 |   set<size_t> getFeaturesInTree() { return( featuresInTree_ ); }
57 | 
58 |   string getTargetName() const { return( targetName_ ); }
59 |   bool isTargetNumerical() const { return( isTargetNumerical_ ); }
60 | 
61 |   unordered_map<string,num_t> getDI();
62 | 
63 |   void verifyIntegrity() const;
64 | 
65 | #ifndef TEST__
66 | private:
67 | #endif
68 | 
69 |   size_t getTreeSizeEstimate(const size_t nSamples, const size_t nMaxLeaves, const size_t nodeSize) const;
70 | 
71 |   forest_t forestType_;
72 |   string targetName_;
73 |   bool isTargetNumerical_;
74 | 
75 |   // Parameters that are generated only when a tree is grown
76 |   vector<Node> children_;
77 | 
78 |   size_t nLeaves_;
79 | 
80 |   vector<size_t> bootstrapIcs_;
81 |   vector<size_t> oobIcs_;
82 | 
83 |   set<size_t> featuresInTree_;
84 | 
85 |   vector<size_t> minDistToRoot_;
86 | 
87 |   SplitCache splitCache_;
88 | 
89 | };
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/src/statistics.cpp:
--------------------------------------------------------------------------------
 1 | #include "statistics.hpp"
 2 | #include "utils.hpp"
 3 | #include "math.hpp"
 4 | 
 5 | statistics::RF_statistics::RF_statistics() {
 6 | 
 7 | }
 8 | 
 9 | statistics::RF_statistics::RF_statistics(vector<vector<num_t> > importanceMat, 
10 | 					 vector<vector<num_t> > contrastImportanceMat,
11 | 					 vector<vector<size_t> > nodeMat,
12 | 					 num_t executionTime):
13 | 
14 |     importanceMat_(importanceMat),
15 |     contrastImportanceMat_(contrastImportanceMat),
16 |     nodeMat_(nodeMat),
17 |     executionTime_(executionTime) {
18 |   
19 |   
20 | 
21 | }
22 | 
23 | void statistics::RF_statistics::printContrastImportance(ofstream& toFile) {
24 | 
25 |   size_t nFeatures = contrastImportanceMat_[0].size();
26 |   size_t nPerms = contrastImportanceMat_.size();
27 | 
28 |   for ( size_t featureIdx = 0; featureIdx < nFeatures; ++featureIdx ) {
29 | 
30 |     vector<num_t> fSample( nPerms );
31 | 
32 |     for( size_t permIdx = 0; permIdx < nPerms; ++permIdx ) {
33 |       fSample[permIdx] = contrastImportanceMat_[permIdx][featureIdx];
34 |     }
35 | 
36 |     num_t mu = math::mean( utils::removeNANs(fSample) );
37 | 
38 |     toFile << mu << endl;
39 | 
40 |   }
41 | 
42 | 
43 | }
44 | 
45 | void statistics::RF_statistics::print(ofstream& toFile) {
46 | 
47 |   assert( nodeMat_.size() > 0 );
48 | 
49 |   size_t nPerms = importanceMat_.size();
50 |   size_t nTrees = nodeMat_[0].size();
51 | 
52 |   assert( nPerms == contrastImportanceMat_.size() );
53 |   assert( nPerms == nodeMat_.size() );
54 | 
55 |   vector<num_t> importanceVec( nPerms );
56 |   vector<num_t> contrastImportanceVec( nPerms );
57 | 
58 |   size_t nNodes = 0;
59 | 
60 |   for ( size_t permIdx = 0; permIdx < nPerms; ++permIdx ) {
61 |     importanceVec[permIdx] = math::mean( utils::removeNANs(importanceMat_[permIdx]) );
62 |     contrastImportanceVec[permIdx] = math::mean( utils::removeNANs(contrastImportanceMat_[permIdx]) );
63 |     for ( size_t treeIdx = 0; treeIdx < nodeMat_[permIdx].size(); ++treeIdx ) {
64 |       nNodes += nodeMat_[permIdx][treeIdx];
65 |     }
66 |   }
67 | 
68 |   num_t meanNodesPerTree = 1.0 * nNodes / ( nPerms * nTrees );
69 | 
70 |   num_t meanNodesPerSecond = 1.0 * nNodes / executionTime_;
71 | 
72 |   importanceVec = utils::removeNANs(importanceVec);
73 |   contrastImportanceVec = utils::removeNANs(contrastImportanceVec);
74 | 
75 |   num_t meanImportance = math::mean(importanceVec);
76 |   num_t meanContrastImportance = math::mean(contrastImportanceVec);
77 | 
78 |   num_t stdImportance = sqrtf( math::var(importanceVec) );
79 |   num_t stdContrastImportance = sqrtf( math::var(contrastImportanceVec) );
80 | 
81 |   toFile << "Random Forest statistics" << endl
82 | 	 << "------------------------" << endl
83 | 	 << "-- NUMBER OF TREES PER FOREST = " << nTrees << endl
84 | 	 << "--          NUMBER OF FORESTS = " << nPerms << endl
85 | 	 << "--            MEAN IMPORTANCE = " << meanImportance << endl
86 | 	 << "--             STD IMPORTANCE = " << stdImportance << endl
87 | 	 << "--   MEAN CONTRAST IMPORTANCE = " << meanContrastImportance << endl
88 | 	 << "--    STD CONTRAST IMPORTANCE = " << stdContrastImportance << endl
89 | 	 << "--        MEAN NODES PER TREE = " << meanNodesPerTree << endl
90 | 	 << "--      MEAN NODES PER SECOND = " << meanNodesPerSecond << endl;
91 | 
92 | 
93 | }
94 | 
95 | 


--------------------------------------------------------------------------------
/src/statistics.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STATISTICS_HPP
 2 | #define STATISTICS_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include <vector>
 8 | #include <cassert>
 9 | 
10 | #include "datadefs.hpp"
11 | 
12 | using namespace std;
13 | using datadefs::num_t;
14 | using datadefs::NUM_NAN;
15 | 
16 | namespace statistics {
17 |   
18 |   class RF_statistics {
19 |     
20 |   public:
21 | 
22 |     RF_statistics();
23 |     RF_statistics(vector<vector<num_t> > importanceMat, vector<vector<num_t> > contrastImportanceMat, vector<vector<size_t> > nodeMat, num_t executionTime);
24 | 
25 |     void printContrastImportance(ofstream& toFile);
26 |     
27 |     void print(ofstream& toFile);
28 | 
29 |   private:
30 | 
31 |     vector<vector<num_t> > importanceMat_;
32 |     vector<vector<num_t> > contrastImportanceMat_;
33 | 
34 |     vector<vector<size_t> > nodeMat_;
35 | 
36 |     num_t executionTime_;
37 | 
38 |   };
39 | }
40 | 
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/src/stochasticforest.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STOCHASTICFOREST_HPP
 2 | #define STOCHASTICFOREST_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <fstream>
 6 | #include "rootnode.hpp"
 7 | #include "treedata.hpp"
 8 | #include "options.hpp"
 9 | #include "distributions.hpp"
10 | 
11 | using namespace std;
12 | 
13 | class StochasticForest {
14 | public:
15 |   
16 |   StochasticForest();
17 |   
18 |   ~StochasticForest();
19 | 
20 |   void learnRF(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const vector<num_t>& featureWeights, vector<distributions::Random>& randoms);
21 |   void learnGBT(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const vector<num_t>& featureWeights, vector<distributions::Random>& randoms);
22 | 
23 |   void loadForest(const string& fileName);
24 | 
25 | 
26 |   void trainForestAndPredictQuantiles(TreeData* trainData,
27 | 				      const size_t targetIdx,
28 | 				      TreeData* testData,
29 | 				      distributions::PMF* pmf,
30 | 				      ForestOptions* forestOptions,
31 | 				      distributions::Random* random,
32 | 				      vector<vector<num_t> >& predictions); 
33 | 
34 |   //num_t getError() { return(0.0); }
35 |   //num_t getOobError();
36 | 
37 |   //void getImportanceValues(TreeData* trainData, vector<num_t>& importanceValues, vector<num_t>& contrastImportanceValues);
38 |   void getMDI(TreeData* trainData, vector<num_t>& impurityValues, vector<num_t>& contrastImpurityValues);
39 | 
40 |   void predict(TreeData* testData, vector<string>& predictions, vector<num_t>& confidence, size_t nThreads = 1);
41 |   void predict(TreeData* testData, vector<num_t>& predictions, vector<num_t>& confidence, size_t nThreads = 1);
42 | 
43 |   //bool useQuantiles() const;
44 | 
45 |   void getNumDistributions(TreeData* testData, vector<vector<num_t> >& distributions, distributions::Random* random, const size_t nSamplesPerTree);
46 |   void getCatDistributions(TreeData* testData, vector<vector<cat_t> >& distributions, distributions::Random* random, const size_t nSamplesPerTree);
47 | 
48 |   //vector<num_t> getOobPredictions();
49 |   //vector<num_t> getPermutedOobPredictions(const size_t featureIdx);
50 | 
51 |   //Counts the number of nodes in the forest
52 |   //size_t nNodes();
53 |   //size_t nNodes(const size_t treeIdx);
54 | 
55 |   size_t nTrees();
56 | 
57 |   //RootNode* tree(const size_t treeIdx) { return( rootNodes_[treeIdx] ); }
58 | 
59 |   //inline set<size_t> getFeaturesInForest() const { return( featuresInForest_ ); }
60 |   inline string getTargetName() const { assert(rootNodes_.size() > 0); return( rootNodes_[0]->getTargetName() ); }
61 |   inline bool isTargetNumerical() const { assert(rootNodes_.size() > 0); return( rootNodes_[0]->isTargetNumerical() ); }
62 | 
63 |   void writeForest(ofstream& toFile);
64 | 
65 | #ifndef TEST__
66 | private:
67 | #endif
68 | 
69 |   void readForestHeader(ifstream& forestStream);
70 |   
71 |   void growNumericalGBT(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const distributions::PMF* pmf, vector<distributions::Random>& randoms);
72 |   void growCategoricalGBT(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const distributions::PMF* pmf, vector<distributions::Random>& randoms);
73 | 
74 |   //num_t error(const vector<num_t>& data1,
75 |   //	      const vector<num_t>& data2); 
76 | 
77 |   datadefs::forest_t forestType_;
78 | 
79 |   vector<num_t> GBTConstants_;
80 |   num_t GBTShrinkage_;
81 | 
82 |   // Root nodes for every tree
83 |   vector<RootNode*> rootNodes_;
84 | 
85 |   // Container for all features in the forest for fast look-up
86 |   //set<size_t> featuresInForest_;
87 |   
88 | };
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/src/timer.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_HPP
 2 | #define TIMER_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <vector>
 6 | #include <string>
 7 | #include <cassert>
 8 | #include <ctime>
 9 | #include <cmath>
10 | #include <map>
11 | 
12 | #include "datadefs.hpp"
13 | 
14 | using namespace std;
15 | 
16 | class Timer {
17 | 
18 | public:
19 | 
20 |   Timer() {}
21 |   ~Timer() {}
22 | 
23 |   void tic(const string& objName) { 
24 |     name2idx_.insert( pair<string,size_t>(objName,timedObjects_.size()) );
25 |     timedObjects_.push_back( TimedObject(objName) );
26 |   }
27 | 
28 |   void toc(const string& objName) {
29 | 
30 |     map<string,size_t>::const_iterator it( name2idx_.find(objName) );
31 | 
32 |     if ( it == name2idx_.end() ) {
33 |       cerr << "Cannot stop timing '" << objName << "', since it was never started!" << endl;
34 |       exit(1);
35 |     }
36 | 
37 |     size_t idx = name2idx_[objName];
38 |     timedObjects_[idx].timeDiff = time(0) - timedObjects_[idx].startTime;
39 |     timedObjects_[idx].clockDiff = clock() - timedObjects_[idx].startClocks;
40 |     if ( timedObjects_[idx].timeDiff > 0 ) {
41 |       timedObjects_[idx].boost = static_cast<clock_t>(round(1.0 * timedObjects_[idx].clockDiff / ( CLOCKS_PER_SEC * timedObjects_[idx].timeDiff )));
42 |     }
43 |     timedObjects_[idx].isRunning = false;
44 |   }
45 | 
46 |   void print() {
47 |     cout << "Execution time breakdown:" << endl;
48 |     for ( size_t i = 0; i < timedObjects_.size(); ++i ) {
49 |       timedObjects_[i].print();
50 |     }
51 |     cout << endl;
52 |   }
53 | 
54 | private:
55 | 
56 |   struct TimedObject {
57 |     string  name;
58 |     clock_t startTime;
59 |     clock_t timeDiff;
60 |     clock_t startClocks;
61 |     clock_t clockDiff;
62 |     clock_t boost;
63 |     bool    isRunning;
64 |     TimedObject(const string& newName): name(newName),startTime(time(0)),startClocks(clock()),boost(1),isRunning(true) {}
65 |     void print() {
66 |       if ( !isRunning ) {
67 | 	cout << name << "  " << timeDiff << " seconds (" << boost << "x)" << endl;
68 |       } else {
69 | 	cout << name << " is still running!" << endl;
70 |       }
71 |     }
72 |   };
73 |   
74 |   map<string,size_t> name2idx_;
75 | 
76 |   vector<TimedObject> timedObjects_;
77 | 
78 | };
79 | 
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/src/treedata.hpp:
--------------------------------------------------------------------------------
 1 | //densetreedata.hpp
 2 | //
 3 | //
 4 | 
 5 | #ifndef TREEDATA_HPP
 6 | #define TREEDATA_HPP
 7 | 
 8 | #include <cstdlib>
 9 | #include <vector>
10 | 
11 | #include "datadefs.hpp"
12 | #include "distributions.hpp"
13 | #include "options.hpp"
14 | #include "feature.hpp"
15 | #include "reader.hpp"
16 | 
17 | using namespace std;
18 | using datadefs::num_t;
19 | 
20 | class TreeData {
21 | public:
22 | 
23 |   // Reveals the Feature class interface to the user
24 |   virtual const Feature* feature(const size_t featureIdx) const = 0;
25 |   
26 |   // Returns the number of features
27 |   virtual size_t nFeatures() const = 0;
28 |   
29 |   // Returns feature index, given the name
30 |   virtual size_t getFeatureIdx(const string& featureName) const = 0;
31 |   
32 |   // A value denoting the "one-over-last" feature in matrix
33 |   virtual size_t end() const = 0;
34 |   
35 |   // Returns sample name, given sample index
36 |   virtual string getSampleName(const size_t sampleIdx) = 0;
37 |   
38 |   // Returns the number of samples
39 |   virtual size_t nSamples() const = 0;
40 |   
41 |   virtual vector<num_t> getFeatureWeights() const = 0;
42 |   
43 |   virtual void separateMissingSamples(const size_t featureIdx,
44 | 				      vector<size_t>& sampleIcs,
45 | 				      vector<size_t>& missingIcs) = 0;
46 |   
47 |   virtual num_t numericalFeatureSplit(const size_t targetIdx,
48 | 				      const size_t featureIdx,
49 | 				      const size_t minSamples,
50 | 				      vector<size_t>& sampleIcs_left,
51 | 				      vector<size_t>& sampleIcs_right,
52 | 				      num_t& splitValue) = 0;
53 | 
54 |   virtual num_t categoricalFeatureSplit(const size_t targetIdx,
55 | 					const size_t featureIdx,
56 | 					const vector<cat_t>& catOrder,
57 | 					const size_t minSamples,
58 | 					vector<size_t>& sampleIcs_left,
59 | 					vector<size_t>& sampleIcs_right,
60 | 					unordered_set<cat_t>& splitValues_left) = 0;
61 |   
62 |   virtual num_t textualFeatureSplit(const size_t targetIdx,
63 | 				    const size_t featureIdx,
64 | 				    const uint32_t hashIdx,
65 | 				    const size_t minSamples,
66 | 				    vector<size_t>& sampleIcs_left,
67 | 				    vector<size_t>& sampleIcs_right) = 0;
68 |     
69 |   // Generates a bootstrap sample from the real samples of featureIdx. Samples not in the bootstrap sample will be stored in oob_ics,
70 |   // and the number of oob samples is stored in noob.
71 |   virtual void bootstrapFromRealSamples(distributions::Random* random,
72 | 					const bool withReplacement, 
73 | 					const num_t sampleSize, 
74 | 					const size_t featureIdx, 
75 | 					vector<size_t>& ics, 
76 | 					vector<size_t>& oobIcs) = 0;
77 | 
78 |   virtual void createContrasts() = 0;
79 |   virtual void permuteContrasts(distributions::Random* random) = 0;
80 |   
81 | };
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/src/utils.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef UTILS_HPP
  2 | #define UTILS_HPP
  3 | 
  4 | #include <vector>
  5 | #include <set>
  6 | #include <map>
  7 | #include <string>
  8 | #include <istream>
  9 | #include <sstream>
 10 | #include <cstdlib>
 11 | #include <unordered_set>
 12 | #include <unordered_map>
 13 | 
 14 | #include "datadefs.hpp"
 15 | #include "distributions.hpp"
 16 | 
 17 | using namespace std;
 18 | using datadefs::num_t;
 19 | using datadefs::cat_t;
 20 | 
 21 | class Treedata;
 22 | 
 23 | namespace utils {
 24 | 
 25 |   string tolower(const string& str);
 26 | 
 27 |   string suffix(const string& str);
 28 | 
 29 |   // Removes missing values from the provided data vector
 30 |   //vector<num_t> removeNANs(vector<num_t> x);
 31 |     
 32 |   // Chomps a string, i.e. removes all the trailing end-of-line characters
 33 |   string chomp(const string& str, const string& eof = "\r\n");
 34 | 
 35 |   // Removes leading and trailing whitespace characters
 36 |   string trim(const string& str, const string& wh = " ");
 37 | 
 38 |   unordered_set<string> keys(const string& str, const char delimiter);
 39 |   
 40 |   // A sophisticated parser that extracts a key-value pairs from a string
 41 |   map<string,string> parse(const string& str, 
 42 | 			   const char delimiter, 
 43 | 			   const char separator, 
 44 | 			   const char comment);
 45 | 
 46 |   map<string,string> parse(istream& streamObj,
 47 | 			   const char delimiter,
 48 | 			   const char separator,
 49 | 			   const char comment);
 50 | 
 51 |   unordered_set<uint32_t> hashText(const string& text);
 52 |   
 53 |   // Splits a delimited string
 54 |   vector<string> split(const string& str, const char delimiter, const string& wh = " ");
 55 | 
 56 |   // Splits a delimited stream
 57 |   vector<string> split(istream& streamObj, const char delimiter, const string& wh = " ");
 58 | 
 59 |   // Reads a list of items from a file
 60 |   vector<string> readListFromFile(const string& fileName, const char delimiter);
 61 |   
 62 |   template<typename T>
 63 |   vector<T> removeNANs(vector<T> data) {
 64 |     
 65 |     data.erase( remove_if(data.begin(),data.end(),&datadefs::isNAN<T>), data.end() );
 66 |     
 67 |     return(data);
 68 |     
 69 |   }
 70 |   
 71 |   template<typename StartIterator, typename StopIterator>
 72 |   inline void write(ostream& os, StartIterator startIt, StopIterator stopIt, const char delimiter = ' ') {
 73 |     
 74 |     if ( startIt != stopIt ) {
 75 |       os << *startIt;
 76 |       ++startIt;
 77 |     }
 78 |     
 79 |     while ( startIt != stopIt ) {
 80 |       os << delimiter << *startIt;
 81 |       ++startIt;
 82 |     }
 83 |   }
 84 | 
 85 |   void filterSort(const bool isIncreasingOrder,
 86 | 		  vector<num_t>& data,
 87 | 		  vector<size_t>& refIcs);
 88 |   
 89 |   string num2str(const num_t x);
 90 | 
 91 |   void strv2numv(const vector<string>& strvec,
 92 | 		 vector<datadefs::num_t>& numvec);
 93 | 
 94 |   /*
 95 |     void strv2catv(const vector<string>& strvec, 
 96 |     vector<datadefs::num_t>& catvec, 
 97 |     map<string,datadefs::num_t>& mapping, 
 98 |     map<datadefs::num_t,string>& backMapping);
 99 |   */
100 | 
101 |   void sortDataAndMakeRef(const bool isIncreasingOrder,
102 | 			  vector<datadefs::num_t>& data,
103 | 			  vector<size_t>& refIcs);
104 | 
105 |   /**
106 |    * Sorts a given input data vector of type T based on a given reference
107 |    * ordering of type vector<int>.
108 |    !! Correctness: this will fail if any of the contents of refIcs fall outside
109 |        of the normal scope of vector<T>& data.
110 |   */
111 |   template <typename T> void sortFromRef(vector<T>& data,
112 |                                          vector<size_t> const& refIcs
113 |                                          ) {
114 |     assert(data.size() == refIcs.size());
115 |     vector<T> foo = data;
116 |     int n = data.size();
117 |     for (int i = 0; i < n; ++i) {
118 |       data[i] = foo[refIcs[i]];
119 |     }
120 |   }
121 |     
122 |   template <typename T>
123 |   T str2(const string& str) {
124 | 
125 |     if( datadefs::isNAN_STR(str) ) {
126 |       return( static_cast<T>(datadefs::NUM_NAN) );
127 |     }
128 |     
129 |     stringstream ss( chomp(str) );
130 |     T ret;
131 |     ss >> ret;
132 |     
133 |     if ( ss.fail() || ss.bad() || !ss.eof() ) {
134 |       cerr << "utils::convert::str2<T>() -- input '" << str
135 | 	   << "' incorrectly formatted for conversion to type T" << endl;
136 |       exit(1);
137 |     }
138 |     
139 |     return( ret );
140 |   }
141 | 
142 |   template<typename T>
143 |   vector<vector<T> > transpose(const vector<vector<T> >& data) {
144 |     
145 |     size_t nRows = data.size();
146 |     size_t nCols = data[0].size();
147 | 
148 |     vector<vector<T> > dataTransposed(nCols,vector<T>(nRows,datadefs::NUM_NAN));
149 | 
150 |     for ( size_t i = 0; i < nRows; ++i ) {
151 |       for ( size_t j = 0; j < nCols; ++j ) {
152 | 	dataTransposed[j][i] = data[i][j];
153 |       }
154 |     }
155 | 
156 |     return(dataTransposed);
157 | 
158 |   }
159 | 
160 |   vector<size_t> range(const size_t n);
161 | 
162 |   istream& safeGetline(istream& is, string& t);
163 | 
164 |   vector<vector<size_t> > splitRange(const size_t nElements, const size_t nSplits);
165 | 
166 |   template<typename T>
167 |   void permute(vector<T>& data, distributions::Random* random) {
168 |     
169 |     // Permute indices
170 |     for (size_t i = 0; i < data.size(); ++i) {
171 |       size_t j = random->integer() % (i + 1);
172 |       T temp = data[i];
173 |       data[i] = data[j];
174 |       data[j] = temp;
175 |     }
176 | 
177 |   }
178 | 
179 |   num_t numericalFeatureSplitsNumericalTarget(const vector<num_t>& tv,
180 | 					      const vector<num_t>& fv,
181 | 					      const size_t minSamples,
182 | 					      size_t& splitIdx);
183 |   
184 |   num_t numericalFeatureSplitsCategoricalTarget(const vector<cat_t>& tv,
185 | 						const vector<num_t>& fv,
186 | 						const size_t minSamples,
187 | 						size_t& splitIdx);
188 |   
189 |   num_t categoricalFeatureSplitsNumericalTarget(const vector<num_t>& tv,
190 | 						const vector<cat_t>& fv,
191 | 						const size_t minSamples,
192 | 						const vector<cat_t>& catOrder,
193 | 						unordered_map<cat_t,vector<size_t> >& fmap_left,
194 | 						unordered_map<cat_t,vector<size_t> >& fmap_right);
195 |   
196 |   num_t categoricalFeatureSplitsCategoricalTarget(const vector<cat_t>& tv,
197 | 						  const vector<cat_t>& fv,
198 | 						  const size_t minSamples,
199 | 						  const vector<cat_t>& catOrder,
200 | 						  unordered_map<cat_t,vector<size_t> >& fmap_left,
201 | 						  unordered_map<cat_t,vector<size_t> >& fmap_right);
202 |   
203 |   
204 | }
205 | 
206 | #endif
207 | 


--------------------------------------------------------------------------------
/test/GBT_benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <iostream>
  3 | #include "argparse.hpp"
  4 | #include "stochasticforest.hpp"
  5 | #include "treedata.hpp"
  6 | 
  7 | using namespace std;
  8 | 
  9 | 
 10 | const size_t DEFAULT_TARGETIDX = 0;
 11 | const size_t DEFAULT_NTREES = 500;
 12 | const size_t DEFAULT_NODESIZE = 5;
 13 | const num_t DEFAULT_SHRINKAGE = 0.2;
 14 | const num_t DEFAULT_SUBSAMPLE = 0.5;
 15 | 
 16 | int main(const int argc, char* const argv[]) {
 17 | 
 18 |   //------------------------------------------------------------------------
 19 |   // 0: parameters
 20 |   if(argc == 1 || argc == 2) {
 21 |     if(argc == 2) {
 22 |       string helphandle(argv[1]);
 23 |       if (helphandle != "-h" && helphandle != "--help") {
 24 |         cerr << "use -h or --help to get started" << endl;
 25 |         return EXIT_FAILURE;
 26 |       }
 27 |     }
 28 | 
 29 |     cout << endl;
 30 |     cout << "REQUIRED ARGUMENTS:" << endl;
 31 |     cout << "-I / --input        input feature matrix" << endl;
 32 |     cout << "-O / --output       output association file" << endl;
 33 |     cout << endl;
 34 |     cout << "OPTIONAL ARGUMENTS:" << endl;
 35 |     cout << "-i / --targetidx    target index, ref. to feature matrix (default " << DEFAULT_TARGETIDX << ")" << endl;
 36 |     cout << "-n / --ntrees       number of trees per GBT forest (default " << DEFAULT_NTREES << ")" << endl;
 37 |     cout << "-s / --nodesize     minimum number of train samples per node, affects tree depth (default " << DEFAULT_NODESIZE << ")" << endl;
 38 |     cout << "-z / --shrinkage    shrinkage (default " << DEFAULT_SHRINKAGE << ")" << endl;
 39 |     cout << "-u / --subsample    subsample size (default " << DEFAULT_SUBSAMPLE << ")" << endl;
 40 |     cout << endl;
 41 |     return EXIT_SUCCESS;
 42 |   }
 43 | 
 44 |   cout << endl;
 45 |   cout << "  ----------------------------------" << endl;
 46 |   cout << "  ---  GBT_benchmark version 0.0.2    ---" << endl;
 47 |   cout << "  ----------------------------------" << endl;
 48 | 
 49 |   //using namespace GetOpt;
 50 |   string input = "";
 51 |   size_t targetIdx = DEFAULT_TARGETIDX;
 52 |   size_t ntrees = DEFAULT_NTREES;
 53 |   size_t nodesize = DEFAULT_NODESIZE;
 54 |   num_t shrinkage = DEFAULT_SHRINKAGE;
 55 |   num_t subSampleSize = DEFAULT_SUBSAMPLE;
 56 |   string output = "";
 57 | 
 58 |   ArgParse parser(argc,argv);
 59 |   parser.getArgument<string>("I","input",input);
 60 |   parser.getArgument<size_t>("i","target",targetIdx);
 61 |   parser.getArgument<size_t>("n","ntrees",ntrees);
 62 |   parser.getArgument<string>("O","output",output);
 63 |   parser.getArgument<num_t>("z","shrinkage",shrinkage);
 64 |   parser.getArgument<num_t>("u","subsample",subSampleSize);
 65 | 
 66 | 
 67 |   if(input == "") {
 68 |     cerr << "input file not specified" << endl;
 69 |     return EXIT_FAILURE;
 70 |   }
 71 | 
 72 |   if(output == "") {
 73 |     cerr << "output file not specified" << endl;
 74 |     return EXIT_FAILURE;
 75 |   }
 76 | 
 77 | 
 78 |   //------------------------------------------------------------------------
 79 |   // 1: read data into Treedata class (features are rows)
 80 |   cout <<endl<< "READ:" << endl;
 81 |   Treedata treeData(input);
 82 |   size_t nSamples = treeData.nSamples();
 83 | 
 84 | 
 85 |   //------------------------------------------------------------------------
 86 |   // 2: construct a GBT object
 87 |   cout <<endl<< "CONSTRUCT:" << endl;
 88 |   size_t nTrees(ntrees);
 89 |   size_t nMaxLeaves(nodesize);
 90 |   StochasticForest myGbtForest( &treeData, targetIdx, nTrees);
 91 | 
 92 | 
 93 |   //------------------------------------------------------------------------
 94 |   // 3: grow the forest
 95 |   cout <<endl<< "GROWING:" << endl;
 96 |   myGbtForest.learnGBT(nMaxLeaves, shrinkage, subSampleSize);
 97 | 
 98 |   //------------------------------------------------------------------------
 99 |   // 4: predict using the forest
100 |   cout << "PREDICTION: "<<nSamples<<" samples. Target="<<targetIdx<<endl;
101 |   vector<num_t>  confidence(nSamples);
102 |   vector<string> prediction(nSamples);
103 |   vector<num_t>  numPrediction(nSamples);
104 |   if ( treeData.isFeatureNumerical(targetIdx) )
105 |     myGbtForest.predict(numPrediction, confidence);
106 |   else
107 |     myGbtForest.predict(prediction, confidence);
108 | 
109 |   vector<num_t> target(nSamples);
110 |   treeData.getFeatureData(targetIdx, target);
111 | 
112 |   // diagnostic print out the true and the prediction (on train data)
113 |   if ( treeData.isFeatureNumerical(targetIdx) ) {
114 |     for (size_t i=0; i<nSamples; i++) {
115 |       cout << i << "\t" << target[i] << "\t" << numPrediction[i] <<"\t" << confidence[i]<<endl;
116 |     }
117 | 
118 |   } else {
119 | 
120 |     size_t errors = 0;
121 |     for (size_t i=0; i<nSamples; i++) {
122 |       string trueStr = treeData.getRawFeatureData(targetIdx,i);
123 |       if (trueStr != prediction[i]) ++errors;
124 |       cout << i << "\t" << trueStr << "\t" << prediction[i] <<"\t" << confidence[i]<<endl;
125 |     }
126 |     cout << "Errors " <<errors<<"/"<<nSamples<<" = " << 100.0*errors/nSamples<<"%."<<endl;
127 |   }
128 | 
129 |   return(EXIT_SUCCESS);
130 | }
131 | 


--------------------------------------------------------------------------------
/test/R/benchmark.R:
--------------------------------------------------------------------------------
  1 | library(rfacer)
  2 | library(randomForest)
  3 | library(quantregForest)
  4 | 
  5 | source("test/R/utils.R")
  6 | 
  7 | pMissing <- c(0.0,0.2,0.4)
  8 | 
  9 | errorlist <- list()
 10 | 
 11 | # Benchmark missing values
 12 | for ( i in 1:length(pMissing) ) {
 13 |   tmp <- benchmarkMissingValues(pMissing[[i]])
 14 |   errorlist[[i]] <- tmp$errors
 15 | } 
 16 | 
 17 | colors <- c("black","darkgrey","lightgrey")
 18 | 
 19 | pdf("errors.pdf",width=6,height=6)
 20 | par(mfcol=c(3,3))
 21 | for ( i in 1:3 ) {
 22 | tit <- paste(c("NUM, %NA=",as.character(pMissing[i]*100)),collapse='')
 23 | barplot(errorlist[[i]]$num,legend.text=FALSE,axes=TRUE,ylab="RMSE",col=colors,cex.names=0.65,ylim=c(0,9))
 24 | title(tit)
 25 | 
 26 | tit <- paste(c("NUM+TXT, %NA=",as.character(pMissing[i]*100)),collapse='')
 27 | barplot(errorlist[[i]]$txt,legend.text=FALSE,axes=TRUE,ylab="RMSE",col=colors,cex.names=0.65,ylim=c(0,9))
 28 | title(tit)
 29 | 
 30 | tit <- paste(c("NUM+CAT, %NA=",as.character(pMissing[i]*100)),collapse='')
 31 | barplot(errorlist[[i]]$cat,legend.text=FALSE,axes=TRUE,ylab="RMSE",col=colors,cex.names=0.65,ylim=c(0,9))
 32 | title(tit)
 33 | }
 34 | dev.off()
 35 | 
 36 | 
 37 | # Benchmark categorical splitter speed
 38 | offsets <- as.vector(c(0,2,3,4,5,7,9,15,20,25,30))
 39 | nCategories <- 3 + 3*offsets 
 40 | speeds <- data.frame(rf=vector(length=length(offsets)),rface=vector(length=length(offsets)))
 41 | RMSE <- data.frame(RF=vector(length=length(offsets)),"RF-ACE"=vector(length=length(offsets)))
 42 | names(RMSE) <- c("RF","RF-ACE")
 43 | for ( i in 1:length(offsets) ) {
 44 |   out <- benchmarkCatSplitterSpeed(offsets[i])
 45 |   speeds$rf[i] <- out$rfSpeed
 46 |   speeds$rface[i] <- out$rfaceSpeed
 47 |   RMSE$"RF"[i] <- out$rfRMSE
 48 |   RMSE$"RF-ACE"[i] <- out$rfaceRMSE
 49 | }
 50 | rownames(speeds) <- c(nCategories)
 51 | #speeds <- t(speeds)
 52 | rownames(RMSE) <- c(nCategories)
 53 | RMSE <- t(RMSE)
 54 | pdf("catsplitter_speeds.pdf",width=8,height=4)
 55 | par(mfcol=c(1,2))
 56 | plot(nCategories,speeds$rface,type='l',col='lightgray',xlab='Cardinality',ylab='Runtime (s)',lwd=2.5,ylim=c(0,2))
 57 | lines(nCategories,speeds$rf,col='black',lwd=2.5)
 58 | legend(10,3.2,c('RF','RF-ACE'),lty=c(1,1),lwd=c(2.5,2.5),col=c("black","lightgray"))
 59 | grid()
 60 | #barplot(speeds,beside=TRUE,legend=TRUE,cex.names=0.8)
 61 | #title("Categorical splitter execution time")
 62 | barplot(RMSE,beside=TRUE,legend=TRUE,cex.names=0.8,col=c("black","lightgray"),xlab="Cardinality",ylab="RMSE")
 63 | #plot(nCategories,RMSE$rf)
 64 | #points(nCategories,RMSE$rface)
 65 | #title("RMSE as function of cardinality")
 66 | box()
 67 | grid()
 68 | dev.off()
 69 | 
 70 | # Benchmark tree size
 71 | treesizes <- as.data.frame(t(read.table("tmp/treesizes.tsv")))
 72 | names(treesizes) <- c("0%","10%","20%","30%","0%","10%","20%","30%")
 73 | pdf("treesizes.pdf",width=8,height=4)
 74 | par(mfcol=c(1,2))
 75 | boxplot(treesizes[1:4],ylim=c(15,150),xlab="% of missing values",ylab="Tree size (nodes)")
 76 | title("RF-ACE, binary splits")
 77 | grid()
 78 | boxplot(treesizes[5:8],ylim=c(15,150),xlab="% of missing values",ylab="Tree size (nodes)")
 79 | title("RF-ACE, ternary splits")
 80 | grid()
 81 | dev.off()
 82 | 
 83 | nSamples <- 1000
 84 | std <- 0.3
 85 | offset <- 0
 86 | pMissing <- 0.2
 87 | 
 88 | trainData <- makeData(nSamples,std,offset,pMissing)
 89 | testData <- makeData(nSamples,std,offset,pMissing)
 90 | qrface <- rface.train(trainData[c(1,2,3,5,6,7,8,9)],"N:output",nTrees=100,mTry=3,nodeSize=10)
 91 | qrfaceOut <- rface.predict(qrface,testData,quantiles=as.vector(seq(0.01,0.90,0.05)),nSamplesForQuantiles=50)
 92 | 
 93 | cal <- testCalibration(qrfaceOut)
 94 | 
 95 | pdf("quantilecalibration.pdf")
 96 | plot(qrfaceOut$quantiles,cal,pch=".",cex=4)
 97 | lines( par()$usr[1:2], par()$usr[1:2] )
 98 | grid()
 99 | dev.off()
100 | 
101 | nSamples <- as.vector(seq(1000,10000,1000))
102 | nIters <- 1
103 | 
104 | modelSpeeds <- benchmarkRFSpeeds(nSamples,nIters)
105 | 
106 | pdf("RF_QRF_RFACE_runtime.pdf",width=4,height=4)
107 | plot(nSamples,modelSpeeds$"RF"/nIters,type='l',col='black',lwd=2.5,xlab='nSamples',ylab='Runtime (s)')
108 | lines(nSamples,modelSpeeds$"QRF"/nIters,col='darkgray',lwd=2.5)
109 | lines(nSamples,modelSpeeds$"RF-ACE"/nIters,col='gray',lwd=2.5)
110 | legend(1000,8.2,c('RF','QRF','RF-ACE'),lty=c(1,1,1),lwd=c(2.5,2.5,2.5),col=c("black","darkgray","gray"))
111 | grid()
112 | dev.off()
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/test/R/run_tests.R:
--------------------------------------------------------------------------------
 1 | library(rfacer)
 2 | 
 3 | # Loading training data from an .afm file
 4 | trainData1 <- read.afm("test_103by300_mixed_nan_matrix.afm")
 5 | 
 6 | # Building predictor for "N:output"
 7 | predictorObj1 <- rface.train(trainData1,"N:output")
 8 | 
 9 | # 
10 | associations1 <- rface.filter(trainData1,"N:output")
11 | predictions1 <- rface.predict(predictorObj1,trainData1)
12 | rface.save(predictorObj1,"foo.sf")
13 | predictorObj2 <- rface.load("foo.sf")
14 | predictions2 <- rface.predict(predictorObj2,trainData1)
15 | 
16 | # Generating new training data
17 | x  <- 2*pi*(1:1000)/1000 + rnorm(1000,0,0.1)
18 | y  <- sin(x) + rnorm(1000,0,0.1)
19 | n1 <- rnorm(1000)
20 | n2 <- rnorm(1000)
21 | t  <- as.vector(rep(as.character("foo"),1000))
22 | 
23 | # Populating the data frame with the training data
24 | trainData2 <- data.frame(y,x,n1,n2,t,stringsAsFactors=FALSE)
25 | colnames(trainData2) <- c("N:output","N:input","N:noise1","N:noise2","T:random")
26 | 
27 | # Populating sample names
28 | rownames(trainData2) <- paste(c(rep("s",1000)),(1:1000),sep='')
29 | 
30 | # Calculating associations for "N:output" with RF-ACE using 4 threads
31 | associations3 <- rface.filter(trainData2,"N:output",nThreads=4)
32 | 
33 | # 
34 | predictorObj3 <- rface.train(trainData2,"N:output")
35 | predictions3 <- rface.predict(predictorObj3,trainData2)
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/test/R/utils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | rmse <- function(out) {
  4 |   return(sqrt(mean((out$predData-out$trueData)^2,na.rm=TRUE)))
  5 | }
  6 | 
  7 | 
  8 | 
  9 | sampleFakeClasses <- function(classes,offset) {
 10 | 
 11 |   fakeClasses <- classes
 12 | 
 13 |   if ( offset > 0 ) {
 14 |     for ( i in 1:length(classes) ) {
 15 |       start <- (classes[i]-1)*300 + 1
 16 |       fakeClasses[i] <- sample(start:(start+offset),1,replace=T)
 17 |     }
 18 |   }
 19 | 
 20 |   return(fakeClasses)
 21 | }
 22 | 
 23 | 
 24 | 
 25 | 
 26 | makeData <- function(nSamples,std,offset,pMissing) {
 27 | 
 28 |   nWordsMin <- 4
 29 |   nWordsMax <- 8
 30 | 
 31 |   bags <- list(
 32 |   list("buckler","shield","sword","helmet","gloves","horse","medieval","castle","joust","clown","extra","words","that","mix"),
 33 |   list("swan","duck","duckling","bird","fly","pond","wings","feather","beak","legs","words","that","dont","distinguish"),
 34 |   list("baby","diaper","toy","poo","pee","smile","cry","toddler","infant","play","text","that","dont","distinguish"))
 35 | 
 36 |   classes <- sample(1:3,nSamples,replace=T)
 37 |   fakeClasses <- sampleFakeClasses(classes,offset)
 38 | 
 39 |   nWordsPerSample <- sample(nWordsMin:nWordsMax,nSamples,replace=TRUE)
 40 | 
 41 |   text <- vector()
 42 | 
 43 |   v  <- seq(0,4*pi,length.out=nSamples)
 44 |   x1 <- sin(v) + rnorm(nSamples,0,std)
 45 |   x2 <- v + rnorm(nSamples,0,std)
 46 |   y  <- x1 + x2 + rnorm(nSamples,0,std)
 47 | 
 48 |   nNoisyVars <- 4
 49 | 
 50 |   for ( i in 1:nSamples ) {
 51 |     c <- classes[i]
 52 |     nWords <- nWordsPerSample[i]
 53 |     # nWords <- 10
 54 |     text[i] <- paste(sample(bags[[c]],nWords,replace=F),collapse=', ') 
 55 |     y[i] <- y[i] + 4 * pi * c  
 56 |   }
 57 | 
 58 |   n1 <- rnorm(nSamples)
 59 |   n1[runif(nSamples) < pMissing & classes == 1] <- NA
 60 |   n2 <- rnorm(nSamples)
 61 |   n2[runif(nSamples) < pMissing & classes == 1] <- NA
 62 |   n3 <- rnorm(nSamples)
 63 |   n3[runif(nSamples) < pMissing & classes == 1] <- NA
 64 |   n4 <- rnorm(nSamples)
 65 |   n4[runif(nSamples) < pMissing & classes == 1] <- NA
 66 |   x1[runif(nSamples) < pMissing & classes == 1] <- NA
 67 |   x2[runif(nSamples) < pMissing & classes == 1] <- NA
 68 |   fakeClasses[runif(nSamples) < pMissing & classes == 1] <- NA
 69 | 
 70 |   # Populating the data frame with the training data
 71 |   data <- data.frame(y,x1,x2,text,as.character(fakeClasses),n1,n2,n3,n4,stringsAsFactors=FALSE)
 72 |   colnames(data) <- c("N:output","N:input1","N:input2","T:random","C:class","N:noise1","N:noise2","N:noise3","N:noise4")
 73 | 
 74 |   # Populating sample names
 75 |   rownames(data) <- paste(c(rep("s",nSamples)),(1:nSamples),sep='')
 76 | 
 77 |   return(data)
 78 | 
 79 | }
 80 | 
 81 | getRFACEOutput <- function(trainData,testData,forestType,noNABranching,quantiles=vector(length(0))) {
 82 | 
 83 | rface <- rface.train(trainData,"N:output",nTrees=50,mTry=3,nodeSize=3,forestType=forestType,noNABranching=noNABranching)
 84 | rfaceOut <- rface.predict(rface,testData,quantiles=as.vector(c(0.5)))
 85 | rfaceOut$predData <- rfaceOut$predictions[[1]]
 86 | return(rfaceOut)
 87 | 
 88 | }
 89 | 
 90 | getQuantileVector <- function(predictions,idx) {
 91 | 
 92 | out <- vector(length=length(predictions))
 93 | 
 94 | for ( i in 1:length(predictions) ) {
 95 | out[i] <- predictions[[i]][idx]
 96 | }
 97 | return(out)
 98 | }
 99 | 
100 | testCalibration <- function(rfaceOut) {
101 | 
102 |   nQuantiles <- length(rfaceOut$quantiles)
103 |   nSamples <- length(rfaceOut$trueData)
104 |   cal <- 1*vector(length=nQuantiles)
105 | 
106 |   for ( q in 1:nQuantiles ) {
107 |     cal[q] <- sum( rfaceOut$trueData < rfaceOut$predictions[[q]] )/nSamples
108 |   }
109 | 
110 |   return(cal)
111 | }
112 | 
113 | benchmarkMissingValues <- function(pMissing) {
114 | 
115 | offset <- 0
116 | nSamples <- 1000
117 | std <- 0.3
118 | 
119 | trainData <- makeData(nSamples,std,offset,pMissing)
120 | testData <- makeData(nSamples,std,offset,pMissing)
121 | 
122 | icsNum <- as.vector(c(1,2,3,6,7,8,9))
123 | icsNumTxt <- as.vector(c(1,2,3,4,6,7,8,9))
124 | icsNumCat <- as.vector(c(1,2,3,5,6,7,8,9))
125 | 
126 | outA <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",TRUE)
127 | outB <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",TRUE)
128 | outC <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",TRUE)
129 | outD <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",FALSE)
130 | outE <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",FALSE)
131 | outF <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",FALSE)
132 | 
133 | #outG <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",TRUE,quantiles=vector(c(0.5)))
134 | #outH <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",TRUE,quantiles=vector(c(0.5)))
135 | #outI <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",TRUE,quantiles=vector(c(0.5)))
136 | #outJ <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",FALSE,quantiles=vector(c(0.5)))
137 | #outK <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",FALSE,quantiles=vector(c(0.5)))
138 | #outL <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",FALSE,quantiles=vector(c(0.5)))
139 | 
140 | trainData$"C:class" <- as.factor(trainData$"C:class")
141 | testData$"C:class"  <- as.factor(testData$"C:class")
142 | 
143 | imputedTrainData <- na.roughfix(trainData[c(1,2,3,5,6,7,8,9)])
144 | imputedTestData  <- na.roughfix(testData[ c(1,2,3,5,6,7,8,9)])
145 | 
146 | rfOut1 <- randomForest(imputedTrainData[c(2,3,5,6,7,8)],y=imputedTrainData[[1]],xtest=imputedTestData[c(2,3,5,6,7,8)],ytest=imputedTestData[[1]],ntree=50,mtry=3)
147 | rfOut2 <- randomForest(imputedTrainData[c(2,3,4,5,6,7,8)],y=imputedTrainData[[1]],xtest=imputedTestData[c(2,3,4,5,6,7,8)],ytest=imputedTestData[[1]],ntree=50,mtry=3)
148 | 
149 | outRef1 <- list()
150 | outRef1$trueData <- outA$trueData
151 | outRef1$predData <- rfOut1$test$predicted
152 | 
153 | outRef2 <- list()
154 | outRef2$trueData <- outA$trueData
155 | outRef2$predData <- rfOut2$test$predicted
156 | 
157 | colors <- testData$"C:class"
158 | 
159 | # dev.new()
160 | pdf("scattermatrix.pdf")
161 | pairs(testData[c(1,2,3,7)],col=colors)
162 | dev.off()
163 | 
164 | errors <- list()
165 | errors$num <- c(rmse(outRef1),rmse(outA),rmse(outD))
166 | names(errors$num) <- c("RF\nImputed","RF-ACE\nBinary","RF-ACE\nTernary")
167 | errors$txt <- c(rmse(outRef1),rmse(outB),rmse(outE))
168 | names(errors$txt) <- c("RF\nImputed","RF-ACE\nBinary","RF-ACE\nTernary")
169 | errors$cat <- c(rmse(outRef2),rmse(outC),rmse(outF))
170 | names(errors$cat) <- c("RF\nImputed","RF-ACE\nBinary","RF-ACE\nTernary")
171 | errors$title <- paste(c("n=",as.character(nSamples), ", pMissing=",as.character(pMissing*100)),collapse='')
172 | 
173 | return(list(errors=errors,data=testData,idata=imputedTestData,rf=rfOut1))
174 | 
175 | }
176 | 
177 | benchmarkCatSplitterSpeed <- function(offset) {
178 | 
179 | nSamples <- 1000
180 | std <- 0.3
181 | pMissing <- 0.0
182 | 
183 | trainData <- makeData(nSamples,std,offset,pMissing)
184 | testData <- makeData(nSamples,std,offset,pMissing)
185 | trainData <- trainData[c(1,5)]
186 | testData <- testData[c(1,5)]
187 | 
188 | speed <- list()
189 | 
190 | speed$rface <- 0
191 | for ( i in 1:10 ) {
192 | diff <- proc.time()
193 | rface <- rface.train(trainData,"N:output",nTrees=50,mTry=1,nodeSize=3,forestType="RF",noNABranching=FALSE)
194 | diff <- proc.time() - diff
195 | speed$rface <- as.matrix(speed$rface + diff)[1]
196 | }
197 | 
198 | RMSE <- list()
199 | rfaceOut <- rface.predict(rface,testData,quantiles=as.vector(c(0.5)))
200 | rfaceOut$predData <- rfaceOut$predictions[[1]]
201 | RMSE$rface <- rmse(rfaceOut)
202 | 
203 | trainData$"C:class" <- as.factor(trainData$"C:class")
204 | 
205 | speed$rf <- NA
206 | if (offset < 10) {
207 | speed$rf <- 0
208 | for ( i in 1:10 ) {
209 | diff <- proc.time()
210 | rf <- randomForest(trainData[2],y=trainData[[1]],ntree=50,mtry=1)
211 | diff <- proc.time() - diff
212 | speed$rf <- as.matrix(speed$rf + diff)[1]
213 | }
214 | }
215 | 
216 | RMSE$rf <- NA 
217 | if (offset < 10) {
218 | rf <- randomForest(trainData[2],y=trainData[[1]],xtest=testData[2],ytest=testData[[1]],ntree=50,mtry=1)
219 | 
220 | rfOut <- list()
221 | rfOut$trueData <- rfaceOut$trueData
222 | rfOut$predData <- rf$test$predicted
223 | RMSE$rf <- rmse(rfOut)
224 | }
225 | 
226 | return(list(rfSpeed=speed$rf,rfaceSpeed=speed$rface,data=trainData,rfRMSE=RMSE$rf,rfaceRMSE=RMSE$rface))
227 | }
228 | 
229 | benchmarkRFSpeeds <- function(nSamples,nIters) {
230 |   
231 |   speeds <- data.frame(rf=0*vector(length=length(nSamples)),qrf=0*vector(length=length(nSamples)),rface=0*vector(length=length(nSamples)))
232 |   names(speeds) <- c("RF","QRF","RF-ACE") 
233 |   std <- 0.3
234 |   offset <- 0
235 |   pMissing <- 0
236 | 
237 |   for ( iter in 1:nIters ) {
238 |   for ( i in 1:length(nSamples) ) {
239 | 
240 |     data <- makeData(nSamples[i],std,offset,pMissing)
241 | 
242 |     diff <- proc.time()
243 |     rf <- randomForest(data[c(2,3,5,6,7,8,9)],y=data[[1]],ntree=50,mtry=3,nodesize=3)
244 |     diff <- proc.time() - diff
245 |     speeds$"RF"[i] <- as.matrix(speeds$"RF"[i]+diff)[1]
246 | 
247 |     diff <- proc.time()
248 |     rf <- quantregForest(data[c(2,3,5,6,7,8,9)],y=data[[1]],ntree=50,mtry=3,nodesize=3)
249 |     diff <- proc.time() - diff
250 |     speeds$"QRF"[i] <- as.matrix(speeds$"QRF"[i]+diff)[1]
251 | 
252 |     diff <- proc.time()
253 |     rface <- rface.train(data[c(1,2,3,5,6,7,8,9)],"N:output",nTrees=50,mTry=3,nodeSize=3)
254 |     diff <- proc.time() - diff
255 |     speeds$"RF-ACE"[i] <- as.matrix(speeds$"RF-ACE"[i]+diff)[1]
256 | 
257 |   }
258 |   }
259 | 
260 |   return(speeds)
261 | 
262 | }
263 | 
264 | 


--------------------------------------------------------------------------------
/test/bash/treesize_vs_pmissing.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | file="test_103by300_mixed_matrix.afm"
 4 | 
 5 | python test/python/ampute.py $file 0.1 tmp/foo_1.afm
 6 | python test/python/ampute.py $file 0.2 tmp/foo_2.afm
 7 | python test/python/ampute.py $file 0.3 tmp/foo_3.afm
 8 | 
 9 | bin/rf-ace --trainData $file         -i 0 -S 1 -n 100 -m 30 -V tmp/forest_0b.sf -N
10 | bin/rf-ace --trainData tmp/foo_1.afm -i 0 -S 2 -n 100 -m 30 -V tmp/forest_1b.sf -N
11 | bin/rf-ace --trainData tmp/foo_2.afm -i 0 -S 3 -n 100 -m 30 -V tmp/forest_2b.sf -N
12 | bin/rf-ace --trainData tmp/foo_3.afm -i 0 -S 4 -n 100 -m 30 -V tmp/forest_3b.sf -N
13 | bin/rf-ace --trainData $file         -i 0 -S 5 -n 100 -m 30 -V tmp/forest_0t.sf
14 | bin/rf-ace --trainData tmp/foo_1.afm -i 0 -S 6 -n 100 -m 30 -V tmp/forest_1t.sf
15 | bin/rf-ace --trainData tmp/foo_2.afm -i 0 -S 7 -n 100 -m 30 -V tmp/forest_2t.sf
16 | bin/rf-ace --trainData tmp/foo_3.afm -i 0 -S 8 -n 100 -m 30 -V tmp/forest_3t.sf
17 | 
18 | 
19 | grep "NNODES=" tmp/forest_0b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >  tmp/treesizes.tsv
20 | grep "NNODES=" tmp/forest_1b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
21 | grep "NNODES=" tmp/forest_2b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
22 | grep "NNODES=" tmp/forest_3b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
23 | grep "NNODES=" tmp/forest_0t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
24 | grep "NNODES=" tmp/forest_1t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
25 | grep "NNODES=" tmp/forest_2t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
26 | grep "NNODES=" tmp/forest_3t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv
27 | 
28 |  


--------------------------------------------------------------------------------
/test/data/12by21_categorical_matrix.arff:
--------------------------------------------------------------------------------
 1 | @RELATION dataset 
 2 | 
 3 | @ATTRIBUTE X0 {0,1,2} 
 4 | @ATTRIBUTE X1 {0,1,2} 
 5 | @ATTRIBUTE X2 {0,1,2} 
 6 | @ATTRIBUTE X3 {0,1,2} 
 7 | @ATTRIBUTE X4 {0,1,2} 
 8 | @ATTRIBUTE X5 {0,1,2} 
 9 | @ATTRIBUTE X6 {0,1,2} 
10 | @ATTRIBUTE X7 {0,1,2} 
11 | @ATTRIBUTE X8 {0,1,2} 
12 | @ATTRIBUTE X9 {0,1,2} 
13 | @ATTRIBUTE X10 {0,1,2} 
14 | @ATTRIBUTE X11 {0,1,2} 
15 | @ATTRIBUTE X12 {0,1,2} 
16 | @ATTRIBUTE X13 {0,1,2} 
17 | @ATTRIBUTE X14 {0,1,2} 
18 | @ATTRIBUTE X15 {0,1,2} 
19 | @ATTRIBUTE X16 {0,1,2} 
20 | @ATTRIBUTE X17 {0,1,2} 
21 | @ATTRIBUTE X18 {0,1,2} 
22 | @ATTRIBUTE X19 {0,1,2} 
23 | @ATTRIBUTE Class {a,b} 
24 | 
25 | @DATA 
26 | 0,1,1,2,2,1,1,1,2,1,2,2,0,1,1,1,1,1,1,1,b 
27 | 0,1,1,0,2,1,0,0,2,1,2,1,0,1,1,1,1,2,1,0,a 
28 | 0,1,1,2,2,0,0,0,2,1,2,1,0,1,1,1,1,1,1,0,a
29 | 1,1,1,2,2,1,1,1,2,0,2,2,0,1,1,2,1,0,2,1,b
30 | 0,1,0,2,2,0,0,1,2,1,0,1,0,2,1,1,1,1,1,0,a
31 | 0,1,1,1,2,1,0,0,2,1,2,1,0,1,1,1,1,1,1,0,a
32 | 0,1,1,2,2,1,0,1,2,1,1,2,0,1,1,0,1,1,1,1,b
33 | 0,1,1,2,0,1,0,0,2,1,2,1,0,1,0,2,1,2,1,0,a
34 | 0,1,1,1,2,1,0,0,0,2,1,1,0,1,1,1,1,1,1,0,a
35 | 0,1,1,1,0,1,1,1,0,1,2,2,1,0,1,1,2,1,1,1,b
36 | 0,1,1,2,2,1,0,0,2,1,2,1,0,1,1,0,1,1,1,0,a
37 | 0,1,1,2,2,1,0,0,2,1,2,1,0,1,1,1,1,1,1,0,a
38 | 


--------------------------------------------------------------------------------
/test/data/3by8_mixed_NA_matrix.afm:
--------------------------------------------------------------------------------
1 | 	N:var0	C:var1	N:var2	N:var3	N:var4	N:var5	N:var6	T:var7
2 | s0	NA	foo	2.2	3.3	4.4	5.5	6.6	Ah, be so good. Yes, no?
3 | s1	0.00	NA	2.22	3.33	4.44	5.55	NA	NA
4 | s2	0.000	bar	2.222	3.333	4.444	5.555	6.666	Some more text, but not much.


--------------------------------------------------------------------------------
/test/data/3by8_mixed_NA_transposed_matrix.afm:
--------------------------------------------------------------------------------
1 | 	s0	s1	s2
2 | N:var0	NA	0.00	0.000
3 | C:var1	foo	NA	bar
4 | N:var2	2.2	2.22	2.222
5 | N:var3	3.3	3.33	3.333
6 | N:var4	4.4	4.44	4.444
7 | N:var5	5.5	5.55	5.555
8 | N:var6	6.6	NA	6.666
9 | T:var7	Ah, be so good. Yes, no?	NA	Some more text, but not much.


--------------------------------------------------------------------------------
/test/data/5by10_numeric_matrix.arff:
--------------------------------------------------------------------------------
 1 | @relation po
 2 | @attribute x1 numeric
 3 | @attribute x2 numeric
 4 | @attribute x3 numeric
 5 | @attribute x4 numeric
 6 | @attribute y numeric
 7 | @data
 8 | 0.8147,1.0000,0.0596,0.9160,6.0000
 9 | 0.9058,2.0000,0.6820,0.0012,14.0000
10 | 0.1270,3.0000,0.0424,0.4624,24.0000
11 | 0.9134,4.0000,0.0714,0.4243,36.0000
12 | 0.6324,5.0000,?,0.4609,50.0000
13 | 0.0975,6.0000,0.0967,0.7702,66.0000
14 | 0.2785,7.0000,0.8181,0.3225,84.0000
15 | 0.5469,?,0.8175,0.7847,104.0000
16 | 0.9575,9.0000,0.7224,0.4714,126.0000
17 | 0.9649,10.0000,0.1499,0.0358,150.0000
18 | 


--------------------------------------------------------------------------------
/test/distributions_newtest.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef DISTRIBUTIONS_NEWTEST_HPP
  2 | #define DISTRIBUTIONS_NEWTEST_HPP
  3 | 
  4 | #include <cstdlib>
  5 | #include <vector>
  6 | #include <unordered_map>
  7 | 
  8 | #include "utils.hpp"
  9 | #include "newtest.hpp"
 10 | #include "distributions.hpp"
 11 | 
 12 | using namespace std;
 13 | 
 14 | void distributions_newtest_random_integer();
 15 | void distributions_newtest_random_uniform();
 16 | void distributions_newtest_PMF();
 17 | 
 18 | void distributions_newtest() {
 19 | 
 20 |   newtest("integer()", &distributions_newtest_random_integer);
 21 |   newtest("uniform()", &distributions_newtest_random_uniform);
 22 |   newtest("pmf()", &distributions_newtest_PMF);
 23 | 
 24 | }
 25 | 
 26 | void distributions_newtest_random_integer() {
 27 | 
 28 |   // Make two identical random integer generators
 29 |   distributions::Random randGen1(0);
 30 |   distributions::Random randGen2(0);
 31 | 
 32 |   bool stayInSync = true;
 33 | 
 34 |   // Test that rand1 and rand2 stay in sync
 35 |   for ( size_t i = 0; i < 1000; ++i ) {
 36 | 
 37 |     size_t r1 = randGen1.integer();
 38 |     size_t r2 = randGen2.integer();
 39 | 
 40 |     if ( r1 != r2 ) {
 41 |       stayInSync = false;
 42 |       break;
 43 |     }
 44 |     
 45 |   }
 46 |   
 47 |   newassert( stayInSync );
 48 | 
 49 |   unordered_map<size_t,size_t> hist;
 50 | 
 51 |   size_t maxIdx = 1000;
 52 | 
 53 |   for ( size_t i = 0; i < maxIdx; ++i ) {
 54 |     hist[i] = 0;
 55 |   }
 56 | 
 57 |   for ( size_t i = 0; i < 100000; ++i ) {
 58 |     //size_t r = rand1() % maxIdx;
 59 |     ++hist[ randGen1.integer() % maxIdx ];
 60 |   }
 61 | 
 62 |   size_t nZeroCounts = 0;
 63 | 
 64 |   for ( size_t i = 0; i < maxIdx; ++i ) {
 65 |     if ( hist[i] == 0 ) ++nZeroCounts;
 66 |   }
 67 | 
 68 |   // We allow there to be at most two indices that never got sampled during
 69 |   // 100k random number generation rounds
 70 |   newassert( nZeroCounts <= 2 );
 71 | 
 72 | }
 73 | 
 74 | void distributions_newtest_random_uniform() {
 75 | 
 76 |   num_t r_min = datadefs::NUM_INF;
 77 |   num_t r_max = 0.0;
 78 | 
 79 |   distributions::Random random(0);
 80 | 
 81 |   bool stayWithinBounds = true;
 82 | 
 83 |   for ( size_t i = 0; i < 100000; ++i ) {
 84 |     num_t r = random.uniform();
 85 | 
 86 |     if ( ! (0.0 <= r && r <= 1.0) ) {
 87 |       stayWithinBounds = false;
 88 |       break;
 89 |     }
 90 | 
 91 |     if ( r_min > r ) r_min = r;
 92 |     if ( r_max < r ) r_max = r;
 93 |     
 94 |   }
 95 |  
 96 |   newassert( stayWithinBounds );
 97 |   newassert( r_max > r_min );
 98 |   newassert( fabs( 1 - r_max - r_min ) < 0.0001 );
 99 | 
100 | }
101 | 
102 | void distributions_newtest_PMF() {
103 | 
104 |   distributions::Random random(0);
105 | 
106 |   vector<num_t> weights = {1,2,3,5,3,1,0,1e-5};
107 | 
108 |   num_t sum = math::mean(weights) * weights.size();
109 | 
110 |   distributions::PMF pmf(weights);
111 | 
112 |   vector<num_t> PMFest(8,0.0);
113 |   
114 |   size_t maxIter = 1e7;
115 |   num_t incr = 1.0/maxIter;
116 |   
117 |   for ( size_t i = 0; i < maxIter; ++i ) {
118 |     PMFest[ pmf.sample(&random) ] += incr;
119 |   }
120 |   
121 |   for ( size_t i = 0; i < 8; ++i ) {
122 |     newassert( fabs( PMFest[i] - weights[i] / sum ) < 0.01 );
123 |   }
124 |   
125 | }
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/test/matlab/categoricalFeatureSplit.m:
--------------------------------------------------------------------------------
 1 | function [DI,splitValues_left,splitValues_right,ics_left,ics_right] = categoricalFeatureSplit(tv,fv,minSamples,isTargetNumerical)
 2 | 
 3 | %eliminate NaNs
 4 | ics = find(~isnan(tv) & ~isnan(fv));
 5 | tv = tv(ics);
 6 | fv = fv(ics);
 7 | 
 8 | fVals = unique(fv);
 9 | fVals(isnan(fVals)) = [];
10 | 
11 | n = length(tv);
12 | assert(n == length(fv));
13 | 
14 | ics_left = false(1,n);
15 | ics_right = true(1,n);
16 | 
17 | splitValues_left = [];
18 | splitValues_right = fVals;
19 | 
20 | DI_best = 0;
21 | 
22 | while true
23 |     
24 |     splitVal = -1;
25 |     
26 |     for i = 1:length(splitValues_right)
27 |         
28 |         fVal = splitValues_right(i);
29 |         
30 |         ics_left_test = ics_left | fv == fVal;
31 |         ics_right_test = ics_right & fv ~= fVal;
32 |         
33 |         DI = deltaImpurity(tv(ics_left_test),tv(ics_right_test),isTargetNumerical);
34 |         
35 |         if DI > DI_best && sum(~isnan(ics_left_test)) >= minSamples && sum(~isnan(ics_right_test)) > minSamples
36 |             DI_best = DI;
37 |             splitVal = fVal;
38 |         end
39 |         
40 |         %ics_left = ics_left & fv ~= fVal;
41 |         %ics_right = ics_right | fv == fVal;
42 |         
43 |     end
44 |     
45 |     if splitVal == -1
46 |         %sum(ics_left)
47 |         %sum(ics_right)
48 |         break;
49 |     end
50 |     
51 |     splitValues_left = unique([splitValues_left,splitVal]);
52 |     splitValues_right = setdiff(splitValues_right,splitVal);
53 | 
54 |     ics_left = ics_left | fv == splitVal;
55 |     ics_right = ics_right & fv ~= splitVal;
56 |     
57 | end
58 | 
59 | %splitValues_left
60 | %splitValues_right
61 | %sum(ics_left)
62 | %sum(ics_right)
63 | 
64 | 
65 | ics_left = find(ics_left);
66 | ics_right = find(ics_right);
67 | 
68 | DI = DI_best;
69 | 
70 | 


--------------------------------------------------------------------------------
/test/matlab/deltaImpurity.m:
--------------------------------------------------------------------------------
 1 | function DI = deltaImpurity(x_left,x_right,isNumerical)
 2 | %DI = deltaImpurity(x,idx)
 3 | %
 4 | %Returns decrease in impurity when data x is split into two
 5 | %halves, "x_left" and "x_right". Type of data is indicated by 
 6 | % isNumerical flag
 7 | 
 8 | assert(~any(isnan(x_left)));
 9 | assert(~any(isnan(x_right)));
10 | 
11 | % Calculate the decrease using the variance formula (slow+unstable)
12 | if isNumerical
13 |     
14 |     DI = deltaImpurity_var_regr(x_left,x_right);
15 |     
16 |     %Calculate the decrease using the mean formulat (fast+stable)
17 |     DI_test = deltaImpurity_mean_regr(x_left,x_right);
18 | 
19 | else
20 |     
21 |     DI = deltaImpurity_gi_class(x_left,x_right);
22 |     
23 |     DI_test = deltaImpurity_sf_class(SF([x_left(:);x_right(:)]),length(x_left) + length(x_right),SF(x_left),length(x_left),SF(x_right),length(x_right));
24 |     
25 | end
26 | 
27 | %Make sure the two measures agree
28 | if any(isnan([DI,DI_test]))
29 |     assert(isnan(DI) && isnan(DI_test), 'error: only the other impurity function yields NaN');
30 | else
31 |     assert( abs(DI - DI_test ) < 1e-3, 'error: impurity functions disagree in value');
32 | end
33 | 
34 | 
35 | function DI = deltaImpurity_mean_regr(x_left,x_right)
36 | 
37 | x = [x_left(:);x_right(:)];
38 | 
39 | mu = mean(x);
40 | n = length(x);
41 | muL = mean(x_left);
42 | nL = length(x_left);
43 | muR = mean(x_right);
44 | nR = length(x_right);
45 | 
46 | DI = -mu^2 + nL/n*muL^2 + nR/n*muR^2;
47 | 
48 | 
49 | function DI = deltaImpurity_var_regr(x_left,x_right)
50 | 
51 | x = [x_left(:);x_right(:)];
52 | n = length(x);
53 | nL = length(x_left);
54 | nR = length(x_right);
55 | 
56 | DI = var(x,1) - nL/n*var(x_left,1) - nR/n*var(x_right,1);
57 | 
58 | 
59 | function DI = deltaImpurity_gi_class(x_left,x_right)
60 | 
61 | x = [x_left(:);x_right(:)];
62 | n = length(x);
63 | nL = length(x_left);
64 | nR = length(x_right);
65 | 
66 | DI = giniIndex(x) - nL/n*giniIndex(x_left) - nR/n*giniIndex(x_right);
67 | 
68 | function DI = deltaImpurity_sf_class(sf_tot,n_tot,sf_left,n_left,sf_right,n_right)
69 | 
70 | DI = -sf_tot/(n_tot*n_tot) + sf_left/(n_tot*n_left) + sf_right / (n_tot*n_right);
71 | 
72 | function sf = SF(x)
73 | x = x+1;
74 | sf = sum(hist(x,unique(x)).^2);
75 | 
76 | function GI = giniIndex(x)
77 | 
78 | GI = hist(x,unique(x))/length(x);
79 | if ~isempty(GI)
80 |     GI = 1 - sum(GI.^2);
81 | else
82 |     GI = 0;
83 | end
84 | 
85 | 


--------------------------------------------------------------------------------
/test/matlab/numericalFeatureSplit.m:
--------------------------------------------------------------------------------
 1 | function [DI,splitValue,ics_left,ics_right] = numericalFeatureSplit(tv,fv,minSplit,isTargetNumerical)
 2 | 
 3 | %eliminate NaNs
 4 | ics = find(~isnan(tv) & ~isnan(fv));
 5 | tv = tv(ics);
 6 | fv = fv(ics);
 7 | 
 8 | [fv,T] = sort(fv,'ascend');
 9 | 
10 | tv = tv(T);
11 | ics = ics(T);
12 | 
13 | n = length(tv);
14 | assert(n == length(fv));
15 | 
16 | DIvec = zeros(1,n);
17 | 
18 | for i = minSplit:(n-minSplit)
19 |     if fv(i) == fv(i+1), continue, end;
20 |     DIvec(i) = deltaImpurity(tv(1:i),tv(i+1:end),isTargetNumerical);
21 | end
22 | 
23 | [DI,idx] = max(DIvec);
24 | 
25 | splitValue = fv(idx);
26 | 
27 | ics_left = ics(1:idx);
28 | ics_right = ics(idx+1:end);


--------------------------------------------------------------------------------
/test/matlab/readAFM.m:
--------------------------------------------------------------------------------
 1 | function [X,rowHeaders,colHeaders] = readAFM(afmFile)
 2 | 
 3 | S = importdata(afmFile);
 4 | 
 5 | X = S.data;
 6 | rowHeaders = S.textdata(2:end,1);
 7 | colHeaders = S.textdata(1,2:end);
 8 | 
 9 | [nRows,nCols] = size(X);
10 | 
11 | fprintf('%i rows and %i columns read\n',nRows,nCols);
12 | 
13 | assert(numel(rowHeaders) == nRows, 'error: row count mismatch\n');
14 | assert(numel(colHeaders) == nCols, 'error: columns count mismatch\n');
15 | 
16 | 


--------------------------------------------------------------------------------
/test/matlab/writeAFM.m:
--------------------------------------------------------------------------------
 1 | function writeAFM(X,featureHeaders,sampleHeaders,fileName)
 2 | 
 3 | fid = fopen(fileName,'w');
 4 | 
 5 | [f,n] = size(X);
 6 | 
 7 | if f > 0
 8 |     assert( length(featureHeaders) == f );
 9 | else
10 |     f = length(featureHeaders);
11 | end
12 |     
13 | if isempty(sampleHeaders)
14 |     for i = 1:n
15 |         fprintf(fid,'\t%s',['S',num2str(i)]);
16 |     end
17 | else
18 |     assert( length(sampleHeaders) == n );
19 |     for i = 1:n
20 |         fprintf(fid,'\t%s',sampleHeaders{i});
21 |     end
22 | end
23 | 
24 | fprintf(fid,'\n');
25 | 
26 | for i = 1:f
27 |     fprintf(fid,'%s',featureHeaders{i});
28 |     
29 |     if n > 0
30 |         if strcmp(featureHeaders{i}(1:2),'N:')
31 |             fmt = repmat('\t%6.3f',[1,n]);
32 |         else
33 |             fmt = repmat('\t%i',[1,n]);
34 |         end
35 |     
36 |         fprintf(fid,fmt,X(i,:));
37 |         
38 |     end
39 |     
40 |     fprintf(fid,'\n');
41 | end
42 | 
43 | fclose(fid);


--------------------------------------------------------------------------------
/test/newtest.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef NEWTEST_HPP
 2 | #define NEWTEST_HPP
 3 | 
 4 | #define TEST__
 5 | 
 6 | #include <cstdlib>
 7 | #include <iostream>
 8 | #include <sstream>
 9 | #include <ios>
10 | 
11 | std::stringstream ERRLOG;
12 | 
13 | size_t N_SUCCESS = 0;
14 | size_t N_FAIL = 0;
15 | 
16 | #define newassert(condition) { if(!(condition)){ ERRLOG << " => FAIL: " << #condition << " @ " << __FILE__ << " (" << __LINE__ << ")" << std::endl; N_FAIL++; } else { N_SUCCESS++; } }
17 | 
18 | void printERRLOG() {
19 |   std::string errLine;
20 |   while( std::getline(ERRLOG,errLine) ) {
21 |     std::cerr << errLine << std::endl;
22 |   }
23 |   ERRLOG.clear();
24 | }
25 | 
26 | void rewindERRLOG() {
27 |   ERRLOG.seekg(std::ios_base::beg);
28 |   ERRLOG.clear();
29 | }
30 | 
31 | void newtestinit() {
32 | 
33 |   // Reset counters
34 |   N_SUCCESS = 0;
35 |   N_FAIL = 0;
36 | 
37 |   // Clear error stream
38 |   ERRLOG.clear();
39 |   ERRLOG.str("");
40 | 
41 |   std::cout << std::endl << "UNIT TESTING STARTED" << std::endl;
42 |   
43 | }
44 | 
45 | void newtest(const std::string& info, void (*testFunc)(void) ) {
46 | 
47 |   size_t nOldSuccess = N_SUCCESS;
48 |   size_t nOldFail = N_FAIL;
49 |   size_t nOldTests = N_SUCCESS + N_FAIL;
50 | 
51 |   std::cout << " TEST: " << info << "..." << std::flush; 
52 |   testFunc();
53 |   std::cout << " " << N_SUCCESS - nOldSuccess << "/" << N_SUCCESS + N_FAIL - nOldTests << " OK " << std::flush;
54 | 
55 |   if ( N_FAIL > nOldFail ) {
56 |     std::cout << " !! " << N_FAIL - nOldFail << " FAILURES !! " << std::flush;
57 |   }
58 | 
59 |   std::cout << std::endl;
60 | 
61 |   printERRLOG();
62 | 
63 | }
64 | 
65 | void newtestdone() {
66 |   
67 |   std::cout << std::endl << "ALL DONE! " << N_SUCCESS + N_FAIL << " tests run: " << N_SUCCESS << " successes and " << N_FAIL << " failures" << std::endl << std::endl;
68 |   rewindERRLOG();
69 |   printERRLOG();
70 | 
71 | }
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/test/node_newtest.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef NODE_NEWTEST_HPP
  2 | #define NODE_NEWTEST_HPP
  3 | 
  4 | #include <cstdlib>
  5 | 
  6 | #include "newtest.hpp"
  7 | #include "node.hpp"
  8 | #include "datadefs.hpp"
  9 | 
 10 | using namespace std;
 11 | using datadefs::num_t;
 12 | 
 13 | void node_newtest_getChildLeaves();
 14 | void node_newtest_setSplitter();
 15 | void node_newtest_percolateData();
 16 | void node_newtest_getLeafTrainPrediction();
 17 | void node_newtest_hasChildren();
 18 | void node_newtest_recursiveNodeSplit();
 19 | void node_newtest_cleanPairVectorFromNANs();
 20 | void node_newtest_recursiveNDescendantNodes();
 21 | void node_newtest_regularSplitterSeek();
 22 | 
 23 | void node_newtest() {
 24 | 
 25 |   newtest( "getChildLeaves(x)", &node_newtest_getChildLeaves );
 26 |   newtest( "setSplitter(x)", &node_newtest_setSplitter );
 27 |   newtest( "percolateData(x)", &node_newtest_percolateData );
 28 |   newtest( "getLeafTrainPrediction(x)", &node_newtest_getLeafTrainPrediction );
 29 |   newtest( "hasChildren(x)", &node_newtest_hasChildren );
 30 |   newtest( "recursiveNodeSplit(x)", &node_newtest_recursiveNodeSplit );
 31 |   newtest( "cleanPairVectorFromNANs(x)", &node_newtest_cleanPairVectorFromNANs );
 32 |   newtest( "recursiveNDescendantNodes(x)", &node_newtest_recursiveNDescendantNodes );
 33 |   newtest( "regularSplitterSeek(x)", &node_newtest_regularSplitterSeek );
 34 | 
 35 | 
 36 | }
 37 | 
 38 | void node_newtest_getChildLeaves() {
 39 | 
 40 |   Node node,nodeL,nodeR,nodeM,nodeLL,nodeLR;
 41 | 
 42 |   node.setSplitter(0.0,"foo",static_cast<num_t>(5.0),nodeL,nodeR);
 43 |   nodeL.setSplitter(0.0,"bar",static_cast<num_t>(6.0),nodeLL,nodeLR);
 44 |   node.missingChild_ = &nodeM;
 45 | 
 46 |   nodeR.setNumTrainPrediction(1.3);
 47 |   nodeM.setNumTrainPrediction(2.2);
 48 | 
 49 |   nodeLL.setNumTrainPrediction(1.1);
 50 |   nodeLR.setNumTrainPrediction(1.2);
 51 | 
 52 |   nodeLL.setNumTrainData({1,2,3});
 53 |   nodeLR.setNumTrainData({4,5});
 54 |   nodeR.setNumTrainData({6});
 55 |   nodeM.setNumTrainData({7});
 56 | 
 57 |   vector<Node*> childLeaves = node.getSubTreeLeaves();
 58 | 
 59 |   set<Node*> childLeavesSet(childLeaves.begin(),childLeaves.end());
 60 | 
 61 |   newassert( childLeaves.size() == 4 );
 62 |   newassert( childLeavesSet.size() == 4 );
 63 |   newassert( childLeavesSet.find(&nodeLL) != childLeavesSet.end() );
 64 |   newassert( childLeavesSet.find(&nodeLR) != childLeavesSet.end() );
 65 |   newassert( childLeavesSet.find(&nodeR) != childLeavesSet.end() );
 66 |   newassert( childLeavesSet.find(&nodeM) != childLeavesSet.end() );
 67 | 
 68 |   childLeaves = nodeR.getSubTreeLeaves();
 69 |   newassert( childLeaves.size() == 1 );
 70 |   newassert( childLeaves[0] == &nodeR );
 71 | 
 72 |   childLeaves = nodeM.getSubTreeLeaves();
 73 |   newassert( childLeaves.size() == 1 );
 74 |   newassert( childLeaves[0] == &nodeM );
 75 | 
 76 |   childLeaves = nodeL.getSubTreeLeaves();
 77 |   newassert( childLeaves.size() == 2 );
 78 |   childLeavesSet = set<Node*>(childLeaves.begin(),childLeaves.end());
 79 |   newassert( childLeavesSet.find(&nodeLL) != childLeavesSet.end() );
 80 |   newassert( childLeavesSet.find(&nodeLR) != childLeavesSet.end() );
 81 |   
 82 |   vector<num_t> trainData = nodeLL.getPrediction().numTrainData;
 83 |   set<num_t> trainDataSet(trainData.begin(),trainData.end());
 84 | 
 85 |   newassert( trainDataSet.find(1) != trainDataSet.end() );
 86 |   newassert( trainDataSet.find(2) != trainDataSet.end() );
 87 |   newassert( trainDataSet.find(3) != trainDataSet.end() );
 88 |   
 89 | }
 90 | 
 91 | 
 92 | void node_newtest_setSplitter() {
 93 |   
 94 |   //size_t splitterIdx = 3;
 95 |   datadefs::num_t splitLeftLeqValue = 0.5; 
 96 |   //datadefs::num_t leftFraction = 0.5;
 97 |   
 98 |   //Splitter::Splitter splitter(0.5);
 99 |   
100 |   Node node,leftChild,rightChild;
101 | 
102 |   node.setSplitter(0.0,"foo",splitLeftLeqValue,leftChild,rightChild);
103 |   
104 |   //newassert( node.splitterIdx() == splitterIdx );
105 |   newassert( node.splitter_.type == Feature::Type::NUM );
106 |   newassert( fabs(node.splitter_.leftLeqValue - splitLeftLeqValue) < datadefs::EPS );
107 |   //newassert( fabs(node.splitter_.leftFraction - leftFraction) < datadefs::EPS );
108 |   
109 | }
110 | 
111 | void node_newtest_percolateData() {
112 |   
113 |   DenseTreeData treeData("test_2by10_text_matrix.afm",'\t',':');
114 | 
115 |   uint32_t h;
116 | 
117 |   MurmurHash3_x86_32("c",1,0,&h);
118 | 
119 |   Node node,leftChild,rightChild;
120 | 
121 |   node.setSplitter(0.0,"T:in",h,leftChild,rightChild);
122 |   
123 |   newassert( &leftChild == node.leftChild() );
124 |   newassert( &rightChild == node.rightChild() );
125 | 
126 |   newassert( NULL == node.missingChild() );
127 | 
128 |   newassert( node.percolate(&treeData,0,1) == &rightChild );
129 |   newassert( node.percolate(&treeData,1,1) == &rightChild );
130 |   newassert( node.percolate(&treeData,2,1) == &rightChild );
131 |   newassert( node.percolate(&treeData,3,1) == &rightChild );
132 |   newassert( node.percolate(&treeData,4,1) == &rightChild );
133 |   newassert( node.percolate(&treeData,5,1) == &leftChild );
134 |   newassert( node.percolate(&treeData,6,1) == &leftChild );
135 |   newassert( node.percolate(&treeData,7,1) == &leftChild );
136 |   newassert( node.percolate(&treeData,8,1) == &leftChild );
137 |   newassert( node.percolate(&treeData,9,1) == &leftChild );
138 |   newassert( node.percolate(&treeData,10,1) == &leftChild );
139 |   newassert( node.percolate(&treeData,11,1) == &leftChild );
140 |   newassert( node.percolate(&treeData,12,1) == &leftChild );
141 |   newassert( node.percolate(&treeData,13,1) == &leftChild );
142 |   newassert( node.percolate(&treeData,14,1) == &leftChild );
143 |   newassert( node.percolate(&treeData,15,1) == &rightChild );
144 |   newassert( node.percolate(&treeData,16,1) == &rightChild );
145 |   newassert( node.percolate(&treeData,17,1) == &rightChild );
146 |   newassert( node.percolate(&treeData,18,1) == &rightChild );
147 |   newassert( node.percolate(&treeData,19,1) == &rightChild );
148 | 
149 | 
150 | }
151 | 
152 | void node_newtest_regularSplitterSeek() {
153 | 
154 | }
155 | 
156 | void node_newtest_getLeafTrainPrediction() {
157 | }
158 | 
159 | void node_newtest_hasChildren() {
160 | }
161 | 
162 | void node_newtest_recursiveNodeSplit() { 
163 | }
164 | 
165 | void node_newtest_cleanPairVectorFromNANs() { 
166 | 
167 | }
168 | 
169 | void node_newtest_recursiveNDescendantNodes() {
170 |   
171 | }
172 | 
173 | #endif
174 | 


--------------------------------------------------------------------------------
/test/python/ampute.py:
--------------------------------------------------------------------------------
 1 | import csv,sys,random
 2 | 
 3 | afmFileIn = sys.argv[1]
 4 | afmFileOut = sys.argv[3]
 5 | pMissing = float(sys.argv[2])
 6 | 
 7 | assert afmFileIn != afmFileOut
 8 | assert 0 < pMissing < 1
 9 | 
10 | afmReader = csv.reader(open(afmFileIn,'r'),delimiter='\t')
11 | afmWriter = csv.writer(open(afmFileOut,'w'),delimiter='\t')
12 | 
13 | afmWriter.writerow(afmReader.next())
14 | 
15 | for inputLine in afmReader:
16 |     
17 |     afmWriter.writerow( [inputLine[0]] + [ "NA" if random.uniform(0,1) < pMissing else x for x in inputLine[1:] ] )
18 | 


--------------------------------------------------------------------------------
/test/python/deltaImpurity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | % Caculate the decrease using the variance formula (slow+unstable)
  3 | if isNumerical
  4 |     DI = deltaImpurity_var_regr(x_left,x_right);
  5 | 
  6 |     %Calculate the decrease using the mean formulat (fast+stable)
  7 |     DI_test = deltaImpurity_mean_regr(x_left,x_right);
  8 | 
  9 |     %Make sure the two measures agree
 10 |     assert( abs(DI - DI_test ) < 1e-5, 'error: impurity functions disagree');
 11 | else
 12 | 
 13 |     DI = deltaImpurity_class(x_left,x_right);
 14 | 
 15 | end
 16 | 
 17 | function DI = deltaImpurity_mean_regr(x_left,x_right)
 18 | 
 19 | x = [x_left(:);x_right(:)];
 20 | 
 21 | mu = mean(x);
 22 | n = length(x);
 23 | muL = mean(x_left);
 24 | nL = length(x_left);
 25 | muR = mean(x_right);
 26 | nR = length(x_right);
 27 | 
 28 | DI = -mu^2 + nL/n*muL^2 + nR/n*muR^2;
 29 | 
 30 | function DI = deltaImpurity_var_regr(x_left,x_right)
 31 | 
 32 | x = [x_left(:);x_right(:)];
 33 | n = length(x);
 34 | nL = length(x_left);
 35 | nR = length(x_right);
 36 | DI = var(x,1) - nL/n*var(x_left,1) - nR/n*var(x_right,1);
 37 | 
 38 | function DI = deltaImpurity_class(x_left,x_right)
 39 | x = [x_left(:);x_right(:)];
 40 | n = length(x);
 41 | nL = length(x_left);
 42 | nR = length(x_right);
 43 | DI = giniIndex(x) - nL/n*giniIndex(x_left) - nR/n*giniIndex(x_right);
 44 | 
 45 | function GI = giniIndex(x)
 46 | GI = hist(x,unique(x))/length(x);
 47 | if ~isempty(GI)
 48 |     GI = 1 - sum(GI.^2);
 49 | else
 50 |     GI = 0;
 51 | end                                   
 52 | """
 53 | 
 54 | import sys
 55 | import getopt
 56 | import numpy
 57 | 
 58 | def myHist(list):
 59 | 	dic = {}
 60 | 	for l in list:
 61 | 		print l
 62 | 		if (dic.get(l)):
 63 | 			dic[l] = dic[l] + 1
 64 | 		else:
 65 | 			dic[l] = 1
 66 | 	return dic
 67 | 
 68 | def giniIndex(x):
 69 | 	print "Begin giniIndex"
 70 | 	print x
 71 | 	L = len(x)
 72 | 	sorted_x = sorted(x)
 73 | 	hist = myHist(sorted_x)
 74 | 	"""
 75 | 	numeric_sx = []
 76 | 	for v in sorted_x:
 77 | 		numeric_sx.append(float(v))
 78 | 	print numeric_sx
 79 | 	myset = set(x)
 80 | 	y = numpy.cumsum(numeric_sx)
 81 | 	B = sum(y) / (y[-1] * L)
 82 | 	return 1 + 1./L - 2*B
 83 | 	"""
 84 | 	print hist.keys()
 85 | 	GI = 0.0	
 86 | 	for v in hist.values():
 87 | 		GI = GI + pow(float(v)/float(L), 2)
 88 | 	return 1 - GI;	
 89 | 
 90 | """
 91 | function DI = deltaImpurity_mean_regr(x_left,x_right)
 92 | 
 93 | x = [x_left(:);x_right(:)];
 94 | 
 95 | mu = mean(x);
 96 | n = length(x);
 97 | muL = mean(x_left);
 98 | nL = length(x_left);
 99 | muR = mean(x_right);
100 | nR = length(x_right);
101 | 
102 | DI = -mu^2 + nL/n*muL^2 + nR/n*muR^2;
103 | """
104 | 
105 | def diIndex(x,y):
106 | 	x = [float(v) for v in x]
107 | 	y = [float(v) for v in y]
108 | 	w = x + y
109 | 		
110 | 	L = len(w)
111 | 	xL = len(x)
112 | 	yL = len(y)
113 | 	mw = numpy.mean(w)
114 | 	mx = numpy.mean(x)
115 | 	my = numpy.mean(y)
116 | 	return (-1.0)*pow(mw,2) + float(xL)/L*(pow(mx,2)) + float(yL)/L*(pow(my,2))
117 | 	
118 | def main():
119 |     try:
120 |         opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
121 |     except getopt.error, msg:
122 |         print msg
123 |         print "for help use --help"
124 |         sys.exit(2)
125 |     for o, a in opts:
126 |         if o in ("-h", "--help"):
127 |             print __doc__
128 |             sys.exit(0)
129 |     #for arg in args:
130 |         #process(arg) # process() is defined elsewhere
131 |     l = args[0].split(",")
132 |     r = args[1].split(",")
133 |     w = l + r
134 |     left = giniIndex(l)
135 |     right = giniIndex(r)
136 |     print left
137 |     print right
138 |     wgi = giniIndex(w) - (float(len(l))/float(len(w))*left + float(len(r))/float(len(w))*right )
139 |     print wgi
140 | 
141 |     print diIndex(l,r)	
142 |     	
143 | if __name__ == "__main__":
144 |     main()
145 | 
146 | 


--------------------------------------------------------------------------------
/test/reader_newtest.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef READER_NEWTEST_HPP
  2 | #define READER_NEWTEST_HPP
  3 | 
  4 | #include "newtest.hpp"
  5 | #include "reader.hpp"
  6 | #include "datadefs.hpp"
  7 | #include "treedata.hpp"
  8 | 
  9 | using namespace std;
 10 | using datadefs::num_t;
 11 | 
 12 | void reader_newtest_readAFM();
 13 | 
 14 | void reader_newtest() {
 15 | 
 16 |   newtest( "Testing Reader class with AFM data", &reader_newtest_readAFM );
 17 | 
 18 | }
 19 | 
 20 | void reader_newtest_readAFM() {
 21 | 
 22 |   Reader reader("test/data/3by8_mixed_NA_matrix.afm",'\t');
 23 | 
 24 |   newassert( reader.nLines() == 4 );
 25 | 
 26 |   size_t nSamples = reader.nLines() - 1;
 27 | 
 28 |   vector<Feature> features;
 29 | 
 30 |   // Removing top-left corner from table having column and row headers
 31 |   reader.nextLine();
 32 |   reader.skipField();
 33 |   
 34 |   size_t nVars = 0;
 35 | 
 36 |   // Check that all variable names are valid
 37 |   for ( ; ! reader.endOfLine(); ++nVars ) {
 38 |     string varName; reader >> varName;
 39 |     if ( varName.substr(0,2) == "N:" ) {
 40 |       features.push_back( Feature(Feature::Type::NUM,varName,nSamples) );
 41 |     } else if ( varName.substr(0,2) == "C:" ) {
 42 |       features.push_back( Feature(Feature::Type::CAT,varName,nSamples) );
 43 |     } else if ( varName.substr(0,2) == "T:" ) {
 44 |       features.push_back( Feature(Feature::Type::TXT,varName,nSamples) );
 45 |     } else {
 46 |       newassert( false );
 47 |     }
 48 |   }
 49 |   
 50 |   newassert( nVars == 8 );
 51 |   newassert( features.size() == 8 );
 52 | 
 53 |   // We should have reached end of the first line
 54 |   newassert( reader.endOfLine() );
 55 |   
 56 |   // Get the next line and start reading...
 57 |   reader.nextLine();
 58 |   string field;
 59 |   reader >> field; newassert( field == "s0" );  
 60 |   reader >> field; newassert( field == "NA" );  
 61 |   reader >> field; newassert( field == "foo" ); 
 62 |   reader >> field; newassert( field == "2.2" ); 
 63 |   reader >> field; newassert( field == "3.3" ); 
 64 |   reader >> field; newassert( field == "4.4" ); 
 65 |   reader >> field; newassert( field == "5.5" ); 
 66 |   reader >> field; newassert( field == "6.6" ); 
 67 |   reader >> field; newassert( field == "Ah, be so good. Yes, no?" ); 
 68 | 
 69 |   // Make sure that we reached end of line again
 70 |   newassert( reader.endOfLine() ); 
 71 | 
 72 |   // Go to the start of file and get first line
 73 |   reader.rewind();
 74 |   reader.nextLine();
 75 | 
 76 |   vector<string> sampleNames(nSamples);
 77 | 
 78 |   // Go through lines 2,3,...
 79 |   size_t i;
 80 |   for ( i = 0; reader.nextLine(); ++i ) {
 81 |     //reader.nextLine();
 82 |     // Sample name is the first field of the line
 83 |     reader >> sampleNames[i];
 84 |     for ( size_t j = 0; j < nVars; ++j ) {
 85 |       if ( features[j].isNumerical() ) {
 86 | 	num_t val; reader >> val;
 87 | 	features[j].setNumSampleValue(i,val);
 88 |       } else if ( features[j].isCategorical() ) {
 89 | 	string str; reader >> str;
 90 | 	features[j].setCatSampleValue(i,str);
 91 |       } else if ( features[j].isTextual() ) {
 92 | 	string str; reader >> str;
 93 | 	features[j].setTxtSampleValue(i,str);
 94 |       }
 95 |     }
 96 |     // By now, we should have reached end of line
 97 |     newassert( reader.endOfLine() );     
 98 |   }
 99 | 
100 |   newassert( i == nSamples );
101 | 
102 |   // Did we recover the correct sample names from the file
103 |   newassert( sampleNames[0] == "s0" ); 
104 |   newassert( sampleNames[1] == "s1" ); 
105 |   newassert( sampleNames[2] == "s2" ); 
106 | 
107 |   // Rewind again to the start, and start reading from line 2
108 |   reader.rewind();
109 |   reader.nextLine();
110 |   reader.nextLine();
111 | 
112 |   // Variables for storing all data on line 2
113 |   string s0;
114 |   num_t  v1,v3,v4,v5,v6,v7;
115 |   cat_t v2,v8;
116 |   
117 |   // Read the 2nd line in one pass
118 |   reader >> s0 >> v1 >> v2 >> v3 >> v4 >> v5 >> v6 >> v7 >> v8; 
119 | 
120 |   // Again, end of line should have been reached
121 |   newassert( reader.endOfLine() );
122 | 
123 |   // Make sure the content of the 2nd line is as expected
124 |   newassert( s0 == "s0" ); 
125 |   newassert( datadefs::isNAN(v1) ); 
126 |   newassert( v2 == "foo" ); 
127 |   newassert( fabs( v3 - 2.2 ) < 1e-5 ); 
128 |   newassert( fabs( v4 - 3.3 ) < 1e-5 ); 
129 |   newassert( fabs( v5 - 4.4 ) < 1e-5 ); 
130 |   newassert( fabs( v6 - 5.5 ) < 1e-5 ); 
131 |   newassert( fabs( v7 - 6.6 ) < 1e-5 ); 
132 |   newassert( v8 == "Ah, be so good. Yes, no?" ); 
133 | 
134 |   // Go back to the beginning
135 |   reader.rewind();
136 | 
137 |   // While reading the whole file line by line till the end, we should 
138 |   // not reach end of line nor end of file, since we have the last line
139 |   // stored in the linefeed...
140 |   for ( size_t i = 0; i < reader.nLines(); ++i ) {
141 |     reader.nextLine();
142 |     newassert( ! reader.endOfLine() );
143 |     //newassert( ! reader.endOfFile() );
144 |   }
145 | 
146 |   // ... that means that we can then read the last line, field by field, 
147 |   // into string variables
148 |   for ( size_t i = 0; i < nVars + 1; ++i ) {
149 |     newassert( ! reader.endOfLine() );
150 |     string field; reader >> field;
151 |   }
152 | 
153 |   // After we are done reading the last line, we should have reached end of line 
154 |   // and end of file, meaning that we can't extract the next line since there is no
155 |   // next line
156 |   newassert( reader.endOfLine() );
157 |   newassert( ! reader.nextLine() );
158 | 
159 | }
160 | 
161 | #endif
162 | 


--------------------------------------------------------------------------------
/test/rface_newtest.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef RFACE_NEWTEST_HPP
  2 | #define RFACE_NEWTEST_HPP
  3 | 
  4 | #include <cstdlib>
  5 | #include <cmath>
  6 | #include "options.hpp"
  7 | #include "densetreedata.hpp"
  8 | #include "rf_ace.hpp"
  9 | #include "newtest.hpp"
 10 | 
 11 | using namespace std;
 12 | using datadefs::num_t;
 13 | 
 14 | void rface_newtest_RF_train_test_classification();
 15 | void rface_newtest_RF_train_test_regression();
 16 | void rface_newtest_QRF_train_test_regression();
 17 | void rface_newtest_GBT_train_test_classification();
 18 | void rface_newtest_GBT_train_test_regression();
 19 | void rface_newtest_RF_save_load_classification();
 20 | void rface_newtest_RF_save_load_regression();
 21 | void rface_newtest_QRF_save_load_regression();
 22 | void rface_newtest_GBT_save_load_classification();
 23 | void rface_newtest_GBT_save_load_regression();
 24 | 
 25 | void rface_newtest() {
 26 |   
 27 |   newtest( "RF for classification", &rface_newtest_RF_train_test_classification );
 28 |   newtest( "RF for regression", &rface_newtest_RF_train_test_regression );
 29 |   newtest( "QRF for regression", &rface_newtest_QRF_train_test_regression );
 30 |   //newtest( "Testing GBT for classification", &rface_newtest_GBT_train_test_classification );
 31 |   //newtest( "Testing GBT for regression", &rface_newtest_GBT_train_test_regression );
 32 |   newtest( "save/load RF for classification", &rface_newtest_RF_save_load_classification );
 33 |   newtest( "save/load RF for regression", &rface_newtest_RF_save_load_regression );
 34 |   newtest( "save/load QRF for regression", &rface_newtest_QRF_save_load_regression );
 35 |   //newtest( "Testing save/load GBT for classification", &rface_newtest_GBT_save_load_classification );
 36 |   //newtest( "Testing save/load GBT for regression", &rface_newtest_GBT_save_load_regression );
 37 | 
 38 | }
 39 | 
 40 | RFACE::TestOutput make_predictions(ForestOptions& forestOptions, const string& targetStr) {
 41 | 
 42 |   string fileName = "test_103by300_mixed_nan_matrix.afm";
 43 |   DenseTreeData trainData(fileName,'\t',':',false);
 44 |   size_t targetIdx = trainData.getFeatureIdx(targetStr);
 45 |   vector<num_t> weights = trainData.getFeatureWeights();
 46 |   weights[targetIdx] = 0;
 47 | 
 48 |   RFACE rface;
 49 | 
 50 |   rface.train(&trainData,targetIdx,weights,&forestOptions);
 51 | 
 52 |   return( rface.test(&trainData) );
 53 | 
 54 | }
 55 | 
 56 | RFACE::QRFPredictionOutput make_quantile_predictions(ForestOptions& forestOptions, const string& targetStr) {
 57 | 
 58 |   string fileName = "test_103by300_mixed_nan_matrix.afm";
 59 |   DenseTreeData trainData(fileName,'\t',':',false);
 60 |   size_t targetIdx = trainData.getFeatureIdx(targetStr);
 61 |   vector<num_t> weights = trainData.getFeatureWeights();
 62 |   weights[targetIdx] = 0;
 63 | 
 64 |   RFACE rface;
 65 | 
 66 |   rface.train(&trainData,targetIdx,weights,&forestOptions);
 67 | 
 68 |   return( rface.predictQRF(&trainData,forestOptions) );
 69 | 
 70 | }
 71 | 
 72 | 
 73 | 
 74 | RFACE::TestOutput make_save_load_predictions(ForestOptions& forestOptions, const string& targetStr) {
 75 | 
 76 |   string fileName = "test_103by300_mixed_nan_matrix.afm";
 77 |   DenseTreeData trainData(fileName,'\t',':',false);
 78 |   size_t targetIdx = trainData.getFeatureIdx(targetStr);
 79 |   vector<num_t> weights = trainData.getFeatureWeights();
 80 |   weights[targetIdx] = 0;
 81 | 
 82 |   RFACE rface;
 83 | 
 84 |   rface.train(&trainData,targetIdx,weights,&forestOptions);
 85 |   
 86 |   rface.save("foo.sf");
 87 | 
 88 |   RFACE rface2;
 89 |   
 90 |   rface2.load("foo.sf");
 91 | 
 92 |   return( rface2.test(&trainData) );
 93 |   
 94 | }
 95 | 
 96 | RFACE::QRFPredictionOutput make_save_load_quantile_predictions(ForestOptions& forestOptions, const string& targetStr) {
 97 |   
 98 |   string fileName = "test_103by300_mixed_nan_matrix.afm";
 99 |   DenseTreeData trainData(fileName,'\t',':',false);
100 |   size_t targetIdx = trainData.getFeatureIdx(targetStr);
101 |   vector<num_t> weights = trainData.getFeatureWeights();
102 |   weights[targetIdx] = 0;
103 |   
104 |   RFACE rface;
105 |   
106 |   rface.train(&trainData,targetIdx,weights,&forestOptions);
107 |   
108 |   rface.save("foo.sf");
109 |   
110 |   RFACE rface2;
111 |   
112 |   rface2.load("foo.sf");
113 |   
114 |   return( rface2.predictQRF(&trainData,forestOptions) );
115 | 
116 | }
117 | 
118 | 
119 | num_t classification_error(const RFACE::TestOutput& predictions) { 
120 |  
121 |   num_t pError = 0.0;
122 |   num_t n = static_cast<num_t>(predictions.catPredictions.size());
123 |   for ( size_t i = 0; i < predictions.catPredictions.size(); ++i ) {
124 |     pError += (predictions.catPredictions[i] != predictions.catTrueData[i]) / n;
125 |   }
126 |   
127 |   return(pError);
128 | 
129 | }
130 | 
131 | num_t regression_error(const RFACE::TestOutput& predictions) {
132 | 
133 |   num_t RMSE = 0.0;
134 |   num_t n = static_cast<num_t>(predictions.numPredictions.size());
135 |   for ( size_t i = 0; i < predictions.numPredictions.size(); ++i ) {
136 |     num_t e = predictions.numPredictions[i] - predictions.numTrueData[i];
137 |     RMSE += powf(e,2)/n;
138 |   }
139 |   
140 |   return( sqrt(RMSE) );
141 | 
142 | }
143 | 
144 | vector<num_t> quantile_regression_error(const RFACE::QRFPredictionOutput& qPredOut) {
145 | 
146 |   vector<num_t> QDEV(qPredOut.quantiles.size(),0.0);
147 |   num_t n = static_cast<num_t>(qPredOut.numPredictions.size());
148 | 
149 |   for ( size_t q = 0; q < qPredOut.quantiles.size(); ++q ) {
150 |     for ( size_t i = 0; i < qPredOut.numPredictions.size(); ++i ) {
151 |       bool b = qPredOut.trueNumData[i] < qPredOut.numPredictions[i][q];
152 |       QDEV[q] += b/n;
153 |     }
154 |     QDEV[q] = fabs(QDEV[q] - qPredOut.quantiles[q]);
155 |   }
156 | 
157 |   return(QDEV);
158 | 
159 | }
160 | 
161 | void rface_newtest_RF_train_test_classification() {
162 |   
163 |   ForestOptions forestOptions(forest_t::QRF);
164 |   forestOptions.mTry = 30;
165 | 
166 |   num_t pError = classification_error( make_predictions(forestOptions,"C:class") );
167 | 
168 |   newassert(pError < 0.2);
169 | 
170 | }
171 | 
172 | void rface_newtest_RF_train_test_regression() {
173 |   
174 |   ForestOptions forestOptions(forest_t::QRF);
175 |   forestOptions.mTry = 30;
176 | 
177 |   num_t RMSE = regression_error( make_predictions(forestOptions,"N:output") );
178 |   
179 |   newassert(RMSE < 1.0);
180 | 
181 | }
182 | 
183 | void rface_newtest_QRF_train_test_regression() {
184 | 
185 |   ForestOptions forestOptions(forest_t::QRF);
186 |   forestOptions.mTry = 30;
187 |   forestOptions.quantiles = {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9};
188 |   forestOptions.nSamplesForQuantiles = 100;
189 | 
190 |   vector<num_t> QDEV = quantile_regression_error( make_quantile_predictions(forestOptions,"N:output") );
191 | 
192 |   //utils::write(cout,QDEV.begin(),QDEV.end());
193 | 
194 |   newassert( math::mean(QDEV) < 0.20 );
195 |   
196 | }
197 | 
198 | void rface_newtest_GBT_train_test_classification() {
199 | 
200 |   ForestOptions forestOptions(forest_t::GBT);
201 | 
202 |   num_t pError = classification_error( make_predictions(forestOptions,"C:class") );
203 | 
204 |   newassert( pError < 0.2 );
205 | 
206 | }
207 | 
208 | void rface_newtest_GBT_train_test_regression() { 
209 | 
210 |   ForestOptions forestOptions(forest_t::GBT);
211 | 
212 |   num_t RMSE = regression_error( make_predictions(forestOptions,"N:output") );
213 | 
214 |   newassert(RMSE < 1.0);
215 | 
216 | }
217 | 
218 | void rface_newtest_RF_save_load_classification() {
219 | 
220 |   ForestOptions forestOptions(forest_t::QRF);
221 |   forestOptions.mTry = 30;
222 | 
223 |   num_t pError1 = classification_error( make_predictions(forestOptions,"C:class") );
224 |   num_t pError2 = classification_error( make_save_load_predictions(forestOptions,"C:class") );
225 | 
226 |   newassert( fabs(pError1 - pError2) < 1e-1 );
227 | 
228 | }
229 | 
230 | void rface_newtest_RF_save_load_regression() {
231 | 
232 |   ForestOptions forestOptions(forest_t::QRF);
233 |   forestOptions.mTry = 30;
234 | 
235 |   num_t RMSE1 = classification_error( make_predictions(forestOptions,"N:output") );
236 |   num_t RMSE2 = classification_error( make_save_load_predictions(forestOptions,"N:output") );
237 | 
238 |   newassert( fabs(RMSE1 - RMSE2) < 1e-1 );
239 | 
240 | }
241 | 
242 | void rface_newtest_QRF_save_load_regression() {
243 | 
244 |   ForestOptions forestOptions(forest_t::QRF);
245 |   forestOptions.mTry = 30;
246 |   forestOptions.quantiles = {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9};
247 |   forestOptions.nSamplesForQuantiles = 100;
248 |   
249 |   vector<num_t> QRMSE1 = quantile_regression_error( make_quantile_predictions(forestOptions,"N:output") );
250 |   vector<num_t> QRMSE2 = quantile_regression_error( make_save_load_quantile_predictions(forestOptions,"N:output") );
251 |   
252 |   newassert( fabs(QRMSE1[2] - QRMSE2[2]) < 1e-1 );
253 | 
254 | 
255 | }
256 | 
257 | void rface_newtest_GBT_save_load_classification() {
258 | 
259 |   ForestOptions forestOptions(forest_t::GBT);
260 | 
261 |   num_t pError1 = classification_error( make_predictions(forestOptions,"C:class") );
262 |   num_t pError2 = classification_error( make_save_load_predictions(forestOptions,"C:class") );
263 | 
264 |   newassert( fabs(pError1 - pError2) < 1e-1 );
265 |   
266 | }
267 | 
268 | void rface_newtest_GBT_save_load_regression() {
269 | 
270 |   ForestOptions forestOptions(forest_t::GBT);
271 | 
272 |   num_t RMSE1 = classification_error( make_predictions(forestOptions,"N:output") );
273 |   num_t RMSE2 = classification_error( make_save_load_predictions(forestOptions,"N:output") );
274 | 
275 |   newassert( fabs(RMSE1 - RMSE2) < 1e-1 );
276 | 
277 | }
278 | 
279 | #endif
280 | 


--------------------------------------------------------------------------------
/test/rootnode_newtest.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ROOTNODE_NEWTEST_HPP
 2 | #define ROOTNODE_NEWTEST_HPP
 3 | 
 4 | #include <cstdlib>
 5 | #include <set>
 6 | #incluce <vector>
 7 | 
 8 | #include "newtest.hpp"
 9 | #include "rootnode.hpp"
10 | #include "datadefs.hpp"
11 | 
12 | using namespace std;
13 | 
14 | void rootnode_newtest_getChildLeafTrainData();
15 | 
16 | void rootnode_newtest() {
17 | 
18 |   newtest( "Testing extraction of train samples from child leaf nodes", &rootnode_newtest_getChildLeafTrainData );
19 | 
20 | }
21 | 
22 | void rootnode_newtest_getChildLeafTrainData() {
23 | 
24 |   Rootnode rootNode;
25 |   Node nodeL,nodeR,nodeM;
26 | 
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/test/run_newtests.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <iostream>
 3 | 
 4 | #include "newtest.hpp"
 5 | #include "reader_newtest.hpp"
 6 | #include "treedata_newtest.hpp"
 7 | #include "rface_newtest.hpp"
 8 | #include "distributions_newtest.hpp"
 9 | #include "utils_newtest.hpp"
10 | #include "datadefs_newtest.hpp"
11 | #include "node_newtest.hpp"
12 | #include "math_newtest.hpp"
13 | 
14 | using namespace std;
15 | 
16 | int main() {
17 | 
18 |   newtestinit();
19 | 
20 |   cout << endl << "Testing Reader class:" << endl; 
21 |   reader_newtest();
22 | 
23 |   cout << endl << "Testing Treedata class:" << endl; 
24 |   treedata_newtest();
25 | 
26 |   cout << endl << "Testing RFACE class:" << endl;
27 |   rface_newtest();
28 | 
29 |   cout << endl << "Testing Distributions namespace:" << endl;
30 |   distributions_newtest();
31 |   
32 |   cout << endl << "Testing Utils namespace:" << endl;
33 |   utils_newtest();
34 |   
35 |   cout << endl << "Testing Datadefs namespace:" << endl;
36 |   datadefs_newtest();
37 | 
38 |   cout << endl << "Testing Node class:" << endl;
39 |   node_newtest();
40 | 
41 |   cout << endl << "Testing math namespace:" << endl;
42 |   math_newtest();
43 | 
44 |   newtestdone();
45 | 
46 |   return( EXIT_SUCCESS );
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/test_2by10_text_matrix.afm:
--------------------------------------------------------------------------------
1 | 	s1	s2	s3	s4	s5	s6	s7	s8	s9	s10	s11	s12	s13	s14	s15	s16	s17	s18	s19	s20
2 | C:out	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	0	0	0	0	0
3 | T:in	a.b	a.b	a.b	a.b	a.b	a.b.c	a.b.c	a.b.c	a.b.c	a.b.c	a.b.c	a.b.c	c	c	b.c	a.b	b	a	b	a
4 | N:no1	8	7	6	8	8	5	78	5	4	7	5	4	4	8	9	6	4	4	7	8
5 | N:no2	9	8	6	4	5	4	6	8	9	7	5	4	3	7	9	0	7	5	4	7
6 | 


--------------------------------------------------------------------------------
/test_2by8_numerical_matrix.tsv:
--------------------------------------------------------------------------------
1 | 	S1	S2	S3	S4	S5	S6	S7	S8
2 | N:F1	3	2	nan	nan	5	4	2.9	3.1
3 | N:F2	1	3	2	nan	4	5	3.6	2.8
4 | 


--------------------------------------------------------------------------------
/test_3by10_categorical_matrix.tsv:
--------------------------------------------------------------------------------
1 | 	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2 | C:F	0	1	0	1	4	5	6	7	8	9
3 | C:T1	1	1	1	1	1	0	0	0	0	0
4 | C:T2	1	2	1	2	0	0	0	0	0	0


--------------------------------------------------------------------------------
/test_6by10_mixed_matrix.tsv:
--------------------------------------------------------------------------------
1 | foo	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2 | N:F1	nA	8.5	3.4	7.2	5	6	7	11	9	NA
3 | N:F2	2	3	4	5	6	NA	NA	9	nan	10
4 | C:F3	NA	nA	naN	NaN	1	1	1	2	2	2
5 | N:F4	10	9.9	8	7	6	5	4	3	2.4	1
6 | C:F5	3	3	3	4	4	5	3	2	2	2
7 | N:F6	9	8	7	9	8	7	3	2	1.0	99.23


--------------------------------------------------------------------------------
/test_fullSplitterSweep.txt:
--------------------------------------------------------------------------------
  1 | N:output	C:class	1.121686e+00	86	192
  2 | N:output	N:input	1.251324e+00	113	162
  3 | N:output	N:noise_1	9.328740e-02	21	253
  4 | N:output	N:noise_2	1.078159e-01	148	128
  5 | N:output	N:noise_3	9.386892e-02	246	29
  6 | N:output	N:noise_4	6.245053e-02	63	211
  7 | N:output	N:noise_5	4.026299e-02	13	260
  8 | N:output	C:noise_6	2.118358e-03	78	191
  9 | N:output	N:noise_7	1.003136e-01	43	236
 10 | N:output	N:noise_8	4.813590e-02	48	224
 11 | N:output	N:noise_9	1.011168e-01	13	269
 12 | N:output	N:noise_10	4.593473e-02	12	264
 13 | N:output	N:noise_11	4.945915e-02	24	247
 14 | N:output	N:noise_12	7.033682e-02	23	243
 15 | N:output	N:noise_13	6.036680e-02	26	246
 16 | N:output	N:noise_14	1.019840e-01	3	272
 17 | N:output	N:noise_15	5.772823e-02	146	128
 18 | N:output	N:noise_16	9.041976e-02	261	11
 19 | N:output	N:noise_17	6.628259e-02	204	69
 20 | N:output	C:noise_18	1.611157e-01	103	168
 21 | N:output	N:noise_19	4.208906e-02	126	147
 22 | N:output	N:noise_20	5.109677e-02	13	261
 23 | N:output	N:noise_21	7.293652e-02	88	178
 24 | N:output	N:noise_22	1.346184e-01	15	259
 25 | N:output	C:noise_23	9.415048e-03	76	196
 26 | N:output	N:noise_24	4.618499e-02	10	258
 27 | N:output	N:noise_25	1.230457e-01	192	78
 28 | N:output	N:noise_26	1.463906e-01	5	270
 29 | N:output	N:noise_27	4.840161e-02	13	256
 30 | N:output	N:noise_28	8.308741e-02	247	28
 31 | N:output	N:noise_29	7.497228e-02	256	16
 32 | N:output	N:noise_30	1.471867e-01	211	62
 33 | N:output	N:noise_31	6.474735e-02	265	10
 34 | N:output	N:noise_32	8.340131e-02	21	250
 35 | N:output	N:noise_33	4.267850e-02	259	19
 36 | N:output	N:noise_34	1.228116e-01	45	225
 37 | N:output	N:noise_35	6.714979e-02	15	260
 38 | N:output	N:noise_36	3.932013e-02	28	243
 39 | N:output	N:noise_37	9.741292e-02	4	263
 40 | N:output	N:noise_38	5.143446e-02	4	267
 41 | N:output	N:noise_39	6.923821e-02	260	9
 42 | N:output	N:noise_40	5.260796e-02	85	190
 43 | N:output	N:noise_41	9.401821e-02	38	233
 44 | N:output	N:noise_42	1.959699e-01	5	269
 45 | N:output	N:noise_43	3.535644e-02	152	120
 46 | N:output	N:noise_44	5.551332e-02	4	270
 47 | N:output	N:noise_45	6.478165e-02	49	226
 48 | N:output	N:noise_46	6.094157e-02	242	32
 49 | N:output	N:noise_47	1.149479e-01	104	166
 50 | N:output	N:noise_48	8.086051e-02	32	236
 51 | N:output	N:noise_49	7.817317e-02	26	246
 52 | N:output	N:noise_50	8.304132e-02	87	182
 53 | N:output	N:noise_51	4.322512e-02	9	256
 54 | N:output	N:noise_52	4.241347e-02	230	42
 55 | N:output	N:noise_53	1.284148e-01	67	199
 56 | N:output	N:noise_54	5.553333e-02	20	254
 57 | N:output	N:noise_55	1.378761e-01	59	215
 58 | N:output	N:noise_56	8.478826e-02	113	165
 59 | N:output	N:noise_57	5.143181e-02	213	58
 60 | N:output	N:noise_58	1.677901e-01	78	200
 61 | N:output	N:noise_59	4.516454e-02	86	178
 62 | N:output	N:noise_60	4.662042e-02	120	155
 63 | N:output	N:noise_61	4.883542e-02	255	20
 64 | N:output	C:noise_62	1.811719e-02	81	188
 65 | N:output	N:noise_63	4.851824e-02	238	38
 66 | N:output	N:noise_64	5.891705e-02	170	104
 67 | N:output	N:noise_65	9.432740e-02	265	10
 68 | N:output	C:noise_66	2.228690e-02	84	185
 69 | N:output	N:noise_67	9.516234e-02	204	65
 70 | N:output	N:noise_68	9.043474e-02	240	35
 71 | N:output	C:noise_69	3.016761e-02	94	184
 72 | N:output	N:noise_70	1.484555e-01	260	10
 73 | N:output	N:noise_71	4.317891e-02	271	3
 74 | N:output	N:noise_72	8.184142e-02	122	148
 75 | N:output	N:noise_73	4.699886e-02	60	209
 76 | N:output	C:noise_74	5.743994e-02	104	164
 77 | N:output	C:noise_75	1.078566e-02	80	187
 78 | N:output	N:noise_76	5.749446e-02	268	3
 79 | N:output	N:noise_77	8.603062e-02	122	148
 80 | N:output	N:noise_78	1.196307e-01	120	156
 81 | N:output	N:noise_79	5.445619e-02	256	19
 82 | N:output	N:noise_80	7.984189e-02	186	88
 83 | N:output	N:noise_81	7.058734e-02	19	251
 84 | N:output	N:noise_82	4.363979e-02	262	7
 85 | N:output	N:noise_83	7.936767e-02	142	135
 86 | N:output	N:noise_84	7.261903e-02	9	267
 87 | N:output	N:noise_85	8.337065e-02	267	8
 88 | N:output	C:noise_86	3.541747e-02	88	186
 89 | N:output	N:noise_87	1.050412e-01	37	235
 90 | N:output	N:noise_88	2.744471e-02	267	10
 91 | N:output	N:noise_89	6.768391e-02	27	247
 92 | N:output	N:noise_90	1.147344e-01	241	22
 93 | N:output	N:noise_91	1.392028e-01	186	90
 94 | N:output	N:noise_92	5.769372e-02	238	35
 95 | N:output	N:noise_93	1.057857e-01	141	134
 96 | N:output	N:noise_94	1.015035e-01	5	265
 97 | N:output	C:noise_95	6.726926e-03	76	201
 98 | N:output	N:noise_96	4.923999e-02	268	5
 99 | N:output	N:noise_97	8.986233e-02	209	66
100 | N:output	N:noise_98	9.489314e-02	5	269
101 | N:output	N:noise_99	1.116795e-01	273	4
102 | N:output	N:noise_100	3.103523e-02	262	8
103 | 


--------------------------------------------------------------------------------
/test_fullSplitterSweep_class.txt:
--------------------------------------------------------------------------------
  1 | C:class	N:output	8.277247e-02	132	146
  2 | C:class	N:input	1.080478e-02	273	4
  3 | C:class	N:noise_1	1.186730e-02	145	131
  4 | C:class	N:noise_2	9.047567e-03	149	131
  5 | C:class	N:noise_3	5.632269e-03	37	240
  6 | C:class	N:noise_4	2.012800e-02	101	176
  7 | C:class	N:noise_5	4.715754e-03	144	132
  8 | C:class	C:noise_6	1.915083e-03	93	178
  9 | C:class	N:noise_7	8.512985e-03	263	19
 10 | C:class	N:noise_8	4.338452e-03	17	260
 11 | C:class	N:noise_9	7.568886e-03	157	128
 12 | C:class	N:noise_10	5.742623e-03	212	67
 13 | C:class	N:noise_11	8.089438e-03	267	7
 14 | C:class	N:noise_12	6.164748e-03	162	107
 15 | C:class	N:noise_13	5.167658e-03	144	132
 16 | C:class	N:noise_14	7.158856e-03	265	13
 17 | C:class	N:noise_15	8.629035e-03	31	245
 18 | C:class	N:noise_16	1.294779e-02	270	6
 19 | C:class	N:noise_17	3.829856e-03	43	233
 20 | C:class	C:noise_18	1.939052e-02	89	183
 21 | C:class	N:noise_19	7.300695e-03	13	262
 22 | C:class	N:noise_20	1.300216e-02	88	188
 23 | C:class	N:noise_21	7.683229e-03	227	43
 24 | C:class	N:noise_22	1.820842e-02	8	268
 25 | C:class	C:noise_23	4.497780e-03	77	195
 26 | C:class	N:noise_24	1.553144e-02	31	240
 27 | C:class	N:noise_25	7.985333e-03	28	244
 28 | C:class	N:noise_26	1.021783e-02	213	64
 29 | C:class	N:noise_27	5.208401e-03	32	239
 30 | C:class	N:noise_28	8.072889e-03	250	28
 31 | C:class	N:noise_29	7.635241e-03	10	265
 32 | C:class	N:noise_30	1.439265e-02	225	50
 33 | C:class	N:noise_31	9.134501e-03	269	10
 34 | C:class	N:noise_32	1.537890e-02	185	89
 35 | C:class	N:noise_33	7.579281e-03	3	278
 36 | C:class	N:noise_34	1.615210e-02	41	231
 37 | C:class	N:noise_35	9.442723e-03	52	227
 38 | C:class	N:noise_36	9.529369e-03	205	70
 39 | C:class	N:noise_37	8.874968e-03	182	90
 40 | C:class	N:noise_38	6.306706e-03	83	191
 41 | C:class	N:noise_39	6.690488e-03	13	257
 42 | C:class	N:noise_40	6.732478e-03	156	123
 43 | C:class	N:noise_41	6.800112e-03	258	16
 44 | C:class	N:noise_42	1.330721e-02	5	271
 45 | C:class	N:noise_43	6.459754e-03	251	22
 46 | C:class	N:noise_44	1.463259e-02	86	190
 47 | C:class	N:noise_45	1.074883e-02	57	221
 48 | C:class	N:noise_46	4.948310e-03	249	27
 49 | C:class	N:noise_47	1.185683e-02	74	201
 50 | C:class	N:noise_48	5.228776e-03	203	68
 51 | C:class	N:noise_49	8.015075e-03	270	3
 52 | C:class	N:noise_50	3.827547e-03	245	26
 53 | C:class	N:noise_51	5.623361e-03	8	260
 54 | C:class	N:noise_52	1.132672e-02	264	11
 55 | C:class	N:noise_53	1.474747e-02	198	72
 56 | C:class	N:noise_54	1.464262e-02	184	93
 57 | C:class	N:noise_55	1.061787e-02	57	218
 58 | C:class	N:noise_56	9.102942e-03	275	7
 59 | C:class	N:noise_57	8.660064e-03	4	270
 60 | C:class	N:noise_58	1.468407e-02	7	273
 61 | C:class	N:noise_59	4.682519e-03	254	14
 62 | C:class	N:noise_60	8.742796e-03	87	189
 63 | C:class	N:noise_61	6.415251e-03	276	3
 64 | C:class	C:noise_62	3.877685e-03	91	182
 65 | C:class	N:noise_63	9.286570e-03	252	26
 66 | C:class	N:noise_64	7.245304e-03	6	272
 67 | C:class	N:noise_65	8.578843e-03	274	4
 68 | C:class	C:noise_66	3.314331e-03	100	172
 69 | C:class	N:noise_67	8.746950e-03	268	4
 70 | C:class	N:noise_68	8.593788e-03	63	215
 71 | C:class	C:noise_69	4.556471e-03	83	198
 72 | C:class	N:noise_70	1.385411e-02	269	5
 73 | C:class	N:noise_71	7.371832e-03	45	234
 74 | C:class	N:noise_72	5.581014e-03	266	7
 75 | C:class	N:noise_73	5.526464e-03	222	50
 76 | C:class	C:noise_74	1.235055e-03	86	186
 77 | C:class	C:noise_75	2.357321e-03	81	190
 78 | C:class	N:noise_76	5.492676e-03	231	42
 79 | C:class	N:noise_77	1.658936e-02	207	66
 80 | C:class	N:noise_78	9.358209e-03	38	239
 81 | C:class	N:noise_79	5.969206e-03	185	95
 82 | C:class	N:noise_80	1.487118e-02	17	260
 83 | C:class	N:noise_81	5.329469e-03	268	5
 84 | C:class	N:noise_82	7.679314e-03	246	26
 85 | C:class	N:noise_83	9.127313e-03	7	274
 86 | C:class	N:noise_84	4.509829e-03	137	143
 87 | C:class	N:noise_85	4.530825e-03	110	169
 88 | C:class	C:noise_86	1.859930e-03	78	199
 89 | C:class	N:noise_87	1.187371e-02	30	244
 90 | C:class	N:noise_88	5.670918e-03	200	80
 91 | C:class	N:noise_89	7.086722e-03	21	256
 92 | C:class	N:noise_90	1.036187e-02	207	59
 93 | C:class	N:noise_91	1.046930e-02	4	274
 94 | C:class	N:noise_92	1.057821e-02	239	38
 95 | C:class	N:noise_93	7.872152e-03	45	233
 96 | C:class	N:noise_94	8.085322e-03	23	250
 97 | C:class	C:noise_95	1.761905e-03	100	180
 98 | C:class	N:noise_96	9.914971e-03	144	131
 99 | C:class	N:noise_97	1.001201e-02	17	261
100 | C:class	N:noise_98	7.998670e-03	274	3
101 | C:class	N:noise_99	6.701586e-03	221	60
102 | C:class	N:noise_100	1.181143e-02	191	82
103 | 


--------------------------------------------------------------------------------
/test_predictor.sf:
--------------------------------------------------------------------------------
 1 | FOREST=RF,NTREES=1,TARGET="N:T",CATEGORIES=,SHRINKAGE=1
 2 | TREE=0,NNODES=13
 3 | NODE=*,PRED=5.0,SPLITTER="N:f0",SPLITTERTYPE=NUMERICAL,LFRACTION=1.0,LVALUES="1.1",RVALUES="1.1",M=M
 4 | NODE=*L,PRED=4.0,SPLITTER="C:f1",SPLITTERTYPE=CATEGORICAL,LFRACTION=1.0,LVALUES="0:2",RVALUES="1"
 5 | NODE=*LL,PRED=3.9
 6 | NODE=*LR,PRED=4.2,SPLITTER="N:f3",SPLITTERTYPE=NUMERICAL,LFRACTION=1.0,LVALUES="-1.5",RVALUES="-1.5"
 7 | NODE=*LRL,PRED=3.99
 8 | NODE=*LRR,PRED=4.3
 9 | NODE=*R,PRED=6.0,SPLITTER="N:f2",SPLITTERTYPE=NUMERICAL,LFRACTION=1.0,LVALUES="3.0",RVALUES="3.0"
10 | NODE=*RL,PRED=5.1
11 | NODE=*RR,PRED=6.6,SPLITTER="C:f4",SPLITTERTYPE=CATEGORICAL,LFRACTION=1.0,LVALUES="2:3",RVALUES="0:1",M=M
12 | NODE=*RRL,PRED=6.5
13 | NODE=*RRR,PRED=7.1
14 | NODE=*RRM,PRED=9.0
15 | NODE=*M,PRED=8.0


--------------------------------------------------------------------------------
/test_rfacer.R:
--------------------------------------------------------------------------------
1 | library(Rcpp)
2 | library(rfacer)
3 | 
4 | trainData <- read.afm("test_103by300_mixed_nan_matrix.afm")
5 | 
6 | predictorObj <- rface.train(trainData,"C:class",mTry = 30, nTrees = 1000)
7 | predictions <- rface.predict(predictorObj,trainData)
8 | 


--------------------------------------------------------------------------------
/testdata.tsv:
--------------------------------------------------------------------------------
1 | 	S1	S2	S3	S4
2 | N:f0	0.0	0.5	NA	6.0
3 | C:f1	0	1	0	2
4 | N:f2	4.0	3.0	2.5	3.1
5 | N:f3	-1.6	2.0	0.5	1.5
6 | C:f4	0	1	2	NA
7 | N:T	0.0	1.0	2.0	3.0


--------------------------------------------------------------------------------
/tmp/feature.cpp:
--------------------------------------------------------------------------------
1 | #include "feature.hpp"
2 | 


--------------------------------------------------------------------------------
/tmp/feature.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef FEATURE_HPP
 2 | #define FEATURE_HPP
 3 | 
 4 | class Feature {
 5 | 
 6 | public:
 7 | 
 8 |   Feature();
 9 |   ~Feature();
10 | 
11 |   bool isNumerical() const;
12 |   bool isCategorical() const;
13 |   bool isTextual() const;
14 | 
15 | protected:
16 | 
17 |   enum Type { NUM, CAT, TXT, UNKNOWN };
18 | 
19 |   virtual initialize() = 0;
20 | 
21 | private:
22 | 
23 |   Type type_;
24 | 
25 | };
26 | 
27 | class NumFeature : public Feature {
28 | 
29 | public:
30 | 
31 |   NumFeature();
32 |   ~NumFeature();
33 | 
34 | protected:
35 | 
36 |   virtual initialize();
37 | 
38 | private:
39 | 
40 | 
41 | 
42 | };
43 | 
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/tmp/treesizes.tsv:
--------------------------------------------------------------------------------
1 | 149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149
2 | 71	69	73	75	71	71	71	77	93	61	69	89	71	65	69	73	73	83	73	73	57	79	63	71	69	71	83	67	65	83	69	73	73	67	77	73	77	65	63	59	73	73	65	69	63	69	69	73	63	75	59	65	69	73	67	65	73	77	69	63	69	69	81	75	79	83	73	73	81	71	75	67	63	69	77	83	75	85	57	65	71	79	71	77	71	81	71	79	79	71	71	75	69	69	71	69	73	79	71	79
3 | 41	45	39	31	41	39	37	43	37	45	37	43	41	41	27	41	37	37	45	39	37	39	31	37	35	47	39	51	29	43	41	39	41	41	39	39	41	37	35	33	37	33	43	41	39	39	41	39	43	41	43	51	45	33	43	43	35	37	43	35	43	31	35	53	35	47	43	43	37	37	39	39	43	43	43	39	37	45	43	49	35	41	39	41	43	39	39	35	43	51	43	41	45	39	43	49	43	39	49	41
4 | 31	21	25	21	33	23	27	27	33	27	27	31	31	25	25	29	27	31	31	23	29	25	25	31	29	33	27	27	31	25	25	27	31	25	31	27	29	25	27	27	33	21	31	27	27	25	33	25	27	27	35	27	31	27	29	29	29	27	35	31	27	29	31	27	31	25	29	31	23	29	31	25	27	25	27	23	25	37	29	27	27	31	27	27	29	27	25	21	29	31	33	29	29	25	29	29	25	27	31	25
5 | 149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149	149
6 | 130	132	132	128	132	128	125	132	132	128	126	132	132	131	127	129	132	129	128	132	131	127	129	129	129	122	131	127	132	129	128	131	130	131	132	126	129	118	131	129	132	126	132	127	127	131	130	130	131	132	124	132	131	130	132	126	132	129	124	131	131	129	132	131	127	127	131	132	131	132	132	132	131	132	130	131	129	126	132	132	127	131	132	132	131	127	132	129	132	128	131	132	122	127	132	131	131	132	131	131
7 | 106	109	106	113	112	108	109	114	108	110	105	107	113	104	111	109	109	100	109	114	112	112	114	108	108	113	110	114	107	112	100	111	110	108	113	113	105	104	111	109	110	108	105	111	110	113	114	112	106	112	112	109	110	105	111	108	112	113	112	110	113	114	110	108	109	110	109	109	111	114	108	106	104	113	105	112	108	111	113	111	111	107	108	107	108	105	110	114	111	104	106	106	113	108	113	104	108	105	113	112
8 | 105	101	104	102	106	102	102	101	101	104	97	99	104	104	107	109	98	97	102	100	106	94	106	103	102	100	107	100	106	102	106	104	107	105	98	105	98	105	103	106	100	104	103	101	102	107	105	105	106	108	107	103	107	102	99	98	109	104	109	99	96	106	100	96	104	97	102	98	103	98	100	106	106	103	103	103	99	100	99	100	103	105	96	106	108	104	106	99	105	103	102	97	101	100	96	107	107	108	103	104
9 | 


--------------------------------------------------------------------------------