├── DESCRIPTION ├── Makefile ├── NAMESPACE ├── R ├── is.installed.R ├── read.afm.R ├── rface.filter.R ├── rface.load.R ├── rface.predict.R ├── rface.save.R └── rface.train.R ├── README ├── data ├── 6_num_features_X_10_cases.tsv ├── dataGenerator.m ├── scriptTestDataGenerator.m ├── test_4by10_feature1_splits_target0.tsv └── test_6by10_featurerows_matrix.tsv ├── doxy.cfg ├── install_R.sh ├── make_package.sh ├── make_win32.bat ├── make_win64.bat ├── man ├── is.installed.Rd ├── rface.filter.Rd ├── rface.predict.Rd ├── rface.train.Rd └── rfacer.Rd ├── matlab └── rface_filter.cpp ├── rf-ace-launcher.sh ├── rf_ace_batch.py ├── src ├── Makevars ├── argparse.hpp ├── datadefs.cpp ├── datadefs.hpp ├── densetreedata.cpp ├── densetreedata.hpp ├── distributions.cpp ├── distributions.hpp ├── errno.hpp ├── exceptions.hpp ├── feature.cpp ├── feature.hpp ├── math.cpp ├── math.hpp ├── mtrand.h ├── murmurhash3.cpp ├── murmurhash3.hpp ├── node.cpp ├── node.hpp ├── options.hpp ├── progress.cpp ├── progress.hpp ├── reader.cpp ├── reader.hpp ├── rf_ace.cpp ├── rf_ace.hpp ├── rf_ace_R.cpp ├── rootnode.cpp ├── rootnode.hpp ├── statistics.cpp ├── statistics.hpp ├── stochasticforest.cpp ├── stochasticforest.hpp ├── timer.hpp ├── treedata.hpp ├── utils.cpp └── utils.hpp ├── test ├── GBT_benchmark.cpp ├── R │ ├── benchmark.R │ ├── run_tests.R │ └── utils.R ├── bash │ └── treesize_vs_pmissing.sh ├── data │ ├── 12by21_categorical_matrix.arff │ ├── 3by8_mixed_NA_matrix.afm │ ├── 3by8_mixed_NA_transposed_matrix.afm │ └── 5by10_numeric_matrix.arff ├── datadefs_newtest.hpp ├── distributions_newtest.hpp ├── math_newtest.hpp ├── matlab │ ├── categoricalFeatureSplit.m │ ├── deltaImpurity.m │ ├── numericalFeatureSplit.m │ ├── readAFM.m │ └── writeAFM.m ├── newtest.hpp ├── node_newtest.hpp ├── python │ ├── ampute.py │ └── deltaImpurity.py ├── reader_newtest.hpp ├── rface_newtest.hpp ├── rootnode_newtest.hpp ├── run_newtests.cpp ├── treedata_newtest.hpp └── utils_newtest.hpp ├── test_103by300_mixed_matrix.afm ├── test_103by300_mixed_nan_matrix.afm ├── test_2by10_text_matrix.afm ├── test_2by8_numerical_matrix.tsv ├── test_3by10_categorical_matrix.tsv ├── test_6by10_mixed_matrix.tsv ├── test_fullSplitterSweep.txt ├── test_fullSplitterSweep_class.txt ├── test_predictor.sf ├── test_rfacer.R ├── testdata.tsv └── tmp ├── feature.cpp ├── feature.hpp └── treesizes.tsv /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rfacer 2 | Type: Package 3 | Title: Random Forests with Artificial Contrast Ensembles 4 | Version: 1.0.8 5 | Date: 2012-10-01 6 | Author: Timo Erkkila 7 | Maintainer: Timo Erkkila 8 | Description: Random Forests with Artificial Contrast Ensembles 9 | License: Apache 2.0 10 | Depends: Rcpp -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COMPILER = g++ 2 | CFLAGS = -O3 -std=c++0x -Wall -Wextra -pedantic -Isrc/ -lz 3 | TFLAGS = -pthread 4 | SOURCEFILES = src/densetreedata.cpp src/murmurhash3.cpp src/datadefs.cpp src/progress.cpp src/statistics.cpp src/math.cpp src/stochasticforest.cpp src/rootnode.cpp src/node.cpp src/utils.cpp src/distributions.cpp src/reader.cpp src/feature.cpp 5 | STATICFLAGS = -static-libgcc -static 6 | TESTFILES = test/rface_test.hpp test/distributions_test.hpp test/argparse_test.hpp test/datadefs_test.hpp test/stochasticforest_test.hpp test/utils_test.hpp test/math_test.hpp test/rootnode_test.hpp test/node_test.hpp test/densetreedata_test.hpp 7 | TESTFLAGS = -std=c++0x -L${HOME}/lib/ -L/usr/local/lib -lcppunit -ldl -pedantic -I${HOME}/include/ -I/usr/local/include -Itest/ -Isrc/ 8 | .PHONY: all test clean # Squash directory checks for the usual suspects 9 | 10 | all: rf-ace 11 | 12 | rf-ace: $(SOURCEFILES) 13 | $(COMPILER) $(CFLAGS) src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace 14 | 15 | rf-ace-i386: $(SOURCEFILES) 16 | $(COMPILER) $(CFLAGS) -m32 src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace-i386 17 | 18 | rf-ace-amd64: $(SOURCEFILES) 19 | $(COMPILER) $(CFLAGS) -m64 src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace-amd64 20 | 21 | no-threads: $(SOURCEFILES) 22 | $(COMPILER) $(CFLAGS) -DNOTHREADS $(SOURCEFILES) src/rf_ace.cpp -o bin/rf-ace 23 | 24 | debug: $(SOURCEFILES) 25 | $(COMPILER) $(CFLAGS) src/rf_ace.cpp $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace -g -ggdb -pg 26 | 27 | static: $(SOURCEFILES) 28 | $(COMPILER) $(CFLAGS) src/rf_ace.cpp $(STATICFLAGS) $(SOURCEFILES) $(TFLAGS) -o bin/rf-ace 29 | 30 | static-no-threads: $(SOURCEFILES) 31 | $(COMPILER) $(CFLAGS) -DNOTHREADS src/rf_ace.cpp $(STATICFLAGS) $(SOURCEFILES) -o bin/rf-ace 32 | 33 | GBT_benchmark: test/GBT_benchmark.cpp $(SOURCEFILES) 34 | $(COMPILER) $(CFLAGS) test/GBT_benchmark.cpp $(SOURCEFILES) $(TFLAGS) -o bin/GBT_benchmark 35 | 36 | test: $(SOURCEFILES) 37 | rm -f bin/newtest; $(COMPILER) $(CFLAGS) test/run_newtests.cpp $(SOURCEFILES) $(TFLAGS) -o bin/newtest -ggdb; ./bin/newtest 38 | 39 | test-no-threads: $(SOURCEFILES) 40 | rm -f bin/newtest; $(COMPILER) $(CFLAGS) -DNOTHREADS test/run_newtests.cpp $(SOURCEFILES) -o bin/newtest -ggdb; ./bin/newtest 41 | 42 | clean: 43 | rm -rf bin/rf-ace bin/benchmark bin/GBT_benchmark bin/test bin/*.dSYM/ src/*.o 44 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | import("Rcpp") 3 | useDynLib("rfacer") 4 | -------------------------------------------------------------------------------- /R/is.installed.R: -------------------------------------------------------------------------------- 1 | is.installed <- 2 | function(mypkg) is.element(mypkg, installed.packages()[,1]) 3 | -------------------------------------------------------------------------------- /R/read.afm.R: -------------------------------------------------------------------------------- 1 | read.afm <- function(file) 2 | { 3 | 4 | trainData <- read.table(file,head=TRUE,sep="\t",row.names=1) 5 | 6 | trainData <- as.data.frame(t(trainData)) 7 | 8 | featureNames <- names(trainData) 9 | 10 | for( i in 1:length(featureNames) ) { 11 | if ( substr(featureNames[i],1,2) != "N:" ) { 12 | for( j in 1:length(row.names(trainData)) ) { 13 | trainData[j,i] <- as.character(trainData[j,i]) 14 | } 15 | } 16 | } 17 | return(trainData) 18 | } 19 | -------------------------------------------------------------------------------- /R/rface.filter.R: -------------------------------------------------------------------------------- 1 | rface.filter <- 2 | function(filterData, target, featureWeights = vector(length=0), nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 1000, nThreads = 1) { 3 | filterOutput <- .Call("rfaceFilter", filterData, as.character(target), featureWeights, nTrees, mTry, nodeSize, nMaxLeaves, nThreads) 4 | return(filterOutput) 5 | } 6 | -------------------------------------------------------------------------------- /R/rface.load.R: -------------------------------------------------------------------------------- 1 | rface.load <- 2 | function(predictorFile, nThreads = 1) { 3 | predictorObj <- .Call("rfaceLoad",predictorFile,nThreads) 4 | return(predictorObj) 5 | } -------------------------------------------------------------------------------- /R/rface.predict.R: -------------------------------------------------------------------------------- 1 | rface.predict <- 2 | function(predictorObj,testData,quantiles=vector(length=0),nSamplesForQuantiles=10,distributions=FALSE) { 3 | predictions <- .Call("rfacePredict",predictorObj,testData,quantiles,nSamplesForQuantiles,distributions); 4 | return(predictions) 5 | } 6 | -------------------------------------------------------------------------------- /R/rface.save.R: -------------------------------------------------------------------------------- 1 | rface.save <- 2 | function(predictorObj,fileName) { 3 | .Call("rfaceSave",predictorObj,fileName) 4 | } -------------------------------------------------------------------------------- /R/rface.train.R: -------------------------------------------------------------------------------- 1 | rface.train <- 2 | function(trainData, target, featureWeights = vector(length=0), forestType = "RF", nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 0, shrinkage = 0.01, noNABranching = FALSE, nThreads = 1) { 3 | predictorObj <- .Call("rfaceTrain", trainData, as.character(target), featureWeights, as.character(forestType), nTrees, mTry, nodeSize, nMaxLeaves, shrinkage, noNABranching, nThreads) 4 | return(predictorObj) 5 | } 6 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | #summary Manual pages. 2 | 3 | *The manual pages have been written on the basis of RF-ACE verson 0.5.5* 4 | 5 | = Description = 6 | 7 | RF-ACE is an efficient C++ implementation of a robust machine learning algorithm for uncovering multivariate associations from large and diverse data sets. RF-ACE natively handles numerical and categorical data with missing values, and potentially large quantities of noninformative features are handled gracefully utilizing artificial contrast features, bootstrapping, and p-value estimation. 8 | 9 | = Installation = 10 | 11 | Download the latest stable release from the [http://code.google.com/p/rf-ace/downloads/list download page], or checkout the latest development version (to directory rf-ace/) by typing 12 | {{{ 13 | svn checkout http://rf-ace.googlecode.com/svn/trunk/ rf-ace 14 | }}} 15 | 16 | Compiler makefiles for Linux (`Makefile`) and Visual Studio for Windows (`make.bat`) are provided in the package. In Linux, you can compile the program by typing 17 | {{{ 18 | make 19 | }}} 20 | or 21 | {{{ 22 | make rf_ace 23 | }}} 24 | 25 | In Windows and using Visual Studio, first open up the Visual Studio terminal and execute `make.bat` by typing 26 | {{{ 27 | make 28 | }}} 29 | Simple as that! If you feel lucky, check for compiled binaries at the [http://code.google.com/p/rf-ace/downloads/list download page]. 30 | 31 | = Supported data formats = 32 | RF-ACE currently supports two file formats, Annotated Feature Matrix (AFM) and Attribute-Relation File Format (ARFF). 33 | 34 | == Annotated Feature Matrix (AFM) == 35 | 36 | Annotated Feature Matrix represents the data as a tab-delimited table, where both columns and rows contain headers describing the samples and features. Based on the headers, the AFM reader is able to discern the right orientation (features as rows or columns in the matrix) of the matrix. Namely AFM feature headers must encode whether the feature is (`N`)umerical, (`C`)ategorical, (`O`)rdinal, or (`B`)inary, followed by colon and the actual name of the feature as follows: 37 | 38 | * `B:is_alive` 39 | * `N:age` 40 | * `C:tumor_grage` 41 | * `O:anatomic_organ_subdivision` 42 | 43 | In fact any string, even including colons, spaces, and other special characters, encodes a valid feature name as long as it starts with the preamble `N:`/`C:`/`O:`/`B:`. Thus, the following is a valid feature header: 44 | 45 | * `N:GEXP:TP53:chr17:123:456` 46 | 47 | Sample headers are not constrained, except that they must not contain preambles `N:`/`C:`/`O:`/`B:`, being reserved for the feature headers. 48 | 49 | == Attribute-Relation File Format (ARFF) == 50 | 51 | [http://www.cs.waikato.ac.nz/~ml/weka/arff.html ARFF specification]. 52 | 53 | = Usage = 54 | The following examples follow Linux syntax. Type 55 | {{{ 56 | bin/rf_ace --help 57 | }}} 58 | or 59 | {{{ 60 | bin/rf_ace -h 61 | }}} 62 | to bring up help: 63 | {{{ 64 | REQUIRED ARGUMENTS: 65 | -I / --input input feature file (AFM or ARFF) 66 | -i / --target target, specified as integer or string that is to be matched with the content of input 67 | -O / --output output association file 68 | 69 | OPTIONAL ARGUMENTS: 70 | -n / --ntrees number of trees per RF (default nsamples/nrealsamples) 71 | -m / --mtry number of randomly drawn features per node split (default sqrt(nfeatures)) 72 | -s / --nodesize minimum number of train samples per node, affects tree depth (default max{5,nsamples/20}) 73 | -p / --nperms number of Random Forests (default 50) 74 | -t / --pthreshold p-value threshold below which associations are listed (default 0.1) 75 | -g / --gbt Enable (1 == YES) Gradient Boosting Trees, a subsequent filtering procedure (default 0 == NO) 76 | }}} 77 | 78 | So all that is required is an input file (`-I/--input`), either of type `.arff` or `.afm`, and a target (`-i/--target`) to build the RF-ACE model upon. Target in this case corresponds to a feature in the input file, and it can be identified with an index corresponding to it's order of appearance in the file, or with it's name. Thus, if the target is `N:age` (we would be looking for features associated with age) existing on row `123` (0-base and omitting the header row), one execute RF-ACE by typing 79 | {{{ 80 | bin/rf_ace --input featurematrix.afm --target 123 --output associations.tsv 81 | }}} 82 | or with the short-hand notation equivalently as 83 | {{{ 84 | bin/rf_ace -I featurematrix.afm -i 123 -O associations.tsv 85 | }}} 86 | or by using the header "N:age" instead of the index by typing 87 | {{{ 88 | bin/rf_ace -I featurematrix.afm -i N:age -O associations.tsv 89 | }}} 90 | In case a provided (sub)string identifies multiple target candidates, RF-ACE will be executed serially for all target candidates, results catenated in the specified output file. 91 | 92 | The above will execute RF-ACE with the default parameters; as the help documentation points out, most of the parameters are estimated dynamically based on the data dimensions and content, so running RF-ACE with no information about the algorithm itself is possible. 93 | 94 | = Output = 95 | The following call (assuming now the substring `age` uniquely identifies just one feature, `N:age`) 96 | {{{ 97 | bin/rf_ace -I featurematrix.afm -i age -O associations.tsv 98 | }}} 99 | produces the output 100 | {{{ 101 | 102 | 103 | --------------------------------------------------------------- 104 | | RF-ACE -- efficient feature selection with heterogeneous data | 105 | | | 106 | | Version: RF-ACE v0.5.5, July 4th, 2011 | 107 | | Project page: http://code.google.com/p/rf-ace | 108 | | Contact: timo.p.erkkila@tut.fi | 109 | | kari.torkkola@gmail.com | 110 | | | 111 | | DEVELOPMENT VERSION, BUGS EXIST! | 112 | --------------------------------------------------------------- 113 | 114 | Reading file 'featurematrix.afm' 115 | File type is unknown -- defaulting to Annotated Feature Matrix (AFM) 116 | AFM orientation: features as rows 117 | 118 | RF-ACE parameter configuration: 119 | --input = featurematrix.afm 120 | --nsamples = 223 / 282 (20.922% missing) 121 | --nfeatures = 48912 122 | --targetidx = 123, header 'N:age' 123 | --ntrees = 356 124 | --mtry = 221 125 | --nodesize = 12 126 | --nperms = 50 127 | --pthresold = 0.1 128 | --output = associations.tsv 129 | 130 | Growing 50 Random Forests (RFs), please wait... 131 | RF 1: 4880 nodes (avg. 13.7079 nodes / tree) 132 | RF 2: 4810 nodes (avg. 13.5112 nodes / tree) 133 | RF 3: 4856 nodes (avg. 13.6404 nodes / tree) 134 | RF 4: 4994 nodes (avg. 14.0281 nodes / tree) 135 | RF 5: 5036 nodes (avg. 14.1461 nodes / tree) 136 | RF 6: 5016 nodes (avg. 14.0899 nodes / tree) 137 | RF 7: 5132 nodes (avg. 14.4157 nodes / tree) 138 | ... 139 | RF 47: 4736 nodes (avg. 13.3034 nodes / tree) 140 | RF 48: 5234 nodes (avg. 14.7022 nodes / tree) 141 | RF 49: 4582 nodes (avg. 12.8708 nodes / tree) 142 | RF 50: 5210 nodes (avg. 14.6348 nodes / tree) 143 | 50 RFs, 17800 trees, and 247516 nodes generated in 102.91 seconds (2405.17 nodes per second) 144 | Gradient Boosting Trees *DISABLED* 145 | 146 | Association file created. Format: 147 | TARGET PREDICTOR P-VALUE IMPORTANCE CORRELATION 148 | 149 | Done. 150 | }}} 151 | 152 | If there are no associations found, the program would end as follows: 153 | {{{ 154 | No significant associations found, quitting... 155 | }}} 156 | 157 | = RF-ACE configuration = 158 | 159 | Information will be added in the future -------------------------------------------------------------------------------- /data/6_num_features_X_10_cases.tsv: -------------------------------------------------------------------------------- 1 | foo S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 2 | N:F1 1 8.5 3.4 7.2 5 6 7 11 9 1 3 | N:F2 2 3 4 5 6 1 1 9 1 10 4 | N:F3 1 1 1 1 1 1 1 2 2 2 5 | N:F4 10 9.9 8 7 6 5 4 3 2.4 1 6 | N:F5 3 3 3 4 4 5 3 2 2 2 7 | N:F6 9 8 7 9 8 7 3 2 1 1 8 | -------------------------------------------------------------------------------- /data/scriptTestDataGenerator.m: -------------------------------------------------------------------------------- 1 | 2 | % Script to test dataGenerator 3 | 4 | clear par; 5 | 6 | % basic parameters 7 | %par.dependency = 'linear'; % or 'nonlinear' 8 | par.dependency = 'nonlinear'; 9 | par.O = 1; % number of target variables 10 | par.N = 200; % number of samples generated 11 | par.n = 5; % number of relevant variables from which target is generated 12 | %par.seed = 1; % random number generator seed, if not given, generated from time 13 | par.sets = 10; % how many data sets to generate 14 | par.testFraction = 0; % fraction of each set written to test file 15 | 16 | % used by nonlinear dependency generation 17 | par.L = 5; % number of functions added together to construct the target 18 | 19 | % used for linear dependency generation, if not specified, will generate randomly 20 | par.P = 1:(-0.1):0.1; 21 | %par.P = [1 0.5 0.25 0.125 0.0625]; 22 | 23 | % post dependency generation 24 | par.Kn = 100; % number of additional noise variables concatenated to data 25 | par.maxClasses = 0; % discretize target, 0=regression 26 | par.mixedType = 0.0; % discretize this fraction of the input variables 27 | par.maxLevels = 6; % max num discrete levels 28 | par.randomizeTarget=0.02; % add noise to target with var 'randomizeTarget' 29 | par.missing = 0.1; % fraction of missing values 30 | 31 | % uncomment one output option 32 | % par.fileFormat='R'; % samples as rows, tsv, cat levels are strings, (this is slow) 33 | par.fileFormat='x'; % features as rows, tsv, cat levels are numbers 34 | % par.fileFormat='arff'; % arff file 35 | % par.fileFormat='none'; % return a cell array of sets 36 | par.fileFormat={'none','R','x','arff'}; 37 | par.sampleHeader = 1; % generate a header for samples 38 | 39 | [traindata, testdata] = dataGenerator( par ); 40 | 41 | -------------------------------------------------------------------------------- /data/test_4by10_feature1_splits_target0.tsv: -------------------------------------------------------------------------------- 1 | S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 2 | N:F0 na 5.1 nA 3.8 4.2 1.4 8.2 9.1 4.5 na 3 | N:F1 4.2 5.2 7.2 3.9 4.3 1.5 8.3 9.2 4.6 NAN 4 | N:F2 4.2 nan nan 6.1 1.4 7.3 7.3 0.2 na naN 5 | N:F3 nan nan nan nan nan nan nan nan nan nan -------------------------------------------------------------------------------- /data/test_6by10_featurerows_matrix.tsv: -------------------------------------------------------------------------------- 1 | foo S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 2 | N:F1 nA 8.5 3.4 7.2 5 6 7 11 9 NA 3 | N:F2 2 3 4 5 6 NA NA 9 nan 10 4 | C:F3 NA nA naN NaN 1 1 1 2 2 2 5 | N:F4 10 9.9 8 7 6 5 4 3 2.4 1 6 | C:F5 3 3 3 4 4 5 3 2 2 2 7 | N:F6 9 8 7 9 8 7 3 2 1.0 99.23 -------------------------------------------------------------------------------- /install_R.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Prepare C++ compiler flags 4 | export PKG_CPPFLAGS="`Rscript -e 'Rcpp:::CxxFlags()'` -std=c++0x -Wall -Wextra -pedantic" 5 | 6 | ## Prepare library flags 7 | export PKG_LIBS=`Rscript -e "Rcpp:::LdFlags()"` 8 | 9 | ## Make shared library 10 | R CMD SHLIB -o lib/rf_ace_R.so src/rf_ace_R.cpp src/progress.cpp src/statistics.cpp src/math.cpp src/stochasticforest.cpp src/rootnode.cpp src/node.cpp src/treedata.cpp src/datadefs.cpp src/utils.cpp src/distributions.cpp -------------------------------------------------------------------------------- /make_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/sh 2 | 3 | package=$1 4 | 5 | tar -czf $package src/*.*pp bin/rf-ace* test/ test_*.* testdata.tsv Makefile make_win32.bat make_win64.bat doxy.cfg rf_ace_batch.py rf-ace-launcher.sh 6 | -------------------------------------------------------------------------------- /make_win32.bat: -------------------------------------------------------------------------------- 1 | mkdir bin 2 | 3 | SetEnv.cmd /x86 /Release 4 | 5 | cl /EHsc /O2 /analyze /DNOTHREADS /Febin\rf-ace-win32.exe src\murmurhash3.cpp src\rf_ace.cpp src\statistics.cpp src\distributions.cpp src\progress.cpp src\stochasticforest.cpp src\rootnode.cpp src\node.cpp src\treedata.cpp src\datadefs.cpp src\math.cpp src\utils.cpp src\reader.cpp src\feature.cpp 6 | 7 | del *.obj 8 | 9 | -------------------------------------------------------------------------------- /make_win64.bat: -------------------------------------------------------------------------------- 1 | mkdir bin 2 | 3 | SetEnv.cmd /x64 /Release 4 | 5 | cl /EHsc /O2 /analyze /DNOTHREADS /Febin\rf-ace-win64.exe src\murmurhash3.cpp src\rf_ace.cpp src\statistics.cpp src\distributions.cpp src\progress.cpp src\stochasticforest.cpp src\rootnode.cpp src\node.cpp src\treedata.cpp src\datadefs.cpp src\math.cpp src\utils.cpp src\reader.cpp src\feature.cpp 6 | 7 | del *.obj 8 | 9 | -------------------------------------------------------------------------------- /man/is.installed.Rd: -------------------------------------------------------------------------------- 1 | \name{is.installed} 2 | \alias{is.installed} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | foo bar 6 | } 7 | \description{ 8 | %% ~~ A concise (1-5 lines) description of what the function does. ~~ 9 | } 10 | \usage{ 11 | is.installed(mypkg) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{mypkg}{ 16 | %% ~~Describe \code{mypkg} here~~ 17 | } 18 | } 19 | \details{ 20 | %% ~~ If necessary, more details than the description above ~~ 21 | } 22 | \value{ 23 | %% ~Describe the value returned 24 | %% If it is a LIST, use 25 | %% \item{comp1 }{Description of 'comp1'} 26 | %% \item{comp2 }{Description of 'comp2'} 27 | %% ... 28 | } 29 | \references{ 30 | %% ~put references to the literature/web site here ~ 31 | } 32 | \author{ 33 | %% ~~who you are~~ 34 | } 35 | \note{ 36 | %% ~~further notes~~ 37 | } 38 | 39 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 40 | 41 | \seealso{ 42 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 43 | } 44 | \examples{ 45 | ##---- Should be DIRECTLY executable !! ---- 46 | ##-- ==> Define data, use random, 47 | ##-- or do help(data=index) for the standard data sets. 48 | 49 | ## The function is currently defined as 50 | function (mypkg) 51 | is.element(mypkg, installed.packages()[, 1]) 52 | } 53 | % Add one or more standard keywords, see file 'KEYWORDS' in the 54 | % R documentation directory. 55 | \keyword{ ~kwd1 } 56 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 57 | -------------------------------------------------------------------------------- /man/rface.filter.Rd: -------------------------------------------------------------------------------- 1 | \name{rface.filter} 2 | \alias{rface.filter} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Apply feature selection with RF-ACE. 6 | } 7 | \description{ 8 | Apply feature selection with RF-ACE. 9 | } 10 | \usage{ 11 | associations <- rface.filter(filterData, target, featureWeights = vector(length=0), nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 1000, nThreads = 1) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{filterData}{ 16 | A data.frame storing the data for feature selection. filterData[featureName] stores a vector of strings (categorical feature) or floats (numerical feature). 17 | } 18 | \item{target}{ 19 | An integer or string, pointing to a feature in filterData, i.e. filterData[target]. 20 | } 21 | \item{featureWeights}{ 22 | A vector of nonnegative weights for the features; affects the sampling distribution. By default all features get weight 1 corresponding uniform sampling. 23 | } 24 | \item{nTrees}{ 25 | Number of trees in the forest. Default 100. 26 | } 27 | \item{mTry}{ 28 | Number of randomly sampled candidate features per split. Default 10. 29 | } 30 | \item{nodeSize}{ 31 | Minimum number of train samples per node. Default 3. 32 | } 33 | \item{nMaxLeaves}{ 34 | Maximum number of leaves per tree. Default 1000. 35 | } 36 | \item{nThreads}{ 37 | Number of CPU threads to train the model with. Default 1. 38 | } 39 | } 40 | \details{ 41 | 42 | } 43 | \value{ 44 | %% ~Describe the value returned 45 | %% If it is a LIST, use 46 | %% \item{comp1 }{Description of 'comp1'} 47 | %% \item{comp2 }{Description of 'comp2'} 48 | %% ... 49 | } 50 | \references{ 51 | http://code.google.com/p/rf-ace 52 | } 53 | \author{ 54 | Timo Erkkila 55 | } 56 | \note{ 57 | %% ~~further notes~~ 58 | } 59 | 60 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 61 | 62 | \seealso{ 63 | \code{ \link{read.afm}, \link{rface.train}, \link{rface.predict}, \link{rface.save}, \link{rface.load} } 64 | } 65 | \examples{ 66 | 67 | afmFile <- "test_103by300_mixed_nan_matrix.afm"; 68 | target <- "N:output"; 69 | 70 | nTrees <- 100; 71 | mTry <- 30; 72 | 73 | nThreads <- 4; 74 | 75 | filterData <- read.afm(afmFile); 76 | 77 | associations <- rface.filter(filterData, target, nTrees = nTrees, mTry = mTry, nThreads = nThreads); 78 | 79 | } 80 | % Add one or more standard keywords, see file 'KEYWORDS' in the 81 | % R documentation directory. 82 | \keyword{ read.afm } 83 | \keyword{ rface.filter } 84 | -------------------------------------------------------------------------------- /man/rface.predict.Rd: -------------------------------------------------------------------------------- 1 | \name{rface.predict} 2 | \alias{rface.predict} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | foo bar 6 | } 7 | \description{ 8 | %% ~~ A concise (1-5 lines) description of what the function does. ~~ 9 | } 10 | \usage{ 11 | rface.predict(predictor, testData) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{predictor}{ 16 | %% ~~Describe \code{predictor} here~~ 17 | } 18 | \item{testData}{ 19 | %% ~~Describe \code{testData} here~~ 20 | } 21 | } 22 | \details{ 23 | %% ~~ If necessary, more details than the description above ~~ 24 | } 25 | \value{ 26 | %% ~Describe the value returned 27 | %% If it is a LIST, use 28 | %% \item{comp1 }{Description of 'comp1'} 29 | %% \item{comp2 }{Description of 'comp2'} 30 | %% ... 31 | } 32 | \references{ 33 | %% ~put references to the literature/web site here ~ 34 | } 35 | \author{ 36 | %% ~~who you are~~ 37 | } 38 | \note{ 39 | %% ~~further notes~~ 40 | } 41 | 42 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 43 | 44 | \seealso{ 45 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 46 | } 47 | \examples{ 48 | ##---- Should be DIRECTLY executable !! ---- 49 | ##-- ==> Define data, use random, 50 | ##-- or do help(data=index) for the standard data sets. 51 | 52 | ## The function is currently defined as 53 | function (predictor, testData) 54 | { 55 | .Call("rfacePredict", predictor, testData) 56 | } 57 | } 58 | % Add one or more standard keywords, see file 'KEYWORDS' in the 59 | % R documentation directory. 60 | \keyword{ ~kwd1 } 61 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 62 | -------------------------------------------------------------------------------- /man/rface.train.Rd: -------------------------------------------------------------------------------- 1 | \name{rface.train} 2 | \alias{rface.train} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Builds an RF-ACE predictor object. 6 | } 7 | \description{ 8 | Builds an RF-ACE predictor object. 9 | } 10 | \usage{ 11 | predictorObj <- rface.train(trainData, target, featureWeights = vector(length=0), nTrees = 100, mTry = 10, nodeSize = 3, nMaxLeaves = 1000, quantiles = vector(length=0), nThreads = 1) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{trainData}{ 16 | A data.frame storing the training data. trainData[featureName] stores a vector of strings (categorical feature) or floats (numerical feature). 17 | } 18 | \item{target}{ 19 | An integer or string, pointing to a feature in trainData, i.e. trainData[target]. 20 | } 21 | \item{featureWeights}{ 22 | A vector of nonnegative weights for the features; affects the sampling distribution. By default all features get weight 1 corresponding uniform sampling. 23 | } 24 | \item{nTrees}{ 25 | Number of trees in the forest. Default 100. 26 | } 27 | \item{mTry}{ 28 | Number of randomly sampled candidate features per split. Default 10. 29 | } 30 | \item{nodeSize}{ 31 | Minimum number of train samples per node. Default 3. 32 | } 33 | \item{nMaxLeaves}{ 34 | Maximum number of leaves per tree. Default 1000. 35 | } 36 | \item{quantiles}{ 37 | A vector of quantile points to provide predictions for. If empty, mean prediction will be calculated. Quantiles are only applicable in regression. 38 | } 39 | \item{nThreads}{ 40 | Number of CPU threads to train the model with. Default 1. 41 | } 42 | } 43 | \details{ 44 | 45 | } 46 | \value{ 47 | %% ~Describe the value returned 48 | %% If it is a LIST, use 49 | %% \item{comp1 }{Description of 'comp1'} 50 | %% \item{comp2 }{Description of 'comp2'} 51 | %% ... 52 | } 53 | \references{ 54 | http://code.google.com/p/rf-ace 55 | } 56 | \author{ 57 | Timo Erkkila 58 | } 59 | \note{ 60 | %% ~~further notes~~ 61 | } 62 | 63 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 64 | 65 | \seealso{ 66 | \code{ \link{read.afm}, \link{rface.filter}, \link{rface.predict}, \link{rface.save}, \link{rface.load} } 67 | } 68 | \examples{ 69 | 70 | afmFile <- "test_103by300_mixed_nan_matrix.afm"; 71 | target <- "N:output"; 72 | 73 | nTrees <- 100; 74 | mTry <- 30; 75 | 76 | nThreads <- 4; 77 | 78 | trainData <- read.afm(afmFile); 79 | 80 | predictorObj <- rface.train(trainData, target, nTrees = nTrees, mTry = mTry, nThreads = nThreads); 81 | 82 | } 83 | % Add one or more standard keywords, see file 'KEYWORDS' in the 84 | % R documentation directory. 85 | \keyword{ read.afm } 86 | \keyword{ rface.predict } 87 | -------------------------------------------------------------------------------- /man/rfacer.Rd: -------------------------------------------------------------------------------- 1 | \name{skeleton-package} 2 | \alias{skeleton-package} 3 | \alias{skeleton} 4 | \docType{package} 5 | \title{ 6 | What the package does (short line) 7 | ~~ package title ~~ 8 | } 9 | \description{ 10 | More about what it does (maybe more than one line) 11 | ~~ A concise (1-5 lines) description of the package ~~ 12 | } 13 | \details{ 14 | \tabular{ll}{ 15 | Package: \tab skeleton\cr 16 | Type: \tab Package\cr 17 | Version: \tab 1.0\cr 18 | Date: \tab 2012-10-01\cr 19 | License: \tab What license is it under?\cr 20 | } 21 | ~~ An overview of how to use the package, including the most important ~~ 22 | ~~ functions ~~ 23 | } 24 | \author{ 25 | Who wrote it 26 | 27 | Maintainer: Who to complain to 28 | ~~ The author and/or maintainer of the package ~~ 29 | } 30 | \references{ 31 | ~~ Literature or other references for background information ~~ 32 | } 33 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ 34 | ~~ the R documentation directory ~~ 35 | \keyword{ package } 36 | \seealso{ 37 | ~~ Optional links to other man pages, e.g. ~~ 38 | ~~ \code{\link[:-package]{}} ~~ 39 | } 40 | \examples{ 41 | ~~ simple examples of the most important functions ~~ 42 | } 43 | -------------------------------------------------------------------------------- /matlab/rface_filter.cpp: -------------------------------------------------------------------------------- 1 | #include "mex.h" 2 | 3 | void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { 4 | 5 | 6 | 7 | } 8 | -------------------------------------------------------------------------------- /rf-ace-launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Generic launcher for RF-ACE 3 | # 4 | # This script is used primarily with GenePattern, allowing us to specify 5 | # all options explicitly, with defaults provided by GenePattern's web 6 | # interface. In effect, this makes all parameters positional. 7 | # 8 | # The GenePattern command line for this is: 9 | # sh rf-ace-launcher.sh \ 10 | # \ 11 | # \ 12 | # "" 13 | 14 | export PATH=$1:$PATH 15 | 16 | chmod a+x $1/rf_ace 17 | echo "Running: \ 18 | rf_ace --input=$2 --target=$3 --output=$4 --RF_ntrees=$5 \ 19 | --RF_mtry=$6 --RF_nodesize=$7 --RF_nperms=$8 --RF_pthreshold=$9 \ 20 | --GBT_ntrees=${10} --GBT_maxleaves=${11} --GBT_shrinkage=${12} \ 21 | --GBT_samplesize=${13} ${14}" 22 | 23 | rf_ace --input=$2 --target=$3 --output=$4 --RF_ntrees=$5 \ 24 | --RF_mtry=$6 --RF_nodesize=$7 --RF_nperms=$8 --RF_pthreshold=$9 \ 25 | --GBT_ntrees=${10} --GBT_maxleaves=${11} --GBT_shrinkage=${12} \ 26 | --GBT_samplesize=${13} ${14} 27 | -------------------------------------------------------------------------------- /rf_ace_batch.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | assert sys.argv[1] != sys.argv[4] 4 | assert int(sys.argv[2]) <= int(sys.argv[3]) 5 | 6 | xlist = xrange(int(sys.argv[2]),int(sys.argv[3])+1) 7 | 8 | for targetidx in xlist: 9 | os.system('bin/rf_ace --traindata '+sys.argv[1]+' --target '+str(targetidx)+' --associations '+sys.argv[4]+'_'+str(targetidx)) 10 | 11 | os.system('cat '+sys.argv[4]+'_* > '+sys.argv[4]) 12 | os.system('rm '+sys.argv[4]+'_*') 13 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | ## Prepare C++ compiler flags 2 | PKG_CPPFLAGS=$(shell ${R_HOME}/bin/Rscript -e 'Rcpp:::CxxFlags()') -std=c++0x -Wall -Wextra -pedantic -DNOTHREADS 3 | 4 | ## Prepare library flags 5 | PKG_LIBS=$(shell ${R_HOME}/bin/Rscript -e 'Rcpp:::LdFlags()') 6 | 7 | ## Make shared library 8 | ## R CMD SHLIB -o lib/rf_ace_R.so src/rf_ace_R.cpp src/progress.cpp src/statistics.cpp src/math.cpp src/stochasticforest.cpp src/rootnode.cpp src/node.cpp src/treedata.cpp src/datadefs.cpp src/utils.cpp src/distributions.cpp -------------------------------------------------------------------------------- /src/argparse.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ARGPARSE_HPP 2 | #define ARGPARSE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "errno.hpp" 11 | //#include "exceptions.hpp" 12 | 13 | using namespace std; 14 | 15 | /** 16 | * Generic argument parser that allows options to be checked 17 | * speculatively. Currently uses a map representation of the argument tree. 18 | */ 19 | class ArgParse { 20 | 21 | public: 22 | 23 | ArgParse() {} 24 | 25 | ArgParse(const int argc, char* const argv[]) { 26 | if (argc < 1) { 27 | throw ERRNO::INVALID_ARGUMENT; 28 | //throw EXCEPTION_INVALID_ARGUMENT; 29 | } 30 | 31 | string currArg = ""; 32 | for (int i = 0; i < argc; ++i) { 33 | 34 | try { 35 | // !! Correctness Note: this strategy may attempt to dereference 36 | // !! corrupt memory. Thus, these runtime checks, while better than an 37 | // !! outright crash, may imply security vulnerabilities in dependent 38 | // !! code. Beware! 39 | if (argv[i] == NULL || argv[i][0] == 0x0) { 40 | throw ERRNO::INVALID_ARGUMENT; 41 | } else { 42 | if (!currArg.empty()) { 43 | mappedArgs[currArg] = string(argv[i]); 44 | currArg = string(""); 45 | //continue; 46 | } 47 | switch(argv[i][0]) { 48 | case '-': 49 | case '+': 50 | if (argv[i][1] == '\0') { break; } 51 | if (argv[i][1] == '-' || 52 | argv[i][1] == '+') { 53 | 54 | stringstream argSS; 55 | bool containsEquals = false; 56 | size_t idx; 57 | size_t len = strlen(argv[i]); 58 | for (idx = 2; idx < len; ++idx) { 59 | if (argv[i][idx] == '=') { 60 | containsEquals = true; 61 | break; 62 | } 63 | argSS << (char)argv[i][idx]; 64 | } 65 | 66 | currArg = argSS.str(); 67 | if (containsEquals) { 68 | stringstream valSS; 69 | for (++idx; idx < len; ++idx) { 70 | valSS << (char)argv[i][idx]; 71 | } 72 | mappedArgs[currArg] = valSS.str(); 73 | currArg = string(""); 74 | } 75 | 76 | 77 | } else { 78 | size_t len = strlen(argv[i]); 79 | for (size_t idx = 1; idx < len; ++idx) { 80 | char arg[] = {argv[i][idx], '\0'}; 81 | mappedArgs[string(arg)] = string(""); 82 | currArg = string(arg); 83 | } 84 | } 85 | 86 | break; 87 | default: 88 | extraArgs.push_back(string(argv[i])); 89 | break; 90 | } 91 | } 92 | } catch (...) { 93 | //assert(false); // Check if argv was corrupt or overstepped. Implies a 94 | // major FIXME if this is hit. (Disabled in lieu of 95 | // runtime checks during testing) 96 | 97 | throw ERRNO::ILLEGAL_MEMORY_ACCESS; // Perform a safer runtime check 98 | // that should never be hit by 99 | // correct code. 100 | } 101 | } 102 | if (!currArg.empty()) { 103 | mappedArgs[currArg] = string(""); 104 | currArg = string(""); 105 | } 106 | 107 | /* 108 | cout << "Mapped args:" << endl; 109 | for (map::iterator it = mappedArgs.begin(); it != mappedArgs.end(); ++it) { 110 | cout << (*it).first << "->" << (*it).second << endl; 111 | } 112 | 113 | cout << "Extra args:" << endl; 114 | for (int i = 0; i < extraArgs.size(); ++i) { 115 | cout << extraArgs[i] << endl; 116 | }*/ 117 | } 118 | ~ArgParse() {} 119 | 120 | /** 121 | * Queries the backend map for the current argument-value pair. Extra 122 | * arguments passed positionally are not yet supported. 123 | * 124 | * Contractual guarantees: 125 | * 126 | * + The behavior of the extraction operator (>>) will be used with input 127 | * type T. This may cause unexpected results if your type explicitly 128 | * specifies an append instead of an overwrite for this 129 | * operation. Declare the contents of returnVal carefully or redefine 130 | * your type for these cases. 131 | * 132 | * + Certain types specifiable for T may cause memory access violations that 133 | * are difficult to debug. For example, specifying char* may throw a 134 | * memory access violation at 'SOptarg >> returnVal'. It is expected and 135 | * indeed required that your types be well-defined before passing them 136 | * to this method. 137 | * 138 | * + Attempting to pass a NULL pointer for any input value will not 139 | * work. Don't do it. 140 | * 141 | * + Duplicate arguments will prefer the last long specification over the 142 | * last short specification of that same argument. 143 | * 144 | * + Arguments are case-sensitive. 145 | * 146 | * Sets returnVal and returns true if an argument was found; false 147 | * otherwise. !! TODO return a unified status code instead 148 | */ 149 | template bool getArgument(const char* shortName, const char* longName, T& returnVal) { 150 | 151 | assert(shortName != NULL); 152 | assert(longName != NULL); 153 | assert(strlen(shortName) == 1); 154 | assert(*longName != 0); 155 | 156 | map::iterator it = mappedArgs.find(longName); 157 | if (it == mappedArgs.end()) { 158 | it = mappedArgs.find(shortName); 159 | } 160 | 161 | if (it != mappedArgs.end()) { 162 | string found = (*it).second; 163 | if (found.empty()) { 164 | throw ERRNO::INVALID_VALUE; 165 | } 166 | stringstream ss(found); 167 | ss >> returnVal; 168 | 169 | if (ss.fail() || !ss.eof()) { 170 | throw ERRNO::INVALID_VALUE; 171 | } 172 | return true; 173 | } 174 | 175 | return false; 176 | } 177 | 178 | template bool getArgument(const string& shortName, const string& longName, T& returnVal) { 179 | return getArgument(shortName.c_str(), longName.c_str(), returnVal); 180 | } 181 | 182 | /** 183 | * Queries the backend map for the presence of a flag. This can also be used 184 | * to check for the presence of an argument-value pair or non-presence of a 185 | * value, but abusing this functionality is not recommended. 186 | * 187 | * Contractual guarantees: 188 | * 189 | * + Attempting to pass a NULL pointer for any input value will not 190 | * work. Don't do it. 191 | * 192 | * + Duplicate flags are assumed to be one instance of the set 193 | * flag. Conflicting, non-duplicate flags are your problem. 194 | * 195 | * + Flags are case-sensitive. 196 | * 197 | */ 198 | bool getFlag(const char* shortName, const char* longName, bool& returnVal) { 199 | 200 | assert(shortName != NULL); 201 | assert(longName != NULL); 202 | assert(strlen(shortName) == 1); 203 | assert(*longName != 0); 204 | 205 | map::iterator it = mappedArgs.find(longName); 206 | if (it == mappedArgs.end()) { 207 | it = mappedArgs.find(shortName); 208 | } 209 | 210 | if (it != mappedArgs.end()) { 211 | returnVal = true; 212 | return true; 213 | } 214 | 215 | return false; 216 | } 217 | 218 | bool getFlag(const string& shortName, const string& longName, bool& returnVal) { 219 | return getFlag(shortName.c_str(), longName.c_str(), returnVal); 220 | } 221 | 222 | 223 | private: 224 | map mappedArgs; 225 | vector extraArgs; 226 | }; 227 | 228 | #endif 229 | -------------------------------------------------------------------------------- /src/datadefs.cpp: -------------------------------------------------------------------------------- 1 | #include "datadefs.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef NOTHREADS 10 | #include 11 | #endif 12 | 13 | 14 | using namespace std; 15 | //using datadefs::ForestType; 16 | 17 | //////////////////////////////////////////////////////////// 18 | // CONSTANTS 19 | //////////////////////////////////////////////////////////// 20 | const datadefs::num_t datadefs::NUM_NAN = numeric_limits::quiet_NaN();//numeric_limits::infinity(); 21 | //const datadefs::cat_t datadefs::CAT_NAN = "NA"; 22 | const string datadefs::STR_NAN = "NA"; 23 | const datadefs::num_t datadefs::NUM_INF = numeric_limits::infinity(); 24 | const size_t datadefs::MAX_IDX = numeric_limits::max() - 1; 25 | const datadefs::num_t datadefs::EPS = 1e-18; //1e-12; 26 | const datadefs::num_t datadefs::NUM_PI = 3.1415926535; 27 | const datadefs::num_t datadefs::A = 0.140012; 28 | const datadefs::num_t datadefs::LOG_OF_MAX_NUM = 70.0; /** !! Potentially 29 | * spurious. Do you mean 30 | * the log of the 31 | * maximum number 32 | * expressible as a 33 | * num_t? */ 34 | 35 | const char datadefs::tokenDelimiters[] = " \t,.;:?!@'\"-\n\0"; 36 | 37 | // List of NaN's adapted from http://en.wikipedia.org/wiki/NaN#Display 38 | const set datadefs::NANs = {"NA","NAN","NULL","?"}; 39 | 40 | const string datadefs::CONTRAST = "CONTRAST"; 41 | 42 | #ifndef NOTHREADS 43 | const size_t datadefs::MAX_THREADS = thread::hardware_concurrency(); 44 | #endif 45 | 46 | #ifdef NOTHREADS 47 | const size_t datadefs::MAX_THREADS = 1; 48 | #endif 49 | 50 | //enum ForestType {RF, GBT, CART, UNKNOWN}; 51 | 52 | const map datadefs::forestTypeAssign = { {"RF",datadefs::forest_t::RF}, {"GBT",datadefs::forest_t::GBT}, {"QRF",datadefs::forest_t::QRF} }; 53 | 54 | const bool datadefs::SF_DEFAULT_NO_NA_BRANCHING = false; 55 | const vector datadefs::SF_DEFAULT_QUANTILES = {}; 56 | 57 | // Random Forest default configuration 58 | const size_t datadefs::RF_DEFAULT_N_TREES = 100; 59 | const size_t datadefs::RF_DEFAULT_M_TRY = 0; 60 | const size_t datadefs::RF_DEFAULT_N_MAX_LEAVES = datadefs::MAX_IDX; 61 | const size_t datadefs::RF_DEFAULT_NODE_SIZE = 3; 62 | const datadefs::num_t datadefs::RF_DEFAULT_IN_BOX_FRACTION = 1.0; 63 | const datadefs::num_t datadefs::RF_DEFAULT_SAMPLE_WITH_REPLACEMENT = true; 64 | const bool datadefs::RF_DEFAULT_USE_CONTRASTS = false; 65 | const datadefs::num_t datadefs::RF_DEFAULT_CONTRAST_FRACTION = 0.5; 66 | const bool datadefs::RF_DEFAULT_IS_RANDOM_SPLIT = true; 67 | const datadefs::num_t datadefs::RF_DEFAULT_SHRINKAGE = 0.0; 68 | const vector datadefs::RF_DEFAULT_QUANTILES = {}; 69 | const size_t datadefs::RF_DEFAULT_N_SAMPLES_FOR_QUANTILES = 0; 70 | 71 | // Random Forest default configuration 72 | const size_t datadefs::QRF_DEFAULT_N_TREES = 100; 73 | const size_t datadefs::QRF_DEFAULT_M_TRY = 0; 74 | const size_t datadefs::QRF_DEFAULT_N_MAX_LEAVES = datadefs::MAX_IDX; 75 | const size_t datadefs::QRF_DEFAULT_NODE_SIZE = 3; 76 | const datadefs::num_t datadefs::QRF_DEFAULT_IN_BOX_FRACTION = 1.0; 77 | const datadefs::num_t datadefs::QRF_DEFAULT_SAMPLE_WITH_REPLACEMENT = true; 78 | const bool datadefs::QRF_DEFAULT_USE_CONTRASTS = false; 79 | const datadefs::num_t datadefs::QRF_DEFAULT_CONTRAST_FRACTION = 0.5; 80 | const bool datadefs::QRF_DEFAULT_IS_RANDOM_SPLIT = true; 81 | const datadefs::num_t datadefs::QRF_DEFAULT_SHRINKAGE = 0.0; 82 | const vector datadefs::QRF_DEFAULT_QUANTILES = {0.25,0.5,0.75}; 83 | const size_t datadefs::QRF_DEFAULT_N_SAMPLES_FOR_QUANTILES = 10; 84 | 85 | // Gradient Boosting Trees default configuration 86 | const size_t datadefs::GBT_DEFAULT_N_TREES = 100; 87 | const size_t datadefs::GBT_DEFAULT_M_TRY = 0; 88 | const size_t datadefs::GBT_DEFAULT_N_MAX_LEAVES = 6; 89 | const size_t datadefs::GBT_DEFAULT_NODE_SIZE = 3; 90 | const datadefs::num_t datadefs::GBT_DEFAULT_IN_BOX_FRACTION = 0.5; 91 | const datadefs::num_t datadefs::GBT_DEFAULT_SAMPLE_WITH_REPLACEMENT = false; 92 | const bool datadefs::GBT_DEFAULT_USE_CONTRASTS = false; 93 | const datadefs::num_t datadefs::GBT_DEFAULT_CONTRAST_FRACTION = 0.5; 94 | const bool datadefs::GBT_DEFAULT_IS_RANDOM_SPLIT = false; 95 | const datadefs::num_t datadefs::GBT_DEFAULT_SHRINKAGE = 0.1; 96 | const vector datadefs::GBT_DEFAULT_QUANTILES = {}; 97 | const size_t datadefs::GBT_DEFAULT_N_SAMPLES_FOR_QUANTILES = 0; 98 | 99 | // Statistical test default configuration 100 | const size_t datadefs::FILTER_DEFAULT_N_PERMS = 20; 101 | const datadefs::num_t datadefs::FILTER_DEFAULT_P_VALUE_THRESHOLD = 0.05; 102 | const bool datadefs::FILTER_DEFAULT_IS_ADJUSTED_P_VALUE = false; 103 | const datadefs::num_t datadefs::FILTER_DEFAULT_IMPORTANCE_THRESHOLD = 10; 104 | const bool datadefs::FILTER_NORMALIZE_IMPORTANCE_VALUES = false; 105 | const bool datadefs::FILTER_DEFAULT_REPORT_NONEXISTENT_FEATURES = false; 106 | 107 | // Default general configuration 108 | const bool datadefs::GENERAL_DEFAULT_PRINT_HELP = false; 109 | const char datadefs::GENERAL_DEFAULT_DATA_DELIMITER = '\t'; 110 | const char datadefs::GENERAL_DEFAULT_HEADER_DELIMITER = ':'; 111 | const size_t datadefs::GENERAL_DEFAULT_MIN_SAMPLES = 5; 112 | const int datadefs::GENERAL_DEFAULT_SEED = -1; 113 | const size_t datadefs::GENERAL_DEFAULT_N_THREADS = 1; 114 | const bool datadefs::GENERAL_DEFAULT_IS_MAX_THREADS = false; 115 | const datadefs::num_t datadefs::GENERAL_DEFAULT_FEATURE_WEIGHT = 0; 116 | 117 | //////////////////////////////////////////////////////////// 118 | // HELPER FUNCTIONS 119 | //////////////////////////////////////////////////////////// 120 | 121 | /** 122 | * Promote each character in a sequence to uppercase. Effectively, a wrapper 123 | * around std::transform. 124 | */ 125 | 126 | string datadefs::toUpperCase(const string& str) { 127 | int (*pf)(int) = toupper; 128 | string strcopy(str); 129 | transform(strcopy.begin(), strcopy.end(), strcopy.begin(), pf); 130 | return(strcopy); 131 | } 132 | 133 | bool datadefs::isInteger(const string& str, int& integer) { 134 | stringstream ss(str); 135 | if(ss >> integer && ss.eof()) { 136 | return(true); 137 | } else { 138 | return(false); 139 | } 140 | } 141 | 142 | 143 | /** 144 | * Count all values that aren't transfinite 145 | !! Correctness: what about representations of infinity? And to be entirely 146 | pedantic: signaling NaN, post-trap? These should have specific non-guarantees. 147 | */ 148 | void datadefs::countRealValues(vector const& data, size_t& nRealValues) { 149 | nRealValues = 0; 150 | for(size_t i = 0; i < data.size(); ++i) { 151 | if(!datadefs::isNAN(data[i])) { 152 | ++nRealValues; 153 | } 154 | } 155 | } 156 | 157 | -------------------------------------------------------------------------------- /src/densetreedata.hpp: -------------------------------------------------------------------------------- 1 | //densetreedata.hpp 2 | // 3 | // 4 | 5 | #ifndef DENSETREEDATA_HPP 6 | #define DENSETREEDATA_HPP 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "datadefs.hpp" 15 | #include "distributions.hpp" 16 | #include "options.hpp" 17 | #include "feature.hpp" 18 | #include "reader.hpp" 19 | #include "treedata.hpp" 20 | 21 | using namespace std; 22 | using datadefs::num_t; 23 | 24 | class DenseTreeData : public TreeData { 25 | public: 26 | 27 | // Initializes the object 28 | DenseTreeData(const vector& features, bool useContrasts = false, const vector& sampleHeaders = vector(0)); 29 | 30 | // Initializes the object and reads in a data matrix 31 | DenseTreeData(string fileName, const char dataDelimiter, const char headerDelimiter, const bool useContrasts = false); 32 | 33 | ~DenseTreeData(); 34 | 35 | // Reveals the Feature class interface to the user 36 | const Feature* feature(const size_t featureIdx) const { 37 | return( &features_[featureIdx] ); 38 | } 39 | 40 | // Returns the number of features 41 | size_t nFeatures() const; 42 | 43 | // Returns feature index, given the name 44 | size_t getFeatureIdx(const string& featureName) const; 45 | 46 | // A value denoting the "one-over-last" feature in matrix 47 | size_t end() const { return( datadefs::MAX_IDX ); } 48 | 49 | // Returns sample name, given sample index 50 | string getSampleName(const size_t sampleIdx); 51 | 52 | // Returns the number of samples 53 | size_t nSamples() const; 54 | 55 | vector getFeatureWeights() const; 56 | 57 | void separateMissingSamples(const size_t featureIdx, 58 | vector& sampleIcs, 59 | vector& missingIcs); 60 | 61 | num_t numericalFeatureSplit(const size_t targetIdx, 62 | const size_t featureIdx, 63 | const size_t minSamples, 64 | vector& sampleIcs_left, 65 | vector& sampleIcs_right, 66 | num_t& splitValue); 67 | 68 | num_t categoricalFeatureSplit(const size_t targetIdx, 69 | const size_t featureIdx, 70 | const vector& catOrder, 71 | const size_t minSamples, 72 | vector& sampleIcs_left, 73 | vector& sampleIcs_right, 74 | unordered_set& splitValues_left); 75 | 76 | num_t textualFeatureSplit(const size_t targetIdx, 77 | const size_t featureIdx, 78 | const uint32_t hashIdx, 79 | const size_t minSamples, 80 | vector& sampleIcs_left, 81 | vector& sampleIcs_right); 82 | 83 | //string getRawFeatureData(const size_t featureIdx, const size_t sampleIdx); 84 | //string getRawFeatureData(const size_t featureIdx, const num_t data); 85 | //vector getRawFeatureData(const size_t featureIdx); 86 | 87 | // Generates a bootstrap sample from the real samples of featureIdx. Samples not in the bootstrap sample will be stored in oob_ics, 88 | // and the number of oob samples is stored in noob. 89 | void bootstrapFromRealSamples(distributions::Random* random, 90 | const bool withReplacement, 91 | const num_t sampleSize, 92 | const size_t featureIdx, 93 | vector& ics, 94 | vector& oobIcs); 95 | 96 | void createContrasts(); 97 | void permuteContrasts(distributions::Random* random); 98 | 99 | void replaceFeatureData(const size_t featureIdx, const vector& featureData); 100 | void replaceFeatureData(const size_t featureIdx, const vector& rawFeatureData); 101 | 102 | 103 | #ifndef TEST__ 104 | private: 105 | #endif 106 | 107 | enum FileType {UNKNOWN, AFM, ARFF}; 108 | 109 | FileType getFileType(const string& fileName); 110 | 111 | bool isRowsAsSamplesInAFM(Reader& reader, const char headerDelimiter); 112 | 113 | void readAFM(const string& fileName, const char dataDelimiter, const char headerDelimiter); 114 | //void readARFF(const string& fileName); 115 | 116 | //void parseARFFattribute(const string& str, string& attributeName, bool& isFeatureNumerical); 117 | 118 | bool isValidNumericalHeader(const string& str, const char headerDelimiter); 119 | bool isValidCategoricalHeader(const string& str, const char headerDelimiter); 120 | bool isValidTextHeader(const string& str, const char headerDelimiter); 121 | bool isValidFeatureHeader(const string& str, const char headerDelimiter); 122 | 123 | //template void transpose(vector >& mat); 124 | 125 | bool useContrasts_; 126 | 127 | vector features_; 128 | vector sampleHeaders_; 129 | 130 | unordered_map name2idx_; 131 | 132 | }; 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /src/distributions.cpp: -------------------------------------------------------------------------------- 1 | #include "distributions.hpp" 2 | 3 | #include "utils.hpp" 4 | 5 | using datadefs::num_t; 6 | 7 | distributions::Random::Random(): 8 | rand_(0,datadefs::MAX_IDX) { 9 | this->seed( distributions::generateSeed() ); 10 | } 11 | 12 | distributions::Random::Random(size_t seed): 13 | rand_(0,datadefs::MAX_IDX) { 14 | 15 | this->seed(seed); 16 | 17 | } 18 | 19 | distributions::Random::~Random() { 20 | 21 | } 22 | 23 | void distributions::Random::seed(size_t seed) { 24 | eng_.seed(seed); 25 | } 26 | 27 | size_t distributions::Random::integer() { 28 | return( rand_(eng_) ); 29 | } 30 | 31 | num_t distributions::Random::uniform() { 32 | 33 | return( 1.0 * rand_(eng_) / ( datadefs::MAX_IDX + 1 ) ); 34 | 35 | } 36 | 37 | distributions::PMF::PMF(const vector& weights) { 38 | 39 | size_t n = weights.size(); 40 | 41 | num_t sum = 0.0; 42 | 43 | for ( size_t i = 0; i < n; ++i ) { 44 | assert( weights[i] >= 0.0 ); 45 | sum += weights[i]; 46 | } 47 | 48 | prob_.resize(n); 49 | alias_.resize(n); 50 | 51 | vector HL(n); 52 | vector::iterator H(HL.begin()-1); 53 | vector::iterator L(HL.end()); 54 | 55 | for ( size_t i = 0; i < n; ++i ) { 56 | prob_[i] = weights[i] / sum * n; 57 | if ( prob_[i] < 1.0 ) { 58 | *(++H) = i; 59 | } else { 60 | *(--L) = i; 61 | } 62 | } 63 | 64 | for ( size_t k = 0; k < n-1; k++ ) { 65 | size_t i = HL[k]; 66 | size_t j = *L; 67 | alias_[i] = j; 68 | prob_[j] += prob_[i] - 1.0; 69 | if ( prob_[j] < 1.0 ) { 70 | L++; 71 | } 72 | if ( L >= HL.end() ) { 73 | break; 74 | } 75 | } 76 | 77 | } 78 | 79 | distributions::PMF::~PMF() { } 80 | 81 | size_t distributions::PMF::sample(distributions::Random* random) const { 82 | 83 | size_t i = random->integer() % prob_.size(); 84 | 85 | return( random->uniform() < prob_[i] ? i : alias_[i] ); 86 | 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/distributions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DISTRIBUTIONS_HPP 2 | #define DISTRIBUTIONS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "datadefs.hpp" 9 | 10 | using namespace std; 11 | 12 | namespace distributions { 13 | 14 | typedef tr1::mt19937 Engine; 15 | // typedef std::ranlux_base_01 Engine; 16 | 17 | inline unsigned int generateSeed() { return( clock() + time(0) ); } 18 | 19 | class Random { 20 | public: 21 | 22 | // Initialize the generator 23 | Random(); 24 | Random(size_t seed); 25 | 26 | // Destructor 27 | ~Random(); 28 | 29 | void seed(size_t seed); 30 | 31 | // Return random int 32 | size_t integer(); 33 | 34 | // Generate and normalize random int 35 | datadefs::num_t uniform(); 36 | 37 | size_t minIdx() { return( 0 ); } 38 | size_t maxIdx() { return( datadefs::MAX_IDX ); } 39 | 40 | private: 41 | 42 | Engine eng_; 43 | tr1::uniform_int rand_; 44 | 45 | }; 46 | 47 | class PMF { 48 | public: 49 | 50 | PMF(const vector& weights); 51 | ~PMF(); 52 | 53 | size_t sample(Random* random) const; 54 | 55 | #ifndef TEST__ 56 | private: 57 | #endif 58 | 59 | vector prob_; 60 | vector alias_; 61 | 62 | }; 63 | 64 | } 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /src/errno.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ERRNO_HPP 2 | #define ERRNO_HPP 3 | 4 | enum ERRNO { SUCCESS, 5 | UNDEFINED, 6 | INVALID_ARGUMENT, 7 | INVALID_VALUE, 8 | NULL_POINTER_DEREFENCE, 9 | ILLEGAL_MEMORY_ACCESS, 10 | NUMERIC_OVERFLOW, 11 | NUMERIC_UNDERFLOW, 12 | PARTIAL_READ, 13 | INVALID_READ, 14 | NAN_GIVEN_FOR_SORTING }; 15 | 16 | #endif // ERRNO_HPP 17 | -------------------------------------------------------------------------------- /src/exceptions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXCEPTIONS_HPP 2 | #define EXCEPTIONS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "errno.hpp" 9 | 10 | class RFACE_EXCEPTION : public exception { 11 | 12 | public: 13 | 14 | RFACE_EXCEPTION(const ERRNO& errno, const std::string& note = "") throw():errno_(errno) { 15 | stringstream ss; 16 | ss << "ERRNO (" << errno_ << "): "; 17 | switch ( errno_ ) { 18 | case ERRNO::INVALID_ARGUMENT: 19 | ss << "invalid command-line argument."; 20 | break; 21 | case ERRNO::INVALID_VALUE: 22 | ss << "invalid command-line value."; 23 | break; 24 | case ERRNO::ILLEGAL_MEMORY_ACCESS: 25 | ss << "illegal memory access."; 26 | break; 27 | default: 28 | ss << "unknown exception!"; 29 | break; 30 | } 31 | ss << " " << note; 32 | msg_ = ss.str(); 33 | } 34 | 35 | ~RFACE_EXCEPTION() throw() {} 36 | 37 | virtual const char* what() const throw() { 38 | return msg_.c_str(); 39 | } 40 | 41 | private: 42 | std::string msg_; 43 | ERRNO errno_; 44 | 45 | }; 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/feature.cpp: -------------------------------------------------------------------------------- 1 | #include "feature.hpp" 2 | 3 | #include 4 | 5 | #include "utils.hpp" 6 | 7 | 8 | Feature::Feature(): 9 | type_(Feature::Type::UNKNOWN) { 10 | } 11 | 12 | Feature::Feature(Feature::Type newType, const string& newName, const size_t nSamples): 13 | type_(newType), 14 | name_(newName) { 15 | 16 | if ( type_ == Feature::Type::NUM ) { 17 | numData.resize(nSamples); 18 | catData.clear(); 19 | txtData.clear(); 20 | } else if ( type_ == Feature::Type::CAT ) { 21 | numData.clear(); 22 | catData.resize(nSamples); 23 | txtData.clear(); 24 | } else { 25 | numData.clear(); 26 | catData.clear(); 27 | txtData.resize(nSamples); 28 | } 29 | 30 | } 31 | 32 | void Feature::setNumSampleValue(const size_t sampleIdx, const num_t val) { 33 | assert( type_ == Feature::Type::NUM ); 34 | numData[sampleIdx] = val; 35 | } 36 | 37 | void Feature::setCatSampleValue(const size_t sampleIdx, const cat_t& val) { 38 | assert( type_ == Feature::Type::CAT ); 39 | catData[sampleIdx] = val; 40 | } 41 | 42 | void Feature::setTxtSampleValue(const size_t sampleIdx, const string& str) { 43 | assert( type_ == Feature::Type::TXT ); 44 | if ( datadefs::isNAN(str) ) { 45 | txtData[sampleIdx] = utils::hashText(""); 46 | } else { 47 | txtData[sampleIdx] = utils::hashText(str); 48 | } 49 | } 50 | 51 | cat_t Feature::getCatData(const size_t sampleIdx) const { 52 | assert(type_ == Feature::Type::CAT); 53 | return(catData[sampleIdx]); 54 | } 55 | 56 | vector Feature::getCatData() const { 57 | assert(type_ == Feature::Type::CAT); 58 | return(catData); 59 | } 60 | 61 | vector Feature::getCatData(const vector& sampleIcs) const { 62 | assert(type_ == Feature::Type::CAT); 63 | vector data(sampleIcs.size()); 64 | for ( size_t i = 0; i < sampleIcs.size(); ++i ) { 65 | data[i] = catData[sampleIcs[i]]; 66 | } 67 | return(data); 68 | } 69 | 70 | num_t Feature::getNumData(const size_t sampleIdx) const { 71 | assert(type_ == Feature::Type::NUM); 72 | return(numData[sampleIdx]); 73 | } 74 | 75 | vector Feature::getNumData() const { 76 | assert(type_ == Feature::Type::NUM); 77 | return(numData); 78 | } 79 | 80 | vector Feature::getNumData(const vector& sampleIcs) const { 81 | assert(type_ == Feature::Type::NUM); 82 | vector data(sampleIcs.size()); 83 | for ( size_t i = 0; i < sampleIcs.size(); ++i ) { 84 | data[i] = numData[sampleIcs[i]]; 85 | } 86 | return(data); 87 | } 88 | 89 | unordered_set Feature::getTxtData(const size_t sampleIdx) const { 90 | assert(type_ == Feature::Type::TXT); 91 | return(txtData[sampleIdx]); 92 | } 93 | 94 | Feature::Feature(const vector& newNumData, const string& newName): 95 | type_(Feature::Type::NUM), 96 | name_(newName) { 97 | numData = newNumData; 98 | } 99 | 100 | Feature::Feature(const vector& newCatData, const string& newName): 101 | type_(Feature::Type::CAT), 102 | name_(newName) { 103 | catData = newCatData; 104 | } 105 | 106 | Feature::Feature(const vector& newTxtData, const string& newName, const bool doHash): 107 | type_(Feature::Type::TXT), 108 | name_(newName) { 109 | 110 | assert(doHash); 111 | 112 | size_t nSamples = newTxtData.size(); 113 | 114 | txtData.resize(nSamples); 115 | 116 | for ( size_t i = 0; i < nSamples; ++i ) { 117 | if ( datadefs::isNAN(newTxtData[i]) ) { 118 | txtData[i] = utils::hashText(""); 119 | } else { 120 | txtData[i] = utils::hashText(newTxtData[i]); 121 | } 122 | } 123 | 124 | } 125 | 126 | Feature::~Feature() { } 127 | 128 | bool Feature::isNumerical() const { 129 | return( type_ == Feature::Type::NUM ? true : false ); 130 | } 131 | 132 | bool Feature::isCategorical() const { 133 | return( type_ == Feature::Type::CAT ? true : false ); 134 | } 135 | 136 | bool Feature::isTextual() const { 137 | return( type_ == Feature::Type::TXT ? true : false ); 138 | } 139 | 140 | bool Feature::isMissing(const size_t sampleIdx) const { 141 | switch (type_) { 142 | case NUM: 143 | return( datadefs::isNAN(numData[sampleIdx]) ); 144 | case CAT: 145 | return( datadefs::isNAN(catData[sampleIdx]) ); 146 | case TXT: 147 | return( txtData[sampleIdx].size() == 0 ); 148 | case UNKNOWN: 149 | break; 150 | } 151 | 152 | cerr << "Feature::isMissing() -- tried to use with unset feature object!" << endl; 153 | exit(1); 154 | } 155 | 156 | size_t Feature::nSamples() const { 157 | switch ( type_ ) { 158 | case NUM: 159 | return( numData.size() ); 160 | case CAT: 161 | return( catData.size() ); 162 | case TXT: 163 | return( txtData.size() ); 164 | case UNKNOWN: 165 | break; 166 | } 167 | 168 | cerr << "Feature::nSamples() -- tried to use with unset feature object!" << endl; 169 | exit(1); 170 | } 171 | 172 | size_t Feature::nRealSamples() const { 173 | 174 | size_t n = 0; 175 | 176 | for ( size_t i = 0; i < this->nSamples(); ++i ) { 177 | if ( !this->isMissing(i) ) { 178 | ++n; 179 | } 180 | } 181 | 182 | return(n); 183 | 184 | } 185 | 186 | string Feature::name() const { 187 | return( name_ ); 188 | } 189 | 190 | void Feature::setName(const string& newName) { 191 | assert( newName.length() > 0 ); 192 | name_ = newName; 193 | } 194 | 195 | 196 | vector Feature::categories() const { 197 | 198 | vector categories; 199 | 200 | if( this->isNumerical() || this->isTextual() ) { 201 | return( categories ); 202 | } 203 | 204 | unordered_set categoriesSet; 205 | 206 | for ( size_t i = 0; i < catData.size(); ++i ) { 207 | if ( !this->isMissing(i) ) { 208 | categoriesSet.insert(catData[i]); 209 | } 210 | } 211 | 212 | categories.resize(categoriesSet.size()); 213 | 214 | copy(categoriesSet.begin(),categoriesSet.end(),categories.begin()); 215 | 216 | return( categories ); 217 | 218 | } 219 | 220 | 221 | uint32_t Feature::getHash(const size_t sampleIdx, const size_t integer) const { 222 | 223 | assert( type_ == Feature::Type::TXT ); 224 | 225 | size_t pos = integer % this->txtData[sampleIdx].size(); 226 | 227 | unordered_set::const_iterator it(this->txtData[sampleIdx].begin()); 228 | for ( size_t i = 0; i < pos; ++i ) { 229 | it++; 230 | } 231 | 232 | return(*it); 233 | 234 | } 235 | 236 | bool Feature::hasHash(const size_t sampleIdx, const uint32_t hashIdx) const { 237 | 238 | return( this->txtData[sampleIdx].find(hashIdx) != this->txtData[sampleIdx].end() ); 239 | 240 | } 241 | 242 | unordered_map Feature::getHashKeyFrequency() const { 243 | 244 | size_t nSamples = txtData.size(); 245 | 246 | unordered_map visitedKeys; 247 | 248 | for ( size_t i = 0; i < nSamples; ++i ) { 249 | for ( unordered_set::const_iterator it(txtData[i].begin()); it != txtData[i].end(); ++it ) { 250 | visitedKeys[*it]++; 251 | } 252 | } 253 | 254 | return(visitedKeys); 255 | 256 | } 257 | 258 | num_t Feature::entropy() const { 259 | 260 | size_t nSamples = txtData.size(); 261 | 262 | unordered_map visitedKeys = getHashKeyFrequency(); 263 | 264 | unordered_map::const_iterator it(visitedKeys.begin()); 265 | 266 | num_t entropy = 0.0; 267 | 268 | for ( ; it != visitedKeys.end(); ++it ) { 269 | num_t f = static_cast(it->second) / static_cast(nSamples); 270 | if ( fabs(f) > 1e-5 && fabs(1-f) > 1e-5 ) { 271 | entropy -= (f * log(f) + (1-f)*log(1-f))/log(2); 272 | } 273 | } 274 | 275 | return(entropy); 276 | 277 | } 278 | 279 | void Feature::removeFrequentHashKeys(num_t fThreshold) { 280 | 281 | size_t nSamples = txtData.size(); 282 | 283 | const unordered_map visitedKeys = this->getHashKeyFrequency(); 284 | 285 | unordered_map::const_iterator it(visitedKeys.begin()); 286 | 287 | for ( ; it != visitedKeys.end(); ++it ) { 288 | num_t f = static_cast(it->second) / static_cast(nSamples); 289 | if ( f > fThreshold ) { 290 | for ( size_t sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { 291 | uint32_t hashKey = it->first; 292 | if ( txtData[sampleIdx].find(hashKey) != txtData[sampleIdx].end() ) { 293 | txtData[sampleIdx].erase(it->first); 294 | } 295 | } 296 | } 297 | } 298 | 299 | } 300 | -------------------------------------------------------------------------------- /src/feature.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FEATURE_HPP 2 | #define FEATURE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "datadefs.hpp" 11 | 12 | using namespace std; 13 | using datadefs::num_t; 14 | using datadefs::cat_t; 15 | 16 | class Feature { 17 | public: 18 | 19 | enum Type { NUM, CAT, TXT, UNKNOWN }; 20 | 21 | vector numData; 22 | vector catData; 23 | vector > txtData; 24 | 25 | Feature(); 26 | Feature(Type newType, const string& newName, const size_t nSamples); 27 | Feature(const vector& newNumData, const string& newName); 28 | Feature(const vector& newCatData, const string& newName); 29 | Feature(const vector& newTxtData, const string& newName, const bool doHash); 30 | ~Feature(); 31 | 32 | void setNumSampleValue(const size_t sampleIdx, const num_t val); 33 | void setCatSampleValue(const size_t sampleIdx, const cat_t& val); 34 | void setTxtSampleValue(const size_t sampleIdx, const string& str); 35 | 36 | num_t getNumData(const size_t sampleIdx) const; 37 | vector getNumData() const; 38 | vector getNumData(const vector& sampleIcs) const; 39 | 40 | cat_t getCatData(const size_t sampleIdx) const; 41 | vector getCatData() const; 42 | vector getCatData(const vector& sampleIcs) const; 43 | 44 | unordered_set getTxtData(const size_t sampleIdx) const; 45 | 46 | bool isNumerical() const; 47 | bool isCategorical() const; 48 | bool isTextual() const; 49 | 50 | bool isMissing(const size_t sampleIdx) const; 51 | 52 | size_t nSamples() const; 53 | size_t nRealSamples() const; 54 | 55 | string name() const; 56 | void setName(const string& newName); 57 | 58 | vector categories() const; 59 | 60 | uint32_t getHash(const size_t sampleIdx, const size_t integer) const; 61 | bool hasHash(const size_t sampleIdx, const uint32_t hashIdx) const; 62 | 63 | num_t entropy() const; 64 | 65 | unordered_map getHashKeyFrequency() const; 66 | 67 | void removeFrequentHashKeys(const num_t fThreshold); 68 | 69 | #ifndef TEST__ 70 | private: 71 | #endif 72 | 73 | Type type_; 74 | string name_; 75 | 76 | }; 77 | 78 | 79 | #endif 80 | -------------------------------------------------------------------------------- /src/math.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | //#include "gamma.hpp" 4 | #include "math.hpp" 5 | 6 | void math::transformLogistic(size_t nCategories, 7 | vector& prediction, 8 | vector& probability) { 9 | 10 | //size_t nCategories = trainData_->nCategories(); 11 | 12 | // Multiclass logistic transform of class probabilities from current probability estimates. 13 | assert(nCategories == prediction.size()); 14 | vector& expPrediction = probability; // just using the space by a different name 15 | 16 | // find maximum prediction 17 | vector::iterator maxPrediction = max_element(prediction.begin(),prediction.end()); 18 | // scale by maximum to prevent numerical errors 19 | 20 | num_t expSum = 0.0; 21 | size_t k; 22 | for (k = 0; k < nCategories; ++k) { 23 | expPrediction[k] = exp(prediction[k] - *maxPrediction); // scale by maximum 24 | expSum += expPrediction[k]; 25 | } 26 | for (k = 0; k < nCategories; ++k) { 27 | probability[k] = expPrediction[k] / expSum; 28 | } 29 | } 30 | 31 | 32 | void math::adjustPValues(vector& pValues, const size_t nTests) { 33 | 34 | num_t previousPValue = 0.0; 35 | 36 | for ( size_t i = 0; i < pValues.size(); ++i ) { 37 | 38 | pValues[i] *= nTests / ( i + 1 ); 39 | 40 | if ( pValues[i] > 1.0 ) { 41 | pValues[i] = 1.0; 42 | } 43 | 44 | if ( pValues[i] < previousPValue ) { 45 | pValues[i] = previousPValue; 46 | } else { 47 | previousPValue = pValues[i]; 48 | } 49 | 50 | } 51 | 52 | } 53 | 54 | 55 | /** 56 | Two-sample t-test 57 | */ 58 | num_t math::ttest(const vector& x, 59 | const vector& y, 60 | const bool WS) { 61 | 62 | // Sample mean and variance of x 63 | num_t mean_x = math::mean(x); 64 | num_t var_x = math::var(x,mean_x); 65 | size_t n_x = x.size(); 66 | 67 | // If sample size is too small, we exit 68 | if ( n_x < 2 ) { 69 | return( datadefs::NUM_NAN ); 70 | } 71 | 72 | // Sample mean and variance of y 73 | num_t mean_y = math::mean(y); 74 | num_t var_y = math::var(y,mean_y); 75 | size_t n_y = y.size(); 76 | 77 | // If sample size is too small, we exit 78 | if ( n_y < 2 ) { 79 | return( datadefs::NUM_NAN ); 80 | } 81 | 82 | // Degrees of freedom 83 | num_t v; 84 | 85 | // Standard deviation 86 | num_t s; 87 | 88 | if ( !WS ) { 89 | v = static_cast( n_x + n_y - 2 ); 90 | num_t sp = sqrt(((n_x-1) * var_x + (n_y-1) * var_y) / v); 91 | s = sp * sqrt(1.0 / n_x + 1.0 / n_y); 92 | } else { 93 | num_t h1 = pow(var_x / n_x + var_y / n_y,2); 94 | num_t h2 = pow( var_x / n_x, 2) / (n_x - 1) + pow(var_y/n_y,2)/(n_y-1); 95 | v = h1 / h2 ; 96 | s = sqrt( var_x / n_x + var_y / n_y ); 97 | } 98 | 99 | // If pooled standard deviation is zero... 100 | if ( fabs(s) < datadefs::EPS ) { 101 | if ( mean_x > mean_y ) { 102 | return( datadefs::EPS ); // ... and x larger than y => p = EPS 103 | } else if ( fabs( mean_x - mean_y ) < datadefs::EPS ) { 104 | return( 0.5 ); // ... and x and y almost equal => p = 0.5 105 | } else { 106 | return( 1.0 ); // ... and x smaller than y => p = 1.0 107 | } 108 | } 109 | 110 | // T-test statistic 111 | num_t tvalue = (mean_x - mean_y) / s; 112 | 113 | // Transformed t-test statistic 114 | num_t ttrans = v / ( pow(tvalue,2) + v ); 115 | 116 | // This variable will store the integral of the tail of the t-distribution 117 | num_t integral; 118 | 119 | // When ttrans > 0.9, we need to recast the integration in order to retain 120 | // accuracy. In other words we make use of the following identity: 121 | // 122 | // I(x,a,b) = 1 - I(1-x,b,a) 123 | if ( ttrans > 0.9 ) { 124 | 125 | // Calculate I(x,a,b) as 1 - I(1-x,b,a) 126 | integral = 1 - math::regularizedIncompleteBeta(1 - ttrans, 0.5, v/2); 127 | 128 | } else { 129 | 130 | // Calculate I(x,a,b) directly 131 | integral = math::regularizedIncompleteBeta(ttrans, v/2, 0.5); 132 | } 133 | 134 | // We need to be careful about which way to calculate the integral so that it represents 135 | // the tail of the t-distribution. The sign of the tvalue hints which way to integrate 136 | if ( tvalue > 0.0 ) { 137 | return( integral / 2 ); 138 | } else { 139 | return( 1 - integral / 2 ); 140 | } 141 | 142 | } 143 | 144 | /** 145 | Odd factors for the infinite continued fraction representation of the 146 | regularized incomplete beta function 147 | */ 148 | num_t dO(const num_t m, 149 | const num_t x, 150 | const num_t a, 151 | const num_t b) { 152 | return( -1.0*(a+m)*(a+b+m)*x / ( (a+2*m)*(a+2*m+1) ) ); 153 | } 154 | 155 | /** 156 | Even factors for the infinite continued fraction representation of the 157 | regularized incomplete beta function 158 | */ 159 | num_t dE(const num_t m, 160 | const num_t x, 161 | const num_t a, 162 | const num_t b) { 163 | return( m*(b-m)*x / ((a+2*m-1)*(a+2*m)) ); 164 | } 165 | 166 | /** 167 | Beta function, implemented as function of log-gamma functions implemented 168 | in "gamma.hpp" 169 | */ 170 | num_t beta(const num_t a, const num_t b) { 171 | return( exp( lgamma(a) + lgamma(b) - lgamma(a+b) ) ); 172 | // return( exp( LogGamma(a) + LogGamma(b) - LogGamma(a+b) ) ); 173 | } 174 | 175 | // http://en.wikipedia.org/wiki/Beta_function 176 | // http://en.wikipedia.org/wiki/Student's_t-distribution 177 | // http://www.boost.org/doc/libs/1_38_0/libs/math/doc/sf_and_dist/html/math_toolkit/special/sf_beta/ibeta_function.html 178 | // http://www.mpi-hd.mpg.de/astrophysik/HEA/internal/Numerical_Recipes/f6-4.pdf 179 | num_t math::regularizedIncompleteBeta(const num_t x, 180 | const num_t a, 181 | const num_t b) { 182 | 183 | // Number of factors in the infinite continued fraction representation 184 | size_t i = 50; 185 | 186 | num_t continuedFraction = 1; 187 | 188 | // Accumulate the continued fraction 189 | while ( i >= 1 ) { 190 | num_t m = static_cast(i); 191 | continuedFraction = 1 + dE(m,x,a,b) / ( 1 + dO(m,x,a,b) / continuedFraction ); 192 | --i; 193 | } 194 | 195 | return( pow(x,a)*pow(1-x,b) / ( a * beta(a,b) * ( 1 + dO(0,x,a,b) / continuedFraction ) ) ); 196 | 197 | } 198 | 199 | num_t math::erf(const num_t x) { 200 | 201 | num_t x2 = x*x; 202 | 203 | num_t sgn; 204 | if(x < 0.0) { 205 | sgn = -1.0; 206 | } else { 207 | sgn = 1.0; 208 | } 209 | 210 | return( sgn*sqrt(1.0 - exp(-x2*(4.0/datadefs::NUM_PI + datadefs::A*x2) / (1+datadefs::A*x2))) ); 211 | 212 | } 213 | 214 | num_t math::pearsonCorrelation(const vector& x, 215 | const vector& y) { 216 | 217 | assert( x.size() == y.size() ); 218 | 219 | size_t n = x.size(); 220 | 221 | if ( n == 0 ) { 222 | return( datadefs::NUM_NAN ); 223 | } 224 | 225 | num_t corr = 0.0; 226 | 227 | num_t mu_x = math::mean(x); 228 | num_t se_x = math::var(x,mu_x) * (n - 1); 229 | num_t mu_y = math::mean(y); 230 | num_t se_y = math::var(y,mu_y) * (n - 1); 231 | 232 | for(size_t i = 0; i < n; ++i) { 233 | corr += ( x[i] - mu_x ) * ( y[i] - mu_y ); 234 | } 235 | 236 | return( corr / sqrt(se_x*se_y) ); 237 | 238 | } 239 | 240 | num_t math::gamma(const vector& x, const size_t nCategories) { 241 | 242 | size_t n = x.size(); 243 | assert( n > 0 ); 244 | assert( nCategories > 0 ); 245 | 246 | num_t numerator = 0.0; 247 | num_t denominator = 0.0; 248 | 249 | for (size_t i = 0; i < n; ++i) { 250 | num_t abs_data_i = fabs( x[i] ); 251 | denominator += abs_data_i * (1.0 - abs_data_i); 252 | numerator += x[i]; 253 | } 254 | 255 | if ( fabs(denominator) <= datadefs::EPS ) { 256 | return( datadefs::LOG_OF_MAX_NUM * numerator ); 257 | } else { 258 | return( (numerator*(nCategories - 1)) / (denominator*nCategories) ); 259 | } 260 | 261 | } 262 | 263 | 264 | num_t math::numericalError(const vector& x, const vector& y) { 265 | 266 | assert( x.size() == y.size() ); 267 | 268 | size_t n = x.size(); 269 | 270 | if ( n == 0 ) { 271 | return(datadefs::NUM_NAN); 272 | } 273 | 274 | num_t ret = 0.0; 275 | 276 | for ( size_t i = 0; i < n; ++i ) { 277 | ret += pow( x[i] - y[i], 2 ) / n; 278 | } 279 | 280 | return( sqrt(ret) ); 281 | 282 | } 283 | 284 | 285 | num_t math::var(const vector& x) { 286 | 287 | num_t mu = math::mean(x); 288 | 289 | return( math::var(x,mu) ); 290 | 291 | } 292 | 293 | num_t math::var(const vector& x, const num_t& mu) { 294 | 295 | if ( x.size() < 2 ) { 296 | return( datadefs::NUM_NAN ); 297 | } 298 | 299 | size_t n = x.size(); 300 | 301 | num_t ret = 0.0; 302 | 303 | for(size_t i = 0; i < n; ++i) { 304 | ret += pow(x[i] - mu,2) / ( n - 1 ); 305 | } 306 | 307 | return( ret ); 308 | 309 | } 310 | 311 | -------------------------------------------------------------------------------- /src/math.hpp: -------------------------------------------------------------------------------- 1 | #ifndef MATH_HPP 2 | #define MATH_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "datadefs.hpp" 10 | #include "errno.hpp" 11 | 12 | 13 | using namespace std; 14 | using datadefs::num_t; 15 | 16 | 17 | namespace math { 18 | 19 | /** 20 | Returns the p'th percentile of the data vector x 21 | */ 22 | template 23 | T percentile(vector x, const num_t p) { 24 | 25 | // If the data vector has length 0, return 26 | if ( x.size() == 0 ) { 27 | cerr << "math::percentile() -- cannot compute with vector of length 0!" << endl; 28 | exit(1); 29 | } 30 | 31 | T prc; 32 | 33 | // Sort data to increasing order 34 | sort(x.begin(),x.end()); 35 | 36 | // Exact index without rounding 37 | T k = ( x.size() - 1 ) * p; 38 | 39 | // Lower bound of the index 40 | T f = floor(k); 41 | 42 | // Upper bound of the index 43 | T c = ceil(k); 44 | 45 | // If the upper and lower bounds are equal, 46 | // we can calculate the percentile directly 47 | // by the index k 48 | if(fabs(f - c) < datadefs::EPS) { 49 | prc = x[static_cast(k)]; 50 | } else { 51 | 52 | // Otherwise we will interpolate linearly based on the 53 | // distances from the intermediate point (k) to both 54 | // bounds: ceil->k and k->floor 55 | T d0 = x[static_cast(f)] * (c - k); 56 | T d1 = x[static_cast(c)] * (k - f); 57 | 58 | // This operation equals to the weighted average, 59 | // which in other words is the interpolated percentile 60 | // we were after 61 | prc = d0 + d1; 62 | } 63 | 64 | // Finally return the calculated percentile 65 | return( prc ); 66 | 67 | } 68 | 69 | void transformLogistic(size_t nCategories, vector& prediction, vector& probability); 70 | 71 | /** 72 | Error function 73 | NOTE: see http://en.wikipedia.org/wiki/Error_function 74 | */ 75 | num_t erf(num_t x); 76 | 77 | void adjustPValues(vector& pValues, const size_t nTests); 78 | 79 | /** 80 | Two-sample t-test 81 | NOTE: see http://en.wikipedia.org/wiki/Student's_t-test 82 | */ 83 | num_t ttest(const vector& x, 84 | const vector& y, 85 | const bool WS = false); 86 | 87 | /** 88 | Regularized incomplete Beta function 89 | NOTE: see http://en.wikipedia.org/wiki/Beta_function 90 | */ 91 | num_t regularizedIncompleteBeta(const num_t x, 92 | const num_t a, 93 | const num_t b); 94 | 95 | 96 | 97 | num_t pearsonCorrelation(const vector& x, 98 | const vector& y); 99 | 100 | inline num_t mean(const vector& x) { 101 | 102 | if ( x.size() == 0 ) { 103 | return( datadefs::NUM_NAN ); 104 | } 105 | 106 | num_t mu = 0.0; 107 | 108 | for(size_t i = 0; i < x.size(); ++i) { 109 | mu += x[i]; 110 | } 111 | 112 | return( mu / x.size() ); 113 | 114 | } 115 | 116 | template 117 | unordered_map frequency(const vector& x) { 118 | unordered_map freq; 119 | for(size_t i = 0; i < x.size(); ++i) { 120 | if( freq.find(x[i]) == freq.end() ) { 121 | freq[ x[i] ] = 1; 122 | } else { 123 | ++freq[ x[i] ]; 124 | } 125 | } 126 | return( freq ); 127 | } 128 | 129 | template 130 | T mode(const vector& x) { 131 | unordered_map freq = frequency(x); 132 | typename unordered_map::const_iterator maxElement( freq.begin() ); 133 | for ( typename unordered_map::const_iterator it(freq.begin()); it != freq.end(); ++it ) { 134 | if ( it->second > maxElement->second ) { 135 | maxElement = it; 136 | } 137 | } 138 | return( maxElement->first ); 139 | } 140 | 141 | template 142 | size_t nMismatches(const vector& x, const T& y) { 143 | size_t count = 0; 144 | for ( size_t i = 0; i < x.size(); ++i ) { 145 | if ( x[i] != y ) { 146 | ++count; 147 | } 148 | } 149 | return( count ); 150 | } 151 | 152 | template 153 | map > confusionMap(const vector& x, const vector& y) { 154 | 155 | assert(x.size() == y.size()); 156 | 157 | map > cMap; 158 | 159 | set allClasses; 160 | 161 | for ( size_t i = 0; i < x.size(); ++i ) { 162 | T a = x[i]; 163 | T b = y[i]; 164 | allClasses.insert(a); 165 | allClasses.insert(b); 166 | if ( cMap[a].find(b) == cMap[a].end() ) { 167 | cMap[a][b] = 1; 168 | } else { 169 | ++cMap[a][b]; 170 | } 171 | } 172 | 173 | } 174 | 175 | template 176 | num_t categoricalError(const vector& x, const vector& y) { 177 | 178 | assert( x.size() == y.size() ); 179 | 180 | size_t n = x.size(); 181 | 182 | if ( n == 0 ) { 183 | return(datadefs::NUM_NAN); 184 | } 185 | 186 | num_t ret = 0.0; 187 | 188 | for ( size_t i = 0; i < n; ++i ) { 189 | ret += static_cast( x[i] != y[i] ) / n; 190 | } 191 | 192 | return( ret ); 193 | 194 | } 195 | 196 | num_t numericalError(const vector& x, const vector& y); 197 | 198 | 199 | num_t gamma(const vector& x, const size_t nCategories); 200 | 201 | //num_t squaredError(const vector& x); 202 | 203 | //num_t squaredError(const vector& x, const num_t mu); 204 | 205 | // Unbiased variance estimate: 1/(n-1)*sum(y-y_i)^2 206 | num_t var(const vector& x); 207 | 208 | num_t var(const vector& x, const num_t& mu); 209 | 210 | /** 211 | Updates the squared frequency by ADDING x_n to the set 212 | NOTE: NANs will corrupt the data 213 | */ 214 | template 215 | inline void incrementSquaredFrequency(const T& x_n, 216 | unordered_map& freq, 217 | size_t& sqFreq) { 218 | 219 | 220 | // Check if the value already exists in the frequency map 221 | typename unordered_map::iterator it(freq.find(x_n)); 222 | if(it == freq.end()) { 223 | 224 | // If not, squared frequency becomes updated by 1 225 | sqFreq += 1; 226 | freq[x_n] = 1; 227 | 228 | } else { 229 | 230 | // Otherwise the squared frequency becomes updated by 231 | // 2*freq + 1 232 | sqFreq += 2*freq[x_n] + 1; 233 | it->second++; //freq[x_n]; 234 | 235 | } 236 | } 237 | 238 | /** 239 | Updates the squared frequency by REMOVING x_n 240 | from the set 241 | NOTE: NANs will corrupt the data 242 | */ 243 | template 244 | inline void decrementSquaredFrequency(const T& x_n, 245 | unordered_map& freq, 246 | size_t& sqFreq) { 247 | 248 | assert( freq.find(x_n) != freq.end() ); 249 | assert( freq[x_n] > 0); 250 | 251 | sqFreq -= 2*freq[x_n] - 1; 252 | --freq[x_n]; 253 | 254 | if(freq[x_n] == 0) { 255 | freq.erase(x_n); 256 | } 257 | } 258 | 259 | // Calculates decrease in impurity for a numerical target 260 | inline num_t deltaImpurity_regr(const num_t mu_tot, 261 | const size_t n_tot, 262 | const num_t mu_left, 263 | const size_t n_left, 264 | const num_t mu_right, 265 | const size_t n_right) { 266 | 267 | return( - mu_tot * mu_tot 268 | + mu_left * mu_left * n_left / n_tot 269 | + mu_right * mu_right * n_right / n_tot ); 270 | 271 | } 272 | 273 | inline num_t deltaImpurity_class(const size_t sf_tot, 274 | const size_t n_tot, 275 | const size_t sf_left, 276 | const size_t n_left, 277 | const size_t sf_right, 278 | const size_t n_right) { 279 | 280 | //cout << - 1.0 * sf_tot / ( 1.0 * n_tot * n_tot ) << " + " << 1.0 * sf_left / ( 1.0 * n_tot * n_left ) << " + " << 1.0 * sf_right / ( 1.0 * n_tot * n_right ) << endl; 281 | 282 | return( - 1.0 * sf_tot / ( n_tot * n_tot ) 283 | + 1.0 * sf_left / ( n_tot * n_left ) 284 | + 1.0 * sf_right / ( n_tot * n_right ) ); 285 | 286 | } 287 | 288 | template 289 | inline void setUnion(set& baseSet, const set& newSet) { 290 | 291 | for ( typename set::const_iterator it( newSet.begin() ); it != newSet.end(); ++it ) { 292 | baseSet.insert(*it); 293 | } 294 | 295 | } 296 | 297 | 298 | } 299 | 300 | #endif 301 | -------------------------------------------------------------------------------- /src/mtrand.h: -------------------------------------------------------------------------------- 1 | // mtrand.h 2 | // C++ include file for MT19937, with initialization improved 2002/1/26. 3 | // Coded by Takuji Nishimura and Makoto Matsumoto. 4 | // Ported to C++ by Jasper Bedaux 2003/1/1 (see http://www.bedaux.net/mtrand/). 5 | // The generators returning floating point numbers are based on 6 | // a version by Isaku Wada, 2002/01/09 7 | // 8 | // Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 9 | // All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions 13 | // are met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. The names of its contributors may not be used to endorse or promote 23 | // products derived from this software without specific prior written 24 | // permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Any feedback is very welcome. 39 | // http://www.math.keio.ac.jp/matumoto/emt.html 40 | // email: matumoto@math.keio.ac.jp 41 | // 42 | // Feedback about the C++ port should be sent to Jasper Bedaux, 43 | // see http://www.bedaux.net/mtrand/ for e-mail address and info. 44 | 45 | #ifndef MTRAND_H 46 | #define MTRAND_H 47 | 48 | class MTRand_int32 { // Mersenne Twister random number generator 49 | public: 50 | // default constructor: uses default seed only if this is the first instance 51 | MTRand_int32() { if (!init) seed(5489UL); init = true; } 52 | // constructor with 32 bit int as seed 53 | MTRand_int32(unsigned long s) { seed(s); init = true; } 54 | // constructor with array of size 32 bit ints as seed 55 | MTRand_int32(const unsigned long* array, int size) { seed(array, size); init = true; } 56 | // the two seed functions 57 | void seed(unsigned long); // seed with 32 bit integer 58 | void seed(const unsigned long*, int size); // seed with array 59 | // overload operator() to make this a generator (functor) 60 | unsigned long operator()() { return rand_int32(); } 61 | // 2007-02-11: made the destructor virtual; thanks "double more" for pointing this out 62 | virtual ~MTRand_int32() {} // destructor 63 | protected: // used by derived classes, otherwise not accessible; use the ()-operator 64 | unsigned long rand_int32(); // generate 32 bit random integer 65 | private: 66 | static const int n = 624, m = 397; // compile time constants 67 | // the variables below are static (no duplicates can exist) 68 | static unsigned long state[n]; // state vector array 69 | static int p; // position in state array 70 | static bool init; // true if init function is called 71 | // private functions used to generate the pseudo random numbers 72 | unsigned long twiddle(unsigned long, unsigned long); // used by gen_state() 73 | void gen_state(); // generate new state 74 | // make copy constructor and assignment operator unavailable, they don't make sense 75 | MTRand_int32(const MTRand_int32&); // copy constructor not defined 76 | void operator=(const MTRand_int32&); // assignment operator not defined 77 | }; 78 | 79 | // inline for speed, must therefore reside in header file 80 | inline unsigned long MTRand_int32::twiddle(unsigned long u, unsigned long v) { 81 | return (((u & 0x80000000UL) | (v & 0x7FFFFFFFUL)) >> 1) 82 | ^ ((v & 1UL) ? 0x9908B0DFUL : 0x0UL); 83 | } 84 | 85 | inline unsigned long MTRand_int32::rand_int32() { // generate 32 bit random int 86 | if (p == n) gen_state(); // new state vector needed 87 | // gen_state() is split off to be non-inline, because it is only called once 88 | // in every 624 calls and otherwise irand() would become too big to get inlined 89 | unsigned long x = state[p++]; 90 | x ^= (x >> 11); 91 | x ^= (x << 7) & 0x9D2C5680UL; 92 | x ^= (x << 15) & 0xEFC60000UL; 93 | return x ^ (x >> 18); 94 | } 95 | 96 | // generates double floating point numbers in the half-open interval [0, 1) 97 | class MTRand : public MTRand_int32 { 98 | public: 99 | MTRand() : MTRand_int32() {} 100 | MTRand(unsigned long seed) : MTRand_int32(seed) {} 101 | MTRand(const unsigned long* seed, int size) : MTRand_int32(seed, size) {} 102 | ~MTRand() {} 103 | double operator()() { 104 | return static_cast(rand_int32()) * (1. / 4294967296.); } // divided by 2^32 105 | private: 106 | MTRand(const MTRand&); // copy constructor not defined 107 | void operator=(const MTRand&); // assignment operator not defined 108 | }; 109 | 110 | // generates double floating point numbers in the closed interval [0, 1] 111 | class MTRand_closed : public MTRand_int32 { 112 | public: 113 | MTRand_closed() : MTRand_int32() {} 114 | MTRand_closed(unsigned long seed) : MTRand_int32(seed) {} 115 | MTRand_closed(const unsigned long* seed, int size) : MTRand_int32(seed, size) {} 116 | ~MTRand_closed() {} 117 | double operator()() { 118 | return static_cast(rand_int32()) * (1. / 4294967295.); } // divided by 2^32 - 1 119 | private: 120 | MTRand_closed(const MTRand_closed&); // copy constructor not defined 121 | void operator=(const MTRand_closed&); // assignment operator not defined 122 | }; 123 | 124 | // generates double floating point numbers in the open interval (0, 1) 125 | class MTRand_open : public MTRand_int32 { 126 | public: 127 | MTRand_open() : MTRand_int32() {} 128 | MTRand_open(unsigned long seed) : MTRand_int32(seed) {} 129 | MTRand_open(const unsigned long* seed, int size) : MTRand_int32(seed, size) {} 130 | ~MTRand_open() {} 131 | double operator()() { 132 | return (static_cast(rand_int32()) + .5) * (1. / 4294967296.); } // divided by 2^32 133 | private: 134 | MTRand_open(const MTRand_open&); // copy constructor not defined 135 | void operator=(const MTRand_open&); // assignment operator not defined 136 | }; 137 | 138 | // generates 53 bit resolution doubles in the half-open interval [0, 1) 139 | class MTRand53 : public MTRand_int32 { 140 | public: 141 | MTRand53() : MTRand_int32() {} 142 | MTRand53(unsigned long seed) : MTRand_int32(seed) {} 143 | MTRand53(const unsigned long* seed, int size) : MTRand_int32(seed, size) {} 144 | ~MTRand53() {} 145 | double operator()() { 146 | return (static_cast(rand_int32() >> 5) * 67108864. + 147 | static_cast(rand_int32() >> 6)) * (1. / 9007199254740992.); } 148 | private: 149 | MTRand53(const MTRand53&); // copy constructor not defined 150 | void operator=(const MTRand53&); // assignment operator not defined 151 | }; 152 | 153 | #endif // MTRAND_H 154 | -------------------------------------------------------------------------------- /src/murmurhash3.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the 6 | // algorithms are optimized for their respective platforms. You can still 7 | // compile and run any of them on any platform, but your performance with the 8 | // non-native version will be less than optimal. 9 | 10 | #include "murmurhash3.hpp" 11 | 12 | //----------------------------------------------------------------------------- 13 | // Platform-specific functions and macros 14 | 15 | // Microsoft Visual Studio 16 | 17 | #if defined(_MSC_VER) 18 | 19 | #define FORCE_INLINE __forceinline 20 | 21 | #include 22 | 23 | #define ROTL32(x,y) _rotl(x,y) 24 | #define ROTL64(x,y) _rotl64(x,y) 25 | 26 | #define BIG_CONSTANT(x) (x) 27 | 28 | // Other compilers 29 | 30 | #else // defined(_MSC_VER) 31 | 32 | #define FORCE_INLINE inline __attribute__((always_inline)) 33 | 34 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 35 | { 36 | return (x << r) | (x >> (32 - r)); 37 | } 38 | 39 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 40 | { 41 | return (x << r) | (x >> (64 - r)); 42 | } 43 | 44 | #define ROTL32(x,y) rotl32(x,y) 45 | #define ROTL64(x,y) rotl64(x,y) 46 | 47 | #define BIG_CONSTANT(x) (x##LLU) 48 | 49 | #endif // !defined(_MSC_VER) 50 | 51 | //----------------------------------------------------------------------------- 52 | // Block read - if your platform needs to do endian-swapping or can only 53 | // handle aligned reads, do the conversion here 54 | 55 | FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) 56 | { 57 | return p[i]; 58 | } 59 | 60 | FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) 61 | { 62 | return p[i]; 63 | } 64 | 65 | //----------------------------------------------------------------------------- 66 | // Finalization mix - force all bits of a hash block to avalanche 67 | 68 | FORCE_INLINE uint32_t fmix ( uint32_t h ) 69 | { 70 | h ^= h >> 16; 71 | h *= 0x85ebca6b; 72 | h ^= h >> 13; 73 | h *= 0xc2b2ae35; 74 | h ^= h >> 16; 75 | 76 | return h; 77 | } 78 | 79 | //---------- 80 | 81 | FORCE_INLINE uint64_t fmix ( uint64_t k ) 82 | { 83 | k ^= k >> 33; 84 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 85 | k ^= k >> 33; 86 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 87 | k ^= k >> 33; 88 | 89 | return k; 90 | } 91 | 92 | //----------------------------------------------------------------------------- 93 | 94 | void MurmurHash3_x86_32 ( const void * key, int len, 95 | uint32_t seed, void * out ) 96 | { 97 | const uint8_t * data = (const uint8_t*)key; 98 | const int nblocks = len / 4; 99 | 100 | uint32_t h1 = seed; 101 | 102 | const uint32_t c1 = 0xcc9e2d51; 103 | const uint32_t c2 = 0x1b873593; 104 | 105 | //---------- 106 | // body 107 | 108 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 109 | 110 | for(int i = -nblocks; i; i++) 111 | { 112 | uint32_t k1 = getblock(blocks,i); 113 | 114 | k1 *= c1; 115 | k1 = ROTL32(k1,15); 116 | k1 *= c2; 117 | 118 | h1 ^= k1; 119 | h1 = ROTL32(h1,13); 120 | h1 = h1*5+0xe6546b64; 121 | } 122 | 123 | //---------- 124 | // tail 125 | 126 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 127 | 128 | uint32_t k1 = 0; 129 | 130 | switch(len & 3) 131 | { 132 | case 3: k1 ^= tail[2] << 16; 133 | case 2: k1 ^= tail[1] << 8; 134 | case 1: k1 ^= tail[0]; 135 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 136 | }; 137 | 138 | //---------- 139 | // finalization 140 | 141 | h1 ^= len; 142 | 143 | h1 = fmix(h1); 144 | 145 | *(uint32_t*)out = h1; 146 | } 147 | 148 | //----------------------------------------------------------------------------- 149 | 150 | void MurmurHash3_x86_128 ( const void * key, const int len, 151 | uint32_t seed, void * out ) 152 | { 153 | const uint8_t * data = (const uint8_t*)key; 154 | const int nblocks = len / 16; 155 | 156 | uint32_t h1 = seed; 157 | uint32_t h2 = seed; 158 | uint32_t h3 = seed; 159 | uint32_t h4 = seed; 160 | 161 | const uint32_t c1 = 0x239b961b; 162 | const uint32_t c2 = 0xab0e9789; 163 | const uint32_t c3 = 0x38b34ae5; 164 | const uint32_t c4 = 0xa1e38b93; 165 | 166 | //---------- 167 | // body 168 | 169 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 170 | 171 | for(int i = -nblocks; i; i++) 172 | { 173 | uint32_t k1 = getblock(blocks,i*4+0); 174 | uint32_t k2 = getblock(blocks,i*4+1); 175 | uint32_t k3 = getblock(blocks,i*4+2); 176 | uint32_t k4 = getblock(blocks,i*4+3); 177 | 178 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 179 | 180 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 181 | 182 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 183 | 184 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 185 | 186 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 187 | 188 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 189 | 190 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 191 | 192 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 193 | } 194 | 195 | //---------- 196 | // tail 197 | 198 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 199 | 200 | uint32_t k1 = 0; 201 | uint32_t k2 = 0; 202 | uint32_t k3 = 0; 203 | uint32_t k4 = 0; 204 | 205 | switch(len & 15) 206 | { 207 | case 15: k4 ^= tail[14] << 16; 208 | case 14: k4 ^= tail[13] << 8; 209 | case 13: k4 ^= tail[12] << 0; 210 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 211 | 212 | case 12: k3 ^= tail[11] << 24; 213 | case 11: k3 ^= tail[10] << 16; 214 | case 10: k3 ^= tail[ 9] << 8; 215 | case 9: k3 ^= tail[ 8] << 0; 216 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 217 | 218 | case 8: k2 ^= tail[ 7] << 24; 219 | case 7: k2 ^= tail[ 6] << 16; 220 | case 6: k2 ^= tail[ 5] << 8; 221 | case 5: k2 ^= tail[ 4] << 0; 222 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 223 | 224 | case 4: k1 ^= tail[ 3] << 24; 225 | case 3: k1 ^= tail[ 2] << 16; 226 | case 2: k1 ^= tail[ 1] << 8; 227 | case 1: k1 ^= tail[ 0] << 0; 228 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 229 | }; 230 | 231 | //---------- 232 | // finalization 233 | 234 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 235 | 236 | h1 += h2; h1 += h3; h1 += h4; 237 | h2 += h1; h3 += h1; h4 += h1; 238 | 239 | h1 = fmix(h1); 240 | h2 = fmix(h2); 241 | h3 = fmix(h3); 242 | h4 = fmix(h4); 243 | 244 | h1 += h2; h1 += h3; h1 += h4; 245 | h2 += h1; h3 += h1; h4 += h1; 246 | 247 | ((uint32_t*)out)[0] = h1; 248 | ((uint32_t*)out)[1] = h2; 249 | ((uint32_t*)out)[2] = h3; 250 | ((uint32_t*)out)[3] = h4; 251 | } 252 | 253 | //----------------------------------------------------------------------------- 254 | 255 | void MurmurHash3_x64_128 ( const void * key, const int len, 256 | const uint32_t seed, void * out ) 257 | { 258 | const uint8_t * data = (const uint8_t*)key; 259 | const int nblocks = len / 16; 260 | 261 | uint64_t h1 = seed; 262 | uint64_t h2 = seed; 263 | 264 | const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 265 | const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 266 | 267 | //---------- 268 | // body 269 | 270 | const uint64_t * blocks = (const uint64_t *)(data); 271 | 272 | for(int i = 0; i < nblocks; i++) 273 | { 274 | uint64_t k1 = getblock(blocks,i*2+0); 275 | uint64_t k2 = getblock(blocks,i*2+1); 276 | 277 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 278 | 279 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 280 | 281 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 282 | 283 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 284 | } 285 | 286 | //---------- 287 | // tail 288 | 289 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 290 | 291 | uint64_t k1 = 0; 292 | uint64_t k2 = 0; 293 | 294 | switch(len & 15) 295 | { 296 | case 15: k2 ^= uint64_t(tail[14]) << 48; 297 | case 14: k2 ^= uint64_t(tail[13]) << 40; 298 | case 13: k2 ^= uint64_t(tail[12]) << 32; 299 | case 12: k2 ^= uint64_t(tail[11]) << 24; 300 | case 11: k2 ^= uint64_t(tail[10]) << 16; 301 | case 10: k2 ^= uint64_t(tail[ 9]) << 8; 302 | case 9: k2 ^= uint64_t(tail[ 8]) << 0; 303 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 304 | 305 | case 8: k1 ^= uint64_t(tail[ 7]) << 56; 306 | case 7: k1 ^= uint64_t(tail[ 6]) << 48; 307 | case 6: k1 ^= uint64_t(tail[ 5]) << 40; 308 | case 5: k1 ^= uint64_t(tail[ 4]) << 32; 309 | case 4: k1 ^= uint64_t(tail[ 3]) << 24; 310 | case 3: k1 ^= uint64_t(tail[ 2]) << 16; 311 | case 2: k1 ^= uint64_t(tail[ 1]) << 8; 312 | case 1: k1 ^= uint64_t(tail[ 0]) << 0; 313 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 314 | }; 315 | 316 | //---------- 317 | // finalization 318 | 319 | h1 ^= len; h2 ^= len; 320 | 321 | h1 += h2; 322 | h2 += h1; 323 | 324 | h1 = fmix(h1); 325 | h2 = fmix(h2); 326 | 327 | h1 += h2; 328 | h2 += h1; 329 | 330 | ((uint64_t*)out)[0] = h1; 331 | ((uint64_t*)out)[1] = h2; 332 | } 333 | 334 | //----------------------------------------------------------------------------- 335 | 336 | -------------------------------------------------------------------------------- /src/murmurhash3.hpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | #if defined(_MSC_VER) 14 | 15 | typedef unsigned char uint8_t; 16 | typedef unsigned long uint32_t; 17 | typedef unsigned __int64 uint64_t; 18 | 19 | // Other compilers 20 | 21 | #else // defined(_MSC_VER) 22 | 23 | #include 24 | 25 | #endif // !defined(_MSC_VER) 26 | 27 | //----------------------------------------------------------------------------- 28 | 29 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 30 | 31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 32 | 33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 34 | 35 | //----------------------------------------------------------------------------- 36 | 37 | #endif // _MURMURHASH3_H_ 38 | -------------------------------------------------------------------------------- /src/node.hpp: -------------------------------------------------------------------------------- 1 | //node.hpp 2 | // 3 | //A node class for CARTs 4 | 5 | #ifndef NODE_HPP 6 | #define NODE_HPP 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "datadefs.hpp" 15 | #include "treedata.hpp" 16 | #include "options.hpp" 17 | #include "utils.hpp" 18 | #include "distributions.hpp" 19 | 20 | using namespace std; 21 | using datadefs::num_t; 22 | 23 | class Node { 24 | public: 25 | 26 | struct Prediction { 27 | 28 | Feature::Type type; 29 | num_t numTrainPrediction; 30 | cat_t catTrainPrediction; 31 | vector numTrainData; 32 | vector catTrainData; 33 | }; 34 | 35 | struct Splitter { 36 | 37 | num_t fitness; 38 | string name; 39 | Feature::Type type; 40 | uint32_t hashValue; 41 | num_t leftLeqValue; 42 | unordered_set leftValues; 43 | 44 | Splitter(): fitness(0.0), name(""), type(Feature::Type::UNKNOWN) {} 45 | 46 | }; 47 | 48 | //Initializes node. 49 | Node(); 50 | ~Node(); 51 | 52 | //Gets the splitter for the node 53 | const string& splitterName() const { return( splitter_.name ); } 54 | 55 | //Sets a splitter feature for the node. 56 | //NOTE: splitter can be assigned only once! Subsequent setter calls will raise an assertion failure. 57 | void setSplitter(const num_t splitFitness, 58 | const string& splitterName, 59 | const num_t splitLeftLeqValue, 60 | Node& leftChild, 61 | Node& rightChild); 62 | 63 | void setSplitter(const num_t splitFitness, 64 | const string& splitterName, 65 | const unordered_set& leftSplitValues, 66 | Node& leftChild, 67 | Node& rightChild); 68 | 69 | void setSplitter(const num_t splitFitness, 70 | const string& splitterName, 71 | const uint32_t hashIdx, 72 | Node& leftChild, 73 | Node& rightChild); 74 | 75 | void setMissingChild(Node& missingChild); 76 | 77 | //Given a value, descends to either one of the child nodes, if existing, otherwise returns a pointer to the current node 78 | Node* percolate(TreeData* testData, const size_t sampleIdx, const size_t scrambleFeatureIdx = datadefs::MAX_IDX); 79 | 80 | void setNumTrainPrediction(const num_t& numTrainPrediction); 81 | void setCatTrainPrediction(const cat_t& catTrainPrediction); 82 | 83 | //Logic test whether the node has children or not 84 | inline bool hasChildren() const { return( this->leftChild() || this->rightChild() ); } 85 | 86 | Node* leftChild() const; 87 | Node* rightChild() const; 88 | Node* missingChild() const; 89 | 90 | vector getSubTreeLeaves(); 91 | 92 | void setNumTrainData(const vector& numTrainData); 93 | void setCatTrainData(const vector& catTrainData); 94 | 95 | const Prediction& getPrediction(); 96 | 97 | const Splitter& getSplitter(); 98 | 99 | void recursiveWriteTree(string& traversal, ofstream& toFile); 100 | 101 | enum PredictionFunctionType { MEAN, MODE, GAMMA }; 102 | 103 | #ifndef TEST__ 104 | protected: 105 | #endif 106 | 107 | struct SplitCache { 108 | 109 | size_t nSamples; 110 | vector featureSampleIcs; 111 | 112 | vector sampleIcs_left; 113 | vector sampleIcs_right; 114 | vector sampleIcs_missing; 115 | uint32_t hashIdx; 116 | size_t splitFeatureIdx; 117 | num_t splitValue; 118 | unordered_set splitValues_left; 119 | num_t splitFitness; 120 | 121 | vector newSampleIcs_left; 122 | vector newSampleIcs_right; 123 | vector newSampleIcs_missing; 124 | uint32_t newHashIdx; 125 | size_t newSplitFeatureIdx; 126 | num_t newSplitValue; 127 | unordered_set newSplitValues_left; 128 | num_t newSplitFitness; 129 | 130 | }; 131 | 132 | void recursiveNodeSplit(TreeData* treeData, 133 | const size_t targetIdx, 134 | const ForestOptions* forestOptions, 135 | distributions::Random* random, 136 | const PredictionFunctionType& predictionFunctionType, 137 | const distributions::PMF* pmf, 138 | const vector& sampleIcs, 139 | size_t* nLeaves, 140 | size_t& childIdx, 141 | vector& children, 142 | SplitCache& splitCache); 143 | 144 | bool regularSplitterSeek(TreeData* treeData, 145 | const size_t targetIdx, 146 | const ForestOptions* forestOptions, 147 | distributions::Random* random, 148 | const vector& sampleIcs, 149 | size_t& childIdx, 150 | vector& children, 151 | SplitCache& splitCache); 152 | 153 | 154 | void recursiveGetSubTreeLeaves(vector& leaves); 155 | 156 | #ifndef TEST__ 157 | private: 158 | #endif 159 | 160 | Splitter splitter_; 161 | 162 | Prediction prediction_; 163 | 164 | Node* leftChild_; 165 | Node* rightChild_; 166 | Node* missingChild_; 167 | 168 | }; 169 | 170 | #endif 171 | -------------------------------------------------------------------------------- /src/progress.cpp: -------------------------------------------------------------------------------- 1 | #include "progress.hpp" 2 | 3 | 4 | Progress::Progress(): 5 | width_(3) { 6 | cout << setw(width_) << "0" << "%" << flush; 7 | } 8 | 9 | Progress::~Progress() { 10 | reset(); 11 | } 12 | 13 | void Progress::update(const num_t fraction) { 14 | 15 | reset(); 16 | 17 | cout << setw(width_) << static_cast(fraction*100) << "%" << flush; 18 | 19 | } 20 | 21 | void Progress::reset() { 22 | 23 | for(size_t i = 0; i <= width_; ++i) { 24 | cout << "\b"; 25 | } 26 | 27 | } 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/progress.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "datadefs.hpp" 6 | 7 | using namespace std; 8 | using datadefs::num_t; 9 | 10 | class Progress { 11 | public: 12 | Progress(); 13 | ~Progress(); 14 | 15 | void update(const num_t fraction); 16 | 17 | private: 18 | 19 | void reset(); 20 | 21 | size_t width_; 22 | 23 | }; 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/reader.cpp: -------------------------------------------------------------------------------- 1 | #include "reader.hpp" 2 | #include 3 | 4 | using namespace std; 5 | 6 | Reader::Reader(const string& fileName, const char delimiter): 7 | delimiter_(delimiter) { 8 | 9 | this->init(fileName); 10 | 11 | } 12 | 13 | Reader::~Reader() { 14 | 15 | if ( inStream_.is_open() ) { 16 | inStream_.close(); 17 | } 18 | 19 | } 20 | 21 | void Reader::init(const string& fileName) { 22 | 23 | inStream_.open(fileName.c_str()); 24 | 25 | if ( !inStream_.good() ) { 26 | cerr << "ERROR: failed to open file '" << fileName << "' for reading. Make sure the file exists. Quitting..." << endl; 27 | exit(1); 28 | } 29 | 30 | this->setLineFeed(""); 31 | 32 | nLines_ = 0; 33 | 34 | string line; 35 | 36 | for ( nLines_ = 0; getline(inStream_,line); ++nLines_ ) { } 37 | 38 | this->rewind(); 39 | 40 | } 41 | 42 | bool Reader::endOfLine() const { 43 | return( lineFeed_.rdbuf()->in_avail() == 0 ); 44 | } 45 | 46 | bool Reader::nextLine() { 47 | 48 | string line; 49 | 50 | if ( getline(inStream_,line) ) { 51 | this->setLineFeed(line); 52 | return(true); 53 | } else { 54 | this->setLineFeed(line); 55 | return(false); 56 | } 57 | 58 | } 59 | 60 | bool Reader::skipField() { 61 | 62 | string field; 63 | 64 | if ( getline(lineFeed_,field,delimiter_) ) { 65 | return(true); 66 | } else { 67 | return(false); 68 | } 69 | 70 | } 71 | 72 | void Reader::rewind() { 73 | 74 | inStream_.clear(); 75 | inStream_.seekg(ios_base::beg); 76 | 77 | this->setLineFeed(""); 78 | 79 | } 80 | 81 | void Reader::checkLineFeed() const { 82 | 83 | if ( this->endOfLine() ) { 84 | cerr << "READ ERROR: tried to read from an empty linefeed. Did you forget Reader::nextLine()?" << endl; 85 | exit(1); 86 | } 87 | 88 | } 89 | 90 | void Reader::setLineFeed(const string& str) { 91 | lineFeed_.clear(); 92 | lineFeed_.str(str); 93 | } 94 | -------------------------------------------------------------------------------- /src/reader.hpp: -------------------------------------------------------------------------------- 1 | #ifndef READER_HPP 2 | #define READER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.hpp" 11 | #include "datadefs.hpp" 12 | 13 | class Reader { 14 | public: 15 | 16 | Reader(const std::string& fileName, const char delimiter = '\t'); 17 | ~Reader(); 18 | 19 | template inline friend Reader& operator>>(Reader& reader, T& val) { 20 | reader.checkLineFeed(); 21 | std::string field; 22 | std::getline(reader.lineFeed_,field,reader.delimiter_); 23 | std::stringstream ss( utils::chomp(field) ); 24 | ss >> val; 25 | return(reader); 26 | } 27 | 28 | bool nextLine(); 29 | 30 | bool skipField(); 31 | 32 | void rewind(); 33 | 34 | bool endOfLine() const; 35 | 36 | size_t nLines() const { return( nLines_ ); } 37 | 38 | void setDelimiter(const char delimiter) { delimiter_ = delimiter; } 39 | 40 | #ifndef TEST__ 41 | private: 42 | #endif 43 | 44 | void init(const std::string& fileName); 45 | 46 | void checkLineFeed() const; 47 | 48 | void setLineFeed(const string& str); 49 | 50 | std::ifstream inStream_; 51 | 52 | char delimiter_; 53 | 54 | size_t nLines_; 55 | 56 | stringstream lineFeed_; 57 | 58 | }; 59 | 60 | template<> inline Reader& operator>>(Reader& reader, datadefs::num_t& val) { 61 | reader.checkLineFeed(); 62 | std::string field; 63 | std::getline(reader.lineFeed_,field,reader.delimiter_); 64 | field = utils::chomp(field); 65 | if ( datadefs::isNAN_STR(field) ) { 66 | val = datadefs::NUM_NAN; 67 | } else { 68 | std::stringstream ss( utils::chomp(field) ); 69 | ss >> val; 70 | } 71 | return(reader); 72 | } 73 | 74 | /* 75 | template<> inline Reader& operator>>(Reader& reader, datadefs::cat_t& val) { 76 | reader.checkLineFeed(); 77 | std::string field; 78 | std::getline(reader.lineFeed_,field,reader.delimiter_); 79 | field = utils::chomp(field); 80 | if ( datadefs::isNAN_STR(field) ) { 81 | val = datadefs::CAT_NAN; 82 | } else { 83 | std::stringstream ss( utils::chomp(field) ); 84 | ss >> val; 85 | } 86 | return(reader); 87 | } 88 | */ 89 | 90 | template<> inline Reader& operator>>(Reader& reader, string& str) { 91 | reader.checkLineFeed(); 92 | std::getline(reader.lineFeed_,str,reader.delimiter_); 93 | str = utils::chomp(str); 94 | return(reader); 95 | } 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /src/rf_ace_R.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "rf_ace.hpp" 6 | #include "treedata.hpp" 7 | #include "datadefs.hpp" 8 | #include "options.hpp" 9 | #include "utils.hpp" 10 | 11 | using namespace std; 12 | using datadefs::num_t; 13 | 14 | void parseDataFrame(SEXP dataFrameObj, vector& dataMatrix, vector& sampleHeaders) { 15 | 16 | Rcpp::DataFrame df(dataFrameObj); 17 | 18 | //Rcpp::CharacterVector colNames = df.attr("names"); 19 | //Rcpp::CharacterVector rowNames = df.attr("row.names"); 20 | 21 | vector featureHeaders = df.attr("names"); 22 | vector foo = df.attr("row.names"); 23 | sampleHeaders = foo; 24 | 25 | dataMatrix.resize( 0 ); 26 | 27 | //cout << "nf = " << featureHeaders.size() << endl; 28 | //cout << "ns = " << sampleHeaders.size() << endl; 29 | 30 | // Read one column of information, which in this case is assumed to be one sample 31 | for ( size_t i = 0; i < featureHeaders.size(); ++i ) { 32 | Rcpp::List vec = df[i]; 33 | assert(vec.length() == sampleHeaders.size() ); 34 | //cout << " " << foo[0] << flush; 35 | //cout << " df[" << i << "].length() = " << vec.length() << endl; 36 | if ( featureHeaders[i].substr(0,2) != "N:" ) { 37 | vector sVec(sampleHeaders.size()); 38 | for ( size_t j = 0; j < sampleHeaders.size(); ++j ) { 39 | //cout << Rcpp::as(vec[j]) << endl; 40 | sVec[j] = Rcpp::as(vec[j]); 41 | } 42 | if ( featureHeaders[i].substr(0,2) == "T:" ) { 43 | bool doHash = true; 44 | dataMatrix.push_back( Feature(sVec,featureHeaders[i],doHash) ); 45 | } else { 46 | dataMatrix.push_back( Feature(sVec,featureHeaders[i]) ); 47 | } 48 | } else { 49 | vector sVec(sampleHeaders.size()); 50 | for ( size_t j = 0; j < sampleHeaders.size(); ++j ) { 51 | sVec[j] = Rcpp::as(vec[j]); 52 | } 53 | dataMatrix.push_back( Feature(sVec,featureHeaders[i]) ); 54 | } 55 | 56 | // cout << "df[" << j << "," << i << "] = " << Rcpp::as(vec[j]) << endl; 57 | // } 58 | } 59 | 60 | assert( dataMatrix.size() == featureHeaders.size() ); 61 | 62 | } 63 | 64 | RcppExport void rfaceSave(SEXP rfaceObj, SEXP fileName) { 65 | 66 | Rcpp::XPtr rface(rfaceObj); 67 | 68 | rface->save(Rcpp::as(fileName)); 69 | 70 | } 71 | 72 | RcppExport SEXP rfaceLoad(SEXP rfaceFile) { 73 | 74 | 75 | Rcpp::XPtr rface( new RFACE, true); 76 | 77 | rface->load(Rcpp::as(rfaceFile)); 78 | 79 | return(rface); 80 | 81 | } 82 | 83 | RcppExport SEXP rfaceTrain(SEXP trainDataFrameObj, 84 | SEXP targetStrR, 85 | SEXP featureWeightsR, 86 | SEXP forestTypeR, 87 | SEXP nTreesR, 88 | SEXP mTryR, 89 | SEXP nodeSizeR, 90 | SEXP nMaxLeavesR, 91 | SEXP shrinkageR, 92 | SEXP noNABranchingR, 93 | SEXP nThreadsR) { 94 | 95 | 96 | ForestOptions forestOptions( forest_t::QRF ); 97 | 98 | string targetStr = Rcpp::as(targetStrR); 99 | forestOptions.nTrees = Rcpp::as(nTreesR); 100 | forestOptions.mTry = Rcpp::as(mTryR); 101 | forestOptions.nodeSize = Rcpp::as(nodeSizeR); 102 | forestOptions.nMaxLeaves = Rcpp::as(nMaxLeavesR); 103 | forestOptions.shrinkage = Rcpp::as(shrinkageR); 104 | forestOptions.noNABranching = Rcpp::as(noNABranchingR); 105 | size_t nThreads = Rcpp::as(nThreadsR); 106 | 107 | vector dataMatrix; 108 | vector sampleHeaders; 109 | 110 | parseDataFrame(trainDataFrameObj,dataMatrix,sampleHeaders); 111 | 112 | bool useContrasts = false; 113 | Treedata trainData(dataMatrix,useContrasts,sampleHeaders); 114 | 115 | if ( forestOptions.nMaxLeaves == 0 ) { 116 | forestOptions.nMaxLeaves = datadefs::MAX_IDX; 117 | } 118 | 119 | size_t targetIdx = trainData.getFeatureIdx(targetStr); 120 | 121 | if ( targetIdx == trainData.end() ) { 122 | int integer; 123 | if ( datadefs::isInteger(targetStr,integer) && integer >= 0 && integer < static_cast(trainData.nFeatures()) ) { 124 | targetIdx = static_cast(integer); 125 | } else { 126 | cerr << "Invalid target: " << targetStr << endl; 127 | exit(1); 128 | } 129 | } 130 | 131 | Rcpp::XPtr rface( new RFACE(nThreads), true); 132 | 133 | vector featureWeights = Rcpp::as >(featureWeightsR); 134 | 135 | if ( featureWeights.size() == 0 ) { 136 | featureWeights = trainData.getFeatureWeights(); 137 | } 138 | 139 | featureWeights[targetIdx] = 0.0; 140 | 141 | rface->train(&trainData,targetIdx,featureWeights,&forestOptions); 142 | 143 | return(rface); 144 | 145 | } 146 | 147 | RcppExport SEXP rfacePredict(SEXP rfaceObj, SEXP testDataFrameObj, SEXP quantilesR, SEXP nSamplesForQuantilesR, SEXP distributionsR) { 148 | 149 | Rcpp::XPtr rface(rfaceObj); 150 | 151 | ForestOptions forestOptions(forest_t::QRF); 152 | 153 | { 154 | vector quantiles = Rcpp::as >(quantilesR); 155 | if ( quantiles.size() > 0 ) { 156 | forestOptions.quantiles = quantiles; 157 | } 158 | } 159 | 160 | forestOptions.nSamplesForQuantiles = Rcpp::as(nSamplesForQuantilesR); 161 | forestOptions.distributions = Rcpp::as(distributionsR); 162 | 163 | vector testDataMatrix; 164 | vector sampleHeaders; 165 | 166 | parseDataFrame(testDataFrameObj,testDataMatrix,sampleHeaders); 167 | 168 | bool useContrasts = false; 169 | 170 | Treedata testData(testDataMatrix,useContrasts,sampleHeaders); 171 | 172 | RFACE::QRFPredictionOutput qPredOut = rface->predictQRF(&testData,forestOptions); 173 | 174 | if ( qPredOut.isTargetNumerical ) { 175 | 176 | vector > numPredictionsTrans = utils::transpose(qPredOut.numPredictions); 177 | 178 | if ( forestOptions.distributions ) { 179 | 180 | return( Rcpp::List::create(Rcpp::Named("targetName")=qPredOut.targetName, 181 | Rcpp::Named("sampleNames")=qPredOut.sampleNames, 182 | Rcpp::Named("trueData")=qPredOut.trueNumData, 183 | Rcpp::Named("predictions")=numPredictionsTrans, 184 | Rcpp::Named("quantiles")=qPredOut.quantiles, 185 | Rcpp::Named("distributions")=qPredOut.numDistributions)); 186 | 187 | } else { 188 | 189 | return( Rcpp::List::create(Rcpp::Named("targetName")=qPredOut.targetName, 190 | Rcpp::Named("sampleNames")=qPredOut.sampleNames, 191 | Rcpp::Named("trueData")=qPredOut.trueNumData, 192 | Rcpp::Named("predictions")=numPredictionsTrans, 193 | Rcpp::Named("quantiles")=qPredOut.quantiles)); 194 | 195 | } 196 | 197 | } else { 198 | 199 | vector > catPredictionsTrans = utils::transpose(qPredOut.catPredictions); 200 | 201 | return( Rcpp::List::create(Rcpp::Named("targetName")=qPredOut.targetName, 202 | Rcpp::Named("sampleNames")=qPredOut.sampleNames, 203 | Rcpp::Named("trueData")=qPredOut.trueCatData, 204 | Rcpp::Named("predictions")=catPredictionsTrans, 205 | Rcpp::Named("categories")=qPredOut.categories)); 206 | } 207 | 208 | } 209 | 210 | RcppExport SEXP rfaceFilter(SEXP filterDataFrameObj, SEXP targetStrR, SEXP featureWeightsR, SEXP nTreesR, SEXP mTryR, SEXP nodeSizeR, SEXP nMaxLeavesR, SEXP nThreadsR) { 211 | 212 | string targetStr = Rcpp::as(targetStrR); 213 | 214 | ForestOptions forestOptions(forest_t::RF); 215 | forestOptions.nTrees = Rcpp::as(nTreesR); 216 | forestOptions.mTry = Rcpp::as(mTryR); 217 | forestOptions.nodeSize = Rcpp::as(nodeSizeR); 218 | forestOptions.nMaxLeaves = Rcpp::as(nMaxLeavesR); 219 | 220 | size_t nThreads = Rcpp::as(nThreadsR); 221 | 222 | FilterOptions filterOptions; 223 | 224 | vector dataMatrix; 225 | vector sampleHeaders; 226 | 227 | parseDataFrame(filterDataFrameObj,dataMatrix,sampleHeaders); 228 | 229 | bool useContrasts = true; 230 | 231 | Treedata filterData(dataMatrix,useContrasts,sampleHeaders); 232 | 233 | size_t targetIdx = filterData.getFeatureIdx(targetStr); 234 | 235 | if ( targetIdx == filterData.end() ) { 236 | int integer; 237 | if ( datadefs::isInteger(targetStr,integer) && integer >= 0 && integer < static_cast(filterData.nFeatures()) ) { 238 | targetIdx = static_cast(integer); 239 | } else { 240 | cerr << "Invalid target: " << targetStr << endl; 241 | exit(1); 242 | } 243 | } 244 | 245 | vector featureWeights = Rcpp::as >(featureWeightsR); 246 | if ( featureWeights.size() == 0 ) { 247 | featureWeights = filterData.getFeatureWeights(); 248 | } 249 | featureWeights[targetIdx] = 0.0; 250 | 251 | RFACE rface(nThreads); 252 | 253 | RFACE::FilterOutput filterOutput = rface.filter(&filterData,targetIdx,featureWeights,&forestOptions,&filterOptions); 254 | 255 | Rcpp::List filterOutputR = Rcpp::List::create(Rcpp::Named("featureNames")=filterOutput.featureNames, 256 | Rcpp::Named("pValues")=filterOutput.pValues, 257 | Rcpp::Named("importances")=filterOutput.importances, 258 | Rcpp::Named("correlations")=filterOutput.correlations, 259 | Rcpp::Named("sampleCounts")=filterOutput.sampleCounts); 260 | 261 | 262 | return(filterOutputR); 263 | 264 | } 265 | 266 | -------------------------------------------------------------------------------- /src/rootnode.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ROOTNODE_HPP 2 | #define ROOTNODE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "node.hpp" 11 | #include "treedata.hpp" 12 | #include "options.hpp" 13 | #include "distributions.hpp" 14 | #include "datadefs.hpp" 15 | 16 | using datadefs::num_t; 17 | using datadefs::cat_t; 18 | 19 | class RootNode : public Node { 20 | public: 21 | 22 | // Empty tree 23 | RootNode(); 24 | 25 | // Learn Tree from data 26 | RootNode(TreeData* trainData, const size_t targetIdx, const distributions::PMF* pmf, const ForestOptions* forestOptions, distributions::Random* random); 27 | 28 | // Load tree from file 29 | RootNode(ifstream& treeStream); 30 | 31 | ~RootNode(); 32 | 33 | void reset(const size_t nNodes); 34 | 35 | void loadTree(ifstream& treeStream); 36 | 37 | void writeTree(ofstream& toFile); 38 | 39 | void growTree(TreeData* trainData, const size_t targetIdx, const distributions::PMF* pmf, const ForestOptions* forestOptions, distributions::Random* random); 40 | 41 | Node& childRef(const size_t childIdx); 42 | 43 | size_t nNodes() const; 44 | 45 | size_t nLeaves() const; 46 | 47 | const Prediction& getPrediction(TreeData* treeData, const size_t sampleIdx); 48 | 49 | vector getChildLeafNumTrainData(TreeData* treeData, const size_t sampleIdx); 50 | vector getChildLeafCatTrainData(TreeData* treeData, const size_t sampleIdx); 51 | 52 | vector getOobIcs(); 53 | 54 | size_t nOobSamples(); 55 | 56 | set getFeaturesInTree() { return( featuresInTree_ ); } 57 | 58 | string getTargetName() const { return( targetName_ ); } 59 | bool isTargetNumerical() const { return( isTargetNumerical_ ); } 60 | 61 | unordered_map getDI(); 62 | 63 | void verifyIntegrity() const; 64 | 65 | #ifndef TEST__ 66 | private: 67 | #endif 68 | 69 | size_t getTreeSizeEstimate(const size_t nSamples, const size_t nMaxLeaves, const size_t nodeSize) const; 70 | 71 | forest_t forestType_; 72 | string targetName_; 73 | bool isTargetNumerical_; 74 | 75 | // Parameters that are generated only when a tree is grown 76 | vector children_; 77 | 78 | size_t nLeaves_; 79 | 80 | vector bootstrapIcs_; 81 | vector oobIcs_; 82 | 83 | set featuresInTree_; 84 | 85 | vector minDistToRoot_; 86 | 87 | SplitCache splitCache_; 88 | 89 | }; 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /src/statistics.cpp: -------------------------------------------------------------------------------- 1 | #include "statistics.hpp" 2 | #include "utils.hpp" 3 | #include "math.hpp" 4 | 5 | statistics::RF_statistics::RF_statistics() { 6 | 7 | } 8 | 9 | statistics::RF_statistics::RF_statistics(vector > importanceMat, 10 | vector > contrastImportanceMat, 11 | vector > nodeMat, 12 | num_t executionTime): 13 | 14 | importanceMat_(importanceMat), 15 | contrastImportanceMat_(contrastImportanceMat), 16 | nodeMat_(nodeMat), 17 | executionTime_(executionTime) { 18 | 19 | 20 | 21 | } 22 | 23 | void statistics::RF_statistics::printContrastImportance(ofstream& toFile) { 24 | 25 | size_t nFeatures = contrastImportanceMat_[0].size(); 26 | size_t nPerms = contrastImportanceMat_.size(); 27 | 28 | for ( size_t featureIdx = 0; featureIdx < nFeatures; ++featureIdx ) { 29 | 30 | vector fSample( nPerms ); 31 | 32 | for( size_t permIdx = 0; permIdx < nPerms; ++permIdx ) { 33 | fSample[permIdx] = contrastImportanceMat_[permIdx][featureIdx]; 34 | } 35 | 36 | num_t mu = math::mean( utils::removeNANs(fSample) ); 37 | 38 | toFile << mu << endl; 39 | 40 | } 41 | 42 | 43 | } 44 | 45 | void statistics::RF_statistics::print(ofstream& toFile) { 46 | 47 | assert( nodeMat_.size() > 0 ); 48 | 49 | size_t nPerms = importanceMat_.size(); 50 | size_t nTrees = nodeMat_[0].size(); 51 | 52 | assert( nPerms == contrastImportanceMat_.size() ); 53 | assert( nPerms == nodeMat_.size() ); 54 | 55 | vector importanceVec( nPerms ); 56 | vector contrastImportanceVec( nPerms ); 57 | 58 | size_t nNodes = 0; 59 | 60 | for ( size_t permIdx = 0; permIdx < nPerms; ++permIdx ) { 61 | importanceVec[permIdx] = math::mean( utils::removeNANs(importanceMat_[permIdx]) ); 62 | contrastImportanceVec[permIdx] = math::mean( utils::removeNANs(contrastImportanceMat_[permIdx]) ); 63 | for ( size_t treeIdx = 0; treeIdx < nodeMat_[permIdx].size(); ++treeIdx ) { 64 | nNodes += nodeMat_[permIdx][treeIdx]; 65 | } 66 | } 67 | 68 | num_t meanNodesPerTree = 1.0 * nNodes / ( nPerms * nTrees ); 69 | 70 | num_t meanNodesPerSecond = 1.0 * nNodes / executionTime_; 71 | 72 | importanceVec = utils::removeNANs(importanceVec); 73 | contrastImportanceVec = utils::removeNANs(contrastImportanceVec); 74 | 75 | num_t meanImportance = math::mean(importanceVec); 76 | num_t meanContrastImportance = math::mean(contrastImportanceVec); 77 | 78 | num_t stdImportance = sqrtf( math::var(importanceVec) ); 79 | num_t stdContrastImportance = sqrtf( math::var(contrastImportanceVec) ); 80 | 81 | toFile << "Random Forest statistics" << endl 82 | << "------------------------" << endl 83 | << "-- NUMBER OF TREES PER FOREST = " << nTrees << endl 84 | << "-- NUMBER OF FORESTS = " << nPerms << endl 85 | << "-- MEAN IMPORTANCE = " << meanImportance << endl 86 | << "-- STD IMPORTANCE = " << stdImportance << endl 87 | << "-- MEAN CONTRAST IMPORTANCE = " << meanContrastImportance << endl 88 | << "-- STD CONTRAST IMPORTANCE = " << stdContrastImportance << endl 89 | << "-- MEAN NODES PER TREE = " << meanNodesPerTree << endl 90 | << "-- MEAN NODES PER SECOND = " << meanNodesPerSecond << endl; 91 | 92 | 93 | } 94 | 95 | -------------------------------------------------------------------------------- /src/statistics.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STATISTICS_HPP 2 | #define STATISTICS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "datadefs.hpp" 11 | 12 | using namespace std; 13 | using datadefs::num_t; 14 | using datadefs::NUM_NAN; 15 | 16 | namespace statistics { 17 | 18 | class RF_statistics { 19 | 20 | public: 21 | 22 | RF_statistics(); 23 | RF_statistics(vector > importanceMat, vector > contrastImportanceMat, vector > nodeMat, num_t executionTime); 24 | 25 | void printContrastImportance(ofstream& toFile); 26 | 27 | void print(ofstream& toFile); 28 | 29 | private: 30 | 31 | vector > importanceMat_; 32 | vector > contrastImportanceMat_; 33 | 34 | vector > nodeMat_; 35 | 36 | num_t executionTime_; 37 | 38 | }; 39 | } 40 | 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /src/stochasticforest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STOCHASTICFOREST_HPP 2 | #define STOCHASTICFOREST_HPP 3 | 4 | #include 5 | #include 6 | #include "rootnode.hpp" 7 | #include "treedata.hpp" 8 | #include "options.hpp" 9 | #include "distributions.hpp" 10 | 11 | using namespace std; 12 | 13 | class StochasticForest { 14 | public: 15 | 16 | StochasticForest(); 17 | 18 | ~StochasticForest(); 19 | 20 | void learnRF(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const vector& featureWeights, vector& randoms); 21 | void learnGBT(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const vector& featureWeights, vector& randoms); 22 | 23 | void loadForest(const string& fileName); 24 | 25 | 26 | void trainForestAndPredictQuantiles(TreeData* trainData, 27 | const size_t targetIdx, 28 | TreeData* testData, 29 | distributions::PMF* pmf, 30 | ForestOptions* forestOptions, 31 | distributions::Random* random, 32 | vector >& predictions); 33 | 34 | //num_t getError() { return(0.0); } 35 | //num_t getOobError(); 36 | 37 | //void getImportanceValues(TreeData* trainData, vector& importanceValues, vector& contrastImportanceValues); 38 | void getMDI(TreeData* trainData, vector& impurityValues, vector& contrastImpurityValues); 39 | 40 | void predict(TreeData* testData, vector& predictions, vector& confidence, size_t nThreads = 1); 41 | void predict(TreeData* testData, vector& predictions, vector& confidence, size_t nThreads = 1); 42 | 43 | //bool useQuantiles() const; 44 | 45 | void getNumDistributions(TreeData* testData, vector >& distributions, distributions::Random* random, const size_t nSamplesPerTree); 46 | void getCatDistributions(TreeData* testData, vector >& distributions, distributions::Random* random, const size_t nSamplesPerTree); 47 | 48 | //vector getOobPredictions(); 49 | //vector getPermutedOobPredictions(const size_t featureIdx); 50 | 51 | //Counts the number of nodes in the forest 52 | //size_t nNodes(); 53 | //size_t nNodes(const size_t treeIdx); 54 | 55 | size_t nTrees(); 56 | 57 | //RootNode* tree(const size_t treeIdx) { return( rootNodes_[treeIdx] ); } 58 | 59 | //inline set getFeaturesInForest() const { return( featuresInForest_ ); } 60 | inline string getTargetName() const { assert(rootNodes_.size() > 0); return( rootNodes_[0]->getTargetName() ); } 61 | inline bool isTargetNumerical() const { assert(rootNodes_.size() > 0); return( rootNodes_[0]->isTargetNumerical() ); } 62 | 63 | void writeForest(ofstream& toFile); 64 | 65 | #ifndef TEST__ 66 | private: 67 | #endif 68 | 69 | void readForestHeader(ifstream& forestStream); 70 | 71 | void growNumericalGBT(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const distributions::PMF* pmf, vector& randoms); 72 | void growCategoricalGBT(TreeData* trainData, const size_t targetIdx, const ForestOptions* forestOptions, const distributions::PMF* pmf, vector& randoms); 73 | 74 | //num_t error(const vector& data1, 75 | // const vector& data2); 76 | 77 | datadefs::forest_t forestType_; 78 | 79 | vector GBTConstants_; 80 | num_t GBTShrinkage_; 81 | 82 | // Root nodes for every tree 83 | vector rootNodes_; 84 | 85 | // Container for all features in the forest for fast look-up 86 | //set featuresInForest_; 87 | 88 | }; 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_HPP 2 | #define TIMER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "datadefs.hpp" 13 | 14 | using namespace std; 15 | 16 | class Timer { 17 | 18 | public: 19 | 20 | Timer() {} 21 | ~Timer() {} 22 | 23 | void tic(const string& objName) { 24 | name2idx_.insert( pair(objName,timedObjects_.size()) ); 25 | timedObjects_.push_back( TimedObject(objName) ); 26 | } 27 | 28 | void toc(const string& objName) { 29 | 30 | map::const_iterator it( name2idx_.find(objName) ); 31 | 32 | if ( it == name2idx_.end() ) { 33 | cerr << "Cannot stop timing '" << objName << "', since it was never started!" << endl; 34 | exit(1); 35 | } 36 | 37 | size_t idx = name2idx_[objName]; 38 | timedObjects_[idx].timeDiff = time(0) - timedObjects_[idx].startTime; 39 | timedObjects_[idx].clockDiff = clock() - timedObjects_[idx].startClocks; 40 | if ( timedObjects_[idx].timeDiff > 0 ) { 41 | timedObjects_[idx].boost = static_cast(round(1.0 * timedObjects_[idx].clockDiff / ( CLOCKS_PER_SEC * timedObjects_[idx].timeDiff ))); 42 | } 43 | timedObjects_[idx].isRunning = false; 44 | } 45 | 46 | void print() { 47 | cout << "Execution time breakdown:" << endl; 48 | for ( size_t i = 0; i < timedObjects_.size(); ++i ) { 49 | timedObjects_[i].print(); 50 | } 51 | cout << endl; 52 | } 53 | 54 | private: 55 | 56 | struct TimedObject { 57 | string name; 58 | clock_t startTime; 59 | clock_t timeDiff; 60 | clock_t startClocks; 61 | clock_t clockDiff; 62 | clock_t boost; 63 | bool isRunning; 64 | TimedObject(const string& newName): name(newName),startTime(time(0)),startClocks(clock()),boost(1),isRunning(true) {} 65 | void print() { 66 | if ( !isRunning ) { 67 | cout << name << " " << timeDiff << " seconds (" << boost << "x)" << endl; 68 | } else { 69 | cout << name << " is still running!" << endl; 70 | } 71 | } 72 | }; 73 | 74 | map name2idx_; 75 | 76 | vector timedObjects_; 77 | 78 | }; 79 | 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /src/treedata.hpp: -------------------------------------------------------------------------------- 1 | //densetreedata.hpp 2 | // 3 | // 4 | 5 | #ifndef TREEDATA_HPP 6 | #define TREEDATA_HPP 7 | 8 | #include 9 | #include 10 | 11 | #include "datadefs.hpp" 12 | #include "distributions.hpp" 13 | #include "options.hpp" 14 | #include "feature.hpp" 15 | #include "reader.hpp" 16 | 17 | using namespace std; 18 | using datadefs::num_t; 19 | 20 | class TreeData { 21 | public: 22 | 23 | // Reveals the Feature class interface to the user 24 | virtual const Feature* feature(const size_t featureIdx) const = 0; 25 | 26 | // Returns the number of features 27 | virtual size_t nFeatures() const = 0; 28 | 29 | // Returns feature index, given the name 30 | virtual size_t getFeatureIdx(const string& featureName) const = 0; 31 | 32 | // A value denoting the "one-over-last" feature in matrix 33 | virtual size_t end() const = 0; 34 | 35 | // Returns sample name, given sample index 36 | virtual string getSampleName(const size_t sampleIdx) = 0; 37 | 38 | // Returns the number of samples 39 | virtual size_t nSamples() const = 0; 40 | 41 | virtual vector getFeatureWeights() const = 0; 42 | 43 | virtual void separateMissingSamples(const size_t featureIdx, 44 | vector& sampleIcs, 45 | vector& missingIcs) = 0; 46 | 47 | virtual num_t numericalFeatureSplit(const size_t targetIdx, 48 | const size_t featureIdx, 49 | const size_t minSamples, 50 | vector& sampleIcs_left, 51 | vector& sampleIcs_right, 52 | num_t& splitValue) = 0; 53 | 54 | virtual num_t categoricalFeatureSplit(const size_t targetIdx, 55 | const size_t featureIdx, 56 | const vector& catOrder, 57 | const size_t minSamples, 58 | vector& sampleIcs_left, 59 | vector& sampleIcs_right, 60 | unordered_set& splitValues_left) = 0; 61 | 62 | virtual num_t textualFeatureSplit(const size_t targetIdx, 63 | const size_t featureIdx, 64 | const uint32_t hashIdx, 65 | const size_t minSamples, 66 | vector& sampleIcs_left, 67 | vector& sampleIcs_right) = 0; 68 | 69 | // Generates a bootstrap sample from the real samples of featureIdx. Samples not in the bootstrap sample will be stored in oob_ics, 70 | // and the number of oob samples is stored in noob. 71 | virtual void bootstrapFromRealSamples(distributions::Random* random, 72 | const bool withReplacement, 73 | const num_t sampleSize, 74 | const size_t featureIdx, 75 | vector& ics, 76 | vector& oobIcs) = 0; 77 | 78 | virtual void createContrasts() = 0; 79 | virtual void permuteContrasts(distributions::Random* random) = 0; 80 | 81 | }; 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /src/utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_HPP 2 | #define UTILS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "datadefs.hpp" 15 | #include "distributions.hpp" 16 | 17 | using namespace std; 18 | using datadefs::num_t; 19 | using datadefs::cat_t; 20 | 21 | class Treedata; 22 | 23 | namespace utils { 24 | 25 | string tolower(const string& str); 26 | 27 | string suffix(const string& str); 28 | 29 | // Removes missing values from the provided data vector 30 | //vector removeNANs(vector x); 31 | 32 | // Chomps a string, i.e. removes all the trailing end-of-line characters 33 | string chomp(const string& str, const string& eof = "\r\n"); 34 | 35 | // Removes leading and trailing whitespace characters 36 | string trim(const string& str, const string& wh = " "); 37 | 38 | unordered_set keys(const string& str, const char delimiter); 39 | 40 | // A sophisticated parser that extracts a key-value pairs from a string 41 | map parse(const string& str, 42 | const char delimiter, 43 | const char separator, 44 | const char comment); 45 | 46 | map parse(istream& streamObj, 47 | const char delimiter, 48 | const char separator, 49 | const char comment); 50 | 51 | unordered_set hashText(const string& text); 52 | 53 | // Splits a delimited string 54 | vector split(const string& str, const char delimiter, const string& wh = " "); 55 | 56 | // Splits a delimited stream 57 | vector split(istream& streamObj, const char delimiter, const string& wh = " "); 58 | 59 | // Reads a list of items from a file 60 | vector readListFromFile(const string& fileName, const char delimiter); 61 | 62 | template 63 | vector removeNANs(vector data) { 64 | 65 | data.erase( remove_if(data.begin(),data.end(),&datadefs::isNAN), data.end() ); 66 | 67 | return(data); 68 | 69 | } 70 | 71 | template 72 | inline void write(ostream& os, StartIterator startIt, StopIterator stopIt, const char delimiter = ' ') { 73 | 74 | if ( startIt != stopIt ) { 75 | os << *startIt; 76 | ++startIt; 77 | } 78 | 79 | while ( startIt != stopIt ) { 80 | os << delimiter << *startIt; 81 | ++startIt; 82 | } 83 | } 84 | 85 | void filterSort(const bool isIncreasingOrder, 86 | vector& data, 87 | vector& refIcs); 88 | 89 | string num2str(const num_t x); 90 | 91 | void strv2numv(const vector& strvec, 92 | vector& numvec); 93 | 94 | /* 95 | void strv2catv(const vector& strvec, 96 | vector& catvec, 97 | map& mapping, 98 | map& backMapping); 99 | */ 100 | 101 | void sortDataAndMakeRef(const bool isIncreasingOrder, 102 | vector& data, 103 | vector& refIcs); 104 | 105 | /** 106 | * Sorts a given input data vector of type T based on a given reference 107 | * ordering of type vector. 108 | !! Correctness: this will fail if any of the contents of refIcs fall outside 109 | of the normal scope of vector& data. 110 | */ 111 | template void sortFromRef(vector& data, 112 | vector const& refIcs 113 | ) { 114 | assert(data.size() == refIcs.size()); 115 | vector foo = data; 116 | int n = data.size(); 117 | for (int i = 0; i < n; ++i) { 118 | data[i] = foo[refIcs[i]]; 119 | } 120 | } 121 | 122 | template 123 | T str2(const string& str) { 124 | 125 | if( datadefs::isNAN_STR(str) ) { 126 | return( static_cast(datadefs::NUM_NAN) ); 127 | } 128 | 129 | stringstream ss( chomp(str) ); 130 | T ret; 131 | ss >> ret; 132 | 133 | if ( ss.fail() || ss.bad() || !ss.eof() ) { 134 | cerr << "utils::convert::str2() -- input '" << str 135 | << "' incorrectly formatted for conversion to type T" << endl; 136 | exit(1); 137 | } 138 | 139 | return( ret ); 140 | } 141 | 142 | template 143 | vector > transpose(const vector >& data) { 144 | 145 | size_t nRows = data.size(); 146 | size_t nCols = data[0].size(); 147 | 148 | vector > dataTransposed(nCols,vector(nRows,datadefs::NUM_NAN)); 149 | 150 | for ( size_t i = 0; i < nRows; ++i ) { 151 | for ( size_t j = 0; j < nCols; ++j ) { 152 | dataTransposed[j][i] = data[i][j]; 153 | } 154 | } 155 | 156 | return(dataTransposed); 157 | 158 | } 159 | 160 | vector range(const size_t n); 161 | 162 | istream& safeGetline(istream& is, string& t); 163 | 164 | vector > splitRange(const size_t nElements, const size_t nSplits); 165 | 166 | template 167 | void permute(vector& data, distributions::Random* random) { 168 | 169 | // Permute indices 170 | for (size_t i = 0; i < data.size(); ++i) { 171 | size_t j = random->integer() % (i + 1); 172 | T temp = data[i]; 173 | data[i] = data[j]; 174 | data[j] = temp; 175 | } 176 | 177 | } 178 | 179 | num_t numericalFeatureSplitsNumericalTarget(const vector& tv, 180 | const vector& fv, 181 | const size_t minSamples, 182 | size_t& splitIdx); 183 | 184 | num_t numericalFeatureSplitsCategoricalTarget(const vector& tv, 185 | const vector& fv, 186 | const size_t minSamples, 187 | size_t& splitIdx); 188 | 189 | num_t categoricalFeatureSplitsNumericalTarget(const vector& tv, 190 | const vector& fv, 191 | const size_t minSamples, 192 | const vector& catOrder, 193 | unordered_map >& fmap_left, 194 | unordered_map >& fmap_right); 195 | 196 | num_t categoricalFeatureSplitsCategoricalTarget(const vector& tv, 197 | const vector& fv, 198 | const size_t minSamples, 199 | const vector& catOrder, 200 | unordered_map >& fmap_left, 201 | unordered_map >& fmap_right); 202 | 203 | 204 | } 205 | 206 | #endif 207 | -------------------------------------------------------------------------------- /test/GBT_benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "argparse.hpp" 4 | #include "stochasticforest.hpp" 5 | #include "treedata.hpp" 6 | 7 | using namespace std; 8 | 9 | 10 | const size_t DEFAULT_TARGETIDX = 0; 11 | const size_t DEFAULT_NTREES = 500; 12 | const size_t DEFAULT_NODESIZE = 5; 13 | const num_t DEFAULT_SHRINKAGE = 0.2; 14 | const num_t DEFAULT_SUBSAMPLE = 0.5; 15 | 16 | int main(const int argc, char* const argv[]) { 17 | 18 | //------------------------------------------------------------------------ 19 | // 0: parameters 20 | if(argc == 1 || argc == 2) { 21 | if(argc == 2) { 22 | string helphandle(argv[1]); 23 | if (helphandle != "-h" && helphandle != "--help") { 24 | cerr << "use -h or --help to get started" << endl; 25 | return EXIT_FAILURE; 26 | } 27 | } 28 | 29 | cout << endl; 30 | cout << "REQUIRED ARGUMENTS:" << endl; 31 | cout << "-I / --input input feature matrix" << endl; 32 | cout << "-O / --output output association file" << endl; 33 | cout << endl; 34 | cout << "OPTIONAL ARGUMENTS:" << endl; 35 | cout << "-i / --targetidx target index, ref. to feature matrix (default " << DEFAULT_TARGETIDX << ")" << endl; 36 | cout << "-n / --ntrees number of trees per GBT forest (default " << DEFAULT_NTREES << ")" << endl; 37 | cout << "-s / --nodesize minimum number of train samples per node, affects tree depth (default " << DEFAULT_NODESIZE << ")" << endl; 38 | cout << "-z / --shrinkage shrinkage (default " << DEFAULT_SHRINKAGE << ")" << endl; 39 | cout << "-u / --subsample subsample size (default " << DEFAULT_SUBSAMPLE << ")" << endl; 40 | cout << endl; 41 | return EXIT_SUCCESS; 42 | } 43 | 44 | cout << endl; 45 | cout << " ----------------------------------" << endl; 46 | cout << " --- GBT_benchmark version 0.0.2 ---" << endl; 47 | cout << " ----------------------------------" << endl; 48 | 49 | //using namespace GetOpt; 50 | string input = ""; 51 | size_t targetIdx = DEFAULT_TARGETIDX; 52 | size_t ntrees = DEFAULT_NTREES; 53 | size_t nodesize = DEFAULT_NODESIZE; 54 | num_t shrinkage = DEFAULT_SHRINKAGE; 55 | num_t subSampleSize = DEFAULT_SUBSAMPLE; 56 | string output = ""; 57 | 58 | ArgParse parser(argc,argv); 59 | parser.getArgument("I","input",input); 60 | parser.getArgument("i","target",targetIdx); 61 | parser.getArgument("n","ntrees",ntrees); 62 | parser.getArgument("O","output",output); 63 | parser.getArgument("z","shrinkage",shrinkage); 64 | parser.getArgument("u","subsample",subSampleSize); 65 | 66 | 67 | if(input == "") { 68 | cerr << "input file not specified" << endl; 69 | return EXIT_FAILURE; 70 | } 71 | 72 | if(output == "") { 73 | cerr << "output file not specified" << endl; 74 | return EXIT_FAILURE; 75 | } 76 | 77 | 78 | //------------------------------------------------------------------------ 79 | // 1: read data into Treedata class (features are rows) 80 | cout < 0 ) { 14 | for ( i in 1:length(classes) ) { 15 | start <- (classes[i]-1)*300 + 1 16 | fakeClasses[i] <- sample(start:(start+offset),1,replace=T) 17 | } 18 | } 19 | 20 | return(fakeClasses) 21 | } 22 | 23 | 24 | 25 | 26 | makeData <- function(nSamples,std,offset,pMissing) { 27 | 28 | nWordsMin <- 4 29 | nWordsMax <- 8 30 | 31 | bags <- list( 32 | list("buckler","shield","sword","helmet","gloves","horse","medieval","castle","joust","clown","extra","words","that","mix"), 33 | list("swan","duck","duckling","bird","fly","pond","wings","feather","beak","legs","words","that","dont","distinguish"), 34 | list("baby","diaper","toy","poo","pee","smile","cry","toddler","infant","play","text","that","dont","distinguish")) 35 | 36 | classes <- sample(1:3,nSamples,replace=T) 37 | fakeClasses <- sampleFakeClasses(classes,offset) 38 | 39 | nWordsPerSample <- sample(nWordsMin:nWordsMax,nSamples,replace=TRUE) 40 | 41 | text <- vector() 42 | 43 | v <- seq(0,4*pi,length.out=nSamples) 44 | x1 <- sin(v) + rnorm(nSamples,0,std) 45 | x2 <- v + rnorm(nSamples,0,std) 46 | y <- x1 + x2 + rnorm(nSamples,0,std) 47 | 48 | nNoisyVars <- 4 49 | 50 | for ( i in 1:nSamples ) { 51 | c <- classes[i] 52 | nWords <- nWordsPerSample[i] 53 | # nWords <- 10 54 | text[i] <- paste(sample(bags[[c]],nWords,replace=F),collapse=', ') 55 | y[i] <- y[i] + 4 * pi * c 56 | } 57 | 58 | n1 <- rnorm(nSamples) 59 | n1[runif(nSamples) < pMissing & classes == 1] <- NA 60 | n2 <- rnorm(nSamples) 61 | n2[runif(nSamples) < pMissing & classes == 1] <- NA 62 | n3 <- rnorm(nSamples) 63 | n3[runif(nSamples) < pMissing & classes == 1] <- NA 64 | n4 <- rnorm(nSamples) 65 | n4[runif(nSamples) < pMissing & classes == 1] <- NA 66 | x1[runif(nSamples) < pMissing & classes == 1] <- NA 67 | x2[runif(nSamples) < pMissing & classes == 1] <- NA 68 | fakeClasses[runif(nSamples) < pMissing & classes == 1] <- NA 69 | 70 | # Populating the data frame with the training data 71 | data <- data.frame(y,x1,x2,text,as.character(fakeClasses),n1,n2,n3,n4,stringsAsFactors=FALSE) 72 | colnames(data) <- c("N:output","N:input1","N:input2","T:random","C:class","N:noise1","N:noise2","N:noise3","N:noise4") 73 | 74 | # Populating sample names 75 | rownames(data) <- paste(c(rep("s",nSamples)),(1:nSamples),sep='') 76 | 77 | return(data) 78 | 79 | } 80 | 81 | getRFACEOutput <- function(trainData,testData,forestType,noNABranching,quantiles=vector(length(0))) { 82 | 83 | rface <- rface.train(trainData,"N:output",nTrees=50,mTry=3,nodeSize=3,forestType=forestType,noNABranching=noNABranching) 84 | rfaceOut <- rface.predict(rface,testData,quantiles=as.vector(c(0.5))) 85 | rfaceOut$predData <- rfaceOut$predictions[[1]] 86 | return(rfaceOut) 87 | 88 | } 89 | 90 | getQuantileVector <- function(predictions,idx) { 91 | 92 | out <- vector(length=length(predictions)) 93 | 94 | for ( i in 1:length(predictions) ) { 95 | out[i] <- predictions[[i]][idx] 96 | } 97 | return(out) 98 | } 99 | 100 | testCalibration <- function(rfaceOut) { 101 | 102 | nQuantiles <- length(rfaceOut$quantiles) 103 | nSamples <- length(rfaceOut$trueData) 104 | cal <- 1*vector(length=nQuantiles) 105 | 106 | for ( q in 1:nQuantiles ) { 107 | cal[q] <- sum( rfaceOut$trueData < rfaceOut$predictions[[q]] )/nSamples 108 | } 109 | 110 | return(cal) 111 | } 112 | 113 | benchmarkMissingValues <- function(pMissing) { 114 | 115 | offset <- 0 116 | nSamples <- 1000 117 | std <- 0.3 118 | 119 | trainData <- makeData(nSamples,std,offset,pMissing) 120 | testData <- makeData(nSamples,std,offset,pMissing) 121 | 122 | icsNum <- as.vector(c(1,2,3,6,7,8,9)) 123 | icsNumTxt <- as.vector(c(1,2,3,4,6,7,8,9)) 124 | icsNumCat <- as.vector(c(1,2,3,5,6,7,8,9)) 125 | 126 | outA <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",TRUE) 127 | outB <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",TRUE) 128 | outC <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",TRUE) 129 | outD <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",FALSE) 130 | outE <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",FALSE) 131 | outF <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",FALSE) 132 | 133 | #outG <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",TRUE,quantiles=vector(c(0.5))) 134 | #outH <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",TRUE,quantiles=vector(c(0.5))) 135 | #outI <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",TRUE,quantiles=vector(c(0.5))) 136 | #outJ <- getRFACEOutput(trainData[icsNum],testData[icsNum],"RF",FALSE,quantiles=vector(c(0.5))) 137 | #outK <- getRFACEOutput(trainData[icsNumTxt],testData[icsNumTxt],"RF",FALSE,quantiles=vector(c(0.5))) 138 | #outL <- getRFACEOutput(trainData[icsNumCat],testData[icsNumCat],"RF",FALSE,quantiles=vector(c(0.5))) 139 | 140 | trainData$"C:class" <- as.factor(trainData$"C:class") 141 | testData$"C:class" <- as.factor(testData$"C:class") 142 | 143 | imputedTrainData <- na.roughfix(trainData[c(1,2,3,5,6,7,8,9)]) 144 | imputedTestData <- na.roughfix(testData[ c(1,2,3,5,6,7,8,9)]) 145 | 146 | rfOut1 <- randomForest(imputedTrainData[c(2,3,5,6,7,8)],y=imputedTrainData[[1]],xtest=imputedTestData[c(2,3,5,6,7,8)],ytest=imputedTestData[[1]],ntree=50,mtry=3) 147 | rfOut2 <- randomForest(imputedTrainData[c(2,3,4,5,6,7,8)],y=imputedTrainData[[1]],xtest=imputedTestData[c(2,3,4,5,6,7,8)],ytest=imputedTestData[[1]],ntree=50,mtry=3) 148 | 149 | outRef1 <- list() 150 | outRef1$trueData <- outA$trueData 151 | outRef1$predData <- rfOut1$test$predicted 152 | 153 | outRef2 <- list() 154 | outRef2$trueData <- outA$trueData 155 | outRef2$predData <- rfOut2$test$predicted 156 | 157 | colors <- testData$"C:class" 158 | 159 | # dev.new() 160 | pdf("scattermatrix.pdf") 161 | pairs(testData[c(1,2,3,7)],col=colors) 162 | dev.off() 163 | 164 | errors <- list() 165 | errors$num <- c(rmse(outRef1),rmse(outA),rmse(outD)) 166 | names(errors$num) <- c("RF\nImputed","RF-ACE\nBinary","RF-ACE\nTernary") 167 | errors$txt <- c(rmse(outRef1),rmse(outB),rmse(outE)) 168 | names(errors$txt) <- c("RF\nImputed","RF-ACE\nBinary","RF-ACE\nTernary") 169 | errors$cat <- c(rmse(outRef2),rmse(outC),rmse(outF)) 170 | names(errors$cat) <- c("RF\nImputed","RF-ACE\nBinary","RF-ACE\nTernary") 171 | errors$title <- paste(c("n=",as.character(nSamples), ", pMissing=",as.character(pMissing*100)),collapse='') 172 | 173 | return(list(errors=errors,data=testData,idata=imputedTestData,rf=rfOut1)) 174 | 175 | } 176 | 177 | benchmarkCatSplitterSpeed <- function(offset) { 178 | 179 | nSamples <- 1000 180 | std <- 0.3 181 | pMissing <- 0.0 182 | 183 | trainData <- makeData(nSamples,std,offset,pMissing) 184 | testData <- makeData(nSamples,std,offset,pMissing) 185 | trainData <- trainData[c(1,5)] 186 | testData <- testData[c(1,5)] 187 | 188 | speed <- list() 189 | 190 | speed$rface <- 0 191 | for ( i in 1:10 ) { 192 | diff <- proc.time() 193 | rface <- rface.train(trainData,"N:output",nTrees=50,mTry=1,nodeSize=3,forestType="RF",noNABranching=FALSE) 194 | diff <- proc.time() - diff 195 | speed$rface <- as.matrix(speed$rface + diff)[1] 196 | } 197 | 198 | RMSE <- list() 199 | rfaceOut <- rface.predict(rface,testData,quantiles=as.vector(c(0.5))) 200 | rfaceOut$predData <- rfaceOut$predictions[[1]] 201 | RMSE$rface <- rmse(rfaceOut) 202 | 203 | trainData$"C:class" <- as.factor(trainData$"C:class") 204 | 205 | speed$rf <- NA 206 | if (offset < 10) { 207 | speed$rf <- 0 208 | for ( i in 1:10 ) { 209 | diff <- proc.time() 210 | rf <- randomForest(trainData[2],y=trainData[[1]],ntree=50,mtry=1) 211 | diff <- proc.time() - diff 212 | speed$rf <- as.matrix(speed$rf + diff)[1] 213 | } 214 | } 215 | 216 | RMSE$rf <- NA 217 | if (offset < 10) { 218 | rf <- randomForest(trainData[2],y=trainData[[1]],xtest=testData[2],ytest=testData[[1]],ntree=50,mtry=1) 219 | 220 | rfOut <- list() 221 | rfOut$trueData <- rfaceOut$trueData 222 | rfOut$predData <- rf$test$predicted 223 | RMSE$rf <- rmse(rfOut) 224 | } 225 | 226 | return(list(rfSpeed=speed$rf,rfaceSpeed=speed$rface,data=trainData,rfRMSE=RMSE$rf,rfaceRMSE=RMSE$rface)) 227 | } 228 | 229 | benchmarkRFSpeeds <- function(nSamples,nIters) { 230 | 231 | speeds <- data.frame(rf=0*vector(length=length(nSamples)),qrf=0*vector(length=length(nSamples)),rface=0*vector(length=length(nSamples))) 232 | names(speeds) <- c("RF","QRF","RF-ACE") 233 | std <- 0.3 234 | offset <- 0 235 | pMissing <- 0 236 | 237 | for ( iter in 1:nIters ) { 238 | for ( i in 1:length(nSamples) ) { 239 | 240 | data <- makeData(nSamples[i],std,offset,pMissing) 241 | 242 | diff <- proc.time() 243 | rf <- randomForest(data[c(2,3,5,6,7,8,9)],y=data[[1]],ntree=50,mtry=3,nodesize=3) 244 | diff <- proc.time() - diff 245 | speeds$"RF"[i] <- as.matrix(speeds$"RF"[i]+diff)[1] 246 | 247 | diff <- proc.time() 248 | rf <- quantregForest(data[c(2,3,5,6,7,8,9)],y=data[[1]],ntree=50,mtry=3,nodesize=3) 249 | diff <- proc.time() - diff 250 | speeds$"QRF"[i] <- as.matrix(speeds$"QRF"[i]+diff)[1] 251 | 252 | diff <- proc.time() 253 | rface <- rface.train(data[c(1,2,3,5,6,7,8,9)],"N:output",nTrees=50,mTry=3,nodeSize=3) 254 | diff <- proc.time() - diff 255 | speeds$"RF-ACE"[i] <- as.matrix(speeds$"RF-ACE"[i]+diff)[1] 256 | 257 | } 258 | } 259 | 260 | return(speeds) 261 | 262 | } 263 | 264 | -------------------------------------------------------------------------------- /test/bash/treesize_vs_pmissing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | file="test_103by300_mixed_matrix.afm" 4 | 5 | python test/python/ampute.py $file 0.1 tmp/foo_1.afm 6 | python test/python/ampute.py $file 0.2 tmp/foo_2.afm 7 | python test/python/ampute.py $file 0.3 tmp/foo_3.afm 8 | 9 | bin/rf-ace --trainData $file -i 0 -S 1 -n 100 -m 30 -V tmp/forest_0b.sf -N 10 | bin/rf-ace --trainData tmp/foo_1.afm -i 0 -S 2 -n 100 -m 30 -V tmp/forest_1b.sf -N 11 | bin/rf-ace --trainData tmp/foo_2.afm -i 0 -S 3 -n 100 -m 30 -V tmp/forest_2b.sf -N 12 | bin/rf-ace --trainData tmp/foo_3.afm -i 0 -S 4 -n 100 -m 30 -V tmp/forest_3b.sf -N 13 | bin/rf-ace --trainData $file -i 0 -S 5 -n 100 -m 30 -V tmp/forest_0t.sf 14 | bin/rf-ace --trainData tmp/foo_1.afm -i 0 -S 6 -n 100 -m 30 -V tmp/forest_1t.sf 15 | bin/rf-ace --trainData tmp/foo_2.afm -i 0 -S 7 -n 100 -m 30 -V tmp/forest_2t.sf 16 | bin/rf-ace --trainData tmp/foo_3.afm -i 0 -S 8 -n 100 -m 30 -V tmp/forest_3t.sf 17 | 18 | 19 | grep "NNODES=" tmp/forest_0b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' > tmp/treesizes.tsv 20 | grep "NNODES=" tmp/forest_1b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 21 | grep "NNODES=" tmp/forest_2b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 22 | grep "NNODES=" tmp/forest_3b.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 23 | grep "NNODES=" tmp/forest_0t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 24 | grep "NNODES=" tmp/forest_1t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 25 | grep "NNODES=" tmp/forest_2t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 26 | grep "NNODES=" tmp/forest_3t.sf | cut -d',' -f2 | cut -d'=' -f2 | awk 'BEGIN{ORS="\t"}1;END{print "\n"}' >> tmp/treesizes.tsv 27 | 28 | -------------------------------------------------------------------------------- /test/data/12by21_categorical_matrix.arff: -------------------------------------------------------------------------------- 1 | @RELATION dataset 2 | 3 | @ATTRIBUTE X0 {0,1,2} 4 | @ATTRIBUTE X1 {0,1,2} 5 | @ATTRIBUTE X2 {0,1,2} 6 | @ATTRIBUTE X3 {0,1,2} 7 | @ATTRIBUTE X4 {0,1,2} 8 | @ATTRIBUTE X5 {0,1,2} 9 | @ATTRIBUTE X6 {0,1,2} 10 | @ATTRIBUTE X7 {0,1,2} 11 | @ATTRIBUTE X8 {0,1,2} 12 | @ATTRIBUTE X9 {0,1,2} 13 | @ATTRIBUTE X10 {0,1,2} 14 | @ATTRIBUTE X11 {0,1,2} 15 | @ATTRIBUTE X12 {0,1,2} 16 | @ATTRIBUTE X13 {0,1,2} 17 | @ATTRIBUTE X14 {0,1,2} 18 | @ATTRIBUTE X15 {0,1,2} 19 | @ATTRIBUTE X16 {0,1,2} 20 | @ATTRIBUTE X17 {0,1,2} 21 | @ATTRIBUTE X18 {0,1,2} 22 | @ATTRIBUTE X19 {0,1,2} 23 | @ATTRIBUTE Class {a,b} 24 | 25 | @DATA 26 | 0,1,1,2,2,1,1,1,2,1,2,2,0,1,1,1,1,1,1,1,b 27 | 0,1,1,0,2,1,0,0,2,1,2,1,0,1,1,1,1,2,1,0,a 28 | 0,1,1,2,2,0,0,0,2,1,2,1,0,1,1,1,1,1,1,0,a 29 | 1,1,1,2,2,1,1,1,2,0,2,2,0,1,1,2,1,0,2,1,b 30 | 0,1,0,2,2,0,0,1,2,1,0,1,0,2,1,1,1,1,1,0,a 31 | 0,1,1,1,2,1,0,0,2,1,2,1,0,1,1,1,1,1,1,0,a 32 | 0,1,1,2,2,1,0,1,2,1,1,2,0,1,1,0,1,1,1,1,b 33 | 0,1,1,2,0,1,0,0,2,1,2,1,0,1,0,2,1,2,1,0,a 34 | 0,1,1,1,2,1,0,0,0,2,1,1,0,1,1,1,1,1,1,0,a 35 | 0,1,1,1,0,1,1,1,0,1,2,2,1,0,1,1,2,1,1,1,b 36 | 0,1,1,2,2,1,0,0,2,1,2,1,0,1,1,0,1,1,1,0,a 37 | 0,1,1,2,2,1,0,0,2,1,2,1,0,1,1,1,1,1,1,0,a 38 | -------------------------------------------------------------------------------- /test/data/3by8_mixed_NA_matrix.afm: -------------------------------------------------------------------------------- 1 | N:var0 C:var1 N:var2 N:var3 N:var4 N:var5 N:var6 T:var7 2 | s0 NA foo 2.2 3.3 4.4 5.5 6.6 Ah, be so good. Yes, no? 3 | s1 0.00 NA 2.22 3.33 4.44 5.55 NA NA 4 | s2 0.000 bar 2.222 3.333 4.444 5.555 6.666 Some more text, but not much. -------------------------------------------------------------------------------- /test/data/3by8_mixed_NA_transposed_matrix.afm: -------------------------------------------------------------------------------- 1 | s0 s1 s2 2 | N:var0 NA 0.00 0.000 3 | C:var1 foo NA bar 4 | N:var2 2.2 2.22 2.222 5 | N:var3 3.3 3.33 3.333 6 | N:var4 4.4 4.44 4.444 7 | N:var5 5.5 5.55 5.555 8 | N:var6 6.6 NA 6.666 9 | T:var7 Ah, be so good. Yes, no? NA Some more text, but not much. -------------------------------------------------------------------------------- /test/data/5by10_numeric_matrix.arff: -------------------------------------------------------------------------------- 1 | @relation po 2 | @attribute x1 numeric 3 | @attribute x2 numeric 4 | @attribute x3 numeric 5 | @attribute x4 numeric 6 | @attribute y numeric 7 | @data 8 | 0.8147,1.0000,0.0596,0.9160,6.0000 9 | 0.9058,2.0000,0.6820,0.0012,14.0000 10 | 0.1270,3.0000,0.0424,0.4624,24.0000 11 | 0.9134,4.0000,0.0714,0.4243,36.0000 12 | 0.6324,5.0000,?,0.4609,50.0000 13 | 0.0975,6.0000,0.0967,0.7702,66.0000 14 | 0.2785,7.0000,0.8181,0.3225,84.0000 15 | 0.5469,?,0.8175,0.7847,104.0000 16 | 0.9575,9.0000,0.7224,0.4714,126.0000 17 | 0.9649,10.0000,0.1499,0.0358,150.0000 18 | -------------------------------------------------------------------------------- /test/distributions_newtest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DISTRIBUTIONS_NEWTEST_HPP 2 | #define DISTRIBUTIONS_NEWTEST_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "utils.hpp" 9 | #include "newtest.hpp" 10 | #include "distributions.hpp" 11 | 12 | using namespace std; 13 | 14 | void distributions_newtest_random_integer(); 15 | void distributions_newtest_random_uniform(); 16 | void distributions_newtest_PMF(); 17 | 18 | void distributions_newtest() { 19 | 20 | newtest("integer()", &distributions_newtest_random_integer); 21 | newtest("uniform()", &distributions_newtest_random_uniform); 22 | newtest("pmf()", &distributions_newtest_PMF); 23 | 24 | } 25 | 26 | void distributions_newtest_random_integer() { 27 | 28 | // Make two identical random integer generators 29 | distributions::Random randGen1(0); 30 | distributions::Random randGen2(0); 31 | 32 | bool stayInSync = true; 33 | 34 | // Test that rand1 and rand2 stay in sync 35 | for ( size_t i = 0; i < 1000; ++i ) { 36 | 37 | size_t r1 = randGen1.integer(); 38 | size_t r2 = randGen2.integer(); 39 | 40 | if ( r1 != r2 ) { 41 | stayInSync = false; 42 | break; 43 | } 44 | 45 | } 46 | 47 | newassert( stayInSync ); 48 | 49 | unordered_map hist; 50 | 51 | size_t maxIdx = 1000; 52 | 53 | for ( size_t i = 0; i < maxIdx; ++i ) { 54 | hist[i] = 0; 55 | } 56 | 57 | for ( size_t i = 0; i < 100000; ++i ) { 58 | //size_t r = rand1() % maxIdx; 59 | ++hist[ randGen1.integer() % maxIdx ]; 60 | } 61 | 62 | size_t nZeroCounts = 0; 63 | 64 | for ( size_t i = 0; i < maxIdx; ++i ) { 65 | if ( hist[i] == 0 ) ++nZeroCounts; 66 | } 67 | 68 | // We allow there to be at most two indices that never got sampled during 69 | // 100k random number generation rounds 70 | newassert( nZeroCounts <= 2 ); 71 | 72 | } 73 | 74 | void distributions_newtest_random_uniform() { 75 | 76 | num_t r_min = datadefs::NUM_INF; 77 | num_t r_max = 0.0; 78 | 79 | distributions::Random random(0); 80 | 81 | bool stayWithinBounds = true; 82 | 83 | for ( size_t i = 0; i < 100000; ++i ) { 84 | num_t r = random.uniform(); 85 | 86 | if ( ! (0.0 <= r && r <= 1.0) ) { 87 | stayWithinBounds = false; 88 | break; 89 | } 90 | 91 | if ( r_min > r ) r_min = r; 92 | if ( r_max < r ) r_max = r; 93 | 94 | } 95 | 96 | newassert( stayWithinBounds ); 97 | newassert( r_max > r_min ); 98 | newassert( fabs( 1 - r_max - r_min ) < 0.0001 ); 99 | 100 | } 101 | 102 | void distributions_newtest_PMF() { 103 | 104 | distributions::Random random(0); 105 | 106 | vector weights = {1,2,3,5,3,1,0,1e-5}; 107 | 108 | num_t sum = math::mean(weights) * weights.size(); 109 | 110 | distributions::PMF pmf(weights); 111 | 112 | vector PMFest(8,0.0); 113 | 114 | size_t maxIter = 1e7; 115 | num_t incr = 1.0/maxIter; 116 | 117 | for ( size_t i = 0; i < maxIter; ++i ) { 118 | PMFest[ pmf.sample(&random) ] += incr; 119 | } 120 | 121 | for ( size_t i = 0; i < 8; ++i ) { 122 | newassert( fabs( PMFest[i] - weights[i] / sum ) < 0.01 ); 123 | } 124 | 125 | } 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /test/matlab/categoricalFeatureSplit.m: -------------------------------------------------------------------------------- 1 | function [DI,splitValues_left,splitValues_right,ics_left,ics_right] = categoricalFeatureSplit(tv,fv,minSamples,isTargetNumerical) 2 | 3 | %eliminate NaNs 4 | ics = find(~isnan(tv) & ~isnan(fv)); 5 | tv = tv(ics); 6 | fv = fv(ics); 7 | 8 | fVals = unique(fv); 9 | fVals(isnan(fVals)) = []; 10 | 11 | n = length(tv); 12 | assert(n == length(fv)); 13 | 14 | ics_left = false(1,n); 15 | ics_right = true(1,n); 16 | 17 | splitValues_left = []; 18 | splitValues_right = fVals; 19 | 20 | DI_best = 0; 21 | 22 | while true 23 | 24 | splitVal = -1; 25 | 26 | for i = 1:length(splitValues_right) 27 | 28 | fVal = splitValues_right(i); 29 | 30 | ics_left_test = ics_left | fv == fVal; 31 | ics_right_test = ics_right & fv ~= fVal; 32 | 33 | DI = deltaImpurity(tv(ics_left_test),tv(ics_right_test),isTargetNumerical); 34 | 35 | if DI > DI_best && sum(~isnan(ics_left_test)) >= minSamples && sum(~isnan(ics_right_test)) > minSamples 36 | DI_best = DI; 37 | splitVal = fVal; 38 | end 39 | 40 | %ics_left = ics_left & fv ~= fVal; 41 | %ics_right = ics_right | fv == fVal; 42 | 43 | end 44 | 45 | if splitVal == -1 46 | %sum(ics_left) 47 | %sum(ics_right) 48 | break; 49 | end 50 | 51 | splitValues_left = unique([splitValues_left,splitVal]); 52 | splitValues_right = setdiff(splitValues_right,splitVal); 53 | 54 | ics_left = ics_left | fv == splitVal; 55 | ics_right = ics_right & fv ~= splitVal; 56 | 57 | end 58 | 59 | %splitValues_left 60 | %splitValues_right 61 | %sum(ics_left) 62 | %sum(ics_right) 63 | 64 | 65 | ics_left = find(ics_left); 66 | ics_right = find(ics_right); 67 | 68 | DI = DI_best; 69 | 70 | -------------------------------------------------------------------------------- /test/matlab/deltaImpurity.m: -------------------------------------------------------------------------------- 1 | function DI = deltaImpurity(x_left,x_right,isNumerical) 2 | %DI = deltaImpurity(x,idx) 3 | % 4 | %Returns decrease in impurity when data x is split into two 5 | %halves, "x_left" and "x_right". Type of data is indicated by 6 | % isNumerical flag 7 | 8 | assert(~any(isnan(x_left))); 9 | assert(~any(isnan(x_right))); 10 | 11 | % Calculate the decrease using the variance formula (slow+unstable) 12 | if isNumerical 13 | 14 | DI = deltaImpurity_var_regr(x_left,x_right); 15 | 16 | %Calculate the decrease using the mean formulat (fast+stable) 17 | DI_test = deltaImpurity_mean_regr(x_left,x_right); 18 | 19 | else 20 | 21 | DI = deltaImpurity_gi_class(x_left,x_right); 22 | 23 | DI_test = deltaImpurity_sf_class(SF([x_left(:);x_right(:)]),length(x_left) + length(x_right),SF(x_left),length(x_left),SF(x_right),length(x_right)); 24 | 25 | end 26 | 27 | %Make sure the two measures agree 28 | if any(isnan([DI,DI_test])) 29 | assert(isnan(DI) && isnan(DI_test), 'error: only the other impurity function yields NaN'); 30 | else 31 | assert( abs(DI - DI_test ) < 1e-3, 'error: impurity functions disagree in value'); 32 | end 33 | 34 | 35 | function DI = deltaImpurity_mean_regr(x_left,x_right) 36 | 37 | x = [x_left(:);x_right(:)]; 38 | 39 | mu = mean(x); 40 | n = length(x); 41 | muL = mean(x_left); 42 | nL = length(x_left); 43 | muR = mean(x_right); 44 | nR = length(x_right); 45 | 46 | DI = -mu^2 + nL/n*muL^2 + nR/n*muR^2; 47 | 48 | 49 | function DI = deltaImpurity_var_regr(x_left,x_right) 50 | 51 | x = [x_left(:);x_right(:)]; 52 | n = length(x); 53 | nL = length(x_left); 54 | nR = length(x_right); 55 | 56 | DI = var(x,1) - nL/n*var(x_left,1) - nR/n*var(x_right,1); 57 | 58 | 59 | function DI = deltaImpurity_gi_class(x_left,x_right) 60 | 61 | x = [x_left(:);x_right(:)]; 62 | n = length(x); 63 | nL = length(x_left); 64 | nR = length(x_right); 65 | 66 | DI = giniIndex(x) - nL/n*giniIndex(x_left) - nR/n*giniIndex(x_right); 67 | 68 | function DI = deltaImpurity_sf_class(sf_tot,n_tot,sf_left,n_left,sf_right,n_right) 69 | 70 | DI = -sf_tot/(n_tot*n_tot) + sf_left/(n_tot*n_left) + sf_right / (n_tot*n_right); 71 | 72 | function sf = SF(x) 73 | x = x+1; 74 | sf = sum(hist(x,unique(x)).^2); 75 | 76 | function GI = giniIndex(x) 77 | 78 | GI = hist(x,unique(x))/length(x); 79 | if ~isempty(GI) 80 | GI = 1 - sum(GI.^2); 81 | else 82 | GI = 0; 83 | end 84 | 85 | -------------------------------------------------------------------------------- /test/matlab/numericalFeatureSplit.m: -------------------------------------------------------------------------------- 1 | function [DI,splitValue,ics_left,ics_right] = numericalFeatureSplit(tv,fv,minSplit,isTargetNumerical) 2 | 3 | %eliminate NaNs 4 | ics = find(~isnan(tv) & ~isnan(fv)); 5 | tv = tv(ics); 6 | fv = fv(ics); 7 | 8 | [fv,T] = sort(fv,'ascend'); 9 | 10 | tv = tv(T); 11 | ics = ics(T); 12 | 13 | n = length(tv); 14 | assert(n == length(fv)); 15 | 16 | DIvec = zeros(1,n); 17 | 18 | for i = minSplit:(n-minSplit) 19 | if fv(i) == fv(i+1), continue, end; 20 | DIvec(i) = deltaImpurity(tv(1:i),tv(i+1:end),isTargetNumerical); 21 | end 22 | 23 | [DI,idx] = max(DIvec); 24 | 25 | splitValue = fv(idx); 26 | 27 | ics_left = ics(1:idx); 28 | ics_right = ics(idx+1:end); -------------------------------------------------------------------------------- /test/matlab/readAFM.m: -------------------------------------------------------------------------------- 1 | function [X,rowHeaders,colHeaders] = readAFM(afmFile) 2 | 3 | S = importdata(afmFile); 4 | 5 | X = S.data; 6 | rowHeaders = S.textdata(2:end,1); 7 | colHeaders = S.textdata(1,2:end); 8 | 9 | [nRows,nCols] = size(X); 10 | 11 | fprintf('%i rows and %i columns read\n',nRows,nCols); 12 | 13 | assert(numel(rowHeaders) == nRows, 'error: row count mismatch\n'); 14 | assert(numel(colHeaders) == nCols, 'error: columns count mismatch\n'); 15 | 16 | -------------------------------------------------------------------------------- /test/matlab/writeAFM.m: -------------------------------------------------------------------------------- 1 | function writeAFM(X,featureHeaders,sampleHeaders,fileName) 2 | 3 | fid = fopen(fileName,'w'); 4 | 5 | [f,n] = size(X); 6 | 7 | if f > 0 8 | assert( length(featureHeaders) == f ); 9 | else 10 | f = length(featureHeaders); 11 | end 12 | 13 | if isempty(sampleHeaders) 14 | for i = 1:n 15 | fprintf(fid,'\t%s',['S',num2str(i)]); 16 | end 17 | else 18 | assert( length(sampleHeaders) == n ); 19 | for i = 1:n 20 | fprintf(fid,'\t%s',sampleHeaders{i}); 21 | end 22 | end 23 | 24 | fprintf(fid,'\n'); 25 | 26 | for i = 1:f 27 | fprintf(fid,'%s',featureHeaders{i}); 28 | 29 | if n > 0 30 | if strcmp(featureHeaders{i}(1:2),'N:') 31 | fmt = repmat('\t%6.3f',[1,n]); 32 | else 33 | fmt = repmat('\t%i',[1,n]); 34 | end 35 | 36 | fprintf(fid,fmt,X(i,:)); 37 | 38 | end 39 | 40 | fprintf(fid,'\n'); 41 | end 42 | 43 | fclose(fid); -------------------------------------------------------------------------------- /test/newtest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef NEWTEST_HPP 2 | #define NEWTEST_HPP 3 | 4 | #define TEST__ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | std::stringstream ERRLOG; 12 | 13 | size_t N_SUCCESS = 0; 14 | size_t N_FAIL = 0; 15 | 16 | #define newassert(condition) { if(!(condition)){ ERRLOG << " => FAIL: " << #condition << " @ " << __FILE__ << " (" << __LINE__ << ")" << std::endl; N_FAIL++; } else { N_SUCCESS++; } } 17 | 18 | void printERRLOG() { 19 | std::string errLine; 20 | while( std::getline(ERRLOG,errLine) ) { 21 | std::cerr << errLine << std::endl; 22 | } 23 | ERRLOG.clear(); 24 | } 25 | 26 | void rewindERRLOG() { 27 | ERRLOG.seekg(std::ios_base::beg); 28 | ERRLOG.clear(); 29 | } 30 | 31 | void newtestinit() { 32 | 33 | // Reset counters 34 | N_SUCCESS = 0; 35 | N_FAIL = 0; 36 | 37 | // Clear error stream 38 | ERRLOG.clear(); 39 | ERRLOG.str(""); 40 | 41 | std::cout << std::endl << "UNIT TESTING STARTED" << std::endl; 42 | 43 | } 44 | 45 | void newtest(const std::string& info, void (*testFunc)(void) ) { 46 | 47 | size_t nOldSuccess = N_SUCCESS; 48 | size_t nOldFail = N_FAIL; 49 | size_t nOldTests = N_SUCCESS + N_FAIL; 50 | 51 | std::cout << " TEST: " << info << "..." << std::flush; 52 | testFunc(); 53 | std::cout << " " << N_SUCCESS - nOldSuccess << "/" << N_SUCCESS + N_FAIL - nOldTests << " OK " << std::flush; 54 | 55 | if ( N_FAIL > nOldFail ) { 56 | std::cout << " !! " << N_FAIL - nOldFail << " FAILURES !! " << std::flush; 57 | } 58 | 59 | std::cout << std::endl; 60 | 61 | printERRLOG(); 62 | 63 | } 64 | 65 | void newtestdone() { 66 | 67 | std::cout << std::endl << "ALL DONE! " << N_SUCCESS + N_FAIL << " tests run: " << N_SUCCESS << " successes and " << N_FAIL << " failures" << std::endl << std::endl; 68 | rewindERRLOG(); 69 | printERRLOG(); 70 | 71 | } 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /test/node_newtest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef NODE_NEWTEST_HPP 2 | #define NODE_NEWTEST_HPP 3 | 4 | #include 5 | 6 | #include "newtest.hpp" 7 | #include "node.hpp" 8 | #include "datadefs.hpp" 9 | 10 | using namespace std; 11 | using datadefs::num_t; 12 | 13 | void node_newtest_getChildLeaves(); 14 | void node_newtest_setSplitter(); 15 | void node_newtest_percolateData(); 16 | void node_newtest_getLeafTrainPrediction(); 17 | void node_newtest_hasChildren(); 18 | void node_newtest_recursiveNodeSplit(); 19 | void node_newtest_cleanPairVectorFromNANs(); 20 | void node_newtest_recursiveNDescendantNodes(); 21 | void node_newtest_regularSplitterSeek(); 22 | 23 | void node_newtest() { 24 | 25 | newtest( "getChildLeaves(x)", &node_newtest_getChildLeaves ); 26 | newtest( "setSplitter(x)", &node_newtest_setSplitter ); 27 | newtest( "percolateData(x)", &node_newtest_percolateData ); 28 | newtest( "getLeafTrainPrediction(x)", &node_newtest_getLeafTrainPrediction ); 29 | newtest( "hasChildren(x)", &node_newtest_hasChildren ); 30 | newtest( "recursiveNodeSplit(x)", &node_newtest_recursiveNodeSplit ); 31 | newtest( "cleanPairVectorFromNANs(x)", &node_newtest_cleanPairVectorFromNANs ); 32 | newtest( "recursiveNDescendantNodes(x)", &node_newtest_recursiveNDescendantNodes ); 33 | newtest( "regularSplitterSeek(x)", &node_newtest_regularSplitterSeek ); 34 | 35 | 36 | } 37 | 38 | void node_newtest_getChildLeaves() { 39 | 40 | Node node,nodeL,nodeR,nodeM,nodeLL,nodeLR; 41 | 42 | node.setSplitter(0.0,"foo",static_cast(5.0),nodeL,nodeR); 43 | nodeL.setSplitter(0.0,"bar",static_cast(6.0),nodeLL,nodeLR); 44 | node.missingChild_ = &nodeM; 45 | 46 | nodeR.setNumTrainPrediction(1.3); 47 | nodeM.setNumTrainPrediction(2.2); 48 | 49 | nodeLL.setNumTrainPrediction(1.1); 50 | nodeLR.setNumTrainPrediction(1.2); 51 | 52 | nodeLL.setNumTrainData({1,2,3}); 53 | nodeLR.setNumTrainData({4,5}); 54 | nodeR.setNumTrainData({6}); 55 | nodeM.setNumTrainData({7}); 56 | 57 | vector childLeaves = node.getSubTreeLeaves(); 58 | 59 | set childLeavesSet(childLeaves.begin(),childLeaves.end()); 60 | 61 | newassert( childLeaves.size() == 4 ); 62 | newassert( childLeavesSet.size() == 4 ); 63 | newassert( childLeavesSet.find(&nodeLL) != childLeavesSet.end() ); 64 | newassert( childLeavesSet.find(&nodeLR) != childLeavesSet.end() ); 65 | newassert( childLeavesSet.find(&nodeR) != childLeavesSet.end() ); 66 | newassert( childLeavesSet.find(&nodeM) != childLeavesSet.end() ); 67 | 68 | childLeaves = nodeR.getSubTreeLeaves(); 69 | newassert( childLeaves.size() == 1 ); 70 | newassert( childLeaves[0] == &nodeR ); 71 | 72 | childLeaves = nodeM.getSubTreeLeaves(); 73 | newassert( childLeaves.size() == 1 ); 74 | newassert( childLeaves[0] == &nodeM ); 75 | 76 | childLeaves = nodeL.getSubTreeLeaves(); 77 | newassert( childLeaves.size() == 2 ); 78 | childLeavesSet = set(childLeaves.begin(),childLeaves.end()); 79 | newassert( childLeavesSet.find(&nodeLL) != childLeavesSet.end() ); 80 | newassert( childLeavesSet.find(&nodeLR) != childLeavesSet.end() ); 81 | 82 | vector trainData = nodeLL.getPrediction().numTrainData; 83 | set trainDataSet(trainData.begin(),trainData.end()); 84 | 85 | newassert( trainDataSet.find(1) != trainDataSet.end() ); 86 | newassert( trainDataSet.find(2) != trainDataSet.end() ); 87 | newassert( trainDataSet.find(3) != trainDataSet.end() ); 88 | 89 | } 90 | 91 | 92 | void node_newtest_setSplitter() { 93 | 94 | //size_t splitterIdx = 3; 95 | datadefs::num_t splitLeftLeqValue = 0.5; 96 | //datadefs::num_t leftFraction = 0.5; 97 | 98 | //Splitter::Splitter splitter(0.5); 99 | 100 | Node node,leftChild,rightChild; 101 | 102 | node.setSplitter(0.0,"foo",splitLeftLeqValue,leftChild,rightChild); 103 | 104 | //newassert( node.splitterIdx() == splitterIdx ); 105 | newassert( node.splitter_.type == Feature::Type::NUM ); 106 | newassert( fabs(node.splitter_.leftLeqValue - splitLeftLeqValue) < datadefs::EPS ); 107 | //newassert( fabs(node.splitter_.leftFraction - leftFraction) < datadefs::EPS ); 108 | 109 | } 110 | 111 | void node_newtest_percolateData() { 112 | 113 | DenseTreeData treeData("test_2by10_text_matrix.afm",'\t',':'); 114 | 115 | uint32_t h; 116 | 117 | MurmurHash3_x86_32("c",1,0,&h); 118 | 119 | Node node,leftChild,rightChild; 120 | 121 | node.setSplitter(0.0,"T:in",h,leftChild,rightChild); 122 | 123 | newassert( &leftChild == node.leftChild() ); 124 | newassert( &rightChild == node.rightChild() ); 125 | 126 | newassert( NULL == node.missingChild() ); 127 | 128 | newassert( node.percolate(&treeData,0,1) == &rightChild ); 129 | newassert( node.percolate(&treeData,1,1) == &rightChild ); 130 | newassert( node.percolate(&treeData,2,1) == &rightChild ); 131 | newassert( node.percolate(&treeData,3,1) == &rightChild ); 132 | newassert( node.percolate(&treeData,4,1) == &rightChild ); 133 | newassert( node.percolate(&treeData,5,1) == &leftChild ); 134 | newassert( node.percolate(&treeData,6,1) == &leftChild ); 135 | newassert( node.percolate(&treeData,7,1) == &leftChild ); 136 | newassert( node.percolate(&treeData,8,1) == &leftChild ); 137 | newassert( node.percolate(&treeData,9,1) == &leftChild ); 138 | newassert( node.percolate(&treeData,10,1) == &leftChild ); 139 | newassert( node.percolate(&treeData,11,1) == &leftChild ); 140 | newassert( node.percolate(&treeData,12,1) == &leftChild ); 141 | newassert( node.percolate(&treeData,13,1) == &leftChild ); 142 | newassert( node.percolate(&treeData,14,1) == &leftChild ); 143 | newassert( node.percolate(&treeData,15,1) == &rightChild ); 144 | newassert( node.percolate(&treeData,16,1) == &rightChild ); 145 | newassert( node.percolate(&treeData,17,1) == &rightChild ); 146 | newassert( node.percolate(&treeData,18,1) == &rightChild ); 147 | newassert( node.percolate(&treeData,19,1) == &rightChild ); 148 | 149 | 150 | } 151 | 152 | void node_newtest_regularSplitterSeek() { 153 | 154 | } 155 | 156 | void node_newtest_getLeafTrainPrediction() { 157 | } 158 | 159 | void node_newtest_hasChildren() { 160 | } 161 | 162 | void node_newtest_recursiveNodeSplit() { 163 | } 164 | 165 | void node_newtest_cleanPairVectorFromNANs() { 166 | 167 | } 168 | 169 | void node_newtest_recursiveNDescendantNodes() { 170 | 171 | } 172 | 173 | #endif 174 | -------------------------------------------------------------------------------- /test/python/ampute.py: -------------------------------------------------------------------------------- 1 | import csv,sys,random 2 | 3 | afmFileIn = sys.argv[1] 4 | afmFileOut = sys.argv[3] 5 | pMissing = float(sys.argv[2]) 6 | 7 | assert afmFileIn != afmFileOut 8 | assert 0 < pMissing < 1 9 | 10 | afmReader = csv.reader(open(afmFileIn,'r'),delimiter='\t') 11 | afmWriter = csv.writer(open(afmFileOut,'w'),delimiter='\t') 12 | 13 | afmWriter.writerow(afmReader.next()) 14 | 15 | for inputLine in afmReader: 16 | 17 | afmWriter.writerow( [inputLine[0]] + [ "NA" if random.uniform(0,1) < pMissing else x for x in inputLine[1:] ] ) 18 | -------------------------------------------------------------------------------- /test/python/deltaImpurity.py: -------------------------------------------------------------------------------- 1 | """ 2 | % Caculate the decrease using the variance formula (slow+unstable) 3 | if isNumerical 4 | DI = deltaImpurity_var_regr(x_left,x_right); 5 | 6 | %Calculate the decrease using the mean formulat (fast+stable) 7 | DI_test = deltaImpurity_mean_regr(x_left,x_right); 8 | 9 | %Make sure the two measures agree 10 | assert( abs(DI - DI_test ) < 1e-5, 'error: impurity functions disagree'); 11 | else 12 | 13 | DI = deltaImpurity_class(x_left,x_right); 14 | 15 | end 16 | 17 | function DI = deltaImpurity_mean_regr(x_left,x_right) 18 | 19 | x = [x_left(:);x_right(:)]; 20 | 21 | mu = mean(x); 22 | n = length(x); 23 | muL = mean(x_left); 24 | nL = length(x_left); 25 | muR = mean(x_right); 26 | nR = length(x_right); 27 | 28 | DI = -mu^2 + nL/n*muL^2 + nR/n*muR^2; 29 | 30 | function DI = deltaImpurity_var_regr(x_left,x_right) 31 | 32 | x = [x_left(:);x_right(:)]; 33 | n = length(x); 34 | nL = length(x_left); 35 | nR = length(x_right); 36 | DI = var(x,1) - nL/n*var(x_left,1) - nR/n*var(x_right,1); 37 | 38 | function DI = deltaImpurity_class(x_left,x_right) 39 | x = [x_left(:);x_right(:)]; 40 | n = length(x); 41 | nL = length(x_left); 42 | nR = length(x_right); 43 | DI = giniIndex(x) - nL/n*giniIndex(x_left) - nR/n*giniIndex(x_right); 44 | 45 | function GI = giniIndex(x) 46 | GI = hist(x,unique(x))/length(x); 47 | if ~isempty(GI) 48 | GI = 1 - sum(GI.^2); 49 | else 50 | GI = 0; 51 | end 52 | """ 53 | 54 | import sys 55 | import getopt 56 | import numpy 57 | 58 | def myHist(list): 59 | dic = {} 60 | for l in list: 61 | print l 62 | if (dic.get(l)): 63 | dic[l] = dic[l] + 1 64 | else: 65 | dic[l] = 1 66 | return dic 67 | 68 | def giniIndex(x): 69 | print "Begin giniIndex" 70 | print x 71 | L = len(x) 72 | sorted_x = sorted(x) 73 | hist = myHist(sorted_x) 74 | """ 75 | numeric_sx = [] 76 | for v in sorted_x: 77 | numeric_sx.append(float(v)) 78 | print numeric_sx 79 | myset = set(x) 80 | y = numpy.cumsum(numeric_sx) 81 | B = sum(y) / (y[-1] * L) 82 | return 1 + 1./L - 2*B 83 | """ 84 | print hist.keys() 85 | GI = 0.0 86 | for v in hist.values(): 87 | GI = GI + pow(float(v)/float(L), 2) 88 | return 1 - GI; 89 | 90 | """ 91 | function DI = deltaImpurity_mean_regr(x_left,x_right) 92 | 93 | x = [x_left(:);x_right(:)]; 94 | 95 | mu = mean(x); 96 | n = length(x); 97 | muL = mean(x_left); 98 | nL = length(x_left); 99 | muR = mean(x_right); 100 | nR = length(x_right); 101 | 102 | DI = -mu^2 + nL/n*muL^2 + nR/n*muR^2; 103 | """ 104 | 105 | def diIndex(x,y): 106 | x = [float(v) for v in x] 107 | y = [float(v) for v in y] 108 | w = x + y 109 | 110 | L = len(w) 111 | xL = len(x) 112 | yL = len(y) 113 | mw = numpy.mean(w) 114 | mx = numpy.mean(x) 115 | my = numpy.mean(y) 116 | return (-1.0)*pow(mw,2) + float(xL)/L*(pow(mx,2)) + float(yL)/L*(pow(my,2)) 117 | 118 | def main(): 119 | try: 120 | opts, args = getopt.getopt(sys.argv[1:], "h", ["help"]) 121 | except getopt.error, msg: 122 | print msg 123 | print "for help use --help" 124 | sys.exit(2) 125 | for o, a in opts: 126 | if o in ("-h", "--help"): 127 | print __doc__ 128 | sys.exit(0) 129 | #for arg in args: 130 | #process(arg) # process() is defined elsewhere 131 | l = args[0].split(",") 132 | r = args[1].split(",") 133 | w = l + r 134 | left = giniIndex(l) 135 | right = giniIndex(r) 136 | print left 137 | print right 138 | wgi = giniIndex(w) - (float(len(l))/float(len(w))*left + float(len(r))/float(len(w))*right ) 139 | print wgi 140 | 141 | print diIndex(l,r) 142 | 143 | if __name__ == "__main__": 144 | main() 145 | 146 | -------------------------------------------------------------------------------- /test/reader_newtest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef READER_NEWTEST_HPP 2 | #define READER_NEWTEST_HPP 3 | 4 | #include "newtest.hpp" 5 | #include "reader.hpp" 6 | #include "datadefs.hpp" 7 | #include "treedata.hpp" 8 | 9 | using namespace std; 10 | using datadefs::num_t; 11 | 12 | void reader_newtest_readAFM(); 13 | 14 | void reader_newtest() { 15 | 16 | newtest( "Testing Reader class with AFM data", &reader_newtest_readAFM ); 17 | 18 | } 19 | 20 | void reader_newtest_readAFM() { 21 | 22 | Reader reader("test/data/3by8_mixed_NA_matrix.afm",'\t'); 23 | 24 | newassert( reader.nLines() == 4 ); 25 | 26 | size_t nSamples = reader.nLines() - 1; 27 | 28 | vector features; 29 | 30 | // Removing top-left corner from table having column and row headers 31 | reader.nextLine(); 32 | reader.skipField(); 33 | 34 | size_t nVars = 0; 35 | 36 | // Check that all variable names are valid 37 | for ( ; ! reader.endOfLine(); ++nVars ) { 38 | string varName; reader >> varName; 39 | if ( varName.substr(0,2) == "N:" ) { 40 | features.push_back( Feature(Feature::Type::NUM,varName,nSamples) ); 41 | } else if ( varName.substr(0,2) == "C:" ) { 42 | features.push_back( Feature(Feature::Type::CAT,varName,nSamples) ); 43 | } else if ( varName.substr(0,2) == "T:" ) { 44 | features.push_back( Feature(Feature::Type::TXT,varName,nSamples) ); 45 | } else { 46 | newassert( false ); 47 | } 48 | } 49 | 50 | newassert( nVars == 8 ); 51 | newassert( features.size() == 8 ); 52 | 53 | // We should have reached end of the first line 54 | newassert( reader.endOfLine() ); 55 | 56 | // Get the next line and start reading... 57 | reader.nextLine(); 58 | string field; 59 | reader >> field; newassert( field == "s0" ); 60 | reader >> field; newassert( field == "NA" ); 61 | reader >> field; newassert( field == "foo" ); 62 | reader >> field; newassert( field == "2.2" ); 63 | reader >> field; newassert( field == "3.3" ); 64 | reader >> field; newassert( field == "4.4" ); 65 | reader >> field; newassert( field == "5.5" ); 66 | reader >> field; newassert( field == "6.6" ); 67 | reader >> field; newassert( field == "Ah, be so good. Yes, no?" ); 68 | 69 | // Make sure that we reached end of line again 70 | newassert( reader.endOfLine() ); 71 | 72 | // Go to the start of file and get first line 73 | reader.rewind(); 74 | reader.nextLine(); 75 | 76 | vector sampleNames(nSamples); 77 | 78 | // Go through lines 2,3,... 79 | size_t i; 80 | for ( i = 0; reader.nextLine(); ++i ) { 81 | //reader.nextLine(); 82 | // Sample name is the first field of the line 83 | reader >> sampleNames[i]; 84 | for ( size_t j = 0; j < nVars; ++j ) { 85 | if ( features[j].isNumerical() ) { 86 | num_t val; reader >> val; 87 | features[j].setNumSampleValue(i,val); 88 | } else if ( features[j].isCategorical() ) { 89 | string str; reader >> str; 90 | features[j].setCatSampleValue(i,str); 91 | } else if ( features[j].isTextual() ) { 92 | string str; reader >> str; 93 | features[j].setTxtSampleValue(i,str); 94 | } 95 | } 96 | // By now, we should have reached end of line 97 | newassert( reader.endOfLine() ); 98 | } 99 | 100 | newassert( i == nSamples ); 101 | 102 | // Did we recover the correct sample names from the file 103 | newassert( sampleNames[0] == "s0" ); 104 | newassert( sampleNames[1] == "s1" ); 105 | newassert( sampleNames[2] == "s2" ); 106 | 107 | // Rewind again to the start, and start reading from line 2 108 | reader.rewind(); 109 | reader.nextLine(); 110 | reader.nextLine(); 111 | 112 | // Variables for storing all data on line 2 113 | string s0; 114 | num_t v1,v3,v4,v5,v6,v7; 115 | cat_t v2,v8; 116 | 117 | // Read the 2nd line in one pass 118 | reader >> s0 >> v1 >> v2 >> v3 >> v4 >> v5 >> v6 >> v7 >> v8; 119 | 120 | // Again, end of line should have been reached 121 | newassert( reader.endOfLine() ); 122 | 123 | // Make sure the content of the 2nd line is as expected 124 | newassert( s0 == "s0" ); 125 | newassert( datadefs::isNAN(v1) ); 126 | newassert( v2 == "foo" ); 127 | newassert( fabs( v3 - 2.2 ) < 1e-5 ); 128 | newassert( fabs( v4 - 3.3 ) < 1e-5 ); 129 | newassert( fabs( v5 - 4.4 ) < 1e-5 ); 130 | newassert( fabs( v6 - 5.5 ) < 1e-5 ); 131 | newassert( fabs( v7 - 6.6 ) < 1e-5 ); 132 | newassert( v8 == "Ah, be so good. Yes, no?" ); 133 | 134 | // Go back to the beginning 135 | reader.rewind(); 136 | 137 | // While reading the whole file line by line till the end, we should 138 | // not reach end of line nor end of file, since we have the last line 139 | // stored in the linefeed... 140 | for ( size_t i = 0; i < reader.nLines(); ++i ) { 141 | reader.nextLine(); 142 | newassert( ! reader.endOfLine() ); 143 | //newassert( ! reader.endOfFile() ); 144 | } 145 | 146 | // ... that means that we can then read the last line, field by field, 147 | // into string variables 148 | for ( size_t i = 0; i < nVars + 1; ++i ) { 149 | newassert( ! reader.endOfLine() ); 150 | string field; reader >> field; 151 | } 152 | 153 | // After we are done reading the last line, we should have reached end of line 154 | // and end of file, meaning that we can't extract the next line since there is no 155 | // next line 156 | newassert( reader.endOfLine() ); 157 | newassert( ! reader.nextLine() ); 158 | 159 | } 160 | 161 | #endif 162 | -------------------------------------------------------------------------------- /test/rface_newtest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RFACE_NEWTEST_HPP 2 | #define RFACE_NEWTEST_HPP 3 | 4 | #include 5 | #include 6 | #include "options.hpp" 7 | #include "densetreedata.hpp" 8 | #include "rf_ace.hpp" 9 | #include "newtest.hpp" 10 | 11 | using namespace std; 12 | using datadefs::num_t; 13 | 14 | void rface_newtest_RF_train_test_classification(); 15 | void rface_newtest_RF_train_test_regression(); 16 | void rface_newtest_QRF_train_test_regression(); 17 | void rface_newtest_GBT_train_test_classification(); 18 | void rface_newtest_GBT_train_test_regression(); 19 | void rface_newtest_RF_save_load_classification(); 20 | void rface_newtest_RF_save_load_regression(); 21 | void rface_newtest_QRF_save_load_regression(); 22 | void rface_newtest_GBT_save_load_classification(); 23 | void rface_newtest_GBT_save_load_regression(); 24 | 25 | void rface_newtest() { 26 | 27 | newtest( "RF for classification", &rface_newtest_RF_train_test_classification ); 28 | newtest( "RF for regression", &rface_newtest_RF_train_test_regression ); 29 | newtest( "QRF for regression", &rface_newtest_QRF_train_test_regression ); 30 | //newtest( "Testing GBT for classification", &rface_newtest_GBT_train_test_classification ); 31 | //newtest( "Testing GBT for regression", &rface_newtest_GBT_train_test_regression ); 32 | newtest( "save/load RF for classification", &rface_newtest_RF_save_load_classification ); 33 | newtest( "save/load RF for regression", &rface_newtest_RF_save_load_regression ); 34 | newtest( "save/load QRF for regression", &rface_newtest_QRF_save_load_regression ); 35 | //newtest( "Testing save/load GBT for classification", &rface_newtest_GBT_save_load_classification ); 36 | //newtest( "Testing save/load GBT for regression", &rface_newtest_GBT_save_load_regression ); 37 | 38 | } 39 | 40 | RFACE::TestOutput make_predictions(ForestOptions& forestOptions, const string& targetStr) { 41 | 42 | string fileName = "test_103by300_mixed_nan_matrix.afm"; 43 | DenseTreeData trainData(fileName,'\t',':',false); 44 | size_t targetIdx = trainData.getFeatureIdx(targetStr); 45 | vector weights = trainData.getFeatureWeights(); 46 | weights[targetIdx] = 0; 47 | 48 | RFACE rface; 49 | 50 | rface.train(&trainData,targetIdx,weights,&forestOptions); 51 | 52 | return( rface.test(&trainData) ); 53 | 54 | } 55 | 56 | RFACE::QRFPredictionOutput make_quantile_predictions(ForestOptions& forestOptions, const string& targetStr) { 57 | 58 | string fileName = "test_103by300_mixed_nan_matrix.afm"; 59 | DenseTreeData trainData(fileName,'\t',':',false); 60 | size_t targetIdx = trainData.getFeatureIdx(targetStr); 61 | vector weights = trainData.getFeatureWeights(); 62 | weights[targetIdx] = 0; 63 | 64 | RFACE rface; 65 | 66 | rface.train(&trainData,targetIdx,weights,&forestOptions); 67 | 68 | return( rface.predictQRF(&trainData,forestOptions) ); 69 | 70 | } 71 | 72 | 73 | 74 | RFACE::TestOutput make_save_load_predictions(ForestOptions& forestOptions, const string& targetStr) { 75 | 76 | string fileName = "test_103by300_mixed_nan_matrix.afm"; 77 | DenseTreeData trainData(fileName,'\t',':',false); 78 | size_t targetIdx = trainData.getFeatureIdx(targetStr); 79 | vector weights = trainData.getFeatureWeights(); 80 | weights[targetIdx] = 0; 81 | 82 | RFACE rface; 83 | 84 | rface.train(&trainData,targetIdx,weights,&forestOptions); 85 | 86 | rface.save("foo.sf"); 87 | 88 | RFACE rface2; 89 | 90 | rface2.load("foo.sf"); 91 | 92 | return( rface2.test(&trainData) ); 93 | 94 | } 95 | 96 | RFACE::QRFPredictionOutput make_save_load_quantile_predictions(ForestOptions& forestOptions, const string& targetStr) { 97 | 98 | string fileName = "test_103by300_mixed_nan_matrix.afm"; 99 | DenseTreeData trainData(fileName,'\t',':',false); 100 | size_t targetIdx = trainData.getFeatureIdx(targetStr); 101 | vector weights = trainData.getFeatureWeights(); 102 | weights[targetIdx] = 0; 103 | 104 | RFACE rface; 105 | 106 | rface.train(&trainData,targetIdx,weights,&forestOptions); 107 | 108 | rface.save("foo.sf"); 109 | 110 | RFACE rface2; 111 | 112 | rface2.load("foo.sf"); 113 | 114 | return( rface2.predictQRF(&trainData,forestOptions) ); 115 | 116 | } 117 | 118 | 119 | num_t classification_error(const RFACE::TestOutput& predictions) { 120 | 121 | num_t pError = 0.0; 122 | num_t n = static_cast(predictions.catPredictions.size()); 123 | for ( size_t i = 0; i < predictions.catPredictions.size(); ++i ) { 124 | pError += (predictions.catPredictions[i] != predictions.catTrueData[i]) / n; 125 | } 126 | 127 | return(pError); 128 | 129 | } 130 | 131 | num_t regression_error(const RFACE::TestOutput& predictions) { 132 | 133 | num_t RMSE = 0.0; 134 | num_t n = static_cast(predictions.numPredictions.size()); 135 | for ( size_t i = 0; i < predictions.numPredictions.size(); ++i ) { 136 | num_t e = predictions.numPredictions[i] - predictions.numTrueData[i]; 137 | RMSE += powf(e,2)/n; 138 | } 139 | 140 | return( sqrt(RMSE) ); 141 | 142 | } 143 | 144 | vector quantile_regression_error(const RFACE::QRFPredictionOutput& qPredOut) { 145 | 146 | vector QDEV(qPredOut.quantiles.size(),0.0); 147 | num_t n = static_cast(qPredOut.numPredictions.size()); 148 | 149 | for ( size_t q = 0; q < qPredOut.quantiles.size(); ++q ) { 150 | for ( size_t i = 0; i < qPredOut.numPredictions.size(); ++i ) { 151 | bool b = qPredOut.trueNumData[i] < qPredOut.numPredictions[i][q]; 152 | QDEV[q] += b/n; 153 | } 154 | QDEV[q] = fabs(QDEV[q] - qPredOut.quantiles[q]); 155 | } 156 | 157 | return(QDEV); 158 | 159 | } 160 | 161 | void rface_newtest_RF_train_test_classification() { 162 | 163 | ForestOptions forestOptions(forest_t::QRF); 164 | forestOptions.mTry = 30; 165 | 166 | num_t pError = classification_error( make_predictions(forestOptions,"C:class") ); 167 | 168 | newassert(pError < 0.2); 169 | 170 | } 171 | 172 | void rface_newtest_RF_train_test_regression() { 173 | 174 | ForestOptions forestOptions(forest_t::QRF); 175 | forestOptions.mTry = 30; 176 | 177 | num_t RMSE = regression_error( make_predictions(forestOptions,"N:output") ); 178 | 179 | newassert(RMSE < 1.0); 180 | 181 | } 182 | 183 | void rface_newtest_QRF_train_test_regression() { 184 | 185 | ForestOptions forestOptions(forest_t::QRF); 186 | forestOptions.mTry = 30; 187 | forestOptions.quantiles = {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9}; 188 | forestOptions.nSamplesForQuantiles = 100; 189 | 190 | vector QDEV = quantile_regression_error( make_quantile_predictions(forestOptions,"N:output") ); 191 | 192 | //utils::write(cout,QDEV.begin(),QDEV.end()); 193 | 194 | newassert( math::mean(QDEV) < 0.20 ); 195 | 196 | } 197 | 198 | void rface_newtest_GBT_train_test_classification() { 199 | 200 | ForestOptions forestOptions(forest_t::GBT); 201 | 202 | num_t pError = classification_error( make_predictions(forestOptions,"C:class") ); 203 | 204 | newassert( pError < 0.2 ); 205 | 206 | } 207 | 208 | void rface_newtest_GBT_train_test_regression() { 209 | 210 | ForestOptions forestOptions(forest_t::GBT); 211 | 212 | num_t RMSE = regression_error( make_predictions(forestOptions,"N:output") ); 213 | 214 | newassert(RMSE < 1.0); 215 | 216 | } 217 | 218 | void rface_newtest_RF_save_load_classification() { 219 | 220 | ForestOptions forestOptions(forest_t::QRF); 221 | forestOptions.mTry = 30; 222 | 223 | num_t pError1 = classification_error( make_predictions(forestOptions,"C:class") ); 224 | num_t pError2 = classification_error( make_save_load_predictions(forestOptions,"C:class") ); 225 | 226 | newassert( fabs(pError1 - pError2) < 1e-1 ); 227 | 228 | } 229 | 230 | void rface_newtest_RF_save_load_regression() { 231 | 232 | ForestOptions forestOptions(forest_t::QRF); 233 | forestOptions.mTry = 30; 234 | 235 | num_t RMSE1 = classification_error( make_predictions(forestOptions,"N:output") ); 236 | num_t RMSE2 = classification_error( make_save_load_predictions(forestOptions,"N:output") ); 237 | 238 | newassert( fabs(RMSE1 - RMSE2) < 1e-1 ); 239 | 240 | } 241 | 242 | void rface_newtest_QRF_save_load_regression() { 243 | 244 | ForestOptions forestOptions(forest_t::QRF); 245 | forestOptions.mTry = 30; 246 | forestOptions.quantiles = {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9}; 247 | forestOptions.nSamplesForQuantiles = 100; 248 | 249 | vector QRMSE1 = quantile_regression_error( make_quantile_predictions(forestOptions,"N:output") ); 250 | vector QRMSE2 = quantile_regression_error( make_save_load_quantile_predictions(forestOptions,"N:output") ); 251 | 252 | newassert( fabs(QRMSE1[2] - QRMSE2[2]) < 1e-1 ); 253 | 254 | 255 | } 256 | 257 | void rface_newtest_GBT_save_load_classification() { 258 | 259 | ForestOptions forestOptions(forest_t::GBT); 260 | 261 | num_t pError1 = classification_error( make_predictions(forestOptions,"C:class") ); 262 | num_t pError2 = classification_error( make_save_load_predictions(forestOptions,"C:class") ); 263 | 264 | newassert( fabs(pError1 - pError2) < 1e-1 ); 265 | 266 | } 267 | 268 | void rface_newtest_GBT_save_load_regression() { 269 | 270 | ForestOptions forestOptions(forest_t::GBT); 271 | 272 | num_t RMSE1 = classification_error( make_predictions(forestOptions,"N:output") ); 273 | num_t RMSE2 = classification_error( make_save_load_predictions(forestOptions,"N:output") ); 274 | 275 | newassert( fabs(RMSE1 - RMSE2) < 1e-1 ); 276 | 277 | } 278 | 279 | #endif 280 | -------------------------------------------------------------------------------- /test/rootnode_newtest.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ROOTNODE_NEWTEST_HPP 2 | #define ROOTNODE_NEWTEST_HPP 3 | 4 | #include 5 | #include 6 | #incluce 7 | 8 | #include "newtest.hpp" 9 | #include "rootnode.hpp" 10 | #include "datadefs.hpp" 11 | 12 | using namespace std; 13 | 14 | void rootnode_newtest_getChildLeafTrainData(); 15 | 16 | void rootnode_newtest() { 17 | 18 | newtest( "Testing extraction of train samples from child leaf nodes", &rootnode_newtest_getChildLeafTrainData ); 19 | 20 | } 21 | 22 | void rootnode_newtest_getChildLeafTrainData() { 23 | 24 | Rootnode rootNode; 25 | Node nodeL,nodeR,nodeM; 26 | 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /test/run_newtests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "newtest.hpp" 5 | #include "reader_newtest.hpp" 6 | #include "treedata_newtest.hpp" 7 | #include "rface_newtest.hpp" 8 | #include "distributions_newtest.hpp" 9 | #include "utils_newtest.hpp" 10 | #include "datadefs_newtest.hpp" 11 | #include "node_newtest.hpp" 12 | #include "math_newtest.hpp" 13 | 14 | using namespace std; 15 | 16 | int main() { 17 | 18 | newtestinit(); 19 | 20 | cout << endl << "Testing Reader class:" << endl; 21 | reader_newtest(); 22 | 23 | cout << endl << "Testing Treedata class:" << endl; 24 | treedata_newtest(); 25 | 26 | cout << endl << "Testing RFACE class:" << endl; 27 | rface_newtest(); 28 | 29 | cout << endl << "Testing Distributions namespace:" << endl; 30 | distributions_newtest(); 31 | 32 | cout << endl << "Testing Utils namespace:" << endl; 33 | utils_newtest(); 34 | 35 | cout << endl << "Testing Datadefs namespace:" << endl; 36 | datadefs_newtest(); 37 | 38 | cout << endl << "Testing Node class:" << endl; 39 | node_newtest(); 40 | 41 | cout << endl << "Testing math namespace:" << endl; 42 | math_newtest(); 43 | 44 | newtestdone(); 45 | 46 | return( EXIT_SUCCESS ); 47 | 48 | } 49 | -------------------------------------------------------------------------------- /test_2by10_text_matrix.afm: -------------------------------------------------------------------------------- 1 | s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s20 2 | C:out 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 | T:in a.b a.b a.b a.b a.b a.b.c a.b.c a.b.c a.b.c a.b.c a.b.c a.b.c c c b.c a.b b a b a 4 | N:no1 8 7 6 8 8 5 78 5 4 7 5 4 4 8 9 6 4 4 7 8 5 | N:no2 9 8 6 4 5 4 6 8 9 7 5 4 3 7 9 0 7 5 4 7 6 | -------------------------------------------------------------------------------- /test_2by8_numerical_matrix.tsv: -------------------------------------------------------------------------------- 1 | S1 S2 S3 S4 S5 S6 S7 S8 2 | N:F1 3 2 nan nan 5 4 2.9 3.1 3 | N:F2 1 3 2 nan 4 5 3.6 2.8 4 | -------------------------------------------------------------------------------- /test_3by10_categorical_matrix.tsv: -------------------------------------------------------------------------------- 1 | S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 2 | C:F 0 1 0 1 4 5 6 7 8 9 3 | C:T1 1 1 1 1 1 0 0 0 0 0 4 | C:T2 1 2 1 2 0 0 0 0 0 0 -------------------------------------------------------------------------------- /test_6by10_mixed_matrix.tsv: -------------------------------------------------------------------------------- 1 | foo S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 2 | N:F1 nA 8.5 3.4 7.2 5 6 7 11 9 NA 3 | N:F2 2 3 4 5 6 NA NA 9 nan 10 4 | C:F3 NA nA naN NaN 1 1 1 2 2 2 5 | N:F4 10 9.9 8 7 6 5 4 3 2.4 1 6 | C:F5 3 3 3 4 4 5 3 2 2 2 7 | N:F6 9 8 7 9 8 7 3 2 1.0 99.23 -------------------------------------------------------------------------------- /test_fullSplitterSweep.txt: -------------------------------------------------------------------------------- 1 | N:output C:class 1.121686e+00 86 192 2 | N:output N:input 1.251324e+00 113 162 3 | N:output N:noise_1 9.328740e-02 21 253 4 | N:output N:noise_2 1.078159e-01 148 128 5 | N:output N:noise_3 9.386892e-02 246 29 6 | N:output N:noise_4 6.245053e-02 63 211 7 | N:output N:noise_5 4.026299e-02 13 260 8 | N:output C:noise_6 2.118358e-03 78 191 9 | N:output N:noise_7 1.003136e-01 43 236 10 | N:output N:noise_8 4.813590e-02 48 224 11 | N:output N:noise_9 1.011168e-01 13 269 12 | N:output N:noise_10 4.593473e-02 12 264 13 | N:output N:noise_11 4.945915e-02 24 247 14 | N:output N:noise_12 7.033682e-02 23 243 15 | N:output N:noise_13 6.036680e-02 26 246 16 | N:output N:noise_14 1.019840e-01 3 272 17 | N:output N:noise_15 5.772823e-02 146 128 18 | N:output N:noise_16 9.041976e-02 261 11 19 | N:output N:noise_17 6.628259e-02 204 69 20 | N:output C:noise_18 1.611157e-01 103 168 21 | N:output N:noise_19 4.208906e-02 126 147 22 | N:output N:noise_20 5.109677e-02 13 261 23 | N:output N:noise_21 7.293652e-02 88 178 24 | N:output N:noise_22 1.346184e-01 15 259 25 | N:output C:noise_23 9.415048e-03 76 196 26 | N:output N:noise_24 4.618499e-02 10 258 27 | N:output N:noise_25 1.230457e-01 192 78 28 | N:output N:noise_26 1.463906e-01 5 270 29 | N:output N:noise_27 4.840161e-02 13 256 30 | N:output N:noise_28 8.308741e-02 247 28 31 | N:output N:noise_29 7.497228e-02 256 16 32 | N:output N:noise_30 1.471867e-01 211 62 33 | N:output N:noise_31 6.474735e-02 265 10 34 | N:output N:noise_32 8.340131e-02 21 250 35 | N:output N:noise_33 4.267850e-02 259 19 36 | N:output N:noise_34 1.228116e-01 45 225 37 | N:output N:noise_35 6.714979e-02 15 260 38 | N:output N:noise_36 3.932013e-02 28 243 39 | N:output N:noise_37 9.741292e-02 4 263 40 | N:output N:noise_38 5.143446e-02 4 267 41 | N:output N:noise_39 6.923821e-02 260 9 42 | N:output N:noise_40 5.260796e-02 85 190 43 | N:output N:noise_41 9.401821e-02 38 233 44 | N:output N:noise_42 1.959699e-01 5 269 45 | N:output N:noise_43 3.535644e-02 152 120 46 | N:output N:noise_44 5.551332e-02 4 270 47 | N:output N:noise_45 6.478165e-02 49 226 48 | N:output N:noise_46 6.094157e-02 242 32 49 | N:output N:noise_47 1.149479e-01 104 166 50 | N:output N:noise_48 8.086051e-02 32 236 51 | N:output N:noise_49 7.817317e-02 26 246 52 | N:output N:noise_50 8.304132e-02 87 182 53 | N:output N:noise_51 4.322512e-02 9 256 54 | N:output N:noise_52 4.241347e-02 230 42 55 | N:output N:noise_53 1.284148e-01 67 199 56 | N:output N:noise_54 5.553333e-02 20 254 57 | N:output N:noise_55 1.378761e-01 59 215 58 | N:output N:noise_56 8.478826e-02 113 165 59 | N:output N:noise_57 5.143181e-02 213 58 60 | N:output N:noise_58 1.677901e-01 78 200 61 | N:output N:noise_59 4.516454e-02 86 178 62 | N:output N:noise_60 4.662042e-02 120 155 63 | N:output N:noise_61 4.883542e-02 255 20 64 | N:output C:noise_62 1.811719e-02 81 188 65 | N:output N:noise_63 4.851824e-02 238 38 66 | N:output N:noise_64 5.891705e-02 170 104 67 | N:output N:noise_65 9.432740e-02 265 10 68 | N:output C:noise_66 2.228690e-02 84 185 69 | N:output N:noise_67 9.516234e-02 204 65 70 | N:output N:noise_68 9.043474e-02 240 35 71 | N:output C:noise_69 3.016761e-02 94 184 72 | N:output N:noise_70 1.484555e-01 260 10 73 | N:output N:noise_71 4.317891e-02 271 3 74 | N:output N:noise_72 8.184142e-02 122 148 75 | N:output N:noise_73 4.699886e-02 60 209 76 | N:output C:noise_74 5.743994e-02 104 164 77 | N:output C:noise_75 1.078566e-02 80 187 78 | N:output N:noise_76 5.749446e-02 268 3 79 | N:output N:noise_77 8.603062e-02 122 148 80 | N:output N:noise_78 1.196307e-01 120 156 81 | N:output N:noise_79 5.445619e-02 256 19 82 | N:output N:noise_80 7.984189e-02 186 88 83 | N:output N:noise_81 7.058734e-02 19 251 84 | N:output N:noise_82 4.363979e-02 262 7 85 | N:output N:noise_83 7.936767e-02 142 135 86 | N:output N:noise_84 7.261903e-02 9 267 87 | N:output N:noise_85 8.337065e-02 267 8 88 | N:output C:noise_86 3.541747e-02 88 186 89 | N:output N:noise_87 1.050412e-01 37 235 90 | N:output N:noise_88 2.744471e-02 267 10 91 | N:output N:noise_89 6.768391e-02 27 247 92 | N:output N:noise_90 1.147344e-01 241 22 93 | N:output N:noise_91 1.392028e-01 186 90 94 | N:output N:noise_92 5.769372e-02 238 35 95 | N:output N:noise_93 1.057857e-01 141 134 96 | N:output N:noise_94 1.015035e-01 5 265 97 | N:output C:noise_95 6.726926e-03 76 201 98 | N:output N:noise_96 4.923999e-02 268 5 99 | N:output N:noise_97 8.986233e-02 209 66 100 | N:output N:noise_98 9.489314e-02 5 269 101 | N:output N:noise_99 1.116795e-01 273 4 102 | N:output N:noise_100 3.103523e-02 262 8 103 | -------------------------------------------------------------------------------- /test_fullSplitterSweep_class.txt: -------------------------------------------------------------------------------- 1 | C:class N:output 8.277247e-02 132 146 2 | C:class N:input 1.080478e-02 273 4 3 | C:class N:noise_1 1.186730e-02 145 131 4 | C:class N:noise_2 9.047567e-03 149 131 5 | C:class N:noise_3 5.632269e-03 37 240 6 | C:class N:noise_4 2.012800e-02 101 176 7 | C:class N:noise_5 4.715754e-03 144 132 8 | C:class C:noise_6 1.915083e-03 93 178 9 | C:class N:noise_7 8.512985e-03 263 19 10 | C:class N:noise_8 4.338452e-03 17 260 11 | C:class N:noise_9 7.568886e-03 157 128 12 | C:class N:noise_10 5.742623e-03 212 67 13 | C:class N:noise_11 8.089438e-03 267 7 14 | C:class N:noise_12 6.164748e-03 162 107 15 | C:class N:noise_13 5.167658e-03 144 132 16 | C:class N:noise_14 7.158856e-03 265 13 17 | C:class N:noise_15 8.629035e-03 31 245 18 | C:class N:noise_16 1.294779e-02 270 6 19 | C:class N:noise_17 3.829856e-03 43 233 20 | C:class C:noise_18 1.939052e-02 89 183 21 | C:class N:noise_19 7.300695e-03 13 262 22 | C:class N:noise_20 1.300216e-02 88 188 23 | C:class N:noise_21 7.683229e-03 227 43 24 | C:class N:noise_22 1.820842e-02 8 268 25 | C:class C:noise_23 4.497780e-03 77 195 26 | C:class N:noise_24 1.553144e-02 31 240 27 | C:class N:noise_25 7.985333e-03 28 244 28 | C:class N:noise_26 1.021783e-02 213 64 29 | C:class N:noise_27 5.208401e-03 32 239 30 | C:class N:noise_28 8.072889e-03 250 28 31 | C:class N:noise_29 7.635241e-03 10 265 32 | C:class N:noise_30 1.439265e-02 225 50 33 | C:class N:noise_31 9.134501e-03 269 10 34 | C:class N:noise_32 1.537890e-02 185 89 35 | C:class N:noise_33 7.579281e-03 3 278 36 | C:class N:noise_34 1.615210e-02 41 231 37 | C:class N:noise_35 9.442723e-03 52 227 38 | C:class N:noise_36 9.529369e-03 205 70 39 | C:class N:noise_37 8.874968e-03 182 90 40 | C:class N:noise_38 6.306706e-03 83 191 41 | C:class N:noise_39 6.690488e-03 13 257 42 | C:class N:noise_40 6.732478e-03 156 123 43 | C:class N:noise_41 6.800112e-03 258 16 44 | C:class N:noise_42 1.330721e-02 5 271 45 | C:class N:noise_43 6.459754e-03 251 22 46 | C:class N:noise_44 1.463259e-02 86 190 47 | C:class N:noise_45 1.074883e-02 57 221 48 | C:class N:noise_46 4.948310e-03 249 27 49 | C:class N:noise_47 1.185683e-02 74 201 50 | C:class N:noise_48 5.228776e-03 203 68 51 | C:class N:noise_49 8.015075e-03 270 3 52 | C:class N:noise_50 3.827547e-03 245 26 53 | C:class N:noise_51 5.623361e-03 8 260 54 | C:class N:noise_52 1.132672e-02 264 11 55 | C:class N:noise_53 1.474747e-02 198 72 56 | C:class N:noise_54 1.464262e-02 184 93 57 | C:class N:noise_55 1.061787e-02 57 218 58 | C:class N:noise_56 9.102942e-03 275 7 59 | C:class N:noise_57 8.660064e-03 4 270 60 | C:class N:noise_58 1.468407e-02 7 273 61 | C:class N:noise_59 4.682519e-03 254 14 62 | C:class N:noise_60 8.742796e-03 87 189 63 | C:class N:noise_61 6.415251e-03 276 3 64 | C:class C:noise_62 3.877685e-03 91 182 65 | C:class N:noise_63 9.286570e-03 252 26 66 | C:class N:noise_64 7.245304e-03 6 272 67 | C:class N:noise_65 8.578843e-03 274 4 68 | C:class C:noise_66 3.314331e-03 100 172 69 | C:class N:noise_67 8.746950e-03 268 4 70 | C:class N:noise_68 8.593788e-03 63 215 71 | C:class C:noise_69 4.556471e-03 83 198 72 | C:class N:noise_70 1.385411e-02 269 5 73 | C:class N:noise_71 7.371832e-03 45 234 74 | C:class N:noise_72 5.581014e-03 266 7 75 | C:class N:noise_73 5.526464e-03 222 50 76 | C:class C:noise_74 1.235055e-03 86 186 77 | C:class C:noise_75 2.357321e-03 81 190 78 | C:class N:noise_76 5.492676e-03 231 42 79 | C:class N:noise_77 1.658936e-02 207 66 80 | C:class N:noise_78 9.358209e-03 38 239 81 | C:class N:noise_79 5.969206e-03 185 95 82 | C:class N:noise_80 1.487118e-02 17 260 83 | C:class N:noise_81 5.329469e-03 268 5 84 | C:class N:noise_82 7.679314e-03 246 26 85 | C:class N:noise_83 9.127313e-03 7 274 86 | C:class N:noise_84 4.509829e-03 137 143 87 | C:class N:noise_85 4.530825e-03 110 169 88 | C:class C:noise_86 1.859930e-03 78 199 89 | C:class N:noise_87 1.187371e-02 30 244 90 | C:class N:noise_88 5.670918e-03 200 80 91 | C:class N:noise_89 7.086722e-03 21 256 92 | C:class N:noise_90 1.036187e-02 207 59 93 | C:class N:noise_91 1.046930e-02 4 274 94 | C:class N:noise_92 1.057821e-02 239 38 95 | C:class N:noise_93 7.872152e-03 45 233 96 | C:class N:noise_94 8.085322e-03 23 250 97 | C:class C:noise_95 1.761905e-03 100 180 98 | C:class N:noise_96 9.914971e-03 144 131 99 | C:class N:noise_97 1.001201e-02 17 261 100 | C:class N:noise_98 7.998670e-03 274 3 101 | C:class N:noise_99 6.701586e-03 221 60 102 | C:class N:noise_100 1.181143e-02 191 82 103 | -------------------------------------------------------------------------------- /test_predictor.sf: -------------------------------------------------------------------------------- 1 | FOREST=RF,NTREES=1,TARGET="N:T",CATEGORIES=,SHRINKAGE=1 2 | TREE=0,NNODES=13 3 | NODE=*,PRED=5.0,SPLITTER="N:f0",SPLITTERTYPE=NUMERICAL,LFRACTION=1.0,LVALUES="1.1",RVALUES="1.1",M=M 4 | NODE=*L,PRED=4.0,SPLITTER="C:f1",SPLITTERTYPE=CATEGORICAL,LFRACTION=1.0,LVALUES="0:2",RVALUES="1" 5 | NODE=*LL,PRED=3.9 6 | NODE=*LR,PRED=4.2,SPLITTER="N:f3",SPLITTERTYPE=NUMERICAL,LFRACTION=1.0,LVALUES="-1.5",RVALUES="-1.5" 7 | NODE=*LRL,PRED=3.99 8 | NODE=*LRR,PRED=4.3 9 | NODE=*R,PRED=6.0,SPLITTER="N:f2",SPLITTERTYPE=NUMERICAL,LFRACTION=1.0,LVALUES="3.0",RVALUES="3.0" 10 | NODE=*RL,PRED=5.1 11 | NODE=*RR,PRED=6.6,SPLITTER="C:f4",SPLITTERTYPE=CATEGORICAL,LFRACTION=1.0,LVALUES="2:3",RVALUES="0:1",M=M 12 | NODE=*RRL,PRED=6.5 13 | NODE=*RRR,PRED=7.1 14 | NODE=*RRM,PRED=9.0 15 | NODE=*M,PRED=8.0 -------------------------------------------------------------------------------- /test_rfacer.R: -------------------------------------------------------------------------------- 1 | library(Rcpp) 2 | library(rfacer) 3 | 4 | trainData <- read.afm("test_103by300_mixed_nan_matrix.afm") 5 | 6 | predictorObj <- rface.train(trainData,"C:class",mTry = 30, nTrees = 1000) 7 | predictions <- rface.predict(predictorObj,trainData) 8 | -------------------------------------------------------------------------------- /testdata.tsv: -------------------------------------------------------------------------------- 1 | S1 S2 S3 S4 2 | N:f0 0.0 0.5 NA 6.0 3 | C:f1 0 1 0 2 4 | N:f2 4.0 3.0 2.5 3.1 5 | N:f3 -1.6 2.0 0.5 1.5 6 | C:f4 0 1 2 NA 7 | N:T 0.0 1.0 2.0 3.0 -------------------------------------------------------------------------------- /tmp/feature.cpp: -------------------------------------------------------------------------------- 1 | #include "feature.hpp" 2 | -------------------------------------------------------------------------------- /tmp/feature.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FEATURE_HPP 2 | #define FEATURE_HPP 3 | 4 | class Feature { 5 | 6 | public: 7 | 8 | Feature(); 9 | ~Feature(); 10 | 11 | bool isNumerical() const; 12 | bool isCategorical() const; 13 | bool isTextual() const; 14 | 15 | protected: 16 | 17 | enum Type { NUM, CAT, TXT, UNKNOWN }; 18 | 19 | virtual initialize() = 0; 20 | 21 | private: 22 | 23 | Type type_; 24 | 25 | }; 26 | 27 | class NumFeature : public Feature { 28 | 29 | public: 30 | 31 | NumFeature(); 32 | ~NumFeature(); 33 | 34 | protected: 35 | 36 | virtual initialize(); 37 | 38 | private: 39 | 40 | 41 | 42 | }; 43 | 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /tmp/treesizes.tsv: -------------------------------------------------------------------------------- 1 | 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 2 | 71 69 73 75 71 71 71 77 93 61 69 89 71 65 69 73 73 83 73 73 57 79 63 71 69 71 83 67 65 83 69 73 73 67 77 73 77 65 63 59 73 73 65 69 63 69 69 73 63 75 59 65 69 73 67 65 73 77 69 63 69 69 81 75 79 83 73 73 81 71 75 67 63 69 77 83 75 85 57 65 71 79 71 77 71 81 71 79 79 71 71 75 69 69 71 69 73 79 71 79 3 | 41 45 39 31 41 39 37 43 37 45 37 43 41 41 27 41 37 37 45 39 37 39 31 37 35 47 39 51 29 43 41 39 41 41 39 39 41 37 35 33 37 33 43 41 39 39 41 39 43 41 43 51 45 33 43 43 35 37 43 35 43 31 35 53 35 47 43 43 37 37 39 39 43 43 43 39 37 45 43 49 35 41 39 41 43 39 39 35 43 51 43 41 45 39 43 49 43 39 49 41 4 | 31 21 25 21 33 23 27 27 33 27 27 31 31 25 25 29 27 31 31 23 29 25 25 31 29 33 27 27 31 25 25 27 31 25 31 27 29 25 27 27 33 21 31 27 27 25 33 25 27 27 35 27 31 27 29 29 29 27 35 31 27 29 31 27 31 25 29 31 23 29 31 25 27 25 27 23 25 37 29 27 27 31 27 27 29 27 25 21 29 31 33 29 29 25 29 29 25 27 31 25 5 | 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 149 6 | 130 132 132 128 132 128 125 132 132 128 126 132 132 131 127 129 132 129 128 132 131 127 129 129 129 122 131 127 132 129 128 131 130 131 132 126 129 118 131 129 132 126 132 127 127 131 130 130 131 132 124 132 131 130 132 126 132 129 124 131 131 129 132 131 127 127 131 132 131 132 132 132 131 132 130 131 129 126 132 132 127 131 132 132 131 127 132 129 132 128 131 132 122 127 132 131 131 132 131 131 7 | 106 109 106 113 112 108 109 114 108 110 105 107 113 104 111 109 109 100 109 114 112 112 114 108 108 113 110 114 107 112 100 111 110 108 113 113 105 104 111 109 110 108 105 111 110 113 114 112 106 112 112 109 110 105 111 108 112 113 112 110 113 114 110 108 109 110 109 109 111 114 108 106 104 113 105 112 108 111 113 111 111 107 108 107 108 105 110 114 111 104 106 106 113 108 113 104 108 105 113 112 8 | 105 101 104 102 106 102 102 101 101 104 97 99 104 104 107 109 98 97 102 100 106 94 106 103 102 100 107 100 106 102 106 104 107 105 98 105 98 105 103 106 100 104 103 101 102 107 105 105 106 108 107 103 107 102 99 98 109 104 109 99 96 106 100 96 104 97 102 98 103 98 100 106 106 103 103 103 99 100 99 100 103 105 96 106 108 104 106 99 105 103 102 97 101 100 96 107 107 108 103 104 9 | --------------------------------------------------------------------------------