├── .gitattributes ├── .gitignore ├── AdditiveGroves ├── AdditiveGroves.sln ├── Grove.cpp ├── Grove.h ├── Makefile ├── TrainInfo.h ├── ag_definitions.h ├── ag_expand.cpp ├── ag_fs.cpp ├── ag_functions.cpp ├── ag_functions.h ├── ag_interactions.cpp ├── ag_layered.cpp ├── ag_layered.h ├── ag_layeredjob.cpp ├── ag_layeredjob.h ├── ag_merge.cpp ├── ag_mergepreds.cpp ├── ag_nway.cpp ├── ag_predict.cpp ├── ag_save.cpp ├── ag_savemerge.cpp └── ag_train.cpp ├── BaggedTrees ├── BaggedTrees.sln ├── Makefile ├── TrainInfo.h ├── Tree.cpp ├── Tree.h ├── bt_definitions.h ├── bt_functions.cpp ├── bt_functions.h ├── bt_predict.cpp ├── bt_train.cpp └── gbt_train.cpp ├── OctavePlots ├── fix_title.m ├── make_effect_plot.m ├── make_interaction_plot.m ├── plot_effects.m ├── plot_interactions.m └── rotateticklabel.m ├── README.md ├── ThreadPool ├── License.txt ├── thread_pool.cpp └── thread_pool.h ├── Visualization ├── Makefile ├── Visualization.sln ├── vis_correlations.cpp ├── vis_definitions.h ├── vis_effect.cpp └── vis_iplot.cpp ├── docs ├── AG_interactions.htm ├── AG_manual.htm ├── AG_model.htm ├── AG_quickstart.htm ├── AdditiveGroves.ppt ├── BT_manual.htm ├── CMU_2010.ppt ├── Data_Format.htm ├── Interactions.ppt ├── TreeExtra.css ├── TreeExtra.htm ├── background.gif ├── cookie.html ├── data.attr ├── data.test ├── data.train ├── data.valid ├── index.html ├── license.txt ├── papers.htm ├── papers │ ├── BirdMining.pdf │ ├── ChenDubrawskiSorokina.doc │ ├── Interactions.pdf │ ├── PlagiarismDetection_full.pdf │ ├── ScalableGBFS.pdf │ ├── Similarity64.zip │ ├── Wildlife.pdf │ ├── a9ext_sigir16.pdf │ ├── brain.pdf │ ├── chapter-featureeval.pdf │ ├── fslr.pdf │ ├── groves.pdf │ ├── kddcup09.pdf │ ├── ranking_AG.pdf │ ├── rmbo_full.pdf │ └── thesis.pdf ├── ranking_AG.pdf ├── styles.css └── visualize.htm ├── license.txt ├── make.sh ├── shared ├── ErrLogStream.h ├── INDdata.cpp ├── INDdata.h ├── INDsample.cpp ├── INDsample.h ├── ItemInfo.h ├── LogStream.cpp ├── LogStream.h ├── SplitInfo.cpp ├── SplitInfo.h ├── TreeNode.cpp ├── TreeNode.h ├── definitions.h ├── functions.cpp ├── functions.h └── gtest-internal.h └── win ├── ag_addbag └── ag_addbag.vcxproj ├── ag_expand └── ag_expand.vcxproj ├── ag_fs └── ag_fs.vcxproj ├── ag_interactions └── ag_interactions.vcxproj ├── ag_merge └── ag_merge.vcxproj ├── ag_mergepreds └── ag_mergepreds.vcxproj ├── ag_nway └── ag_nway.vcxproj ├── ag_predict └── ag_predict.vcxproj ├── ag_save └── ag_save.vcxproj ├── ag_train └── ag_train.vcxproj ├── bt_predict └── bt_predict.vcxproj ├── bt_train ├── bt_train.vcxproj └── log.txt ├── gbt_train ├── gbt_train.vcxproj └── gbt_train.vcxproj.filters ├── vis_correlations ├── vis_correlations.vcxproj └── vis_correlations.vcxproj.filters ├── vis_effect └── vis_effect.vcxproj └── vis_iplot └── vis_iplot.vcxproj /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## EasyClangComplete 3 | ################# 4 | 5 | # Include paths needed for finding custom libraries 6 | .clang_complete 7 | 8 | 9 | ################# 10 | ## Eclipse 11 | ################# 12 | 13 | *.pydevproject 14 | .project 15 | .metadata 16 | bin/ 17 | tmp/ 18 | *.tmp 19 | *.bak 20 | *.swp 21 | *~.nib 22 | local.properties 23 | .classpath 24 | .settings/ 25 | .loadpath 26 | 27 | # External tool builders 28 | .externalToolBuilders/ 29 | 30 | # Locally stored "Eclipse launch configurations" 31 | *.launch 32 | 33 | # CDT-specific 34 | .cproject 35 | 36 | # PDT-specific 37 | .buildpath 38 | 39 | 40 | ################# 41 | ## Visual Studio 42 | ################# 43 | 44 | ## Ignore Visual Studio temporary files, build results, and 45 | ## files generated by popular Visual Studio add-ons. 46 | 47 | *VC.db* 48 | *VC.opendb* 49 | 50 | # User-specific files 51 | *.suo 52 | *.user 53 | *.sln.docstates 54 | 55 | # Build results 56 | 57 | [Dd]ebug/ 58 | [Rr]elease/ 59 | x64/ 60 | build/ 61 | [Bb]in/ 62 | [Oo]bj/ 63 | 64 | # MSTest test Results 65 | [Tt]est[Rr]esult*/ 66 | [Bb]uild[Ll]og.* 67 | 68 | *_i.c 69 | *_p.c 70 | *.ilk 71 | *.meta 72 | *.obj 73 | *.pch 74 | *.pdb 75 | *.pgc 76 | *.pgd 77 | *.rsp 78 | *.sbr 79 | *.tlb 80 | *.tli 81 | *.tlh 82 | *.tmp 83 | *.tmp_proj 84 | *.log 85 | *.vspscc 86 | *.vssscc 87 | .builds 88 | *.pidb 89 | *.log 90 | *.scc 91 | 92 | # Visual C++ cache files 93 | ipch/ 94 | *.aps 95 | *.ncb 96 | *.opensdf 97 | *.sdf 98 | *.cachefile 99 | 100 | # Visual Studio profiler 101 | *.psess 102 | *.vsp 103 | *.vspx 104 | 105 | # Guidance Automation Toolkit 106 | *.gpState 107 | 108 | # ReSharper is a .NET coding add-in 109 | _ReSharper*/ 110 | *.[Rr]e[Ss]harper 111 | 112 | # TeamCity is a build add-in 113 | _TeamCity* 114 | 115 | # DotCover is a Code Coverage Tool 116 | *.dotCover 117 | 118 | # NCrunch 119 | *.ncrunch* 120 | .*crunch*.local.xml 121 | 122 | # Installshield output folder 123 | [Ee]xpress/ 124 | 125 | # DocProject is a documentation generator add-in 126 | DocProject/buildhelp/ 127 | DocProject/Help/*.HxT 128 | DocProject/Help/*.HxC 129 | DocProject/Help/*.hhc 130 | DocProject/Help/*.hhk 131 | DocProject/Help/*.hhp 132 | DocProject/Help/Html2 133 | DocProject/Help/html 134 | 135 | # Click-Once directory 136 | publish/ 137 | 138 | # Publish Web Output 139 | *.Publish.xml 140 | *.pubxml 141 | *.publishproj 142 | 143 | # NuGet Packages Directory 144 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 145 | #packages/ 146 | 147 | # Windows Azure Build Output 148 | csx 149 | *.build.csdef 150 | 151 | # Windows Store app package directory 152 | AppPackages/ 153 | 154 | # Others 155 | sql/ 156 | *.Cache 157 | ClientBin/ 158 | [Ss]tyle[Cc]op.* 159 | ~$* 160 | *~ 161 | *.dbmdl 162 | *.[Pp]ublish.xml 163 | *.pfx 164 | *.publishsettings 165 | *.o 166 | Bin/ 167 | 168 | # RIA/Silverlight projects 169 | Generated_Code/ 170 | 171 | # Backup & report files from converting an old project file to a newer 172 | # Visual Studio version. Backup files are not needed, because we have git ;-) 173 | _UpgradeReport_Files/ 174 | Backup*/ 175 | UpgradeLog*.XML 176 | UpgradeLog*.htm 177 | 178 | # SQL Server files 179 | App_Data/*.mdf 180 | App_Data/*.ldf 181 | 182 | ############# 183 | ## Windows detritus 184 | ############# 185 | 186 | # Windows image file caches 187 | Thumbs.db 188 | ehthumbs.db 189 | 190 | # Folder config file 191 | Desktop.ini 192 | 193 | # Recycle Bin used on file shares 194 | $RECYCLE.BIN/ 195 | 196 | # Mac crap 197 | .DS_Store 198 | 199 | 200 | ############# 201 | ## Python 202 | ############# 203 | 204 | *.py[cod] 205 | 206 | # Packages 207 | *.egg 208 | *.egg-info 209 | dist/ 210 | build/ 211 | eggs/ 212 | parts/ 213 | var/ 214 | sdist/ 215 | develop-eggs/ 216 | .installed.cfg 217 | 218 | # Installer logs 219 | pip-log.txt 220 | 221 | # Unit test / coverage reports 222 | .coverage 223 | .tox 224 | 225 | #Translations 226 | *.mo 227 | 228 | #Mr Developer 229 | .mr.developer.cfg 230 | -------------------------------------------------------------------------------- /AdditiveGroves/AdditiveGroves.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.27130.2024 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_expand", "..\win\ag_expand\ag_expand.vcxproj", "{2F07B668-F45A-4BA9-8832-D1F44BCDDD8D}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_predict", "..\win\ag_predict\ag_predict.vcxproj", "{083EF393-CD75-4860-91A9-D0DACCF12334}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_save", "..\win\ag_save\ag_save.vcxproj", "{0F73F967-80AA-4B09-900D-B3F29E3343B2}" 11 | EndProject 12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_merge", "..\win\ag_merge\ag_merge.vcxproj", "{19083E48-3D97-461C-A182-B19105BB31EF}" 13 | EndProject 14 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_fs", "..\win\ag_fs\ag_fs.vcxproj", "{A659907A-4258-4307-BC0A-D409C6D83562}" 15 | EndProject 16 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_train", "..\win\ag_train\ag_train.vcxproj", "{6BCCC77B-68DC-4932-A59C-2673C1AFA640}" 17 | EndProject 18 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_interactions", "..\win\ag_interactions\ag_interactions.vcxproj", "{2B18A9BC-6D33-4D98-8D34-C06A293B1DAC}" 19 | EndProject 20 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_nway", "..\win\ag_nway\ag_nway.vcxproj", "{44CF4107-68B2-4F00-BDB0-ECDBB8478367}" 21 | EndProject 22 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ag_mergepreds", "..\win\ag_mergepreds\ag_mergepreds.vcxproj", "{F3F00B74-59C8-4872-BA48-53FC9EF7C56C}" 23 | EndProject 24 | Global 25 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 26 | Debug|Win32 = Debug|Win32 27 | Release|Win32 = Release|Win32 28 | EndGlobalSection 29 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 30 | {2F07B668-F45A-4BA9-8832-D1F44BCDDD8D}.Debug|Win32.ActiveCfg = Debug|Win32 31 | {2F07B668-F45A-4BA9-8832-D1F44BCDDD8D}.Debug|Win32.Build.0 = Debug|Win32 32 | {2F07B668-F45A-4BA9-8832-D1F44BCDDD8D}.Release|Win32.ActiveCfg = Release|Win32 33 | {2F07B668-F45A-4BA9-8832-D1F44BCDDD8D}.Release|Win32.Build.0 = Release|Win32 34 | {083EF393-CD75-4860-91A9-D0DACCF12334}.Debug|Win32.ActiveCfg = Debug|Win32 35 | {083EF393-CD75-4860-91A9-D0DACCF12334}.Debug|Win32.Build.0 = Debug|Win32 36 | {083EF393-CD75-4860-91A9-D0DACCF12334}.Release|Win32.ActiveCfg = Release|Win32 37 | {083EF393-CD75-4860-91A9-D0DACCF12334}.Release|Win32.Build.0 = Release|Win32 38 | {0F73F967-80AA-4B09-900D-B3F29E3343B2}.Debug|Win32.ActiveCfg = Debug|Win32 39 | {0F73F967-80AA-4B09-900D-B3F29E3343B2}.Debug|Win32.Build.0 = Debug|Win32 40 | {0F73F967-80AA-4B09-900D-B3F29E3343B2}.Release|Win32.ActiveCfg = Release|Win32 41 | {0F73F967-80AA-4B09-900D-B3F29E3343B2}.Release|Win32.Build.0 = Release|Win32 42 | {19083E48-3D97-461C-A182-B19105BB31EF}.Debug|Win32.ActiveCfg = Debug|Win32 43 | {19083E48-3D97-461C-A182-B19105BB31EF}.Debug|Win32.Build.0 = Debug|Win32 44 | {19083E48-3D97-461C-A182-B19105BB31EF}.Release|Win32.ActiveCfg = Release|Win32 45 | {19083E48-3D97-461C-A182-B19105BB31EF}.Release|Win32.Build.0 = Release|Win32 46 | {A659907A-4258-4307-BC0A-D409C6D83562}.Debug|Win32.ActiveCfg = Debug|Win32 47 | {A659907A-4258-4307-BC0A-D409C6D83562}.Debug|Win32.Build.0 = Debug|Win32 48 | {A659907A-4258-4307-BC0A-D409C6D83562}.Release|Win32.ActiveCfg = Release|Win32 49 | {A659907A-4258-4307-BC0A-D409C6D83562}.Release|Win32.Build.0 = Release|Win32 50 | {6BCCC77B-68DC-4932-A59C-2673C1AFA640}.Debug|Win32.ActiveCfg = Debug|Win32 51 | {6BCCC77B-68DC-4932-A59C-2673C1AFA640}.Debug|Win32.Build.0 = Debug|Win32 52 | {6BCCC77B-68DC-4932-A59C-2673C1AFA640}.Release|Win32.ActiveCfg = Release|Win32 53 | {6BCCC77B-68DC-4932-A59C-2673C1AFA640}.Release|Win32.Build.0 = Release|Win32 54 | {2B18A9BC-6D33-4D98-8D34-C06A293B1DAC}.Debug|Win32.ActiveCfg = Debug|Win32 55 | {2B18A9BC-6D33-4D98-8D34-C06A293B1DAC}.Debug|Win32.Build.0 = Debug|Win32 56 | {2B18A9BC-6D33-4D98-8D34-C06A293B1DAC}.Release|Win32.ActiveCfg = Release|Win32 57 | {2B18A9BC-6D33-4D98-8D34-C06A293B1DAC}.Release|Win32.Build.0 = Release|Win32 58 | {44CF4107-68B2-4F00-BDB0-ECDBB8478367}.Debug|Win32.ActiveCfg = Debug|Win32 59 | {44CF4107-68B2-4F00-BDB0-ECDBB8478367}.Debug|Win32.Build.0 = Debug|Win32 60 | {44CF4107-68B2-4F00-BDB0-ECDBB8478367}.Release|Win32.ActiveCfg = Release|Win32 61 | {44CF4107-68B2-4F00-BDB0-ECDBB8478367}.Release|Win32.Build.0 = Release|Win32 62 | {F3F00B74-59C8-4872-BA48-53FC9EF7C56C}.Debug|Win32.ActiveCfg = Debug|Win32 63 | {F3F00B74-59C8-4872-BA48-53FC9EF7C56C}.Debug|Win32.Build.0 = Debug|Win32 64 | {F3F00B74-59C8-4872-BA48-53FC9EF7C56C}.Release|Win32.ActiveCfg = Release|Win32 65 | {F3F00B74-59C8-4872-BA48-53FC9EF7C56C}.Release|Win32.Build.0 = Release|Win32 66 | EndGlobalSection 67 | GlobalSection(SolutionProperties) = preSolution 68 | HideSolutionNode = FALSE 69 | EndGlobalSection 70 | GlobalSection(ExtensibilityGlobals) = postSolution 71 | SolutionGuid = {62E44650-175F-480C-8526-CBF7B83B464F} 72 | EndGlobalSection 73 | EndGlobal 74 | -------------------------------------------------------------------------------- /AdditiveGroves/Grove.h: -------------------------------------------------------------------------------- 1 | // Additive Groves / Grove.h: interface of class Grove 2 | 3 | #pragma once 4 | #include "TreeNode.h" 5 | 6 | #ifndef _WIN32 7 | #include "thread_pool.h" 8 | #endif 9 | 10 | //Grove model: additive ensemble of several trees 11 | class CGrove 12 | { 13 | public: 14 | //set function for static data pointer 15 | static void setData(INDdata& data){pData = &data;} 16 | 17 | #ifndef _WIN32 18 | static void setPool(TThreadPool& pool){pPool = &pool;} 19 | #endif 20 | 21 | //constructor 22 | CGrove(double alpha, int tigN); 23 | 24 | //constructor 25 | CGrove(double alpha, int tigN, intv& interaction); 26 | 27 | //rebuilds grove until convergence with predictions of other grove as starting point 28 | ddpair converge(doublevv& sinpreds, doublev& jointpreds, INDsample& sample); 29 | 30 | //trains the grove using "layered" version of the algorithm (fixed #trees, increase alpha on every step) 31 | void trainLayered(INDsample& sample); 32 | 33 | //saves the grove into the binary file 34 | void save(const char* fileName); 35 | 36 | //loads the grove from the binary file 37 | void load(fstream& fload); 38 | 39 | //calculates prediction of the whole grove for a single item 40 | double predict(int itemNo, DATA_SET dset); 41 | 42 | //returns predictions of single trees and the whole model for all data points in the train set 43 | void batchPredict(doublevv& sinpreds, doublev& jointpreds); 44 | 45 | //outputs code for a tree in a grove 46 | void treeCode(int treeNo, fstream& fcode); 47 | 48 | private: 49 | //trains a single tree as part of training a grove 50 | void genTreeInGrove(doublev& sinpredsx, doublev& jointpreds, int treeNo, INDsample& sample); 51 | 52 | //grows a tree 53 | void growTree(CTreeNode& root, INDsample& sample); 54 | 55 | //trains several restricted trees, chooses the best 56 | void chooseTree(CTreeNode& root, doublev& othpreds, INDsample& sample); 57 | 58 | //calculates prediction of a single tree for a single item 59 | double localPredict(CTreeNode& root, int itemNo, DATA_SET dset); 60 | 61 | private: 62 | static INDdata* pData; //data access pointer 63 | 64 | #ifndef _WIN32 65 | static TThreadPool* pPool; //thread pool pointer 66 | TCondition nodesCond; //condition, used for multithreading control 67 | #endif 68 | 69 | CTreeNodev roots; //roots of trees in the grove 70 | double alpha; //one of two key parameters: controls size of tree 71 | int tigN; //one of two key parameters: number of trees in the grove 72 | 73 | intv interaction; //a higher-order interaction between all these attributes 74 | //should not be allowed in the model (model is restricted on interaction) 75 | 76 | }; 77 | 78 | 79 | #ifndef _WIN32 80 | //Information required for a single node splitting job to run. Used for multithreading 81 | struct JobData 82 | { 83 | JobData(nodeip in_curNH, nodehstack* in_pNodes, TCondition* in_pNodesCond, int* in_pToDoN, 84 | double in_b, double in_H): 85 | curNH(in_curNH), pNodes(in_pNodes), pNodesCond(in_pNodesCond), pToDoN(in_pToDoN), b(in_b), H(in_H){} 86 | 87 | nodeip curNH; 88 | nodehstack* pNodes; 89 | TCondition* pNodesCond; 90 | int* pToDoN; 91 | double b; 92 | double H; 93 | }; 94 | #endif 95 | -------------------------------------------------------------------------------- /AdditiveGroves/Makefile: -------------------------------------------------------------------------------- 1 | SHAREDDIR=../shared 2 | LIBDIR=../ThreadPool 3 | CXXFLAGS = -I$(SHAREDDIR) -I$(LIBDIR) 4 | LIBS = -lpthread 5 | 6 | 7 | OBJS = Grove.o ag_functions.o $(LIBDIR)/thread_pool.o \ 8 | $(SHAREDDIR)/SplitInfo.o $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o \ 9 | $(SHAREDDIR)/TreeNode.o $(SHAREDDIR)/functions.o $(SHAREDDIR)/LogStream.o 10 | ADDOBJS = $(OBJS) ag_layeredjob.o 11 | PGMS = ag_predict ag_train ag_save ag_expand ag_merge ag_fs \ 12 | ag_interactions ag_nway ag_mergepreds ag_savemerge 13 | PGMOBJS = ag_predict.o ag_train.o ag_save.o ag_expand.o ag_merge.o ag_fs.o \ 14 | ag_interactions.o ag_nway.o ag_mergepreds.o ag_savemerge.o ag_layeredjob.o 15 | # XW. List of programs where parallel bagging is needed 16 | # PGMS = ag_train ag_expand ag_fs ag_interactions ag_nway 17 | # PGMOBJS = ag_train.o ag_expand.o ag_fs.o ag_interactions.o ag_nway.o 18 | 19 | all: directories $(PGMS) 20 | 21 | directories: 22 | mkdir -p ../Bin 23 | 24 | clean: 25 | # XW. Should not abort if the files do not exist 26 | rm -f $(OBJS) 27 | rm -f $(PGMOBJS) 28 | rm -rf ../Bin/ag_train ../Bin/ag_expand ../Bin/ag_fs ../Bin/ag_interactions \ 29 | ../Bin/ag_save ../Bin/ag_nway 30 | 31 | .cpp.o: 32 | g++ -O3 $(CXXFLAGS) -c $< -o $@ 33 | 34 | # XW. Put thread-specific data into a separate class 35 | $(SHAREDDIR)/INDdata.o: $(SHAREDDIR)/definitions.h 36 | $(SHAREDDIR)/INDsample.o: $(SHAREDDIR)/definitions.h 37 | 38 | $(SHAREDDIR)/functions.o: $(SHAREDDIR)/definitions.h 39 | $(SHAREDDIR)/SplitInfo.o: $(SHAREDDIR)/definitions.h 40 | $(SHAREDDIR)/LogStream.o: $(SHAREDDIR)/definitions.h 41 | $(SHAREDDIR)/TreeNode.o: $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o # XW 42 | 43 | Grove.o: $(SHAREDDIR)/TreeNode.o 44 | ag_functions.o: $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o TrainInfo.h # XW 45 | ag_layeredjob.o: $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o TrainInfo.h # XW 46 | 47 | ag_predict.o: $(OBJS) 48 | 49 | ag_predict: ag_predict.o $(OBJS) 50 | g++ -O3 -o ../Bin/ag_predict ag_predict.o $(OBJS) $(LIBS) 51 | 52 | ag_train.o: $(OBJS) 53 | 54 | ag_train: ag_train.o $(OBJS) 55 | g++ -O3 -o ../Bin/ag_train ag_train.o $(OBJS) $(LIBS) 56 | 57 | ag_save.o: $(OBJS) 58 | 59 | ag_save: ag_save.o $(OBJS) 60 | g++ -O3 -o ../Bin/ag_save ag_save.o $(OBJS) $(LIBS) 61 | 62 | ag_expand.o: $(OBJS) 63 | 64 | ag_expand: ag_expand.o $(OBJS) 65 | g++ -O3 -o ../Bin/ag_expand ag_expand.o $(OBJS) $(LIBS) 66 | 67 | ag_merge.o: $(OBJS) 68 | 69 | ag_merge: ag_merge.o $(OBJS) 70 | g++ -O3 -o ../Bin/ag_merge ag_merge.o $(OBJS) $(LIBS) 71 | 72 | ag_fs.o: $(ADDOBJS) 73 | 74 | ag_fs: ag_fs.o $(ADDOBJS) 75 | g++ -O3 -o ../Bin/ag_fs ag_fs.o $(ADDOBJS) $(LIBS) 76 | 77 | ag_interactions.o: $(ADDOBJS) 78 | 79 | ag_interactions: ag_interactions.o $(ADDOBJS) 80 | g++ -O3 -o ../Bin/ag_interactions ag_interactions.o $(ADDOBJS) $(LIBS) 81 | 82 | ag_nway.o: $(ADDOBJS) 83 | 84 | ag_nway: ag_nway.o $(ADDOBJS) 85 | g++ -O3 -o ../Bin/ag_nway ag_nway.o $(ADDOBJS) $(LIBS) 86 | 87 | ag_mergepreds.o: $(OBJS) 88 | 89 | ag_mergepreds: ag_mergepreds.o $(OBJS) 90 | g++ -O3 -o ../Bin/ag_mergepreds ag_mergepreds.o $(OBJS) $(LIBS) 91 | 92 | ag_savemerge.o: $(OBJS) 93 | 94 | ag_savemerge: ag_savemerge.o $(OBJS) 95 | g++ -O3 -o ../Bin/ag_savemerge ag_savemerge.o $(OBJS) $(LIBS) 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /AdditiveGroves/TrainInfo.h: -------------------------------------------------------------------------------- 1 | // Additive Groves / TrainInfo.h: implementation of the TrainInfo structure 2 | // This structure contains all parameters relevant to training an Additive Groves model 3 | 4 | #pragma once 5 | #include "ag_definitions.h" 6 | #include "definitions.h" 7 | 8 | struct TrainInfo 9 | { 10 | public: 11 | double minAlpha; //min proportion of train set in the leaf (controls size of tree) 12 | int maxTiGN; //number of trees in a grove 13 | int bagN; //number of bagging iterations 14 | AG_TRAIN_MODE mode; //mode of training Groves (fast/slow/layered) 15 | bool rms; //which performance metric is used (rms/roc) 16 | int seed; //random number initializer 17 | 18 | bool iSet; // Whether -i argument is set and -i is used to set seed 19 | 20 | //file names 21 | string trainFName; //train set 22 | string validFName; //validation set 23 | string testFName; //test set 24 | string attrFName; //attributes description 25 | 26 | intv interaction; //a higher-order interaction between all these attributes 27 | //should not be allowed in the model (model is restricted on interaction) 28 | 29 | TrainInfo(): minAlpha(0.01), maxTiGN(8), bagN(60), mode(FAST), rms(true), seed(1), iSet(false){}; 30 | }; 31 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_definitions.h: -------------------------------------------------------------------------------- 1 | // Additive Groves / ag_definitions.h: constants, enumerators, typedefs and macros 2 | 3 | #pragma once 4 | #pragma warning(disable : 4996) 5 | 6 | enum AG_ERROR 7 | { 8 | INPUT_ERR = 101, 9 | WIN_ERR = 102, 10 | ALPHA_ERR = 103, 11 | TIGN_ERR = 104, 12 | BAGN_ERR = 105, 13 | TEMP_ERR = 106, 14 | OPEN_NWAY_ERR = 107, 15 | DIR_ERR = 108, 16 | MERGE_MISMATCH_ERR = 109, 17 | SAME_SEED_ERR = 110, 18 | TRAIN_EQ_VALID_ERR = 111 19 | }; 20 | 21 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_functions.h: -------------------------------------------------------------------------------- 1 | // Additive Groves / ag_functions.h: declarations of Additive Groves global functions 2 | 3 | #pragma once 4 | #include "TrainInfo.h" 5 | #include "INDdata.h" 6 | 7 | //saves a vector into a binary file 8 | fstream& operator << (fstream& fbin, doublev& vec); 9 | 10 | //saves a vector of vectors into a binary file 11 | fstream& operator << (fstream& fbin, doublevv& mx); 12 | 13 | //saves a vector of vectors of vectors into a binary file 14 | fstream& operator << (fstream& fbin, doublevvv& trivec); 15 | 16 | //reads a vector from a binary file 17 | fstream& operator >> (fstream& fbin, doublev& vec); 18 | 19 | //reads a vector of vectors from a binary file 20 | fstream& operator >> (fstream& fbin, doublevv& mx); 21 | 22 | //reads a vector of vectors of vectors from a binary file 23 | fstream& operator >> (fstream& fbin, doublevvv& trivec); 24 | 25 | //generates output files for train and expand commands 26 | void trainOut(TrainInfo& ti, doublevv& dir, doublevvv& rmsV, doublevvv& surfaceV, doublevvv& predsumsV, 27 | double trainN, doublevv& dirStat, double validStD = -1.0, int startAlphaNo = 0, int startTiGNNo = 0); 28 | 29 | //converts the number of a valid alpha value into the actual value 30 | double alphaVal(int alphaNo); 31 | 32 | //converts the number of a valid TiG value into the actual value 33 | int tigVal(int tigNNo); 34 | 35 | //rounds tigN down to the closest appropriate value 36 | int adjustTiGN(int tigN); 37 | 38 | //converts min alpha value into the number of alpha values 39 | int getAlphaN(double minAlphaVal, double trainV); 40 | 41 | //converts max tigN value into the number of tigN values 42 | int getTiGNN(int tigN); 43 | 44 | //implementation for erase for reverse iterator 45 | void rerase(intv& vec, intv::reverse_iterator& iter); 46 | 47 | //calculate and output effect of an attribute in a model 48 | void outEffects(INDdata& data, intv attrIds, int quantN, string modelFName, string suffix = ""); 49 | 50 | //calculate and output joint effects for pairs of attributes in a model 51 | void outIPlots(INDdata& data, iipairv interactions, int quantN1, int quantN2, string modelFName, 52 | string suffix="", string fixedFName="" 53 | /*last two parameters are valid only for a list consisting of a single interaction*/); 54 | 55 | //calculate the best place on the performance grid for the interaction detection 56 | bool bestForID(doublevvv& surfaceV, bool rms, int& bestTiGNNo, int& bestAlphaNo); 57 | 58 | //adds bagNo to prefix of file names to be used in a multi-threaded setting 59 | string getPrefix(int bagNo, double alpha, int tigN); 60 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_layered.cpp: -------------------------------------------------------------------------------- 1 | // Additive Groves / ag_layered.cpp: implementations of functions for training layered Additive Groves models 2 | 3 | #include "ag_layered.h" 4 | #include "Grove.h" 5 | #include "functions.h" 6 | 7 | #include 8 | #include 9 | 10 | //trains a Layered Groves ensemble (Additive Groves trained in layered style) 11 | //if modelFName is not empty, saves the model 12 | //returns performance on validation set 13 | double layeredGroves(INDdata& data, TrainInfo& ti, string modelFName) 14 | { 15 | doublev validTar, validWt; //true response values on validation set 16 | int validN = data.getTargets(validTar, validWt, VALID); 17 | doublev predsumsV(validN, 0); //sums of predictions for each data point 18 | 19 | if(!modelFName.empty()) 20 | {//save the model's header 21 | fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out); 22 | fmodel.write((char*) &ti.mode, sizeof(enum AG_TRAIN_MODE)); 23 | fmodel.write((char*) &ti.maxTiGN, sizeof(int)); 24 | fmodel.write((char*) &ti.minAlpha, sizeof(double)); 25 | fmodel.close(); 26 | } 27 | 28 | //build bagged models, calculate sums of predictions 29 | for(int bagNo = 0; bagNo < ti.bagN; bagNo++) 30 | { 31 | cout << "\t\tIteration " << bagNo + 1 << " out of " << ti.bagN << endl; 32 | CGrove grove(ti.minAlpha, ti.maxTiGN, ti.interaction); 33 | INDsample sample(data); 34 | grove.trainLayered(sample); 35 | for(int itemNo = 0; itemNo < validN; itemNo++) 36 | predsumsV[itemNo] += grove.predict(itemNo, VALID); 37 | 38 | if(!modelFName.empty()) 39 | grove.save(modelFName.c_str()); 40 | } 41 | 42 | //calculate predictions of the whole ensemble on the validation set 43 | doublev predictions(validN); 44 | for(int itemNo = 0; itemNo < validN; itemNo++) 45 | predictions[itemNo] = predsumsV[itemNo] / ti.bagN; 46 | 47 | if(ti.rms) 48 | return rmse(predictions, validTar, validWt); 49 | else 50 | return roc(predictions, validTar, validWt); 51 | } 52 | 53 | //runs Layered Groves repeatN times, returns average performance and standard deviation 54 | //saves the model from the last run 55 | double meanLG(INDdata& data, TrainInfo ti, int repeatN, double& resStd, string modelFName) 56 | { 57 | doublev resVals(repeatN); 58 | int repeatNo; 59 | cout << endl << "Estimating distribution of model performance" << endl; 60 | for(repeatNo = 0; repeatNo < repeatN; repeatNo++) 61 | { 62 | cout << "\tTraining model " << repeatNo + 1 << " out of " << repeatN << endl; 63 | if(repeatNo == repeatN - 1) 64 | resVals[repeatNo] = layeredGroves(data, ti, modelFName); //save the last model 65 | else 66 | resVals[repeatNo] = layeredGroves(data, ti, string("")); 67 | } 68 | 69 | //calculate mean 70 | double resMean = 0; 71 | for(repeatNo = 0; repeatNo < repeatN; repeatNo++) 72 | resMean += resVals[repeatNo]; 73 | resMean /= repeatN; 74 | 75 | //calculate standard deviation 76 | resStd = 0; 77 | for(repeatNo = 0; repeatNo < repeatN; repeatNo++) 78 | resStd += (resMean - resVals[repeatNo])*(resMean - resVals[repeatNo]); 79 | resStd /= repeatN; 80 | resStd = sqrt(resStd); 81 | 82 | return resMean; 83 | } -------------------------------------------------------------------------------- /AdditiveGroves/ag_layered.h: -------------------------------------------------------------------------------- 1 | // Additive Groves / ag_layered.h: declarations of functions for training layered Additive Groves models 2 | 3 | #pragma once 4 | 5 | #include "TrainInfo.h" 6 | #include "INDdata.h" 7 | 8 | //trains and saves a Layered Groves ensemble (Additive Groves trained in layered style) 9 | double layeredGroves(INDdata& data, TrainInfo& ti, string modelFName); 10 | 11 | //runs Layered Groves repeatN times, returns average performance and standard deviation 12 | //saves the model from the last run 13 | double meanLG(INDdata& data, TrainInfo ti, int repeatN, double& resStd, string modelFName); 14 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_layeredjob.cpp: -------------------------------------------------------------------------------- 1 | //ag_layeredjob.cpp: 2 | // 3 | // (c) Xiaojie Wang 4 | 5 | #include "ag_layeredjob.h" 6 | #include "ErrLogStream.h" 7 | 8 | // Too many arguments are needed to pass to jobs 9 | struct LayeredArg 10 | { 11 | LayeredArg( 12 | int bagNo, 13 | INDdata& data, 14 | TrainInfo& ti, 15 | int validN, 16 | string modelFName, 17 | doublevv& _predsumsV 18 | ): 19 | bagNo(bagNo), 20 | data(data), 21 | ti(ti), 22 | validN(validN), 23 | modelFName(modelFName), 24 | _predsumsV(_predsumsV) 25 | {} 26 | int bagNo; 27 | INDdata& data; 28 | TrainInfo& ti; 29 | int validN; 30 | string modelFName; 31 | doublevv& _predsumsV; 32 | }; 33 | 34 | TMutex StdOutMutex; // Make sure only one thread is using the standard output 35 | TMutex DirMutex; // Probably not needed as only the first thread writes to dir 36 | TMutex ReturnMutex; // Write to the variables computed and returned by threads 37 | 38 | // Can be used in both a single-threaded setting and a multi-threaded setting 39 | void doLayered(LayeredArg* ptr) 40 | { 41 | try 42 | { 43 | int bagNo = ptr->bagNo; 44 | INDdata& data = ptr->data; 45 | TrainInfo& ti = ptr->ti; 46 | int validN = ptr->validN; 47 | string modelFName = ptr->modelFName; 48 | 49 | INDsample sample(data); 50 | doublev __predsumsV(validN, 0); 51 | 52 | 53 | StdOutMutex.Lock(); 54 | cout << "\t\tIteration " << bagNo + 1 << " out of " << ti.bagN << " (begin)" << endl; 55 | StdOutMutex.Unlock(); 56 | 57 | CGrove grove(ti.minAlpha, ti.maxTiGN, ti.interaction); 58 | grove.trainLayered(sample); 59 | for (int itemNo = 0; itemNo < validN; itemNo ++) 60 | __predsumsV[itemNo] = grove.predict(itemNo, VALID); 61 | 62 | // Multiple threads write to different temp files 63 | if (! modelFName.empty()) 64 | { 65 | string _modelFName = getModelFName(modelFName, bagNo); 66 | // Clear previous temp files as the save function appends to the files 67 | system(("rm -f " + _modelFName).c_str()); 68 | grove.save(_modelFName.c_str()); 69 | } 70 | 71 | // Only use mutex once here and not everywhere 72 | ReturnMutex.Lock(); 73 | 74 | // Mutex is not needed because threads access different slices (memory addresses) 75 | ptr->_predsumsV[bagNo] = __predsumsV; 76 | 77 | ReturnMutex.Unlock(); 78 | // Adding mutex here doesn't reduce training time but improves reproducibility 79 | 80 | StdOutMutex.Lock(); 81 | cout << "\t\tIteration " << bagNo + 1 << " out of " << ti.bagN << " (end)" << endl; 82 | StdOutMutex.Unlock(); 83 | }catch(TE_ERROR err){ 84 | StdOutMutex.Lock(); 85 | ErrLogStream errlog; 86 | switch(err) 87 | { 88 | default: 89 | te_errMsg((TE_ERROR)err); 90 | } 91 | exit(1); 92 | StdOutMutex.Unlock(); 93 | } 94 | return; 95 | } 96 | 97 | // Wrap the doLayered function by TJob to be submitted to a thread pool 98 | class LayeredJob: public TThreadPool::TJob 99 | { 100 | public: 101 | void Run(void* ptr) 102 | { 103 | doLayered((LayeredArg*) ptr); 104 | } 105 | }; 106 | 107 | //trains a Layered Groves ensemble (Additive Groves trained in layered style) 108 | //if modelFName is not empty, saves the model 109 | //returns performance on validation set 110 | double layeredGroves( 111 | INDdata& data, 112 | TrainInfo& ti, 113 | string modelFName, 114 | TThreadPool& pool 115 | ) 116 | { 117 | doublev validTar, validWt; //true response values on validation set 118 | int validN = data.getTargets(validTar, validWt, VALID); 119 | doublev predsumsV(validN, 0); //sums of predictions for each data point 120 | 121 | if(!modelFName.empty()) 122 | {//save the model's header 123 | fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out); 124 | fmodel.write((char*) &ti.mode, sizeof(enum AG_TRAIN_MODE)); 125 | fmodel.write((char*) &ti.maxTiGN, sizeof(int)); 126 | fmodel.write((char*) &ti.minAlpha, sizeof(double)); 127 | fmodel.close(); 128 | } 129 | 130 | // Build bagged models, calculate sums of predictions 131 | doublevv _predsumsV(ti.bagN, doublev((validN, 0))); 132 | 133 | for(int bagNo = 0; bagNo < ti.bagN; bagNo ++) 134 | { 135 | LayeredArg* ptr = new LayeredArg( 136 | bagNo, 137 | data, 138 | ti, 139 | validN, 140 | modelFName, 141 | _predsumsV 142 | ); 143 | pool.Run(new LayeredJob, ptr); 144 | } 145 | pool.SyncAll(); 146 | 147 | for (int bagNo = 0; bagNo < ti.bagN; bagNo ++) 148 | { 149 | if (!modelFName.empty()) 150 | { 151 | CGrove grove(ti.minAlpha, ti.maxTiGN, ti.interaction); 152 | string _modelFName = getModelFName(modelFName, bagNo); 153 | fstream fload(_modelFName.c_str(), ios_base::binary | ios_base::in); 154 | grove.load(fload); 155 | grove.save(modelFName.c_str()); 156 | fload.close(); 157 | system(("rm -f " + _modelFName).c_str()); 158 | } 159 | 160 | for (int itemNo = 0; itemNo < validN; itemNo ++) 161 | predsumsV[itemNo] += _predsumsV[bagNo][itemNo]; 162 | } 163 | 164 | //calculate predictions of the whole ensemble on the validation set 165 | doublev predictions(validN); 166 | for(int itemNo = 0; itemNo < validN; itemNo++) 167 | predictions[itemNo] = predsumsV[itemNo] / ti.bagN; 168 | 169 | if(ti.rms) 170 | return rmse(predictions, validTar, validWt); 171 | else 172 | return roc(predictions, validTar, validWt); 173 | } 174 | 175 | //runs Layered Groves repeatN times, returns average performance and standard deviation 176 | //saves the model from the last run 177 | double meanLG( 178 | INDdata& data, 179 | TrainInfo ti, 180 | int repeatN, 181 | double& resStd, 182 | string modelFName, 183 | TThreadPool& pool 184 | ) 185 | { 186 | doublev resVals(repeatN); 187 | int repeatNo; 188 | cout << endl << "Estimating distribution of model performance" << endl; 189 | for(repeatNo = 0; repeatNo < repeatN; repeatNo++) 190 | { 191 | cout << "\tTraining model " << repeatNo + 1 << " out of " << repeatN << endl; 192 | if(repeatNo == repeatN - 1) 193 | { 194 | //save the last model 195 | resVals[repeatNo] = layeredGroves(data, ti, modelFName, pool); 196 | } 197 | else 198 | resVals[repeatNo] = layeredGroves(data, ti, string(""), pool); 199 | } 200 | 201 | //calculate mean 202 | double resMean = 0; 203 | for(repeatNo = 0; repeatNo < repeatN; repeatNo++) 204 | resMean += resVals[repeatNo]; 205 | resMean /= repeatN; 206 | 207 | //calculate standard deviation 208 | resStd = 0; 209 | for(repeatNo = 0; repeatNo < repeatN; repeatNo++) 210 | resStd += (resMean - resVals[repeatNo])*(resMean - resVals[repeatNo]); 211 | resStd /= repeatN; 212 | resStd = sqrt(resStd); 213 | 214 | return resMean; 215 | } 216 | 217 | string getModelFName(string modelFName, int bagNo) 218 | { 219 | string _modelFName = string("./AGTemp/") 220 | + insertSuffix(modelFName, "b." + itoa(bagNo, 10)); 221 | return _modelFName; 222 | } 223 | 224 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_layeredjob.h: -------------------------------------------------------------------------------- 1 | //ag_layeredjob.h: 2 | // 3 | // (c) Xiaojie Wang 4 | 5 | #include "TrainInfo.h" 6 | #include "INDdata.h" 7 | #include "Grove.h" 8 | 9 | #include "functions.h" 10 | 11 | #include 12 | #include 13 | #include "thread_pool.h" 14 | 15 | struct LayeredArg; 16 | void doLayered(LayeredArg* ptr); 17 | class LayeredJob; 18 | 19 | //trains and saves a Layered Groves ensemble (Additive Groves trained in layered style) 20 | double layeredGroves( 21 | INDdata& data, 22 | TrainInfo& ti, 23 | string modelFName, 24 | TThreadPool& pool 25 | ); // XW 26 | 27 | //runs Layered Groves repeatN times, returns average performance and standard deviation, saves the last model 28 | double meanLG( 29 | INDdata& db, 30 | TrainInfo ti, 31 | int repeatN, 32 | double& resStd, 33 | string modelFName, 34 | TThreadPool& pool 35 | ); // XW 36 | 37 | string getModelFName(string modelFName, int bagNo); // XW 38 | 39 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_predict.cpp: -------------------------------------------------------------------------------- 1 | //Additive Groves / ag_predict.cpp: main function of executable ag_predict 2 | 3 | #include "Grove.h" 4 | #include "TrainInfo.h" 5 | #include "LogStream.h" 6 | #include "ErrLogStream.h" 7 | #include "functions.h" 8 | #include "ag_definitions.h" 9 | 10 | #include 11 | 12 | //ag_predict -p _test_set_ -r _attr_file_ [-m _model_file_name_] [-o _output_file_name_] [-c rms|roc] | -version 13 | int main(int argc, char* argv[]) 14 | { 15 | try{ 16 | //0. Set log file 17 | LogStream telog; 18 | telog << "\n-----\nag_predict "; 19 | for(int argNo = 1; argNo < argc; argNo++) 20 | telog << argv[argNo] << " "; 21 | telog << "\n\n"; 22 | 23 | if((argc > 1) && !string(argv[1]).compare("-version")) 24 | { 25 | telog << "TreeExtra version " << VERSION << "\n"; 26 | return 0; 27 | } 28 | 29 | //1. Set default values of parameters 30 | string modelFName = "model.bin"; //name of the input file for the model 31 | string predFName = "preds.txt"; //name of the output file for predictions 32 | 33 | TrainInfo ti; 34 | 35 | //2. Set parameters from command line 36 | //check that the number of arguments is even (flags + value pairs) 37 | if(argc % 2 == 0) 38 | throw INPUT_ERR; 39 | //convert input parameters to string from char* 40 | stringv args(argc); 41 | for(int argNo = 0; argNo < argc; argNo++) 42 | args[argNo] = string(argv[argNo]); 43 | 44 | //parse and save input parameters 45 | //indicators of presence of required flags in the input 46 | bool hasTest = false; 47 | bool hasAttr = false; 48 | 49 | for(int argNo = 1; argNo < argc; argNo += 2) 50 | { 51 | if(!args[argNo].compare("-m")) 52 | modelFName = args[argNo + 1]; 53 | else if(!args[argNo].compare("-o")) 54 | predFName = args[argNo + 1]; 55 | else if(!args[argNo].compare("-p")) 56 | { 57 | ti.testFName = args[argNo + 1]; 58 | hasTest = true; 59 | } 60 | else if(!args[argNo].compare("-r")) 61 | { 62 | ti.attrFName = args[argNo + 1]; 63 | hasAttr = true; 64 | } 65 | else if(!args[argNo].compare("-c")) 66 | { 67 | if(!args[argNo + 1].compare("roc")) 68 | ti.rms = false; 69 | else if(!args[argNo + 1].compare("rms")) 70 | ti.rms = true; 71 | else 72 | throw INPUT_ERR; 73 | } 74 | else 75 | throw INPUT_ERR; 76 | } 77 | 78 | if(!(hasTest && hasAttr)) 79 | throw INPUT_ERR; 80 | 81 | //2. Load data 82 | INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 83 | ti.attrFName.c_str()); 84 | CGrove::setData(data); 85 | CTreeNode::setData(data); 86 | 87 | //3. Open model file, read its header 88 | fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::in); 89 | fmodel.read((char*) &ti.mode, sizeof(enum AG_TRAIN_MODE)); 90 | if(ti.mode == FAST) 91 | {//skip information about fast training - it is not used in this command 92 | int dirN = 0; 93 | fmodel.read((char*) &dirN, sizeof(int)); 94 | bool dirStub = false; 95 | for(int dirNo = 0; dirNo < dirN; dirNo++) 96 | fmodel.read((char*) &dirStub, sizeof(bool)); 97 | } 98 | fmodel.read((char*) &ti.maxTiGN, sizeof(int)); 99 | fmodel.read((char*) &ti.minAlpha, sizeof(double)); 100 | if(fmodel.fail() || (ti.maxTiGN < 1)) 101 | throw MODEL_ERR; 102 | 103 | //4. Load models, get predictions 104 | doublev testTar, testWt; 105 | int testN = data.getTargets(testTar, testWt, TEST); 106 | doublev preds(testN, 0); 107 | 108 | ti.bagN = 0; 109 | cout << "Calculating predictions " << endl; 110 | while(fmodel.peek() != char_traits::eof()) 111 | {//load next Grove in the ensemble 112 | ti.bagN++; 113 | cout << "Iteration " << ti.bagN << endl; 114 | CGrove grove(ti.minAlpha, ti.maxTiGN); 115 | grove.load(fmodel); 116 | 117 | //get predictions, add them to predictions of previous models 118 | for(int itemNo = 0; itemNo < testN; itemNo++) 119 | preds[itemNo] += grove.predict(itemNo, TEST); 120 | } 121 | 122 | //get bagged predictions of the ensemble 123 | for(int itemNo = 0; itemNo < testN; itemNo++) 124 | preds[itemNo] /= ti.bagN; 125 | 126 | //5. Output predictions into the output file and performance value (if available) to std output 127 | fstream fpreds; 128 | fpreds.open(predFName.c_str(), ios_base::out); 129 | for(int itemNo = 0; itemNo < testN; itemNo++) 130 | fpreds << preds[itemNo] << endl; 131 | fpreds.close(); 132 | 133 | if(data.hasTrueTest()) 134 | { 135 | double performance; 136 | if(ti.rms) 137 | { 138 | performance = rmse(preds, testTar, testWt); 139 | telog << "\nRMSE: " << performance << "\n"; 140 | } 141 | else 142 | { 143 | performance = roc(preds, testTar, testWt); 144 | telog << "\nROC: " << performance << "\n"; 145 | } 146 | } 147 | 148 | }catch(TE_ERROR err){ 149 | te_errMsg((TE_ERROR)err); 150 | return 1; 151 | }catch(AG_ERROR err){ 152 | ErrLogStream errlog; 153 | switch(err) 154 | { 155 | case INPUT_ERR: 156 | errlog << "Usage: ag_predict -p _test_set_ -r _attr_file_name_ " 157 | << "[-m _model_file_name_] [-o _output_file_name_] [-c rms|roc] | -version\n"; 158 | break; 159 | default: 160 | throw err; 161 | } 162 | return 1; 163 | }catch(exception &e){ 164 | ErrLogStream errlog; 165 | string errstr(e.what()); 166 | exception_errMsg(errstr); 167 | errlog << "Error: " << errstr << "\n"; 168 | return 1; 169 | }catch(...){ 170 | string errstr = strerror(errno); 171 | ErrLogStream errlog; 172 | errlog << "Error: " << errstr << "\n"; 173 | return 1; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /AdditiveGroves/ag_save.cpp: -------------------------------------------------------------------------------- 1 | //Additive Groves / ag_save.cpp: main function of executable ag_save 2 | 3 | #include "ag_functions.h" 4 | #include "functions.h" 5 | #include "Grove.h" 6 | #include "LogStream.h" 7 | #include "ErrLogStream.h" 8 | #include "ag_definitions.h" 9 | 10 | #include 11 | 12 | //ag_save [-m _model_file_name_] [-a _alpha_value] [-n _N_value_] [-b _bagging_iterations_] | -version 13 | int main(int argc, char* argv[]) 14 | { 15 | try{ 16 | //0. Set log file 17 | LogStream telog; 18 | telog << "\n-----\nag_save "; 19 | for(int argNo = 1; argNo < argc; argNo++) 20 | telog << argv[argNo] << " "; 21 | telog << "\n\n"; 22 | 23 | if((argc > 1) && !string(argv[1]).compare("-version")) 24 | { 25 | telog << "TreeExtra version " << VERSION << "\n"; 26 | return 0; 27 | } 28 | 29 | //1. Read values of parameters from files 30 | string modelFName = "model.bin"; //name of the output file for the model 31 | TrainInfo ti; 32 | //parameters of the model to be saved 33 | int saveTiGN, saveBagN, iStub; 34 | double saveAlpha, trainN, dStub; 35 | string sStub; 36 | 37 | //read values of Groves parameters that produced best results 38 | fstream fbest; 39 | fbest.open("./AGTemp/best.txt", ios_base::in); 40 | fbest >> dStub >> saveTiGN >> saveAlpha >> saveBagN >> trainN; 41 | if(fbest.fail()) 42 | throw TEMP_ERR; 43 | fbest.close(); 44 | 45 | //read values of parameters for which models are trained 46 | fstream fparam; 47 | fparam.open("./AGTemp/params.txt", ios_base::in); 48 | string modeStr, attrFName; 49 | fparam >> iStub >> sStub >> sStub >> ti.attrFName >> ti.minAlpha >> ti.maxTiGN 50 | >> ti.bagN >> modeStr; 51 | //modeStr should be "fast" or "slow" or "layered" 52 | if(modeStr.compare("fast") == 0) 53 | ti.mode = FAST; 54 | else if(modeStr.compare("slow") == 0) 55 | ti.mode = SLOW; 56 | else if(modeStr.compare("layered") == 0) 57 | ti.mode = LAYERED; 58 | else 59 | throw TEMP_ERR; 60 | if(fparam.fail()) 61 | throw TEMP_ERR; 62 | fparam.close(); 63 | 64 | //2. Set parameters from command line 65 | 66 | //check that the number of arguments is even (flags + value pairs) 67 | if(argc % 2 == 0) 68 | throw INPUT_ERR; 69 | //convert input parameters to string from char* 70 | stringv args(argc); 71 | for(int argNo = 0; argNo < argc; argNo++) 72 | args[argNo] = string(argv[argNo]); 73 | 74 | //parse and save input parameters 75 | for(int argNo = 1; argNo < argc; argNo += 2) 76 | { 77 | if(!args[argNo].compare("-a")) 78 | saveAlpha = atofExt(argv[argNo + 1]); 79 | else if(!args[argNo].compare("-n")) 80 | saveTiGN = atoiExt(argv[argNo + 1]); 81 | else if(!args[argNo].compare("-b")) 82 | saveBagN = atoiExt(argv[argNo + 1]); 83 | else if(!args[argNo].compare("-m")) 84 | { 85 | modelFName = args[argNo + 1]; 86 | if(modelFName.empty()) 87 | throw EMPTY_MODEL_NAME_ERR; 88 | } 89 | else 90 | throw INPUT_ERR; 91 | }//end for(int argNo = 1; argNo < argc; argNo += 2) 92 | 93 | if((saveAlpha < ti.minAlpha) || (saveAlpha > 1)) 94 | throw ALPHA_ERR; 95 | if(saveTiGN > ti.maxTiGN) 96 | throw TIGN_ERR; 97 | if(saveBagN > ti.bagN) 98 | throw BAGN_ERR; 99 | 100 | //adjust alpha, if needed 101 | double newAlpha = adjustAlpha(saveAlpha, trainN); 102 | if(saveAlpha != newAlpha) 103 | { 104 | telog << "Warning: alpha value was rounded to the closest valid value " << newAlpha << ".\n\n"; 105 | saveAlpha = newAlpha; 106 | } 107 | //adjust saveTiGN, if needed 108 | int newTiGN = adjustTiGN(saveTiGN); 109 | if(saveTiGN != newTiGN) 110 | { 111 | telog << "Warning: N value was rounded to the closest smaller valid value " << newTiGN << ".\n\n"; 112 | saveTiGN = newTiGN; 113 | } 114 | 115 | int alphaN = getAlphaN(ti.minAlpha, trainN); 116 | int tigNN = getTiGNN(ti.maxTiGN); 117 | int saveAlphaNo = getAlphaN(saveAlpha, trainN) - 1; 118 | int saveTiGNNo = getTiGNN(saveTiGN) - 1; 119 | boolv dir; //path on the parameter grid 120 | 121 | telog << "Alpha = " << saveAlpha << "\nN = " << saveTiGN << "\n" 122 | << saveBagN << " bagging iterations" << "\n\n"; 123 | 124 | //3. Load info about attributes 125 | INDdata data("", "", "", ti.attrFName.c_str()); 126 | CGrove::setData(data); 127 | CTreeNode::setData(data); 128 | 129 | //4. For fast models, figure out the directions path. 130 | if(ti.mode == FAST) 131 | {//read the directions table from file 132 | doublevv dirMx(tigNN, doublev(alphaN, 0)); 133 | //outer array: column (by TiGN) 134 | //middle array: row (by alpha) 135 | fstream fdir; 136 | fdir.open("./AGTemp/dir.txt", ios_base::in); 137 | for(int tigNNo = 0; tigNNo < tigNN; tigNNo++) 138 | for(int alphaNo = 0; alphaNo < alphaN; alphaNo++) 139 | fdir >> dirMx[tigNNo][alphaNo]; 140 | if(fdir.fail()) 141 | throw TEMP_ERR; 142 | fdir.close(); 143 | 144 | //set dir - path from (0,0) to (bestAlphaNo, bestTigNNo) 145 | int tigNNo = saveTiGNNo; 146 | int alphaNo = saveAlphaNo; 147 | for(int dirNo = 0; dirNo < saveTiGNNo + saveAlphaNo; dirNo++) 148 | if(dirMx[tigNNo][alphaNo] == 1) //UP 149 | { 150 | dir.insert(dir.begin(),true); 151 | tigNNo--; 152 | } 153 | else 154 | { 155 | dir.insert(dir.begin(),false); 156 | alphaNo--; 157 | } 158 | } 159 | 160 | //5. Save the model 161 | fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::out); 162 | //save ti.mode, dir (if ti.mode==FAST) and saveTiGN 163 | fmodel.write((char*) &ti.mode, sizeof(enum AG_TRAIN_MODE)); 164 | if(ti.mode == FAST) 165 | { 166 | int dirN = (int)dir.size(); 167 | fmodel.write((char*) &dirN, sizeof(int)); 168 | for(int dirNo = 0; dirNo < saveTiGNNo + saveAlphaNo; dirNo++) 169 | { 170 | bool d = dir[dirNo]; 171 | fmodel.write((char*) &d, sizeof(bool)); 172 | } //can't write directly from dir[dirNo] because vector is not a usual stl type 173 | } 174 | fmodel.write((char*) &saveTiGN, sizeof(int)); 175 | fmodel.write((char*) &saveAlpha, sizeof(double)); 176 | fmodel.close(); 177 | 178 | //generate the title of the file with the trees 179 | const int buflen = 1024; 180 | char buf[buflen]; 181 | string treesFName = string("./AGTemp/ag.a.") 182 | + alphaToStr(saveAlpha) 183 | + ".n." 184 | + itoa(saveTiGN, 10) 185 | + ".tmp"; 186 | //read saveBagN groves and save them to the output file 187 | fstream ftrees(treesFName.c_str(), ios_base::binary | ios_base::in); 188 | for(int groveNo = 0; groveNo < saveBagN; groveNo++) 189 | { 190 | CGrove grove(saveAlpha, saveTiGN); 191 | grove.load(ftrees); 192 | grove.save(modelFName.c_str()); 193 | } 194 | ftrees.close(); 195 | 196 | }catch(TE_ERROR err){ 197 | te_errMsg((TE_ERROR)err); 198 | return 1; 199 | }catch(AG_ERROR err){ 200 | ErrLogStream errlog; 201 | switch(err) 202 | { 203 | case INPUT_ERR: 204 | errlog << "Usage: ag_save [-m _output_file_name_] [-a _alpha_value_] [-n _N_value_] " 205 | << "[-b _bagging_iterations_] | -version\n"; 206 | break; 207 | case TEMP_ERR: 208 | errlog << "Error: temporary files from previous runs of train/expand " 209 | << "are missing or corrupted.\n"; 210 | break; 211 | case ALPHA_ERR: 212 | errlog << "Error: alpha value is out of [0;1] range " 213 | << "or less than in the last run of train/expand.\n"; 214 | break; 215 | case TIGN_ERR: 216 | errlog << "Input error: N value is greater than in the last run of train/expand.\n"; 217 | break; 218 | case BAGN_ERR: 219 | errlog << "Input error: number of bagging iterations is greater than " 220 | << "in the last run of train/expand.\n"; 221 | break; 222 | default: 223 | throw err; 224 | } 225 | return 1; 226 | }catch(exception &e){ 227 | ErrLogStream errlog; 228 | string errstr(e.what()); 229 | exception_errMsg(errstr); 230 | errlog << "Error: " << errstr << "\n"; 231 | return 1; 232 | }catch(...){ 233 | string errstr = strerror(errno); 234 | ErrLogStream errlog; 235 | errlog << "Error: " << errstr << "\n"; 236 | return 1; 237 | } 238 | return 0; 239 | } 240 | -------------------------------------------------------------------------------- /BaggedTrees/BaggedTrees.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.21005.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bt_train", "..\win\bt_train\bt_train.vcxproj", "{5146957E-98AD-46DB-AA17-EFD5F21A963A}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bt_predict", "..\win\bt_predict\bt_predict.vcxproj", "{2FB0A616-9772-42F9-8356-242C04DB2464}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gbt_train", "..\win\gbt_train\gbt_train.vcxproj", "{00D8D791-4CD2-424C-B92C-65BCF0167D0A}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Win32 = Debug|Win32 15 | Release|Win32 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {5146957E-98AD-46DB-AA17-EFD5F21A963A}.Debug|Win32.ActiveCfg = Debug|Win32 19 | {5146957E-98AD-46DB-AA17-EFD5F21A963A}.Debug|Win32.Build.0 = Debug|Win32 20 | {5146957E-98AD-46DB-AA17-EFD5F21A963A}.Release|Win32.ActiveCfg = Release|Win32 21 | {5146957E-98AD-46DB-AA17-EFD5F21A963A}.Release|Win32.Build.0 = Release|Win32 22 | {2FB0A616-9772-42F9-8356-242C04DB2464}.Debug|Win32.ActiveCfg = Debug|Win32 23 | {2FB0A616-9772-42F9-8356-242C04DB2464}.Debug|Win32.Build.0 = Debug|Win32 24 | {2FB0A616-9772-42F9-8356-242C04DB2464}.Release|Win32.ActiveCfg = Release|Win32 25 | {2FB0A616-9772-42F9-8356-242C04DB2464}.Release|Win32.Build.0 = Release|Win32 26 | {00D8D791-4CD2-424C-B92C-65BCF0167D0A}.Debug|Win32.ActiveCfg = Debug|Win32 27 | {00D8D791-4CD2-424C-B92C-65BCF0167D0A}.Debug|Win32.Build.0 = Debug|Win32 28 | {00D8D791-4CD2-424C-B92C-65BCF0167D0A}.Release|Win32.ActiveCfg = Release|Win32 29 | {00D8D791-4CD2-424C-B92C-65BCF0167D0A}.Release|Win32.Build.0 = Release|Win32 30 | EndGlobalSection 31 | GlobalSection(SolutionProperties) = preSolution 32 | HideSolutionNode = FALSE 33 | EndGlobalSection 34 | EndGlobal 35 | -------------------------------------------------------------------------------- /BaggedTrees/Makefile: -------------------------------------------------------------------------------- 1 | SHAREDDIR=../shared 2 | LIBDIR=../ThreadPool 3 | CXXFLAGS = -I$(SHAREDDIR) -I$(LIBDIR) 4 | OBJS = Tree.o bt_functions.o $(LIBDIR)/thread_pool.o \ 5 | $(SHAREDDIR)/SplitInfo.o $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o \ 6 | $(SHAREDDIR)/TreeNode.o $(SHAREDDIR)/functions.o $(SHAREDDIR)/LogStream.o # XW 7 | PGMS = bt_predict bt_train gbt_train 8 | PGMOBJS = bt_predict.o bt_train.o gbt_train.o 9 | LIBS = -lpthread 10 | 11 | all: directories $(PGMS) 12 | 13 | directories: 14 | mkdir -p ../Bin 15 | 16 | clean: 17 | # XW. Should not abort if the files do not exist 18 | rm -f $(OBJS) 19 | rm -f $(PGMOBJS) 20 | rm -f ../Bin/bt_predict ../Bin/bt_train ../Bin/gbt_train 21 | 22 | .cpp.o: 23 | g++ -O3 $(CXXFLAGS) -c $< -o $@ 24 | 25 | $(SHAREDDIR)/INDdata.o: $(SHAREDDIR)/definitions.h 26 | $(SHAREDDIR)/INDsample.o: $(SHAREDDIR)/definitions.h # XW 27 | $(SHAREDDIR)/functions.o: $(SHAREDDIR)/definitions.h 28 | $(SHAREDDIR)/SplitInfo.o: $(SHAREDDIR)/definitions.h 29 | $(SHAREDDIR)/LogStream.o: $(SHAREDDIR)/definitions.h 30 | $(SHAREDDIR)/TreeNode.o: $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o # XW 31 | 32 | bt_functions.o: $(SHAREDDIR)/definitions.h 33 | Tree.o: $(SHAREDDIR)/TreeNode.o 34 | 35 | bt_predict.o: $(OBJS) 36 | 37 | bt_predict: bt_predict.o $(OBJS) 38 | g++ -O3 -o ../Bin/bt_predict bt_predict.o $(OBJS) $(LIBS) 39 | 40 | bt_train.o: $(OBJS) 41 | 42 | bt_train: bt_train.o $(OBJS) 43 | g++ -O3 -o ../Bin/bt_train bt_train.o $(OBJS) $(LIBS) 44 | 45 | gbt_train.o: $(OBJS) 46 | 47 | gbt_train: gbt_train.o $(OBJS) 48 | g++ -O3 -o ../Bin/gbt_train gbt_train.o $(OBJS) $(LIBS) 49 | 50 | -------------------------------------------------------------------------------- /BaggedTrees/TrainInfo.h: -------------------------------------------------------------------------------- 1 | // Bagged Trees / TrainInfo.h: implementation of the TrainInfo structure 2 | // This structure contains all parameters relevant to training Bagged Trees 3 | 4 | #pragma once 5 | 6 | struct TrainInfo 7 | { 8 | public: 9 | int bagN; //number of bagging iterations 10 | int seed; //random number initializer 11 | double alpha; //min proportion of train set in the leaf (controls size of tree) 12 | bool rms; //rms/roc performance metric 13 | 14 | bool iSet; // Whether -i argument is set and -i is used to set seed 15 | 16 | //file names 17 | string trainFName; //train set 18 | string validFName; //validation set 19 | string testFName; //test set 20 | string attrFName; //attributes description 21 | 22 | TrainInfo(): bagN(60), seed(1), alpha(0), rms(true) {}; 23 | }; 24 | -------------------------------------------------------------------------------- /BaggedTrees/Tree.h: -------------------------------------------------------------------------------- 1 | // Bagged Trees / Tree.h: interface of class CTree 2 | 3 | #pragma once 4 | #include "TreeNode.h" 5 | 6 | #ifndef _WIN32 7 | #include "thread_pool.h" 8 | #endif 9 | 10 | //Regression Tree model 11 | class CTree 12 | { 13 | public: 14 | //set function for static data pointer 15 | static void setData(INDdata& data){pData = &data;} 16 | 17 | #ifndef _WIN32 18 | static void setPool(TThreadPool& pool){pPool = &pool;} 19 | #endif 20 | 21 | //constructor 22 | CTree(double alpha = 0); 23 | 24 | //grows a tree, increases attribute counts 25 | // Variable attrCounts is shared and modified by different bagging threads 26 | void growGBT(bool doFS, doublev& attrCounts, INDsample& sample); 27 | // Variable curAttrCounts is local and specific to a certain bagging thread 28 | void growBT(bool doFS, doublev& curAttrCounts, INDsample& sample); 29 | 30 | //saves the tree into the binary file 31 | void save(const char* fileName); 32 | 33 | //loads the tree from the binary file 34 | void load(fstream& fload); 35 | 36 | //calculates prediction of the model for a single item 37 | double predict(int itemNo, DATA_SET dset); 38 | 39 | //loads data into the root 40 | void setRoot(INDsample& sample); 41 | 42 | //input: predictions for train set data points produced by the rest of the model (not by this tree) 43 | //Changes ground truth to residuals in the root train set 44 | void resetRoot(doublev& othpreds); 45 | 46 | private: 47 | static INDdata* pData; //data access pointer 48 | 49 | #ifndef _WIN32 50 | static TThreadPool* pPool; //thread pool pointer 51 | TCondition nodesCond; //condition, used for multithreading control 52 | #endif 53 | 54 | CTreeNode root; //root of the tree 55 | double alpha; //training parameter: controls size of the tree 56 | }; 57 | 58 | #ifndef _WIN32 59 | //Information required for a single node splitting job to run. Used for multithreading 60 | struct JobData 61 | { 62 | JobData( 63 | nodeip in_curNH, 64 | nodehstack* in_pNodes, 65 | TCondition* in_pNodesCond, 66 | int* in_pToDoN, 67 | doublev* in_pAttrCounts, 68 | double in_b, 69 | double in_H, 70 | INDsample& sample 71 | ): 72 | curNH(in_curNH), 73 | pNodes(in_pNodes), 74 | pNodesCond(in_pNodesCond), 75 | pToDoN(in_pToDoN), 76 | pAttrCounts(in_pAttrCounts), 77 | b(in_b), 78 | H(in_H), 79 | sample(sample) 80 | {} 81 | 82 | nodeip curNH; 83 | nodehstack* pNodes; 84 | TCondition* pNodesCond; 85 | int* pToDoN; 86 | double alpha; 87 | doublev* pAttrCounts; 88 | double b; 89 | double H; 90 | INDsample& sample; 91 | }; 92 | #endif 93 | -------------------------------------------------------------------------------- /BaggedTrees/bt_definitions.h: -------------------------------------------------------------------------------- 1 | // Bagged Trees / bt_definitions.h: constants, enumerators, typedefs and macros 2 | 3 | #pragma once 4 | #pragma warning(disable : 4996) 5 | 6 | enum BT_ERROR 7 | { 8 | INPUT_ERR = 101, 9 | WIN_ERR = 102, 10 | ALPHA_ERR = 103 11 | }; 12 | 13 | -------------------------------------------------------------------------------- /BaggedTrees/bt_functions.cpp: -------------------------------------------------------------------------------- 1 | // Bagged Trees / bt_functions.cpp: definitions of global functions for Bagged Trees 2 | 3 | #include "bt_functions.h" 4 | 5 | #include "functions.h" 6 | 7 | //comparison by the second element 8 | bool idGreater(idpair id1, idpair id2) 9 | { 10 | return id1.second > id2.second; 11 | } 12 | 13 | string getModelFName(string modelFName, int bagNo) 14 | { 15 | string _modelFName = string("./BTTemp/") 16 | + insertSuffix(modelFName, "b." + itoa(bagNo, 10)); 17 | return _modelFName; 18 | } 19 | -------------------------------------------------------------------------------- /BaggedTrees/bt_functions.h: -------------------------------------------------------------------------------- 1 | // Bagging/bt_functions.h: declarations of global functions for Bagged Trees 2 | 3 | #include "definitions.h" 4 | 5 | //comparison by the second element 6 | bool idGreater(idpair id1, idpair id2); 7 | 8 | string getModelFName(string modelFName, int bagNo); 9 | -------------------------------------------------------------------------------- /BaggedTrees/bt_predict.cpp: -------------------------------------------------------------------------------- 1 | //Bagged Trees / bt_predict.cpp: main function of executable bt_predict 2 | 3 | #include 4 | 5 | #include "LogStream.h" 6 | #include "ErrLogStream.h" 7 | #include "Tree.h" 8 | #include "bt_definitions.h" 9 | #include "TrainInfo.h" 10 | #include "functions.h" 11 | 12 | 13 | //bt_predict -p _test_set_ -r _attr_file_ [-m _model_file_name_] [-o _output_file_name_] [-c rms|roc] 14 | //[-l log|nolog] | -version 15 | int main(int argc, char* argv[]) 16 | { 17 | try{ 18 | //0. -version mode 19 | if((argc > 1) && !string(argv[1]).compare("-version")) 20 | { 21 | LogStream telog; 22 | telog << "\n-----\nbt_predict "; 23 | for(int argNo = 1; argNo < argc; argNo++) 24 | telog << argv[argNo] << " "; 25 | telog << "\n\n"; 26 | 27 | telog << "TreeExtra version " << VERSION << "\n"; 28 | return 0; 29 | } 30 | 31 | //1. Analyze input parameters 32 | string modelFName = "model.bin"; //name of the input file for the model 33 | string predFName = "preds.txt"; //name of the output file for predictions 34 | bool doOut = true; //whether to output log information to stdout 35 | 36 | TrainInfo ti; 37 | 38 | //check that the number of arguments is even (flags + value pairs) 39 | if(argc % 2 == 0) 40 | throw INPUT_ERR; 41 | //convert input parameters to string from char* 42 | stringv args(argc); 43 | for(int argNo = 0; argNo < argc; argNo++) 44 | args[argNo] = string(argv[argNo]); 45 | 46 | //parse and save input parameters 47 | //indicators of presence of required flags in the input 48 | bool hasTest = false; 49 | bool hasAttr = false; 50 | 51 | for(int argNo = 1; argNo < argc; argNo += 2) 52 | { 53 | if(!args[argNo].compare("-m")) 54 | modelFName = args[argNo + 1]; 55 | else if(!args[argNo].compare("-o")) 56 | predFName = args[argNo + 1]; 57 | else if(!args[argNo].compare("-p")) 58 | { 59 | ti.testFName = args[argNo + 1]; 60 | hasTest = true; 61 | } 62 | else if(!args[argNo].compare("-r")) 63 | { 64 | ti.attrFName = args[argNo + 1]; 65 | hasAttr = true; 66 | } 67 | else if(!args[argNo].compare("-c")) 68 | { 69 | if(!args[argNo + 1].compare("roc")) 70 | ti.rms = false; 71 | else if(!args[argNo + 1].compare("rms")) 72 | ti.rms = true; 73 | else 74 | throw INPUT_ERR; 75 | } 76 | else if(!args[argNo].compare("-l")) 77 | { 78 | if(!args[argNo + 1].compare("log")) 79 | doOut = true; 80 | else if(!args[argNo + 1].compare("nolog")) 81 | doOut = false; 82 | else 83 | throw INPUT_ERR; 84 | } 85 | else 86 | throw INPUT_ERR; 87 | } 88 | 89 | if(!(hasTest && hasAttr)) 90 | throw INPUT_ERR; 91 | 92 | //1a. Set log file 93 | LogStream telog; 94 | LogStream::doOut = doOut; 95 | telog << "\n-----\nbt_predict "; 96 | for(int argNo = 1; argNo < argc; argNo++) 97 | telog << argv[argNo] << " "; 98 | telog << "\n\n"; 99 | 100 | //2. Load data 101 | INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 102 | ti.attrFName.c_str(), doOut); 103 | CTree::setData(data); 104 | CTreeNode::setData(data); 105 | 106 | //3. Open model file 107 | fstream fmodel(modelFName.c_str(), ios_base::binary | ios_base::in); 108 | //read AG header 109 | AG_TRAIN_MODE mode; 110 | int tigN; 111 | double alpha; 112 | fmodel.read((char*) &mode, sizeof(enum AG_TRAIN_MODE)); 113 | fmodel.read((char*) &tigN, sizeof(int)); 114 | fmodel.read((char*) &alpha, sizeof(double)); 115 | 116 | if(fmodel.fail() || (mode != SLOW) || (tigN != 1)) 117 | throw MODEL_ERR; 118 | 119 | //4. Load models, get predictions 120 | doublev testTar, testWt; 121 | int testN = data.getTargets(testTar, testWt, TEST); 122 | doublev preds(testN, 0); 123 | 124 | ti.bagN = 0; 125 | while(fmodel.peek() != char_traits::eof()) 126 | {//load next Grove in the ensemble 127 | ti.bagN++; 128 | if(doOut) 129 | cout << "Iteration " << ti.bagN << endl; 130 | CTree tree; 131 | tree.load(fmodel); 132 | 133 | //get predictions, add them to predictions of previous models 134 | for(int itemNo = 0; itemNo < testN; itemNo++) 135 | preds[itemNo] += tree.predict(itemNo, TEST); 136 | } 137 | 138 | //get bagged predictions of the ensemble 139 | for(int itemNo = 0; itemNo < testN; itemNo++) 140 | preds[itemNo] /= ti.bagN; 141 | 142 | //5. Output predictions into the output file and performance on test set (if available) to std output 143 | fstream fpreds; 144 | fpreds.open(predFName.c_str(), ios_base::out); 145 | for(int itemNo = 0; itemNo < testN; itemNo++) 146 | fpreds << preds[itemNo] << endl; 147 | fpreds.close(); 148 | 149 | if(data.hasTrueTest()) 150 | { 151 | double performance = -1; 152 | if(ti.rms) 153 | { 154 | performance = rmse(preds, testTar, testWt); 155 | telog << "RMSE: " << performance << "\n"; 156 | } 157 | else 158 | { 159 | performance = roc(preds, testTar, testWt); 160 | telog << "ROC: " << performance << "\n"; 161 | } 162 | if(!doOut) 163 | cout << performance << endl; 164 | } 165 | 166 | 167 | }catch(TE_ERROR err){ 168 | te_errMsg((TE_ERROR)err); 169 | return 1; 170 | }catch(BT_ERROR err){ 171 | ErrLogStream errlog; 172 | switch(err) 173 | { 174 | case INPUT_ERR: 175 | errlog << "Usage: bt_predict -p _test_set_ -r _attr_file_name_ " 176 | << "[-m _model_file_name_] [-o _output_file_name_] [-c rms|roc] [-l log|nolog] | -version\n"; 177 | break; 178 | default: 179 | throw err; 180 | } 181 | return 1; 182 | }catch(exception &e){ 183 | ErrLogStream errlog; 184 | string errstr(e.what()); 185 | exception_errMsg(errstr); 186 | errlog << "Error: " << errstr << "\n"; 187 | return 1; 188 | }catch(...){ 189 | string errstr = strerror(errno); 190 | ErrLogStream errlog; 191 | errlog << "Error: " << errstr << "\n"; 192 | return 1; 193 | } 194 | return 0; 195 | } 196 | -------------------------------------------------------------------------------- /OctavePlots/fix_title.m: -------------------------------------------------------------------------------- 1 | function fixed=fix_title(s) 2 | 3 | fixed=strrep(s,'_','\_'); 4 | -------------------------------------------------------------------------------- /OctavePlots/make_effect_plot.m: -------------------------------------------------------------------------------- 1 | % (C) Alexander Sorokin, Daria Sorokina, 2009 2 | % License: New BSD. 3 | 4 | function make_effect_plot(xvalues, xcounts, values, plot_title) 5 | 6 | clf 7 | axes('position',[0.1 0.2,0.8 0.7]) 8 | hold on 9 | 10 | %lc = [198 198 198]/255; 11 | %lw = 3; 12 | 13 | n = numel(xvalues); 14 | hasMV = (xvalues(n) == 0); 15 | if(hasMV) 16 | h = plot(xvalues(1:n-1), values(1:n-1)); 17 | else 18 | h = plot(xvalues, values); 19 | end 20 | set(h,'LineWidth', 5); 21 | set(h,'Color', [127 127 127]/255) 22 | if(hasMV) 23 | mv_off = 1; 24 | if n > 2 25 | mv_off = (xvalues(n-1) - xvalues(1)) / 10; 26 | end 27 | xvalues(n) = xvalues(n-1) + mv_off; 28 | mv = plot(xvalues(n), values(n), "*"); 29 | set(mv,'LineWidth', 5); 30 | set(mv,'Color', [127 127 127]/255) 31 | end 32 | 33 | xLegend=cell(1, numel(xvalues)); 34 | m = n; 35 | if(hasMV) 36 | m=n-1; 37 | end 38 | for iX=1:m 39 | if xcounts(iX) == 1 40 | s = num2str(xvalues(iX)); 41 | else 42 | s = [num2str(xvalues(iX)) ' (x ' num2str(xcounts(iX)) ')']; 43 | end 44 | xLegend{iX} = s; 45 | end 46 | if(hasMV) 47 | if xcounts(n) == 1 48 | s = '?'; 49 | else 50 | s = ['? (x ' num2str(xcounts(n)) ')']; 51 | end 52 | xLegend{n} = s; 53 | end 54 | set(gca, 'XTick', xvalues, 'XTickLabel', xLegend) 55 | rotateticklabel(gca,45); 56 | title(fix_title(plot_title)); -------------------------------------------------------------------------------- /OctavePlots/make_interaction_plot.m: -------------------------------------------------------------------------------- 1 | % (C) Alexander Sorokin, Daria Sorokina 2 | % License: New BSD. 3 | 4 | function make_interaction_plot(xvalues, xcounts, yvalues, ycounts, values, density, xlabelstr, plot_title) 5 | 6 | clf 7 | hold on 8 | lw_scale = 6/numel(yvalues); 9 | lw_offset = 3; 10 | lc_scale = [198 198 198]/255/numel(yvalues); 11 | lc_offset = [0 0 0]; 12 | 13 | celln = sum(xcounts)*sum(ycounts); 14 | top = (1/celln) * 10; 15 | bottom = (1/celln) / 10; 16 | 17 | n=numel(xvalues); 18 | m=n; 19 | hasMV=(xvalues(n)==0); 20 | if hasMV 21 | m=n-1; 22 | mv_off = 1; 23 | if n > 2 24 | mv_off = (xvalues(n-1) - xvalues(1)) / 10; 25 | end 26 | xvalues(n) = xvalues(n-1) + mv_off; 27 | end 28 | for iY = 1:numel(yvalues) 29 | lw = iY*lw_scale+lw_offset; 30 | lc = iY*lc_scale+lc_offset; 31 | handles(iY) = plot(xvalues(1:m), values(iY,1:m),'LineWidth',lw,'Color',lc); 32 | if hasMV 33 | plot(xvalues(n), values(iY,n), "*", 'LineWidth',lw,'Color',lc); 34 | end 35 | %plot high/low density markers 36 | for iX = 1:numel(xvalues) 37 | if density(iY, iX) > top 38 | plot(xvalues(iX), values(iY, iX), 'og', 'MarkerSize', 12, 'LineWidth', 3); 39 | end 40 | if density(iY, iX) < bottom 41 | plot(xvalues(iX), values(iY, iX), 'or', 'MarkerSize', 12, 'LineWidth', 3); 42 | end 43 | end 44 | end 45 | hold off 46 | 47 | xLegend = cell(1, numel(xvalues)); 48 | for iX = 1:m 49 | s = num2str(xvalues(iX)); 50 | if xcounts(iX) > 1 51 | s = [s ' (x' num2str(xcounts(iX)) ')']; 52 | end 53 | xLegend{iX} = s; 54 | end 55 | if(hasMV) 56 | xLegend{n} = '?'; 57 | if xcounts(n) > 1 58 | xLegend{n} = ['? (x' num2str(xcounts(n)) ')' ]; 59 | end 60 | end 61 | set(gca,'XTick', xvalues, 'XTickLabel', xLegend ); 62 | xlh = xlabel(xlabelstr); 63 | set(xlh, 'FontSize', 20); 64 | rotateticklabel(gca,45); 65 | 66 | yLegend = cell(1, numel(yvalues)); 67 | for iY = 1:numel(yvalues) 68 | s = num2str(yvalues(iY)); 69 | if ycounts(iY) > 1 70 | s = [s ' (x' num2str(ycounts(iY)) ')']; 71 | end 72 | yLegend{iY} = s; 73 | end 74 | yn = numel(yvalues); 75 | if yvalues(yn) == 0 76 | yLegend{yn} = '?'; 77 | if ycounts(yn) > 1 78 | yLegend{yn} = ['? (x' num2str(ycounts(yn)) ')' ]; 79 | end 80 | end 81 | 82 | legend(handles, yLegend,'Location','NorthEastOutside'); 83 | 84 | title(fix_title(plot_title)); -------------------------------------------------------------------------------- /OctavePlots/plot_effects.m: -------------------------------------------------------------------------------- 1 | folder='./'; 2 | 3 | files=dir([folder '*effect.txt']); 4 | for iF = 1:numel(files) 5 | 6 | fn = files(iF).name; 7 | 8 | data = dlmread([folder fn],'\t',2,0); 9 | 10 | xcounts = data(1:end, 1); 11 | xvalues = data(1:end, 2); 12 | values = data(1:end, 3); 13 | 14 | make_effect_plot(xvalues, xcounts, values, fn) 15 | 16 | print(gcf,'-depsc',[folder fn '.eps']); 17 | end -------------------------------------------------------------------------------- /OctavePlots/plot_interactions.m: -------------------------------------------------------------------------------- 1 | % (C) Alexander Sorokin, Daria Sorokina 2 | % License: New BSD. 3 | 4 | folder='./'; 5 | 6 | files=dir([folder '*iplot.txt']); 7 | for iF=1:numel(files) 8 | 9 | fn=files(iF).name; %fn='x1.x3.iplot.txt'; 10 | fn2=strrep(fn,'iplot.txt','iplot.dens.txt'); 11 | if ~exist(fullfile(folder, fn2),'file') 12 | warning(['Missing file ' fn2 ' for file ' fn '. Skipping ' fn '.']); 13 | continue 14 | end 15 | 16 | f=fopen(fullfile(folder, fn), 'r'); 17 | s=fgets(f); 18 | s=fgets(f); 19 | var1 = strtrim(s(8:end)); 20 | s=fgets(f); 21 | var2 = strtrim(s(11:end)); 22 | fclose(f); 23 | [data]=dlmread(fullfile(folder, fn),'\t',5,0); 24 | 25 | xvalues = data(2, 3:end)'; 26 | xcounts = data(1, 3:end)'; 27 | yvalues = data(3:end, 2); 28 | ycounts = data(3:end, 1); 29 | values = data(3:end, 3:end); 30 | if xcounts(1) == 0 31 | error(['Incorrect data for quantile counts. If your data comes from LRTree, you should be using plot_interactions_lrtree script instead']) 32 | continue 33 | end 34 | [density]=dlmread(fullfile(folder, fn2),'\t',4,0); 35 | 36 | make_interaction_plot(xvalues, xcounts, yvalues, ycounts, values, density, var2, fn); 37 | 38 | print(gcf,'-depsc',[folder fn '.eps']); 39 | %print(gcf,'-djpeg90',[folder fn '.jpg']); 40 | 41 | make_interaction_plot(yvalues, ycounts, xvalues, xcounts, values', density', var1, ['Flipped ' fn]); 42 | 43 | print(gcf,'-depsc',[folder fn '.flipped.eps']); 44 | %print(gcf,'-djpeg90',[folder fn '.flipped.jpg']); 45 | 46 | end -------------------------------------------------------------------------------- /OctavePlots/rotateticklabel.m: -------------------------------------------------------------------------------- 1 | function th=rotateticklabel(h,rot,demo) 2 | %ROTATETICKLABEL rotates tick labels 3 | % TH=ROTATETICKLABEL(H,ROT) is the calling form where H is a handle to 4 | % the axis that contains the XTickLabels that are to be rotated. ROT is 5 | % an optional parameter that specifies the angle of rotation. The default 6 | % angle is 90. TH is a handle to the text objects created. For long 7 | % strings such as those produced by datetick, you may have to adjust the 8 | % position of the axes so the labels don't get cut off. 9 | % 10 | % Of course, GCA can be substituted for H if desired. 11 | % 12 | % TH=ROTATETICKLABEL([],[],'demo') shows a demo figure. 13 | % 14 | % Known deficiencies: if tick labels are raised to a power, the power 15 | % will be lost after rotation. 16 | % 17 | % See also datetick. 18 | 19 | % Written Oct 14, 2005 by Andy Bliss 20 | % Copyright 2005 by Andy Bliss 21 | 22 | %DEMO: 23 | if nargin==3 24 | x=[now-.7 now-.3 now]; 25 | y=[20 35 15]; 26 | figure 27 | plot(x,y,'.-') 28 | datetick('x',0,'keepticks') 29 | h=gca; 30 | set(h,'position',[0.13 0.35 0.775 0.55]) 31 | rot=90; 32 | end 33 | 34 | %set the default rotation if user doesn't specify 35 | if nargin==1 36 | rot=90; 37 | end 38 | %make sure the rotation is in the range 0:360 (brute force method) 39 | while rot>360 40 | rot=rot-360; 41 | end 42 | while rot<0 43 | rot=rot+360; 44 | end 45 | %get current tick labels 46 | a=get(h,'XTickLabel'); 47 | %erase current tick labels from figure 48 | set(h,'XTickLabel',[]); 49 | %get tick label positions 50 | b=get(h,'XTick'); 51 | c=get(h,'YTick'); 52 | %make new tick labels 53 | if rot<180 54 | th=text(b,repmat(c(1)-.1*(c(2)-c(1)),length(b),1),a,'HorizontalAlignment','right','rotation',rot); 55 | else 56 | th=text(b,repmat(c(1)-.1*(c(2)-c(1)),length(b),1),a,'HorizontalAlignment','left','rotation',rot); 57 | end 58 | 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TreeExtra 2 | Additive Groves, Bagged Trees with Feature Evaluation, Interaction Detection, Visualization of Feature Effects 3 | 4 | This is a development version of the TreeExtra package. You can download the latest release directly from this repository (https://github.com/dariasor/TreeExtra/releases). 5 | You can find detailed manuals, early releases, related research papers and talks on http://dariasor.github.io/TreeExtra/. 6 | 7 | TreeExtra is maintained for Linux, macOS and Windows platforms. 8 | -------------------------------------------------------------------------------- /Visualization/Makefile: -------------------------------------------------------------------------------- 1 | SHAREDDIR = ../shared 2 | AGDIR = ../AdditiveGroves 3 | LIBDIR=../ThreadPool 4 | CXXFLAGS = -I$(SHAREDDIR) -I$(AGDIR) -I$(LIBDIR) 5 | OBJS = $(AGDIR)/ag_functions.o $(AGDIR)/Grove.o $(LIBDIR)/thread_pool.o \ 6 | $(SHAREDDIR)/SplitInfo.o $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o \ 7 | $(SHAREDDIR)/TreeNode.o $(SHAREDDIR)/functions.o $(SHAREDDIR)/LogStream.o # XW 8 | PGMS = vis_iplot vis_effect vis_correlations 9 | PGMOBJS = vis_iplot.o vis_effect.o vis_correlations.o 10 | LIBS = -lpthread 11 | 12 | all: directories $(PGMS) 13 | 14 | directories: 15 | mkdir -p ../Bin 16 | 17 | clean: 18 | rm -f $(OBJS) 19 | rm -f $(PGMOBJS) 20 | rm -f $(PGMS) 21 | 22 | .cpp.o: 23 | g++ -O3 $(CXXFLAGS) -c $< -o $@ 24 | 25 | $(SHAREDDIR)/INDdata.o: $(SHAREDDIR)/definitions.h 26 | $(SHAREDDIR)/INDsample.o: $(SHAREDDIR)/definitions.h # XW 27 | $(SHAREDDIR)/functions.o: $(SHAREDDIR)/definitions.h 28 | $(SHAREDDIR)/SplitInfo.o: $(SHAREDDIR)/definitions.h 29 | $(SHAREDDIR)/LogStream.o: $(SHAREDDIR)/definitions.h 30 | $(SHAREDDIR)/TreeNode.o: $(SHAREDDIR)/INDdata.o $(SHAREDDIR)/INDsample.o # XW 31 | 32 | 33 | vis_iplot.o: $(OBJS) 34 | 35 | vis_effect.o: $(OBJS) 36 | 37 | vis_correlations.o: $(OBJS) 38 | 39 | vis_iplot: vis_iplot.o $(OBJS) 40 | g++ -O3 -o ../Bin/vis_iplot vis_iplot.o $(OBJS) $(LIBS) 41 | 42 | vis_effect: vis_effect.o $(OBJS) 43 | g++ -O3 -o ../Bin/vis_effect vis_effect.o $(OBJS) $(LIBS) 44 | 45 | vis_correlations: vis_correlations.o $(OBJS) 46 | g++ -O3 -o ../Bin/vis_correlations vis_correlations.o $(OBJS) $(LIBS) 47 | -------------------------------------------------------------------------------- /Visualization/Visualization.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.27130.2024 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vis_effect", "..\win\vis_effect\vis_effect.vcxproj", "{FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vis_iplot", "..\win\vis_iplot\vis_iplot.vcxproj", "{F2EF2C55-F16C-4D7E-A9FB-16599193F316}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vis_correlations", "..\win\vis_correlations\vis_correlations.vcxproj", "{C0F27E3C-2533-456D-897D-DDDE9A68A679}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Win32 = Debug|Win32 15 | Debug|x64 = Debug|x64 16 | Release|Win32 = Release|Win32 17 | Release|x64 = Release|x64 18 | EndGlobalSection 19 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 20 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}.Debug|Win32.ActiveCfg = Debug|Win32 21 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}.Debug|Win32.Build.0 = Debug|Win32 22 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}.Debug|x64.ActiveCfg = Debug|Win32 23 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}.Release|Win32.ActiveCfg = Release|Win32 24 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}.Release|Win32.Build.0 = Release|Win32 25 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA}.Release|x64.ActiveCfg = Release|Win32 26 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316}.Debug|Win32.ActiveCfg = Debug|Win32 27 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316}.Debug|Win32.Build.0 = Debug|Win32 28 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316}.Debug|x64.ActiveCfg = Debug|Win32 29 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316}.Release|Win32.ActiveCfg = Release|Win32 30 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316}.Release|Win32.Build.0 = Release|Win32 31 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316}.Release|x64.ActiveCfg = Release|Win32 32 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Debug|Win32.ActiveCfg = Debug|Win32 33 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Debug|Win32.Build.0 = Debug|Win32 34 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Debug|x64.ActiveCfg = Debug|x64 35 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Debug|x64.Build.0 = Debug|x64 36 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Release|Win32.ActiveCfg = Release|Win32 37 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Release|Win32.Build.0 = Release|Win32 38 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Release|x64.ActiveCfg = Release|x64 39 | {C0F27E3C-2533-456D-897D-DDDE9A68A679}.Release|x64.Build.0 = Release|x64 40 | EndGlobalSection 41 | GlobalSection(SolutionProperties) = preSolution 42 | HideSolutionNode = FALSE 43 | EndGlobalSection 44 | GlobalSection(ExtensibilityGlobals) = postSolution 45 | SolutionGuid = {57E8CE6C-81C8-452F-9851-A6E798854C9A} 46 | EndGlobalSection 47 | EndGlobal 48 | -------------------------------------------------------------------------------- /Visualization/vis_correlations.cpp: -------------------------------------------------------------------------------- 1 | //Visualization / vis_correlations.cpp: main function of the executable vis_correlations 2 | 3 | #include "LogStream.h" 4 | #include "ErrLogStream.h" 5 | #include "functions.h" 6 | #include "vis_definitions.h" 7 | #include "INDdata.h" 8 | #include "INDsample.h" 9 | 10 | #include 11 | 12 | //vis_correlations -t _training_set_ -r _attr_file_ | -version 13 | int main(int argc, char* argv[]) 14 | { 15 | try{ 16 | //0. Set log file 17 | LogStream telog; 18 | telog << "\n-----\nvis_correlations "; 19 | for(int argNo = 1; argNo < argc; argNo++) 20 | telog << argv[argNo] << " "; 21 | telog << "\n\n"; 22 | 23 | if((argc > 1) && !string(argv[1]).compare("-version")) 24 | { 25 | telog << "TreeExtra version " << VERSION << "\n"; 26 | return 0; 27 | } 28 | 29 | //1. Set default values of parameters 30 | string trainFName; 31 | string attrFName; 32 | 33 | //2. Set parameters from command line 34 | //check that the number of arguments is even (flags + value pairs) 35 | if(argc % 2 == 0) 36 | throw VIS_INPUT_ERR; 37 | //convert input parameters to string from char* 38 | stringv args(argc); 39 | for(int argNo = 0; argNo < argc; argNo++) 40 | args[argNo] = string(argv[argNo]); 41 | 42 | //parse and save input parameters 43 | //indicators of presence of required flags in the input 44 | bool hasTrain = false; 45 | bool hasAttr = false; 46 | 47 | for(int argNo = 1; argNo < argc; argNo += 2) 48 | { 49 | if(!args[argNo].compare("-t")) 50 | { 51 | trainFName = args[argNo + 1]; 52 | hasTrain = true; 53 | } 54 | else if(!args[argNo].compare("-r")) 55 | { 56 | attrFName = args[argNo + 1]; 57 | hasAttr = true; 58 | } 59 | else 60 | throw VIS_INPUT_ERR; 61 | } 62 | 63 | if(!(hasTrain && hasAttr)) 64 | throw VIS_INPUT_ERR; 65 | 66 | //2. Load data 67 | INDdata data(trainFName.c_str(), "", "", attrFName.c_str()); 68 | 69 | //3. Calculate and output correlations 70 | INDsample sample(data); 71 | sample.newBag(); 72 | sample.correlations(trainFName); 73 | 74 | 75 | }catch(TE_ERROR err){ 76 | te_errMsg((TE_ERROR)err); 77 | return 1; 78 | }catch(VIS_ERROR err){ 79 | ErrLogStream errlog; 80 | switch(err) 81 | { 82 | case VIS_INPUT_ERR: 83 | errlog << "Usage: vis_correlations -t _training_set_ -r _attr_file_ | -version\n "; 84 | break; 85 | default: 86 | throw err; 87 | } 88 | return 1; 89 | }catch(exception &e){ 90 | ErrLogStream errlog; 91 | string errstr(e.what()); 92 | exception_errMsg(errstr); 93 | errlog << "Error: " << errstr << "\n"; 94 | return 1; 95 | }catch(...){ 96 | string errstr = strerror(errno); 97 | ErrLogStream errlog; 98 | errlog << "Error: " << errstr << "\n"; 99 | return 1; 100 | } 101 | return 0; 102 | } 103 | -------------------------------------------------------------------------------- /Visualization/vis_definitions.h: -------------------------------------------------------------------------------- 1 | // Visualization / vis_definitions.h: constants, enumerators, typedefs and macros 2 | 3 | #pragma once 4 | #pragma warning(disable : 4996) 5 | 6 | enum VIS_ERROR 7 | { 8 | VIS_INPUT_ERR = 101 9 | }; 10 | 11 | -------------------------------------------------------------------------------- /Visualization/vis_effect.cpp: -------------------------------------------------------------------------------- 1 | //Visualization / vis_effect.cpp: main function of executable vis_effect 2 | 3 | #include "Grove.h" 4 | #include "TrainInfo.h" 5 | #include "LogStream.h" 6 | #include "ErrLogStream.h" 7 | #include "functions.h" 8 | #include "vis_definitions.h" 9 | #include "ag_functions.h" 10 | 11 | #include 12 | 13 | //vis_effect -v _validation_set_ -r _attr_file_ -f _feature_ [-m _model_file_name_] [-o _output_suffix_] 14 | //[-q _#quantile_values_] | -version 15 | int main(int argc, char* argv[]) 16 | { 17 | try{ 18 | //0. Set log file 19 | LogStream telog; 20 | telog << "\n-----\nvis_effect "; 21 | for(int argNo = 1; argNo < argc; argNo++) 22 | telog << argv[argNo] << " "; 23 | telog << "\n\n"; 24 | 25 | if((argc > 1) && !string(argv[1]).compare("-version")) 26 | { 27 | telog << "TreeExtra version " << VERSION << "\n"; 28 | return 0; 29 | } 30 | 31 | //1. Set default values of parameters 32 | string modelFName = "model.bin"; //name of the input file for the model 33 | string suffix; //suffix of the output file 34 | int quantN = 10; //number of quantile point values to plot 35 | 36 | TrainInfo ti; 37 | string attrName; // partial dependence attribute name 38 | 39 | //2. Set parameters from command line 40 | //check that the number of arguments is even (flags + value pairs) 41 | if(argc % 2 == 0) 42 | throw VIS_INPUT_ERR; 43 | //convert input parameters to string from char* 44 | stringv args(argc); 45 | for(int argNo = 0; argNo < argc; argNo++) 46 | args[argNo] = string(argv[argNo]); 47 | 48 | //parse and save input parameters 49 | //indicators of presence of required flags in the input 50 | bool hasVal = false; 51 | bool hasAttr = false; 52 | bool hasFeature = false; 53 | 54 | for(int argNo = 1; argNo < argc; argNo += 2) 55 | { 56 | if(!args[argNo].compare("-m")) 57 | modelFName = args[argNo + 1]; 58 | else if(!args[argNo].compare("-o")) 59 | suffix = args[argNo + 1]; 60 | else if(!args[argNo].compare("-v")) 61 | { 62 | ti.validFName = args[argNo + 1]; 63 | hasVal = true; 64 | } 65 | else if(!args[argNo].compare("-r")) 66 | { 67 | ti.attrFName = args[argNo + 1]; 68 | hasAttr = true; 69 | } 70 | else if(!args[argNo].compare("-f")) 71 | { 72 | attrName = args[argNo + 1]; 73 | hasFeature = true; 74 | } 75 | else if(!args[argNo].compare("-q")) 76 | quantN = atoi(argv[argNo + 1]); 77 | else 78 | throw VIS_INPUT_ERR; 79 | } 80 | 81 | if(!(hasVal && hasAttr && hasFeature)) 82 | throw VIS_INPUT_ERR; 83 | 84 | //2. Load data 85 | INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 86 | ti.attrFName.c_str()); 87 | CGrove::setData(data); 88 | CTreeNode::setData(data); 89 | 90 | int attrId = data.getAttrId(attrName); 91 | if(!data.isActive(attrId)) 92 | throw ATTR_NAME_ERR; 93 | 94 | //3. Calculate and output data for feature effect plot 95 | outEffects(data, intv(1,attrId), quantN, modelFName, suffix); 96 | 97 | string in_suffix; 98 | if(suffix.size()) 99 | in_suffix = "." + suffix; 100 | string outFName = attrName + in_suffix + ".effect.txt"; 101 | telog << "Partial dependence function values are saved into the file " << outFName << ".\n"; 102 | 103 | }catch(TE_ERROR err){ 104 | te_errMsg((TE_ERROR)err); 105 | return 1; 106 | }catch(VIS_ERROR err){ 107 | ErrLogStream errlog; 108 | switch(err) 109 | { 110 | case VIS_INPUT_ERR: 111 | errlog << "Usage: -v _validation_set_ -r _attr_file_ -f _feature_ [-m _model_file_name_] " 112 | << "[-o _output_file_name_] [-q _#quantile_values_] | -version\n"; 113 | break; 114 | default: 115 | throw err; 116 | } 117 | return 1; 118 | }catch(exception &e){ 119 | ErrLogStream errlog; 120 | string errstr(e.what()); 121 | exception_errMsg(errstr); 122 | errlog << "Error: " << errstr << "\n"; 123 | return 1; 124 | }catch(...){ 125 | string errstr = strerror(errno); 126 | ErrLogStream errlog; 127 | errlog << "Error: " << errstr << "\n"; 128 | return 1; 129 | } 130 | return 0; 131 | } 132 | -------------------------------------------------------------------------------- /Visualization/vis_iplot.cpp: -------------------------------------------------------------------------------- 1 | //Visualization / vis_iplot.cpp: main function of executable vis_iplot 2 | 3 | #include "Grove.h" 4 | #include "TrainInfo.h" 5 | #include "LogStream.h" 6 | #include "ErrLogStream.h" 7 | #include "functions.h" 8 | #include "vis_definitions.h" 9 | #include "ag_functions.h" 10 | 11 | #include 12 | 13 | 14 | 15 | //vis_iplot -v _validation_set_ -r _attr_file_ -f1 _feature1_ -f2 _feature2_ [-q1 _#quantile_values1_] 16 | //[-q2 _#quantile_values2_] [-m _model_file_name_] [-o _output_file_suffix_] [-x _fixed_values_file_] | -version 17 | int main(int argc, char* argv[]) 18 | { 19 | try{ 20 | //0. Set log file 21 | LogStream telog; 22 | telog << "\n-----\nvis_iplot "; 23 | for(int argNo = 1; argNo < argc; argNo++) 24 | telog << argv[argNo] << " "; 25 | telog << "\n\n"; 26 | 27 | if((argc > 1) && !string(argv[1]).compare("-version")) 28 | { 29 | telog << "TreeExtra version " << VERSION << "\n"; 30 | return 0; 31 | } 32 | 33 | //1. Set default values of parameters 34 | string modelFName = "model.bin"; //name of the input file for the model 35 | string suffix; //suffix for the output files 36 | string fixedFName; //name of the input file for fixed attributes and their values 37 | int quantN1 = 10; //number of quantile point values to plot for feature 1 38 | int quantN2 = 10; //number of quantile point values to plot for feature 2 39 | 40 | TrainInfo ti; 41 | string attrName1, attrName2; //interacting attribute names 42 | 43 | //2. Set parameters from command line 44 | //check that the number of arguments is even (flags + value pairs) 45 | if(argc % 2 == 0) 46 | throw VIS_INPUT_ERR; 47 | //convert input parameters to string from char* 48 | stringv args(argc); 49 | for(int argNo = 0; argNo < argc; argNo++) 50 | args[argNo] = string(argv[argNo]); 51 | 52 | //parse and save input parameters 53 | //indicators of presence of required flags in the input 54 | bool hasVal = false; 55 | bool hasAttr = false; 56 | bool hasF1 = false; 57 | bool hasF2 = false; 58 | 59 | for(int argNo = 1; argNo < argc; argNo += 2) 60 | { 61 | if(!args[argNo].compare("-m")) 62 | modelFName = args[argNo + 1]; 63 | else if(!args[argNo].compare("-o")) 64 | suffix = args[argNo + 1]; 65 | else if(!args[argNo].compare("-x")) 66 | fixedFName = args[argNo + 1]; 67 | else if(!args[argNo].compare("-v")) 68 | { 69 | ti.validFName = args[argNo + 1]; 70 | hasVal = true; 71 | } 72 | else if(!args[argNo].compare("-r")) 73 | { 74 | ti.attrFName = args[argNo + 1]; 75 | hasAttr = true; 76 | } 77 | else if(!args[argNo].compare("-f1")) 78 | { 79 | attrName1 = args[argNo + 1]; 80 | hasF1 = true; 81 | } 82 | else if(!args[argNo].compare("-f2")) 83 | { 84 | attrName2 = args[argNo + 1]; 85 | hasF2 = true; 86 | } 87 | else if(!args[argNo].compare("-q1")) 88 | quantN1 = atoi(argv[argNo + 1]); 89 | else if(!args[argNo].compare("-q2")) 90 | quantN2 = atoi(argv[argNo + 1]); 91 | else 92 | throw VIS_INPUT_ERR; 93 | } 94 | 95 | if(!(hasVal && hasAttr && hasF1 && hasF2)) 96 | throw VIS_INPUT_ERR; 97 | 98 | //2. Load data 99 | INDdata data(ti.trainFName.c_str(), ti.validFName.c_str(), ti.testFName.c_str(), 100 | ti.attrFName.c_str()); 101 | CGrove::setData(data); 102 | CTreeNode::setData(data); 103 | 104 | //3. Calculate and output data for the interaction plot 105 | int attrId1 = data.getAttrId(attrName1); 106 | int attrId2 = data.getAttrId(attrName2); 107 | if(!data.isActive(attrId1) || !data.isActive(attrId2)) 108 | throw ATTR_NAME_ERR; 109 | 110 | outIPlots(data, iipairv(1, iipair(attrId1, attrId2)), quantN1, quantN2, modelFName, 111 | suffix, fixedFName); 112 | 113 | string in_suffix; 114 | if(suffix.size()) 115 | in_suffix = "." + suffix; 116 | string outFName = attrName1 + "." + attrName2 + in_suffix + ".iplot.txt"; 117 | 118 | string denFName = insertSuffix(outFName, "dens"); 119 | 120 | telog << "Joint effect values are saved into file " << outFName << ".\n"; 121 | telog << "Density table is saved into file " << denFName << ".\n"; 122 | 123 | }catch(TE_ERROR err){ 124 | te_errMsg((TE_ERROR)err); 125 | return 1; 126 | }catch(VIS_ERROR err){ 127 | ErrLogStream errlog; 128 | switch(err) 129 | { 130 | case VIS_INPUT_ERR: 131 | errlog << "Usage: -v _validation_set_ -r _attr_file_ -f1 _feature1_ -f2 _feature2_ " 132 | << "[-q1 _#quantile_values1_] [-q2 _#quantile_values2_] [-m _model_file_name_] " 133 | << "[-o _output_file_suffix_] [-x _fixed_values_file_] | -version\n"; 134 | break; 135 | default: 136 | throw err; 137 | } 138 | return 1; 139 | }catch(exception &e){ 140 | ErrLogStream errlog; 141 | string errstr(e.what()); 142 | exception_errMsg(errstr); 143 | errlog << "Error: " << errstr << "\n"; 144 | return 1; 145 | }catch(...){ 146 | string errstr = strerror(errno); 147 | ErrLogStream errlog; 148 | errlog << "Error: " << errstr << "\n"; 149 | return 1; 150 | } 151 | return 0; 152 | } 153 | -------------------------------------------------------------------------------- /docs/AG_manual.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/AG_manual.htm -------------------------------------------------------------------------------- /docs/AG_model.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/AG_model.htm -------------------------------------------------------------------------------- /docs/AG_quickstart.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Additive Groves - quick start 5 | 6 | 7 | 8 | Back to TreeExtra package web page
9 | Back to Additive Groves manual web page 10 |

11 | Additive Groves - quick start 12 |

13 |
  • 14 |

    Download executable binaries from the latest TreeExtra release. You can also download sources and compile them, but normally you don't have to. TreeExtra binaries are standalone tools that run from command line. (In Windows, you need to use something like Command Line Prompt (cmd) to run command line tools.)

    15 |
  • 16 |

    Prepare your data set following the instructions on input data format. 17 | You will need train, validation, test and attribute files. Here is a sample synthetic data set: 18 | data.train, data.valid, 19 | data.test, data.attr.

    20 |
  • 21 |

    Create a new folder where you want to run this experiment and cd there. Output and temporary files 22 | will be placed in this folder.

    23 |
  • 24 |

    Run ag_train (if needed, modify the file names in the following command 25 | line):

    26 | > ag_train -t data.train -v data.valid -r data.attr

    27 |
  • 28 |

    The log output will end with the recommendation which command to run next. Most likely the 29 | recommendation will be to run ag_expand. Keep following 30 | recommendations (often it takes about 6 runs of ag_expand) until you 31 | run ag_save.

    32 | ... recommendation: ag_expand -b 90
    33 | > ag_expand -b 90
    34 | ... recommendation: ag_expand -b 140
    35 | > ag_expand -b 140 36 | ...
    37 | ... recommendation: ag_save -a 0.02 -n 6
    38 | > ag_save -a 0.02 -n 6

    39 | 40 | The best model is saved in the file model.bin.
    41 |

  • 42 |

    Run ag_predict on the test data:

    43 | > ag_predict -p data.test -r data.attr
    44 | ... RMSE: 0.574717

    45 | 46 | That's it. The predictions on the test set are saved in preds.txt.

    47 |
  • 48 |

    If you can afford to increase the running time of the program, I recommend you repeat the same 49 | experiment in the slow mode. This will create a better model with better performance. To do it, 50 | run ag_train with an additional flag -s slow. The 51 | rest of the process is the same.
    52 | > ag_train -t data.train -v data.valid -r data.attr -s slow
    53 | > ag_expand -n 16 -b 90
    54 | > ag_expand -b 140 55 | ...
    56 | > ag_save -a 0.05 -n 4
    57 | > ag_predict -p data.test -r data.attr
    58 | ... RMSE: 0.565393

    59 | 60 |
  • 61 |

    Check out the rest of Additive Groves manual for other options like 62 | parallelization, evaluation by ROC, "superfast" training with fixed parameters, etc.

    63 |
64 | 65 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /docs/AdditiveGroves.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/AdditiveGroves.ppt -------------------------------------------------------------------------------- /docs/BT_manual.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/BT_manual.htm -------------------------------------------------------------------------------- /docs/CMU_2010.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/CMU_2010.ppt -------------------------------------------------------------------------------- /docs/Data_Format.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Data Format 5 | 6 | 7 | 8 | Back to TreeExtra package web page 9 |

Data format

10 |

Data files

11 |

12 | Train, validation and test data sets for all tools in TreeExtra should be provided in separate tab-delimited 13 | text files without any headers. Only continuous and Boolean features are supported. Nominal features are allowed in the data file, but they should be explicitly 14 | marked as unused in the attribute file (see next section). 15 | Missing values should be encoded with question marks. All data sets should have 16 | the same number and order of columns.  If your test data does not have labels, 17 | you can put missing values instead, but the column should be still present. Binary classification problems should use 0 and 1 for the response 18 | values.

19 |

Attribute files

20 |

21 | A separate attribute file describing data is required. I reused the idea of an attribute 22 | file from IND package, so the format of this file should be compatible with IND 23 | to some extent.

24 |

25 | Each line in the first part of the attribute file corresponds to a single attribute. 26 | The order of attributes should be the same as in the data file.

27 |

28 | The structure of the attribute description is the following:

29 |

30 | _attr_name_: _type_ [(class)|(weight)].

31 |

32 | _type_ should be either cont for continuous features, 0,1 for boolean or 33 | nom for nominals. (class) 34 | marks the label, there should be exactly one attribute marked with (class) per attribute 35 | file. (weight) marks the column with weights.
36 | The first and the second parts of the attribute file should be separated by a line 37 | "contexts:".
38 | The second part lists attributes that should not be used for training. Each line 39 | contains one attribute in the format:

40 |

41 | _attr_name_ never

42 |

43 | Here is an example of a valid attribute file:

44 |

45 | 46 | latitude: cont. 47 | longtitude: cont. 48 | x: cont. 49 | y: cont. 50 | name: nom. 51 | label: 0,1 (class). 52 | region_Pacific: 0,1. 53 | region_Mountain: 0,1. 54 | region_NA: 0,1. 55 | coefficient: cont (weight). 56 | contexts: 57 | y never 58 | x never 59 | name never 60 | 61 |

62 | 63 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /docs/Interactions.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/Interactions.ppt -------------------------------------------------------------------------------- /docs/TreeExtra.css: -------------------------------------------------------------------------------- 1 |  body { font-family:Times New Roman;font-size:12pt} 2 | .code { white-space: pre ; font-size:10pt;font-family: Courier New;font-weight: bolder;} 3 | .snippet { white-space: pre ; font-size:10pt;font-family: Courier New;font-face:bold;} 4 | .codeblue { white-space: pre ; font-size:10pt; font-weight: bolder;font-family: Courier New;font-face:bold; color: #000099;} 5 | .output { white-space: pre ;font-size:10pt; font-weight: bolder;font-family: Courier New;font-face:bold; color: #009900;} 6 | .folder {font-family: Courier New;} 7 | -------------------------------------------------------------------------------- /docs/background.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/background.gif -------------------------------------------------------------------------------- /docs/cookie.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 |

Visits from this machine will not show in Google Analytics for this website any more.

20 | 21 | -------------------------------------------------------------------------------- /docs/data.attr: -------------------------------------------------------------------------------- 1 | class: 0,1 (class). 2 | x1: cont. 3 | x2: cont. 4 | x3: cont. 5 | x4: cont. 6 | x5: cont. 7 | x6: cont. 8 | x7: cont. 9 | x8: cont. 10 | x9: cont. 11 | x10: cont. 12 | 13 | -------------------------------------------------------------------------------- /docs/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2012, Daria Sorokina 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the organizations nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /docs/papers/BirdMining.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/BirdMining.pdf -------------------------------------------------------------------------------- /docs/papers/ChenDubrawskiSorokina.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/ChenDubrawskiSorokina.doc -------------------------------------------------------------------------------- /docs/papers/Interactions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/Interactions.pdf -------------------------------------------------------------------------------- /docs/papers/PlagiarismDetection_full.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/PlagiarismDetection_full.pdf -------------------------------------------------------------------------------- /docs/papers/ScalableGBFS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/ScalableGBFS.pdf -------------------------------------------------------------------------------- /docs/papers/Similarity64.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/Similarity64.zip -------------------------------------------------------------------------------- /docs/papers/Wildlife.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/Wildlife.pdf -------------------------------------------------------------------------------- /docs/papers/a9ext_sigir16.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/a9ext_sigir16.pdf -------------------------------------------------------------------------------- /docs/papers/brain.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/brain.pdf -------------------------------------------------------------------------------- /docs/papers/chapter-featureeval.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/chapter-featureeval.pdf -------------------------------------------------------------------------------- /docs/papers/fslr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/fslr.pdf -------------------------------------------------------------------------------- /docs/papers/groves.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/groves.pdf -------------------------------------------------------------------------------- /docs/papers/kddcup09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/kddcup09.pdf -------------------------------------------------------------------------------- /docs/papers/ranking_AG.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/ranking_AG.pdf -------------------------------------------------------------------------------- /docs/papers/rmbo_full.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/rmbo_full.pdf -------------------------------------------------------------------------------- /docs/papers/thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/papers/thesis.pdf -------------------------------------------------------------------------------- /docs/ranking_AG.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariasor/TreeExtra/007c68c748f17e583a5665a01ff325afa321668e/docs/ranking_AG.pdf -------------------------------------------------------------------------------- /docs/styles.css: -------------------------------------------------------------------------------- 1 | 2 | BODY 3 | { 4 | COLOR: #999999; 5 | } 6 | 7 | 8 | IMG 9 | { 10 | border : 0 11 | } 12 | 13 | 14 | .BigText 15 | { 16 | FONT-SIZE: 48pt; 17 | FONT-FAMILY:Garamond,'Times New Roman'; 18 | font-style :italic; 19 | text-decoration : underline; 20 | font-weight :normal; 21 | color:#999999; 22 | } 23 | 24 | .BigTextNU 25 | { 26 | FONT-SIZE: 48pt; 27 | FONT-FAMILY:Garamond,'Times New Roman'; 28 | font-style :italic; 29 | text-decoration : none; 30 | font-weight :normal; 31 | color:#999999; 32 | } 33 | 34 | .TinyLink 35 | { 36 | text-decoration : none; 37 | FONT-FAMILY :Courier New; 38 | font-style :normal; 39 | font-weight :bold; 40 | font-size :12pt; 41 | color:Black; 42 | } 43 | 44 | A 45 | { 46 | text-decoration : underline; 47 | color: #707070; 48 | } 49 | 50 | .WhiteCell 51 | { 52 | border : solid 1px gray; 53 | background-color : White; 54 | padding : 10; 55 | font-family :Times New Roman; 56 | font-size :12pt; 57 | color: #606060; 58 | font-style:italic; 59 | text-align:justify; 60 | } 61 | .WhiteCell .contact 62 | { 63 | font-style:normal; 64 | } 65 | 66 | 67 | .ResumeWhiteCell 68 | { 69 | border : solid 1px gray; 70 | background-color : White; 71 | padding : 10; 72 | font-family :Times New Roman; 73 | font-size :12pt; 74 | color: #404040; 75 | text-align:justify; 76 | 77 | } 78 | 79 | .ResumeWhiteCell TABLE 80 | { 81 | background-color : White; 82 | padding : 10; 83 | font-family :Times New Roman; 84 | font-size :12pt; 85 | color: #404040; 86 | text-align:justify; 87 | } 88 | 89 | .DatesCell 90 | { 91 | font-weight: bold; 92 | 93 | 94 | } 95 | .DatesCell2 96 | { 97 | font-weight: bold; 98 | } 99 | 100 | .ResumeHeader2 101 | { 102 | font-family:Garamond, 'Times New Roman'; 103 | text-align : center; 104 | font-size :16pt; 105 | font-weight:bold; 106 | } 107 | .ResumeHeader3 108 | { 109 | font-family:Garamond, 'Times New Roman'; 110 | text-align : center; 111 | font-size:20pt; 112 | font-weight:bold; 113 | } 114 | .ResumeTable 115 | { 116 | } 117 | .ResumeTable TD 118 | { 119 | PADDING-BOTTOM: 10px; 120 | } 121 | .Name 122 | { 123 | font-family:Garamond, 'Times New Roman'; 124 | font-size:18pt; 125 | font-style:italic; 126 | } 127 | .NameHeader 128 | { 129 | font-style:normal; 130 | font-family:Garamond, 'Times New Roman'; 131 | font-size:16pt; 132 | font-weight:bold; 133 | } 134 | .ButtonWhiteCell 135 | { 136 | background-color : White; 137 | font-family :Times New Roman; 138 | font-size :12pt; 139 | color: #505050; 140 | text-align:center; 141 | width:120; 142 | height:28; 143 | padding :0; 144 | border : solid 1px gray; 145 | } 146 | 147 | .ButtonWhiteCell2 148 | { 149 | background-color : White; 150 | font-family :Times New Roman; 151 | font-size :12pt; 152 | color: #505050; 153 | text-align:left; 154 | height:28; 155 | padding-left :10; 156 | padding-right :10; 157 | border : solid 1px gray; 158 | } 159 | .PhotoWhiteCell 160 | { 161 | background-color : White; 162 | font-family :Times New Roman; 163 | font-size :12pt; 164 | color: #505050; 165 | text-align:center; 166 | padding :15; 167 | border : solid 1px gray; 168 | } 169 | 170 | .BigPhotoWhiteCell 171 | { 172 | background-color : White; 173 | font-family :Times New Roman; 174 | font-style:italic; 175 | font-size :12pt; 176 | color: #707070; 177 | text-align:center; 178 | padding-left:40; 179 | padding-right:40; 180 | padding-top:40; 181 | border : solid 1px gray; 182 | } 183 | .MainText 184 | { 185 | padding-bottom :0; 186 | padding-top :0; 187 | line-height : 10px; 188 | text-decoration:none; 189 | } 190 | .SmallFont 191 | { 192 | font-size :8pt; 193 | color :Gray 194 | } 195 | .Date 196 | { 197 | font-weight:normal; 198 | FONT-FAMILY :Times New Roman; 199 | font-size :12pt; 200 | } 201 | .Button 202 | { 203 | width:100; 204 | height:30; 205 | text-align:center; 206 | background-color:#A0A0A0; 207 | } 208 | .Button A 209 | { 210 | color:White; 211 | text-decoration:none; 212 | font-size:16pt; 213 | font-family:Courier; 214 | font-weight:600; 215 | } 216 | .BigPushedButton 217 | { 218 | width:116; 219 | height:24; 220 | text-align:center; 221 | background-color:White; 222 | color:Gray; 223 | font-size:15pt; 224 | font-family:Courier; 225 | font-weight:600; 226 | } 227 | .BigUnderButton 228 | { 229 | width:120; 230 | height:30; 231 | text-align:center; 232 | background-color:Gray; 233 | } 234 | .Big2PushedButton 235 | { 236 | width:136; 237 | height:24; 238 | text-align:center; 239 | background-color:White; 240 | color:Gray; 241 | font-size:15pt; 242 | font-family:Courier; 243 | font-weight:600; 244 | } 245 | .Big2UnderButton 246 | { 247 | width:140; 248 | height:30; 249 | text-align:center; 250 | background-color:Gray; 251 | } 252 | .PushedButton 253 | { 254 | width:96; 255 | height:24; 256 | text-align:center; 257 | background-color:White; 258 | color:Gray; 259 | font-size:15pt; 260 | font-family:Courier; 261 | font-weight:600; 262 | } 263 | .UnderButton 264 | { 265 | width:100; 266 | height:30; 267 | text-align:center; 268 | background-color:Gray; 269 | } 270 | 271 | .SmallLettersButton 272 | { 273 | width:100; 274 | height:30; 275 | text-align:center; 276 | background-color:#A0A0A0; 277 | } 278 | .SmallLettersButton A 279 | { 280 | color:White; 281 | text-decoration:none; 282 | font-size:14pt; 283 | font-family:Courier; 284 | font-weight:600; 285 | } 286 | .BigButton 287 | { 288 | width:120; 289 | height:30; 290 | text-align:center; 291 | background-color:#A0A0A0; 292 | } 293 | .BigButton A 294 | { 295 | color:White; 296 | text-decoration:none; 297 | font-size:16pt; 298 | font-family:Courier; 299 | font-weight:600; 300 | } 301 | .Big2Button 302 | { 303 | width:140; 304 | height:30; 305 | text-align:center; 306 | background-color:#A0A0A0; 307 | } 308 | .Big2Button A 309 | { 310 | color:White; 311 | text-decoration:none; 312 | font-size:16pt; 313 | font-family:Courier; 314 | font-weight:600; 315 | } 316 | .ShortButton 317 | { 318 | width:60; 319 | height:30; 320 | text-align:center; 321 | background-color:#A0A0A0; 322 | } 323 | .ShortButton A 324 | { 325 | color:White; 326 | text-decoration:none; 327 | font-size:16pt; 328 | font-family:Courier; 329 | font-weight:600; 330 | } 331 | .ShortPushedButton 332 | { 333 | width:56; 334 | height:24; 335 | text-align:center; 336 | background-color:White; 337 | color:Gray; 338 | font-size:15pt; 339 | font-family:Courier; 340 | font-weight:600; 341 | } 342 | .ShortUnderButton 343 | { 344 | width:60; 345 | height:30; 346 | text-align:center; 347 | background-color:Gray; 348 | } 349 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | This BSD license refers to the whole TreeExtra package including standalone code for ag_predict_stream with the 2 | exception of the ThreadPool library, which is covered by LGPLv2.1 license. 3 | 4 | Copyright (c) Daria Sorokina, Xiaojie Wang, Yichen Zhou 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | * Neither the name of the organizations nor the 15 | names of its contributors may be used to endorse or promote products 16 | derived from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY 19 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 22 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | rm -f shared/*.o 2 | rm -f AdditiveGroves/*.o 3 | rm -f BaggedTrees/*.o 4 | rm -f Visualization/*.o 5 | cd AdditiveGroves 6 | make --makefile Makefile 7 | cd ../BaggedTrees 8 | make --makefile Makefile 9 | cd ../Visualization 10 | make --makefile Makefile 11 | -------------------------------------------------------------------------------- /shared/ErrLogStream.h: -------------------------------------------------------------------------------- 1 | // ErrLogStream.h: implementation of ErrLogStream class and << operator 2 | // Redirects output to both cerr (console error output) and file log.txt 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | class ErrLogStream 10 | { 11 | }; 12 | 13 | template 14 | ErrLogStream& operator << (ErrLogStream& errlogout, T& data) 15 | { 16 | cerr << data; 17 | cerr.flush(); 18 | 19 | fstream fout; 20 | fout.open("log.txt", ios_base::out | ios_base::app); 21 | fout << data; 22 | fout.close(); 23 | return errlogout; 24 | } 25 | -------------------------------------------------------------------------------- /shared/INDdata.h: -------------------------------------------------------------------------------- 1 | // INDdata.h: interface for the INDdata class 2 | 3 | #pragma once 4 | #include "ItemInfo.h" 5 | 6 | class INDdata 7 | { 8 | public: 9 | //loads data into memory 10 | INDdata(const char* trainFName, const char* valFName, const char* testFName, 11 | const char* attrFName, bool doOut = true); 12 | 13 | //get functions for private members 14 | int getAttrN(){return attrN;} 15 | int getTrainN(){return trainN;} 16 | int getTarColNo(){return tarColNo;} 17 | int getTargets(doublev& targets, doublev& weights, DATA_SET dset); 18 | bool getHasWeights(){return weightColNo != -1;} 19 | bool getHasActiveMV(){return hasActiveMV;} 20 | bool useCoef(){return hasActiveMV || (weightColNo != -1);} 21 | int getColNo(int attrId){return aIdToColNo[attrId];} 22 | //Return data references to avoid copying large sized data 23 | doublev& getTrainTar() { return trainTar; } 24 | doublev& getTrainWt() { return trainWt; } 25 | floatvv& getTrain() { return train; } 26 | 27 | //untrivial get functions 28 | 29 | //gets attrID by its name 30 | int getAttrId(string attrName); 31 | 32 | //return name of the column (attribute or target) 33 | string colToName(int column); 34 | 35 | //gets a list of active attributes 36 | void getActiveAttrs(intv& attrs); 37 | 38 | //gets a value of a given attribute for a given case in a given data set 39 | double getValue(int itemNo, int attrId, DATA_SET dset); 40 | 41 | //returns the name of the attribute by its number 42 | string getAttrName(int attrId); 43 | 44 | //returns counts and quantile values 45 | int getQuantiles(int attrId, int& quantN, dipairv& valCounts); 46 | 47 | //returns std of response 48 | double getTarStD(DATA_SET ds); 49 | 50 | //"question" functions 51 | 52 | //checks if attribute is boolean 53 | bool boolAttr(int attrId); 54 | 55 | //checks if all target values in test set are valid 56 | bool hasTrueTest(); 57 | 58 | //checks if the attr number is valid and active 59 | bool isActive(int attrId); 60 | 61 | //action functions 62 | 63 | //deactivates the attribute 64 | void ignoreAttr(int attrId); 65 | 66 | //actuvates the attribute 67 | void useAttr(int attrId); 68 | 69 | //inserts a new data point into the data set 70 | int addTestItem(idpairv& values); 71 | 72 | //outputs a version of attribute file where only a predefined set of features is active 73 | void outAttr(string attrFName); 74 | 75 | //gets all values of two specific attribute in the validation data 76 | void getValues(int attr1No, int attr2No, ddpairv& values); 77 | 78 | //gets all values of a specific attribute in the validation set 79 | void getValues(int attrId, doublev& values); 80 | 81 | intset& getSplitAttrs() { return splitAttrs; } 82 | 83 | private: 84 | //gets a line of text, returns a vector with data points 85 | void readData(char* buf, streamsize buflen, floatv& retv, int retvlen); 86 | 87 | private: 88 | int attrN; //number of attributes 89 | int colN; //number of columns in the data file 90 | 91 | intv aIdToColNo; //attribute ids to column numbers 92 | intset boolAttrs; //boolean attributes 93 | intset nomAttrs; //nominal attributes 94 | //boolv rawNom; //boolean vector with the original number of columns, marks columns with nominal attributes 95 | intset ignoreAttrs; //attributes that should be ignored 96 | intset splitAttrs; //Attributes that are allowed to split nodes of FirTree (usually query-level features) 97 | 98 | boolv rawIgnore; //boolean vector with the original number of columns, marks columns with the attributes that should not be used 99 | stringv attrNames; //names of attributes 100 | string tarName; //name of the response attribute 101 | int tarColNo; //response column number 102 | int weightColNo; //weights column number 103 | 104 | int trainN; //number of data points in the train set 105 | floatvv train; //train set data w/o response 106 | doublev trainTar; //train set response 107 | doublev trainWt; //train set weights 108 | 109 | int validN; //number of data points in the validation set 110 | floatvv valid; //validation set data w/o response 111 | doublev validTar; //validation set response 112 | doublev validWt; //validation set weights 113 | 114 | int testN; //number of data points in the test set 115 | floatvv test; //test set data w/o response 116 | doublev testTar; //test set response 117 | doublev testWt; //test set weights 118 | 119 | bool hasMV; //data has missing values 120 | bool hasActiveMV; //data has missing values in active attributes 121 | 122 | }; 123 | -------------------------------------------------------------------------------- /shared/INDsample.h: -------------------------------------------------------------------------------- 1 | //INDsample.h: Consist of a state for the rand_r function and a bag of data 2 | // 3 | // (c) Xiaojie Wang 4 | 5 | #pragma once 6 | #include "INDdata.h" 7 | 8 | class INDsample 9 | { 10 | public: 11 | INDsample(INDdata& data); 12 | 13 | //get functions for private members 14 | double getBagV() { return bagV; } 15 | int getBagDataN() { return bootstrap.size(); } 16 | int getOutOfBag(intv& oobData, doublev& oobTar, doublev& oobWt); 17 | 18 | //untrivial get functions 19 | //gets current bag of training data 20 | void getCurBag(ItemInfov& itemSet); 21 | int getCurBag(intv& bagData, doublev& bagTar, doublev& bagWt); 22 | 23 | //gets sorted indexes of current training data 24 | void getSortedData(fipairvv& sorted); 25 | 26 | //action functions 27 | //replaces bootstrap in the bag 28 | void newBag(void); 29 | 30 | //subsampling without replacement 31 | void newSample(int sampleN); 32 | 33 | //calculates and outputs correlation scores between active attributes based on the training set 34 | void correlations(string trainFName); 35 | 36 | private: 37 | //create versions of bootstrap data sorted by active continuous attributes 38 | void sortItems(); 39 | 40 | private: 41 | unsigned int state; // Avoid data sampling of a thread affecting that of other threads 42 | // The same state yields the same sequence of sampled bags 43 | INDdata& data; // Data access reference 44 | 45 | // The following five variables come from the INDdata class 46 | intv bootstrap; //indexes of data points currently in the bag, can be repeating 47 | int oobN; //number of out-of-bag data points 48 | intv oobData; //indexes of out-of-bag data points 49 | 50 | double bagV; //sum of weights 51 | 52 | fipairvv sortedItems; //several copies of sorted data points in the bag 53 | //separate vector for sorting by each attribute 54 | //each data point represented as (id, attrvalue) pair 55 | }; 56 | -------------------------------------------------------------------------------- /shared/ItemInfo.h: -------------------------------------------------------------------------------- 1 | //ItemInfo.h: ItemInfo structure 2 | 3 | #pragma once 4 | #include "definitions.h" 5 | 6 | //Information about a case in a tree node trainset subset or prediction of the leaf 7 | struct ItemInfo 8 | { 9 | ItemInfo(){key=0;coef=1;response=0;} 10 | 11 | int key; //case id 12 | double coef; //case belongs to the node with coefficient coef. 0 ItemInfov; 17 | -------------------------------------------------------------------------------- /shared/LogStream.cpp: -------------------------------------------------------------------------------- 1 | // LogStream.cpp: implementation of LogStream class and << operator 2 | // Redirects output to both console and file log.txt 3 | 4 | #include "LogStream.h" 5 | #include "definitions.h" 6 | 7 | //static variable showing if log messages are sent into the standard output in addition to log files 8 | bool LogStream::doOut = true; 9 | 10 | //static initialization, needs to be called once in the whole program 11 | void LogStream::init(bool doOut_in) 12 | { 13 | doOut = doOut_in; 14 | 15 | //if log.txt exists, append its content to log.archive.txt 16 | fstream foldlog; 17 | foldlog.open("log.txt", ios_base::in); 18 | if(foldlog) 19 | { 20 | fstream farchive; 21 | farchive.open("log.archive.txt", ios_base::out | ios_base::app); 22 | farchive << foldlog.rdbuf(); 23 | } 24 | 25 | //open clean log.txt 26 | fstream fout; 27 | fout.open("log.txt", ios_base::out); 28 | if(!fout) 29 | cout << "\nWARNING: failed to open log file log.txt\n"; 30 | fout.close(); 31 | } 32 | 33 | 34 | -------------------------------------------------------------------------------- /shared/LogStream.h: -------------------------------------------------------------------------------- 1 | // LogStream.h: declaration of LogStream class and implementation of LogStream << operator 2 | // Redirects output to both console and file log.txt 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include "definitions.h" 10 | 11 | class LogStream 12 | { 13 | public: 14 | //Clears or creates log.txt file. Should be called once in the whole program. 15 | static void init(bool doOut_in); 16 | static bool doOut; //turns on/off console output 17 | }; 18 | 19 | template 20 | LogStream& operator << (LogStream& logcout, T data) 21 | { 22 | if(LogStream::doOut) 23 | { 24 | cout << data; 25 | cout.flush(); 26 | } 27 | 28 | fstream fout; 29 | fout.open("log.txt", ios_base::out | ios_base::app); 30 | 31 | fout << data; 32 | fout.close(); 33 | return logcout; 34 | } 35 | -------------------------------------------------------------------------------- /shared/SplitInfo.cpp: -------------------------------------------------------------------------------- 1 | //SplitInfo.cpp: implementation of SplitInfo structure 2 | 3 | #include "SplitInfo.h" 4 | #include "math.h" 5 | 6 | //default constructor 7 | SplitInfo::SplitInfo():divAttr(-1) 8 | { 9 | missingL = QNAN; 10 | border = QNAN; 11 | } 12 | 13 | //init constructor 14 | //missingL sometimes is set later 15 | SplitInfo::SplitInfo(int attr, double point, double miss): 16 | divAttr(attr), border(point), missingL(miss) 17 | { 18 | 19 | } 20 | 21 | 22 | //in: value of the split attribute for the input case 23 | //out: value of the coefficient for the case regarding going down the left branch 24 | double SplitInfo::leftCoef(double value) 25 | { 26 | //left coefficient for missing values is defined by missingL 27 | if(isnan(value)) 28 | return missingL; 29 | 30 | //absence of border value indicates special split - non-missing vs missing. 31 | //Missing values are treated in the previous if, so this can only be a non-missing value that should go left 32 | if(isnan(border)) 33 | return 1; 34 | 35 | //the rest is standard crisp split 36 | if(value <= border) 37 | return 1; 38 | else //if(value > border) 39 | return 0; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /shared/SplitInfo.h: -------------------------------------------------------------------------------- 1 | //SplitInfo.h: SplitInfo structure interface 2 | 3 | #pragma once 4 | #include "definitions.h" 5 | 6 | //data structure describing how the node is split into two subnodes 7 | struct SplitInfo 8 | { 9 | public: 10 | //default constructor 11 | SplitInfo(); 12 | 13 | //init constructor 14 | SplitInfo(int attr, double point, double miss = 0.5); 15 | 16 | //returns left branch coefficient given the value of the split attribute 17 | double leftCoef(double value); 18 | 19 | public: 20 | int divAttr; //split attribute id 21 | double missingL; //proportion of missing values going to the left 22 | double border; //split point 23 | }; 24 | 25 | typedef vector SplitInfov; 26 | -------------------------------------------------------------------------------- /shared/TreeNode.h: -------------------------------------------------------------------------------- 1 | // TreeNode.h: interface for the CTreeNode class. 2 | // Represents a node of a regression (decision) tree 3 | // Contains functions used during training and prediction stages 4 | // The performance metric used by the tree is Root Mean Squared Error (RMSE) 5 | 6 | #pragma once 7 | 8 | #include "INDdata.h" 9 | #include "INDsample.h" 10 | #include "SplitInfo.h" 11 | 12 | //Node of a regression tree 13 | class CTreeNode 14 | { 15 | private: 16 | static INDdata* pData; 17 | public: 18 | //initialize static data pointer 19 | static void setData(INDdata& data){pData = &data;} 20 | 21 | public: 22 | //constructor 23 | CTreeNode(); 24 | 25 | //copy constructor 26 | CTreeNode(const CTreeNode& original); 27 | 28 | //destructor 29 | virtual ~CTreeNode(); 30 | 31 | //assignment operator 32 | CTreeNode& operator=(const CTreeNode& rhs); 33 | 34 | //get functions 35 | int getDivAttr() {return splitting.divAttr;} 36 | double getThresh() {return splitting.border;} 37 | double getResp() {return (*pItemSet)[0].response;} //should be applied to leaves only 38 | double getNodeV(); 39 | 40 | double getEntropy(int attrNo); //get entropy of this feature in this node 41 | 42 | //initializes fresh root 43 | void setRoot(INDsample& sample); 44 | 45 | //changes train set responses to residuals 46 | void resetRoot(doublev& othpreds); 47 | 48 | //deletes an attribute from the internal structures 49 | void delAttr(int attrNo); 50 | 51 | //checks if a node is a leaf 52 | bool isLeaf() {return left == NULL;} 53 | 54 | //sends a test case down the tree (used in generating prediction for the test case) 55 | void traverse(int itemNo, double coef, double& ltCoef, double& rtCoef, DATA_SET dset); 56 | 57 | //splits the node; grows two offsprings 58 | bool split(double alpha, INDsample& sample, double* pEntropy = NULL); 59 | 60 | //saves the node into a binary file 61 | void save(fstream& fsave); 62 | 63 | //loads the node from a binary file 64 | bool load(fstream& fload); 65 | 66 | 67 | private: 68 | //delete a subtree 69 | void del(); 70 | 71 | //returns several summaries of the prediction values set in this node 72 | bool getStats(double& nodeV, double& nodeSum); 73 | 74 | //cleans training data out of a leaf 75 | void makeLeaf(double nodeMean); 76 | 77 | //finds and sets a splitting info with the best MSE 78 | bool setSplit(double nodeV, double nodeSum); 79 | 80 | //finds and sets a splitting info with the best MSE when weights are present in the data 81 | bool setSplitW(double nodeV, double nodeSum); 82 | 83 | //finds and sets a splitting info with the best MSE when missing values and possibly weights are present in the data 84 | bool setSplitMV(double nodeV, double nodeSum); 85 | 86 | //evaluates boolean split 87 | double evalBool(SplitInfo& canSplit, double nodeV, double nodeSum); 88 | 89 | //evaluates boolean split with weights present 90 | double evalBoolW(SplitInfo& canSplit, double nodeV, double nodeSum); 91 | 92 | //evaluates boolean split when missing values present in the data 93 | double evalBoolMV(SplitInfo& canSplit, double nodeV, double nodeSum, double missV, double missSum); 94 | 95 | public: 96 | CTreeNode* left; //pointer to the left child 97 | CTreeNode* right; //pointer to the right child 98 | 99 | private: 100 | ItemInfov* pItemSet; //subset of the training set that belongs to the node during training 101 | fipairvv* pSorted; //current itemset indexes sorted by value of attribute 102 | intv* pAttrs; //set of valid attributes in the node 103 | SplitInfo splitting; //split (attribute, split point, proportion for missing values) 104 | 105 | }; 106 | 107 | typedef vector CTreeNodev; 108 | typedef pair nodecoefp; 109 | typedef pair nodeip; 110 | typedef stack nodehstack; 111 | -------------------------------------------------------------------------------- /shared/definitions.h: -------------------------------------------------------------------------------- 1 | // definitions.h: constants, enumerators, typedefs and macros 2 | 3 | #pragma once 4 | #pragma warning(disable : 4996) 5 | 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #ifdef __APPLE__ 15 | #include 16 | #endif 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | using namespace std; 23 | 24 | typedef set intset; 25 | typedef map idmap; 26 | typedef map ddmap; 27 | 28 | typedef vector intv; 29 | typedef vector intvv; 30 | typedef vector intvvv; 31 | typedef vector doublev; 32 | typedef vector doublevv; 33 | typedef vector doublevvv; 34 | typedef vector doublevvvv; 35 | typedef vector floatv; 36 | typedef vector floatvv; 37 | typedef vector stringv; 38 | typedef vector boolv; 39 | typedef vector boolvv; 40 | 41 | typedef pair dipair; 42 | typedef pair fipair; 43 | typedef pair idpair; 44 | typedef pair ddpair; 45 | typedef pair iipair; 46 | typedef pair bbpair; 47 | typedef pair bipair; 48 | typedef pair sspair; 49 | typedef pair ssdtriple; 50 | typedef pair dddtriple; 51 | 52 | typedef vector iipairv; 53 | typedef vector dipairv; 54 | typedef vector fipairv; 55 | typedef vector idpairv; 56 | typedef vector ddpairv; 57 | typedef vector dipairvv; 58 | typedef vector fipairvv; 59 | typedef vector bbpairv; 60 | typedef vector ssdtriplev; 61 | typedef vector dddtriplev; 62 | 63 | typedef numeric_limits flim; 64 | 65 | enum DATA_SET 66 | { 67 | TRAIN = 1, 68 | VALID = 2, 69 | TEST = 3 70 | }; 71 | 72 | enum TE_ERROR 73 | { 74 | TREE_LOAD_ERR = 1, 75 | OPEN_ATTR_ERR = 2, 76 | MULT_CLASS_ERR = 3, 77 | NO_CLASS_ERR = 4, 78 | OPEN_TRAIN_ERR = 5, 79 | OPEN_VALID_ERR = 6, 80 | OPEN_TEST_ERR = 7, 81 | ATTR_ID_ERR = 8, 82 | LONG_LINE_ERR = 9, 83 | VALID_EMPTY_ERR = 10, 84 | ROC_ERR = 11, 85 | ATTR_DATA_MISMATCH_L_ERR = 12, 86 | MODEL_ATTR_MISMATCH_ERR = 13, 87 | ATTR_NAME_ERR = 14, 88 | NO_EFFECT_ERR = 15, 89 | MODEL_ERR = 16, 90 | EMPTY_MODEL_NAME_ERR = 17, 91 | ATTR_TYPE_ERR = 18, 92 | ATTR_NOT_BOOL_ERR = 19, 93 | TREE_WRITE_ERR = 20, 94 | TRAIN_EMPTY_ERR = 21, 95 | ATTR_NAME_DEF_ERR = 22, 96 | NOM_ACTIVE_ERR = 23, 97 | ATTR_DATA_MISMATCH_G_ERR = 24, 98 | NUMERIC_ARG_ERR = 25, 99 | ROC_FLAT_ERR = 26, 100 | OPEN_OUT_ERR = 27, 101 | MV_CLASS_TRAIN_ERR = 28, 102 | MV_CLASS_VALID_ERR = 29, 103 | NON_NUMERIC_VALUE_ERR = 30, 104 | CORR_MV_ERR = 31, 105 | DUPLICATE_ATTRIBUTES_ERR = 32 106 | }; 107 | 108 | //this enum has to be in the general definition file, because it is a part of a model file, and all model 109 | //files should be compatible 110 | enum AG_TRAIN_MODE 111 | { 112 | FAST = 201, 113 | SLOW = 202, 114 | LAYERED = 203 115 | }; 116 | 117 | 118 | 119 | #define VERSION "2.7.0" //release version 120 | #define LINE_LEN 500000 //maximum length of line in the input file 121 | #define QNAN flim::quiet_NaN() 122 | 123 | #if defined(_MSC_VER) && (_MSC_VER <= 1600) 124 | #define isnan(a) _isnan(a) //for old versions of Visual Studio that did not have isnan 125 | #endif 126 | -------------------------------------------------------------------------------- /shared/functions.h: -------------------------------------------------------------------------------- 1 | // functions.h: declarations of global functions 2 | 3 | #pragma once 4 | #include "definitions.h" 5 | 6 | //deletes spaces from the beginning and from the end of the string 7 | string trimSpace(string& str); 8 | 9 | //calculates root mean squared error 10 | double rmse(doublev& predicts, doublev& realvals, doublev& weights); 11 | 12 | //removes an element from a vector 13 | int erasev(intv* pVec, int value); 14 | 15 | //erase first occurence of item from vector, return its number in variable no 16 | intv::iterator erasev(intv* pVec, int item, int& no); 17 | 18 | //returns a place of the first significant digit after zero 19 | int sDigit(double number); 20 | 21 | //rounds a positive integer to the order of two important digits 22 | int roundInt(int number); 23 | 24 | //rounds up alpha to the closest appropriate value 25 | double adjustAlpha(double alpha, double trainV); 26 | 27 | //converts valid values of alpha to string 28 | string alphaToStr(double alpha); 29 | 30 | //checks if more bagging will benefit the performance 31 | bool moreBag(doublev bagPerf); 32 | 33 | //extends fstream::getline with check on exceeding the buffer size 34 | std::streamsize getLineExt(fstream& fin, char* buf); 35 | 36 | //outputs error messages for shared TreeExtra errors 37 | void te_errMsg(TE_ERROR err); 38 | 39 | //Expands error messages for std::exception 40 | void exception_errMsg(string& errstr); 41 | 42 | //calculates probabilistic roc - response can be any probability between 0 and 1 43 | //when response values are 0/1, behaves like a standard roc 44 | double roc(doublev& preds, doublev& tars, doublev& weights); 45 | 46 | //inserts suffix into a string (usually file name) 47 | string insertSuffix(string fileName, string suffix); 48 | 49 | //returns the part of the string (usually a file name) before the last dot 50 | string beforeLastDot(string fileName); 51 | 52 | //checks if the first set is a subset of the second set 53 | bipair isSubset(intset& set1, intset& set2); 54 | 55 | //checks if the element is in the array 56 | bool isIn(stringv& v, string str); 57 | 58 | //converts string to int, throws error if the string is unconvertable 59 | int atoiExt(char* str); 60 | 61 | //converts string to double, throws error if the string is unconvertable 62 | double atofExt(char* str); 63 | 64 | //returns difference, unless it is due to the rounding error 65 | double diff10d(double d1, double d2); 66 | 67 | //returns random double between 0 and 1 68 | double rand_coef(); 69 | 70 | //Make the rand_coef function thread-safe 71 | double rand_coef(unsigned int& state); 72 | 73 | //less function with NaN greater than numbers 74 | bool lessNaN(double i, double j); 75 | 76 | //less function with NaN greater than numbers for pairs 77 | bool lessNaNP(ddpair p1, ddpair p2); 78 | 79 | //equals function taking into account NaN 80 | bool equalsNaN(double i, double j); 81 | 82 | //converts double to string, NaN to question mark 83 | string ftoaExt(double d); 84 | 85 | //equals function for doubles, takes round-off errors into account 86 | bool eqDouble(double i, double j); 87 | 88 | //equals or less function for doubles, takes round-off errors into account 89 | bool leDouble(double i, double j); 90 | 91 | //less function for doubles, takes round-off errors into account 92 | bool ltDouble(double i, double j); 93 | 94 | //greater function comparing by the second item 95 | bool gtSecond(idpair p1, idpair p2); 96 | 97 | //less function comparing by the second item 98 | bool ltSecond(fipair p1, fipair p2); 99 | 100 | //greater function comparing by the absolute value of the third item 101 | bool gtAbsThird(ssdtriple t1, ssdtriple t2); 102 | 103 | //converts number to string 104 | std::string itoa(int value, int base); 105 | -------------------------------------------------------------------------------- /win/ag_addbag/ag_addbag.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {1C583D69-00D7-4BD3-B877-0F83E6531F33} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | Console 52 | 53 | 54 | 55 | 56 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /win/ag_expand/ag_expand.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {2F07B668-F45A-4BA9-8832-D1F44BCDDD8D} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | 52 | 53 | 54 | 55 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 56 | 57 | 58 | Console 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /win/ag_fs/ag_fs.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {A659907A-4258-4307-BC0A-D409C6D83562} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | Console 52 | 53 | 54 | 55 | 56 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 57 | 58 | 59 | Console 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /win/ag_interactions/ag_interactions.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {2B18A9BC-6D33-4D98-8D34-C06A293B1DAC} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | 52 | 53 | 54 | 55 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /win/ag_merge/ag_merge.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {19083E48-3D97-461C-A182-B19105BB31EF} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | 52 | 53 | 54 | 55 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /win/ag_mergepreds/ag_mergepreds.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {F3F00B74-59C8-4872-BA48-53FC9EF7C56C} 15 | ag_mergepreds 16 | 10.0.16299.0 17 | 18 | 19 | 20 | Application 21 | true 22 | MultiByte 23 | v141 24 | 25 | 26 | Application 27 | false 28 | true 29 | MultiByte 30 | v141 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | $(SolutionDir)/../win/$(Configuration)\ 44 | 45 | 46 | $(SolutionDir)/../win/$(Configuration)\ 47 | 48 | 49 | 50 | Level3 51 | Disabled 52 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 53 | 54 | 55 | true 56 | 57 | 58 | 59 | 60 | Level3 61 | MaxSpeed 62 | true 63 | true 64 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 65 | 66 | 67 | false 68 | true 69 | true 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /win/ag_nway/ag_nway.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {44CF4107-68B2-4F00-BDB0-ECDBB8478367} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | 52 | 53 | 54 | 55 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /win/ag_predict/ag_predict.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {083EF393-CD75-4860-91A9-D0DACCF12334} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | false 25 | v141 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | <_ProjectFileVersion>10.0.30319.1 39 | $(SolutionDir)/../win/$(Configuration)\ 40 | $(Configuration)\ 41 | $(SolutionDir)/../win/$(Configuration)\ 42 | $(Configuration)\ 43 | 44 | 45 | 46 | Disabled 47 | false 48 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 49 | EditAndContinue 50 | 51 | 52 | true 53 | 54 | 55 | 56 | 57 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /win/ag_save/ag_save.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {0F73F967-80AA-4B09-900D-B3F29E3343B2} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | 52 | 53 | 54 | 55 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /win/ag_train/ag_train.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {6BCCC77B-68DC-4932-A59C-2673C1AFA640} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | NotSet 25 | v141 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | <_ProjectFileVersion>10.0.30319.1 39 | $(SolutionDir)/../win/$(Configuration)\ 40 | $(Configuration)\ 41 | $(SolutionDir)/../win/$(Configuration)\ 42 | $(Configuration)\ 43 | 44 | 45 | 46 | Disabled 47 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 48 | EditAndContinue 49 | 50 | 51 | true 52 | Console 53 | 54 | 55 | 56 | 57 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 58 | 59 | 60 | Console 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /win/bt_predict/bt_predict.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {2FB0A616-9772-42F9-8356-242C04DB2464} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | Disabled 46 | ../../shared;../../BaggedTrees;%(AdditionalIncludeDirectories) 47 | EditAndContinue 48 | 49 | 50 | true 51 | Console 52 | 53 | 54 | 55 | 56 | ../../shared;../../BaggedTrees;%(AdditionalIncludeDirectories) 57 | 58 | 59 | Console 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /win/bt_train/bt_train.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {5146957E-98AD-46DB-AA17-EFD5F21A963A} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | MaxSpeed 46 | ../../shared;../../BaggedTrees;%(AdditionalIncludeDirectories) 47 | ProgramDatabase 48 | 49 | 50 | true 51 | Console 52 | 53 | 54 | 55 | 56 | ../../shared;../../BaggedTrees;%(AdditionalIncludeDirectories) 57 | EditAndContinue 58 | Disabled 59 | 60 | 61 | Console 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /win/bt_train/log.txt: -------------------------------------------------------------------------------- 1 | Usage: bt_train -t _train_set_ -v _validation_set_ -r _attr_file_ [-a _alpha_value_] [-b _bagging_iterations_] [-i _init_random_] [-m _model_file_name_] [-k _attributes_to_leave_] [-c rms|roc] [-l log|nolog] | -version 2 | Usage: bt_train -t _train_set_ -v _validation_set_ -r _attr_file_ [-a _alpha_value_] [-b _bagging_iterations_] [-i _init_random_] [-m _model_file_name_] [-k _attributes_to_leave_] [-c rms|roc] [-l log|nolog] | -version 3 | Usage: bt_train -t _train_set_ -v _validation_set_ -r _attr_file_ [-a _alpha_value_] [-b _bagging_iterations_] [-i _init_random_] [-m _model_file_name_] [-k _attributes_to_leave_] [-c rms|roc] [-l log|nolog] | -version 4 | Usage: bt_train -t _train_set_ -v _validation_set_ -r _attr_file_ [-a _alpha_value_] [-b _bagging_iterations_] [-i _init_random_] [-m _model_file_name_] [-k _attributes_to_leave_] [-c rms|roc] [-l log|nolog] | -version 5 | Usage: bt_train -t _train_set_ -v _validation_set_ -r _attr_file_ [-a _alpha_value_] [-b _bagging_iterations_] [-i _init_random_] [-m _model_file_name_] [-k _attributes_to_leave_] [-c rms|roc] [-l log|nolog] | -version 6 | -------------------------------------------------------------------------------- /win/gbt_train/gbt_train.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {00D8D791-4CD2-424C-B92C-65BCF0167D0A} 41 | gbt_train 42 | 10.0.16299.0 43 | 44 | 45 | 46 | Application 47 | true 48 | v141 49 | MultiByte 50 | 51 | 52 | Application 53 | false 54 | v141 55 | true 56 | MultiByte 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | Level3 72 | Disabled 73 | true 74 | ../../shared;../../BaggedTrees;%(AdditionalIncludeDirectories) 75 | 76 | 77 | true 78 | Console 79 | 80 | 81 | 82 | 83 | Level3 84 | MaxSpeed 85 | true 86 | true 87 | true 88 | ../../shared;../../BaggedTrees;%(AdditionalIncludeDirectories) 89 | true 90 | 91 | 92 | true 93 | true 94 | true 95 | Console 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /win/gbt_train/gbt_train.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files 29 | 30 | 31 | Source Files 32 | 33 | 34 | Source Files 35 | 36 | 37 | Source Files 38 | 39 | 40 | Source Files 41 | 42 | 43 | Source Files 44 | 45 | 46 | 47 | 48 | Header Files 49 | 50 | 51 | Header Files 52 | 53 | 54 | Header Files 55 | 56 | 57 | Header Files 58 | 59 | 60 | Header Files 61 | 62 | 63 | Header Files 64 | 65 | 66 | Header Files 67 | 68 | 69 | Header Files 70 | 71 | 72 | Header Files 73 | 74 | 75 | Header Files 76 | 77 | 78 | Header Files 79 | 80 | 81 | Header Files 82 | 83 | 84 | Header Files 85 | 86 | 87 | -------------------------------------------------------------------------------- /win/vis_correlations/vis_correlations.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 15.0 23 | {C0F27E3C-2533-456D-897D-DDDE9A68A679} 24 | viscorrelations 25 | 10.0.16299.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v141 32 | MultiByte 33 | 34 | 35 | Application 36 | false 37 | v141 38 | true 39 | MultiByte 40 | 41 | 42 | Application 43 | true 44 | v141 45 | MultiByte 46 | 47 | 48 | Application 49 | false 50 | v141 51 | true 52 | MultiByte 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | $(SolutionDir)/../win/$(Configuration)\ 74 | $(VC_IncludePath);$(WindowsSDK_IncludePath); 75 | 76 | 77 | $(SolutionDir)/../win/$(Configuration)\ 78 | $(VC_IncludePath);$(WindowsSDK_IncludePath); 79 | 80 | 81 | 82 | Level3 83 | Disabled 84 | true 85 | true 86 | ../../shared; 87 | 88 | 89 | Console 90 | 91 | 92 | 93 | 94 | Level3 95 | Disabled 96 | true 97 | true 98 | 99 | 100 | 101 | 102 | Level3 103 | MaxSpeed 104 | true 105 | true 106 | true 107 | true 108 | ../../shared; 109 | 110 | 111 | true 112 | true 113 | Console 114 | 115 | 116 | 117 | 118 | Level3 119 | MaxSpeed 120 | true 121 | true 122 | true 123 | true 124 | 125 | 126 | true 127 | true 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /win/vis_correlations/vis_correlations.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files 29 | 30 | 31 | Source Files 32 | 33 | 34 | 35 | 36 | Header Files 37 | 38 | 39 | Header Files 40 | 41 | 42 | Header Files 43 | 44 | 45 | Header Files 46 | 47 | 48 | Header Files 49 | 50 | 51 | Header Files 52 | 53 | 54 | -------------------------------------------------------------------------------- /win/vis_effect/vis_effect.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {FAC1BA2B-DE36-47BD-9CBB-EDB34962D8BA} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 46 | EditAndContinue 47 | Disabled 48 | 49 | 50 | Console 51 | true 52 | 53 | 54 | 55 | 56 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 57 | 58 | 59 | Console 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /win/vis_iplot/vis_iplot.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {F2EF2C55-F16C-4D7E-A9FB-16599193F316} 15 | 10.0.16299.0 16 | 17 | 18 | 19 | Application 20 | v141 21 | 22 | 23 | Application 24 | v141 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | <_ProjectFileVersion>10.0.30319.1 38 | $(SolutionDir)/../win/$(Configuration)\ 39 | $(Configuration)\ 40 | $(SolutionDir)/../win/$(Configuration)\ 41 | $(Configuration)\ 42 | 43 | 44 | 45 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 46 | Disabled 47 | 48 | 49 | Console 50 | true 51 | 52 | 53 | 54 | 55 | ../../AdditiveGroves;../../shared;%(AdditionalIncludeDirectories) 56 | 57 | 58 | Console 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | --------------------------------------------------------------------------------