├── .gitattributes ├── .gitignore ├── readme.md └── rgf.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Regularized Greedy Forest Wrappers 2 | 3 | First version for a toy Scikit/learn API compatible wrapper for Regularized Greedy Forests [Johnson & Zhang, 2014] 4 | 5 | ## Usage 6 | 7 | ### Classification 8 | 9 | **RegularizedGreedyForestClassifier(verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model")** 10 | 11 | Parameter|Description 12 | ---|--- 13 | verbose|Int. Verbosity of the classifier. *Default=0* 14 | max_leaf|Int. Max number of leafs to create before halting. *Default=500* 15 | test_interval|Int. Save models during intervals. *Default=100* 16 | algorithm|String. Any of `RGF` (RGF with L2 regularization, `RGF_Opt` (RGF with min-penalty regularization), `RGF_Sib` (RGF with min-penalty regularization with sum-to-zero sibling constraints) *Default=RGF* 17 | loss|String. Any of `LS` (Least squares), `Expo` (Exponential), `Log` (Logarithmic). *Default=LS* 18 | L2|Float. Amount of L2 regularization. `1.0`, `0.1` and `0.01` are sane values. *Default=1.0* 19 | 20 | ### Regression 21 | 22 | **RegularizedGreedyForestRegressor(verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model")** 23 | 24 | Parameter|Description 25 | ---|--- 26 | verbose|Int. Verbosity of the regressor. *Default=0* 27 | max_leaf|Int. Max number of leafs to create before halting. *Default=500* 28 | test_interval|Int. Save models during intervals. *Default=100* 29 | algorithm|String. Any of `RGF` (RGF with L2 regularization, `RGF_Opt` (RGF with min-penalty regularization), `RGF_Sib` (RGF with min-penalty regularization with sum-to-zero sibling constraints) *Default=RGF* 30 | loss|String. Any of `LS` (Least squares), `Expo` (Exponential), `Log` (Logarithmic). *Default=LS* 31 | L2|Float. Amount of L2 regularization. `1.0`, `0.1` and `0.01` are sane values. *Default=1.0* -------------------------------------------------------------------------------- /rgf.py: -------------------------------------------------------------------------------- 1 | """ 2 | __Author__: Triskelion 3 | 4 | A small toy Python wrapper for Regularized Greedy Forests 5 | 6 | Limitation of Liability. In no event shall Author be liable to you or any 7 | party related to you for any indirect, incidental, consequential, special, 8 | exemplary, or punitive damages or lost profits, even if Author has been advised 9 | of the possibility of such damages. In any event, Author's total aggregate 10 | liability to you for all damages of every kind and type (regardless of whether 11 | based in contract or tort) shall not exceed the purchase price of the product. 12 | """ 13 | 14 | ## Dependencies ############################################### 15 | import os 16 | import subprocess 17 | from glob import glob 18 | import numpy as np 19 | import pandas as pd 20 | 21 | ## Edit this ################################################## 22 | 23 | #Location of the RGF executable 24 | loc_exec = "c:\\python64\\rgf\\bin\\rgf.exe" 25 | 26 | #Location of a temporary directory (has to exist) 27 | loc_temp = "rgf\\temp3" 28 | 29 | class RegularizedGreedyForestClassifier: 30 | def __init__(self, verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model"): 31 | self.verbose = verbose 32 | self.max_leaf = max_leaf 33 | self.algorithm = algorithm 34 | self.loss = loss 35 | self.test_interval = test_interval 36 | self.prefix = prefix 37 | self.l2 = l2 38 | if os.path.exists(loc_exec): 39 | self.loc_exec = loc_exec 40 | else: 41 | print("Warning: Location to RGF executable not found or not correctly set:\n\t%s\n"%loc_exec) 42 | if os.path.exists(loc_temp): 43 | self.loc_temp = loc_temp 44 | else: 45 | print("Warning: Location to a temporary directory does not exist:\n\t%s\n"%loc_temp) 46 | 47 | #Fitting/training the model to target variables 48 | def fit(self,X,y): 49 | #Store the train set into RGF format 50 | np.savetxt(os.path.join(loc_temp, "train.data.x"), X, delimiter=' ', fmt="%s") 51 | y = ["+1" if f == "1" else "-1" for f in map(str, list(y))] 52 | #Store the targets into RGF format 53 | np.savetxt(os.path.join(loc_temp, "train.data.y"), y, delimiter=' ', fmt="%s") 54 | 55 | #format train command 56 | params = [] 57 | if self.verbose > 0: 58 | params.append("Verbose") 59 | params.append("train_x_fn=%s"%os.path.join(loc_temp, "train.data.x")) 60 | params.append("train_y_fn=%s"%os.path.join(loc_temp, "train.data.y")) 61 | params.append("algorithm=%s"%self.algorithm) 62 | params.append("loss=%s"%self.loss) 63 | params.append("max_leaf_forest=%s"%self.max_leaf) 64 | params.append("test_interval=%s"%self.test_interval) 65 | params.append("reg_L2=%s"%self.l2) 66 | params.append("model_fn_prefix=%s"%os.path.join(loc_temp, self.prefix)) 67 | 68 | cmd = "%s train %s 2>&1"%(self.loc_exec,",".join(params)) 69 | 70 | #train 71 | output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate() 72 | 73 | for k in output: 74 | print k 75 | 76 | def predict_proba(self,X, clean=True): 77 | #Store the test set into RGF format 78 | np.savetxt(os.path.join(loc_temp, "test.data.x"), X, delimiter=' ', fmt="%s") 79 | 80 | #Find latest model location 81 | model_glob = self.loc_temp + os.sep + self.prefix + "*" 82 | latest_model_loc = sorted(glob(model_glob),reverse=True)[0] 83 | 84 | #Format test command 85 | params = [] 86 | params.append("test_x_fn=%s"%os.path.join(loc_temp, "test.data.x")) 87 | params.append("prediction_fn=%s"%os.path.join(loc_temp, "predictions.txt")) 88 | params.append("model_fn=%s"%latest_model_loc) 89 | cmd = "%s predict %s 2>&1"%(self.loc_exec,",".join(params)) 90 | 91 | output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate() 92 | 93 | for k in output: 94 | print k 95 | 96 | y_pred = np.array([sigmoid(x) for x in np.loadtxt(os.path.join(loc_temp, "predictions.txt"))]) 97 | y_pred = np.array([[1-x, x] for x in y_pred]) 98 | #Clean temp directory 99 | if clean: 100 | model_glob = self.loc_temp + os.sep + "*" 101 | 102 | for fn in glob(model_glob): 103 | if "predictions.txt" in fn or "model-" in fn or "train.data." in fn or "test.data." in fn: 104 | os.remove(fn) 105 | 106 | return y_pred 107 | 108 | def get_params(self): 109 | params = {} 110 | params["verbose"] = self.verbose 111 | params["max_leaf"] = self.max_leaf 112 | params["algorithm"] = self.algorithm 113 | params["loss"] = self.loss 114 | params["test_interval"] = self.test_interval 115 | params["prefix"] = self.prefix 116 | params["l2"] = self.l2 117 | return params 118 | 119 | class RegularizedGreedyForestRegressor: 120 | def __init__(self, verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model"): 121 | self.verbose = verbose 122 | self.max_leaf = max_leaf 123 | self.algorithm = algorithm 124 | self.loss = loss 125 | self.test_interval = test_interval 126 | self.prefix = prefix 127 | self.l2 = l2 128 | if os.path.exists(loc_exec): 129 | self.loc_exec = loc_exec 130 | else: 131 | print("Warning: Location to RGF executable not found or not correctly set:\n\t%s\n"%loc_exec) 132 | if os.path.exists(loc_temp): 133 | self.loc_temp = loc_temp 134 | else: 135 | print("Warning: Location to a temporary directory does not exist:\n\t%s\n"%loc_temp) 136 | 137 | #Fitting/training the model to target variables 138 | def fit(self,X,y): 139 | #Store the train set into RGF format 140 | np.savetxt(os.path.join(loc_temp, "train.data.x"), X, delimiter=' ', fmt="%s") 141 | #Store the targets into RGF format 142 | np.savetxt(os.path.join(loc_temp, "train.data.y"), y, delimiter=' ', fmt="%s") 143 | 144 | #format train command 145 | params = [] 146 | if self.verbose > 0: 147 | params.append("Verbose") 148 | params.append("NormalizeTarget") 149 | params.append("train_x_fn=%s"%os.path.join(loc_temp, "train.data.x")) 150 | params.append("train_y_fn=%s"%os.path.join(loc_temp, "train.data.y")) 151 | params.append("algorithm=%s"%self.algorithm) 152 | params.append("loss=%s"%self.loss) 153 | params.append("max_leaf_forest=%s"%self.max_leaf) 154 | params.append("test_interval=%s"%self.test_interval) 155 | params.append("reg_L2=%s"%self.l2) 156 | params.append("model_fn_prefix=%s"%os.path.join(loc_temp, self.prefix)) 157 | 158 | cmd = "%s train %s 2>&1"%(self.loc_exec,",".join(params)) 159 | 160 | #train 161 | output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate() 162 | 163 | for k in output: 164 | print k 165 | 166 | def predict(self,X, clean=True): 167 | #Store the test set into RGF format 168 | np.savetxt(os.path.join(loc_temp, "test.data.x"), X, delimiter=' ', fmt="%s") 169 | 170 | #Find latest model location 171 | model_glob = self.loc_temp + os.sep + self.prefix + "*" 172 | latest_model_loc = sorted(glob(model_glob),reverse=True)[0] 173 | 174 | #Format test command 175 | params = [] 176 | params.append("test_x_fn=%s"%os.path.join(loc_temp, "test.data.x")) 177 | params.append("prediction_fn=%s"%os.path.join(loc_temp, "predictions.txt")) 178 | params.append("model_fn=%s"%latest_model_loc) 179 | cmd = "%s predict %s"%(self.loc_exec,",".join(params)) # 2>&1 180 | 181 | output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate() 182 | 183 | for k in output: 184 | print k 185 | 186 | y_pred = np.loadtxt(os.path.join(loc_temp, "predictions.txt")) 187 | 188 | #Clean temp directory 189 | if clean: 190 | model_glob = self.loc_temp + os.sep + "*" 191 | 192 | for fn in glob(model_glob): 193 | if "predictions.txt" in fn or "model-" in fn or "train.data." in fn or "test.data." in fn: 194 | os.remove(fn) 195 | print X.shape 196 | return y_pred 197 | --------------------------------------------------------------------------------