├── .gitattributes
├── .gitignore
├── readme.md
└── rgf.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Regularized Greedy Forest Wrappers
 2 | 
 3 | First version for a toy Scikit/learn API compatible wrapper for Regularized Greedy Forests [Johnson & Zhang, 2014]
 4 | 
 5 | ## Usage
 6 | 
 7 | ### Classification
 8 | 
 9 | **RegularizedGreedyForestClassifier(verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model")**
10 | 
11 | Parameter|Description
12 | ---|---
13 | verbose|Int. Verbosity of the classifier. *Default=0*
14 | max_leaf|Int. Max number of leafs to create before halting. *Default=500*
15 | test_interval|Int. Save models during intervals. *Default=100*
16 | algorithm|String. Any of `RGF` (RGF with L2 regularization, `RGF_Opt` (RGF with min-penalty regularization), `RGF_Sib` (RGF with min-penalty regularization with sum-to-zero sibling constraints) *Default=RGF*
17 | loss|String. Any of `LS` (Least squares), `Expo` (Exponential), `Log` (Logarithmic). *Default=LS*
18 | L2|Float. Amount of L2 regularization. `1.0`, `0.1` and `0.01` are sane values. *Default=1.0*
19 | 
20 | ### Regression
21 | 
22 | **RegularizedGreedyForestRegressor(verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model")**
23 | 
24 | Parameter|Description
25 | ---|---
26 | verbose|Int. Verbosity of the regressor. *Default=0*
27 | max_leaf|Int. Max number of leafs to create before halting. *Default=500*
28 | test_interval|Int. Save models during intervals. *Default=100*
29 | algorithm|String. Any of `RGF` (RGF with L2 regularization, `RGF_Opt` (RGF with min-penalty regularization), `RGF_Sib` (RGF with min-penalty regularization with sum-to-zero sibling constraints) *Default=RGF*
30 | loss|String. Any of `LS` (Least squares), `Expo` (Exponential), `Log` (Logarithmic). *Default=LS*
31 | L2|Float. Amount of L2 regularization. `1.0`, `0.1` and `0.01` are sane values. *Default=1.0*


--------------------------------------------------------------------------------
/rgf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | __Author__: Triskelion <info@mlwave.com>
  3 | 
  4 | A small toy Python wrapper for Regularized Greedy Forests
  5 | 
  6 | Limitation of Liability. In no event shall Author be liable to you or any 
  7 | party related to you for any indirect, incidental, consequential, special, 
  8 | exemplary, or punitive damages or lost profits, even if Author has been advised 
  9 | of the possibility of such damages. In any event, Author's total aggregate 
 10 | liability to you for all damages of every kind and type (regardless of whether 
 11 | based in contract or tort) shall not exceed the purchase price of the product.
 12 | """
 13 | 
 14 | ## Dependencies ###############################################
 15 | import os
 16 | import subprocess
 17 | from glob import glob
 18 | import numpy as np
 19 | import pandas as pd
 20 | 
 21 | ## Edit this ##################################################
 22 | 
 23 | #Location of the RGF executable
 24 | loc_exec = "c:\\python64\\rgf\\bin\\rgf.exe"
 25 | 
 26 | #Location of a temporary directory (has to exist)
 27 | loc_temp = "rgf\\temp3"
 28 | 
 29 | class RegularizedGreedyForestClassifier:
 30 | 	def __init__(self, verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model"):
 31 | 		self.verbose = verbose
 32 | 		self.max_leaf = max_leaf
 33 | 		self.algorithm = algorithm
 34 | 		self.loss = loss
 35 | 		self.test_interval = test_interval
 36 | 		self.prefix = prefix
 37 | 		self.l2 = l2
 38 | 		if os.path.exists(loc_exec):
 39 | 			self.loc_exec = loc_exec
 40 | 		else:
 41 | 			print("Warning: Location to RGF executable not found or not correctly set:\n\t%s\n"%loc_exec)
 42 | 		if os.path.exists(loc_temp):
 43 | 			self.loc_temp = loc_temp
 44 | 		else:
 45 | 			print("Warning: Location to a temporary directory does not exist:\n\t%s\n"%loc_temp)
 46 | 	
 47 | 	#Fitting/training the model to target variables
 48 | 	def fit(self,X,y):
 49 | 		#Store the train set into RGF format
 50 | 		np.savetxt(os.path.join(loc_temp, "train.data.x"), X, delimiter=' ', fmt="%s")
 51 | 		y = ["+1" if f == "1" else "-1" for f in map(str, list(y))]
 52 | 		#Store the targets into RGF format
 53 | 		np.savetxt(os.path.join(loc_temp, "train.data.y"), y, delimiter=' ', fmt="%s")
 54 | 		
 55 | 		#format train command
 56 | 		params = []
 57 | 		if self.verbose > 0:
 58 | 			params.append("Verbose")
 59 | 		params.append("train_x_fn=%s"%os.path.join(loc_temp, "train.data.x"))
 60 | 		params.append("train_y_fn=%s"%os.path.join(loc_temp, "train.data.y"))
 61 | 		params.append("algorithm=%s"%self.algorithm)
 62 | 		params.append("loss=%s"%self.loss)
 63 | 		params.append("max_leaf_forest=%s"%self.max_leaf)
 64 | 		params.append("test_interval=%s"%self.test_interval)
 65 | 		params.append("reg_L2=%s"%self.l2)
 66 | 		params.append("model_fn_prefix=%s"%os.path.join(loc_temp, self.prefix))
 67 | 		
 68 | 		cmd = "%s train %s 2>&1"%(self.loc_exec,",".join(params))
 69 | 		
 70 | 		#train
 71 | 		output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate()
 72 | 		
 73 | 		for k in output:
 74 | 			print k
 75 | 			
 76 | 	def predict_proba(self,X, clean=True):
 77 | 		#Store the test set into RGF format
 78 | 		np.savetxt(os.path.join(loc_temp, "test.data.x"), X, delimiter=' ', fmt="%s")
 79 | 	
 80 | 		#Find latest model location
 81 | 		model_glob = self.loc_temp + os.sep + self.prefix + "*"
 82 | 		latest_model_loc = sorted(glob(model_glob),reverse=True)[0]
 83 | 		
 84 | 		#Format test command
 85 | 		params = []
 86 | 		params.append("test_x_fn=%s"%os.path.join(loc_temp, "test.data.x"))
 87 | 		params.append("prediction_fn=%s"%os.path.join(loc_temp, "predictions.txt"))
 88 | 		params.append("model_fn=%s"%latest_model_loc)
 89 | 		cmd = "%s predict %s 2>&1"%(self.loc_exec,",".join(params))
 90 | 		
 91 | 		output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate()
 92 | 		
 93 | 		for k in output:
 94 | 			print k
 95 | 		
 96 | 		y_pred = np.array([sigmoid(x) for x in np.loadtxt(os.path.join(loc_temp, "predictions.txt"))])
 97 | 		y_pred = np.array([[1-x, x] for x in y_pred])
 98 | 		#Clean temp directory
 99 | 		if clean:
100 | 			model_glob = self.loc_temp + os.sep + "*"
101 | 			
102 | 			for fn in glob(model_glob):
103 | 				if "predictions.txt" in fn or "model-" in fn or "train.data." in fn or "test.data." in fn:
104 | 					os.remove(fn)
105 | 			
106 | 		return y_pred
107 | 		
108 | 	def get_params(self):
109 | 		params = {}
110 | 		params["verbose"] = self.verbose
111 | 		params["max_leaf"] = self.max_leaf
112 | 		params["algorithm"] = self.algorithm
113 | 		params["loss"] = self.loss
114 | 		params["test_interval"] = self.test_interval
115 | 		params["prefix"] = self.prefix
116 | 		params["l2"] = self.l2
117 | 		return params
118 | 		
119 | class RegularizedGreedyForestRegressor:
120 | 	def __init__(self, verbose=0, max_leaf=500, test_interval=100, loc_exec=loc_exec, loc_temp=loc_temp, algorithm="RGF", loss="LS", l2="1", prefix="model"):
121 | 		self.verbose = verbose
122 | 		self.max_leaf = max_leaf
123 | 		self.algorithm = algorithm
124 | 		self.loss = loss
125 | 		self.test_interval = test_interval
126 | 		self.prefix = prefix
127 | 		self.l2 = l2
128 | 		if os.path.exists(loc_exec):
129 | 			self.loc_exec = loc_exec
130 | 		else:
131 | 			print("Warning: Location to RGF executable not found or not correctly set:\n\t%s\n"%loc_exec)
132 | 		if os.path.exists(loc_temp):
133 | 			self.loc_temp = loc_temp
134 | 		else:
135 | 			print("Warning: Location to a temporary directory does not exist:\n\t%s\n"%loc_temp)
136 | 	
137 | 	#Fitting/training the model to target variables
138 | 	def fit(self,X,y):
139 | 		#Store the train set into RGF format
140 | 		np.savetxt(os.path.join(loc_temp, "train.data.x"), X, delimiter=' ', fmt="%s")
141 | 		#Store the targets into RGF format
142 | 		np.savetxt(os.path.join(loc_temp, "train.data.y"), y, delimiter=' ', fmt="%s")
143 | 		
144 | 		#format train command
145 | 		params = []
146 | 		if self.verbose > 0:
147 | 			params.append("Verbose")
148 | 		params.append("NormalizeTarget")
149 | 		params.append("train_x_fn=%s"%os.path.join(loc_temp, "train.data.x"))
150 | 		params.append("train_y_fn=%s"%os.path.join(loc_temp, "train.data.y"))
151 | 		params.append("algorithm=%s"%self.algorithm)
152 | 		params.append("loss=%s"%self.loss)
153 | 		params.append("max_leaf_forest=%s"%self.max_leaf)
154 | 		params.append("test_interval=%s"%self.test_interval)
155 | 		params.append("reg_L2=%s"%self.l2)
156 | 		params.append("model_fn_prefix=%s"%os.path.join(loc_temp, self.prefix))
157 | 		
158 | 		cmd = "%s train %s 2>&1"%(self.loc_exec,",".join(params))
159 | 		
160 | 		#train
161 | 		output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate()
162 | 		
163 | 		for k in output:
164 | 			print k
165 | 			
166 | 	def predict(self,X, clean=True):
167 | 		#Store the test set into RGF format
168 | 		np.savetxt(os.path.join(loc_temp, "test.data.x"), X, delimiter=' ', fmt="%s")
169 | 	
170 | 		#Find latest model location
171 | 		model_glob = self.loc_temp + os.sep + self.prefix + "*"
172 | 		latest_model_loc = sorted(glob(model_glob),reverse=True)[0]
173 | 		
174 | 		#Format test command
175 | 		params = []
176 | 		params.append("test_x_fn=%s"%os.path.join(loc_temp, "test.data.x"))
177 | 		params.append("prediction_fn=%s"%os.path.join(loc_temp, "predictions.txt"))
178 | 		params.append("model_fn=%s"%latest_model_loc)
179 | 		cmd = "%s predict %s"%(self.loc_exec,",".join(params)) # 2>&1
180 | 		
181 | 		output = subprocess.Popen(cmd.split(),stdout=subprocess.PIPE,shell=True).communicate()
182 | 		
183 | 		for k in output:
184 | 			print k
185 | 		
186 | 		y_pred = np.loadtxt(os.path.join(loc_temp, "predictions.txt"))
187 | 		
188 | 		#Clean temp directory
189 | 		if clean:
190 | 			model_glob = self.loc_temp + os.sep + "*"
191 | 			
192 | 			for fn in glob(model_glob):
193 | 				if "predictions.txt" in fn or "model-" in fn or "train.data." in fn or "test.data." in fn:
194 | 					os.remove(fn)
195 | 		print X.shape
196 | 		return y_pred
197 | 


--------------------------------------------------------------------------------