├── .gitattributes ├── .gitignore ├── examples ├── bitstringcompression.py ├── digits.py └── dimensionality_reduction.py ├── genetic_perceptrons ├── ogc.py └── readme.md ├── rbr_experiments ├── rbro.py └── readme.md ├── readme.md └── rftrl.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /examples/bitstringcompression.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Triskelion, HJ van Veen, info@mlwave.com 3 | 4 | Description: 5 | 6 | Here will be the experiments with compression of the bitstring. 7 | 8 | We compress the bitstring by chopping it up into n equally sized bit chunks. 9 | 10 | We then take the sum of the bit chunk, and look if that is over half the size of the bit chunk. 11 | 12 | Example: 13 | 14 | original bitstring of size 10: 15 | 16 | "1110101000" 17 | 18 | chop into 2 chunks of length 5. 19 | 20 | 11101 = 4 = larger than 2.5 = 1 21 | 01000 = 1 = smaller than 2.5 = 0 22 | 23 | output: 24 | 25 | "10" 26 | 27 | Reference/Inspiration: 28 | 29 | Similarity Estimation Techniques From Rounding Algorithms, Moses Charikar 30 | http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf 31 | 32 | Todo: 33 | 34 | Clean up code. 35 | Make much faster. 36 | Acquire probabilistic pseudo-random supercomputer 37 | """ -------------------------------------------------------------------------------- /examples/digits.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Triskelion, HJ van Veen, info@mlwave.com 3 | 4 | Description: 5 | 6 | Creates 3 classifiers. 7 | Experiments with ensembling their predictions, and studies variance. 8 | Uses digits dataset (the "0"'s and "1"'s) 9 | 10 | Seems that: 11 | 12 | One 'overfitted' classifier can improve the ensemble. 13 | Random_state change shows more difference than Random Forest. 14 | Averaging 3 classifiers with different random state increases robustness. 15 | We can approach SVM accuracy. 16 | Weighing the predictions by the 3 classifier's progressive validation loss can be better than unweighted average. 17 | Very similar to Vowpal Wabbit's -q and --cubic. 18 | 19 | """ 20 | from sklearn.datasets import load_digits 21 | import rftrl 22 | 23 | def logloss(act,pred): 24 | predicted = max(min(pred, 1. - 10e-15), 10e-15) 25 | return -log(predicted) if act == 1. else -log(1. - predicted) 26 | 27 | if __name__ == "__main__": 28 | X_train, y = load_digits().data, load_digits().target 29 | 30 | clf = rftrl.RandomLeaderClassifier(nr_projections=500, random_state=36, l2=1., size_projections=1, verbose=1) 31 | clf2 = rftrl.RandomLeaderClassifier(nr_projections=100000, random_state=37, l2=1., size_projections=3, verbose=1) 32 | clf3 = rftrl.RandomLeaderClassifier(nr_projections=1000, random_state=38, l2=1., size_projections=2, verbose=1) 33 | 34 | clf.project(X_train) 35 | clf2.project(X_train) 36 | clf3.project(X_train) 37 | 38 | loss = 0 39 | loss2 = 0 40 | loss3 = 0 41 | loss_ensemble = 0 42 | loss_ensemble_ranked = 0 43 | count = 0 44 | for e, (x,y) in enumerate(zip(X_train,y)): 45 | if y == 0 or y == 1: # make a binary problem 46 | count += 1. 47 | 48 | clf.fit(x,e,y) 49 | pred = clf.predict() 50 | loss += clf.logloss() 51 | clf.update(pred) 52 | 53 | clf2.fit(x,e,y) 54 | pred2 = clf2.predict() 55 | loss2 += clf2.logloss() 56 | clf2.update(pred2) 57 | 58 | clf3.fit(x,e,y) 59 | pred3 = clf3.predict() 60 | loss3 += clf3.logloss() 61 | clf3.update(pred3) 62 | 63 | leaders = sorted([(loss/count,pred), (loss2/count,pred2), (loss3/count,pred3)]) 64 | loss_ensemble_ranked += logloss(y,((leaders[0][1]*3)+(leaders[1][1]*2)+(leaders[2][1]*1))/6.) 65 | loss_ensemble += logloss(y,(pred+pred2+pred3)/3.) 66 | 67 | print("%f\t%s\t%f\t%f\t%f\t\t%f\t%f"%(pred, y, loss/count, loss2/count, loss3/count, loss_ensemble/count, loss_ensemble_ranked/count)) -------------------------------------------------------------------------------- /examples/dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Triskelion, HJ van Veen, info@mlwave.com 3 | 4 | Description: 5 | 6 | Dimensionality Reduction 7 | 8 | RBR can be used for dimensionality reduction. 9 | 10 | Either by trial or error, or by saving only the best (most informative/highest weighted) bits, 11 | you can create short bitstrings that reduce dimensionality. 12 | 13 | Todo: 14 | Clean code 15 | """ -------------------------------------------------------------------------------- /genetic_perceptrons/ogc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Online Genetic Classifier 3 | 4 | Totally experimental code/proof-of-concept. 5 | """ 6 | from __future__ import division 7 | from collections import defaultdict 8 | import sys 9 | import random 10 | from math import exp, log 11 | 12 | class GeneticClassifier(object): 13 | def __init__(self, verbose=2, loc_python="python", get_train_data_function="", get_test_data_function="", loss_function="log_loss", custom_loss_function="", random_state=42): 14 | self.loc_python = loc_python 15 | self.verbose = verbose 16 | self.get_train_data_function = get_train_data_function 17 | self.get_test_data_function = get_test_data_function 18 | random.seed(random_state) 19 | if len(custom_loss_function) > 0: 20 | self.loss = custom_loss_function 21 | else: 22 | if loss_function == "log_loss": 23 | self.loss = self.log_loss 24 | elif loss_function == "mse": 25 | self.loss = self.mse 26 | else: 27 | sys.exit("invalid loss function specified. Pick any of ['log_loss', 'mse']") 28 | self.minmax = defaultdict(lambda: defaultdict(float)) 29 | 30 | def __repr__(self): 31 | return "GeneticClassifier()" 32 | 33 | def log_loss(self,y,p): 34 | p = max(min(p, 1. - 10e-15), 10e-15) 35 | return -log(p) if y == 1. else -log(1. - p) 36 | 37 | def mse(self,y_real,y_pred): 38 | print "ddd" 39 | 40 | def random_perceptron(self, size=3): 41 | perceptron = [] 42 | for feature_index in random.sample(self.minmax.keys(),size): 43 | perceptron.append((random.uniform(-1,1), feature_index)) 44 | return perceptron 45 | 46 | def calculate_perceptron(self, x, perceptron): 47 | #print sum([random_weight*x[feature_index] for random_weight, feature_index in perceptron[:-1]]), perceptron[-1], "drek" 48 | return sum([random_weight*x[feature_index][1] for random_weight, feature_index in perceptron[:-1]]) 49 | 50 | def data_gen(self, data_generator): 51 | return data_generator 52 | 53 | def bounded_sigmoid(self, wTx): 54 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 55 | 56 | def fit(self, data_generator,data_generator2,data_generator3): 57 | # Calculate min and max for every column 58 | k = self.data_gen(data_generator) 59 | 60 | if self.verbose > 0: 61 | print("calculating min and max for every feature_index") 62 | for i, (x, y) in enumerate(k): 63 | for feature_index, feature_val in x: 64 | if i == 0: 65 | self.minmax[feature_index]["min"] = feature_val 66 | self.minmax[feature_index]["max"] = feature_val 67 | else: 68 | if feature_val < self.minmax[feature_index]["min"]: 69 | self.minmax[feature_index]["min"] = feature_val 70 | if feature_val > self.minmax[feature_index]["max"]: 71 | self.minmax[feature_index]["max"] = feature_val 72 | if self.verbose > 0: 73 | if i % 1000 == 0: 74 | print(i) 75 | #print self.minmax 76 | 77 | # generate n random perceptrons with random threshold between min,max. 78 | perceptrons = [] 79 | for i in range(5000): 80 | perceptrons.append(self.random_perceptron()) 81 | 82 | #k = data_generator 83 | # calculate fitness of generation 84 | fitness = defaultdict(list) 85 | fitness = defaultdict(lambda: defaultdict(int)) 86 | fitness = defaultdict(float) 87 | #k = self.data_gen(data_generator2) 88 | for i, (x, y) in enumerate(data_generator2): 89 | #print "kek" 90 | for perceptron_id, perceptron in enumerate(perceptrons): 91 | #print perceptron 92 | #print self.calculate_perceptron(x, perceptron) 93 | #fitness[perceptron_id].append(self.calculate_perceptron(x, perceptron)) 94 | #print self.bounded_sigmoid(self.calculate_perceptron(x, perceptron)), perceptron 95 | fitness[perceptron_id] += self.log_loss( y, self.bounded_sigmoid(self.calculate_perceptron(x, perceptron)) ) 96 | #print fitness 97 | """ 98 | fitness_keys = fitness.keys() 99 | for k in fitness_keys: 100 | #print k, fitness[k], perceptrons[k] 101 | total = sum(fitness[k].values()) 102 | for label in fitness[k]: 103 | fitness[k][label] = fitness[k][label] / total 104 | for k in fitness_keys: 105 | print k, fitness[k] 106 | """ 107 | fittest = [] 108 | for f in sorted(fitness, key=fitness.get)[:3]: 109 | #print f, fitness[f], fitness[f] / i, perceptrons[f] 110 | fittest.append(perceptrons[f]) 111 | kk = [] 112 | for i, (x, y) in enumerate(data_generator3): 113 | pred = [] 114 | for perceptron_id, perceptron in enumerate(fittest): 115 | pred.append(self.bounded_sigmoid(self.calculate_perceptron(x, perceptron))) 116 | #print y, sum(pred) / len(pred) 117 | kk.append((sum(pred) / len(pred), y)) 118 | from sklearn.metrics import roc_auc_score 119 | preds = [] 120 | y_real = [] 121 | for k in sorted(kk): 122 | preds.append(k[0]) 123 | y_real.append(k[1]) 124 | print k 125 | print roc_auc_score(y_real, preds) 126 | print fittest 127 | clf = GeneticClassifier() 128 | 129 | from sklearn.datasets import load_boston, load_digits 130 | X, ys = load_boston().data, load_boston().target 131 | X, ys = load_digits().data, load_digits().target 132 | 133 | X_bin = [] 134 | y_bin = [] 135 | for x, y in zip(X,ys): 136 | if y == 1 or y == 0: 137 | X_bin.append(list(x)) 138 | y_bin.append(int(y)) 139 | 140 | def get_data(X,ys): 141 | for x, y in zip(X, ys): 142 | yield [(e,f) for e,f in enumerate(x)], y 143 | 144 | clf.fit(get_data(X_bin[:200],y_bin[:200]),get_data(X_bin[:200],y_bin[:200]),get_data(X_bin[200:],y_bin[200:])) -------------------------------------------------------------------------------- /genetic_perceptrons/readme.md: -------------------------------------------------------------------------------- 1 | # Genetic Classifier Experiment 2 | 3 | Out-of-memory Genetic Programming experiments. 4 | 5 | ## Data 6 | 7 | Digit dataset turned into binary classification problem (all "1"'s and "0"'s). 8 | 9 | We use 200 samples for training, rest for testing. 10 | 11 | ## Epoch 1 (generate population) 12 | 13 | Creates 5000 random perceptrons. Weights are random.uniform(-1,1). Variables are a random subset of size 3. 14 | 15 | ## Epoch 2 (selecting) 16 | 17 | Do a bounded sigmoid on the perceptron dotproduct. Compute logistic loss for every perceptron. 18 | 19 | ## Epoch 3 (validation) 20 | 21 | Pick top n (in our case just top 3) perceptrons with lowest loss. Calculate AUC score on their average. 22 | 23 | ## Todo 24 | 25 | Mutation (perturb the weights) 26 | Cross-breeding 27 | Stacked Generalization 28 | Refactor code 29 | 30 | ## Console 31 | ``` 32 | (0.014806130485203578, 0) 33 | (0.015559958197744028, 0) 34 | (0.01587381161380923, 0) 35 | (0.0169735033705027, 0) 36 | (0.01717849773878812, 0) 37 | (0.019452516605602196, 0) 38 | (0.019935589575795432, 0) 39 | (0.021776800144591064, 0) 40 | (0.022564160497505852, 0) 41 | (0.024220038565533175, 0) 42 | (0.025746555210662218, 0) 43 | (0.02605650144187907, 0) 44 | (0.026810217179488924, 0) 45 | (0.02713099408186445, 0) 46 | (0.02723356433475863, 0) 47 | (0.02936591408356302, 0) 48 | (0.029978667260524033, 0) 49 | (0.030700550772331376, 0) 50 | (0.030844433504879143, 0) 51 | (0.031053159900509414, 0) 52 | (0.03123725078013455, 0) 53 | (0.03297678325368353, 0) 54 | (0.03332912169404179, 0) 55 | (0.033473004426589556, 0) 56 | (0.033473004426589556, 0) 57 | (0.03433782823002379, 0) 58 | (0.03542307913509152, 0) 59 | (0.036705456537757065, 0) 60 | (0.037034271782880074, 0) 61 | (0.037623925905289714, 0) 62 | (0.038775641856037484, 0) 63 | (0.039960926109952756, 0) 64 | (0.04044573790490704, 0) 65 | (0.04130375894356001, 0) 66 | (0.043281184380554755, 0) 67 | (0.04328179360110455, 0) 68 | (0.04538484473375132, 0) 69 | (0.046783753657153086, 0) 70 | (0.04844416778234802, 0) 71 | (0.048755056454485746, 0) 72 | (0.050116700140464994, 0) 73 | (0.05239087672961318, 0) 74 | (0.05316126249745066, 0) 75 | (0.053315231941341364, 0) 76 | (0.0546916896005725, 0) 77 | (0.05472269649917016, 0) 78 | (0.05554640940178249, 0) 79 | (0.055913721143210526, 0) 80 | (0.055913721143210526, 0) 81 | (0.055913721143210526, 0) 82 | (0.05679474073321927, 0) 83 | (0.05841107658258162, 0) 84 | (0.05912213847818546, 0) 85 | (0.06219733610199696, 0) 86 | (0.06301575070520711, 0) 87 | (0.06301575070520711, 0) 88 | (0.06518628703287722, 0) 89 | (0.0662417195310765, 0) 90 | (0.06791626882271846, 0) 91 | (0.06834706228267, 0) 92 | (0.06872344287821336, 0) 93 | (0.07074219992113444, 0) 94 | (0.07185115914143808, 0) 95 | (0.07330218790504078, 0) 96 | (0.07385290196880523, 0) 97 | (0.07565544253872046, 0) 98 | (0.07618661179570192, 0) 99 | (0.08020173209230932, 0) 100 | (0.08020173209230932, 0) 101 | (0.09218162661712481, 0) 102 | (0.09480643413693064, 0) 103 | (0.10213356194781036, 0) 104 | (0.1060934551977365, 0) 105 | (0.1132913629361838, 0) 106 | (0.11741878733773632, 0) 107 | (0.16301499791048507, 0) 108 | (0.17538128558051155, 0) 109 | (0.20671005402596973, 0) 110 | (0.2516665407893916, 1) 111 | (0.3661389534525756, 1) 112 | (0.4252099449008057, 0) 113 | (0.46393919484645285, 1) 114 | (0.46393919484645285, 1) 115 | (0.5010890570451947, 1) 116 | (0.5612109193335089, 1) 117 | (0.6331774086989662, 1) 118 | (0.7429332849557856, 1) 119 | (0.7628192573041437, 1) 120 | (0.7685918165525977, 1) 121 | (0.7685918165525977, 1) 122 | (0.8570344990295519, 1) 123 | (0.9417282789353449, 1) 124 | (0.9604228201148005, 1) 125 | (0.9676786026688059, 1) 126 | (0.9772060070555658, 1) 127 | (0.9791451785550085, 1) 128 | (0.9842293187007075, 1) 129 | (0.9901488715048538, 1) 130 | (0.990436153644518, 1) 131 | (0.9919238054978897, 1) 132 | (0.9928078540976585, 1) 133 | (0.9933478035260386, 1) 134 | (0.993918811471151, 1) 135 | (0.9949463438751233, 1) 136 | (0.9950517422780601, 1) 137 | (0.9953454365707204, 1) 138 | (0.9957547029560953, 1) 139 | (0.9966372191059513, 1) 140 | (0.9968856510709517, 1) 141 | (0.9968927400719431, 1) 142 | (0.9970405821756699, 1) 143 | (0.9974510717889961, 1) 144 | (0.9976801858304766, 1) 145 | (0.9979552393408241, 1) 146 | (0.9982154865259442, 1) 147 | (0.9982203594383284, 1) 148 | (0.9982840036989512, 1) 149 | (0.998419906033074, 1) 150 | (0.9984286897033879, 1) 151 | (0.9984939983442622, 1) 152 | (0.9985027501741893, 1) 153 | (0.998549118578187, 1) 154 | (0.9986863173788039, 1) 155 | (0.9986995495829957, 1) 156 | (0.9988363408173697, 1) 157 | (0.998854496324887, 1) 158 | (0.998955527610253, 1) 159 | (0.9991916934357409, 1) 160 | (0.99919397823552, 1) 161 | (0.99919397823552, 1) 162 | (0.9992342622785184, 1) 163 | (0.9992840756930583, 1) 164 | (0.9993487207606337, 1) 165 | (0.9993487207606337, 1) 166 | (0.9993665026371147, 1) 167 | (0.9993913467408749, 1) 168 | (0.999412694902663, 1) 169 | (0.9994539795827881, 1) 170 | (0.9994588524951725, 1) 171 | (0.9994663855013514, 1) 172 | (0.9995178058181322, 1) 173 | (0.9995572280200719, 1) 174 | (0.999596894990182, 1) 175 | (0.9996193026734028, 1) 176 | (0.9996382625515509, 1) 177 | (0.9996787062495565, 1) 178 | (0.999718214845872, 1) 179 | (0.9997201285040723, 1) 180 | (0.999781483433603, 1) 181 | (0.999781483433603, 1) 182 | (0.999783492129653, 1) 183 | (0.9997936397792117, 1) 184 | (0.9998102570392101, 1) 185 | (0.999817704188042, 1) 186 | (0.999817704188042, 1) 187 | (0.9998257655652, 1) 188 | (0.9998278518376006, 1) 189 | (0.9998278518376006, 1) 190 | (0.9998278518376006, 1) 191 | (0.9998301101066053, 1) 192 | 193 | ROC_AUC_SCORE: 0.999687451164 194 | 195 | Top 3 Perceptrons [(random_weight, feature_index)] 196 | [ 197 | [(0.6149939955332868, 36), (-0.6191801712762446, 30), (-0.8061383715423533, 50)], 198 | [(-0.42417643001229877, 29), (0.7619881468846776, 36), (-0.08738675196793921, 47)], 199 | [(0.4814303936739788, 36), (-0.2050123438980298, 50), (0.36938855845436214, 44)] 200 | ] 201 | ``` -------------------------------------------------------------------------------- /rbr_experiments/rbro.py: -------------------------------------------------------------------------------- 1 | # Coding up the algorithm from http://arxiv.org/abs/1501.02990 2 | # "Random Bits Regression: a Strong General Predictor for Big Data" 3 | 4 | import random 5 | from datetime import datetime 6 | 7 | def create_var_subset(x,size=3): 8 | # (1) Randomly select a small subset of variables, e.g. x1, x3, x6. 9 | return random.sample([i for i in range(len(x))],min(size,len(x))) 10 | 11 | def assign_weights(var_subset): 12 | # (2) Randomly assign weights to each selected variables. The weights 13 | # are sampled from standard normal distribution, for example, 14 | # w1, w3, w6~N(0,1) 15 | return [(random.random(),i) for i in var_subset] 16 | 17 | def obtain_weighted_sum(x, weighted_var_subset): 18 | # (3) Obtain the weighted sum for each sample, for example 19 | # (w1*x1) + (w3*x3) + (w6*x6) = zi for the ith sample. 20 | weighted_sum = 0 21 | for w, i in weighted_var_subset: 22 | weighted_sum += w * x[i] 23 | return weighted_sum 24 | 25 | def pick_random_threshold(weighted_sums): 26 | # (4) Randomly pick one zi from the n generated as the threshold T. 27 | return random.choice(weighted_sums) 28 | 29 | def assign_bit(weighted_sum, threshold): 30 | # (5) Assign bits values to fk according to the threshold T 31 | # If zi >= T then 1 else 0 32 | if weighted_sum >= threshold: 33 | return 1 34 | else: 35 | return 0 36 | 37 | def process(data, K=100, size=3): 38 | # The process is repeated K times. 39 | start = datetime.now() 40 | data_bits = [] 41 | for k in range(K): 42 | var_subset = create_var_subset(data[0],size=size) # 1 43 | weighted_var_subset = assign_weights(var_subset) #2 44 | weighted_sums = [] 45 | for x in data: 46 | weighted_sums.append(obtain_weighted_sum(x, weighted_var_subset)) # 3 47 | # The first feature is fixed to 1 to act as the interceptor. 48 | if k == 0: 49 | data_bits.append([1]) 50 | 51 | random_threshold = pick_random_threshold(weighted_sums) # 4 (Try picking multiple thresholds or entropy) 52 | 53 | for i, (x, data_bit) in enumerate(zip(data, data_bits)): 54 | data_bit.append( assign_bit(obtain_weighted_sum(x, weighted_var_subset),random_threshold) ) # 5 55 | 56 | if k % 1000 == 0: 57 | print k, datetime.now() - start 58 | return data_bits 59 | 60 | random.seed(100) 61 | 62 | from sklearn import datasets 63 | data, y = datasets.load_digits().data, datasets.load_digits().target 64 | data = [list(x) for x in data] 65 | 66 | data_bits = process(data, 10000, 3) # We generate ~10^4-10^6 random binary intermediate features for each sample. 67 | 68 | from sklearn import linear_model, ensemble, svm, neighbors, cross_validation 69 | import numpy as np 70 | 71 | # Select predictive intermediate features by regularized linear/logistic regression. 72 | 73 | # KNN Classifier without intermediate features 74 | start = datetime.now() 75 | clf = neighbors.KNeighborsClassifier() 76 | scores = cross_validation.cross_val_score(clf, data, y,cv=20) 77 | print clf, np.array(data).shape 78 | print scores 79 | print scores.mean() 80 | print datetime.now() - start 81 | print 82 | 83 | # KNN Classifier with intermediate features 84 | start = datetime.now() 85 | scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20) 86 | print clf, np.array(data_bits).shape 87 | print scores 88 | print scores.mean() 89 | print datetime.now() - start 90 | print 91 | 92 | # SGD Classifier without intermediate features 93 | start = datetime.now() 94 | clf = linear_model.SGDClassifier(loss="log", penalty="l2", n_iter=20, random_state=1, n_jobs=-1) 95 | scores = cross_validation.cross_val_score(clf, data, y,cv=20) 96 | print clf, np.array(data).shape 97 | print scores 98 | print scores.mean() 99 | print datetime.now() - start 100 | print 101 | 102 | # SGD Classifier with intermediate features 103 | start = datetime.now() 104 | scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20) 105 | print clf, np.array(data_bits).shape 106 | print scores 107 | print scores.mean() 108 | print datetime.now() - start 109 | print 110 | 111 | # Logistic Regression without intermediate features 112 | start = datetime.now() 113 | clf = linear_model.LogisticRegression() 114 | scores = cross_validation.cross_val_score(clf, data, y,cv=20) 115 | print clf, np.array(data).shape 116 | print scores 117 | print scores.mean() 118 | print datetime.now() - start 119 | print 120 | 121 | # Logistic Regression with intermediate features 122 | start = datetime.now() 123 | scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20) 124 | print clf, np.array(data_bits).shape 125 | print scores 126 | print scores.mean() 127 | print datetime.now() - start 128 | print 129 | 130 | # Standard RF without features 131 | start = datetime.now() 132 | clf = ensemble.ExtraTreesClassifier(n_estimators=500,random_state=1,n_jobs=-1) 133 | scores = cross_validation.cross_val_score(clf, data, y, cv=20) 134 | print clf, np.array(data).shape 135 | print scores 136 | print scores.mean() 137 | print datetime.now() - start 138 | print 139 | 140 | start = datetime.now() 141 | clf = ensemble.ExtraTreesClassifier(n_estimators=500,random_state=1,n_jobs=-1) 142 | scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20) 143 | print clf, np.array(data_bits).shape 144 | print scores 145 | print scores.mean() 146 | print datetime.now() - start 147 | print 148 | 149 | start = datetime.now() 150 | clf = ensemble.RandomForestClassifier(n_estimators=500,n_jobs=-1,random_state=1) 151 | scores = cross_validation.cross_val_score(clf, data, y, cv=20) 152 | print clf, np.array(data).shape 153 | print scores 154 | print scores.mean() 155 | print datetime.now() - start 156 | print 157 | 158 | start = datetime.now() 159 | clf = ensemble.RandomForestClassifier(n_estimators=500,n_jobs=-1,random_state=1) 160 | scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20) 161 | print clf, np.array(data_bits).shape 162 | print scores 163 | print scores.mean() 164 | print datetime.now() - start 165 | print 166 | 167 | start = datetime.now() 168 | clf = svm.SVC(kernel="linear") 169 | scores = cross_validation.cross_val_score(clf, data, y, cv=20) 170 | print clf, np.array(data).shape 171 | print scores 172 | print scores.mean() 173 | print datetime.now() - start 174 | print 175 | 176 | start = datetime.now() 177 | clf = svm.SVC(kernel="linear") 178 | scores = cross_validation.cross_val_score(clf, data_bits, y, cv=20) 179 | print clf, np.array(data_bits).shape 180 | print scores 181 | print scores.mean() 182 | print datetime.now() - start 183 | print -------------------------------------------------------------------------------- /rbr_experiments/readme.md: -------------------------------------------------------------------------------- 1 | # Experiment offline RBR on digits 2 | We write the algorithm to be as close to the paper as possible. Then we use a toy dataset `digits` shaped (1797, 64) with 10 classes. We look at algorithm performance of using 10^4 intermediate features of subset size 3. 3 | 4 | ## Results 5 | 20-fold CV acc. | Vectors | Algo 6 | --- | --- | --- 7 | 0.981674170670 | **RBR** | **SVM** 8 | 0.981230593359 | RAW | KNN 9 | 0.978857040929 | RAW | ET 10 | 0.974951330371 | RBR | **LOGREG** 11 | 0.974470711080 | RBR | ET 12 | 0.972307524295 | RBR | KNN 13 | 0.971636906430 | RBR | **RF** 14 | 0.971165709003 | RAW | SVM 15 | 0.967125864925 | RAW | RF 16 | 0.965967668687 | RBR | **SGD** 17 | 0.946672851823 | RAW | LOGREG 18 | 0.916611431522 | RAW | SGD 19 | 20 | ## Prelim 21 | RBR SVM took `0:04:24.457000` vs. RAW KNN `0:00:00.714000`. RBR improved SVM, Logreg, RF and SGD over using the RAW original features. 22 | 23 | Logistic Regression took a long time with 10k RBR features. All-in-all RBR LOGREG could be a useful diverse addition to an ensemble. 24 | 25 | ## Console 26 | ``` 27 | 0 0:00:00.012000 28 | 1000 0:00:08.505000 29 | 2000 0:00:17.046000 30 | 3000 0:00:25.596000 31 | 4000 0:00:34.179000 32 | 5000 0:00:42.764000 33 | 6000 0:00:51.321000 34 | 7000 0:00:59.911000 35 | 8000 0:01:08.479000 36 | 9000 0:01:17.042000 37 | 38 | KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', 39 | metric_params=None, n_neighbors=5, p=2, weights='uniform') (1797L, 64L) 40 | [ 0.92631579 0.97849462 0.98901099 1. 0.98888889 0.97777778 41 | 0.98888889 0.96666667 0.98888889 0.94444444 1. 0.98888889 42 | 0.98888889 1. 0.98876404 0.98876404 1. 0.95454545 43 | 0.97701149 0.98837209] 44 | 0.981230593359 45 | 0:00:00.714000 46 | 47 | KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', 48 | metric_params=None, n_neighbors=5, p=2, weights='uniform') (1797L, 10001L) 49 | [ 0.91578947 0.97849462 0.98901099 0.98888889 0.98888889 0.97777778 50 | 0.95555556 0.96666667 0.96666667 0.93333333 0.98888889 0.97777778 51 | 0.97777778 0.98888889 1. 0.98876404 1. 0.90909091 52 | 0.96551724 0.98837209] 53 | 0.972307524295 54 | 0:01:47.331000 55 | 56 | SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, 57 | eta0=0.0, fit_intercept=True, l1_ratio=0.15, 58 | learning_rate='optimal', loss='log', n_iter=20, n_jobs=-1, 59 | penalty='l2', power_t=0.5, random_state=1, shuffle=True, verbose=0, 60 | warm_start=False) (1797L, 64L) 61 | [ 0.89473684 0.90322581 0.87912088 0.94444444 0.97777778 0.9 62 | 0.88888889 0.91111111 0.87777778 0.9 0.97777778 0.98888889 63 | 0.98888889 0.87777778 0.94382022 0.93258427 0.85393258 0.80681818 64 | 0.91954023 0.96511628] 65 | 0.916611431522 66 | 0:00:03.182000 67 | 68 | SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, 69 | eta0=0.0, fit_intercept=True, l1_ratio=0.15, 70 | learning_rate='optimal', loss='log', n_iter=20, n_jobs=-1, 71 | penalty='l2', power_t=0.5, random_state=1, shuffle=True, verbose=0, 72 | warm_start=False) (1797L, 10001L) 73 | [ 0.94736842 0.96774194 1. 0.98888889 0.98888889 0.94444444 74 | 0.93333333 0.96666667 0.97777778 0.94444444 1. 0.97777778 75 | 0.97777778 0.97777778 0.98876404 0.97752809 0.97752809 0.875 76 | 0.94252874 0.96511628] 77 | 0.965967668687 78 | 0:01:13.966000 79 | 80 | LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, 81 | intercept_scaling=1, max_iter=100, multi_class='ovr', 82 | penalty='l2', random_state=None, solver='liblinear', tol=0.0001, 83 | verbose=0) (1797L, 64L) 84 | [ 0.89473684 0.93548387 0.95604396 0.96666667 0.95555556 0.94444444 85 | 0.92222222 0.93333333 0.92222222 0.97777778 0.98888889 0.98888889 86 | 0.98888889 0.95555556 0.97752809 0.96629213 0.91011236 0.81818182 87 | 0.96551724 0.96511628] 88 | 0.946672851823 89 | 0:00:06.033000 90 | 91 | LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, 92 | intercept_scaling=1, max_iter=100, multi_class='ovr', 93 | penalty='l2', random_state=None, solver='liblinear', tol=0.0001, 94 | verbose=0) (1797L, 10001L) 95 | [ 0.94736842 0.96774194 1. 0.98888889 0.98888889 0.97777778 96 | 0.94444444 0.97777778 0.97777778 0.96666667 0.98888889 0.98888889 97 | 0.97777778 0.98888889 0.98876404 0.98876404 0.98876404 0.93181818 98 | 0.95402299 0.96511628] 99 | 0.974951330371 100 | 0:09:41.141000 101 | 102 | ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', 103 | max_depth=None, max_features='auto', max_leaf_nodes=None, 104 | min_samples_leaf=1, min_samples_split=2, 105 | min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, 106 | oob_score=False, random_state=1, verbose=0, warm_start=False) (1797L, 64L) 107 | [ 0.93684211 0.97849462 1. 1. 0.98888889 0.97777778 108 | 0.97777778 0.96666667 0.97777778 0.97777778 0.98888889 0.98888889 109 | 0.97777778 1. 0.98876404 0.98876404 1. 0.94318182 110 | 0.97701149 0.94186047] 111 | 0.978857040929 112 | 0:00:16.637000 113 | 114 | ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', 115 | max_depth=None, max_features='auto', max_leaf_nodes=None, 116 | min_samples_leaf=1, min_samples_split=2, 117 | min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, 118 | oob_score=False, random_state=1, verbose=0, warm_start=False) (1797L, 10001L) 119 | [ 0.91578947 0.96774194 1. 0.98888889 0.98888889 0.97777778 120 | 0.96666667 0.96666667 0.96666667 0.98888889 1. 1. 121 | 0.98888889 0.97777778 0.97752809 0.98876404 0.96629213 0.94318182 122 | 0.96551724 0.95348837] 123 | 0.97447071108 124 | 0:02:19.255000 125 | 126 | RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', 127 | max_depth=None, max_features='auto', max_leaf_nodes=None, 128 | min_samples_leaf=1, min_samples_split=2, 129 | min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, 130 | oob_score=False, random_state=1, verbose=0, warm_start=False) (1797L, 64L) 131 | [ 0.93684211 0.96774194 0.98901099 0.97777778 0.98888889 0.97777778 132 | 0.95555556 0.95555556 0.96666667 0.96666667 0.98888889 0.98888889 133 | 0.96666667 0.96666667 0.97752809 0.98876404 0.96629213 0.92045455 134 | 0.95402299 0.94186047] 135 | 0.967125864925 136 | 0:00:18.448000 137 | 138 | RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', 139 | max_depth=None, max_features='auto', max_leaf_nodes=None, 140 | min_samples_leaf=1, min_samples_split=2, 141 | min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, 142 | oob_score=False, random_state=1, verbose=0, warm_start=False) (1797L, 10001L) 143 | [ 0.92631579 0.95698925 1. 0.98888889 0.97777778 0.97777778 144 | 0.96666667 0.96666667 0.97777778 0.98888889 0.98888889 0.98888889 145 | 0.98888889 0.97777778 0.97752809 0.98876404 0.95505618 0.93181818 146 | 0.96551724 0.94186047] 147 | 0.97163690643 148 | 0:01:42.010000 149 | 150 | SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, 151 | kernel='linear', max_iter=-1, probability=False, random_state=None, 152 | shrinking=True, tol=0.001, verbose=False) (1797L, 64L) 153 | [ 0.91578947 0.98924731 0.98901099 1. 0.98888889 0.96666667 154 | 0.95555556 0.96666667 0.95555556 0.96666667 1. 0.97777778 155 | 0.95555556 0.96666667 0.96629213 1. 0.98876404 0.92045455 156 | 0.97701149 0.97674419] 157 | 0.971165709003 158 | 0:00:01.275000 159 | 160 | SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, 161 | kernel='linear', max_iter=-1, probability=False, random_state=None, 162 | shrinking=True, tol=0.001, verbose=False) (1797L, 10001L) 163 | [ 0.93684211 0.97849462 1. 1. 0.97777778 0.97777778 164 | 0.97777778 1. 0.97777778 0.97777778 1. 1. 165 | 0.98888889 0.98888889 1. 0.98876404 1. 0.92045455 166 | 0.96551724 0.97674419] 167 | 0.98167417067 168 | 0:04:24.457000 169 | ``` -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Randomly Follow the Regularized Leader 2 | 3 | This is a class containing a binary classifier for online machine learning. It employs approaches based on Random Bits Regression and the FTRL-Proximal algorithm 4 | 5 | ##### rftrl.**RandomLeaderClassifier**(alpha=0.1, beta=1., l1=0., l2=1., nr_projections=10000, max_projections=0, subsample_projections=1., size_projections=3, random_state=0, verbose=0) 6 | 7 | ## Parameters 8 | 9 | Parameter | Description 10 | --- | --- 11 | alpha. | Float. Learning Rate. *Default = 0.1* 12 | beta. | Float. Smoothing parameter for adaptive learning rate. *Default = 1.* 13 | l1. | Float. L1 Regularization. *Default = 0.1* 14 | l2. | Float. L2 Regularization. *Default = 1.0* 15 | nr_projections. | Int. Number of random linear projections to create. *Default = 10000* 16 | max_projections. | Int. Not implemented. 17 | subsample_projections. | Float. Uses subsampling when making a first pass to create the random thresholds. This is more memory friendly for larger datasets. *Default = 1.* 18 | size_projections. | Int. Number of (feature_value * random_weight) to use in the random linear functions. *Default = 3* 19 | random_state. | Int. Seed for replication. *Default = 0* 20 | Verbose. | Int. Verbosity of classifier. *Default = 0* 21 | 22 | ## Usage 23 | 24 | ```python 25 | clf = rftrl.RandomLeaderClassifier(nr_projections=50000, random_state=1, size_projections=3) 26 | 27 | # Project data 28 | clf.project(X_train) 29 | 30 | # Train 31 | loss = 0 32 | for e, (x,y) in enumerate(zip(X_train,y)): 33 | clf.fit(x,e,y) 34 | pred = clf.predict() 35 | loss += clf.logloss() 36 | clf.update(pred) 37 | 38 | # Test 39 | y = 1 # Dummy label 40 | for e, x in enumerate(X_test): 41 | clf.fit(x,e,y) 42 | pred = clf.predict() 43 | print("%s,%s"%(e,pred)) 44 | ``` 45 | 46 | ## References 47 | 48 | > Random Bit Regression (RBR). 49 | > Random Bits Regression: a Strong General Predictor for Big Data 50 | > Yi Wang, Yi Li, Momiao Xiong, Li Jin 51 | 52 | http://arxiv.org/abs/1501.02990 53 | 54 | > Follow the Regularized Leader (FTRL) 55 | > Ad Click prediction: A view from the trenches. 56 | > H. Brendan McMahan, Gary Holt, D. Sculley, Michael Young, Dietmar Ebner, Julian Grady, Lan Nie, Todd Phillips, Eugene Davydov, Daniel Golovin, Sharat Chikkerur, Dan Liu, Martin Wattenberg, Arnar Mar Hrafnkelsson, Tom Boulos, Jeremy Kubica. 57 | 58 | https://research.google.com/pubs/archive/41159.pdf 59 | 60 | > Tinrtgu's Beat the Benchmark online FTRL proximal script's 61 | > Beat the benchmark with less then 200MB of memory. 62 | 63 | https://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/10322/beat-the-benchmark-with-less-then-200mb-of-memory/53737 64 | 65 | https://www.kaggle.com/c/tradeshift-text-classification/forums/t/10537/beat-the-benchmark-with-less-than-400mb-of-memory/ -------------------------------------------------------------------------------- /rftrl.py: -------------------------------------------------------------------------------- 1 | """ Author: Triskelion, HJ van Veen, info@mlwave.com 2 | 3 | This class implements a binary classifier for online learning, which is based on descriptions in the papers: 4 | 5 | Random Bit Regression (RBR). 6 | Random Bits Regression: a Strong General Predictor for Big Data 7 | Yi Wang, Yi Li, Momiao Xiong, Li Jin 8 | http://arxiv.org/abs/1501.02990 9 | 10 | Follow the Regularized Leader (FTRL) 11 | Ad Click prediction: A view from the trenches. 12 | H. Brendan McMahan, Gary Holt, D. Sculley, Michael Young, Dietmar Ebner, Julian Grady, Lan Nie, Todd Phillips, 13 | Eugene Davydov, Daniel Golovin, Sharat Chikkerur, Dan Liu, Martin Wattenberg, Arnar Mar Hrafnkelsson, Tom Boulos, 14 | Jeremy Kubica. 15 | https://research.google.com/pubs/archive/41159.pdf 16 | 17 | Random Bit Regression 18 | 19 | RBR works well on dense tall datasets. The algorithm is most succinctly described in the paper: 20 | 21 | 1. Randomly select a subset of variables, eg: f1, f2, f3 22 | 2. Assign random weights uniformly drawn from between 0 and 1 for each variable in the subset. eg: w1 = 0.4532134 23 | 3. Obtain the weighted sum (z). eg: z = (f1 * w1) + (f2 * w2) + (f3 * w3) 24 | 4. Randomly pick one threshold (t_random) generated from all z's (Z). eg: t_random = 15.34245 25 | 5. Vectorize samples with bits according to the formula: if z > t_random then 1 else 0. 26 | 27 | Basically we add the result of many random linear functions (perceptrons) as binarized features to a sample: Random Bit Vectorization. 28 | 29 | Follow the Regularized Leader 30 | 31 | We then use a logistic regression algorithm with L2 regularization to do conventional supervised learning on this bit representation. 32 | 33 | The online FTRL (oFTRL) code is credit to tinrtgu (https://www.kaggle.com/ggglhf) . This is a categorical classifier that was used for 34 | "ad click prediction"-competitions on Kaggle. It used the hashing trick to one-hot encode all the features and supported both L1 and 35 | L2 regularization. 36 | 37 | Modifications 38 | 39 | RBR 40 | 41 | We modify (relax) step 4. from the Random Bit Regression Algorithm. We don't want to generate all the thresholds for the 42 | entire dataset, simply to obtain a single random threshold. If we do all that, then we may as well pick thresholds so they 43 | better divide the classes. A single pass over a dataset or batch is still needed to get a random threshold for every random 44 | linear function. Heavy subsampling and a max Z-size ensures the generation of random thresholds without wasting too much time 45 | building the vectorizers. There are other paths to check out: completely random thresholds, prenormalizing or online normalization 46 | of features, and "Don't do linear functions, but Euclidean distance to first n noise-distorted samples". 47 | 48 | oFTRL 49 | 50 | oFTRL was originally a purely categorical classifier. Through bit vectorizing the features with random linear functions it can now 51 | handle features which were originally floats or numerical. Another benefit is the added boost for none-linearity in problems. 52 | 53 | As we always know the length of our binary representation (we are using this for dense tall datasets, not sparse datasets like text), 54 | we do not need the hashing trick for now. We can simply sparse encode: 55 | 56 | "11101" becomes "1:1 2:1 3:1 5:1". 57 | 58 | We call this modified algorithm "Randomly Follow the Regularized Leader" 59 | """ 60 | import numpy as np 61 | from math import sqrt, exp, log 62 | 63 | class RandomLeaderClassifier(object): 64 | def __init__(self, alpha=0.1, beta=1., l1=0., l2=1., nr_projections=10000, max_projections=0, 65 | subsample_projections=1., size_projections=3, random_state=0, 66 | verbose=0): 67 | self.z = [0.] * (nr_projections+1) 68 | self.n = [0.] * (nr_projections+1) 69 | self.nr_projections = nr_projections 70 | self.alpha = alpha 71 | self.beta = beta 72 | self.l1 = l1 73 | self.l2 = l2 74 | self.size_projections = size_projections 75 | self.subsample_projections = subsample_projections 76 | self.max_projections = max_projections 77 | self.random_state = random_state 78 | self.verbose = verbose 79 | self.w = {} 80 | self.X = [] 81 | self.y = 0. 82 | self.random_thresholds = [] 83 | self.random_indexes = [] 84 | self.random_weights = [] 85 | self.Prediction = 0. 86 | 87 | def sgn(self, x): 88 | if x < 0: 89 | return -1 90 | else: 91 | return 1 92 | 93 | def project(self, X_train): 94 | if self.verbose > 0: 95 | print("Creating %s random projections on train set shaped %s"%(self.nr_projections,str(X_train.shape))) 96 | print("Using random seed %s"%(self.random_state)) 97 | np.random.seed(self.random_state) 98 | self.random_indexes = np.random.randint(0, high=X_train.shape[1], size=(self.nr_projections, self.size_projections)) 99 | self.random_weights = np.random.rand(self.nr_projections,self.size_projections) 100 | for e, x in enumerate(X_train): 101 | if e == 0: 102 | thresholds = np.sum(x[self.random_indexes] * self.random_weights, axis=1).reshape((1,self.nr_projections)) 103 | else: 104 | if np.random.random() < self.subsample_projections: 105 | thresholds = np.r_[thresholds, np.sum(x[self.random_indexes] * self.random_weights, axis=1).reshape((1,self.nr_projections))] 106 | if self.max_projections > 0 and thresholds.shape[0] >= self.max_projections: 107 | if self.verbose > 0: 108 | print("Halting.") 109 | break 110 | 111 | random_thresholds = [] 112 | for column_id in range(self.nr_projections): 113 | random_thresholds.append(thresholds[np.random.randint(0,high=thresholds.shape[0])][column_id]) 114 | self.random_thresholds = np.array(random_thresholds) 115 | 116 | 117 | def fit(self,x,sample_id,label): 118 | self.ID = sample_id 119 | self.y = float(label) 120 | 121 | thresholds = np.sum(x[self.random_indexes] * self.random_weights, axis=1).reshape((1,self.nr_projections)) 122 | bools = thresholds > self.random_thresholds 123 | 124 | self.X = [e+1 for e, f in enumerate(list(bools.astype(int)[0])) if f == 1 ] # Sparse encoding the bitstring 125 | self.X = [0] + self.X # Prefix with a bias term 126 | 127 | def logloss(self): 128 | act = self.y 129 | pred = self.Prediction 130 | predicted = max(min(pred, 1. - 10e-15), 10e-15) 131 | return -log(predicted) if act == 1. else -log(1. - predicted) 132 | 133 | def predict(self): 134 | W_dot_x = 0. 135 | w = {} 136 | for i in self.X: 137 | if abs(self.z[i]) <= self.l1: 138 | w[i] = 0. 139 | else: 140 | w[i] = (self.sgn(self.z[i]) * self.l1 - self.z[i]) / (((self.beta + sqrt(self.n[i]))/self.alpha) + self.l2) 141 | W_dot_x += w[i] 142 | self.w = w 143 | self.Prediction = 1. / (1. + exp(-max(min(W_dot_x, 35.), -35.))) 144 | return self.Prediction 145 | 146 | def update(self, prediction): 147 | for i in self.X: 148 | g = (prediction - self.y) 149 | sigma = (1./self.alpha) * (sqrt(self.n[i] + g*g) - sqrt(self.n[i])) 150 | self.z[i] += g - sigma*self.w[i] 151 | self.n[i] += g*g --------------------------------------------------------------------------------