├── .gitignore ├── rbm.png ├── vae.png ├── dc_gan.png ├── optimization.png ├── supervised_model.png ├── LICENSE ├── k_nearest_neighbor.py ├── self_organizing_map.py ├── adaboost.py ├── learning_vector_quantization.py ├── logistic_regression.py ├── random_forest.py ├── xgboost.py ├── gradient_boosting_decision_tree.py ├── deep_belief_network.py ├── bayesian_net.py ├── decision_boundary_visualization.py ├── linear_regression.py ├── ant_colony.py ├── markov_random_field.py ├── naive_bayes.py ├── hidden_markov_model.py ├── simple_mlp.py ├── variational_autoencoder.py ├── restricted_boltzmann_machine.py ├── evolutionary_algorithm.py ├── temporal_difference.py ├── factorization_machines.py ├── support_vector_machine.py ├── decision_tree.py ├── minimax.py ├── transfer_learning.py ├── optimization_visualization.py ├── convolutional_neural_network.py ├── simple_cnn_layers.py ├── README.md ├── monte_carlo_tree_search.py ├── generative_adversarial_network.py ├── recurrent_neural_network.py ├── multilayer_perceptron.py ├── deep_q_network.py ├── long_short_term_memory.py └── nn_layers.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | *.gz 4 | *.pkz 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /rbm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiecong/Simple-Implementation-of-ML-Algorithms/HEAD/rbm.png -------------------------------------------------------------------------------- /vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiecong/Simple-Implementation-of-ML-Algorithms/HEAD/vae.png -------------------------------------------------------------------------------- /dc_gan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiecong/Simple-Implementation-of-ML-Algorithms/HEAD/dc_gan.png -------------------------------------------------------------------------------- /optimization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiecong/Simple-Implementation-of-ML-Algorithms/HEAD/optimization.png -------------------------------------------------------------------------------- /supervised_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiecong/Simple-Implementation-of-ML-Algorithms/HEAD/supervised_model.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Cong Xie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /k_nearest_neighbor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | 4 | 5 | class kNearestNeighbor(object): 6 | 7 | def __init__(self, k): 8 | self.k = k 9 | 10 | def fit(self, x, y): 11 | self.train_x = x 12 | self.train_y = y 13 | self.labels = np.unique(y) 14 | 15 | def _get_nn(self, x): 16 | nn_idx = np.argsort(np.square(self.train_x - x).sum(axis=1))[:self.k] 17 | nn_y, counts = np.unique(self.train_y[nn_idx], return_counts=True) 18 | y = np.zeros(len(self.labels)) 19 | y[nn_y] = counts 20 | return y / y.sum() 21 | 22 | def predict(self, x): 23 | return np.array([self._get_nn(xi) for xi in x]) 24 | 25 | 26 | def main(): 27 | data = load_digits() 28 | test_ratio = 0.2 29 | test_split = np.random.uniform(0, 1, len(data.data)) 30 | train_x, test_x = data.data[test_split >= 31 | test_ratio], data.data[test_split < test_ratio] 32 | train_y, test_y = data.target[ 33 | test_split >= test_ratio], data.target[test_split < test_ratio] 34 | 35 | knn = kNearestNeighbor(k=3) 36 | knn.fit(train_x, train_y) 37 | print(sum(np.argmax(knn.predict(train_x), axis=1) 38 | == train_y) / train_y.shape[0]) 39 | print(sum(np.argmax(knn.predict(test_x), axis=1) 40 | == test_y) / test_y.shape[0]) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /self_organizing_map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | class SOM(object): 6 | 7 | def __init__(self): 8 | self.sigma = 1 9 | self.lr = 0.1 10 | self.eps = 0.05 11 | self.n_size = 10 12 | self.iterations = 10 13 | self.neighbors_radius = [] 14 | radius = 4 15 | for i in range(-radius, radius+1): 16 | for j in range(-radius, radius+1): 17 | if i * i + j * j <= radius * radius: 18 | self.neighbors_radius.append((i, j)) 19 | self.w = None 20 | 21 | def get_bmu(self, w, x): 22 | dist = np.square(w - x).sum(axis=2) 23 | index = np.argmin(dist) 24 | return np.array([index // self.n_size, index % self.n_size]) 25 | 26 | def fit(self, x): 27 | fig, ax = plt.subplots(nrows=2, ncols=5, subplot_kw=dict(xticks=[], yticks=[])) 28 | 29 | self.w = np.random.randn(self.n_size, self.n_size, x.shape[1]) 30 | sigma_sq = self.sigma * self.sigma 31 | for step in range(self.iterations): 32 | for y in np.random.permutation(x): 33 | i, j = self.get_bmu(self.w, y) 34 | # update w 35 | for di, dj in self.neighbors_radius: 36 | if i + di >= 0 and i + di < self.n_size and j + di >= 0 and j + dj < self.n_size: 37 | self.w[i + di][j + dj] += self.lr * (y - self.w[i + di][j + dj]) * np.exp(-np.square([di, dj]).sum() / 2 / sigma_sq) 38 | self.lr *= np.exp(-step * self.eps) 39 | sigma_sq *= np.exp(-step * self.eps) 40 | ax[step//5][step%5].imshow(self.w.astype(int)) 41 | ax[step//5][step%5].title.set_text(step) 42 | plt.show() 43 | return self.w 44 | 45 | def main(): 46 | som = SOM() 47 | x = np.random.randint(0, 255, (3000, 3)) 48 | w = som.fit(x) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /adaboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_breast_cancer 3 | from decision_tree import DecisionTree 4 | 5 | 6 | class AdaBoost(object): 7 | 8 | def __init__(self, esti_num=10): 9 | self.esti_num = esti_num 10 | self.estimators = [] 11 | self.alphas = [] 12 | 13 | def fit(self, x, y): 14 | n_data = x.shape[0] 15 | w = np.ones(x.shape[0]) / n_data 16 | eps = 1e-16 17 | prediction = np.zeros(n_data) 18 | for i in range(self.esti_num): 19 | self.estimators.append(DecisionTree( 20 | metric_type='Gini impurity', depth=2)) 21 | self.estimators[i].fit(x, y, w) 22 | pred_i = self.estimators[i].predict(x) 23 | error_i = (pred_i != y).dot(w.T) 24 | self.alphas.append(np.log((1.0 - error_i) / (error_i + eps)) / 2) 25 | w = w * np.exp(self.alphas[i] * (2 * (pred_i != y) - 1)) 26 | w = w / w.sum() 27 | 28 | prediction += pred_i * self.alphas[i] 29 | print("Tree {} constructed, acc {}".format( 30 | i, (np.sign(prediction) == y).sum() / n_data)) 31 | 32 | def predict(self, x): 33 | return sum(esti.predict(x) * alpha for esti, alpha in zip(self.estimators, self.alphas)) 34 | 35 | 36 | def main(): 37 | data = load_breast_cancer() 38 | y = data.target * 2 - 1 39 | test_ratio = 0.2 40 | test_split = np.random.uniform(0, 1, len(data.data)) 41 | train_x, test_x = data.data[test_split >= 42 | test_ratio], data.data[test_split < test_ratio] 43 | train_y, test_y = y[test_split >= test_ratio], y[test_split < test_ratio] 44 | 45 | adaboost = AdaBoost() 46 | adaboost.fit(train_x, train_y) 47 | print((np.sign(adaboost.predict(train_x)) 48 | == train_y).sum() / train_x.shape[0]) 49 | print((np.sign(adaboost.predict(test_x)) 50 | == test_y).sum() / test_x.shape[0]) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /learning_vector_quantization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_openml 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class LVQ(object): 7 | 8 | def __init__(self): 9 | self.lr = 0.01 10 | self.iterations = 10 11 | self.eps = 0.05 12 | self.w = None 13 | self.c = None 14 | 15 | def fit(self, x, y): 16 | n_labels = 10 17 | n_repeat = 10 18 | self.w = np.repeat([x[y==i].mean(axis=0) for i in range(n_labels)], n_repeat, axis=0) 19 | print(self.w.shape) 20 | self.c = np.repeat(np.arange(n_labels), n_repeat) 21 | for step in range(self.iterations): 22 | print(f'iteration {step}') 23 | for i in np.random.permutation(np.arange(x.shape[0])): 24 | j = np.argmin(np.square(self.w - x[i]).sum(axis=1)) 25 | self.w[j] += (1.0 if self.c[j] == y[i] else -1.0) * self.lr * (x[i] - self.w[j]) 26 | self.lr *= np.exp(-step * self.eps) 27 | 28 | def predict(self, x): 29 | return self.c[ 30 | [np.argmin(np.square(self.w - xi).sum(axis=1)) for xi in x] 31 | ] 32 | 33 | 34 | def main(): 35 | x, y = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 36 | test_ratio = 0.2 37 | test_split = np.random.uniform(0, 1, x.shape[0]) 38 | train_x, test_x = x[test_split >= test_ratio] / \ 39 | x.max(), x[test_split < test_ratio] / x.max() 40 | train_y, test_y = y.astype(np.int_)[test_split >= test_ratio], y.astype( 41 | np.int_)[test_split < test_ratio] 42 | 43 | lvq = LVQ() 44 | lvq.fit(train_x, train_y) 45 | print(sum(lvq.predict(train_x) == train_y) / train_y.shape[0]) 46 | print(sum(lvq.predict(test_x) == test_y) / test_y.shape[0]) 47 | 48 | for i in range(lvq.w.shape[0]): 49 | plt.subplot(10, 10, i+1) 50 | plt.imshow(lvq.w[i].reshape(28, 28), cmap='gray', vmin=np.min(lvq.w), vmax=np.max(lvq.w)) 51 | plt.title('lvq codebooks') 52 | print('visualizing codebooks') 53 | plt.show() 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | 4 | 5 | def sigmoid(x): 6 | return 1 / (1 + np.exp(-x)) 7 | 8 | 9 | class LogisticRegression(object): 10 | 11 | def __init__(self): 12 | self.learning_rate = 0.01 13 | self.gamma = 0.9 14 | self.decay = 1 - 1e-4 15 | 16 | def loss(self, x, y): # using cross entropy as loss function 17 | eps = 1e-20 18 | h = self.predict(x) 19 | return -(np.multiply(y, np.log(h + eps)) + np.multiply((1 - y), np.log(1 - h + eps))).mean() 20 | 21 | def fit(self, x, y): 22 | label_num = len(np.unique(y)) 23 | labels = np.zeros((x.shape[0], label_num)) 24 | labels[np.arange(x.shape[0]), y] = 1 25 | self.w = np.random.randn(x.shape[1], label_num) 26 | self.b = np.random.randn(1, label_num) 27 | self.mom_w = np.zeros_like(self.w) 28 | self.mom_b = np.zeros_like(self.b) 29 | 30 | train_num = x.shape[0] 31 | for i in range(5000): 32 | h = sigmoid(x.dot(self.w) + self.b) 33 | g_w = x.T.dot(h - labels) / train_num 34 | g_b = (h - labels).sum() / train_num 35 | self.mom_w = self.gamma * self.mom_w + self.learning_rate * g_w 36 | self.w = (self.w - self.mom_w) * self.decay 37 | self.mom_b = self.gamma * self.mom_b + self.learning_rate * g_b 38 | self.b = (self.b - self.mom_b) * self.decay 39 | if i % 100 == 0: 40 | print(self.loss(x, labels)) 41 | 42 | def predict(self, x): 43 | return sigmoid(x.dot(self.w) + self.b) 44 | 45 | 46 | def main(): 47 | data = load_digits() 48 | test_ratio = 0.2 49 | test_split = np.random.uniform(0, 1, len(data.data)) 50 | train_x, train_y = data.data[ 51 | test_split >= test_ratio], data.target[test_split >= test_ratio] 52 | test_x, test_y = data.data[test_split < test_ratio], data.target[ 53 | test_split < test_ratio] 54 | 55 | lr = LogisticRegression() 56 | lr.fit(train_x, train_y) 57 | print(sum(np.argmax(lr.predict(train_x), axis=1) 58 | == train_y) / train_y.shape[0]) 59 | print(sum(np.argmax(lr.predict(test_x), axis=1) 60 | == test_y) / test_y.shape[0]) 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | from decision_tree import DecisionTree 4 | 5 | 6 | class RandomForest(object): 7 | 8 | def __init__(self, tree_num=50, max_depth=5, regression=False): 9 | self.max_depth = max_depth 10 | self.tree_num = tree_num 11 | self.forest = [] 12 | self.regression = regression 13 | self.metric_type = 'Variance' if regression else 'Gini impurity' 14 | 15 | def fit(self, x, y): 16 | feat_num = x.shape[1] 17 | n_feat = int(np.ceil(np.sqrt(feat_num))) 18 | data_num = x.shape[0] 19 | n_sample = data_num // 5 20 | self.labels = np.unique(y) 21 | 22 | for i in range(self.tree_num): 23 | f = np.random.randint(feat_num, size=n_feat) 24 | idx = np.random.randint(data_num, size=n_sample) 25 | dt = DecisionTree(metric_type=self.metric_type, 26 | depth=self.max_depth, regression=self.regression) 27 | dt.fit(x[idx], y[idx], feature_set=f) 28 | self.forest.append(dt) 29 | if self.regression: 30 | print("Tree #{} constructed, squared loss {}".format( 31 | i, np.square(self.predict(x) - y).sum())) 32 | else: 33 | print("Tree #{} constructed, acc {}".format( 34 | i, (np.argmax(self.predict(x), axis=1) == y).sum() / x.shape[0])) 35 | 36 | def predict(self, x): 37 | preds = np.array([tree.predict(x) for tree in self.forest]).T 38 | if self.regression: 39 | return preds.mean(axis=1) 40 | else: 41 | y = np.zeros((x.shape[0], len(self.labels))) 42 | for i, pred in enumerate(preds): 43 | value, counts = np.unique(pred, return_counts=True) 44 | y[i][value.astype(int)] = counts / counts.sum() 45 | return y 46 | 47 | 48 | def main(): 49 | data = load_digits() 50 | test_ratio = 0.2 51 | test_split = np.random.uniform(0, 1, len(data.data)) 52 | train_x = data.data[test_split >= test_ratio] 53 | test_x = data.data[test_split < test_ratio] 54 | train_y = data.target[test_split >= test_ratio] 55 | test_y = data.target[test_split < test_ratio] 56 | 57 | rf = RandomForest() 58 | rf.fit(train_x, train_y) 59 | print((np.argmax(rf.predict(train_x), axis=1) 60 | == train_y).sum() / train_x.shape[0]) 61 | print((np.argmax(rf.predict(test_x), axis=1) 62 | == test_y).sum() / test_x.shape[0]) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /xgboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_california_housing 3 | from decision_tree import DecisionTree 4 | # TODO classification 5 | 6 | 7 | def squared_loss(y, pred): 8 | return np.square(pred - y).mean() / 2 9 | 10 | 11 | def squared_loss_gradient(y, pred): 12 | return pred - y 13 | 14 | 15 | class XGBoostRegressionTree(DecisionTree): 16 | 17 | def __init__(self, max_depth): 18 | self.lambd = 0.01 19 | self.gamma = 0.1 20 | super(XGBoostRegressionTree, self).__init__( 21 | metric_type="Gini impurity", depth=max_depth, regression=True) 22 | self.metric = self.score 23 | 24 | def gen_leaf(self, y, w): 25 | return {'label': y.dot(w) / (sum(w) + self.lambd)} 26 | 27 | def score(self, y, w): 28 | return np.square(y.dot(w)) / (sum(w) + self.lambd) 29 | 30 | def split_gain(self, p_score, l_y, r_y, l_w, r_w): 31 | return (self.metric(l_y, l_w) + self.metric(r_y, r_w) - p_score) / 2 - self.gamma 32 | 33 | # importance for each feature 34 | 35 | 36 | class XGBoost(object): 37 | 38 | def __init__(self, regression=True, max_depth=4, tree_num=20): 39 | self.regression = regression 40 | self.max_depth = max_depth 41 | self.tree_num = tree_num 42 | self.forest = [] 43 | self.shrinkage = 0.5 44 | 45 | def get_importance(self): 46 | return sum(tree.get_importance() for tree in self.forest) / self.tree_num 47 | 48 | def fit(self, x, y): 49 | pred = 0 50 | for i in range(self.tree_num): 51 | grad = squared_loss_gradient(y, pred) 52 | self.forest.append(XGBoostRegressionTree(max_depth=self.max_depth)) 53 | self.forest[i].fit(x, grad) 54 | pred -= self.forest[i].predict(x) * self.shrinkage 55 | print("tree {} constructed, loss {}".format( 56 | i, squared_loss(y, pred))) 57 | 58 | def predict(self, x): 59 | return -np.array([tree.predict(x) * self.shrinkage for tree in self.forest]).sum(axis=0) 60 | 61 | 62 | def main(): 63 | data = fetch_california_housing(data_home='data') 64 | test_ratio = 0.2 65 | test_split = np.random.uniform(0, 1, len(data.data)) 66 | train_x = data.data[test_split >= test_ratio] 67 | test_x = data.data[test_split < test_ratio] 68 | train_y = data.target[test_split >= test_ratio] 69 | test_y = data.target[test_split < test_ratio] 70 | 71 | xgboost = XGBoost() 72 | xgboost.fit(train_x, train_y) 73 | print(xgboost.get_importance()) 74 | print(squared_loss(train_y, xgboost.predict(train_x))) 75 | print(squared_loss(test_y, xgboost.predict(test_x))) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /gradient_boosting_decision_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_california_housing 3 | from decision_tree import DecisionTree 4 | # TODO classification 5 | 6 | 7 | def squared_loss(y, pred): 8 | return np.square(pred - y).mean() / 2 9 | 10 | 11 | def squared_loss_gradient(y, pred): 12 | return pred - y 13 | 14 | 15 | def absolute_loss_gradient(y, pred): 16 | return np.sign(pred - y) 17 | 18 | 19 | class GBDT(object): 20 | 21 | def __init__(self, regression=True, tree_num=20, max_depth=4): 22 | self.regression = regression 23 | self.max_depth = max_depth 24 | self.tree_num = tree_num 25 | self.forest = [] 26 | self.rhos = np.ones(self.tree_num) 27 | self.t0 = 0 28 | self.shrinkage = 0.5 29 | 30 | def get_importance(self): 31 | return sum(tree.get_importance() for tree in self.forest) / self.tree_num 32 | 33 | def _linear_search(self, y, pred, delta): 34 | step = 0.1 35 | rhos = np.arange(step, 10, step) 36 | losses = [squared_loss(y, pred - rho * delta) for rho in rhos] 37 | return rhos[np.argmin(losses)] 38 | 39 | def fit(self, x, y): 40 | self.t0 = y.mean() # t0, which is a constant 41 | pred = y.mean() 42 | for i in range(self.tree_num): 43 | grad = squared_loss_gradient(y, pred) 44 | self.forest.append(DecisionTree( 45 | metric_type="Variance", depth=self.max_depth, regression=True)) 46 | self.forest[i].fit(x, grad) 47 | delta = self.forest[i].predict(x) 48 | # find best learning rate 49 | self.rhos[i] = self._linear_search(y, pred, delta) 50 | pred -= self.shrinkage * delta * self.rhos[i] 51 | # for categorical dataset, use cross entropy loss 52 | print("tree {} constructed, rho {}, loss {}".format( 53 | i, self.rhos[i], squared_loss(y, pred))) 54 | 55 | def predict(self, x): 56 | return self.t0 - np.array([tree.predict(x) * rho * self.shrinkage for tree, rho in zip(self.forest, self.rhos)]).sum(axis=0) 57 | 58 | 59 | def main(): 60 | data = fetch_california_housing(data_home='data') 61 | test_ratio = 0.2 62 | test_split = np.random.uniform(0, 1, len(data.data)) 63 | train_x = data.data[test_split >= test_ratio] 64 | test_x = data.data[test_split < test_ratio] 65 | train_y = data.target[test_split >= test_ratio] 66 | test_y = data.target[test_split < test_ratio] 67 | 68 | gbdt = GBDT() 69 | gbdt.fit(train_x, train_y) 70 | print(gbdt.get_importance()) 71 | print(squared_loss(train_y, gbdt.predict(train_x))) 72 | print(squared_loss(test_y, gbdt.predict(test_x))) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /deep_belief_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits, fetch_openml 3 | from multilayer_perceptron import MLP 4 | from restricted_boltzmann_machine import RBM 5 | 6 | 7 | def softmax(x): 8 | eps = 1e-8 9 | out = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) 10 | return out / (np.sum(out, axis=1).reshape(-1, 1) + eps) 11 | 12 | 13 | # this implementation reused the training of MLP for back propagation 14 | class DBN(object): 15 | 16 | def __init__(self, layers, n_labels): 17 | self.rbms = [] 18 | self.n_labels = n_labels 19 | for n_v, n_h in zip(layers[:-1], layers[1:]): 20 | self.rbms.append(RBM(n_v, n_h, epochs=10, lr=0.1)) 21 | self.dense = None 22 | 23 | def pretrain(self, x): 24 | v = x 25 | for rbm in self.rbms: 26 | rbm.fit(v) 27 | v = rbm.marginal_h(v) 28 | 29 | def finetuning(self, x, labels): 30 | # assign weights 31 | layers = [x.shape[1]] + [rbm.b.shape[1] for rbm in self.rbms] + [self.n_labels] 32 | mlp = MLP(act_type='Sigmoid', opt_type='Adam', layers=layers, 33 | epochs=20, learning_rate=0.01, lmbda=1e-2) 34 | 35 | mlp.w = [rbm.w for rbm in self.rbms] + \ 36 | [np.random.randn(self.rbms[-1].w.shape[1], self.n_labels)] 37 | mlp.b = [rbm.b for rbm in self.rbms] + \ 38 | [np.random.randn(1, self.n_labels)] 39 | mlp.fit(x, labels) 40 | # give back the weights 41 | # add the last feed-forward layer 42 | for rbm, w, b in zip(self.rbms, mlp.w[:-1], mlp.b[:-1]): 43 | rbm.w = w 44 | rbm.b = b 45 | self.dense = {'w': mlp.w[-1], 'b': mlp.b[-1]} 46 | 47 | def fit(self, x, y): 48 | self.pretrain(x) 49 | self.finetuning(x, y) 50 | 51 | def predict(self, x): 52 | for rbm in self.rbms: 53 | x = rbm.marginal_h(x) 54 | return softmax(self.dense['b'] + x.dot(self.dense['w'])) 55 | 56 | 57 | def main(): 58 | # data = load_digits() 59 | # x, y = data.data, data.target 60 | x, y = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 61 | test_ratio = 0.2 62 | test_split = np.random.uniform(0, 1, x.shape[0]) 63 | train_x, test_x = x[test_split >= test_ratio] / \ 64 | x.max(), x[test_split < test_ratio] / x.max() 65 | train_y, test_y = y.astype(np.int_)[test_split >= test_ratio], y.astype( 66 | np.int_)[test_split < test_ratio] 67 | 68 | print('dbn training') 69 | dbn = DBN([train_x.shape[1], 100, 100], 10) 70 | dbn.fit(train_x, train_y) 71 | print('dbn train accuracy', sum( 72 | np.argmax(dbn.predict(train_x), axis=1) == train_y) / train_y.shape[0]) 73 | print('dbn test accuracy', sum( 74 | np.argmax(dbn.predict(test_x), axis=1) == test_y) / test_y.shape[0]) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /bayesian_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pandas import read_csv 3 | # an example of asia bayesian net: 4 | # https://www.eecis.udel.edu/~shatkay/Course/papers/Lauritzen1988.pdf 5 | 6 | 7 | class BayesianNet(object): 8 | 9 | def __init__(self, names, edges, tables=None): 10 | self.n_nodes = len(names) 11 | if tables is None: 12 | tables = [[0]] * self.n_nodes 13 | self.nodes = [{'name': name, 'table': np.array( 14 | table)} for name, table in zip(names, tables)] 15 | self.name2idx = {k: v for v, k in enumerate(names)} 16 | self.graph = np.zeros((self.n_nodes, self.n_nodes)) 17 | for edge in edges: 18 | self.graph[self.name2idx[edge[1]], self.name2idx[edge[0]]] = 1 19 | self.binary = np.array( 20 | [1 << self.n_nodes - 1 - i for i in range(self.n_nodes)]) 21 | 22 | def fit(self, data): 23 | data_size = len(data) 24 | for i, node in enumerate(self.nodes): 25 | table = [] 26 | parents = self.graph[i] == 1 27 | marginal = data[:, parents] 28 | index = np.zeros(data.shape[0]) 29 | if marginal.shape[1] > 0: 30 | index = ( 31 | marginal * self.binary[-marginal.shape[1]:]).sum(axis=1) 32 | for j in range(2**parents.sum()): 33 | table.append(data[(index == j), i].sum() / (index == j).sum()) 34 | node['table'] = np.array(table) 35 | 36 | def joint_p(self, values): 37 | p = 1 38 | for i in range(self.n_nodes): 39 | index = 0 40 | parents = self.graph[i] == 1 41 | if parents.sum() > 0: 42 | index = np.dot(values[parents], self.binary[-parents.sum():]) 43 | p *= (1 - values[i]) + (2 * values[i] - 1) * \ 44 | self.nodes[i]['table'][int(index)] 45 | return p 46 | 47 | def marginal_p(self, condition): 48 | p = 0 49 | values = -np.ones(self.n_nodes) 50 | for v in condition: 51 | values[self.name2idx[v[1]]] = int(v[0] != '~') 52 | mask = np.arange(self.n_nodes)[(values == -1)] 53 | n_unkowns = self.n_nodes - len(condition) 54 | for i in range(2**n_unkowns): 55 | values[mask] = np.array( 56 | [int(x) for x in '{:0{size}b}'.format(i, size=n_unkowns)]) 57 | p += self.joint_p(values) 58 | return p 59 | 60 | def query(self, v, condition): 61 | p_pos = self.marginal_p([f'+{v}'] + condition) / self.marginal_p(condition) 62 | return [1 - p_pos, p_pos] 63 | 64 | 65 | def get_asia_data(url): 66 | return read_csv(url).apply(lambda x: x == 'yes').astype(int).values 67 | 68 | 69 | def main(): 70 | names = 'ATSLBEXD' 71 | edges = ['AT', 'SL', 'SB', 'TE', 'LE', 'BD', 'EX', 'ED'] 72 | #tables = [[0.01], [0.01, 0.05], [0.5], [0.01, 0.1], [0.3, 0.6], [0, 1, 1, 1], [0.05, 0.98], [0.1, 0.7, 0.8, 0.9]] 73 | # also can use predefined conditional tables 74 | bn = BayesianNet(list(names), edges) 75 | asia_url = 'http://www.ccd.pitt.edu/wiki/images/ASIA10k.csv' 76 | bn.fit(get_asia_data(asia_url)) 77 | print(bn.nodes) 78 | for condition in [[], ['+A', '~S'], ['+A', '~S', '~D', '+X']]: 79 | for c in ['T', 'L', 'B', 'E']: 80 | print('p({}|{})={}'.format(c, ','.join( 81 | condition), bn.query(c, condition))) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /decision_boundary_visualization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from multilayer_perceptron import MLP 4 | from gradient_boosting_decision_tree import GBDT 5 | from xgboost import XGBoost 6 | from random_forest import RandomForest 7 | from adaboost import AdaBoost 8 | from factorization_machines import FactorizationMachines 9 | from support_vector_machine import SVM 10 | from k_nearest_neighbor import kNearestNeighbor 11 | 12 | 13 | def gen_linear(train_num): 14 | x = 2 * np.random.random((train_num, 2)) - 1 15 | return x, (x.sum(axis=1) > 0) * 1 16 | 17 | 18 | def gen_circle(train_num): 19 | x = 2 * np.random.random((train_num, 2)) - 1 20 | return x, (np.square(x).sum(axis=1) > 0.6) * 1 21 | 22 | 23 | def gen_xor(train_num): 24 | x = 2 * np.random.random((train_num, 2)) - 1 25 | return x, np.array([(xi[0] * xi[1] > 0) for xi in x]) * 1 26 | 27 | 28 | def gen_spiral(train_num): 29 | r = 0.8 * np.arange(train_num) / train_num 30 | y = np.arange(train_num) % 2 31 | t = 1.75 * r * 2 * np.pi + y * np.pi 32 | x = np.c_[r * np.sin(t) + np.random.random(train_num) / 33 | 10, r * np.cos(t) + np.random.random(train_num) / 10] 34 | return x, y * 1 35 | 36 | 37 | def gen_moon(train_num): 38 | y = np.arange(train_num) % 2 39 | x0 = (y - 0.5) * (.5 - np.cos(np.linspace(0, np.pi, train_num))) + \ 40 | np.random.random(train_num) / 10 41 | x1 = (y - 0.5) * (.5 - 2 * np.sin(np.linspace(0, np.pi, train_num)) 42 | ) + np.random.random(train_num) / 10 43 | return np.c_[x0, x1], y 44 | 45 | # visualize decision boundary change 46 | 47 | 48 | def boundary_vis_plots(model, x, y, subplot=[1, 1, 1]): 49 | plt.subplot(subplot[0], subplot[1], subplot[2]) 50 | xx, yy = np.meshgrid(np.linspace(-1, 1, 50), np.linspace(-1, 1, 50)) 51 | pred = model.predict(np.c_[xx.ravel(), yy.ravel()]) 52 | zz = pred.reshape(xx.shape) if len(pred.shape) == 1 or pred.shape[ 53 | 1] == 1 else pred[:, 1].reshape(xx.shape) 54 | if subplot[2] <= subplot[1]: 55 | plt.title(type(model).__name__) 56 | plt.contourf(xx, yy, zz, levels=np.linspace( 57 | zz.min(), zz.max(), 40), cmap=plt.cm.RdBu) 58 | plt.contour(xx, yy, zz, levels=[0.5], colors='darkred') 59 | plt.scatter(x[:, 0], x[:, 1], c=np.array( 60 | ['red', 'blue'])[y], s=10, edgecolors='k') 61 | if subplot[2] == subplot[0] * subplot[1]: 62 | plt.show() 63 | 64 | 65 | def main(): 66 | data_loaders = [gen_linear, gen_circle, gen_xor, gen_spiral, gen_moon] 67 | models = [ 68 | (kNearestNeighbor, {'k': 5}), 69 | (FactorizationMachines, {'learning_rate': 1, 'embedding_dim': 1}), 70 | (SVM, {}), 71 | (AdaBoost, {'esti_num': 10}), 72 | (RandomForest, {'tree_num': 20, 'max_depth': 3}), 73 | (XGBoost, {'tree_num': 20, 'max_depth': 3}), 74 | (MLP, {'act_type': 'Tanh', 'opt_type': 'Adam', 'layers': [ 75 | 2, 8, 7, 2], 'epochs': 200, 'learning_rate': 0.5, 'lmbda': 1e-4}) 76 | ] 77 | for i, data_loader in enumerate(data_loaders): 78 | x, y = data_loader(256) 79 | for j, model in enumerate(models): 80 | clf = model[0](**model[1]) 81 | clf.fit(x, y if not j in [2, 3] else 2 * y - 1) 82 | boundary_vis_plots(clf, x, y, subplot=[len( 83 | data_loaders), len(models), len(models) * i + 1 + j]) 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import inv 3 | from sklearn.datasets import load_boston 4 | 5 | 6 | def squared_loss(y, pred): 7 | return np.square(pred - y).mean() / 2 8 | 9 | 10 | def squared_loss_gradient(y, pred): 11 | return pred - y 12 | 13 | 14 | class LinearRegression(object): 15 | 16 | def __init__(self): 17 | self.learning_rate = 0.1 18 | self.embedding_dim = 1 19 | self.lmbda = 0.001 # regularization coefficient 20 | self.reg = 2 21 | self.eps = 1e-12 22 | self.optimization = False 23 | 24 | def fit(self, x, y): 25 | if self.optimization: 26 | self.optimize(x, y) 27 | else: 28 | self.matrix_solver(x, y) 29 | 30 | def optimize(self, x, y): 31 | n_dim = x.shape[1] 32 | self.b = 0 33 | self.w = np.random.randn(n_dim) 34 | self.mom_w, self.cache_w = np.zeros(n_dim), np.zeros(n_dim) 35 | self.mom_b, self.cache_b = 0, 0 36 | 37 | for i in range(5000): 38 | grad = squared_loss_gradient(y, self.predict(x)) 39 | self.adam(grad.dot(x), grad.sum(), i + 1) 40 | self.regularization() 41 | if i % 100 == 0: 42 | print('loss {}'.format(squared_loss(self.predict(x), y))) 43 | 44 | def matrix_solver(self, x, y): 45 | n_dim = x.shape[1] 46 | ext_x = np.c_[x, np.ones((x.shape[0], 1))] 47 | inv_matrix = inv(np.matmul(ext_x.T, ext_x) + 48 | self.lmbda * np.identity(n_dim + 1)) 49 | ext_w = np.matmul(np.matmul(inv_matrix, ext_x.T), y.reshape(-1, 1)) 50 | self.w = ext_w[:-1].flatten() 51 | self.b = ext_w[-1] 52 | 53 | def sgd(self, grad_w, grad_b): # use a very small learning rate for sgd, e.g., 1e-8 54 | self.w -= self.learning_rate * grad_w 55 | self.b -= self.learning_rate * grad_b 56 | 57 | def adam(self, grad_w, grad_b, i): 58 | beta1 = 0.9 59 | beta2 = 0.999 60 | alpha = self.learning_rate 61 | self.mom_w = beta1 * self.mom_w + (1 - beta1) * grad_w 62 | self.cache_w = beta2 * self.cache_w + (1 - beta2) * np.square(grad_w) 63 | self.w -= alpha * self.mom_w / \ 64 | (1 - beta1**i) / (np.sqrt(self.cache_w / (1 - beta2**i)) + self.eps) 65 | self.mom_b = beta1 * self.mom_b + (1 - beta1) * grad_b 66 | self.cache_b = beta2 * self.cache_b + (1 - beta2) * np.square(grad_b) 67 | self.b -= alpha * self.mom_b / \ 68 | (1 - beta1**i) / (np.sqrt(self.cache_b / (1 - beta2**i)) + self.eps) 69 | 70 | def regularization(self): 71 | if(self.reg == 1): 72 | self.w -= self.lmbda * np.sign(self.w) 73 | self.b -= self.lmbda * np.sign(self.b) 74 | elif(self.reg == 2): 75 | self.w -= self.lmbda * self.w 76 | self.b -= self.lmbda * self.b 77 | 78 | def predict(self, x): 79 | return self.b + x.dot(self.w) 80 | 81 | 82 | def main(): 83 | data = load_boston() 84 | test_ratio = 0.2 85 | test_split = np.random.uniform(0, 1, len(data.data)) 86 | train_x = data.data[test_split >= test_ratio] 87 | test_x = data.data[test_split < test_ratio] 88 | train_y = data.target[test_split >= test_ratio] 89 | test_y = data.target[test_split < test_ratio] 90 | 91 | rr = LinearRegression() 92 | rr.fit(train_x, train_y) 93 | print(squared_loss(rr.predict(train_x), train_y)) 94 | print(squared_loss(rr.predict(test_x), test_y)) 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /ant_colony.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | # ant colony for traveling salesman problem 4 | 5 | 6 | class ACO(object): 7 | 8 | def __init__(self, nodes): 9 | self.n_node = nodes.shape[0] 10 | self.pheromone = np.ones((self.n_node, self.n_node)) 11 | self.n_ants = 50 12 | self.rho = 0.1 13 | self.alpha = 1 14 | self.beta = 2 15 | self.q = 50 16 | self.distance = np.array([ 17 | np.sqrt(np.square(nodes[i] - nodes[j]).sum()) 18 | for j in range(self.n_node) for i in range(self.n_node) 19 | ]).reshape(self.n_node, -1) 20 | self.dist_inv_beta = np.array([ 21 | np.power(np.square(nodes[i] - nodes[j]).sum(), -self.beta / 2) 22 | if i != j else 0 23 | for j in range(self.n_node) for i in range(self.n_node) 24 | ]).reshape(self.n_node, -1) 25 | 26 | def generate_path(self): 27 | # pick random init node 28 | this_id, next_id = -1, np.random.choice(np.arange(self.n_node)) 29 | visited_ids = [next_id] 30 | this_distance = 0 31 | for _ in range(self.n_node - 1): 32 | this_id = next_id 33 | # available_nodes 34 | p = np.power(self.pheromone[this_id], 35 | self.alpha) * self.dist_inv_beta[this_id] 36 | p[visited_ids] = 0 37 | next_id = np.random.choice(self.n_node, 1, p=p / np.sum(p))[0] 38 | this_distance += self.distance[this_id, next_id] 39 | visited_ids.append(next_id) 40 | this_distance += self.distance[next_id, visited_ids[0]] 41 | visited_ids.append(visited_ids[0]) 42 | return visited_ids, this_distance 43 | 44 | def update_pheromone(self, paths, dists): 45 | self.pheromone *= (1 - self.rho) 46 | for path, dist in zip(paths, dists): 47 | for this_id, next_id in zip(path[1:], path[:-1]): 48 | self.pheromone[ 49 | this_id, next_id] += self.q * dist 50 | 51 | def optimize(self): 52 | best_path = [] 53 | min_distance = np.inf 54 | for _ in range(100): 55 | paths, dists = [], [] 56 | for _ in range(self.n_ants): 57 | this_path, this_distance = self.generate_path() 58 | paths.append(this_path) 59 | dists.append(this_distance) 60 | if min_distance > this_distance: 61 | best_path = this_path 62 | min_distance = this_distance 63 | self.update_pheromone(paths, dists) 64 | return best_path, min_distance 65 | 66 | 67 | def main(): 68 | nodes = np.array([[565.0, 575.0], [25.0, 185.0], [345.0, 750.0], [945.0, 685.0], [845.0, 655.0], [880.0, 660.0], [25.0, 230.0], [525.0, 1000.0], [580.0, 1175.0], [650.0, 1130.0], [1605.0, 620.0], [1220.0, 580.0], [1465.0, 200.0], [1530.0, 5.0], [845.0, 680.0], [725.0, 370.0], [145.0, 665.0], [415.0, 635.0], [510.0, 875.0], [560.0, 365.0], [300.0, 465.0], [520.0, 585.0], [480.0, 415.0], [835.0, 625.0], [975.0, 580.0], [1215.0, 245.0], [ 69 | 1320.0, 315.0], [1250.0, 400.0], [660.0, 180.0], [410.0, 250.0], [420.0, 555.0], [575.0, 665.0], [1150.0, 1160.0], [700.0, 580.0], [685.0, 595.0], [685.0, 610.0], [770.0, 610.0], [795.0, 645.0], [720.0, 635.0], [760.0, 650.0], [475.0, 960.0], [95.0, 260.0], [875.0, 920.0], [700.0, 500.0], [555.0, 815.0], [830.0, 485.0], [1170.0, 65.0], [830.0, 610.0], [605.0, 625.0], [595.0, 360.0], [1340.0, 725.0], [1740.0, 245.0]]) 70 | aco = ACO(nodes) 71 | best_path, min_distance = aco.optimize() 72 | print('best path:', best_path, 'length:', min_distance) 73 | plt.scatter(nodes[:, 0], nodes[:, 1]) 74 | 75 | pos = nodes[best_path] 76 | angles = pos[1:] - pos[:-1] 77 | plt.quiver(pos[:-1, 0], pos[:-1, 1], angles[:, 0], angles[:, 1], 78 | scale_units='xy', angles='xy', scale=1, width=0.004) 79 | plt.show() 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /markov_random_field.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from skimage import data 3 | import matplotlib.pyplot as plt 4 | from skimage.transform import resize 5 | # Markov Random Field for image segmentation 6 | 7 | 8 | class MRF(object): 9 | 10 | def __init__(self, img): 11 | self.n_col, self.n_row = img.shape[1], img.shape[0] 12 | self.max_iter = self.n_col * self.n_row * 50 13 | self.beta = 2 14 | self.n_label = 3 15 | # init label by random 16 | self.labels = np.random.choice(self.n_label, self.n_col * self.n_row) 17 | self.label_means, self.label_vars = self.get_label_stats( 18 | img.flatten(), self.labels) 19 | 20 | def energy(self, img): 21 | energy = 0 22 | for idx in range(self.n_row * self.n_col): 23 | # sum p(x_s, y_s) 24 | mean, var = self.label_means[self.labels[ 25 | idx]], self.label_vars[self.labels[idx]] 26 | energy += np.log(np.sqrt(2 * np.pi * var)) + \ 27 | np.square(img[idx] - mean) / 2 / var 28 | # sum p(x_s, x_t) t in s neighbors 29 | for di, dj in [[0, -1], [-1, 0], [0, 1], [1, 0]]: 30 | if not(0 <= idx // self.n_col + di < self.n_row and 0 <= idx % self.n_col + dj < self.n_col): 31 | continue 32 | energy += -self.beta / 2 if self.labels[idx] == self.labels[ 33 | idx + di * self.n_col + dj] else self.beta / 2 34 | return energy 35 | 36 | def get_label_stats(self, img, labels): 37 | return [np.mean(img[labels == i]) for i in range(self.n_label)], [np.var(img[labels == i]) for i in range(self.n_label)] 38 | 39 | def transition_prob(self, img, idx, new_label, t): 40 | new_labels = self.labels.copy() 41 | new_labels[idx] = new_label 42 | old_mean, old_var = self.label_means[ 43 | self.labels[idx]], self.label_vars[self.labels[idx]] 44 | # new_mean, new_var = np.mean(img[new_labels == new_label]), np.var(img[new_labels == new_label]) 45 | new_mean, new_var = self.label_means[ 46 | new_label], self.label_vars[new_label] 47 | delta_energy = np.log(np.sqrt(new_var / old_var)) + np.square( 48 | img[idx] - new_mean) / 2 / new_var - np.square(img[idx] - old_mean) / 2 / old_var 49 | for di, dj in [[0, -1], [-1, 0], [0, 1], [1, 0]]: # , [-1, -1], [-1, 1], [1, 1], [1, -1]]: 50 | if not(0 <= idx // self.n_col + di < self.n_row and 0 <= idx % self.n_col + dj < self.n_col): 51 | continue 52 | delta_energy += -self.beta if new_label == self.labels[ 53 | idx + di * self.n_col + dj] else self.beta 54 | delta_energy -= -self.beta if self.labels[idx] == self.labels[ 55 | idx + di * self.n_col + dj] else self.beta 56 | if delta_energy < 0: 57 | return 1 58 | else: 59 | return np.exp(-delta_energy / t) 60 | 61 | def optimize(self, img): 62 | for t in range(self.max_iter): 63 | idx = t % img.shape[0] # np.random.choice(img.shape[0]) 64 | lp = np.ones(self.n_label) 65 | lp[self.labels[idx]] = 0 66 | new_label = np.random.choice(self.n_label, p=lp / lp.sum()) 67 | prob = self.transition_prob( 68 | img=img, idx=idx, new_label=new_label, t=0.01 * (1 - t / self.max_iter)) 69 | if prob >= np.random.uniform(): 70 | self.labels[idx] = new_label 71 | self.label_means, self.label_vars = self.get_label_stats( 72 | img, self.labels) 73 | return self.labels 74 | 75 | 76 | def main(): 77 | img = data.camera() 78 | img = resize( 79 | img, (img.shape[0] // 4, img.shape[1] // 4), anti_aliasing=True) 80 | mrf = MRF(img) 81 | plt.subplot(1, 3, 1) 82 | plt.imshow(img, cmap='gray') 83 | plt.subplot(1, 3, 2) 84 | plt.imshow(mrf.labels.reshape((img.shape[0], -1)), cmap='gray') 85 | seg_img = mrf.optimize(img.flatten()) 86 | plt.subplot(1, 3, 3) 87 | plt.imshow(seg_img.reshape((img.shape[0], -1)), cmap='gray') 88 | plt.show() 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /naive_bayes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_20newsgroups 3 | import re 4 | 5 | 6 | def tokenize(documents, stop_words): 7 | text = [] 8 | for doc in documents: 9 | letters_only = re.sub("[^a-zA-Z]", " ", doc) 10 | words = letters_only.lower().split() 11 | text.append([w for w in words if not w in stop_words]) 12 | return np.array(text) 13 | 14 | 15 | class NaiveBayes(object): 16 | # multinominal NB model with laplace smoothing 17 | # guassian can be used for numerical 18 | 19 | def __init__(self): 20 | self.p_w = {} 21 | self.p_c = {} 22 | self.vocabulary = [] 23 | self.v_num = 0 24 | 25 | def fit(self, x, y): 26 | n_data = len(y) 27 | self.label, p_c = np.unique(y, return_counts=True) 28 | self.p_c = dict(zip(self.label, np.log(p_c / n_data))) 29 | indexes = np.c_[np.array(y), np.arange(n_data)] 30 | 31 | self.vocabulary = np.unique( 32 | [item for sublist in x for item in sublist]) 33 | self.v_num = len(self.vocabulary) 34 | print("vocabulary length {}".format(self.v_num)) 35 | self.v_idx = dict(zip(self.vocabulary, np.arange(self.v_num))) 36 | 37 | print("start fitting") 38 | for l in self.label: 39 | idxes = indexes[indexes[:, 0] == l][:, 1].astype(int) 40 | corpus = [x[idx] for idx in idxes] 41 | flatten = [item for sublist in corpus for item in sublist] 42 | self.p_w[l] = [ 43 | np.log(1 / (len(flatten) + self.v_num))] * self.v_num 44 | words, pwl = np.unique(flatten, return_counts=True) 45 | for w, p in zip(words, pwl): 46 | self.p_w[l][self.v_idx[w]] = np.log( 47 | (p + 1) / (len(flatten) + self.v_num)) 48 | 49 | def predict(self, x): 50 | return np.array([self.predict_sample(xi) for xi in x]) 51 | 52 | def predict_sample(self, x): 53 | eps = 1 / self.v_num 54 | p = [self.p_c[i] + sum(self.p_w[i][self.v_idx[w]] if w in self.v_idx.keys() 55 | else eps for w in x) for i in range(len(self.label))] 56 | return self.label[np.argmax(p)] 57 | 58 | 59 | def main(): 60 | stop_words = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 61 | "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", 62 | "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", 63 | "them", "their", "theirs", "themselves", "what", "which", "who", "whom", 64 | "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", 65 | "been", "being", "have", "has", "had", "having", "do", "does", "did", 66 | "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", 67 | "until", "while", "of", "at", "by", "for", "with", "about", "against", 68 | "between", "into", "through", "during", "before", "after", "above", "below", 69 | "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", 70 | "again", "further", "then", "once", "here", "there", "when", "where", "why", 71 | "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 72 | "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", 73 | "very", "s", "t", "can", "will", "just", "don", "should", "now"]) 74 | data = fetch_20newsgroups() 75 | x = tokenize(data.data, stop_words) 76 | y = data.target 77 | 78 | test_ratio = 0.2 79 | test_split = np.random.uniform(0, 1, len(x)) 80 | train_x = x[test_split >= test_ratio] 81 | test_x = x[test_split < test_ratio] 82 | train_y = y[test_split >= test_ratio] 83 | test_y = y[test_split < test_ratio] 84 | 85 | nb = NaiveBayes() 86 | nb.fit(train_x, train_y) 87 | print("predicting") 88 | print(sum(nb.predict(train_x) == train_y) / train_x.shape[0]) 89 | print(sum(nb.predict(test_x) == test_y) / test_y.shape[0]) 90 | 91 | 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /hidden_markov_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # todo: update examples 3 | 4 | 5 | class HMM(object): 6 | 7 | def __init__(self, o_num, s_num, pi=None, A=None, B=None): 8 | self.o_num = o_num 9 | self.s_num = s_num 10 | self.A = np.random.rand(s_num, s_num) if A is None else A 11 | self.A = self.A / self.A.sum(axis=1).reshape(-1, 1) 12 | self.B = np.random.rand(s_num, o_num) if B is None else B 13 | self.B = self.B / self.B.sum(axis=1).reshape(-1, 1) 14 | self.pi = np.ones(s_num) / s_num if pi is None else pi 15 | 16 | # Probability of an observed sequence 17 | def forward(self, obs): 18 | alpha = np.zeros((obs.shape[0], self.s_num)) 19 | alpha[0, :] = self.pi * self.B[:, obs[0]] 20 | for t in range(1, len(obs)): 21 | alpha[t, :] = alpha[t - 1, :].dot(self.A) * self.B[:, obs[t]] 22 | return alpha 23 | 24 | def backward(self, obs): 25 | beta = np.zeros((obs.shape[0], self.s_num)) 26 | beta[obs.shape[0] - 1, :] = np.ones((1, self.s_num)) 27 | for t in range(obs.shape[0] - 2, -1, -1): 28 | beta[t, :] = self.A.dot((beta[t + 1, :] * self.B[:, obs[t + 1]]).T) 29 | return beta 30 | 31 | def baum_welch(self, obs, epsilon=0.05, max_it=100): 32 | it = 0 33 | obs_indicator = np.zeros((len(obs), self.o_num)) 34 | obs_indicator[np.arange(len(obs)), obs] = 1 35 | error = epsilon + 1 36 | while(error > epsilon and it < 100): 37 | alpha = self.forward(obs) 38 | beta = self.backward(obs) 39 | 40 | # E step 41 | xi = np.zeros((self.s_num, self.s_num)) 42 | likelihood = (alpha * beta).T 43 | gamma = likelihood / likelihood.sum(axis=0).reshape(1, -1) 44 | for t in range(0, len(obs) - 1): 45 | xit = alpha[ 46 | t].reshape(-1, 1).dot((beta[t + 1] * self.B[:, obs[t + 1]]).reshape(1, -1)) * self.A 47 | xi += xit / xit.sum() 48 | 49 | # M step 50 | self.pi = gamma[:, 0] 51 | A = xi / gamma[:, :-1].sum(axis=1).reshape(-1, 1) 52 | B = gamma.dot(obs_indicator) / gamma.sum(axis=1).reshape(-1, 1) 53 | 54 | error = (np.abs(A - self.A)).max() + (np.abs(B - self.B)).max() 55 | it += 1 56 | self.A, self.B = A, B 57 | 58 | def viterbi(self, obs): 59 | v = self.pi * self.B[:, obs[0]] 60 | vpath = np.arange(self.s_num).reshape(-1, 1).tolist() 61 | for i in range(1, len(obs)): 62 | prev = np.array([np.argmax(v * self.A[:, n]) 63 | for n in range(self.s_num)]) 64 | v = v[prev] * self.A[prev, 65 | np.arange(self.s_num)] * self.B[:, obs[i]] 66 | vpath = [vpath[prev[s]] + [s] for s in range(self.s_num)] 67 | return vpath[np.argmax(v)] 68 | 69 | 70 | def seq_generator(): 71 | o_num, s_num = 2, 2 72 | A = np.array([[0.4, 0.6], [0.9, 0.1]]) 73 | B = np.array([[0.49, 0.51], [0.85, 0.15]]) 74 | pi = np.array([0.5, 0.5]) 75 | q = np.random.choice(s_num, 1, p=pi)[0] 76 | v = [] 77 | for i in range(100): 78 | v.append(np.random.choice(o_num, 1, p=B[q].flatten())[0]) 79 | q = np.random.choice(s_num, 1, p=A[q].flatten())[0] 80 | obs = np.array(v) 81 | return obs, A, B, pi 82 | 83 | 84 | def main(): 85 | hmm = HMM( 86 | o_num=3, s_num=2, 87 | pi=np.array([0.6, 0.4]), 88 | A=np.array([[0.7, 0.3], [0.4, 0.6]]), 89 | B=np.array([[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]]) 90 | ) 91 | # 0, 0, 1 see example in https://en.wikipedia.org/wiki/Viterbi_algorithm 92 | print('viterbi', hmm.viterbi([2, 1, 0])) 93 | 94 | # examples here https://iulg.sitehost.iu.edu/moss/hmmcalculations.pdf 95 | hmm = HMM( 96 | o_num=2, s_num=2, 97 | pi=np.array([0.85, 0.16]), 98 | A=np.array([[0.3, 0.7], [0.1, 0.9]]), 99 | B=np.array([[0.4, 0.6], [0.5, 0.5]]) 100 | ) 101 | obs = np.array([0, 1, 1, 0]) 102 | hmm.baum_welch(obs) 103 | print('initial probabilities', hmm.pi) 104 | print('transition matrix', hmm.A) 105 | print('emission matrix', hmm.B) 106 | 107 | 108 | if __name__ == "__main__": 109 | main() 110 | -------------------------------------------------------------------------------- /simple_mlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | # a simpler implementation of multilayer perceptron with backpropagation training 4 | # 2 hidden layers, with 100 and 50 perceptrons 5 | # this one use sigmoid in hiddn layer and softmax in output 6 | # set batch size and epochs before start 7 | 8 | 9 | def sigmoid(x): 10 | return 1 / (1 + np.exp(-x)) 11 | 12 | 13 | def softmax(x): 14 | eps = 1e-8 15 | out = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) 16 | return out / (np.sum(out, axis=1).reshape(-1, 1) + eps) 17 | 18 | 19 | class MLP(object): 20 | 21 | def __init__(self, n_features, n_labels): 22 | ''' 23 | D_in is input dimension; 24 | H is hidden dimension; 25 | D_out is output dimension. 26 | ''' 27 | self.D_in, self.H1, self.H2, self.D_out = n_features, 100, 50, n_labels 28 | self.epochs, self.batch_size = 200, 32 29 | self.learning_rate = 1e-2 30 | 31 | # Randomly initialize weights 32 | self.w1 = np.random.randn(self.D_in, self.H1) 33 | self.w2 = np.random.randn(self.H1, self.H2) 34 | self.w3 = np.random.randn(self.H2, self.D_out) 35 | 36 | self.b1 = np.random.randn(1, self.H1) 37 | self.b2 = np.random.randn(1, self.H2) 38 | self.b3 = np.random.randn(1, self.D_out) 39 | 40 | def loss(self, x, y): 41 | return -(np.multiply(y, np.log(self.predict(x)))).mean() 42 | 43 | def predict(self, x): 44 | eps = 1e-8 45 | a1 = sigmoid(x.dot(self.w1) + self.b1) 46 | a2 = sigmoid(a1.dot(self.w2) + self.b2) 47 | return softmax(a2.dot(self.w3) + self.b3) 48 | 49 | def fit(self, x_train, labels): 50 | train_num = x_train.shape[0] 51 | eps = 1e-8 52 | bvec = np.ones((1, self.batch_size)) 53 | 54 | y_train = np.zeros((train_num, self.D_out)) 55 | y_train[np.arange(train_num), labels] = 1 56 | 57 | for epoch in range(self.epochs): 58 | # mini batch 59 | permut = np.random.permutation( 60 | train_num // self.batch_size * self.batch_size).reshape(-1, self.batch_size) 61 | for b_idx in range(permut.shape[0]): 62 | x, y = x_train[permut[b_idx, :]], y_train[permut[b_idx, :]] 63 | 64 | # Forward pass: compute predicted y 65 | a1 = sigmoid(x.dot(self.w1) + self.b1) 66 | a2 = sigmoid(a1.dot(self.w2) + self.b2) 67 | out = softmax(a2.dot(self.w3) + self.b3) 68 | 69 | # Backprop to compute gradients of weights with respect to loss 70 | grad_out = out - y 71 | grad_w3 = a2.T.dot(grad_out) 72 | 73 | grad_a2 = grad_out.dot(self.w3.T) 74 | grad_a2 = np.multiply(grad_a2, (a2 - np.square(a2))) 75 | grad_w2 = a1.T.dot(grad_a2) 76 | 77 | grad_a1 = grad_a2.dot(self.w2.T) 78 | grad_a1 = np.multiply(grad_a1, (a1 - np.square(a1))) 79 | grad_w1 = x.T.dot(grad_a1) 80 | 81 | # Update weights 82 | self.w1 -= self.learning_rate * grad_w1 83 | self.b1 -= self.learning_rate * bvec.dot(grad_a1) 84 | self.w2 -= self.learning_rate * grad_w2 85 | self.b2 -= self.learning_rate * bvec.dot(grad_a2) 86 | self.w3 -= self.learning_rate * grad_w3 87 | self.b3 -= self.learning_rate * bvec.dot(grad_out) 88 | print('epoch {}, loss: {}'.format( 89 | epoch, self.loss(x_train, y_train))) 90 | 91 | 92 | def main(): 93 | data = load_digits() 94 | test_ratio = 0.2 95 | test_split = np.random.uniform(0, 1, len(data.data)) 96 | train_x = data.data[test_split >= test_ratio] / data.data.max() 97 | test_x = data.data[test_split < test_ratio] / data.data.max() 98 | train_y = data.target[test_split >= test_ratio] 99 | test_y = data.target[test_split < test_ratio] 100 | 101 | mlp = MLP(train_x.shape[1], len(np.unique(data.target))) 102 | mlp.fit(train_x, train_y) 103 | print(sum(np.argmax(mlp.predict(train_x), axis=1) 104 | == train_y) / train_y.shape[0]) 105 | print(sum(np.argmax(mlp.predict(test_x), axis=1) 106 | == test_y) / test_y.shape[0]) 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /variational_autoencoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits, fetch_openml 3 | from nn_layers import FullyConnect, Activation 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | class VAE(object): 8 | 9 | def __init__(self, dim_in, dim_hidden, dim_z): 10 | self.n_epochs, self.batch_size = 10, 32 11 | self.C = 1 # trade off of reconstruction and KL divergence 12 | 13 | # architecture is hard-coded 14 | self.encoder_hidden = FullyConnect([dim_in], [dim_hidden], lr=1e-2) 15 | self.encoder_act = Activation(act_type='ReLU') 16 | self.encoder_mu = FullyConnect([dim_hidden], [dim_z], lr=1e-2) 17 | self.encoder_log_sigma = FullyConnect([dim_hidden], [dim_z], lr=1e-2) 18 | 19 | self.decoder_hidden = FullyConnect([dim_z], [dim_hidden], lr=1e-2) 20 | self.decoder_act_hidden = Activation(act_type='ReLU') 21 | self.decoder_out = FullyConnect([dim_hidden], [dim_in], lr=1e-2) 22 | self.decoder_act_out = Activation(act_type='Sigmoid') 23 | 24 | def fit(self, x): 25 | for epoch in range(self.n_epochs): 26 | permut = np.random.permutation( 27 | x.shape[0] // self.batch_size * self.batch_size 28 | ).reshape([-1, self.batch_size]) 29 | for b_idx in range(permut.shape[0]): 30 | x_batch = x[permut[b_idx, :]] 31 | mu, log_sigma = self.encoder_forward(x_batch) 32 | z = self.sampling(mu, log_sigma) 33 | out = self.decoder_forward(z) 34 | 35 | recon_grad = self.C * (out - x_batch) 36 | grad_d_act_out = self.decoder_act_out.gradient(recon_grad) 37 | grad_d_out = self.decoder_out.gradient(grad_d_act_out) 38 | grad_d_act_hidden = self.decoder_act_hidden.gradient( 39 | grad_d_out) 40 | grad_z = self.decoder_hidden.gradient(grad_d_act_hidden) 41 | 42 | kl_mu_grad = mu 43 | kl_sigma_grad = np.exp(2 * log_sigma) - 1 44 | grad_mu = self.encoder_mu.gradient(grad_z + kl_mu_grad) 45 | grad_log_sigma = self.encoder_log_sigma.gradient( 46 | grad_z + kl_sigma_grad) 47 | grad_e_act = self.encoder_act.gradient( 48 | grad_mu + grad_log_sigma) 49 | grad_e_hidden = self.encoder_hidden.gradient(grad_e_act) 50 | 51 | self.backward() 52 | print('epoch: {}, log loss: {}, kl loss: {}'.format( 53 | epoch, self.log_loss(out, x_batch), self.kl_loss(mu, log_sigma) 54 | )) 55 | 56 | def encoder_forward(self, x): 57 | hidden = self.encoder_hidden.forward(x) 58 | hidden = self.encoder_act.forward(hidden) 59 | mu = self.encoder_mu.forward(hidden) 60 | log_sigma = self.encoder_log_sigma.forward(hidden) 61 | return mu, log_sigma 62 | 63 | def sampling(self, mu, log_sigma): 64 | noise = np.random.randn(mu.shape[0], mu.shape[1]) 65 | return mu + noise * np.exp(log_sigma) 66 | 67 | def decoder_forward(self, z): 68 | hidden = self.decoder_hidden.forward(z) 69 | hidden = self.decoder_act_hidden.forward(hidden) 70 | out = self.decoder_out.forward(hidden) 71 | out = self.decoder_act_out.forward(out) 72 | return out 73 | 74 | def backward(self): 75 | self.decoder_act_out.backward() 76 | self.decoder_out.backward() 77 | self.decoder_act_hidden.backward() 78 | self.decoder_hidden.backward() 79 | self.encoder_mu.backward() 80 | self.encoder_log_sigma.backward() 81 | self.encoder_act.backward() 82 | self.encoder_hidden.backward() 83 | 84 | def log_loss(self, pred, x): 85 | return 0.5 * self.C * np.square(pred - x).mean() 86 | 87 | def kl_loss(self, mu, log_sigma): 88 | return 0.5 * (-2 * log_sigma + np.exp(2 * log_sigma) + np.square(mu) - 1).mean() 89 | 90 | 91 | def main(): 92 | #data = load_digits() 93 | #x, y = data.data, data.target 94 | x, _ = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 95 | vae = VAE(x.shape[1], 64, 2) 96 | vae.fit(x / x.max()) 97 | 98 | n_rows = 11 99 | for i in range(n_rows): 100 | for j in range(n_rows): 101 | plt.subplot(n_rows, n_rows, i * n_rows + j + 1) 102 | plt.imshow( 103 | vae.decoder_forward( 104 | np.array([[(i - n_rows // 2) / 2, (j - n_rows // 2) / 2]])).reshape(28, 28), 105 | cmap='gray', vmin=0, vmax=1 106 | ) 107 | plt.show() 108 | 109 | 110 | if __name__ == "__main__": 111 | main() 112 | -------------------------------------------------------------------------------- /restricted_boltzmann_machine.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits, fetch_openml 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def sigmoid(x): 7 | return 1 / (1 + np.exp(-x)) 8 | 9 | 10 | class RBM(object): 11 | 12 | def __init__(self, n_v, n_h, epochs=50, lr=0.05): 13 | self.w = np.random.randn(n_v, n_h) 14 | self.a = np.random.randn(1, n_v) 15 | self.b = np.random.randn(1, n_h) 16 | 17 | self.mom_w, self.cache_w = np.zeros_like(self.w), np.zeros_like(self.w) 18 | self.mom_a, self.cache_a = np.zeros_like(self.a), np.zeros_like(self.a) 19 | self.mom_b, self.cache_b = np.zeros_like(self.b), np.zeros_like(self.b) 20 | 21 | self.lr = lr 22 | self.batch_size = 16 23 | self.max_epochs = epochs 24 | self.decay = 1 - 1e-4 25 | 26 | def fit(self, v): 27 | beta1 = 0.9 28 | beta2 = 0.999 29 | eps = 1e-20 30 | 31 | train_num = v.shape[0] 32 | for j in range(self.max_epochs): 33 | permut = np.random.permutation( 34 | train_num // self.batch_size * self.batch_size).reshape(-1, self.batch_size) 35 | for i in range(permut.shape[0]): 36 | v0 = v[permut[i], :] 37 | p_h0 = self.marginal_h(v0) 38 | h0 = 1 * (p_h0 >= np.random.uniform(0, 1, 39 | (self.batch_size, self.b.shape[1]))) 40 | v1 = self.marginal_v(h0) 41 | p_h1 = self.marginal_h(v1) 42 | h1 = 1 * (p_h1 >= np.random.uniform(0, 1, 43 | (self.batch_size, self.b.shape[1]))) 44 | 45 | grad_w = np.matmul(v1.T, p_h1) - np.matmul(v0.T, p_h0) 46 | grad_b = np.matmul(np.ones((1, self.batch_size)), p_h1 - p_h0) 47 | grad_a = np.matmul(np.ones((1, self.batch_size)), v1 - v0) 48 | 49 | alpha = self.lr / self.batch_size 50 | mom_scaler = 1 - beta1 ** (j + 1) 51 | cache_scaler = 1 - beta2 ** (j + 1) 52 | 53 | self.mom_w = beta1 * self.mom_w + (1 - beta1) * grad_w 54 | self.cache_w = beta2 * self.cache_w + \ 55 | (1 - beta2) * np.square(grad_w) 56 | self.w -= alpha * self.mom_w / mom_scaler / \ 57 | (np.sqrt(self.cache_w / cache_scaler) + eps) 58 | self.mom_b = beta1 * self.mom_b + (1 - beta1) * grad_b 59 | self.cache_b = beta2 * self.cache_b + \ 60 | (1 - beta2) * np.square(grad_b) 61 | self.b -= alpha * self.mom_b / mom_scaler / \ 62 | (np.sqrt(self.cache_b / cache_scaler) + eps) 63 | self.mom_a = beta1 * self.mom_a + (1 - beta1) * grad_a 64 | self.cache_a = beta2 * self.cache_a + \ 65 | (1 - beta2) * np.square(grad_a) 66 | self.a -= alpha * self.mom_a / mom_scaler / \ 67 | (np.sqrt(self.cache_a / cache_scaler) + eps) 68 | 69 | self.w *= self.decay 70 | self.a *= self.decay 71 | self.b *= self.decay 72 | if j % 10 == 9: 73 | print('squared loss', np.square( 74 | self.marginal_v(self.marginal_h(v)) - v).sum()) 75 | # print(np.around(self.marginal_v(self.marginal_h(v)), 3)) 76 | 77 | def marginal_v(self, h): 78 | return sigmoid(self.a + np.matmul(h, self.w.T)) 79 | 80 | def marginal_h(self, v): 81 | return sigmoid(self.b + np.matmul(v, self.w)) 82 | 83 | 84 | def main(): 85 | # data = load_digits() 86 | # x, y = data.data, data.target 87 | x, y = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 88 | test_ratio = 0.2 89 | test_split = np.random.uniform(0, 1, x.shape[0]) 90 | train_x, test_x = x[test_split >= test_ratio] / \ 91 | x.max(), x[test_split < test_ratio] / x.max() 92 | train_y, test_y = y.astype(np.int_)[test_split >= test_ratio], y.astype( 93 | np.int_)[test_split < test_ratio] 94 | 95 | rbm = RBM(x.shape[1], 64) 96 | rbm.fit(train_x) 97 | print(np.square(rbm.marginal_v(rbm.marginal_h(train_x)) - train_x).sum()) 98 | print(np.square(rbm.marginal_v(rbm.marginal_h(test_x)) - test_x).sum()) 99 | 100 | for i in range(10): 101 | plt.subplot(2, 10, i + 1) 102 | plt.imshow(test_x[test_y == i].mean(axis=0).reshape( 103 | 28, 28), cmap='gray', vmin=0, vmax=1) 104 | plt.subplot(2, 10, i + 11) 105 | plt.imshow(rbm.marginal_v(rbm.marginal_h(test_x[test_y == i])).mean( 106 | axis=0).reshape(28, 28), cmap='gray', vmin=0, vmax=1) 107 | plt.show() 108 | 109 | 110 | if __name__ == "__main__": 111 | main() 112 | -------------------------------------------------------------------------------- /evolutionary_algorithm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | import matplotlib.pyplot as plt 4 | ''' 5 | this code uses EA to find the weights of an mlp which learns the iris dataset 6 | the mlp contains a hidden layer of six nuerons 7 | tanh is used as activation function here. 8 | ''' 9 | 10 | 11 | def tanh(x): 12 | return np.tanh(x) 13 | 14 | 15 | def sigmoid(x): 16 | return 1 / (1 + np.exp(-x)) 17 | 18 | 19 | def softmax(x): 20 | eps = 1e-8 21 | out = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) 22 | return out / (np.sum(out, axis=1).reshape(-1, 1) + eps) 23 | 24 | 25 | class NN(object): 26 | 27 | def __init__(self, in_dim=None, h_dim=None, out_dim=None, w1=None, b1=None, w2=None, b2=None): 28 | self.w1 = np.random.randn(in_dim, h_dim) if w1 is None else w1 29 | self.b1 = np.random.randn(1, h_dim) if b1 is None else b1 30 | self.w2 = np.random.randn(h_dim, out_dim) if w2 is None else w2 31 | self.b2 = np.random.randn(1, out_dim) if b2 is None else b2 32 | 33 | def loss(self, x, y): # using cross entropy as loss function 34 | eps = 1e-8 35 | return -(np.multiply(y, np.log(self.predict(x)))).mean() 36 | 37 | def predict(self, x): 38 | o1 = sigmoid(x.dot(self.w1) + self.b1) 39 | return softmax(o1.dot(self.w2) + self.b2) 40 | 41 | 42 | class EvolutionaryAlgorithm(object): 43 | 44 | def __init__(self): 45 | self.pop_num = 50 46 | self.elitism_num = self.pop_num // 5 47 | self.gen_num = 600 48 | self.mutate_rate = 0.1 49 | 50 | def cross_over(self, w1, w2): 51 | # CROSSOVER 52 | mask = np.random.uniform(0, 1, w1.shape) 53 | return (mask > 0.5) * w1 + (mask <= 0.5) * w2 54 | 55 | def mutate(self, w): 56 | mask = np.random.uniform(0, 1, w.shape) 57 | mutate_multiplier = np.random.randn(w.shape[0], w.shape[1]) 58 | w += w * (mask <= self.mutate_rate) * mutate_multiplier 59 | 60 | mutate_range = 3 # prevent too large or small 61 | w[w > mutate_range] = mutate_range 62 | w[w < -mutate_range] = -mutate_range 63 | 64 | def evolve(self, old_pop, x, y): 65 | eps = 1e-8 66 | fitness = np.array([1 / (p.loss(x, y) + eps) for p in old_pop]) 67 | fitness = fitness / fitness.sum() 68 | top = np.argsort(fitness)[-1:-self.elitism_num - 1:-1] 69 | new_pop = [old_pop[idx] for idx in top] 70 | for i in range(self.pop_num - self.elitism_num): 71 | # SELECTION by probabilities (fitness) 72 | idxes = np.random.choice(self.pop_num, 2, p=fitness) 73 | a = old_pop[idxes[0]] 74 | b = old_pop[idxes[1]] 75 | # CROSSOVER 76 | w1 = self.cross_over(a.w1, b.w1) 77 | b1 = self.cross_over(a.b1, b.b1) 78 | w2 = self.cross_over(a.w2, b.w2) 79 | b2 = self.cross_over(a.b2, b.b2) 80 | # MUTATION 81 | self.mutate(w1) 82 | self.mutate(b1) 83 | self.mutate(w2) 84 | self.mutate(b2) 85 | new_pop.append(NN(w1=w1, b1=b1, w2=w2, b2=b2)) 86 | 87 | loss = [p.loss(x, y) for p in new_pop] 88 | return new_pop, loss 89 | 90 | def run(self, x, y): 91 | label_num = len(np.unique(y)) 92 | labels = np.zeros((x.shape[0], label_num)) 93 | labels[np.arange(x.shape[0]), y] = 1 94 | 95 | population = [NN(in_dim=x.shape[1], h_dim=32, out_dim=label_num) 96 | for _ in range(self.pop_num)] 97 | losslog = [] 98 | for i in range(self.gen_num): 99 | population, loss = self.evolve(population, x, labels) 100 | losslog.append([max(loss), np.mean(loss), min(loss)]) 101 | print("Gen {} max:{} min:{} mean:{}".format( 102 | i, max(loss), min(loss), np.mean(loss))) 103 | losslog = np.array(losslog) 104 | plt.plot(losslog[:, 0]) 105 | plt.plot(losslog[:, 1]) 106 | plt.plot(losslog[:, 2]) 107 | plt.legend(('max', 'mean', 'best'), loc='best') 108 | plt.title('loss over generation') 109 | plt.show() 110 | return population[np.argmin(loss)] 111 | 112 | 113 | def main(): 114 | data = load_digits() 115 | x = data.data 116 | y = data.target 117 | 118 | test_ratio = 0.2 119 | test_split = np.random.uniform(0, 1, len(x)) 120 | train_x = x[test_split >= test_ratio] 121 | test_x = x[test_split < test_ratio] 122 | train_y = y[test_split >= test_ratio] 123 | test_y = y[test_split < test_ratio] 124 | 125 | ea = EvolutionaryAlgorithm() 126 | model = ea.run(train_x, train_y) 127 | res = model.predict(test_x) 128 | print(sum(yi == np.argmax(y_hat) 129 | for y_hat, yi in zip(res, test_y)) / test_y.shape[0]) 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /temporal_difference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nn_layers import FullyConnect, Activation, Conv 3 | from minimax import MiniMax, RandomMove 4 | # Temporal difference Q learning for Tic Tac Toe / Gomoku 5 | 6 | 7 | n_size = 3 8 | n_connect = 3 9 | 10 | 11 | def is_done(board): 12 | for i in range(n_size * n_size): 13 | x, y = i % n_size, i // n_size 14 | x_end = x + n_connect 15 | x_rev_end = x - n_connect 16 | y_end = y + n_connect 17 | if ( # - 18 | x_end <= n_size and abs(board[y, x:x_end].sum()) == n_connect 19 | ) or ( # | 20 | y_end <= n_size and abs(board[y:y_end, x].sum()) == n_connect 21 | ) or ( # \ 22 | x_end <= n_size and y_end <= n_size and abs( 23 | board[range(y, y_end), range(x, x_end)].sum()) == n_connect 24 | ) or ( # / 25 | x_rev_end >= -1 and y_end <= n_size and abs( 26 | board[range(y, y_end), range(x, x_rev_end, -1)].sum()) == n_connect 27 | ): 28 | return board[y, x] 29 | return 0 30 | 31 | 32 | def transform_action(action): # generating more board by flipping and rotating 33 | y = action // n_size 34 | x = action % n_size 35 | pos = [ 36 | (y, x), (x, n_size - 1 - y), (n_size - 1 - 37 | y, n_size - 1 - x), (n_size - 1 - x, y), 38 | (y, n_size - 1 - x), (n_size - 1 - x, 39 | n_size - 1 - y), (n_size - 1 - y, x), (x, y) 40 | ] 41 | return np.array([y * n_size + x for y, x in pos]) 42 | 43 | 44 | class TD(object): 45 | 46 | def __init__(self): 47 | self.q = {} 48 | self.draw_reward = 0.6 49 | self.alpha = 0.9 50 | self.gamma = 0.95 51 | 52 | def hash(self, board): 53 | hash_str = ''.join([str(i) for i in board.tolist()]) 54 | if hash_str not in self.q: 55 | self.q[hash_str] = self.draw_reward * (1 - abs(board)) - abs(board) 56 | return hash_str 57 | 58 | def act(self, board, player=None): 59 | if board[np.argmax(self.q[self.hash(board)])] != 0: 60 | print('error') 61 | return np.argmax(self.q[self.hash(board)]) 62 | 63 | def fit(self): 64 | random = RandomMove() 65 | minimax = MiniMax(max_depth=9) 66 | agents = np.array([random, self]) 67 | state = np.zeros(n_size * n_size) 68 | for i in range(20001): 69 | np.random.shuffle(agents) 70 | extended_boards, extended_actions, rewards, unfinished_flags, _ = play( 71 | agents) 72 | for board_sequence, action_sequence in zip(extended_boards, extended_actions): 73 | for state, next_state, action, reward, unfinished in zip( 74 | board_sequence[ 75 | :-1], board_sequence[1:], action_sequence, rewards, unfinished_flags 76 | ): 77 | state_hash = self.hash(state) 78 | next_hash = self.hash(next_state) 79 | self.q[state_hash][action] += self.alpha * ( 80 | reward + self.gamma * unfinished * 81 | np.amax(self.q[next_hash]) - self.q[state_hash][action] 82 | ) 83 | if i % 1000 == 0: 84 | print(f'iteration {i}\t\t\twin/draw/lose') 85 | print('minimax vs. q learning', test([minimax, self])) 86 | print('q learning vs. minimax', test([self, minimax])) 87 | print('random vs. q learning', test([random, self])) 88 | print('q learning vs. random', test([self, random])) 89 | 90 | 91 | def play(agents): 92 | boards = np.zeros((8, n_size * n_size)).astype(int) 93 | winner = 0 94 | saved_actions = [] 95 | saved_states = [] 96 | for move in range(n_size * n_size): 97 | player = move % 2 * 2 - 1 98 | action_pos = agents[move % 2].act(boards[0], player) 99 | action_list = transform_action(action_pos) 100 | if isinstance(agents[move % 2], TD): 101 | saved_actions.append(action_list) 102 | saved_states.append(boards.copy()) 103 | boards[range(8), action_list] = player 104 | winner = is_done(boards[0].reshape((n_size, n_size))) 105 | if abs(winner) == 1: 106 | break 107 | saved_states.append(np.zeros((8, n_size * n_size)).astype(int)) 108 | rewards = np.zeros(len(saved_actions)) 109 | unfinished_flags = np.ones(len(saved_actions)) 110 | rewards[-1] = winner * (2 * isinstance(agents[1], TD) - 1) 111 | unfinished_flags[-1] = 0 112 | return np.transpose(saved_states, (1, 0, 2)), np.transpose(saved_actions), rewards, unfinished_flags, winner 113 | 114 | 115 | def test(agents): 116 | game_records = [0, 0, 0] 117 | for i in range(100): 118 | _, _, _, _, winner = play(agents) 119 | game_records[int(winner) + 1] += 1 120 | return game_records 121 | 122 | 123 | def main(): 124 | td = TD() 125 | td.fit() 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /factorization_machines.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_breast_cancer, load_boston 3 | 4 | 5 | def squared_loss(y, pred): 6 | return np.square(pred - y).mean() / 2 7 | 8 | 9 | def squared_loss_gradient(y, pred): 10 | return pred - y 11 | 12 | 13 | def sigmoid(x): 14 | return 1 / (1 + np.exp(-x)) 15 | 16 | 17 | def cross_entropy_loss(y, pred): 18 | eps = 1e-12 19 | y_hat = sigmoid(pred) 20 | return -(y * np.log(y_hat + eps) + (1 - y) * np.log(1 - y_hat + eps)).mean() 21 | 22 | 23 | def cross_entropy_gradient(y, pred): 24 | return sigmoid(pred) - y 25 | 26 | 27 | class FactorizationMachines(object): 28 | 29 | def __init__(self, learning_rate=0.01, embedding_dim=5, regression=False): 30 | self.learning_rate = learning_rate 31 | self.embedding_dim = embedding_dim 32 | self.lmbda = 0.001 # regularization coefficient 33 | self.reg = 2 34 | self.eps = 1e-12 35 | if regression: 36 | self.grad_func = squared_loss_gradient 37 | self.loss_func = squared_loss 38 | else: 39 | self.grad_func = cross_entropy_gradient 40 | self.loss_func = cross_entropy_loss 41 | 42 | def fit(self, x, y): 43 | n_data = x.shape[0] 44 | n_dim = x.shape[1] 45 | self.w0 = 0 46 | self.w = np.random.randn(n_dim) 47 | self.v = np.random.randn(self.embedding_dim, n_dim) 48 | 49 | self.mom_w0, self.cache_w0 = 0, 0 50 | self.mom_w, self.cache_w = np.zeros(n_dim), np.zeros(n_dim) 51 | self.mom_v, self.cache_v = np.zeros( 52 | self.v.shape), np.zeros(self.v.shape) 53 | 54 | for i in range(5000): 55 | grad = self.grad_func(y, self.predict(x)) 56 | x_squares = np.repeat( 57 | np.square(x), self.embedding_dim, axis=0).reshape(n_data, -1, n_dim) 58 | vx = [np.matmul(vix.reshape(-1, 1), xi.reshape(1, -1)) 59 | for vix, xi in zip(x.dot(self.v.T), x)] 60 | dv = np.array(vx) - self.v * x_squares 61 | grad_v = grad.dot(dv.reshape(n_data, -1)).reshape(self.v.shape) 62 | self.adam(grad.sum(), grad.dot(x), grad_v, i + 1) 63 | self.regularization() 64 | if i % 100 == 0: 65 | print('loss {}'.format(self.loss_func(y, self.predict(x)))) 66 | 67 | def sgd(self, grad_w0, grad_w, grad_v): # use a very small learning rate for sgd, e.g., 1e-14 68 | self.w0 -= self.learning_rate * grad_w0 69 | self.w -= self.learning_rate * grad_w 70 | self.v -= self.learning_rate * grad_v 71 | 72 | def adam(self, grad_w0, grad_w, grad_v, i): 73 | beta1 = 0.9 74 | beta2 = 0.999 75 | alpha = self.learning_rate 76 | self.mom_w0 = beta1 * self.mom_w0 + (1 - beta1) * grad_w0 77 | self.cache_w0 = beta2 * self.cache_w0 + \ 78 | (1 - beta2) * np.square(grad_w0) 79 | self.w0 -= alpha * self.mom_w0 / \ 80 | (1 - beta1**i) / (np.sqrt(self.cache_w0 / (1 - beta2**i)) + self.eps) 81 | 82 | self.mom_w = beta1 * self.mom_w + (1 - beta1) * grad_w 83 | self.cache_w = beta2 * self.cache_w + (1 - beta2) * np.square(grad_w) 84 | self.w -= alpha * self.mom_w / \ 85 | (1 - beta1**i) / (np.sqrt(self.cache_w / (1 - beta2**i)) + self.eps) 86 | 87 | self.mom_v = beta1 * self.mom_v + (1 - beta1) * grad_v 88 | self.cache_v = beta2 * self.cache_v + (1 - beta2) * np.square(grad_v) 89 | self.v -= alpha * self.mom_v / \ 90 | (1 - beta1**i) / (np.sqrt(self.cache_v / (1 - beta2**i)) + self.eps) 91 | 92 | def regularization(self): 93 | if(self.reg == 1): 94 | self.w0 -= self.lmbda * np.sign(self.w0) 95 | self.w -= self.lmbda * np.sign(self.w) 96 | self.v -= self.lmbda * np.sign(self.v) 97 | elif(self.reg == 2): 98 | self.w0 -= self.lmbda * self.w0 99 | self.w -= self.lmbda * self.w 100 | self.v -= self.lmbda * self.v 101 | 102 | def predict(self, x): 103 | xvt = np.matmul(x, self.v.T) 104 | return self.w0 + x.dot(self.w) + (np.square(xvt).sum(axis=1) - np.square(x).dot(np.square(self.v).sum(axis=0))) / 2 105 | 106 | 107 | def main(): 108 | data = load_breast_cancer() # load_boston() for regression 109 | test_ratio = 0.2 110 | test_split = np.random.uniform(0, 1, len(data.data)) 111 | train_x = data.data[test_split >= test_ratio] 112 | test_x = data.data[test_split < test_ratio] 113 | train_y = data.target[test_split >= test_ratio] 114 | test_y = data.target[test_split < test_ratio] 115 | 116 | fm = FactorizationMachines(regression=False) # True for regression 117 | fm.fit(train_x, train_y) 118 | 119 | print(((fm.predict(train_x) >= 0) == train_y).sum() / train_y.shape[0]) 120 | print(((fm.predict(test_x) >= 0) == test_y).sum() / test_y.shape[0]) 121 | 122 | # for regression 123 | #print(squared_loss(fm.predict(train_x), train_y)) 124 | #print(squared_loss(fm.predict(test_x), test_y)) 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /support_vector_machine.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_breast_cancer 3 | 4 | 5 | class SVM(object): 6 | 7 | def __init__(self): 8 | self.b = 0 9 | self.kernel = self.polynomial 10 | self.gamma = 1 11 | self.degree = 3 12 | self.C = 1 13 | 14 | def _ktt_violations(self, uy, alpha): 15 | violations = np.zeros(len(uy)) 16 | violations[uy >= 1] = self.C - alpha[uy >= 1] 17 | violations[uy <= 1] = alpha[uy <= 1] 18 | violations[uy == 1] = ( 19 | (alpha[uy == 1] >= self.C) + (alpha[uy == 1] <= 0)) * self.C / 2 20 | return violations 21 | 22 | def _select_pair_by_delta_e(self, u, y, alpha): 23 | violations = self._ktt_violations(u * y, alpha) > 0 24 | if violations.max() == 0: 25 | return -1, -1 26 | e = u - y 27 | repeat_e = np.repeat(e.reshape(1, -1), e.shape[0], axis=0) 28 | delta_e = (violations * abs((repeat_e - repeat_e.T))).flatten() 29 | idx = np.random.choice( 30 | len(delta_e), 1, p=delta_e / delta_e.sum()).sum() 31 | return idx % len(e), idx // len(e) 32 | 33 | def _select_pair_by_max_violations(self, u, y, alpha): 34 | n_data = len(y) 35 | violations = self._ktt_violations(u * y, alpha) 36 | if violations.max() == 0: 37 | return -1, -1 38 | idx1 = np.random.choice( 39 | n_data, 1, p=violations / violations.sum()).sum() 40 | delta_e = abs(u - y - u[idx1] + y[idx1]) 41 | idx2 = np.random.choice(n_data, 1, p=delta_e / delta_e.sum()).sum() 42 | return idx1, idx2 43 | 44 | def loss(self, alpha, x, y): 45 | w = np.matmul(self.supp_w.reshape(-1, 1), self.supp_w.reshape(1, -1)) 46 | return alpha.sum() - (w * self.kernel(self.supp_x, self.supp_x)).sum() / 2 47 | 48 | def fit(self, x, y): # SMO 49 | n_data = x.shape[0] 50 | self.supp_w = np.zeros(x.shape[0]) 51 | self.supp_x = x 52 | self.b = 1 53 | alpha = np.zeros(n_data) 54 | for i in range(1000): 55 | # select alpha1, alpha2 56 | u = np.sign(self.predict(x)) 57 | idx1, idx2 = self._select_pair_by_max_violations(u, y, alpha) 58 | if(idx1 == -1): 59 | break 60 | y1, y2 = y[idx1], y[idx2] 61 | 62 | # update alpha1, alpha2 63 | L = max(0, alpha[idx2] - alpha[idx1]) if y1 != y2 else max(0, 64 | alpha[idx1] + alpha[idx2] - self.C) 65 | H = min(self.C, self.C + alpha[idx2] - alpha[idx1] 66 | ) if y1 != y2 else min(self.C, alpha[idx1] + alpha[idx2]) 67 | e1, e2 = u[idx1] - y1, u[idx2] - y2 68 | k11 = self.kernel(x[[idx1]], x[[idx1]]).sum() 69 | k12 = self.kernel(x[[idx1]], x[[idx2]]).sum() 70 | k22 = self.kernel(x[[idx2]], x[[idx2]]).sum() 71 | alpha2 = min( 72 | H, max(L, alpha[idx2] + y2 * (e1 - e2) / (k11 + k22 - 2 * k12))) 73 | alpha1 = alpha[idx1] + y1 * y2 * (alpha[idx2] - alpha2) 74 | 75 | # update b 76 | b1 = self.b - e1 - y1 * \ 77 | (alpha1 - alpha[idx1]) * k11 - \ 78 | y2 * (alpha2 - alpha[idx2]) * k12 79 | b2 = self.b - e2 - y1 * \ 80 | (alpha1 - alpha[idx1]) * k12 - \ 81 | y2 * (alpha2 - alpha[idx2]) * k22 82 | if alpha1 > 0 and alpha1 < self.C: 83 | self.b = b1 84 | elif alpha2 > 0 and alpha2 < self.C: 85 | self.b = b2 86 | else: 87 | self.b = (b1 + b2) / 2 88 | 89 | # update model 90 | alpha[[idx1, idx2]] = [alpha1, alpha2] 91 | sv = (alpha != 0) 92 | self.supp_w = alpha[sv] * y[sv] 93 | self.supp_x = x[sv] 94 | if i % 100 == 0: 95 | print(self.loss(alpha, x, y)) 96 | print('support vectors:', self.supp_x) 97 | 98 | def predict(self, x): 99 | return self.supp_w.dot(self.kernel(self.supp_x, x)).flatten() + self.b 100 | 101 | def rbf(self, x1, x2): 102 | sub = np.array([[np.square(x1i - x2i).sum() 103 | for x2i in x2] for x1i in x1]) 104 | return np.exp(-self.gamma * sub) 105 | 106 | def polynomial(self, x1, x2): 107 | return (x1.dot(x2.T) + 1)**self.degree 108 | 109 | def linear(self, x1, x2): 110 | return x1.dot(x2.T) 111 | 112 | 113 | def main(): 114 | data = load_breast_cancer() 115 | target = data.target * 2 - 1 116 | test_ratio = 0.2 117 | test_split = np.random.uniform(0, 1, len(target)) 118 | train_x = data.data[test_split >= test_ratio] 119 | test_x = data.data[test_split < test_ratio] 120 | train_y = target[test_split >= test_ratio] 121 | test_y = target[test_split < test_ratio] 122 | svm = SVM() 123 | svm.fit(train_x, train_y) 124 | print(sum(np.sign(svm.predict(train_x)) == train_y) / train_x.shape[0]) 125 | print(sum(np.sign(svm.predict(test_x)) == test_y) / test_x.shape[0]) 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /decision_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_iris 3 | 4 | 5 | def weighted_histo(y, w): 6 | y_unique = np.unique(y) 7 | return [w[y == yi].sum() for yi in y_unique] / w.sum() 8 | 9 | 10 | def entropy(y, w): 11 | p = weighted_histo(y, w) 12 | return -np.sum(np.multiply(p, np.log2(p + 1e-8))) 13 | 14 | 15 | def impurity(y, w): 16 | return 1 - np.sum(np.square(weighted_histo(y, w))) 17 | 18 | 19 | def variance(y, w): 20 | mu = y.dot(w) / len(y) 21 | return np.square(y - mu).dot(w) / sum(w) 22 | 23 | 24 | class DecisionTree(object): 25 | 26 | def __init__(self, metric_type, depth, regression=False): 27 | metrics = {'Info gain': entropy, 28 | 'Gini impurity': impurity, 'Variance': variance} 29 | self.regression = regression 30 | self.metric = metrics[metric_type] 31 | self.tree = {} 32 | self.depth = depth 33 | self.gain_threshold = 1e-8 34 | 35 | def split_gain(self, p_score, l_y, r_y, l_w, r_w): 36 | total_w = sum(l_w) + sum(r_w) 37 | return p_score - (self.metric(l_y, l_w) * sum(l_w) + self.metric(r_y, r_w) * sum(r_w)) / total_w 38 | 39 | def print_tree(self, node=None, depth=0): 40 | if node is None: 41 | node = self.tree 42 | if 'f_id' in node: 43 | print('{}[X{} < {}]'.format(depth * ' ', 44 | (node['f_id'] + 1), node['value'])) 45 | self.print_tree(node['left'], depth + 1) 46 | self.print_tree(node['right'], depth + 1) 47 | else: 48 | print('{}{}'.format(depth * ' ', node)) 49 | 50 | def gen_leaf(self, y, w): 51 | if not self.regression: 52 | weighted_sum = [w[y == li].sum() for li in self.labels] 53 | node = dict(zip(self.labels, weighted_sum)) 54 | node['label'] = self.labels[np.argmax(weighted_sum)] 55 | else: 56 | node = {'label': y.dot(w) / sum(w)} 57 | return node 58 | 59 | def split(self, x, y, w, depth): 60 | if(depth >= self.depth): 61 | return self.gen_leaf(y, w) 62 | p_score = self.metric(y, w) 63 | max_gain, f_id, value = self.gain_threshold, -1, 0, 64 | splt_l_x, splt_r_x, splt_l_y, splt_r_y, splt_l_w, splt_r_w = None, None, None, None, None, None 65 | for f in self.feature_set: 66 | split_values = np.unique(x[:, f].round(decimals=4)) 67 | for split_value in split_values: 68 | l_idx, r_idx = x[:, f] < split_value, x[:, f] >= split_value 69 | l_x, l_y, l_w = x[l_idx], y[l_idx], w[l_idx] 70 | r_x, r_y, r_w = x[r_idx], y[r_idx], w[r_idx] 71 | if(len(l_x) * len(r_x) == 0): 72 | continue 73 | gain = self.split_gain(p_score, l_y, r_y, l_w, r_w) 74 | if gain > max_gain: 75 | max_gain, f_id, value = gain, f, split_value 76 | splt_l_x, splt_l_y, splt_l_w = l_x, l_y, l_w 77 | splt_r_x, splt_r_y, splt_r_w = r_x, r_y, r_w 78 | if f_id != -1: 79 | self.importance[f_id] += max_gain * sum(w) 80 | return { 81 | 'f_id': f_id, 82 | 'value': value, 83 | 'left': self.split(splt_l_x, splt_l_y, splt_l_w, depth + 1), 84 | 'right': self.split(splt_r_x, splt_r_y, splt_r_w, depth + 1) 85 | } 86 | else: 87 | return self.gen_leaf(y, w) 88 | 89 | def fit(self, x, y, w=None, feature_set=None): 90 | self.labels = np.unique(y) 91 | self.feature_set = np.arange( 92 | x.shape[1]) if feature_set is None else feature_set 93 | self.importance = np.zeros(x.shape[1]) 94 | self.tree = self.split(x, y, np.ones( 95 | x.shape[0]) if w is None else w, 0) 96 | self.importance /= len(x) 97 | 98 | def predict(self, x): 99 | return np.array([self.predict_sample(xi) for xi in x]) 100 | 101 | def predict_sample(self, sample, node=None): 102 | if node is None: 103 | node = self.tree 104 | if 'f_id' in node: 105 | child = node['left'] if(sample[node['f_id']] < node[ 106 | 'value']) else node['right'] 107 | return self.predict_sample(sample, child) 108 | else: 109 | return node['label'] 110 | 111 | def get_importance(self): 112 | return self.importance 113 | 114 | 115 | def main(): 116 | data = load_iris() 117 | test_ratio = 0.2 118 | test_split = np.random.uniform(0, 1, len(data.data)) 119 | train_x, test_x = data.data[test_split >= 120 | test_ratio], data.data[test_split < test_ratio] 121 | train_y, test_y = data.target[ 122 | test_split >= test_ratio], data.target[test_split < test_ratio] 123 | 124 | dt = DecisionTree(metric_type='Gini impurity', depth=4) 125 | dt.fit(train_x, train_y) 126 | dt.print_tree() 127 | print(dt.importance) 128 | print(sum(dt.predict(train_x) == train_y) / len(train_x)) 129 | print(sum(dt.predict(test_x) == test_y) / len(test_x)) 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /minimax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # implements Minimax for Tic Tac Toe / Gomoku 3 | 4 | 5 | n_size = 3 6 | n_connect = 3 7 | 8 | 9 | def is_done(board): 10 | for i in range(n_size * n_size): 11 | x, y = i % n_size, i // n_size 12 | x_end = x + n_connect 13 | x_rev_end = x - n_connect 14 | y_end = y + n_connect 15 | if ( # - 16 | x_end <= n_size and abs(board[y, x:x_end].sum()) == n_connect 17 | ) or ( # | 18 | y_end <= n_size and abs(board[y:y_end, x].sum()) == n_connect 19 | ) or ( # \ 20 | x_end <= n_size and y_end <= n_size and abs( 21 | board[range(y, y_end), range(x, x_end)].sum()) == n_connect 22 | ) or ( # / 23 | x_rev_end >= -1 and y_end <= n_size and abs( 24 | board[range(y, y_end), range(x, x_rev_end, -1)].sum()) == n_connect 25 | ): 26 | return board[y, x] 27 | return 0 28 | 29 | 30 | def play(agents): 31 | board = np.zeros(n_size * n_size).astype(int) 32 | record = np.zeros(n_size * n_size) 33 | winner = 0 34 | n_moves = 0 35 | 36 | for move in range(n_size * n_size): 37 | n_moves += 1 38 | player = move % 2 * 2 - 1 39 | action_pos = agents[move % 2].act(board, player) 40 | record[action_pos] = n_moves 41 | board[action_pos] = player 42 | winner = is_done(board.reshape((n_size, n_size))) 43 | if abs(winner) == 1: 44 | break 45 | return record.reshape((n_size, n_size)), winner 46 | 47 | 48 | def test(agents): 49 | game_records = [0, 0, 0] 50 | for i in range(100): 51 | idx = [0, 1] # np.random.permutation([0, 1]).astype(int) 52 | board, winner = play([agents[idx[0]], agents[idx[1]]]) 53 | game_records[-int(winner) * (2 * idx[0] - 1) + 1] += 1 54 | return game_records 55 | 56 | 57 | class RandomMove(object): 58 | 59 | def act(self, board, player): 60 | return np.random.choice(n_size * n_size, p=(1 - np.abs(board)) / (1 - abs(board)).sum()) 61 | 62 | 63 | class MiniMax(object): 64 | 65 | def __init__(self, max_depth=4): 66 | self.cache = {} 67 | self.max_depth = max_depth 68 | 69 | def heuristic(self, board, player): 70 | # a. in [1, -1], b. score(player1) = -score(player2) 71 | evals = [0, 0] # for player -1 1 72 | for i in range(n_size * n_size): 73 | if board[i] == 0: 74 | continue 75 | evals[(board[i] + 1 ) // 2] += \ 76 | (i % n_size < n_size - 1 and board[i] == board[i + 1]) + \ 77 | (i + n_size < board.shape[0] - 1 and board[i] == board[i + n_size]) + \ 78 | (i + n_size < board.shape[0] - 1 and i % n_size > 0 and board[i] == board[i + n_size - 1]) + \ 79 | (i + n_size < board.shape[0] - 1 and i % n_size < 80 | n_size - 1 and board[i] == board[i + n_size + 1]) 81 | return (-evals[0] * player + evals[1] * player) / (evals[0] + evals[1] + 1) 82 | 83 | def score(self, board, player, depth, alpha, beta): 84 | board_str = ''.join([str(i) for i in board]) 85 | if board_str in self.cache: # cached before 86 | return self.cache[board_str] 87 | winner = is_done(board.reshape(n_size, n_size)) 88 | if np.abs(board).sum() == board.shape[0] or winner != 0: # game end 89 | self.cache[board_str] = ([], winner * player) 90 | return [], winner * player 91 | if depth >= self.max_depth: 92 | return [], self.heuristic(board, player) 93 | # a value less than -1 so next step can pick a legal move 94 | board_scores = np.ones(board.shape[0]) * -2 95 | heuristics_used = np.zeros(board.shape[0]) 96 | for i in range(board.shape[0]): 97 | if board[i] != 0: 98 | continue 99 | board[i] = player 100 | board_scores[i] = -self.score(board, -player, depth + 1, alpha, beta)[1] 101 | heuristics_used[i] = ''.join([str(i) for i in board]) not in self.cache 102 | board[i] = 0 103 | if(player == -1): 104 | alpha = max(np.max(board_scores), alpha) 105 | else: 106 | beta = max(np.max(board_scores), beta) 107 | # alpha beta pruning will reduce the # returned choice of winning moves 108 | if alpha > -beta or (player == -1 and alpha == 1) or (player == 1 and beta == 1): 109 | break 110 | best_score = np.amax(board_scores) 111 | best_moves = [i for i in range(board.shape[0]) if board_scores[ 112 | i] == best_score] 113 | if heuristics_used.sum() == 0 or (best_score == 1 and heuristics_used[best_moves].sum() == 0): 114 | self.cache[board_str] = best_moves, best_score 115 | return best_moves, best_score 116 | 117 | def act(self, board, player): 118 | return np.random.choice(self.score(board, player, 0, -2, -2)[0]) 119 | 120 | 121 | def main(): 122 | minimax = MiniMax() 123 | random = RandomMove() 124 | print('\t\t\t\twin/draw/lose') 125 | print('minimax vs. minimax', test([minimax, minimax])) 126 | print('random vs. minimax', test([random, minimax])) 127 | print('minimax vs. random', test([minimax, random])) 128 | 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /transfer_learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_openml 3 | from nn_layers import Conv, MaxPooling, FullyConnect, Activation, Softmax, BatchNormalization 4 | 5 | # learning on MNIST digit 0, 1, 2, 3, 4 and transfer to 5, 6, 7, 8, 9 6 | class NN(object): 7 | 8 | def __init__(self, layers): 9 | self.layers = layers 10 | self.batch_size = 32 11 | self.epochs = 3 12 | 13 | def predict(self, x): 14 | out = x 15 | for layer in self.layers: 16 | out = layer.predict_forward(out) if isinstance( 17 | layer, BatchNormalization) else layer.forward(out) 18 | return out 19 | 20 | def forward(self, x): 21 | out = x 22 | for layer in self.layers: 23 | out = layer.forward(out) 24 | return out 25 | 26 | def gradient(self, grad_loss): 27 | grad = grad_loss 28 | for layer in self.layers[::-1]: 29 | grad = layer.gradient(grad) 30 | return grad 31 | 32 | def backward(self): 33 | for layer in self.layers: 34 | layer.backward() 35 | 36 | def predict(self, x): 37 | out = x 38 | for layer in self.layers: 39 | out = layer.predict_forward(out) if isinstance( 40 | layer, BatchNormalization) else layer.forward(out) 41 | return out 42 | 43 | 44 | def fit(self, x, labels): 45 | train_num = x.shape[0] 46 | n_labels = 5 47 | y = np.zeros((train_num, n_labels)) 48 | y[np.arange(train_num), labels] = 1 49 | 50 | for epoch in range(self.epochs): 51 | permut = np.random.permutation( 52 | x.shape[0] // self.batch_size * self.batch_size).reshape([-1, self.batch_size]) 53 | total_loss = 0 54 | count = 0 55 | for batch_idx in permut: 56 | pred = self.forward(x[batch_idx]) 57 | loss = self.layers[-1].loss(pred, y[batch_idx]) 58 | total_loss += loss 59 | 60 | if count % 100 == 0: 61 | print("epoch {} batch {} loss: {}".format( 62 | epoch, count, loss)) 63 | count += 1 64 | 65 | # the last softmax layer calculates the pred - y 66 | self.gradient(y[batch_idx]) 67 | self.backward() 68 | print('avg batch loss', total_loss / permut.shape[0]) 69 | 70 | 71 | class TransferLearning(object): 72 | def __init__(self): 73 | self.lr = 0.001 74 | self.n_labels = 5 75 | 76 | def train(self, x, y): 77 | lr = self.lr 78 | conv1 = Conv(in_shape=x.shape[1:4], k_num=6, k_size=5, lr=lr) 79 | bn1 = BatchNormalization(in_shape=conv1.out_shape, lr=lr) 80 | relu1 = Activation(act_type="ReLU") 81 | pool1 = MaxPooling(in_shape=conv1.out_shape, k_size=2) 82 | conv2 = Conv(in_shape=pool1.out_shape, k_num=16, k_size=3, lr=lr) 83 | bn2 = BatchNormalization(in_shape=conv2.out_shape, lr=lr) 84 | relu2 = Activation(act_type="ReLU") 85 | pool2 = MaxPooling(in_shape=conv2.out_shape, k_size=2) 86 | fc = FullyConnect(pool2.out_shape, [self.n_labels], lr=lr) 87 | softmax = Softmax() 88 | 89 | nn = NN([ 90 | conv1, bn1, relu1, pool1, 91 | conv2, bn2, relu2, pool2, 92 | fc, softmax 93 | ]) 94 | nn.fit(x, y) 95 | return nn 96 | 97 | 98 | def transfer(self, x, y, nn): 99 | for layer in nn.layers[:-2]: 100 | x = layer.predict_forward(x) if isinstance( 101 | layer, BatchNormalization) else layer.forward(x) 102 | 103 | nn_top = NN([ 104 | FullyConnect(nn.layers[-3].out_shape, [self.n_labels], lr=self.lr), 105 | Softmax() 106 | ]) 107 | nn_top.fit(x, y) 108 | return NN(nn.layers[:-2] + nn_top.layers) 109 | 110 | 111 | def main(): 112 | x_all, y_all = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 113 | x_all = x_all.reshape(-1, 1, 28, 28) 114 | test_ratio = 0.2 115 | tl = TransferLearning() 116 | 117 | for mode_type in ['original', 'transferred']: 118 | index = (y_all <= '4') if mode_type == 'original' else (y_all > '4') 119 | x = x_all[index] 120 | y = y_all[index] 121 | test_split = np.random.uniform(0, 1, x.shape[0]) 122 | train_x, train_y = x[test_split >= test_ratio] / \ 123 | x.max(), y.astype(np.int_)[test_split >= test_ratio] 124 | test_x, test_y = x[test_split < test_ratio] / \ 125 | x.max(), y.astype(np.int_)[test_split < test_ratio] 126 | if mode_type == 'original': 127 | print('train the first model') 128 | nn = tl.train(train_x, train_y) 129 | else: 130 | train_y = train_y - 5 # for one hot encoding purpose 131 | test_y = test_y - 5 # for one hot encoding purpose 132 | print('transfer to the second model') 133 | nn = tl.transfer(train_x, train_y, nn) 134 | print(nn.layers) 135 | print('model performance') 136 | print('train set accuracy', sum(np.argmax(nn.predict(train_x), axis=1) == train_y) / train_y.shape[0]) 137 | print('test set accuracy', sum(np.argmax(nn.predict(test_x), axis=1) == test_y) / test_y.shape[0]) 138 | 139 | 140 | if __name__ == "__main__": 141 | main() 142 | -------------------------------------------------------------------------------- /optimization_visualization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | # implements and visualize Gradient Descent, Momentum, Nesterov, AdaGrad, 4 | # RMSprop, Adam, and Simulated Annealing 5 | 6 | 7 | class Optimization(object): 8 | 9 | def __init__(self): 10 | self.optimizers = {'GD': self.gradient_descent, 'Momentum': self.momentum, 'Nesterov': self.nesterov, 11 | 'AdaGrad': self.adagrad, 'RMSprop': self.rmsprop, 'Adam': self.adam} 12 | self.gamma = 0.8 13 | self.eps = 1e-8 14 | self.reset() 15 | 16 | def reset(self, lr=0.001): 17 | self.pos = np.array([-10.0, -5.0]) 18 | self.mom = np.zeros_like(self.pos) 19 | self.cache = np.zeros_like(self.pos) 20 | self.adam_iter = 1 21 | self.learning_rate = lr 22 | 23 | def gradient_descent(self, grad): 24 | self.pos -= self.learning_rate * grad 25 | 26 | def momentum(self, grad): 27 | self.mom = self.gamma * self.mom + self.learning_rate * grad 28 | self.pos -= self.mom 29 | 30 | def nesterov(self, grad): 31 | mom_v_prev = self.mom 32 | self.mom = self.gamma * self.mom + self.learning_rate * grad 33 | self.pos -= ((1 + self.gamma) * self.mom - self.gamma * mom_v_prev) 34 | 35 | def adagrad(self, grad): 36 | self.cache += np.square(grad) 37 | self.pos -= self.learning_rate * grad / \ 38 | (np.sqrt(self.cache) + self.eps) 39 | 40 | def rmsprop(self, grad): 41 | self.cache = self.gamma * self.cache + \ 42 | (1 - self.gamma) * np.square(grad) 43 | self.pos -= self.learning_rate * grad / \ 44 | (np.sqrt(self.cache) + self.eps) 45 | 46 | def adam(self, grad): 47 | beta1 = 0.5 48 | beta2 = 0.8 49 | self.mom = beta1 * self.mom + (1 - beta1) * grad 50 | self.cache = beta2 * self.cache + (1 - beta2) * np.square(grad) 51 | self.pos -= self.learning_rate * self.mom / \ 52 | (1 - beta1**self.adam_iter) / \ 53 | (np.sqrt(self.cache / (1 - beta2**self.adam_iter)) + self.eps) 54 | self.adam_iter += 1 55 | 56 | def optimize(self, opt_algo, grad_func, x, y): 57 | trace = [self.pos.copy()] 58 | for i in range(30): 59 | grad = grad_func(self.pos, x, y) 60 | self.optimizers[opt_algo](grad) 61 | if np.sum(np.square(self.pos - np.array([3, 5]))) < 1: 62 | break 63 | trace.append(self.pos.copy()) 64 | return np.array(trace) 65 | 66 | 67 | class Annealing(object): 68 | 69 | def __init__(self): 70 | self.learning_rate = 0.5 71 | self.pos = np.array([-10.0, -5.0]) 72 | self.iterations = 100 73 | 74 | def transfer_prob(self, e_old, e_new, t): 75 | if e_old > e_new: 76 | return 1 77 | else: 78 | return np.exp((e_old - e_new) / t) 79 | 80 | def annealing(self, x, y): 81 | trace = [self.pos.copy()] 82 | for i in range(self.iterations): 83 | t = 1 - i / self.iterations 84 | radius, theta = 5 * self.learning_rate * np.random.uniform(), np.random.uniform() * \ 85 | 2 * np.pi - np.pi 86 | pos_next = self.pos + radius * \ 87 | np.array([np.cos(theta), np.sin(theta)]) 88 | p = self.transfer_prob(loss(self.pos, x, y), 89 | loss(pos_next, x, y), t) 90 | if p >= np.random.uniform(): 91 | self.pos = pos_next 92 | trace.append(self.pos.copy()) 93 | if np.sum(np.square(self.pos - np.array([3, 5]))) < 1: 94 | break 95 | return np.array(trace) 96 | 97 | 98 | def loss(w, x, y): # w:1*2 99 | return np.mean(np.square(w.reshape((1, 2)).dot(x) - y)) / 2 100 | 101 | 102 | def grad(w, x, y): 103 | y_hat = w.dot(x) 104 | return (y_hat - y).reshape(1, -1).dot(x.T).flatten() 105 | 106 | 107 | def main(): 108 | dim = 400 109 | x = np.linspace(-1, 1, dim) 110 | y = 3 * x + 5 + np.random.randn(dim) 111 | x_expand = np.concatenate([x.reshape((1, dim)), np.ones((1, dim))], axis=0) 112 | w_mesh, b_mesh = np.meshgrid( 113 | np.linspace(-12, 15, 100), np.linspace(-5, 15, 100)) 114 | loss_grid = np.array([ 115 | loss(np.array([w, b]), x_expand, y) 116 | for w, b in zip(np.ravel(w_mesh), np.ravel(b_mesh)) 117 | ]) 118 | plt.contour(w_mesh, b_mesh, loss_grid.reshape( 119 | w_mesh.shape), 70, cmap='bwr_r', alpha=0.5) 120 | opt = Optimization() 121 | an = Annealing() 122 | for i, opt_algo, lr in zip(range(7), ['GD', 'Momentum', 'Nesterov', 'AdaGrad', 'RMSprop', 'Adam', 'Annealing'], [0.0035, 0.0005, 0.0006, 10, 2, 5, 0.5]): 123 | if opt_algo == 'Annealing': 124 | trace = an.annealing(x_expand, y) 125 | else: 126 | opt.reset(lr) 127 | trace = opt.optimize(opt_algo, grad, x_expand, y) 128 | print(f'{opt_algo} finished with {trace.shape[0]} steps') 129 | angles = trace[1:] - trace[:-1] 130 | q = plt.quiver(trace[:-1, 0], trace[:-1, 1], angles[:, 0], angles[:, 1], 131 | scale_units='xy', angles='xy', scale=1, color=plt.cm.get_cmap('Set1')(i), alpha=1, width=0.004) 132 | plt.quiverkey(q, X=1.06, Y=0.9 - i * 0.1, U=1, label=opt_algo) 133 | plt.show() 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /convolutional_neural_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_openml 3 | from nn_layers import Conv, MaxPooling, FullyConnect, Activation, Softmax, BatchNormalization 4 | # This implements Lenet-4, test on MNIST dataset 5 | # gradient check for all layers for input x, w, b 6 | 7 | 8 | class CNN(object): 9 | 10 | def __init__(self, x_shape, label_num): 11 | self.batch_size, lr = 32, 1e-3 12 | # Conv > Normalization > Activation > Dropout > Pooling 13 | conv1 = Conv(in_shape=x_shape, k_num=6, k_size=5, lr=lr) 14 | bn1 = BatchNormalization(in_shape=conv1.out_shape, lr=lr) 15 | relu1 = Activation(act_type="ReLU") 16 | pool1 = MaxPooling(in_shape=conv1.out_shape, k_size=2) 17 | conv2 = Conv(in_shape=pool1.out_shape, k_num=16, k_size=3, lr=lr) 18 | bn2 = BatchNormalization(in_shape=conv2.out_shape, lr=lr) 19 | relu2 = Activation(act_type="ReLU") 20 | pool2 = MaxPooling(in_shape=conv2.out_shape, k_size=2) 21 | fc1 = FullyConnect(pool2.out_shape, [120], lr=lr) 22 | bn3 = BatchNormalization(in_shape=[120], lr=lr) 23 | relu3 = Activation(act_type="ReLU") 24 | fc2 = FullyConnect([120], [label_num], lr=lr) 25 | softmax = Softmax() 26 | 27 | self.layers = [ 28 | conv1, bn1, relu1, pool1, 29 | conv2, bn2, relu2, pool2, 30 | fc1, bn3, relu3, 31 | fc2, softmax 32 | ] 33 | 34 | def fit(self, train_x, labels): 35 | n_data = train_x.shape[0] 36 | train_y = np.zeros((n_data, 10)) 37 | train_y[np.arange(n_data), labels] = 1 38 | for epoch in range(3): 39 | # mini batch 40 | permut = np.random.permutation( 41 | n_data // self.batch_size * self.batch_size).reshape([-1, self.batch_size]) 42 | total_loss = 0 43 | for b_idx in range(permut.shape[0]): 44 | x0 = train_x[permut[b_idx, :]] 45 | y = train_y[permut[b_idx, :]] 46 | 47 | out = x0 48 | for layer in self.layers: 49 | out = layer.forward(out) 50 | 51 | batch_loss = self.layers[-1].loss(out, y) 52 | if b_idx % 100 == 0: 53 | print("epoch {} batch {} loss: {}".format( 54 | epoch, b_idx, batch_loss)) 55 | grad = y # the last softmax layer calculates the pred - y 56 | for layer in self.layers[::-1]: 57 | grad = layer.gradient(grad) 58 | for layer in self.layers: 59 | layer.backward() 60 | total_loss += batch_loss 61 | print('acc', self.get_accuracy(train_x, labels), 62 | 'avg batch loss', total_loss / permut.shape[0]) 63 | 64 | def predict(self, x): 65 | out = x 66 | for layer in self.layers: 67 | out = layer.predict_forward(out) if isinstance( 68 | layer, BatchNormalization) else layer.forward(out) 69 | return out 70 | 71 | def get_accuracy(self, x, label): 72 | n_correct = 0 73 | for i in range(0, x.shape[0], self.batch_size): 74 | x_batch, label_batch = x[ 75 | i: i + self.batch_size], label[i: i + self.batch_size] 76 | n_correct += sum(np.argmax(self.predict(x_batch), 77 | axis=1) == label_batch) 78 | return n_correct / x.shape[0] 79 | 80 | 81 | def gradient_check(conv=True): 82 | if conv: 83 | layera = Conv(in_shape=[16, 32, 28], k_num=12, k_size=3) 84 | layerb = Conv(in_shape=[16, 32, 28], k_num=12, k_size=3) 85 | else: 86 | layera = FullyConnect(in_shape=[16, 32, 28], out_dim=12) 87 | layerb = FullyConnect(in_shape=[16, 32, 28], out_dim=12) 88 | act_layer = Activation(act_type='Tanh') 89 | layerb.w = layera.w.copy() 90 | layerb.b = layera.b.copy() 91 | eps = 1e-4 92 | x = np.random.randn(10, 16, 32, 28) * 10 93 | for i in range(100): 94 | idxes = tuple((np.random.uniform(0, 1, 4) * x.shape).astype(int)) 95 | x_a = x.copy() 96 | x_b = x.copy() 97 | x_a[idxes] += eps 98 | x_b[idxes] -= eps 99 | out = act_layer.forward(layera.forward(x)) 100 | gradient = layera.gradient(act_layer.gradient(np.ones(out.shape))) 101 | 102 | delta_out = (act_layer.forward(layera.forward(x_a)) - 103 | act_layer.forward(layerb.forward(x_b))).sum() 104 | # the output should be in the order of eps*eps 105 | print(idxes, (delta_out / eps / 2 - gradient[idxes]) / eps / eps) 106 | 107 | 108 | def main(): 109 | x, y = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 110 | x = x.reshape(-1, 1, 28, 28) 111 | 112 | test_ratio = 0.2 113 | test_split = np.random.uniform(0, 1, x.shape[0]) 114 | train_x, train_y = x[test_split >= test_ratio] / \ 115 | x.max(), y.astype(np.int_)[test_split >= test_ratio] 116 | test_x, test_y = x[test_split < test_ratio] / \ 117 | x.max(), y.astype(np.int_)[test_split < test_ratio] 118 | 119 | cnn = CNN(x.shape[1:4], 10) 120 | cnn.fit(train_x, train_y) 121 | print('train accuracy', cnn.get_accuracy(train_x, train_y)) 122 | print('test accuracy', cnn.get_accuracy(test_x, test_y)) 123 | 124 | if __name__ == "__main__": 125 | # gradient_check() 126 | main() 127 | -------------------------------------------------------------------------------- /simple_cnn_layers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Layer(object): 5 | 6 | def __init__(self, has_param): 7 | self.gradient_funcs = {'Adam': self.adam, "SGD": self.sgd} 8 | self.learning_rate = 1e-2 9 | self.weight_decay = 1e-4 10 | self.has_param = has_param 11 | 12 | def forward(self, x): 13 | pass 14 | 15 | def gradient(self, grad): 16 | pass 17 | 18 | def backward(self, opt_type): 19 | if self.has_param: 20 | self.regularize() 21 | self.gradient_funcs[opt_type]() 22 | 23 | def regularize(self): 24 | self.w *= (1 - self.weight_decay) 25 | self.b *= (1 - self.weight_decay) 26 | 27 | def adam(self): 28 | beta1 = 0.9 29 | beta2 = 0.999 30 | eps = 1e-8 31 | alpha = self.learning_rate 32 | self.mom_w = beta1 * self.mom_w + (1 - beta1) * self.grad_w 33 | self.cache_w = beta2 * self.cache_w + \ 34 | (1 - beta2) * np.square(self.grad_w) 35 | self.w -= alpha * self.mom_w / (np.sqrt(self.cache_w) + eps) 36 | self.mom_b = beta1 * self.mom_b + (1 - beta1) * self.grad_b 37 | self.cache_b = beta2 * self.cache_b + \ 38 | (1 - beta2) * np.square(self.grad_b) 39 | self.b -= alpha * self.mom_b / (np.sqrt(self.cache_b) + eps) 40 | 41 | def sgd(self): 42 | self.w -= self.learning_rate * self.grad_w 43 | self.b -= self.learning_rate * self.grad_b 44 | 45 | 46 | class Conv(Layer): 47 | 48 | def __init__(self, in_shape, k_size, k_num, stride=1): 49 | super(Conv, self).__init__(has_param=True) 50 | self.in_shape = in_shape 51 | channel, height, width = in_shape 52 | self.k_size = k_size 53 | self.w = np.random.randn(channel * k_size * k_size, k_num) 54 | self.b = np.random.randn(1, k_num) 55 | 56 | self.mom_w = np.zeros_like(self.w) 57 | self.cache_w = np.zeros_like(self.w) 58 | self.mom_b = np.zeros_like(self.b) 59 | self.cache_b = np.zeros_like(self.b) 60 | 61 | self.out_shape = (k_num, (height - k_size + 1) // 62 | stride, (width - k_size + 1) // stride) 63 | self.stride = stride 64 | 65 | def img2col(self, x): 66 | col_matrix = [] 67 | channel, height, width = self.in_shape 68 | for i in range(0, height - self.k_size + 1, self.stride): 69 | for j in range(0, width - self.k_size + 1, self.stride): 70 | # convert kernel size squre into row 71 | col_matrix.append( 72 | x[:, i:i + self.k_size, j:j + self.k_size].reshape([-1])) 73 | return np.array(col_matrix) 74 | 75 | def forward(self, x): 76 | out = [] 77 | self.input = [] 78 | for i in range(x.shape[0]): 79 | self.input.append(self.img2col(x[i])) 80 | out = self.input[i].dot(self.w) + self.b 81 | out.append(out.T.reshape(self.out_shape)) 82 | return np.array(out) 83 | 84 | def col2img(self, grad_colin): 85 | k_size = self.k_size 86 | img = np.zeros(self.in_shape) 87 | for row in range(grad_colin.shape[0]): 88 | i = row // self.out_shape[2] * self.stride 89 | j = row % self.out_shape[2] * self.stride 90 | img[:, i:i + k_size, j:j + 91 | k_size] += grad_colin[row].reshape([self.in_shape[0], k_size, k_size]) 92 | return img 93 | 94 | def gradient(self, grad): 95 | batch_size = grad.shape[0] 96 | b_vec = np.ones((1, self.out_shape[1] * self.out_shape[2])) 97 | grad_out = grad.reshape([batch_size, self.out_shape[0], -1]) 98 | self.grad_w = np.zeros(self.w.shape) 99 | self.grad_b = np.zeros(self.b.shape) 100 | grad_in = [] 101 | for i in range(batch_size): 102 | grad_out_i = grad_out[i].T 103 | self.grad_w += self.input[i].T.dot(grad_out_i) 104 | self.grad_b += b_vec.dot(grad_out_i) 105 | grad_in.append(self.col2img(grad_out_i.dot(self.w.T))) 106 | self.grad_w /= batch_size 107 | self.grad_b /= batch_size 108 | self.input = None 109 | return np.array(grad_in) 110 | 111 | 112 | class MaxPooling(Layer): 113 | 114 | def __init__(self, in_shape, k_size, stride=None): 115 | super(MaxPooling, self).__init__(has_param=False) 116 | self.in_shape = in_shape 117 | channel, height, width = in_shape 118 | self.k_size = k_size 119 | self.stride = k_size if stride is None else stride 120 | self.out_shape = [channel, height // self.stride, width // self.stride] 121 | 122 | def forward(self, x): 123 | batch_size = x.shape[0] 124 | channel, height, width = self.in_shape 125 | self.mask = np.zeros((batch_size, channel, height, width)) 126 | out = np.zeros( 127 | (batch_size, channel, self.out_shape[1], self.out_shape[2])) 128 | for b_idx in range(batch_size): 129 | for c_idx in range(channel): 130 | for i in range(0, height - self.k_size + 1, self.stride): 131 | for j in range(0, width - self.k_size + 1, self.stride): 132 | out[b_idx, c_idx, i // self.stride, j // self.stride] = \ 133 | np.max(x[b_idx, c_idx, i:i + 134 | self.k_size, j:j + self.k_size]) 135 | max_idx = np.argmax( 136 | x[b_idx, c_idx, i:i + self.k_size, j:j + self.k_size]) 137 | self.mask[b_idx, c_idx, i + max_idx // 138 | self.k_size, j + max_idx % self.k_size] = 1 139 | return out 140 | 141 | def gradient(self, grad_out): 142 | grad_out = np.repeat(grad_out, self.k_size, axis=2) 143 | grad_out = np.repeat(grad_out, self.k_size, axis=3) 144 | return np.multiply(self.mask, grad_out) 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple-Implementation-of-ML-Algorithms Using Only Numpy 2 | ### My simplest implementations of common ML algorithms from scratch. 3 | * Most commonly used algorithms including Regressions, Graph Models, Deep Learning, Ensemble Learning, Reinforcement Learning, Optimization Algorithms, Swarm Intelligence, Heuristic Search, and other basic ML algorithms (see below list for more details). 4 | * For an easy understanding, most of the codes implements only minimal version of the algorithm. 5 | * The implementation uses only numpy. 6 | * Most algorithms are within 100 lines of codes. 7 | 8 | ### Example Output 9 | * Decision boundary visualization for implemented classifiers in `decision_boundary_visualization.py` 10 | ![supervised_model](supervised_model.png) 11 | 12 | * Optimization process visualization in `optimization_visualization.py`: ![optimization](optimization.png) 13 | 14 | * digit images from the generative models trained with MNIST dataset: 15 | * Deep Convolutional Generative Adversarial Network (DC GAN) in `generative_adversarial_network.py`: ![dc_gan](dc_gan.png) 16 | 17 | * Variational Autoencoders (VAE) in `variational_autoencoder.py`: ![vae](vae.png) 18 | 19 | * Average image by Restricted Boltzmann Machine (RBM) `restricted_boltzman_machine.py`: ![rbm](rbm.png) 20 | 21 | 22 | ### Implemented algorithms: 23 | 24 | #### Regression Models 25 | * Linear Regression `linear_regression.py` 26 | * Matrix solver 27 | * SGD/Adam solver 28 | * L1 regularization Lasso 29 | * L2 regularization Ridge 30 | * Logistic Regression `logistic_regression.py` 31 | * Multi-class prediction 32 | * Factorization Machines `factorization_machines.py` 33 | * Regularization 34 | * Classification/regression 35 | 36 | #### Tree Models and Ensemble Learning 37 | * Decision Tree `decision_tree.py` 38 | * Classification/regression 39 | * Different metrics 40 | * Feature importances 41 | * Sample weights 42 | * Random Forest `random_forest.py` 43 | * Adaboost `adaboost.py` 44 | * Gradient Boosting Decision Tree (GBDT) `gradient_boosting_decision_tree.py` 45 | * Shrinkage 46 | * Line search of multiplier 47 | * XGBoost `xgboost.py` 48 | * XGBoost Regression Tree 49 | * Shrinkage 50 | 51 | #### Deep Learning 52 | * Architecture 53 | * Supervised Model 54 | * Multilayer Perceptron (MLP) `multilayer_perceptron.py` / `simple_mlp.py` 55 | * Convolutional Neural Network (CNN) `convolutional_neural_network.py` 56 | * Convolutional layer with vectorized img2col and col2img 57 | * Recurrent neural network (RNN) `recurrent_neural_network.py` 58 | * Backpropagation through time (BPTT) 59 | * Long short-term memory (LSTM) `long_short_term_memory.py` 60 | * Generative Model 61 | * Restricted Boltzman Machine (RBM) `restricted_boltzman_machine.py` 62 | * Deep Belief Network (DBN) `deep_belief_network.py` 63 | * Variational autoencoder (VAE) `variational_autoencoder.py` 64 | * Generative Adversarial Network (GAN) `generative_adversarial_network.py` 65 | * Deep Convolutional GAN (DCGAN) 66 | * Conditional GAN 67 | * Transfer Learning `transfer_learning.py` 68 | * CNN on MNIST - freeze convolutional and fine tune dense layers 69 | * Layers `nn_layers.py` / `simple_cnn_layers.py` 70 | * Feedforward layer (dense) 71 | * Convolutional layer 72 | * Max pooling layer 73 | * Batch normalization layer 74 | * Softmax layer for classification 75 | * Activation layer 76 | * ReLU (Leaky) 77 | * Tanh (Leaky) 78 | * Sigmoid 79 | * Drop out layer (WIP) 80 | * Training `nn_layers.py` / `multilayer_perceptron.py` 81 | * Mini Batch 82 | * He initialization 83 | * Loss functions 84 | * Mean squared error for regression 85 | * Cross entropy for classification 86 | * Log loss for classification 87 | * L1/L2 Regularization 88 | * Gradient check 89 | 90 | #### Optimization Algorithms 91 | * Gradient Descent and variations `optimization_visualization.py` / `multilayer_perceptron.py` 92 | * Stochastic Gradient Descent 93 | * Gradient Descent with Momentum 94 | * Nesterov Momentum 95 | * AdaGrad 96 | * RMSProp 97 | * Adam 98 | * Monte Carlo method for global optimum 99 | * Simulated annealing `optimization_visualization.py` 100 | 101 | #### Graph Models 102 | * Naive Bayes `naive_bayes.py` 103 | * Multinomial model 104 | * Document tokenizer 105 | * Beyesian Network `beyesian_net.py` 106 | * Conditional probability MLE 107 | * Beyesian inference 108 | * Hidden Markov Model `hidden_markov_model.py` 109 | * Fitting by Baum-Welch 110 | * Prediction by Viterbi 111 | * Markov Random Field `markov_random_field.py` 112 | * Image Segmentation 113 | * Simulated Annealing for optimal Energy Function 114 | 115 | #### Instance-based Learning and Kernel Machines 116 | * k-Nearest Neighbors (kNN) `k_nearest_neighbors.py` 117 | * Learning Vector Quantization 118 | * Support Vector Machine (SVM) `support_vector_machine.py` 119 | * Soft boundary 120 | * SMO algorithm 121 | * Different heuristics for selecting pairs in SMO 122 | 123 | #### Swarm Intelligence 124 | * Evolutionary Algorithm (EA) `evolutionary_algorithm.py` 125 | * Training a NN model by Genetic Algorithm 126 | * Selection by Fitness 127 | * Crossover approaches 128 | * Mutation rate 129 | * Ant Colony Optimization (ACO) `ant_colony.py` 130 | * Traveling Salesman Problem 131 | 132 | #### Heuristic Search 133 | * Monte Carlo tree search `monte_carlo_tree_search.py` 134 | * Upper Confidence Bound 1 applied to trees (UCT) 135 | * Minimax `minimax.py` 136 | * Alpha-Beta Pruning 137 | 138 | #### Reinforcement Learning 139 | * Temporal difference learning `temporal_difference.py` 140 | * Tabular Q Learning 141 | * Deep Q-Network (DQN) `deep_q_network.py` 142 | * CNN Target & Policy Net 143 | * Epsilon-Greedy 144 | 145 | #### Unsupervised Learning 146 | * Clustering (WIP) 147 | * k-Means / dbscan / spectrum / hierachical 148 | * Dimension Reduction (WIP) 149 | * SOM 150 | * Principal Component Analysis / Linear Discriminant Analysis / MDS / t-SNE 151 | 152 | Feel free to use the code. Please contact me if you have any question: xiecng [at] gmail.com 153 | -------------------------------------------------------------------------------- /monte_carlo_tree_search.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from minimax import MiniMax, RandomMove # for testing purpose 3 | # implements Monte Carlo tree search for Tic Tac Toe / Gomoku 4 | 5 | 6 | n_size = 3 7 | n_connect = 3 8 | 9 | 10 | def is_done(board): 11 | for i in range(n_size * n_size): 12 | x, y = i % n_size, i // n_size 13 | x_end = x + n_connect 14 | x_rev_end = x - n_connect 15 | y_end = y + n_connect 16 | if ( # - 17 | x_end <= n_size and abs(board[y, x:x_end].sum()) == n_connect 18 | ) or ( # | 19 | y_end <= n_size and abs(board[y:y_end, x].sum()) == n_connect 20 | ) or ( # \ 21 | x_end <= n_size and y_end <= n_size and abs( 22 | board[range(y, y_end), range(x, x_end)].sum()) == n_connect 23 | ) or ( # / 24 | x_rev_end >= -1 and y_end <= n_size and abs( 25 | board[range(y, y_end), range(x, x_rev_end, -1)].sum()) == n_connect 26 | ): 27 | return board[y, x] 28 | return 0 29 | 30 | 31 | def play(agents): 32 | board = np.zeros(n_size * n_size).astype(int) 33 | record = np.zeros(n_size * n_size).astype(int) 34 | winner = 0 35 | n_moves = 0 36 | 37 | for move in range(n_size * n_size): 38 | n_moves += 1 39 | player = move % 2 * 2 - 1 40 | if isinstance(agents[move % 2], MCTS): 41 | action_pos = agents[move % 2].act(board, record, player) 42 | else: 43 | action_pos = agents[move % 2].act(board, player) 44 | record[action_pos] = n_moves 45 | board[action_pos] = player 46 | winner = is_done(board.reshape((n_size, n_size))) 47 | if abs(winner) == 1: 48 | break 49 | return record.reshape((n_size, n_size)), winner 50 | 51 | 52 | def test(agents): 53 | game_records = [0, 0, 0] 54 | for i in range(100): 55 | idx = [0, 1] # np.random.permutation([0, 1]).astype(int) 56 | board, winner = play([agents[idx[0]], agents[idx[1]]]) 57 | game_records[-int(winner) * (2 * idx[0] - 1) + 1] += 1 58 | return game_records 59 | 60 | 61 | class MCTSNode(object): 62 | 63 | def __init__(self, board): 64 | self.board = board 65 | self.simulations = [0, 0, 0] # lose/draw/win 66 | self.n_visit = 0 67 | self.children = {} 68 | self.score = 0 69 | self.done = np.abs(board).sum() == board.shape[ 70 | 0] or is_done(board.reshape(n_size, n_size)) != 0 71 | 72 | def update(self, result): 73 | self.simulations[result + 1] += 1 # -1/0/1 -> lose/draw/win (0,1,2) 74 | self.n_visit += 1 75 | self.score = (self.simulations[2] + 0.5 * self.simulations[1]) / self.n_visit # 1 for draw 76 | 77 | 78 | class MCTS(object): 79 | 80 | def __init__(self): 81 | init_board = np.zeros(n_size * n_size).astype(int) 82 | init_board_str = ''.join([str(i) for i in init_board]) 83 | self.cache = {init_board_str: MCTSNode(init_board)} 84 | self.rm = RandomMove() 85 | self.n_iteration = 4 * n_size * n_size 86 | 87 | def legal_moves(self, board): 88 | return [i for i in range(n_size * n_size) if board[i] == 0] 89 | 90 | def selection(self, node): 91 | max_uct = -np.inf 92 | next_moves = [] 93 | for move in self.legal_moves(node.board): 94 | score = node.children[move].score if move in node.children else 0 95 | child_visits = node.children[ 96 | move].n_visit if move in node.children else 1e-4 97 | this_uct = score + np.sqrt(2 * np.log(node.n_visit) / child_visits) 98 | if max_uct < this_uct: 99 | next_moves = [move] 100 | max_uct = this_uct 101 | elif max_uct == this_uct: 102 | next_moves.append(move) 103 | return np.random.choice(next_moves) 104 | 105 | def simulation(self, board, player): # todo add heuristics 106 | winner = is_done(board.reshape((n_size, n_size))) 107 | while np.abs(winner) == 0 and np.abs(board).sum() < board.shape[0]: 108 | board[self.rm.act(board, player)] = player 109 | winner = is_done(board.reshape((n_size, n_size))) 110 | player = -player 111 | return winner 112 | 113 | def search(self, root_node, index_board): 114 | for _ in range(self.n_iteration): 115 | step = np.max(index_board).astype(int) 116 | record = index_board.copy() 117 | node = root_node 118 | while not node.done and node.n_visit > 0: 119 | # selection 120 | next_move = self.selection(node) 121 | if next_move not in node.children: # expansion 122 | child_board = node.board.copy() 123 | child_board[next_move] = -(node.board.sum() * 2 + 1) 124 | child_board_str = ''.join([str(i) for i in child_board]) 125 | if child_board_str not in self.cache: 126 | self.cache[child_board_str] = MCTSNode(child_board) 127 | node.children[next_move] = self.cache[child_board_str] 128 | node = node.children[next_move] 129 | step += 1 130 | record[next_move] = step 131 | # simulation 132 | result = self.simulation(node.board.copy(), node.board.sum() * 2 + 1) 133 | # backpropagation 134 | while step >= 0: 135 | # only updating one branch might affect uct as the n_visit of parent is no longer cnosistent 136 | board_state = (record > 0) * (1-2*(record%2)) 137 | board_str = ''.join([str(i) for i in board_state]) 138 | this_player = 1-2*(step%2) 139 | record = (record!=step) * record 140 | step -= 1 141 | self.cache[board_str].update(result * this_player) 142 | 143 | def act(self, board, index_board, player): 144 | board_str = ''.join([str(int(i)) for i in board]) 145 | node = self.cache[board_str] 146 | self.search(node, index_board) 147 | v_max = np.amax([c.score for m, c in node.children.items()]) 148 | return np.random.choice([m for m, c in node.children.items() if c.score == v_max]) 149 | 150 | 151 | def main(): 152 | minimax = MiniMax(max_depth=9) 153 | mcts = MCTS() 154 | random = RandomMove() 155 | test([mcts, mcts]) 156 | print('\t\t\t\twin/draw/lose') 157 | print('mcts vs. mcts', test([mcts, mcts])) 158 | print('random vs. mcts', test([random, mcts])) 159 | print('mcts vs. random', test([mcts, random])) 160 | print('minimax vs. mcts', test([minimax, mcts])) 161 | print('mcts vs. minimax', test([mcts, minimax])) 162 | 163 | if __name__ == "__main__": 164 | main() 165 | -------------------------------------------------------------------------------- /generative_adversarial_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_openml 3 | from nn_layers import FullyConnect, Activation, Softmax, BatchNormalization, Conv, TrasposedConv 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def noise(n_x, n_d): 8 | return np.random.randn(n_x, n_d) 9 | 10 | 11 | def bce_loss(pred, y): 12 | eps = 1e-20 13 | return -((1 - y) * np.log(1 - pred + eps) + y * np.log(pred + eps)).mean() 14 | 15 | 16 | def bce_grad(pred, y): 17 | eps = 1e-20 18 | return (- y / (pred + eps) + (1 - y) / (1 - pred + eps)) / pred.shape[0] 19 | 20 | 21 | class NN(object): 22 | 23 | def __init__(self, layers): 24 | self.layers = layers 25 | 26 | def predict(self, x): 27 | out = x 28 | for layer in self.layers: 29 | out = layer.predict_forward(out) if isinstance( 30 | layer, BatchNormalization) else layer.forward(out) 31 | return out 32 | 33 | def forward(self, x): 34 | out = x 35 | for layer in self.layers: 36 | out = layer.forward(out) 37 | return out 38 | 39 | def gradient(self, grad_loss): 40 | grad = grad_loss 41 | for layer in self.layers[::-1]: 42 | grad = layer.gradient(grad) 43 | return grad 44 | 45 | def backward(self): 46 | for layer in self.layers: 47 | layer.backward() 48 | 49 | 50 | class GAN(object): 51 | 52 | def __init__(self, conditioned=True): 53 | self.n_epochs, self.batch_size = 1, 64 54 | self.gen_input = 100 55 | self.n_classes = 10 56 | self.conditioned = conditioned 57 | self.dc_gan() 58 | 59 | def dc_gan(self): 60 | gen_lr, dis_lr = 4e-3, 1e-3 61 | dense = FullyConnect( 62 | [self.gen_input + self.n_classes if self.conditioned else self.gen_input], 63 | (128, 7, 7), lr=gen_lr, optimizer='RMSProp' 64 | ) 65 | tconv1 = TrasposedConv(dense.out_shape, k_size=4, 66 | k_num=128, stride=2, padding=1, lr=gen_lr, optimizer='RMSProp') 67 | tconv2 = TrasposedConv(tconv1.out_shape, k_size=4, 68 | k_num=128, stride=2, padding=1, lr=gen_lr, optimizer='RMSProp') 69 | tconv3 = TrasposedConv(tconv2.out_shape, k_size=7, 70 | k_num=1, stride=1, padding=3, lr=gen_lr, optimizer='RMSProp') 71 | self.generator = NN([ 72 | dense, 73 | BatchNormalization(tconv1.in_shape, lr=gen_lr, optimizer='RMSProp'), 74 | Activation(act_type='ReLU'), 75 | tconv1, 76 | BatchNormalization(tconv1.out_shape, lr=gen_lr, optimizer='RMSProp'), 77 | Activation(act_type='ReLU'), 78 | tconv2, 79 | BatchNormalization(tconv2.out_shape, lr=gen_lr, optimizer='RMSProp'), 80 | Activation(act_type='ReLU'), 81 | tconv3, 82 | BatchNormalization(tconv3.out_shape, lr=gen_lr, optimizer='RMSProp'), 83 | Activation(act_type='Tanh') 84 | ]) 85 | conv1 = Conv( 86 | (1 + self.n_classes if self.conditioned else 1, 28, 28), 87 | k_size=7, k_num=128, stride=1, padding=3, lr=dis_lr, optimizer='RMSProp' 88 | ) 89 | conv2 = Conv(conv1.out_shape, k_size=4, k_num=128, 90 | stride=2, padding=1, lr=dis_lr, optimizer='RMSProp') 91 | conv3 = Conv(conv2.out_shape, k_size=4, k_num=128, 92 | stride=2, padding=1, lr=dis_lr, optimizer='RMSProp') 93 | self.discriminator = NN([ 94 | conv1, 95 | Activation(act_type='LeakyReLU'), 96 | conv2, 97 | BatchNormalization(conv2.out_shape, lr=dis_lr, optimizer='RMSProp'), 98 | Activation(act_type='LeakyReLU'), 99 | conv3, 100 | BatchNormalization(conv3.out_shape, lr=dis_lr, optimizer='RMSProp'), 101 | Activation(act_type='LeakyReLU'), 102 | FullyConnect(conv3.out_shape, [1], lr=dis_lr, optimizer='RMSProp'), 103 | Activation(act_type='Sigmoid') 104 | ]) 105 | 106 | def fit(self, x, labels): 107 | y_true = np.ones((self.batch_size, 1)) 108 | y_false = np.zeros((self.batch_size, 1)) 109 | y_dis = np.concatenate([y_true, y_false], axis=0) 110 | label_channels = np.repeat(labels, 28*28, axis=1).reshape(labels.shape[0], self.n_classes, 28, 28) 111 | 112 | for epoch in range(self.n_epochs): 113 | permut = np.random.permutation( 114 | x.shape[0] // self.batch_size * self.batch_size).reshape([-1, self.batch_size]) 115 | for b_idx in range(permut.shape[0]): 116 | batch_label_channels = label_channels[permut[b_idx, :]] 117 | if self.conditioned: 118 | x_true = np.concatenate((x[permut[b_idx, :]], batch_label_channels), axis=1) 119 | else: 120 | x_true = x[permut[b_idx, :]] 121 | pred_dis_true = self.discriminator.forward(x_true) 122 | self.discriminator.gradient(bce_grad(pred_dis_true, y_true)) 123 | self.discriminator.backward() 124 | 125 | if self.conditioned: 126 | x_gen = self.generator.forward( 127 | np.concatenate((noise(self.batch_size, self.gen_input), labels[permut[b_idx, :]]), axis=1) 128 | ) 129 | x_gen = np.concatenate((x_gen, batch_label_channels), axis=1) 130 | else: 131 | x_gen = self.generator.forward(noise(self.batch_size, self.gen_input)) 132 | pred_dis_gen = self.discriminator.forward(x_gen) 133 | self.discriminator.gradient(bce_grad(pred_dis_gen, y_false)) 134 | self.discriminator.backward() 135 | 136 | pred_gen = self.discriminator.forward(x_gen) 137 | grad = self.discriminator.gradient(bce_grad(pred_gen, y_true)) 138 | if self.conditioned: 139 | self.generator.gradient(grad[:,:1,:,:]) 140 | else: 141 | self.generator.gradient(grad) 142 | self.generator.backward() 143 | print( 144 | f'Epoch {epoch} batch {b_idx} discriminator:', 145 | bce_loss(np.concatenate((pred_dis_true, pred_dis_gen)), y_dis), 146 | 'generator:', bce_loss(pred_gen, y_true) 147 | ) 148 | 149 | 150 | def main(): 151 | x, y = fetch_openml('mnist_784', return_X_y=True, data_home='data', as_frame=False) 152 | x = 2 * (x / x.max()) - 1 153 | labels = np.zeros((y.shape[0], 10)) 154 | labels[range(y.shape[0]), y.astype(np.int_)] = 1 155 | gan = GAN(conditioned=True) 156 | gan.fit(x.reshape((-1, 1, 28, 28)), labels) 157 | 158 | if gan.conditioned: 159 | onehot = np.zeros((30, 10)) 160 | onehot[range(30), np.arange(30)%10] = 1 161 | images = gan.generator.predict( 162 | np.concatenate((noise(30, gan.gen_input), onehot), axis=1) 163 | ) 164 | else: 165 | images = gan.generator.predict(noise(30, gan.gen_input)) 166 | 167 | for i, img in enumerate(np.array(images).reshape(-1, 784)): 168 | plt.subplot(len(images), 10, i + 1) 169 | plt.imshow(img.reshape(28, 28), cmap='gray', vmin=-1, vmax=1) 170 | plt.show() 171 | 172 | if __name__ == "__main__": 173 | main() 174 | -------------------------------------------------------------------------------- /recurrent_neural_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import requests 3 | import re 4 | # TODO add sentence tokenizer 5 | # implements letter-based sentense generation 6 | # example output from a model trained with Alice's Adventures in Wonderland is like 7 | # "jome to a gorking,' Anntaying that o Junn a tell you like the bitious again, you by youed :unnoby i" 8 | 9 | def sigmoid(x): 10 | return 1 / (1 + np.exp(-x)) 11 | 12 | 13 | def tanh(x): 14 | return np.tanh(x) 15 | 16 | 17 | def dsigmoid(grad_a, act): 18 | return np.multiply(grad_a, act - np.square(act)) 19 | 20 | 21 | def dtanh(grad_a, act): 22 | return np.multiply(grad_a, 1 - np.square(act)) 23 | 24 | 25 | def softmax(x): 26 | eps = 1e-20 27 | out = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) 28 | return out / (np.sum(out, axis=1).reshape(-1, 1) + eps) 29 | 30 | 31 | def cross_entropy(pred, y): 32 | return -(np.multiply(y, np.log(pred + 1e-20))).sum() 33 | 34 | 35 | class RNN(object): 36 | 37 | def __init__(self, n_input, n_hidden, n_label, n_t): 38 | self.act_func, self.dact_func = tanh, dtanh 39 | self.loss = cross_entropy 40 | self.n_hidden, self.n_label = n_hidden, n_label 41 | self.lr, self.batch_size, self.epochs = 1, 32, 200 42 | self.eps = 1e-20 43 | self.n_t = n_t 44 | self.u = np.random.randn(n_input, self.n_hidden) / n_input 45 | self.w = np.random.randn(self.n_hidden, self.n_hidden) / self.n_hidden 46 | self.b = np.random.randn(1, self.n_hidden) 47 | self.v = np.random.randn(self.n_hidden, n_label) / self.n_hidden 48 | self.c = np.random.randn(1, self.n_label) 49 | 50 | self.mom_u, self.cache_u = np.zeros_like(self.u), np.zeros_like(self.u) 51 | self.mom_v, self.cache_v = np.zeros_like(self.v), np.zeros_like(self.v) 52 | self.mom_w, self.cache_w = np.zeros_like(self.w), np.zeros_like(self.w) 53 | self.mom_b, self.cache_b = np.zeros_like(self.b), np.zeros_like(self.b) 54 | self.mom_c, self.cache_c = np.zeros_like(self.c), np.zeros_like(self.c) 55 | 56 | def fit(self, x, label): 57 | b_size = self.batch_size 58 | n_t, n_data, n_input = x.shape 59 | y = np.zeros((n_t * n_data, self.n_label)) 60 | y[np.arange(n_t * n_data), label.flatten()] = 1 61 | y = y.reshape((n_t, n_data, self.n_label)) 62 | constant = np.ones((1, self.batch_size * n_t)) 63 | 64 | for epoch in range(self.epochs): 65 | permut = np.random.permutation( 66 | n_data // b_size * b_size).reshape(-1, b_size) 67 | for b_idx in range(permut.shape[0]): 68 | x_batch = x[:, permut[b_idx, :]].reshape(n_t * b_size, n_input) 69 | y_batch = y[:, permut[b_idx, :]].reshape( 70 | n_t * b_size, self.n_label) 71 | h = np.zeros((n_t * b_size, self.n_hidden)) 72 | 73 | for t in range(n_t): 74 | t_idx = np.arange(t * b_size, (t + 1) * b_size) 75 | t_idx_prev = t_idx - b_size if t > 0 else t_idx 76 | h[t_idx] = self.act_func(x_batch[t_idx].dot( 77 | self.u) + h[t_idx_prev].dot(self.w) + self.b) 78 | 79 | grad_pred = softmax(h.dot(self.v) + self.c) - y_batch 80 | 81 | grad_h = grad_pred.dot(self.v.T) 82 | for t in reversed(range(1, n_t)): 83 | t_idx = np.arange(t * b_size, (t + 1) * b_size) 84 | grad_h[ 85 | t_idx - b_size] += self.dact_func(grad_h[t_idx], h[t_idx]).dot(self.w.T) 86 | 87 | grad_o = self.dact_func(grad_h, h) 88 | 89 | grad_w = h[:-b_size].T.dot(grad_o[b_size:]) 90 | grad_u = x_batch.T.dot(grad_o) 91 | grad_b = constant.dot(grad_o) 92 | 93 | grad_v = h.T.dot(grad_pred) 94 | grad_c = constant.dot(grad_pred) 95 | 96 | for grads in [grad_u, grad_w, grad_b, grad_v, grad_c]: 97 | np.clip(grads, -10, 10, out=grads) 98 | 99 | self.adam(grad_u=grad_u, grad_w=grad_w, 100 | grad_b=grad_b, grad_v=grad_v, grad_c=grad_c) 101 | self.regularization() 102 | if hasattr(self, 'ix_to_word'): 103 | print(self.sample(np.random.randint(n_input), 104 | np.random.randn(1, self.n_hidden), n_t * 4)) 105 | print(self.loss(self.predict(x).reshape(n_t * n_data, 106 | self.n_label), y.reshape(n_t * n_data, self.n_label))) 107 | 108 | def sgd(self, grad_u, grad_w, grad_b, grad_v, grad_c): 109 | alpha = self.lr / self.batch_size / self.n_t 110 | for params, grads in zip([self.u, self.w, self.b, self.v, self.c], [grad_u, grad_w, grad_b, grad_v, grad_c]): 111 | params -= alpha * grads 112 | 113 | def adam(self, grad_u, grad_w, grad_b, grad_v, grad_c): 114 | beta1 = 0.9 115 | beta2 = 0.999 116 | alpha = self.lr / self.batch_size / self.n_t 117 | for params, grads, mom, cache in zip( 118 | [self.u, self.w, self.b, self.v, self.c], 119 | [grad_u, grad_w, grad_b, grad_v, grad_c], 120 | [self.mom_u, self.mom_w, self.mom_b, self.mom_v, self.mom_c], 121 | [self.cache_u, self.cache_w, self.cache_b, 122 | self.cache_v, self.cache_c] 123 | ): 124 | mom += (beta1 - 1) * mom + (1 - beta1) * grads 125 | cache += (beta2 - 1) * cache + (1 - beta2) * np.square(grads) 126 | params -= alpha * mom / (np.sqrt(cache) + self.eps) 127 | 128 | def regularization(self): 129 | lbd = 1e-4 130 | for params in [self.u, self.w, self.b, self.v, self.c]: 131 | params -= lbd * params 132 | 133 | def predict(self, x): 134 | n_t, n_data, n_input = x.shape 135 | h = np.zeros((n_t * n_data, self.n_hidden)) 136 | for t in range(n_t): 137 | t_idx = np.arange(t * n_data, (t + 1) * n_data) 138 | t_idx_prev = t_idx - n_data if t > 0 else t_idx 139 | h[t_idx] = self.act_func( 140 | x[t].dot(self.u) + h[t_idx_prev].dot(self.w) + self.b) 141 | return softmax(h.dot(self.v) + self.c).reshape(n_t, n_data, self.n_label) 142 | 143 | def sample(self, x_idx, h, seq_length): 144 | n_input = self.u.shape[0] 145 | seq = [x_idx] 146 | for t in range(seq_length): 147 | x = np.zeros((1, n_input)) 148 | x[0, seq[-1]] = 1 149 | h = self.act_func(x.dot(self.u) + h.dot(self.w) + self.b) 150 | y = softmax(h.dot(self.v) + self.c) 151 | seq.append(np.random.choice(range(n_input), p=y.flatten())) 152 | return ''.join(np.vectorize(self.ix_to_word.get)(np.array(seq)).tolist()) 153 | 154 | 155 | def binary_add_test(): 156 | binary_dim = 8 157 | max_num = pow(2, binary_dim) 158 | binary = np.flip(np.unpackbits( 159 | np.array([range(max_num)], dtype=np.uint8).T, axis=1), axis=1) 160 | numbers = np.random.randint(max_num / 2, size=(8192, 2)) 161 | x, y = binary[numbers].transpose(2, 0, 1), binary[ 162 | numbers.sum(axis=1)].transpose() 163 | 164 | test_ratio = 0.2 165 | test_split = np.random.uniform(0, 1, numbers.shape[0]) 166 | train_x, test_x = x[:, test_split >= test_ratio, :], x[ 167 | :, test_split < test_ratio, :] 168 | train_y, test_y = y[:, test_split >= test_ratio], y[ 169 | :, test_split < test_ratio] 170 | 171 | rnn = RNN(2, 3, 2, binary_dim) 172 | rnn.fit(train_x, train_y) 173 | # rnn.gradient_check(train_x[:,np.arange(32),:], train_y[:,np.arange(32)]) 174 | print('train loss', (np.argmax(rnn.predict(train_x), axis=2) == 175 | train_y).sum() / (train_y.shape[0] * train_y.shape[1])) 176 | print('test loss', (np.argmax(rnn.predict(test_x), axis=2) 177 | == test_y).sum() / (test_y.shape[0] * test_y.shape[1])) 178 | 179 | 180 | def text_generation(use_word=True): 181 | text = requests.get('http://www.gutenberg.org/cache/epub/11/pg11.txt').text 182 | if use_word: 183 | text = [ 184 | word + ' ' for word in re.sub("[^a-zA-Z]", " ", text).lower().split()] 185 | 186 | words = sorted(list(set(text))) 187 | text_size, vocab_size = len(text), len(words) 188 | 189 | print(f'text has {text_size} characters, {vocab_size} unique.') 190 | word_to_ix = {word: i for i, word in enumerate(words)} 191 | ix_to_word = {i: word for i, word in enumerate(words)} 192 | 193 | seq_length = 25 194 | indices = np.vectorize(word_to_ix.get)(np.array(list(text))) 195 | data = np.zeros((text_size, vocab_size)) 196 | data[np.arange(text_size), indices] = 1 197 | n_text = (text_size - 1) // seq_length 198 | x = data[ 199 | :n_text * seq_length].reshape(n_text, seq_length, vocab_size).transpose(1, 0, 2) 200 | y = indices[1: n_text * seq_length + 1].reshape(n_text, seq_length).T 201 | 202 | test_ratio = 0.2 203 | test_split = np.random.uniform(0, 1, x.shape[1]) 204 | train_x, test_x = x[:, test_split >= test_ratio, :], x[ 205 | :, test_split < test_ratio, :] 206 | train_y, test_y = y[:, test_split >= test_ratio], y[ 207 | :, test_split < test_ratio] 208 | 209 | rnn = RNN(vocab_size, 500, vocab_size, seq_length) 210 | rnn.ix_to_word = ix_to_word 211 | # rnn.gradient_check(train_x[:,np.arange(32),:], train_y[:,np.arange(32)]) 212 | rnn.fit(train_x, train_y) 213 | print('train loss', (np.argmax(rnn.predict(train_x), axis=2) == 214 | train_y).sum() / (train_y.shape[0] * train_y.shape[1])) 215 | print('test loss', (np.argmax(rnn.predict(test_x), axis=2) 216 | == test_y).sum() / (test_y.shape[0] * test_y.shape[1])) 217 | 218 | 219 | def main(): 220 | text_generation(use_word=False) 221 | # binary_add_test() 222 | 223 | 224 | if __name__ == "__main__": 225 | main() 226 | -------------------------------------------------------------------------------- /multilayer_perceptron.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import fetch_openml 3 | ''' 4 | This is a simple implementation of multilayer perceptron with backpropagation training 5 | Implemented features: 6 | Activation function: relu/sigmoid/hyperbolic tangent 7 | Regularization: optional L1/L2 to prevent overfitting 8 | Optimization: SDG/Momentum/Adagrad/RMSprop/Nesterov/Adam 9 | Architecture: layer configuarations in self.layers 10 | Hyperparameters: set learning rate, batch size and epochs before start 11 | ''' 12 | 13 | 14 | def relu(x): 15 | return np.maximum(x, 0) 16 | 17 | 18 | def sigmoid(x): 19 | return 1 / (1 + np.exp(-x)) 20 | 21 | 22 | def tanh(x): 23 | return np.tanh(x) 24 | 25 | 26 | def drelu(grad_a, act): 27 | grad_a[act <= 0] = 0 28 | return grad_a 29 | 30 | 31 | def dsigmoid(grad_a, act): 32 | return np.multiply(grad_a, act - np.square(act)) 33 | 34 | 35 | def dtanh(grad_a, act): 36 | return np.multiply(grad_a, 1 - np.square(act)) 37 | 38 | 39 | def softmax(x): 40 | eps = 1e-8 41 | out = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) 42 | return out / (np.sum(out, axis=1).reshape(-1, 1) + eps) 43 | 44 | 45 | def linear(x): 46 | return x 47 | 48 | 49 | def cross_entropy(pred, y): 50 | return -(np.multiply(y, np.log(pred + 1e-4))).mean() 51 | 52 | 53 | def squared_error(pred, y): 54 | return np.square(pred - y).mean() / 2 55 | 56 | 57 | class MLP(object): 58 | 59 | def __init__(self, act_type, opt_type, layers, epochs=20, regression=False, learning_rate=0.01, lmbda=1e-2): 60 | act_funcs = {'ReLU': relu, 'Sigmoid': sigmoid, 'Tanh': tanh} 61 | dacts = {'ReLU': drelu, 'Sigmoid': dsigmoid, 'Tanh': dtanh} 62 | optimizers = {'SGD': self.sgd, 'Momentum': self.momentum, 'Nesterov': self.nesterov, 63 | 'AdaGrad': self.adagrad, 'RMSprop': self.rmsprop, 'Adam': self.adam} 64 | 65 | self.reg = 2 # 0=none, 1=L1, 2=L2 regularization 66 | self.lmbda = lmbda # regularization coefficient 67 | self.gamma = 0.9 68 | self.eps = 1e-8 69 | self.epochs, self.batch_size = epochs, 32 70 | self.learning_rate = learning_rate 71 | self.layer_num = len(layers) - 1 72 | self.n_labels = layers[-1] 73 | self.regression = regression 74 | self.output = linear if self.regression else softmax 75 | self.loss = squared_error if self.regression else cross_entropy 76 | 77 | self.afunc = act_funcs[act_type] 78 | self.dact = dacts[act_type] 79 | self.optimize = optimizers[opt_type] 80 | 81 | # Randomly initialize weights 82 | self.w, self.b = [np.empty] * \ 83 | self.layer_num, [np.empty] * self.layer_num 84 | self.mom_w, self.cache_w = [np.empty] * \ 85 | self.layer_num, [np.empty] * self.layer_num 86 | self.mom_b, self.cache_b = [np.empty] * \ 87 | self.layer_num, [np.empty] * self.layer_num 88 | 89 | for i in range(self.layer_num): 90 | self.w[i] = np.random.randn(layers[i], layers[i + 1]) 91 | self.b[i] = np.random.randn(1, layers[i + 1]) 92 | self.mom_w[i] = np.zeros_like(self.w[i]) 93 | self.cache_w[i] = np.zeros_like(self.w[i]) 94 | self.mom_b[i] = np.zeros_like(self.b[i]) 95 | self.cache_b[i] = np.zeros_like(self.b[i]) 96 | 97 | def sgd(self, grad_w, grad_b): 98 | alpha = self.learning_rate / self.batch_size 99 | for i in range(self.layer_num): 100 | self.w[i] -= alpha * grad_w[i] 101 | self.b[i] -= alpha * grad_b[i] 102 | 103 | def momentum(self, grad_w, grad_b): 104 | alpha = self.learning_rate / self.batch_size 105 | for i in range(self.layer_num): 106 | self.mom_w[i] = self.gamma * self.mom_w[i] + alpha * grad_w[i] 107 | self.w[i] -= self.mom_w[i] 108 | self.mom_b[i] = self.gamma * self.mom_b[i] + alpha * grad_b[i] 109 | self.b[i] -= self.mom_b[i] 110 | 111 | def nesterov(self, grad_w, grad_b): 112 | alpha = self.learning_rate / self.batch_size 113 | for i in range(self.layer_num): 114 | mom_v_prev = self.mom_w[i] 115 | self.mom_w[i] = self.gamma * self.mom_w[i] + alpha * grad_w[i] 116 | self.w[i] -= ((1 + self.gamma) * self.mom_w[i] - 117 | self.gamma * mom_v_prev) 118 | mom_b_prev = self.mom_b[i] 119 | self.mom_b[i] = self.gamma * self.mom_b[i] + alpha * grad_b[i] 120 | self.b[i] -= ((1 + self.gamma) * self.mom_b[i] - 121 | self.gamma * mom_b_prev) 122 | 123 | def adagrad(self, grad_w, grad_b): 124 | alpha = self.learning_rate / self.batch_size 125 | for i in range(self.layer_num): 126 | self.cache_w[i] += np.square(grad_w[i]) 127 | self.w[i] -= alpha * grad_w[i] / \ 128 | (np.sqrt(self.cache_w[i]) + self.eps) 129 | self.cache_b[i] += np.square(grad_b[i]) 130 | self.b[i] -= alpha * grad_b[i] / \ 131 | (np.sqrt(self.cache_b[i]) + self.eps) 132 | 133 | def rmsprop(self, grad_w, grad_b): 134 | alpha = self.learning_rate / self.batch_size 135 | for i in range(self.layer_num): 136 | self.cache_w[i] = self.gamma * self.cache_w[i] + \ 137 | (1 - self.gamma) * np.square(grad_w[i]) 138 | self.w[i] -= alpha * grad_w[i] / \ 139 | (np.sqrt(self.cache_w[i]) + self.eps) 140 | self.cache_b[i] = self.gamma * self.cache_b[i] + \ 141 | (1 - self.gamma) * np.square(grad_b[i]) 142 | self.b[i] -= alpha * grad_b[i] / \ 143 | (np.sqrt(self.cache_b[i]) + self.eps) 144 | 145 | def adam(self, grad_w, grad_b): 146 | beta1 = 0.9 147 | beta2 = 0.999 148 | alpha = self.learning_rate / self.batch_size 149 | for i in range(self.layer_num): 150 | self.mom_w[i] = beta1 * self.mom_w[i] + (1 - beta1) * grad_w[i] 151 | self.cache_w[i] = beta2 * self.cache_w[i] + \ 152 | (1 - beta2) * np.square(grad_w[i]) 153 | self.w[i] -= alpha * self.mom_w[i] / \ 154 | (np.sqrt(self.cache_w[i]) + self.eps) 155 | self.mom_b[i] = beta1 * self.mom_b[i] + (1 - beta1) * grad_b[i] 156 | self.cache_b[i] = beta2 * self.cache_b[i] + \ 157 | (1 - beta2) * np.square(grad_b[i]) 158 | self.b[i] -= alpha * self.mom_b[i] / \ 159 | (np.sqrt(self.cache_b[i]) + self.eps) 160 | 161 | def regularization(self): 162 | if(self.reg == 0): 163 | return 164 | alpha = self.learning_rate * self.lmbda 165 | for i in range(self.layer_num): 166 | if(self.reg == 1): 167 | self.w[i] -= alpha * np.sign(self.w[i]) 168 | elif(self.reg == 2): 169 | self.w[i] -= alpha * self.w[i] 170 | 171 | def predict(self, x): 172 | act = x 173 | for i in range(self.layer_num - 1): 174 | act = self.afunc(act.dot(self.w[i]) + self.b[i]) 175 | return self.output(act.dot(self.w[self.layer_num - 1]) + self.b[self.layer_num - 1]) 176 | 177 | def fit(self, x, labels): 178 | train_num = x.shape[0] 179 | l_num = self.layer_num 180 | bvec = np.ones((1, self.batch_size)) 181 | 182 | if self.regression: 183 | y = labels 184 | else: 185 | y = np.zeros((train_num, self.n_labels)) 186 | y[np.arange(train_num), labels] = 1 187 | 188 | for epoch in range(self.epochs): 189 | # mini batch 190 | permut = np.random.permutation( 191 | train_num // self.batch_size * self.batch_size).reshape(-1, self.batch_size) 192 | for b_idx in range(permut.shape[0]): 193 | # Forward pass: compute predicted out 194 | act = [np.empty] * (l_num + 1) 195 | act[0] = x[permut[b_idx, :]] 196 | for i in range(1, l_num): 197 | act[i] = self.afunc( 198 | act[i - 1].dot(self.w[i - 1]) + self.b[i - 1]) 199 | act[l_num] = self.output( 200 | act[l_num - 1].dot(self.w[l_num - 1]) + self.b[l_num - 1]) 201 | 202 | # Backprop to compute gradients of weights & activaions 203 | grad_a, grad_w, grad_b = [ 204 | np.empty] * (l_num + 1), [np.empty] * l_num, [np.empty] * l_num 205 | grad_a[l_num] = act[l_num] - y[permut[b_idx, :]] 206 | grad_w[l_num - 1] = act[l_num - 1].T.dot(grad_a[l_num]) 207 | grad_b[l_num - 1] = bvec.dot(grad_a[l_num]) 208 | 209 | for i in reversed(range(1, l_num)): 210 | grad_a[i] = grad_a[i + 1].dot(self.w[i].T) 211 | grad_a[i] = self.dact(grad_a[i], act[i]) 212 | grad_w[i - 1] = act[i - 1].T.dot(grad_a[i]) 213 | grad_b[i - 1] = bvec.dot(grad_a[i]) 214 | 215 | # Update weights 216 | self.regularization() 217 | self.optimize(grad_w, grad_b) 218 | print('epoch {}, loss: {}'.format( 219 | epoch, self.loss(self.predict(x), y))) 220 | 221 | 222 | def main(): 223 | x, y = fetch_openml('mnist_784', return_X_y=True, data_home="data", as_frame=False) 224 | test_ratio = 0.2 225 | test_split = np.random.uniform(0, 1, x.shape[0]) 226 | train_x, test_x = x[test_split >= test_ratio] / \ 227 | x.max(), x[test_split < test_ratio] / x.max() 228 | train_y, test_y = y.astype(np.int_)[test_split >= test_ratio], y.astype( 229 | np.int_)[test_split < test_ratio] 230 | 231 | mlp = MLP('ReLU', 'Adam', layers=[x.shape[1], 100, 100, len(np.unique(y))]) 232 | mlp.fit(train_x, train_y) 233 | print(sum(np.argmax(mlp.predict(train_x), axis=1) 234 | == train_y) / train_y.shape[0]) 235 | print(sum(np.argmax(mlp.predict(test_x), axis=1) 236 | == test_y) / test_y.shape[0]) 237 | 238 | 239 | if __name__ == "__main__": 240 | main() 241 | -------------------------------------------------------------------------------- /deep_q_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nn_layers import FullyConnect, Activation, Conv 3 | from minimax import MiniMax, RandomMove 4 | # Double deep q learning (DQN) for Tic Tac Toe / Gomoku 5 | 6 | 7 | n_size = 3 8 | n_connect = 3 9 | 10 | 11 | def is_done(board): 12 | for i in range(n_size * n_size): 13 | x, y = i % n_size, i // n_size 14 | x_end = x + n_connect 15 | x_rev_end = x - n_connect 16 | y_end = y + n_connect 17 | if ( # - 18 | x_end <= n_size and abs(board[y, x:x_end].sum()) == n_connect 19 | ) or ( # | 20 | y_end <= n_size and abs(board[y:y_end, x].sum()) == n_connect 21 | ) or ( # \ 22 | x_end <= n_size and y_end <= n_size and abs( 23 | board[range(y, y_end), range(x, x_end)].sum()) == n_connect 24 | ) or ( # / 25 | x_rev_end >= -1 and y_end <= n_size and abs( 26 | board[range(y, y_end), range(x, x_rev_end, -1)].sum()) == n_connect 27 | ): 28 | return board[y, x] 29 | return 0 30 | 31 | 32 | def transform_action(action): # generating more board by flipping and rotating 33 | y = action // n_size 34 | x = action % n_size 35 | pos = [ 36 | (y, x), (x, n_size - 1 - y), (n_size - 1 - 37 | y, n_size - 1 - x), (n_size - 1 - x, y), 38 | (y, n_size - 1 - x), (n_size - 1 - x, 39 | n_size - 1 - y), (n_size - 1 - y, x), (x, y) 40 | ] 41 | return np.array([y * n_size + x for y, x in pos]) 42 | 43 | 44 | class NN(object): 45 | 46 | def __init__(self, layers): 47 | self.layers = layers 48 | 49 | def forward(self, x): 50 | out = x 51 | for layer in self.layers: 52 | out = layer.forward(out) 53 | return out 54 | 55 | def gradient(self, grad_loss): 56 | grad = grad_loss 57 | for layer in self.layers[::-1]: 58 | grad = layer.gradient(grad) 59 | return grad 60 | 61 | def backward(self): 62 | for layer in self.layers: 63 | layer.backward() 64 | 65 | def copy_weights(self, nn): 66 | for layer1, layer2 in zip(self.layers, nn.layers): 67 | if isinstance(layer1, FullyConnect) or isinstance(layer1, Conv): 68 | layer1.w = layer2.w.copy() 69 | layer1.b = layer2.b.copy() 70 | 71 | 72 | class DQN(object): 73 | 74 | def __init__(self, eps=1): 75 | self.n_episodes = 300 76 | self.batch_size = 32 77 | self.n_epochs = 200 78 | self.training_size = self.n_epochs * self.batch_size 79 | self.gamma = 0.99 80 | self.eps = eps 81 | self.eps_decay = 0.99 82 | lr = 0.002 83 | self.policy_net, self.target_net = [NN([ 84 | Conv((3, n_size, n_size), k_size=n_connect, 85 | k_num=16, optimizer='RMSProp'), 86 | Activation(act_type='LeakyReLU'), 87 | FullyConnect([16, n_size - n_connect + 1, n_size - n_connect + 1], [16], 88 | lr=lr, optimizer='RMSProp'), 89 | Activation(act_type='LeakyReLU'), 90 | FullyConnect([16], [16], lr=lr, optimizer='RMSProp'), 91 | Activation(act_type='LeakyReLU'), 92 | FullyConnect([16], [16], lr=lr, optimizer='RMSProp'), 93 | Activation(act_type='LeakyReLU'), 94 | FullyConnect([16], [16], lr=lr, optimizer='RMSProp'), 95 | Activation(act_type='LeakyReLU'), 96 | FullyConnect([16], [n_size * n_size], lr=lr, optimizer='RMSProp'), 97 | # Activation(act_type='Tanh'), 98 | ]) for _ in range(2)] 99 | self.states = np.zeros((0, 3, n_size, n_size)) 100 | self.next_states = np.zeros((0, 3, n_size, n_size)) 101 | self.actions = np.zeros(0).astype(int) 102 | self.rewards = np.zeros(0) 103 | self.unfinish_mask = np.zeros(0) 104 | self.weights = np.zeros(0) 105 | 106 | def replay(self): 107 | permut = np.random.permutation( 108 | self.n_epochs * self.batch_size).reshape([self.n_epochs, self.batch_size]) 109 | loss = 0 110 | for batch_idx in permut: 111 | action_pos = self.actions[batch_idx] 112 | 113 | this_q = np.zeros((self.batch_size, n_size * n_size)) 114 | this_q[range(self.batch_size), action_pos] = self.policy_net.forward( 115 | self.states[batch_idx])[range(self.batch_size), action_pos] 116 | 117 | targets = np.zeros((self.batch_size, n_size * n_size)) 118 | next_q = np.amax(self.target_net.forward( 119 | self.next_states[batch_idx]), axis=1) 120 | targets[range(self.batch_size), action_pos] = self.rewards[ 121 | batch_idx] + self.unfinish_mask[batch_idx] * self.gamma * next_q 122 | 123 | grad = (this_q - targets) * self.weights[batch_idx].reshape(-1, 1) 124 | loss += np.square(grad).mean() 125 | self.policy_net.gradient(grad) 126 | self.policy_net.backward() 127 | print('loss', loss / self.n_epochs) 128 | 129 | def act(self, board, player): 130 | state = np.array([[(board == player).reshape( 131 | n_size, n_size), (board == -player).reshape(n_size, n_size), (board == 0).reshape(n_size, n_size)]]) 132 | return self.eps_greedy(state) 133 | 134 | def eps_greedy(self, state): 135 | valid_mask = 1 - state[0, 0, :, :].flatten() - \ 136 | state[0, 1, :, :].flatten() 137 | preds = self.policy_net.forward(state)[0] 138 | max_idx = np.argmax(preds * valid_mask - 139 | (1 - valid_mask) * np.finfo(float).max) 140 | m = sum(valid_mask) 141 | p = self.eps / m * valid_mask 142 | p[max_idx] = 1 - self.eps + self.eps / m 143 | return np.random.choice(n_size * n_size, p=p) 144 | 145 | def fit(self): 146 | random = RandomMove() 147 | minimax = MiniMax(max_depth=9) 148 | agents = [minimax, self] 149 | while self.states.shape[0] < self.training_size: 150 | # np.random.shuffle(agents) 151 | play(agents, self) 152 | for iteration in range(self.n_episodes): 153 | self.eps *= self.eps_decay 154 | # np.random.shuffle(agents) 155 | play(agents, self) 156 | print('iteration:', iteration, 'eps:', self.eps) 157 | for i in range(10): 158 | self.replay() 159 | if iteration % 10 == 0: 160 | self.target_net.copy_weights(self.policy_net) 161 | temp_eps = self.eps 162 | self.eps = 0 163 | print('\t\t\t\twin/draw/lose') 164 | print('minimax vs. dqn', test([minimax, self])) 165 | print('dqn vs. minimax', test([self, minimax])) 166 | print('random vs. dqn', test([random, self])) 167 | print('dqn vs. random', test([self, random])) 168 | self.eps = temp_eps 169 | 170 | def save_play(self, saved_actions, saved_states, winner, n_moves, saved_weights): 171 | self.actions = np.append(self.actions, np.array( 172 | saved_actions))[-self.training_size:] 173 | self.states = np.append(self.states, np.array( 174 | saved_states), axis=0)[-self.training_size:] 175 | self.next_states = np.append( 176 | self.next_states, np.array(saved_states[16:]), axis=0) 177 | self.next_states = np.append(self.next_states, np.zeros( 178 | (16, 3, n_size, n_size)), axis=0)[-self.training_size:] 179 | this_mask, this_rewards = np.ones(n_moves), np.zeros(n_moves) 180 | this_mask[[-2, -1]] = np.array([0, 0]) 181 | this_rewards[[-2, -1]] = np.array([-1 * abs(winner) + ( 182 | 1 - abs(winner)) * 1, 1 * abs(winner) + (1 - abs(winner)) * 1]) 183 | self.unfinish_mask = np.append( 184 | self.unfinish_mask, np.repeat(this_mask, 8))[-self.training_size:] 185 | self.rewards = np.append(self.rewards, np.repeat( 186 | this_rewards, 8))[-self.training_size:] 187 | self.weights = np.append(self.weights, np.array( 188 | saved_weights))[-self.training_size:] 189 | 190 | 191 | def play(agents, cache=None): 192 | boards = np.zeros((8, n_size * n_size)).astype(int) 193 | record = np.zeros(n_size * n_size) 194 | winner = 0 195 | n_moves = 0 196 | saved_actions = [] 197 | saved_states = [] 198 | saved_weights = [] 199 | for move in range(n_size * n_size): 200 | n_moves += 1 201 | player = move % 2 * 2 - 1 202 | action_pos = agents[move % 2].act(boards[0], player) 203 | record[action_pos] = n_moves 204 | action_list = transform_action(action_pos) 205 | for action, current_board in zip(action_list, boards): 206 | saved_actions.append(action) 207 | saved_states.append([ 208 | (current_board == player).reshape(n_size, n_size), 209 | (current_board == -player).reshape(n_size, n_size), 210 | (current_board == 0).reshape(n_size, n_size) 211 | ]) 212 | # only do q learning update for the dqn's move 213 | saved_weights.append(1 if isinstance(agents[move % 2], DQN) else 0) 214 | boards[range(8), action_list] = player 215 | winner = is_done(boards[0].reshape((n_size, n_size))) 216 | if abs(winner) == 1: 217 | break 218 | if cache is not None: 219 | cache.save_play(saved_actions, saved_states, 220 | winner, n_moves, saved_weights) 221 | return record.reshape((n_size, n_size)), winner 222 | 223 | 224 | def test(agents): 225 | game_records = [0, 0, 0] 226 | for i in range(100): 227 | idx = [0, 1] # np.random.permutation([0, 1]).astype(int) 228 | board, winner = play([agents[idx[0]], agents[idx[1]]]) 229 | game_records[-int(winner) * (2 * idx[0] - 1) + 1] += 1 230 | return game_records 231 | 232 | 233 | def main(): 234 | dqn = DQN() 235 | dqn.fit() 236 | 237 | if __name__ == "__main__": 238 | main() 239 | -------------------------------------------------------------------------------- /long_short_term_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import requests 3 | import re 4 | 5 | # generate sentences base on letters. Model is trained with Alice's Adventures in Wonderland 6 | # example output "'Oh, I BEL yourt!' Saic 'Alice thing seemst,' 7 | # Alice reminused all cranged at the end of everying and bring rause 8 | 9 | def sigmoid(x): 10 | return 1 / (1 + np.exp(-x)) 11 | 12 | 13 | def tanh(x): 14 | return np.tanh(x) 15 | 16 | 17 | def dsigmoid(grad_a, act): 18 | return grad_a * (act - np.square(act)) 19 | 20 | 21 | def dtanh(grad_a, act): 22 | return grad_a * (1 - np.square(act)) 23 | 24 | 25 | def softmax(x): 26 | eps = 1e-20 27 | out = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) 28 | return out / (np.sum(out, axis=1).reshape(-1, 1) + eps) 29 | 30 | 31 | def cross_entropy(pred, y): 32 | return -(np.multiply(y, np.log(pred + 1e-20))).sum() 33 | 34 | 35 | class LSTM(object): 36 | 37 | def __init__(self, n_input, n_hidden, n_label, n_t): 38 | self.loss = cross_entropy 39 | self.n_hidden, self.n_label = n_hidden, n_label 40 | self.lr, self.batch_size, self.epochs = 1, 32, 200 41 | self.eps = 1e-20 42 | self.n_t = n_t 43 | 44 | self.w_f, self.w_i, self.w_c, self.w_o = [np.random.randn( 45 | n_input, self.n_hidden) / n_input for _ in range(4)] 46 | self.u_f, self.u_i, self.u_c, self.u_o = [np.random.randn( 47 | self.n_hidden, self.n_hidden) / self.n_hidden for _ in range(4)] 48 | self.b_f, self.b_i, self.b_c, self.b_o = [ 49 | np.random.randn(1, self.n_hidden) for _ in range(4)] 50 | self.u_v, self.b_v = np.random.randn( 51 | self.n_hidden, self.n_label) / self.n_hidden, np.random.randn(1, self.n_label) 52 | 53 | self.param_list = [ 54 | self.w_f, self.w_i, self.w_c, self.w_o, 55 | self.u_f, self.u_i, self.u_c, self.u_o, self.u_v, 56 | self.b_f, self.b_i, self.b_c, self.b_o, self.b_v 57 | ] 58 | self.mom_list = [np.zeros_like(param) for param in self.param_list] 59 | self.cache_list = [np.zeros_like(param) for param in self.param_list] 60 | 61 | def fit(self, x, label): 62 | b_size = self.batch_size 63 | n_t, n_data, n_input = x.shape 64 | y = np.zeros((n_t * n_data, self.n_label)) 65 | y[np.arange(n_t * n_data), label.flatten()] = 1 66 | y = y.reshape((n_t, n_data, self.n_label)) 67 | constant = np.ones((1, self.batch_size * n_t)) 68 | 69 | for epoch in range(self.epochs): 70 | permut = np.random.permutation( 71 | n_data // b_size * b_size).reshape(-1, b_size) 72 | for b_idx in range(permut.shape[0]): 73 | x_batch = x[:, permut[b_idx, :]].reshape(n_t * b_size, n_input) 74 | y_batch = y[:, permut[b_idx, :]].reshape( 75 | n_t * b_size, self.n_label) 76 | h, f, i, c, o, c_bar, grad_f, grad_i, grad_o, grad_c, grad_c_bar = [ 77 | np.zeros((n_t * b_size, self.n_hidden)) for _ in range(11) 78 | ] 79 | 80 | # forward pass 81 | for t in range(n_t): 82 | t_idx = np.arange(t * b_size, (t + 1) * b_size) 83 | t_idx_prev = t_idx - b_size if t > 0 else t_idx 84 | 85 | xt_batch, ht_prev = x_batch[t_idx], h[t_idx_prev] 86 | 87 | f[t_idx] = sigmoid(xt_batch @ self.w_f + ht_prev @ self.u_f + self.b_f) 88 | i[t_idx] = sigmoid(xt_batch @ self.w_i + ht_prev @ self.u_i + self.b_i) 89 | o[t_idx] = sigmoid(xt_batch @ self.w_o + ht_prev @ self.u_o + self.b_o) 90 | c_bar[t_idx] = tanh(xt_batch @ self.w_c + ht_prev @ self.u_c + self.b_c) 91 | c[t_idx] = f[t_idx] * c[t_idx_prev] + \ 92 | i[t_idx] * c_bar[t_idx] 93 | h[t_idx] = o[t_idx] * tanh(c[t_idx]) 94 | 95 | c_prev = np.zeros(c.shape) 96 | c_prev[b_size:, :] = c[:-b_size, :] 97 | h_prev = np.zeros(h.shape) 98 | h_prev[b_size:, :] = h[:-b_size, :] 99 | 100 | # back propagation through time 101 | grad_v = softmax(h @ self.u_v + self.b_v) - y_batch 102 | grad_h = grad_v @ self.u_v.T 103 | 104 | for t in reversed(range(0, n_t)): 105 | t_idx = np.arange(t * b_size, (t + 1) * b_size) 106 | t_idx_next = t_idx + b_size if t < n_t - 1 else t_idx 107 | grad_h[t_idx] += ( 108 | dsigmoid(grad_f[t_idx_next], f[t_idx_next]) @ self.u_f.T + 109 | dsigmoid(grad_i[t_idx_next], i[t_idx_next]) @ self.u_i.T + 110 | dsigmoid(grad_o[t_idx_next], o[t_idx_next]) @ self.u_o.T + 111 | dtanh(grad_c_bar[t_idx_next], c_bar[t_idx_next]) @ self.u_c.T 112 | ) 113 | grad_c[t_idx] = o[t_idx] * grad_h[t_idx] * \ 114 | (1 - np.square(np.tanh(c[t_idx]))) + \ 115 | f[t_idx_next] * grad_c[t_idx_next] 116 | grad_f[t_idx] = grad_c[t_idx] * c_prev[t_idx] 117 | grad_i[t_idx] = grad_c[t_idx] * c_bar[t_idx] 118 | grad_o[t_idx] = grad_h[t_idx] * tanh(c[t_idx]) 119 | grad_c_bar[t_idx] = grad_c[t_idx] * i[t_idx] 120 | 121 | self.adam( 122 | grad_list=[ 123 | x_batch.T @ dsigmoid(grad_f, f), x_batch.T @ dsigmoid(grad_i, i), x_batch.T @ dtanh(grad_c_bar, c_bar), x_batch.T @ dsigmoid(grad_o, o), 124 | h_prev.T @ dsigmoid(grad_f, f), h_prev.T @ dsigmoid(grad_i, i), h_prev.T @ dtanh(grad_c_bar, c_bar), h_prev.T @ dsigmoid(grad_o, o), h.T @ grad_v, 125 | constant @ dsigmoid(grad_f, f), constant @ dsigmoid(grad_i, i), constant @ dtanh(grad_c_bar, c_bar), constant @ dsigmoid(grad_o, o), constant @ grad_v 126 | ] 127 | ) 128 | self.regularization() 129 | print(self.sample(np.random.randint(n_input), np.random.randn( 130 | 1, self.n_hidden), np.random.randn(1, self.n_hidden), n_t * 4)) 131 | print(self.loss(self.predict(x).reshape(n_t * n_data, 132 | self.n_label), y.reshape(n_t * n_data, self.n_label))) 133 | 134 | def sgd(self, grad_list): 135 | alpha = self.lr / self.batch_size / self.n_t 136 | for params, grads in zip(self.param_list, grad_list): 137 | params -= alpha * grads 138 | 139 | def adam(self, grad_list): 140 | beta1 = 0.9 141 | beta2 = 0.999 142 | alpha = self.lr / self.batch_size / self.n_t 143 | for params, grads, mom, cache in zip( 144 | self.param_list, grad_list, self.mom_list, self.cache_list 145 | ): 146 | mom += (beta1 - 1) * mom + (1 - beta1) * grads 147 | cache += (beta2 - 1) * cache + (1 - beta2) * np.square(grads) 148 | params -= alpha * mom / (np.sqrt(cache) + self.eps) 149 | 150 | def regularization(self): 151 | lbd = 1e-5 152 | for params in self.param_list: 153 | params -= lbd * params 154 | 155 | def predict(self, x): 156 | n_t, n_data, n_input = x.shape 157 | h, f, i, c, o = [np.zeros((n_t * n_data, self.n_hidden)) 158 | for _ in range(5)] 159 | # forward pass 160 | for t in range(n_t): 161 | t_idx = np.arange(t * n_data, (t + 1) * n_data) 162 | t_idx_prev = t_idx - n_data if t > 0 else t_idx 163 | f[t_idx] = sigmoid(x[t] @ self.w_f + h[t_idx_prev] @ self.u_f + self.b_f) 164 | i[t_idx] = sigmoid(x[t] @ self.w_i + h[t_idx_prev] @ self.u_i + self.b_i) 165 | o[t_idx] = sigmoid(x[t] @ self.w_o + h[t_idx_prev] @ self.u_o + self.b_o) 166 | c[t_idx] = f[t_idx] * c[t_idx_prev] + i[t_idx] * tanh(x[t] @ self.w_c + h[t_idx_prev] @ self.u_c + self.b_c) 167 | h[t_idx] = o[t_idx] * tanh(c[t_idx]) 168 | return softmax(h @ self.u_v + self.b_v).reshape(n_t, n_data, self.n_label) 169 | 170 | def sample(self, x_idx, h, c, seq_length): 171 | n_input = self.w_f.shape[0] 172 | seq = [x_idx] 173 | for t in range(seq_length): 174 | x = np.zeros((1, n_input)) 175 | x[0, seq[-1]] = 1 176 | 177 | f = sigmoid(x @ self.w_f + h @ self.u_f + self.b_f) 178 | i = sigmoid(x @ self.w_i + h @ self.u_i + self.b_i) 179 | o = sigmoid(x @ self.w_o + h @ self.u_o + self.b_o) 180 | c = f * c + i * tanh(x @ self.w_c + h @ self.u_c + self.b_c) 181 | h = o * tanh(c) 182 | y = softmax(h @ self.u_v + self.b_v) 183 | seq.append(np.random.choice(range(n_input), p=y.flatten())) 184 | return ''.join(np.vectorize(self.ix_to_word.get)(np.array(seq)).tolist()) 185 | 186 | 187 | def text_generation(use_word=True): 188 | text = requests.get('http://www.gutenberg.org/cache/epub/11/pg11.txt').text 189 | if use_word: 190 | text = [ 191 | word + ' ' for word in re.sub("[^a-zA-Z]", " ", text).lower().split()] 192 | 193 | words = sorted(list(set(text))) 194 | text_size, vocab_size = len(text), len(words) 195 | 196 | print(f'text has {text_size} characters, {vocab_size} unique.') 197 | word_to_ix = {word: i for i, word in enumerate(words)} 198 | ix_to_word = {i: word for i, word in enumerate(words)} 199 | 200 | seq_length = 50 201 | indices = np.vectorize(word_to_ix.get)(np.array(list(text))) 202 | data = np.zeros((text_size, vocab_size)) 203 | data[np.arange(text_size), indices] = 1 204 | n_text = (text_size - 1) // seq_length 205 | x = data[ 206 | :n_text * seq_length].reshape(n_text, seq_length, vocab_size).transpose(1, 0, 2) 207 | y = indices[1: n_text * seq_length + 1].reshape(n_text, seq_length).T 208 | 209 | test_ratio = 0.2 210 | test_split = np.random.uniform(0, 1, x.shape[1]) 211 | train_x, test_x = x[:, test_split >= test_ratio, :], x[ 212 | :, test_split < test_ratio, :] 213 | train_y, test_y = y[:, test_split >= test_ratio], y[ 214 | :, test_split < test_ratio] 215 | 216 | lstm = LSTM(vocab_size, 500, vocab_size, seq_length) 217 | lstm.ix_to_word = ix_to_word 218 | lstm.fit(train_x, train_y) 219 | print('train loss', (np.argmax(lstm.predict(train_x), axis=2) 220 | == train_y).sum() / (train_y.shape[0] * train_y.shape[1])) 221 | print('test loss', (np.argmax(lstm.predict(test_x), axis=2) 222 | == test_y).sum() / (test_y.shape[0] * test_y.shape[1])) 223 | 224 | 225 | def main(): 226 | text_generation(use_word=False) 227 | 228 | 229 | if __name__ == "__main__": 230 | main() 231 | -------------------------------------------------------------------------------- /nn_layers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # will add dropout 3 | 4 | 5 | def img2col_index(x_shape, k_size, stride=1): 6 | in_c, in_h, in_w = x_shape 7 | out_h, out_w = (in_h - k_size) // stride + 1, (in_w - k_size) // stride + 1 8 | n_rows = out_h * out_w 9 | c_idices = np.tile( 10 | np.repeat(np.arange(in_c), k_size * k_size), (n_rows, 1)) 11 | h_off_set = np.repeat(np.arange(0, in_h - k_size + 1, stride), out_w) 12 | h_indices = np.tile(np.repeat(np.arange(k_size), k_size), (n_rows, in_c)) 13 | h_indices += h_off_set.reshape(-1, 1) 14 | w_off_set = np.tile(np.arange(0, in_w - k_size + 1, stride), (1, out_h)) 15 | w_indices = np.tile(np.arange(k_size), (n_rows, k_size * in_c)) 16 | w_indices += w_off_set.reshape(-1, 1) 17 | return c_idices, h_indices, w_indices 18 | 19 | 20 | def img2col(img, k_size, stride=1): 21 | batch_size, in_c, in_h, in_w = img.shape 22 | c_idices, h_indices, w_indices = img2col_index( 23 | [in_c, in_h, in_w], k_size, stride) 24 | return img[:, c_idices, h_indices, w_indices].transpose(1, 0, 2).reshape(-1, in_c * k_size * k_size) 25 | 26 | 27 | def col2img(col, in_shape, k_size, stride): 28 | in_c, in_h, in_w = in_shape 29 | out_h, out_w = (in_h - k_size) // stride + 1, (in_w - k_size) // stride + 1 30 | batch_size = col.shape[0] // out_h // out_w 31 | c_idices, h_indices, w_indices = img2col_index(in_shape, k_size, stride) 32 | img = np.zeros((batch_size, in_c, in_h, in_w)) 33 | np.add.at( 34 | img, 35 | (slice(None), c_idices, h_indices, w_indices), 36 | col.reshape(-1, batch_size, in_c * k_size * k_size).transpose(1, 0, 2) 37 | ) 38 | #img[:, c_idices, h_indices, w_indices] += col.reshape(-1, batch_size, in_c * k_size * k_size).transpose(1,0,2) 39 | return img 40 | 41 | 42 | class Layer(object): 43 | 44 | def __init__(self, lr=1e-3, optimizer="Adam"): 45 | self.gradient_funcs = {"Adam": self.adam, 46 | "SGD": self.sgd, "RMSProp": self.rmsprop} 47 | self.learning_rate = lr 48 | self.weight_decay = 1e-4 49 | self.eps = 1e-20 50 | self.optimizer = optimizer 51 | 52 | def init_momentum_cache(self): 53 | self.mom_w, self.cache_w = np.zeros_like(self.w), np.zeros_like(self.w) 54 | self.mom_b, self.cache_b = np.zeros_like(self.b), np.zeros_like(self.b) 55 | 56 | def forward(self, x): 57 | pass 58 | 59 | def gradient(self, grad): 60 | pass 61 | 62 | def backward(self): 63 | self.regularize() 64 | self.gradient_funcs[self.optimizer]() 65 | del self.grad_w 66 | del self.grad_b 67 | 68 | def regularize(self): 69 | self.w *= (1 - self.weight_decay) 70 | self.b *= (1 - self.weight_decay) 71 | 72 | def adam(self): 73 | beta1 = 0.9 74 | beta2 = 0.999 75 | alpha = self.learning_rate 76 | self.mom_w = beta1 * self.mom_w + (1 - beta1) * self.grad_w 77 | self.cache_w = beta2 * self.cache_w + \ 78 | (1 - beta2) * np.square(self.grad_w) 79 | self.w -= alpha * self.mom_w / (np.sqrt(self.cache_w) + self.eps) 80 | self.mom_b = beta1 * self.mom_b + (1 - beta1) * self.grad_b 81 | self.cache_b = beta2 * self.cache_b + \ 82 | (1 - beta2) * np.square(self.grad_b) 83 | self.b -= alpha * self.mom_b / (np.sqrt(self.cache_b) + self.eps) 84 | 85 | def rmsprop(self): 86 | gamma = 0.9 87 | alpha = self.learning_rate 88 | self.cache_w = gamma * self.cache_w + \ 89 | (1 - gamma) * np.square(self.grad_w) 90 | self.w -= alpha * self.grad_w / (np.sqrt(self.cache_w) + self.eps) 91 | self.cache_b = gamma * self.cache_b + \ 92 | (1 - gamma) * np.square(self.grad_b) 93 | self.b -= alpha * self.grad_b / (np.sqrt(self.cache_b) + self.eps) 94 | 95 | def sgd(self): 96 | self.w -= self.learning_rate * self.grad_w 97 | self.b -= self.learning_rate * self.grad_b 98 | 99 | 100 | class Conv(Layer): 101 | 102 | def __init__(self, in_shape, k_size, k_num, stride=1, padding=0, lr=1e-3, optimizer="Adam"): 103 | super(Conv, self).__init__(lr=lr, optimizer=optimizer) 104 | self.in_shape = in_shape 105 | channel, height, width = in_shape 106 | self.k_size = k_size 107 | self.w = np.random.randn( 108 | channel * k_size * k_size, k_num) / np.sqrt(channel / 2) / k_size 109 | self.b = np.zeros((1, k_num)) 110 | self.init_momentum_cache() 111 | assert((height + 2 * padding - k_size) % stride == 0) 112 | assert((width + 2 * padding - k_size) % stride == 0) 113 | self.out_shape = (k_num, (height + 2 * padding - k_size) // 114 | stride + 1, (width + 2 * padding - k_size) // stride + 1) 115 | self.stride, self.padding = stride, padding 116 | 117 | def forward(self, x): 118 | p = self.padding 119 | x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), 'constant') 120 | self.input = img2col(x_padded, self.k_size, self.stride) 121 | out = self.input.dot(self.w) + self.b 122 | out = out.reshape(self.out_shape[1], self.out_shape[ 123 | 2], x.shape[0], self.out_shape[0]) 124 | return out.transpose(2, 3, 0, 1) 125 | 126 | def gradient(self, grad): 127 | batch_size = grad.shape[0] 128 | p = self.padding 129 | padded_inshape = (self.in_shape[0], self.in_shape[ 130 | 1] + 2 * p, self.in_shape[2] + 2 * p) 131 | grad_out = grad.transpose(2, 3, 0, 1).reshape([-1, self.out_shape[0]]) 132 | self.grad_w = self.input.T.dot(grad_out) / batch_size 133 | self.grad_b = np.ones((1, grad_out.shape[0])).dot( 134 | grad_out) / batch_size 135 | del self.input 136 | grad_padded = col2img(grad_out.dot(self.w.T), 137 | padded_inshape, self.k_size, self.stride) 138 | return grad_padded if p == 0 else grad_padded[:, :, p:-p, p:-p] 139 | 140 | 141 | class TrasposedConv(Layer): 142 | 143 | def __init__(self, in_shape, k_size, k_num, stride=1, padding=0, lr=1e-3, optimizer="Adam"): 144 | super(TrasposedConv, self).__init__(lr=lr, optimizer=optimizer) 145 | self.in_shape = in_shape 146 | channel, height, width = in_shape 147 | self.k_size = k_size 148 | self.w = np.random.randn( 149 | channel, k_num * k_size * k_size) / np.sqrt(k_num / 2) / k_size 150 | self.b = np.zeros((1, k_num)) 151 | self.init_momentum_cache() 152 | 153 | self.out_shape = (k_num, stride * (height - 1) + k_size - 154 | 2 * padding, stride * (width - 1) + k_size - 2 * padding) 155 | self.stride, self.padding = stride, padding 156 | 157 | def forward(self, x): 158 | self.input = x.transpose(2, 3, 0, 1).reshape([-1, self.in_shape[0]]) 159 | p = self.padding 160 | padded_outshape = (self.out_shape[0], self.out_shape[ 161 | 1] + 2 * p, self.out_shape[2] + 2 * p) 162 | out_cols = self.input.dot(self.w) 163 | out_padded = col2img(out_cols, padded_outshape, self.k_size, 164 | self.stride) + self.b.reshape((1, -1, 1, 1)) 165 | return out_padded if p == 0 else out_padded[:, :, p:-p, p:-p] 166 | 167 | def gradient(self, grad): 168 | batch_size = grad.shape[0] 169 | p = self.padding 170 | grad_padded = np.pad( 171 | grad, ((0, 0), (0, 0), (p, p), (p, p)), 'constant') 172 | grad_col = img2col(grad_padded, self.k_size, self.stride) 173 | self.grad_w = self.input.T.dot(grad_col) / batch_size 174 | self.grad_b = grad.sum(axis=(0, 2, 3)) / batch_size 175 | del self.input 176 | return grad_col.dot(self.w.T).reshape(self.in_shape[1], self.in_shape[2], batch_size, self.in_shape[0]).transpose(2, 3, 0, 1) 177 | 178 | 179 | class MaxPooling(Layer): 180 | 181 | def __init__(self, in_shape, k_size, stride=None): 182 | super(MaxPooling, self).__init__() 183 | self.in_shape = in_shape 184 | channel, height, width = in_shape 185 | self.k_size = k_size 186 | self.stride = k_size if stride is None else stride 187 | self.out_shape = (channel, (height - k_size) // 188 | self.stride + 1, (width - k_size) // self.stride + 1) 189 | 190 | def gradient(self, grad): 191 | grad = np.repeat(grad, self.k_size, axis=2) 192 | grad = np.repeat(grad, self.k_size, axis=3) 193 | return np.multiply(self.mask, grad) 194 | 195 | def forward(self, x): 196 | col = img2col( 197 | x.reshape(-1, 1, self.in_shape[1], self.in_shape[2]), k_size=self.k_size, stride=self.stride) 198 | max_idx = np.argmax(col, axis=1) 199 | col_mask = np.zeros(col.shape) 200 | col_mask[range(col.shape[0]), max_idx] = 1 201 | col_mask = col_mask.reshape(self.out_shape[ 202 | 1] * self.out_shape[2] * x.shape[0], self.in_shape[0] * self.k_size * self.k_size) 203 | self.mask = col2img(col_mask, self.in_shape, self.k_size, self.stride) 204 | out = col[range(col.shape[0]), max_idx].reshape( 205 | self.out_shape[1], self.out_shape[2], x.shape[0], self.in_shape[0]) 206 | return out.transpose(2, 3, 0, 1) 207 | 208 | def backward(self): 209 | pass 210 | 211 | 212 | class Softmax(Layer): 213 | 214 | def __init__(self): 215 | super(Softmax, self).__init__() 216 | 217 | def loss(self, out, y): 218 | return -(np.multiply(y, np.log(out + self.eps))).mean() 219 | 220 | def forward(self, x): 221 | out = np.exp(x - np.max(x, axis=1).reshape([-1, 1])) 222 | self.out = out / (np.sum(out, axis=1).reshape([-1, 1]) + self.eps) 223 | return self.out 224 | 225 | def gradient(self, y): 226 | return self.out - y 227 | 228 | def backward(self): 229 | pass 230 | 231 | 232 | class FullyConnect(Layer): 233 | 234 | def __init__(self, in_shape, out_shape, lr=1e-3, optimizer="Adam"): 235 | super(FullyConnect, self).__init__(lr=lr, optimizer=optimizer) 236 | self.in_shape, self.out_shape = in_shape, out_shape 237 | in_dim, out_dim = np.prod(in_shape), np.prod(out_shape) 238 | self.w = np.random.randn(in_dim, out_dim) / np.sqrt(in_dim / 2) 239 | self.b = np.zeros((1, out_dim)) 240 | self.init_momentum_cache() 241 | 242 | def forward(self, x): 243 | self.input = x.reshape([x.shape[0], np.prod(self.in_shape)]) 244 | return (self.input.dot(self.w) + self.b).reshape([-1] + list(self.out_shape)) 245 | 246 | def gradient(self, grad): 247 | batch_size = grad.shape[0] 248 | grad_out = grad.reshape((batch_size, np.prod(self.out_shape))) 249 | self.grad_w = self.input.T.dot(grad_out) / batch_size 250 | self.grad_b = np.ones((1, batch_size)).dot(grad_out) / batch_size 251 | del self.input 252 | return grad_out.dot(self.w.T).reshape([-1] + list(self.in_shape)) 253 | 254 | 255 | class Activation(Layer): 256 | 257 | def __init__(self, act_type): 258 | super(Activation, self).__init__() 259 | self.act_funcs = {'ReLU': self.relu, 'Sigmoid': self.sigmoid, 260 | 'Tanh': self.tanh, 'LeakyReLU': self.leaky_relu} 261 | self.dact_funcs = {'ReLU': self.drelu, 'Sigmoid': self.dsigmoid, 262 | 'Tanh': self.dtanh, 'LeakyReLU': self.dleaky_relu} 263 | self.act_func = self.act_funcs[act_type] 264 | self.dact_func = self.dact_funcs[act_type] 265 | 266 | def forward(self, x): 267 | self.out = self.act_func(x) 268 | return self.out 269 | 270 | def gradient(self, grad): 271 | return self.dact_func(grad, self.out) 272 | 273 | def relu(self, x): 274 | return x * (x > 0) 275 | 276 | def leaky_relu(self, x): 277 | return x * ((x > 0) * 0.99 + 0.01) 278 | 279 | def sigmoid(self, x): 280 | return 1 / (1 + np.exp(-x)) 281 | 282 | def tanh(self, x): 283 | return np.tanh(x) 284 | 285 | def drelu(self, grad, act): 286 | return grad * (act > 0) 287 | 288 | def dleaky_relu(self, grad, act): 289 | return grad * ((act > 0) * 0.99 + 0.01) 290 | 291 | def dsigmoid(self, grad, act): 292 | return np.multiply(grad, act - np.square(act)) 293 | 294 | def dtanh(self, grad, act): 295 | return np.multiply(grad, 1 - np.square(act)) 296 | 297 | def backward(self): 298 | pass 299 | 300 | 301 | class BatchNormalization(Layer): 302 | 303 | def __init__(self, in_shape, lr=1e-3, momentum=0.9, optimizer='Adam'): 304 | super(BatchNormalization, self).__init__(lr=lr, optimizer=optimizer) 305 | self.in_shape = in_shape 306 | self.param_shape = (1, in_shape[0]) if len( 307 | in_shape) == 1 else (1, in_shape[0], 1, 1) 308 | self.agg_axis = 0 if len(in_shape) == 1 else ( 309 | 0, 2, 3) # cnn over channel 310 | self.momentum = momentum 311 | self.weight_decay = 0 312 | self.w, self.b = np.ones(self.param_shape), np.zeros(self.param_shape) 313 | self.init_momentum_cache() 314 | self.global_mean, self.global_var = np.zeros( 315 | self.param_shape), np.ones(self.param_shape) 316 | 317 | def forward(self, x): 318 | batch_mean = x.mean(axis=self.agg_axis).reshape(self.param_shape) 319 | batch_var = x.var(axis=self.agg_axis).reshape(self.param_shape) 320 | self.global_mean = batch_mean * \ 321 | (1.0 - self.momentum) + self.global_mean * self.momentum 322 | self.global_var = batch_var * \ 323 | (1.0 - self.momentum) + self.global_var * self.momentum 324 | self.batch_var_sqrt = np.sqrt(batch_var + self.eps) 325 | self.x_hat = (x - batch_mean) / self.batch_var_sqrt 326 | return self.w * self.x_hat + self.b 327 | 328 | def predict_forward(self, x): 329 | return self.w * (x - self.global_mean) / np.sqrt(self.global_var + self.eps) + self.b 330 | 331 | def gradient(self, grad): 332 | batch_size = grad.shape[0] 333 | self.grad_w = ( 334 | grad * self.x_hat).sum(axis=self.agg_axis).reshape(self.param_shape) / batch_size 335 | self.grad_b = grad.sum(axis=self.agg_axis).reshape( 336 | self.param_shape) / batch_size 337 | grad_x_hat = grad * self.w 338 | return ( 339 | grad_x_hat 340 | - grad_x_hat.mean(axis=self.agg_axis).reshape(self.param_shape) 341 | - self.x_hat * 342 | (grad_x_hat * self.x_hat).mean(axis=self.agg_axis).reshape(self.param_shape) 343 | ) / self.batch_var_sqrt 344 | --------------------------------------------------------------------------------