├── .gitignore ├── 02.Perceptron └── perceptron.py ├── 03.KNN ├── __init__.py ├── knn.py ├── knn_kdtree.py └── test_kdtree.py ├── 04.NaiveBayes ├── NaiveBayesMAP.py └── NaiveBayesMLE.py ├── 05.DecisionTree ├── C4.5.py ├── ClassificationCART.py ├── ID3.py ├── RegressionCART.py ├── prune.py └── pruneClassificationCART.py ├── 06.LogisticRegression-MaxEntropy ├── BinaryLogisticRegression.py └── MaxEntropy.py ├── 07.SVM └── SVM.py ├── 08.Boosting ├── AdaBoost.py └── GBDT.py ├── 09.EM ├── GMM.py ├── GMMGradientDescent.py └── benchmark.py ├── 10.HMM ├── Backward.py ├── BaumWelch.py ├── Forward.py ├── HMM.py └── Viterbi.py ├── 11.ConditionalRandomField └── LinearChainConditionalRandomField.py ├── 14.Cluster ├── Agglomerative.py └── KMeans.py ├── 15.SVD └── SVD.py ├── 16.PCA └── PCA.py ├── 17.LSA └── LSA.py ├── 18.PLSA └── PLSA.py ├── 19.MCMC ├── GibbsSampling.py ├── MetropolisHasting.py └── SingleComponentMetropolisHasting.py ├── 20.LDA └── LDA.py ├── 21.PageRank └── PageRank.py ├── README.md ├── __init__.py ├── test_get_solution_domain.py ├── test_heap.py ├── test_information_gain.py ├── test_line_search.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | pytestdebug.log 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | doc/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | ### Emacs 138 | # -*- mode: gitignore; -*- 139 | *~ 140 | \#*\# 141 | /.emacs.desktop 142 | /.emacs.desktop.lock 143 | *.elc 144 | auto-save-list 145 | tramp 146 | .\#* 147 | 148 | # Org-mode 149 | .org-id-locations 150 | *_archive 151 | 152 | # flymake-mode 153 | *_flymake.* 154 | 155 | # eshell files 156 | /eshell/history 157 | /eshell/lastdir 158 | 159 | # elpa packages 160 | /elpa/ 161 | 162 | # reftex files 163 | *.rel 164 | 165 | # AUCTeX auto folder 166 | /auto/ 167 | 168 | # cask packages 169 | .cask/ 170 | dist/ 171 | 172 | # Flycheck 173 | flycheck_*.el 174 | 175 | # server auth directory 176 | /server/ 177 | 178 | # projectiles files 179 | .projectile 180 | 181 | # directory configuration 182 | .dir-locals.el 183 | 184 | # network security 185 | /network-security.data 186 | 187 | GPATH 188 | GRTAGS 189 | GTAGS 190 | -------------------------------------------------------------------------------- /02.Perceptron/perceptron.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | from rich.console import Console 5 | from rich.table import Table 6 | import sys 7 | from pathlib import Path 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 9 | from utils import * 10 | 11 | class Perceptron: 12 | def __init__(self, lr=1e-1, max_iteration=2000, verbose=False): 13 | self.lr = lr 14 | self.verbose = verbose 15 | self.max_iteration = max_iteration 16 | 17 | def _trans(self, x): 18 | return self.w @ x + self.b 19 | 20 | def _predict(self, x): 21 | return 1 if self._trans(x) >= 0. else -1 22 | 23 | def fit(self, X, Y): 24 | self.feature_size = X.shape[-1] 25 | # define parameteres 26 | self.w = np.random.rand(self.feature_size) 27 | self.b = np.random.rand(1) 28 | 29 | updated = 1 30 | epoch = 0 31 | # if there is mis-classified sample, train 32 | while updated > 0 and epoch < self.max_iteration: 33 | if self.verbose: 34 | print(f"epoch {epoch} started...") 35 | 36 | updated = 0 37 | # shuffle data 38 | perm = np.random.permutation(len(X)) 39 | for i in perm: 40 | x, y = X[i], Y[i] 41 | # if there is a mis-classified sample 42 | if self._predict(x) != y: 43 | # update the parameters 44 | self.w += self.lr * y * x 45 | self.b += self.lr * y 46 | updated += 1 47 | 48 | if self.verbose: 49 | print(f"epoch {epoch} finishied, {updated} pieces of data mis-classified") 50 | epoch += 1 51 | return 52 | 53 | def predict(self, X): 54 | return np.apply_along_axis(self._predict, axis=-1, arr=X) 55 | 56 | if __name__ == "__main__": 57 | def demonstrate(X, Y, desc): 58 | console = Console(markup=False) 59 | perceptron = Perceptron(verbose=True) 60 | perceptron.fit(X, Y) 61 | 62 | # plot 63 | plt.scatter(X[:, 0], X[:, 1], c=Y) 64 | wbline(perceptron.w, perceptron.b) 65 | plt.title(desc) 66 | plt.show() 67 | 68 | # show in table 69 | pred = perceptron.predict(X) 70 | table = Table('x', 'y', 'pred') 71 | for x, y, y_hat in zip(X, Y, pred): 72 | table.add_row(*map(str, [x, y, y_hat])) 73 | console.print(table) 74 | 75 | # -------------------------- Example 1 ---------------------------------------- 76 | print("Example 1:") 77 | X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 78 | Y = np.array([1, 1, -1, -1]) 79 | demonstrate(X, Y, "Example 1") 80 | 81 | # -------------------------- Example 2 ---------------------------------------- 82 | print("Example 2: Perceptron cannot solve a simple XOR problem") 83 | X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 84 | Y = np.array([1, -1, -1, 1]) 85 | demonstrate(X, Y, "Example 2: Perceptron cannot solve a simple XOR problem") 86 | -------------------------------------------------------------------------------- /03.KNN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SleepyBag/Statistical-Learning-Methods/c16edf2d56f9f7c00651c749464b74b9ec039522/03.KNN/__init__.py -------------------------------------------------------------------------------- /03.KNN/knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | from matplotlib import pyplot as plt 4 | from functools import partial 5 | import sys 6 | import os 7 | from pathlib import Path 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 9 | from utils import * 10 | 11 | class KNN: 12 | def __init__(self, k=1, distance_func="l2"): 13 | self.k = k 14 | if distance_func == 'l2': 15 | self.distance_func = lambda x, y: np.linalg.norm(x - y) 16 | else: 17 | self.distance_func = distance_func 18 | 19 | def _knn(self, x): 20 | dis = np.apply_along_axis(partial(self.distance_func, y=x), axis=-1, arr=self.X) 21 | topk_ind = np.argpartition(dis, self.k)[:self.k] 22 | return topk_ind 23 | 24 | def _predict(self, x): 25 | topk_ind = self._knn(x) 26 | topk_y = self.Y[topk_ind] 27 | return np.argmax(np.bincount(topk_y)) 28 | 29 | def fit(self, X, Y): 30 | self.X = X 31 | self.Y = Y 32 | self.k = min(self.k, len(self.X)) 33 | 34 | def predict(self, X): 35 | return np.apply_along_axis(self._predict, axis=-1, arr=X) 36 | 37 | if __name__ == "__main__": 38 | def demonstrate(X_train, Y_train, X_test, k, desc): 39 | knn = KNN(k=k) 40 | knn.fit(X_train, Y_train) 41 | pred_test = knn.predict(X_test) 42 | 43 | # plot 44 | plt.scatter(X_train[:,0], X_train[:,1], c=Y_train, s=20) 45 | plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, marker=".", s=1) 46 | plt.title(desc) 47 | plt.show() 48 | 49 | # -------------------------- Example 1 ---------------------------------------- 50 | X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]]) 51 | Y_train = np.array([1, 2, 3, 4, 5]) 52 | # generate grid-shaped test data 53 | X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1)) 54 | demonstrate(X_train, Y_train, X_test, 1, "Example 1") 55 | 56 | # -------------------------- Example 2 (Imbalanced Data) ------------------------ 57 | X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]]) 58 | Y_train = np.array([1, 1, 2, 3, 4]) 59 | # generate grid-shaped test data 60 | X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1)) 61 | demonstrate(X_train, Y_train, X_test, 1, "Example 2") 62 | 63 | # -------------------------- Example 3 (Imbalanced Data) ------------------------ 64 | X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]]) 65 | Y_train = np.array([1, 1, 2, 2, 2]) 66 | # generate grid-shaped test data 67 | X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1)) 68 | demonstrate(X_train, Y_train, X_test, 1, "Example 3") 69 | -------------------------------------------------------------------------------- /03.KNN/knn_kdtree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | from matplotlib import pyplot as plt 4 | from rich.console import Console 5 | from rich.table import Table 6 | from functools import partial 7 | import sys 8 | import os 9 | from pathlib import Path 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 11 | from utils import * 12 | 13 | class KDTree: 14 | class Node: 15 | def __init__(self, points, labels, axis): 16 | self.points = points 17 | self.labels = labels 18 | self.axis = axis 19 | self.left = None 20 | self.right = None 21 | 22 | def build(self, X, Y, split_axis=0): 23 | if not len(X): 24 | return None 25 | median_ind = np.argpartition(X[:, split_axis], len(X) // 2, axis=0)[len(X) // 2] 26 | split_point = float(X[median_ind, split_axis]) 27 | equal_x = X[X[:, split_axis] == split_point] 28 | equal_y = Y[X[:, split_axis] == split_point] 29 | less_x = X[X[:, split_axis] < split_point] 30 | less_y = Y[X[:, split_axis] < split_point] 31 | greater_x = X[X[:, split_axis] > split_point] 32 | greater_y = Y[X[:, split_axis] > split_point] 33 | node = self.Node(equal_x, equal_y, split_axis) 34 | node.left = self.build(less_x, less_y, 1 - split_axis) 35 | node.right = self.build(greater_x, greater_y, 1 - split_axis) 36 | return node 37 | 38 | def _query(self, root, x, k): 39 | if not root: 40 | return Heap(max_len=k, key=lambda xy: -euc_dis(x, xy[0])) 41 | # Find the region that contains the target point 42 | if x[root.axis] <= root.points[0][root.axis]: 43 | ans = self._query(root.left, x, k) 44 | sibling = root.right 45 | else: 46 | ans = self._query(root.right, x, k) 47 | sibling = root.left 48 | # All the points on the current splitting line are possible answers 49 | for curx, cury in zip(root.points, root.labels): 50 | ans.push((curx, cury)) 51 | # If the distance between the target point and the splitting line is 52 | # shorter than the best answer up until, find in the other tree 53 | if len(ans) < k or -ans.top_key() > abs(x[root.axis] - root.points[0][root.axis]): 54 | other_ans = self._query(sibling, x, k) 55 | while other_ans: 56 | otherx, othery = other_ans.pop() 57 | ans.push((otherx, othery)) 58 | return ans 59 | 60 | def query(self, x, k): 61 | return self._query(self.root, x, k) 62 | 63 | def __init__(self, X, Y): 64 | self.root = self.build(X, Y) 65 | 66 | class KNN: 67 | def __init__(self, k=1, distance_func="l2"): 68 | self.k = k 69 | if distance_func == 'l2': 70 | self.distance_func = lambda x, y: np.linalg.norm(x - y) 71 | else: 72 | self.distance_func = distance_func 73 | 74 | def _predict(self, x): 75 | topk = self.tree.query(x, self.k) 76 | topk_y = [y for x, y in topk] 77 | return np.argmax(np.bincount(topk_y)) 78 | 79 | def fit(self, X, Y): 80 | self.tree = KDTree(X, Y) 81 | self.k = min(self.k, len(X)) 82 | 83 | def predict(self, X): 84 | return np.apply_along_axis(self._predict, axis=-1, arr=X) 85 | 86 | if __name__ == "__main__": 87 | def demonstrate(X_train, Y_train, X_test, k, desc): 88 | knn = KNN(k=k) 89 | knn.fit(X_train, Y_train) 90 | pred_test = knn.predict(X_test) 91 | 92 | # plot 93 | plt.scatter(X_train[:,0], X_train[:,1], c=Y_train, s=20) 94 | plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, marker=".", s=1) 95 | plt.title(desc) 96 | plt.show() 97 | 98 | # -------------------------- Example 1 ---------------------------------------- 99 | X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]]) 100 | Y_train = np.array([1, 2, 3, 4, 5]) 101 | # generate grid-shaped test data 102 | X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1)) 103 | demonstrate(X_train, Y_train, X_test, 1, "Example 1") 104 | 105 | # -------------------------- Example 2 (Imbalanced Data) ------------------------ 106 | X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]]) 107 | Y_train = np.array([1, 1, 2, 3, 4]) 108 | # generate grid-shaped test data 109 | X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1)) 110 | demonstrate(X_train, Y_train, X_test, 1, "Example 2") 111 | 112 | # -------------------------- Example 3 (Imbalanced Data) ------------------------ 113 | X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]]) 114 | Y_train = np.array([1, 1, 2, 2, 2]) 115 | # generate grid-shaped test data 116 | X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1)) 117 | demonstrate(X_train, Y_train, X_test, 1, "Example 3") 118 | -------------------------------------------------------------------------------- /03.KNN/test_kdtree.py: -------------------------------------------------------------------------------- 1 | import knn_kdtree 2 | import numpy as np 3 | 4 | X = np.array([[1, 1], [1, 2], [1, 3], [2, 2], [3, 1], [3, 2], [3, 3]]) 5 | Y = np.array([0] * len(X)) 6 | tree = knn_kdtree.KDTree(X, Y) 7 | 8 | def points_equal(a, b): 9 | a = set(map(tuple, a)) 10 | b = set(map(tuple, b)) 11 | return a == b 12 | 13 | assert(points_equal(tree.root.points, [[2, 2]])) 14 | assert(points_equal(tree.root.left.points, [[1, 2]])) 15 | assert(points_equal(tree.root.right.points, [[3, 2]])) 16 | assert(points_equal(tree.root.left.left.points, [[1, 1]])) 17 | assert(points_equal(tree.root.left.right.points, [[1, 3]])) 18 | assert(points_equal(tree.root.right.left.points, [[3, 1]])) 19 | assert(points_equal(tree.root.right.right.points, [[3, 3]])) 20 | 21 | assert(points_equal([a[0] for a in tree.query(np.array([2, 1]), 3)], [[1, 1], [2, 2], [3, 1]])) 22 | 23 | X = np.array([[0, 0], [1, 1], [2, 2]]) 24 | Y = np.array([0] * len(X)) 25 | tree = knn_kdtree.KDTree(X, Y) 26 | assert(points_equal([a[0] for a in tree.query(np.array([1, 1]), 3)], X)) 27 | 28 | X = np.array([[0, 0], [1, 1], [2, 2]]) 29 | Y = np.array([0] * len(X)) 30 | tree = knn_kdtree.KDTree(X, Y) 31 | assert(points_equal([a[0] for a in tree.query(np.array([10, 2.001]), 3)], X)) 32 | -------------------------------------------------------------------------------- /04.NaiveBayes/NaiveBayesMAP.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from rich.console import Console 3 | from rich.table import Table 4 | import numpy as np 5 | 6 | class NaiveBayesMAP: 7 | def __init__(self, lamda=1, verbose=False): 8 | # p(a|y), the probability of an attribute a when the data is of label y 9 | # its a three-layer dict 10 | # the first-layer key is y, the value label 11 | # the second-layer key is n, which means the nth attribute 12 | # the thrid-layer key is the value of the nth attribute 13 | self.pa_y = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) 14 | # p(y), the prior probability of label y 15 | self.py = defaultdict(lambda: 0) 16 | self.verbose = verbose 17 | # parameter lamda means that 18 | # we take each value as it has appeared lamda times before our experiment 19 | self.lamda = lamda 20 | 21 | def fit(self, X, Y): 22 | y_cnt = Counter(Y) 23 | for col in range(len(X[0])): 24 | col_values = set(x[col] for x in X) 25 | for x, y in zip(X, Y): 26 | self.pa_y[y][col][x[col]] += 1 27 | for y in y_cnt: 28 | for a in self.pa_y[y][col]: 29 | self.pa_y[y][col][a] += self.lamda 30 | self.pa_y[y][col][a] /= y_cnt[y] + self.lamda * len(col_values) 31 | for y in y_cnt: 32 | self.py[y] = (y_cnt[y] + self.lamda) / (len(X) + self.lamda * len(y_cnt)) 33 | 34 | if self.verbose: 35 | for y in self.pa_y: 36 | print(f'The prior probability of label {y} is', self.py[y]) 37 | for nth in self.pa_y[y]: 38 | prob = self.pa_y[y][nth] 39 | for a in prob: 40 | print(f'When the label is {y}, the probability that {nth}th attribute be {a} is {prob[a]}') 41 | 42 | def _predict(self, x): 43 | # all the labels 44 | labels = list(self.pa_y.keys()) 45 | probs = [] 46 | for y in labels: 47 | prob = self.py[y] 48 | for i, a in enumerate(x): 49 | prob *= self.pa_y[y][i][a] 50 | probs.append(prob) 51 | if self.verbose: 52 | for y, p in zip(labels, probs): 53 | print(f'The likelihood {x} belongs to {y} is {p}') 54 | return labels[np.argmax(probs)] 55 | 56 | def predict(self, X): 57 | return [self._predict(x) for x in X] 58 | 59 | if __name__ == "__main__": 60 | console = Console(markup=False) 61 | naive_bayes_map = NaiveBayesMAP(verbose=True) 62 | # -------------------------- Example 1 ---------------------------------------- 63 | print("Example 1:") 64 | X = [ 65 | [1,'S'], 66 | [1,'M'], 67 | [1,'M'], 68 | [1,'S'], 69 | [1,'S'], 70 | [2,'S'], 71 | [2,'M'], 72 | [2,'M'], 73 | [2,'L'], 74 | [2,'L'], 75 | [3,'L'], 76 | [3,'M'], 77 | [3,'M'], 78 | [3,'L'], 79 | [3,'L'], 80 | ] 81 | Y = [-1 ,-1 ,1 ,1 ,-1 ,-1 ,-1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,-1] 82 | naive_bayes_map.fit(X, Y) 83 | 84 | # show in table 85 | pred = naive_bayes_map.predict(X) 86 | table = Table('x', 'y', 'pred') 87 | for x, y, y_hat in zip(X, Y, pred): 88 | table.add_row(*map(str, [x, y, y_hat])) 89 | console.print(table) 90 | -------------------------------------------------------------------------------- /04.NaiveBayes/NaiveBayesMLE.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from rich.console import Console 3 | from rich.table import Table 4 | import numpy as np 5 | 6 | class NaiveBayesMLE: 7 | def __init__(self, verbose=False): 8 | # p(a|y), the probability of an attribute a when the data is of label y 9 | # its a three-layer dict 10 | # the first-layer key is y, the value label 11 | # the second-layer key is n, which means the nth attribute 12 | # the thrid-layer key is the value of the nth attribute 13 | self.pa_y = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) 14 | # p(y), the prior probability of label y 15 | self.py = defaultdict(lambda: 0) 16 | self.verbose = verbose 17 | 18 | def fit(self, X, Y): 19 | y_cnt = Counter(Y) 20 | for x, y in zip(X, Y): 21 | for i, a in enumerate(x): 22 | self.pa_y[y][i][a] += 1 / y_cnt[y] 23 | self.py[y] += 1 / len(X) 24 | 25 | if self.verbose: 26 | for y in self.pa_y: 27 | print(f'The prior probability of label {y} is', self.py[y]) 28 | for nth in self.pa_y[y]: 29 | prob = self.pa_y[y][nth] 30 | for a in prob: 31 | print(f'When the label is {y}, the probability that {nth}th attribute be {a} is {prob[a]}') 32 | 33 | def _predict(self, x): 34 | # all the labels 35 | labels = list(self.pa_y.keys()) 36 | probs = [] 37 | for y in labels: 38 | prob = self.py[y] 39 | for i, a in enumerate(x): 40 | prob *= self.pa_y[y][i][a] 41 | probs.append(prob) 42 | if self.verbose: 43 | for y, p in zip(labels, probs): 44 | print(f'The likelihood {x} belongs to {y} is {p}') 45 | return labels[np.argmax(probs)] 46 | 47 | def predict(self, X): 48 | return [self._predict(x) for x in X] 49 | 50 | if __name__ == "__main__": 51 | console = Console(markup=False) 52 | naive_bayes_mle = NaiveBayesMLE(verbose=True) 53 | # -------------------------- Example 1 ---------------------------------------- 54 | print("Example 1:") 55 | X = [ 56 | [1,'S'], 57 | [1,'M'], 58 | [1,'M'], 59 | [1,'S'], 60 | [1,'S'], 61 | [2,'S'], 62 | [2,'M'], 63 | [2,'M'], 64 | [2,'L'], 65 | [2,'L'], 66 | [3,'L'], 67 | [3,'M'], 68 | [3,'M'], 69 | [3,'L'], 70 | [3,'L'], 71 | ] 72 | Y = [-1 ,-1 ,1 ,1 ,-1 ,-1 ,-1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,-1] 73 | naive_bayes_mle.fit(X, Y) 74 | 75 | # show in table 76 | pred = naive_bayes_mle.predict(X) 77 | table = Table('x', 'y', 'pred') 78 | for x, y, y_hat in zip(X, Y, pred): 79 | table.add_row(*map(str, [x, y, y_hat])) 80 | console.print(table) 81 | -------------------------------------------------------------------------------- /05.DecisionTree/C4.5.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pprint import pprint 3 | from rich.console import Console 4 | from rich.table import Table 5 | from math import log 6 | from collections import Counter 7 | import sys 8 | import os 9 | from pathlib import Path 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 11 | from utils import * 12 | 13 | class C45: 14 | class Node: 15 | def __init__(self, col, Y): 16 | self.col = col 17 | self.children = {} 18 | self.cnt = Counter(Y) 19 | self.label = self.cnt.most_common(1)[0][0] 20 | 21 | def __init__(self, information_gain_threshold=0., verbose=False): 22 | self.information_gain_threshold = information_gain_threshold 23 | self.verbose = verbose 24 | 25 | def build(self, X, Y, selected): 26 | cur = self.Node(None, Y) 27 | if self.verbose: 28 | print("Cur selected columns:", selected) 29 | print("Cur data:") 30 | pprint(X) 31 | print(Y) 32 | split = False 33 | # check if there is no attribute to choose, or there is no need for spilt 34 | if len(selected) != self.column_cnt and len(set(Y)) > 1: 35 | left_columns = list(set(range(self.column_cnt)) - selected) 36 | col_ind, best_information_gain_ratio = argmax(left_columns, key=lambda col: information_gain_ratio(X, Y, col)) 37 | col = left_columns[col_ind] 38 | # if this split is better than not splitting 39 | if best_information_gain_ratio > self.information_gain_threshold: 40 | print(f"Split by {col}th column") 41 | split = True 42 | cur.col = col 43 | for val in set(x[col] for x in X): 44 | ind = [x[col] == val for x in X] 45 | child_X = [x for i, x in zip(ind, X) if i] 46 | child_Y = [y for i, y in zip(ind, Y) if i] 47 | cur.children[val] = self.build(child_X, child_Y, selected | {col}) 48 | if not split: 49 | print("No split") 50 | return cur 51 | 52 | def query(self, root, x): 53 | if root.col is None or x[root.col] not in root.children: 54 | return root.label 55 | return self.query(root.children[x[root.col]], x) 56 | 57 | def fit(self, X, Y): 58 | self.column_cnt = len(X[0]) 59 | self.root = self.build(X, Y, set()) 60 | 61 | def _predict(self, x): 62 | return self.query(self.root, x) 63 | 64 | def predict(self, X): 65 | return [self._predict(x) for x in X] 66 | 67 | if __name__ == "__main__": 68 | console = Console(markup=False) 69 | c45 = C45(verbose=True) 70 | # -------------------------- Example 1 ---------------------------------------- 71 | # unpruned decision tree predict correctly for all training data 72 | print("Example 1:") 73 | X = [ 74 | ['青年', '否', '否', '一般'], 75 | ['青年', '否', '否', '好'], 76 | ['青年', '是', '否', '好'], 77 | ['青年', '是', '是', '一般'], 78 | ['青年', '否', '否', '一般'], 79 | ['老年', '否', '否', '一般'], 80 | ['老年', '否', '否', '好'], 81 | ['老年', '是', '是', '好'], 82 | ['老年', '否', '是', '非常好'], 83 | ['老年', '否', '是', '非常好'], 84 | ['老年', '否', '是', '非常好'], 85 | ['老年', '否', '是', '好'], 86 | ['老年', '是', '否', '好'], 87 | ['老年', '是', '否', '非常好'], 88 | ['老年', '否', '否', '一般'], 89 | ] 90 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 91 | c45.fit(X, Y) 92 | 93 | # show in table 94 | pred = c45.predict(X) 95 | table = Table('x', 'y', 'pred') 96 | for x, y, y_hat in zip(X, Y, pred): 97 | table.add_row(*map(str, [x, y, y_hat])) 98 | console.print(table) 99 | 100 | # -------------------------- Example 2 ---------------------------------------- 101 | # but unpruned decision tree doesn't generalize well for test data 102 | print("Example 2:") 103 | X = [ 104 | ['青年', '否', '否', '一般'], 105 | ['青年', '否', '否', '好'], 106 | ['青年', '是', '是', '一般'], 107 | ['青年', '否', '否', '一般'], 108 | ['老年', '否', '否', '一般'], 109 | ['老年', '否', '否', '好'], 110 | ['老年', '是', '是', '好'], 111 | ['老年', '否', '是', '非常好'], 112 | ['老年', '否', '是', '非常好'], 113 | ['老年', '否', '是', '非常好'], 114 | ['老年', '否', '是', '好'], 115 | ['老年', '否', '否', '一般'], 116 | ] 117 | Y = ['否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否'] 118 | c45.fit(X, Y) 119 | 120 | testX = [ 121 | ['青年', '否', '否', '一般'], 122 | ['青年', '否', '否', '好'], 123 | ['青年', '是', '否', '好'], 124 | ['青年', '是', '是', '一般'], 125 | ['青年', '否', '否', '一般'], 126 | ['老年', '否', '否', '一般'], 127 | ['老年', '否', '否', '好'], 128 | ['老年', '是', '是', '好'], 129 | ['老年', '否', '是', '非常好'], 130 | ['老年', '否', '是', '非常好'], 131 | ['老年', '否', '是', '非常好'], 132 | ['老年', '否', '是', '好'], 133 | ['老年', '是', '否', '好'], 134 | ['老年', '是', '否', '非常好'], 135 | ['老年', '否', '否', '一般'], 136 | ] 137 | testY = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 138 | 139 | # show in table 140 | pred = c45.predict(testX) 141 | table = Table('x', 'y', 'pred') 142 | for x, y, y_hat in zip(testX, testY, pred): 143 | table.add_row(*map(str, [x, y, y_hat])) 144 | console.print(table) 145 | -------------------------------------------------------------------------------- /05.DecisionTree/ClassificationCART.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import nan, inf 3 | from pprint import pprint 4 | from rich.console import Console 5 | from rich.table import Table 6 | from collections import Counter 7 | import sys 8 | import os 9 | from pathlib import Path 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 11 | from utils import gini 12 | 13 | class ClassificationCART: 14 | class Node: 15 | def __init__(self, col, Y): 16 | self.col = col 17 | self.val = None 18 | self.left, self.right = None, None 19 | self.label = Counter(Y).most_common(1)[0][0] 20 | 21 | def __init__(self, verbose=False): 22 | self.verbose = verbose 23 | 24 | def get_gini_of_split(self, Y1, Y2): 25 | """get the square error of a split""" 26 | # Assume that we assign each a certain label to the two set, 27 | # the best assignment is the mean value of each set 28 | gini1 = gini(Y1) 29 | gini2 = gini(Y2) 30 | length = len(Y1) + len(Y2) 31 | return len(Y1) / length * gini1 + len(Y2) / length * gini2 32 | 33 | def build(self, X, Y): 34 | cur = self.Node(None, Y) 35 | if self.verbose: 36 | print("Cur data:") 37 | pprint(X) 38 | print(Y) 39 | best_gini = inf 40 | best_col, best_val = -1, nan 41 | # The orignal content of the book doesn't discuss about when to cease. 42 | # So I take the easiest way: cease when the data cannot be splitted 43 | if len(set(Y)) > 1: 44 | for col in range(len(X[0])): 45 | val_set = set(X[:, col]) 46 | if len(val_set) != 1: 47 | for val in val_set: 48 | # Don't split by the minimal value 49 | # because no value is smaller than it 50 | # so the left part is empty 51 | selected_ind = X[:, col] == val 52 | other_ind = X[:, col] != val 53 | selected_Y = Y[selected_ind] 54 | other_Y = Y[other_ind] 55 | cur_gini = self.get_gini_of_split(selected_Y, other_Y) 56 | if cur_gini < best_gini: 57 | best_gini, best_col, best_val = cur_gini, col, val 58 | 59 | # Build left and right child nodes recursively 60 | if self.verbose: 61 | print(f"Split by value {best_val} of {best_col}th column") 62 | selected_ind = X[:, best_col] == best_val 63 | other_ind = X[:, best_col] != best_val 64 | selected_X = X[selected_ind] 65 | other_X = X[other_ind] 66 | selected_Y = Y[selected_ind] 67 | other_Y = Y[other_ind] 68 | 69 | cur.col = best_col 70 | cur.val = best_val 71 | cur.left = self.build(selected_X, selected_Y) 72 | cur.right = self.build(other_X, other_Y) 73 | elif self.verbose: 74 | print("No split") 75 | return cur 76 | 77 | def query(self, root, x): 78 | if root.col is None: 79 | return root.label 80 | elif x[root.col] != root.val: 81 | return self.query(root.right, x) 82 | return self.query(root.left, x) 83 | 84 | def fit(self, X, Y): 85 | self.root = self.build(X, Y) 86 | 87 | def _predict(self, x): 88 | return self.query(self.root, x) 89 | 90 | def predict(self, X): 91 | return [self._predict(x) for x in X] 92 | 93 | if __name__ == "__main__": 94 | console = Console(markup=False) 95 | cart = ClassificationCART(verbose=True) 96 | # -------------------------- Example 1 ---------------------------------------- 97 | print("Example 1:") 98 | X = np.array([ 99 | ['青年', '否', '否', '一般'], 100 | ['青年', '否', '否', '好'], 101 | ['青年', '是', '否', '好'], 102 | ['青年', '是', '是', '一般'], 103 | ['青年', '否', '否', '一般'], 104 | ['老年', '否', '否', '一般'], 105 | ['老年', '否', '否', '好'], 106 | ['老年', '是', '是', '好'], 107 | ['老年', '否', '是', '非常好'], 108 | ['老年', '否', '是', '非常好'], 109 | ['老年', '否', '是', '非常好'], 110 | ['老年', '否', '是', '好'], 111 | ['老年', '是', '否', '好'], 112 | ['老年', '是', '否', '非常好'], 113 | ['老年', '否', '否', '一般'], 114 | ]) 115 | Y = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']) 116 | cart.fit(X, Y) 117 | 118 | # show in table 119 | pred = cart.predict(X) 120 | table = Table('x', 'y', 'pred') 121 | for x, y, y_hat in zip(X, Y, pred): 122 | table.add_row(*map(str, [x, y, y_hat])) 123 | console.print(table) 124 | 125 | # -------------------------- Example 2 ---------------------------------------- 126 | # but unpruned decision tree doesn't generalize well for test data 127 | print("Example 2:") 128 | X = np.array([ 129 | ['青年', '否', '否', '一般'], 130 | ['青年', '否', '否', '好'], 131 | ['青年', '是', '是', '一般'], 132 | ['青年', '否', '否', '一般'], 133 | ['老年', '否', '否', '一般'], 134 | ['老年', '否', '否', '好'], 135 | ['老年', '是', '是', '好'], 136 | ['老年', '否', '是', '非常好'], 137 | ['老年', '否', '是', '非常好'], 138 | ['老年', '否', '是', '非常好'], 139 | ['老年', '否', '是', '好'], 140 | ['老年', '否', '否', '一般'], 141 | ]) 142 | Y = np.array(['否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否']) 143 | cart.fit(X, Y) 144 | 145 | testX = np.array([ 146 | ['青年', '否', '否', '一般'], 147 | ['青年', '否', '否', '好'], 148 | ['青年', '是', '否', '好'], 149 | ['青年', '是', '是', '一般'], 150 | ['青年', '否', '否', '一般'], 151 | ['老年', '否', '否', '一般'], 152 | ['老年', '否', '否', '好'], 153 | ['老年', '是', '是', '好'], 154 | ['老年', '否', '是', '非常好'], 155 | ['老年', '否', '是', '非常好'], 156 | ['老年', '否', '是', '非常好'], 157 | ['老年', '否', '是', '好'], 158 | ['老年', '是', '否', '好'], 159 | ['老年', '是', '否', '非常好'], 160 | ['老年', '否', '否', '一般'], 161 | ]) 162 | testY = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']) 163 | 164 | # show in table 165 | pred = cart.predict(testX) 166 | table = Table('x', 'y', 'pred') 167 | for x, y, y_hat in zip(testX, testY, pred): 168 | table.add_row(*map(str, [x, y, y_hat])) 169 | console.print(table) 170 | -------------------------------------------------------------------------------- /05.DecisionTree/ID3.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from rich.console import Console 3 | from rich.table import Table 4 | from collections import Counter 5 | import sys 6 | import os 7 | from pathlib import Path 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 9 | from utils import argmax, information_gain 10 | 11 | 12 | class ID3: 13 | class Node: 14 | def __init__(self, col, Y): 15 | self.col = col 16 | self.children = {} 17 | self.cnt = Counter(Y) 18 | self.label = self.cnt.most_common(1)[0][0] 19 | 20 | def __init__(self, information_gain_threshold=0., verbose=False): 21 | self.information_gain_threshold = information_gain_threshold 22 | self.verbose = verbose 23 | 24 | def build(self, X, Y, selected): 25 | cur = self.Node(None, Y) 26 | if self.verbose: 27 | print("Cur selected columns:", selected) 28 | print("Cur data:") 29 | pprint(X) 30 | print(Y) 31 | split = False 32 | # check if there is no attribute to choose 33 | # or there is no need for spilt 34 | if len(selected) != self.column_cnt and len(set(Y)) > 1: 35 | left_columns = list(set(range(self.column_cnt)) - selected) 36 | col_ind, best_information_gain = argmax(left_columns, 37 | key=lambda col: information_gain(X, Y, col)) 38 | col = left_columns[col_ind] 39 | # if this split is better than not splitting 40 | if best_information_gain > self.information_gain_threshold: 41 | if self.verbose: 42 | print(f"Split by {col}th column") 43 | split = True 44 | cur.col = col 45 | for val in set(x[col] for x in X): 46 | ind = [x[col] == val for x in X] 47 | child_X = [x for i, x in zip(ind, X) if i] 48 | child_Y = [y for i, y in zip(ind, Y) if i] 49 | cur.children[val] = self.build(child_X, child_Y, selected | {col}) 50 | if not split and self.verbose: 51 | print("No split") 52 | return cur 53 | 54 | def query(self, root, x): 55 | if root.col is None or x[root.col] not in root.children: 56 | return root.label 57 | return self.query(root.children[x[root.col]], x) 58 | 59 | def fit(self, X, Y): 60 | self.column_cnt = len(X[0]) 61 | self.root = self.build(X, Y, set()) 62 | 63 | def _predict(self, x): 64 | return self.query(self.root, x) 65 | 66 | def predict(self, X): 67 | return [self._predict(x) for x in X] 68 | 69 | 70 | if __name__ == "__main__": 71 | console = Console(markup=False) 72 | id3 = ID3(verbose=False) 73 | # -------------------------- Example 1 ---------------------------------------- 74 | # unpruned decision tree predict correctly for all training data 75 | print("Example 1:") 76 | X = [ 77 | ['青年', '否', '否', '一般'], 78 | ['青年', '否', '否', '好'], 79 | ['青年', '是', '否', '好'], 80 | ['青年', '是', '是', '一般'], 81 | ['青年', '否', '否', '一般'], 82 | ['老年', '否', '否', '一般'], 83 | ['老年', '否', '否', '好'], 84 | ['老年', '是', '是', '好'], 85 | ['老年', '否', '是', '非常好'], 86 | ['老年', '否', '是', '非常好'], 87 | ['老年', '否', '是', '非常好'], 88 | ['老年', '否', '是', '好'], 89 | ['老年', '是', '否', '好'], 90 | ['老年', '是', '否', '非常好'], 91 | ['老年', '否', '否', '一般'], 92 | ] 93 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 94 | id3.fit(X, Y) 95 | 96 | # show in table 97 | pred = id3.predict(X) 98 | table = Table('x', 'y', 'pred') 99 | for x, y, y_hat in zip(X, Y, pred): 100 | table.add_row(*map(str, [x, y, y_hat])) 101 | console.print(table) 102 | 103 | # -------------------------- Example 2 ---------------------------------------- 104 | # but unpruned decision tree doesn't generalize well for test data 105 | print("Example 2:") 106 | X = [ 107 | ['青年', '否', '否', '一般'], 108 | ['青年', '否', '否', '好'], 109 | ['青年', '是', '是', '一般'], 110 | ['青年', '否', '否', '一般'], 111 | ['老年', '否', '否', '一般'], 112 | ['老年', '否', '否', '好'], 113 | ['老年', '是', '是', '好'], 114 | ['老年', '否', '是', '非常好'], 115 | ['老年', '否', '是', '非常好'], 116 | ['老年', '否', '是', '非常好'], 117 | ['老年', '否', '是', '好'], 118 | ['老年', '否', '否', '一般'], 119 | ] 120 | Y = ['否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否'] 121 | id3.fit(X, Y) 122 | 123 | testX = [ 124 | ['青年', '否', '否', '一般'], 125 | ['青年', '否', '否', '好'], 126 | ['青年', '是', '否', '好'], 127 | ['青年', '是', '是', '一般'], 128 | ['青年', '否', '否', '一般'], 129 | ['老年', '否', '否', '一般'], 130 | ['老年', '否', '否', '好'], 131 | ['老年', '是', '是', '好'], 132 | ['老年', '否', '是', '非常好'], 133 | ['老年', '否', '是', '非常好'], 134 | ['老年', '否', '是', '非常好'], 135 | ['老年', '否', '是', '好'], 136 | ['老年', '是', '否', '好'], 137 | ['老年', '是', '否', '非常好'], 138 | ['老年', '否', '否', '一般'], 139 | ] 140 | testY = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 141 | 142 | # show in table 143 | pred = id3.predict(testX) 144 | table = Table('x', 'y', 'pred') 145 | for x, y, y_hat in zip(testX, testY, pred): 146 | table.add_row(*map(str, [x, y, y_hat])) 147 | console.print(table) 148 | -------------------------------------------------------------------------------- /05.DecisionTree/RegressionCART.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pprint import pprint 3 | from rich.console import Console 4 | from rich.table import Table 5 | import sys 6 | import os 7 | from pathlib import Path 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 9 | from utils import * 10 | 11 | 12 | class RegressionCART: 13 | class Node: 14 | def __init__(self, col, Y): 15 | self.col = col 16 | self.val = nan 17 | self.left, self.right = None, None 18 | self.label = Y.mean() 19 | 20 | def __hash__(self): 21 | return id(self) 22 | 23 | def __init__(self, verbose=False, max_depth=inf): 24 | self.verbose = verbose 25 | self.max_depth = max_depth 26 | 27 | def get_se(self, Y_cnt): 28 | """get square error given the count of each Y value""" 29 | mean = sum(y * Y_cnt[y] for y in Y_cnt) / sum(Y_cnt.values()) 30 | square_error = sum((y - mean) ** 2 * Y_cnt[y] for y in Y_cnt) 31 | return square_error 32 | 33 | def get_se_of_split(self, Y1_cnt, Y2_cnt): 34 | """get the square error of a split""" 35 | return self.get_se(Y1_cnt) + self.get_se(Y2_cnt) 36 | 37 | def build(self, X, Y, depth=1): 38 | cur = self.Node(None, Y) 39 | if self.verbose: 40 | print("Cur data:") 41 | pprint(X) 42 | print(Y) 43 | best_se = inf 44 | best_col, best_val = -1, nan 45 | # The orignal content of the book doesn't discuss about when to cease. 46 | # So I take the easiest way: cease when the data cannot be splitted, 47 | # i.e., there are different labels 48 | if depth < self.max_depth and len(set(Y)) > 1: 49 | for col in range(len(X[0])): 50 | smaller_Y_cnt = Counter() 51 | larger_Y_cnt = Counter(Y) 52 | sorted_inds = np.argsort(X[:, col]) 53 | # try all the possible split values 54 | for i, ind in enumerate(sorted_inds): 55 | smaller_Y_cnt[Y[ind]] += 1 56 | larger_Y_cnt[Y[ind]] -= 1 57 | # don't split on the largest number, otherwise the right part is empty 58 | if sorted_inds[i] == sorted_inds[-1]: 59 | break 60 | # split only when this is the last one of consequent identical numbers 61 | if i == len(X) - 1 or X[ind, col] != X[sorted_inds[i + 1], col]: 62 | se = self.get_se_of_split(smaller_Y_cnt, larger_Y_cnt) 63 | if se < best_se: 64 | val = X[ind, col] 65 | best_se, best_col, best_val = se, col, val 66 | 67 | # Build left and right child nodes recursively 68 | if self.verbose: 69 | print(f"Split by value {best_val} of {best_col}th column") 70 | smaller_ind = X[:, best_col] <= best_val 71 | larger_ind = X[:, best_col] > best_val 72 | smaller_X = X[smaller_ind] 73 | larger_X = X[larger_ind] 74 | smaller_Y = Y[smaller_ind] 75 | larger_Y = Y[larger_ind] 76 | 77 | cur.col = best_col 78 | cur.val = best_val 79 | cur.left = self.build(smaller_X, smaller_Y, depth + 1) 80 | cur.right = self.build(larger_X, larger_Y, depth + 1) 81 | elif self.verbose: 82 | print("No split") 83 | return cur 84 | 85 | def _query(self, root, x): 86 | if root.col is None: 87 | return root 88 | elif x[root.col] > root.val: 89 | return self._query(root.right, x) 90 | return self._query(root.left, x) 91 | 92 | def query(self, root, x): 93 | return self._query(root, x).label 94 | 95 | def fit(self, X, Y): 96 | self.root = self.build(X, Y) 97 | 98 | def _predict(self, x): 99 | return self.query(self.root, x) 100 | 101 | def predict(self, X): 102 | return [self._predict(x) for x in X] 103 | 104 | if __name__ == "__main__": 105 | def demonstrate(cart, X, Y, test_X, test_Y, desc): 106 | print(desc) 107 | console = Console(markup=False) 108 | cart.fit(X, Y) 109 | 110 | # show in table 111 | pred = cart.predict(test_X) 112 | table = Table('x', 'y', 'pred') 113 | for x, y, y_hat in zip(test_X, test_Y, pred): 114 | table.add_row(*map(str, [x, y, y_hat])) 115 | console.print(table) 116 | 117 | # -------------------------- Example 1 ---------------------------------------- 118 | cart = RegressionCART(verbose=True) 119 | X = np.arange(1, 11).reshape(-1, 1) 120 | Y = np.array([4.5, 4.75, 4.91, 5.34, 5.8, 7.05, 7.90, 8.23, 8.70, 9.00]) 121 | demonstrate(cart, X, Y, X, Y, "Example 1:") 122 | 123 | # -------------------------- Example 2 ---------------------------------------- 124 | # show in table 125 | cart = RegressionCART(verbose=True) 126 | test_X = X + .5 127 | test_Y = np.zeros_like(Y) + nan 128 | demonstrate(cart, X, Y, test_X, test_Y, "Example 2:") 129 | 130 | # -------------------------- Example 3 ---------------------------------------- 131 | cart = RegressionCART(verbose=True, max_depth=1) 132 | X = np.arange(1, 11).reshape(-1, 1) 133 | Y = np.array([4.5, 4.75, 4.91, 5.34, 5.8, 7.05, 7.90, 8.23, 8.70, 9.00]) 134 | demonstrate(cart, X, Y, X, Y, "Example 3: CART stump") 135 | 136 | 137 | # -------------------------- Example 4 ---------------------------------------- 138 | cart = RegressionCART(verbose=True, max_depth=3) 139 | X = np.arange(1, 11).reshape(-1, 1) 140 | Y = np.array([4.5, 4.75, 4.91, 5.34, 5.8, 7.05, 7.90, 8.23, 8.70, 9.00]) 141 | demonstrate(cart, X, Y, X, Y, "Example 4: split twice") 142 | -------------------------------------------------------------------------------- /05.DecisionTree/prune.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pprint import pprint 3 | from collections import Counter 4 | from rich.console import Console 5 | from rich.table import Table 6 | import sys 7 | import os 8 | from pathlib import Path 9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 10 | from utils import * 11 | from ID3 import ID3 12 | 13 | def prune(root, X, Y, alpha=.0, verbose=True): 14 | """ 15 | prune a decision tree recursively. alpha is the weight of tree size in the loss function 16 | reutrn the loss of all the leaf nodes 17 | """ 18 | # calculate the entropy of this subtree if the children of root is trimmed 19 | pruned_entropy = len(X) * entropy(Counter(Y).values()) 20 | pruned_loss = pruned_entropy + alpha 21 | # if root is a leaf node, return loss directly 22 | if not root.children: 23 | return pruned_loss 24 | cur_loss = 0. 25 | # trim child nodes recursively 26 | for col_val in root.children: 27 | child = root.children[col_val] 28 | ind = [x[root.col] == col_val for x in X] 29 | childX = [x for i, x in zip(ind, X) if i] 30 | childY = [y for i, y in zip(ind, Y) if i] 31 | cur_loss += prune(child, childX, childY, alpha, verbose) 32 | # if pruned, return the pruned loss 33 | if verbose: 34 | pprint(X) 35 | print('loss if prune:', pruned_loss) 36 | print('current loss', cur_loss) 37 | if pruned_loss < cur_loss: 38 | root.children.clear() 39 | return pruned_loss 40 | # if not pruned, the loss of node root is the sum loss of all of its children 41 | return cur_loss 42 | 43 | 44 | if __name__ == "__main__": 45 | console = Console(markup=False) 46 | # -------------------------- Example 1 (Small Normalization Param) ------------ 47 | print("Example 1:") 48 | id3 = ID3(verbose=False) 49 | X = [ 50 | ['青年', '否', '否', '一般'], 51 | ['青年', '否', '否', '好'], 52 | ['青年', '是', '否', '好'], 53 | ['青年', '是', '是', '一般'], 54 | ['青年', '否', '否', '一般'], 55 | ['老年', '否', '否', '一般'], 56 | ['老年', '否', '否', '好'], 57 | ['老年', '是', '是', '好'], 58 | ['老年', '否', '是', '非常好'], 59 | ['老年', '否', '是', '非常好'], 60 | ['老年', '否', '是', '非常好'], 61 | ['老年', '否', '是', '好'], 62 | ['老年', '是', '否', '好'], 63 | ['老年', '是', '否', '非常好'], 64 | ['老年', '否', '否', '一般'], 65 | ] 66 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 67 | id3.fit(X, Y) 68 | 69 | # prune with alpha 0. 70 | prune(id3.root, X, Y, 0.) 71 | 72 | # show in table 73 | pred = id3.predict(X) 74 | table = Table('x', 'y', 'pred') 75 | for x, y, y_hat in zip(X, Y, pred): 76 | table.add_row(*map(str, [x, y, y_hat])) 77 | console.print(table) 78 | 79 | # -------------------------- Example 2 (Large Normalization Param) ------------ 80 | print("Example 2:") 81 | id3 = ID3(verbose=False) 82 | X = [ 83 | ['青年', '否', '否', '一般'], 84 | ['青年', '否', '否', '好'], 85 | ['青年', '是', '否', '好'], 86 | ['青年', '是', '是', '一般'], 87 | ['青年', '否', '否', '一般'], 88 | ['老年', '否', '否', '一般'], 89 | ['老年', '否', '否', '好'], 90 | ['老年', '是', '是', '好'], 91 | ['老年', '否', '是', '非常好'], 92 | ['老年', '否', '是', '非常好'], 93 | ['老年', '否', '是', '非常好'], 94 | ['老年', '否', '是', '好'], 95 | ['老年', '是', '否', '好'], 96 | ['老年', '是', '否', '非常好'], 97 | ['老年', '否', '否', '一般'], 98 | ] 99 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 100 | id3.fit(X, Y) 101 | 102 | # prune with large alpha 103 | prune(id3.root, X, Y, 10000.) 104 | 105 | # show in table 106 | pred = id3.predict(X) 107 | table = Table('x', 'y', 'pred') 108 | for x, y, y_hat in zip(X, Y, pred): 109 | table.add_row(*map(str, [x, y, y_hat])) 110 | console.print(table) 111 | 112 | # -------------------------- Example 3 (Midium Normalization Param) ----------- 113 | print("Example 3:") 114 | id3 = ID3(verbose=False) 115 | X = [ 116 | ['青年', '否', '否', '一般'], 117 | ['青年', '否', '否', '好'], 118 | ['青年', '是', '否', '好'], 119 | ['青年', '是', '是', '一般'], 120 | ['青年', '否', '否', '一般'], 121 | ['老年', '否', '否', '一般'], 122 | ['老年', '否', '否', '好'], 123 | ['老年', '是', '是', '好'], 124 | ['老年', '否', '是', '非常好'], 125 | ['老年', '否', '是', '非常好'], 126 | ['老年', '否', '是', '非常好'], 127 | ['老年', '否', '是', '好'], 128 | ['老年', '是', '否', '好'], 129 | ['老年', '是', '否', '非常好'], 130 | ['老年', '否', '否', '一般'], 131 | ] 132 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 133 | id3.fit(X, Y) 134 | 135 | # prune with medium alpha 136 | prune(id3.root, X, Y, 5.) 137 | 138 | # show in table 139 | pred = id3.predict(X) 140 | table = Table('x', 'y', 'pred') 141 | for x, y, y_hat in zip(X, Y, pred): 142 | table.add_row(*map(str, [x, y, y_hat])) 143 | console.print(table) 144 | -------------------------------------------------------------------------------- /05.DecisionTree/pruneClassificationCART.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pprint import pprint 3 | from collections import Counter 4 | from rich.console import Console 5 | from rich.table import Table 6 | import sys 7 | import os 8 | from pathlib import Path 9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 10 | from utils import * 11 | from ClassificationCART import ClassificationCART 12 | 13 | class PrunedCART: 14 | def __init__(self, cart, X, Y, val_X, val_Y, verbose=True): 15 | self.root = cart.root 16 | self.possible_prune_threshold = {np.inf} 17 | self.verbose = verbose 18 | # Stage one: calculate pruning loss for all nodes 19 | self.calculate_prune_loss(self.root, X, Y) 20 | if self.verbose: 21 | print("All the possible threshold values are", self.possible_prune_threshold) 22 | # Stage two: choose the best threshold for pruning 23 | self.prune_threshold = self.choose_threshold(val_X, val_Y, self.possible_prune_threshold) 24 | if self.verbose: 25 | print("The best threshold value is", self.prune_threshold) 26 | 27 | def calculate_prune_loss(self, root, X, Y): 28 | """ 29 | get the pruning loss of a classification CART recursively 30 | tag all the nodes with a float `pruned_loss`, which indicating the loss of pruning the branches under this node 31 | this node will be trimmed 32 | 33 | possible_prune_threshold is a empty set, in which this function will insert all the possible threshold value. 34 | return the loss of all the leaf nodes, and the size of the subtree 35 | """ 36 | # calculate the gini index of this subtree if the children of root is trimmed 37 | pruned_gini = len(X) * gini(Counter(Y).values()) 38 | pruned_loss = pruned_gini 39 | # if root is a leaf node, return loss directly 40 | if root.col is None: 41 | return pruned_loss, 1 42 | 43 | # cur_loss record the loss function when root is not trimmed 44 | cur_loss = 0. 45 | # size record the size of this subtree 46 | size = 1 47 | 48 | selected_ind = X[:, root.col] == root.val 49 | other_ind = X[:, root.col] != root.val 50 | selected_X = X[selected_ind] 51 | other_X = X[other_ind] 52 | selected_Y = Y[selected_ind] 53 | other_Y = Y[other_ind] 54 | 55 | # trim the left node recursively 56 | child_loss, child_size = self.calculate_prune_loss(root.left, selected_X, selected_Y) 57 | cur_loss += child_loss 58 | size += child_size 59 | 60 | # trim the right node recursively 61 | child_loss, child_size = self.calculate_prune_loss(root.right, other_X, other_Y) 62 | cur_loss += child_loss 63 | size += child_size 64 | 65 | # the loss of prune the branches of this node 66 | relative_prune_loss = (pruned_loss - cur_loss) / (size - 1) 67 | root.relative_prune_loss = relative_prune_loss 68 | self.possible_prune_threshold.add(relative_prune_loss) 69 | return cur_loss, size 70 | 71 | def query(self, root, x, prune_threshold): 72 | # if root.relative_prune_loss is less than choosed prune threshold, it is trimmed 73 | if root.col is None or root.relative_prune_loss < prune_threshold: 74 | return root.label 75 | elif x[root.col] != root.val: 76 | return self.query(root.right, x, prune_threshold) 77 | return self.query(root.left, x, prune_threshold) 78 | 79 | def _predict(self, x, prune_threshold): 80 | return self.query(self.root, x, prune_threshold) 81 | 82 | def predict(self, X, prune_threshold=None): 83 | if prune_threshold is None: 84 | prune_threshold = self.prune_threshold 85 | return np.array([self._predict(x, prune_threshold) for x in X]) 86 | 87 | def validate(self, val_X, val_Y, prune_threshold): 88 | """ 89 | I don't think using gini index for validation, as written in the book, is a good idea, 90 | beacause gini index is unsupervised but there is label available in the validation set. 91 | So I choose to use accuracy instead. 92 | """ 93 | pred = self.predict(val_X, prune_threshold) 94 | return (pred == val_Y).mean() 95 | 96 | def choose_threshold(self, val_X, val_Y, possible_prune_threshold): 97 | """ 98 | Choose the best subtree according to the validation set. 99 | Cross-validation here simply refers to predict on a pre-split validation set. 100 | """ 101 | best_acc = -1. 102 | best_prune_threshold = 0. 103 | for prune_threshold in sorted(list(possible_prune_threshold)): 104 | cur_acc = self.validate(val_X, val_Y, prune_threshold) 105 | if self.verbose: 106 | print(f"When prune threshold = {prune_threshold}, accuracy is {cur_acc}") 107 | if cur_acc >= best_acc: 108 | best_acc = cur_acc 109 | best_prune_threshold = prune_threshold 110 | return best_prune_threshold 111 | 112 | 113 | if __name__ == "__main__": 114 | console = Console(markup=False) 115 | cart = ClassificationCART(verbose=True) 116 | # -------------------------- Example 1 ---------------------------------------- 117 | print("Example 1:") 118 | X = np.array([ 119 | ['青年', '否', '否', '一般'], 120 | ['青年', '否', '否', '好'], 121 | ['青年', '是', '否', '好'], 122 | ['青年', '是', '是', '一般'], 123 | ['青年', '否', '否', '一般'], 124 | ['老年', '否', '否', '一般'], 125 | ['老年', '否', '否', '好'], 126 | ['老年', '是', '是', '好'], 127 | ['老年', '否', '是', '非常好'], 128 | ['老年', '否', '是', '非常好'], 129 | ['老年', '否', '是', '非常好'], 130 | ['老年', '否', '是', '好'], 131 | ['老年', '是', '否', '好'], 132 | ['老年', '是', '否', '非常好'], 133 | ['老年', '否', '否', '一般'], 134 | ]) 135 | Y = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']) 136 | cart.fit(X, Y) 137 | 138 | # Here I use the same dataset as the validation set 139 | # Notice that it must be the full tree to be choosed this way 140 | testX = np.array([ 141 | ['青年', '否', '否', '一般'], 142 | ['青年', '否', '否', '好'], 143 | ['青年', '是', '否', '好'], 144 | ['青年', '是', '是', '一般'], 145 | ['青年', '否', '否', '一般'], 146 | ['老年', '否', '否', '一般'], 147 | ['老年', '否', '否', '好'], 148 | ['老年', '是', '是', '好'], 149 | ['老年', '否', '是', '非常好'], 150 | ['老年', '否', '是', '非常好'], 151 | ['老年', '否', '是', '非常好'], 152 | ['老年', '否', '是', '好'], 153 | ['老年', '是', '否', '好'], 154 | ['老年', '是', '否', '非常好'], 155 | ['老年', '否', '否', '一般'], 156 | ]) 157 | testY = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']) 158 | 159 | pruned_cart = PrunedCART(cart, X, Y, testX, testY) 160 | 161 | # show in table 162 | pred = pruned_cart.predict(testX) 163 | table = Table('x', 'y', 'pred') 164 | for x, y, y_hat in zip(testX, testY, pred): 165 | table.add_row(*map(str, [x, y, y_hat])) 166 | console.print(table) 167 | -------------------------------------------------------------------------------- /06.LogisticRegression-MaxEntropy/BinaryLogisticRegression.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import numpy as np 3 | import sys 4 | import os 5 | from pathlib import Path 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 7 | from utils import binary_cross_entropy, sigmoid, wbline 8 | 9 | class LogisticRegression: 10 | def __init__(self, lr=1, max_steps=1000, verbose=True): 11 | self.lr = lr 12 | self.max_steps = max_steps 13 | self.verbose = verbose 14 | 15 | def fit(self, X, Y): 16 | """ 17 | X: of shape [data-size, feature-size] 18 | Y: of shape [data-size] 19 | """ 20 | self.feature_size = X.shape[-1] 21 | # w of shape [feature-size] 22 | self.w = np.random.rand(self.feature_size) 23 | # b of shape [1] 24 | self.b = np.random.rand(1) 25 | 26 | for step in range(self.max_steps): 27 | # pred of shape [data-size] 28 | pred = self._predict(X) 29 | # Bias gradient of shape [data-size] 30 | gradient_b = Y - pred 31 | # Weight gradient of shape [data-size, feature-size] 32 | gradient_w = gradient_b[:, None] * X 33 | # get mean of gradient across all data 34 | gradient_b = gradient_b.mean(axis=0) 35 | gradient_w = gradient_w.mean(axis=0) 36 | self.w += gradient_w * self.lr 37 | self.b += gradient_b * self.lr 38 | if self.verbose: 39 | loss = binary_cross_entropy(pred, Y) 40 | print(f"Step {step}, Loss is {loss}...") 41 | 42 | def _predict(self, X): 43 | logit = self.w @ X.transpose() + self.b 44 | p = sigmoid(logit) 45 | return p 46 | 47 | def predict(self, X): 48 | p = self._predict(X) 49 | Y = (p > .5).astype(int) 50 | return Y 51 | 52 | if __name__ == "__main__": 53 | def demonstrate(X, Y, desc): 54 | logistic_regression = LogisticRegression(verbose=True) 55 | logistic_regression.fit(X, Y) 56 | 57 | # plot 58 | plt.title(desc) 59 | plt.scatter(X[:, 0], X[:, 1], c=Y) 60 | wbline(logistic_regression.w, logistic_regression.b) 61 | plt.show() 62 | 63 | # -------------------------- Example 1 ---------------------------------------- 64 | X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 65 | Y = np.array([1, 1, 0, 0]) 66 | demonstrate(X, Y, "Example 1") 67 | 68 | # -------------------------- Example 2 ---------------------------------------- 69 | X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 70 | Y = np.array([1, 0, 0, 1]) 71 | demonstrate(X, Y, "Example 2: Logistic Regression still cannot solve a simple XOR problem") 72 | 73 | # -------------------------- Example 3 ---------------------------------------- 74 | X = np.concatenate([np.random.normal([0, 1], size=[40, 2]), 75 | np.random.normal([1, 0], size=[40, 2])]) 76 | Y = np.concatenate([np.ones(40), np.zeros(40)]) 77 | demonstrate(X, Y, "Example 3: Logistic Regression is suitable for tasks that are not strictly linear separable") 78 | -------------------------------------------------------------------------------- /06.LogisticRegression-MaxEntropy/MaxEntropy.py: -------------------------------------------------------------------------------- 1 | from rich.console import Console 2 | from rich.table import Table 3 | import numpy as np 4 | from numpy import linalg 5 | import sys 6 | import os 7 | from pathlib import Path 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 9 | from utils import softmax, line_search 10 | 11 | class MaxEntropy: 12 | def __init__(self, epsilon=1e-6, max_steps=1000, verbose=True): 13 | self.epsilon = epsilon 14 | self.max_steps = max_steps 15 | self.verbose = verbose 16 | 17 | def _p_w(self, w): 18 | """ 19 | calculate probability table according to w 20 | """ 21 | logit = (w[:, None, None] * self.feature).sum(axis=0) 22 | p_w = softmax(logit, axis=-1) 23 | return p_w 24 | 25 | def _f(self, w): 26 | """ 27 | object function 28 | """ 29 | return \ 30 | ( 31 | np.log( 32 | np.exp( 33 | (w[:, None, None] * self.feature 34 | ).sum(axis=0) 35 | ).sum(axis=-1)) * self.p_data_x 36 | ).sum() - \ 37 | ( 38 | self.p_data * (w[:, None, None] * self.feature).sum(axis=0) 39 | ).sum() 40 | 41 | def _g(self, w): 42 | """ 43 | gradient of object function 44 | """ 45 | p_w = self._p_w(w) 46 | return (self.p_data_x[None, :, None] * p_w[None, :, :] * self.feature 47 | ).sum(axis=(1, 2)) - self.E_feature 48 | 49 | def fit(self, p_data, feature): 50 | """ 51 | optimize max entropy model with BFGS 52 | p_data: matrix of shape [nx, ny], possibility of all (x, y) 53 | feature: matrix of shape[nf, nx, ny], all the feature functions of all (x, y) 54 | """ 55 | # nf is the number of feature functions, and the size of w 56 | self.nf, self.nx, self.ny = feature.shape 57 | self.feature = feature 58 | self.p_data = p_data 59 | self.p_data_x = p_data.sum(axis=-1) 60 | self.E_feature = (p_data[None, :, :] * feature).sum(axis=(1, 2)) 61 | 62 | # initlaize optimizer 63 | self.w = np.random.rand(self.nf) 64 | B = np.eye(self.nf) 65 | g_next = self._g(self.w) 66 | g_norm = linalg.norm(g_next) 67 | # optimize 68 | for i in range(self.max_steps): 69 | g = g_next 70 | if self.verbose: 71 | print(f"Step {i}, L2 norm of gradient is {g_norm}") 72 | if g_norm < self.epsilon: 73 | break 74 | p = linalg.solve(B, -g) 75 | f_lambda = lambda x: self._f(self.w + x * p) 76 | lamda = line_search(f_lambda, 0, 100, epsilon=self.epsilon) 77 | delta_w = lamda * p 78 | self.w += delta_w 79 | g_next = self._g(self.w) 80 | g_norm = linalg.norm(g_next) 81 | if g_norm < self.epsilon: 82 | print(f"L2 norm of gradient is {g_norm}, stop training...") 83 | break 84 | delta_g = g_next - g 85 | B_delta_w = B @ delta_w 86 | B += np.outer(delta_g, delta_g) / (delta_g @ delta_w) - \ 87 | np.outer(B_delta_w, B_delta_w) / (B_delta_w.T @ delta_w) 88 | self.p_w = self._p_w(self.w) 89 | 90 | def predict(self, x, y): 91 | """predict p(y|x)""" 92 | return self.p_w[x][y] 93 | 94 | # The following examples are proposed by SleepyBag at 95 | # https://www.zhihu.com/question/24094554/answer/1507080982 96 | if __name__ == "__main__": 97 | console = Console(markup=False) 98 | def float2str(x): 99 | return "%.3f" % x 100 | 101 | def demonstrate(data, feature_functions): 102 | max_entropy = MaxEntropy() 103 | max_entropy.fit(data, feature_functions) 104 | 105 | # print results 106 | for i, ff in enumerate(feature_functions): 107 | table = Table(f'feature {i}', 'y=1', 'y=2', 'y=3') 108 | for x in range(2): 109 | table.add_row(f'x={x}', *map(float2str, [ff[x, y] for y in range(3)])) 110 | console.print(table) 111 | table = Table('prob', 'y=1', 'y=2', 'y=3') 112 | for x in range(2): 113 | table.add_row(f'x={x}', *map(float2str, [max_entropy.predict(x, y) for y in range(3)])) 114 | console.print(table) 115 | 116 | # ---------------------- Prepare Data ----------------------------------------- 117 | data = np.array([[.125, .25, .125], 118 | [.5, 0., 0.]]) 119 | table = Table('data', 'y=1', 'y=2', 'y=3') 120 | for x in range(2): 121 | table.add_row(f'x={x}', *map(float2str, [data[x, y] for y in range(3)])) 122 | console.print(table) 123 | 124 | # ---------------------- Example 1--------------------------------------------- 125 | print('Example 1: Single feature function') 126 | feature_functions = np.array([ 127 | [[1, 0, 0], 128 | [0, 0, 0]] 129 | ]) 130 | demonstrate(data, feature_functions) 131 | 132 | # ---------------------- Example 3--------------------------------------------- 133 | print('Example 2: the value of feature function doesn\'t matter for feature function with only one non-zero value') 134 | feature_functions = np.array([ 135 | [[0.5, 0, 0], 136 | [0, 0, 0]] 137 | ]) 138 | demonstrate(data, feature_functions) 139 | 140 | # ---------------------- Example 2--------------------------------------------- 141 | print('Example 3: double feature functions') 142 | feature_functions = np.array([ 143 | [[1, 0, 0], 144 | [0, 0, 0]], 145 | [[0, 0, 0], 146 | [0, 1, 0]] 147 | ]) 148 | demonstrate(data, feature_functions) 149 | 150 | # ---------------------- Example 3--------------------------------------------- 151 | print('Example 4: single feature function with two non-zeros') 152 | feature_functions = np.array([ 153 | [[0, 1, 1], 154 | [0, 0, 0]] 155 | ]) 156 | demonstrate(data, feature_functions) 157 | 158 | # ---------------------- Example 3--------------------------------------------- 159 | print('Example 5: the value of feature function matters for feature function with multiple non-zero values') 160 | feature_functions = np.array([ 161 | [[0, 1, .5], 162 | [0, 0, 0]] 163 | ]) 164 | demonstrate(data, feature_functions) 165 | -------------------------------------------------------------------------------- /07.SVM/SVM.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import numpy as np 3 | import sys 4 | import os 5 | from pathlib import Path 6 | from rich.console import Console 7 | from rich.table import Table 8 | from functools import partial 9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 10 | from utils import wbline 11 | 12 | class SVM: 13 | def __init__(self, C=1e9, epsilon=1e-6, lr=1e-4, max_steps=1000, verbose=True, kernel=np.dot): 14 | """ 15 | kernel: kernel function, of which 16 | the input is two vectors a, b 17 | the output is a scalar value 18 | """ 19 | self.lr = lr 20 | self.max_steps = max_steps 21 | self.verbose = verbose 22 | self.C = C 23 | self.epsilon = epsilon 24 | self.kernel = kernel 25 | 26 | def _smo_objective(self, i, j): 27 | """ 28 | The objective function of one step of SMO 29 | given the choosed alpha i and alpha j 30 | """ 31 | alpha, Y, K = self.alpha, self.Y, self.K 32 | return (alpha[i] * Y[i] * alpha * K[i:] * Y).sum() \ 33 | + (alpha[j] * Y[j] * alpha * K[j:] * Y).sum() \ 34 | - .5 * alpha[i] ** 2 * K[i, i] \ 35 | - .5 * alpha[j] ** 2 * K[j, j] \ 36 | - alpha[i] * alpha[j] * Y[i] * Y[j] * K[i, j]\ 37 | - alpha[i] - alpha[j] 38 | 39 | def _smo_step(self, step_cnt): 40 | if self.verbose: 41 | print(f'SMO step {step_cnt} start...') 42 | alpha = self.alpha 43 | K = self.K 44 | data_size = len(alpha) 45 | 46 | # the prediction of this step 47 | pred = (self.alpha * Y * self.K).sum(axis=-1) + self.b 48 | # the score of pred 49 | score = Y * pred 50 | # discrepency between pred and label 51 | error = pred - Y 52 | 53 | updated = False 54 | 55 | # find the first variable alpha_i 56 | # which violate KKT constraint 57 | # first try to find fake support vectors 58 | # of which 0 < alpha_i < C but score_i isn't 1 59 | i_cands = [i for i in range(data_size) if 60 | 0 < alpha[i] < self.C and abs(score[i] - 1) > self.epsilon or 61 | alpha[i] == 0 and score[i] < 1 or 62 | alpha[i] == self.C and score[i] > 1] 63 | for i in i_cands: 64 | # find the second variable 65 | # which makes alpha_i change most 66 | relative_error = np.abs(error - error[i]) 67 | j_cands = sorted(list(range(data_size)), key=relative_error.__getitem__) 68 | for j in j_cands: 69 | if j == i: 70 | continue 71 | smo_objective_before = self._smo_objective(i, j) 72 | 73 | # upper bound and lower bound of alpha_j 74 | L = max(0, alpha[j] - alpha[i] if Y[i] != Y[j] else alpha[i] + alpha[j] - self.C) 75 | H = min(self.C, self.C + alpha[j] - alpha[i] if Y[i] != Y[j] else alpha[i] + alpha[j]) 76 | 77 | if self.verbose: 78 | print('SMO chooses: ', i, j) 79 | print('alpha[i] and alpha[j] are', alpha[i], alpha[j]) 80 | print('Step begin, current object of dual problem:', smo_objective_before) 81 | 82 | alpha_j_old = alpha[j] 83 | eta = K[i, i] + K[j, j] - 2 * K[i, j] + self.epsilon 84 | # update alpha_j 85 | alpha[j] += Y[j] * (error[i] - error[j]) / eta 86 | # clip 87 | alpha[j] = min(alpha[j], H) 88 | alpha[j] = max(alpha[j], L) 89 | # update alpha_i 90 | alpha[i] += Y[i] * Y[j] * (alpha_j_old - alpha[j]) 91 | # update b 92 | self.b = Y[i] - (alpha * Y * K[i]).sum() 93 | if 0 < alpha[j] < self.C: 94 | self.b = (Y[j] - (alpha * Y * K[j]).sum() + self.b) / 2 95 | smo_objective_after = self._smo_objective(i, j) 96 | if self.verbose: 97 | print('Step end, current object of dual problem:', smo_objective_after) 98 | print('alpha[i] and alpha[j] are', alpha[i], alpha[j]) 99 | if smo_objective_before - smo_objective_after > self.epsilon: 100 | updated = True 101 | break 102 | if updated: 103 | break 104 | if self.verbose: 105 | print('SMO step end...') 106 | print() 107 | return len(i_cands) > 0 108 | 109 | def fit(self, X, Y): 110 | """ 111 | optimize SVM with SMO 112 | X: of shape [data-size, feature-size] 113 | Y: of shape [data-size] 114 | """ 115 | self.X, self.Y = X, Y 116 | data_size = len(X) 117 | self.alpha = np.zeros(data_size) 118 | self.b = np.random.rand() 119 | 120 | self.K = np.array([[self.kernel(x1, x2) for x1 in X] for x2 in X]) 121 | print(self.K) 122 | # optimize 123 | step_cnt = 0 124 | while self._smo_step(step_cnt) and step_cnt < self.max_steps: 125 | step_cnt += 1 126 | pass 127 | 128 | # optimized, get w and b 129 | support_vector_ind = 0 < self.alpha 130 | self._support_vectors = X[support_vector_ind] 131 | self._support_Y = Y[support_vector_ind] 132 | self._support_alpha = self.alpha[support_vector_ind] 133 | if self.verbose: 134 | print("Done!") 135 | print('Alphas are as follows:') 136 | print(self.alpha) 137 | print(support_vector_ind) 138 | print('Support vectors are as follows:') 139 | print(self._support_vectors) 140 | 141 | # for demonstration 142 | self.w = ((self.alpha * Y)[:, None] * X).sum(axis=0) 143 | 144 | def _predict(self, x): 145 | return (self._support_Y * self._support_alpha * \ 146 | np.apply_along_axis(partial(self.kernel, x), -1, self._support_vectors)).sum() 147 | 148 | def predict(self, X): 149 | score = np.apply_along_axis(self._predict, -1, X) 150 | # score = (self.w * X).sum(axis=-1) + self.b 151 | pred = (score >= 0).astype(int) * 2 - 1 152 | return pred 153 | 154 | if __name__ == "__main__": 155 | def demonstrate(X, Y, desc, draw=True, **args): 156 | console = Console(markup=False) 157 | svm = SVM(verbose=True, **args) 158 | svm.fit(X, Y) 159 | 160 | # plot 161 | if draw: 162 | plt.scatter(X[:, 0], X[:, 1], c=Y) 163 | wbline(svm.w, svm.b) 164 | plt.title(desc) 165 | plt.show() 166 | 167 | # show in table 168 | pred = svm.predict(X) 169 | table = Table('x', 'y', 'pred') 170 | for x, y, y_hat in zip(X, Y, pred): 171 | table.add_row(*map(str, [x, y, y_hat])) 172 | console.print(table) 173 | 174 | # -------------------------- Example 1 ---------------------------------------- 175 | print("Example 1:") 176 | X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 177 | Y = np.array([1, 1, -1, -1]) 178 | demonstrate(X, Y, "Example 1") 179 | 180 | # -------------------------- Example 2 ---------------------------------------- 181 | print("Example 2:") 182 | X = np.concatenate((np.random.rand(5, 2), np.random.rand(5, 2) + np.array([1, 1])), axis=0) 183 | Y = np.array([1, 1, 1, 1, 1, -1, -1, -1, -1, -1]) 184 | print(X, Y) 185 | demonstrate(X, Y, "Example 2: randomly generated data") 186 | 187 | # ---------------------- Example 3 -------------------------------------------- 188 | print("Example 3:") 189 | X = np.array([[0, 0], [1, 1], [1, 0], [0, 1]]) 190 | Y = np.array([1, 1, -1, -1]) 191 | demonstrate(X, Y, "Example 3: SVM with dot kernel cannot sovle XOR problem", C=1) 192 | 193 | # ---------------------- Example 4 -------------------------------------------- 194 | def gaussian_kernel(x, y): 195 | return np.exp(-((x - y) ** 2).sum()) 196 | print("Example 4:") 197 | X = np.array([[0, 0], [1, 1], [1, 0], [0, 1]]) 198 | Y = np.array([1, 1, -1, -1]) 199 | demonstrate(X, Y, "Example 4: SVM with dot kernel cannot sovle XOR problem", draw=False, kernel=gaussian_kernel) 200 | -------------------------------------------------------------------------------- /08.Boosting/AdaBoost.py: -------------------------------------------------------------------------------- 1 | from math import log 2 | from matplotlib import pyplot as plt 3 | import numpy as np 4 | import sys 5 | import os 6 | from pathlib import Path 7 | from rich.console import Console 8 | from rich.table import Table 9 | from functools import partial 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 11 | from utils import wbline 12 | 13 | class DecisionStump: 14 | """ 15 | A simple classifier. 16 | A decision stump divide dataset by a threshold 17 | Expected one-dimensional X 18 | """ 19 | def __init__(self, verbose=True): 20 | self.verbose = verbose 21 | 22 | def fit(self, X, Y, weight): 23 | # since X is one-dimensional, just flatten it 24 | X = X[:, 0] 25 | possible_thresholds = list(set(X)) 26 | possible_thresholds.append(max(possible_thresholds) + 1) 27 | possible_thresholds.append(min(possible_thresholds) - 1) 28 | # try all possible threshold 29 | best_acc = 0. 30 | best_threshold, best_sign = 0., 0. 31 | for self.sign in [1, -1]: 32 | for self.threshold in possible_thresholds: 33 | pred = self.predict(X) 34 | acc = (pred == Y) @ weight 35 | if acc > best_acc: 36 | best_acc, best_threshold, best_sign = acc, self.threshold, self.sign 37 | self.threshold, self.sign = best_threshold, best_sign 38 | if self.verbose: 39 | print(f'Threshold is {self.threshold}') 40 | 41 | def predict(self, X): 42 | X = X * self.sign 43 | threshold = self.threshold * self.sign 44 | pred = (X > threshold) * 2 - 1 45 | return pred.flatten() 46 | 47 | class AdaBoost: 48 | def __init__(self, BasicModel=DecisionStump, steps=10, verbose=True): 49 | self.BasicModel = BasicModel 50 | self.steps = steps 51 | self.verbose = verbose 52 | 53 | def fit(self, X, Y): 54 | n = len(X) 55 | weight = np.ones(n) / n 56 | self.basic_models = [] 57 | self.model_weights = [] 58 | for i in range(self.steps): 59 | basic_model = self.BasicModel() 60 | basic_model.fit(X, Y, weight) 61 | self.basic_models.append(basic_model) 62 | pred = basic_model.predict(X) 63 | error_rate = (pred != Y) @ weight 64 | model_weight = .5 * log((1 - error_rate) / error_rate) 65 | weight *= np.exp(-model_weight * Y * pred) 66 | weight /= weight.sum() 67 | self.model_weights.append(model_weight) 68 | if self.verbose: 69 | print(f'Step {i}, current error rate is {error_rate}') 70 | print(f'The weight of current model is {model_weight}') 71 | 72 | def predict(self, X): 73 | score = sum(model.predict(X) * weight for model, weight in 74 | zip(self.basic_models, self.model_weights)) 75 | pred = (score > 0.).astype(int) * 2 - 1 76 | return pred 77 | 78 | if __name__ == "__main__": 79 | def demonstrate(X, Y, desc): 80 | print(desc) 81 | console = Console(markup=False) 82 | adaboost = AdaBoost(verbose=True) 83 | adaboost.fit(X, Y) 84 | 85 | # show in table 86 | pred = adaboost.predict(X) 87 | table = Table('x', 'y', 'pred') 88 | for x, y, y_hat in zip(X, Y, pred): 89 | table.add_row(*map(str, [x, y, y_hat])) 90 | console.print(table) 91 | 92 | # -------------------------- Example 1 ---------------------------------------- 93 | X = np.arange(10).reshape(-1, 1) 94 | Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1]) 95 | demonstrate(X, Y, "Example 1") 96 | -------------------------------------------------------------------------------- /08.Boosting/GBDT.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | import sys 4 | import os 5 | from pathlib import Path 6 | from rich.console import Console 7 | from rich.table import Table 8 | from functools import partial 9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 10 | from utils import line_search 11 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '5.DecisionTree')) 12 | from RegressionCART import RegressionCART 13 | 14 | class GBDT: 15 | def __init__(self, 16 | loss_function=lambda label, pred: ((label - pred) ** 2).sum(), 17 | gradient_function=lambda label, pred: 2 * (pred - label), 18 | steps=10, 19 | max_depth=3, 20 | verbose=True): 21 | """ 22 | `loss_function` takes two arguments, label and pred and return a scalar, the loss 23 | `gradient_function` is gradient from loss function to the prediction 24 | It takes two arguments, i.e., label and pred and return the gradient 25 | the loss function should be convex 26 | The default loss function is l2 loss, which makes GBDT an ordinary boosting tree 27 | """ 28 | self.steps = steps 29 | self.verbose = verbose 30 | self.gradient_function = gradient_function 31 | self.loss_function = loss_function 32 | self.max_depth = max_depth 33 | 34 | def _loss_of_const(self, Y, c): 35 | """ 36 | Return the loss when the model take a constant c as the prediction 37 | `Y` is a vector of labels 38 | `c` is a constant scalar 39 | """ 40 | c = (np.ones_like(Y) * c).astype(float) 41 | return self.loss_function(Y, c) 42 | 43 | def fit(self, X, Y): 44 | n = len(X) 45 | self.carts = [] 46 | # the basic value of prediction, so that there can be 'residual' 47 | self.basic_pred = line_search(partial(self._loss_of_const, Y), min(Y), max(Y)) 48 | 49 | cur_pred = np.zeros_like(Y) + self.basic_pred 50 | residual = -self.gradient_function(Y, cur_pred) 51 | for i in range(self.steps): 52 | if self.verbose: 53 | print(f'step {i}') 54 | print(f'Current pred is {cur_pred}') 55 | print(f'Current residual is {residual}') 56 | cart = RegressionCART(verbose=False, max_depth=self.max_depth) 57 | cart.fit(X, residual) 58 | self.carts.append(cart) 59 | # regression trees use l2 loss as loss function, 60 | # the return value leaf nodes should be recorrect 61 | leaf2label=defaultdict(list) 62 | for i, x in enumerate(X): 63 | leaf = cart._query_leaf(cart.root, x) 64 | leaf2label[leaf].append(i) 65 | for leaf in leaf2label: 66 | data_ind = np.stack(leaf2label[leaf]) 67 | leafY = Y[data_ind] 68 | leaf_cur_pred = cur_pred[data_ind] 69 | leaf.label = line_search(lambda c: self.loss_function(leafY, leaf_cur_pred + c), -1e9, 1e9) 70 | 71 | # update the incremental prediction 72 | inc_pred = cart.predict(X) 73 | cur_pred += inc_pred 74 | residual = -self.gradient_function(Y, cur_pred) 75 | 76 | def predict(self, X): 77 | pred = np.zeros(len(X)) + self.basic_pred 78 | for cart in self.carts: 79 | pred += cart.predict(X) 80 | return pred 81 | 82 | if __name__ == "__main__": 83 | def demonstrate(X, Y, max_depth, desc): 84 | print(desc) 85 | console = Console(markup=False) 86 | gbdt = GBDT(verbose=True, max_depth=max_depth) 87 | gbdt.fit(X, Y) 88 | 89 | # show in table 90 | pred = gbdt.predict(X) 91 | table = Table('x', 'y', 'pred') 92 | for x, y, y_hat in zip(X, Y, pred): 93 | table.add_row(*map(str, [x, y, y_hat])) 94 | console.print(table) 95 | 96 | # -------------------------- Example 1 ---------------------------------------- 97 | X = np.arange(10).reshape(-1, 1) 98 | Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1]) 99 | demonstrate(X, Y, 3, "Example 1") 100 | 101 | # -------------------------- Example 2 ---------------------------------------- 102 | X = np.arange(10).reshape(-1, 1) 103 | Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1]) 104 | demonstrate(X, Y, 1, "Example 2: CART cannot be all stumps") 105 | 106 | # -------------------------- Example 3 ---------------------------------------- 107 | X = np.arange(10).reshape(-1, 1) 108 | Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1]) 109 | demonstrate(X, Y, 2, "Example 3") 110 | -------------------------------------------------------------------------------- /09.EM/GMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | 4 | class GMM: 5 | def __init__(self, k, independent_variance=True, max_step=2000, verbose=True): 6 | self.k = k 7 | self.max_step = max_step 8 | self.epsilon = 1e-8 9 | self.verbose = verbose 10 | # specify whether each feature has independent variance - that is, has a diagnol covariance matrix 11 | self.independent_variance = independent_variance 12 | 13 | def fit(self, X): 14 | """ 15 | X: training data of shape [n, feature_size] 16 | """ 17 | n, self.feature_size = X.shape 18 | # the parameter of each gaussian distribution 19 | self.prior = np.ones(self.k) / self.k 20 | self.prior /= self.prior.sum() 21 | if self.independent_variance: 22 | self.std = np.repeat(np.std(X, axis=0, keepdims=True), self.k, axis=0) 23 | self.mean = np.random.normal(X.mean(axis=0), self.std, [self.k, self.feature_size]) 24 | else: 25 | self.cov = np.repeat(np.cov(X.T)[None, ...], self.k, axis=0) 26 | self.mean = np.random.multivariate_normal(X.mean(axis=0), self.cov[0], [self.k]) 27 | 28 | previous_log_likelihood = -np.inf 29 | for step in range(self.max_step): 30 | ########################################## 31 | # Expectation step 32 | ########################################## 33 | # posterior probability of each sample in each Gaussian model 34 | posterior = self.predict(X) 35 | 36 | ########################################## 37 | # Maximization step 38 | ########################################## 39 | # center of each Gaussian model 40 | self.mean = (posterior[:, :, None] * X[None, :, :]).sum(axis=1) / \ 41 | (posterior.sum(axis=1)[:, None] + self.epsilon) 42 | # distance from each sample to each center 43 | dis = X[None, :, :] - self.mean[:, None, :] 44 | if self.independent_variance: 45 | # variance of each Gaussian model 46 | var = (posterior[:, :, None] * dis ** 2).sum(axis=1) / \ 47 | (posterior.sum(axis=1)[:, None] + self.epsilon) 48 | # standard deviation of each Gaussian model, in each dimension 49 | # shape [k, feature_size] 50 | # std[i, j] is the variance of j-th feature in the i-th Gaussian model 51 | self.std = np.sqrt(var) 52 | else: 53 | # covariance of each Gaussian model 54 | # shape [k, feature_size, feature_size] 55 | # cov[i] is the covariance matrix of i-th Gaussian model 56 | self.cov = (dis.transpose(0, 2, 1) @ (posterior[:, :, None] * dis)) / \ 57 | (posterior.sum(axis=1)[:, None, None] + self.epsilon) 58 | self.prior = posterior.sum(axis=1) 59 | self.prior /= (self.prior.sum() + self.epsilon) 60 | 61 | # early stopping 62 | log_likelihood = self.log_likelihood(X) 63 | if self.verbose: 64 | print('After step', step, ', likelihood of model parameters is', np.exp(log_likelihood)) 65 | if log_likelihood - previous_log_likelihood < self.epsilon: 66 | break 67 | previous_log_likelihood = log_likelihood 68 | 69 | def pairwise_likelihood(self, X): 70 | """ 71 | return the likelihood of each data piece in X belonging to each Gaussian cluster 72 | """ 73 | # dis[i, j, k] is the distance from i-th center to j-th sample, in k-th dimension 74 | dis = X[None, :, :] - self.mean[:, None, :] 75 | 76 | # calculate log likelihood first, then likelihood 77 | if self.independent_variance: 78 | # data_log_likelihood is of shape [k, n, feature_size] 79 | data_log_likelihood = -dis ** 2 * .5 / (self.std[:, None, :] ** 2 + self.epsilon) \ 80 | - np.log(np.sqrt(2 * np.pi) + self.epsilon) - np.log(self.std[:, None, :] + self.epsilon) 81 | # reduce likelihood to shape [k, n] 82 | # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center 83 | data_log_likelihood = data_log_likelihood.sum(-1) 84 | else: 85 | # data_log_likelihood is of shape [k, n] 86 | # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center 87 | fixed_cov = self.cov + self.epsilon * np.eye(self.feature_size) 88 | data_log_likelihood = -.5 * (dis @ np.linalg.inv(fixed_cov) * dis).sum(axis=-1) \ 89 | -.5 * np.linalg.slogdet(2 * np.pi * fixed_cov)[1][:, None] # slogdet returns [sign, logdet], we just need logdet 90 | 91 | data_likelihood = np.exp(data_log_likelihood) 92 | # the posterior of each datium belonging to a distribution, of shape [k, n] 93 | posterior = self.prior[:, None] * data_likelihood 94 | return posterior 95 | 96 | def log_likelihood(self, X): 97 | """ 98 | return the likelihood of parameter given dataset X. 99 | It is exactly the posterior probability of X given current parametmer 100 | """ 101 | posterior = self.pairwise_likelihood(X) 102 | log_likelihood = np.log(posterior.sum(axis=0)).mean() 103 | return log_likelihood 104 | 105 | def predict(self, X): 106 | """return the probability of each x belonging to each gaussian distribution""" 107 | posterior = self.pairwise_likelihood(X) 108 | posterior /= (posterior.sum(axis=0, keepdims=True) + self.epsilon) 109 | return posterior 110 | 111 | 112 | if __name__ == '__main__': 113 | def demonstrate(desc, X): 114 | gmm = GMM(3, independent_variance=False) 115 | gmm.fit(X) 116 | pred = gmm.predict(X).T 117 | plt.scatter(X[:, 0], X[:, 1], color=pred) 118 | plt.title(desc) 119 | plt.show() 120 | 121 | # ---------------------- Example 1--------------------------------------------- 122 | X = np.concatenate([ 123 | np.random.normal([0, 0], [.3, .3], [100, 2]), 124 | np.random.normal([0, 1], [.3, .3], [100, 2]), 125 | np.random.normal([1, 0], [.3, .3], [100, 2]), 126 | ]) 127 | demonstrate("Example 1", X) 128 | 129 | # ---------------------- Example 2--------------------------------------------- 130 | demonstrate("Example 2: GMM does'nt promise the same result for the same data", X) 131 | 132 | # ---------------------- Example 3--------------------------------------------- 133 | X = np.concatenate([ 134 | np.random.normal([0, 0], [.4, .4], [100, 2]), 135 | np.random.normal([0, 1], [.4, .4], [100, 2]), 136 | np.random.normal([1, 0], [.4, .4], [100, 2]), 137 | ]) 138 | demonstrate("Example 3", X) 139 | 140 | # ---------------------- Example 4--------------------------------------------- 141 | X = np.concatenate([ 142 | np.random.normal([0, 0], [.4, .4], [100, 2]), 143 | np.random.normal([0, 3], [.4, .4], [100, 2]), 144 | np.random.normal([3, 0], [.4, .4], [100, 2]), 145 | ]) 146 | demonstrate("Example 4", X) 147 | 148 | # ---------------------- Example 5--------------------------------------------- 149 | X = np.concatenate([ 150 | np.random.normal([0, 0], [.4, .4], [1, 2]), 151 | np.random.normal([0, 3], [.4, .4], [1, 2]), 152 | np.random.normal([3, 0], [.4, .4], [1, 2]), 153 | ]) 154 | demonstrate("Example 5", X) 155 | -------------------------------------------------------------------------------- /09.EM/GMMGradientDescent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | from matplotlib import pyplot as plt 4 | 5 | class GMMGradientDescent: 6 | def __init__(self, k, independent_variance=True, max_step=20000, learning_rate=1e-3, verbose=True): 7 | self.k = k 8 | self.max_step = max_step 9 | self.epsilon = 1e-8 10 | self.learning_rate = learning_rate 11 | self.log_sqrt_2pi = math.log(math.sqrt(2 * torch.pi)) 12 | self.verbose = verbose 13 | # specify whether each feature has independent variance - that is, has a diagnol covariance matrix 14 | self.independent_variance = independent_variance 15 | 16 | def fit(self, X): 17 | """ 18 | X: training data of shape [n, feature_size] 19 | """ 20 | n, self.feature_size = X.shape 21 | X = torch.Tensor(X) 22 | # the parameter of each gaussian distribution 23 | self.prior_logit = torch.zeros(self.k) 24 | self.prior_logit.requires_grad_() 25 | if self.independent_variance: 26 | self.log_std = torch.log(X.std(dim=0)).repeat(self.k, 1) 27 | self.log_std.requires_grad_() 28 | self.mean = torch.zeros(self.k, self.feature_size) 29 | self.mean.normal_() 30 | self.mean.requires_grad_() 31 | else: 32 | self.cholesky_inverse_cov = torch.linalg.cholesky(torch.cov(X.T)).repeat(self.k, 1, 1) 33 | self.cholesky_inverse_cov.requires_grad_() 34 | self.mean = torch.zeros(self.k, self.feature_size) 35 | self.mean.normal_() 36 | self.mean.requires_grad_() 37 | self.optimizer = torch.optim.Adam([self.log_std, self.mean, self.prior_logit], lr=self.learning_rate) 38 | 39 | previous_log_likelihood = -math.inf 40 | for step in range(self.max_step): 41 | ########################################## 42 | # Calculate Likelihood 43 | ########################################## 44 | # posterior probability of each sample in each Gaussian model 45 | # it is exactly the likelihood of parameters including mean, std and prior 46 | log_likelihood = self.log_likelihood(X, input_tensor=True, return_tensor=True) 47 | neg_log_likelihood = -log_likelihood.mean() 48 | 49 | if self.verbose: 50 | if step % 1000 == 0: 51 | print('Step', step, ', likelihood is', math.exp(-neg_log_likelihood)) 52 | 53 | ########################################## 54 | # Gradient Descent Step 55 | ########################################## 56 | self.optimizer.zero_grad() 57 | neg_log_likelihood.backward() 58 | self.optimizer.step() 59 | 60 | # early stopping 61 | log_likelihood = self.log_likelihood(X, input_tensor=True) 62 | if self.verbose: 63 | print('After step', step, ', likelihood of model parameters is', np.exp(log_likelihood)) 64 | if log_likelihood - previous_log_likelihood < self.epsilon: 65 | break 66 | previous_log_likelihood = log_likelihood 67 | 68 | def log_likelihood(self, X, input_tensor=False, return_tensor=False): 69 | if not input_tensor: 70 | X = torch.Tensor(X) 71 | pairwise_likelihood = self.pairwise_likelihood(X) 72 | log_likelihood = torch.log(pairwise_likelihood.sum(dim=0)).mean() 73 | if not return_tensor: 74 | log_likelihood = log_likelihood.detach().numpy() 75 | return log_likelihood 76 | 77 | def pairwise_likelihood(self, X): 78 | """return the likelihood of each x belonging to each gaussian distribution""" 79 | # dis[i, j, k] is the distance from i-th center to j-th sample, in k-th dimension 80 | dis = X[None, :, :] - self.mean[:, None, :] 81 | 82 | # calculate log likelihood first, then likelihood 83 | if self.independent_variance: 84 | # log_likelihood is of shape [k, n, feature_size] 85 | data_log_likelihood = -dis ** 2 * .5 / (torch.exp(self.log_std[:, None, :]) ** 2 + self.epsilon) \ 86 | - self.log_sqrt_2pi - self.log_std[:, None, :] 87 | # reduce likelihood to shape [k, n] 88 | # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center 89 | data_log_likelihood = data_log_likelihood.sum(dim=-1) 90 | else: 91 | # log_likelihood is of shape [k, n] 92 | # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center 93 | inverse_cov = self.cholesky_inverse_cov @ self.cholesky_inverse_cov.T 94 | data_log_likelihood = -.5 * (dis @ inverse_cov * dis).sum(axis=-1) \ 95 | +.5 * torch.linalg.slogdet(.5 / torch.pi * inverse_cov)[1][:, None] # slogdet returns [sign, logdet], we just need logdet 96 | 97 | likelihood = torch.exp(data_log_likelihood) 98 | # the posterior of each datium belonging to a distribution, of shape [k, n] 99 | pairwise_likelihood = torch.nn.functional.softmax(self.prior_logit)[:, None] * likelihood 100 | return pairwise_likelihood 101 | 102 | def predict(self, X): 103 | posterior = self.pairwise_likelihood(torch.Tensor(X)).detach().numpy() 104 | posterior /= (posterior.sum(axis=0, keepdims=True) + self.epsilon) 105 | return posterior 106 | 107 | 108 | if __name__ == '__main__': 109 | import numpy as np 110 | 111 | def demonstrate(desc, X): 112 | gmm = GMMGradientDescent(3) 113 | gmm.fit(X) 114 | pred = gmm.predict(X).T 115 | plt.scatter(X[:, 0], X[:, 1], color=pred) 116 | plt.title(desc) 117 | plt.show() 118 | 119 | # ---------------------- Example 1--------------------------------------------- 120 | X = np.concatenate([ 121 | np.random.normal([0, 0], [.3, .3], [100, 2]), 122 | np.random.normal([0, 1], [.3, .3], [100, 2]), 123 | np.random.normal([1, 0], [.3, .3], [100, 2]), 124 | ]) 125 | demonstrate("Example 1", X) 126 | 127 | # ---------------------- Example 2--------------------------------------------- 128 | demonstrate("Example 2: GMM does'nt promise the same result for the same data", X) 129 | 130 | # ---------------------- Example 3--------------------------------------------- 131 | X = np.concatenate([ 132 | np.random.normal([0, 0], [.4, .4], [100, 2]), 133 | np.random.normal([0, 1], [.4, .4], [100, 2]), 134 | np.random.normal([1, 0], [.4, .4], [100, 2]), 135 | ]) 136 | demonstrate("Example 3", X) 137 | 138 | # ---------------------- Example 4--------------------------------------------- 139 | X = np.concatenate([ 140 | np.random.normal([0, 0], [.4, .4], [100, 2]), 141 | np.random.normal([0, 3], [.4, .4], [100, 2]), 142 | np.random.normal([3, 0], [.4, .4], [100, 2]), 143 | ]) 144 | demonstrate("Example 4", X) 145 | 146 | # ---------------------- Example 5--------------------------------------------- 147 | X = np.concatenate([ 148 | np.random.normal([0, 0], [.4, .4], [1, 2]), 149 | np.random.normal([0, 3], [.4, .4], [1, 2]), 150 | np.random.normal([3, 0], [.4, .4], [1, 2]), 151 | ]) 152 | demonstrate("Example 5", X) 153 | -------------------------------------------------------------------------------- /09.EM/benchmark.py: -------------------------------------------------------------------------------- 1 | from tabnanny import verbose 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | 5 | from GMM import GMM 6 | from GMMGradientDescent import GMMGradientDescent 7 | 8 | def compare(X, k): 9 | gmm = GMM(k, verbose=False) 10 | gmm_gradient_descent = GMMGradientDescent(k, verbose=False) 11 | gmm.fit(X) 12 | gmm_gradient_descent.fit(X) 13 | gmm_likelihood = np.exp(gmm.log_likelihood(X)) 14 | gmm_gradient_descent_log_likelihood = np.exp(gmm_gradient_descent.log_likelihood(X)) 15 | return gmm_likelihood, gmm_gradient_descent_log_likelihood 16 | 17 | X = np.concatenate([ 18 | np.random.normal([0, 0], [.3, .3], [100, 2]), 19 | np.random.normal([0, 1], [.3, .3], [100, 2]), 20 | np.random.normal([1, 0], [.3, .3], [100, 2]), 21 | ]) 22 | gmm_likelihoods = [] 23 | gmm_gradient_descent_likelihoods = [] 24 | for i in range(50): 25 | print('Running comparison', i) 26 | gmm_likelihood, gmm_gradient_descent_likelihood = compare(X, 3) 27 | gmm_likelihoods.append(gmm_likelihood) 28 | gmm_gradient_descent_likelihoods.append(gmm_gradient_descent_likelihood) 29 | print('likelihood of EM algorithm is', gmm_likelihood) 30 | print('likelihood of gradient descent is', gmm_gradient_descent_likelihood) 31 | 32 | plt.boxplot([gmm_likelihoods, gmm_gradient_descent_likelihoods]) 33 | # plt.axes().set_xticklabels(["EM", "gradient descent"]) 34 | plt.show() -------------------------------------------------------------------------------- /10.HMM/Backward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rich.console import Console 3 | from rich.table import Table 4 | 5 | def backward(state2state, state2observation, initial_state, observation): 6 | """ 7 | Given a HMM with parameter (state2state, state2observation, initial_state) 8 | and the observation, 9 | return the probability of the observation generated by this HMM 10 | 11 | state2state is a matrix shaped of [state_size, state_size] 12 | state2observation is a matrix shaped of [state_size, observation_size] 13 | initial_state is a tensor shaped of [state_size], whose each dimension means the probability of each state 14 | observation is a matrix shaped of [sequence_length] 15 | 16 | where 17 | 18 | data_size is the number of all the data initial_stateeces 19 | state_size is the number of all the possible states 20 | observation_size is the number of all the possible observations 21 | sequence_length is the length of each sequence 22 | 23 | the return value consists of two parts: 24 | the probability of the observation, 25 | and a sequence of probability of each state of each step 26 | """ 27 | state_size, _ = state2state.shape 28 | data_size, sequence_length = observation.shape 29 | 30 | seq_state_likelihood = np.zeros([data_size, sequence_length, state_size]) 31 | state_likelihood = np.ones([state_size, data_size]) 32 | for i in range(sequence_length - 1, -1, -1): 33 | o = observation[:, i] 34 | # given the parameter of HMM and each possible state this step, get the probability of the following observation 35 | state_likelihood = state2state @ state_likelihood 36 | seq_state_likelihood[:, i, :] = state_likelihood.T 37 | # given the observation of this step, get the probability of this state 38 | state_likelihood = state_likelihood * state2observation[:, o] 39 | state_prob = state_likelihood.T * initial_state 40 | return state_prob.sum(axis=-1), seq_state_likelihood 41 | 42 | 43 | if __name__ == '__main__': 44 | def demonstrate(state2state, state2observation, initial_state, observation, desc): 45 | console = Console(markup=False) 46 | prob = backward(state2state, state2observation, initial_state, observation)[0] 47 | 48 | # show in table 49 | print(desc) 50 | table = Table('sequence', 'prob') 51 | for o, p in zip(observation, prob): 52 | table.add_row(str(o), str(p)) 53 | table.add_row("Sum", str(sum(prob))) 54 | console.print(table) 55 | 56 | # ---------------------- Example 1 -------------------------------------------- 57 | state2state = np.array( 58 | [[.5, .2, .3], 59 | [.3, .5, .2], 60 | [.2, .3, .5]] 61 | ) 62 | state2observation = np.array( 63 | [[.5, .5], 64 | [.4, .6], 65 | [.7, .3]] 66 | ) 67 | initial_state = np.array([.2, .4, .4]) 68 | observation = np.array([ 69 | [0, 0, 0], 70 | [0, 0, 1], 71 | [0, 1, 0], 72 | [0, 1, 1], 73 | [1, 0, 0], 74 | [1, 0, 1], 75 | [1, 1, 0], 76 | [1, 1, 1], 77 | ]) 78 | demonstrate(state2state, state2observation, initial_state, observation, "Example 1") 79 | 80 | # ---------------------- Example 2 -------------------------------------------- 81 | state2state = np.array( 82 | [[.5, .5], 83 | [.5, .5]] 84 | ) 85 | state2observation = np.array( 86 | [[.5, .5], 87 | [.5, .5]] 88 | ) 89 | initial_state = np.array([.5, .5]) 90 | observation = np.array([ 91 | [0], 92 | [1], 93 | ]) 94 | demonstrate(state2state, state2observation, initial_state, observation, "Example 2") 95 | -------------------------------------------------------------------------------- /10.HMM/BaumWelch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | from pathlib import Path 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 6 | from utils import * 7 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '10.HMM')) 8 | from Backward import backward 9 | from Forward import forward 10 | 11 | def baum_welch(observation, state_size, observation_size, epsilon=1e-8, max_iteration=500): 12 | """ 13 | Given a batch of sequence of observation, 14 | return the parameter of the learnt HMM 15 | 16 | observation is a matrix shaped of [data_size, sequence_length] 17 | 18 | where 19 | 20 | data_size is the number of all the data initial_stateeces 21 | sequence_length is the length of each sequence 22 | 23 | """ 24 | data_size, sequence_legnth = observation.shape 25 | 26 | # initial parameters 27 | state2state = np.random.rand(state_size, state_size) 28 | state2observation = np.random.rand(state_size, observation_size) 29 | initial_state = np.random.rand(state_size) 30 | state2state /= state2state.sum(axis=-1, keepdims=True) 31 | state2observation /= state2observation.sum(axis=-1, keepdims=True) 32 | initial_state /= initial_state.sum() 33 | 34 | for _ in range(max_iteration): 35 | pre_state2state, pre_state2observation, pre_initial_state = state2state, state2observation, initial_state 36 | 37 | # Expectation step, from parameters to probability of states 38 | state_prob_forward = forward(state2state, state2observation, initial_state, observation)[1] 39 | state_likelihood_backward = backward(state2state, state2observation, initial_state, observation)[1] 40 | state_likelihood = state_prob_forward * state_likelihood_backward + epsilon 41 | 42 | state_likelihood_wrt_observation = state2observation.T[observation] 43 | 44 | state_prob = state_likelihood / state_likelihood.sum(axis=-1, keepdims=True) 45 | state_trans_prob = state_prob_forward[:, :-1, :, None] * \ 46 | state2state[None, None, :, :] * \ 47 | state_likelihood_wrt_observation[:, 1:, None, :] * \ 48 | state_likelihood_backward[:, 1:, None, :] 49 | state_trans_prob /= state_trans_prob.sum(axis=(-1, -2), keepdims=True) 50 | 51 | # Maximization step, from probability of states to parameters 52 | state2state = state_trans_prob.sum(axis=(0, 1)) / state_prob[:, :-1, :].sum(axis=(0, 1))[:, None] 53 | state2state /= state2state.sum(axis=-1, keepdims=True) 54 | state2observation = ((observation[:, :, None] == np.arange(observation_size)[None, None, :])[:, :, None, :] * 55 | state_prob[:, :, :, None]).sum(axis=(0, 1)) / state_prob.sum(axis=(0, 1))[:, None] 56 | initial_state = state_prob[:, 0].mean(axis=0) 57 | 58 | stride = np.mean([abs(pre_state2state - state2state).mean(), 59 | abs(pre_state2observation - state2observation).mean(), 60 | abs(pre_initial_state - initial_state).mean()]) 61 | if stride < epsilon: 62 | break 63 | return state2state, state2observation, initial_state 64 | 65 | 66 | if __name__ == '__main__': 67 | def demonstrate(observation, state_size, observation_size, desc): 68 | print(desc) 69 | state2state, state2observation, initial_state = baum_welch(observation, state_size, observation_size) 70 | print('state2state is:\n', np.round(state2state, 2)) 71 | print('state2observation is:\n', np.round(state2observation, 2)) 72 | print('initial_state is:\n', np.round(initial_state, 2)) 73 | print('') 74 | 75 | # Example 1 76 | observation = np.array([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]]) 77 | state_size = 2 78 | observation_size = 2 79 | demonstrate(observation, state_size, observation_size, "Example 1") 80 | 81 | # Example 2 82 | observation = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) 83 | state_size = 2 84 | observation_size = 2 85 | demonstrate(observation, state_size, observation_size, "Example 2") 86 | 87 | # Example 3 88 | observation = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) 89 | state_size = 2 90 | observation_size = 2 91 | demonstrate(observation, state_size, observation_size, "Example 3") 92 | 93 | # Example 3 94 | observation = np.array([[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]]) 95 | state_size = 3 96 | observation_size = 3 97 | demonstrate(observation, state_size, observation_size, "Example 4") 98 | -------------------------------------------------------------------------------- /10.HMM/Forward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rich.console import Console 3 | from rich.table import Table 4 | 5 | def forward(state2state, state2observation, initial_state, observation): 6 | """ 7 | Given a HMM with parameter (state2state, state2observation, initial_state) 8 | and the observation, 9 | return the probability of the observation generated by this HMM 10 | 11 | state2state is a matrix shaped of [state_size, state_size] 12 | state2observation is a matrix shaped of [state_size, observation_size] 13 | initial_state is a tensor shaped of [state_size], whose each dimension means the probability of each state 14 | observation is a matrix shaped of [data_size, sequence_length] 15 | 16 | where 17 | 18 | data_size is the number of all the data initial_stateeces 19 | state_size is the number of all the possible states 20 | observation_size is the number of all the possible observations 21 | sequence_length is the length of each sequence 22 | 23 | the return value consists of two parts: 24 | the probability of the observation, 25 | and a sequence of probability of each state of each step 26 | """ 27 | state_size, _ = state2state.shape 28 | data_size, sequence_length = observation.shape 29 | 30 | seq_state_prob = np.zeros([data_size, sequence_length, state_size]) 31 | state_prob = initial_state[None, :] 32 | for i, o in enumerate(observation.T): 33 | # given the parameters of HMM, get the probability of this state with the previous observation 34 | state_prob = state_prob * state2observation.T[o] 35 | seq_state_prob[:, i, :] = state_prob 36 | # the probability of each state in next step 37 | state_prob = state_prob @ state2state 38 | return state_prob.sum(axis=-1), seq_state_prob 39 | 40 | 41 | if __name__ == '__main__': 42 | def demonstrate(state2state, state2observation, initial_state, observation, desc): 43 | console = Console(markup=False) 44 | prob = forward(state2state, state2observation, initial_state, observation)[0] 45 | 46 | # show in table 47 | print(desc) 48 | table = Table('sequence', 'prob') 49 | for o, p in zip(observation, prob): 50 | table.add_row(str(o), str(p)) 51 | table.add_row("Sum", str(sum(prob))) 52 | console.print(table) 53 | 54 | # ---------------------- Example 1 -------------------------------------------- 55 | state2state = np.array( 56 | [[.5, .2, .3], 57 | [.3, .5, .2], 58 | [.2, .3, .5]] 59 | ) 60 | state2observation = np.array( 61 | [[.5, .5], 62 | [.4, .6], 63 | [.7, .3]] 64 | ) 65 | initial_state = np.array([.2, .4, .4]) 66 | observation = np.array([ 67 | [0, 0, 0], 68 | [0, 0, 1], 69 | [0, 1, 0], 70 | [0, 1, 1], 71 | [1, 0, 0], 72 | [1, 0, 1], 73 | [1, 1, 0], 74 | [1, 1, 1], 75 | ]) 76 | demonstrate(state2state, state2observation, initial_state, observation, "Example 1") 77 | 78 | # ---------------------- Example 2 -------------------------------------------- 79 | state2state = np.array( 80 | [[.5, .5], 81 | [.5, .5]] 82 | ) 83 | state2observation = np.array( 84 | [[.5, .5], 85 | [.5, .5]] 86 | ) 87 | initial_state = np.array([.5, .5]) 88 | observation = np.array([ 89 | [0], 90 | [1], 91 | ]) 92 | demonstrate(state2state, state2observation, initial_state, observation, "Example 2") 93 | 94 | 95 | # ---------------------- Example 2 -------------------------------------------- 96 | state2state = np.array( 97 | [[.0, 1.], 98 | [1., .0]] 99 | ) 100 | state2observation = np.array( 101 | [[1., 0.], 102 | [0., 1.]] 103 | ) 104 | initial_state = np.array([0., 1.]) 105 | observation = np.array([ 106 | [1, 0, 1, 0, 1, 0], 107 | ]) 108 | demonstrate(state2state, state2observation, initial_state, observation, "Example 2") 109 | -------------------------------------------------------------------------------- /10.HMM/HMM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from functools import partial 4 | import sys 5 | from pathlib import Path 6 | from rich.console import Console 7 | from rich.table import Table 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 9 | from utils import * 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '10.HMM')) 11 | from BaumWelch import baum_welch 12 | from Viterbi import viterbi 13 | 14 | class HMM: 15 | def __init__(self, state_size, observation_size, max_iteration=2000, verbose=False, epsilon=1e-8): 16 | self.max_iteration = max_iteration 17 | self.verbose = verbose 18 | self.state_size = state_size 19 | self.observation_size = observation_size 20 | self.epsilon = epsilon 21 | 22 | def fit(self, X): 23 | """ 24 | When there is no label in the training data, 25 | HMM uses baum-welch for training. 26 | Otherwise just counting the probability will be fine (not implemented here) 27 | """ 28 | self.state2state, self.state2observation, self.initial_state = \ 29 | baum_welch(X, self.state_size, self.observation_size, self.epsilon, self.max_iteration) 30 | 31 | def predict(self, X): 32 | """HMM uses viterbi for predicting""" 33 | Y = np.zeros_like(X) 34 | Y = np.apply_along_axis( 35 | partial(viterbi, self.state2state, self.state2observation, self.initial_state), -1, X) 36 | return Y 37 | 38 | 39 | if __name__ == '__main__': 40 | def demonstrate(X, testX, desc): 41 | console = Console(markup=False) 42 | 43 | vocab = set(X.flatten()) 44 | vocab_size = len(vocab) 45 | word2num = {word: num for num, word in enumerate(vocab)} 46 | 47 | f_word2num = np.vectorize(lambda word: word2num[word]) 48 | 49 | numX, num_testX = map(f_word2num, (X, testX)) 50 | 51 | hmm = HMM(4, vocab_size) 52 | hmm.fit(numX) 53 | pred = hmm.predict(num_testX) 54 | 55 | # show in table 56 | print(desc) 57 | table = Table() 58 | for x, p in zip(testX, pred): 59 | table.add_row(*map(str, x)) 60 | table.add_row(*map(str, p)) 61 | console.print(table) 62 | 63 | 64 | # ---------------------- Example 1 -------------------------------------------- 65 | X = np.array([s.split() for s in 66 | ['i am good .', 67 | 'i am bad .', 68 | 'you are good .', 69 | 'you are bad .', 70 | 'it is good .', 71 | 'it is bad .', 72 | ] 73 | ]) 74 | testX = X 75 | demonstrate(X, testX, "Example 1") 76 | 77 | # ---------------------- Example 2 -------------------------------------------- 78 | testX = np.array([s.split() for s in 79 | ['you is good .', 80 | 'i are bad .', 81 | 'it are good .'] 82 | ]) 83 | testX = np.concatenate([X, testX]) 84 | demonstrate(X, testX, "Example 2") 85 | -------------------------------------------------------------------------------- /10.HMM/Viterbi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def viterbi(state2state, state2observation, initial_state, observation): 4 | """ 5 | Given a HMM with parameter (state2state, state2observation, initial_state) 6 | and the observation, 7 | return the most possible state sequence 8 | 9 | state2state is a matrix shaped of [state_size, state_size] 10 | state2observation is a matrix shaped of [state_size, observation_size] 11 | initial_state is a tensor shaped of [state_size], whose each dimension means the probability of each state 12 | observation is a tensor shaped of [sequence_length] 13 | observation_size is the number of all the possible observations 14 | """ 15 | sequence_length, = observation.shape 16 | state_size, _ = state2state.shape 17 | 18 | state_prob = initial_state 19 | pre_state = np.zeros([sequence_length, state_size]).astype(int) 20 | for i, o in enumerate(observation): 21 | state_prob *= state2observation[:, o] 22 | if i != sequence_length - 1: 23 | trans_prob = state_prob[:, None] * state2state 24 | pre_state[i + 1] = trans_prob.argmax(axis=0) 25 | state_prob = trans_prob.max(axis=0) 26 | ans = np.zeros(sequence_length).astype(int) 27 | ans[-1] = state_prob.argmax() 28 | for i in range(sequence_length - 2, -1, -1): 29 | ans[i] = pre_state[i + 1, ans[i + 1]] 30 | return ans 31 | 32 | 33 | if __name__ == '__main__': 34 | state2state = np.array([[.5, .2, .3], 35 | [.3, .5, .2], 36 | [.2, .3, .5]]) 37 | state2observation = np.array([[.5, .5], 38 | [.4, .6], 39 | [.7, .3]]) 40 | initial_state = np.array([.2, .4, .4]) 41 | observation = np.array([0, 1, 0]) 42 | print(viterbi(state2state, state2observation, initial_state, observation)) 43 | -------------------------------------------------------------------------------- /11.ConditionalRandomField/LinearChainConditionalRandomField.py: -------------------------------------------------------------------------------- 1 | from math import log 2 | import os 3 | from matplotlib.tri.triinterpolate import LinearTriInterpolator 4 | import numpy as np 5 | from functools import partial 6 | import sys 7 | from pathlib import Path 8 | from rich.console import Console 9 | from rich.table import Table 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 11 | from utils import * 12 | 13 | class LinearChainConditionalRandomField: 14 | def __init__(self, feature_funcs, trans_feature_funcs, sequence_length, n_x, n_y, max_iteration=100, verbose=False): 15 | """ 16 | `feature_funcs` are a group of functions s(y_i, X, i) in a list 17 | `trans_feature_funcs` are a group of functions t(y_{i-1}, y_i, X, i) in a list 18 | `sequence_length` is the length of each input sequence 19 | `n_x` is the number of possible values of each item in a sequence x 20 | `n_y` is the number of possible values of each item in a sequence y 21 | """ 22 | self.feature_funcs = feature_funcs 23 | self.trans_feature_funcs = trans_feature_funcs 24 | self.n_x = n_x 25 | self.n_y = n_y 26 | self.sequence_length = sequence_length 27 | self.max_iteration = max_iteration 28 | self.verbose = verbose 29 | 30 | def get_trans(self, x): 31 | """get transition matrix given observed sequence x""" 32 | trans_feature = np.zeros([self.sequence_length, self.n_y, self.n_y]) 33 | for i in range(self.sequence_length): 34 | for y_i_1 in range(self.n_y): 35 | for y_i in range(self.n_y): 36 | for j, func in enumerate(self.used_feature_funcs): 37 | trans_feature[i, y_i_1, y_i] += self.w_feature_funcs[j] * func(y_i, x, i) 38 | if i > 0: 39 | for y_i_1 in range(self.n_y): 40 | for y_i in range(self.n_y): 41 | for j, func in enumerate(self.used_trans_feature_funcs): 42 | trans_feature[i, y_i_1, y_i] += self.w_trans_feature_funcs[j] * func(y_i_1, y_i, x, i) 43 | return np.exp(trans_feature) 44 | 45 | def fit(self, X, Y): 46 | """ 47 | X is a two dimensional matrix of observation sequence 48 | Y is a two dimensional matrix of hidden state sequence 49 | optimize weights by Improved Iterative Scaling 50 | """ 51 | E_feature = np.zeros(len(self.feature_funcs)) 52 | E_trans_feature = np.zeros(len(self.trans_feature_funcs)) 53 | 54 | # Because each x is a sequence, it's vector space is too large to iterate. 55 | # We need to store all the possible sequence x during the training time 56 | # and only iterate over existing x. 57 | p_x = {tuple(x): 0. for x in X} 58 | 59 | for x, y in zip(X, Y): 60 | x_key = tuple(x) 61 | p_x[x_key] += 1 / len(X) 62 | for i, yi in enumerate(y): 63 | for j, func in enumerate(self.feature_funcs): 64 | E_feature[j] += func(yi, x, i) / len(X) 65 | for i in range(1, self.sequence_length): 66 | yi_1, yi = y[i - 1], y[i] 67 | for j, func in enumerate(self.trans_feature_funcs): 68 | E_trans_feature[j] += func(yi_1, yi, x, i) / len(X) 69 | 70 | # features that don't show in training data are useless, filter them 71 | self.used_feature_funcs = [func for E, func in zip(E_feature, self.feature_funcs) if E != 0] 72 | self.used_trans_feature_funcs = [func for E, func in zip(E_trans_feature, self.trans_feature_funcs) if E != 0] 73 | E_feature = E_feature[E_feature.nonzero()] 74 | E_trans_feature = E_trans_feature[E_trans_feature.nonzero()] 75 | self.w_feature_funcs = np.zeros(len(self.used_feature_funcs)) 76 | self.w_trans_feature_funcs = np.zeros(len(self.used_trans_feature_funcs)) 77 | 78 | # pre-calculate all the possible values of feature functions 79 | feature = np.zeros([len(self.used_feature_funcs), len(p_x), self.sequence_length, self.n_y]) 80 | trans_feature = np.zeros([len(self.used_trans_feature_funcs), len(p_x), self.sequence_length, self.n_y, self.n_y]) 81 | for x_i, x_key in enumerate(p_x): 82 | x = np.array(x_key) 83 | for func_i, func in enumerate(self.used_trans_feature_funcs): 84 | for i in range(1, self.sequence_length): 85 | for y_i_1 in range(self.n_y): 86 | for y_i in range(self.n_y): 87 | trans_feature[func_i, x_i, i, y_i_1, y_i] = func(y_i_1, y_i, x, i) 88 | for func_i, func in enumerate(self.used_feature_funcs): 89 | for i in range(self.sequence_length): 90 | for y_i in range(self.n_y): 91 | feature[func_i, x_i, i, y_i] = func(y_i, x, i) 92 | 93 | # pre-calculate the max number of features, given x 94 | max_feature = np.zeros(len(p_x), dtype=int) 95 | sum_trans_feature = trans_feature.sum(axis=0) 96 | sum_feature = feature.sum(axis=0) 97 | for x_i, x_key in enumerate(p_x): 98 | cur_max_feature = np.zeros(self.n_y) 99 | for i in range(self.sequence_length): 100 | cur_max_feature = (cur_max_feature[:, None] + sum_trans_feature[x_i, i]).max(axis=0) + sum_feature[x_i, i] 101 | max_feature[x_i] = cur_max_feature.max() 102 | n_coef = max(max_feature) + 1 103 | 104 | # train 105 | for iteration in range(self.max_iteration): 106 | if self.verbose: 107 | print(f'Iteration {iteration} starts...') 108 | loss = 0. 109 | for funcs, w, E_experience in zip( 110 | [self.used_feature_funcs, self.used_trans_feature_funcs], 111 | [self.w_feature_funcs, self.w_trans_feature_funcs], 112 | [E_feature, E_trans_feature]): 113 | for func_i in range(len(funcs)): 114 | # if funcs is self.used_trans_feature_funcs: 115 | coef = np.zeros(n_coef) 116 | # only iterater over possible x 117 | for x_i, x_key in enumerate(p_x): 118 | cur_p_x = p_x[x_key] 119 | x = np.array(x_key) 120 | 121 | trans = self.get_trans(x) 122 | # forward algorithm 123 | cur_prob = np.ones(self.n_y) 124 | forward_prob = np.zeros([self.sequence_length + 1, self.n_y]) 125 | forward_prob[0] = cur_prob 126 | for i in range(self.sequence_length): 127 | cur_prob = cur_prob @ trans[i] 128 | forward_prob[i + 1] = cur_prob 129 | # backward algorithm 130 | cur_prob = np.ones(self.n_y) 131 | backward_prob = np.zeros([self.sequence_length + 1, self.n_y]) 132 | backward_prob[-1] = cur_prob 133 | for i in range(self.sequence_length - 1, -1, -1): 134 | cur_prob = trans[i] @ cur_prob 135 | backward_prob[i] = cur_prob 136 | 137 | if iteration < 10: 138 | np.testing.assert_almost_equal( 139 | forward_prob[-1].sum(), 140 | backward_prob[0].sum() 141 | ) 142 | for i in range(1, self.sequence_length + 1): 143 | np.testing.assert_almost_equal( 144 | forward_prob[i] @ backward_prob[i], 145 | forward_prob[-1].sum() 146 | ) 147 | for i in range(0, self.sequence_length): 148 | np.testing.assert_almost_equal( 149 | (np.outer(forward_prob[i], backward_prob[i + 1]) * trans[i]).sum(), 150 | forward_prob[-1].sum() 151 | ) 152 | 153 | # calculate expectation of each feature_function given x 154 | cur_E_feature = 0. 155 | if funcs is self.used_feature_funcs: 156 | for i in range(1, self.sequence_length + 1): 157 | cur_E_feature += ( 158 | forward_prob[i] * backward_prob[i] * feature[func_i, x_i, i - 1] 159 | ).sum() 160 | elif funcs is self.used_trans_feature_funcs: 161 | for i in range(0, self.sequence_length): 162 | cur_E_feature += ( 163 | np.outer(forward_prob[i], backward_prob[i + 1]) * trans[i] * trans_feature[func_i, x_i, i] 164 | ).sum() 165 | else: 166 | raise Exception("Unknown function set!") 167 | cur_E_feature /= forward_prob[-1].sum() 168 | 169 | coef[max_feature[x_i]] += cur_p_x * cur_E_feature 170 | 171 | # update w 172 | dw_i = log(newton( 173 | lambda x: sum(c * x ** i for i, c in enumerate(coef)) - E_experience[func_i], 174 | lambda x: sum(i * c * x ** (i - 1) for i, c in enumerate(coef) if i > 0), 175 | 1 176 | )) 177 | w[func_i] += dw_i 178 | loss += abs(E_experience[func_i] - coef.sum()) 179 | loss /= len(self.feature_funcs) + len(self.trans_feature_funcs) 180 | if self.verbose: 181 | print(f'Iteration {iteration} ends, Loss: {loss}') 182 | 183 | def predict(self, X): 184 | """ 185 | predict state sequence y using viterbi algorithm 186 | X is a group of sequence x in a two-dimensional array 187 | """ 188 | 189 | ans = np.zeros([len(X), self.sequence_length]) 190 | for x_i, x in enumerate(X): 191 | # pre-calculate all the possible values of feature functions 192 | feature = np.zeros([len(self.used_feature_funcs), self.sequence_length, self.n_y]) 193 | trans_feature = np.zeros([len(self.used_trans_feature_funcs), self.sequence_length, self.n_y, self.n_y]) 194 | for func_i, func in enumerate(self.used_trans_feature_funcs): 195 | for i in range(1, self.sequence_length): 196 | for y_i_1 in range(self.n_y): 197 | for y_i in range(self.n_y): 198 | trans_feature[func_i, i, y_i_1, y_i] = func(y_i_1, y_i, x, i) 199 | for func_i, func in enumerate(self.used_feature_funcs): 200 | for i in range(self.sequence_length): 201 | for y_i in range(self.n_y): 202 | feature[func_i, i, y_i] = func(y_i, x, i) 203 | feature = (self.w_feature_funcs[:, None, None] * feature).sum(axis=0) 204 | trans_feature = (self.w_trans_feature_funcs[:, None, None, None] * trans_feature).sum(axis=0) 205 | 206 | # viterbi 207 | pre_state = np.zeros([self.sequence_length, self.n_y], dtype=int) - 1 208 | prob = np.zeros([self.sequence_length, self.n_y]) 209 | cur_prob = np.ones(self.n_y) 210 | for i in range(self.sequence_length): 211 | trans_prob = cur_prob[:, None] + trans_feature[i] 212 | pre_state[i] = trans_prob.argmax(axis=0) 213 | cur_prob = trans_prob.max(axis=0) + feature[i] 214 | prob[i] = cur_prob 215 | 216 | # back track the trace 217 | cur_state = prob[-1].argmax() 218 | for i in range(self.sequence_length - 1, -1, -1): 219 | ans[x_i, i] = cur_state 220 | cur_state = pre_state[i, cur_state] 221 | return ans 222 | 223 | 224 | if __name__ == '__main__': 225 | def demonstrate(X, Y, testX, n_y, desc): 226 | console = Console(markup=False) 227 | 228 | vocab = set(X.flatten()) 229 | vocab_size = len(vocab) 230 | word2num = {word: num for num, word in enumerate(vocab)} 231 | 232 | f_word2num = np.vectorize(lambda word: word2num[word]) 233 | 234 | numX, num_testX = map(f_word2num, (X, testX)) 235 | 236 | sequence_length = numX.shape[-1] 237 | 238 | class FeatureFunc: 239 | def __init__(self, x_i, y_i): 240 | self.x_i = x_i 241 | self.y_i = y_i 242 | 243 | def __call__(self, y_i, x, i): 244 | return int(y_i == self.y_i and x[i] == self.x_i) 245 | 246 | class TransFeatureFunc: 247 | def __init__(self, y_i_1, y_i): 248 | self.y_i = y_i 249 | self.y_i_1 = y_i_1 250 | 251 | def __call__(self, y_i_1, y_i, x, i): 252 | return int(y_i_1 == self.y_i_1 and y_i == self.y_i) 253 | 254 | feature_funcs = [FeatureFunc(x_i, y_i) 255 | for x_i in range(vocab_size) 256 | for y_i in range(n_y)] 257 | trans_feature_funcs = [TransFeatureFunc(y_i_1, y_i) 258 | for y_i_1 in range(n_y) 259 | for y_i in range(n_y)] 260 | 261 | linear_chain_conditional_random_field = LinearChainConditionalRandomField( 262 | feature_funcs, 263 | trans_feature_funcs, 264 | sequence_length, 265 | vocab_size, 266 | n_y, 267 | verbose=True 268 | ) 269 | linear_chain_conditional_random_field.fit(numX, Y) 270 | pred = linear_chain_conditional_random_field.predict(num_testX) 271 | 272 | # show in table 273 | print(desc) 274 | table = Table() 275 | for x, p in zip(testX, pred): 276 | table.add_row(*map(str, x)) 277 | table.add_row(*map(str, p)) 278 | console.print(table) 279 | 280 | 281 | # ---------------------- Example 1 -------------------------------------------- 282 | X = np.array([s.split() for s in 283 | ['i am good .', 284 | 'i am bad .', 285 | 'you are good .', 286 | 'you are bad .', 287 | 'it is good .', 288 | 'it is bad .', 289 | ] 290 | ]) 291 | Y = np.array([ 292 | [0, 1, 2, 3], 293 | [0, 1, 2, 3], 294 | [0, 1, 2, 3], 295 | [0, 1, 2, 3], 296 | [0, 1, 2, 3], 297 | ]) 298 | testX = np.array([s.split() for s in 299 | ['you is good .', 300 | 'i are bad .', 301 | 'it are good .'] 302 | ]) 303 | testX = np.concatenate([X, testX]) 304 | demonstrate(X, Y, testX, 4, "Example 1") 305 | 306 | # ---------------------- Example 1 -------------------------------------------- 307 | X = np.array([s.split() for s in 308 | ['i be good .', 309 | 'you be good .', 310 | 'be good . .', 311 | 'i love you .', 312 | 'he be . .', 313 | ] 314 | ]) 315 | # pronoun: 0, verb: 1, adjective: 2, ".": 3 316 | Y = np.array([ 317 | [0, 1, 2, 3], 318 | [0, 1, 2, 3], 319 | [1, 2, 3, 3], 320 | [0, 1, 0, 3], 321 | [0, 1, 3, 3], 322 | ]) 323 | testX = np.array([s.split() for s in 324 | ['you be good .', 325 | 'he love you .', 326 | 'i love good .', 327 | '. be love .', 328 | '. love be .', 329 | '. . be good'] 330 | ]) 331 | testX = np.concatenate([X, testX]) 332 | demonstrate(X, Y, testX, 4, "Example 2") 333 | -------------------------------------------------------------------------------- /14.Cluster/Agglomerative.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | import sys 5 | from pathlib import Path 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 7 | from utils import euc_dis 8 | 9 | class Agglomerative: 10 | def __init__(self, k): 11 | self.k = k 12 | 13 | def get_root(self, i): 14 | if self.parent[i] != i: 15 | self.parent[i] = self.get_root(self.parent[i]) 16 | return self.parent[i] 17 | 18 | def fit_predict(self, X): 19 | """ 20 | X is a matrix shaped of [data_size, feature_size] 21 | """ 22 | data_size, feature_size = X.shape 23 | self.cluster_num = data_size 24 | 25 | self.parent = [i for i in range(data_size)] 26 | dis = euc_dis(X[:, None, :], X[None, :, :]) 27 | sorted_a, sorted_b = np.unravel_index(np.argsort(dis, axis=None), dis.shape) 28 | for a, b in zip(sorted_a, sorted_b): 29 | root_a, root_b = self.get_root(a), self.get_root(b) 30 | if root_a != root_b: 31 | if root_a > root_b: 32 | root_a, root_b = root_b, root_a 33 | self.parent[root_b] = root_a 34 | 35 | self.cluster_num -= 1 36 | if self.cluster_num <= self.k: 37 | break 38 | 39 | root = [self.get_root(i) for i in range(data_size)] 40 | root_map = {n: i for i, n in enumerate(sorted(list(set(root))))} 41 | return [root_map[r] for r in root] 42 | 43 | 44 | if __name__ == "__main__": 45 | def demonstrate(X, k, desc): 46 | agglomerative = Agglomerative(k=k) 47 | pred = agglomerative.fit_predict(X) 48 | 49 | # plot 50 | plt.scatter(X[:,0], X[:,1], c=pred, s=20) 51 | plt.title(desc) 52 | plt.show() 53 | 54 | # -------------------------- Example 1 ---------------------------------------- 55 | X = np.array([[0, 0], [0, 1], [1, 0], [2, 2], [2, 1], [1, 2]]) 56 | # generate grid-shaped test data 57 | demonstrate(X, 2, "Example 1") 58 | 59 | # -------------------------- Example 2 ---------------------------------------- 60 | X = np.concatenate([ 61 | np.random.normal([0, 0], [.3, .3], [100, 2]), 62 | np.random.normal([0, 1], [.3, .3], [100, 2]), 63 | np.random.normal([1, 0], [.3, .3], [100, 2]), 64 | ]) 65 | # generate grid-shaped test data 66 | demonstrate(X, 3, "Example 2: it is very sensitive to noise") 67 | -------------------------------------------------------------------------------- /14.Cluster/KMeans.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | import sys 5 | from pathlib import Path 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 7 | from utils import euc_dis 8 | 9 | class KMeans: 10 | def __init__(self, k, max_iterations=1000, verbose=False): 11 | self.k = k 12 | self.max_iterations = max_iterations 13 | self.verbose = verbose 14 | 15 | def fit(self, X): 16 | """ 17 | X is a matrix shaped of [data_size, feature_size] 18 | """ 19 | X = X.astype(float) 20 | data_size, feature_size = X.shape 21 | 22 | self.centers = X[np.random.choice(data_size, self.k, replace=False)] 23 | pre_centers = self.centers - 1 24 | step = 0 25 | if self.verbose: 26 | print('Initial centroids:', self.centers) 27 | while (pre_centers != self.centers).any(): 28 | pre_centers = self.centers.copy() 29 | # distance from each data sample to the centroid 30 | # dis[i, j] is the distance from i-th data sample to the j-th centroid 31 | # shape: [data_size, k] 32 | dis = euc_dis(X[:, None, :], self.centers[None, :, :]) 33 | # assignment of each data sample to centroid 34 | # cluster[i] is the index of cluster of i-th data sample 35 | # shape: [data_size] 36 | cluster = dis.argmin(axis=-1) 37 | for i in range(self.k): 38 | self.centers[i] = X[cluster == i].mean(axis=0) 39 | step += 1 40 | if self.verbose: 41 | print('Step', step) 42 | print('Assignment:', cluster) 43 | print('Centroids:', self.centers) 44 | if step == self.max_iterations: 45 | break 46 | 47 | def predict(self, X): 48 | dis = euc_dis(X[:, None, :], self.centers[None, :, :]) 49 | return dis.argmin(axis=-1) 50 | 51 | if __name__ == "__main__": 52 | def demonstrate(X, k, desc): 53 | k_means = KMeans(k=k, verbose=True) 54 | k_means.fit(X) 55 | pred = k_means.predict(X) 56 | 57 | # plot 58 | plt.scatter(k_means.centers[:, 0], k_means.centers[:,1], marker='x', label='centroids') 59 | plt.scatter(X[:,0], X[:,1], c=pred, s=20, label='data samples') 60 | plt.legend() 61 | plt.title(desc) 62 | plt.show() 63 | 64 | # -------------------------- Example 1 ---------------------------------------- 65 | X = np.array([[0, 0], [0, 1], [1, 0], [2, 2], [2, 1], [1, 2]]).astype(float) 66 | demonstrate(X, 2, "Example 1") 67 | 68 | # -------------------------- Example 2 ---------------------------------------- 69 | X = np.concatenate([ 70 | np.random.normal([0, 0], [.3, .3], [100, 2]), 71 | np.random.normal([0, 1], [.3, .3], [100, 2]), 72 | np.random.normal([1, 0], [.3, .3], [100, 2]), 73 | ]).astype(float) 74 | demonstrate(X, 3, "Example 2") 75 | 76 | # -------------------------- Example 3 ---------------------------------------- 77 | X = np.array([[0, 0], [0, 1], [0, 3]]).astype(float) 78 | demonstrate(X, 2, "Example 3: K-Means doesn't always return the best answer. (try to run multiple times!)") 79 | -------------------------------------------------------------------------------- /15.SVD/SVD.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | from pathlib import Path 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 6 | from utils import * 7 | 8 | def svd(A): 9 | """ 10 | given an m x n matrix, 11 | return the result of SVD, 12 | as a tuple of (U, Sigma, V) 13 | """ 14 | m , n = A.shape 15 | 16 | symmetry = A.T @ A 17 | rank = np.linalg.matrix_rank(symmetry) 18 | eigen_values, eigen_vectors = np.linalg.eig(symmetry) 19 | eigen_order = eigen_values.argsort()[::-1] 20 | eigen_values = eigen_values[eigen_order] 21 | 22 | eigen_values = eigen_values[: rank] 23 | eigen_vectors = eigen_vectors[:, eigen_order] 24 | # V is of shape [n, n] 25 | V = eigen_vectors 26 | eigen_vectors = eigen_vectors[:, : rank] 27 | 28 | singular_values = np.sqrt(eigen_values) 29 | singular_matrix = np.zeros_like(A) 30 | for i, v in enumerate(singular_values): 31 | singular_matrix[i][i] = v 32 | 33 | U1 = A @ eigen_vectors / singular_values 34 | U2 = get_solution_domain(row_echelon(A.T)) 35 | U = np.concatenate([U1, U2], axis=-1) 36 | return U, singular_matrix, V 37 | 38 | 39 | if __name__ == '__main__': 40 | def demonstrate(A, desc): 41 | print(desc) 42 | U, singular_matrix, V = svd(A) 43 | print("U is:") 44 | print(U) 45 | print("Singular matrix is:") 46 | print(singular_matrix) 47 | print("V is:") 48 | print(V) 49 | print("The reconstructed matrix is:") 50 | print(U @ singular_matrix @ V.T) 51 | 52 | A = np.array([[1, 1], 53 | [2, 2], 54 | [0, 0]]).astype(float) 55 | demonstrate(A, 'Example 1') 56 | 57 | A = np.array([[1, 0, 0, 0], 58 | [0, 0, 0, 4], 59 | [0, 3, 0, 0], 60 | [0, 0, 0, 0], 61 | [2, 0, 0, 0]]).astype(float) 62 | demonstrate(A, 'Example 2') 63 | 64 | A = np.array([[3, 1], 65 | [2, 1]]).astype(float) 66 | demonstrate(A, 'Example 3') 67 | -------------------------------------------------------------------------------- /16.PCA/PCA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | from pathlib import Path 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 6 | from utils import * 7 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '15.SVD')) 8 | from SVD import svd 9 | 10 | def pca(X, k=5): 11 | """ 12 | given a normlized matrix X, each of whose column is a sample 13 | the dimension of the principle component, k 14 | return the principle component matrix 15 | """ 16 | m, n = X.shape 17 | X_trans = 1 / sqrt(n - 1) * X.T 18 | _, _, V = svd(X_trans) 19 | V = V[:, :k] 20 | return V.T @ X 21 | 22 | if __name__ == '__main__': 23 | def demonstrate(X, k, desc): 24 | print(desc) 25 | X -= X.mean(axis=-1, keepdims=True) 26 | X_trans = pca(X, k=k) 27 | print(X_trans) 28 | 29 | X = np.array([[1, 1], 30 | [2, 2], 31 | [0, 0]]).astype(float) 32 | demonstrate(X, 1, 'Example 1') 33 | 34 | X = np.array([[1, 0, 0, 0], 35 | [0, 0, 0, 4], 36 | [0, 3, 0, 0], 37 | [0, 0, 0, 0], 38 | [2, 0, 0, 0]]).astype(float) 39 | demonstrate(X, 1, 'Example 2') 40 | 41 | X = np.array([[3, 1], 42 | [2, 1]]).astype(float) 43 | demonstrate(X, 1, 'Example 3') 44 | 45 | X = np.array([[0, 0], 46 | [-1, 1]]).astype(float) 47 | demonstrate(X, 1, 'Example 3') 48 | -------------------------------------------------------------------------------- /17.LSA/LSA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | from pathlib import Path 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 6 | from utils import * 7 | 8 | def lsa(word_text, k=5, max_iteration=1000): 9 | """ 10 | given a word-text matrix 11 | the dimension of the principle component, k 12 | optimize using the algorithm proposed by Lee and Seung 13 | return the word-topic matrix and text-topic matrix 14 | """ 15 | n_word, n_text = word_text.shape 16 | word_topic = np.random.rand(n_word, k) 17 | topic_text = np.random.rand(k, n_text) 18 | for i in range(max_iteration): 19 | word_topic *= (word_text @ topic_text.T) / (word_topic @ topic_text @ topic_text.T) 20 | topic_text *= (word_topic.T @ word_text) / (word_topic.T @ word_topic @ topic_text) 21 | return word_topic, topic_text.T 22 | 23 | if __name__ == '__main__': 24 | def demonstrate(X, k, desc): 25 | print(desc) 26 | word_topic, text_topic = lsa(X, k=k) 27 | print("The topic vectors of all the words are") 28 | print(word_topic) 29 | print("The topic vectors of all the texts are") 30 | print(text_topic) 31 | print("The recovered word-text matrix is") 32 | print(np.round(word_topic @ text_topic.T)) 33 | 34 | X = np.array([ 35 | [0, 0, 1, 1, 0, 0, 0, 0, 0], 36 | [0, 0, 0, 0, 0, 1, 0, 0, 1], 37 | [0, 1, 0, 0, 0, 0, 0, 1, 0], 38 | [0, 0, 0, 0, 0, 0, 1, 0, 1], 39 | [1, 0, 0, 0, 0, 1, 0, 0, 0], 40 | [1, 1, 1, 1, 1, 1, 1, 1, 1], 41 | [1, 0, 1, 0, 0, 0, 0, 0, 0], 42 | [0, 0, 0, 0, 0, 0, 1, 0, 1], 43 | [0, 0, 0, 0, 0, 2, 0, 0, 1], 44 | [1, 0, 1, 0, 0, 0, 0, 1, 0], 45 | [0, 0, 0, 1, 1, 0, 0, 0, 0], 46 | ]).astype(float) 47 | demonstrate(X, 3, 'Example 1') 48 | -------------------------------------------------------------------------------- /18.PLSA/PLSA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | from pathlib import Path 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 6 | from utils import * 7 | 8 | def plsa(word_text, k=5, max_iteration=1000, epsilon=1e-8): 9 | """ 10 | given a word-text matrix 11 | the dimension of the principle component, k 12 | optimize using EM algorithm 13 | return the word-topic matrix and text-topic matrix 14 | """ 15 | n_word, n_text = word_text.shape 16 | p_topic_when_text = np.random.rand(n_text, k) 17 | p_word_when_topic = np.random.rand(k, n_word) 18 | 19 | text_word = word_text.T 20 | text_word_cnt = text_word.sum(axis=-1, keepdims=True) 21 | for i in range(max_iteration): 22 | # E step: calculate the expectation of each topic for each word-text pair 23 | p_topic_when_text_word = p_topic_when_text[:, :, None] * p_word_when_topic[None, :, :] 24 | p_topic_when_text_word /= p_topic_when_text_word.sum(axis=1, keepdims=True) + epsilon 25 | 26 | # M step, maximazation the likelihood of the observation, i.e., the word-text matrix 27 | topic_cnt = text_word[:, None, :] * p_topic_when_text_word 28 | p_word_when_topic = (topic_cnt).sum(axis=0) / \ 29 | (topic_cnt).sum(axis=0).sum(axis=-1, keepdims=True) 30 | p_topic_when_text = (text_word[:, None, :] * p_topic_when_text_word).sum(axis=-1) / text_word_cnt 31 | return p_topic_when_text, p_word_when_topic 32 | 33 | if __name__ == '__main__': 34 | def demonstrate(X, k, desc): 35 | print(desc) 36 | p_topic_when_text, p_word_when_topic = plsa(X, k=k) 37 | print("The probabilities of each topic for each text are") 38 | print(np.round(p_topic_when_text, 2)) 39 | print("The probabilities of each word for each topic are") 40 | print(np.round(p_word_when_topic, 2)) 41 | print("The recovered text-wordcnt matrix is") 42 | print(np.round((p_topic_when_text @ p_word_when_topic).T, 2)) 43 | print() 44 | 45 | X = np.array([ 46 | [0, 0, 1, 1, 0, 0, 0, 0, 0], 47 | [0, 0, 0, 0, 0, 1, 0, 0, 1], 48 | [0, 1, 0, 0, 0, 0, 0, 1, 0], 49 | [0, 0, 0, 0, 0, 0, 1, 0, 1], 50 | [1, 0, 0, 0, 0, 1, 0, 0, 0], 51 | [1, 1, 1, 1, 1, 1, 1, 1, 1], 52 | [1, 0, 1, 0, 0, 0, 0, 0, 0], 53 | [0, 0, 0, 0, 0, 0, 1, 0, 1], 54 | [0, 0, 0, 0, 0, 2, 0, 0, 1], 55 | [1, 0, 1, 0, 0, 0, 0, 1, 0], 56 | [0, 0, 0, 1, 1, 0, 0, 0, 0], 57 | ]).astype(float) 58 | demonstrate(X, 3, 'Example 1') 59 | 60 | X = np.array([ 61 | [0, 0, 1, 1, 0, 0, 0, 0, 0], 62 | [0, 0, 0, 0, 0, 1, 0, 0, 1], 63 | [0, 1, 0, 0, 0, 0, 0, 1, 0], 64 | [0, 0, 0, 0, 0, 0, 1, 0, 1], 65 | [1, 0, 0, 0, 0, 1, 0, 0, 0], 66 | [1, 1, 1, 1, 1, 1, 1, 1, 1], 67 | [1, 0, 1, 0, 0, 0, 0, 0, 0], 68 | [0, 0, 0, 0, 0, 0, 1, 0, 1], 69 | [0, 0, 0, 0, 0, 2, 0, 0, 1], 70 | [1, 0, 1, 0, 0, 0, 0, 1, 0], 71 | [0, 0, 0, 1, 1, 0, 0, 0, 0], 72 | ]).astype(float) 73 | demonstrate(X, max(X.shape), 'Example 2: You can recogonize the original matrix from the recovered one if k is large enough') 74 | -------------------------------------------------------------------------------- /19.MCMC/GibbsSampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.stats import gaussian_kde 4 | 5 | 6 | def gibbs_sampling(dim, conditional_sampler, x0=None, burning_steps=1000, max_steps=10000, epsilon=1e-8, verbose=False): 7 | """ 8 | Given a conditionl sampler which samples from p(x_j | x_1, x_2, ... x_n) 9 | return a list of samples x ~ p, where p is the original distribution of the conditional distribution. 10 | x0 is the initial value of x. If not specified, it's set as zero vector. 11 | conditional_sampler takes (x, j) as parameters 12 | """ 13 | x = np.zeros(dim) if x0 is None else x0 14 | samples = np.zeros([max_steps - burning_steps, dim]) 15 | for i in range(max_steps): 16 | for j in range(dim): 17 | x[j] = conditional_sampler(x, j) 18 | if verbose: 19 | print("New value of x is", x_new) 20 | if i >= burning_steps: 21 | samples[i - burning_steps] = x 22 | return samples 23 | 24 | 25 | if __name__ == '__main__': 26 | def demonstrate(dim, p, desc, **args): 27 | samples = gibbs_sampling(dim, p, **args) 28 | z = gaussian_kde(samples.T)(samples.T) 29 | plt.scatter(samples[:, 0], samples[:, 1], c=z, marker='.') 30 | plt.plot(samples[: 100, 0], samples[: 100, 1], 'r-') 31 | plt.title(desc) 32 | plt.show() 33 | 34 | # example 1: 35 | mean = np.array([2, 3]) 36 | covariance = np.array([[1, 0], 37 | [0, 1]]) 38 | covariance_inv = np.linalg.inv(covariance) 39 | det_convariance = 1 40 | def gaussian_sampler1(x, j): 41 | return np.random.normal() 42 | demonstrate(2, gaussian_sampler1, "Gaussian distribution with mean of 0 and 0") 43 | 44 | # example 2: 45 | mean = np.array([2, 3]) 46 | covariance = np.array([[1, 0], 47 | [0, 1]]) 48 | covariance_inv = np.linalg.inv(covariance) 49 | det_convariance = 1 50 | def gaussian_sampler2(x, j): 51 | if j == 0: 52 | return np.random.normal(2) 53 | else: 54 | return np.random.normal(3) 55 | demonstrate(2, gaussian_sampler2, "Gaussian distribution with mean of 2 and 3") 56 | 57 | # example 3: 58 | def blocks_sampler(x, j): 59 | sample = np.random.random() 60 | if sample > .5: 61 | sample += 1. 62 | return sample 63 | demonstrate(2, blocks_sampler, "Four blocks") 64 | 65 | # example 4: 66 | def blocks_sampler(x, j): 67 | sample = np.random.random() 68 | if sample > .5: 69 | sample += 100. 70 | return sample 71 | demonstrate(2, blocks_sampler, "Four blocks with large gap.") 72 | -------------------------------------------------------------------------------- /19.MCMC/MetropolisHasting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.stats import gaussian_kde 4 | 5 | 6 | def gaussian_kernel(x1, x2): 7 | return np.exp(-((x1 - x2) ** 2).sum()) 8 | 9 | def gaussian_sampler(x): 10 | return np.random.normal(x) 11 | 12 | def metropolis_hasting(dim, p, q=gaussian_kernel, q_sampler=gaussian_sampler, x0=None, burning_steps=1000, max_steps=10000, epsilon=1e-8, verbose=False): 13 | """ 14 | Given a distribution function p (it doesn't need to be a probability, a likelihood function is enough), 15 | and the recommended distribution q, 16 | return a list of samples x ~ p, 17 | where the number of samples is max_steps - burning_steps. 18 | q_sampler is a function taking an x as input and return a sample of q(x_new | x_old). 19 | q is a distribution function representing q(x_new | x_old). 20 | q takes (x_old, x_new) as parameters. 21 | """ 22 | x = np.zeros(dim) if x0 is None else x0 23 | samples = np.zeros([max_steps - burning_steps, dim]) 24 | for i in range(max_steps): 25 | x_new = q_sampler(x) 26 | accept_prob = (p(x_new) + epsilon) / (p(x) + epsilon) * q(x, x_new) / q(x_new, x) 27 | if verbose: 28 | print("New value of x is", x_new) 29 | if np.random.random() < accept_prob: 30 | x = x_new 31 | elif verbose: 32 | print("New value is dropped") 33 | if i >= burning_steps: 34 | samples[i - burning_steps] = x 35 | return samples 36 | 37 | 38 | if __name__ == '__main__': 39 | def demonstrate(dim, p, desc, **args): 40 | samples = metropolis_hasting(dim, p, **args) 41 | z = gaussian_kde(samples.T)(samples.T) 42 | plt.scatter(samples[:, 0], samples[:, 1], c=z, marker='.') 43 | plt.plot(samples[: 100, 0], samples[: 100, 1], 'r-') 44 | plt.title(desc) 45 | plt.show() 46 | 47 | # example 1: 48 | mean = np.array([2, 3]) 49 | covariance = np.array([[1, 0], 50 | [0, 1]]) 51 | covariance_inv = np.linalg.inv(covariance) 52 | det_convariance = 1 53 | def gaussian1(x): 54 | return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean)) 55 | demonstrate(2, gaussian1, "Gaussian distribution with mean of 2 and 3") 56 | 57 | # example 2: 58 | mean = np.array([2, 3]) 59 | covariance = np.array([[1, .5], 60 | [.5, 1]]) 61 | covariance_inv = np.linalg.inv(covariance) 62 | det_convariance = 1 63 | def gaussian2(x): 64 | return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean)) 65 | demonstrate(2, gaussian2, "Gaussian distribution with mean of 2 and 3") 66 | 67 | # example 3: 68 | def blocks(x): 69 | if (0 < x[0] < 1 or 2 < x[0] < 3) and (0 < x[1] < 1 or 2 < x[1] < 3): 70 | return 1 71 | return 0 72 | demonstrate(2, blocks, "Four blocks") 73 | 74 | # example 4: 75 | def blocks(x): 76 | if (0 < x[0] < 1 or 200 < x[0] < 300) and (0 < x[1] < 1 or 200 < x[1] < 300): 77 | return 1 78 | return 0 79 | demonstrate(2, blocks, "Four blocks with large gap. (Monte Carlo doesn't solve everything)") 80 | -------------------------------------------------------------------------------- /19.MCMC/SingleComponentMetropolisHasting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.stats import gaussian_kde 4 | 5 | 6 | def gaussian_kernel(x, j, xj_new): 7 | return np.exp(-(x[j] - xj_new) ** 2) 8 | 9 | def gaussian_sampler(x, j): 10 | return np.random.normal(x[j]) 11 | 12 | def single_component_metropolis_hasting(dim, p, q=gaussian_kernel, q_sampler=gaussian_sampler, x0=None, burning_steps=1000, max_steps=10000, epsilon=1e-8, verbose=False): 13 | """ 14 | Given a distribution function p (it doesn't need to be a probability, a likelihood function is enough), 15 | and the recommended distribution q, 16 | return a list of samples x ~ p, 17 | where the number of samples is max_steps - burning_steps. 18 | q_sampler is a function taking an (x, j) as input and return a sample of q(xj_new | xj_old, old_x_without_xj) 19 | q is a distribution function representing q(xj_new, xj_old | old_x_without_xj). 20 | q takes (x, j, xj_new) as parameters, 21 | where x is the variable last step, 22 | j is index of the the parameter chosen to be updated, 23 | xj_new is the new value of x_j. 24 | x0 is the initial value of x. If not specified, it's set as zero vector. 25 | """ 26 | x = np.zeros(dim) if x0 is None else x0 27 | samples = np.zeros([max_steps - burning_steps, dim]) 28 | or i in range(max_steps): 29 | for j in range(dim): 30 | xj_new = q_sampler(x, j) 31 | x_new = x.copy() 32 | x_new[j] = xj_new 33 | accept_prob = (p(x_new) + epsilon) / (p(x) + epsilon) * q(x, j, xj_new) / q(x_new, j, x[j]) 34 | if verbose: 35 | print("New value of x is", x_new) 36 | if np.random.random() < accept_prob: 37 | x = x_new 38 | elif verbose: 39 | print("New value is dropped") 40 | if i >= burning_steps: 41 | samples[i - burning_steps] = x 42 | return samples 43 | 44 | 45 | if __name__ == '__main__': 46 | def demonstrate(dim, p, desc, **args): 47 | samples = single_component_metropolis_hasting(dim, p, **args) 48 | z = gaussian_kde(samples.T)(samples.T) 49 | plt.scatter(samples[:, 0], samples[:, 1], c=z, marker='.') 50 | plt.plot(samples[: 100, 0], samples[: 100, 1], 'r-') 51 | plt.title(desc) 52 | plt.show() 53 | 54 | # example 1: 55 | mean = np.array([2, 3]) 56 | covariance = np.array([[1, 0], 57 | [0, 1]]) 58 | covariance_inv = np.linalg.inv(covariance) 59 | det_convariance = 1 60 | def gaussian1(x): 61 | return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean)) 62 | demonstrate(2, gaussian1, "Gaussian distribution with mean of 2 and 3") 63 | 64 | # example 2: 65 | mean = np.array([2, 3]) 66 | covariance = np.array([[1, .5], 67 | [.5, 1]]) 68 | covariance_inv = np.linalg.inv(covariance) 69 | det_convariance = 1 70 | def gaussian2(x): 71 | return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean)) 72 | demonstrate(2, gaussian2, "Gaussian distribution with mean of 2 and 3") 73 | 74 | # example 3: 75 | def blocks(x): 76 | if (0 < x[0] < 1 or 2 < x[0] < 3) and (0 < x[1] < 1 or 2 < x[1] < 3): 77 | return 1 78 | return 0 79 | demonstrate(2, blocks, "Four blocks") 80 | 81 | # example 4: 82 | def blocks(x): 83 | if (0 < x[0] < 1 or 200 < x[0] < 300) and (0 < x[1] < 1 or 200 < x[1] < 300): 84 | return 1 85 | return 0 86 | demonstrate(2, blocks, "Four blocks with large gap. (Monte Carlo doesn't solve everything)") 87 | -------------------------------------------------------------------------------- /20.LDA/LDA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | from pathlib import Path 5 | from itertools import chain 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 7 | from utils import * 8 | 9 | def lda(texts, word_prior_cnt=None, topic_prior_cnt=None, k=5, max_iteration=1000, epsilon=1e-8): 10 | """ 11 | given a list of token lists, tokens are integers from [0, n_word]. 12 | return the topic distribution of each document, 13 | and the word distribution of each topic. 14 | """ 15 | n_word = max(chain(*texts)) + 1 16 | n_text = len(texts) 17 | 18 | n_text_topic = np.zeros([n_text, k]) + epsilon 19 | n_topic_word = np.zeros([k, n_word]) + epsilon 20 | if topic_prior_cnt is not None: 21 | n_text_topic += topic_prior_cnt[None, :] 22 | if word_prior_cnt is not None: 23 | n_topic_word += word_prior_cnt[None, :] 24 | 25 | topic = [[np.random.choice(k) for word in text] for text in texts] 26 | for i, (text, text_topic) in enumerate(zip(texts, topic)): 27 | for word, word_topic in zip(text, text_topic): 28 | n_text_topic[i, word_topic] += 1 29 | n_topic_word[word_topic, word] += 1 30 | 31 | for step in range(max_iteration): 32 | for i, (text, text_topic) in enumerate(zip(texts, topic)): 33 | for j, (word, word_topic) in enumerate(zip(text, text_topic)): 34 | # reduce the current value from the count 35 | n_text_topic[i, word_topic] -= 1 36 | n_topic_word[word_topic, word] -= 1 37 | # infer the current value from count of others 38 | likelihood_word_topic = n_topic_word[:, word] / n_topic_word.sum(axis=-1) 39 | likelihood_topic = n_text_topic[i, :] / n_text_topic[i, :].sum(axis=-1) 40 | likelihood_topic *= likelihood_word_topic 41 | p_topic = likelihood_topic / likelihood_topic.sum() 42 | # update count 43 | topic[i][j] = np.random.choice(k, p=p_topic) 44 | n_text_topic[i, topic[i][j]] += 1 45 | n_topic_word[topic[i][j], word] += 1 46 | 47 | p_topic_when_text = n_text_topic / n_text_topic.sum(axis=-1, keepdims=True) 48 | p_word_when_topic = n_topic_word / n_topic_word.sum(axis=-1, keepdims=True) 49 | return p_topic_when_text, p_word_when_topic 50 | 51 | if __name__ == '__main__': 52 | def demonstrate(X, k, desc, **args): 53 | print(desc) 54 | p_topic_when_text, p_word_when_topic = lda(X, k=k, **args) 55 | print("The probabilities of each topic for each text are") 56 | print(np.round(p_topic_when_text, 2)) 57 | print("The probabilities of each word for each topic are") 58 | print(np.round(p_word_when_topic, 2)) 59 | print("The recovered text-wordcnt matrix is") 60 | print(np.round((p_topic_when_text @ p_word_when_topic), 2)) 61 | print() 62 | 63 | n_vocab = 9 64 | X = [ 65 | [2, 3], 66 | [5, 8], 67 | [1, 7], 68 | [6, 8], 69 | [0, 5], 70 | [0, 1, 2, 3, 4, 5, 6, 7, 8], 71 | [0, 2], 72 | [6, 8], 73 | [5, 5, 8], 74 | [0, 2, 7], 75 | [3, 4] 76 | ] 77 | demonstrate(X, 3, 'Example 1') 78 | demonstrate(X, 8, 'Example 2: You can recogonize the original matrix from the recovered one if k is large enough') 79 | 80 | k = 8 81 | word_prior_cnt = np.ones(n_vocab) * 2 82 | topic_prior_cnt = np.ones(k) * 2 83 | demonstrate(X, k, 'Example 3: The influence of prior', word_prior_cnt=word_prior_cnt, topic_prior_cnt=topic_prior_cnt) 84 | 85 | k = 8 86 | word_prior_cnt = np.ones(n_vocab) * 2 87 | topic_prior_cnt = np.zeros(k) 88 | topic_prior_cnt[3] = 5 89 | demonstrate(X, k, 'Example 4: The influence of prior', word_prior_cnt=word_prior_cnt, topic_prior_cnt=topic_prior_cnt) 90 | -------------------------------------------------------------------------------- /21.PageRank/PageRank.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | from pathlib import Path 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent)) 6 | from utils import * 7 | 8 | def pageRank(graph, d, max_iteration=1000, epsilon=1e-8): 9 | """ 10 | given a n * n link graph 11 | graph[i, j] = 1 means that there is a link from i to j 12 | d is the proportion of neighbours in the definition of page rank 13 | return the probablisitic for a user visiting each page 14 | """ 15 | n, _ = graph.shape 16 | p = np.ones(n) / n 17 | graph /= (graph.sum(axis=-1, keepdims=True) + epsilon) 18 | graph = graph.T 19 | for i in range(max_iteration): 20 | pre_p = p 21 | p = d * graph @ p + (1 - d) / n 22 | if max(p - pre_p) < epsilon: 23 | break 24 | return p 25 | 26 | if __name__ == '__main__': 27 | def demonstrate(graph, d, desc): 28 | print(desc) 29 | p = pageRank(graph, d=d) 30 | print('The probability of each node is', np.round(p, 2)) 31 | 32 | graph = np.array( 33 | [[0, 1, 1, 1], 34 | [1, 0, 0, 1], 35 | [0, 0, 1, 0], 36 | [0, 1, 1, 0]] 37 | ).astype(float) 38 | demonstrate(graph, .8, 'Example 1') 39 | 40 | graph = np.array( 41 | [[0, 1, 1], 42 | [0, 0, 1], 43 | [1, 0, 0]] 44 | ).astype(float) 45 | demonstrate(graph, .85, 'Example 2') 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Statistical-Learning-Methods(中文文档请往下翻) 2 | 3 | Implement all the algorithms introduced by _Statistical Learning Methods_, Li Hang. 4 | 5 | ## Feature 6 | 7 | - **Complete**. All the algorithms introduced by this book are implemented, including 8 | - kNN powered by kd-tree. 9 | - max entropy model. I cannot find any other repo that implements this algorithm. 10 | - linear chain conditional random field. I cannot find this model in any other similar repo. 11 | - HMM powered by baum-welch. Most repos only provide with HMM trained by counting. 12 | - **Detailed**. All the algorithms are implemented thoroughly. I try my best not to skip any detail. For example, 13 | - about how to select the best one of trimmed CART by cross-validation, I asked Dr. Li Hang by e-mail and got detailed answer. Thanks a lot to Dr. Li Hang for his patiance and kindness. 14 | - **Matrix calculation**. Strip off `for` loops. Implement most of the algorithms with matrix calculation supported by `numpy`. 15 | - **Extensible**. It is easy to fit the codes with new datasets because all the algorithms are controllable through parameters, to a large extent. 16 | - **Examples** Each algorithm comes with some examples. Just run the model file and you will see the examples. If you have better examples for others to understand the model, please feel free to start a PR. 17 | 18 | ## Dependencies 19 | 20 | - Python3 21 | - numpy 22 | - matplotlib 23 | - [rich](https://github.com/willmcgugan/rich) 24 | 25 | ## Usage 26 | 27 | Just run any single file located in each chapter. You will see examples of the algorithm. 28 | 29 | --- 30 | 31 | # 统计学习方法 32 | 33 | 李航博士《统计学习方法》一书的**硬核** Python 实现。 34 | 35 | ## 项目特色 36 | 37 | GitHub 上有许多实现《统计学习方法》的仓库。本仓库与它们的不同之处在于: 38 | 39 | - **完整性**。实现了**所有**模型。包括 40 | - KD 树支持的 KNN 模型。 41 | - **最大熵模型**。我没有找到其他任何一个仓库实现了该算法。 42 | - **线性链条件随机场**。我同样没有找到其他任何一个仓库实现了该算法。这个模型花费了我一个月的时间去理解和实现。 43 | - Baum-Welch 算法支持的 HMM 算法。大多数仓库实现的 HMM 算法都是简单的计数模型。 44 | - **细节**。所有的算法我都在尽力**完全**实现。比如说 45 | - 有关如何用交叉验证法选取剪枝的 CART 树,我特意邮件询问了李航博士并得到了耐心的解答。在此非常感谢李航博士的支持! 46 | - **矩阵运算**。我不喜欢用循环。你可以看到本仓库中的算法使用了大量的矩阵运算来避免使用循环。 47 | - **可扩展性**。其他仓库的算法可能会在可扩展性上偷懒。比如 GMM 模型可能只实现了两个聚类的简单版本用于演示。而本仓库中的算法尽量将所有可调节部分作为模型参数,以供自由修改使用。 48 | - **示例**。每个算法都加上了我认为会增强读者对算法理解的例子。当然我认为这部分目前还是不太完善的。如果你对如何举例有更好的见解,欢迎给我提 PR。 49 | 50 | ## 项目依赖 51 | 52 | - Python3 53 | - numpy 54 | - matplotlib 55 | - [rich](https://github.com/willmcgugan/rich) 56 | 57 | ## 如何使用 58 | 59 | 直接使用 Python 运行任意一个文件夹内的模型文件,你就可以看到算法示例了。 60 | 61 | ## 目录 62 | 63 | - [第 2 章 - 感知机](02.Perceptron) 64 | - [感知机](02.Perceptron/perceptron.py) 65 | - [第 3 章 - k 近邻法](03.KNN) 66 | - [k 近邻模型](03.KNN/knn.py) 67 | - [k 近邻模型 - 使用 KD 树实现](03.KNN/knn_kdtree.py) 68 | - [第 4 章 - 朴素贝叶斯法](04.NaiveBayes) 69 | - [使用极大似然估计的朴素贝叶斯模型](04.NaiveBayes/NaiveBayesMLE.py) 70 | - [使用贝叶斯估计的朴素贝叶斯模型](04.NaiveBayes/NaiveBayesMAP.py) 71 | - [第 5 章 - 决策树](05.DecisionTree) 72 | - [ID3 决策树](05.DecisionTree/ID3.py) 73 | - [C4.5 决策树](05.DecisionTree/C4.5.py) 74 | - [决策树剪枝算法](05.DecisionTree/prune.py) 75 | - [分类 CART 决策树](05.DecisionTree/ClassificationCART.py) 76 | - [分类 CART 决策树剪枝算法](05.DecisionTree/pruneClassificationCART.py) 77 | - [回归 CART 决策树](05.DecisionTree/RegressionCART.py) 78 | - [第 6 章 - 逻辑斯谛回归与最大熵模型](06.LogisticRegression-MaxEntropy) 79 | - [逻辑斯谛回归模型](06.LogisticRegression-MaxEntropy/BinaryLogisticRegression.py) 80 | - [最大熵模型](06.LogisticRegression-MaxEntropy/MaxEntropy.py) 81 | - [第 7 章 - 支持向量机](07.SVM) 82 | - [支持向量机](07.SVM/SVM.py) 83 | - [第 8 章 - 提升方法](08.Boosting) 84 | - [AdaBoost](08.Boosting/AdaBoost.py) 85 | - [梯度提升树](08.Boosting/GBDT.py) 86 | - [第 9 章 - EM 算法及其推广](09.EM) 87 | - [高斯混合模型](09.EM/GMM.py) 88 | - [第 10 章 - 隐马尔科夫模型](10.HMM) 89 | - [前向算法](10.HMM/Forward.py) 90 | - [后向算法](10.HMM/Backward.py) 91 | - [维特比算法](10.HMM/Viterbi.py) 92 | - [Baum-Welch 算法](10.HMM/BaumWelch.py) 93 | - [使用 Baum-Welch 算法训练的隐马尔可夫模型](10.HMM/HMM.py) 94 | - [第 11 章 - 条件随机场](11.ConditionalRandomField) 95 | - [线性链条件随机场](11.ConditionalRandomField/LinearChainConditionalRandomField.py) 96 | - [第 14 章 - 聚类方法](14.Cluster) 97 | - [层次聚类](14.Cluster/Agglomerative.py) 98 | - [k 均值聚类](14.Cluster/KMeans.py) 99 | - [第 15 章 - 奇异值分解](15.SVD) 100 | - [奇异值分解](15.SVD/SVD.py) 101 | - [第 16 章 - 主成分分析](16.PCA) 102 | - [主成分分析](16.PCA/PCA.py) 103 | - [第 17 章 - 潜在语义分析](17.LSA) 104 | - [潜在语义分析模型](17.LSA/LSA.py) 105 | - [第 18 章 - 概率潜在语义分析](18.PLSA) 106 | - [概率潜在语义分析模型](18.PLSA/PLSA.py) 107 | - [第 19 章 - 马尔可夫蒙特卡罗法](19.MCMC) 108 | - [Metropolis-Hasting 算法](19.MCMC/MetropolisHasting.py) 109 | - [单分量的 Metropolis-Hasting 算法](19.MCMC/SingleComponentMetropolisHasting.py) 110 | - [吉布斯采样](19.MCMC/GibbsSampling.py) 111 | - [第 20 章 - 潜在狄利克雷分配](20.LDA) 112 | - [潜在狄利克雷分配模型](20.LDA/LDA.py) 113 | - [第 21 章 - PageRank 算法](21.PageRank) 114 | - [PageRank 算法](21.PageRank/PageRank.py) 115 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SleepyBag/Statistical-Learning-Methods/c16edf2d56f9f7c00651c749464b74b9ec039522/__init__.py -------------------------------------------------------------------------------- /test_get_solution_domain.py: -------------------------------------------------------------------------------- 1 | from utils import row_echelon, get_solution_domain 2 | import numpy as np 3 | 4 | for i in range(100): 5 | print('processing ', i) 6 | a = np.random.rand(100, 50) 7 | re = row_echelon(a) 8 | assert(len(re) == np.linalg.matrix_rank(a)) 9 | zero = a @ get_solution_domain(re) 10 | assert((zero == 0.).all()) 11 | 12 | for i in range(100): 13 | print('processing ', i) 14 | a = np.random.rand(5, 10) 15 | re = row_echelon(a) 16 | assert(len(re) == np.linalg.matrix_rank(a)) 17 | zero = a @ get_solution_domain(re) 18 | assert((abs(zero) < 1e-8).all()) 19 | -------------------------------------------------------------------------------- /test_heap.py: -------------------------------------------------------------------------------- 1 | from utils import Heap 2 | 3 | heap = Heap([3, 1, 2]) 4 | heap.push(1) 5 | heap.push(2) 6 | a = [i for i in heap] 7 | 8 | assert(a == [1, 1, 2, 2, 3]) 9 | -------------------------------------------------------------------------------- /test_information_gain.py: -------------------------------------------------------------------------------- 1 | from utils import information_gain, entropy 2 | from collections import Counter 3 | from math import fabs 4 | 5 | eps = 1e-3 6 | 7 | X = [ 8 | ['青年', '否', '否', '一般'], 9 | ['青年', '否', '否', '好'], 10 | ['青年', '是', '否', '好'], 11 | ['青年', '是', '是', '一般'], 12 | ['青年', '否', '否', '一般'], 13 | ['中年', '否', '否', '一般'], 14 | ['中年', '否', '否', '好'], 15 | ['中年', '是', '是', '好'], 16 | ['中年', '否', '是', '非常好'], 17 | ['中年', '否', '是', '非常好'], 18 | ['老年', '否', '是', '非常好'], 19 | ['老年', '否', '是', '好'], 20 | ['老年', '是', '否', '好'], 21 | ['老年', '是', '否', '非常好'], 22 | ['老年', '否', '否', '一般'], 23 | ] 24 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'] 25 | 26 | assert(fabs(entropy(Counter(Y).values()) - .971) < eps) 27 | assert(fabs(information_gain(X, Y, 0) - .083) < eps) 28 | assert(fabs(information_gain(X, Y, 1) - .324) < eps) 29 | assert(fabs(information_gain(X, Y, 2) - .420) < eps) 30 | assert(fabs(information_gain(X, Y, 3) - .363) < eps) 31 | -------------------------------------------------------------------------------- /test_line_search.py: -------------------------------------------------------------------------------- 1 | from utils import line_search 2 | 3 | class F: 4 | def __init__(self, n): 5 | self.n = n 6 | 7 | def __call__(self, x): 8 | return (x - self.n) ** 2 9 | 10 | f = F(0) 11 | epsilon = 1e-6 12 | for i in range(-1000, 1000): 13 | f.n = i 14 | assert(abs(line_search(f, -2000, 2000, epsilon) - i) <= epsilon) 15 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import numpy as np 3 | import heapq 4 | from math import inf, nan 5 | from math import log, sqrt 6 | from collections import Counter 7 | 8 | # ------------------ Basic Structures ----------------------------------------- 9 | class Heap: 10 | def __init__(self, arr=None, key=lambda x: x, max_len=inf): 11 | self.key = key 12 | self.max_len = max_len 13 | if not arr: 14 | self.h = [] 15 | else: 16 | self.h = [(self.key(i), i) for i in arr] 17 | heapq.heapify(self.h) 18 | self.i = 0 19 | 20 | def __len__(self): 21 | return len(self.h) 22 | 23 | def __bool__(self): 24 | return len(self.h) != 0 25 | 26 | def __iter__(self): 27 | while self: 28 | yield self.pop() 29 | 30 | def push(self, x): 31 | # insert an number to the middle so that `x` will be never compared 32 | # because maybe `x` doesn't have comparing operator defined 33 | heapq.heappush(self.h, (self.key(x), self.i, x)) 34 | self.i += 1 35 | if len(self.h) > self.max_len: 36 | self.pop() 37 | 38 | def top(self): 39 | return self.h[0][-1] 40 | 41 | def top_key(self): 42 | return self.h[0][0] 43 | 44 | def pop(self): 45 | return heapq.heappop(self.h)[-1] 46 | 47 | # ------------------ Functions ------------------------------------------------ 48 | def argmax(arr, key=lambda x: x): 49 | arr = [key(a) for a in arr] 50 | ans = max(arr) 51 | return arr.index(ans), ans 52 | 53 | def argmin(arr, key=lambda x: x): 54 | arr = [key(a) for a in arr] 55 | ans = min(arr) 56 | return arr.index(ans), ans 57 | 58 | def sigmoid(x): 59 | return 1 / (np.exp(-x) + 1) 60 | 61 | def binary_cross_entropy(pred, Y): 62 | loss = -(Y * np.log(pred) + (1 - Y) * np.log(1 - pred)).sum() 63 | return loss 64 | 65 | def softmax(logits, axis=-1): 66 | exps = np.exp(logits) 67 | return exps / exps.sum(axis=axis, keepdims=True) 68 | 69 | def line_search(f, l, r, epsilon=1e-6): 70 | """find the minimum point of a convex function""" 71 | lrate = (3 - sqrt(5)) / 2 72 | rrate = (sqrt(5) - 1) / 2 73 | fll, frr = None, None 74 | while r - l >= epsilon: 75 | if fll is None: 76 | ll = l + (r - l) * lrate 77 | fll = f(ll) 78 | if frr is None: 79 | rr = l + (r - l) * rrate 80 | frr = f(rr) 81 | if fll < frr: 82 | r, rr = rr, ll 83 | frr, fll = fll, None 84 | elif fll > frr: 85 | l, ll = ll, rr 86 | fll, frr = frr, None 87 | else: 88 | l, r = ll, rr 89 | fll, frr = None, None 90 | return (l + r) / 2 91 | 92 | def newton(f, g, x0, epsilon=1e-6): 93 | """ 94 | Find the zero point wehre f(x) = 0 of function f 95 | g(x) is the gradient function of f 96 | """ 97 | prex = x0 98 | x = x0 - f(x0) / g(x0) 99 | while abs(x - prex) > epsilon: 100 | prex, x = x, x - f(x) / g(x) 101 | return x 102 | 103 | def one_hot(i, size): 104 | """Given a hot number the tensor size, return the one-hot tensor""" 105 | ans = np.zeros(size) 106 | ans[i] = 1 107 | return ans 108 | 109 | def row_echelon(A): 110 | """ 111 | eliminate a matrix to row echelon form with gaussian elimination 112 | """ 113 | # convert A to row echolon form 114 | row_cnt, col_cnt = A.shape 115 | col = 0 116 | rank = 0 117 | # from top to the bottom 118 | for i in range(row_cnt): 119 | find = False 120 | while not find and col < col_cnt: 121 | # look for the first non-zero value in current column 122 | for j in range(i, row_cnt): 123 | if A[j][col] != 0.: 124 | if i != j: 125 | A[[i, j]] = A[[j, i]] 126 | A[i] /= A[i][col] 127 | find = True 128 | # if non-zero value found, start elimination 129 | for k in range(i + 1, row_cnt): 130 | A[k] -= A[i] * A[k][col] 131 | rank += 1 132 | break 133 | # if not found, check the next column 134 | else: 135 | col += 1 136 | col += 1 137 | # from bottom to the top 138 | for i in range(row_cnt - 1, -1, -1): 139 | # find the first non-zero value and eliminate 140 | for col in range(col_cnt): 141 | if A[i][col] != 0.: 142 | # start elimination 143 | for k in range(i - 1, -1, -1): 144 | A[k] -= A[i] * A[k][col] / A[i][col] 145 | break 146 | return A[: rank] 147 | 148 | def get_solution_domain(A): 149 | """ 150 | get a group of linearly independent solutions of Ax=0, which are normalized 151 | the input A is supposed to be in row echelon form 152 | """ 153 | row_cnt, col_cnt = A.shape 154 | A = row_echelon(A) 155 | col = 0 156 | nonzero_cols = [] 157 | ans = [] 158 | for i in range(row_cnt): 159 | while col != col_cnt and A[i][col] == 0.: 160 | ans.append(one_hot(col, col_cnt)) 161 | for j, j_col in enumerate(nonzero_cols): 162 | print(j, j_col) 163 | ans[-1][j_col] = -A[j][col] 164 | col += 1 165 | # record the first nonzero value of each row 166 | nonzero_cols.append(col) 167 | col += 1 168 | 169 | for col in range(col, col_cnt): 170 | ans.append(one_hot(col, col_cnt)) 171 | for i, j in enumerate(nonzero_cols): 172 | ans[-1][j] = -A[i][col] 173 | if ans: 174 | ans = np.stack(ans) 175 | ans /= np.linalg.norm(ans, axis=-1, keepdims=True) 176 | else: 177 | ans = np.zeros([0, col_cnt]) 178 | return ans.T 179 | 180 | # ------------------ Decision Trees ------------------------------------------- 181 | def entropy(p): 182 | s = sum(p) 183 | p = [i / s for i in p] 184 | ans = sum(-i * log(i, 2) for i in p) 185 | return ans 186 | 187 | def entropy_of_split(X, Y, col): 188 | """calculate the conditional entropy of splitting data by col""" 189 | val_cnt = Counter(x[col] for x in X) 190 | ans = 0 191 | for val in val_cnt: 192 | weight = val_cnt[val] / len(X) 193 | entr = entropy(Counter(y for x, y in zip(X, Y) if x[col] == val).values()) 194 | ans += weight * entr 195 | return ans 196 | 197 | def information_gain(X, Y, col): 198 | entropy_of_X = entropy(Counter(Y).values()) 199 | entropy_of_col = entropy_of_split(X, Y, col) 200 | return entropy_of_X - entropy_of_col 201 | 202 | def information_gain_ratio(X, Y, col): 203 | information_gain_of_col = information_gain(X, Y, col) 204 | entropy_of_col = entropy(Counter(x[col] for x in X).values()) 205 | return information_gain_of_col / entropy_of_col 206 | 207 | def gini(Y): 208 | cnt = Counter(Y) 209 | ans = 0. 210 | for y in cnt: 211 | ans += (cnt[y] / len(Y)) ** 2 212 | return 1 - ans 213 | 214 | # ------------------ Geometry ------------------------------------------------- 215 | def kbline(k, b, **args): 216 | """Plot a line from slope and intercept""" 217 | axes = plt.gca() 218 | x_vals = np.array(axes.get_xlim()) 219 | y_vals = b + k * x_vals 220 | plt.plot(x_vals, y_vals, **args) 221 | 222 | def wbline(w, b, **args): 223 | if w[1] == 0: 224 | plt.vlines(-b / w[0], *plt.gca().get_ylim(), **args) 225 | else: 226 | k = -w[0] / w[1] 227 | b /= -w[1] 228 | kbline(k, b, **args) 229 | 230 | def euc_dis(a, b): 231 | return np.linalg.norm(a - b, axis=-1) 232 | --------------------------------------------------------------------------------