├── .gitignore
├── 02.Perceptron
    └── perceptron.py
├── 03.KNN
    ├── __init__.py
    ├── knn.py
    ├── knn_kdtree.py
    └── test_kdtree.py
├── 04.NaiveBayes
    ├── NaiveBayesMAP.py
    └── NaiveBayesMLE.py
├── 05.DecisionTree
    ├── C4.5.py
    ├── ClassificationCART.py
    ├── ID3.py
    ├── RegressionCART.py
    ├── prune.py
    └── pruneClassificationCART.py
├── 06.LogisticRegression-MaxEntropy
    ├── BinaryLogisticRegression.py
    └── MaxEntropy.py
├── 07.SVM
    └── SVM.py
├── 08.Boosting
    ├── AdaBoost.py
    └── GBDT.py
├── 09.EM
    ├── GMM.py
    ├── GMMGradientDescent.py
    └── benchmark.py
├── 10.HMM
    ├── Backward.py
    ├── BaumWelch.py
    ├── Forward.py
    ├── HMM.py
    └── Viterbi.py
├── 11.ConditionalRandomField
    └── LinearChainConditionalRandomField.py
├── 14.Cluster
    ├── Agglomerative.py
    └── KMeans.py
├── 15.SVD
    └── SVD.py
├── 16.PCA
    └── PCA.py
├── 17.LSA
    └── LSA.py
├── 18.PLSA
    └── PLSA.py
├── 19.MCMC
    ├── GibbsSampling.py
    ├── MetropolisHasting.py
    └── SingleComponentMetropolisHasting.py
├── 20.LDA
    └── LDA.py
├── 21.PageRank
    └── PageRank.py
├── README.md
├── __init__.py
├── test_get_solution_domain.py
├── test_heap.py
├── test_information_gain.py
├── test_line_search.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | pytestdebug.log
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | doc/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | ### Emacs
138 | # -*- mode: gitignore; -*-
139 | *~
140 | \#*\#
141 | /.emacs.desktop
142 | /.emacs.desktop.lock
143 | *.elc
144 | auto-save-list
145 | tramp
146 | .\#*
147 | 
148 | # Org-mode
149 | .org-id-locations
150 | *_archive
151 | 
152 | # flymake-mode
153 | *_flymake.*
154 | 
155 | # eshell files
156 | /eshell/history
157 | /eshell/lastdir
158 | 
159 | # elpa packages
160 | /elpa/
161 | 
162 | # reftex files
163 | *.rel
164 | 
165 | # AUCTeX auto folder
166 | /auto/
167 | 
168 | # cask packages
169 | .cask/
170 | dist/
171 | 
172 | # Flycheck
173 | flycheck_*.el
174 | 
175 | # server auth directory
176 | /server/
177 | 
178 | # projectiles files
179 | .projectile
180 | 
181 | # directory configuration
182 | .dir-locals.el
183 | 
184 | # network security
185 | /network-security.data
186 | 
187 | GPATH
188 | GRTAGS
189 | GTAGS
190 | 


--------------------------------------------------------------------------------
/02.Perceptron/perceptron.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | from rich.console import Console
 5 | from rich.table import Table
 6 | import sys
 7 | from pathlib import Path
 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 9 | from utils import *
10 | 
11 | class Perceptron:
12 |     def __init__(self, lr=1e-1, max_iteration=2000, verbose=False):
13 |         self.lr = lr
14 |         self.verbose = verbose
15 |         self.max_iteration = max_iteration
16 | 
17 |     def _trans(self, x):
18 |         return self.w @ x + self.b
19 | 
20 |     def _predict(self, x):
21 |         return 1 if self._trans(x) >= 0. else -1
22 | 
23 |     def fit(self, X, Y):
24 |         self.feature_size = X.shape[-1]
25 |         # define parameteres
26 |         self.w = np.random.rand(self.feature_size)
27 |         self.b = np.random.rand(1)
28 | 
29 |         updated = 1
30 |         epoch = 0
31 |         # if there is mis-classified sample, train
32 |         while updated > 0 and epoch < self.max_iteration:
33 |             if self.verbose:
34 |                 print(f"epoch {epoch} started...")
35 | 
36 |             updated = 0
37 |             # shuffle data
38 |             perm = np.random.permutation(len(X))
39 |             for i in perm:
40 |                 x, y = X[i], Y[i]
41 |                 # if there is a mis-classified sample
42 |                 if self._predict(x) != y:
43 |                     # update the parameters
44 |                     self.w += self.lr * y * x
45 |                     self.b += self.lr * y
46 |                     updated += 1
47 | 
48 |             if self.verbose:
49 |                 print(f"epoch {epoch} finishied, {updated} pieces of data mis-classified")
50 |             epoch += 1
51 |         return
52 | 
53 |     def predict(self, X):
54 |         return np.apply_along_axis(self._predict, axis=-1, arr=X)
55 | 
56 | if __name__ == "__main__":
57 |     def demonstrate(X, Y, desc):
58 |         console = Console(markup=False)
59 |         perceptron = Perceptron(verbose=True)
60 |         perceptron.fit(X, Y)
61 | 
62 |         # plot
63 |         plt.scatter(X[:, 0], X[:, 1], c=Y)
64 |         wbline(perceptron.w, perceptron.b)
65 |         plt.title(desc)
66 |         plt.show()
67 | 
68 |         # show in table
69 |         pred = perceptron.predict(X)
70 |         table = Table('x', 'y', 'pred')
71 |         for x, y, y_hat in zip(X, Y, pred):
72 |             table.add_row(*map(str, [x, y, y_hat]))
73 |         console.print(table)
74 | 
75 |     # -------------------------- Example 1 ----------------------------------------
76 |     print("Example 1:")
77 |     X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
78 |     Y = np.array([1, 1, -1, -1])
79 |     demonstrate(X, Y, "Example 1")
80 | 
81 |     # -------------------------- Example 2 ----------------------------------------
82 |     print("Example 2: Perceptron cannot solve a simple XOR problem")
83 |     X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
84 |     Y = np.array([1, -1, -1, 1])
85 |     demonstrate(X, Y, "Example 2: Perceptron cannot solve a simple XOR problem")
86 | 


--------------------------------------------------------------------------------
/03.KNN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SleepyBag/Statistical-Learning-Methods/c16edf2d56f9f7c00651c749464b74b9ec039522/03.KNN/__init__.py


--------------------------------------------------------------------------------
/03.KNN/knn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | from matplotlib import pyplot as plt
 4 | from functools import partial
 5 | import sys
 6 | import os
 7 | from pathlib import Path
 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 9 | from utils import *
10 | 
11 | class KNN:
12 |     def __init__(self, k=1, distance_func="l2"):
13 |         self.k = k
14 |         if distance_func == 'l2':
15 |             self.distance_func = lambda x, y: np.linalg.norm(x - y)
16 |         else:
17 |             self.distance_func = distance_func
18 | 
19 |     def _knn(self, x):
20 |         dis = np.apply_along_axis(partial(self.distance_func, y=x), axis=-1, arr=self.X)
21 |         topk_ind = np.argpartition(dis, self.k)[:self.k]
22 |         return topk_ind
23 | 
24 |     def _predict(self, x):
25 |         topk_ind = self._knn(x)
26 |         topk_y = self.Y[topk_ind]
27 |         return np.argmax(np.bincount(topk_y))
28 | 
29 |     def fit(self, X, Y):
30 |         self.X = X
31 |         self.Y = Y
32 |         self.k = min(self.k, len(self.X))
33 | 
34 |     def predict(self, X):
35 |         return np.apply_along_axis(self._predict, axis=-1, arr=X)
36 | 
37 | if __name__ == "__main__":
38 |     def demonstrate(X_train, Y_train, X_test, k, desc):
39 |         knn = KNN(k=k)
40 |         knn.fit(X_train, Y_train)
41 |         pred_test = knn.predict(X_test)
42 | 
43 |         # plot
44 |         plt.scatter(X_train[:,0], X_train[:,1], c=Y_train, s=20)
45 |         plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, marker=".", s=1)
46 |         plt.title(desc)
47 |         plt.show()
48 | 
49 |     # -------------------------- Example 1 ----------------------------------------
50 |     X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]])
51 |     Y_train = np.array([1, 2, 3, 4, 5])
52 |     # generate grid-shaped test data
53 |     X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1))
54 |     demonstrate(X_train, Y_train, X_test, 1, "Example 1")
55 | 
56 |     # -------------------------- Example 2 (Imbalanced Data) ------------------------
57 |     X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]])
58 |     Y_train = np.array([1, 1, 2, 3, 4])
59 |     # generate grid-shaped test data
60 |     X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1))
61 |     demonstrate(X_train, Y_train, X_test, 1, "Example 2")
62 | 
63 |     # -------------------------- Example 3 (Imbalanced Data) ------------------------
64 |     X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]])
65 |     Y_train = np.array([1, 1, 2, 2, 2])
66 |     # generate grid-shaped test data
67 |     X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1))
68 |     demonstrate(X_train, Y_train, X_test, 1, "Example 3")
69 | 


--------------------------------------------------------------------------------
/03.KNN/knn_kdtree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib
  3 | from matplotlib import pyplot as plt
  4 | from rich.console import Console
  5 | from rich.table import Table
  6 | from functools import partial
  7 | import sys
  8 | import os
  9 | from pathlib import Path
 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 11 | from utils import *
 12 | 
 13 | class KDTree:
 14 |     class Node:
 15 |         def __init__(self, points, labels, axis):
 16 |             self.points = points
 17 |             self.labels = labels
 18 |             self.axis = axis
 19 |             self.left = None
 20 |             self.right = None
 21 | 
 22 |     def build(self, X, Y, split_axis=0):
 23 |         if not len(X):
 24 |             return None
 25 |         median_ind = np.argpartition(X[:, split_axis], len(X) // 2, axis=0)[len(X) // 2]
 26 |         split_point = float(X[median_ind, split_axis])
 27 |         equal_x = X[X[:, split_axis] == split_point]
 28 |         equal_y = Y[X[:, split_axis] == split_point]
 29 |         less_x = X[X[:, split_axis] < split_point]
 30 |         less_y = Y[X[:, split_axis] < split_point]
 31 |         greater_x = X[X[:, split_axis] > split_point]
 32 |         greater_y = Y[X[:, split_axis] > split_point]
 33 |         node = self.Node(equal_x, equal_y, split_axis)
 34 |         node.left = self.build(less_x, less_y, 1 - split_axis)
 35 |         node.right = self.build(greater_x, greater_y, 1 - split_axis)
 36 |         return node
 37 | 
 38 |     def _query(self, root, x, k):
 39 |         if not root:
 40 |             return Heap(max_len=k, key=lambda xy: -euc_dis(x, xy[0]))
 41 |         # Find the region that contains the target point
 42 |         if x[root.axis] <= root.points[0][root.axis]:
 43 |             ans = self._query(root.left, x, k)
 44 |             sibling = root.right
 45 |         else:
 46 |             ans = self._query(root.right, x, k)
 47 |             sibling = root.left
 48 |         # All the points on the current splitting line are possible answers
 49 |         for curx, cury in zip(root.points, root.labels):
 50 |             ans.push((curx, cury))
 51 |         # If the distance between the target point and the splitting line is
 52 |         # shorter than the best answer up until, find in the other tree
 53 |         if len(ans) < k or -ans.top_key() > abs(x[root.axis] - root.points[0][root.axis]):
 54 |             other_ans = self._query(sibling, x, k)
 55 |             while other_ans:
 56 |                 otherx, othery = other_ans.pop()
 57 |                 ans.push((otherx, othery))
 58 |         return ans
 59 | 
 60 |     def query(self, x, k):
 61 |         return self._query(self.root, x, k)
 62 | 
 63 |     def __init__(self, X, Y):
 64 |         self.root = self.build(X, Y)
 65 | 
 66 | class KNN:
 67 |     def __init__(self, k=1, distance_func="l2"):
 68 |         self.k = k
 69 |         if distance_func == 'l2':
 70 |             self.distance_func = lambda x, y: np.linalg.norm(x - y)
 71 |         else:
 72 |             self.distance_func = distance_func
 73 | 
 74 |     def _predict(self, x):
 75 |         topk = self.tree.query(x, self.k)
 76 |         topk_y = [y for x, y in topk]
 77 |         return np.argmax(np.bincount(topk_y))
 78 | 
 79 |     def fit(self, X, Y):
 80 |         self.tree = KDTree(X, Y)
 81 |         self.k = min(self.k, len(X))
 82 | 
 83 |     def predict(self, X):
 84 |         return np.apply_along_axis(self._predict, axis=-1, arr=X)
 85 | 
 86 | if __name__ == "__main__":
 87 |     def demonstrate(X_train, Y_train, X_test, k, desc):
 88 |         knn = KNN(k=k)
 89 |         knn.fit(X_train, Y_train)
 90 |         pred_test = knn.predict(X_test)
 91 | 
 92 |         # plot
 93 |         plt.scatter(X_train[:,0], X_train[:,1], c=Y_train, s=20)
 94 |         plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, marker=".", s=1)
 95 |         plt.title(desc)
 96 |         plt.show()
 97 | 
 98 |     # -------------------------- Example 1 ----------------------------------------
 99 |     X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]])
100 |     Y_train = np.array([1, 2, 3, 4, 5])
101 |     # generate grid-shaped test data
102 |     X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1))
103 |     demonstrate(X_train, Y_train, X_test, 1, "Example 1")
104 | 
105 |     # -------------------------- Example 2 (Imbalanced Data) ------------------------
106 |     X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]])
107 |     Y_train = np.array([1, 1, 2, 3, 4])
108 |     # generate grid-shaped test data
109 |     X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1))
110 |     demonstrate(X_train, Y_train, X_test, 1, "Example 2")
111 | 
112 |     # -------------------------- Example 3 (Imbalanced Data) ------------------------
113 |     X_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [.5, .5]])
114 |     Y_train = np.array([1, 1, 2, 2, 2])
115 |     # generate grid-shaped test data
116 |     X_test = np.concatenate(np.stack(np.meshgrid(np.linspace(-1, 2, 100), np.linspace(-1, 2, 100)), axis=-1))
117 |     demonstrate(X_train, Y_train, X_test, 1, "Example 3")
118 | 


--------------------------------------------------------------------------------
/03.KNN/test_kdtree.py:
--------------------------------------------------------------------------------
 1 | import knn_kdtree
 2 | import numpy as np
 3 | 
 4 | X = np.array([[1, 1], [1, 2], [1, 3], [2, 2], [3, 1], [3, 2], [3, 3]])
 5 | Y = np.array([0] * len(X))
 6 | tree = knn_kdtree.KDTree(X, Y)
 7 | 
 8 | def points_equal(a, b):
 9 |     a = set(map(tuple, a))
10 |     b = set(map(tuple, b))
11 |     return a == b
12 | 
13 | assert(points_equal(tree.root.points, [[2, 2]]))
14 | assert(points_equal(tree.root.left.points, [[1, 2]]))
15 | assert(points_equal(tree.root.right.points, [[3, 2]]))
16 | assert(points_equal(tree.root.left.left.points, [[1, 1]]))
17 | assert(points_equal(tree.root.left.right.points, [[1, 3]]))
18 | assert(points_equal(tree.root.right.left.points, [[3, 1]]))
19 | assert(points_equal(tree.root.right.right.points, [[3, 3]]))
20 | 
21 | assert(points_equal([a[0] for a in tree.query(np.array([2, 1]), 3)], [[1, 1], [2, 2], [3, 1]]))
22 | 
23 | X = np.array([[0, 0], [1, 1], [2, 2]])
24 | Y = np.array([0] * len(X))
25 | tree = knn_kdtree.KDTree(X, Y)
26 | assert(points_equal([a[0] for a in tree.query(np.array([1, 1]), 3)], X))
27 | 
28 | X = np.array([[0, 0], [1, 1], [2, 2]])
29 | Y = np.array([0] * len(X))
30 | tree = knn_kdtree.KDTree(X, Y)
31 | assert(points_equal([a[0] for a in tree.query(np.array([10, 2.001]), 3)], X))
32 | 


--------------------------------------------------------------------------------
/04.NaiveBayes/NaiveBayesMAP.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict, Counter
 2 | from rich.console import Console
 3 | from rich.table import Table
 4 | import numpy as np
 5 | 
 6 | class NaiveBayesMAP:
 7 |     def __init__(self, lamda=1, verbose=False):
 8 |         # p(a|y), the probability of an attribute a when the data is of label y
 9 |         # its a three-layer dict
10 |         # the first-layer key is y, the value label
11 |         # the second-layer key is n, which means the nth attribute
12 |         # the thrid-layer key is the value of the nth attribute
13 |         self.pa_y = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))
14 |         # p(y), the prior probability of label y
15 |         self.py = defaultdict(lambda: 0)
16 |         self.verbose = verbose
17 |         # parameter lamda means that
18 |         # we take each value as it has appeared lamda times before our experiment
19 |         self.lamda = lamda
20 | 
21 |     def fit(self, X, Y):
22 |         y_cnt = Counter(Y)
23 |         for col in range(len(X[0])):
24 |             col_values = set(x[col] for x in X)
25 |             for x, y in zip(X, Y):
26 |                 self.pa_y[y][col][x[col]] += 1
27 |             for y in y_cnt:
28 |                 for a in self.pa_y[y][col]:
29 |                     self.pa_y[y][col][a] += self.lamda
30 |                     self.pa_y[y][col][a] /= y_cnt[y] + self.lamda * len(col_values)
31 |         for y in y_cnt:
32 |             self.py[y] = (y_cnt[y] + self.lamda) / (len(X) + self.lamda * len(y_cnt))
33 | 
34 |         if self.verbose:
35 |             for y in self.pa_y:
36 |                 print(f'The prior probability of label {y} is', self.py[y])
37 |                 for nth in self.pa_y[y]:
38 |                     prob = self.pa_y[y][nth]
39 |                     for a in prob:
40 |                         print(f'When the label is {y}, the probability that {nth}th attribute be {a} is {prob[a]}')
41 | 
42 |     def _predict(self, x):
43 |         # all the labels
44 |         labels = list(self.pa_y.keys())
45 |         probs = []
46 |         for y in labels:
47 |             prob = self.py[y]
48 |             for i, a in enumerate(x):
49 |                 prob *= self.pa_y[y][i][a]
50 |             probs.append(prob)
51 |         if self.verbose:
52 |             for y, p in zip(labels, probs):
53 |                 print(f'The likelihood {x} belongs to {y} is {p}')
54 |         return labels[np.argmax(probs)]
55 | 
56 |     def predict(self, X):
57 |         return [self._predict(x) for x in X]
58 | 
59 | if __name__ == "__main__":
60 |     console = Console(markup=False)
61 |     naive_bayes_map = NaiveBayesMAP(verbose=True)
62 |     # -------------------------- Example 1 ----------------------------------------
63 |     print("Example 1:")
64 |     X = [
65 |         [1,'S'],
66 |         [1,'M'],
67 |         [1,'M'],
68 |         [1,'S'],
69 |         [1,'S'],
70 |         [2,'S'],
71 |         [2,'M'],
72 |         [2,'M'],
73 |         [2,'L'],
74 |         [2,'L'],
75 |         [3,'L'],
76 |         [3,'M'],
77 |         [3,'M'],
78 |         [3,'L'],
79 |         [3,'L'],
80 |     ]
81 |     Y = [-1 ,-1 ,1 ,1 ,-1 ,-1 ,-1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,-1]
82 |     naive_bayes_map.fit(X, Y)
83 | 
84 |     # show in table
85 |     pred = naive_bayes_map.predict(X)
86 |     table = Table('x', 'y', 'pred')
87 |     for x, y, y_hat in zip(X, Y, pred):
88 |         table.add_row(*map(str, [x, y, y_hat]))
89 |     console.print(table)
90 | 


--------------------------------------------------------------------------------
/04.NaiveBayes/NaiveBayesMLE.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict, Counter
 2 | from rich.console import Console
 3 | from rich.table import Table
 4 | import numpy as np
 5 | 
 6 | class NaiveBayesMLE:
 7 |     def __init__(self, verbose=False):
 8 |         # p(a|y), the probability of an attribute a when the data is of label y
 9 |         # its a three-layer dict
10 |         # the first-layer key is y, the value label
11 |         # the second-layer key is n, which means the nth attribute
12 |         # the thrid-layer key is the value of the nth attribute
13 |         self.pa_y = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))
14 |         # p(y), the prior probability of label y
15 |         self.py = defaultdict(lambda: 0)
16 |         self.verbose = verbose
17 | 
18 |     def fit(self, X, Y):
19 |         y_cnt = Counter(Y)
20 |         for x, y in zip(X, Y):
21 |             for i, a in enumerate(x):
22 |                 self.pa_y[y][i][a] += 1 / y_cnt[y]
23 |             self.py[y] += 1 / len(X)
24 | 
25 |         if self.verbose:
26 |             for y in self.pa_y:
27 |                 print(f'The prior probability of label {y} is', self.py[y])
28 |                 for nth in self.pa_y[y]:
29 |                     prob = self.pa_y[y][nth]
30 |                     for a in prob:
31 |                         print(f'When the label is {y}, the probability that {nth}th attribute be {a} is {prob[a]}')
32 | 
33 |     def _predict(self, x):
34 |         # all the labels
35 |         labels = list(self.pa_y.keys())
36 |         probs = []
37 |         for y in labels:
38 |             prob = self.py[y]
39 |             for i, a in enumerate(x):
40 |                 prob *= self.pa_y[y][i][a]
41 |             probs.append(prob)
42 |         if self.verbose:
43 |             for y, p in zip(labels, probs):
44 |                 print(f'The likelihood {x} belongs to {y} is {p}')
45 |         return labels[np.argmax(probs)]
46 | 
47 |     def predict(self, X):
48 |         return [self._predict(x) for x in X]
49 | 
50 | if __name__ == "__main__":
51 |     console = Console(markup=False)
52 |     naive_bayes_mle = NaiveBayesMLE(verbose=True)
53 |     # -------------------------- Example 1 ----------------------------------------
54 |     print("Example 1:")
55 |     X = [
56 |         [1,'S'],
57 |         [1,'M'],
58 |         [1,'M'],
59 |         [1,'S'],
60 |         [1,'S'],
61 |         [2,'S'],
62 |         [2,'M'],
63 |         [2,'M'],
64 |         [2,'L'],
65 |         [2,'L'],
66 |         [3,'L'],
67 |         [3,'M'],
68 |         [3,'M'],
69 |         [3,'L'],
70 |         [3,'L'],
71 |     ]
72 |     Y = [-1 ,-1 ,1 ,1 ,-1 ,-1 ,-1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,-1]
73 |     naive_bayes_mle.fit(X, Y)
74 | 
75 |     # show in table
76 |     pred = naive_bayes_mle.predict(X)
77 |     table = Table('x', 'y', 'pred')
78 |     for x, y, y_hat in zip(X, Y, pred):
79 |         table.add_row(*map(str, [x, y, y_hat]))
80 |     console.print(table)
81 | 


--------------------------------------------------------------------------------
/05.DecisionTree/C4.5.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from pprint import pprint
  3 | from rich.console import Console
  4 | from rich.table import Table
  5 | from math import log
  6 | from collections import Counter
  7 | import sys
  8 | import os
  9 | from pathlib import Path
 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 11 | from utils import *
 12 | 
 13 | class C45:
 14 |     class Node:
 15 |         def __init__(self, col, Y):
 16 |             self.col = col
 17 |             self.children = {}
 18 |             self.cnt = Counter(Y)
 19 |             self.label = self.cnt.most_common(1)[0][0]
 20 | 
 21 |     def __init__(self, information_gain_threshold=0., verbose=False):
 22 |         self.information_gain_threshold = information_gain_threshold
 23 |         self.verbose = verbose
 24 | 
 25 |     def build(self, X, Y, selected):
 26 |         cur = self.Node(None, Y)
 27 |         if self.verbose:
 28 |             print("Cur selected columns:", selected)
 29 |             print("Cur data:")
 30 |             pprint(X)
 31 |             print(Y)
 32 |         split = False
 33 |         # check if there is no attribute to choose, or there is no need for spilt
 34 |         if len(selected) != self.column_cnt and len(set(Y)) > 1:
 35 |             left_columns = list(set(range(self.column_cnt)) - selected)
 36 |             col_ind, best_information_gain_ratio = argmax(left_columns, key=lambda col: information_gain_ratio(X, Y, col))
 37 |             col = left_columns[col_ind]
 38 |             # if this split is better than not splitting
 39 |             if best_information_gain_ratio > self.information_gain_threshold:
 40 |                 print(f"Split by {col}th column")
 41 |                 split = True
 42 |                 cur.col = col
 43 |                 for val in set(x[col] for x in X):
 44 |                     ind = [x[col] == val for x in X]
 45 |                     child_X = [x for i, x in zip(ind, X) if i]
 46 |                     child_Y = [y for i, y in zip(ind, Y) if i]
 47 |                     cur.children[val] = self.build(child_X, child_Y, selected | {col})
 48 |         if not split:
 49 |             print("No split")
 50 |         return cur
 51 | 
 52 |     def query(self, root, x):
 53 |         if root.col is None or x[root.col] not in root.children:
 54 |             return root.label
 55 |         return self.query(root.children[x[root.col]], x)
 56 | 
 57 |     def fit(self, X, Y):
 58 |         self.column_cnt = len(X[0])
 59 |         self.root = self.build(X, Y, set())
 60 | 
 61 |     def _predict(self, x):
 62 |         return self.query(self.root, x)
 63 | 
 64 |     def predict(self, X):
 65 |         return [self._predict(x) for x in X]
 66 | 
 67 | if __name__ == "__main__":
 68 |     console = Console(markup=False)
 69 |     c45 = C45(verbose=True)
 70 |     # -------------------------- Example 1 ----------------------------------------
 71 |     # unpruned decision tree predict correctly for all training data
 72 |     print("Example 1:")
 73 |     X = [
 74 |         ['青年', '否', '否', '一般'],
 75 |         ['青年', '否', '否', '好'],
 76 |         ['青年', '是', '否', '好'],
 77 |         ['青年', '是', '是', '一般'],
 78 |         ['青年', '否', '否', '一般'],
 79 |         ['老年', '否', '否', '一般'],
 80 |         ['老年', '否', '否', '好'],
 81 |         ['老年', '是', '是', '好'],
 82 |         ['老年', '否', '是', '非常好'],
 83 |         ['老年', '否', '是', '非常好'],
 84 |         ['老年', '否', '是', '非常好'],
 85 |         ['老年', '否', '是', '好'],
 86 |         ['老年', '是', '否', '好'],
 87 |         ['老年', '是', '否', '非常好'],
 88 |         ['老年', '否', '否', '一般'],
 89 |     ]
 90 |     Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
 91 |     c45.fit(X, Y)
 92 | 
 93 |     # show in table
 94 |     pred = c45.predict(X)
 95 |     table = Table('x', 'y', 'pred')
 96 |     for x, y, y_hat in zip(X, Y, pred):
 97 |         table.add_row(*map(str, [x, y, y_hat]))
 98 |     console.print(table)
 99 | 
100 |     # -------------------------- Example 2 ----------------------------------------
101 |     # but unpruned decision tree doesn't generalize well for test data
102 |     print("Example 2:")
103 |     X = [
104 |         ['青年', '否', '否', '一般'],
105 |         ['青年', '否', '否', '好'],
106 |         ['青年', '是', '是', '一般'],
107 |         ['青年', '否', '否', '一般'],
108 |         ['老年', '否', '否', '一般'],
109 |         ['老年', '否', '否', '好'],
110 |         ['老年', '是', '是', '好'],
111 |         ['老年', '否', '是', '非常好'],
112 |         ['老年', '否', '是', '非常好'],
113 |         ['老年', '否', '是', '非常好'],
114 |         ['老年', '否', '是', '好'],
115 |         ['老年', '否', '否', '一般'],
116 |     ]
117 |     Y = ['否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否']
118 |     c45.fit(X, Y)
119 | 
120 |     testX = [
121 |         ['青年', '否', '否', '一般'],
122 |         ['青年', '否', '否', '好'],
123 |         ['青年', '是', '否', '好'],
124 |         ['青年', '是', '是', '一般'],
125 |         ['青年', '否', '否', '一般'],
126 |         ['老年', '否', '否', '一般'],
127 |         ['老年', '否', '否', '好'],
128 |         ['老年', '是', '是', '好'],
129 |         ['老年', '否', '是', '非常好'],
130 |         ['老年', '否', '是', '非常好'],
131 |         ['老年', '否', '是', '非常好'],
132 |         ['老年', '否', '是', '好'],
133 |         ['老年', '是', '否', '好'],
134 |         ['老年', '是', '否', '非常好'],
135 |         ['老年', '否', '否', '一般'],
136 |     ]
137 |     testY = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
138 | 
139 |     # show in table
140 |     pred = c45.predict(testX)
141 |     table = Table('x', 'y', 'pred')
142 |     for x, y, y_hat in zip(testX, testY, pred):
143 |         table.add_row(*map(str, [x, y, y_hat]))
144 |     console.print(table)
145 | 


--------------------------------------------------------------------------------
/05.DecisionTree/ClassificationCART.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from math import nan, inf
  3 | from pprint import pprint
  4 | from rich.console import Console
  5 | from rich.table import Table
  6 | from collections import Counter
  7 | import sys
  8 | import os
  9 | from pathlib import Path
 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 11 | from utils import gini
 12 | 
 13 | class ClassificationCART:
 14 |     class Node:
 15 |         def __init__(self, col, Y):
 16 |             self.col = col
 17 |             self.val = None
 18 |             self.left, self.right = None, None
 19 |             self.label = Counter(Y).most_common(1)[0][0]
 20 | 
 21 |     def __init__(self, verbose=False):
 22 |         self.verbose = verbose
 23 | 
 24 |     def get_gini_of_split(self, Y1, Y2):
 25 |         """get the square error of a split"""
 26 |         # Assume that we assign each a certain label to the two set,
 27 |         # the best assignment is the mean value of each set
 28 |         gini1 = gini(Y1)
 29 |         gini2 = gini(Y2)
 30 |         length = len(Y1) + len(Y2)
 31 |         return len(Y1) / length * gini1 + len(Y2) / length * gini2
 32 | 
 33 |     def build(self, X, Y):
 34 |         cur = self.Node(None, Y)
 35 |         if self.verbose:
 36 |             print("Cur data:")
 37 |             pprint(X)
 38 |             print(Y)
 39 |         best_gini = inf
 40 |         best_col, best_val = -1, nan
 41 |         # The orignal content of the book doesn't discuss about when to cease.
 42 |         # So I take the easiest way: cease when the data cannot be splitted
 43 |         if len(set(Y)) > 1:
 44 |             for col in range(len(X[0])):
 45 |                 val_set = set(X[:, col])
 46 |                 if len(val_set) != 1:
 47 |                     for val in val_set:
 48 |                         # Don't split by the minimal value
 49 |                         # because no value is smaller than it
 50 |                         # so the left part is empty
 51 |                         selected_ind = X[:, col] == val
 52 |                         other_ind = X[:, col] != val
 53 |                         selected_Y = Y[selected_ind]
 54 |                         other_Y = Y[other_ind]
 55 |                         cur_gini = self.get_gini_of_split(selected_Y, other_Y)
 56 |                         if cur_gini < best_gini:
 57 |                             best_gini, best_col, best_val = cur_gini, col, val
 58 | 
 59 |             # Build left and right child nodes recursively
 60 |             if self.verbose:
 61 |                 print(f"Split by value {best_val} of {best_col}th column")
 62 |             selected_ind = X[:, best_col] == best_val
 63 |             other_ind = X[:, best_col] != best_val
 64 |             selected_X = X[selected_ind]
 65 |             other_X = X[other_ind]
 66 |             selected_Y = Y[selected_ind]
 67 |             other_Y = Y[other_ind]
 68 | 
 69 |             cur.col = best_col
 70 |             cur.val = best_val
 71 |             cur.left = self.build(selected_X, selected_Y)
 72 |             cur.right = self.build(other_X, other_Y)
 73 |         elif self.verbose:
 74 |             print("No split")
 75 |         return cur
 76 | 
 77 |     def query(self, root, x):
 78 |         if root.col is None:
 79 |             return root.label
 80 |         elif x[root.col] != root.val:
 81 |             return self.query(root.right, x)
 82 |         return self.query(root.left, x)
 83 | 
 84 |     def fit(self, X, Y):
 85 |         self.root = self.build(X, Y)
 86 | 
 87 |     def _predict(self, x):
 88 |         return self.query(self.root, x)
 89 | 
 90 |     def predict(self, X):
 91 |         return [self._predict(x) for x in X]
 92 | 
 93 | if __name__ == "__main__":
 94 |     console = Console(markup=False)
 95 |     cart = ClassificationCART(verbose=True)
 96 |     # -------------------------- Example 1 ----------------------------------------
 97 |     print("Example 1:")
 98 |     X = np.array([
 99 |         ['青年', '否', '否', '一般'],
100 |         ['青年', '否', '否', '好'],
101 |         ['青年', '是', '否', '好'],
102 |         ['青年', '是', '是', '一般'],
103 |         ['青年', '否', '否', '一般'],
104 |         ['老年', '否', '否', '一般'],
105 |         ['老年', '否', '否', '好'],
106 |         ['老年', '是', '是', '好'],
107 |         ['老年', '否', '是', '非常好'],
108 |         ['老年', '否', '是', '非常好'],
109 |         ['老年', '否', '是', '非常好'],
110 |         ['老年', '否', '是', '好'],
111 |         ['老年', '是', '否', '好'],
112 |         ['老年', '是', '否', '非常好'],
113 |         ['老年', '否', '否', '一般'],
114 |     ])
115 |     Y = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
116 |     cart.fit(X, Y)
117 | 
118 |     # show in table
119 |     pred = cart.predict(X)
120 |     table = Table('x', 'y', 'pred')
121 |     for x, y, y_hat in zip(X, Y, pred):
122 |         table.add_row(*map(str, [x, y, y_hat]))
123 |     console.print(table)
124 | 
125 |     # -------------------------- Example 2 ----------------------------------------
126 |     # but unpruned decision tree doesn't generalize well for test data
127 |     print("Example 2:")
128 |     X = np.array([
129 |         ['青年', '否', '否', '一般'],
130 |         ['青年', '否', '否', '好'],
131 |         ['青年', '是', '是', '一般'],
132 |         ['青年', '否', '否', '一般'],
133 |         ['老年', '否', '否', '一般'],
134 |         ['老年', '否', '否', '好'],
135 |         ['老年', '是', '是', '好'],
136 |         ['老年', '否', '是', '非常好'],
137 |         ['老年', '否', '是', '非常好'],
138 |         ['老年', '否', '是', '非常好'],
139 |         ['老年', '否', '是', '好'],
140 |         ['老年', '否', '否', '一般'],
141 |     ])
142 |     Y = np.array(['否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否'])
143 |     cart.fit(X, Y)
144 | 
145 |     testX = np.array([
146 |         ['青年', '否', '否', '一般'],
147 |         ['青年', '否', '否', '好'],
148 |         ['青年', '是', '否', '好'],
149 |         ['青年', '是', '是', '一般'],
150 |         ['青年', '否', '否', '一般'],
151 |         ['老年', '否', '否', '一般'],
152 |         ['老年', '否', '否', '好'],
153 |         ['老年', '是', '是', '好'],
154 |         ['老年', '否', '是', '非常好'],
155 |         ['老年', '否', '是', '非常好'],
156 |         ['老年', '否', '是', '非常好'],
157 |         ['老年', '否', '是', '好'],
158 |         ['老年', '是', '否', '好'],
159 |         ['老年', '是', '否', '非常好'],
160 |         ['老年', '否', '否', '一般'],
161 |     ])
162 |     testY = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
163 | 
164 |     # show in table
165 |     pred = cart.predict(testX)
166 |     table = Table('x', 'y', 'pred')
167 |     for x, y, y_hat in zip(testX, testY, pred):
168 |         table.add_row(*map(str, [x, y, y_hat]))
169 |     console.print(table)
170 | 


--------------------------------------------------------------------------------
/05.DecisionTree/ID3.py:
--------------------------------------------------------------------------------
  1 | from pprint import pprint
  2 | from rich.console import Console
  3 | from rich.table import Table
  4 | from collections import Counter
  5 | import sys
  6 | import os
  7 | from pathlib import Path
  8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
  9 | from utils import argmax, information_gain
 10 | 
 11 | 
 12 | class ID3:
 13 |     class Node:
 14 |         def __init__(self, col, Y):
 15 |             self.col = col
 16 |             self.children = {}
 17 |             self.cnt = Counter(Y)
 18 |             self.label = self.cnt.most_common(1)[0][0]
 19 | 
 20 |     def __init__(self, information_gain_threshold=0., verbose=False):
 21 |         self.information_gain_threshold = information_gain_threshold
 22 |         self.verbose = verbose
 23 | 
 24 |     def build(self, X, Y, selected):
 25 |         cur = self.Node(None, Y)
 26 |         if self.verbose:
 27 |             print("Cur selected columns:", selected)
 28 |             print("Cur data:")
 29 |             pprint(X)
 30 |             print(Y)
 31 |         split = False
 32 |         # check if there is no attribute to choose
 33 |         # or there is no need for spilt
 34 |         if len(selected) != self.column_cnt and len(set(Y)) > 1:
 35 |             left_columns = list(set(range(self.column_cnt)) - selected)
 36 |             col_ind, best_information_gain = argmax(left_columns,
 37 |                                                     key=lambda col: information_gain(X, Y, col))
 38 |             col = left_columns[col_ind]
 39 |             # if this split is better than not splitting
 40 |             if best_information_gain > self.information_gain_threshold:
 41 |                 if self.verbose:
 42 |                     print(f"Split by {col}th column")
 43 |                 split = True
 44 |                 cur.col = col
 45 |                 for val in set(x[col] for x in X):
 46 |                     ind = [x[col] == val for x in X]
 47 |                     child_X = [x for i, x in zip(ind, X) if i]
 48 |                     child_Y = [y for i, y in zip(ind, Y) if i]
 49 |                     cur.children[val] = self.build(child_X, child_Y, selected | {col})
 50 |         if not split and self.verbose:
 51 |             print("No split")
 52 |         return cur
 53 | 
 54 |     def query(self, root, x):
 55 |         if root.col is None or x[root.col] not in root.children:
 56 |             return root.label
 57 |         return self.query(root.children[x[root.col]], x)
 58 | 
 59 |     def fit(self, X, Y):
 60 |         self.column_cnt = len(X[0])
 61 |         self.root = self.build(X, Y, set())
 62 | 
 63 |     def _predict(self, x):
 64 |         return self.query(self.root, x)
 65 | 
 66 |     def predict(self, X):
 67 |         return [self._predict(x) for x in X]
 68 | 
 69 | 
 70 | if __name__ == "__main__":
 71 |     console = Console(markup=False)
 72 |     id3 = ID3(verbose=False)
 73 |     # -------------------------- Example 1 ----------------------------------------
 74 |     # unpruned decision tree predict correctly for all training data
 75 |     print("Example 1:")
 76 |     X = [
 77 |         ['青年', '否', '否', '一般'],
 78 |         ['青年', '否', '否', '好'],
 79 |         ['青年', '是', '否', '好'],
 80 |         ['青年', '是', '是', '一般'],
 81 |         ['青年', '否', '否', '一般'],
 82 |         ['老年', '否', '否', '一般'],
 83 |         ['老年', '否', '否', '好'],
 84 |         ['老年', '是', '是', '好'],
 85 |         ['老年', '否', '是', '非常好'],
 86 |         ['老年', '否', '是', '非常好'],
 87 |         ['老年', '否', '是', '非常好'],
 88 |         ['老年', '否', '是', '好'],
 89 |         ['老年', '是', '否', '好'],
 90 |         ['老年', '是', '否', '非常好'],
 91 |         ['老年', '否', '否', '一般'],
 92 |     ]
 93 |     Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
 94 |     id3.fit(X, Y)
 95 | 
 96 |     # show in table
 97 |     pred = id3.predict(X)
 98 |     table = Table('x', 'y', 'pred')
 99 |     for x, y, y_hat in zip(X, Y, pred):
100 |         table.add_row(*map(str, [x, y, y_hat]))
101 |     console.print(table)
102 | 
103 |     # -------------------------- Example 2 ----------------------------------------
104 |     # but unpruned decision tree doesn't generalize well for test data
105 |     print("Example 2:")
106 |     X = [
107 |         ['青年', '否', '否', '一般'],
108 |         ['青年', '否', '否', '好'],
109 |         ['青年', '是', '是', '一般'],
110 |         ['青年', '否', '否', '一般'],
111 |         ['老年', '否', '否', '一般'],
112 |         ['老年', '否', '否', '好'],
113 |         ['老年', '是', '是', '好'],
114 |         ['老年', '否', '是', '非常好'],
115 |         ['老年', '否', '是', '非常好'],
116 |         ['老年', '否', '是', '非常好'],
117 |         ['老年', '否', '是', '好'],
118 |         ['老年', '否', '否', '一般'],
119 |     ]
120 |     Y = ['否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否']
121 |     id3.fit(X, Y)
122 | 
123 |     testX = [
124 |         ['青年', '否', '否', '一般'],
125 |         ['青年', '否', '否', '好'],
126 |         ['青年', '是', '否', '好'],
127 |         ['青年', '是', '是', '一般'],
128 |         ['青年', '否', '否', '一般'],
129 |         ['老年', '否', '否', '一般'],
130 |         ['老年', '否', '否', '好'],
131 |         ['老年', '是', '是', '好'],
132 |         ['老年', '否', '是', '非常好'],
133 |         ['老年', '否', '是', '非常好'],
134 |         ['老年', '否', '是', '非常好'],
135 |         ['老年', '否', '是', '好'],
136 |         ['老年', '是', '否', '好'],
137 |         ['老年', '是', '否', '非常好'],
138 |         ['老年', '否', '否', '一般'],
139 |     ]
140 |     testY = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
141 | 
142 |     # show in table
143 |     pred = id3.predict(testX)
144 |     table = Table('x', 'y', 'pred')
145 |     for x, y, y_hat in zip(testX, testY, pred):
146 |         table.add_row(*map(str, [x, y, y_hat]))
147 |     console.print(table)
148 | 


--------------------------------------------------------------------------------
/05.DecisionTree/RegressionCART.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from pprint import pprint
  3 | from rich.console import Console
  4 | from rich.table import Table
  5 | import sys
  6 | import os
  7 | from pathlib import Path
  8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
  9 | from utils import *
 10 | 
 11 | 
 12 | class RegressionCART:
 13 |     class Node:
 14 |         def __init__(self, col, Y):
 15 |             self.col = col
 16 |             self.val = nan
 17 |             self.left, self.right = None, None
 18 |             self.label = Y.mean()
 19 | 
 20 |         def __hash__(self):
 21 |             return id(self)
 22 | 
 23 |     def __init__(self, verbose=False, max_depth=inf):
 24 |         self.verbose = verbose
 25 |         self.max_depth = max_depth
 26 | 
 27 |     def get_se(self, Y_cnt):
 28 |         """get square error given the count of each Y value"""
 29 |         mean = sum(y * Y_cnt[y] for y in Y_cnt) / sum(Y_cnt.values())
 30 |         square_error = sum((y - mean) ** 2 * Y_cnt[y] for y in Y_cnt)
 31 |         return square_error
 32 | 
 33 |     def get_se_of_split(self, Y1_cnt, Y2_cnt):
 34 |         """get the square error of a split"""
 35 |         return self.get_se(Y1_cnt) + self.get_se(Y2_cnt)
 36 | 
 37 |     def build(self, X, Y, depth=1):
 38 |         cur = self.Node(None, Y)
 39 |         if self.verbose:
 40 |             print("Cur data:")
 41 |             pprint(X)
 42 |             print(Y)
 43 |         best_se = inf
 44 |         best_col, best_val = -1, nan
 45 |         # The orignal content of the book doesn't discuss about when to cease.
 46 |         # So I take the easiest way: cease when the data cannot be splitted,
 47 |         # i.e., there are different labels
 48 |         if depth < self.max_depth and len(set(Y)) > 1:
 49 |             for col in range(len(X[0])):
 50 |                 smaller_Y_cnt = Counter()
 51 |                 larger_Y_cnt = Counter(Y)
 52 |                 sorted_inds = np.argsort(X[:, col])
 53 |                 # try all the possible split values
 54 |                 for i, ind in enumerate(sorted_inds):
 55 |                     smaller_Y_cnt[Y[ind]] += 1
 56 |                     larger_Y_cnt[Y[ind]] -= 1
 57 |                     # don't split on the largest number, otherwise the right part is empty
 58 |                     if sorted_inds[i] == sorted_inds[-1]:
 59 |                         break
 60 |                     # split only when this is the last one of consequent identical numbers
 61 |                     if i == len(X) - 1 or X[ind, col] != X[sorted_inds[i + 1], col]:
 62 |                         se = self.get_se_of_split(smaller_Y_cnt, larger_Y_cnt)
 63 |                         if se < best_se:
 64 |                             val = X[ind, col]
 65 |                             best_se, best_col, best_val = se, col, val
 66 | 
 67 |             # Build left and right child nodes recursively
 68 |             if self.verbose:
 69 |                 print(f"Split by value {best_val} of {best_col}th column")
 70 |             smaller_ind = X[:, best_col] <= best_val
 71 |             larger_ind = X[:, best_col] > best_val
 72 |             smaller_X = X[smaller_ind]
 73 |             larger_X = X[larger_ind]
 74 |             smaller_Y = Y[smaller_ind]
 75 |             larger_Y = Y[larger_ind]
 76 | 
 77 |             cur.col = best_col
 78 |             cur.val = best_val
 79 |             cur.left = self.build(smaller_X, smaller_Y, depth + 1)
 80 |             cur.right = self.build(larger_X, larger_Y, depth + 1)
 81 |         elif self.verbose:
 82 |             print("No split")
 83 |         return cur
 84 | 
 85 |     def _query(self, root, x):
 86 |         if root.col is None:
 87 |             return root
 88 |         elif x[root.col] > root.val:
 89 |             return self._query(root.right, x)
 90 |         return self._query(root.left, x)
 91 | 
 92 |     def query(self, root, x):
 93 |         return self._query(root, x).label
 94 | 
 95 |     def fit(self, X, Y):
 96 |         self.root = self.build(X, Y)
 97 | 
 98 |     def _predict(self, x):
 99 |         return self.query(self.root, x)
100 | 
101 |     def predict(self, X):
102 |         return [self._predict(x) for x in X]
103 | 
104 | if __name__ == "__main__":
105 |     def demonstrate(cart, X, Y, test_X, test_Y, desc):
106 |         print(desc)
107 |         console = Console(markup=False)
108 |         cart.fit(X, Y)
109 | 
110 |         # show in table
111 |         pred = cart.predict(test_X)
112 |         table = Table('x', 'y', 'pred')
113 |         for x, y, y_hat in zip(test_X, test_Y, pred):
114 |             table.add_row(*map(str, [x, y, y_hat]))
115 |         console.print(table)
116 | 
117 |     # -------------------------- Example 1 ----------------------------------------
118 |     cart = RegressionCART(verbose=True)
119 |     X = np.arange(1, 11).reshape(-1, 1)
120 |     Y = np.array([4.5, 4.75, 4.91, 5.34, 5.8, 7.05, 7.90, 8.23, 8.70, 9.00])
121 |     demonstrate(cart, X, Y, X, Y, "Example 1:")
122 | 
123 |     # -------------------------- Example 2 ----------------------------------------
124 |     # show in table
125 |     cart = RegressionCART(verbose=True)
126 |     test_X = X + .5
127 |     test_Y = np.zeros_like(Y) + nan
128 |     demonstrate(cart, X, Y, test_X, test_Y, "Example 2:")
129 | 
130 |     # -------------------------- Example 3 ----------------------------------------
131 |     cart = RegressionCART(verbose=True, max_depth=1)
132 |     X = np.arange(1, 11).reshape(-1, 1)
133 |     Y = np.array([4.5, 4.75, 4.91, 5.34, 5.8, 7.05, 7.90, 8.23, 8.70, 9.00])
134 |     demonstrate(cart, X, Y, X, Y, "Example 3: CART stump")
135 | 
136 | 
137 |     # -------------------------- Example 4 ----------------------------------------
138 |     cart = RegressionCART(verbose=True, max_depth=3)
139 |     X = np.arange(1, 11).reshape(-1, 1)
140 |     Y = np.array([4.5, 4.75, 4.91, 5.34, 5.8, 7.05, 7.90, 8.23, 8.70, 9.00])
141 |     demonstrate(cart, X, Y, X, Y, "Example 4: split twice")
142 | 


--------------------------------------------------------------------------------
/05.DecisionTree/prune.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from pprint import pprint
  3 | from collections import Counter
  4 | from rich.console import Console
  5 | from rich.table import Table
  6 | import sys
  7 | import os
  8 | from pathlib import Path
  9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 10 | from utils import *
 11 | from ID3 import ID3
 12 | 
 13 | def prune(root, X, Y, alpha=.0, verbose=True):
 14 |     """
 15 |     prune a decision tree recursively. alpha is the weight of tree size in the loss function
 16 |     reutrn the loss of all the leaf nodes
 17 |     """
 18 |     # calculate the entropy of this subtree if the children of root is trimmed
 19 |     pruned_entropy = len(X) * entropy(Counter(Y).values())
 20 |     pruned_loss = pruned_entropy + alpha
 21 |     # if root is a leaf node, return loss directly
 22 |     if not root.children:
 23 |         return pruned_loss
 24 |     cur_loss = 0.
 25 |     # trim child nodes recursively
 26 |     for col_val in root.children:
 27 |         child = root.children[col_val]
 28 |         ind = [x[root.col] == col_val for x in X]
 29 |         childX = [x for i, x in zip(ind, X) if i]
 30 |         childY = [y for i, y in zip(ind, Y) if i]
 31 |         cur_loss += prune(child, childX, childY, alpha, verbose)
 32 |     # if pruned, return the pruned loss
 33 |     if verbose:
 34 |         pprint(X)
 35 |         print('loss if prune:', pruned_loss)
 36 |         print('current loss', cur_loss)
 37 |     if pruned_loss < cur_loss:
 38 |         root.children.clear()
 39 |         return pruned_loss
 40 |     # if not pruned, the loss of node root is the sum loss of all of its children
 41 |     return cur_loss
 42 | 
 43 | 
 44 | if __name__ == "__main__":
 45 |     console = Console(markup=False)
 46 |     # -------------------------- Example 1 (Small Normalization Param) ------------
 47 |     print("Example 1:")
 48 |     id3 = ID3(verbose=False)
 49 |     X = [
 50 |         ['青年', '否', '否', '一般'],
 51 |         ['青年', '否', '否', '好'],
 52 |         ['青年', '是', '否', '好'],
 53 |         ['青年', '是', '是', '一般'],
 54 |         ['青年', '否', '否', '一般'],
 55 |         ['老年', '否', '否', '一般'],
 56 |         ['老年', '否', '否', '好'],
 57 |         ['老年', '是', '是', '好'],
 58 |         ['老年', '否', '是', '非常好'],
 59 |         ['老年', '否', '是', '非常好'],
 60 |         ['老年', '否', '是', '非常好'],
 61 |         ['老年', '否', '是', '好'],
 62 |         ['老年', '是', '否', '好'],
 63 |         ['老年', '是', '否', '非常好'],
 64 |         ['老年', '否', '否', '一般'],
 65 |     ]
 66 |     Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
 67 |     id3.fit(X, Y)
 68 | 
 69 |     # prune with alpha 0.
 70 |     prune(id3.root, X, Y, 0.)
 71 | 
 72 |     # show in table
 73 |     pred = id3.predict(X)
 74 |     table = Table('x', 'y', 'pred')
 75 |     for x, y, y_hat in zip(X, Y, pred):
 76 |         table.add_row(*map(str, [x, y, y_hat]))
 77 |     console.print(table)
 78 | 
 79 |     # -------------------------- Example 2 (Large Normalization Param) ------------
 80 |     print("Example 2:")
 81 |     id3 = ID3(verbose=False)
 82 |     X = [
 83 |         ['青年', '否', '否', '一般'],
 84 |         ['青年', '否', '否', '好'],
 85 |         ['青年', '是', '否', '好'],
 86 |         ['青年', '是', '是', '一般'],
 87 |         ['青年', '否', '否', '一般'],
 88 |         ['老年', '否', '否', '一般'],
 89 |         ['老年', '否', '否', '好'],
 90 |         ['老年', '是', '是', '好'],
 91 |         ['老年', '否', '是', '非常好'],
 92 |         ['老年', '否', '是', '非常好'],
 93 |         ['老年', '否', '是', '非常好'],
 94 |         ['老年', '否', '是', '好'],
 95 |         ['老年', '是', '否', '好'],
 96 |         ['老年', '是', '否', '非常好'],
 97 |         ['老年', '否', '否', '一般'],
 98 |     ]
 99 |     Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
100 |     id3.fit(X, Y)
101 | 
102 |     # prune with large alpha
103 |     prune(id3.root, X, Y, 10000.)
104 | 
105 |     # show in table
106 |     pred = id3.predict(X)
107 |     table = Table('x', 'y', 'pred')
108 |     for x, y, y_hat in zip(X, Y, pred):
109 |         table.add_row(*map(str, [x, y, y_hat]))
110 |     console.print(table)
111 | 
112 |     # -------------------------- Example 3 (Midium Normalization Param) -----------
113 |     print("Example 3:")
114 |     id3 = ID3(verbose=False)
115 |     X = [
116 |         ['青年', '否', '否', '一般'],
117 |         ['青年', '否', '否', '好'],
118 |         ['青年', '是', '否', '好'],
119 |         ['青年', '是', '是', '一般'],
120 |         ['青年', '否', '否', '一般'],
121 |         ['老年', '否', '否', '一般'],
122 |         ['老年', '否', '否', '好'],
123 |         ['老年', '是', '是', '好'],
124 |         ['老年', '否', '是', '非常好'],
125 |         ['老年', '否', '是', '非常好'],
126 |         ['老年', '否', '是', '非常好'],
127 |         ['老年', '否', '是', '好'],
128 |         ['老年', '是', '否', '好'],
129 |         ['老年', '是', '否', '非常好'],
130 |         ['老年', '否', '否', '一般'],
131 |     ]
132 |     Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
133 |     id3.fit(X, Y)
134 | 
135 |     # prune with medium alpha
136 |     prune(id3.root, X, Y, 5.)
137 | 
138 |     # show in table
139 |     pred = id3.predict(X)
140 |     table = Table('x', 'y', 'pred')
141 |     for x, y, y_hat in zip(X, Y, pred):
142 |         table.add_row(*map(str, [x, y, y_hat]))
143 |     console.print(table)
144 | 


--------------------------------------------------------------------------------
/05.DecisionTree/pruneClassificationCART.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from pprint import pprint
  3 | from collections import Counter
  4 | from rich.console import Console
  5 | from rich.table import Table
  6 | import sys
  7 | import os
  8 | from pathlib import Path
  9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 10 | from utils import *
 11 | from ClassificationCART import ClassificationCART
 12 | 
 13 | class PrunedCART:
 14 |     def __init__(self, cart, X, Y, val_X, val_Y, verbose=True):
 15 |         self.root = cart.root
 16 |         self.possible_prune_threshold = {np.inf}
 17 |         self.verbose = verbose
 18 |         # Stage one: calculate pruning loss for all nodes
 19 |         self.calculate_prune_loss(self.root, X, Y)
 20 |         if self.verbose:
 21 |             print("All the possible threshold values are", self.possible_prune_threshold)
 22 |         # Stage two: choose the best threshold for pruning
 23 |         self.prune_threshold = self.choose_threshold(val_X, val_Y, self.possible_prune_threshold)
 24 |         if self.verbose:
 25 |             print("The best threshold value is", self.prune_threshold)
 26 | 
 27 |     def calculate_prune_loss(self, root, X, Y):
 28 |         """
 29 |         get the pruning loss of a classification CART recursively
 30 |         tag all the nodes with a float `pruned_loss`, which indicating the loss of pruning the branches under this node
 31 |         this node will be trimmed
 32 | 
 33 |         possible_prune_threshold is a empty set, in which this function will insert all the possible threshold value.
 34 |         return the loss of all the leaf nodes, and the size of the subtree
 35 |         """
 36 |         # calculate the gini index of this subtree if the children of root is trimmed
 37 |         pruned_gini = len(X) * gini(Counter(Y).values())
 38 |         pruned_loss = pruned_gini
 39 |         # if root is a leaf node, return loss directly
 40 |         if root.col is None:
 41 |             return pruned_loss, 1
 42 | 
 43 |         # cur_loss record the loss function when root is not trimmed
 44 |         cur_loss = 0.
 45 |         # size record the size of this subtree
 46 |         size = 1
 47 | 
 48 |         selected_ind = X[:, root.col] == root.val
 49 |         other_ind = X[:, root.col] != root.val
 50 |         selected_X = X[selected_ind]
 51 |         other_X = X[other_ind]
 52 |         selected_Y = Y[selected_ind]
 53 |         other_Y = Y[other_ind]
 54 | 
 55 |         # trim the left node recursively
 56 |         child_loss, child_size = self.calculate_prune_loss(root.left, selected_X, selected_Y)
 57 |         cur_loss += child_loss
 58 |         size += child_size
 59 | 
 60 |         # trim the right node recursively
 61 |         child_loss, child_size = self.calculate_prune_loss(root.right, other_X, other_Y)
 62 |         cur_loss += child_loss
 63 |         size += child_size
 64 | 
 65 |         # the loss of prune the branches of this node
 66 |         relative_prune_loss = (pruned_loss - cur_loss) / (size - 1)
 67 |         root.relative_prune_loss = relative_prune_loss
 68 |         self.possible_prune_threshold.add(relative_prune_loss)
 69 |         return cur_loss, size
 70 | 
 71 |     def query(self, root, x, prune_threshold):
 72 |         # if root.relative_prune_loss is less than choosed prune threshold, it is trimmed
 73 |         if root.col is None or root.relative_prune_loss < prune_threshold:
 74 |             return root.label
 75 |         elif x[root.col] != root.val:
 76 |             return self.query(root.right, x, prune_threshold)
 77 |         return self.query(root.left, x, prune_threshold)
 78 | 
 79 |     def _predict(self, x, prune_threshold):
 80 |         return self.query(self.root, x, prune_threshold)
 81 | 
 82 |     def predict(self, X, prune_threshold=None):
 83 |         if prune_threshold is None:
 84 |             prune_threshold = self.prune_threshold
 85 |         return np.array([self._predict(x, prune_threshold) for x in X])
 86 | 
 87 |     def validate(self, val_X, val_Y, prune_threshold):
 88 |         """
 89 |         I don't think using gini index for validation, as written in the book, is a good idea,
 90 |         beacause gini index is unsupervised but there is label available in the validation set.
 91 |         So I choose to use accuracy instead.
 92 |         """
 93 |         pred = self.predict(val_X, prune_threshold)
 94 |         return (pred == val_Y).mean()
 95 | 
 96 |     def choose_threshold(self, val_X, val_Y, possible_prune_threshold):
 97 |         """
 98 |         Choose the best subtree according to the validation set.
 99 |         Cross-validation here simply refers to predict on a pre-split validation set.
100 |         """
101 |         best_acc = -1.
102 |         best_prune_threshold = 0.
103 |         for prune_threshold in sorted(list(possible_prune_threshold)):
104 |             cur_acc = self.validate(val_X, val_Y, prune_threshold)
105 |             if self.verbose:
106 |                 print(f"When prune threshold = {prune_threshold}, accuracy is {cur_acc}")
107 |             if cur_acc >= best_acc:
108 |                 best_acc = cur_acc
109 |                 best_prune_threshold = prune_threshold
110 |         return best_prune_threshold
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     console = Console(markup=False)
115 |     cart = ClassificationCART(verbose=True)
116 |     # -------------------------- Example 1 ----------------------------------------
117 |     print("Example 1:")
118 |     X = np.array([
119 |         ['青年', '否', '否', '一般'],
120 |         ['青年', '否', '否', '好'],
121 |         ['青年', '是', '否', '好'],
122 |         ['青年', '是', '是', '一般'],
123 |         ['青年', '否', '否', '一般'],
124 |         ['老年', '否', '否', '一般'],
125 |         ['老年', '否', '否', '好'],
126 |         ['老年', '是', '是', '好'],
127 |         ['老年', '否', '是', '非常好'],
128 |         ['老年', '否', '是', '非常好'],
129 |         ['老年', '否', '是', '非常好'],
130 |         ['老年', '否', '是', '好'],
131 |         ['老年', '是', '否', '好'],
132 |         ['老年', '是', '否', '非常好'],
133 |         ['老年', '否', '否', '一般'],
134 |     ])
135 |     Y = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
136 |     cart.fit(X, Y)
137 | 
138 |     # Here I use the same dataset as the validation set
139 |     # Notice that it must be the full tree to be choosed this way
140 |     testX = np.array([
141 |         ['青年', '否', '否', '一般'],
142 |         ['青年', '否', '否', '好'],
143 |         ['青年', '是', '否', '好'],
144 |         ['青年', '是', '是', '一般'],
145 |         ['青年', '否', '否', '一般'],
146 |         ['老年', '否', '否', '一般'],
147 |         ['老年', '否', '否', '好'],
148 |         ['老年', '是', '是', '好'],
149 |         ['老年', '否', '是', '非常好'],
150 |         ['老年', '否', '是', '非常好'],
151 |         ['老年', '否', '是', '非常好'],
152 |         ['老年', '否', '是', '好'],
153 |         ['老年', '是', '否', '好'],
154 |         ['老年', '是', '否', '非常好'],
155 |         ['老年', '否', '否', '一般'],
156 |     ])
157 |     testY = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
158 | 
159 |     pruned_cart = PrunedCART(cart, X, Y, testX, testY)
160 | 
161 |     # show in table
162 |     pred = pruned_cart.predict(testX)
163 |     table = Table('x', 'y', 'pred')
164 |     for x, y, y_hat in zip(testX, testY, pred):
165 |         table.add_row(*map(str, [x, y, y_hat]))
166 |     console.print(table)
167 | 


--------------------------------------------------------------------------------
/06.LogisticRegression-MaxEntropy/BinaryLogisticRegression.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | import numpy as np
 3 | import sys
 4 | import os
 5 | from pathlib import Path
 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 7 | from utils import binary_cross_entropy, sigmoid, wbline
 8 | 
 9 | class LogisticRegression:
10 |     def __init__(self, lr=1, max_steps=1000, verbose=True):
11 |         self.lr = lr
12 |         self.max_steps = max_steps
13 |         self.verbose = verbose
14 | 
15 |     def fit(self, X, Y):
16 |         """
17 |         X: of shape [data-size, feature-size]
18 |         Y: of shape [data-size]
19 |         """
20 |         self.feature_size = X.shape[-1]
21 |         # w of shape [feature-size]
22 |         self.w = np.random.rand(self.feature_size)
23 |         # b of shape [1]
24 |         self.b = np.random.rand(1)
25 | 
26 |         for step in range(self.max_steps):
27 |             # pred of shape [data-size]
28 |             pred = self._predict(X)
29 |             # Bias gradient of shape [data-size]
30 |             gradient_b = Y - pred
31 |             # Weight gradient of shape [data-size, feature-size]
32 |             gradient_w = gradient_b[:, None] * X
33 |             # get mean of gradient across all data
34 |             gradient_b = gradient_b.mean(axis=0)
35 |             gradient_w = gradient_w.mean(axis=0)
36 |             self.w += gradient_w * self.lr
37 |             self.b += gradient_b * self.lr
38 |             if self.verbose:
39 |                 loss = binary_cross_entropy(pred, Y)
40 |                 print(f"Step {step}, Loss is {loss}...")
41 | 
42 |     def _predict(self, X):
43 |         logit = self.w @ X.transpose() + self.b
44 |         p = sigmoid(logit)
45 |         return p
46 | 
47 |     def predict(self, X):
48 |         p = self._predict(X)
49 |         Y = (p > .5).astype(int)
50 |         return Y
51 | 
52 | if __name__ == "__main__":
53 |     def demonstrate(X, Y, desc):
54 |         logistic_regression = LogisticRegression(verbose=True)
55 |         logistic_regression.fit(X, Y)
56 | 
57 |         # plot
58 |         plt.title(desc)
59 |         plt.scatter(X[:, 0], X[:, 1], c=Y)
60 |         wbline(logistic_regression.w, logistic_regression.b)
61 |         plt.show()
62 | 
63 |     # -------------------------- Example 1 ----------------------------------------
64 |     X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
65 |     Y = np.array([1, 1, 0, 0])
66 |     demonstrate(X, Y, "Example 1")
67 | 
68 |     # -------------------------- Example 2 ----------------------------------------
69 |     X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
70 |     Y = np.array([1, 0, 0, 1])
71 |     demonstrate(X, Y, "Example 2: Logistic Regression still cannot solve a simple XOR problem")
72 | 
73 |     # -------------------------- Example 3 ----------------------------------------
74 |     X = np.concatenate([np.random.normal([0, 1], size=[40, 2]),
75 |                         np.random.normal([1, 0], size=[40, 2])])
76 |     Y = np.concatenate([np.ones(40), np.zeros(40)])
77 |     demonstrate(X, Y, "Example 3: Logistic Regression is suitable for tasks that are not strictly linear separable")
78 | 


--------------------------------------------------------------------------------
/06.LogisticRegression-MaxEntropy/MaxEntropy.py:
--------------------------------------------------------------------------------
  1 | from rich.console import Console
  2 | from rich.table import Table
  3 | import numpy as np
  4 | from numpy import linalg
  5 | import sys
  6 | import os
  7 | from pathlib import Path
  8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
  9 | from utils import softmax, line_search
 10 | 
 11 | class MaxEntropy:
 12 |     def __init__(self, epsilon=1e-6, max_steps=1000, verbose=True):
 13 |         self.epsilon = epsilon
 14 |         self.max_steps = max_steps
 15 |         self.verbose = verbose
 16 | 
 17 |     def _p_w(self, w):
 18 |         """
 19 |         calculate probability table according to w
 20 |         """
 21 |         logit = (w[:, None, None] * self.feature).sum(axis=0)
 22 |         p_w = softmax(logit, axis=-1)
 23 |         return p_w
 24 | 
 25 |     def _f(self, w):
 26 |         """
 27 |         object function
 28 |         """
 29 |         return \
 30 |             (
 31 |                 np.log(
 32 |                     np.exp(
 33 |                         (w[:, None, None] * self.feature
 34 |                          ).sum(axis=0)
 35 |                     ).sum(axis=-1)) * self.p_data_x
 36 |             ).sum() - \
 37 |             (
 38 |                 self.p_data * (w[:, None, None] * self.feature).sum(axis=0)
 39 |             ).sum()
 40 | 
 41 |     def _g(self, w):
 42 |         """
 43 |         gradient of object function
 44 |         """
 45 |         p_w = self._p_w(w)
 46 |         return (self.p_data_x[None, :, None] * p_w[None, :, :] * self.feature
 47 |                 ).sum(axis=(1, 2)) - self.E_feature
 48 | 
 49 |     def fit(self, p_data, feature):
 50 |         """
 51 |         optimize max entropy model with BFGS
 52 |         p_data: matrix of shape [nx, ny], possibility of all (x, y)
 53 |         feature: matrix of shape[nf, nx, ny], all the feature functions of all (x, y)
 54 |         """
 55 |         # nf is the number of feature functions, and the size of w
 56 |         self.nf, self.nx, self.ny = feature.shape
 57 |         self.feature = feature
 58 |         self.p_data = p_data
 59 |         self.p_data_x = p_data.sum(axis=-1)
 60 |         self.E_feature = (p_data[None, :, :] * feature).sum(axis=(1, 2))
 61 | 
 62 |         # initlaize optimizer
 63 |         self.w = np.random.rand(self.nf)
 64 |         B = np.eye(self.nf)
 65 |         g_next = self._g(self.w)
 66 |         g_norm = linalg.norm(g_next)
 67 |         # optimize
 68 |         for i in range(self.max_steps):
 69 |             g = g_next
 70 |             if self.verbose:
 71 |                 print(f"Step {i}, L2 norm of gradient is {g_norm}")
 72 |             if g_norm < self.epsilon:
 73 |                 break
 74 |             p = linalg.solve(B, -g)
 75 |             f_lambda = lambda x: self._f(self.w + x * p)
 76 |             lamda = line_search(f_lambda, 0, 100, epsilon=self.epsilon)
 77 |             delta_w = lamda * p
 78 |             self.w += delta_w
 79 |             g_next = self._g(self.w)
 80 |             g_norm = linalg.norm(g_next)
 81 |             if g_norm < self.epsilon:
 82 |                 print(f"L2 norm of gradient is {g_norm}, stop training...")
 83 |                 break
 84 |             delta_g = g_next - g
 85 |             B_delta_w = B @ delta_w
 86 |             B += np.outer(delta_g, delta_g) / (delta_g @ delta_w) - \
 87 |                 np.outer(B_delta_w, B_delta_w) / (B_delta_w.T @ delta_w)
 88 |         self.p_w = self._p_w(self.w)
 89 | 
 90 |     def predict(self, x, y):
 91 |         """predict p(y|x)"""
 92 |         return self.p_w[x][y]
 93 | 
 94 | # The following examples are proposed by SleepyBag at
 95 | # https://www.zhihu.com/question/24094554/answer/1507080982
 96 | if __name__ == "__main__":
 97 |     console = Console(markup=False)
 98 |     def float2str(x):
 99 |         return "%.3f" % x
100 | 
101 |     def demonstrate(data, feature_functions):
102 |         max_entropy = MaxEntropy()
103 |         max_entropy.fit(data, feature_functions)
104 | 
105 |         # print results
106 |         for i, ff in enumerate(feature_functions):
107 |             table = Table(f'feature {i}', 'y=1', 'y=2', 'y=3')
108 |             for x in range(2):
109 |                 table.add_row(f'x={x}', *map(float2str, [ff[x, y] for y in range(3)]))
110 |             console.print(table)
111 |         table = Table('prob', 'y=1', 'y=2', 'y=3')
112 |         for x in range(2):
113 |             table.add_row(f'x={x}', *map(float2str, [max_entropy.predict(x, y) for y in range(3)]))
114 |         console.print(table)
115 | 
116 |     # ---------------------- Prepare Data -----------------------------------------
117 |     data = np.array([[.125, .25, .125],
118 |                     [.5, 0., 0.]])
119 |     table = Table('data', 'y=1', 'y=2', 'y=3')
120 |     for x in range(2):
121 |         table.add_row(f'x={x}', *map(float2str, [data[x, y] for y in range(3)]))
122 |     console.print(table)
123 | 
124 |     # ---------------------- Example 1---------------------------------------------
125 |     print('Example 1: Single feature function')
126 |     feature_functions = np.array([
127 |         [[1, 0, 0],
128 |         [0, 0, 0]]
129 |     ])
130 |     demonstrate(data, feature_functions)
131 | 
132 |     # ---------------------- Example 3---------------------------------------------
133 |     print('Example 2: the value of feature function doesn\'t matter for feature function with only one non-zero value')
134 |     feature_functions = np.array([
135 |         [[0.5, 0, 0],
136 |          [0, 0, 0]]
137 |     ])
138 |     demonstrate(data, feature_functions)
139 | 
140 |     # ---------------------- Example 2---------------------------------------------
141 |     print('Example 3: double feature functions')
142 |     feature_functions = np.array([
143 |         [[1, 0, 0],
144 |          [0, 0, 0]],
145 |         [[0, 0, 0],
146 |          [0, 1, 0]]
147 |     ])
148 |     demonstrate(data, feature_functions)
149 | 
150 |     # ---------------------- Example 3---------------------------------------------
151 |     print('Example 4: single feature function with two non-zeros')
152 |     feature_functions = np.array([
153 |         [[0, 1, 1],
154 |          [0, 0, 0]]
155 |     ])
156 |     demonstrate(data, feature_functions)
157 | 
158 |     # ---------------------- Example 3---------------------------------------------
159 |     print('Example 5: the value of feature function matters for feature function with multiple non-zero values')
160 |     feature_functions = np.array([
161 |         [[0, 1, .5],
162 |          [0, 0, 0]]
163 |     ])
164 |     demonstrate(data, feature_functions)
165 | 


--------------------------------------------------------------------------------
/07.SVM/SVM.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot as plt
  2 | import numpy as np
  3 | import sys
  4 | import os
  5 | from pathlib import Path
  6 | from rich.console import Console
  7 | from rich.table import Table
  8 | from functools import partial
  9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 10 | from utils import wbline
 11 | 
 12 | class SVM:
 13 |     def __init__(self, C=1e9, epsilon=1e-6, lr=1e-4, max_steps=1000, verbose=True, kernel=np.dot):
 14 |         """
 15 |         kernel: kernel function, of which
 16 |                 the input is two vectors a, b
 17 |                 the output is a scalar value
 18 |         """
 19 |         self.lr = lr
 20 |         self.max_steps = max_steps
 21 |         self.verbose = verbose
 22 |         self.C = C
 23 |         self.epsilon = epsilon
 24 |         self.kernel = kernel
 25 | 
 26 |     def _smo_objective(self, i, j):
 27 |         """
 28 |         The objective function of one step of SMO
 29 |         given the choosed alpha i and alpha j
 30 |         """
 31 |         alpha, Y, K = self.alpha, self.Y, self.K
 32 |         return (alpha[i] * Y[i] * alpha * K[i:] * Y).sum() \
 33 |             + (alpha[j] * Y[j] * alpha * K[j:] * Y).sum() \
 34 |             - .5 * alpha[i] ** 2 * K[i, i] \
 35 |             - .5 * alpha[j] ** 2 * K[j, j] \
 36 |             - alpha[i] * alpha[j] * Y[i] * Y[j] * K[i, j]\
 37 |             - alpha[i] - alpha[j]
 38 | 
 39 |     def _smo_step(self, step_cnt):
 40 |         if self.verbose:
 41 |             print(f'SMO step {step_cnt} start...')
 42 |         alpha = self.alpha
 43 |         K = self.K
 44 |         data_size = len(alpha)
 45 | 
 46 |         # the prediction of this step
 47 |         pred = (self.alpha * Y * self.K).sum(axis=-1) + self.b
 48 |         # the score of pred
 49 |         score = Y * pred
 50 |         # discrepency between pred and label
 51 |         error = pred - Y
 52 | 
 53 |         updated = False
 54 | 
 55 |         # find the first variable alpha_i
 56 |         # which violate KKT constraint
 57 |         # first try to find fake support vectors
 58 |         # of which 0 < alpha_i < C but score_i isn't 1
 59 |         i_cands = [i for i in range(data_size) if
 60 |                    0 < alpha[i] < self.C and abs(score[i] - 1) > self.epsilon or
 61 |                    alpha[i] == 0 and score[i] < 1 or
 62 |                    alpha[i] == self.C and score[i] > 1]
 63 |         for i in i_cands:
 64 |             # find the second variable
 65 |             # which makes alpha_i change most
 66 |             relative_error = np.abs(error - error[i])
 67 |             j_cands = sorted(list(range(data_size)), key=relative_error.__getitem__)
 68 |             for j in j_cands:
 69 |                 if j == i:
 70 |                     continue
 71 |                 smo_objective_before = self._smo_objective(i, j)
 72 | 
 73 |                 # upper bound and lower bound of alpha_j
 74 |                 L = max(0, alpha[j] - alpha[i] if Y[i] != Y[j] else alpha[i] + alpha[j] - self.C)
 75 |                 H = min(self.C, self.C + alpha[j] - alpha[i] if Y[i] != Y[j] else alpha[i] + alpha[j])
 76 | 
 77 |                 if self.verbose:
 78 |                     print('SMO chooses: ', i, j)
 79 |                     print('alpha[i] and alpha[j] are', alpha[i], alpha[j])
 80 |                     print('Step begin, current object of dual problem:', smo_objective_before)
 81 | 
 82 |                 alpha_j_old = alpha[j]
 83 |                 eta = K[i, i] + K[j, j] - 2 * K[i, j] + self.epsilon
 84 |                 # update alpha_j
 85 |                 alpha[j] += Y[j] * (error[i] - error[j]) / eta
 86 |                 # clip
 87 |                 alpha[j] = min(alpha[j], H)
 88 |                 alpha[j] = max(alpha[j], L)
 89 |                 # update alpha_i
 90 |                 alpha[i] += Y[i] * Y[j] * (alpha_j_old - alpha[j])
 91 |                 # update b
 92 |                 self.b = Y[i] - (alpha * Y * K[i]).sum()
 93 |                 if 0 < alpha[j] < self.C:
 94 |                     self.b = (Y[j] - (alpha * Y * K[j]).sum() + self.b) / 2
 95 |                 smo_objective_after = self._smo_objective(i, j)
 96 |                 if self.verbose:
 97 |                     print('Step end, current object of dual problem:', smo_objective_after)
 98 |                     print('alpha[i] and alpha[j] are', alpha[i], alpha[j])
 99 |                 if smo_objective_before - smo_objective_after > self.epsilon:
100 |                     updated = True
101 |                     break
102 |             if updated:
103 |                 break
104 |         if self.verbose:
105 |             print('SMO step end...')
106 |             print()
107 |         return len(i_cands) > 0
108 | 
109 |     def fit(self, X, Y):
110 |         """
111 |         optimize SVM with SMO
112 |         X: of shape [data-size, feature-size]
113 |         Y: of shape [data-size]
114 |         """
115 |         self.X, self.Y = X, Y
116 |         data_size = len(X)
117 |         self.alpha = np.zeros(data_size)
118 |         self.b = np.random.rand()
119 | 
120 |         self.K = np.array([[self.kernel(x1, x2) for x1 in X] for x2 in X])
121 |         print(self.K)
122 |         # optimize
123 |         step_cnt = 0
124 |         while self._smo_step(step_cnt) and step_cnt < self.max_steps:
125 |             step_cnt += 1
126 |             pass
127 | 
128 |         # optimized, get w and b
129 |         support_vector_ind = 0 < self.alpha
130 |         self._support_vectors = X[support_vector_ind]
131 |         self._support_Y = Y[support_vector_ind]
132 |         self._support_alpha = self.alpha[support_vector_ind]
133 |         if self.verbose:
134 |             print("Done!")
135 |             print('Alphas are as follows:')
136 |             print(self.alpha)
137 |             print(support_vector_ind)
138 |             print('Support vectors are as follows:')
139 |             print(self._support_vectors)
140 | 
141 |         # for demonstration
142 |         self.w = ((self.alpha * Y)[:, None] * X).sum(axis=0)
143 | 
144 |     def _predict(self, x):
145 |         return (self._support_Y * self._support_alpha * \
146 |             np.apply_along_axis(partial(self.kernel, x), -1, self._support_vectors)).sum()
147 | 
148 |     def predict(self, X):
149 |         score = np.apply_along_axis(self._predict, -1, X)
150 |         # score = (self.w * X).sum(axis=-1) + self.b
151 |         pred = (score >= 0).astype(int) * 2 - 1
152 |         return pred
153 | 
154 | if __name__ == "__main__":
155 |     def demonstrate(X, Y, desc, draw=True, **args):
156 |         console = Console(markup=False)
157 |         svm = SVM(verbose=True, **args)
158 |         svm.fit(X, Y)
159 | 
160 |         # plot
161 |         if draw:
162 |             plt.scatter(X[:, 0], X[:, 1], c=Y)
163 |             wbline(svm.w, svm.b)
164 |             plt.title(desc)
165 |             plt.show()
166 | 
167 |         # show in table
168 |         pred = svm.predict(X)
169 |         table = Table('x', 'y', 'pred')
170 |         for x, y, y_hat in zip(X, Y, pred):
171 |             table.add_row(*map(str, [x, y, y_hat]))
172 |         console.print(table)
173 | 
174 |     # -------------------------- Example 1 ----------------------------------------
175 |     print("Example 1:")
176 |     X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
177 |     Y = np.array([1, 1, -1, -1])
178 |     demonstrate(X, Y, "Example 1")
179 | 
180 |     # -------------------------- Example 2 ----------------------------------------
181 |     print("Example 2:")
182 |     X = np.concatenate((np.random.rand(5, 2), np.random.rand(5, 2) + np.array([1, 1])), axis=0)
183 |     Y = np.array([1, 1, 1, 1, 1, -1, -1, -1, -1, -1])
184 |     print(X, Y)
185 |     demonstrate(X, Y, "Example 2: randomly generated data")
186 | 
187 |     # ---------------------- Example 3 --------------------------------------------
188 |     print("Example 3:")
189 |     X = np.array([[0, 0], [1, 1], [1, 0], [0, 1]])
190 |     Y = np.array([1, 1, -1, -1])
191 |     demonstrate(X, Y, "Example 3: SVM with dot kernel cannot sovle XOR problem", C=1)
192 | 
193 |     # ---------------------- Example 4 --------------------------------------------
194 |     def gaussian_kernel(x, y):
195 |         return np.exp(-((x - y) ** 2).sum())
196 |     print("Example 4:")
197 |     X = np.array([[0, 0], [1, 1], [1, 0], [0, 1]])
198 |     Y = np.array([1, 1, -1, -1])
199 |     demonstrate(X, Y, "Example 4: SVM with dot kernel cannot sovle XOR problem", draw=False, kernel=gaussian_kernel)
200 | 


--------------------------------------------------------------------------------
/08.Boosting/AdaBoost.py:
--------------------------------------------------------------------------------
 1 | from math import log
 2 | from matplotlib import pyplot as plt
 3 | import numpy as np
 4 | import sys
 5 | import os
 6 | from pathlib import Path
 7 | from rich.console import Console
 8 | from rich.table import Table
 9 | from functools import partial
10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
11 | from utils import wbline
12 | 
13 | class DecisionStump:
14 |     """
15 |     A simple classifier.
16 |     A decision stump divide dataset by a threshold
17 |     Expected one-dimensional X
18 |     """
19 |     def __init__(self, verbose=True):
20 |         self.verbose = verbose
21 | 
22 |     def fit(self, X, Y, weight):
23 |         # since X is one-dimensional, just flatten it
24 |         X = X[:, 0]
25 |         possible_thresholds = list(set(X))
26 |         possible_thresholds.append(max(possible_thresholds) + 1)
27 |         possible_thresholds.append(min(possible_thresholds) - 1)
28 |         # try all possible threshold
29 |         best_acc = 0.
30 |         best_threshold, best_sign = 0., 0.
31 |         for self.sign in [1, -1]:
32 |             for self.threshold in possible_thresholds:
33 |                 pred = self.predict(X)
34 |                 acc = (pred == Y) @ weight
35 |                 if acc > best_acc:
36 |                     best_acc, best_threshold, best_sign = acc, self.threshold, self.sign
37 |         self.threshold, self.sign = best_threshold, best_sign
38 |         if self.verbose:
39 |             print(f'Threshold is {self.threshold}')
40 | 
41 |     def predict(self, X):
42 |         X = X * self.sign
43 |         threshold = self.threshold * self.sign
44 |         pred = (X > threshold) * 2 - 1
45 |         return pred.flatten()
46 | 
47 | class AdaBoost:
48 |     def __init__(self, BasicModel=DecisionStump, steps=10, verbose=True):
49 |         self.BasicModel = BasicModel
50 |         self.steps = steps
51 |         self.verbose = verbose
52 | 
53 |     def fit(self, X, Y):
54 |         n = len(X)
55 |         weight = np.ones(n) / n
56 |         self.basic_models = []
57 |         self.model_weights = []
58 |         for i in range(self.steps):
59 |             basic_model = self.BasicModel()
60 |             basic_model.fit(X, Y, weight)
61 |             self.basic_models.append(basic_model)
62 |             pred = basic_model.predict(X)
63 |             error_rate = (pred != Y) @ weight
64 |             model_weight = .5 * log((1 - error_rate) / error_rate)
65 |             weight *= np.exp(-model_weight * Y * pred)
66 |             weight /= weight.sum()
67 |             self.model_weights.append(model_weight)
68 |             if self.verbose:
69 |                 print(f'Step {i}, current error rate is {error_rate}')
70 |                 print(f'The weight of current model is {model_weight}')
71 | 
72 |     def predict(self, X):
73 |         score = sum(model.predict(X) * weight for model, weight in
74 |                     zip(self.basic_models, self.model_weights))
75 |         pred = (score > 0.).astype(int) * 2 - 1
76 |         return pred
77 | 
78 | if __name__ == "__main__":
79 |     def demonstrate(X, Y, desc):
80 |         print(desc)
81 |         console = Console(markup=False)
82 |         adaboost = AdaBoost(verbose=True)
83 |         adaboost.fit(X, Y)
84 | 
85 |         # show in table
86 |         pred = adaboost.predict(X)
87 |         table = Table('x', 'y', 'pred')
88 |         for x, y, y_hat in zip(X, Y, pred):
89 |             table.add_row(*map(str, [x, y, y_hat]))
90 |         console.print(table)
91 | 
92 |     # -------------------------- Example 1 ----------------------------------------
93 |     X = np.arange(10).reshape(-1, 1)
94 |     Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
95 |     demonstrate(X, Y, "Example 1")
96 | 


--------------------------------------------------------------------------------
/08.Boosting/GBDT.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | import sys
  4 | import os
  5 | from pathlib import Path
  6 | from rich.console import Console
  7 | from rich.table import Table
  8 | from functools import partial
  9 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 10 | from utils import line_search
 11 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '5.DecisionTree'))
 12 | from RegressionCART import RegressionCART
 13 | 
 14 | class GBDT:
 15 |     def __init__(self,
 16 |                  loss_function=lambda label, pred: ((label - pred) ** 2).sum(),
 17 |                  gradient_function=lambda label, pred: 2 * (pred - label),
 18 |                  steps=10,
 19 |                  max_depth=3,
 20 |                  verbose=True):
 21 |         """
 22 |         `loss_function` takes two arguments, label and pred and return a scalar, the loss
 23 |         `gradient_function` is gradient from loss function to the prediction
 24 |         It takes two arguments, i.e., label and pred and return the gradient
 25 |         the loss function should be convex
 26 |         The default loss function is l2 loss, which makes GBDT an ordinary boosting tree
 27 |         """
 28 |         self.steps = steps
 29 |         self.verbose = verbose
 30 |         self.gradient_function = gradient_function
 31 |         self.loss_function = loss_function
 32 |         self.max_depth = max_depth
 33 | 
 34 |     def _loss_of_const(self, Y, c):
 35 |         """
 36 |         Return the loss when the model take a constant c as the prediction
 37 |         `Y` is a vector of labels
 38 |         `c` is a constant scalar
 39 |         """
 40 |         c = (np.ones_like(Y) * c).astype(float)
 41 |         return self.loss_function(Y, c)
 42 | 
 43 |     def fit(self, X, Y):
 44 |         n = len(X)
 45 |         self.carts = []
 46 |         # the basic value of prediction, so that there can be 'residual'
 47 |         self.basic_pred = line_search(partial(self._loss_of_const, Y), min(Y), max(Y))
 48 | 
 49 |         cur_pred = np.zeros_like(Y) + self.basic_pred
 50 |         residual = -self.gradient_function(Y, cur_pred)
 51 |         for i in range(self.steps):
 52 |             if self.verbose:
 53 |                 print(f'step {i}')
 54 |                 print(f'Current pred is {cur_pred}')
 55 |                 print(f'Current residual is {residual}')
 56 |             cart = RegressionCART(verbose=False, max_depth=self.max_depth)
 57 |             cart.fit(X, residual)
 58 |             self.carts.append(cart)
 59 |             # regression trees use l2 loss as loss function,
 60 |             # the return value leaf nodes should be recorrect
 61 |             leaf2label=defaultdict(list)
 62 |             for i, x in enumerate(X):
 63 |                 leaf = cart._query_leaf(cart.root, x)
 64 |                 leaf2label[leaf].append(i)
 65 |             for leaf in leaf2label:
 66 |                 data_ind = np.stack(leaf2label[leaf])
 67 |                 leafY = Y[data_ind]
 68 |                 leaf_cur_pred = cur_pred[data_ind]
 69 |                 leaf.label = line_search(lambda c: self.loss_function(leafY, leaf_cur_pred + c), -1e9, 1e9)
 70 | 
 71 |             # update the incremental prediction
 72 |             inc_pred = cart.predict(X)
 73 |             cur_pred += inc_pred
 74 |             residual = -self.gradient_function(Y, cur_pred)
 75 | 
 76 |     def predict(self, X):
 77 |         pred = np.zeros(len(X)) + self.basic_pred
 78 |         for cart in self.carts:
 79 |             pred += cart.predict(X)
 80 |         return pred
 81 | 
 82 | if __name__ == "__main__":
 83 |     def demonstrate(X, Y, max_depth, desc):
 84 |         print(desc)
 85 |         console = Console(markup=False)
 86 |         gbdt = GBDT(verbose=True, max_depth=max_depth)
 87 |         gbdt.fit(X, Y)
 88 | 
 89 |         # show in table
 90 |         pred = gbdt.predict(X)
 91 |         table = Table('x', 'y', 'pred')
 92 |         for x, y, y_hat in zip(X, Y, pred):
 93 |             table.add_row(*map(str, [x, y, y_hat]))
 94 |         console.print(table)
 95 | 
 96 |     # -------------------------- Example 1 ----------------------------------------
 97 |     X = np.arange(10).reshape(-1, 1)
 98 |     Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
 99 |     demonstrate(X, Y, 3, "Example 1")
100 | 
101 |     # -------------------------- Example 2 ----------------------------------------
102 |     X = np.arange(10).reshape(-1, 1)
103 |     Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
104 |     demonstrate(X, Y, 1, "Example 2: CART cannot be all stumps")
105 | 
106 |     # -------------------------- Example 3 ----------------------------------------
107 |     X = np.arange(10).reshape(-1, 1)
108 |     Y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
109 |     demonstrate(X, Y, 2, "Example 3")
110 | 


--------------------------------------------------------------------------------
/09.EM/GMM.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from matplotlib import pyplot as plt
  3 | 
  4 | class GMM:
  5 |     def __init__(self, k, independent_variance=True, max_step=2000, verbose=True):
  6 |         self.k = k
  7 |         self.max_step = max_step
  8 |         self.epsilon = 1e-8
  9 |         self.verbose = verbose
 10 |         # specify whether each feature has independent variance - that is, has a diagnol covariance matrix
 11 |         self.independent_variance = independent_variance
 12 | 
 13 |     def fit(self, X):
 14 |         """
 15 |         X: training data of shape [n, feature_size]
 16 |         """
 17 |         n, self.feature_size = X.shape
 18 |         # the parameter of each gaussian distribution
 19 |         self.prior = np.ones(self.k) / self.k
 20 |         self.prior /= self.prior.sum()
 21 |         if self.independent_variance:
 22 |             self.std = np.repeat(np.std(X, axis=0, keepdims=True), self.k, axis=0)
 23 |             self.mean = np.random.normal(X.mean(axis=0), self.std, [self.k, self.feature_size])
 24 |         else:
 25 |             self.cov = np.repeat(np.cov(X.T)[None, ...], self.k, axis=0)
 26 |             self.mean = np.random.multivariate_normal(X.mean(axis=0), self.cov[0], [self.k])
 27 | 
 28 |         previous_log_likelihood = -np.inf
 29 |         for step in range(self.max_step):
 30 |             ##########################################
 31 |             # Expectation step
 32 |             ##########################################
 33 |             # posterior probability of each sample in each Gaussian model
 34 |             posterior = self.predict(X)
 35 | 
 36 |             ##########################################
 37 |             # Maximization step
 38 |             ##########################################
 39 |             # center of each Gaussian model
 40 |             self.mean = (posterior[:, :, None] * X[None, :, :]).sum(axis=1) / \
 41 |                 (posterior.sum(axis=1)[:, None] + self.epsilon)
 42 |             # distance from each sample to each center
 43 |             dis = X[None, :, :] - self.mean[:, None, :]
 44 |             if self.independent_variance:
 45 |                 # variance of each Gaussian model
 46 |                 var = (posterior[:, :, None] * dis ** 2).sum(axis=1) / \
 47 |                     (posterior.sum(axis=1)[:, None] + self.epsilon)
 48 |                 # standard deviation of each Gaussian model, in each dimension
 49 |                 # shape [k, feature_size]
 50 |                 # std[i, j] is the variance of j-th feature in the i-th Gaussian model
 51 |                 self.std = np.sqrt(var)
 52 |             else:
 53 |                 # covariance of each Gaussian model
 54 |                 # shape [k, feature_size, feature_size]
 55 |                 # cov[i] is the covariance matrix of i-th Gaussian model
 56 |                 self.cov =  (dis.transpose(0, 2, 1) @ (posterior[:, :, None] * dis)) / \
 57 |                     (posterior.sum(axis=1)[:, None, None] + self.epsilon)
 58 |             self.prior = posterior.sum(axis=1)
 59 |             self.prior /= (self.prior.sum() + self.epsilon)
 60 | 
 61 |             # early stopping
 62 |             log_likelihood = self.log_likelihood(X) 
 63 |             if self.verbose:
 64 |                 print('After step', step, ', likelihood of model parameters is', np.exp(log_likelihood))
 65 |             if log_likelihood - previous_log_likelihood < self.epsilon:
 66 |                 break
 67 |             previous_log_likelihood = log_likelihood
 68 | 
 69 |     def pairwise_likelihood(self, X):
 70 |         """
 71 |         return the likelihood of each data piece in X belonging to each Gaussian cluster
 72 |         """
 73 |         # dis[i, j, k] is the distance from i-th center to j-th sample, in k-th dimension
 74 |         dis = X[None, :, :] - self.mean[:, None, :]
 75 | 
 76 |         # calculate log likelihood first, then likelihood
 77 |         if self.independent_variance:
 78 |             # data_log_likelihood is of shape [k, n, feature_size]
 79 |             data_log_likelihood = -dis ** 2 * .5 / (self.std[:, None, :] ** 2 + self.epsilon) \
 80 |                 - np.log(np.sqrt(2 * np.pi) + self.epsilon) - np.log(self.std[:, None, :] + self.epsilon)
 81 |             # reduce likelihood to shape [k, n]
 82 |             # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center
 83 |             data_log_likelihood = data_log_likelihood.sum(-1)
 84 |         else:
 85 |             # data_log_likelihood is of shape [k, n]
 86 |             # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center
 87 |             fixed_cov = self.cov + self.epsilon * np.eye(self.feature_size)
 88 |             data_log_likelihood = -.5 * (dis @ np.linalg.inv(fixed_cov) * dis).sum(axis=-1) \
 89 |                 -.5 * np.linalg.slogdet(2 * np.pi * fixed_cov)[1][:, None]                            # slogdet returns [sign, logdet], we just need logdet
 90 | 
 91 |         data_likelihood = np.exp(data_log_likelihood)
 92 |         # the posterior of each datium belonging to a distribution, of shape [k, n]
 93 |         posterior = self.prior[:, None] * data_likelihood
 94 |         return posterior
 95 | 
 96 |     def log_likelihood(self, X):
 97 |         """
 98 |         return the likelihood of parameter given dataset X. 
 99 |         It is exactly the posterior probability of X given current parametmer
100 |         """
101 |         posterior = self.pairwise_likelihood(X)
102 |         log_likelihood = np.log(posterior.sum(axis=0)).mean()
103 |         return log_likelihood
104 | 
105 |     def predict(self, X):
106 |         """return the probability of each x belonging to each gaussian distribution"""
107 |         posterior = self.pairwise_likelihood(X)
108 |         posterior /= (posterior.sum(axis=0, keepdims=True) + self.epsilon)
109 |         return posterior
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     def demonstrate(desc, X):
114 |         gmm = GMM(3, independent_variance=False)
115 |         gmm.fit(X)
116 |         pred = gmm.predict(X).T
117 |         plt.scatter(X[:, 0], X[:, 1], color=pred)
118 |         plt.title(desc)
119 |         plt.show()
120 | 
121 |     # ---------------------- Example 1---------------------------------------------
122 |     X = np.concatenate([
123 |         np.random.normal([0, 0], [.3, .3], [100, 2]),
124 |         np.random.normal([0, 1], [.3, .3], [100, 2]),
125 |         np.random.normal([1, 0], [.3, .3], [100, 2]),
126 |     ])
127 |     demonstrate("Example 1", X)
128 | 
129 |     # ---------------------- Example 2---------------------------------------------
130 |     demonstrate("Example 2: GMM does'nt promise the same result for the same data", X)
131 | 
132 |     # ---------------------- Example 3---------------------------------------------
133 |     X = np.concatenate([
134 |         np.random.normal([0, 0], [.4, .4], [100, 2]),
135 |         np.random.normal([0, 1], [.4, .4], [100, 2]),
136 |         np.random.normal([1, 0], [.4, .4], [100, 2]),
137 |     ])
138 |     demonstrate("Example 3", X)
139 | 
140 |     # ---------------------- Example 4---------------------------------------------
141 |     X = np.concatenate([
142 |         np.random.normal([0, 0], [.4, .4], [100, 2]),
143 |         np.random.normal([0, 3], [.4, .4], [100, 2]),
144 |         np.random.normal([3, 0], [.4, .4], [100, 2]),
145 |     ])
146 |     demonstrate("Example 4", X)
147 | 
148 |     # ---------------------- Example 5---------------------------------------------
149 |     X = np.concatenate([
150 |         np.random.normal([0, 0], [.4, .4], [1, 2]),
151 |         np.random.normal([0, 3], [.4, .4], [1, 2]),
152 |         np.random.normal([3, 0], [.4, .4], [1, 2]),
153 |     ])
154 |     demonstrate("Example 5", X)
155 | 


--------------------------------------------------------------------------------
/09.EM/GMMGradientDescent.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | from matplotlib import pyplot as plt
  4 | 
  5 | class GMMGradientDescent:
  6 |     def __init__(self, k, independent_variance=True, max_step=20000, learning_rate=1e-3, verbose=True):
  7 |         self.k = k
  8 |         self.max_step = max_step
  9 |         self.epsilon = 1e-8
 10 |         self.learning_rate = learning_rate
 11 |         self.log_sqrt_2pi = math.log(math.sqrt(2 * torch.pi))
 12 |         self.verbose = verbose
 13 |         # specify whether each feature has independent variance - that is, has a diagnol covariance matrix
 14 |         self.independent_variance = independent_variance
 15 | 
 16 |     def fit(self, X):
 17 |         """
 18 |         X: training data of shape [n, feature_size]
 19 |         """
 20 |         n, self.feature_size = X.shape
 21 |         X = torch.Tensor(X)
 22 |         # the parameter of each gaussian distribution
 23 |         self.prior_logit = torch.zeros(self.k)
 24 |         self.prior_logit.requires_grad_()
 25 |         if self.independent_variance:
 26 |             self.log_std = torch.log(X.std(dim=0)).repeat(self.k, 1)
 27 |             self.log_std.requires_grad_()
 28 |             self.mean = torch.zeros(self.k, self.feature_size)
 29 |             self.mean.normal_()
 30 |             self.mean.requires_grad_()
 31 |         else:
 32 |             self.cholesky_inverse_cov = torch.linalg.cholesky(torch.cov(X.T)).repeat(self.k, 1, 1)
 33 |             self.cholesky_inverse_cov.requires_grad_()
 34 |             self.mean = torch.zeros(self.k, self.feature_size)
 35 |             self.mean.normal_()
 36 |             self.mean.requires_grad_()
 37 |         self.optimizer = torch.optim.Adam([self.log_std, self.mean, self.prior_logit], lr=self.learning_rate)
 38 | 
 39 |         previous_log_likelihood = -math.inf
 40 |         for step in range(self.max_step):
 41 |             ##########################################
 42 |             # Calculate Likelihood
 43 |             ##########################################
 44 |             # posterior probability of each sample in each Gaussian model
 45 |             # it is exactly the likelihood of parameters including mean, std and prior
 46 |             log_likelihood = self.log_likelihood(X, input_tensor=True, return_tensor=True)
 47 |             neg_log_likelihood = -log_likelihood.mean()
 48 | 
 49 |             if self.verbose:
 50 |                 if step % 1000 == 0:
 51 |                     print('Step', step, ', likelihood is', math.exp(-neg_log_likelihood))
 52 | 
 53 |             ##########################################
 54 |             # Gradient Descent Step
 55 |             ##########################################
 56 |             self.optimizer.zero_grad()
 57 |             neg_log_likelihood.backward()
 58 |             self.optimizer.step()
 59 | 
 60 |             # early stopping
 61 |             log_likelihood = self.log_likelihood(X, input_tensor=True) 
 62 |             if self.verbose:
 63 |                 print('After step', step, ', likelihood of model parameters is', np.exp(log_likelihood))
 64 |             if log_likelihood - previous_log_likelihood < self.epsilon:
 65 |                 break
 66 |             previous_log_likelihood = log_likelihood
 67 | 
 68 |     def log_likelihood(self, X, input_tensor=False, return_tensor=False):
 69 |         if not input_tensor:
 70 |             X = torch.Tensor(X)
 71 |         pairwise_likelihood = self.pairwise_likelihood(X)
 72 |         log_likelihood = torch.log(pairwise_likelihood.sum(dim=0)).mean()
 73 |         if not return_tensor:
 74 |             log_likelihood = log_likelihood.detach().numpy()
 75 |         return log_likelihood
 76 | 
 77 |     def pairwise_likelihood(self, X):
 78 |         """return the likelihood of each x belonging to each gaussian distribution"""
 79 |         # dis[i, j, k] is the distance from i-th center to j-th sample, in k-th dimension
 80 |         dis = X[None, :, :] - self.mean[:, None, :]
 81 | 
 82 |         # calculate log likelihood first, then likelihood
 83 |         if self.independent_variance:
 84 |             # log_likelihood is of shape [k, n, feature_size]
 85 |             data_log_likelihood = -dis ** 2 * .5 / (torch.exp(self.log_std[:, None, :]) ** 2 + self.epsilon) \
 86 |                 - self.log_sqrt_2pi - self.log_std[:, None, :]
 87 |             # reduce likelihood to shape [k, n]
 88 |             # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center
 89 |             data_log_likelihood = data_log_likelihood.sum(dim=-1)
 90 |         else:
 91 |             # log_likelihood is of shape [k, n]
 92 |             # data_log_likelihood[i, j] is the likelihood of j-th sample belonging to i-th center
 93 |             inverse_cov = self.cholesky_inverse_cov @ self.cholesky_inverse_cov.T
 94 |             data_log_likelihood = -.5 * (dis @ inverse_cov * dis).sum(axis=-1) \
 95 |                 +.5 * torch.linalg.slogdet(.5 / torch.pi * inverse_cov)[1][:, None]                            # slogdet returns [sign, logdet], we just need logdet
 96 | 
 97 |         likelihood = torch.exp(data_log_likelihood)
 98 |         # the posterior of each datium belonging to a distribution, of shape [k, n]
 99 |         pairwise_likelihood = torch.nn.functional.softmax(self.prior_logit)[:, None] * likelihood
100 |         return pairwise_likelihood
101 | 
102 |     def predict(self, X):
103 |         posterior = self.pairwise_likelihood(torch.Tensor(X)).detach().numpy()
104 |         posterior /= (posterior.sum(axis=0, keepdims=True) + self.epsilon)
105 |         return posterior
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     import numpy as np
110 | 
111 |     def demonstrate(desc, X):
112 |         gmm = GMMGradientDescent(3)
113 |         gmm.fit(X)
114 |         pred = gmm.predict(X).T
115 |         plt.scatter(X[:, 0], X[:, 1], color=pred)
116 |         plt.title(desc)
117 |         plt.show()
118 | 
119 |     # ---------------------- Example 1---------------------------------------------
120 |     X = np.concatenate([
121 |         np.random.normal([0, 0], [.3, .3], [100, 2]),
122 |         np.random.normal([0, 1], [.3, .3], [100, 2]),
123 |         np.random.normal([1, 0], [.3, .3], [100, 2]),
124 |     ])
125 |     demonstrate("Example 1", X)
126 | 
127 |     # ---------------------- Example 2---------------------------------------------
128 |     demonstrate("Example 2: GMM does'nt promise the same result for the same data", X)
129 | 
130 |     # ---------------------- Example 3---------------------------------------------
131 |     X = np.concatenate([
132 |         np.random.normal([0, 0], [.4, .4], [100, 2]),
133 |         np.random.normal([0, 1], [.4, .4], [100, 2]),
134 |         np.random.normal([1, 0], [.4, .4], [100, 2]),
135 |     ])
136 |     demonstrate("Example 3", X)
137 | 
138 |     # ---------------------- Example 4---------------------------------------------
139 |     X = np.concatenate([
140 |         np.random.normal([0, 0], [.4, .4], [100, 2]),
141 |         np.random.normal([0, 3], [.4, .4], [100, 2]),
142 |         np.random.normal([3, 0], [.4, .4], [100, 2]),
143 |     ])
144 |     demonstrate("Example 4", X)
145 | 
146 |     # ---------------------- Example 5---------------------------------------------
147 |     X = np.concatenate([
148 |         np.random.normal([0, 0], [.4, .4], [1, 2]),
149 |         np.random.normal([0, 3], [.4, .4], [1, 2]),
150 |         np.random.normal([3, 0], [.4, .4], [1, 2]),
151 |     ])
152 |     demonstrate("Example 5", X)
153 | 


--------------------------------------------------------------------------------
/09.EM/benchmark.py:
--------------------------------------------------------------------------------
 1 | from tabnanny import verbose
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | from GMM import GMM
 6 | from GMMGradientDescent import GMMGradientDescent
 7 | 
 8 | def compare(X, k):
 9 |     gmm = GMM(k, verbose=False)
10 |     gmm_gradient_descent = GMMGradientDescent(k, verbose=False)
11 |     gmm.fit(X)
12 |     gmm_gradient_descent.fit(X)
13 |     gmm_likelihood = np.exp(gmm.log_likelihood(X))
14 |     gmm_gradient_descent_log_likelihood = np.exp(gmm_gradient_descent.log_likelihood(X))
15 |     return gmm_likelihood, gmm_gradient_descent_log_likelihood
16 | 
17 | X = np.concatenate([
18 |     np.random.normal([0, 0], [.3, .3], [100, 2]),
19 |     np.random.normal([0, 1], [.3, .3], [100, 2]),
20 |     np.random.normal([1, 0], [.3, .3], [100, 2]),
21 | ])
22 | gmm_likelihoods = []
23 | gmm_gradient_descent_likelihoods = []
24 | for i in range(50):
25 |     print('Running comparison', i)
26 |     gmm_likelihood, gmm_gradient_descent_likelihood = compare(X, 3)
27 |     gmm_likelihoods.append(gmm_likelihood)
28 |     gmm_gradient_descent_likelihoods.append(gmm_gradient_descent_likelihood)
29 |     print('likelihood of EM algorithm is', gmm_likelihood)
30 |     print('likelihood of gradient descent is', gmm_gradient_descent_likelihood)
31 | 
32 | plt.boxplot([gmm_likelihoods, gmm_gradient_descent_likelihoods])
33 | # plt.axes().set_xticklabels(["EM", "gradient descent"])
34 | plt.show()


--------------------------------------------------------------------------------
/10.HMM/Backward.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rich.console import Console
 3 | from rich.table import Table
 4 | 
 5 | def backward(state2state, state2observation, initial_state, observation):
 6 |     """
 7 |     Given a HMM with parameter (state2state, state2observation, initial_state)
 8 |     and the observation,
 9 |     return the probability of the observation generated by this HMM
10 | 
11 |     state2state is a matrix shaped of [state_size, state_size]
12 |     state2observation is a matrix shaped of [state_size, observation_size]
13 |     initial_state is a tensor shaped of [state_size], whose each dimension means the probability of each state
14 |     observation is a matrix shaped of [sequence_length]
15 | 
16 |     where
17 | 
18 |     data_size is the number of all the data initial_stateeces
19 |     state_size is the number of all the possible states
20 |     observation_size is the number of all the possible observations
21 |     sequence_length is the length of each sequence
22 | 
23 |     the return value consists of two parts:
24 |     the probability of the observation,
25 |     and a sequence of probability of each state of each step
26 |     """
27 |     state_size, _ = state2state.shape
28 |     data_size, sequence_length = observation.shape
29 | 
30 |     seq_state_likelihood = np.zeros([data_size, sequence_length, state_size])
31 |     state_likelihood = np.ones([state_size, data_size])
32 |     for i in range(sequence_length - 1, -1, -1):
33 |         o = observation[:, i]
34 |         # given the parameter of HMM and each possible state this step, get the probability of the following observation
35 |         state_likelihood = state2state @ state_likelihood
36 |         seq_state_likelihood[:, i, :] = state_likelihood.T
37 |         # given the observation of this step, get the probability of this state
38 |         state_likelihood = state_likelihood * state2observation[:, o]
39 |     state_prob = state_likelihood.T * initial_state
40 |     return state_prob.sum(axis=-1), seq_state_likelihood
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     def demonstrate(state2state, state2observation, initial_state, observation, desc):
45 |         console = Console(markup=False)
46 |         prob = backward(state2state, state2observation, initial_state, observation)[0]
47 | 
48 |         # show in table
49 |         print(desc)
50 |         table = Table('sequence', 'prob')
51 |         for o, p in zip(observation, prob):
52 |             table.add_row(str(o), str(p))
53 |         table.add_row("Sum", str(sum(prob)))
54 |         console.print(table)
55 | 
56 |     # ---------------------- Example 1 --------------------------------------------
57 |     state2state = np.array(
58 |         [[.5, .2, .3],
59 |          [.3, .5, .2],
60 |          [.2, .3, .5]]
61 |     )
62 |     state2observation = np.array(
63 |         [[.5, .5],
64 |          [.4, .6],
65 |          [.7, .3]]
66 |     )
67 |     initial_state = np.array([.2, .4, .4])
68 |     observation = np.array([
69 |         [0, 0, 0],
70 |         [0, 0, 1],
71 |         [0, 1, 0],
72 |         [0, 1, 1],
73 |         [1, 0, 0],
74 |         [1, 0, 1],
75 |         [1, 1, 0],
76 |         [1, 1, 1],
77 |     ])
78 |     demonstrate(state2state, state2observation, initial_state, observation, "Example 1")
79 | 
80 |     # ---------------------- Example 2 --------------------------------------------
81 |     state2state = np.array(
82 |         [[.5, .5],
83 |          [.5, .5]]
84 |     )
85 |     state2observation = np.array(
86 |         [[.5, .5],
87 |          [.5, .5]]
88 |     )
89 |     initial_state = np.array([.5, .5])
90 |     observation = np.array([
91 |         [0],
92 |         [1],
93 |     ])
94 |     demonstrate(state2state, state2observation, initial_state, observation, "Example 2")
95 | 


--------------------------------------------------------------------------------
/10.HMM/BaumWelch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 6 | from utils import *
 7 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '10.HMM'))
 8 | from Backward import backward
 9 | from Forward import forward
10 | 
11 | def baum_welch(observation, state_size, observation_size, epsilon=1e-8, max_iteration=500):
12 |     """
13 |     Given a batch of sequence of observation,
14 |     return the parameter of the learnt HMM
15 | 
16 |     observation is a matrix shaped of [data_size, sequence_length]
17 | 
18 |     where
19 | 
20 |     data_size is the number of all the data initial_stateeces
21 |     sequence_length is the length of each sequence
22 | 
23 |     """
24 |     data_size, sequence_legnth = observation.shape
25 | 
26 |     # initial parameters
27 |     state2state = np.random.rand(state_size, state_size)
28 |     state2observation = np.random.rand(state_size, observation_size)
29 |     initial_state = np.random.rand(state_size)
30 |     state2state /= state2state.sum(axis=-1, keepdims=True)
31 |     state2observation /= state2observation.sum(axis=-1, keepdims=True)
32 |     initial_state /= initial_state.sum()
33 | 
34 |     for _ in range(max_iteration):
35 |         pre_state2state, pre_state2observation, pre_initial_state = state2state, state2observation, initial_state
36 | 
37 |         # Expectation step, from parameters to probability of states
38 |         state_prob_forward = forward(state2state, state2observation, initial_state, observation)[1]
39 |         state_likelihood_backward = backward(state2state, state2observation, initial_state, observation)[1]
40 |         state_likelihood = state_prob_forward * state_likelihood_backward + epsilon
41 | 
42 |         state_likelihood_wrt_observation = state2observation.T[observation]
43 | 
44 |         state_prob = state_likelihood / state_likelihood.sum(axis=-1, keepdims=True)
45 |         state_trans_prob = state_prob_forward[:, :-1, :, None] * \
46 |             state2state[None, None, :, :] * \
47 |             state_likelihood_wrt_observation[:, 1:, None, :] * \
48 |             state_likelihood_backward[:, 1:, None, :]
49 |         state_trans_prob /= state_trans_prob.sum(axis=(-1, -2), keepdims=True)
50 | 
51 |         # Maximization step, from probability of states to parameters
52 |         state2state = state_trans_prob.sum(axis=(0, 1)) / state_prob[:, :-1, :].sum(axis=(0, 1))[:, None]
53 |         state2state /= state2state.sum(axis=-1, keepdims=True)
54 |         state2observation = ((observation[:, :, None] == np.arange(observation_size)[None, None, :])[:, :, None, :] *
55 |                              state_prob[:, :, :, None]).sum(axis=(0, 1)) / state_prob.sum(axis=(0, 1))[:, None]
56 |         initial_state = state_prob[:, 0].mean(axis=0)
57 | 
58 |         stride = np.mean([abs(pre_state2state - state2state).mean(),
59 |                           abs(pre_state2observation - state2observation).mean(),
60 |                           abs(pre_initial_state - initial_state).mean()])
61 |         if stride < epsilon:
62 |             break
63 |     return state2state, state2observation, initial_state
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     def demonstrate(observation, state_size, observation_size, desc):
68 |         print(desc)
69 |         state2state, state2observation, initial_state = baum_welch(observation, state_size, observation_size)
70 |         print('state2state is:\n', np.round(state2state, 2))
71 |         print('state2observation is:\n', np.round(state2observation, 2))
72 |         print('initial_state is:\n', np.round(initial_state, 2))
73 |         print('')
74 | 
75 |     # Example 1
76 |     observation = np.array([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])
77 |     state_size = 2
78 |     observation_size = 2
79 |     demonstrate(observation, state_size, observation_size, "Example 1")
80 | 
81 |     # Example 2
82 |     observation = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
83 |     state_size = 2
84 |     observation_size = 2
85 |     demonstrate(observation, state_size, observation_size, "Example 2")
86 | 
87 |     # Example 3
88 |     observation = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
89 |     state_size = 2
90 |     observation_size = 2
91 |     demonstrate(observation, state_size, observation_size, "Example 3")
92 | 
93 |     # Example 3
94 |     observation = np.array([[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
95 |     state_size = 3
96 |     observation_size = 3
97 |     demonstrate(observation, state_size, observation_size, "Example 4")
98 | 


--------------------------------------------------------------------------------
/10.HMM/Forward.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rich.console import Console
  3 | from rich.table import Table
  4 | 
  5 | def forward(state2state, state2observation, initial_state, observation):
  6 |     """
  7 |     Given a HMM with parameter (state2state, state2observation, initial_state)
  8 |     and the observation,
  9 |     return the probability of the observation generated by this HMM
 10 | 
 11 |     state2state is a matrix shaped of [state_size, state_size]
 12 |     state2observation is a matrix shaped of [state_size, observation_size]
 13 |     initial_state is a tensor shaped of [state_size], whose each dimension means the probability of each state
 14 |     observation is a matrix shaped of [data_size, sequence_length]
 15 | 
 16 |     where
 17 | 
 18 |     data_size is the number of all the data initial_stateeces
 19 |     state_size is the number of all the possible states
 20 |     observation_size is the number of all the possible observations
 21 |     sequence_length is the length of each sequence
 22 | 
 23 |     the return value consists of two parts:
 24 |     the probability of the observation,
 25 |     and a sequence of probability of each state of each step
 26 |     """
 27 |     state_size, _ = state2state.shape
 28 |     data_size, sequence_length = observation.shape
 29 | 
 30 |     seq_state_prob = np.zeros([data_size, sequence_length, state_size])
 31 |     state_prob = initial_state[None, :]
 32 |     for i, o in enumerate(observation.T):
 33 |         # given the parameters of HMM, get the probability of this state with the previous observation
 34 |         state_prob = state_prob * state2observation.T[o]
 35 |         seq_state_prob[:, i, :] = state_prob
 36 |         # the probability of each state in next step
 37 |         state_prob = state_prob @ state2state
 38 |     return state_prob.sum(axis=-1), seq_state_prob
 39 | 
 40 | 
 41 | if __name__ == '__main__':
 42 |     def demonstrate(state2state, state2observation, initial_state, observation, desc):
 43 |         console = Console(markup=False)
 44 |         prob = forward(state2state, state2observation, initial_state, observation)[0]
 45 | 
 46 |         # show in table
 47 |         print(desc)
 48 |         table = Table('sequence', 'prob')
 49 |         for o, p in zip(observation, prob):
 50 |             table.add_row(str(o), str(p))
 51 |         table.add_row("Sum", str(sum(prob)))
 52 |         console.print(table)
 53 | 
 54 |     # ---------------------- Example 1 --------------------------------------------
 55 |     state2state = np.array(
 56 |         [[.5, .2, .3],
 57 |          [.3, .5, .2],
 58 |          [.2, .3, .5]]
 59 |     )
 60 |     state2observation = np.array(
 61 |         [[.5, .5],
 62 |          [.4, .6],
 63 |          [.7, .3]]
 64 |     )
 65 |     initial_state = np.array([.2, .4, .4])
 66 |     observation = np.array([
 67 |         [0, 0, 0],
 68 |         [0, 0, 1],
 69 |         [0, 1, 0],
 70 |         [0, 1, 1],
 71 |         [1, 0, 0],
 72 |         [1, 0, 1],
 73 |         [1, 1, 0],
 74 |         [1, 1, 1],
 75 |     ])
 76 |     demonstrate(state2state, state2observation, initial_state, observation, "Example 1")
 77 | 
 78 |     # ---------------------- Example 2 --------------------------------------------
 79 |     state2state = np.array(
 80 |         [[.5, .5],
 81 |          [.5, .5]]
 82 |     )
 83 |     state2observation = np.array(
 84 |         [[.5, .5],
 85 |          [.5, .5]]
 86 |     )
 87 |     initial_state = np.array([.5, .5])
 88 |     observation = np.array([
 89 |         [0],
 90 |         [1],
 91 |     ])
 92 |     demonstrate(state2state, state2observation, initial_state, observation, "Example 2")
 93 | 
 94 | 
 95 |     # ---------------------- Example 2 --------------------------------------------
 96 |     state2state = np.array(
 97 |         [[.0, 1.],
 98 |          [1., .0]]
 99 |     )
100 |     state2observation = np.array(
101 |         [[1., 0.],
102 |          [0., 1.]]
103 |     )
104 |     initial_state = np.array([0., 1.])
105 |     observation = np.array([
106 |         [1, 0, 1, 0, 1, 0],
107 |     ])
108 |     demonstrate(state2state, state2observation, initial_state, observation, "Example 2")
109 | 


--------------------------------------------------------------------------------
/10.HMM/HMM.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from functools import partial
 4 | import sys
 5 | from pathlib import Path
 6 | from rich.console import Console
 7 | from rich.table import Table
 8 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 9 | from utils import *
10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '10.HMM'))
11 | from BaumWelch import baum_welch
12 | from Viterbi import viterbi
13 | 
14 | class HMM:
15 |     def __init__(self, state_size, observation_size, max_iteration=2000, verbose=False, epsilon=1e-8):
16 |         self.max_iteration = max_iteration
17 |         self.verbose = verbose
18 |         self.state_size = state_size
19 |         self.observation_size = observation_size
20 |         self.epsilon = epsilon
21 | 
22 |     def fit(self, X):
23 |         """
24 |         When there is no label in the training data,
25 |         HMM uses baum-welch for training.
26 |         Otherwise just counting the probability will be fine (not implemented here)
27 |         """
28 |         self.state2state, self.state2observation, self.initial_state = \
29 |             baum_welch(X, self.state_size, self.observation_size, self.epsilon, self.max_iteration)
30 | 
31 |     def predict(self, X):
32 |         """HMM uses viterbi for predicting"""
33 |         Y = np.zeros_like(X)
34 |         Y = np.apply_along_axis(
35 |             partial(viterbi, self.state2state, self.state2observation, self.initial_state), -1, X)
36 |         return Y
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     def demonstrate(X, testX, desc):
41 |         console = Console(markup=False)
42 | 
43 |         vocab = set(X.flatten())
44 |         vocab_size = len(vocab)
45 |         word2num = {word: num for num, word in enumerate(vocab)}
46 | 
47 |         f_word2num = np.vectorize(lambda word: word2num[word])
48 | 
49 |         numX, num_testX = map(f_word2num, (X, testX))
50 | 
51 |         hmm = HMM(4, vocab_size)
52 |         hmm.fit(numX)
53 |         pred = hmm.predict(num_testX)
54 | 
55 |         # show in table
56 |         print(desc)
57 |         table = Table()
58 |         for x, p in zip(testX, pred):
59 |             table.add_row(*map(str, x))
60 |             table.add_row(*map(str, p))
61 |         console.print(table)
62 | 
63 | 
64 |     # ---------------------- Example 1 --------------------------------------------
65 |     X = np.array([s.split() for s in
66 |                   ['i am good .',
67 |                    'i am bad .',
68 |                    'you are good .',
69 |                    'you are bad .',
70 |                    'it is good .',
71 |                    'it is bad .',
72 |                    ]
73 |                   ])
74 |     testX = X
75 |     demonstrate(X, testX, "Example 1")
76 | 
77 |     # ---------------------- Example 2 --------------------------------------------
78 |     testX = np.array([s.split() for s in
79 |                   ['you is good .',
80 |                    'i are bad .',
81 |                    'it are good .']
82 |                   ])
83 |     testX = np.concatenate([X, testX])
84 |     demonstrate(X, testX, "Example 2")
85 | 


--------------------------------------------------------------------------------
/10.HMM/Viterbi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def viterbi(state2state, state2observation, initial_state, observation):
 4 |     """
 5 |     Given a HMM with parameter (state2state, state2observation, initial_state)
 6 |     and the observation,
 7 |     return the most possible state sequence
 8 | 
 9 |     state2state is a matrix shaped of [state_size, state_size]
10 |     state2observation is a matrix shaped of [state_size, observation_size]
11 |     initial_state is a tensor shaped of [state_size], whose each dimension means the probability of each state
12 |     observation is a tensor shaped of [sequence_length]
13 |     observation_size is the number of all the possible observations
14 |     """
15 |     sequence_length, = observation.shape
16 |     state_size, _ = state2state.shape
17 | 
18 |     state_prob = initial_state
19 |     pre_state = np.zeros([sequence_length, state_size]).astype(int)
20 |     for i, o in enumerate(observation):
21 |         state_prob *= state2observation[:, o]
22 |         if i != sequence_length - 1:
23 |             trans_prob = state_prob[:, None] * state2state
24 |             pre_state[i + 1] = trans_prob.argmax(axis=0)
25 |             state_prob = trans_prob.max(axis=0)
26 |     ans = np.zeros(sequence_length).astype(int)
27 |     ans[-1] = state_prob.argmax()
28 |     for i in range(sequence_length - 2, -1, -1):
29 |         ans[i] = pre_state[i + 1, ans[i + 1]]
30 |     return ans
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     state2state = np.array([[.5, .2, .3],
35 |                             [.3, .5, .2],
36 |                             [.2, .3, .5]])
37 |     state2observation = np.array([[.5, .5],
38 |                                   [.4, .6],
39 |                                   [.7, .3]])
40 |     initial_state = np.array([.2, .4, .4])
41 |     observation = np.array([0, 1, 0])
42 |     print(viterbi(state2state, state2observation, initial_state, observation))
43 | 


--------------------------------------------------------------------------------
/11.ConditionalRandomField/LinearChainConditionalRandomField.py:
--------------------------------------------------------------------------------
  1 | from math import log
  2 | import os
  3 | from matplotlib.tri.triinterpolate import LinearTriInterpolator
  4 | import numpy as np
  5 | from functools import partial
  6 | import sys
  7 | from pathlib import Path
  8 | from rich.console import Console
  9 | from rich.table import Table
 10 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 11 | from utils import *
 12 | 
 13 | class LinearChainConditionalRandomField:
 14 |     def __init__(self, feature_funcs, trans_feature_funcs, sequence_length, n_x, n_y, max_iteration=100, verbose=False):
 15 |         """
 16 |         `feature_funcs` are a group of functions s(y_i, X, i) in a list
 17 |         `trans_feature_funcs` are a group of functions t(y_{i-1}, y_i, X, i) in a list
 18 |         `sequence_length` is the length of each input sequence
 19 |         `n_x` is the number of possible values of each item in a sequence x
 20 |         `n_y` is the number of possible values of each item in a sequence y
 21 |         """
 22 |         self.feature_funcs = feature_funcs
 23 |         self.trans_feature_funcs = trans_feature_funcs
 24 |         self.n_x = n_x
 25 |         self.n_y = n_y
 26 |         self.sequence_length = sequence_length
 27 |         self.max_iteration = max_iteration
 28 |         self.verbose = verbose
 29 | 
 30 |     def get_trans(self, x):
 31 |         """get transition matrix given observed sequence x"""
 32 |         trans_feature = np.zeros([self.sequence_length, self.n_y, self.n_y])
 33 |         for i in range(self.sequence_length):
 34 |             for y_i_1 in range(self.n_y):
 35 |                 for y_i in range(self.n_y):
 36 |                     for j, func in enumerate(self.used_feature_funcs):
 37 |                         trans_feature[i, y_i_1, y_i] += self.w_feature_funcs[j] * func(y_i, x, i)
 38 |             if i > 0:
 39 |                 for y_i_1 in range(self.n_y):
 40 |                     for y_i in range(self.n_y):
 41 |                         for j, func in enumerate(self.used_trans_feature_funcs):
 42 |                             trans_feature[i, y_i_1, y_i] += self.w_trans_feature_funcs[j] * func(y_i_1, y_i, x, i)
 43 |         return np.exp(trans_feature)
 44 | 
 45 |     def fit(self, X, Y):
 46 |         """
 47 |         X is a two dimensional matrix of observation sequence
 48 |         Y is a two dimensional matrix of hidden state sequence
 49 |         optimize weights by Improved Iterative Scaling
 50 |         """
 51 |         E_feature = np.zeros(len(self.feature_funcs))
 52 |         E_trans_feature = np.zeros(len(self.trans_feature_funcs))
 53 | 
 54 |         # Because each x is a sequence, it's vector space is too large to iterate.
 55 |         # We need to store all the possible sequence x during the training time
 56 |         # and only iterate over existing x.
 57 |         p_x = {tuple(x): 0. for x in X}
 58 | 
 59 |         for x, y in zip(X, Y):
 60 |             x_key = tuple(x)
 61 |             p_x[x_key] += 1 / len(X)
 62 |             for i, yi in enumerate(y):
 63 |                 for j, func in enumerate(self.feature_funcs):
 64 |                     E_feature[j] += func(yi, x, i) / len(X)
 65 |             for i in range(1, self.sequence_length):
 66 |                 yi_1, yi = y[i - 1], y[i]
 67 |                 for j, func in enumerate(self.trans_feature_funcs):
 68 |                     E_trans_feature[j] += func(yi_1, yi, x, i) / len(X)
 69 | 
 70 |         # features that don't show in training data are useless, filter them
 71 |         self.used_feature_funcs = [func for E, func in zip(E_feature, self.feature_funcs) if E != 0]
 72 |         self.used_trans_feature_funcs = [func for E, func in zip(E_trans_feature, self.trans_feature_funcs) if E != 0]
 73 |         E_feature = E_feature[E_feature.nonzero()]
 74 |         E_trans_feature = E_trans_feature[E_trans_feature.nonzero()]
 75 |         self.w_feature_funcs = np.zeros(len(self.used_feature_funcs))
 76 |         self.w_trans_feature_funcs = np.zeros(len(self.used_trans_feature_funcs))
 77 | 
 78 |         # pre-calculate all the possible values of feature functions
 79 |         feature = np.zeros([len(self.used_feature_funcs), len(p_x), self.sequence_length, self.n_y])
 80 |         trans_feature = np.zeros([len(self.used_trans_feature_funcs), len(p_x), self.sequence_length, self.n_y, self.n_y])
 81 |         for x_i, x_key in enumerate(p_x):
 82 |             x = np.array(x_key)
 83 |             for func_i, func in enumerate(self.used_trans_feature_funcs):
 84 |                 for i in range(1, self.sequence_length):
 85 |                     for y_i_1 in range(self.n_y):
 86 |                         for y_i in range(self.n_y):
 87 |                             trans_feature[func_i, x_i, i, y_i_1, y_i] = func(y_i_1, y_i, x, i)
 88 |             for func_i, func in enumerate(self.used_feature_funcs):
 89 |                 for i in range(self.sequence_length):
 90 |                     for y_i in range(self.n_y):
 91 |                         feature[func_i, x_i, i, y_i] = func(y_i, x, i)
 92 | 
 93 |         # pre-calculate the max number of features, given x
 94 |         max_feature = np.zeros(len(p_x), dtype=int)
 95 |         sum_trans_feature = trans_feature.sum(axis=0)
 96 |         sum_feature = feature.sum(axis=0)
 97 |         for x_i, x_key in enumerate(p_x):
 98 |             cur_max_feature = np.zeros(self.n_y)
 99 |             for i in range(self.sequence_length):
100 |                 cur_max_feature = (cur_max_feature[:, None] + sum_trans_feature[x_i, i]).max(axis=0) + sum_feature[x_i, i]
101 |             max_feature[x_i] = cur_max_feature.max()
102 |         n_coef = max(max_feature) + 1
103 | 
104 |         # train
105 |         for iteration in range(self.max_iteration):
106 |             if self.verbose:
107 |                 print(f'Iteration {iteration} starts...')
108 |             loss = 0.
109 |             for funcs, w, E_experience in zip(
110 |                     [self.used_feature_funcs, self.used_trans_feature_funcs],
111 |                     [self.w_feature_funcs, self.w_trans_feature_funcs],
112 |                     [E_feature, E_trans_feature]):
113 |                 for func_i in range(len(funcs)):
114 |                     # if funcs is self.used_trans_feature_funcs:
115 |                     coef = np.zeros(n_coef)
116 |                     # only iterater over possible x
117 |                     for x_i, x_key in enumerate(p_x):
118 |                         cur_p_x = p_x[x_key]
119 |                         x = np.array(x_key)
120 | 
121 |                         trans = self.get_trans(x)
122 |                         # forward algorithm
123 |                         cur_prob = np.ones(self.n_y)
124 |                         forward_prob = np.zeros([self.sequence_length + 1, self.n_y])
125 |                         forward_prob[0] = cur_prob
126 |                         for i in range(self.sequence_length):
127 |                             cur_prob = cur_prob @ trans[i]
128 |                             forward_prob[i + 1] = cur_prob
129 |                         # backward algorithm
130 |                         cur_prob = np.ones(self.n_y)
131 |                         backward_prob = np.zeros([self.sequence_length + 1, self.n_y])
132 |                         backward_prob[-1] = cur_prob
133 |                         for i in range(self.sequence_length - 1, -1, -1):
134 |                             cur_prob = trans[i] @ cur_prob
135 |                             backward_prob[i] = cur_prob
136 | 
137 |                         if iteration < 10:
138 |                             np.testing.assert_almost_equal(
139 |                                 forward_prob[-1].sum(),
140 |                                 backward_prob[0].sum()
141 |                             )
142 |                             for i in range(1, self.sequence_length + 1):
143 |                                 np.testing.assert_almost_equal(
144 |                                     forward_prob[i] @ backward_prob[i],
145 |                                     forward_prob[-1].sum()
146 |                                 )
147 |                             for i in range(0, self.sequence_length):
148 |                                 np.testing.assert_almost_equal(
149 |                                     (np.outer(forward_prob[i], backward_prob[i + 1]) * trans[i]).sum(),
150 |                                     forward_prob[-1].sum()
151 |                                 )
152 | 
153 |                         # calculate expectation of each feature_function given x
154 |                         cur_E_feature = 0.
155 |                         if funcs is self.used_feature_funcs:
156 |                             for i in range(1, self.sequence_length + 1):
157 |                                 cur_E_feature += (
158 |                                     forward_prob[i] * backward_prob[i] * feature[func_i, x_i, i - 1]
159 |                                 ).sum()
160 |                         elif funcs is self.used_trans_feature_funcs:
161 |                             for i in range(0, self.sequence_length):
162 |                                 cur_E_feature += (
163 |                                     np.outer(forward_prob[i], backward_prob[i + 1]) * trans[i] * trans_feature[func_i, x_i, i]
164 |                                 ).sum()
165 |                         else:
166 |                             raise Exception("Unknown function set!")
167 |                         cur_E_feature /= forward_prob[-1].sum()
168 | 
169 |                         coef[max_feature[x_i]] += cur_p_x * cur_E_feature
170 | 
171 |                     # update w
172 |                     dw_i = log(newton(
173 |                         lambda x: sum(c * x ** i for i, c in enumerate(coef)) - E_experience[func_i],
174 |                         lambda x: sum(i * c * x ** (i  - 1) for i, c in enumerate(coef) if i > 0),
175 |                         1
176 |                     ))
177 |                     w[func_i] += dw_i
178 |                     loss += abs(E_experience[func_i] - coef.sum())
179 |             loss /= len(self.feature_funcs) + len(self.trans_feature_funcs)
180 |             if self.verbose:
181 |                 print(f'Iteration {iteration} ends, Loss: {loss}')
182 | 
183 |     def predict(self, X):
184 |         """
185 |         predict state sequence y using viterbi algorithm
186 |         X is a group of sequence x in a two-dimensional array
187 |         """
188 | 
189 |         ans = np.zeros([len(X), self.sequence_length])
190 |         for x_i, x in enumerate(X):
191 |             # pre-calculate all the possible values of feature functions
192 |             feature = np.zeros([len(self.used_feature_funcs), self.sequence_length, self.n_y])
193 |             trans_feature = np.zeros([len(self.used_trans_feature_funcs), self.sequence_length, self.n_y, self.n_y])
194 |             for func_i, func in enumerate(self.used_trans_feature_funcs):
195 |                 for i in range(1, self.sequence_length):
196 |                     for y_i_1 in range(self.n_y):
197 |                         for y_i in range(self.n_y):
198 |                             trans_feature[func_i, i, y_i_1, y_i] = func(y_i_1, y_i, x, i)
199 |             for func_i, func in enumerate(self.used_feature_funcs):
200 |                 for i in range(self.sequence_length):
201 |                     for y_i in range(self.n_y):
202 |                         feature[func_i, i, y_i] = func(y_i, x, i)
203 |             feature = (self.w_feature_funcs[:, None, None] * feature).sum(axis=0)
204 |             trans_feature = (self.w_trans_feature_funcs[:, None, None, None] * trans_feature).sum(axis=0)
205 | 
206 |             # viterbi
207 |             pre_state = np.zeros([self.sequence_length, self.n_y], dtype=int) - 1
208 |             prob = np.zeros([self.sequence_length, self.n_y])
209 |             cur_prob = np.ones(self.n_y)
210 |             for i in range(self.sequence_length):
211 |                 trans_prob = cur_prob[:, None] + trans_feature[i]
212 |                 pre_state[i] = trans_prob.argmax(axis=0)
213 |                 cur_prob = trans_prob.max(axis=0) + feature[i]
214 |                 prob[i] = cur_prob
215 | 
216 |             # back track the trace
217 |             cur_state = prob[-1].argmax()
218 |             for i in range(self.sequence_length - 1, -1, -1):
219 |                 ans[x_i, i] = cur_state
220 |                 cur_state = pre_state[i, cur_state]
221 |         return ans
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     def demonstrate(X, Y, testX, n_y, desc):
226 |         console = Console(markup=False)
227 | 
228 |         vocab = set(X.flatten())
229 |         vocab_size = len(vocab)
230 |         word2num = {word: num for num, word in enumerate(vocab)}
231 | 
232 |         f_word2num = np.vectorize(lambda word: word2num[word])
233 | 
234 |         numX, num_testX = map(f_word2num, (X, testX))
235 | 
236 |         sequence_length = numX.shape[-1]
237 | 
238 |         class FeatureFunc:
239 |             def __init__(self, x_i, y_i):
240 |                 self.x_i = x_i
241 |                 self.y_i = y_i
242 | 
243 |             def __call__(self, y_i, x, i):
244 |                 return int(y_i == self.y_i and x[i] == self.x_i)
245 | 
246 |         class TransFeatureFunc:
247 |             def __init__(self, y_i_1, y_i):
248 |                 self.y_i = y_i
249 |                 self.y_i_1 = y_i_1
250 | 
251 |             def __call__(self, y_i_1, y_i, x, i):
252 |                 return int(y_i_1 == self.y_i_1 and y_i == self.y_i)
253 | 
254 |         feature_funcs = [FeatureFunc(x_i, y_i)
255 |                          for x_i in range(vocab_size)
256 |                          for y_i in range(n_y)]
257 |         trans_feature_funcs = [TransFeatureFunc(y_i_1, y_i)
258 |                                for y_i_1 in range(n_y)
259 |                                for y_i in range(n_y)]
260 | 
261 |         linear_chain_conditional_random_field = LinearChainConditionalRandomField(
262 |             feature_funcs,
263 |             trans_feature_funcs,
264 |             sequence_length,
265 |             vocab_size,
266 |             n_y,
267 |             verbose=True
268 |         )
269 |         linear_chain_conditional_random_field.fit(numX, Y)
270 |         pred = linear_chain_conditional_random_field.predict(num_testX)
271 | 
272 |         # show in table
273 |         print(desc)
274 |         table = Table()
275 |         for x, p in zip(testX, pred):
276 |             table.add_row(*map(str, x))
277 |             table.add_row(*map(str, p))
278 |         console.print(table)
279 | 
280 | 
281 |     # ---------------------- Example 1 --------------------------------------------
282 |     X = np.array([s.split() for s in
283 |                   ['i am good .',
284 |                    'i am bad .',
285 |                    'you are good .',
286 |                    'you are bad .',
287 |                    'it is good .',
288 |                    'it is bad .',
289 |                    ]
290 |                   ])
291 |     Y = np.array([
292 |         [0, 1, 2, 3],
293 |         [0, 1, 2, 3],
294 |         [0, 1, 2, 3],
295 |         [0, 1, 2, 3],
296 |         [0, 1, 2, 3],
297 |     ])
298 |     testX = np.array([s.split() for s in
299 |                   ['you is good .',
300 |                    'i are bad .',
301 |                    'it are good .']
302 |                   ])
303 |     testX = np.concatenate([X, testX])
304 |     demonstrate(X, Y, testX, 4, "Example 1")
305 | 
306 |     # ---------------------- Example 1 --------------------------------------------
307 |     X = np.array([s.split() for s in
308 |                   ['i be good .',
309 |                    'you be good .',
310 |                    'be good . .',
311 |                    'i love you .',
312 |                    'he be . .',
313 |                    ]
314 |                   ])
315 |     # pronoun: 0, verb: 1, adjective: 2, ".": 3
316 |     Y = np.array([
317 |         [0, 1, 2, 3],
318 |         [0, 1, 2, 3],
319 |         [1, 2, 3, 3],
320 |         [0, 1, 0, 3],
321 |         [0, 1, 3, 3],
322 |     ])
323 |     testX = np.array([s.split() for s in
324 |                   ['you be good .',
325 |                    'he love you .',
326 |                    'i love good .',
327 |                    '. be love .',
328 |                    '. love be .',
329 |                    '. . be good']
330 |                   ])
331 |     testX = np.concatenate([X, testX])
332 |     demonstrate(X, Y, testX, 4, "Example 2")
333 | 


--------------------------------------------------------------------------------
/14.Cluster/Agglomerative.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | import sys
 5 | from pathlib import Path
 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 7 | from utils import euc_dis
 8 | 
 9 | class Agglomerative:
10 |     def __init__(self, k):
11 |         self.k = k
12 | 
13 |     def get_root(self, i):
14 |         if self.parent[i] != i:
15 |             self.parent[i] = self.get_root(self.parent[i])
16 |         return self.parent[i]
17 | 
18 |     def fit_predict(self, X):
19 |         """
20 |         X is a matrix shaped of [data_size, feature_size]
21 |         """
22 |         data_size, feature_size = X.shape
23 |         self.cluster_num = data_size
24 | 
25 |         self.parent = [i for i in range(data_size)]
26 |         dis = euc_dis(X[:, None, :], X[None, :, :])
27 |         sorted_a, sorted_b = np.unravel_index(np.argsort(dis, axis=None), dis.shape)
28 |         for a, b in zip(sorted_a, sorted_b):
29 |             root_a, root_b = self.get_root(a), self.get_root(b)
30 |             if root_a != root_b:
31 |                 if root_a > root_b:
32 |                     root_a, root_b = root_b, root_a
33 |                 self.parent[root_b] = root_a
34 | 
35 |                 self.cluster_num -= 1
36 |                 if self.cluster_num <= self.k:
37 |                     break
38 | 
39 |         root = [self.get_root(i) for i in range(data_size)]
40 |         root_map = {n: i for i, n in enumerate(sorted(list(set(root))))}
41 |         return [root_map[r] for r in root]
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     def demonstrate(X, k, desc):
46 |         agglomerative = Agglomerative(k=k)
47 |         pred = agglomerative.fit_predict(X)
48 | 
49 |         # plot
50 |         plt.scatter(X[:,0], X[:,1], c=pred, s=20)
51 |         plt.title(desc)
52 |         plt.show()
53 | 
54 |     # -------------------------- Example 1 ----------------------------------------
55 |     X = np.array([[0, 0], [0, 1], [1, 0], [2, 2], [2, 1], [1, 2]])
56 |     # generate grid-shaped test data
57 |     demonstrate(X, 2, "Example 1")
58 | 
59 |     # -------------------------- Example 2 ----------------------------------------
60 |     X = np.concatenate([
61 |         np.random.normal([0, 0], [.3, .3], [100, 2]),
62 |         np.random.normal([0, 1], [.3, .3], [100, 2]),
63 |         np.random.normal([1, 0], [.3, .3], [100, 2]),
64 |     ])
65 |     # generate grid-shaped test data
66 |     demonstrate(X, 3, "Example 2: it is very sensitive to noise")
67 | 


--------------------------------------------------------------------------------
/14.Cluster/KMeans.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | import sys
 5 | from pathlib import Path
 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 7 | from utils import euc_dis
 8 | 
 9 | class KMeans:
10 |     def __init__(self, k, max_iterations=1000, verbose=False):
11 |         self.k = k
12 |         self.max_iterations = max_iterations
13 |         self.verbose = verbose
14 | 
15 |     def fit(self, X):
16 |         """
17 |         X is a matrix shaped of [data_size, feature_size]
18 |         """
19 |         X = X.astype(float)
20 |         data_size, feature_size = X.shape
21 | 
22 |         self.centers = X[np.random.choice(data_size, self.k, replace=False)]
23 |         pre_centers = self.centers - 1
24 |         step = 0
25 |         if self.verbose:
26 |             print('Initial centroids:', self.centers)
27 |         while (pre_centers != self.centers).any():
28 |             pre_centers = self.centers.copy()
29 |             # distance from each data sample to the centroid
30 |             # dis[i, j] is the distance from i-th data sample to the j-th centroid
31 |             # shape: [data_size, k]
32 |             dis = euc_dis(X[:, None, :], self.centers[None, :, :])
33 |             # assignment of each data sample to centroid
34 |             # cluster[i] is the index of cluster of i-th data sample
35 |             # shape: [data_size]
36 |             cluster = dis.argmin(axis=-1)
37 |             for i in range(self.k):
38 |                 self.centers[i] = X[cluster == i].mean(axis=0)
39 |             step += 1
40 |             if self.verbose:
41 |                 print('Step', step)
42 |                 print('Assignment:', cluster)
43 |                 print('Centroids:', self.centers)
44 |             if step == self.max_iterations:
45 |                 break
46 | 
47 |     def predict(self, X):
48 |         dis = euc_dis(X[:, None, :], self.centers[None, :, :])
49 |         return dis.argmin(axis=-1)
50 | 
51 | if __name__ == "__main__":
52 |     def demonstrate(X, k, desc):
53 |         k_means = KMeans(k=k, verbose=True)
54 |         k_means.fit(X)
55 |         pred = k_means.predict(X)
56 | 
57 |         # plot
58 |         plt.scatter(k_means.centers[:, 0], k_means.centers[:,1], marker='x', label='centroids')
59 |         plt.scatter(X[:,0], X[:,1], c=pred, s=20, label='data samples')
60 |         plt.legend()
61 |         plt.title(desc)
62 |         plt.show()
63 | 
64 |     # -------------------------- Example 1 ----------------------------------------
65 |     X = np.array([[0, 0], [0, 1], [1, 0], [2, 2], [2, 1], [1, 2]]).astype(float)
66 |     demonstrate(X, 2, "Example 1")
67 | 
68 |     # -------------------------- Example 2 ----------------------------------------
69 |     X = np.concatenate([
70 |         np.random.normal([0, 0], [.3, .3], [100, 2]),
71 |         np.random.normal([0, 1], [.3, .3], [100, 2]),
72 |         np.random.normal([1, 0], [.3, .3], [100, 2]),
73 |     ]).astype(float)
74 |     demonstrate(X, 3, "Example 2")
75 | 
76 |     # -------------------------- Example 3 ----------------------------------------
77 |     X = np.array([[0, 0], [0, 1], [0, 3]]).astype(float)
78 |     demonstrate(X, 2, "Example 3: K-Means doesn't always return the best answer. (try to run multiple times!)")
79 | 


--------------------------------------------------------------------------------
/15.SVD/SVD.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import os
 4 | from pathlib import Path
 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 6 | from utils import *
 7 | 
 8 | def svd(A):
 9 |     """
10 |     given an m x n matrix,
11 |     return the result of SVD,
12 |     as a tuple of (U, Sigma, V)
13 |     """
14 |     m , n = A.shape
15 | 
16 |     symmetry = A.T @ A
17 |     rank = np.linalg.matrix_rank(symmetry)
18 |     eigen_values, eigen_vectors = np.linalg.eig(symmetry)
19 |     eigen_order = eigen_values.argsort()[::-1]
20 |     eigen_values = eigen_values[eigen_order]
21 | 
22 |     eigen_values = eigen_values[: rank]
23 |     eigen_vectors = eigen_vectors[:, eigen_order]
24 |     # V is of shape [n, n]
25 |     V = eigen_vectors
26 |     eigen_vectors = eigen_vectors[:, : rank]
27 | 
28 |     singular_values = np.sqrt(eigen_values)
29 |     singular_matrix = np.zeros_like(A)
30 |     for i, v in enumerate(singular_values):
31 |         singular_matrix[i][i] = v
32 | 
33 |     U1 = A @ eigen_vectors / singular_values
34 |     U2 = get_solution_domain(row_echelon(A.T))
35 |     U = np.concatenate([U1, U2], axis=-1)
36 |     return U, singular_matrix, V
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     def demonstrate(A, desc):
41 |         print(desc)
42 |         U, singular_matrix, V = svd(A)
43 |         print("U is:")
44 |         print(U)
45 |         print("Singular matrix is:")
46 |         print(singular_matrix)
47 |         print("V is:")
48 |         print(V)
49 |         print("The reconstructed matrix is:")
50 |         print(U @ singular_matrix @ V.T)
51 | 
52 |     A = np.array([[1, 1],
53 |                   [2, 2],
54 |                   [0, 0]]).astype(float)
55 |     demonstrate(A, 'Example 1')
56 | 
57 |     A = np.array([[1, 0, 0, 0],
58 |                   [0, 0, 0, 4],
59 |                   [0, 3, 0, 0],
60 |                   [0, 0, 0, 0],
61 |                   [2, 0, 0, 0]]).astype(float)
62 |     demonstrate(A, 'Example 2')
63 | 
64 |     A = np.array([[3, 1],
65 |                   [2, 1]]).astype(float)
66 |     demonstrate(A, 'Example 3')
67 | 


--------------------------------------------------------------------------------
/16.PCA/PCA.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import os
 4 | from pathlib import Path
 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 6 | from utils import *
 7 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent / '15.SVD'))
 8 | from SVD import svd
 9 | 
10 | def pca(X, k=5):
11 |     """
12 |     given a normlized matrix X, each of whose column is a sample
13 |     the dimension of the principle component, k
14 |     return the principle component matrix
15 |     """
16 |     m, n = X.shape
17 |     X_trans = 1 / sqrt(n - 1) * X.T
18 |     _, _, V = svd(X_trans)
19 |     V = V[:, :k]
20 |     return V.T @ X
21 | 
22 | if __name__ == '__main__':
23 |     def demonstrate(X, k, desc):
24 |         print(desc)
25 |         X -= X.mean(axis=-1, keepdims=True)
26 |         X_trans = pca(X, k=k)
27 |         print(X_trans)
28 | 
29 |     X = np.array([[1, 1],
30 |                   [2, 2],
31 |                   [0, 0]]).astype(float)
32 |     demonstrate(X, 1, 'Example 1')
33 | 
34 |     X = np.array([[1, 0, 0, 0],
35 |                   [0, 0, 0, 4],
36 |                   [0, 3, 0, 0],
37 |                   [0, 0, 0, 0],
38 |                   [2, 0, 0, 0]]).astype(float)
39 |     demonstrate(X, 1, 'Example 2')
40 | 
41 |     X = np.array([[3, 1],
42 |                   [2, 1]]).astype(float)
43 |     demonstrate(X, 1, 'Example 3')
44 | 
45 |     X = np.array([[0, 0],
46 |                   [-1, 1]]).astype(float)
47 |     demonstrate(X, 1, 'Example 3')
48 | 


--------------------------------------------------------------------------------
/17.LSA/LSA.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import os
 4 | from pathlib import Path
 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 6 | from utils import *
 7 | 
 8 | def lsa(word_text, k=5, max_iteration=1000):
 9 |     """
10 |     given a word-text matrix
11 |     the dimension of the principle component, k
12 |     optimize using the algorithm proposed by Lee and Seung
13 |     return the word-topic matrix and text-topic matrix
14 |     """
15 |     n_word, n_text = word_text.shape
16 |     word_topic = np.random.rand(n_word, k)
17 |     topic_text = np.random.rand(k, n_text)
18 |     for i in range(max_iteration):
19 |         word_topic *= (word_text @ topic_text.T) / (word_topic @ topic_text @ topic_text.T)
20 |         topic_text *= (word_topic.T @ word_text) / (word_topic.T @ word_topic @ topic_text)
21 |     return word_topic, topic_text.T
22 | 
23 | if __name__ == '__main__':
24 |     def demonstrate(X, k, desc):
25 |         print(desc)
26 |         word_topic, text_topic = lsa(X, k=k)
27 |         print("The topic vectors of all the words are")
28 |         print(word_topic)
29 |         print("The topic vectors of all the texts are")
30 |         print(text_topic)
31 |         print("The recovered word-text matrix is")
32 |         print(np.round(word_topic @ text_topic.T))
33 | 
34 |     X = np.array([
35 |         [0, 0, 1, 1, 0, 0, 0, 0, 0],
36 |         [0, 0, 0, 0, 0, 1, 0, 0, 1],
37 |         [0, 1, 0, 0, 0, 0, 0, 1, 0],
38 |         [0, 0, 0, 0, 0, 0, 1, 0, 1],
39 |         [1, 0, 0, 0, 0, 1, 0, 0, 0],
40 |         [1, 1, 1, 1, 1, 1, 1, 1, 1],
41 |         [1, 0, 1, 0, 0, 0, 0, 0, 0],
42 |         [0, 0, 0, 0, 0, 0, 1, 0, 1],
43 |         [0, 0, 0, 0, 0, 2, 0, 0, 1],
44 |         [1, 0, 1, 0, 0, 0, 0, 1, 0],
45 |         [0, 0, 0, 1, 1, 0, 0, 0, 0],
46 |     ]).astype(float)
47 |     demonstrate(X, 3, 'Example 1')
48 | 


--------------------------------------------------------------------------------
/18.PLSA/PLSA.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import os
 4 | from pathlib import Path
 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 6 | from utils import *
 7 | 
 8 | def plsa(word_text, k=5, max_iteration=1000, epsilon=1e-8):
 9 |     """
10 |     given a word-text matrix
11 |     the dimension of the principle component, k
12 |     optimize using EM algorithm
13 |     return the word-topic matrix and text-topic matrix
14 |     """
15 |     n_word, n_text = word_text.shape
16 |     p_topic_when_text = np.random.rand(n_text, k)
17 |     p_word_when_topic = np.random.rand(k, n_word)
18 | 
19 |     text_word = word_text.T
20 |     text_word_cnt = text_word.sum(axis=-1, keepdims=True)
21 |     for i in range(max_iteration):
22 |         # E step: calculate the expectation of each topic for each word-text pair
23 |         p_topic_when_text_word = p_topic_when_text[:, :, None] * p_word_when_topic[None, :, :]
24 |         p_topic_when_text_word /= p_topic_when_text_word.sum(axis=1, keepdims=True) + epsilon
25 | 
26 |         # M step, maximazation the likelihood of the observation, i.e., the word-text matrix
27 |         topic_cnt = text_word[:, None, :] * p_topic_when_text_word
28 |         p_word_when_topic = (topic_cnt).sum(axis=0) / \
29 |             (topic_cnt).sum(axis=0).sum(axis=-1, keepdims=True)
30 |         p_topic_when_text = (text_word[:, None, :] * p_topic_when_text_word).sum(axis=-1) / text_word_cnt
31 |     return p_topic_when_text, p_word_when_topic
32 | 
33 | if __name__ == '__main__':
34 |     def demonstrate(X, k, desc):
35 |         print(desc)
36 |         p_topic_when_text, p_word_when_topic = plsa(X, k=k)
37 |         print("The probabilities of each topic for each text are")
38 |         print(np.round(p_topic_when_text, 2))
39 |         print("The probabilities of each word for each topic are")
40 |         print(np.round(p_word_when_topic, 2))
41 |         print("The recovered text-wordcnt matrix is")
42 |         print(np.round((p_topic_when_text @ p_word_when_topic).T, 2))
43 |         print()
44 | 
45 |     X = np.array([
46 |         [0, 0, 1, 1, 0, 0, 0, 0, 0],
47 |         [0, 0, 0, 0, 0, 1, 0, 0, 1],
48 |         [0, 1, 0, 0, 0, 0, 0, 1, 0],
49 |         [0, 0, 0, 0, 0, 0, 1, 0, 1],
50 |         [1, 0, 0, 0, 0, 1, 0, 0, 0],
51 |         [1, 1, 1, 1, 1, 1, 1, 1, 1],
52 |         [1, 0, 1, 0, 0, 0, 0, 0, 0],
53 |         [0, 0, 0, 0, 0, 0, 1, 0, 1],
54 |         [0, 0, 0, 0, 0, 2, 0, 0, 1],
55 |         [1, 0, 1, 0, 0, 0, 0, 1, 0],
56 |         [0, 0, 0, 1, 1, 0, 0, 0, 0],
57 |     ]).astype(float)
58 |     demonstrate(X, 3, 'Example 1')
59 | 
60 |     X = np.array([
61 |         [0, 0, 1, 1, 0, 0, 0, 0, 0],
62 |         [0, 0, 0, 0, 0, 1, 0, 0, 1],
63 |         [0, 1, 0, 0, 0, 0, 0, 1, 0],
64 |         [0, 0, 0, 0, 0, 0, 1, 0, 1],
65 |         [1, 0, 0, 0, 0, 1, 0, 0, 0],
66 |         [1, 1, 1, 1, 1, 1, 1, 1, 1],
67 |         [1, 0, 1, 0, 0, 0, 0, 0, 0],
68 |         [0, 0, 0, 0, 0, 0, 1, 0, 1],
69 |         [0, 0, 0, 0, 0, 2, 0, 0, 1],
70 |         [1, 0, 1, 0, 0, 0, 0, 1, 0],
71 |         [0, 0, 0, 1, 1, 0, 0, 0, 0],
72 |     ]).astype(float)
73 |     demonstrate(X, max(X.shape), 'Example 2: You can recogonize the original matrix from the recovered one if k is large enough')
74 | 


--------------------------------------------------------------------------------
/19.MCMC/GibbsSampling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.stats import gaussian_kde
 4 | 
 5 | 
 6 | def gibbs_sampling(dim, conditional_sampler, x0=None, burning_steps=1000, max_steps=10000, epsilon=1e-8, verbose=False):
 7 |     """
 8 |     Given a conditionl sampler which samples from p(x_j | x_1, x_2, ... x_n)
 9 |     return a list of samples x ~ p, where p is the original distribution of the conditional distribution.
10 |     x0 is the initial value of x. If not specified, it's set as zero vector.
11 |     conditional_sampler takes (x, j) as parameters
12 |     """
13 |     x = np.zeros(dim) if x0 is None else x0
14 |     samples = np.zeros([max_steps - burning_steps, dim])
15 |     for i in range(max_steps):
16 |         for j in range(dim):
17 |             x[j]  = conditional_sampler(x, j)
18 |             if verbose:
19 |                 print("New value of x is", x_new)
20 |         if i >= burning_steps:
21 |             samples[i - burning_steps] = x
22 |     return samples
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     def demonstrate(dim, p, desc, **args):
27 |         samples = gibbs_sampling(dim, p, **args)
28 |         z = gaussian_kde(samples.T)(samples.T)
29 |         plt.scatter(samples[:, 0], samples[:, 1], c=z, marker='.')
30 |         plt.plot(samples[: 100, 0], samples[: 100, 1], 'r-')
31 |         plt.title(desc)
32 |         plt.show()
33 | 
34 |     # example 1:
35 |     mean = np.array([2, 3])
36 |     covariance = np.array([[1, 0],
37 |                            [0, 1]])
38 |     covariance_inv = np.linalg.inv(covariance)
39 |     det_convariance = 1
40 |     def gaussian_sampler1(x, j):
41 |         return np.random.normal()
42 |     demonstrate(2, gaussian_sampler1, "Gaussian distribution with mean of 0 and 0")
43 | 
44 |     # example 2:
45 |     mean = np.array([2, 3])
46 |     covariance = np.array([[1, 0],
47 |                            [0, 1]])
48 |     covariance_inv = np.linalg.inv(covariance)
49 |     det_convariance = 1
50 |     def gaussian_sampler2(x, j):
51 |         if j == 0:
52 |             return np.random.normal(2)
53 |         else:
54 |             return np.random.normal(3)
55 |     demonstrate(2, gaussian_sampler2, "Gaussian distribution with mean of 2 and 3")
56 | 
57 |     # example 3:
58 |     def blocks_sampler(x, j):
59 |         sample = np.random.random()
60 |         if sample > .5:
61 |             sample += 1.
62 |         return sample
63 |     demonstrate(2, blocks_sampler, "Four blocks")
64 | 
65 |     # example 4:
66 |     def blocks_sampler(x, j):
67 |         sample = np.random.random()
68 |         if sample > .5:
69 |             sample += 100.
70 |         return sample
71 |     demonstrate(2, blocks_sampler, "Four blocks with large gap.")
72 | 


--------------------------------------------------------------------------------
/19.MCMC/MetropolisHasting.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.stats import gaussian_kde
 4 | 
 5 | 
 6 | def gaussian_kernel(x1, x2):
 7 |     return np.exp(-((x1 - x2) ** 2).sum())
 8 | 
 9 | def gaussian_sampler(x):
10 |     return np.random.normal(x)
11 | 
12 | def metropolis_hasting(dim, p, q=gaussian_kernel, q_sampler=gaussian_sampler, x0=None, burning_steps=1000, max_steps=10000, epsilon=1e-8, verbose=False):
13 |     """
14 |     Given a distribution function p (it doesn't need to be a probability, a likelihood function is enough),
15 |     and the recommended distribution q,
16 |     return a list of samples x ~ p,
17 |     where the number of samples is max_steps - burning_steps.
18 |     q_sampler is a function taking an x as input and return a sample of q(x_new | x_old).
19 |     q is a distribution function representing q(x_new | x_old).
20 |     q takes (x_old, x_new) as parameters.
21 |     """
22 |     x = np.zeros(dim) if x0 is None else x0
23 |     samples = np.zeros([max_steps - burning_steps, dim])
24 |     for i in range(max_steps):
25 |         x_new = q_sampler(x)
26 |         accept_prob = (p(x_new) + epsilon) / (p(x) + epsilon) * q(x, x_new) / q(x_new, x)
27 |         if verbose:
28 |             print("New value of x is", x_new)
29 |         if np.random.random() < accept_prob:
30 |             x = x_new
31 |         elif verbose:
32 |             print("New value is dropped")
33 |         if i >= burning_steps:
34 |             samples[i - burning_steps] = x
35 |     return samples
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     def demonstrate(dim, p, desc, **args):
40 |         samples = metropolis_hasting(dim, p, **args)
41 |         z = gaussian_kde(samples.T)(samples.T)
42 |         plt.scatter(samples[:, 0], samples[:, 1], c=z, marker='.')
43 |         plt.plot(samples[: 100, 0], samples[: 100, 1], 'r-')
44 |         plt.title(desc)
45 |         plt.show()
46 | 
47 |     # example 1:
48 |     mean = np.array([2, 3])
49 |     covariance = np.array([[1, 0],
50 |                            [0, 1]])
51 |     covariance_inv = np.linalg.inv(covariance)
52 |     det_convariance = 1
53 |     def gaussian1(x):
54 |         return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean))
55 |     demonstrate(2, gaussian1, "Gaussian distribution with mean of 2 and 3")
56 | 
57 |     # example 2:
58 |     mean = np.array([2, 3])
59 |     covariance = np.array([[1, .5],
60 |                            [.5, 1]])
61 |     covariance_inv = np.linalg.inv(covariance)
62 |     det_convariance = 1
63 |     def gaussian2(x):
64 |         return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean))
65 |     demonstrate(2, gaussian2, "Gaussian distribution with mean of 2 and 3")
66 | 
67 |     # example 3:
68 |     def blocks(x):
69 |         if (0 < x[0] < 1 or 2 < x[0] < 3) and (0 < x[1] < 1 or 2 < x[1] < 3):
70 |             return 1
71 |         return 0
72 |     demonstrate(2, blocks, "Four blocks")
73 | 
74 |     # example 4:
75 |     def blocks(x):
76 |         if (0 < x[0] < 1 or 200 < x[0] < 300) and (0 < x[1] < 1 or 200 < x[1] < 300):
77 |             return 1
78 |         return 0
79 |     demonstrate(2, blocks, "Four blocks with large gap. (Monte Carlo doesn't solve everything)")
80 | 


--------------------------------------------------------------------------------
/19.MCMC/SingleComponentMetropolisHasting.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.stats import gaussian_kde
 4 | 
 5 | 
 6 | def gaussian_kernel(x, j, xj_new):
 7 |     return np.exp(-(x[j] - xj_new) ** 2)
 8 | 
 9 | def gaussian_sampler(x, j):
10 |     return np.random.normal(x[j])
11 | 
12 | def single_component_metropolis_hasting(dim, p, q=gaussian_kernel, q_sampler=gaussian_sampler, x0=None, burning_steps=1000, max_steps=10000, epsilon=1e-8, verbose=False):
13 |     """
14 |     Given a distribution function p (it doesn't need to be a probability, a likelihood function is enough),
15 |     and the recommended distribution q,
16 |     return a list of samples x ~ p,
17 |     where the number of samples is max_steps - burning_steps.
18 |     q_sampler is a function taking an (x, j) as input and return a sample of q(xj_new | xj_old, old_x_without_xj)
19 |     q is a distribution function representing q(xj_new, xj_old | old_x_without_xj).
20 |     q takes (x, j, xj_new) as parameters,
21 |     where x is the variable last step,
22 |     j is index of the the parameter chosen to be updated,
23 |     xj_new is the new value of x_j.
24 |     x0 is the initial value of x. If not specified, it's set as zero vector.
25 |     """
26 |     x = np.zeros(dim) if x0 is None else x0
27 |     samples = np.zeros([max_steps - burning_steps, dim])
28 |     or i in range(max_steps):
29 |         for j in range(dim):
30 |             xj_new = q_sampler(x, j)
31 |             x_new = x.copy()
32 |             x_new[j] = xj_new
33 |             accept_prob = (p(x_new) + epsilon) / (p(x) + epsilon) * q(x, j, xj_new) / q(x_new, j, x[j])
34 |             if verbose:
35 |                 print("New value of x is", x_new)
36 |             if np.random.random() < accept_prob:
37 |                 x = x_new
38 |             elif verbose:
39 |                 print("New value is dropped")
40 |         if i >= burning_steps:
41 |             samples[i - burning_steps] = x
42 |     return samples
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     def demonstrate(dim, p, desc, **args):
47 |         samples = single_component_metropolis_hasting(dim, p, **args)
48 |         z = gaussian_kde(samples.T)(samples.T)
49 |         plt.scatter(samples[:, 0], samples[:, 1], c=z, marker='.')
50 |         plt.plot(samples[: 100, 0], samples[: 100, 1], 'r-')
51 |         plt.title(desc)
52 |         plt.show()
53 | 
54 |     # example 1:
55 |     mean = np.array([2, 3])
56 |     covariance = np.array([[1, 0],
57 |                            [0, 1]])
58 |     covariance_inv = np.linalg.inv(covariance)
59 |     det_convariance = 1
60 |     def gaussian1(x):
61 |         return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean))
62 |     demonstrate(2, gaussian1, "Gaussian distribution with mean of 2 and 3")
63 | 
64 |     # example 2:
65 |     mean = np.array([2, 3])
66 |     covariance = np.array([[1, .5],
67 |                            [.5, 1]])
68 |     covariance_inv = np.linalg.inv(covariance)
69 |     det_convariance = 1
70 |     def gaussian2(x):
71 |         return np.exp(-.5 * (x - mean).T @ covariance_inv @ (x - mean))
72 |     demonstrate(2, gaussian2, "Gaussian distribution with mean of 2 and 3")
73 | 
74 |     # example 3:
75 |     def blocks(x):
76 |         if (0 < x[0] < 1 or 2 < x[0] < 3) and (0 < x[1] < 1 or 2 < x[1] < 3):
77 |             return 1
78 |         return 0
79 |     demonstrate(2, blocks, "Four blocks")
80 | 
81 |     # example 4:
82 |     def blocks(x):
83 |         if (0 < x[0] < 1 or 200 < x[0] < 300) and (0 < x[1] < 1 or 200 < x[1] < 300):
84 |             return 1
85 |         return 0
86 |     demonstrate(2, blocks, "Four blocks with large gap. (Monte Carlo doesn't solve everything)")
87 | 


--------------------------------------------------------------------------------
/20.LDA/LDA.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import os
 4 | from pathlib import Path
 5 | from itertools import chain
 6 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 7 | from utils import *
 8 | 
 9 | def lda(texts, word_prior_cnt=None, topic_prior_cnt=None, k=5, max_iteration=1000, epsilon=1e-8):
10 |     """
11 |     given a list of token lists, tokens are integers from [0, n_word].
12 |     return the topic distribution of each document,
13 |     and the word distribution of each topic.
14 |     """
15 |     n_word = max(chain(*texts)) + 1
16 |     n_text = len(texts)
17 | 
18 |     n_text_topic = np.zeros([n_text, k]) + epsilon
19 |     n_topic_word = np.zeros([k, n_word]) + epsilon
20 |     if topic_prior_cnt is not None:
21 |         n_text_topic += topic_prior_cnt[None, :]
22 |     if word_prior_cnt is not None:
23 |         n_topic_word += word_prior_cnt[None, :]
24 | 
25 |     topic = [[np.random.choice(k) for word in text] for text in texts]
26 |     for i, (text, text_topic) in enumerate(zip(texts, topic)):
27 |         for word, word_topic in zip(text, text_topic):
28 |             n_text_topic[i, word_topic] += 1
29 |             n_topic_word[word_topic, word] += 1
30 | 
31 |     for step in range(max_iteration):
32 |         for i, (text, text_topic) in enumerate(zip(texts, topic)):
33 |             for j, (word, word_topic) in enumerate(zip(text, text_topic)):
34 |                 # reduce the current value from the count
35 |                 n_text_topic[i, word_topic] -= 1
36 |                 n_topic_word[word_topic, word] -= 1
37 |                 # infer the current value from count of others
38 |                 likelihood_word_topic = n_topic_word[:, word] / n_topic_word.sum(axis=-1)
39 |                 likelihood_topic = n_text_topic[i, :] / n_text_topic[i, :].sum(axis=-1)
40 |                 likelihood_topic *= likelihood_word_topic
41 |                 p_topic = likelihood_topic / likelihood_topic.sum()
42 |                 # update count
43 |                 topic[i][j] = np.random.choice(k, p=p_topic)
44 |                 n_text_topic[i, topic[i][j]] += 1
45 |                 n_topic_word[topic[i][j], word] += 1
46 | 
47 |     p_topic_when_text = n_text_topic / n_text_topic.sum(axis=-1, keepdims=True)
48 |     p_word_when_topic = n_topic_word / n_topic_word.sum(axis=-1, keepdims=True)
49 |     return p_topic_when_text, p_word_when_topic
50 | 
51 | if __name__ == '__main__':
52 |     def demonstrate(X, k, desc, **args):
53 |         print(desc)
54 |         p_topic_when_text, p_word_when_topic = lda(X, k=k, **args)
55 |         print("The probabilities of each topic for each text are")
56 |         print(np.round(p_topic_when_text, 2))
57 |         print("The probabilities of each word for each topic are")
58 |         print(np.round(p_word_when_topic, 2))
59 |         print("The recovered text-wordcnt matrix is")
60 |         print(np.round((p_topic_when_text @ p_word_when_topic), 2))
61 |         print()
62 | 
63 |     n_vocab = 9
64 |     X = [
65 |         [2, 3],
66 |         [5, 8],
67 |         [1, 7],
68 |         [6, 8],
69 |         [0, 5],
70 |         [0, 1, 2, 3, 4, 5, 6, 7, 8],
71 |         [0, 2],
72 |         [6, 8],
73 |         [5, 5, 8],
74 |         [0, 2, 7],
75 |         [3, 4]
76 |     ]
77 |     demonstrate(X, 3, 'Example 1')
78 |     demonstrate(X, 8, 'Example 2: You can recogonize the original matrix from the recovered one if k is large enough')
79 | 
80 |     k = 8
81 |     word_prior_cnt = np.ones(n_vocab) * 2
82 |     topic_prior_cnt = np.ones(k) * 2
83 |     demonstrate(X, k, 'Example 3: The influence of prior', word_prior_cnt=word_prior_cnt, topic_prior_cnt=topic_prior_cnt)
84 | 
85 |     k = 8
86 |     word_prior_cnt = np.ones(n_vocab) * 2
87 |     topic_prior_cnt = np.zeros(k)
88 |     topic_prior_cnt[3] = 5
89 |     demonstrate(X, k, 'Example 4: The influence of prior', word_prior_cnt=word_prior_cnt, topic_prior_cnt=topic_prior_cnt)
90 | 


--------------------------------------------------------------------------------
/21.PageRank/PageRank.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import os
 4 | from pathlib import Path
 5 | sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
 6 | from utils import *
 7 | 
 8 | def pageRank(graph, d, max_iteration=1000, epsilon=1e-8):
 9 |     """
10 |     given a n * n link graph
11 |     graph[i, j] = 1 means that there is a link from i to j
12 |     d is the proportion of neighbours in the definition of page rank
13 |     return the probablisitic for a user visiting each page
14 |     """
15 |     n, _ = graph.shape
16 |     p = np.ones(n) / n
17 |     graph /= (graph.sum(axis=-1, keepdims=True) + epsilon)
18 |     graph = graph.T
19 |     for i in range(max_iteration):
20 |         pre_p = p
21 |         p = d * graph @ p + (1 - d) / n
22 |         if max(p - pre_p) < epsilon:
23 |             break
24 |     return p
25 | 
26 | if __name__ == '__main__':
27 |     def demonstrate(graph, d, desc):
28 |         print(desc)
29 |         p = pageRank(graph, d=d)
30 |         print('The probability of each node is', np.round(p, 2))
31 | 
32 |     graph = np.array(
33 |         [[0, 1, 1, 1],
34 |          [1, 0, 0, 1],
35 |          [0, 0, 1, 0],
36 |          [0, 1, 1, 0]]
37 |     ).astype(float)
38 |     demonstrate(graph, .8, 'Example 1')
39 | 
40 |     graph = np.array(
41 |         [[0, 1, 1],
42 |          [0, 0, 1],
43 |          [1, 0, 0]]
44 |     ).astype(float)
45 |     demonstrate(graph, .85, 'Example 2')
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Statistical-Learning-Methods（中文文档请往下翻）
  2 | 
  3 | Implement all the algorithms introduced by _Statistical Learning Methods_, Li Hang.
  4 | 
  5 | ## Feature
  6 | 
  7 | - **Complete**. All the algorithms introduced by this book are implemented, including
  8 |   - kNN powered by kd-tree.
  9 |   - max entropy model. I cannot find any other repo that implements this algorithm.
 10 |   - linear chain conditional random field. I cannot find this model in any other similar repo.
 11 |   - HMM powered by baum-welch. Most repos only provide with HMM trained by counting.
 12 | - **Detailed**. All the algorithms are implemented thoroughly. I try my best not to skip any detail. For example,
 13 |   - about how to select the best one of trimmed CART by cross-validation, I asked Dr. Li Hang by e-mail and got detailed answer. Thanks a lot to Dr. Li Hang for his patiance and kindness.
 14 | - **Matrix calculation**. Strip off `for` loops. Implement most of the algorithms with matrix calculation supported by `numpy`.
 15 | - **Extensible**. It is easy to fit the codes with new datasets because all the algorithms are controllable through parameters, to a large extent.
 16 | - **Examples** Each algorithm comes with some examples. Just run the model file and you will see the examples. If you have better examples for others to understand the model, please feel free to start a PR.
 17 | 
 18 | ## Dependencies
 19 | 
 20 | - Python3
 21 | - numpy
 22 | - matplotlib
 23 | - [rich](https://github.com/willmcgugan/rich)
 24 | 
 25 | ## Usage
 26 | 
 27 | Just run any single file located in each chapter. You will see examples of the algorithm.
 28 | 
 29 | ---
 30 | 
 31 | # 统计学习方法
 32 | 
 33 | 李航博士《统计学习方法》一书的**硬核** Python 实现。
 34 | 
 35 | ## 项目特色
 36 | 
 37 | GitHub 上有许多实现《统计学习方法》的仓库。本仓库与它们的不同之处在于：
 38 | 
 39 | - **完整性**。实现了**所有**模型。包括
 40 |   - KD 树支持的 KNN 模型。
 41 |   - **最大熵模型**。我没有找到其他任何一个仓库实现了该算法。
 42 |   - **线性链条件随机场**。我同样没有找到其他任何一个仓库实现了该算法。这个模型花费了我一个月的时间去理解和实现。
 43 |   - Baum-Welch 算法支持的 HMM 算法。大多数仓库实现的 HMM 算法都是简单的计数模型。
 44 | - **细节**。所有的算法我都在尽力**完全**实现。比如说
 45 |   - 有关如何用交叉验证法选取剪枝的 CART 树，我特意邮件询问了李航博士并得到了耐心的解答。在此非常感谢李航博士的支持！
 46 | - **矩阵运算**。我不喜欢用循环。你可以看到本仓库中的算法使用了大量的矩阵运算来避免使用循环。
 47 | - **可扩展性**。其他仓库的算法可能会在可扩展性上偷懒。比如 GMM 模型可能只实现了两个聚类的简单版本用于演示。而本仓库中的算法尽量将所有可调节部分作为模型参数，以供自由修改使用。
 48 | - **示例**。每个算法都加上了我认为会增强读者对算法理解的例子。当然我认为这部分目前还是不太完善的。如果你对如何举例有更好的见解，欢迎给我提 PR。
 49 | 
 50 | ## 项目依赖
 51 | 
 52 | - Python3
 53 | - numpy
 54 | - matplotlib
 55 | - [rich](https://github.com/willmcgugan/rich)
 56 | 
 57 | ## 如何使用
 58 | 
 59 | 直接使用 Python 运行任意一个文件夹内的模型文件，你就可以看到算法示例了。
 60 | 
 61 | ## 目录
 62 | 
 63 | - [第 2 章 - 感知机](02.Perceptron)
 64 |   - [感知机](02.Perceptron/perceptron.py)
 65 | - [第 3 章 - k 近邻法](03.KNN)
 66 |   - [k 近邻模型](03.KNN/knn.py)
 67 |   - [k 近邻模型 - 使用 KD 树实现](03.KNN/knn_kdtree.py)
 68 | - [第 4 章 - 朴素贝叶斯法](04.NaiveBayes)
 69 |   - [使用极大似然估计的朴素贝叶斯模型](04.NaiveBayes/NaiveBayesMLE.py)
 70 |   - [使用贝叶斯估计的朴素贝叶斯模型](04.NaiveBayes/NaiveBayesMAP.py)
 71 | - [第 5 章 - 决策树](05.DecisionTree)
 72 |   - [ID3 决策树](05.DecisionTree/ID3.py)
 73 |   - [C4.5 决策树](05.DecisionTree/C4.5.py)
 74 |   - [决策树剪枝算法](05.DecisionTree/prune.py)
 75 |   - [分类 CART 决策树](05.DecisionTree/ClassificationCART.py)
 76 |   - [分类 CART 决策树剪枝算法](05.DecisionTree/pruneClassificationCART.py)
 77 |   - [回归 CART 决策树](05.DecisionTree/RegressionCART.py)
 78 | - [第 6 章 - 逻辑斯谛回归与最大熵模型](06.LogisticRegression-MaxEntropy)
 79 |   - [逻辑斯谛回归模型](06.LogisticRegression-MaxEntropy/BinaryLogisticRegression.py)
 80 |   - [最大熵模型](06.LogisticRegression-MaxEntropy/MaxEntropy.py)
 81 | - [第 7 章 - 支持向量机](07.SVM)
 82 |   - [支持向量机](07.SVM/SVM.py)
 83 | - [第 8 章 - 提升方法](08.Boosting)
 84 |   - [AdaBoost](08.Boosting/AdaBoost.py)
 85 |   - [梯度提升树](08.Boosting/GBDT.py)
 86 | - [第 9 章 - EM 算法及其推广](09.EM)
 87 |   - [高斯混合模型](09.EM/GMM.py)
 88 | - [第 10 章 - 隐马尔科夫模型](10.HMM)
 89 |   - [前向算法](10.HMM/Forward.py)
 90 |   - [后向算法](10.HMM/Backward.py)
 91 |   - [维特比算法](10.HMM/Viterbi.py)
 92 |   - [Baum-Welch 算法](10.HMM/BaumWelch.py)
 93 |   - [使用 Baum-Welch 算法训练的隐马尔可夫模型](10.HMM/HMM.py)
 94 | - [第 11 章 - 条件随机场](11.ConditionalRandomField)
 95 |   - [线性链条件随机场](11.ConditionalRandomField/LinearChainConditionalRandomField.py)
 96 | - [第 14 章 - 聚类方法](14.Cluster)
 97 |   - [层次聚类](14.Cluster/Agglomerative.py)
 98 |   - [k 均值聚类](14.Cluster/KMeans.py)
 99 | - [第 15 章 - 奇异值分解](15.SVD)
100 |   - [奇异值分解](15.SVD/SVD.py)
101 | - [第 16 章 - 主成分分析](16.PCA)
102 |   - [主成分分析](16.PCA/PCA.py)
103 | - [第 17 章 - 潜在语义分析](17.LSA)
104 |   - [潜在语义分析模型](17.LSA/LSA.py)
105 | - [第 18 章 - 概率潜在语义分析](18.PLSA)
106 |   - [概率潜在语义分析模型](18.PLSA/PLSA.py)
107 | - [第 19 章 - 马尔可夫蒙特卡罗法](19.MCMC)
108 |   - [Metropolis-Hasting 算法](19.MCMC/MetropolisHasting.py)
109 |   - [单分量的 Metropolis-Hasting 算法](19.MCMC/SingleComponentMetropolisHasting.py)
110 |   - [吉布斯采样](19.MCMC/GibbsSampling.py)
111 | - [第 20 章 - 潜在狄利克雷分配](20.LDA)
112 |   - [潜在狄利克雷分配模型](20.LDA/LDA.py)
113 | - [第 21 章 - PageRank 算法](21.PageRank)
114 |   - [PageRank 算法](21.PageRank/PageRank.py)
115 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SleepyBag/Statistical-Learning-Methods/c16edf2d56f9f7c00651c749464b74b9ec039522/__init__.py


--------------------------------------------------------------------------------
/test_get_solution_domain.py:
--------------------------------------------------------------------------------
 1 | from utils import row_echelon, get_solution_domain
 2 | import numpy as np
 3 | 
 4 | for i in range(100):
 5 |     print('processing ', i)
 6 |     a = np.random.rand(100, 50)
 7 |     re = row_echelon(a)
 8 |     assert(len(re) == np.linalg.matrix_rank(a))
 9 |     zero = a @ get_solution_domain(re)
10 |     assert((zero == 0.).all())
11 | 
12 | for i in range(100):
13 |     print('processing ', i)
14 |     a = np.random.rand(5, 10)
15 |     re = row_echelon(a)
16 |     assert(len(re) == np.linalg.matrix_rank(a))
17 |     zero = a @ get_solution_domain(re)
18 |     assert((abs(zero) < 1e-8).all())
19 | 


--------------------------------------------------------------------------------
/test_heap.py:
--------------------------------------------------------------------------------
1 | from utils import Heap
2 | 
3 | heap = Heap([3, 1, 2])
4 | heap.push(1)
5 | heap.push(2)
6 | a = [i for i in heap]
7 | 
8 | assert(a == [1, 1, 2, 2, 3])
9 | 


--------------------------------------------------------------------------------
/test_information_gain.py:
--------------------------------------------------------------------------------
 1 | from utils import information_gain, entropy
 2 | from collections import Counter
 3 | from math import fabs
 4 | 
 5 | eps = 1e-3
 6 | 
 7 | X = [
 8 |     ['青年', '否', '否', '一般'],
 9 |     ['青年', '否', '否', '好'],
10 |     ['青年', '是', '否', '好'],
11 |     ['青年', '是', '是', '一般'],
12 |     ['青年', '否', '否', '一般'],
13 |     ['中年', '否', '否', '一般'],
14 |     ['中年', '否', '否', '好'],
15 |     ['中年', '是', '是', '好'],
16 |     ['中年', '否', '是', '非常好'],
17 |     ['中年', '否', '是', '非常好'],
18 |     ['老年', '否', '是', '非常好'],
19 |     ['老年', '否', '是', '好'],
20 |     ['老年', '是', '否', '好'],
21 |     ['老年', '是', '否', '非常好'],
22 |     ['老年', '否', '否', '一般'],
23 | ]
24 | Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
25 | 
26 | assert(fabs(entropy(Counter(Y).values()) - .971) < eps)
27 | assert(fabs(information_gain(X, Y, 0) - .083) < eps)
28 | assert(fabs(information_gain(X, Y, 1) - .324) < eps)
29 | assert(fabs(information_gain(X, Y, 2) - .420) < eps)
30 | assert(fabs(information_gain(X, Y, 3) - .363) < eps)
31 | 


--------------------------------------------------------------------------------
/test_line_search.py:
--------------------------------------------------------------------------------
 1 | from utils import line_search
 2 | 
 3 | class F:
 4 |     def __init__(self, n):
 5 |         self.n = n
 6 | 
 7 |     def __call__(self, x):
 8 |         return (x - self.n) ** 2
 9 | 
10 | f = F(0)
11 | epsilon = 1e-6
12 | for i in range(-1000, 1000):
13 |     f.n = i
14 |     assert(abs(line_search(f, -2000, 2000, epsilon) - i) <= epsilon)
15 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot as plt
  2 | import numpy as np
  3 | import heapq
  4 | from math import inf, nan
  5 | from math import log, sqrt
  6 | from collections import Counter
  7 | 
  8 | # ------------------ Basic Structures -----------------------------------------
  9 | class Heap:
 10 |     def __init__(self, arr=None, key=lambda x: x, max_len=inf):
 11 |         self.key = key
 12 |         self.max_len = max_len
 13 |         if not arr:
 14 |             self.h = []
 15 |         else:
 16 |             self.h = [(self.key(i), i) for i in arr]
 17 |         heapq.heapify(self.h)
 18 |         self.i = 0
 19 | 
 20 |     def __len__(self):
 21 |         return len(self.h)
 22 | 
 23 |     def __bool__(self):
 24 |         return len(self.h) != 0
 25 | 
 26 |     def __iter__(self):
 27 |         while self:
 28 |             yield self.pop()
 29 | 
 30 |     def push(self, x):
 31 |         # insert an number to the middle so that `x` will be never compared
 32 |         # because maybe `x` doesn't have comparing operator defined
 33 |         heapq.heappush(self.h, (self.key(x), self.i, x))
 34 |         self.i += 1
 35 |         if len(self.h) > self.max_len:
 36 |             self.pop()
 37 | 
 38 |     def top(self):
 39 |         return self.h[0][-1]
 40 | 
 41 |     def top_key(self):
 42 |         return self.h[0][0]
 43 | 
 44 |     def pop(self):
 45 |         return heapq.heappop(self.h)[-1]
 46 | 
 47 | # ------------------ Functions ------------------------------------------------
 48 | def argmax(arr, key=lambda x: x):
 49 |     arr = [key(a) for a in arr]
 50 |     ans = max(arr)
 51 |     return arr.index(ans), ans
 52 | 
 53 | def argmin(arr, key=lambda x: x):
 54 |     arr = [key(a) for a in arr]
 55 |     ans = min(arr)
 56 |     return arr.index(ans), ans
 57 | 
 58 | def sigmoid(x):
 59 |     return 1 / (np.exp(-x) + 1)
 60 | 
 61 | def binary_cross_entropy(pred, Y):
 62 |     loss = -(Y * np.log(pred) + (1 - Y) * np.log(1 - pred)).sum()
 63 |     return loss
 64 | 
 65 | def softmax(logits, axis=-1):
 66 |     exps = np.exp(logits)
 67 |     return exps / exps.sum(axis=axis, keepdims=True)
 68 | 
 69 | def line_search(f, l, r, epsilon=1e-6):
 70 |     """find the minimum point of a convex function"""
 71 |     lrate = (3 - sqrt(5)) / 2
 72 |     rrate = (sqrt(5) - 1) / 2
 73 |     fll, frr = None, None
 74 |     while r - l >= epsilon:
 75 |         if fll is None:
 76 |             ll = l + (r - l) * lrate
 77 |             fll = f(ll)
 78 |         if frr is None:
 79 |             rr = l + (r - l) * rrate
 80 |             frr = f(rr)
 81 |         if fll < frr:
 82 |             r, rr = rr, ll
 83 |             frr, fll = fll, None
 84 |         elif fll > frr:
 85 |             l, ll = ll, rr
 86 |             fll, frr = frr, None
 87 |         else:
 88 |             l, r = ll, rr
 89 |             fll, frr = None, None
 90 |     return (l + r) / 2
 91 | 
 92 | def newton(f, g, x0, epsilon=1e-6):
 93 |     """
 94 |     Find the zero point wehre f(x) = 0 of function f
 95 |     g(x) is the gradient function of f
 96 |     """
 97 |     prex = x0
 98 |     x = x0 - f(x0) / g(x0)
 99 |     while abs(x - prex) > epsilon:
100 |         prex, x = x, x - f(x) / g(x)
101 |     return x
102 | 
103 | def one_hot(i, size):
104 |     """Given a hot number the tensor size, return the one-hot tensor"""
105 |     ans = np.zeros(size)
106 |     ans[i] = 1
107 |     return ans
108 | 
109 | def row_echelon(A):
110 |     """
111 |     eliminate a matrix to row echelon form with gaussian elimination
112 |     """
113 |     # convert A to row echolon form
114 |     row_cnt, col_cnt = A.shape
115 |     col = 0
116 |     rank = 0
117 |     # from top to the bottom
118 |     for i in range(row_cnt):
119 |         find = False
120 |         while not find and col < col_cnt:
121 |             # look for the first non-zero value in current column
122 |             for j in range(i, row_cnt):
123 |                 if A[j][col] != 0.:
124 |                     if i != j:
125 |                         A[[i, j]] = A[[j, i]]
126 |                     A[i] /= A[i][col]
127 |                     find = True
128 |                     # if non-zero value found, start elimination
129 |                     for k in range(i + 1, row_cnt):
130 |                         A[k] -= A[i] * A[k][col]
131 |                     rank += 1
132 |                     break
133 |             # if not found, check the next column
134 |             else:
135 |                 col += 1
136 |         col += 1
137 |     # from bottom to the top
138 |     for i in range(row_cnt - 1, -1, -1):
139 |         # find the first non-zero value and eliminate
140 |         for col in range(col_cnt):
141 |             if A[i][col] != 0.:
142 |                 # start elimination
143 |                 for k in range(i - 1, -1, -1):
144 |                     A[k] -= A[i] * A[k][col] / A[i][col]
145 |                 break
146 |     return A[: rank]
147 | 
148 | def get_solution_domain(A):
149 |     """
150 |     get a group of linearly independent solutions of Ax=0, which are normalized
151 |     the input A is supposed to be in row echelon form
152 |     """
153 |     row_cnt, col_cnt = A.shape
154 |     A = row_echelon(A)
155 |     col = 0
156 |     nonzero_cols = []
157 |     ans = []
158 |     for i in range(row_cnt):
159 |         while col != col_cnt and A[i][col] == 0.:
160 |             ans.append(one_hot(col, col_cnt))
161 |             for j, j_col in enumerate(nonzero_cols):
162 |                 print(j, j_col)
163 |                 ans[-1][j_col] = -A[j][col]
164 |             col += 1
165 |         # record the first nonzero value of each row
166 |         nonzero_cols.append(col)
167 |         col += 1
168 | 
169 |     for col in range(col, col_cnt):
170 |         ans.append(one_hot(col, col_cnt))
171 |         for i, j in enumerate(nonzero_cols):
172 |             ans[-1][j] = -A[i][col]
173 |     if ans:
174 |         ans = np.stack(ans)
175 |         ans /= np.linalg.norm(ans, axis=-1, keepdims=True)
176 |     else:
177 |         ans = np.zeros([0, col_cnt])
178 |     return ans.T
179 | 
180 | # ------------------ Decision Trees -------------------------------------------
181 | def entropy(p):
182 |     s = sum(p)
183 |     p = [i / s for i in p]
184 |     ans = sum(-i * log(i, 2) for i in p)
185 |     return ans
186 | 
187 | def entropy_of_split(X, Y, col):
188 |     """calculate the conditional entropy of splitting data by col"""
189 |     val_cnt = Counter(x[col] for x in X)
190 |     ans = 0
191 |     for val in val_cnt:
192 |         weight = val_cnt[val] / len(X)
193 |         entr = entropy(Counter(y for x, y in zip(X, Y) if x[col] == val).values())
194 |         ans += weight * entr
195 |     return ans
196 | 
197 | def information_gain(X, Y, col):
198 |     entropy_of_X = entropy(Counter(Y).values())
199 |     entropy_of_col = entropy_of_split(X, Y, col)
200 |     return entropy_of_X - entropy_of_col
201 | 
202 | def information_gain_ratio(X, Y, col):
203 |     information_gain_of_col = information_gain(X, Y, col)
204 |     entropy_of_col = entropy(Counter(x[col] for x in X).values())
205 |     return information_gain_of_col / entropy_of_col
206 | 
207 | def gini(Y):
208 |     cnt = Counter(Y)
209 |     ans = 0.
210 |     for y in cnt:
211 |         ans += (cnt[y] / len(Y)) ** 2
212 |     return 1 - ans
213 | 
214 | # ------------------ Geometry -------------------------------------------------
215 | def kbline(k, b, **args):
216 |     """Plot a line from slope and intercept"""
217 |     axes = plt.gca()
218 |     x_vals = np.array(axes.get_xlim())
219 |     y_vals = b + k * x_vals
220 |     plt.plot(x_vals, y_vals, **args)
221 | 
222 | def wbline(w, b, **args):
223 |     if w[1] == 0:
224 |         plt.vlines(-b / w[0], *plt.gca().get_ylim(), **args)
225 |     else:
226 |         k = -w[0] / w[1]
227 |         b /= -w[1]
228 |         kbline(k, b, **args)
229 | 
230 | def euc_dis(a, b):
231 |     return np.linalg.norm(a - b, axis=-1)
232 | 


--------------------------------------------------------------------------------