├── .gitignore ├── ANN └── ANN_tutorial.py ├── GBM └── gbm.py ├── ImportanceSampling ├── ImportanceSampling.ipynb └── ImportanceSampling.py ├── KL-Divergence └── kl-divergence.ipynb ├── KMeans └── kMeans.py ├── Multi-regression └── multivariable_reg.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | .idea 3 | */.ipynb_checkpoints 4 | .ipynb_checkpoints 5 | */__pycache__ 6 | .DS_Store -------------------------------------------------------------------------------- /ANN/ANN_tutorial.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_iris 3 | from sklearn.preprocessing import OneHotEncoder 4 | from sklearn.model_selection import train_test_split 5 | 6 | 7 | class fcnn(): 8 | 9 | def __init__(self, train_X, train_y, hid, lr=0.01): 10 | self.input = train_X.shape[1] 11 | self.hid = hid 12 | self.output = train_y.shape[1] 13 | self.train_X = train_X 14 | self.train_y = train_y 15 | self.lr = lr 16 | # initialise weights 17 | self.W1 = np.random.randn(self.input, self.hid) / np.sqrt(self.input) 18 | self.B1 = np.zeros((1, self.hid)) 19 | self.W2 = np.random.randn(self.hid, self.output) / np.sqrt(self.hid) 20 | self.B2 = np.zeros((1, self.output)) 21 | 22 | # active function for the first layer 23 | def tanh(self, X_fc1): 24 | X_tanh = np.tanh(X_fc1) 25 | 26 | return X_tanh 27 | 28 | # softmax for the output layer 29 | # input X_fc2 is the result after assign W2 30 | def softmax(self, X_fc2): 31 | X_output = np.exp(X_fc2) / float(np.sum(np.exp(X_fc2))) 32 | 33 | return X_output 34 | 35 | # define loss function 36 | # y is actual value 37 | # p-1 38 | def cross_loss(self, X_output, y): 39 | index = np.where(y == 1) 40 | loss = -np.log(X_output[:, index]) 41 | # the loss with derivative 42 | loss_dev = X_output-y 43 | 44 | return loss, loss_dev 45 | 46 | # forward propagation 47 | def fit(self, epoch=1): 48 | # iterate whole data set 49 | for step in range(epoch): 50 | print 'step', step 51 | for i in range(self.train_X.shape[0]): 52 | # forward propagation 53 | X_input = self.train_X[i, :].reshape(1, self.train_X.shape[1]) 54 | X_fc1 = np.dot(X_input, self.W1) + self.B1 55 | X_tanh = self.tanh(X_fc1) 56 | X_fc2 = np.dot(X_tanh, self.W2) + self.B2 57 | X_output = self.softmax(X_fc2) 58 | 59 | # calculate loss 60 | loss, loss_dev = self.cross_loss(X_output, self.train_y[i, :]) 61 | print 'loss', float(loss) 62 | 63 | # according loss to do the back propagation 64 | delta_output = loss_dev 65 | delta_input = np.dot(delta_output, self.W2.T) * (1 - np.power(X_tanh, 2)) 66 | 67 | # update weights 68 | self.W2 += -self.lr * np.dot(X_tanh.T, delta_output) 69 | self.B2 += -self.lr * delta_output 70 | self.W1 += -self.lr * np.dot(X_input.T, delta_input) 71 | self.B1 += -self.lr * delta_input 72 | 73 | def predict(self, test_X, test_y): 74 | result = np.zeros((test_X.shape[0], test_y.shape[1])) 75 | correct = 0 76 | for i in range(test_X.shape[0]): 77 | X_input = test_X[i, :] 78 | X_fc1 = np.dot(X_input, self.W1) + self.B1 79 | X_sig = self.tanh(X_fc1) 80 | X_fc2 = np.dot(X_sig, self.W2) + self.B2 81 | X_output = self.softmax(X_fc2) 82 | result[i, :] = X_output 83 | # calculate precision 84 | if np.argmax(X_output) == np.argmax(test_y[i, :]): 85 | correct += 1 86 | 87 | accuracy = float(correct)/test_X.shape[0] 88 | print 'accuracy', accuracy 89 | 90 | return result 91 | 92 | 93 | iris = load_iris() 94 | train_X = iris.data 95 | enc = OneHotEncoder() 96 | train_y = enc.fit_transform(iris.target.reshape((150, 1)), iris.target).toarray() 97 | 98 | train_X2, test_X2, train_y2, test_y2 = train_test_split(train_X, train_y) 99 | 100 | # training ... 101 | 102 | fc = fcnn(train_X2, train_y2, hid=10) 103 | 104 | fc.fit(epoch=3) 105 | 106 | # calculate accuracy 107 | 108 | output = fc.predict(test_X2, test_y2) 109 | 110 | -------------------------------------------------------------------------------- /GBM/gbm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.tree import DecisionTreeRegressor 5 | 6 | x = np.linspace(0, 50, 51) 7 | y = np.random.randn(51) 8 | 9 | plt.scatter(x, y) 10 | 11 | # GBM regressor 12 | 13 | 14 | def gbm(x, y, M): 15 | x = x.reshape(-1, 1) 16 | # var to store predictions 17 | all_preds = np.zeros((M, len(y))) 18 | # initialization 19 | pred = np.mean(y) 20 | # iterate 21 | # for each iteration fit the residual 22 | for i in range(M): 23 | print 'iteration', i+1 24 | all_preds[i, :] = pred 25 | residual = y - pred 26 | clf = DecisionTreeRegressor(max_depth=1) 27 | clf = clf.fit(x, residual) 28 | h = clf.predict(x) 29 | pred += h 30 | 31 | return all_preds 32 | 33 | 34 | preds = gbm(x, y, 12) 35 | 36 | plt.scatter(x, y) 37 | plt.plot(x, preds[0]) 38 | 39 | plt.scatter(x, y) 40 | plt.plot(x, preds[9]) 41 | 42 | # plot to see difference each iteration 43 | fig = plt.figure(figsize=[8, 8]) 44 | for i in range(12): 45 | ax = fig.add_subplot(3, 4, i+1) 46 | ax.set_title('iteration {}'.format(i+1)) 47 | ax.scatter(x, y, s=2.5) 48 | ax.plot(x, preds[i], c='r') 49 | -------------------------------------------------------------------------------- /ImportanceSampling/ImportanceSampling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Importance Sampling\n", 8 | "---\n", 9 | "Using sampling to approximate a distribution\n", 10 | "\n", 11 | "$$E[f(x)] = \\int f(x)p(x) dx \\approx \\frac{1}{n}\\sum_{i} f(x_i)$$\n", 12 | "where $ x \\sim p(x)$\n", 13 | "\n", 14 | "$$E[f(x)] = \\int f(x)p(x) dx = \\int f(x)\\frac{p(x)}{q(x)}q(x) dx \\approx \\frac{1}{n} \\sum_{i} f(x_i)\\frac{p(x_i)}{q(x_i)}$$\n", 15 | "\n", 16 | "where $ x \\sim q(x)$\n", 17 | "\n", 18 | "Idea of importance sampling: draw the sample from a proposal distribution and re-weight the integral using importance weights so that the correct distribution is targeted\n", 19 | "\n", 20 | "$$Var(X) = E[X^2] - E[X]^2$$\n", 21 | "\n", 22 | "**Reference**\n", 23 | "\n", 24 | "- [1](https://www.youtube.com/watch?v=3Mw6ivkDVZc)\n", 25 | "- [2](https://astrostatistics.psu.edu/su14/lectures/cisewski_is.pdf)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 69, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stderr", 35 | "output_type": "stream", 36 | "text": [ 37 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 38 | " return f(*args, **kwds)\n", 39 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 40 | " return f(*args, **kwds)\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "import numpy as np\n", 46 | "import scipy.stats as stats\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "import seaborn as sns" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 161, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "def f_x(x):\n", 58 | " return 1/(1 + np.exp(-x))\n", 59 | "\n", 60 | "def distribution(mu=0, sigma=1):\n", 61 | " # return probability given a value\n", 62 | " distribution = stats.norm(mu, sigma)\n", 63 | " return distribution" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 121, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "" 75 | ] 76 | }, 77 | "execution_count": 121, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | }, 81 | { 82 | "data": { 83 | "image/png": "\n", 84 | "text/plain": [ 85 | "
" 86 | ] 87 | }, 88 | "metadata": {}, 89 | "output_type": "display_data" 90 | } 91 | ], 92 | "source": [ 93 | "plt.figure(figsize=[6, 4])\n", 94 | "x = np.linspace(0, 4, 50) # x ranges from 0 to 4\n", 95 | "y = [f_x(i) for i in x]\n", 96 | "\n", 97 | "plt.plot(x, y, label=\"$f(x)$\")\n", 98 | "\n", 99 | "plt.xlabel(\"x\", size=18)\n", 100 | "plt.ylabel(\"y\", size=18)\n", 101 | "plt.legend(prop={\"size\": 14})" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Sampling" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 169, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# pre-setting\n", 118 | "n = 1000\n", 119 | "\n", 120 | "mu_target = 3.5\n", 121 | "sigma_target = 1\n", 122 | "mu_appro = 3\n", 123 | "sigma_appro = 1\n", 124 | "\n", 125 | "p_x = distribution(mu_target, sigma_target)\n", 126 | "q_x = distribution(mu_appro, sigma_appro)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 170, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stderr", 136 | "output_type": "stream", 137 | "text": [ 138 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", 139 | " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n", 140 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", 141 | " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n" 142 | ] 143 | }, 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "" 148 | ] 149 | }, 150 | "execution_count": 170, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | }, 154 | { 155 | "data": { 156 | "image/png": "\n", 157 | "text/plain": [ 158 | "
" 159 | ] 160 | }, 161 | "metadata": {}, 162 | "output_type": "display_data" 163 | } 164 | ], 165 | "source": [ 166 | "plt.figure(figsize=[10, 4])\n", 167 | "\n", 168 | "sns.distplot([np.random.normal(mu_target, sigma_target) for _ in range(3000)], label=\"distribution $p(x)$\")\n", 169 | "sns.distplot([np.random.normal(mu_appro, sigma_appro) for _ in range(3000)], label=\"distribution $q(x)$\")\n", 170 | "\n", 171 | "plt.title(\"Distributions\", size=16)\n", 172 | "plt.legend()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 178, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "simulate value 0.9542816022260111\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "# value\n", 190 | "s = 0\n", 191 | "for i in range(n):\n", 192 | " # draw a sample\n", 193 | " x_i = np.random.normal(mu_target, sigma_target)\n", 194 | " s += f_x(x_i)\n", 195 | "print(\"simulate value\", s/n)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 172, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "average 0.9495227171370471 variance 0.3043862985463373\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "# calculate value sampling from a different distribution\n", 213 | "\n", 214 | "value_list = []\n", 215 | "for i in range(n):\n", 216 | " # sample from different distribution\n", 217 | " x_i = np.random.normal(mu_appro, sigma_appro)\n", 218 | " value = f_x(x_i)*(p_x.pdf(x_i) / q_x.pdf(x_i))\n", 219 | " \n", 220 | " value_list.append(value)\n", 221 | "\n", 222 | "print(\"average {} variance {}\".format(np.mean(value_list), np.var(value_list)))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Different $q(x)$" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 179, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# pre-setting\n", 239 | "n = 5000\n", 240 | "\n", 241 | "mu_target = 3.5\n", 242 | "sigma_target = 1\n", 243 | "mu_appro = 1\n", 244 | "sigma_appro = 1\n", 245 | "\n", 246 | "p_x = distribution(mu_target, sigma_target)\n", 247 | "q_x = distribution(mu_appro, sigma_appro)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 182, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stderr", 257 | "output_type": "stream", 258 | "text": [ 259 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", 260 | " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n", 261 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", 262 | " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n" 263 | ] 264 | }, 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "" 269 | ] 270 | }, 271 | "execution_count": 182, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | }, 275 | { 276 | "data": { 277 | "image/png": "\n", 278 | "text/plain": [ 279 | "
" 280 | ] 281 | }, 282 | "metadata": {}, 283 | "output_type": "display_data" 284 | } 285 | ], 286 | "source": [ 287 | "plt.figure(figsize=[10, 4])\n", 288 | "\n", 289 | "sns.distplot([np.random.normal(mu_target, sigma_target) for _ in range(3000)], label=\"distribution $p(x)$\")\n", 290 | "sns.distplot([np.random.normal(mu_appro, sigma_appro) for _ in range(3000)], label=\"distribution $q(x)$\")\n", 291 | "\n", 292 | "plt.title(\"Distributions\", size=16)\n", 293 | "plt.legend()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 181, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "average 0.9959984807502844 variance 83.36158359644132\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "# calculate value sampling from a different distribution\n", 311 | "\n", 312 | "value_list = []\n", 313 | "# need larger steps\n", 314 | "for i in range(n):\n", 315 | " # sample from different distribution\n", 316 | " x_i = np.random.normal(mu_appro, sigma_appro)\n", 317 | " value = f_x(x_i)*(p_x.pdf(x_i) / q_x.pdf(x_i))\n", 318 | " \n", 319 | " value_list.append(value)\n", 320 | "\n", 321 | "print(\"average {} variance {}\".format(np.mean(value_list), np.var(value_list)))" 322 | ] 323 | } 324 | ], 325 | "metadata": { 326 | "kernelspec": { 327 | "display_name": "Python 3", 328 | "language": "python", 329 | "name": "python3" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 3 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython3", 341 | "version": "3.6.5" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 2 346 | } 347 | -------------------------------------------------------------------------------- /ImportanceSampling/ImportanceSampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | 7 | def f_x(x): 8 | return 1/(1 + np.exp(-x)) 9 | 10 | 11 | def distribution(mu=0, sigma=1): 12 | # return probability given a value 13 | distribution = stats.norm(mu, sigma) 14 | return distribution 15 | 16 | 17 | if __name__ == "__main__": 18 | # pre-setting 19 | n = 1000 20 | 21 | mu_target = 3.5 22 | sigma_target = 1 23 | mu_appro = 3 24 | sigma_appro = 1 25 | 26 | p_x = distribution(mu_target, sigma_target) 27 | q_x = distribution(mu_appro, sigma_appro) 28 | 29 | plt.figure(figsize=[10, 4]) 30 | 31 | sns.distplot([np.random.normal(mu_target, sigma_target) for _ in range(3000)], label="distribution $p(x)$") 32 | sns.distplot([np.random.normal(mu_appro, sigma_appro) for _ in range(3000)], label="distribution $q(x)$") 33 | 34 | plt.title("Distributions", size=16) 35 | plt.legend() 36 | 37 | # value 38 | s = 0 39 | for i in range(n): 40 | # draw a sample 41 | x_i = np.random.normal(mu_target, sigma_target) 42 | s += f_x(x_i) 43 | print("simulate value", s / n) 44 | 45 | # calculate value sampling from a different distribution 46 | 47 | value_list = [] 48 | for i in range(n): 49 | # sample from different distribution 50 | x_i = np.random.normal(mu_appro, sigma_appro) 51 | value = f_x(x_i) * (p_x.pdf(x_i) / q_x.pdf(x_i)) 52 | 53 | value_list.append(value) 54 | 55 | print("average {} variance {}".format(np.mean(value_list), np.var(value_list))) 56 | 57 | # pre-setting different q(x) 58 | n = 5000 59 | 60 | mu_target = 3.5 61 | sigma_target = 1 62 | mu_appro = 1 63 | sigma_appro = 1 64 | 65 | p_x = distribution(mu_target, sigma_target) 66 | q_x = distribution(mu_appro, sigma_appro) 67 | 68 | plt.figure(figsize=[10, 4]) 69 | 70 | sns.distplot([np.random.normal(mu_target, sigma_target) for _ in range(3000)], label="distribution $p(x)$") 71 | sns.distplot([np.random.normal(mu_appro, sigma_appro) for _ in range(3000)], label="distribution $q(x)$") 72 | 73 | plt.title("Distributions", size=16) 74 | plt.legend() 75 | 76 | # calculate value sampling from a different distribution 77 | 78 | value_list = [] 79 | # need larger steps 80 | for i in range(n): 81 | # sample from different distribution 82 | x_i = np.random.normal(mu_appro, sigma_appro) 83 | value = f_x(x_i) * (p_x.pdf(x_i) / q_x.pdf(x_i)) 84 | 85 | value_list.append(value) 86 | 87 | print("average {} variance {}".format(np.mean(value_list), np.var(value_list))) -------------------------------------------------------------------------------- /KL-Divergence/kl-divergence.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "For discrete variable:\n", 8 | "$$ D_{KL}(p || q) = \\sum_{i}p(x_i)(logp(x_i) - logq(x_i)) = \\sum_{i}p(x_i)(log\\frac{p(x_i)}{q(x_i)})$$\n", 9 | "\n", 10 | "For continuous variable:\n", 11 | "$$ D_{KL}(p || q) = \\int p(x)(logp(x) - logq(x)) dx = \\int p(x)(log\\frac{p(x)}{q(x)}) dx $$\n", 12 | "\n", 13 | "**Reference**:\n", 14 | "- [1](https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained)\n", 15 | "- [2](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 39, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "%matplotlib inline\n", 25 | "\n", 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import scipy.stats as stats\n", 29 | "import seaborn as sns" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### Generate random distribution" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 21, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "def sample_gen(n=1000):\n", 46 | " samples = []\n", 47 | " for _ in range(n):\n", 48 | " if np.random.uniform() <= 0.5:\n", 49 | " x = np.random.uniform(-2, 2)\n", 50 | " else:\n", 51 | " x = np.random.normal(1, 1)\n", 52 | " samples.append(x)\n", 53 | " return samples" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 25, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | "/Users/jeremy.zhang/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", 66 | " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n" 67 | ] 68 | }, 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "" 73 | ] 74 | }, 75 | "execution_count": 25, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | }, 79 | { 80 | "data": { 81 | "image/png": "\n", 82 | "text/plain": [ 83 | "
" 84 | ] 85 | }, 86 | "metadata": {}, 87 | "output_type": "display_data" 88 | } 89 | ], 90 | "source": [ 91 | "samples = sample_gen(n=2000)\n", 92 | "plt.figure(figsize=[10, 6])\n", 93 | "sns.distplot(samples)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 49, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "" 105 | ] 106 | }, 107 | "execution_count": 49, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | }, 111 | { 112 | "data": { 113 | "image/png": "\n", 114 | "text/plain": [ 115 | "
" 116 | ] 117 | }, 118 | "metadata": {}, 119 | "output_type": "display_data" 120 | } 121 | ], 122 | "source": [ 123 | "n = 2000\n", 124 | "uni_samples = np.random.uniform(-1, 3, n)\n", 125 | "norm_samples = np.random.normal(1, 1, n)\n", 126 | "\n", 127 | "plt.figure(figsize=[10, 6])\n", 128 | "\n", 129 | "sns.distplot(samples, hist=False, label=\"sample distribution\")\n", 130 | "sns.distplot(uni_samples, hist=False, label=\"uniform approximation\")\n", 131 | "sns.distplot(norm_samples, hist=False, label=\"normal approximation\")\n", 132 | "\n", 133 | "plt.legend" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Limit range to `[-1, 3]` " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 103, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "array([-2.11403038, -1.76619909, -1.41836779, -1.0705365 , -0.72270521,\n", 152 | " -0.37487392, -0.02704263, 0.32078866, 0.66861996, 1.01645125,\n", 153 | " 1.36428254, 1.71211383, 2.05994512, 2.40777641, 2.75560771,\n", 154 | " 3.103439 , 3.45127029, 3.79910158, 4.14693287, 4.49476416])" 155 | ] 156 | }, 157 | "execution_count": 103, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "lower, upper = min(samples), max(samples)\n", 164 | "\n", 165 | "space = np.linspace(lower, upper, 20)\n", 166 | "space" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 67, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "dist_uni = stats.norm(1, 1)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 68, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "-8.379093586553296" 187 | ] 188 | }, 189 | "execution_count": 68, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "np.sum([1/2000*(np.log2((1/2000)/dist_uni.pdf(s))) for s in samples])" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 69, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "-8.380821783940933" 207 | ] 208 | }, 209 | "execution_count": 69, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "np.sum([1/2000*(np.log2((1/2000)/(1/6))) for s in samples])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 104, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "" 227 | ] 228 | }, 229 | "execution_count": 104, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | }, 233 | { 234 | "data": { 235 | "image/png": "\n", 236 | "text/plain": [ 237 | "
" 238 | ] 239 | }, 240 | "metadata": {}, 241 | "output_type": "display_data" 242 | } 243 | ], 244 | "source": [ 245 | "s1 = np.random.normal(2, 1, 2000)\n", 246 | "s2 = np.random.normal(2.2, 1, 2000)\n", 247 | "\n", 248 | "plt.figure(figsize=[10, 6])\n", 249 | "\n", 250 | "sns.distplot(s1, hist=False, label=\"s1\")\n", 251 | "sns.distplot(s2, hist=False, label=\"s2\")" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 119, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "-0.24610114352357795" 263 | ] 264 | }, 265 | "execution_count": 119, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "dist_s1 = stats.norm(2, 1)\n", 272 | "dist_s2 = stats.norm(2.2, 1)\n", 273 | "\n", 274 | "KL = 0\n", 275 | "for i in range(1000):\n", 276 | " xi = np.random.normal(2, 1)\n", 277 | " pxi = dist_s1.pdf(xi)\n", 278 | " qxi = dist_s2.pdf(xi)\n", 279 | " \n", 280 | " kli = xi*np.log2(pxi/qxi)\n", 281 | " KL += kli\n", 282 | "KL/1000" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.6.5" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 2 307 | } 308 | -------------------------------------------------------------------------------- /KMeans/kMeans.py: -------------------------------------------------------------------------------- 1 | # use numpy to implement k-means algorithm 2 | 3 | import numpy as np 4 | import random 5 | import matplotlib.pyplot as plt 6 | 7 | x1 = np.random.normal(loc=1, scale=.4, size=50) 8 | y1 = np.random.normal(loc=1, scale=.3, size=50) 9 | x2 = np.random.normal(loc=3, scale=.2, size=40) 10 | y2 = np.random.normal(loc=2, scale=.3, size=40) 11 | 12 | plt.scatter(x1, y1, c='r') 13 | plt.scatter(x2, y2, c='b') 14 | 15 | x = np.concatenate((x1, x2)) 16 | y = np.concatenate((y1, y2)) 17 | 18 | data = np.stack((x, y), axis=1) # (90, 2) 19 | 20 | 21 | class kMeans(): 22 | 23 | def __init__(self, k, max_iter=100): 24 | self.k = k 25 | self.max_iter = max_iter 26 | self.centroids = None 27 | 28 | def distane(self, pt, centroids): 29 | # compute distance using euclidean 30 | # return the cluster corresponds to the min distance 31 | min_dist = 10000 32 | min_index = -1 33 | for i in range(len(centroids)): 34 | curr_dist = np.sum((pt-centroids[i])**2) 35 | if curr_dist < min_dist: 36 | min_dist = curr_dist 37 | min_index = i 38 | 39 | return min_index 40 | 41 | def updata_centroids(self, data, classes): 42 | for i in range(self.k): 43 | class_i_data = data[np.where(np.array(classes) == i)] 44 | self.centroids[i] = np.mean(class_i_data, axis=0) 45 | 46 | def fit(self, data): 47 | self.centroids = random.sample(data, self.k) 48 | # iteration 49 | iter = 0 50 | while iter < self.max_iter: 51 | iter += 1 52 | # a list to store class index 53 | classes = [] 54 | # iterate all points 55 | for point in data: 56 | curr_class = self.distane(point, self.centroids) 57 | classes.append(curr_class) 58 | # update centroids 59 | self.updata_centroids(data, classes) 60 | 61 | return np.array(classes) 62 | 63 | 64 | km = kMeans(k=2, max_iter=10) 65 | res = km.fit(data) 66 | 67 | color = ['r', 'c', 'g', 'k', 'b', 'm', 'y', 'w'] 68 | 69 | for i in range(data.shape[0]): 70 | plt.scatter(data[i, 0], data[i, 1], c=color[res[i]], s=40) 71 | 72 | -------------------------------------------------------------------------------- /Multi-regression/multivariable_reg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | class muti_reg: 8 | 9 | def __init__(self): 10 | self.b = None 11 | 12 | def fit(self, train_X, train_y): 13 | train_X = train_X.reshape(len(train_y), -1) 14 | train_y = train_y.reshape(len(train_y), 1) 15 | X = np.insert(train_X, 0, 1, axis=1) 16 | self.b = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(train_y) # (x.T*x)-1x.T*y 17 | 18 | def predict(self, test_X): 19 | test_X = test_X.reshape(test_X.shape[0], -1) 20 | X = np.insert(test_X, 0, 1, axis=1) 21 | output = np.dot(X, self.b) 22 | 23 | return output.flatten() 24 | 25 | 26 | def error(y_true, y_pred): 27 | n = len(y_true) 28 | sse = np.sum((y_true-y_pred)**2) 29 | mse = sse/float(n) 30 | rmse = np.sqrt(mse) 31 | sst = np.sum((y_true-np.mean(y_true))**2) 32 | r_square = 1-sse/sst 33 | # ad_r_square = 1-(1-r_square**2)*(n-1)/(n-k) 34 | # print ad_r_square 35 | print 'SSE:', sse, '\nMSE:', mse, '\nRMSE', rmse, '\nR_Square:', r_square 36 | 37 | 38 | # uni-variable regression 39 | 40 | x = np.linspace(1, 10, 20) 41 | y = 2*x+3+np.random.randn(20) 42 | 43 | plt.plot(x, y, 'bo') 44 | 45 | mr = muti_reg() 46 | mr.fit(x, y) 47 | y_pred = mr.predict(x) 48 | 49 | error(y, y_pred) 50 | 51 | # plt.grid() 52 | plt.scatter(x, y) 53 | plt.plot(x, 2*x+3, '-r', linewidth=2, label='actual') 54 | plt.plot(x, y_pred, '--b', label='predict') 55 | plt.legend(loc='upper left') 56 | 57 | # multi-linear-regression 58 | 59 | state = pd.read_csv('state.csv') 60 | data = state.drop([state.columns[0], 'Murder'], axis=1) 61 | data_y = state.Murder 62 | 63 | train_X, test_X, train_y, test_y = train_test_split(np.array(data), np.array(data_y)) 64 | 65 | mr2 = muti_reg() 66 | mr2.fit(train_X, train_y) 67 | test_pred = mr2.predict(test_X) 68 | 69 | error(test_y, test_pred) 70 | 71 | # SSE: 34.9067045869 72 | # MSE: 2.68513112207 73 | # RMSE 1.6386369708 74 | # R_Square: 0.658421407881 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Algorithm Implemention 2 | Implement algorithms with built-in functions 3 | 4 | ## Importance Sampling 5 | - [Importance Sampling](https://medium.com/@zhangyue9306/importance-sampling-introduction-e76b2c32e744) 6 | --------------------------------------------------------------------------------