├── .DS_Store ├── Animated_Illustrations ├── .DS_Store ├── ATTRIBUTION.txt ├── density_estimation.png ├── density_estimation.py ├── gaussian_mixture_model.png ├── gaussian_mixture_model.py ├── gradient_descent.png ├── gradient_descent.py ├── kernel_regression.png ├── kernel_regression.py ├── kmeans alpha 0.3.png ├── kmeans.py ├── linear_regression.png └── linear_regression_fit.py ├── LICENSE ├── PCA_MNIST.py ├── README.md ├── UMAP_MNIST.py ├── autoencoder_MNIST.py ├── data.txt ├── density_estimation.py ├── gaussian_mixture_model.py ├── gradient_descent.py ├── kernel_regression.py ├── kernel_trick.py ├── kmeans.py ├── linear_regression_fit.py ├── multivariate_gaussian.py ├── pdf.py ├── pmf.py ├── prediction_strength.py ├── standard_logistic_function.py ├── under_over_fitting.py └── vector.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/.DS_Store -------------------------------------------------------------------------------- /Animated_Illustrations/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/.DS_Store -------------------------------------------------------------------------------- /Animated_Illustrations/ATTRIBUTION.txt: -------------------------------------------------------------------------------- 1 | The animated illustrations are produced by Ranjan Piyush (https://www.linkedin.com/in/ranjan-piyush-34b29856/) based on the original book's source code. -------------------------------------------------------------------------------- /Animated_Illustrations/density_estimation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/density_estimation.png -------------------------------------------------------------------------------- /Animated_Illustrations/density_estimation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | import math 6 | 7 | from sklearn.neighbors import KernelDensity 8 | 9 | import scipy.integrate as integrate 10 | from sklearn.kernel_ridge import KernelRidge 11 | 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 14 | matplotlib.rcParams.update({'font.size': 18}) 15 | 16 | mu1, sigma1 = 3.0, 1.0 17 | mu2, sigma2 = 8.0, 1.5 18 | 19 | def sample_points(): 20 | s1 = np.random.normal(mu1, sigma1, 50) 21 | s2 = np.random.normal(mu2, sigma2, 50) 22 | return list(s1) + list(s2) 23 | 24 | # generate points used to plot 25 | x_plot = np.linspace(-3,15,100) 26 | 27 | # generate points and keep a subset of them 28 | x = sample_points() 29 | 30 | ##colors = ['red','blue','orange','green','black','purple','yellow','magenta', 31 | ## 'pink','grey'] 32 | lw = 2 33 | 34 | def kernel(x1, x2, bi = 5.0): 35 | z = (x1 - x2) / bi 36 | return (1.0/math.sqrt(2.0 * 3.14)) * math.exp((-1.0/2.0)*(z**2)) 37 | 38 | def fb(xx, data, bi): 39 | return (1/(len(data)*bi)) * sum([kernel(xx, xi, bi) for xi in data]) 40 | 41 | def fbi(i, data, bi): 42 | data_minus_i = [] 43 | for ii in range(len(data)): 44 | if i != ii: 45 | data_minus_i.append(data[ii]) 46 | return (1/(len(data_minus_i)*bi)) * sum([kernel(data[i], xi, bi) for xi in data_minus_i]) 47 | 48 | def sum_pdf(x): 49 | result = [] 50 | for i in range(len(x)): 51 | result.append((sp.stats.norm.pdf(x, mu1, sigma1)[i] + sp.stats.norm.pdf(x, mu2, sigma2)[i])/2.0) 52 | #result.append(sp.stats.norm.pdf(x, mu1, sigma1)[i]) 53 | return result 54 | 55 | b = np.linspace(0.01, 5.0, 100) 56 | 57 | score = [] 58 | for bi in b: 59 | def fb2(xx): 60 | return fb(xx, x, bi)**2 61 | 62 | s = integrate.quad(fb2, -np.inf, np.inf)[0] - 2.0*np.mean([fbi(i, x, bi) for i in range(len(x))]) 63 | score.append(s) 64 | 65 | plt.figure(1) 66 | plt.plot(b,score) 67 | plt.xlabel("$b$") 68 | plt.ylabel("$l$") 69 | plt.tight_layout() 70 | plt.xticks(np.arange(0,5,0.5)) 71 | #plt.show() 72 | fig1 = plt.gcf() 73 | ##fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 74 | ##fig1.savefig('../../Illustrations/density-estimation-loss.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 75 | ##fig1.savefig('../../Illustrations/density-estimation-loss.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 76 | ##fig1.savefig('../../Illustrations/density-estimation-loss.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 77 | minb = [bi for bi, s in zip(b, score) if s == min(score)][0] 78 | print(minb) 79 | 80 | import numpy 81 | 82 | def fig2data(fig): 83 | """ 84 | @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it 85 | @param fig a matplotlib figure 86 | @return a numpy 3D array of RGB values 87 | """ 88 | # draw the renderer 89 | fig.canvas.draw() 90 | 91 | # Get the RGBA buffer from the figure 92 | w,h = fig.canvas.get_width_height() 93 | buf = numpy.array(fig.canvas.renderer._renderer) 94 | 95 | return buf 96 | 97 | seq = [] 98 | for count,degree in enumerate([round(minb,2)] + np.arange(0.05,1.5,0.05)): 99 | plt.figure(count+2) 100 | axes = plt.gca() 101 | axes.set_xlim([-3,15]) 102 | axes.set_ylim([0,0.3]) 103 | plt.xlabel("$x$") 104 | plt.ylabel("pdf") 105 | degree = round(degree,2) 106 | ## heading = 'Iteration '+str(count) 107 | plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples") 108 | plt.plot(x_plot, [fb(xp ,x, degree) for xp in x_plot],color='blue',linewidth=lw, label="$\\hat{f}_b$, $b = " + str(degree) + "$") 109 | plt.plot(x_plot,sum_pdf(x_plot), label="true pdf") 110 | ## plt.title(heading) 111 | 112 | plt.legend(loc='upper right',prop = {'size':9}) 113 | plt.tight_layout() 114 | 115 | fig1 = plt.gcf() 116 | nfig = fig2data(fig1) 117 | seq.append(nfig) 118 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 119 | ## fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 120 | ## fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 121 | ## fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 122 | #plt.show() 123 | 124 | import os 125 | ## Get the directory address of current python file 126 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 127 | os.chdir(curr_dir) ## Set the current directory as working directory 128 | 129 | ## The package used to create gif files 130 | import numpngw 131 | numpngw.write_apng('density_estimation.png',seq,delay = 500) 132 | 133 | -------------------------------------------------------------------------------- /Animated_Illustrations/gaussian_mixture_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/gaussian_mixture_model.png -------------------------------------------------------------------------------- /Animated_Illustrations/gaussian_mixture_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | import math 6 | 7 | from sklearn.neighbors import KernelDensity 8 | 9 | import scipy.integrate as integrate 10 | from sklearn.kernel_ridge import KernelRidge 11 | 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 14 | matplotlib.rcParams.update({'font.size': 18}) 15 | 16 | mu1, sigma1 = 3.0, 1.0 17 | mu2, sigma2 = 8.0, 3.5 18 | 19 | def sample_points(): 20 | s1 = np.random.normal(mu1, math.sqrt(sigma1), 50) 21 | 22 | s2 = np.random.normal(mu2, math.sqrt(sigma2), 50) 23 | 24 | return list(s1) + list(s2) 25 | 26 | def compute_bi(mu1local, sigma1local, mu2local, sigma2local, phi1local, phi2local): 27 | bis = [] 28 | for xi in x: 29 | bis.append((sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local)/(sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local + sp.stats.norm.pdf(xi, mu2local, math.sqrt(sigma2local)) * phi2local)) 30 | return bis 31 | 32 | # generate points used to plot 33 | x_plot = np.linspace(-2, 12, 100) 34 | 35 | # generate points and keep a subset of them 36 | x = sample_points() 37 | 38 | colors = ['red', 'blue', 'orange', 'green'] 39 | lw = 2 40 | 41 | mu1_estimate = 1.0 42 | mu2_estimate = 2.0 43 | sigma1_estimate = 1.0 44 | sigma2_estimate = 2.0 45 | 46 | phi1_estimate = 0.5 47 | phi2_estimate = 0.5 48 | 49 | import numpy 50 | 51 | def fig2data(fig): 52 | """ 53 | @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it 54 | @param fig a matplotlib figure 55 | @return a numpy 3D array of RGB values 56 | """ 57 | # draw the renderer 58 | fig.canvas.draw() 59 | 60 | # Get the RGBA buffer from the figure 61 | w,h = fig.canvas.get_width_height() 62 | buf = numpy.array(fig.canvas.renderer._renderer) 63 | 64 | return buf 65 | 66 | seq = [] 67 | count = 0 68 | while True: 69 | plt.figure(count) 70 | axes = plt.gca() 71 | axes.set_xlim([-2,14]) 72 | axes.set_ylim([0,0.8]) 73 | plt.xlabel("$x$") 74 | plt.ylabel("pdf") 75 | heading = "Iteration "+str(count) 76 | plt.title(heading) 77 | plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples") 78 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1_estimate, math.sqrt(sigma1_estimate)) for xp in x_plot], color=colors[1], linewidth=lw, label="$f(x_i \\mid \\mu_1 ,\\sigma_1^2)$") 79 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2_estimate, math.sqrt(sigma2_estimate)) for xp in x_plot], color=colors[3], linewidth=lw, label="$f(x_i \\mid \\mu_2 ,\\sigma_2^2)$") 80 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1, math.sqrt(sigma1)) for xp in x_plot], color=colors[0], label="true pdf") 81 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2, math.sqrt(sigma2)) for xp in x_plot], color=colors[0]) 82 | 83 | plt.legend(loc='upper right',prop={'size': 9}) 84 | plt.tight_layout() 85 | 86 | fig1 = plt.gcf() 87 | ##fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 88 | ## fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 89 | ## fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 90 | ## fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 91 | nfig = fig2data(fig1) 92 | seq.append(nfig) 93 | ##plt.show() 94 | 95 | bis1 = compute_bi(mu1_estimate, sigma1_estimate, mu2_estimate, sigma2_estimate, phi1_estimate, phi2_estimate) 96 | bis2 = compute_bi(mu2_estimate, sigma2_estimate, mu1_estimate, sigma1_estimate, phi2_estimate, phi1_estimate) 97 | 98 | #print bis1[:5] 99 | #print bis2[:5] 100 | 101 | mu1_estimate = sum([bis1[i] * x[i] for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))]) 102 | mu2_estimate = sum([bis2[i] * x[i] for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))]) 103 | 104 | sigma1_estimate = sum([bis1[i] * (x[i] - mu1_estimate)**2 for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))]) 105 | sigma2_estimate = sum([bis2[i] * (x[i] - mu2_estimate)**2 for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))]) 106 | 107 | #print mu1_estimate, mu2_estimate 108 | #print sigma1_estimate, sigma2_estimate 109 | 110 | phi1_estimate = sum([bis1[i] for i in range(len(x))])/float(len(x)) 111 | phi2_estimate = 1.0 - phi1_estimate 112 | 113 | print(phi1_estimate) 114 | 115 | count += 1 116 | 117 | plt.close(count) 118 | 119 | if count > 50: 120 | break 121 | 122 | import os 123 | ## Get the directory address of current python file 124 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 125 | os.chdir(curr_dir) ## Set the current directory as working directory 126 | 127 | ## The package used to create gif files 128 | import numpngw 129 | numpngw.write_apng('GMM.png',seq,delay = 250) 130 | -------------------------------------------------------------------------------- /Animated_Illustrations/gradient_descent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/gradient_descent.png -------------------------------------------------------------------------------- /Animated_Illustrations/gradient_descent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | import matplotlib 6 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 7 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 8 | matplotlib.rcParams.update({'font.size': 18}) 9 | 10 | 11 | def plot_original_data(): 12 | df = pd.read_csv("data.csv") 13 | plt.scatter(df['Spendings'], df['Sales'], color='#1f77b4',marker = 'o') 14 | plt.xlabel("Spendings, M$") 15 | plt.ylabel("Sales, Units") 16 | plt.title("Sales vs radio ad spendings") 17 | #plt.show() 18 | axes = plt.gca() 19 | axes.set_xlim([0,50]) 20 | axes.set_ylim([0,35]) 21 | plt.tight_layout() 22 | fig1 = plt.gcf() 23 | nfig = fig2data(fig1) 24 | seq.append(nfig) 25 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 26 | ## fig1.savefig('../../Illustrations/gradient_descent-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 27 | ## fig1.savefig('../../Illustrations/gradient_descent-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 28 | ## fig1.savefig('../../Illustrations/gradient_descent-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 29 | 30 | def update_w_and_b(spendings,sales,w,b,alpha): 31 | dr_dw = 0.0 32 | dr_db = 0.0 33 | N = len(spendings) 34 | for i in range(N): 35 | dr_dw += -2*spendings[i]*(sales[i] - (w*spendings[i] + b)) 36 | dr_db += -2*(sales[i] - (w*spendings[i] + b)) 37 | # update w and b 38 | w = w - (dr_dw/float(N))*alpha 39 | b = b - (dr_db/float(N))*alpha 40 | return w,b 41 | 42 | def fig2data(fig): 43 | """ 44 | @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it 45 | @param fig a matplotlib figure 46 | @return a numpy 3D array of RGB values 47 | """ 48 | # draw the renderer 49 | fig.canvas.draw() 50 | # Get the RGBA buffer from the figure 51 | w,h = fig.canvas.get_width_height() 52 | buf = np.array(fig.canvas.renderer._renderer) 53 | return buf 54 | 55 | seq = [] 56 | plot_original_data() 57 | def train(spendings,sales,w,b,alpha,epochs): 58 | image_counter = 2; 59 | for e in range(epochs): 60 | w, b = update_w_and_b(spendings,sales,w,b,alpha) 61 | # log the progress 62 | if (e==0) or (e<3000 and e%400==0) or (e%3000==0): 63 | print("epoch: ", str(e), "loss: "+str(loss(spendings,sales,w,b))) 64 | print("w, b: ",w,b) 65 | plt.figure(image_counter) 66 | plt.xlabel("Spendings, M$") 67 | plt.ylabel("Sales, Units") 68 | axes = plt.gca() 69 | axes.set_xlim([0,50]) 70 | axes.set_ylim([0,35]) 71 | plt.scatter(spendings,sales,color='#1f77b4',marker='o' ) 72 | X_plot = np.linspace(0,50,50) 73 | plt.plot(X_plot,X_plot*w + b) 74 | heading = 'epoch = '+str(e)+' loss = '+str(round(loss(spendings,sales,w,b))) 75 | plt.title(heading) 76 | #plt.show() 77 | plt.tight_layout() 78 | fig1 = plt.gcf() 79 | nfig = fig2data(fig1) 80 | seq.append(nfig) 81 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 82 | ## fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 83 | ## fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 84 | ## fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 85 | image_counter += 1 86 | return w,b 87 | 88 | def loss(spendings,sales,w,b): 89 | N = len(spendings) 90 | total_error = 0.0 91 | for i in range(N): 92 | total_error += (sales[i] - (w*spendings[i] + b))**2 93 | return total_error/N 94 | 95 | df = pd.read_csv("data.csv") 96 | x = df['Spendings'] 97 | y = df['Sales'] 98 | w,b = train(x,y,0.0,0.0,0.001,16000) 99 | 100 | def predict(x,w,b): 101 | return w*x + b 102 | x_new = 23.0 103 | y_new = predict(x_new, w, b) 104 | print(y_new) 105 | 106 | import os 107 | ## Get the directory address of current python file 108 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 109 | os.chdir(curr_dir) ## Set the current directory as working directory 110 | 111 | ## The package used to create gif files 112 | import numpngw 113 | numpngw.write_apng('gradient_descent.png',seq,delay = 750) 114 | -------------------------------------------------------------------------------- /Animated_Illustrations/kernel_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/kernel_regression.png -------------------------------------------------------------------------------- /Animated_Illustrations/kernel_regression.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import math 5 | 6 | from sklearn.linear_model import Ridge 7 | from sklearn.preprocessing import PolynomialFeatures 8 | from sklearn.pipeline import make_pipeline 9 | from sklearn.kernel_ridge import KernelRidge 10 | 11 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 12 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 13 | matplotlib.rcParams.update({'font.size': 25}) 14 | 15 | def f(x): 16 | """ function to approximate by polynomial interpolation""" 17 | return x * (x) 18 | 19 | 20 | # generate points used to plot 21 | x_plot = np.linspace(-5,2,100) 22 | 23 | # generate points and keep a subset of them 24 | x = np.linspace(-5,2,100) 25 | rng = np.random.RandomState(0) 26 | rng.shuffle(x) 27 | x = np.sort(x[:50]) 28 | noize = [(-5 + np.random.random()*5) for i in range(len(x))] 29 | y = f(x) + noize 30 | 31 | # create matrix versions of these arrays 32 | X = x[:, np.newaxis] 33 | X_plot = x_plot[:, np.newaxis] 34 | 35 | ##colors = ['red', 'blue', 'orange'] 36 | lw = 2 37 | 38 | def kernel(x1,x2,b=2): 39 | z = (x1-x2)/b 40 | return (1/math.sqrt(2*3.14))*np.exp(-z**2/2) 41 | 42 | def fig2data(fig): 43 | """ 44 | @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it 45 | @param fig a matplotlib figure 46 | @return a numpy 3D array of RGB values 47 | """ 48 | # draw the renderer 49 | fig.canvas.draw() 50 | 51 | # Get the RGBA buffer from the figure 52 | w,h = fig.canvas.get_width_height() 53 | buf = np.array(fig.canvas.renderer._renderer) 54 | 55 | return buf 56 | 57 | seq = [] 58 | fit = ["fit", "small overfit", "big overfit"] 59 | for count, degree in enumerate(np.arange(0.05,1.5,0.05)): 60 | plt.figure(count) 61 | axes = plt.gca() 62 | axes.set_xlim([-5,2]) 63 | axes.set_ylim([-10,30]) 64 | degree = round(degree,2) 65 | plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples") 66 | model = KernelRidge(alpha=0.01, kernel=kernel, kernel_params = {'b':degree}) 67 | model.fit(X, y) 68 | y_plot = model.predict(X_plot) 69 | plt.plot(x_plot, y_plot, color='green', linewidth=lw, 70 | label="b = " + str(degree)) 71 | 72 | ## heading = 'Iteration '+str(count) 73 | ## plt.title(heading) 74 | plt.legend(loc='upper right',prop={'size': 9}) 75 | fig1 = plt.gcf() 76 | nfig = fig2data(fig1) 77 | seq.append(nfig) 78 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 79 | ## fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 80 | ## fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 81 | ## fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 82 | 83 | 84 | ##plt.show() 85 | 86 | import os 87 | ## Get the directory address of current python file 88 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 89 | os.chdir(curr_dir) ## Set the current directory as working directory 90 | 91 | ## The package used to create gif files 92 | import numpngw 93 | numpngw.write_apng('kernel_regression.png',seq,delay = 500) 94 | 95 | -------------------------------------------------------------------------------- /Animated_Illustrations/kmeans alpha 0.3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/kmeans alpha 0.3.png -------------------------------------------------------------------------------- /Animated_Illustrations/kmeans.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from sklearn.datasets.samples_generator import make_blobs 6 | from sklearn.metrics import pairwise_distances_argmin 7 | from random import shuffle, random 8 | from matplotlib.ticker import NullLocator 9 | from scipy.spatial import Voronoi 10 | 11 | 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 14 | matplotlib.rcParams.update({'font.size': 18}) 15 | 16 | x, _ = make_blobs(n_samples=50, centers=3, cluster_std=0.6, random_state=0) 17 | 18 | #plt.scatter(x[:, 0], x[:, 1], s=50) 19 | 20 | def voronoi_finite_polygons_2d(vor, radius=None): 21 | """ 22 | Reconstruct infinite voronoi regions in a 2D diagram to finite 23 | regions. 24 | 25 | Parameters 26 | ---------- 27 | vor : Voronoi 28 | Input diagram 29 | radius : float, optional 30 | Distance to 'points at infinity'. 31 | 32 | Returns 33 | ------- 34 | regions : list of tuples 35 | Indices of vertices in each revised Voronoi regions. 36 | vertices : list of tuples 37 | Coordinates for revised Voronoi vertices. Same as coordinates 38 | of input vertices, with 'points at infinity' appended to the 39 | end. 40 | 41 | """ 42 | 43 | if vor.points.shape[1] != 2: 44 | raise ValueError("Requires 2D input") 45 | 46 | new_regions = [] 47 | new_vertices = vor.vertices.tolist() 48 | 49 | center = vor.points.mean(axis=0) 50 | if radius is None: 51 | radius = vor.points.ptp().max()*2 52 | 53 | # Construct a map containing all ridges for a given point 54 | all_ridges = {} 55 | for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices): 56 | all_ridges.setdefault(p1, []).append((p2, v1, v2)) 57 | all_ridges.setdefault(p2, []).append((p1, v1, v2)) 58 | 59 | # Reconstruct infinite regions 60 | for p1, region in enumerate(vor.point_region): 61 | vertices = vor.regions[region] 62 | 63 | if all([v >= 0 for v in vertices]): 64 | # finite region 65 | new_regions.append(vertices) 66 | continue 67 | 68 | # reconstruct a non-finite region 69 | ridges = all_ridges[p1] 70 | new_region = [v for v in vertices if v >= 0] 71 | 72 | for p2, v1, v2 in ridges: 73 | if v2 < 0: 74 | v1, v2 = v2, v1 75 | if v1 >= 0: 76 | # finite ridge: already in the region 77 | continue 78 | 79 | # Compute the missing endpoint of an infinite ridge 80 | 81 | t = vor.points[p2] - vor.points[p1] # tangent 82 | t /= np.linalg.norm(t) 83 | n = np.array([-t[1], t[0]]) # normal 84 | 85 | midpoint = vor.points[[p1, p2]].mean(axis=0) 86 | direction = np.sign(np.dot(midpoint - center, n)) * n 87 | far_point = vor.vertices[v2] + direction * radius 88 | 89 | new_region.append(len(new_vertices)) 90 | new_vertices.append(far_point.tolist()) 91 | 92 | # sort region counterclockwise 93 | vs = np.asarray([new_vertices[v] for v in new_region]) 94 | c = vs.mean(axis=0) 95 | angles = np.arctan2(vs[:,1] - c[1], vs[:,0] - c[0]) 96 | new_region = np.array(new_region)[np.argsort(angles)] 97 | 98 | # finish 99 | new_regions.append(new_region.tolist()) 100 | 101 | return new_regions, np.asarray(new_vertices) 102 | 103 | def fig2data(fig): 104 | """ 105 | @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it 106 | @param fig a matplotlib figure 107 | @return a numpy 3D array of RGB values 108 | """ 109 | # draw the renderer 110 | fig.canvas.draw() 111 | 112 | # Get the RGBA buffer from the figure 113 | w,h = fig.canvas.get_width_height() 114 | buf = np.array(fig.canvas.renderer._renderer) 115 | 116 | return buf 117 | 118 | seq = [] 119 | 120 | def find_clusters(x, n_clusters): 121 | # randomly set cluster centroids 122 | x_list = list(x) 123 | shuffle(x_list) 124 | centroids = np.array([[2 * random(), 4 * random()], [2 * random(), 4 * random()], [2 * random(), 4 * random()]]) 125 | 126 | counter = 0 127 | 128 | plt.figure(counter) 129 | 130 | plt.scatter(x[:, 0], x[:, 1], s=50) 131 | 132 | ax = plt.gca() 133 | ax.set_xlabel('$x_1$') 134 | ax.set_ylabel('$x_2$') 135 | plt.xlim(-3.0, 4.0) 136 | plt.ylim(-1, 6) 137 | 138 | fig1 = plt.gcf() 139 | 140 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 141 | ## 142 | ## fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 143 | ## fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 144 | ## fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 145 | 146 | #plt.show() 147 | 148 | 149 | counter = 1 150 | 151 | while True: 152 | 153 | plt.figure(counter) 154 | axes = plt.gca() 155 | 156 | # assign labels based on closest centroid 157 | labels = pairwise_distances_argmin(x, centroids) 158 | 159 | plt.scatter(x[:, 0], x[:, 1], c=[l + 1 for l in labels], s=50, cmap='tab10', zorder=2); 160 | 161 | plt.scatter(centroids[:, 0], centroids[:, 1], c=[1,2,3], s=200, cmap='tab10', marker="s", facecolors='none', zorder=2); 162 | plt.xlim(-3.0, 4.0) 163 | plt.ylim(-1, 6) 164 | 165 | vor = Voronoi(centroids) 166 | 167 | # plot 168 | regions, vertices = voronoi_finite_polygons_2d(vor, 300) 169 | print("--") 170 | print(regions) 171 | ## print("--") 172 | ## print(vertices) 173 | 174 | # colorize 175 | for region in regions: 176 | polygon = vertices[region] 177 | plt.fill(*zip(*polygon), alpha=0.3, zorder=1) 178 | 179 | ax = plt.gca() 180 | ax.set_xlabel('$x_1$') 181 | ax.set_ylabel('$x_2$') 182 | 183 | heading = 'Iteration '+str(counter) 184 | plt.title(heading) 185 | plt.tight_layout() 186 | 187 | fig1 = plt.gcf() 188 | nfig = fig2data(fig1) 189 | seq.append(nfig) 190 | #ax.set_axis_off() 191 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 192 | #plt.margins(0,0) 193 | #ax.xaxis.set_major_locator(NullLocator()) 194 | #ax.yaxis.set_major_locator(NullLocator()) 195 | 196 | 197 | ## fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 198 | ## fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 199 | ## fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 200 | 201 | #plt.show() 202 | 203 | # find new centroids as the average of examples 204 | new_centroids = np.array([x[labels == i].mean(0) for i in range(n_clusters)]) 205 | 206 | # check for convergence 207 | if np.all(centroids == new_centroids): 208 | break 209 | centroids = new_centroids 210 | 211 | counter += 1 212 | 213 | return centroids, labels 214 | 215 | centroids, labels = find_clusters(x,3) 216 | 217 | import os 218 | ## Get the directory address of current python file 219 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 220 | os.chdir(curr_dir) ## Set the current directory as working directory 221 | 222 | ## The package used to create gif files 223 | import numpngw 224 | numpngw.write_apng('kmeans.png',seq,delay = 500) 225 | -------------------------------------------------------------------------------- /Animated_Illustrations/linear_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/linear_regression.png -------------------------------------------------------------------------------- /Animated_Illustrations/linear_regression_fit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.linear_model import Ridge 5 | from sklearn.preprocessing import PolynomialFeatures 6 | from sklearn.pipeline import make_pipeline 7 | 8 | import matplotlib 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 10 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 11 | matplotlib.rcParams.update({'font.size': 18}) 12 | 13 | def f(x): 14 | """ function to approximate by polynomial interpolation""" 15 | return 0.5 * x 16 | 17 | 18 | # generate points used to plot 19 | x_plot = np.linspace(-10, 10, 100) 20 | 21 | # generate points and keep a subset of them 22 | x = np.linspace(-10, 10, 100) 23 | rng = np.random.RandomState(0) 24 | rng.shuffle(x) 25 | x = np.sort(x[:10]) 26 | noize = [(-2 + np.random.random()*2) for i in range(len(x))] 27 | y = f(x) + noize 28 | 29 | # create matrix versions of these arrays 30 | X = x[:, np.newaxis] 31 | X_plot = x_plot[:, np.newaxis] 32 | 33 | colors = ['red', 'red']#, 'orange' 34 | lw = 2 35 | 36 | def fig2data(fig): 37 | """ 38 | @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it 39 | @param fig a matplotlib figure 40 | @return a numpy 3D array of RGB values 41 | """ 42 | # draw the renderer 43 | fig.canvas.draw() 44 | 45 | # Get the RGBA buffer from the figure 46 | w,h = fig.canvas.get_width_height() 47 | buf = np.array(fig.canvas.renderer._renderer) 48 | 49 | return buf 50 | 51 | seq = [] 52 | type_of_regression = ["linear regression", "regression of degree 10"] 53 | fit = ["fit", "overfit"] 54 | for count, degree in enumerate(range(1,11)):#, 2, 15 55 | plt.figure(count) 56 | axes = plt.gca() 57 | axes.set_xlim([-10,10]) 58 | axes.set_ylim([-10,10]) 59 | plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples") 60 | plt.xticks([-10.0, -5.0, 0.0, 5.0, 10.0]) 61 | plt.yticks([-10.0, -5.0, 0.0, 5.0, 10.0]) 62 | model = make_pipeline(PolynomialFeatures(degree), Ridge()) 63 | model.fit(X, y) 64 | y_plot = model.predict(X_plot) 65 | plt.plot(x_plot, y_plot, color='red', linewidth=lw, 66 | label='linear regression of degree '+ str(degree)) 67 | 68 | plt.legend(loc='best') 69 | plt.tight_layout() 70 | fig1 = plt.gcf() 71 | nfig = fig2data(fig1) 72 | seq.append(nfig) 73 | ## fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 74 | ## fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 75 | ## fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 76 | ## fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 77 | 78 | 79 | #plt.show() 80 | 81 | import os 82 | ## Get the directory address of current python file 83 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 84 | os.chdir(curr_dir) ## Set the current directory as working directory 85 | 86 | ## The package used to create gif files 87 | import numpngw 88 | numpngw.write_apng('linear_regression.png',seq,delay = 500) 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PCA_MNIST.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | import math 6 | 7 | from sklearn.decomposition import PCA 8 | 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 10 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 11 | matplotlib.rcParams.update({'font.size': 25}) 12 | 13 | from sklearn.datasets import fetch_mldata 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | mnist = fetch_mldata("MNIST original") 18 | 19 | reducer = PCA(n_components=2) 20 | embedding = reducer.fit_transform(mnist.data) 21 | 22 | plt.figure() 23 | 24 | plt.scatter(embedding[:, 0], embedding[:, 1], c=mnist.target, cmap="Spectral", s=0.1) 25 | 26 | plt.gca().get_xaxis().set_ticklabels([]) 27 | plt.gca().get_yaxis().set_ticklabels([]) 28 | 29 | ax = plt.gca() 30 | ax.set_xlabel('$x_1$') 31 | ax.set_ylabel('$x_2$') 32 | 33 | fig1 = plt.gcf() 34 | 35 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 36 | fig1.savefig('../../Illustrations/PCA-MNIST.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 37 | fig1.savefig('../../Illustrations/PCA-MNIST.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 38 | fig1.savefig('../../Illustrations/PCA-MNIST.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 39 | 40 | plt.show() 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Hundred-Page Machine Learning Book 2 | The Python code to reproduce the illustrations from [The Hundred-Page Machine Learning Book](http://themlbook.com/). 3 | 4 | ![](http://themlbook.com/images/og-image3.png) 5 | 6 | **WARNING!** To avoid buying counterfeit on Amazon, click on **[See All Buying Options](https://www.amazon.com/gp/offer-listing/199957950X/)** and choose "Amazon.com" and not a third-party seller. 7 | -------------------------------------------------------------------------------- /UMAP_MNIST.py: -------------------------------------------------------------------------------- 1 | import umap 2 | from sklearn.datasets import fetch_mldata 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.decomposition import PCA 7 | 8 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 9 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 10 | matplotlib.rcParams.update({'font.size': 25}) 11 | 12 | 13 | mnist = fetch_mldata("MNIST original") 14 | 15 | reducer = umap.UMAP(random_state=42) 16 | embedding = reducer.fit_transform(mnist.data) 17 | 18 | plt.figure() 19 | 20 | plt.scatter(embedding[:, 0], embedding[:, 1], c=mnist.target, cmap="Spectral", s=0.1) 21 | 22 | plt.gca().get_xaxis().set_ticklabels([]) 23 | plt.gca().get_yaxis().set_ticklabels([]) 24 | 25 | ax = plt.gca() 26 | ax.set_xlabel('$x_1$') 27 | ax.set_ylabel('$x_2$') 28 | 29 | fig1 = plt.gcf() 30 | 31 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 32 | fig1.savefig('../../Illustrations/UMAP-MNIST.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 33 | fig1.savefig('../../Illustrations/UMAP-MNIST.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 34 | fig1.savefig('../../Illustrations/UMAP-MNIST.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 35 | 36 | plt.show() 37 | -------------------------------------------------------------------------------- /autoencoder_MNIST.py: -------------------------------------------------------------------------------- 1 | import pylab as plt 2 | import numpy as np 3 | 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | import keras 8 | from keras.models import Sequential, Model 9 | from keras.layers import Dense 10 | from keras.optimizers import Adam 11 | 12 | from keras.datasets import mnist 13 | 14 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 15 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 16 | matplotlib.rcParams.update({'font.size': 25}) 17 | 18 | from sklearn.datasets import fetch_mldata 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 23 | x_train = x_train.reshape(60000, 784) / 255.0 24 | x_test = x_test.reshape(10000, 784) / 255.0 25 | 26 | m = Sequential() 27 | m.add(Dense(512, activation='elu', input_shape=(784,))) 28 | m.add(Dense(128, activation='elu')) 29 | m.add(Dense(2, activation='linear', name="bottleneck")) 30 | m.add(Dense(128, activation='elu')) 31 | m.add(Dense(512, activation='elu')) 32 | m.add(Dense(784, activation='sigmoid')) 33 | m.compile(loss='mean_squared_error', optimizer = Adam()) 34 | history = m.fit(x_train, x_train, batch_size=128, epochs=5, verbose=1, 35 | validation_data=(x_test, x_test)) 36 | 37 | encoder = Model(m.input, m.get_layer('bottleneck').output) 38 | embedding = encoder.predict(x_train) # bottleneck representation 39 | 40 | plt.figure() 41 | 42 | plt.scatter(embedding[:,0], embedding[:,1], c=y_train, s=0.1, cmap='Spectral') 43 | 44 | plt.gca().get_xaxis().set_ticklabels([]) 45 | plt.gca().get_yaxis().set_ticklabels([]) 46 | 47 | ax = plt.gca() 48 | ax.set_xlabel('$x_1$') 49 | ax.set_ylabel('$x_2$') 50 | 51 | fig1 = plt.gcf() 52 | 53 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 54 | fig1.savefig('../../Illustrations/autoencoder-MNIST.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 55 | fig1.savefig('../../Illustrations/autoencoder-MNIST.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 56 | fig1.savefig('../../Illustrations/autoencoder-MNIST.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 57 | 58 | plt.show() 59 | -------------------------------------------------------------------------------- /data.txt: -------------------------------------------------------------------------------- 1 | The dataset for gradient descent example can be downloaded from: http://themlbook.com/wiki/doku.php?id=gradient_descent 2 | -------------------------------------------------------------------------------- /density_estimation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import scipy as sp 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | import math 7 | 8 | from sklearn.neighbors import KernelDensity 9 | 10 | import scipy.integrate as integrate 11 | from sklearn.kernel_ridge import KernelRidge 12 | 13 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 14 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 15 | matplotlib.rcParams.update({'font.size': 18}) 16 | 17 | mu1, sigma1 = 3.0, 1.0 18 | mu2, sigma2 = 8.0, 1.5 19 | 20 | def sample_points(): 21 | s1 = np.random.normal(mu1, sigma1, 50) 22 | 23 | s2 = np.random.normal(mu2, sigma2, 50) 24 | 25 | return list(s1) + list(s2) 26 | 27 | # generate points used to plot 28 | x_plot = np.linspace(0, 12, 100) 29 | 30 | # generate points and keep a subset of them 31 | x = sample_points() 32 | 33 | colors = ['red', 'blue', 'orange', 'green'] 34 | lw = 2 35 | 36 | def kernel(x1, x2, bi = 2.0): 37 | z = (x1 - x2) / bi 38 | return (1.0/math.sqrt(2.0 * 3.14)) * math.exp((-1.0/2.0)*(z**2)) 39 | 40 | def fb(xx, data, bi): 41 | return (1/(len(data)*bi)) * sum([kernel(xx, xi, bi) for xi in data]) 42 | 43 | def fbi(i, data, bi): 44 | data_minus_i = [] 45 | for ii in range(len(data)): 46 | if i != ii: 47 | data_minus_i.append(data[ii]) 48 | return (1/(len(data_minus_i)*bi)) * sum([kernel(data[i], xi, bi) for xi in data_minus_i]) 49 | 50 | 51 | def sum_pdf(x): 52 | result = [] 53 | for i in range(len(x)): 54 | result.append((sp.stats.norm.pdf(x, mu1, sigma1)[i] + sp.stats.norm.pdf(x, mu2, sigma2)[i])/2.0) 55 | #result.append(sp.stats.norm.pdf(x, mu1, sigma1)[i]) 56 | return result 57 | 58 | b = np.linspace(0.01, 3.0, 100) 59 | 60 | score = [] 61 | for bi in b: 62 | def fb2(xx): 63 | return fb(xx, x, bi)**2 64 | 65 | s = integrate.quad(fb2, -np.inf, np.inf)[0] - 2.0*np.mean([fbi(i, x, bi) for i in range(len(x))]) 66 | score.append(s) 67 | 68 | plt.figure(1) 69 | plt.plot(b,score) 70 | plt.xlabel("$b$") 71 | plt.ylabel("$l$") 72 | plt.tight_layout() 73 | plt.xticks(np.arange(0, 3.5, 0.5)) 74 | #plt.show() 75 | fig1 = plt.gcf() 76 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 77 | fig1.savefig('../../Illustrations/density-estimation-loss.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 78 | fig1.savefig('../../Illustrations/density-estimation-loss.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 79 | fig1.savefig('../../Illustrations/density-estimation-loss.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 80 | minb = [bi for bi, s in zip(b, score) if s == min(score)][0] 81 | print(minb) 82 | 83 | 84 | for count, degree in enumerate([round(minb, 2)] + [0.2, 2.0]): 85 | plt.figure(count+2) 86 | axes = plt.gca() 87 | axes.set_xlim([0,12]) 88 | axes.set_ylim([0,0.3]) 89 | plt.xlabel("$x$") 90 | plt.ylabel("pdf") 91 | plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples") 92 | plt.plot(x_plot, [fb(xp ,x, degree) for xp in x_plot], color=colors[count], linewidth=lw, label="$\\hat{f}_b$, $b = " + str(degree) + "$") 93 | plt.plot(x_plot,sum_pdf(x_plot), label="true pdf") 94 | 95 | plt.legend(loc='upper right') 96 | plt.tight_layout() 97 | 98 | fig1 = plt.gcf() 99 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 100 | fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 101 | fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 102 | fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 103 | plt.show() 104 | 105 | -------------------------------------------------------------------------------- /gaussian_mixture_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import scipy as sp 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | import math 7 | 8 | from sklearn.neighbors import KernelDensity 9 | 10 | import scipy.integrate as integrate 11 | from sklearn.kernel_ridge import KernelRidge 12 | 13 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 14 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 15 | matplotlib.rcParams.update({'font.size': 18}) 16 | 17 | mu1, sigma1 = 3.0, 1.0 18 | mu2, sigma2 = 8.0, 3.5 19 | 20 | def sample_points(): 21 | s1 = np.random.normal(mu1, math.sqrt(sigma1), 50) 22 | 23 | s2 = np.random.normal(mu2, math.sqrt(sigma2), 50) 24 | 25 | return list(s1) + list(s2) 26 | 27 | def compute_bi(mu1local, sigma1local, mu2local, sigma2local, phi1local, phi2local): 28 | bis = [] 29 | for xi in x: 30 | bis.append((sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local)/(sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local + sp.stats.norm.pdf(xi, mu2local, math.sqrt(sigma2local)) * phi2local)) 31 | return bis 32 | 33 | # generate points used to plot 34 | x_plot = np.linspace(-2, 12, 100) 35 | 36 | # generate points and keep a subset of them 37 | x = sample_points() 38 | 39 | colors = ['red', 'blue', 'orange', 'green'] 40 | lw = 2 41 | 42 | mu1_estimate = 1.0 43 | mu2_estimate = 2.0 44 | sigma1_estimate = 1.0 45 | sigma2_estimate = 2.0 46 | 47 | phi1_estimate = 0.5 48 | phi2_estimate = 0.5 49 | 50 | count = 0 51 | while True: 52 | plt.figure(count) 53 | axes = plt.gca() 54 | axes.set_xlim([-2,12]) 55 | axes.set_ylim([0,0.8]) 56 | plt.xlabel("$x$") 57 | plt.ylabel("pdf") 58 | plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples") 59 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1_estimate, math.sqrt(sigma1_estimate)) for xp in x_plot], color=colors[1], linewidth=lw, label="$f(x_i \\mid \\mu_1 ,\\sigma_1^2)$") 60 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2_estimate, math.sqrt(sigma2_estimate)) for xp in x_plot], color=colors[3], linewidth=lw, label="$f(x_i \\mid \\mu_2 ,\\sigma_2^2)$") 61 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1, math.sqrt(sigma1)) for xp in x_plot], color=colors[0], label="true pdf") 62 | plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2, math.sqrt(sigma2)) for xp in x_plot], color=colors[0]) 63 | 64 | plt.legend(loc='upper right') 65 | plt.tight_layout() 66 | 67 | fig1 = plt.gcf() 68 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 69 | fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 70 | fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 71 | fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 72 | #plt.show() 73 | 74 | bis1 = compute_bi(mu1_estimate, sigma1_estimate, mu2_estimate, sigma2_estimate, phi1_estimate, phi2_estimate) 75 | bis2 = compute_bi(mu2_estimate, sigma2_estimate, mu1_estimate, sigma1_estimate, phi2_estimate, phi1_estimate) 76 | 77 | #print bis1[:5] 78 | #print bis2[:5] 79 | 80 | mu1_estimate = sum([bis1[i] * x[i] for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))]) 81 | mu2_estimate = sum([bis2[i] * x[i] for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))]) 82 | 83 | sigma1_estimate = sum([bis1[i] * (x[i] - mu1_estimate)**2 for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))]) 84 | sigma2_estimate = sum([bis2[i] * (x[i] - mu2_estimate)**2 for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))]) 85 | 86 | #print mu1_estimate, mu2_estimate 87 | #print sigma1_estimate, sigma2_estimate 88 | 89 | phi1_estimate = sum([bis1[i] for i in range(len(x))])/float(len(x)) 90 | phi2_estimate = 1.0 - phi1_estimate 91 | 92 | print(phi1_estimate) 93 | 94 | count += 1 95 | 96 | plt.close(count) 97 | 98 | if count > 50: 99 | break 100 | 101 | -------------------------------------------------------------------------------- /gradient_descent.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | import matplotlib 6 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 7 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 8 | matplotlib.rcParams.update({'font.size': 18}) 9 | 10 | 11 | def plot_original_data(): 12 | x, y = np.loadtxt("data.txt", delimiter= "\t", unpack = True) 13 | 14 | plt.scatter(x, y, color='#1f77b4', marker='o') 15 | 16 | plt.xlabel("Spendings, M$") 17 | plt.ylabel("Sales, Units") 18 | plt.title("Sales as a function of radio ad spendings.") 19 | #plt.show() 20 | fig1 = plt.gcf() 21 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 22 | fig1.savefig('../../Illustrations/gradient_descent-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 23 | fig1.savefig('../../Illustrations/gradient_descent-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 24 | fig1.savefig('../../Illustrations/gradient_descent-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 25 | 26 | def update_w_and_b(spendings, sales, w, b, alpha): 27 | dr_dw = 0.0 28 | dr_db = 0.0 29 | N = len(spendings) 30 | 31 | for i in range(N): 32 | dr_dw += -2 * spendings[i] * (sales[i] - (w * spendings[i] + b)) 33 | dr_db += -2 * (sales[i] - (w * spendings[i] + b)) 34 | 35 | # update w and b 36 | w = w - (dr_dw/float(N)) * alpha 37 | b = b - (dr_db/float(N)) * alpha 38 | 39 | return w, b 40 | 41 | def train(spendings, sales, w, b, alpha, epochs): 42 | image_counter = 2; 43 | for e in range(epochs): 44 | w, b = update_w_and_b(spendings, sales, w, b, alpha) 45 | 46 | # log the progress 47 | if (e == 0) or (e < 3000 and e % 400 == 0) or (e % 3000 == 0): 48 | print("epoch: ", str(e), "loss: "+str(loss(spendings, sales, w, b))) 49 | print("w, b: ", w, b) 50 | plt.figure(image_counter) 51 | axes = plt.gca() 52 | axes.set_xlim([0,50]) 53 | axes.set_ylim([0,30]) 54 | plt.scatter(spendings, sales) 55 | X_plot = np.linspace(0,50,50) 56 | plt.plot(X_plot, X_plot*w + b) 57 | #plt.show() 58 | fig1 = plt.gcf() 59 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 60 | fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 61 | fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 62 | fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 63 | image_counter += 1 64 | return w, b 65 | 66 | def loss(spendings, sales, w, b): 67 | N = len(spendings) 68 | total_error = 0.0 69 | for i in range(N): 70 | total_error += (sales[i] - (w*spendings[i] + b))**2 71 | return total_error / N 72 | 73 | x, y = np.loadtxt("data.txt", delimiter= "\t", unpack = True) 74 | #w, b = train(x, y, 0.0, 0.0, 0.001, 15000) 75 | 76 | plot_original_data() 77 | 78 | def predict(x, w, b): 79 | return w*x + b 80 | x_new = 23.0 81 | y_new = predict(x_new, w, b) 82 | print(y_new) 83 | -------------------------------------------------------------------------------- /kernel_regression.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import math 5 | 6 | from sklearn.linear_model import Ridge 7 | from sklearn.preprocessing import PolynomialFeatures 8 | from sklearn.pipeline import make_pipeline 9 | from sklearn.kernel_ridge import KernelRidge 10 | 11 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 12 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 13 | matplotlib.rcParams.update({'font.size': 25}) 14 | 15 | def f(x): 16 | """ function to approximate by polynomial interpolation""" 17 | return x * (x) 18 | 19 | 20 | # generate points used to plot 21 | x_plot = np.linspace(-5, 2, 100) 22 | 23 | # generate points and keep a subset of them 24 | x = np.linspace(-5, 2, 100) 25 | rng = np.random.RandomState(0) 26 | rng.shuffle(x) 27 | x = np.sort(x[:50]) 28 | noize = [(-5 + np.random.random()*5) for i in range(len(x))] 29 | y = f(x) + noize 30 | 31 | # create matrix versions of these arrays 32 | X = x[:, np.newaxis] 33 | X_plot = x_plot[:, np.newaxis] 34 | 35 | colors = ['red', 'blue', 'orange'] 36 | lw = 2 37 | 38 | def kernel(x1, x2, b = 2): 39 | z = (x1 - x2) / b 40 | return (1/math.sqrt(2 * 3.14)) * np.exp(-z**2/2) 41 | 42 | fit = ["fit", "small overfit", "big overfit"] 43 | for count, degree in enumerate([0.1, 0.5, 3]): 44 | plt.figure(count) 45 | axes = plt.gca() 46 | axes.set_xlim([-5,2]) 47 | axes.set_ylim([-10,30]) 48 | plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples") 49 | model = KernelRidge(alpha=0.01, kernel=kernel, kernel_params = {'b':degree}) 50 | model.fit(X, y) 51 | y_plot = model.predict(X_plot) 52 | plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw, 53 | label="b = " + str(degree)) 54 | 55 | plt.legend(loc='upper right') 56 | fig1 = plt.gcf() 57 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 58 | fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 59 | fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 60 | fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 61 | 62 | 63 | plt.show() 64 | -------------------------------------------------------------------------------- /kernel_trick.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import math 5 | 6 | from sklearn.linear_model import Ridge 7 | from sklearn.preprocessing import PolynomialFeatures 8 | from sklearn.pipeline import make_pipeline 9 | from mpl_toolkits.mplot3d import Axes3D 10 | 11 | import matplotlib 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 14 | matplotlib.rcParams.update({'font.size': 18}) 15 | 16 | def f_outer(x1): 17 | result = [] 18 | for x in x1: 19 | side = random.uniform(0, 1) 20 | sq = math.sqrt(10 * 10 - x * x) 21 | if side > 0.5: 22 | sq = sq * (-1) 23 | result.append(sq) 24 | return np.asarray(result) 25 | 26 | def f_inner(x1): 27 | result = [] 28 | for x in x1: 29 | side = random.uniform(0, 1) 30 | sq = math.sqrt(3 * 3 - x * x) 31 | if side > 0.5: 32 | sq = sq * (-1) 33 | result.append(sq) 34 | return np.asarray(result) 35 | 36 | 37 | # generate points and keep a subset of them 38 | x_inner = np.linspace(-3, 3, 100) 39 | x_outer = np.linspace(-10, 10, 100) 40 | 41 | rng = np.random.RandomState(0) 42 | rng.shuffle(x_inner) 43 | rng.shuffle(x_outer) 44 | 45 | x_inner = np.sort(x_inner[:30]) 46 | x_outer = np.sort(x_outer[:30]) 47 | 48 | noize = [(-1 + np.random.random()) for i in range(len(x_inner))] 49 | y_inner = f_inner(x_inner) + noize 50 | 51 | noize = [(-1 + np.random.random()) for i in range(len(x_outer))] 52 | y_outer = f_outer(x_outer) + noize 53 | 54 | colors = ['blue', 'red']#, 'orange' 55 | lw = 2 56 | 57 | type_of_regression = ["linear regression", "regression of degree 10"] 58 | fit = ["fit", "overfit"] 59 | 60 | plt.figure(1) 61 | axes = plt.gca() 62 | axes.set_xlim([-11,11]) 63 | axes.set_ylim([-11,11]) 64 | 65 | plt.scatter(x_inner, y_inner, color='navy', s=30, marker='o') 66 | plt.scatter(x_outer, y_outer, color='red', s=30, marker='o') 67 | 68 | fig1 = plt.gcf() 69 | 70 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 71 | fig1.savefig('../../Illustrations/kernel-trick-0.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 72 | fig1.savefig('../../Illustrations/kernel-trick-0.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 73 | fig1.savefig('../../Illustrations/kernel-trick-0.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 74 | 75 | x_inner_transformed = np.asarray([x * x for x in x_inner]) 76 | y_inner_transformed = np.asarray([math.sqrt(2) * x * y for x, y in zip(x_inner, y_inner)]) 77 | z_inner_transformed = np.asarray([y * y for y in y_inner]) 78 | 79 | x_outer_transformed = np.asarray([x * x for x in x_outer]) 80 | y_outer_transformed = np.asarray([math.sqrt(2) * x * y for x, y in zip(x_outer, y_outer)]) 81 | z_outer_transformed = np.asarray([y * y for y in y_outer]) 82 | 83 | fig = plt.figure(2) 84 | ax = Axes3D(fig) 85 | ax.set_yticks([-75, 0, 75]) 86 | #ax.set_xlim([-10,120]) 87 | #$ax.set_ylim([-120,120]) 88 | #ax.set_zlim([-120,120]) 89 | 90 | ax.scatter(x_inner_transformed, y_inner_transformed, z_inner_transformed, color='navy', marker='o') 91 | ax.scatter(x_outer_transformed, y_outer_transformed, z_outer_transformed, color='red', marker='o') 92 | 93 | ax.view_init(14, -77) 94 | 95 | fig.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 96 | fig.savefig('../../Illustrations/kernel-trick-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 97 | fig.savefig('../../Illustrations/kernel-trick-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 98 | fig.savefig('../../Illustrations/kernel-trick-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 99 | 100 | #plt.show() 101 | -------------------------------------------------------------------------------- /kmeans.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import matplotlib 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from sklearn.datasets.samples_generator import make_blobs 7 | from sklearn.metrics import pairwise_distances_argmin 8 | from random import shuffle, random 9 | from matplotlib.ticker import NullLocator 10 | from scipy.spatial import Voronoi 11 | 12 | 13 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 14 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 15 | matplotlib.rcParams.update({'font.size': 18}) 16 | 17 | x, _ = make_blobs(n_samples=50, centers=3, cluster_std=0.6, random_state=0) 18 | 19 | #plt.scatter(x[:, 0], x[:, 1], s=50) 20 | 21 | def voronoi_finite_polygons_2d(vor, radius=None): 22 | """ 23 | 24 | Credit: https://gist.github.com/pv/8036995 25 | 26 | Reconstruct infinite voronoi regions in a 2D diagram to finite 27 | regions. 28 | 29 | Parameters 30 | ---------- 31 | vor : Voronoi 32 | Input diagram 33 | radius : float, optional 34 | Distance to 'points at infinity'. 35 | 36 | Returns 37 | ------- 38 | regions : list of tuples 39 | Indices of vertices in each revised Voronoi regions. 40 | vertices : list of tuples 41 | Coordinates for revised Voronoi vertices. Same as coordinates 42 | of input vertices, with 'points at infinity' appended to the 43 | end. 44 | 45 | """ 46 | 47 | if vor.points.shape[1] != 2: 48 | raise ValueError("Requires 2D input") 49 | 50 | new_regions = [] 51 | new_vertices = vor.vertices.tolist() 52 | 53 | center = vor.points.mean(axis=0) 54 | if radius is None: 55 | radius = vor.points.ptp().max()*2 56 | 57 | # Construct a map containing all ridges for a given point 58 | all_ridges = {} 59 | for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices): 60 | all_ridges.setdefault(p1, []).append((p2, v1, v2)) 61 | all_ridges.setdefault(p2, []).append((p1, v1, v2)) 62 | 63 | # Reconstruct infinite regions 64 | for p1, region in enumerate(vor.point_region): 65 | vertices = vor.regions[region] 66 | 67 | if all([v >= 0 for v in vertices]): 68 | # finite region 69 | new_regions.append(vertices) 70 | continue 71 | 72 | # reconstruct a non-finite region 73 | ridges = all_ridges[p1] 74 | new_region = [v for v in vertices if v >= 0] 75 | 76 | for p2, v1, v2 in ridges: 77 | if v2 < 0: 78 | v1, v2 = v2, v1 79 | if v1 >= 0: 80 | # finite ridge: already in the region 81 | continue 82 | 83 | # Compute the missing endpoint of an infinite ridge 84 | 85 | t = vor.points[p2] - vor.points[p1] # tangent 86 | t /= np.linalg.norm(t) 87 | n = np.array([-t[1], t[0]]) # normal 88 | 89 | midpoint = vor.points[[p1, p2]].mean(axis=0) 90 | direction = np.sign(np.dot(midpoint - center, n)) * n 91 | far_point = vor.vertices[v2] + direction * radius 92 | 93 | new_region.append(len(new_vertices)) 94 | new_vertices.append(far_point.tolist()) 95 | 96 | # sort region counterclockwise 97 | vs = np.asarray([new_vertices[v] for v in new_region]) 98 | c = vs.mean(axis=0) 99 | angles = np.arctan2(vs[:,1] - c[1], vs[:,0] - c[0]) 100 | new_region = np.array(new_region)[np.argsort(angles)] 101 | 102 | # finish 103 | new_regions.append(new_region.tolist()) 104 | 105 | return new_regions, np.asarray(new_vertices) 106 | 107 | def find_clusters(x, n_clusters): 108 | # randomly set cluster centroids 109 | x_list = list(x) 110 | shuffle(x_list) 111 | centroids = np.array([[2 * random(), 4 * random()], [2 * random(), 4 * random()], [2 * random(), 4 * random()]]) 112 | 113 | counter = 0 114 | 115 | plt.figure(counter) 116 | 117 | plt.scatter(x[:, 0], x[:, 1], s=50) 118 | 119 | ax = plt.gca() 120 | ax.set_xlabel('$x_1$') 121 | ax.set_ylabel('$x_2$') 122 | plt.xlim(-3.0, 4.0) 123 | plt.ylim(-1, 6) 124 | 125 | fig1 = plt.gcf() 126 | 127 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 128 | 129 | fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 130 | fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 131 | fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 132 | 133 | #plt.show() 134 | 135 | 136 | counter = 1 137 | 138 | while True: 139 | 140 | plt.figure(counter) 141 | axes = plt.gca() 142 | 143 | # assign labels based on closest centroid 144 | labels = pairwise_distances_argmin(x, centroids) 145 | 146 | plt.scatter(x[:, 0], x[:, 1], c=[l + 1 for l in labels], s=50, cmap='tab10', zorder=2); 147 | 148 | plt.scatter(centroids[:, 0], centroids[:, 1], c=[1,2,3], s=200, cmap='tab10', marker="s", facecolors='none', zorder=2); 149 | plt.xlim(-3.0, 4.0) 150 | plt.ylim(-1, 6) 151 | 152 | vor = Voronoi(centroids) 153 | 154 | # plot 155 | regions, vertices = voronoi_finite_polygons_2d(vor, 300) 156 | print("--") 157 | print(regions) 158 | print("--") 159 | print(vertices) 160 | 161 | # colorize 162 | for region in regions: 163 | polygon = vertices[region] 164 | plt.fill(*zip(*polygon), alpha=0.4, zorder=1) 165 | 166 | ax = plt.gca() 167 | ax.set_xlabel('$x_1$') 168 | ax.set_ylabel('$x_2$') 169 | 170 | fig1 = plt.gcf() 171 | 172 | #ax.set_axis_off() 173 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 174 | #plt.margins(0,0) 175 | #ax.xaxis.set_major_locator(NullLocator()) 176 | #ax.yaxis.set_major_locator(NullLocator()) 177 | 178 | 179 | fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 180 | fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 181 | fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 182 | 183 | #plt.show() 184 | 185 | # find new centroids as the average of examples 186 | new_centroids = np.array([x[labels == i].mean(0) for i in range(n_clusters)]) 187 | 188 | # check for convergence 189 | if np.all(centroids == new_centroids): 190 | break 191 | centroids = new_centroids 192 | 193 | counter += 1 194 | 195 | return centroids, labels 196 | 197 | centroids, labels = find_clusters(x, 3) 198 | -------------------------------------------------------------------------------- /linear_regression_fit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.linear_model import Ridge 5 | from sklearn.preprocessing import PolynomialFeatures 6 | from sklearn.pipeline import make_pipeline 7 | 8 | import matplotlib 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 10 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 11 | matplotlib.rcParams.update({'font.size': 18}) 12 | 13 | def f(x): 14 | """ function to approximate by polynomial interpolation""" 15 | return 0.5 * x 16 | 17 | 18 | # generate points used to plot 19 | x_plot = np.linspace(-10, 10, 100) 20 | 21 | # generate points and keep a subset of them 22 | x = np.linspace(-10, 10, 100) 23 | rng = np.random.RandomState(0) 24 | rng.shuffle(x) 25 | x = np.sort(x[:10]) 26 | noize = [(-2 + np.random.random()*2) for i in range(len(x))] 27 | y = f(x) + noize 28 | 29 | # create matrix versions of these arrays 30 | X = x[:, np.newaxis] 31 | X_plot = x_plot[:, np.newaxis] 32 | 33 | colors = ['red', 'red']#, 'orange' 34 | lw = 2 35 | 36 | 37 | type_of_regression = ["linear regression", "regression of degree 10"] 38 | fit = ["fit", "overfit"] 39 | for count, degree in enumerate([1,10]):#, 2, 15 40 | plt.figure(count) 41 | axes = plt.gca() 42 | axes.set_xlim([-10,10]) 43 | axes.set_ylim([-10,10]) 44 | plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples") 45 | plt.xticks([-10.0, -5.0, 0.0, 5.0, 10.0]) 46 | plt.yticks([-10.0, -5.0, 0.0, 5.0, 10.0]) 47 | model = make_pipeline(PolynomialFeatures(degree), Ridge()) 48 | model.fit(X, y) 49 | y_plot = model.predict(X_plot) 50 | plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw, 51 | label=type_of_regression[count]) 52 | 53 | plt.legend(loc='best') 54 | fig1 = plt.gcf() 55 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 56 | fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 57 | fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 58 | fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 59 | 60 | 61 | plt.show() 62 | -------------------------------------------------------------------------------- /multivariate_gaussian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | 4 | from scipy.stats import multivariate_normal 5 | from sklearn.linear_model import Ridge 6 | from sklearn.preprocessing import PolynomialFeatures 7 | from sklearn.pipeline import make_pipeline 8 | from mpl_toolkits.mplot3d import Axes3D 9 | 10 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 11 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 12 | matplotlib.rcParams.update({'font.size': 18}) 13 | 14 | import matplotlib.pyplot as plt 15 | 16 | mean = [0, 0] 17 | cov = [[1, 4/5], [3/4, 2]] # diagonal covariance 18 | 19 | x, y = np.random.multivariate_normal(mean, cov, 200).T 20 | fig = plt.figure(1) 21 | plt.plot(x, y, 'o') 22 | plt.axis('equal') 23 | plt.xlabel('$x^{(1)}$') 24 | plt.ylabel('$x^{(2)}$') 25 | 26 | fig.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.02, hspace = 0, wspace = 0) 27 | fig.savefig('../../Illustrations/multivariate-gaussian-0.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 28 | fig.savefig('../../Illustrations/multivariate-gaussian-0.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 29 | fig.savefig('../../Illustrations/multivariate-gaussian-0.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 30 | 31 | fig1 = plt.figure(2) 32 | 33 | ax = Axes3D(fig1) 34 | 35 | x1, y1 = np.mgrid[-5:5:.2, -5:5:.2] 36 | pos = np.empty(x1.shape + (2,)) 37 | pos[:, :, 0] = x1; pos[:, :, 1] = y1 38 | rv = multivariate_normal(mean, cov) 39 | #ax.plot_surface(x1, y1, rv.pdf(pos), rstride=1, cstride=1, alpha=0.8, cmap='viridis', edgecolor='none') 40 | ax.plot_wireframe(x1, y1, rv.pdf(pos), rstride=2, cstride=2, color='gray') 41 | 42 | z = [0] * len(x) 43 | ax.scatter(x, y, z) 44 | 45 | ax.set_xlabel('$x^{(1)}$') 46 | ax.set_ylabel('$x^{(2)}$') 47 | ax.set_zlabel('pdf'); 48 | ax.set_zticks([]) 49 | ax.set_xticks([]) 50 | ax.set_yticks([]) 51 | 52 | #ax.view_init(14, -77) 53 | 54 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.9, left = 0.08, hspace = 0, wspace = 0) 55 | fig1.savefig('../../Illustrations/multivariate-gaussian-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 56 | fig1.savefig('../../Illustrations/multivariate-gaussian-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 57 | fig1.savefig('../../Illustrations/multivariate-gaussian-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 58 | 59 | #plt.show() 60 | -------------------------------------------------------------------------------- /pdf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import matplotlib.pyplot as plt 4 | import math 5 | 6 | import matplotlib 7 | 8 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 9 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 10 | matplotlib.rcParams.update({'font.size': 18}) 11 | 12 | from sklearn.kernel_ridge import KernelRidge 13 | 14 | mu1, sigma1 = 3, 0.4 15 | mu2, sigma2 = 5, 0.6 16 | 17 | def sample_points(): 18 | s1 = np.random.normal(mu1, sigma1, 20) 19 | 20 | s2 = np.random.normal(mu2, sigma2, 20) 21 | 22 | return list(s1) + list(s2) 23 | 24 | # generate points used to plot 25 | x_plot = np.linspace(0, 8, 100) 26 | 27 | # generate points and keep a subset of them 28 | x = sample_points() 29 | 30 | lw = 2 31 | 32 | def kernel(x1, x2, b = 2): 33 | z = (x1 - x2) / b 34 | return (1/math.sqrt(2 * 3.14)) * np.exp(-z**2/2) 35 | 36 | def fb(x, data, b): 37 | return 1/(len(data)*b) * sum([kernel(x, xi, b) for xi in data]) 38 | 39 | def sum_pdf(x): 40 | result = [] 41 | for i in range(len(x)): 42 | result.append((sp.stats.norm.pdf(x, mu1, sigma1)[i] + sp.stats.norm.pdf(x, mu2, sigma2)[i])/2) 43 | return result 44 | 45 | plt.figure(0) 46 | axes = plt.gca() 47 | axes.set_ylim([0,0.6]) 48 | plt.plot(x_plot,sum_pdf(x_plot), color='red') 49 | section = np.arange(0, 8, 1/20.) 50 | plt.fill_between(section,sum_pdf(section), color='#e6eeff') 51 | plt.text(3.2, 0.04, "Area = 1.0", fontsize=18) 52 | plt.xlabel("$x$") 53 | plt.ylabel("$pdf$") 54 | 55 | #plt.legend(loc='lower left') 56 | fig1 = plt.gcf() 57 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0) 58 | fig1.savefig('../../Illustrations/pdf.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 59 | fig1.savefig('../../Illustrations/pdf.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 60 | fig1.savefig('../../Illustrations/pdf.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 61 | 62 | plt.show() 63 | -------------------------------------------------------------------------------- /pmf.py: -------------------------------------------------------------------------------- 1 | from matplotlib.ticker import FuncFormatter 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import matplotlib 5 | 6 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 7 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 8 | matplotlib.rcParams.update({'font.size': 18}) 9 | 10 | x = np.arange(4) 11 | pr = [0.1, 0.3, 0.4, 0.2] 12 | 13 | axes = plt.gca() 14 | axes.set_ylim([0,0.6]) 15 | 16 | plt.bar(x, pr, color="red") 17 | plt.xticks(x, ('1', '2', '3', '4')) 18 | plt.yticks(np.arange(0, 0.7, 0.1)) 19 | plt.xlabel("$x$") 20 | plt.ylabel("$pmf$") 21 | 22 | fig1 = plt.gcf() 23 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0) 24 | fig1.savefig('../../Illustrations/pmf.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 25 | fig1.savefig('../../Illustrations/pmf.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 26 | fig1.savefig('../../Illustrations/pmf.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 27 | 28 | 29 | plt.show() 30 | -------------------------------------------------------------------------------- /prediction_strength.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import matplotlib 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import random 6 | import sys 7 | import math 8 | 9 | from sklearn.datasets.samples_generator import make_blobs 10 | from sklearn.metrics import pairwise_distances_argmin 11 | from random import shuffle 12 | from scipy.spatial import Voronoi 13 | from scipy.spatial import distance 14 | 15 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 16 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 17 | matplotlib.rcParams.update({'font.size': 25}) 18 | 19 | random_state = 0 20 | 21 | ## how many clusters do you want in your synthetic data? 22 | centers = 2 23 | 24 | x, _ = make_blobs(n_samples=300, centers=centers, cluster_std=0.6, random_state=random_state) 25 | 26 | plt.figure(10000) 27 | plt.scatter(x[:, 0], x[:, 1], s=20, cmap='viridis'); 28 | plt.xlim(-1, 4.0) 29 | plt.ylim(-1, math.ceil(max(x[:, 1]))) 30 | plt.xticks(np.arange(int(min(x[:, 0])), math.ceil(max(x[:, 0]))+1, 1)) 31 | plt.yticks(np.arange(int(min(x[:, 1])), math.ceil(max(x[:, 1]))+1, 2), rotation='vertical') 32 | 33 | ax = plt.gca() 34 | ax.set_xlabel('$x_1$') 35 | ax.set_ylabel('$x_2$') 36 | 37 | fig1 = plt.gcf() 38 | fig1.subplots_adjust(top = 0.98, bottom = 0.16, right = 0.98, left = 0.12, hspace = 0, wspace = 0) 39 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '.eps', format='eps', dpi=1000) 40 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '.pdf', format='pdf', dpi=1000) 41 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '.png', dpi=1000) 42 | 43 | x_list = list(x) 44 | 45 | random.Random(random_state).shuffle(x_list) 46 | 47 | x_split = {} 48 | 49 | x_split["train"] = np.array(x_list[:len(x_list)/2]) 50 | 51 | x_split["test"] = np.array(x_list[len(x_list)/2:]) 52 | 53 | centroids_splits = {} 54 | labels_splits = {} 55 | counter = 100 56 | 57 | def find_clusters(x, n_clusters, current_split): 58 | 59 | current_split_suffled = list(x_split[current_split])[:] 60 | shuffle(current_split_suffled) 61 | current_split_suffled = np.array(current_split_suffled) 62 | 63 | centroids = np.array(current_split_suffled[:n_clusters]) 64 | 65 | while True: 66 | 67 | # assign labels based on closest centroid 68 | #print centroids 69 | 70 | #print "len train", len(x_split[current_split]) 71 | labels = pairwise_distances_argmin(x_split[current_split], centroids) 72 | #print "len labels", len(labels) 73 | 74 | 75 | # find new centroids as the average of examples 76 | new_centroids = np.array([x_split[current_split][labels == i].mean(0) for i in range(n_clusters)]) 77 | 78 | # check for convergence 79 | if np.all(centroids == new_centroids): 80 | break 81 | centroids = new_centroids 82 | 83 | return centroids, labels 84 | 85 | def get_examples_from_cluster(j, test_points, test_labels): 86 | examples = [] 87 | for e, l in zip(test_points, test_labels): 88 | if l == j: 89 | examples.append(e) 90 | return examples 91 | 92 | def get_closest_centroid(example, centroids): 93 | min_distance = sys.float_info.max 94 | min_centroid = 0 95 | for c in centroids: 96 | if distance.euclidean(example, c) < min_distance: 97 | min_distance = distance.euclidean(example, c) 98 | min_centroid = c 99 | return min_centroid 100 | 101 | def compute_strength(k, train_centroids, test_points, test_labels): 102 | D = np.zeros(shape=(len(test_points),len(test_points))) 103 | for x1, l1, c1 in zip(test_points, test_labels, list(range(len(test_points)))): 104 | for x2, l2, c2 in zip(test_points, test_labels, list(range(len(test_points)))): 105 | if tuple(x1) != tuple(x2): 106 | if tuple(get_closest_centroid(x1, train_centroids)) == tuple(get_closest_centroid(x2, train_centroids)): 107 | D[c1,c2] = 1.0 108 | 109 | ss = [] 110 | for j in range(k): 111 | s = 0 112 | examples_j = get_examples_from_cluster(j, test_points, test_labels) 113 | for x1, l1, c1 in zip(test_points, test_labels, list(range(len(test_points)))): 114 | for x2, l2, c2 in zip(test_points, test_labels, list(range(len(test_points)))): 115 | if tuple(x1) != tuple(x2) and l1 == l2 and l1 == j: 116 | s += D[c1,c2] 117 | s = (1.0/(float(len(examples_j))*float(len(examples_j) - 1)))*s 118 | ss += [s] 119 | 120 | return min(ss) 121 | 122 | strengths = [] 123 | ks = [1,2,3,4,5,6,7,8] 124 | for k in ks: 125 | print("k", k) 126 | for current_split in ["train", "test"]: 127 | counter += 1 128 | centroids, labels = find_clusters(x, k, current_split) 129 | 130 | centroids_splits[current_split] = centroids 131 | labels_splits[current_split] = labels 132 | s = compute_strength(k, centroids_splits["train"], x_split["test"], labels_splits["test"]) 133 | strengths += [s] 134 | print(s) 135 | 136 | plt.figure(10001) 137 | plt.plot(ks, strengths); 138 | plt.xticks(np.arange(1, 9, 1)) 139 | plt.yticks(np.arange(0, 1.05, 0.2), rotation='vertical') 140 | 141 | ax = plt.gca() 142 | ax.set_xlabel('$k$') 143 | ax.set_ylabel('$\\operatorname{ps}(k)$') 144 | 145 | fig1 = plt.gcf() 146 | fig1.subplots_adjust(top = 0.98, bottom = 0.15, right = 0.98, left = 0.15, hspace = 0, wspace = 0) 147 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '_search.eps', format='eps', dpi=1000) 148 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '_search.pdf', format='pdf', dpi=1000) 149 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '_search.png', dpi=1000) 150 | -------------------------------------------------------------------------------- /standard_logistic_function.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pylab as plt 2 | import matplotlib 3 | import numpy as np 4 | 5 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 6 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 7 | matplotlib.rcParams.update({'font.size': 18}) 8 | 9 | def sigmoid(x): 10 | """ 11 | evaluate the boltzman function with midpoint xmid and time constant tau 12 | over x 13 | """ 14 | return 1. / (1. + np.exp(-x)) 15 | 16 | 17 | x = np.linspace(-6, 6, 100) 18 | S = sigmoid(x) 19 | plt.plot(x, S, color='red', lw=2) 20 | plt.xlabel("$x$") 21 | plt.ylabel("$f(x)$") 22 | 23 | fig1 = plt.gcf() 24 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 25 | fig1.savefig('../../Illustrations/standard_logistic_function.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 26 | fig1.savefig('../../Illustrations/standard_logistic_function.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 27 | fig1.savefig('../../Illustrations/standard_logistic_function.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 28 | 29 | plt.show() 30 | -------------------------------------------------------------------------------- /under_over_fitting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.linear_model import Ridge 5 | from sklearn.preprocessing import PolynomialFeatures 6 | from sklearn.pipeline import make_pipeline 7 | 8 | import matplotlib 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 10 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 11 | matplotlib.rcParams.update({'font.size': 25}) 12 | 13 | 14 | def f(x): 15 | """ function to approximate by polynomial interpolation""" 16 | return x * (x) 17 | 18 | 19 | # generate points used to plot 20 | x_plot = np.linspace(-5, 2, 100) 21 | 22 | # generate points and keep a subset of them 23 | x = np.linspace(-5, 2, 100) 24 | rng = np.random.RandomState(0) 25 | rng.shuffle(x) 26 | x = np.sort(x[:20]) 27 | noize = [(-5 + np.random.random()*5) for i in range(len(x))] 28 | y = f(x) + noize 29 | 30 | # create matrix versions of these arrays 31 | X = x[:, np.newaxis] 32 | X_plot = x_plot[:, np.newaxis] 33 | 34 | colors = ['red', 'blue', 'orange'] 35 | lw = 2 36 | 37 | fit = ["underfit", "fit", "overfit"] 38 | for count, degree in enumerate([1, 2, 15]): 39 | plt.figure(count) 40 | axes = plt.gca() 41 | axes.set_xlim([-5,2]) 42 | axes.set_ylim([-10,30]) 43 | plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples") 44 | model = make_pipeline(PolynomialFeatures(degree), Ridge()) 45 | model.fit(X, y) 46 | y_plot = model.predict(X_plot) 47 | plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw, 48 | label=("degree %d (" + fit[count] + ")") % degree) 49 | 50 | plt.legend(loc='best') 51 | fig1 = plt.gcf() 52 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0) 53 | fig1.savefig('../../Illustrations/under-over-fit' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 54 | fig1.savefig('../../Illustrations/under-over-fit-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 55 | fig1.savefig('../../Illustrations/under-over-fit-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0) 56 | 57 | 58 | plt.show() 59 | -------------------------------------------------------------------------------- /vector.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | 4 | matplotlib.rcParams['mathtext.fontset'] = 'stix' 5 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 6 | matplotlib.rcParams.update({'font.size': 18}) 7 | 8 | 9 | plt.figure(1) 10 | plt.quiver([0, 0, 0], [0, 0, 0], [2, -2, 1], [3, 5, 0], color=['r','b','g'], angles='xy', scale_units='xy', scale=1) 11 | plt.xlim(-3, 3) 12 | plt.ylim(-1, 6) 13 | plt.xlabel('$x^{(1)}$') 14 | plt.ylabel('$x^{(2)}$') 15 | fig1 = plt.gcf() 16 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0) 17 | fig1.savefig('../../Illustrations/vector-0.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 18 | fig1.savefig('../../Illustrations/vector-0.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 19 | fig1.savefig('../../Illustrations/vector-0.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 20 | plt.show() 21 | 22 | plt.figure(2) 23 | plt.scatter([2, -2, 1], [3, 5, 0], color=['r','b','g']) 24 | plt.xlim(-3, 3) 25 | plt.ylim(-1, 6) 26 | plt.xlabel('$x^{(1)}$') 27 | plt.ylabel('$x^{(2)}$') 28 | fig1 = plt.gcf() 29 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0) 30 | fig1.savefig('../../Illustrations/vector-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 31 | fig1.savefig('../../Illustrations/vector-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 32 | fig1.savefig('../../Illustrations/vector-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1) 33 | plt.show() 34 | --------------------------------------------------------------------------------