├── .DS_Store
├── Animated_Illustrations
    ├── .DS_Store
    ├── ATTRIBUTION.txt
    ├── density_estimation.png
    ├── density_estimation.py
    ├── gaussian_mixture_model.png
    ├── gaussian_mixture_model.py
    ├── gradient_descent.png
    ├── gradient_descent.py
    ├── kernel_regression.png
    ├── kernel_regression.py
    ├── kmeans alpha 0.3.png
    ├── kmeans.py
    ├── linear_regression.png
    └── linear_regression_fit.py
├── LICENSE
├── PCA_MNIST.py
├── README.md
├── UMAP_MNIST.py
├── autoencoder_MNIST.py
├── data.txt
├── density_estimation.py
├── gaussian_mixture_model.py
├── gradient_descent.py
├── kernel_regression.py
├── kernel_trick.py
├── kmeans.py
├── linear_regression_fit.py
├── multivariate_gaussian.py
├── pdf.py
├── pmf.py
├── prediction_strength.py
├── standard_logistic_function.py
├── under_over_fitting.py
└── vector.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/.DS_Store


--------------------------------------------------------------------------------
/Animated_Illustrations/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/.DS_Store


--------------------------------------------------------------------------------
/Animated_Illustrations/ATTRIBUTION.txt:
--------------------------------------------------------------------------------
1 | The animated illustrations are produced by Ranjan Piyush (https://www.linkedin.com/in/ranjan-piyush-34b29856/) based on the original book's source code.


--------------------------------------------------------------------------------
/Animated_Illustrations/density_estimation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/density_estimation.png


--------------------------------------------------------------------------------
/Animated_Illustrations/density_estimation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy as sp
  3 | import matplotlib
  4 | import matplotlib.pyplot as plt
  5 | import math
  6 | 
  7 | from sklearn.neighbors import KernelDensity
  8 | 
  9 | import scipy.integrate as integrate
 10 | from sklearn.kernel_ridge import KernelRidge
 11 | 
 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 14 | matplotlib.rcParams.update({'font.size': 18})
 15 | 
 16 | mu1, sigma1 = 3.0, 1.0
 17 | mu2, sigma2 = 8.0, 1.5
 18 | 
 19 | def sample_points():
 20 |     s1 = np.random.normal(mu1, sigma1, 50)
 21 |     s2 = np.random.normal(mu2, sigma2, 50)
 22 |     return list(s1) + list(s2)
 23 | 
 24 | # generate points used to plot
 25 | x_plot = np.linspace(-3,15,100)
 26 | 
 27 | # generate points and keep a subset of them
 28 | x = sample_points()
 29 | 
 30 | ##colors = ['red','blue','orange','green','black','purple','yellow','magenta',
 31 | ##          'pink','grey']
 32 | lw = 2
 33 | 
 34 | def kernel(x1, x2, bi = 5.0):
 35 |     z = (x1 - x2) / bi
 36 |     return (1.0/math.sqrt(2.0 * 3.14)) * math.exp((-1.0/2.0)*(z**2))
 37 | 
 38 | def fb(xx, data, bi):
 39 |     return (1/(len(data)*bi)) * sum([kernel(xx, xi, bi) for xi in data])
 40 | 
 41 | def fbi(i, data, bi):
 42 |     data_minus_i = []
 43 |     for ii in range(len(data)):
 44 |         if i != ii:
 45 |             data_minus_i.append(data[ii])
 46 |     return (1/(len(data_minus_i)*bi)) * sum([kernel(data[i], xi, bi) for xi in data_minus_i])
 47 | 
 48 | def sum_pdf(x):
 49 |     result = []
 50 |     for i in range(len(x)):
 51 |         result.append((sp.stats.norm.pdf(x, mu1, sigma1)[i] + sp.stats.norm.pdf(x, mu2, sigma2)[i])/2.0)
 52 |         #result.append(sp.stats.norm.pdf(x, mu1, sigma1)[i])
 53 |     return result
 54 | 
 55 | b = np.linspace(0.01, 5.0, 100)
 56 | 
 57 | score = []
 58 | for bi in b:
 59 |     def fb2(xx):
 60 |         return fb(xx, x, bi)**2
 61 | 
 62 |     s = integrate.quad(fb2, -np.inf, np.inf)[0] - 2.0*np.mean([fbi(i, x, bi) for i in range(len(x))])
 63 |     score.append(s)
 64 | 
 65 | plt.figure(1)
 66 | plt.plot(b,score)
 67 | plt.xlabel("$b$")
 68 | plt.ylabel("$l$")
 69 | plt.tight_layout()
 70 | plt.xticks(np.arange(0,5,0.5))
 71 | #plt.show()
 72 | fig1 = plt.gcf()
 73 | ##fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 74 | ##fig1.savefig('../../Illustrations/density-estimation-loss.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 75 | ##fig1.savefig('../../Illustrations/density-estimation-loss.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 76 | ##fig1.savefig('../../Illustrations/density-estimation-loss.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 77 | minb = [bi for bi, s in zip(b, score) if s == min(score)][0]
 78 | print(minb)
 79 | 
 80 | import numpy
 81 | 
 82 | def fig2data(fig):
 83 |     """
 84 |     @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
 85 |     @param fig a matplotlib figure
 86 |     @return a numpy 3D array of RGB values
 87 |     """
 88 |     # draw the renderer
 89 |     fig.canvas.draw()
 90 |  
 91 |     # Get the RGBA buffer from the figure
 92 |     w,h = fig.canvas.get_width_height()
 93 |     buf = numpy.array(fig.canvas.renderer._renderer)
 94 |     
 95 |     return buf
 96 | 
 97 | seq = []
 98 | for count,degree in enumerate([round(minb,2)] + np.arange(0.05,1.5,0.05)):
 99 |     plt.figure(count+2)
100 |     axes = plt.gca()
101 |     axes.set_xlim([-3,15])
102 |     axes.set_ylim([0,0.3])
103 |     plt.xlabel("$x$")
104 |     plt.ylabel("pdf")
105 |     degree = round(degree,2)
106 | ##    heading = 'Iteration '+str(count)
107 |     plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples")
108 |     plt.plot(x_plot, [fb(xp ,x, degree) for xp in x_plot],color='blue',linewidth=lw, label="$\\hat{f}_b$, $b = " + str(degree) + "$")
109 |     plt.plot(x_plot,sum_pdf(x_plot), label="true pdf")
110 | ##    plt.title(heading)
111 |     
112 |     plt.legend(loc='upper right',prop = {'size':9})
113 |     plt.tight_layout()
114 | 
115 |     fig1 = plt.gcf()
116 |     nfig = fig2data(fig1)
117 |     seq.append(nfig)
118 | ##    fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
119 | ##    fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
120 | ##    fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
121 | ##    fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
122 |     #plt.show()
123 | 
124 | import os
125 | ## Get the directory address of current python file
126 | curr_dir = os.path.dirname(os.path.realpath(__file__))
127 | os.chdir(curr_dir) ## Set the current directory as working directory
128 | 
129 | ## The package used to create gif files
130 | import numpngw
131 | numpngw.write_apng('density_estimation.png',seq,delay = 500)
132 | 
133 | 


--------------------------------------------------------------------------------
/Animated_Illustrations/gaussian_mixture_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/gaussian_mixture_model.png


--------------------------------------------------------------------------------
/Animated_Illustrations/gaussian_mixture_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy as sp
  3 | import matplotlib
  4 | import matplotlib.pyplot as plt
  5 | import math
  6 | 
  7 | from sklearn.neighbors import KernelDensity
  8 | 
  9 | import scipy.integrate as integrate
 10 | from sklearn.kernel_ridge import KernelRidge
 11 | 
 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 14 | matplotlib.rcParams.update({'font.size': 18})
 15 | 
 16 | mu1, sigma1 = 3.0, 1.0
 17 | mu2, sigma2 = 8.0, 3.5
 18 | 
 19 | def sample_points():
 20 |     s1 = np.random.normal(mu1, math.sqrt(sigma1), 50)
 21 | 
 22 |     s2 = np.random.normal(mu2, math.sqrt(sigma2), 50)
 23 | 
 24 |     return list(s1) + list(s2)
 25 | 
 26 | def compute_bi(mu1local, sigma1local, mu2local, sigma2local, phi1local, phi2local):
 27 |     bis = []
 28 |     for xi in x:
 29 |         bis.append((sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local)/(sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local + sp.stats.norm.pdf(xi, mu2local, math.sqrt(sigma2local)) * phi2local))
 30 |     return bis
 31 | 
 32 | # generate points used to plot
 33 | x_plot = np.linspace(-2, 12, 100)
 34 | 
 35 | # generate points and keep a subset of them
 36 | x = sample_points()
 37 | 
 38 | colors = ['red', 'blue', 'orange', 'green']
 39 | lw = 2
 40 | 
 41 | mu1_estimate = 1.0
 42 | mu2_estimate = 2.0
 43 | sigma1_estimate = 1.0
 44 | sigma2_estimate = 2.0
 45 | 
 46 | phi1_estimate = 0.5
 47 | phi2_estimate = 0.5
 48 | 
 49 | import numpy
 50 | 
 51 | def fig2data(fig):
 52 |     """
 53 |     @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
 54 |     @param fig a matplotlib figure
 55 |     @return a numpy 3D array of RGB values
 56 |     """
 57 |     # draw the renderer
 58 |     fig.canvas.draw()
 59 |  
 60 |     # Get the RGBA buffer from the figure
 61 |     w,h = fig.canvas.get_width_height()
 62 |     buf = numpy.array(fig.canvas.renderer._renderer)
 63 |     
 64 |     return buf
 65 | 
 66 | seq = []
 67 | count = 0
 68 | while True:
 69 |     plt.figure(count)
 70 |     axes = plt.gca()
 71 |     axes.set_xlim([-2,14])
 72 |     axes.set_ylim([0,0.8])
 73 |     plt.xlabel("$x$")
 74 |     plt.ylabel("pdf")
 75 |     heading = "Iteration "+str(count)
 76 |     plt.title(heading)
 77 |     plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples")
 78 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1_estimate, math.sqrt(sigma1_estimate)) for xp in x_plot], color=colors[1], linewidth=lw, label="$f(x_i \\mid \\mu_1 ,\\sigma_1^2)$")
 79 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2_estimate, math.sqrt(sigma2_estimate)) for xp in x_plot], color=colors[3], linewidth=lw, label="$f(x_i \\mid \\mu_2 ,\\sigma_2^2)$")
 80 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1, math.sqrt(sigma1)) for xp in x_plot], color=colors[0], label="true pdf")
 81 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2, math.sqrt(sigma2)) for xp in x_plot], color=colors[0])
 82 | 
 83 |     plt.legend(loc='upper right',prop={'size': 9})
 84 |     plt.tight_layout()
 85 | 
 86 |     fig1 = plt.gcf()
 87 |     ##fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 88 | ##    fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 89 | ##    fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 90 | ##    fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 91 |     nfig = fig2data(fig1)
 92 |     seq.append(nfig)
 93 |     ##plt.show()
 94 | 
 95 |     bis1 = compute_bi(mu1_estimate, sigma1_estimate, mu2_estimate, sigma2_estimate, phi1_estimate, phi2_estimate)
 96 |     bis2 = compute_bi(mu2_estimate, sigma2_estimate, mu1_estimate, sigma1_estimate, phi2_estimate, phi1_estimate)
 97 | 
 98 |     #print bis1[:5]
 99 |     #print bis2[:5]
100 | 
101 |     mu1_estimate = sum([bis1[i] * x[i] for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))])
102 |     mu2_estimate = sum([bis2[i] * x[i] for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))])
103 | 
104 |     sigma1_estimate = sum([bis1[i] * (x[i] - mu1_estimate)**2 for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))])
105 |     sigma2_estimate = sum([bis2[i] * (x[i] - mu2_estimate)**2 for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))])
106 | 
107 |     #print mu1_estimate, mu2_estimate
108 |     #print sigma1_estimate, sigma2_estimate
109 | 
110 |     phi1_estimate = sum([bis1[i] for i in range(len(x))])/float(len(x))
111 |     phi2_estimate = 1.0 - phi1_estimate
112 | 
113 |     print(phi1_estimate)
114 | 
115 |     count += 1
116 | 
117 |     plt.close(count)
118 | 
119 |     if count > 50:
120 |         break
121 | 
122 | import os
123 | ## Get the directory address of current python file
124 | curr_dir = os.path.dirname(os.path.realpath(__file__))
125 | os.chdir(curr_dir) ## Set the current directory as working directory
126 | 
127 | ## The package used to create gif files
128 | import numpngw
129 | numpngw.write_apng('GMM.png',seq,delay = 250)
130 | 


--------------------------------------------------------------------------------
/Animated_Illustrations/gradient_descent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/gradient_descent.png


--------------------------------------------------------------------------------
/Animated_Illustrations/gradient_descent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import pandas as pd
  4 | 
  5 | import matplotlib
  6 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
  7 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
  8 | matplotlib.rcParams.update({'font.size': 18})
  9 | 
 10 | 
 11 | def plot_original_data():
 12 |     df = pd.read_csv("data.csv")
 13 |     plt.scatter(df['Spendings'], df['Sales'], color='#1f77b4',marker = 'o')
 14 |     plt.xlabel("Spendings, M$")
 15 |     plt.ylabel("Sales, Units")
 16 |     plt.title("Sales vs radio ad spendings")
 17 |     #plt.show()
 18 |     axes = plt.gca()
 19 |     axes.set_xlim([0,50])
 20 |     axes.set_ylim([0,35])
 21 |     plt.tight_layout()
 22 |     fig1 = plt.gcf()
 23 |     nfig = fig2data(fig1)
 24 |     seq.append(nfig)
 25 | ##    fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 26 | ##    fig1.savefig('../../Illustrations/gradient_descent-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 27 | ##    fig1.savefig('../../Illustrations/gradient_descent-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 28 | ##    fig1.savefig('../../Illustrations/gradient_descent-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 29 | 
 30 | def update_w_and_b(spendings,sales,w,b,alpha):
 31 |     dr_dw = 0.0
 32 |     dr_db = 0.0
 33 |     N = len(spendings)
 34 |     for i in range(N):
 35 |         dr_dw += -2*spendings[i]*(sales[i] - (w*spendings[i] + b))
 36 |         dr_db += -2*(sales[i] - (w*spendings[i] + b))
 37 |     # update w and b
 38 |     w = w - (dr_dw/float(N))*alpha
 39 |     b = b - (dr_db/float(N))*alpha
 40 |     return w,b
 41 | 
 42 | def fig2data(fig):
 43 |     """
 44 |     @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
 45 |     @param fig a matplotlib figure
 46 |     @return a numpy 3D array of RGB values
 47 |     """
 48 |     # draw the renderer
 49 |     fig.canvas.draw()
 50 |     # Get the RGBA buffer from the figure
 51 |     w,h = fig.canvas.get_width_height()
 52 |     buf = np.array(fig.canvas.renderer._renderer)
 53 |     return buf
 54 | 
 55 | seq = []
 56 | plot_original_data()
 57 | def train(spendings,sales,w,b,alpha,epochs):
 58 |     image_counter = 2;
 59 |     for e in range(epochs):
 60 |         w, b = update_w_and_b(spendings,sales,w,b,alpha)
 61 |         # log the progress
 62 |         if (e==0) or (e<3000 and e%400==0) or (e%3000==0):
 63 |             print("epoch: ", str(e), "loss: "+str(loss(spendings,sales,w,b)))
 64 |             print("w, b: ",w,b)
 65 |             plt.figure(image_counter)
 66 |             plt.xlabel("Spendings, M$")
 67 |             plt.ylabel("Sales, Units")
 68 |             axes = plt.gca()
 69 |             axes.set_xlim([0,50])
 70 |             axes.set_ylim([0,35])
 71 |             plt.scatter(spendings,sales,color='#1f77b4',marker='o' )
 72 |             X_plot = np.linspace(0,50,50)
 73 |             plt.plot(X_plot,X_plot*w + b)
 74 |             heading = 'epoch = '+str(e)+' loss = '+str(round(loss(spendings,sales,w,b)))
 75 |             plt.title(heading)
 76 |             #plt.show()
 77 |             plt.tight_layout()
 78 |             fig1 = plt.gcf()
 79 |             nfig = fig2data(fig1)
 80 |             seq.append(nfig)
 81 | ##            fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 82 | ##            fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 83 | ##            fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 84 | ##            fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 85 |             image_counter += 1
 86 |     return w,b
 87 | 
 88 | def loss(spendings,sales,w,b):
 89 |     N = len(spendings)
 90 |     total_error = 0.0
 91 |     for i in range(N):
 92 |         total_error += (sales[i] - (w*spendings[i] + b))**2
 93 |     return total_error/N
 94 | 
 95 | df = pd.read_csv("data.csv")
 96 | x = df['Spendings']
 97 | y = df['Sales']
 98 | w,b = train(x,y,0.0,0.0,0.001,16000)
 99 | 
100 | def predict(x,w,b):
101 |     return w*x + b
102 | x_new = 23.0
103 | y_new = predict(x_new, w, b)
104 | print(y_new)
105 | 
106 | import os
107 | ## Get the directory address of current python file
108 | curr_dir = os.path.dirname(os.path.realpath(__file__))
109 | os.chdir(curr_dir) ## Set the current directory as working directory
110 | 
111 | ## The package used to create gif files
112 | import numpngw
113 | numpngw.write_apng('gradient_descent.png',seq,delay = 750)
114 | 


--------------------------------------------------------------------------------
/Animated_Illustrations/kernel_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/kernel_regression.png


--------------------------------------------------------------------------------
/Animated_Illustrations/kernel_regression.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import math
 5 | 
 6 | from sklearn.linear_model import Ridge
 7 | from sklearn.preprocessing import PolynomialFeatures
 8 | from sklearn.pipeline import make_pipeline
 9 | from sklearn.kernel_ridge import KernelRidge
10 | 
11 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
12 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
13 | matplotlib.rcParams.update({'font.size': 25})
14 | 
15 | def f(x):
16 |     """ function to approximate by polynomial interpolation"""
17 |     return x * (x)
18 | 
19 | 
20 | # generate points used to plot
21 | x_plot = np.linspace(-5,2,100)
22 | 
23 | # generate points and keep a subset of them
24 | x = np.linspace(-5,2,100)
25 | rng = np.random.RandomState(0)
26 | rng.shuffle(x)
27 | x = np.sort(x[:50])
28 | noize = [(-5 + np.random.random()*5) for i in range(len(x))]
29 | y = f(x) + noize
30 | 
31 | # create matrix versions of these arrays
32 | X = x[:, np.newaxis]
33 | X_plot = x_plot[:, np.newaxis]
34 | 
35 | ##colors = ['red', 'blue', 'orange']
36 | lw = 2
37 | 
38 | def kernel(x1,x2,b=2):
39 |     z = (x1-x2)/b
40 |     return (1/math.sqrt(2*3.14))*np.exp(-z**2/2)
41 | 
42 | def fig2data(fig):
43 |     """
44 |     @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
45 |     @param fig a matplotlib figure
46 |     @return a numpy 3D array of RGB values
47 |     """
48 |     # draw the renderer
49 |     fig.canvas.draw()
50 |  
51 |     # Get the RGBA buffer from the figure
52 |     w,h = fig.canvas.get_width_height()
53 |     buf = np.array(fig.canvas.renderer._renderer)
54 |     
55 |     return buf
56 | 
57 | seq = []
58 | fit = ["fit", "small overfit", "big overfit"]
59 | for count, degree in enumerate(np.arange(0.05,1.5,0.05)):
60 |     plt.figure(count)
61 |     axes = plt.gca()
62 |     axes.set_xlim([-5,2])
63 |     axes.set_ylim([-10,30])
64 |     degree = round(degree,2)
65 |     plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples")
66 |     model = KernelRidge(alpha=0.01, kernel=kernel, kernel_params = {'b':degree})
67 |     model.fit(X, y)
68 |     y_plot = model.predict(X_plot)
69 |     plt.plot(x_plot, y_plot, color='green', linewidth=lw,
70 |              label="b = " + str(degree))
71 | 
72 | ##    heading = 'Iteration '+str(count)
73 | ##    plt.title(heading)
74 |     plt.legend(loc='upper right',prop={'size': 9})
75 |     fig1 = plt.gcf()
76 |     nfig = fig2data(fig1)
77 |     seq.append(nfig)
78 | ##    fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
79 | ##    fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
80 | ##    fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
81 | ##    fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
82 | 
83 | 
84 | ##plt.show()
85 | 
86 | import os
87 | ## Get the directory address of current python file
88 | curr_dir = os.path.dirname(os.path.realpath(__file__))
89 | os.chdir(curr_dir) ## Set the current directory as working directory
90 | 
91 | ## The package used to create gif files
92 | import numpngw
93 | numpngw.write_apng('kernel_regression.png',seq,delay = 500)
94 | 
95 | 


--------------------------------------------------------------------------------
/Animated_Illustrations/kmeans alpha 0.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/kmeans alpha 0.3.png


--------------------------------------------------------------------------------
/Animated_Illustrations/kmeans.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | 
  5 | from sklearn.datasets.samples_generator import make_blobs
  6 | from sklearn.metrics import pairwise_distances_argmin
  7 | from random import shuffle, random
  8 | from matplotlib.ticker import NullLocator
  9 | from scipy.spatial import Voronoi
 10 | 
 11 | 
 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 14 | matplotlib.rcParams.update({'font.size': 18})
 15 | 
 16 | x, _ = make_blobs(n_samples=50, centers=3, cluster_std=0.6, random_state=0)
 17 | 
 18 | #plt.scatter(x[:, 0], x[:, 1], s=50)
 19 | 
 20 | def voronoi_finite_polygons_2d(vor, radius=None):
 21 |     """
 22 |     Reconstruct infinite voronoi regions in a 2D diagram to finite
 23 |     regions.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     vor : Voronoi
 28 |         Input diagram
 29 |     radius : float, optional
 30 |         Distance to 'points at infinity'.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     regions : list of tuples
 35 |         Indices of vertices in each revised Voronoi regions.
 36 |     vertices : list of tuples
 37 |         Coordinates for revised Voronoi vertices. Same as coordinates
 38 |         of input vertices, with 'points at infinity' appended to the
 39 |         end.
 40 | 
 41 |     """
 42 | 
 43 |     if vor.points.shape[1] != 2:
 44 |         raise ValueError("Requires 2D input")
 45 | 
 46 |     new_regions = []
 47 |     new_vertices = vor.vertices.tolist()
 48 | 
 49 |     center = vor.points.mean(axis=0)
 50 |     if radius is None:
 51 |         radius = vor.points.ptp().max()*2
 52 | 
 53 |     # Construct a map containing all ridges for a given point
 54 |     all_ridges = {}
 55 |     for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices):
 56 |         all_ridges.setdefault(p1, []).append((p2, v1, v2))
 57 |         all_ridges.setdefault(p2, []).append((p1, v1, v2))
 58 | 
 59 |     # Reconstruct infinite regions
 60 |     for p1, region in enumerate(vor.point_region):
 61 |         vertices = vor.regions[region]
 62 | 
 63 |         if all([v >= 0 for v in vertices]):
 64 |             # finite region
 65 |             new_regions.append(vertices)
 66 |             continue
 67 | 
 68 |         # reconstruct a non-finite region
 69 |         ridges = all_ridges[p1]
 70 |         new_region = [v for v in vertices if v >= 0]
 71 | 
 72 |         for p2, v1, v2 in ridges:
 73 |             if v2 < 0:
 74 |                 v1, v2 = v2, v1
 75 |             if v1 >= 0:
 76 |                 # finite ridge: already in the region
 77 |                 continue
 78 | 
 79 |             # Compute the missing endpoint of an infinite ridge
 80 | 
 81 |             t = vor.points[p2] - vor.points[p1] # tangent
 82 |             t /= np.linalg.norm(t)
 83 |             n = np.array([-t[1], t[0]])  # normal
 84 | 
 85 |             midpoint = vor.points[[p1, p2]].mean(axis=0)
 86 |             direction = np.sign(np.dot(midpoint - center, n)) * n
 87 |             far_point = vor.vertices[v2] + direction * radius
 88 | 
 89 |             new_region.append(len(new_vertices))
 90 |             new_vertices.append(far_point.tolist())
 91 | 
 92 |         # sort region counterclockwise
 93 |         vs = np.asarray([new_vertices[v] for v in new_region])
 94 |         c = vs.mean(axis=0)
 95 |         angles = np.arctan2(vs[:,1] - c[1], vs[:,0] - c[0])
 96 |         new_region = np.array(new_region)[np.argsort(angles)]
 97 | 
 98 |         # finish
 99 |         new_regions.append(new_region.tolist())
100 | 
101 |     return new_regions, np.asarray(new_vertices)
102 | 
103 | def fig2data(fig):
104 |     """
105 |     @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
106 |     @param fig a matplotlib figure
107 |     @return a numpy 3D array of RGB values
108 |     """
109 |     # draw the renderer
110 |     fig.canvas.draw()
111 |  
112 |     # Get the RGBA buffer from the figure
113 |     w,h = fig.canvas.get_width_height()
114 |     buf = np.array(fig.canvas.renderer._renderer)
115 |     
116 |     return buf
117 | 
118 | seq = []
119 | 
120 | def find_clusters(x, n_clusters):
121 |     # randomly set cluster centroids
122 |     x_list = list(x)
123 |     shuffle(x_list)
124 |     centroids = np.array([[2 * random(), 4 * random()], [2 * random(), 4 * random()], [2 * random(), 4 * random()]])
125 | 
126 |     counter = 0
127 | 
128 |     plt.figure(counter)
129 | 
130 |     plt.scatter(x[:, 0], x[:, 1], s=50)
131 | 
132 |     ax = plt.gca()
133 |     ax.set_xlabel('$x_1$')
134 |     ax.set_ylabel('$x_2$')
135 |     plt.xlim(-3.0, 4.0)
136 |     plt.ylim(-1, 6)
137 | 
138 |     fig1 = plt.gcf()
139 | 
140 | ##    fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
141 | ##
142 | ##    fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
143 | ##    fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
144 | ##    fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
145 | 
146 |     #plt.show()
147 | 
148 |     
149 |     counter = 1
150 | 
151 |     while True:
152 | 
153 |         plt.figure(counter)
154 |         axes = plt.gca()
155 | 
156 |         # assign labels based on closest centroid
157 |         labels = pairwise_distances_argmin(x, centroids)
158 | 
159 |         plt.scatter(x[:, 0], x[:, 1], c=[l + 1 for l in labels], s=50, cmap='tab10', zorder=2);
160 | 
161 |         plt.scatter(centroids[:, 0], centroids[:, 1], c=[1,2,3], s=200, cmap='tab10', marker="s", facecolors='none', zorder=2);
162 |         plt.xlim(-3.0, 4.0)
163 |         plt.ylim(-1, 6)
164 | 
165 |         vor = Voronoi(centroids)
166 | 
167 |         # plot
168 |         regions, vertices = voronoi_finite_polygons_2d(vor, 300)
169 |         print("--")
170 |         print(regions)
171 | ##        print("--")
172 | ##        print(vertices)
173 | 
174 |         # colorize
175 |         for region in regions:
176 |             polygon = vertices[region]
177 |             plt.fill(*zip(*polygon), alpha=0.3, zorder=1)
178 | 
179 |         ax = plt.gca()
180 |         ax.set_xlabel('$x_1$')
181 |         ax.set_ylabel('$x_2$')
182 | 
183 |         heading = 'Iteration '+str(counter)
184 |         plt.title(heading)
185 |         plt.tight_layout()
186 |         
187 |         fig1 = plt.gcf()
188 |         nfig = fig2data(fig1)
189 |         seq.append(nfig)
190 |         #ax.set_axis_off()
191 | ##        fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
192 |         #plt.margins(0,0)
193 |         #ax.xaxis.set_major_locator(NullLocator())
194 |         #ax.yaxis.set_major_locator(NullLocator())
195 | 
196 |         
197 | ##        fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
198 | ##        fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
199 | ##        fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
200 | 
201 |         #plt.show()
202 |         
203 |         # find new centroids as the average of examples
204 |         new_centroids = np.array([x[labels == i].mean(0) for i in range(n_clusters)])
205 |         
206 |         # check for convergence
207 |         if np.all(centroids == new_centroids):
208 |             break
209 |         centroids = new_centroids
210 | 
211 |         counter += 1
212 |     
213 |     return centroids, labels
214 | 
215 | centroids, labels = find_clusters(x,3)
216 | 
217 | import os
218 | ## Get the directory address of current python file
219 | curr_dir = os.path.dirname(os.path.realpath(__file__))
220 | os.chdir(curr_dir) ## Set the current directory as working directory
221 | 
222 | ## The package used to create gif files
223 | import numpngw
224 | numpngw.write_apng('kmeans.png',seq,delay = 500)
225 | 


--------------------------------------------------------------------------------
/Animated_Illustrations/linear_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aburkov/theMLbook/a15a9e28851fa49e0983b38727b75b46a1cce03f/Animated_Illustrations/linear_regression.png


--------------------------------------------------------------------------------
/Animated_Illustrations/linear_regression_fit.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.linear_model import Ridge
 5 | from sklearn.preprocessing import PolynomialFeatures
 6 | from sklearn.pipeline import make_pipeline
 7 | 
 8 | import matplotlib
 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
10 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
11 | matplotlib.rcParams.update({'font.size': 18})
12 | 
13 | def f(x):
14 |     """ function to approximate by polynomial interpolation"""
15 |     return 0.5 * x
16 | 
17 | 
18 | # generate points used to plot
19 | x_plot = np.linspace(-10, 10, 100)
20 | 
21 | # generate points and keep a subset of them
22 | x = np.linspace(-10, 10, 100)
23 | rng = np.random.RandomState(0)
24 | rng.shuffle(x)
25 | x = np.sort(x[:10])
26 | noize = [(-2 + np.random.random()*2) for i in range(len(x))]
27 | y = f(x) + noize
28 | 
29 | # create matrix versions of these arrays
30 | X = x[:, np.newaxis]
31 | X_plot = x_plot[:, np.newaxis]
32 | 
33 | colors = ['red', 'red']#, 'orange'
34 | lw = 2
35 | 
36 | def fig2data(fig):
37 |     """
38 |     @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
39 |     @param fig a matplotlib figure
40 |     @return a numpy 3D array of RGB values
41 |     """
42 |     # draw the renderer
43 |     fig.canvas.draw()
44 |  
45 |     # Get the RGBA buffer from the figure
46 |     w,h = fig.canvas.get_width_height()
47 |     buf = np.array(fig.canvas.renderer._renderer)
48 |     
49 |     return buf
50 | 
51 | seq = []
52 | type_of_regression = ["linear regression", "regression of degree 10"]
53 | fit = ["fit", "overfit"]
54 | for count, degree in enumerate(range(1,11)):#, 2, 15
55 |     plt.figure(count)
56 |     axes = plt.gca()
57 |     axes.set_xlim([-10,10])
58 |     axes.set_ylim([-10,10])
59 |     plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples")
60 |     plt.xticks([-10.0, -5.0, 0.0, 5.0, 10.0])
61 |     plt.yticks([-10.0, -5.0, 0.0, 5.0, 10.0])
62 |     model = make_pipeline(PolynomialFeatures(degree), Ridge())
63 |     model.fit(X, y)
64 |     y_plot = model.predict(X_plot)
65 |     plt.plot(x_plot, y_plot, color='red', linewidth=lw,
66 |              label='linear regression of degree '+ str(degree))
67 |     
68 |     plt.legend(loc='best')
69 |     plt.tight_layout()
70 |     fig1 = plt.gcf()
71 |     nfig = fig2data(fig1)
72 |     seq.append(nfig)
73 | ##    fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
74 | ##    fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
75 | ##    fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
76 | ##    fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
77 | 
78 | 
79 | #plt.show()
80 | 
81 | import os
82 | ## Get the directory address of current python file
83 | curr_dir = os.path.dirname(os.path.realpath(__file__))
84 | os.chdir(curr_dir) ## Set the current directory as working directory
85 | 
86 | ## The package used to create gif files
87 | import numpngw
88 | numpngw.write_apng('linear_regression.png',seq,delay = 500)
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PCA_MNIST.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy as sp
 3 | import matplotlib
 4 | import matplotlib.pyplot as plt
 5 | import math
 6 | 
 7 | from sklearn.decomposition import PCA
 8 | 
 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
10 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
11 | matplotlib.rcParams.update({'font.size': 25})
12 | 
13 | from sklearn.datasets import fetch_mldata
14 | import matplotlib.pyplot as plt
15 | 
16 | 
17 | mnist = fetch_mldata("MNIST original")
18 | 
19 | reducer = PCA(n_components=2)
20 | embedding = reducer.fit_transform(mnist.data)
21 | 
22 | plt.figure()
23 | 
24 | plt.scatter(embedding[:, 0], embedding[:, 1], c=mnist.target, cmap="Spectral", s=0.1)
25 | 
26 | plt.gca().get_xaxis().set_ticklabels([])
27 | plt.gca().get_yaxis().set_ticklabels([])
28 | 
29 | ax = plt.gca()
30 | ax.set_xlabel('$x_1$')
31 | ax.set_ylabel('$x_2$')
32 | 
33 | fig1 = plt.gcf()
34 | 
35 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
36 | fig1.savefig('../../Illustrations/PCA-MNIST.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
37 | fig1.savefig('../../Illustrations/PCA-MNIST.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
38 | fig1.savefig('../../Illustrations/PCA-MNIST.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
39 | 
40 | plt.show()
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The Hundred-Page Machine Learning Book
2 | The Python code to reproduce the illustrations from [The Hundred-Page Machine Learning Book](http://themlbook.com/).
3 | 
4 | ![](http://themlbook.com/images/og-image3.png)
5 | 
6 | **WARNING!** To avoid buying counterfeit on Amazon, click on **[See All Buying Options](https://www.amazon.com/gp/offer-listing/199957950X/)** and choose "Amazon.com" and not a third-party seller.
7 | 


--------------------------------------------------------------------------------
/UMAP_MNIST.py:
--------------------------------------------------------------------------------
 1 | import umap
 2 | from sklearn.datasets import fetch_mldata
 3 | import matplotlib
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.decomposition import PCA
 7 | 
 8 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 9 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
10 | matplotlib.rcParams.update({'font.size': 25})
11 | 
12 | 
13 | mnist = fetch_mldata("MNIST original")
14 | 
15 | reducer = umap.UMAP(random_state=42)
16 | embedding = reducer.fit_transform(mnist.data)
17 | 
18 | plt.figure()
19 | 
20 | plt.scatter(embedding[:, 0], embedding[:, 1], c=mnist.target, cmap="Spectral", s=0.1)
21 | 
22 | plt.gca().get_xaxis().set_ticklabels([])
23 | plt.gca().get_yaxis().set_ticklabels([])
24 | 
25 | ax = plt.gca()
26 | ax.set_xlabel('$x_1$')
27 | ax.set_ylabel('$x_2$')
28 | 
29 | fig1 = plt.gcf()
30 | 
31 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
32 | fig1.savefig('../../Illustrations/UMAP-MNIST.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
33 | fig1.savefig('../../Illustrations/UMAP-MNIST.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
34 | fig1.savefig('../../Illustrations/UMAP-MNIST.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
35 | 
36 | plt.show()
37 | 


--------------------------------------------------------------------------------
/autoencoder_MNIST.py:
--------------------------------------------------------------------------------
 1 | import pylab as plt
 2 | import numpy as np
 3 | 
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | import keras
 8 | from keras.models import Sequential, Model
 9 | from keras.layers import Dense
10 | from keras.optimizers import Adam
11 | 
12 | from keras.datasets import mnist
13 | 
14 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
15 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
16 | matplotlib.rcParams.update({'font.size': 25})
17 | 
18 | from sklearn.datasets import fetch_mldata
19 | import matplotlib.pyplot as plt
20 | 
21 | 
22 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
23 | x_train = x_train.reshape(60000, 784) / 255.0
24 | x_test = x_test.reshape(10000, 784) / 255.0
25 | 
26 | m = Sequential()
27 | m.add(Dense(512,  activation='elu', input_shape=(784,)))
28 | m.add(Dense(128,  activation='elu'))
29 | m.add(Dense(2,    activation='linear', name="bottleneck"))
30 | m.add(Dense(128,  activation='elu'))
31 | m.add(Dense(512,  activation='elu'))
32 | m.add(Dense(784,  activation='sigmoid'))
33 | m.compile(loss='mean_squared_error', optimizer = Adam())
34 | history = m.fit(x_train, x_train, batch_size=128, epochs=5, verbose=1, 
35 |                 validation_data=(x_test, x_test))
36 | 
37 | encoder = Model(m.input, m.get_layer('bottleneck').output)
38 | embedding = encoder.predict(x_train)  # bottleneck representation
39 | 
40 | plt.figure()
41 | 
42 | plt.scatter(embedding[:,0], embedding[:,1], c=y_train, s=0.1, cmap='Spectral')
43 | 
44 | plt.gca().get_xaxis().set_ticklabels([])
45 | plt.gca().get_yaxis().set_ticklabels([])
46 | 
47 | ax = plt.gca()
48 | ax.set_xlabel('$x_1$')
49 | ax.set_ylabel('$x_2$')
50 | 
51 | fig1 = plt.gcf()
52 | 
53 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
54 | fig1.savefig('../../Illustrations/autoencoder-MNIST.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
55 | fig1.savefig('../../Illustrations/autoencoder-MNIST.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
56 | fig1.savefig('../../Illustrations/autoencoder-MNIST.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
57 | 
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/data.txt:
--------------------------------------------------------------------------------
1 | The dataset for gradient descent example can be downloaded from: http://themlbook.com/wiki/doku.php?id=gradient_descent
2 | 


--------------------------------------------------------------------------------
/density_estimation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import numpy as np
  3 | import scipy as sp
  4 | import matplotlib
  5 | import matplotlib.pyplot as plt
  6 | import math
  7 | 
  8 | from sklearn.neighbors import KernelDensity
  9 | 
 10 | import scipy.integrate as integrate
 11 | from sklearn.kernel_ridge import KernelRidge
 12 | 
 13 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 14 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 15 | matplotlib.rcParams.update({'font.size': 18})
 16 | 
 17 | mu1, sigma1 = 3.0, 1.0
 18 | mu2, sigma2 = 8.0, 1.5
 19 | 
 20 | def sample_points():
 21 |     s1 = np.random.normal(mu1, sigma1, 50)
 22 | 
 23 |     s2 = np.random.normal(mu2, sigma2, 50)
 24 | 
 25 |     return list(s1) + list(s2)
 26 | 
 27 | # generate points used to plot
 28 | x_plot = np.linspace(0, 12, 100)
 29 | 
 30 | # generate points and keep a subset of them
 31 | x = sample_points()
 32 | 
 33 | colors = ['red', 'blue', 'orange', 'green']
 34 | lw = 2
 35 | 
 36 | def kernel(x1, x2, bi = 2.0):
 37 |     z = (x1 - x2) / bi
 38 |     return (1.0/math.sqrt(2.0 * 3.14)) * math.exp((-1.0/2.0)*(z**2))
 39 | 
 40 | def fb(xx, data, bi):
 41 |     return (1/(len(data)*bi)) * sum([kernel(xx, xi, bi) for xi in data])
 42 | 
 43 | def fbi(i, data, bi):
 44 |     data_minus_i = []
 45 |     for ii in range(len(data)):
 46 |         if i != ii:
 47 |             data_minus_i.append(data[ii])
 48 |     return (1/(len(data_minus_i)*bi)) * sum([kernel(data[i], xi, bi) for xi in data_minus_i])
 49 | 
 50 | 
 51 | def sum_pdf(x):
 52 |     result = []
 53 |     for i in range(len(x)):
 54 |         result.append((sp.stats.norm.pdf(x, mu1, sigma1)[i] + sp.stats.norm.pdf(x, mu2, sigma2)[i])/2.0)
 55 |         #result.append(sp.stats.norm.pdf(x, mu1, sigma1)[i])
 56 |     return result
 57 | 
 58 | b = np.linspace(0.01, 3.0, 100)
 59 | 
 60 | score = []
 61 | for bi in b:
 62 |     def fb2(xx):
 63 |         return fb(xx, x, bi)**2
 64 | 
 65 |     s = integrate.quad(fb2, -np.inf, np.inf)[0] - 2.0*np.mean([fbi(i, x, bi) for i in range(len(x))])
 66 |     score.append(s)
 67 | 
 68 | plt.figure(1)
 69 | plt.plot(b,score)
 70 | plt.xlabel("$b$")
 71 | plt.ylabel("$l$")
 72 | plt.tight_layout()
 73 | plt.xticks(np.arange(0, 3.5, 0.5))
 74 | #plt.show()
 75 | fig1 = plt.gcf()
 76 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 77 | fig1.savefig('../../Illustrations/density-estimation-loss.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 78 | fig1.savefig('../../Illustrations/density-estimation-loss.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 79 | fig1.savefig('../../Illustrations/density-estimation-loss.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 80 | minb = [bi for bi, s in zip(b, score) if s == min(score)][0]
 81 | print(minb)
 82 | 
 83 | 
 84 | for count, degree in enumerate([round(minb, 2)] + [0.2, 2.0]):
 85 |     plt.figure(count+2)
 86 |     axes = plt.gca()
 87 |     axes.set_xlim([0,12])
 88 |     axes.set_ylim([0,0.3])
 89 |     plt.xlabel("$x$")
 90 |     plt.ylabel("pdf")
 91 |     plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples")
 92 |     plt.plot(x_plot, [fb(xp ,x, degree) for xp in x_plot], color=colors[count], linewidth=lw, label="$\\hat{f}_b$, $b = " + str(degree) + "$")
 93 |     plt.plot(x_plot,sum_pdf(x_plot), label="true pdf")
 94 | 
 95 |     plt.legend(loc='upper right')
 96 |     plt.tight_layout()
 97 | 
 98 |     fig1 = plt.gcf()
 99 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
100 |     fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
101 |     fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
102 |     fig1.savefig('../../Illustrations/density-estimation-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
103 |     plt.show()
104 | 
105 | 


--------------------------------------------------------------------------------
/gaussian_mixture_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import numpy as np
  3 | import scipy as sp
  4 | import matplotlib
  5 | import matplotlib.pyplot as plt
  6 | import math
  7 | 
  8 | from sklearn.neighbors import KernelDensity
  9 | 
 10 | import scipy.integrate as integrate
 11 | from sklearn.kernel_ridge import KernelRidge
 12 | 
 13 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 14 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 15 | matplotlib.rcParams.update({'font.size': 18})
 16 | 
 17 | mu1, sigma1 = 3.0, 1.0
 18 | mu2, sigma2 = 8.0, 3.5
 19 | 
 20 | def sample_points():
 21 |     s1 = np.random.normal(mu1, math.sqrt(sigma1), 50)
 22 | 
 23 |     s2 = np.random.normal(mu2, math.sqrt(sigma2), 50)
 24 | 
 25 |     return list(s1) + list(s2)
 26 | 
 27 | def compute_bi(mu1local, sigma1local, mu2local, sigma2local, phi1local, phi2local):
 28 |     bis = []
 29 |     for xi in x:
 30 |         bis.append((sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local)/(sp.stats.norm.pdf(xi, mu1local, math.sqrt(sigma1local)) * phi1local + sp.stats.norm.pdf(xi, mu2local, math.sqrt(sigma2local)) * phi2local))
 31 |     return bis
 32 | 
 33 | # generate points used to plot
 34 | x_plot = np.linspace(-2, 12, 100)
 35 | 
 36 | # generate points and keep a subset of them
 37 | x = sample_points()
 38 | 
 39 | colors = ['red', 'blue', 'orange', 'green']
 40 | lw = 2
 41 | 
 42 | mu1_estimate = 1.0
 43 | mu2_estimate = 2.0
 44 | sigma1_estimate = 1.0
 45 | sigma2_estimate = 2.0
 46 | 
 47 | phi1_estimate = 0.5
 48 | phi2_estimate = 0.5
 49 | 
 50 | count = 0
 51 | while True:
 52 |     plt.figure(count)
 53 |     axes = plt.gca()
 54 |     axes.set_xlim([-2,12])
 55 |     axes.set_ylim([0,0.8])
 56 |     plt.xlabel("$x$")
 57 |     plt.ylabel("pdf")
 58 |     plt.scatter(x, [0.005] * len(x), color='navy', s=30, marker=2, label="training examples")
 59 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1_estimate, math.sqrt(sigma1_estimate)) for xp in x_plot], color=colors[1], linewidth=lw, label="$f(x_i \\mid \\mu_1 ,\\sigma_1^2)$")
 60 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2_estimate, math.sqrt(sigma2_estimate)) for xp in x_plot], color=colors[3], linewidth=lw, label="$f(x_i \\mid \\mu_2 ,\\sigma_2^2)$")
 61 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu1, math.sqrt(sigma1)) for xp in x_plot], color=colors[0], label="true pdf")
 62 |     plt.plot(x_plot, [sp.stats.norm.pdf(xp, mu2, math.sqrt(sigma2)) for xp in x_plot], color=colors[0])
 63 | 
 64 |     plt.legend(loc='upper right')
 65 |     plt.tight_layout()
 66 | 
 67 |     fig1 = plt.gcf()
 68 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 69 |     fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 70 |     fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 71 |     fig1.savefig('../../Illustrations/gaussian-mixture-model-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 72 |     #plt.show()
 73 | 
 74 |     bis1 = compute_bi(mu1_estimate, sigma1_estimate, mu2_estimate, sigma2_estimate, phi1_estimate, phi2_estimate)
 75 |     bis2 = compute_bi(mu2_estimate, sigma2_estimate, mu1_estimate, sigma1_estimate, phi2_estimate, phi1_estimate)
 76 | 
 77 |     #print bis1[:5]
 78 |     #print bis2[:5]
 79 | 
 80 |     mu1_estimate = sum([bis1[i] * x[i] for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))])
 81 |     mu2_estimate = sum([bis2[i] * x[i] for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))])
 82 | 
 83 |     sigma1_estimate = sum([bis1[i] * (x[i] - mu1_estimate)**2 for i in range(len(x))]) / sum([bis1[i] for i in range(len(x))])
 84 |     sigma2_estimate = sum([bis2[i] * (x[i] - mu2_estimate)**2 for i in range(len(x))]) / sum([bis2[i] for i in range(len(x))])
 85 | 
 86 |     #print mu1_estimate, mu2_estimate
 87 |     #print sigma1_estimate, sigma2_estimate
 88 | 
 89 |     phi1_estimate = sum([bis1[i] for i in range(len(x))])/float(len(x))
 90 |     phi2_estimate = 1.0 - phi1_estimate
 91 | 
 92 |     print(phi1_estimate)
 93 | 
 94 |     count += 1
 95 | 
 96 |     plt.close(count)
 97 | 
 98 |     if count > 50:
 99 |         break
100 | 
101 | 


--------------------------------------------------------------------------------
/gradient_descent.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | import matplotlib
 6 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 7 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 8 | matplotlib.rcParams.update({'font.size': 18})
 9 | 
10 | 
11 | def plot_original_data():
12 |     x, y = np.loadtxt("data.txt", delimiter= "\t", unpack = True)
13 | 
14 |     plt.scatter(x, y, color='#1f77b4', marker='o')
15 | 
16 |     plt.xlabel("Spendings, M$")
17 |     plt.ylabel("Sales, Units")
18 |     plt.title("Sales as a function of radio ad spendings.")
19 |     #plt.show()
20 |     fig1 = plt.gcf()
21 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
22 |     fig1.savefig('../../Illustrations/gradient_descent-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
23 |     fig1.savefig('../../Illustrations/gradient_descent-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
24 |     fig1.savefig('../../Illustrations/gradient_descent-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
25 | 
26 | def update_w_and_b(spendings, sales, w, b, alpha):
27 |     dr_dw = 0.0
28 |     dr_db = 0.0
29 |     N = len(spendings)
30 | 
31 |     for i in range(N):
32 |         dr_dw += -2 * spendings[i] * (sales[i] - (w * spendings[i] + b))
33 |         dr_db += -2 * (sales[i] - (w * spendings[i] + b))
34 | 
35 |     # update w and b
36 |     w = w - (dr_dw/float(N)) * alpha
37 |     b = b - (dr_db/float(N)) * alpha
38 | 
39 |     return w, b
40 | 
41 | def train(spendings, sales, w, b, alpha, epochs):
42 |     image_counter = 2;
43 |     for e in range(epochs):
44 |         w, b = update_w_and_b(spendings, sales, w, b, alpha)
45 | 
46 |         # log the progress
47 |         if (e == 0) or (e < 3000 and e % 400 == 0) or (e % 3000 == 0):
48 |             print("epoch: ", str(e), "loss: "+str(loss(spendings, sales, w, b)))
49 |             print("w, b: ", w, b)
50 |             plt.figure(image_counter)
51 |             axes = plt.gca()
52 |             axes.set_xlim([0,50])
53 |             axes.set_ylim([0,30])
54 |             plt.scatter(spendings, sales)
55 |             X_plot = np.linspace(0,50,50)
56 |             plt.plot(X_plot, X_plot*w + b)
57 |             #plt.show()
58 |             fig1 = plt.gcf()
59 |             fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
60 |             fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
61 |             fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
62 |             fig1.savefig('../../Illustrations/gradient_descent-' + str(image_counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
63 |             image_counter += 1
64 |     return w, b
65 | 
66 | def loss(spendings, sales, w, b):
67 |     N = len(spendings)
68 |     total_error = 0.0
69 |     for i in range(N):
70 |         total_error += (sales[i] - (w*spendings[i] + b))**2
71 |     return total_error / N
72 | 
73 | x, y = np.loadtxt("data.txt", delimiter= "\t", unpack = True)
74 | #w, b = train(x, y, 0.0, 0.0, 0.001, 15000)
75 | 
76 | plot_original_data()
77 | 
78 | def predict(x, w, b):
79 |     return w*x + b
80 | x_new = 23.0
81 | y_new = predict(x_new, w, b)
82 | print(y_new)
83 | 


--------------------------------------------------------------------------------
/kernel_regression.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import math
 5 | 
 6 | from sklearn.linear_model import Ridge
 7 | from sklearn.preprocessing import PolynomialFeatures
 8 | from sklearn.pipeline import make_pipeline
 9 | from sklearn.kernel_ridge import KernelRidge
10 | 
11 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
12 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
13 | matplotlib.rcParams.update({'font.size': 25})
14 | 
15 | def f(x):
16 |     """ function to approximate by polynomial interpolation"""
17 |     return x * (x)
18 | 
19 | 
20 | # generate points used to plot
21 | x_plot = np.linspace(-5, 2, 100)
22 | 
23 | # generate points and keep a subset of them
24 | x = np.linspace(-5, 2, 100)
25 | rng = np.random.RandomState(0)
26 | rng.shuffle(x)
27 | x = np.sort(x[:50])
28 | noize = [(-5 + np.random.random()*5) for i in range(len(x))]
29 | y = f(x) + noize
30 | 
31 | # create matrix versions of these arrays
32 | X = x[:, np.newaxis]
33 | X_plot = x_plot[:, np.newaxis]
34 | 
35 | colors = ['red', 'blue', 'orange']
36 | lw = 2
37 | 
38 | def kernel(x1, x2, b = 2):
39 |     z = (x1 - x2) / b
40 |     return (1/math.sqrt(2 * 3.14)) * np.exp(-z**2/2)
41 | 
42 | fit = ["fit", "small overfit", "big overfit"]
43 | for count, degree in enumerate([0.1, 0.5, 3]):
44 |     plt.figure(count)
45 |     axes = plt.gca()
46 |     axes.set_xlim([-5,2])
47 |     axes.set_ylim([-10,30])
48 |     plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples")
49 |     model = KernelRidge(alpha=0.01, kernel=kernel, kernel_params = {'b':degree})
50 |     model.fit(X, y)
51 |     y_plot = model.predict(X_plot)
52 |     plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw,
53 |              label="b = " + str(degree))
54 | 
55 |     plt.legend(loc='upper right')
56 |     fig1 = plt.gcf()
57 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
58 |     fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
59 |     fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
60 |     fig1.savefig('../../Illustrations/kernel-regression-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
61 | 
62 | 
63 | plt.show()
64 | 


--------------------------------------------------------------------------------
/kernel_trick.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import math
  5 | 
  6 | from sklearn.linear_model import Ridge
  7 | from sklearn.preprocessing import PolynomialFeatures
  8 | from sklearn.pipeline import make_pipeline
  9 | from mpl_toolkits.mplot3d import Axes3D
 10 | 
 11 | import matplotlib
 12 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 13 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 14 | matplotlib.rcParams.update({'font.size': 18})
 15 | 
 16 | def f_outer(x1):
 17 | 	result = []
 18 | 	for x in x1:
 19 | 		side = random.uniform(0, 1)
 20 | 		sq = math.sqrt(10 * 10 - x * x)
 21 | 		if side > 0.5:
 22 | 			sq = sq * (-1)
 23 | 		result.append(sq)
 24 | 	return np.asarray(result)
 25 | 
 26 | def f_inner(x1):
 27 | 	result = []
 28 | 	for x in x1:
 29 | 		side = random.uniform(0, 1)
 30 | 		sq = math.sqrt(3 * 3 - x * x)
 31 | 		if side > 0.5:
 32 | 			sq = sq * (-1)
 33 | 		result.append(sq)
 34 | 	return np.asarray(result)
 35 | 
 36 | 
 37 | # generate points and keep a subset of them
 38 | x_inner = np.linspace(-3, 3, 100)
 39 | x_outer = np.linspace(-10, 10, 100)
 40 | 
 41 | rng = np.random.RandomState(0)
 42 | rng.shuffle(x_inner)
 43 | rng.shuffle(x_outer)
 44 | 
 45 | x_inner = np.sort(x_inner[:30])
 46 | x_outer = np.sort(x_outer[:30])
 47 | 
 48 | noize = [(-1 + np.random.random()) for i in range(len(x_inner))]
 49 | y_inner = f_inner(x_inner) + noize
 50 | 
 51 | noize = [(-1 + np.random.random()) for i in range(len(x_outer))]
 52 | y_outer = f_outer(x_outer) + noize
 53 | 
 54 | colors = ['blue', 'red']#, 'orange'
 55 | lw = 2
 56 | 
 57 | type_of_regression = ["linear regression", "regression of degree 10"]
 58 | fit = ["fit", "overfit"]
 59 | 
 60 | plt.figure(1)
 61 | axes = plt.gca()
 62 | axes.set_xlim([-11,11])
 63 | axes.set_ylim([-11,11])
 64 | 
 65 | plt.scatter(x_inner, y_inner, color='navy', s=30, marker='o')
 66 | plt.scatter(x_outer, y_outer, color='red', s=30, marker='o')
 67 | 
 68 | fig1 = plt.gcf()
 69 | 
 70 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 71 | fig1.savefig('../../Illustrations/kernel-trick-0.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 72 | fig1.savefig('../../Illustrations/kernel-trick-0.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 73 | fig1.savefig('../../Illustrations/kernel-trick-0.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 74 | 
 75 | x_inner_transformed = np.asarray([x * x for x in x_inner])
 76 | y_inner_transformed = np.asarray([math.sqrt(2) * x * y for x, y in zip(x_inner, y_inner)])
 77 | z_inner_transformed = np.asarray([y * y for y in y_inner])
 78 | 
 79 | x_outer_transformed = np.asarray([x * x for x in x_outer])
 80 | y_outer_transformed = np.asarray([math.sqrt(2) * x * y for x, y in zip(x_outer, y_outer)])
 81 | z_outer_transformed = np.asarray([y * y for y in y_outer])
 82 | 
 83 | fig = plt.figure(2)
 84 | ax = Axes3D(fig)
 85 | ax.set_yticks([-75, 0, 75])
 86 | #ax.set_xlim([-10,120])
 87 | #$ax.set_ylim([-120,120])
 88 | #ax.set_zlim([-120,120])
 89 | 
 90 | ax.scatter(x_inner_transformed, y_inner_transformed, z_inner_transformed, color='navy', marker='o')
 91 | ax.scatter(x_outer_transformed, y_outer_transformed, z_outer_transformed, color='red', marker='o')
 92 | 
 93 | ax.view_init(14, -77)
 94 | 
 95 | fig.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
 96 | fig.savefig('../../Illustrations/kernel-trick-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 97 | fig.savefig('../../Illustrations/kernel-trick-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 98 | fig.savefig('../../Illustrations/kernel-trick-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
 99 | 
100 | #plt.show()
101 | 


--------------------------------------------------------------------------------
/kmeans.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import matplotlib
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | 
  6 | from sklearn.datasets.samples_generator import make_blobs
  7 | from sklearn.metrics import pairwise_distances_argmin
  8 | from random import shuffle, random
  9 | from matplotlib.ticker import NullLocator
 10 | from scipy.spatial import Voronoi
 11 | 
 12 | 
 13 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 14 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 15 | matplotlib.rcParams.update({'font.size': 18})
 16 | 
 17 | x, _ = make_blobs(n_samples=50, centers=3, cluster_std=0.6, random_state=0)
 18 | 
 19 | #plt.scatter(x[:, 0], x[:, 1], s=50)
 20 | 
 21 | def voronoi_finite_polygons_2d(vor, radius=None):
 22 |     """
 23 |     
 24 |     Credit: https://gist.github.com/pv/8036995
 25 |     
 26 |     Reconstruct infinite voronoi regions in a 2D diagram to finite
 27 |     regions.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     vor : Voronoi
 32 |         Input diagram
 33 |     radius : float, optional
 34 |         Distance to 'points at infinity'.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     regions : list of tuples
 39 |         Indices of vertices in each revised Voronoi regions.
 40 |     vertices : list of tuples
 41 |         Coordinates for revised Voronoi vertices. Same as coordinates
 42 |         of input vertices, with 'points at infinity' appended to the
 43 |         end.
 44 | 
 45 |     """
 46 | 
 47 |     if vor.points.shape[1] != 2:
 48 |         raise ValueError("Requires 2D input")
 49 | 
 50 |     new_regions = []
 51 |     new_vertices = vor.vertices.tolist()
 52 | 
 53 |     center = vor.points.mean(axis=0)
 54 |     if radius is None:
 55 |         radius = vor.points.ptp().max()*2
 56 | 
 57 |     # Construct a map containing all ridges for a given point
 58 |     all_ridges = {}
 59 |     for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices):
 60 |         all_ridges.setdefault(p1, []).append((p2, v1, v2))
 61 |         all_ridges.setdefault(p2, []).append((p1, v1, v2))
 62 | 
 63 |     # Reconstruct infinite regions
 64 |     for p1, region in enumerate(vor.point_region):
 65 |         vertices = vor.regions[region]
 66 | 
 67 |         if all([v >= 0 for v in vertices]):
 68 |             # finite region
 69 |             new_regions.append(vertices)
 70 |             continue
 71 | 
 72 |         # reconstruct a non-finite region
 73 |         ridges = all_ridges[p1]
 74 |         new_region = [v for v in vertices if v >= 0]
 75 | 
 76 |         for p2, v1, v2 in ridges:
 77 |             if v2 < 0:
 78 |                 v1, v2 = v2, v1
 79 |             if v1 >= 0:
 80 |                 # finite ridge: already in the region
 81 |                 continue
 82 | 
 83 |             # Compute the missing endpoint of an infinite ridge
 84 | 
 85 |             t = vor.points[p2] - vor.points[p1] # tangent
 86 |             t /= np.linalg.norm(t)
 87 |             n = np.array([-t[1], t[0]])  # normal
 88 | 
 89 |             midpoint = vor.points[[p1, p2]].mean(axis=0)
 90 |             direction = np.sign(np.dot(midpoint - center, n)) * n
 91 |             far_point = vor.vertices[v2] + direction * radius
 92 | 
 93 |             new_region.append(len(new_vertices))
 94 |             new_vertices.append(far_point.tolist())
 95 | 
 96 |         # sort region counterclockwise
 97 |         vs = np.asarray([new_vertices[v] for v in new_region])
 98 |         c = vs.mean(axis=0)
 99 |         angles = np.arctan2(vs[:,1] - c[1], vs[:,0] - c[0])
100 |         new_region = np.array(new_region)[np.argsort(angles)]
101 | 
102 |         # finish
103 |         new_regions.append(new_region.tolist())
104 | 
105 |     return new_regions, np.asarray(new_vertices)
106 | 
107 | def find_clusters(x, n_clusters):
108 |     # randomly set cluster centroids
109 |     x_list = list(x)
110 |     shuffle(x_list)
111 |     centroids = np.array([[2 * random(), 4 * random()], [2 * random(), 4 * random()], [2 * random(), 4 * random()]])
112 | 
113 |     counter = 0
114 | 
115 |     plt.figure(counter)
116 | 
117 |     plt.scatter(x[:, 0], x[:, 1], s=50)
118 | 
119 |     ax = plt.gca()
120 |     ax.set_xlabel('$x_1$')
121 |     ax.set_ylabel('$x_2$')
122 |     plt.xlim(-3.0, 4.0)
123 |     plt.ylim(-1, 6)
124 | 
125 |     fig1 = plt.gcf()
126 | 
127 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
128 | 
129 |     fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
130 |     fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
131 |     fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
132 | 
133 |     #plt.show()
134 | 
135 |     
136 |     counter = 1
137 | 
138 |     while True:
139 | 
140 |         plt.figure(counter)
141 |         axes = plt.gca()
142 | 
143 |         # assign labels based on closest centroid
144 |         labels = pairwise_distances_argmin(x, centroids)
145 | 
146 |         plt.scatter(x[:, 0], x[:, 1], c=[l + 1 for l in labels], s=50, cmap='tab10', zorder=2);
147 | 
148 |         plt.scatter(centroids[:, 0], centroids[:, 1], c=[1,2,3], s=200, cmap='tab10', marker="s", facecolors='none', zorder=2);
149 |         plt.xlim(-3.0, 4.0)
150 |         plt.ylim(-1, 6)
151 | 
152 |         vor = Voronoi(centroids)
153 | 
154 |         # plot
155 |         regions, vertices = voronoi_finite_polygons_2d(vor, 300)
156 |         print("--")
157 |         print(regions)
158 |         print("--")
159 |         print(vertices)
160 | 
161 |         # colorize
162 |         for region in regions:
163 |             polygon = vertices[region]
164 |             plt.fill(*zip(*polygon), alpha=0.4, zorder=1)
165 | 
166 |         ax = plt.gca()
167 |         ax.set_xlabel('$x_1$')
168 |         ax.set_ylabel('$x_2$')
169 | 
170 |         fig1 = plt.gcf()
171 | 
172 |         #ax.set_axis_off()
173 |         fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
174 |         #plt.margins(0,0)
175 |         #ax.xaxis.set_major_locator(NullLocator())
176 |         #ax.yaxis.set_major_locator(NullLocator())
177 | 
178 |         
179 |         fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
180 |         fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
181 |         fig1.savefig('../../Illustrations/kmeans-' + str(counter) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
182 | 
183 |         #plt.show()
184 |         
185 |         # find new centroids as the average of examples
186 |         new_centroids = np.array([x[labels == i].mean(0) for i in range(n_clusters)])
187 |         
188 |         # check for convergence
189 |         if np.all(centroids == new_centroids):
190 |             break
191 |         centroids = new_centroids
192 | 
193 |         counter += 1
194 |     
195 |     return centroids, labels
196 | 
197 | centroids, labels = find_clusters(x, 3)
198 | 


--------------------------------------------------------------------------------
/linear_regression_fit.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.linear_model import Ridge
 5 | from sklearn.preprocessing import PolynomialFeatures
 6 | from sklearn.pipeline import make_pipeline
 7 | 
 8 | import matplotlib
 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
10 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
11 | matplotlib.rcParams.update({'font.size': 18})
12 | 
13 | def f(x):
14 |     """ function to approximate by polynomial interpolation"""
15 |     return 0.5 * x
16 | 
17 | 
18 | # generate points used to plot
19 | x_plot = np.linspace(-10, 10, 100)
20 | 
21 | # generate points and keep a subset of them
22 | x = np.linspace(-10, 10, 100)
23 | rng = np.random.RandomState(0)
24 | rng.shuffle(x)
25 | x = np.sort(x[:10])
26 | noize = [(-2 + np.random.random()*2) for i in range(len(x))]
27 | y = f(x) + noize
28 | 
29 | # create matrix versions of these arrays
30 | X = x[:, np.newaxis]
31 | X_plot = x_plot[:, np.newaxis]
32 | 
33 | colors = ['red', 'red']#, 'orange'
34 | lw = 2
35 | 
36 | 
37 | type_of_regression = ["linear regression", "regression of degree 10"]
38 | fit = ["fit", "overfit"]
39 | for count, degree in enumerate([1,10]):#, 2, 15
40 |     plt.figure(count)
41 |     axes = plt.gca()
42 |     axes.set_xlim([-10,10])
43 |     axes.set_ylim([-10,10])
44 |     plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples")
45 |     plt.xticks([-10.0, -5.0, 0.0, 5.0, 10.0])
46 |     plt.yticks([-10.0, -5.0, 0.0, 5.0, 10.0])
47 |     model = make_pipeline(PolynomialFeatures(degree), Ridge())
48 |     model.fit(X, y)
49 |     y_plot = model.predict(X_plot)
50 |     plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw,
51 |              label=type_of_regression[count])
52 | 
53 |     plt.legend(loc='best')
54 |     fig1 = plt.gcf()
55 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
56 |     fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
57 |     fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
58 |     fig1.savefig('../../Illustrations/linear-regression-' + fit[count] + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
59 | 
60 | 
61 | plt.show()
62 | 


--------------------------------------------------------------------------------
/multivariate_gaussian.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | 
 4 | from scipy.stats import multivariate_normal
 5 | from sklearn.linear_model import Ridge
 6 | from sklearn.preprocessing import PolynomialFeatures
 7 | from sklearn.pipeline import make_pipeline
 8 | from mpl_toolkits.mplot3d import Axes3D
 9 | 
10 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
11 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
12 | matplotlib.rcParams.update({'font.size': 18})
13 | 
14 | import matplotlib.pyplot as plt
15 | 
16 | mean = [0, 0]
17 | cov = [[1, 4/5], [3/4, 2]]  # diagonal covariance
18 | 
19 | x, y = np.random.multivariate_normal(mean, cov, 200).T
20 | fig = plt.figure(1)
21 | plt.plot(x, y, 'o')
22 | plt.axis('equal')
23 | plt.xlabel('$x^{(1)}$')
24 | plt.ylabel('$x^{(2)}$')
25 | 
26 | fig.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.02, hspace = 0, wspace = 0)
27 | fig.savefig('../../Illustrations/multivariate-gaussian-0.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
28 | fig.savefig('../../Illustrations/multivariate-gaussian-0.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
29 | fig.savefig('../../Illustrations/multivariate-gaussian-0.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
30 | 
31 | fig1 = plt.figure(2)
32 | 
33 | ax = Axes3D(fig1)
34 | 
35 | x1, y1 = np.mgrid[-5:5:.2, -5:5:.2]
36 | pos = np.empty(x1.shape + (2,))
37 | pos[:, :, 0] = x1; pos[:, :, 1] = y1
38 | rv = multivariate_normal(mean, cov)
39 | #ax.plot_surface(x1, y1, rv.pdf(pos), rstride=1, cstride=1, alpha=0.8, cmap='viridis', edgecolor='none')
40 | ax.plot_wireframe(x1, y1, rv.pdf(pos), rstride=2, cstride=2, color='gray')
41 | 
42 | z = [0] * len(x)
43 | ax.scatter(x, y, z)
44 | 
45 | ax.set_xlabel('$x^{(1)}$')
46 | ax.set_ylabel('$x^{(2)}$')
47 | ax.set_zlabel('pdf');
48 | ax.set_zticks([])
49 | ax.set_xticks([])
50 | ax.set_yticks([])
51 | 
52 | #ax.view_init(14, -77)
53 | 
54 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.9, left = 0.08, hspace = 0, wspace = 0)
55 | fig1.savefig('../../Illustrations/multivariate-gaussian-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
56 | fig1.savefig('../../Illustrations/multivariate-gaussian-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
57 | fig1.savefig('../../Illustrations/multivariate-gaussian-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
58 | 
59 | #plt.show()
60 | 


--------------------------------------------------------------------------------
/pdf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy as sp
 3 | import matplotlib.pyplot as plt
 4 | import math
 5 | 
 6 | import matplotlib
 7 | 
 8 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 9 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
10 | matplotlib.rcParams.update({'font.size': 18})
11 | 
12 | from sklearn.kernel_ridge import KernelRidge
13 | 
14 | mu1, sigma1 = 3, 0.4
15 | mu2, sigma2 = 5, 0.6
16 | 
17 | def sample_points():
18 |     s1 = np.random.normal(mu1, sigma1, 20)
19 | 
20 |     s2 = np.random.normal(mu2, sigma2, 20)
21 | 
22 |     return list(s1) + list(s2)
23 | 
24 | # generate points used to plot
25 | x_plot = np.linspace(0, 8, 100)
26 | 
27 | # generate points and keep a subset of them
28 | x = sample_points()
29 | 
30 | lw = 2
31 | 
32 | def kernel(x1, x2, b = 2):
33 |     z = (x1 - x2) / b
34 |     return (1/math.sqrt(2 * 3.14)) * np.exp(-z**2/2)
35 | 
36 | def fb(x, data, b):
37 |     return 1/(len(data)*b) * sum([kernel(x, xi, b) for xi in data])
38 | 
39 | def sum_pdf(x):
40 |     result = []
41 |     for i in range(len(x)):
42 |         result.append((sp.stats.norm.pdf(x, mu1, sigma1)[i] + sp.stats.norm.pdf(x, mu2, sigma2)[i])/2)
43 |     return result
44 | 
45 | plt.figure(0)
46 | axes = plt.gca()
47 | axes.set_ylim([0,0.6])
48 | plt.plot(x_plot,sum_pdf(x_plot), color='red')
49 | section = np.arange(0, 8, 1/20.)
50 | plt.fill_between(section,sum_pdf(section), color='#e6eeff')
51 | plt.text(3.2, 0.04, "Area = 1.0", fontsize=18)
52 | plt.xlabel("$x$")
53 | plt.ylabel("$pdf$")
54 | 
55 | #plt.legend(loc='lower left')
56 | fig1 = plt.gcf()
57 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0)
58 | fig1.savefig('../../Illustrations/pdf.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
59 | fig1.savefig('../../Illustrations/pdf.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
60 | fig1.savefig('../../Illustrations/pdf.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
61 | 
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/pmf.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.ticker import FuncFormatter
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import matplotlib
 5 | 
 6 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 7 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 8 | matplotlib.rcParams.update({'font.size': 18})
 9 | 
10 | x = np.arange(4)
11 | pr = [0.1, 0.3, 0.4, 0.2]
12 | 
13 | axes = plt.gca()
14 | axes.set_ylim([0,0.6])
15 | 
16 | plt.bar(x, pr, color="red")
17 | plt.xticks(x, ('1', '2', '3', '4'))
18 | plt.yticks(np.arange(0, 0.7, 0.1))
19 | plt.xlabel("$x$")
20 | plt.ylabel("$pmf$")
21 | 
22 | fig1 = plt.gcf()
23 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0)
24 | fig1.savefig('../../Illustrations/pmf.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
25 | fig1.savefig('../../Illustrations/pmf.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
26 | fig1.savefig('../../Illustrations/pmf.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
27 | 
28 | 
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/prediction_strength.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import matplotlib
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import random
  6 | import sys
  7 | import math
  8 | 
  9 | from sklearn.datasets.samples_generator import make_blobs
 10 | from sklearn.metrics import pairwise_distances_argmin
 11 | from random import shuffle
 12 | from scipy.spatial import Voronoi
 13 | from scipy.spatial import distance
 14 | 
 15 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 16 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 17 | matplotlib.rcParams.update({'font.size': 25})
 18 | 
 19 | random_state = 0
 20 | 
 21 | ## how many clusters do you want in your synthetic data?
 22 | centers = 2
 23 | 
 24 | x, _ = make_blobs(n_samples=300, centers=centers, cluster_std=0.6, random_state=random_state)
 25 | 
 26 | plt.figure(10000)
 27 | plt.scatter(x[:, 0], x[:, 1], s=20, cmap='viridis');
 28 | plt.xlim(-1, 4.0)
 29 | plt.ylim(-1, math.ceil(max(x[:, 1])))
 30 | plt.xticks(np.arange(int(min(x[:, 0])), math.ceil(max(x[:, 0]))+1, 1))
 31 | plt.yticks(np.arange(int(min(x[:, 1])), math.ceil(max(x[:, 1]))+1, 2), rotation='vertical')
 32 | 
 33 | ax = plt.gca()
 34 | ax.set_xlabel('$x_1$')
 35 | ax.set_ylabel('$x_2$')
 36 | 
 37 | fig1 = plt.gcf()
 38 | fig1.subplots_adjust(top = 0.98, bottom = 0.16, right = 0.98, left = 0.12, hspace = 0, wspace = 0)
 39 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '.eps', format='eps', dpi=1000)
 40 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '.pdf', format='pdf', dpi=1000)
 41 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '.png', dpi=1000)
 42 | 
 43 | x_list = list(x)
 44 | 
 45 | random.Random(random_state).shuffle(x_list)
 46 | 
 47 | x_split = {}
 48 | 
 49 | x_split["train"] = np.array(x_list[:len(x_list)/2])
 50 | 
 51 | x_split["test"] = np.array(x_list[len(x_list)/2:])
 52 | 
 53 | centroids_splits = {}
 54 | labels_splits = {}
 55 | counter = 100
 56 | 
 57 | def find_clusters(x, n_clusters, current_split):
 58 | 
 59 |     current_split_suffled = list(x_split[current_split])[:]
 60 |     shuffle(current_split_suffled)
 61 |     current_split_suffled = np.array(current_split_suffled)
 62 | 
 63 |     centroids = np.array(current_split_suffled[:n_clusters])
 64 | 
 65 |     while True:
 66 | 
 67 |         # assign labels based on closest centroid
 68 |         #print centroids
 69 | 
 70 |         #print "len train", len(x_split[current_split])
 71 |         labels = pairwise_distances_argmin(x_split[current_split], centroids)
 72 |         #print "len labels", len(labels)
 73 | 
 74 |         
 75 |         # find new centroids as the average of examples
 76 |         new_centroids = np.array([x_split[current_split][labels == i].mean(0) for i in range(n_clusters)])
 77 |         
 78 |         # check for convergence
 79 |         if np.all(centroids == new_centroids):
 80 |             break
 81 |         centroids = new_centroids
 82 | 
 83 |     return centroids, labels
 84 | 
 85 | def get_examples_from_cluster(j, test_points, test_labels):
 86 |     examples = []
 87 |     for e, l in zip(test_points, test_labels):
 88 |         if l == j:
 89 |             examples.append(e)
 90 |     return examples
 91 | 
 92 | def get_closest_centroid(example, centroids):
 93 |     min_distance = sys.float_info.max
 94 |     min_centroid = 0
 95 |     for c in centroids:
 96 |         if distance.euclidean(example, c) < min_distance:
 97 |             min_distance = distance.euclidean(example, c)
 98 |             min_centroid = c
 99 |     return min_centroid
100 | 
101 | def compute_strength(k, train_centroids, test_points, test_labels):
102 |     D = np.zeros(shape=(len(test_points),len(test_points)))
103 |     for x1, l1, c1 in zip(test_points, test_labels, list(range(len(test_points)))):
104 |         for x2, l2, c2 in zip(test_points, test_labels, list(range(len(test_points)))):
105 |             if tuple(x1) != tuple(x2):
106 |                 if tuple(get_closest_centroid(x1, train_centroids)) == tuple(get_closest_centroid(x2, train_centroids)):
107 |                     D[c1,c2] = 1.0
108 | 
109 |     ss = []
110 |     for j in range(k):
111 |         s = 0
112 |         examples_j = get_examples_from_cluster(j, test_points, test_labels)
113 |         for x1, l1, c1 in zip(test_points, test_labels, list(range(len(test_points)))):
114 |             for x2, l2, c2 in zip(test_points, test_labels, list(range(len(test_points)))):
115 |                 if tuple(x1) != tuple(x2) and l1 == l2 and l1 == j:
116 |                     s += D[c1,c2]
117 |         s = (1.0/(float(len(examples_j))*float(len(examples_j) - 1)))*s
118 |         ss += [s]
119 | 
120 |     return min(ss)
121 | 
122 | strengths = []
123 | ks = [1,2,3,4,5,6,7,8]
124 | for k in ks:
125 |     print("k", k)
126 |     for current_split in ["train", "test"]:
127 |         counter += 1
128 |         centroids, labels = find_clusters(x, k, current_split)
129 | 
130 |         centroids_splits[current_split] = centroids
131 |         labels_splits[current_split] = labels
132 |     s = compute_strength(k, centroids_splits["train"], x_split["test"], labels_splits["test"])
133 |     strengths += [s]
134 |     print(s)
135 | 
136 | plt.figure(10001)
137 | plt.plot(ks, strengths);
138 | plt.xticks(np.arange(1, 9, 1))
139 | plt.yticks(np.arange(0, 1.05, 0.2), rotation='vertical')
140 | 
141 | ax = plt.gca()
142 | ax.set_xlabel('$k$')
143 | ax.set_ylabel('$\\operatorname{ps}(k)$')
144 | 
145 | fig1 = plt.gcf()
146 | fig1.subplots_adjust(top = 0.98, bottom = 0.15, right = 0.98, left = 0.15, hspace = 0, wspace = 0)
147 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '_search.eps', format='eps', dpi=1000)
148 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '_search.pdf', format='pdf', dpi=1000)
149 | fig1.savefig('../../Illustrations/prediction_strength_centers_' + str(centers) + '_search.png', dpi=1000)
150 | 


--------------------------------------------------------------------------------
/standard_logistic_function.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pylab as plt
 2 | import matplotlib
 3 | import numpy as np
 4 | 
 5 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 6 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 7 | matplotlib.rcParams.update({'font.size': 18})
 8 | 
 9 | def sigmoid(x):
10 |     """
11 |     evaluate the boltzman function with midpoint xmid and time constant tau
12 |     over x
13 |     """
14 |     return 1. / (1. + np.exp(-x))
15 | 
16 | 
17 | x = np.linspace(-6, 6, 100)
18 | S = sigmoid(x)
19 | plt.plot(x, S, color='red', lw=2)
20 | plt.xlabel("$x$")
21 | plt.ylabel("$f(x)$")
22 | 
23 | fig1 = plt.gcf()
24 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
25 | fig1.savefig('../../Illustrations/standard_logistic_function.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
26 | fig1.savefig('../../Illustrations/standard_logistic_function.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
27 | fig1.savefig('../../Illustrations/standard_logistic_function.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
28 | 
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/under_over_fitting.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.linear_model import Ridge
 5 | from sklearn.preprocessing import PolynomialFeatures
 6 | from sklearn.pipeline import make_pipeline
 7 | 
 8 | import matplotlib
 9 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
10 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
11 | matplotlib.rcParams.update({'font.size': 25})
12 | 
13 | 
14 | def f(x):
15 |     """ function to approximate by polynomial interpolation"""
16 |     return x * (x)
17 | 
18 | 
19 | # generate points used to plot
20 | x_plot = np.linspace(-5, 2, 100)
21 | 
22 | # generate points and keep a subset of them
23 | x = np.linspace(-5, 2, 100)
24 | rng = np.random.RandomState(0)
25 | rng.shuffle(x)
26 | x = np.sort(x[:20])
27 | noize = [(-5 + np.random.random()*5) for i in range(len(x))]
28 | y = f(x) + noize
29 | 
30 | # create matrix versions of these arrays
31 | X = x[:, np.newaxis]
32 | X_plot = x_plot[:, np.newaxis]
33 | 
34 | colors = ['red', 'blue', 'orange']
35 | lw = 2
36 | 
37 | fit = ["underfit", "fit", "overfit"]
38 | for count, degree in enumerate([1, 2, 15]):
39 |     plt.figure(count)
40 |     axes = plt.gca()
41 |     axes.set_xlim([-5,2])
42 |     axes.set_ylim([-10,30])
43 |     plt.scatter(x, y, color='navy', s=30, marker='o', label="training examples")
44 |     model = make_pipeline(PolynomialFeatures(degree), Ridge())
45 |     model.fit(X, y)
46 |     y_plot = model.predict(X_plot)
47 |     plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw,
48 |              label=("degree %d (" + fit[count] + ")") % degree)
49 | 
50 |     plt.legend(loc='best')
51 |     fig1 = plt.gcf()
52 |     fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.08, hspace = 0, wspace = 0)
53 |     fig1.savefig('../../Illustrations/under-over-fit' + str(count) + '.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
54 |     fig1.savefig('../../Illustrations/under-over-fit-' + str(count) + '.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
55 |     fig1.savefig('../../Illustrations/under-over-fit-' + str(count) + '.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0)
56 | 
57 | 
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/vector.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | matplotlib.rcParams['mathtext.fontset'] = 'stix'
 5 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 6 | matplotlib.rcParams.update({'font.size': 18})
 7 | 
 8 | 
 9 | plt.figure(1)
10 | plt.quiver([0, 0, 0], [0, 0, 0], [2, -2, 1], [3, 5, 0], color=['r','b','g'], angles='xy', scale_units='xy', scale=1)
11 | plt.xlim(-3, 3)
12 | plt.ylim(-1, 6)
13 | plt.xlabel('$x^{(1)}$')
14 | plt.ylabel('$x^{(2)}$')
15 | fig1 = plt.gcf()
16 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0)
17 | fig1.savefig('../../Illustrations/vector-0.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
18 | fig1.savefig('../../Illustrations/vector-0.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
19 | fig1.savefig('../../Illustrations/vector-0.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
20 | plt.show()
21 | 
22 | plt.figure(2)
23 | plt.scatter([2, -2, 1], [3, 5, 0], color=['r','b','g'])
24 | plt.xlim(-3, 3)
25 | plt.ylim(-1, 6)
26 | plt.xlabel('$x^{(1)}$')
27 | plt.ylabel('$x^{(2)}$')
28 | fig1 = plt.gcf()
29 | fig1.subplots_adjust(top = 0.98, bottom = 0.1, right = 0.98, left = 0.12, hspace = 0, wspace = 0)
30 | fig1.savefig('../../Illustrations/vector-1.eps', format='eps', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
31 | fig1.savefig('../../Illustrations/vector-1.pdf', format='pdf', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
32 | fig1.savefig('../../Illustrations/vector-1.png', dpi=1000, bbox_inches = 'tight', pad_inches = 0.1)
33 | plt.show()
34 | 


--------------------------------------------------------------------------------