├── code ├── activation_functions │ ├── relu_plot.py │ ├── sine_plot.py │ ├── softplus_plot.py │ ├── softsign_plot.py │ ├── repu_plot.py │ ├── clipping_plot.py │ ├── leaky_relu_plot.py │ ├── swish_plot.py │ ├── gelu_plot.py │ ├── plot_util.py │ ├── logistic_plot.py │ ├── heaviside_plot.py │ ├── elu_plot.py │ └── tanh_plot.py ├── fc-ann2.py ├── loss_functions │ ├── l1loss_plot.py │ ├── mseloss_plot.py │ ├── crossentropyloss_plot.py │ ├── kldloss_plot.py │ └── huberloss_plot.py ├── gradient_plot1.py ├── brownian_motion.py ├── conv-ann-ex.py ├── res-ann.py ├── gradient_plot2.py ├── conv-ann.py ├── optimization_methods │ ├── adagrad.py │ ├── rmsprop.py │ ├── momentum_sgd_bias_adj.py │ ├── rmsprop_bias_adj.py │ ├── adadelta.py │ ├── adam.py │ ├── nesterov_sgd.py │ ├── midpoint_sgd.py │ ├── momentum_sgd.py │ ├── sgd2.py │ └── sgd.py ├── fc-ann.py ├── fc-ann-manual.py ├── example_GD_momentum_plots.py ├── kolmogorov.py ├── dgm.py ├── pinn.py ├── mnist_optim.py └── mnist.py └── README.md /code/activation_functions/relu_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-.5,2)) 7 | 8 | x = np.linspace(-2, 2, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x)) 11 | 12 | plt.savefig("../../plots/relu.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/sine_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2*np.pi,2*np.pi), (-1.5,1.5)) 7 | 8 | x = np.linspace(-2*np.pi, 2*np.pi, 100) 9 | 10 | ax.plot(x, np.sin(x)) 11 | 12 | plt.savefig("../../plots/sine.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/softplus_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-4,4), (-.5,4)) 7 | 8 | x = np.linspace(-4, 4, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x), label='ReLU') 11 | ax.plot(x, tf.keras.activations.softplus(x), label='softplus') 12 | ax.legend() 13 | 14 | plt.savefig("../../plots/softplus.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/softsign_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-5,5), (-1.5,1.5)) 7 | 8 | x = np.linspace(-5, 5, 100) 9 | 10 | ax.plot(x, tf.keras.activations.tanh(x), label='tanh') 11 | ax.plot(x, tf.keras.activations.softsign(x), label='softsign') 12 | ax.legend() 13 | 14 | plt.savefig("../../plots/softsign.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code for the book *Mathematical Introduction to Deep Learning: Methods, Implementations, and Theory* 2 | 3 | This repository is a companion to the book *Mathematical Introduction to Deep Learning: Methods, Implementations, and Theory* by Arnulf Jentzen, Benno Kuckuck, and Philippe von Wurstemberger. It contains all of the Python code from the book. 4 | 5 | The book is currently available as a preprint [on the arXiv](https://arxiv.org/abs/2310.20360). -------------------------------------------------------------------------------- /code/activation_functions/repu_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-.5,3)) 7 | ax.set_ylim(-.5, 3) 8 | 9 | x = np.linspace(-2, 2, 100) 10 | 11 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU') 12 | ax.plot(x, tf.keras.activations.relu(x)**2, label='RePU') 13 | ax.legend() 14 | 15 | plt.savefig("../../plots/repu.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/clipping_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-.5,2)) 7 | 8 | x = np.linspace(-2, 2, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU') 11 | ax.plot(x, tf.keras.activations.relu(x, max_value=1), 12 | label='(0,1)-clipping') 13 | ax.legend() 14 | 15 | plt.savefig("../../plots/clipping.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/leaky_relu_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-.5,2)) 7 | 8 | x = np.linspace(-2, 2, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU') 11 | ax.plot(x, tf.keras.activations.relu(x, alpha=0.1), 12 | label='leaky ReLU') 13 | ax.legend() 14 | 15 | plt.savefig("../../plots/leaky_relu.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/fc-ann2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | # A Module whose forward method is simply a composition of Modules 5 | # can be represented using the torch.nn.Sequential class 6 | model = nn.Sequential( 7 | nn.Linear(3, 20), 8 | nn.ReLU(), 9 | nn.Linear(20, 30), 10 | nn.ReLU(), 11 | nn.Linear(30, 1), 12 | ) 13 | 14 | # Prints a summary of the model architecture 15 | print(model) 16 | 17 | x0 = torch.Tensor([1, 2, 3]) 18 | print(model(x0)) -------------------------------------------------------------------------------- /code/activation_functions/swish_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-4,3), (-.5,3)) 7 | 8 | x = np.linspace(-4, 3, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x), label='ReLU') 11 | ax.plot(x, tf.keras.activations.gelu(x), label='GELU') 12 | ax.plot(x, tf.keras.activations.swish(x), label='swish') 13 | ax.legend() 14 | 15 | plt.savefig("../../plots/swish.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/gelu_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-4,3), (-.5,3)) 7 | 8 | x = np.linspace(-4, 3, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x), label='ReLU') 11 | ax.plot(x, tf.keras.activations.softplus(x), label='softplus') 12 | ax.plot(x, tf.keras.activations.gelu(x), label='GELU') 13 | ax.legend() 14 | 15 | plt.savefig("../../plots/gelu.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/plot_util.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | def setup_axis(xlim, ylim): 4 | _, ax = plt.subplots() 5 | 6 | ax.set_aspect("equal") 7 | ax.set_xlim(xlim) 8 | ax.set_ylim(ylim) 9 | ax.spines["left"].set_position("zero") 10 | ax.spines["bottom"].set_position("zero") 11 | ax.spines["right"].set_color("none") 12 | ax.spines["top"].set_color("none") 13 | for s in ax.spines.values(): 14 | s.set_zorder(0) 15 | 16 | return ax 17 | -------------------------------------------------------------------------------- /code/activation_functions/logistic_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-3,3), (-.5,1.5)) 7 | 8 | x = np.linspace(-3, 3, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x, max_value=1), 11 | label='(0,1)-clipping') 12 | ax.plot(x, tf.keras.activations.sigmoid(x), 13 | label='standard logistic') 14 | ax.legend() 15 | 16 | plt.savefig("../../plots/logistic.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/heaviside_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-3,3), (-.5,1.5)) 7 | 8 | x = np.linspace(-3, 3, 100) 9 | 10 | ax.plot(x[0:50], [0]*50, 'C0') 11 | ax.plot(x[50:100], [1]*50, 'C0', label='Heaviside') 12 | ax.plot(x, tf.keras.activations.sigmoid(x), 'C1', 13 | label='standard logistic') 14 | ax.legend() 15 | 16 | plt.savefig("../../plots/heaviside.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/loss_functions/l1loss_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-.5,2)) 7 | 8 | x = np.linspace(-2, 2, 100) 9 | 10 | mae_loss = tf.keras.losses.MeanAbsoluteError( 11 | reduction=tf.keras.losses.Reduction.NONE) 12 | zero = tf.zeros([100,1]) 13 | 14 | ax.plot(x, mae_loss(x.reshape([100,1]),zero), 15 | label='ℓ¹-error') 16 | ax.legend() 17 | 18 | plt.savefig("../../plots/l1loss.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/activation_functions/elu_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-1,2)) 7 | 8 | x = np.linspace(-2, 2, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU') 11 | ax.plot(x, tf.keras.activations.relu(x, alpha=0.1), linewidth=2, label='leaky ReLU') 12 | ax.plot(x, tf.keras.activations.elu(x), linewidth=0.9, label='ELU') 13 | ax.legend() 14 | 15 | plt.savefig("../../plots/elu.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/loss_functions/mseloss_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-2,2), (-.5,2)) 7 | 8 | x = np.linspace(-2, 2, 100) 9 | 10 | mse_loss = tf.keras.losses.MeanSquaredError( 11 | reduction=tf.keras.losses.Reduction.NONE) 12 | zero = tf.zeros([100,1]) 13 | 14 | ax.plot(x, mse_loss(x.reshape([100,1]),zero), 15 | label='Mean squared error') 16 | ax.legend() 17 | 18 | plt.savefig("../../plots/mseloss.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/gradient_plot1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def f(x): 5 | return x**4 - 3 * x**2 6 | 7 | def nabla_f(x): 8 | return 4 * x**3 - 6 * x 9 | 10 | plt.figure() 11 | 12 | # Plot graph of f 13 | x = np.linspace(-2,2,100) 14 | plt.plot(x,f(x)) 15 | 16 | # Plot arrows 17 | for x in np.linspace(-1.9,1.9,21): 18 | d = nabla_f(x) 19 | plt.arrow(x, f(x), -.05 * d, 0, 20 | length_includes_head=True, head_width=0.08, 21 | head_length=0.05, color='b') 22 | 23 | plt.savefig("../plots/gradient_plot1.pdf") -------------------------------------------------------------------------------- /code/activation_functions/tanh_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-3,3), (-1.5,1.5)) 7 | 8 | x = np.linspace(-3, 3, 100) 9 | 10 | ax.plot(x, tf.keras.activations.relu(x+1, max_value=2)-1, 11 | label='(-1,1)-clipping') 12 | ax.plot(x, tf.keras.activations.sigmoid(x), 13 | label='standard logistic') 14 | ax.plot(x, tf.keras.activations.tanh(x), label='tanh') 15 | ax.legend() 16 | 17 | plt.savefig("../../plots/tanh.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/brownian_motion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def generate_brownian_motion(T, N): 5 | increments = np.random.randn(N) * np.sqrt(T/N) 6 | BM = np.cumsum(increments) 7 | BM = np.insert(BM, 0, 0) 8 | return BM 9 | 10 | T = 1 11 | N = 1000 12 | t_values = np.linspace(0, T, N+1) 13 | 14 | fig, axarr = plt.subplots(2, 2) 15 | 16 | for i in range(2): 17 | for j in range(2): 18 | BM = generate_brownian_motion(T, N) 19 | axarr[i, j].plot(t_values, BM) 20 | 21 | plt.tight_layout() 22 | plt.savefig('../plots/brownian_motions.pdf') 23 | plt.show() -------------------------------------------------------------------------------- /code/loss_functions/crossentropyloss_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((0,1), (0,3)) 7 | 8 | ax.set_aspect(.3) 9 | 10 | x = np.linspace(0, 1, 100) 11 | 12 | cce_loss = tf.keras.losses.CategoricalCrossentropy( 13 | reduction=tf.keras.losses.Reduction.NONE) 14 | y = tf.constant([[0.3, 0.7]] * 100, shape=(100, 2)) 15 | 16 | X = tf.stack([x,1-x], axis=1) 17 | 18 | ax.plot(x, cce_loss(y,X), label='Cross-entropy') 19 | ax.legend() 20 | 21 | plt.savefig("../../plots/crossentropyloss.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/conv-ann-ex.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | model = nn.Sequential( 6 | nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(2, 2)), 7 | nn.ReLU(), 8 | nn.Conv2d(in_channels=2, out_channels=1, kernel_size=(1, 1)), 9 | ) 10 | 11 | with torch.no_grad(): 12 | model[0].weight.set_( 13 | torch.Tensor([[[[0, 0], [0, 0]]], [[[1, 0], [0, 1]]]]) 14 | ) 15 | model[0].bias.set_(torch.Tensor([1, -1])) 16 | model[2].weight.set_(torch.Tensor([[[[-2]], [[2]]]])) 17 | model[2].bias.set_(torch.Tensor([3])) 18 | 19 | x0 = torch.Tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]) 20 | print(model(x0)) 21 | -------------------------------------------------------------------------------- /code/res-ann.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class ResidualANN(nn.Module): 5 | def __init__(self): 6 | super().__init__() 7 | self.affine1 = nn.Linear(3, 10) 8 | self.activation1 = nn.ReLU() 9 | self.affine2 = nn.Linear(10, 20) 10 | self.activation2 = nn.ReLU() 11 | self.affine3 = nn.Linear(20, 10) 12 | self.activation3 = nn.ReLU() 13 | self.affine4 = nn.Linear(10, 1) 14 | 15 | def forward(self, x0): 16 | x1 = self.activation1(self.affine1(x0)) 17 | x2 = self.activation2(self.affine2(x1)) 18 | x3 = self.activation3(x1 + self.affine3(x2)) 19 | x4 = self.affine4(x3) 20 | return x4 21 | -------------------------------------------------------------------------------- /code/loss_functions/kldloss_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((0,1), (0,3)) 7 | 8 | ax.set_aspect(.3) 9 | 10 | x = np.linspace(0, 1, 100) 11 | 12 | kld_loss = tf.keras.losses.KLDivergence( 13 | reduction=tf.keras.losses.Reduction.NONE) 14 | cce_loss = tf.keras.losses.CategoricalCrossentropy( 15 | reduction=tf.keras.losses.Reduction.NONE) 16 | y = tf.constant([[0.3, 0.7]] * 100, shape=(100, 2)) 17 | 18 | X = tf.stack([x,1-x], axis=1) 19 | 20 | ax.plot(x, kld_loss(y,X), label='Kullback-Leibler divergence') 21 | ax.plot(x, cce_loss(y,X), label='Cross-entropy') 22 | ax.legend() 23 | 24 | plt.savefig("../../plots/kldloss.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/loss_functions/huberloss_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import matplotlib.pyplot as plt 4 | import plot_util 5 | 6 | ax = plot_util.setup_axis((-3,3), (-.5,4)) 7 | 8 | x = np.linspace(-3, 3, 100) 9 | 10 | mse_loss = tf.keras.losses.MeanSquaredError( 11 | reduction=tf.keras.losses.Reduction.NONE) 12 | mae_loss = tf.keras.losses.MeanAbsoluteError( 13 | reduction=tf.keras.losses.Reduction.NONE) 14 | huber_loss = tf.keras.losses.Huber( 15 | reduction=tf.keras.losses.Reduction.NONE) 16 | 17 | zero = tf.zeros([100,1]) 18 | 19 | ax.plot(x, mse_loss(x.reshape([100,1]),zero)/2., 20 | label='Scaled mean squared error') 21 | ax.plot(x, mae_loss(x.reshape([100,1]),zero), 22 | label='ℓ¹-error') 23 | ax.plot(x, huber_loss(x.reshape([100,1]),zero), 24 | label='1-Huber-error') 25 | ax.legend() 26 | 27 | plt.savefig("../../plots/huberloss.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/gradient_plot2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | K = [1., 10.] 5 | vartheta = np.array([1., 1.]) 6 | 7 | def f(x, y): 8 | result = K[0] / 2. * np.abs(x - vartheta[0])**2 \ 9 | + K[1] / 2. * np.abs(y - vartheta[1])**2 10 | return result 11 | 12 | def nabla_f(x): 13 | return K * (x - vartheta) 14 | 15 | plt.figure() 16 | 17 | # Plot contour lines of f 18 | x = np.linspace(-3., 7., 100) 19 | y = np.linspace(-2., 4., 100) 20 | X, Y = np.meshgrid(x, y) 21 | Z = f(X, Y) 22 | cp = plt.contour(X, Y, Z, colors="black", 23 | levels = [0.5,2,4,8,16], 24 | linestyles=":") 25 | 26 | # Plot arrows along contour lines 27 | for l in [0.5,2,4,8,16]: 28 | for d in np.linspace(0, 2.*np.pi, 10, endpoint=False): 29 | x = np.cos(d) / ((K[0] / (2*l))**.5) + vartheta[0] 30 | y = np.sin(d) / ((K[1] / (2*l))**.5) + vartheta[1] 31 | grad = nabla_f(np.array([x,y])) 32 | plt.arrow(x, y, -.05 * grad[0], -.05 * grad[1], 33 | length_includes_head=True, head_width=.08, 34 | head_length=.1, color='b') 35 | 36 | plt.savefig("../plots/gradient_plot2.pdf") -------------------------------------------------------------------------------- /code/conv-ann.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConvolutionalANN(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | # The convolutional layer defined here takes any tensor of 9 | # shape (1, n, m) [a single input] or (N, 1, n, m) [a batch 10 | # of N inputs] where N, n, m are natural numbers satisfying 11 | # n >= 3 and m >= 3. 12 | self.conv1 = nn.Conv2d( 13 | in_channels=1, out_channels=5, kernel_size=(3, 3) 14 | ) 15 | self.activation1 = nn.ReLU() 16 | self.conv2 = nn.Conv2d( 17 | in_channels=5, out_channels=5, kernel_size=(5, 3) 18 | ) 19 | 20 | def forward(self, x0): 21 | x1 = self.activation1(self.conv1(x0)) 22 | print(x1.shape) 23 | x2 = self.conv2(x1) 24 | print(x2.shape) 25 | return x2 26 | 27 | 28 | model = ConvolutionalANN() 29 | x0 = torch.rand(1, 20, 20) 30 | # This will print the shapes of the outputs of the two layers of 31 | # the model, in this case: 32 | # torch.Size([5, 18, 18]) 33 | # torch.Size([5, 14, 16]) 34 | model(x0) 35 | -------------------------------------------------------------------------------- /code/optimization_methods/adagrad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.02 20 | eps = 1e-10 21 | 22 | sum_sq_grad = [p.clone().detach().fill_(eps) for p in net.parameters()] 23 | 24 | for n in range(N): 25 | indices = torch.randint(0, M, (J,)) 26 | 27 | x = X[indices] 28 | y = Y[indices] 29 | 30 | net.zero_grad() 31 | 32 | loss_val = loss(net(x), y) 33 | loss_val.backward() 34 | 35 | with torch.no_grad(): 36 | for a, p in zip(sum_sq_grad, net.parameters()): 37 | a.add_(p.grad * p.grad) 38 | p.sub_(lr * a.rsqrt() * p.grad) 39 | 40 | if n % 1000 == 0: 41 | with torch.no_grad(): 42 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 43 | y = torch.sin(x) 44 | loss_val = loss(net(x), y) 45 | print(f"Iteration: {n+1}, Loss: {loss_val}") 46 | -------------------------------------------------------------------------------- /code/fc-ann.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class FullyConnectedANN(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | # Define the layers of the network in terms of Modules. 9 | # nn.Linear(3, 20) represents an affine function defined 10 | # by a 20x3 weight matrix and a 20-dimensional bias vector. 11 | self.affine1 = nn.Linear(3, 20) 12 | # The torch.nn.ReLU class simply wraps the 13 | # torch.nn.functional.relu function as a Module. 14 | self.activation1 = nn.ReLU() 15 | self.affine2 = nn.Linear(20, 30) 16 | self.activation2 = nn.ReLU() 17 | self.affine3 = nn.Linear(30, 1) 18 | 19 | def forward(self, x0): 20 | x1 = self.activation1(self.affine1(x0)) 21 | x2 = self.activation2(self.affine2(x1)) 22 | x3 = self.affine3(x2) 23 | return x3 24 | 25 | 26 | model = FullyConnectedANN() 27 | 28 | x0 = torch.Tensor([1, 2, 3]) 29 | print(model(x0)) 30 | 31 | # Assigning a Module to an instance variable of a Module registers 32 | # all of the former's parameters as parameters of the latter 33 | for p in model.parameters(): 34 | print(p) -------------------------------------------------------------------------------- /code/optimization_methods/rmsprop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.001 20 | beta = 0.9 21 | eps = 1e-10 22 | 23 | moments = [p.clone().detach().zero_() for p in net.parameters()] 24 | 25 | for n in range(N): 26 | indices = torch.randint(0, M, (J,)) 27 | 28 | x = X[indices] 29 | y = Y[indices] 30 | 31 | net.zero_grad() 32 | 33 | loss_val = loss(net(x), y) 34 | loss_val.backward() 35 | 36 | with torch.no_grad(): 37 | for m, p in zip(moments, net.parameters()): 38 | m.mul_(beta) 39 | m.add_((1 - beta) * p.grad * p.grad) 40 | p.sub_(lr * (eps + m).rsqrt() * p.grad) 41 | 42 | if n % 1000 == 0: 43 | with torch.no_grad(): 44 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 45 | y = torch.sin(x) 46 | loss_val = loss(net(x), y) 47 | print(f"Iteration: {n+1}, Loss: {loss_val}") 48 | -------------------------------------------------------------------------------- /code/optimization_methods/momentum_sgd_bias_adj.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.01 20 | alpha = 0.99 21 | adj = 1 22 | 23 | momentum = [p.clone().detach().zero_() for p in net.parameters()] 24 | 25 | for n in range(N): 26 | indices = torch.randint(0, M, (J,)) 27 | 28 | x = X[indices] 29 | y = Y[indices] 30 | 31 | net.zero_grad() 32 | 33 | loss_val = loss(net(x), y) 34 | loss_val.backward() 35 | 36 | adj *= alpha 37 | 38 | with torch.no_grad(): 39 | for m, p in zip(momentum, net.parameters()): 40 | m.mul_(alpha) 41 | m.add_((1-alpha) * p.grad) 42 | p.sub_(lr * m / (1 - adj)) 43 | 44 | if n % 1000 == 0: 45 | with torch.no_grad(): 46 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 47 | y = torch.sin(x) 48 | loss_val = loss(net(x), y) 49 | print(f"Iteration: {n+1}, Loss: {loss_val}") 50 | -------------------------------------------------------------------------------- /code/optimization_methods/rmsprop_bias_adj.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.001 20 | beta = 0.9 21 | eps = 1e-10 22 | adj = 1 23 | 24 | moments = [p.clone().detach().zero_() for p in net.parameters()] 25 | 26 | for n in range(N): 27 | indices = torch.randint(0, M, (J,)) 28 | 29 | x = X[indices] 30 | y = Y[indices] 31 | 32 | net.zero_grad() 33 | 34 | loss_val = loss(net(x), y) 35 | loss_val.backward() 36 | 37 | with torch.no_grad(): 38 | adj *= beta 39 | for m, p in zip(moments, net.parameters()): 40 | m.mul_(beta) 41 | m.add_((1 - beta) * p.grad * p.grad) 42 | p.sub_(lr * (eps + (m / (1 - adj)).sqrt()).reciprocal() * p.grad) 43 | 44 | if n % 1000 == 0: 45 | with torch.no_grad(): 46 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 47 | y = torch.sin(x) 48 | loss_val = loss(net(x), y) 49 | print(f"Iteration: {n+1}, Loss: {loss_val}") 50 | -------------------------------------------------------------------------------- /code/optimization_methods/adadelta.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | beta = 0.9 20 | delta = 0.9 21 | eps = 1e-10 22 | 23 | moments = [p.clone().detach().zero_() for p in net.parameters()] 24 | Delta = [p.clone().detach().zero_() for p in net.parameters()] 25 | 26 | for n in range(N): 27 | indices = torch.randint(0, M, (J,)) 28 | 29 | x = X[indices] 30 | y = Y[indices] 31 | 32 | net.zero_grad() 33 | 34 | loss_val = loss(net(x), y) 35 | loss_val.backward() 36 | 37 | with torch.no_grad(): 38 | for m, D, p in zip(moments, Delta, net.parameters()): 39 | m.mul_(beta) 40 | m.add_((1 - beta) * p.grad * p.grad) 41 | inc = ((eps + D) / (eps + m)).sqrt() * p.grad 42 | p.sub_(inc) 43 | D.mul_(delta) 44 | D.add_((1 - delta) * inc * inc) 45 | 46 | if n % 1000 == 0: 47 | with torch.no_grad(): 48 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 49 | y = torch.sin(x) 50 | loss_val = loss(net(x), y) 51 | print(f"Iteration: {n+1}, Loss: {loss_val}") 52 | -------------------------------------------------------------------------------- /code/optimization_methods/adam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.0001 20 | alpha = 0.9 21 | beta = 0.999 22 | eps = 1e-8 23 | adj = 1. 24 | adj2 = 1. 25 | 26 | m = [p.clone().detach().zero_() for p in net.parameters()] 27 | MM = [p.clone().detach().zero_() for p in net.parameters()] 28 | 29 | for n in range(N): 30 | indices = torch.randint(0, M, (J,)) 31 | 32 | x = X[indices] 33 | y = Y[indices] 34 | 35 | net.zero_grad() 36 | 37 | loss_val = loss(net(x), y) 38 | loss_val.backward() 39 | 40 | with torch.no_grad(): 41 | adj *= alpha 42 | adj2 *= beta 43 | for m_p, M_p, p in zip(m, MM, net.parameters()): 44 | m_p.mul_(alpha) 45 | m_p.add_((1 - alpha) * p.grad) 46 | M_p.mul_(beta) 47 | M_p.add_((1 - beta) * p.grad * p.grad) 48 | p.sub_(lr * m_p / ((1 - adj) * (eps + (M_p / (1 - adj2)).sqrt()))) 49 | 50 | if n % 1000 == 0: 51 | with torch.no_grad(): 52 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 53 | y = torch.sin(x) 54 | loss_val = loss(net(x), y) 55 | print(f"Iteration: {n+1}, Loss: {loss_val}") 56 | -------------------------------------------------------------------------------- /code/optimization_methods/nesterov_sgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.003 20 | alpha = 0.999 21 | 22 | m = [p.clone().detach().zero_() for p in net.parameters()] 23 | 24 | for n in range(N): 25 | indices = torch.randint(0, M, (J,)) 26 | 27 | x = X[indices] 28 | y = Y[indices] 29 | 30 | net.zero_grad() 31 | 32 | # Remember the original parameters 33 | params = [p.clone().detach() for p in net.parameters()] 34 | 35 | for p, m_p in zip(params, m): 36 | p.sub_(lr * alpha * m_p) 37 | 38 | # Compute the loss 39 | loss_val = loss(net(x), y) 40 | # Compute the gradients with respect to the parameters 41 | loss_val.backward() 42 | 43 | with torch.no_grad(): 44 | for p, m_p, q in zip(net.parameters(), m, params): 45 | m_p.mul_(alpha) 46 | m_p.add_((1 - alpha) * p.grad) 47 | q.sub_(lr * m_p) 48 | p.copy_(q) 49 | 50 | if n % 1000 == 0: 51 | with torch.no_grad(): 52 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 53 | y = torch.sin(x) 54 | loss_val = loss(net(x), y) 55 | print(f"Iteration: {n+1}, Loss: {loss_val}") 56 | -------------------------------------------------------------------------------- /code/fc-ann-manual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | # To define a neural network, we define a class that inherits from 7 | # torch.nn.Module 8 | class FullyConnectedANN(nn.Module): 9 | def __init__(self): 10 | super().__init__() 11 | # In the constructor, we define the weights and biases. 12 | # Wrapping the tensors in torch.nn.Parameter objects tells 13 | # PyTorch that these are parameters that should be 14 | # optimized during training. 15 | self.W1 = nn.Parameter( 16 | torch.Tensor([[1, 0], [0, -1], [-2, 2]]) 17 | ) 18 | self.B1 = nn.Parameter(torch.Tensor([0, 2, -1])) 19 | self.W2 = nn.Parameter(torch.Tensor([[1, -2, 3]])) 20 | self.B2 = nn.Parameter(torch.Tensor([1])) 21 | 22 | # The realization function of the network 23 | def forward(self, x0): 24 | x1 = F.relu(self.W1 @ x0 + self.B1) 25 | x2 = self.W2 @ x1 + self.B2 26 | return x2 27 | 28 | 29 | model = FullyConnectedANN() 30 | 31 | x0 = torch.Tensor([1, 2]) 32 | # Print the output of the realization function for input x0 33 | print(model.forward(x0)) 34 | 35 | # As a consequence of inheriting from torch.nn.Module we can just 36 | # "call" the model itself (which will call the forward method 37 | # implicitly) 38 | print(model(x0)) 39 | 40 | # Wrapping a tensor in a Parameter object and assigning it to an 41 | # instance variable of the Module makes PyTorch register it as a 42 | # parameter. We can access all parameters via the parameters 43 | # method. 44 | for p in model.parameters(): 45 | print(p) 46 | -------------------------------------------------------------------------------- /code/example_GD_momentum_plots.py: -------------------------------------------------------------------------------- 1 | # Example for GD and momentum GD 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | # Number of steps for the schemes 7 | N = 8 8 | 9 | # Problem setting 10 | d = 2 11 | K = [1., 10.] 12 | 13 | vartheta = np.array([1., 1.]) 14 | xi = np.array([5., 3.]) 15 | 16 | def f(x, y): 17 | result = K[0] / 2. * np.abs(x - vartheta[0])**2 \ 18 | + K[1] / 2. * np.abs(y - vartheta[1])**2 19 | return result 20 | 21 | def nabla_f(x): 22 | return K * (x - vartheta) 23 | 24 | # Coefficients for GD 25 | gamma_GD = 2 /(K[0] + K[1]) 26 | 27 | # Coefficients for momentum 28 | gamma_momentum = 0.3 29 | alpha = 0.5 30 | 31 | # Placeholder for processes 32 | Theta = np.zeros((N+1, d)) 33 | M = np.zeros((N+1, d)) 34 | m = np.zeros((N+1, d)) 35 | 36 | Theta[0] = xi 37 | M[0] = xi 38 | 39 | # Perform gradient descent 40 | for i in range(N): 41 | Theta[i+1] = Theta[i] - gamma_GD * nabla_f(Theta[i]) 42 | 43 | # Perform momentum GD 44 | for i in range(N): 45 | m[i+1] = alpha * m[i] + (1 - alpha) * nabla_f(M[i]) 46 | M[i+1] = M[i] - gamma_momentum * m[i+1] 47 | 48 | 49 | ### Plot ### 50 | plt.figure() 51 | 52 | # Plot the gradient descent process 53 | plt.plot(Theta[:, 0], Theta[:, 1], 54 | label = "GD", color = "c", 55 | linestyle = "--", marker = "*") 56 | 57 | # Plot the momentum gradient descent process 58 | plt.plot(M[:, 0], M[:, 1], 59 | label = "Momentum", color = "orange", marker = "*") 60 | 61 | # Target value 62 | plt.scatter(vartheta[0],vartheta[1], 63 | label = "vartheta", color = "red", marker = "x") 64 | 65 | # Plot contour lines of f 66 | x = np.linspace(-3., 7., 100) 67 | y = np.linspace(-2., 4., 100) 68 | X, Y = np.meshgrid(x, y) 69 | Z = f(X, Y) 70 | cp = plt.contour(X, Y, Z, colors="black", 71 | levels = [0.5,2,4,8,16], 72 | linestyles=":") 73 | 74 | plt.legend() 75 | plt.savefig("../plots/GD_momentum_plots.pdf") 76 | -------------------------------------------------------------------------------- /code/optimization_methods/midpoint_sgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | net = nn.Sequential( 6 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 7 | ) 8 | 9 | M = 1000 10 | 11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 12 | Y = torch.sin(X) 13 | 14 | J = 64 15 | 16 | N = 150000 17 | 18 | loss = nn.MSELoss() 19 | lr = 0.003 20 | 21 | for n in range(N): 22 | indices = torch.randint(0, M, (J,)) 23 | 24 | x = X[indices] 25 | y = Y[indices] 26 | 27 | net.zero_grad() 28 | 29 | # Remember the original parameters 30 | params = [p.clone().detach() for p in net.parameters()] 31 | # Compute the loss 32 | loss_val = loss(net(x), y) 33 | # Compute the gradients with respect to the parameters 34 | loss_val.backward() 35 | 36 | with torch.no_grad(): 37 | # Make a half-step in the direction of the negative 38 | # gradient 39 | for p in net.parameters(): 40 | if p.grad is not None: 41 | p.sub_(0.5 * lr * p.grad) 42 | 43 | net.zero_grad() 44 | # Compute the loss and the gradients at the midpoint 45 | loss_val = loss(net(x), y) 46 | loss_val.backward() 47 | 48 | with torch.no_grad(): 49 | # Subtract the scaled gradient at the midpoint from the 50 | # original parameters 51 | for param, midpoint_param in zip( 52 | params, net.parameters() 53 | ): 54 | param.sub_(lr * midpoint_param.grad) 55 | 56 | # Copy the new parameters into the model 57 | for param, p in zip(params, net.parameters()): 58 | p.copy_(param) 59 | 60 | if n % 1000 == 0: 61 | with torch.no_grad(): 62 | x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi 63 | y = torch.sin(x) 64 | loss_val = loss(net(x), y) 65 | print(f"Iteration: {n+1}, Loss: {loss_val}") 66 | -------------------------------------------------------------------------------- /code/optimization_methods/momentum_sgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | M = 10000 7 | 8 | torch.manual_seed(0) 9 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi 10 | Y = torch.sin(X) 11 | 12 | J = 64 13 | 14 | N = 100000 15 | 16 | loss = nn.MSELoss() 17 | lr = 0.01 18 | alpha = 0.999 19 | 20 | fig, axs = plt.subplots(1, 4, figsize=(12, 3), sharey='row') 21 | 22 | net = nn.Sequential( 23 | nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1) 24 | ) 25 | 26 | for i, alpha in enumerate([0, 0.9, 0.99, 0.999]): 27 | print(f"alpha = {alpha}") 28 | 29 | for lr in [0.1, 0.03, 0.01, 0.003]: 30 | torch.manual_seed(0) 31 | net.apply( 32 | lambda m: m.reset_parameters() 33 | if isinstance(m, nn.Linear) 34 | else None 35 | ) 36 | 37 | momentum = [ 38 | p.clone().detach().zero_() for p in net.parameters() 39 | ] 40 | 41 | losses = [] 42 | print(f"lr = {lr}") 43 | 44 | for n in range(N): 45 | indices = torch.randint(0, M, (J,)) 46 | 47 | x = X[indices] 48 | y = Y[indices] 49 | 50 | net.zero_grad() 51 | 52 | loss_val = loss(net(x), y) 53 | loss_val.backward() 54 | 55 | with torch.no_grad(): 56 | for m, p in zip(momentum, net.parameters()): 57 | m.mul_(alpha) 58 | m.add_((1 - alpha) * p.grad) 59 | p.sub_(lr * m) 60 | 61 | if n % 100 == 0: 62 | with torch.no_grad(): 63 | x = (torch.rand((1000, 1)) - 0.5) * 4 * np.pi 64 | y = torch.sin(x) 65 | loss_val = loss(net(x), y) 66 | losses.append(loss_val.item()) 67 | 68 | axs[i].plot(losses, label=f"$\\gamma = {lr}$") 69 | 70 | axs[i].set_yscale("log") 71 | axs[i].set_ylim([1e-6, 1]) 72 | axs[i].set_title(f"$\\alpha = {alpha}$") 73 | 74 | axs[0].legend() 75 | 76 | plt.tight_layout() 77 | plt.savefig("../plots/sgd_momentum.pdf", bbox_inches='tight') 78 | -------------------------------------------------------------------------------- /code/optimization_methods/sgd2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | def plot_heatmap(ax, g): 7 | x = np.linspace(-2 * np.pi, 2 * np.pi, 100) 8 | y = np.linspace(-2 * np.pi, 2 * np.pi, 100) 9 | x, y = np.meshgrid(x, y) 10 | 11 | # flatten the grid to [num_points, 2] and convert to tensor 12 | grid = np.vstack([x.flatten(), y.flatten()]).T 13 | grid_torch = torch.from_numpy(grid).float() 14 | 15 | # pass the grid through the network 16 | z = g(grid_torch) 17 | 18 | # reshape the predictions back to a 2D grid 19 | Z = z.numpy().reshape(x.shape) 20 | 21 | # plot the heatmap 22 | ax.imshow(Z, origin='lower', extent=(-2 * np.pi, 2 * np.pi, 23 | -2 * np.pi, 2 * np.pi)) 24 | 25 | M = 10000 26 | 27 | def f(x): 28 | return torch.sin(x).prod(dim=1, keepdim=True) 29 | 30 | torch.manual_seed(0) 31 | X = torch.rand((M, 2)) * 4 * np.pi - 2 * np.pi 32 | Y = f(X) 33 | 34 | J = 32 35 | 36 | N = 100000 37 | 38 | loss = nn.MSELoss() 39 | gamma = 0.05 40 | 41 | fig, axs = plt.subplots( 42 | 3, 3, figsize=(12, 12), sharex="col", sharey="row", 43 | ) 44 | 45 | net = nn.Sequential( 46 | nn.Linear(2, 50), 47 | nn.Softplus(), 48 | nn.Linear(50,50), 49 | nn.Softplus(), 50 | nn.Linear(50, 1) 51 | ) 52 | 53 | plot_after = [0, 100, 300, 1000, 3000, 10000, 30000, 100000] 54 | 55 | for n in range(N + 1): 56 | indices = torch.randint(0, M, (J,)) 57 | 58 | x = X[indices] 59 | y = Y[indices] 60 | 61 | net.zero_grad() 62 | 63 | loss_val = loss(net(x), y) 64 | loss_val.backward() 65 | 66 | with torch.no_grad(): 67 | for p in net.parameters(): 68 | p.sub_(gamma * p.grad) 69 | 70 | if n in plot_after: 71 | i = plot_after.index(n) 72 | 73 | with torch.no_grad(): 74 | plot_heatmap(axs[i // 3][i % 3], net) 75 | axs[i // 3][i % 3].set_title(f"Batch {n}") 76 | 77 | with torch.no_grad(): 78 | plot_heatmap(axs[2][2], f) 79 | axs[2][2].set_title("Target") 80 | 81 | plt.tight_layout() 82 | plt.savefig("../../plots/sgd2.pdf", bbox_inches="tight") 83 | -------------------------------------------------------------------------------- /code/optimization_methods/sgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | M = 10000 # number of training samples 7 | 8 | # We fix a random seed. This is not necessary for training a 9 | # neural network, but we use it here to ensure that the same 10 | # plot is created on every run. 11 | torch.manual_seed(0) 12 | 13 | # Here, we define the training set. 14 | # Create a tensor of shape (M, 1) with entries sampled from a 15 | # uniform distribution on [-2 * pi, 2 * pi) 16 | X = (torch.rand((M, 1)) - 0.5) * 4 * np.pi 17 | # We use the sine as the target function, so this defines the 18 | # desired outputs. 19 | Y = torch.sin(X) 20 | 21 | J = 32 # the batch size 22 | N = 100000 # the number of SGD iterations 23 | 24 | loss = nn.MSELoss() # the mean squared error loss function 25 | gamma = 0.003 # the learning rate 26 | 27 | # Define a network with a single hidden layer of 200 neurons and 28 | # tanh activation function 29 | net = nn.Sequential( 30 | nn.Linear(1, 200), nn.Tanh(), nn.Linear(200, 1) 31 | ) 32 | 33 | # Set up a 3x3 grid of plots 34 | fig, axs = plt.subplots( 35 | 3, 36 | 3, 37 | figsize=(12, 8), 38 | sharex="col", 39 | sharey="row", 40 | ) 41 | 42 | # Plot the target function 43 | x = torch.linspace(-2 * np.pi, 2 * np.pi, 1000).reshape((1000, 1)) 44 | y = torch.sin(x) 45 | for ax in axs.flatten(): 46 | ax.plot(x, y, label="Target") 47 | ax.set_xlim([-2 * np.pi, 2 * np.pi]) 48 | ax.set_ylim([-1.1, 1.1]) 49 | 50 | plot_after = [1, 30, 100, 300, 1000, 3000, 10000, 30000, 100000] 51 | 52 | # The training loop 53 | for n in range(N): 54 | # Choose J samples randomly from the training set 55 | indices = torch.randint(0, M, (J,)) 56 | X_batch = X[indices] 57 | Y_batch = Y[indices] 58 | 59 | net.zero_grad() # Zero out the gradients 60 | 61 | loss_val = loss(net(X_batch), Y_batch) # Compute the loss 62 | loss_val.backward() # Compute the gradients 63 | 64 | # Update the parameters 65 | with torch.no_grad(): 66 | for p in net.parameters(): 67 | # Subtract the scaled gradient in-place 68 | p.sub_(gamma * p.grad) 69 | 70 | if n + 1 in plot_after: 71 | # Plot the realization function of the ANN 72 | i = plot_after.index(n + 1) 73 | ax = axs[i // 3][i % 3] 74 | ax.set_title(f"Batch {n+1}") 75 | 76 | with torch.no_grad(): 77 | ax.plot(x, net(x), label="ANN realization") 78 | 79 | axs[0][0].legend(loc="upper right") 80 | 81 | plt.tight_layout() 82 | plt.savefig("../../plots/sgd.pdf", bbox_inches="tight") 83 | -------------------------------------------------------------------------------- /code/kolmogorov.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import matplotlib.pyplot as plt 3 | 4 | # Use the GPU if available 5 | dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") 6 | 7 | # Computes an approximation of E[|phi(sqrt(2*rho*T) W + xi) - 8 | # N(xi)|²] with W a standard normal random variable using the rows 9 | # of x as # independent realizations of the random variable xi 10 | def loss(N, rho, phi, t, x): 11 | W = torch.randn_like(x).to(dev) 12 | return (phi(torch.sqrt(2 * rho * t) * W + x) - 13 | N(torch.cat((t,x),1))).square().mean() 14 | 15 | d = 2 # the input dimension 16 | a, b = -5.0, 5.0 # the domain will be [a,b]^d 17 | T = 2.0 # the time horizon 18 | rho = 1.0 # the diffusivity 19 | 20 | # Define the initial value 21 | def phi(x): 22 | return x.cos().sum(axis=1, keepdim=True) 23 | 24 | # Define a neural network with two hidden layers with 50 neurons 25 | # each using ReLU activations 26 | N = torch.nn.Sequential( 27 | torch.nn.Linear(d+1, 50), torch.nn.ReLU(), 28 | torch.nn.Linear(50, 50), torch.nn.ReLU(), 29 | torch.nn.Linear(50, 1) 30 | ).to(dev) 31 | 32 | # Configure the training parameters and optimization algorithm 33 | steps = 3000 34 | batch_size = 256 35 | optimizer = torch.optim.Adam(N.parameters()) 36 | 37 | # Train the network 38 | for step in range(steps): 39 | # Generate uniformly distributed samples from [a,b]^d 40 | x = (torch.rand(batch_size, d) * (b-a) + a).to(dev) 41 | t = T * torch.rand(batch_size, 1).to(dev) 42 | 43 | optimizer.zero_grad() 44 | # Compute the loss 45 | L = loss(N, rho, phi, t, x) 46 | # Compute the gradients 47 | L.backward() 48 | # Apply changes to weights and biases of N 49 | optimizer.step() 50 | 51 | # Plot the result at M+1 timesteps 52 | M = 5 53 | mesh = 128 54 | 55 | def toNumpy(t): 56 | return t.detach().cpu().numpy().reshape((mesh,mesh)) 57 | 58 | fig, axs = plt.subplots(2,3,subplot_kw=dict(projection='3d')) 59 | fig.set_size_inches(16, 10) 60 | fig.set_dpi(300) 61 | 62 | for i in range(M+1): 63 | x = torch.linspace(a, b, mesh) 64 | y = torch.linspace(a, b, mesh) 65 | x, y = torch.meshgrid(x, y, indexing='xy') 66 | x = x.reshape((mesh*mesh,1)).to(dev) 67 | y = y.reshape((mesh*mesh,1)).to(dev) 68 | z = N(torch.cat((i*T/M*torch.ones(128*128,1).to(dev), x, y), 69 | 1)) 70 | 71 | axs[i//3,i%3].set_title(f"t = {i * T / M}") 72 | axs[i//3,i%3].set_zlim(-2,2) 73 | axs[i//3,i%3].plot_surface(toNumpy(x), toNumpy(y), toNumpy(z), 74 | cmap='viridis') 75 | 76 | fig.savefig(f"../plots/kolmogorov.pdf", bbox_inches='tight') -------------------------------------------------------------------------------- /code/dgm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import matplotlib.pyplot as plt 3 | from torch.autograd import grad 4 | from matplotlib.gridspec import GridSpec 5 | from matplotlib.cm import ScalarMappable 6 | 7 | 8 | dev = torch.device("cuda:0" if torch.cuda.is_available() else 9 | "cpu") 10 | 11 | T = 3.0 # the time horizom 12 | 13 | # The initial value 14 | def phi(x): 15 | return x.sin().prod(axis=1, keepdims=True) 16 | 17 | torch.manual_seed(0) 18 | 19 | # We use a network with 4 hidden layers of 50 neurons each and the 20 | # Swish activation function (called SiLU in PyTorch) 21 | N = torch.nn.Sequential( 22 | torch.nn.Linear(3, 50), torch.nn.SiLU(), 23 | torch.nn.Linear(50, 50), torch.nn.SiLU(), 24 | torch.nn.Linear(50, 50), torch.nn.SiLU(), 25 | torch.nn.Linear(50, 50), torch.nn.SiLU(), 26 | torch.nn.Linear(50, 1), 27 | ).to(dev) 28 | 29 | optimizer = torch.optim.Adam(N.parameters(), lr=3e-4) 30 | 31 | J = 256 # the batch size 32 | 33 | for i in range(30000): 34 | # Choose a random batch of training samples 35 | x = torch.randn(J, 2).to(dev) * 2 36 | t = torch.rand(J, 1).to(dev) * T 37 | 38 | x1 = x[:, 0:1] 39 | x2 = x[:, 1:2] 40 | 41 | x1.requires_grad_() 42 | x2.requires_grad_() 43 | t.requires_grad_() 44 | 45 | optimizer.zero_grad() 46 | 47 | # Denoting by u the realization function of the ANN, compute 48 | # u(0, x) for each x in the batch 49 | u0 = N(torch.hstack((torch.zeros_like(t), x))) 50 | # Compute the loss for the initial condition 51 | initial_loss = (u0 - phi(x)).square().mean() 52 | 53 | # Compute the partial derivatives using automatic 54 | # differentiation 55 | u = N(torch.hstack((t, x1, x2))) 56 | ones = torch.ones_like(u) 57 | u_t = grad(u, t, ones, create_graph=True)[0] 58 | u_x1 = grad(u, x1, ones, create_graph=True)[0] 59 | u_x2 = grad(u, x2, ones, create_graph=True)[0] 60 | ones = torch.ones_like(u_x1) 61 | u_x1x1 = grad(u_x1, x1, ones, create_graph=True)[0] 62 | u_x2x2 = grad(u_x2, x2, ones, create_graph=True)[0] 63 | 64 | # Compute the loss for the PDE 65 | Laplace = u_x1x1 + u_x2x2 66 | pde_loss = (u_t - (0.005 * Laplace + u - u**3)).square().mean() 67 | 68 | # Compute the total loss and perform a gradient step 69 | loss = initial_loss + pde_loss 70 | loss.backward() 71 | optimizer.step() 72 | 73 | 74 | ### Plot the solution at different times 75 | 76 | mesh = 128 77 | a, b = -torch.pi, torch.pi 78 | 79 | gs = GridSpec(2, 4, width_ratios=[1, 1, 1, 0.05]) 80 | fig = plt.figure(figsize=(16, 10), dpi=300) 81 | 82 | x, y = torch.meshgrid( 83 | torch.linspace(a, b, mesh), 84 | torch.linspace(a, b, mesh), 85 | indexing="xy" 86 | ) 87 | x = x.reshape((mesh * mesh, 1)).to(dev) 88 | y = y.reshape((mesh * mesh, 1)).to(dev) 89 | 90 | for i in range(6): 91 | t = torch.full((mesh * mesh, 1), i * T / 5).to(dev) 92 | z = N(torch.cat((t, x, y), 1)) 93 | z = z.detach().cpu().numpy().reshape((mesh, mesh)) 94 | 95 | ax = fig.add_subplot(gs[i // 3, i % 3]) 96 | ax.set_title(f"t = {i * T / 5}") 97 | ax.imshow( 98 | z, cmap="viridis", extent=[a, b, a, b], vmin=-1.2, vmax=1.2 99 | ) 100 | 101 | # Add the colorbar to the figure 102 | norm = plt.Normalize(vmin=-1.2, vmax=1.2) 103 | sm = ScalarMappable(cmap="viridis", norm=norm) 104 | cax = fig.add_subplot(gs[:, 3]) 105 | fig.colorbar(sm, cax=cax, orientation='vertical') 106 | 107 | fig.savefig("../plots/dgm.pdf", bbox_inches="tight") -------------------------------------------------------------------------------- /code/pinn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import matplotlib.pyplot as plt 3 | from torch.autograd import grad 4 | from matplotlib.gridspec import GridSpec 5 | from matplotlib.cm import ScalarMappable 6 | 7 | 8 | dev = torch.device("cuda:0" if torch.cuda.is_available() else 9 | "cpu") 10 | 11 | T = 3.0 # the time horizom 12 | M = 20000 # the number of training samples 13 | 14 | torch.manual_seed(0) 15 | 16 | x_data = torch.randn(M, 2).to(dev) * 2 17 | t_data = torch.rand(M, 1).to(dev) * T 18 | 19 | # The initial value 20 | def phi(x): 21 | return x.square().sum(axis=1, keepdims=True).sin() 22 | 23 | # We use a network with 4 hidden layers of 50 neurons each and the 24 | # Swish activation function (called SiLU in PyTorch) 25 | N = torch.nn.Sequential( 26 | torch.nn.Linear(3, 50), torch.nn.SiLU(), 27 | torch.nn.Linear(50, 50), torch.nn.SiLU(), 28 | torch.nn.Linear(50, 50), torch.nn.SiLU(), 29 | torch.nn.Linear(50, 50), torch.nn.SiLU(), 30 | torch.nn.Linear(50, 1), 31 | ).to(dev) 32 | 33 | optimizer = torch.optim.Adam(N.parameters(), lr=3e-4) 34 | 35 | J = 256 # the batch size 36 | 37 | for i in range(20000): 38 | # Choose a random batch of training samples 39 | indices = torch.randint(0, M, (J,)) 40 | x = x_data[indices, :] 41 | t = t_data[indices, :] 42 | 43 | x1, x2 = x[:, 0:1], x[:, 1:2] 44 | 45 | x1.requires_grad_() 46 | x2.requires_grad_() 47 | t.requires_grad_() 48 | 49 | optimizer.zero_grad() 50 | 51 | # Denoting by u the realization function of the ANN, compute 52 | # u(0, x) for each x in the batch 53 | u0 = N(torch.hstack((torch.zeros_like(t), x))) 54 | # Compute the loss for the initial condition 55 | initial_loss = (u0 - phi(x)).square().mean() 56 | 57 | # Compute the partial derivatives using automatic 58 | # differentiation 59 | u = N(torch.hstack((t, x1, x2))) 60 | ones = torch.ones_like(u) 61 | u_t = grad(u, t, ones, create_graph=True)[0] 62 | u_x1 = grad(u, x1, ones, create_graph=True)[0] 63 | u_x2 = grad(u, x2, ones, create_graph=True)[0] 64 | ones = torch.ones_like(u_x1) 65 | u_x1x1 = grad(u_x1, x1, ones, create_graph=True)[0] 66 | u_x2x2 = grad(u_x2, x2, ones, create_graph=True)[0] 67 | 68 | # Compute the loss for the PDE 69 | Laplace = u_x1x1 + u_x2x2 70 | pde_loss = (u_t - (0.005 * Laplace + u - u**3)).square().mean() 71 | 72 | # Compute the total loss and perform a gradient step 73 | loss = initial_loss + pde_loss 74 | loss.backward() 75 | optimizer.step() 76 | 77 | 78 | ### Plot the solution at different times 79 | 80 | mesh = 128 81 | a, b = -3, 3 82 | 83 | gs = GridSpec(2, 4, width_ratios=[1, 1, 1, 0.05]) 84 | fig = plt.figure(figsize=(16, 10), dpi=300) 85 | 86 | x, y = torch.meshgrid( 87 | torch.linspace(a, b, mesh), 88 | torch.linspace(a, b, mesh), 89 | indexing="xy" 90 | ) 91 | x = x.reshape((mesh * mesh, 1)).to(dev) 92 | y = y.reshape((mesh * mesh, 1)).to(dev) 93 | 94 | for i in range(6): 95 | t = torch.full((mesh * mesh, 1), i * T / 5).to(dev) 96 | z = N(torch.cat((t, x, y), 1)) 97 | z = z.detach().cpu().numpy().reshape((mesh, mesh)) 98 | 99 | ax = fig.add_subplot(gs[i // 3, i % 3]) 100 | ax.set_title(f"t = {i * T / 5}") 101 | ax.imshow( 102 | z, cmap="viridis", extent=[a, b, a, b], vmin=-1.2, vmax=1.2 103 | ) 104 | 105 | # Add the colorbar to the figure 106 | norm = plt.Normalize(vmin=-1.2, vmax=1.2) 107 | sm = ScalarMappable(cmap="viridis", norm=norm) 108 | cax = fig.add_subplot(gs[:, 3]) 109 | fig.colorbar(sm, cax=cax, orientation='vertical') 110 | 111 | fig.savefig("../plots/pinn.pdf", bbox_inches="tight") 112 | -------------------------------------------------------------------------------- /code/mnist_optim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision.datasets as datasets 3 | import torchvision.transforms as transforms 4 | import torch.nn as nn 5 | import torch.utils.data as data 6 | import torch.optim as optim 7 | import matplotlib.pyplot as plt 8 | from matplotlib.ticker import ScalarFormatter, NullFormatter 9 | import copy 10 | 11 | # Set device as GPU if available or CPU otherwise 12 | device = torch.device( 13 | "cuda" if torch.cuda.is_available() else "cpu" 14 | ) 15 | 16 | # Fix a random seed 17 | torch.manual_seed(0) 18 | 19 | # Load the MNIST training and test datasets 20 | mnist_train = datasets.MNIST( 21 | "./data", 22 | train=True, 23 | transform=transforms.ToTensor(), 24 | download=True, 25 | ) 26 | mnist_test = datasets.MNIST( 27 | "./data", 28 | train=False, 29 | transform=transforms.ToTensor(), 30 | download=True, 31 | ) 32 | train_loader = data.DataLoader( 33 | mnist_train, batch_size=64, shuffle=True 34 | ) 35 | test_loader = data.DataLoader( 36 | mnist_test, batch_size=64, shuffle=False 37 | ) 38 | 39 | # Define a neural network 40 | net = nn.Sequential( # input shape (N, 1, 28, 28) 41 | nn.Conv2d(1, 5, 5), # (N, 5, 24, 24) 42 | nn.ReLU(), 43 | nn.Conv2d(5, 5, 3), # (N, 5, 22, 22) 44 | nn.ReLU(), 45 | nn.Conv2d(5, 3, 3), # (N, 3, 20, 20) 46 | nn.ReLU(), 47 | nn.Flatten(), # (N, 3 * 16 * 16) = (N, 1200) 48 | nn.Linear(1200, 128), # (N, 128) 49 | nn.ReLU(), 50 | nn.Linear(128, 10), # output shape (N, 10) 51 | ).to(device) 52 | 53 | # Save the initial state of the neural network 54 | initial_state = copy.deepcopy(net.state_dict()) 55 | 56 | # Define the loss function 57 | loss_fn = nn.CrossEntropyLoss() 58 | 59 | # Define the optimizers that we want to compare. Each entry in the 60 | # list is a tuple of a label (for the plot) and an optimizer 61 | optimizers = [ 62 | # For SGD we use a learning rate of 0.001 63 | ( 64 | "SGD", 65 | optim.SGD(net.parameters(), lr=1e-3), 66 | ), 67 | ( 68 | "SGD with momentum", 69 | optim.SGD(net.parameters(), lr=1e-3, momentum=0.9), 70 | ), 71 | ( 72 | "Nesterov SGD", 73 | optim.SGD( 74 | net.parameters(), lr=1e-3, momentum=0.9, nesterov=True 75 | ), 76 | ), 77 | # For the adaptive optimization methods we use the default 78 | # hyperparameters 79 | ( 80 | "RMSprop", 81 | optim.RMSprop(net.parameters()), 82 | ), 83 | ( 84 | "Adagrad", 85 | optim.Adagrad(net.parameters()), 86 | ), 87 | ( 88 | "Adadelta", 89 | optim.Adadelta(net.parameters()), 90 | ), 91 | ( 92 | "Adam", 93 | optim.Adam(net.parameters()), 94 | ), 95 | ] 96 | 97 | def compute_test_loss_and_accuracy(): 98 | total_test_loss = 0.0 99 | correct_count = 0 100 | with torch.no_grad(): 101 | for images, labels in test_loader: 102 | images = images.to(device) 103 | labels = labels.to(device) 104 | 105 | output = net(images) 106 | loss = loss_fn(output, labels) 107 | 108 | total_test_loss += loss.item() * images.size(0) 109 | pred_labels = torch.max(output, dim=1).indices 110 | correct_count += torch.sum( 111 | pred_labels == labels 112 | ).item() 113 | 114 | avg_test_loss = total_test_loss / len(mnist_test) 115 | accuracy = correct_count / len(mnist_test) 116 | 117 | return (avg_test_loss, accuracy) 118 | 119 | 120 | loss_plots = [] 121 | accuracy_plots = [] 122 | 123 | test_interval = 100 124 | 125 | for _, optimizer in optimizers: 126 | train_losses = [] 127 | accuracies = [] 128 | print(optimizer) 129 | 130 | with torch.no_grad(): 131 | net.load_state_dict(initial_state) 132 | 133 | i = 0 134 | for e in range(5): 135 | print(f"Epoch {e+1}") 136 | for images, labels in train_loader: 137 | images = images.to(device) 138 | labels = labels.to(device) 139 | 140 | optimizer.zero_grad() 141 | output = net(images) 142 | loss = loss_fn(output, labels) 143 | loss.backward() 144 | optimizer.step() 145 | 146 | train_losses.append(loss.item()) 147 | 148 | if (i + 1) % test_interval == 0: 149 | ( 150 | test_loss, 151 | accuracy, 152 | ) = compute_test_loss_and_accuracy() 153 | print(accuracy) 154 | accuracies.append(accuracy) 155 | 156 | i += 1 157 | 158 | loss_plots.append(train_losses) 159 | accuracy_plots.append(accuracies) 160 | 161 | WINDOW = 200 162 | 163 | _, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12)) 164 | ax1.set_yscale("log") 165 | ax2.set_yscale("logit") 166 | ax2.yaxis.set_major_formatter(ScalarFormatter()) 167 | ax2.yaxis.set_minor_formatter(NullFormatter()) 168 | for (label, _), train_losses, accuracies in zip( 169 | optimizers, loss_plots, accuracy_plots 170 | ): 171 | ax1.plot( 172 | [ 173 | sum(train_losses[max(0,i-WINDOW) : i]) / min(i, WINDOW) 174 | for i in range(1,len(train_losses)) 175 | ], 176 | label=label, 177 | ) 178 | ax2.plot( 179 | range(0, len(accuracies) * test_interval, test_interval), 180 | accuracies, 181 | label=label, 182 | ) 183 | 184 | ax1.legend() 185 | 186 | plt.tight_layout() 187 | plt.savefig("../plots/mnist_optim.pdf", bbox_inches="tight") 188 | -------------------------------------------------------------------------------- /code/mnist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision.datasets as datasets 3 | import torchvision.transforms as transforms 4 | import torch.nn as nn 5 | import torch.utils.data as data 6 | import torch.optim as optim 7 | import matplotlib.pyplot as plt 8 | from matplotlib.ticker import ScalarFormatter, NullFormatter 9 | 10 | # We use the GPU if available. Otherwise, we use the CPU. 11 | device = torch.device( 12 | "cuda" if torch.cuda.is_available() else "cpu" 13 | ) 14 | 15 | # We fix a random seed. This is not necessary for training a 16 | # neural network, but we use it here to ensure that the same 17 | # plot is created on every run. 18 | torch.manual_seed(0) 19 | 20 | # The torch.utils.data.Dataset class is an abstraction for a 21 | # collection of instances that has a length and can be indexed 22 | # (usually by integers). 23 | # The torchvision.datasets module contains functions for loading 24 | # popular machine learning datasets, possibly downloading and 25 | # transforming the data. 26 | 27 | # Here we load the MNIST dataset, containing 28x28 grayscale images 28 | # of handwritten digits with corresponding labels in 29 | # {0, 1, ..., 9}. 30 | 31 | # First load the training portion of the data set, downloading it 32 | # from an online source to the local folder ./data (if it is not 33 | # yet there) and transforming the data to PyTorch Tensors. 34 | mnist_train = datasets.MNIST( 35 | "./data", 36 | train=True, 37 | transform=transforms.ToTensor(), 38 | download=True, 39 | ) 40 | # Next load the test portion 41 | mnist_test = datasets.MNIST( 42 | "./data", 43 | train=False, 44 | transform=transforms.ToTensor(), 45 | download=True, 46 | ) 47 | 48 | # The data.utils.DataLoader class allows iterating datasets for 49 | # training and validation. It supports, e.g., batching and 50 | # shuffling of datasets. 51 | 52 | # Construct a DataLoader that when iterating returns minibatches 53 | # of 64 instances drawn from a random permutation of the training 54 | # dataset 55 | train_loader = data.DataLoader( 56 | mnist_train, batch_size=64, shuffle=True 57 | ) 58 | # The loader for the test dataset does not need shuffling 59 | test_loader = data.DataLoader( 60 | mnist_test, batch_size=64, shuffle=False 61 | ) 62 | 63 | # Define a neural network with 3 convolutional layers, each 64 | # followed by a ReLU activation and then two affine layers, 65 | # the first followed by a ReLU activation 66 | net = nn.Sequential( # input shape (N, 1, 28, 28) 67 | nn.Conv2d(1, 5, 5), # (N, 5, 24, 24) 68 | nn.ReLU(), 69 | nn.Conv2d(5, 5, 5), # (N, 5, 20, 20) 70 | nn.ReLU(), 71 | nn.Conv2d(5, 3, 5), # (N, 3, 16, 16) 72 | nn.ReLU(), 73 | nn.Flatten(), # (N, 3 * 16 * 16) = (N, 768) 74 | nn.Linear(768, 128), # (N, 128) 75 | nn.ReLU(), 76 | nn.Linear(128, 10), # output shape (N, 10) 77 | ).to(device) 78 | 79 | # Define the loss function. For every natural number d, for 80 | # e_1, e_2, ..., e_d the standard basis vectors in R^d, for L the 81 | # d-dimensional cross-entropy loss function, and for A the 82 | # d-dimensional softmax activation function, the function loss_fn 83 | # defined here satisfies for all x in R^d and all natural numbers 84 | # i in [0,d) that 85 | # loss_fn(x, i) = L(A(x), e_i). 86 | # The function loss_fn also accepts batches of inputs, in which 87 | # case it will return the mean of the corresponding outputs. 88 | loss_fn = nn.CrossEntropyLoss() 89 | 90 | # Define the optimizer. We use the Adam SGD optimization method. 91 | optimizer = optim.Adam(net.parameters(), lr=1e-3) 92 | 93 | # This function computes the average loss of the model over the 94 | # entire test set and the accuracy of the model's predictions. 95 | def compute_test_loss_and_accuracy(): 96 | total_test_loss = 0.0 97 | correct_count = 0 98 | with torch.no_grad(): 99 | # On each iteration the test_loader will yield a 100 | # minibatch of images with corresponding labels 101 | for images, labels in test_loader: 102 | # Move the data to the device 103 | images = images.to(device) 104 | labels = labels.to(device) 105 | # Compute the output of the neural network on the 106 | # current minibatch 107 | output = net(images) 108 | # Compute the mean of the cross-entropy losses 109 | loss = loss_fn(output, labels) 110 | # For the cumulative total_test_loss, we multiply loss 111 | # with the batch size (usually 64, as specified above, 112 | # but might be less for the final batch). 113 | total_test_loss += loss.item() * images.size(0) 114 | # For each input, the predicted label is the index of 115 | # the maximal component in the output vector. 116 | pred_labels = torch.max(output, dim=1).indices 117 | # pred_labels == labels compares the two vectors 118 | # componentwise and returns a vector of booleans. 119 | # Summing over this vector counts the number of True 120 | # entries. 121 | correct_count += torch.sum( 122 | pred_labels == labels 123 | ).item() 124 | avg_test_loss = total_test_loss / len(mnist_test) 125 | accuracy = correct_count / len(mnist_test) 126 | return (avg_test_loss, accuracy) 127 | 128 | 129 | # Initialize a list that holds the computed loss on every 130 | # batch during training 131 | train_losses = [] 132 | 133 | # Every 10 batches, we will compute the loss on the entire test 134 | # set as well as the accuracy of the model's predictions on the 135 | # entire test set. We do this for the purpose of illustrating in 136 | # the produced plot the generalization capability of the ANN. 137 | # Computing these losses and accuracies so frequently with such a 138 | # relatively large set of datapoints (compared to the training 139 | # set) is extremely computationally expensive, however (most of 140 | # the training runtime will be spent computing these values) and 141 | # so is not advisable during normal neural network training. 142 | # Usually, the test set is only used at the very end to judge the 143 | # performance of the final trained network. Often, a third set of 144 | # datapoints, called the validation set (not used to train the 145 | # network directly nor to evaluate it at the end) is used to 146 | # judge overfitting or to tune hyperparameters. 147 | test_interval = 10 148 | test_losses = [] 149 | accuracies = [] 150 | 151 | # We run the training for 5 epochs, i.e., 5 full iterations 152 | # through the training set. 153 | i = 0 154 | for e in range(5): 155 | for images, labels in train_loader: 156 | # Move the data to the device 157 | images = images.to(device) 158 | labels = labels.to(device) 159 | 160 | # Zero out the gradients 161 | optimizer.zero_grad() 162 | # Compute the output of the neural network on the current 163 | # minibatch 164 | output = net(images) 165 | # Compute the cross entropy loss 166 | loss = loss_fn(output, labels) 167 | # Compute the gradients 168 | loss.backward() 169 | # Update the parameters of the neural network 170 | optimizer.step() 171 | 172 | # Append the current loss to the list of training losses. 173 | # Note that tracking the training loss comes at 174 | # essentially no computational cost (since we have to 175 | # compute these values anyway) and so is typically done 176 | # during neural network training to gauge the training 177 | # progress. 178 | train_losses.append(loss.item()) 179 | 180 | if (i + 1) % test_interval == 0: 181 | # Compute the average loss on the test set and the 182 | # accuracy of the model and add the values to the 183 | # corresponding list 184 | test_loss, accuracy = compute_test_loss_and_accuracy() 185 | test_losses.append(test_loss) 186 | accuracies.append(accuracy) 187 | 188 | i += 1 189 | 190 | fig, ax1 = plt.subplots(figsize=(12, 8)) 191 | # We plot the training losses, test losses, and accuracies in the 192 | # same plot, but using two different y-axes 193 | ax2 = ax1.twinx() 194 | 195 | # Use a logarithmic scale for the losses 196 | ax1.set_yscale("log") 197 | # Use a logit scale for the accuracies 198 | ax2.set_yscale("logit") 199 | ax2.set_ylim((0.3, 0.99)) 200 | N = len(test_losses) * test_interval 201 | ax2.set_xlim((0, N)) 202 | # Plot the training losses 203 | (training_loss_line,) = ax1.plot( 204 | train_losses, 205 | label="Training loss (left axis)", 206 | ) 207 | # Plot test losses 208 | (test_loss_line,) = ax1.plot( 209 | range(0, N, test_interval), 210 | test_losses, 211 | label="Test loss (left axis)", 212 | ) 213 | # Plot the accuracies 214 | (accuracies_line,) = ax2.plot( 215 | range(0, N, test_interval), 216 | accuracies, 217 | label="Accuracy (right axis)", 218 | color="red", 219 | ) 220 | ax2.yaxis.set_major_formatter(ScalarFormatter()) 221 | ax2.yaxis.set_minor_formatter(NullFormatter()) 222 | 223 | # Put all the labels in a common legend 224 | lines = [training_loss_line, test_loss_line, accuracies_line] 225 | labels = [l.get_label() for l in lines] 226 | ax2.legend(lines, labels) 227 | 228 | plt.tight_layout() 229 | plt.savefig("../plots/mnist.pdf", bbox_inches="tight") 230 | --------------------------------------------------------------------------------