├── code
    ├── activation_functions
    │   ├── relu_plot.py
    │   ├── sine_plot.py
    │   ├── softplus_plot.py
    │   ├── softsign_plot.py
    │   ├── repu_plot.py
    │   ├── clipping_plot.py
    │   ├── leaky_relu_plot.py
    │   ├── swish_plot.py
    │   ├── gelu_plot.py
    │   ├── plot_util.py
    │   ├── logistic_plot.py
    │   ├── heaviside_plot.py
    │   ├── elu_plot.py
    │   └── tanh_plot.py
    ├── fc-ann2.py
    ├── loss_functions
    │   ├── l1loss_plot.py
    │   ├── mseloss_plot.py
    │   ├── crossentropyloss_plot.py
    │   ├── kldloss_plot.py
    │   └── huberloss_plot.py
    ├── gradient_plot1.py
    ├── brownian_motion.py
    ├── conv-ann-ex.py
    ├── res-ann.py
    ├── gradient_plot2.py
    ├── conv-ann.py
    ├── optimization_methods
    │   ├── adagrad.py
    │   ├── rmsprop.py
    │   ├── momentum_sgd_bias_adj.py
    │   ├── rmsprop_bias_adj.py
    │   ├── adadelta.py
    │   ├── adam.py
    │   ├── nesterov_sgd.py
    │   ├── midpoint_sgd.py
    │   ├── momentum_sgd.py
    │   ├── sgd2.py
    │   └── sgd.py
    ├── fc-ann.py
    ├── fc-ann-manual.py
    ├── example_GD_momentum_plots.py
    ├── kolmogorov.py
    ├── dgm.py
    ├── pinn.py
    ├── mnist_optim.py
    └── mnist.py
└── README.md


/code/activation_functions/relu_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-.5,2))
 7 | 
 8 | x = np.linspace(-2, 2, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x))
11 | 
12 | plt.savefig("../../plots/relu.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/sine_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2*np.pi,2*np.pi), (-1.5,1.5))
 7 | 
 8 | x = np.linspace(-2*np.pi, 2*np.pi, 100)
 9 | 
10 | ax.plot(x, np.sin(x))
11 | 
12 | plt.savefig("../../plots/sine.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/softplus_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-4,4), (-.5,4))
 7 | 
 8 | x = np.linspace(-4, 4, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x), label='ReLU')
11 | ax.plot(x, tf.keras.activations.softplus(x), label='softplus')
12 | ax.legend()
13 | 
14 | plt.savefig("../../plots/softplus.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/softsign_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-5,5), (-1.5,1.5))
 7 | 
 8 | x = np.linspace(-5, 5, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.tanh(x), label='tanh')
11 | ax.plot(x, tf.keras.activations.softsign(x), label='softsign')
12 | ax.legend()
13 | 
14 | plt.savefig("../../plots/softsign.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code for the book *Mathematical Introduction to Deep Learning: Methods, Implementations, and Theory*
2 | 
3 | This repository is a companion to the book *Mathematical Introduction to Deep Learning: Methods, Implementations, and Theory* by Arnulf Jentzen, Benno Kuckuck, and Philippe von Wurstemberger. It contains all of the Python code from the book.
4 | 
5 | The book is currently available as a preprint [on the arXiv](https://arxiv.org/abs/2310.20360).


--------------------------------------------------------------------------------
/code/activation_functions/repu_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-.5,3))
 7 | ax.set_ylim(-.5, 3)
 8 | 
 9 | x = np.linspace(-2, 2, 100)
10 | 
11 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU')
12 | ax.plot(x, tf.keras.activations.relu(x)**2, label='RePU')
13 | ax.legend()
14 | 
15 | plt.savefig("../../plots/repu.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/clipping_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-.5,2))
 7 | 
 8 | x = np.linspace(-2, 2, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU')
11 | ax.plot(x, tf.keras.activations.relu(x, max_value=1), 
12 |         label='(0,1)-clipping')
13 | ax.legend()
14 | 
15 | plt.savefig("../../plots/clipping.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/leaky_relu_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-.5,2))
 7 | 
 8 | x = np.linspace(-2, 2, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU')
11 | ax.plot(x, tf.keras.activations.relu(x, alpha=0.1), 
12 |         label='leaky ReLU')
13 | ax.legend()
14 | 
15 | plt.savefig("../../plots/leaky_relu.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/fc-ann2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | # A Module whose forward method is simply a composition of Modules
 5 | # can be represented using the torch.nn.Sequential class
 6 | model = nn.Sequential(
 7 |     nn.Linear(3, 20),
 8 |     nn.ReLU(),
 9 |     nn.Linear(20, 30),
10 |     nn.ReLU(),
11 |     nn.Linear(30, 1),
12 | )
13 | 
14 | # Prints a summary of the model architecture 
15 | print(model)
16 | 
17 | x0 = torch.Tensor([1, 2, 3])
18 | print(model(x0))


--------------------------------------------------------------------------------
/code/activation_functions/swish_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-4,3), (-.5,3))
 7 | 
 8 | x = np.linspace(-4, 3, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x), label='ReLU')
11 | ax.plot(x, tf.keras.activations.gelu(x), label='GELU')
12 | ax.plot(x, tf.keras.activations.swish(x), label='swish')
13 | ax.legend()
14 | 
15 | plt.savefig("../../plots/swish.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/gelu_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-4,3), (-.5,3))
 7 | 
 8 | x = np.linspace(-4, 3, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x), label='ReLU')
11 | ax.plot(x, tf.keras.activations.softplus(x), label='softplus')
12 | ax.plot(x, tf.keras.activations.gelu(x), label='GELU')
13 | ax.legend()
14 | 
15 | plt.savefig("../../plots/gelu.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/plot_util.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | def setup_axis(xlim, ylim):
 4 |     _, ax = plt.subplots()
 5 | 
 6 |     ax.set_aspect("equal")
 7 |     ax.set_xlim(xlim)
 8 |     ax.set_ylim(ylim)
 9 |     ax.spines["left"].set_position("zero")
10 |     ax.spines["bottom"].set_position("zero")
11 |     ax.spines["right"].set_color("none")
12 |     ax.spines["top"].set_color("none")
13 |     for s in ax.spines.values():
14 |         s.set_zorder(0)
15 | 
16 |     return ax
17 | 


--------------------------------------------------------------------------------
/code/activation_functions/logistic_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-3,3), (-.5,1.5))
 7 | 
 8 | x = np.linspace(-3, 3, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x, max_value=1), 
11 |         label='(0,1)-clipping')
12 | ax.plot(x, tf.keras.activations.sigmoid(x), 
13 |         label='standard logistic')
14 | ax.legend()
15 | 
16 | plt.savefig("../../plots/logistic.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/heaviside_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-3,3), (-.5,1.5))
 7 | 
 8 | x = np.linspace(-3, 3, 100)
 9 | 
10 | ax.plot(x[0:50], [0]*50, 'C0')
11 | ax.plot(x[50:100], [1]*50, 'C0', label='Heaviside')
12 | ax.plot(x, tf.keras.activations.sigmoid(x), 'C1', 
13 |         label='standard logistic')
14 | ax.legend()
15 | 
16 | plt.savefig("../../plots/heaviside.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/loss_functions/l1loss_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-.5,2))
 7 | 
 8 | x = np.linspace(-2, 2, 100)
 9 | 
10 | mae_loss = tf.keras.losses.MeanAbsoluteError(
11 |     reduction=tf.keras.losses.Reduction.NONE)
12 | zero = tf.zeros([100,1])
13 | 
14 | ax.plot(x, mae_loss(x.reshape([100,1]),zero), 
15 |         label='ℓ¹-error')
16 | ax.legend()
17 | 
18 | plt.savefig("../../plots/l1loss.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/activation_functions/elu_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-1,2))
 7 | 
 8 | x = np.linspace(-2, 2, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x), linewidth=3, label='ReLU')
11 | ax.plot(x, tf.keras.activations.relu(x, alpha=0.1), linewidth=2, label='leaky ReLU')
12 | ax.plot(x, tf.keras.activations.elu(x), linewidth=0.9, label='ELU')
13 | ax.legend()
14 | 
15 | plt.savefig("../../plots/elu.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/loss_functions/mseloss_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-2,2), (-.5,2))
 7 | 
 8 | x = np.linspace(-2, 2, 100)
 9 | 
10 | mse_loss = tf.keras.losses.MeanSquaredError(
11 |     reduction=tf.keras.losses.Reduction.NONE)
12 | zero = tf.zeros([100,1])
13 | 
14 | ax.plot(x, mse_loss(x.reshape([100,1]),zero), 
15 |         label='Mean squared error')
16 | ax.legend()
17 | 
18 | plt.savefig("../../plots/mseloss.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/gradient_plot1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def f(x):
 5 |   return x**4 - 3 * x**2
 6 | 
 7 | def nabla_f(x):
 8 |   return 4 * x**3 - 6 * x
 9 | 
10 | plt.figure()
11 | 
12 | # Plot graph of f
13 | x = np.linspace(-2,2,100)
14 | plt.plot(x,f(x))
15 | 
16 | # Plot arrows
17 | for x in np.linspace(-1.9,1.9,21):
18 |   d = nabla_f(x)
19 |   plt.arrow(x, f(x), -.05 * d, 0,
20 |     length_includes_head=True, head_width=0.08,
21 |     head_length=0.05, color='b')
22 | 
23 | plt.savefig("../plots/gradient_plot1.pdf")


--------------------------------------------------------------------------------
/code/activation_functions/tanh_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-3,3), (-1.5,1.5))
 7 | 
 8 | x = np.linspace(-3, 3, 100)
 9 | 
10 | ax.plot(x, tf.keras.activations.relu(x+1, max_value=2)-1, 
11 |         label='(-1,1)-clipping')
12 | ax.plot(x, tf.keras.activations.sigmoid(x), 
13 |         label='standard logistic')
14 | ax.plot(x, tf.keras.activations.tanh(x), label='tanh')
15 | ax.legend()
16 | 
17 | plt.savefig("../../plots/tanh.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/brownian_motion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def generate_brownian_motion(T, N):
 5 |     increments = np.random.randn(N) * np.sqrt(T/N)
 6 |     BM = np.cumsum(increments)
 7 |     BM = np.insert(BM, 0, 0)
 8 |     return BM
 9 | 
10 | T = 1
11 | N = 1000
12 | t_values = np.linspace(0, T, N+1)
13 | 
14 | fig, axarr = plt.subplots(2, 2)
15 | 
16 | for i in range(2):
17 |     for j in range(2):
18 |         BM = generate_brownian_motion(T, N)
19 |         axarr[i, j].plot(t_values, BM)
20 | 
21 | plt.tight_layout()
22 | plt.savefig('../plots/brownian_motions.pdf')
23 | plt.show()


--------------------------------------------------------------------------------
/code/loss_functions/crossentropyloss_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((0,1), (0,3))
 7 | 
 8 | ax.set_aspect(.3)
 9 | 
10 | x = np.linspace(0, 1, 100)
11 | 
12 | cce_loss = tf.keras.losses.CategoricalCrossentropy(
13 |     reduction=tf.keras.losses.Reduction.NONE)
14 | y = tf.constant([[0.3, 0.7]] * 100, shape=(100, 2))
15 | 
16 | X = tf.stack([x,1-x], axis=1)
17 | 
18 | ax.plot(x, cce_loss(y,X), label='Cross-entropy')
19 | ax.legend()
20 | 
21 | plt.savefig("../../plots/crossentropyloss.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/conv-ann-ex.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | model = nn.Sequential(
 6 |     nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(2, 2)),
 7 |     nn.ReLU(),
 8 |     nn.Conv2d(in_channels=2, out_channels=1, kernel_size=(1, 1)),
 9 | )
10 | 
11 | with torch.no_grad():
12 |     model[0].weight.set_(
13 |         torch.Tensor([[[[0, 0], [0, 0]]], [[[1, 0], [0, 1]]]])
14 |     )
15 |     model[0].bias.set_(torch.Tensor([1, -1]))
16 |     model[2].weight.set_(torch.Tensor([[[[-2]], [[2]]]]))
17 |     model[2].bias.set_(torch.Tensor([3]))
18 | 
19 | x0 = torch.Tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
20 | print(model(x0))
21 | 


--------------------------------------------------------------------------------
/code/res-ann.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class ResidualANN(nn.Module):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 |         self.affine1 = nn.Linear(3, 10)
 8 |         self.activation1 = nn.ReLU()
 9 |         self.affine2 = nn.Linear(10, 20)
10 |         self.activation2 = nn.ReLU()
11 |         self.affine3 = nn.Linear(20, 10)
12 |         self.activation3 = nn.ReLU()
13 |         self.affine4 = nn.Linear(10, 1)
14 | 
15 |     def forward(self, x0):
16 |         x1 = self.activation1(self.affine1(x0))
17 |         x2 = self.activation2(self.affine2(x1))
18 |         x3 = self.activation3(x1 + self.affine3(x2))
19 |         x4 = self.affine4(x3)
20 |         return x4
21 |         


--------------------------------------------------------------------------------
/code/loss_functions/kldloss_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((0,1), (0,3))
 7 | 
 8 | ax.set_aspect(.3)
 9 | 
10 | x = np.linspace(0, 1, 100)
11 | 
12 | kld_loss = tf.keras.losses.KLDivergence(
13 |     reduction=tf.keras.losses.Reduction.NONE)
14 | cce_loss = tf.keras.losses.CategoricalCrossentropy(
15 |     reduction=tf.keras.losses.Reduction.NONE)
16 | y = tf.constant([[0.3, 0.7]] * 100, shape=(100, 2))
17 | 
18 | X = tf.stack([x,1-x], axis=1)
19 | 
20 | ax.plot(x, kld_loss(y,X), label='Kullback-Leibler divergence')
21 | ax.plot(x, cce_loss(y,X), label='Cross-entropy')
22 | ax.legend()
23 | 
24 | plt.savefig("../../plots/kldloss.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/loss_functions/huberloss_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import plot_util
 5 | 
 6 | ax = plot_util.setup_axis((-3,3), (-.5,4))
 7 | 
 8 | x = np.linspace(-3, 3, 100)
 9 | 
10 | mse_loss = tf.keras.losses.MeanSquaredError(
11 |     reduction=tf.keras.losses.Reduction.NONE)
12 | mae_loss = tf.keras.losses.MeanAbsoluteError(
13 |     reduction=tf.keras.losses.Reduction.NONE)
14 | huber_loss = tf.keras.losses.Huber(
15 |     reduction=tf.keras.losses.Reduction.NONE)
16 | 
17 | zero = tf.zeros([100,1])
18 | 
19 | ax.plot(x, mse_loss(x.reshape([100,1]),zero)/2., 
20 |         label='Scaled mean squared error')
21 | ax.plot(x, mae_loss(x.reshape([100,1]),zero), 
22 |         label='ℓ¹-error')
23 | ax.plot(x, huber_loss(x.reshape([100,1]),zero), 
24 |         label='1-Huber-error')
25 | ax.legend()
26 | 
27 | plt.savefig("../../plots/huberloss.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/gradient_plot2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | K = [1., 10.]
 5 | vartheta = np.array([1., 1.])
 6 | 
 7 | def f(x, y):
 8 |     result =  K[0] / 2. * np.abs(x - vartheta[0])**2 \
 9 |     + K[1] / 2. * np.abs(y - vartheta[1])**2 
10 |     return result
11 | 
12 | def nabla_f(x):
13 |     return K * (x - vartheta)
14 | 
15 | plt.figure()
16 | 
17 | # Plot contour lines of f
18 | x = np.linspace(-3., 7., 100)
19 | y = np.linspace(-2., 4., 100)
20 | X, Y = np.meshgrid(x, y)
21 | Z = f(X, Y)
22 | cp = plt.contour(X, Y, Z, colors="black", 
23 |                  levels = [0.5,2,4,8,16],
24 |                  linestyles=":")
25 | 
26 | # Plot arrows along contour lines
27 | for l in [0.5,2,4,8,16]:
28 |   for d in np.linspace(0, 2.*np.pi, 10, endpoint=False):
29 |     x = np.cos(d) / ((K[0] / (2*l))**.5) + vartheta[0]
30 |     y = np.sin(d) / ((K[1] / (2*l))**.5) + vartheta[1]
31 |     grad = nabla_f(np.array([x,y]))
32 |     plt.arrow(x, y, -.05 * grad[0], -.05 * grad[1],
33 |       length_includes_head=True, head_width=.08, 
34 |       head_length=.1, color='b')
35 | 
36 | plt.savefig("../plots/gradient_plot2.pdf")


--------------------------------------------------------------------------------
/code/conv-ann.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ConvolutionalANN(nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 |         # The convolutional layer defined here takes any tensor of 
 9 |         # shape (1, n, m) [a single input] or (N, 1, n, m) [a batch 
10 |         # of N inputs] where N, n, m are natural numbers satisfying 
11 |         # n >= 3 and m >= 3.
12 |         self.conv1 = nn.Conv2d(
13 |             in_channels=1, out_channels=5, kernel_size=(3, 3)
14 |         )
15 |         self.activation1 = nn.ReLU()
16 |         self.conv2 = nn.Conv2d(
17 |             in_channels=5, out_channels=5, kernel_size=(5, 3)
18 |         )
19 | 
20 |     def forward(self, x0):
21 |         x1 = self.activation1(self.conv1(x0))
22 |         print(x1.shape)
23 |         x2 = self.conv2(x1)
24 |         print(x2.shape)
25 |         return x2
26 | 
27 | 
28 | model = ConvolutionalANN()
29 | x0 = torch.rand(1, 20, 20)
30 | # This will print the shapes of the outputs of the two layers of
31 | # the model, in this case:
32 | # torch.Size([5, 18, 18])
33 | # torch.Size([5, 14, 16])
34 | model(x0)
35 | 


--------------------------------------------------------------------------------
/code/optimization_methods/adagrad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.02
20 | eps = 1e-10
21 | 
22 | sum_sq_grad = [p.clone().detach().fill_(eps) for p in net.parameters()]
23 | 
24 | for n in range(N):
25 |     indices = torch.randint(0, M, (J,))
26 | 
27 |     x = X[indices]
28 |     y = Y[indices]
29 | 
30 |     net.zero_grad()
31 | 
32 |     loss_val = loss(net(x), y)
33 |     loss_val.backward()
34 | 
35 |     with torch.no_grad():
36 |         for a, p in zip(sum_sq_grad, net.parameters()):
37 |             a.add_(p.grad * p.grad)
38 |             p.sub_(lr * a.rsqrt() * p.grad)
39 | 
40 |     if n % 1000 == 0:
41 |         with torch.no_grad():
42 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
43 |             y = torch.sin(x)
44 |             loss_val = loss(net(x), y)
45 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
46 | 


--------------------------------------------------------------------------------
/code/fc-ann.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class FullyConnectedANN(nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 |         # Define the layers of the network in terms of Modules. 
 9 |         # nn.Linear(3, 20) represents an affine function defined 
10 |         # by a 20x3 weight matrix and a 20-dimensional bias vector.
11 |         self.affine1 = nn.Linear(3, 20)
12 |         # The torch.nn.ReLU class simply wraps the 
13 |         # torch.nn.functional.relu function as a Module.
14 |         self.activation1 = nn.ReLU()
15 |         self.affine2 = nn.Linear(20, 30)
16 |         self.activation2 = nn.ReLU()
17 |         self.affine3 = nn.Linear(30, 1)
18 | 
19 |     def forward(self, x0):
20 |         x1 = self.activation1(self.affine1(x0))
21 |         x2 = self.activation2(self.affine2(x1))
22 |         x3 = self.affine3(x2)
23 |         return x3
24 | 
25 | 
26 | model = FullyConnectedANN()
27 | 
28 | x0 = torch.Tensor([1, 2, 3])
29 | print(model(x0))
30 | 
31 | # Assigning a Module to an instance variable of a Module registers 
32 | # all of the former's parameters as parameters of the latter
33 | for p in model.parameters():
34 |     print(p)


--------------------------------------------------------------------------------
/code/optimization_methods/rmsprop.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.001
20 | beta = 0.9
21 | eps = 1e-10
22 | 
23 | moments = [p.clone().detach().zero_() for p in net.parameters()]
24 | 
25 | for n in range(N):
26 |     indices = torch.randint(0, M, (J,))
27 | 
28 |     x = X[indices]
29 |     y = Y[indices]
30 | 
31 |     net.zero_grad()
32 | 
33 |     loss_val = loss(net(x), y)
34 |     loss_val.backward()
35 | 
36 |     with torch.no_grad():
37 |         for m, p in zip(moments, net.parameters()):
38 |             m.mul_(beta)
39 |             m.add_((1 - beta) * p.grad * p.grad)
40 |             p.sub_(lr * (eps + m).rsqrt() * p.grad)
41 | 
42 |     if n % 1000 == 0:
43 |         with torch.no_grad():
44 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
45 |             y = torch.sin(x)
46 |             loss_val = loss(net(x), y)
47 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
48 | 


--------------------------------------------------------------------------------
/code/optimization_methods/momentum_sgd_bias_adj.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.01
20 | alpha = 0.99
21 | adj = 1
22 | 
23 | momentum = [p.clone().detach().zero_() for p in net.parameters()]
24 | 
25 | for n in range(N):
26 |     indices = torch.randint(0, M, (J,))
27 | 
28 |     x = X[indices]
29 |     y = Y[indices]
30 | 
31 |     net.zero_grad()
32 | 
33 |     loss_val = loss(net(x), y)
34 |     loss_val.backward()
35 | 
36 |     adj *= alpha
37 | 
38 |     with torch.no_grad():
39 |         for m, p in zip(momentum, net.parameters()):
40 |             m.mul_(alpha)
41 |             m.add_((1-alpha) * p.grad)
42 |             p.sub_(lr * m / (1 - adj))
43 | 
44 |     if n % 1000 == 0:
45 |         with torch.no_grad():
46 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
47 |             y = torch.sin(x)
48 |             loss_val = loss(net(x), y)
49 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
50 | 


--------------------------------------------------------------------------------
/code/optimization_methods/rmsprop_bias_adj.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.001
20 | beta = 0.9
21 | eps = 1e-10
22 | adj = 1
23 | 
24 | moments = [p.clone().detach().zero_() for p in net.parameters()]
25 | 
26 | for n in range(N):
27 |     indices = torch.randint(0, M, (J,))
28 | 
29 |     x = X[indices]
30 |     y = Y[indices]
31 | 
32 |     net.zero_grad()
33 | 
34 |     loss_val = loss(net(x), y)
35 |     loss_val.backward()
36 | 
37 |     with torch.no_grad():
38 |         adj *= beta
39 |         for m, p in zip(moments, net.parameters()):
40 |             m.mul_(beta)
41 |             m.add_((1 - beta) * p.grad * p.grad)
42 |             p.sub_(lr * (eps + (m / (1 - adj)).sqrt()).reciprocal() * p.grad)
43 | 
44 |     if n % 1000 == 0:
45 |         with torch.no_grad():
46 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
47 |             y = torch.sin(x)
48 |             loss_val = loss(net(x), y)
49 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
50 | 


--------------------------------------------------------------------------------
/code/optimization_methods/adadelta.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | beta = 0.9
20 | delta = 0.9
21 | eps = 1e-10
22 | 
23 | moments = [p.clone().detach().zero_() for p in net.parameters()]
24 | Delta = [p.clone().detach().zero_() for p in net.parameters()]
25 | 
26 | for n in range(N):
27 |     indices = torch.randint(0, M, (J,))
28 | 
29 |     x = X[indices]
30 |     y = Y[indices]
31 | 
32 |     net.zero_grad()
33 | 
34 |     loss_val = loss(net(x), y)
35 |     loss_val.backward()
36 | 
37 |     with torch.no_grad():
38 |         for m, D, p in zip(moments, Delta, net.parameters()):
39 |             m.mul_(beta)
40 |             m.add_((1 - beta) * p.grad * p.grad)
41 |             inc = ((eps + D) / (eps + m)).sqrt() * p.grad
42 |             p.sub_(inc)
43 |             D.mul_(delta)
44 |             D.add_((1 - delta) * inc * inc)
45 | 
46 |     if n % 1000 == 0:
47 |         with torch.no_grad():
48 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
49 |             y = torch.sin(x)
50 |             loss_val = loss(net(x), y)
51 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
52 | 


--------------------------------------------------------------------------------
/code/optimization_methods/adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.0001
20 | alpha = 0.9
21 | beta = 0.999
22 | eps = 1e-8
23 | adj = 1.
24 | adj2 = 1.
25 | 
26 | m = [p.clone().detach().zero_() for p in net.parameters()]
27 | MM = [p.clone().detach().zero_() for p in net.parameters()]
28 | 
29 | for n in range(N):
30 |     indices = torch.randint(0, M, (J,))
31 | 
32 |     x = X[indices]
33 |     y = Y[indices]
34 | 
35 |     net.zero_grad()
36 | 
37 |     loss_val = loss(net(x), y)
38 |     loss_val.backward()
39 | 
40 |     with torch.no_grad():
41 |         adj *= alpha
42 |         adj2 *= beta
43 |         for m_p, M_p, p in zip(m, MM, net.parameters()):
44 |             m_p.mul_(alpha)
45 |             m_p.add_((1 - alpha) * p.grad)
46 |             M_p.mul_(beta)
47 |             M_p.add_((1 - beta) * p.grad * p.grad)
48 |             p.sub_(lr * m_p / ((1 - adj) * (eps + (M_p / (1 - adj2)).sqrt())))
49 | 
50 |     if n % 1000 == 0:
51 |         with torch.no_grad():
52 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
53 |             y = torch.sin(x)
54 |             loss_val = loss(net(x), y)
55 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
56 | 


--------------------------------------------------------------------------------
/code/optimization_methods/nesterov_sgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.003
20 | alpha = 0.999
21 | 
22 | m = [p.clone().detach().zero_() for p in net.parameters()]
23 | 
24 | for n in range(N):
25 |     indices = torch.randint(0, M, (J,))
26 | 
27 |     x = X[indices]
28 |     y = Y[indices]
29 | 
30 |     net.zero_grad()
31 | 
32 |     # Remember the original parameters
33 |     params = [p.clone().detach() for p in net.parameters()]
34 | 
35 |     for p, m_p in zip(params, m):
36 |         p.sub_(lr * alpha * m_p)
37 | 
38 |     # Compute the loss
39 |     loss_val = loss(net(x), y)
40 |     # Compute the gradients with respect to the parameters
41 |     loss_val.backward()
42 | 
43 |     with torch.no_grad():
44 |         for p, m_p, q in zip(net.parameters(), m, params):
45 |             m_p.mul_(alpha)
46 |             m_p.add_((1 - alpha) * p.grad)
47 |             q.sub_(lr * m_p)
48 |             p.copy_(q)
49 | 
50 |     if n % 1000 == 0:
51 |         with torch.no_grad():
52 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
53 |             y = torch.sin(x)
54 |             loss_val = loss(net(x), y)
55 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
56 | 


--------------------------------------------------------------------------------
/code/fc-ann-manual.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | # To define a neural network, we define a class that inherits from 
 7 | # torch.nn.Module
 8 | class FullyConnectedANN(nn.Module):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         # In the constructor, we define the weights and biases.
12 |         # Wrapping the tensors in torch.nn.Parameter objects tells 
13 |         # PyTorch that these are parameters that should be 
14 |         # optimized during training.
15 |         self.W1 = nn.Parameter(
16 |             torch.Tensor([[1, 0], [0, -1], [-2, 2]])
17 |         )
18 |         self.B1 = nn.Parameter(torch.Tensor([0, 2, -1]))
19 |         self.W2 = nn.Parameter(torch.Tensor([[1, -2, 3]]))
20 |         self.B2 = nn.Parameter(torch.Tensor([1]))
21 | 
22 |     # The realization function of the network
23 |     def forward(self, x0):
24 |         x1 = F.relu(self.W1 @ x0 + self.B1)
25 |         x2 = self.W2 @ x1 + self.B2
26 |         return x2
27 | 
28 | 
29 | model = FullyConnectedANN()
30 | 
31 | x0 = torch.Tensor([1, 2])
32 | # Print the output of the realization function for input x0
33 | print(model.forward(x0))
34 | 
35 | # As a consequence of inheriting from torch.nn.Module we can just 
36 | # "call" the model itself (which will call the forward method 
37 | # implicitly)
38 | print(model(x0))
39 | 
40 | # Wrapping a tensor in a Parameter object and assigning it to an 
41 | # instance variable of the Module makes PyTorch register it as a 
42 | # parameter. We can access all parameters via the parameters 
43 | # method.
44 | for p in model.parameters():
45 |     print(p)
46 | 


--------------------------------------------------------------------------------
/code/example_GD_momentum_plots.py:
--------------------------------------------------------------------------------
 1 | # Example for GD and momentum GD
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | # Number of steps for the schemes
 7 | N = 8
 8 | 
 9 | # Problem setting
10 | d = 2
11 | K = [1., 10.]
12 | 
13 | vartheta = np.array([1., 1.])
14 | xi = np.array([5., 3.])
15 | 
16 | def f(x, y):
17 |     result =  K[0] / 2. * np.abs(x - vartheta[0])**2 \
18 |     + K[1] / 2. * np.abs(y - vartheta[1])**2 
19 |     return result
20 | 
21 | def nabla_f(x):
22 |     return K * (x - vartheta)
23 | 
24 | # Coefficients for GD
25 | gamma_GD = 2 /(K[0] + K[1])
26 | 
27 | # Coefficients for momentum
28 | gamma_momentum = 0.3
29 | alpha = 0.5 
30 | 
31 | # Placeholder for processes
32 | Theta = np.zeros((N+1, d))
33 | M = np.zeros((N+1, d))
34 | m = np.zeros((N+1, d))
35 | 
36 | Theta[0] = xi
37 | M[0] = xi
38 | 
39 | # Perform gradient descent 
40 | for i in range(N):
41 |     Theta[i+1] = Theta[i] - gamma_GD * nabla_f(Theta[i])
42 | 
43 | # Perform momentum GD 
44 | for i in range(N):
45 |     m[i+1] = alpha * m[i] + (1 - alpha) * nabla_f(M[i])
46 |     M[i+1] = M[i] - gamma_momentum * m[i+1]
47 | 
48 | 
49 | ### Plot ###
50 | plt.figure()
51 | 
52 | # Plot the gradient descent process
53 | plt.plot(Theta[:, 0], Theta[:, 1], 
54 |          label = "GD", color = "c", 
55 |          linestyle = "--", marker = "*")
56 | 
57 | # Plot the momentum gradient descent process
58 | plt.plot(M[:, 0], M[:, 1], 
59 |          label = "Momentum", color = "orange", marker = "*")
60 | 
61 | # Target value
62 | plt.scatter(vartheta[0],vartheta[1], 
63 |             label = "vartheta", color = "red", marker = "x")
64 | 
65 | # Plot contour lines of f
66 | x = np.linspace(-3., 7., 100)
67 | y = np.linspace(-2., 4., 100)
68 | X, Y = np.meshgrid(x, y)
69 | Z = f(X, Y)
70 | cp = plt.contour(X, Y, Z, colors="black", 
71 |                  levels = [0.5,2,4,8,16],
72 |                  linestyles=":")
73 | 
74 | plt.legend()
75 | plt.savefig("../plots/GD_momentum_plots.pdf")
76 | 


--------------------------------------------------------------------------------
/code/optimization_methods/midpoint_sgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | net = nn.Sequential(
 6 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
 7 | )
 8 | 
 9 | M = 1000
10 | 
11 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
12 | Y = torch.sin(X)
13 | 
14 | J = 64
15 | 
16 | N = 150000
17 | 
18 | loss = nn.MSELoss()
19 | lr = 0.003
20 | 
21 | for n in range(N):
22 |     indices = torch.randint(0, M, (J,))
23 | 
24 |     x = X[indices]
25 |     y = Y[indices]
26 | 
27 |     net.zero_grad()
28 | 
29 |     # Remember the original parameters
30 |     params = [p.clone().detach() for p in net.parameters()]
31 |     # Compute the loss
32 |     loss_val = loss(net(x), y)
33 |     # Compute the gradients with respect to the parameters
34 |     loss_val.backward()
35 | 
36 |     with torch.no_grad():
37 |         # Make a half-step in the direction of the negative 
38 |         # gradient
39 |         for p in net.parameters():
40 |             if p.grad is not None:
41 |                 p.sub_(0.5 * lr * p.grad)
42 | 
43 |     net.zero_grad()
44 |     # Compute the loss and the gradients at the midpoint
45 |     loss_val = loss(net(x), y)
46 |     loss_val.backward()
47 | 
48 |     with torch.no_grad():
49 |         # Subtract the scaled gradient at the midpoint from the
50 |         # original parameters
51 |         for param, midpoint_param in zip(
52 |             params, net.parameters()
53 |         ):
54 |             param.sub_(lr * midpoint_param.grad)
55 | 
56 |         # Copy the new parameters into the model
57 |         for param, p in zip(params, net.parameters()):
58 |             p.copy_(param)
59 | 
60 |     if n % 1000 == 0:
61 |         with torch.no_grad():
62 |             x = torch.rand((1000, 1)) * 4 * np.pi - 2 * np.pi
63 |             y = torch.sin(x)
64 |             loss_val = loss(net(x), y)
65 |             print(f"Iteration: {n+1}, Loss: {loss_val}")
66 | 


--------------------------------------------------------------------------------
/code/optimization_methods/momentum_sgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | M = 10000
 7 | 
 8 | torch.manual_seed(0)
 9 | X = torch.rand((M, 1)) * 4 * np.pi - 2 * np.pi
10 | Y = torch.sin(X)
11 | 
12 | J = 64
13 | 
14 | N = 100000
15 | 
16 | loss = nn.MSELoss()
17 | lr = 0.01
18 | alpha = 0.999
19 | 
20 | fig, axs = plt.subplots(1, 4, figsize=(12, 3), sharey='row')
21 | 
22 | net = nn.Sequential(
23 |     nn.Linear(1, 200), nn.ReLU(), nn.Linear(200, 1)
24 | )
25 | 
26 | for i, alpha in enumerate([0, 0.9, 0.99, 0.999]):
27 |     print(f"alpha = {alpha}")
28 | 
29 |     for lr in [0.1, 0.03, 0.01, 0.003]:
30 |         torch.manual_seed(0)
31 |         net.apply(
32 |             lambda m: m.reset_parameters()
33 |             if isinstance(m, nn.Linear)
34 |             else None
35 |         )
36 | 
37 |         momentum = [
38 |             p.clone().detach().zero_() for p in net.parameters()
39 |         ]
40 | 
41 |         losses = []
42 |         print(f"lr = {lr}")
43 | 
44 |         for n in range(N):
45 |             indices = torch.randint(0, M, (J,))
46 | 
47 |             x = X[indices]
48 |             y = Y[indices]
49 | 
50 |             net.zero_grad()
51 | 
52 |             loss_val = loss(net(x), y)
53 |             loss_val.backward()
54 | 
55 |             with torch.no_grad():
56 |                 for m, p in zip(momentum, net.parameters()):
57 |                     m.mul_(alpha)
58 |                     m.add_((1 - alpha) * p.grad)
59 |                     p.sub_(lr * m)
60 | 
61 |             if n % 100 == 0:
62 |                 with torch.no_grad():
63 |                     x = (torch.rand((1000, 1)) - 0.5) * 4 * np.pi
64 |                     y = torch.sin(x)
65 |                     loss_val = loss(net(x), y)
66 |                     losses.append(loss_val.item())
67 | 
68 |         axs[i].plot(losses, label=f"$\\gamma = {lr}$")
69 | 
70 |     axs[i].set_yscale("log")
71 |     axs[i].set_ylim([1e-6, 1])
72 |     axs[i].set_title(f"$\\alpha = {alpha}$")
73 | 
74 | axs[0].legend()
75 | 
76 | plt.tight_layout()
77 | plt.savefig("../plots/sgd_momentum.pdf", bbox_inches='tight')
78 | 


--------------------------------------------------------------------------------
/code/optimization_methods/sgd2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | def plot_heatmap(ax, g):
 7 |     x = np.linspace(-2 * np.pi, 2 * np.pi, 100)
 8 |     y = np.linspace(-2 * np.pi, 2 * np.pi, 100)
 9 |     x, y = np.meshgrid(x, y)
10 | 
11 |     # flatten the grid to [num_points, 2] and convert to tensor
12 |     grid = np.vstack([x.flatten(), y.flatten()]).T
13 |     grid_torch = torch.from_numpy(grid).float()
14 | 
15 |     # pass the grid through the network
16 |     z = g(grid_torch)
17 | 
18 |     # reshape the predictions back to a 2D grid
19 |     Z = z.numpy().reshape(x.shape)
20 | 
21 |     # plot the heatmap
22 |     ax.imshow(Z, origin='lower', extent=(-2 * np.pi, 2 * np.pi, 
23 |                                          -2 * np.pi, 2 * np.pi))
24 | 
25 | M = 10000
26 | 
27 | def f(x):
28 |     return torch.sin(x).prod(dim=1, keepdim=True)
29 | 
30 | torch.manual_seed(0)
31 | X = torch.rand((M, 2)) * 4 * np.pi - 2 * np.pi
32 | Y = f(X)
33 | 
34 | J = 32
35 | 
36 | N = 100000
37 | 
38 | loss = nn.MSELoss()
39 | gamma = 0.05
40 | 
41 | fig, axs = plt.subplots(
42 |     3, 3, figsize=(12, 12), sharex="col", sharey="row",
43 | )
44 | 
45 | net = nn.Sequential(
46 |     nn.Linear(2, 50), 
47 |     nn.Softplus(), 
48 |     nn.Linear(50,50), 
49 |     nn.Softplus(), 
50 |     nn.Linear(50, 1)
51 | )
52 | 
53 | plot_after = [0, 100, 300, 1000, 3000, 10000, 30000, 100000]
54 | 
55 | for n in range(N + 1):
56 |     indices = torch.randint(0, M, (J,))
57 | 
58 |     x = X[indices]
59 |     y = Y[indices]
60 | 
61 |     net.zero_grad()
62 | 
63 |     loss_val = loss(net(x), y)
64 |     loss_val.backward()
65 | 
66 |     with torch.no_grad():
67 |         for p in net.parameters():
68 |             p.sub_(gamma * p.grad)
69 | 
70 |     if n in plot_after:
71 |         i = plot_after.index(n)
72 | 
73 |         with torch.no_grad():
74 |             plot_heatmap(axs[i // 3][i % 3], net)
75 |             axs[i // 3][i % 3].set_title(f"Batch {n}")
76 | 
77 | with torch.no_grad():
78 |     plot_heatmap(axs[2][2], f)
79 |     axs[2][2].set_title("Target")
80 | 
81 | plt.tight_layout()
82 | plt.savefig("../../plots/sgd2.pdf", bbox_inches="tight")
83 | 


--------------------------------------------------------------------------------
/code/optimization_methods/sgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | M = 10000  # number of training samples
 7 | 
 8 | # We fix a random seed. This is not necessary for training a
 9 | # neural network, but we use it here to ensure that the same
10 | # plot is created on every run.
11 | torch.manual_seed(0)
12 | 
13 | # Here, we define the training set.
14 | # Create a tensor of shape (M, 1) with entries sampled from a
15 | # uniform distribution on [-2 * pi, 2 * pi)
16 | X = (torch.rand((M, 1)) - 0.5) * 4 * np.pi
17 | # We use the sine as the target function, so this defines the 
18 | # desired outputs.
19 | Y = torch.sin(X)
20 | 
21 | J = 32  # the batch size
22 | N = 100000  # the number of SGD iterations
23 | 
24 | loss = nn.MSELoss()  # the mean squared error loss function
25 | gamma = 0.003  # the learning rate
26 | 
27 | # Define a network with a single hidden layer of 200 neurons and 
28 | # tanh activation function
29 | net = nn.Sequential(
30 |     nn.Linear(1, 200), nn.Tanh(), nn.Linear(200, 1)
31 | )
32 | 
33 | # Set up a 3x3 grid of plots
34 | fig, axs = plt.subplots(
35 |     3,
36 |     3,
37 |     figsize=(12, 8),
38 |     sharex="col",
39 |     sharey="row",
40 | )
41 | 
42 | # Plot the target function
43 | x = torch.linspace(-2 * np.pi, 2 * np.pi, 1000).reshape((1000, 1))
44 | y = torch.sin(x)
45 | for ax in axs.flatten():
46 |     ax.plot(x, y, label="Target")
47 |     ax.set_xlim([-2 * np.pi, 2 * np.pi])
48 |     ax.set_ylim([-1.1, 1.1])
49 | 
50 | plot_after = [1, 30, 100, 300, 1000, 3000, 10000, 30000, 100000]
51 | 
52 | # The training loop
53 | for n in range(N):
54 |     # Choose J samples randomly from the training set
55 |     indices = torch.randint(0, M, (J,))
56 |     X_batch = X[indices]
57 |     Y_batch = Y[indices]
58 | 
59 |     net.zero_grad()  # Zero out the gradients
60 | 
61 |     loss_val = loss(net(X_batch), Y_batch)  # Compute the loss
62 |     loss_val.backward()  # Compute the gradients
63 | 
64 |     # Update the parameters
65 |     with torch.no_grad():
66 |         for p in net.parameters():
67 |             # Subtract the scaled gradient in-place
68 |             p.sub_(gamma * p.grad)
69 | 
70 |     if n + 1 in plot_after:
71 |         # Plot the realization function of the ANN
72 |         i = plot_after.index(n + 1)
73 |         ax = axs[i // 3][i % 3]
74 |         ax.set_title(f"Batch {n+1}")
75 | 
76 |         with torch.no_grad():
77 |             ax.plot(x, net(x), label="ANN realization")
78 | 
79 | axs[0][0].legend(loc="upper right")
80 | 
81 | plt.tight_layout()
82 | plt.savefig("../../plots/sgd.pdf", bbox_inches="tight")
83 | 


--------------------------------------------------------------------------------
/code/kolmogorov.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | # Use the GPU if available
 5 | dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 6 | 
 7 | # Computes an approximation of E[|phi(sqrt(2*rho*T) W + xi) -
 8 | # N(xi)|²] with W a standard normal random variable using the rows
 9 | # of x as # independent realizations of the random variable xi
10 | def loss(N, rho, phi, t, x):
11 |     W = torch.randn_like(x).to(dev)
12 |     return (phi(torch.sqrt(2 * rho * t) * W + x) - 
13 |             N(torch.cat((t,x),1))).square().mean()
14 | 
15 | d = 2              # the input dimension
16 | a, b = -5.0, 5.0   # the domain will be [a,b]^d
17 | T = 2.0            # the time horizon
18 | rho = 1.0          # the diffusivity
19 | 
20 | # Define the initial value
21 | def phi(x):
22 |     return x.cos().sum(axis=1, keepdim=True)
23 | 
24 | # Define a neural network with two hidden layers with 50 neurons 
25 | # each using ReLU activations
26 | N = torch.nn.Sequential(
27 |     torch.nn.Linear(d+1, 50), torch.nn.ReLU(),
28 |     torch.nn.Linear(50, 50), torch.nn.ReLU(),
29 |     torch.nn.Linear(50, 1)
30 | ).to(dev)
31 | 
32 | # Configure the training parameters and optimization algorithm
33 | steps = 3000
34 | batch_size = 256
35 | optimizer = torch.optim.Adam(N.parameters())
36 | 
37 | # Train the network
38 | for step in range(steps):
39 |     # Generate uniformly distributed samples from [a,b]^d
40 |     x = (torch.rand(batch_size, d) * (b-a) + a).to(dev)
41 |     t = T * torch.rand(batch_size, 1).to(dev)
42 |     
43 |     optimizer.zero_grad()
44 |     # Compute the loss
45 |     L = loss(N, rho, phi, t, x)
46 |     # Compute the gradients
47 |     L.backward()
48 |     # Apply changes to weights and biases of N
49 |     optimizer.step()
50 | 
51 | # Plot the result at M+1 timesteps
52 | M = 5
53 | mesh = 128
54 | 
55 | def toNumpy(t):
56 |     return t.detach().cpu().numpy().reshape((mesh,mesh))
57 | 
58 | fig, axs = plt.subplots(2,3,subplot_kw=dict(projection='3d'))
59 | fig.set_size_inches(16, 10)
60 | fig.set_dpi(300)
61 | 
62 | for i in range(M+1):
63 |     x = torch.linspace(a, b, mesh)
64 |     y = torch.linspace(a, b, mesh)
65 |     x, y = torch.meshgrid(x, y, indexing='xy')
66 |     x = x.reshape((mesh*mesh,1)).to(dev)
67 |     y = y.reshape((mesh*mesh,1)).to(dev)
68 |     z = N(torch.cat((i*T/M*torch.ones(128*128,1).to(dev), x, y), 
69 |                     1))
70 | 
71 |     axs[i//3,i%3].set_title(f"t = {i * T / M}")
72 |     axs[i//3,i%3].set_zlim(-2,2)
73 |     axs[i//3,i%3].plot_surface(toNumpy(x), toNumpy(y), toNumpy(z), 
74 |                     cmap='viridis')
75 | 
76 | fig.savefig(f"../plots/kolmogorov.pdf", bbox_inches='tight')


--------------------------------------------------------------------------------
/code/dgm.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import matplotlib.pyplot as plt
  3 | from torch.autograd import grad
  4 | from matplotlib.gridspec import GridSpec
  5 | from matplotlib.cm import ScalarMappable
  6 | 
  7 | 
  8 | dev = torch.device("cuda:0" if torch.cuda.is_available() else 
  9 |                    "cpu")
 10 | 
 11 | T = 3.0  # the time horizom
 12 | 
 13 | # The initial value
 14 | def phi(x):
 15 |     return x.sin().prod(axis=1, keepdims=True)
 16 | 
 17 | torch.manual_seed(0)
 18 | 
 19 | # We use a network with 4 hidden layers of 50 neurons each and the
 20 | # Swish activation function (called SiLU in PyTorch)
 21 | N = torch.nn.Sequential(
 22 |     torch.nn.Linear(3, 50), torch.nn.SiLU(),
 23 |     torch.nn.Linear(50, 50), torch.nn.SiLU(),
 24 |     torch.nn.Linear(50, 50), torch.nn.SiLU(),
 25 |     torch.nn.Linear(50, 50), torch.nn.SiLU(),
 26 |     torch.nn.Linear(50, 1),
 27 | ).to(dev)
 28 | 
 29 | optimizer = torch.optim.Adam(N.parameters(), lr=3e-4)
 30 | 
 31 | J = 256  # the batch size
 32 | 
 33 | for i in range(30000):
 34 |     # Choose a random batch of training samples
 35 |     x = torch.randn(J, 2).to(dev) * 2
 36 |     t = torch.rand(J, 1).to(dev) * T
 37 | 
 38 |     x1 = x[:, 0:1]
 39 |     x2 = x[:, 1:2]
 40 | 
 41 |     x1.requires_grad_()
 42 |     x2.requires_grad_()
 43 |     t.requires_grad_()
 44 | 
 45 |     optimizer.zero_grad()
 46 | 
 47 |     # Denoting by u the realization function of the ANN, compute
 48 |     # u(0, x) for each x in the batch
 49 |     u0 = N(torch.hstack((torch.zeros_like(t), x)))
 50 |     # Compute the loss for the initial condition
 51 |     initial_loss = (u0 - phi(x)).square().mean()
 52 | 
 53 |     # Compute the partial derivatives using automatic
 54 |     # differentiation
 55 |     u = N(torch.hstack((t, x1, x2)))
 56 |     ones = torch.ones_like(u)
 57 |     u_t = grad(u, t, ones, create_graph=True)[0]
 58 |     u_x1 = grad(u, x1, ones, create_graph=True)[0]
 59 |     u_x2 = grad(u, x2, ones, create_graph=True)[0]
 60 |     ones = torch.ones_like(u_x1)
 61 |     u_x1x1 = grad(u_x1, x1, ones, create_graph=True)[0]
 62 |     u_x2x2 = grad(u_x2, x2, ones, create_graph=True)[0]
 63 | 
 64 |     # Compute the loss for the PDE
 65 |     Laplace = u_x1x1 + u_x2x2
 66 |     pde_loss = (u_t - (0.005 * Laplace + u - u**3)).square().mean()
 67 | 
 68 |     # Compute the total loss and perform a gradient step
 69 |     loss = initial_loss + pde_loss
 70 |     loss.backward()
 71 |     optimizer.step()
 72 | 
 73 | 
 74 | ### Plot the solution at different times
 75 | 
 76 | mesh = 128
 77 | a, b = -torch.pi, torch.pi
 78 | 
 79 | gs = GridSpec(2, 4, width_ratios=[1, 1, 1, 0.05])
 80 | fig = plt.figure(figsize=(16, 10), dpi=300)
 81 | 
 82 | x, y = torch.meshgrid(
 83 |     torch.linspace(a, b, mesh), 
 84 |     torch.linspace(a, b, mesh), 
 85 |     indexing="xy"
 86 | )
 87 | x = x.reshape((mesh * mesh, 1)).to(dev)
 88 | y = y.reshape((mesh * mesh, 1)).to(dev)
 89 | 
 90 | for i in range(6):
 91 |     t = torch.full((mesh * mesh, 1), i * T / 5).to(dev)
 92 |     z = N(torch.cat((t, x, y), 1))
 93 |     z = z.detach().cpu().numpy().reshape((mesh, mesh))
 94 | 
 95 |     ax = fig.add_subplot(gs[i // 3, i % 3])
 96 |     ax.set_title(f"t = {i * T / 5}")
 97 |     ax.imshow(
 98 |         z, cmap="viridis", extent=[a, b, a, b], vmin=-1.2, vmax=1.2
 99 |     )
100 | 
101 | # Add the colorbar to the figure
102 | norm = plt.Normalize(vmin=-1.2, vmax=1.2)
103 | sm = ScalarMappable(cmap="viridis", norm=norm)
104 | cax = fig.add_subplot(gs[:, 3])
105 | fig.colorbar(sm, cax=cax, orientation='vertical')
106 | 
107 | fig.savefig("../plots/dgm.pdf", bbox_inches="tight")


--------------------------------------------------------------------------------
/code/pinn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import matplotlib.pyplot as plt
  3 | from torch.autograd import grad
  4 | from matplotlib.gridspec import GridSpec
  5 | from matplotlib.cm import ScalarMappable
  6 | 
  7 | 
  8 | dev = torch.device("cuda:0" if torch.cuda.is_available() else
  9 |                    "cpu")
 10 | 
 11 | T = 3.0  # the time horizom
 12 | M = 20000  # the number of training samples
 13 | 
 14 | torch.manual_seed(0)
 15 | 
 16 | x_data = torch.randn(M, 2).to(dev) * 2
 17 | t_data = torch.rand(M, 1).to(dev) * T
 18 | 
 19 | # The initial value
 20 | def phi(x):
 21 |     return x.square().sum(axis=1, keepdims=True).sin()
 22 | 
 23 | # We use a network with 4 hidden layers of 50 neurons each and the
 24 | # Swish activation function (called SiLU in PyTorch)
 25 | N = torch.nn.Sequential(
 26 |     torch.nn.Linear(3, 50), torch.nn.SiLU(),
 27 |     torch.nn.Linear(50, 50), torch.nn.SiLU(),
 28 |     torch.nn.Linear(50, 50), torch.nn.SiLU(),
 29 |     torch.nn.Linear(50, 50), torch.nn.SiLU(),
 30 |     torch.nn.Linear(50, 1),
 31 | ).to(dev)
 32 | 
 33 | optimizer = torch.optim.Adam(N.parameters(), lr=3e-4)
 34 | 
 35 | J = 256  # the batch size
 36 | 
 37 | for i in range(20000):
 38 |     # Choose a random batch of training samples
 39 |     indices = torch.randint(0, M, (J,))
 40 |     x = x_data[indices, :]
 41 |     t = t_data[indices, :]
 42 | 
 43 |     x1, x2 = x[:, 0:1], x[:, 1:2]
 44 | 
 45 |     x1.requires_grad_()
 46 |     x2.requires_grad_()
 47 |     t.requires_grad_()
 48 | 
 49 |     optimizer.zero_grad()
 50 | 
 51 |     # Denoting by u the realization function of the ANN, compute
 52 |     # u(0, x) for each x in the batch
 53 |     u0 = N(torch.hstack((torch.zeros_like(t), x)))
 54 |     # Compute the loss for the initial condition
 55 |     initial_loss = (u0 - phi(x)).square().mean()
 56 | 
 57 |     # Compute the partial derivatives using automatic
 58 |     # differentiation
 59 |     u = N(torch.hstack((t, x1, x2)))
 60 |     ones = torch.ones_like(u)
 61 |     u_t = grad(u, t, ones, create_graph=True)[0]
 62 |     u_x1 = grad(u, x1, ones, create_graph=True)[0]
 63 |     u_x2 = grad(u, x2, ones, create_graph=True)[0]
 64 |     ones = torch.ones_like(u_x1)
 65 |     u_x1x1 = grad(u_x1, x1, ones, create_graph=True)[0]
 66 |     u_x2x2 = grad(u_x2, x2, ones, create_graph=True)[0]
 67 | 
 68 |     # Compute the loss for the PDE
 69 |     Laplace = u_x1x1 + u_x2x2
 70 |     pde_loss = (u_t - (0.005 * Laplace + u - u**3)).square().mean()
 71 | 
 72 |     # Compute the total loss and perform a gradient step
 73 |     loss = initial_loss + pde_loss
 74 |     loss.backward()
 75 |     optimizer.step()
 76 | 
 77 | 
 78 | ### Plot the solution at different times
 79 | 
 80 | mesh = 128
 81 | a, b = -3, 3
 82 | 
 83 | gs = GridSpec(2, 4, width_ratios=[1, 1, 1, 0.05])
 84 | fig = plt.figure(figsize=(16, 10), dpi=300)
 85 | 
 86 | x, y = torch.meshgrid(
 87 |     torch.linspace(a, b, mesh),
 88 |     torch.linspace(a, b, mesh),
 89 |     indexing="xy"
 90 | )
 91 | x = x.reshape((mesh * mesh, 1)).to(dev)
 92 | y = y.reshape((mesh * mesh, 1)).to(dev)
 93 | 
 94 | for i in range(6):
 95 |     t = torch.full((mesh * mesh, 1), i * T / 5).to(dev)
 96 |     z = N(torch.cat((t, x, y), 1))
 97 |     z = z.detach().cpu().numpy().reshape((mesh, mesh))
 98 | 
 99 |     ax = fig.add_subplot(gs[i // 3, i % 3])
100 |     ax.set_title(f"t = {i * T / 5}")
101 |     ax.imshow(
102 |         z, cmap="viridis", extent=[a, b, a, b], vmin=-1.2, vmax=1.2
103 |     )
104 | 
105 | # Add the colorbar to the figure
106 | norm = plt.Normalize(vmin=-1.2, vmax=1.2)
107 | sm = ScalarMappable(cmap="viridis", norm=norm)
108 | cax = fig.add_subplot(gs[:, 3])
109 | fig.colorbar(sm, cax=cax, orientation='vertical')
110 | 
111 | fig.savefig("../plots/pinn.pdf", bbox_inches="tight")
112 | 


--------------------------------------------------------------------------------
/code/mnist_optim.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision.datasets as datasets
  3 | import torchvision.transforms as transforms
  4 | import torch.nn as nn
  5 | import torch.utils.data as data
  6 | import torch.optim as optim
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.ticker import ScalarFormatter, NullFormatter
  9 | import copy
 10 | 
 11 | # Set device as GPU if available or CPU otherwise
 12 | device = torch.device(
 13 |     "cuda" if torch.cuda.is_available() else "cpu"
 14 | )
 15 | 
 16 | # Fix a random seed
 17 | torch.manual_seed(0)
 18 | 
 19 | # Load the MNIST training and test datasets
 20 | mnist_train = datasets.MNIST(
 21 |     "./data",
 22 |     train=True,
 23 |     transform=transforms.ToTensor(),
 24 |     download=True,
 25 | )
 26 | mnist_test = datasets.MNIST(
 27 |     "./data",
 28 |     train=False,
 29 |     transform=transforms.ToTensor(),
 30 |     download=True,
 31 | )
 32 | train_loader = data.DataLoader(
 33 |     mnist_train, batch_size=64, shuffle=True
 34 | )
 35 | test_loader = data.DataLoader(
 36 |     mnist_test, batch_size=64, shuffle=False
 37 | )
 38 | 
 39 | # Define a neural network
 40 | net = nn.Sequential(  # input shape (N, 1, 28, 28)
 41 |     nn.Conv2d(1, 5, 5),  # (N, 5, 24, 24)
 42 |     nn.ReLU(),
 43 |     nn.Conv2d(5, 5, 3),  # (N, 5, 22, 22)
 44 |     nn.ReLU(),
 45 |     nn.Conv2d(5, 3, 3),  # (N, 3, 20, 20)
 46 |     nn.ReLU(),
 47 |     nn.Flatten(),  # (N, 3 * 16 * 16) = (N, 1200)
 48 |     nn.Linear(1200, 128),  # (N, 128)
 49 |     nn.ReLU(),
 50 |     nn.Linear(128, 10),  # output shape (N, 10)
 51 | ).to(device)
 52 | 
 53 | # Save the initial state of the neural network
 54 | initial_state = copy.deepcopy(net.state_dict())
 55 | 
 56 | # Define the loss function
 57 | loss_fn = nn.CrossEntropyLoss()
 58 | 
 59 | # Define the optimizers that we want to compare. Each entry in the
 60 | # list is a tuple of a label (for the plot) and an optimizer
 61 | optimizers = [
 62 |     # For SGD we use a learning rate of 0.001
 63 |     (
 64 |         "SGD",
 65 |         optim.SGD(net.parameters(), lr=1e-3),
 66 |     ),
 67 |     (
 68 |         "SGD with momentum",
 69 |         optim.SGD(net.parameters(), lr=1e-3, momentum=0.9),
 70 |     ),
 71 |     (
 72 |         "Nesterov SGD",
 73 |         optim.SGD(
 74 |             net.parameters(), lr=1e-3, momentum=0.9, nesterov=True
 75 |         ),
 76 |     ),
 77 |     # For the adaptive optimization methods we use the default
 78 |     # hyperparameters
 79 |     (
 80 |         "RMSprop",
 81 |         optim.RMSprop(net.parameters()),
 82 |     ),
 83 |     (
 84 |         "Adagrad",
 85 |         optim.Adagrad(net.parameters()),
 86 |     ),
 87 |     (
 88 |         "Adadelta",
 89 |         optim.Adadelta(net.parameters()),
 90 |     ),
 91 |     (
 92 |         "Adam",
 93 |         optim.Adam(net.parameters()),
 94 |     ),
 95 | ]
 96 | 
 97 | def compute_test_loss_and_accuracy():
 98 |     total_test_loss = 0.0
 99 |     correct_count = 0
100 |     with torch.no_grad():
101 |         for images, labels in test_loader:
102 |             images = images.to(device)
103 |             labels = labels.to(device)
104 | 
105 |             output = net(images)
106 |             loss = loss_fn(output, labels)
107 | 
108 |             total_test_loss += loss.item() * images.size(0)
109 |             pred_labels = torch.max(output, dim=1).indices
110 |             correct_count += torch.sum(
111 |                 pred_labels == labels
112 |             ).item()
113 | 
114 |     avg_test_loss = total_test_loss / len(mnist_test)
115 |     accuracy = correct_count / len(mnist_test)
116 | 
117 |     return (avg_test_loss, accuracy)
118 | 
119 | 
120 | loss_plots = []
121 | accuracy_plots = []
122 | 
123 | test_interval = 100
124 | 
125 | for _, optimizer in optimizers:
126 |     train_losses = []
127 |     accuracies = []
128 |     print(optimizer)
129 | 
130 |     with torch.no_grad():
131 |         net.load_state_dict(initial_state)
132 | 
133 |     i = 0
134 |     for e in range(5):
135 |         print(f"Epoch {e+1}")
136 |         for images, labels in train_loader:
137 |             images = images.to(device)
138 |             labels = labels.to(device)
139 | 
140 |             optimizer.zero_grad()
141 |             output = net(images)
142 |             loss = loss_fn(output, labels)
143 |             loss.backward()
144 |             optimizer.step()
145 | 
146 |             train_losses.append(loss.item())
147 | 
148 |             if (i + 1) % test_interval == 0:
149 |                 (
150 |                     test_loss,
151 |                     accuracy,
152 |                 ) = compute_test_loss_and_accuracy()
153 |                 print(accuracy)
154 |                 accuracies.append(accuracy)
155 | 
156 |             i += 1
157 | 
158 |     loss_plots.append(train_losses)
159 |     accuracy_plots.append(accuracies)
160 | 
161 | WINDOW = 200
162 | 
163 | _, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
164 | ax1.set_yscale("log")
165 | ax2.set_yscale("logit")
166 | ax2.yaxis.set_major_formatter(ScalarFormatter())
167 | ax2.yaxis.set_minor_formatter(NullFormatter())
168 | for (label, _), train_losses, accuracies in zip(
169 |     optimizers, loss_plots, accuracy_plots
170 | ):
171 |     ax1.plot(
172 |         [
173 |             sum(train_losses[max(0,i-WINDOW) : i]) / min(i, WINDOW)
174 |             for i in range(1,len(train_losses))
175 |         ],
176 |         label=label,
177 |     )
178 |     ax2.plot(
179 |         range(0, len(accuracies) * test_interval, test_interval),
180 |         accuracies,
181 |         label=label,
182 |     )
183 | 
184 | ax1.legend()
185 | 
186 | plt.tight_layout()
187 | plt.savefig("../plots/mnist_optim.pdf", bbox_inches="tight")
188 | 


--------------------------------------------------------------------------------
/code/mnist.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision.datasets as datasets
  3 | import torchvision.transforms as transforms
  4 | import torch.nn as nn
  5 | import torch.utils.data as data
  6 | import torch.optim as optim
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.ticker import ScalarFormatter, NullFormatter
  9 | 
 10 | # We use the GPU if available. Otherwise, we use the CPU.
 11 | device = torch.device(
 12 |     "cuda" if torch.cuda.is_available() else "cpu"
 13 | )
 14 | 
 15 | # We fix a random seed. This is not necessary for training a
 16 | # neural network, but we use it here to ensure that the same
 17 | # plot is created on every run.
 18 | torch.manual_seed(0)
 19 | 
 20 | # The torch.utils.data.Dataset class is an abstraction for a
 21 | # collection of instances that has a length and can be indexed
 22 | # (usually by integers).
 23 | # The torchvision.datasets module contains functions for loading
 24 | # popular machine learning datasets, possibly downloading and
 25 | # transforming the data.
 26 | 
 27 | # Here we load the MNIST dataset, containing 28x28 grayscale images
 28 | # of handwritten digits with corresponding labels in
 29 | # {0, 1, ..., 9}.
 30 | 
 31 | # First load the training portion of the data set, downloading it
 32 | # from an online source to the local folder ./data (if it is not
 33 | # yet there) and transforming the data to PyTorch Tensors.
 34 | mnist_train = datasets.MNIST(
 35 |     "./data",
 36 |     train=True,
 37 |     transform=transforms.ToTensor(),
 38 |     download=True,
 39 | )
 40 | # Next load the test portion
 41 | mnist_test = datasets.MNIST(
 42 |     "./data",
 43 |     train=False,
 44 |     transform=transforms.ToTensor(),
 45 |     download=True,
 46 | )
 47 | 
 48 | # The data.utils.DataLoader class allows iterating datasets for
 49 | # training and validation. It supports, e.g., batching and 
 50 | # shuffling of datasets.
 51 | 
 52 | # Construct a DataLoader that when iterating returns minibatches
 53 | # of 64 instances drawn from a random permutation of the training
 54 | # dataset
 55 | train_loader = data.DataLoader(
 56 |     mnist_train, batch_size=64, shuffle=True
 57 | )
 58 | # The loader for the test dataset does not need shuffling
 59 | test_loader = data.DataLoader(
 60 |     mnist_test, batch_size=64, shuffle=False
 61 | )
 62 | 
 63 | # Define a neural network with 3 convolutional layers, each
 64 | # followed by a ReLU activation and then two affine layers,
 65 | # the first followed by a ReLU activation
 66 | net = nn.Sequential(  # input shape (N, 1, 28, 28)
 67 |     nn.Conv2d(1, 5, 5),  # (N, 5, 24, 24)
 68 |     nn.ReLU(),
 69 |     nn.Conv2d(5, 5, 5),  # (N, 5, 20, 20)
 70 |     nn.ReLU(),
 71 |     nn.Conv2d(5, 3, 5),  # (N, 3, 16, 16)
 72 |     nn.ReLU(),
 73 |     nn.Flatten(),  # (N, 3 * 16 * 16) = (N, 768)
 74 |     nn.Linear(768, 128),  # (N, 128)
 75 |     nn.ReLU(),
 76 |     nn.Linear(128, 10),  # output shape (N, 10)
 77 | ).to(device)
 78 | 
 79 | # Define the loss function. For every natural number d, for
 80 | # e_1, e_2, ..., e_d the standard basis vectors in R^d, for L the
 81 | # d-dimensional cross-entropy loss function, and for A the
 82 | # d-dimensional softmax activation function, the function loss_fn
 83 | # defined here satisfies for all x in R^d and all natural numbers
 84 | # i in [0,d) that
 85 | # loss_fn(x, i) = L(A(x), e_i).
 86 | # The function loss_fn also accepts batches of inputs, in which
 87 | # case it will return the mean of the corresponding outputs.
 88 | loss_fn = nn.CrossEntropyLoss()
 89 | 
 90 | # Define the optimizer. We use the Adam SGD optimization method.
 91 | optimizer = optim.Adam(net.parameters(), lr=1e-3)
 92 | 
 93 | # This function computes the average loss of the model over the
 94 | # entire test set and the accuracy of the model's predictions.
 95 | def compute_test_loss_and_accuracy():
 96 |     total_test_loss = 0.0
 97 |     correct_count = 0
 98 |     with torch.no_grad():
 99 |         # On each iteration the test_loader will yield a 
100 |         # minibatch of images with corresponding labels
101 |         for images, labels in test_loader:
102 |             # Move the data to the device
103 |             images = images.to(device)
104 |             labels = labels.to(device)
105 |             # Compute the output of the neural network on the 
106 |             # current minibatch
107 |             output = net(images)
108 |             # Compute the mean of the cross-entropy losses
109 |             loss = loss_fn(output, labels)
110 |             # For the cumulative total_test_loss, we multiply loss
111 |             # with the batch size (usually 64, as specified above,
112 |             # but might be less for the final batch).
113 |             total_test_loss += loss.item() * images.size(0)
114 |             # For each input, the predicted label is the index of 
115 |             # the maximal component in the output vector.
116 |             pred_labels = torch.max(output, dim=1).indices
117 |             # pred_labels == labels compares the two vectors
118 |             # componentwise and returns a vector of booleans. 
119 |             # Summing over this vector counts the number of True 
120 |             # entries.
121 |             correct_count += torch.sum(
122 |                 pred_labels == labels
123 |             ).item()
124 |     avg_test_loss = total_test_loss / len(mnist_test)
125 |     accuracy = correct_count / len(mnist_test)
126 |     return (avg_test_loss, accuracy)
127 | 
128 | 
129 | # Initialize a list that holds the computed loss on every
130 | # batch during training
131 | train_losses = []
132 | 
133 | # Every 10 batches, we will compute the loss on the entire test
134 | # set as well as the accuracy of the model's predictions on the
135 | # entire test set. We do this for the purpose of illustrating in 
136 | # the produced plot the generalization capability of the ANN. 
137 | # Computing these losses and accuracies so frequently with such a 
138 | # relatively large set of datapoints (compared to the training 
139 | # set) is extremely computationally expensive, however (most of 
140 | # the training runtime will be spent computing these values) and 
141 | # so is not advisable during normal neural network training.
142 | # Usually, the test set is only used at the very end to judge the
143 | # performance of the final trained network. Often, a third set of
144 | # datapoints, called the validation set (not used to train the 
145 | # network directly nor to evaluate it at the end) is used to 
146 | # judge overfitting or to tune hyperparameters.
147 | test_interval = 10
148 | test_losses = []
149 | accuracies = []
150 | 
151 | # We run the training for 5 epochs, i.e., 5 full iterations
152 | # through the training set.
153 | i = 0
154 | for e in range(5):
155 |     for images, labels in train_loader:
156 |         # Move the data to the device
157 |         images = images.to(device)
158 |         labels = labels.to(device)
159 | 
160 |         # Zero out the gradients
161 |         optimizer.zero_grad()
162 |         # Compute the output of the neural network on the current
163 |         # minibatch
164 |         output = net(images)
165 |         # Compute the cross entropy loss
166 |         loss = loss_fn(output, labels)
167 |         # Compute the gradients
168 |         loss.backward()
169 |         # Update the parameters of the neural network
170 |         optimizer.step()
171 | 
172 |         # Append the current loss to the list of training losses.
173 |         # Note that tracking the training loss comes at 
174 |         # essentially no computational cost (since we have to 
175 |         # compute these values anyway) and so is typically done 
176 |         # during neural network training to gauge the training 
177 |         # progress.
178 |         train_losses.append(loss.item())
179 | 
180 |         if (i + 1) % test_interval == 0:
181 |             # Compute the average loss on the test set and the
182 |             # accuracy of the model and add the values to the
183 |             # corresponding list
184 |             test_loss, accuracy = compute_test_loss_and_accuracy()
185 |             test_losses.append(test_loss)
186 |             accuracies.append(accuracy)
187 | 
188 |         i += 1
189 | 
190 | fig, ax1 = plt.subplots(figsize=(12, 8))
191 | # We plot the training losses, test losses, and accuracies in the
192 | # same plot, but using two different y-axes
193 | ax2 = ax1.twinx()
194 | 
195 | # Use a logarithmic scale for the losses
196 | ax1.set_yscale("log")
197 | # Use a logit scale for the accuracies
198 | ax2.set_yscale("logit")
199 | ax2.set_ylim((0.3, 0.99))
200 | N = len(test_losses) * test_interval
201 | ax2.set_xlim((0, N))
202 | # Plot the training losses
203 | (training_loss_line,) = ax1.plot(
204 |     train_losses,
205 |     label="Training loss (left axis)",
206 | )
207 | # Plot test losses
208 | (test_loss_line,) = ax1.plot(
209 |     range(0, N, test_interval),
210 |     test_losses,
211 |     label="Test loss (left axis)",
212 | )
213 | # Plot the accuracies
214 | (accuracies_line,) = ax2.plot(
215 |     range(0, N, test_interval),
216 |     accuracies,
217 |     label="Accuracy (right axis)",
218 |     color="red",
219 | )
220 | ax2.yaxis.set_major_formatter(ScalarFormatter())
221 | ax2.yaxis.set_minor_formatter(NullFormatter())
222 | 
223 | # Put all the labels in a common legend
224 | lines = [training_loss_line, test_loss_line, accuracies_line]
225 | labels = [l.get_label() for l in lines]
226 | ax2.legend(lines, labels)
227 | 
228 | plt.tight_layout()
229 | plt.savefig("../plots/mnist.pdf", bbox_inches="tight")
230 | 


--------------------------------------------------------------------------------