├── .gitignore
├── README.md
├── assets
└── training_example.png
├── dataset.py
├── requirements.txt
└── train.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Debugging Challenge
2 |
3 | ## About
4 |
5 | The goal of this challenge is to practice debugging a neural network implementation.
6 | This is a simple ConvNet trained on MNIST classficiation. If you fix the bugs,
7 | it should achieve about 99.5% accuracy on the test set after 10 epochs, which
8 | you should be able to train on your laptop in a few minutes.
9 |
10 | However, there are several bugs present that you will need to fix to get that
11 | performance.
12 |
13 | ## Getting started
14 |
15 | ### 0. Set up a pyenv virtual environment (recommended)
16 |
17 | Note: it is not necessary to use pyenv, but you *do need python 3.6*.
18 |
19 | Follow this tutorial to install pyenv and learn about it:
20 |
21 | ```
22 | https://amaral.northwestern.edu/resources/guides/pyenv-tutorial
23 | ```
24 |
25 | Then create a virtual env for this project:
26 |
27 | ```
28 | pyenv virtualenv 3.6.5 debugging-challenge
29 | pyenv activate 3.6.5/envs/debugging-challenge
30 | ```
31 |
32 | ### 1. Install requirements
33 |
34 | Run the following:
35 |
36 | ```
37 | pip install -r requirements.txt
38 | ```
39 |
40 | ### 2. Try to run training
41 |
42 | Run the following:
43 |
44 | ```
45 | python train.py
46 | ```
47 |
48 | Instead of training on the entire dataset, you can try overfitting a single
49 | batch by running:
50 |
51 | ```
52 | python train.py --overfit-batch --n-epochs 200
53 | ```
54 |
55 | ### 3. Happy bug hunting!
56 |
57 | As a hint, there are (at least) seven bugs total in the codebase.
58 |
59 | You can look at a corrected solution in the git branch called `working`.
60 |
61 |
62 | What should training look like?
63 |
64 | See assets/training_example.png
65 |
66 |
67 |
68 | Bug 1
69 |
70 | You need to pass reuse=True to the layers for the test network.
71 |
72 |
73 | Bug 2
74 |
75 | Reshaping of the output of the pooling layer is incorrect.
76 |
77 |
78 | Bug 3
79 |
80 | Output of the network is incorrect. The softmax cross entropy loss function
81 | requires logits, but we have already taken the softmax. Change the activation
82 | for the last layer to None.
83 |
84 |
85 | Bug 4
86 |
87 | Incorrect input scaling. tf.image.convert_image_dtype already scales the
88 | values to [0, 1), so we are doing it twice.
89 |
90 |
91 | Bug 5
92 |
93 | Over augmentation. Crop value of 0.1 is way too small - it's meant to be 0.9.
94 |
95 |
96 | Bug 6
97 |
98 | Not removing augmentation at test time. augment_example method should only
99 | be used on the training set.
100 |
101 |
102 | Bug 7
103 |
104 | Using regularization at test time. Dropout should be turned off at test time.
105 |
--------------------------------------------------------------------------------
/assets/training_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josh-tobin/debugging-challenge/21177bda401cd0128aefbce1b80e1bed98676ad7/assets/training_example.png
--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
1 | from tensorflow.keras.datasets import mnist
2 | import tensorflow as tf
3 | from abc import abstractmethod, ABC
4 |
5 | class MNISTDataset(ABC):
6 | def __init__(self, batch_size, overfit_batch=False):
7 | x, y = self.load_raw_data()
8 | dset = tf.data.Dataset.from_tensor_slices((x, y))
9 | if overfit_batch:
10 | dset = dset.take(batch_size)
11 | dset = dset.map(self.preprocess_example)
12 | if not overfit_batch:
13 | dset = dset.map(self.augment_example)
14 | dset = dset.shuffle(10000)
15 | dset = dset.batch(batch_size)
16 | self.dset = dset
17 | self.iterator = self.dset.make_initializable_iterator()
18 | self.next_batch = self.iterator.get_next()
19 |
20 | def init(self, sess):
21 | sess.run(self.iterator.initializer)
22 |
23 | @abstractmethod
24 | def load_raw_data(self):
25 | pass
26 |
27 | def preprocess_example(self, *example):
28 | # Convert to float
29 | new_image = tf.image.convert_image_dtype(tf.reshape(example[0], [28, 28, 1]),
30 | tf.float32)
31 | # Scale to [0, 1)
32 | new_image = new_image / 255.
33 | new_label = tf.cast(tf.one_hot(example[1], 10), tf.float32)
34 | return new_image, new_label
35 |
36 | def augment_example(self, *example):
37 | new_image = example[0]
38 | rotation = 20
39 | crop = 0.1
40 |
41 | transforms = []
42 | with tf.name_scope('augmentation'):
43 | shp = tf.shape(new_image)
44 | height, width = shp[0], shp[1]
45 | width = tf.cast(width, tf.float32)
46 | height = tf.cast(height, tf.float32)
47 |
48 | identity = tf.constant([1, 0, 0, 0, 1, 0, 0, 0], dtype=tf.float32)
49 |
50 | if rotation > 0:
51 | angle_rad = rotation * 3.141592653589793 / 180.0
52 | angles = tf.random_uniform([], -angle_rad, angle_rad)
53 | f = tf.contrib.image.angles_to_projective_transforms(angles,
54 | height, width)
55 | new_image = tf.contrib.image.transform(new_image, f)
56 |
57 | if crop < 1:
58 | crop_value = tf.random_uniform([], crop, 1.0)
59 | crop_size = tf.floor(28 * crop_value)
60 | cropped = tf.random_crop(new_image, [crop_size, crop_size, 1])
61 | new_image = tf.image.resize_images(tf.expand_dims(cropped, 0), [28, 28])[0]
62 |
63 | return new_image, example[1]
64 |
65 | class MNISTTrain(MNISTDataset):
66 | def load_raw_data(self):
67 | x, y = mnist.load_data()[0]
68 | return x, y
69 |
70 | class MNISTTest(MNISTDataset):
71 | def load_raw_data(self):
72 | x, y = mnist.load_data()[1]
73 | return x, y
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.12
2 | click==7.0
3 | ipdb==0.12
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import click
3 | from dataset import MNISTTrain, MNISTTest
4 | import numpy as np
5 |
6 | def lenet(input, scope='lenet'):
7 | with tf.variable_scope(scope):
8 | conv1 = tf.layers.conv2d(inputs=input,
9 | filters=32,
10 | kernel_size=[5, 5],
11 | padding='same',
12 | activation=tf.nn.relu)
13 | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
14 |
15 | conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[5, 5],
16 | padding='same', activation=tf.nn.relu)
17 | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
18 |
19 | pool2_flat = tf.reshape(pool2, [-1, 256])
20 | dense = tf.layers.dense(inputs=pool2_flat, units=512, activation=tf.nn.relu)
21 | dropout = tf.layers.dropout(inputs=dense, rate=0.25)
22 | outputs = tf.layers.dense(inputs=dense, units=10, activation=tf.nn.softmax)
23 | return outputs
24 |
25 | def train(n_epochs, learning_rate, batch_size, overfit_batch):
26 | train_dataset = MNISTTrain(batch_size, overfit_batch=overfit_batch)
27 | test_dataset = MNISTTest(batch_size, overfit_batch=overfit_batch)
28 |
29 | x, y = train_dataset.next_batch
30 | y_pred = lenet(x)
31 | train_loss = tf.losses.softmax_cross_entropy(y, y_pred)
32 | train_acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1),
33 | tf.argmax(y_pred, 1)),
34 | tf.float32))
35 |
36 | opt = tf.train.AdamOptimizer(learning_rate)
37 | train_op = opt.minimize(train_loss)
38 |
39 | x_test, y_test = test_dataset.next_batch
40 | y_pred_test = lenet(x_test)
41 | test_loss = tf.losses.softmax_cross_entropy(y_test, y_pred_test)
42 | test_acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_test, 1),
43 | tf.argmax(y_pred_test, 1)),
44 | tf.float32))
45 |
46 | def eval_test(sess):
47 | test_dataset.init(sess)
48 | errs = []
49 | accs = []
50 | try:
51 | while True:
52 | err, acc = sess.run([test_loss, test_acc])
53 | errs.append(err)
54 | accs.append(acc)
55 | except tf.errors.OutOfRangeError:
56 | return np.mean(errs), np.mean(accs)
57 |
58 | with tf.Session() as sess:
59 | sess.run(tf.global_variables_initializer())
60 | for epoch in range(n_epochs):
61 | train_dataset.init(sess)
62 | epoch_err = []
63 | epoch_acc = []
64 | try:
65 | while True:
66 | _, err, acc = sess.run([train_op, train_loss, train_acc])
67 | epoch_err.append(err)
68 | epoch_acc.append(acc)
69 | except tf.errors.OutOfRangeError:
70 | print(f"Epoch {epoch}:")
71 | print(f" - Train err: {np.mean(epoch_err)}")
72 | print(f" - Train acc: {np.mean(epoch_acc)}")
73 | # BUG: Overwrite test_err and test_acc here
74 | epoch_test_err, epoch_test_acc = eval_test(sess)
75 | print(f" - Test err: {epoch_test_err}")
76 | print(f" - Test acc: {epoch_test_acc}")
77 |
78 | @click.command()
79 | @click.option('--n-epochs', type=int, default=5)
80 | @click.option('--lr', type=float, default=3e-4)
81 | @click.option('--batch-size', type=int, default=32)
82 | @click.option('--overfit-batch', is_flag=True, default=False)
83 | def main(n_epochs, lr, batch_size, overfit_batch):
84 | train(n_epochs, lr, batch_size, overfit_batch)
85 |
86 | if __name__ == '__main__':
87 | main()
88 |
--------------------------------------------------------------------------------