├── HW1 ├── HW1P1_Writeup.pdf ├── hw1_p1 │ ├── autograder │ │ └── hw1_autograder │ │ │ ├── data.pkl │ │ │ ├── problems.py │ │ │ ├── runner.py │ │ │ ├── setup.cfg │ │ │ ├── test.py │ │ │ └── tests │ │ │ ├── helpers │ │ │ └── helpers.py │ │ │ └── test_problems.py │ ├── create_tarball.sh │ ├── hw1 │ │ ├── hw1.py │ │ ├── mc.py │ │ ├── train_error.png │ │ ├── train_loss.png │ │ ├── val_error.png │ │ └── val_loss.png │ └── mytorch │ │ ├── activation.py │ │ ├── batchnorm.py │ │ ├── linear.py │ │ └── loss.py └── hw1_p2 │ ├── README.md │ └── main.py ├── HW2 ├── HW2P1_Writeup.pdf ├── HW2P2_Writeup.pdf ├── hw2p1 │ ├── autograder │ │ └── hw2_autograder │ │ │ ├── ref_result │ │ │ ├── res_b.npy │ │ │ └── res_c.npy │ │ │ ├── runner.py │ │ │ └── weights │ │ │ ├── mlp_weights_part_b.npy │ │ │ └── mlp_weights_part_c.npy │ ├── create_tarball.sh │ ├── exclude.txt │ ├── hw2 │ │ ├── hw2.py │ │ ├── mc.py │ │ ├── mlp.py │ │ └── mlp_scan.py │ └── mytorch │ │ ├── activation.py │ │ ├── batchnorm.py │ │ ├── conv.py │ │ ├── linear.py │ │ └── loss.py └── hw2p2 │ ├── README.md │ ├── hw2p2_writeup_submit.pdf │ ├── p2.ipynb │ └── p2.py ├── HW3 ├── HW3P1_Writeup.pdf ├── HW3P2_Writeup.pdf ├── hw3p1 │ ├── autograder │ │ └── hw3_autograder │ │ │ ├── runner.py │ │ │ ├── test.py │ │ │ └── test_rnn.py │ ├── create_tarball.sh │ ├── hw3 │ │ ├── hw3.py │ │ ├── mc.py │ │ └── rnn_classifier.py │ └── mytorch │ │ ├── activation.py │ │ ├── gru_cell.py │ │ ├── linear.py │ │ ├── loss.py │ │ ├── rnn_cell.py │ │ └── search.py └── hw3p2 │ ├── README.md │ ├── notebooks │ ├── CTC.ipynb │ ├── language_model.ipynb │ └── p2.ipynb │ └── p2.py ├── HW4 ├── HW4P1_Writeup.pdf ├── HW4P2_Writeup.pdf ├── Weight_Decay.png ├── hw4p1 │ └── handout │ │ ├── Makefile │ │ ├── fixtures │ │ ├── generation.npy │ │ ├── generation_test.npy │ │ ├── prediction.npz │ │ └── prediction_test.npz │ │ └── hw4 │ │ ├── tests.py │ │ └── training.ipynb └── hw4p2 │ ├── README.md │ ├── dataloader.py │ ├── main.py │ ├── models.py │ ├── p2.ipynb │ ├── plot.py │ └── train_test.py └── README.md /HW1/HW1P1_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW1/HW1P1_Writeup.pdf -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW1/hw1_p1/autograder/hw1_autograder/data.pkl -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/problems.py: -------------------------------------------------------------------------------- 1 | """ 2 | Format of entries is `test name: (autolab problem name, score)` 3 | """ 4 | problems = { 5 | 'test_mcq':('Multiple Choice Questions', 5), 6 | 7 | 'test_sigmoid_forward': ('Sigmoid Non-Linearity (forward)', 2), 8 | 'test_sigmoid_derivative': ('Sigmoid Non-Linearity (derivative)', 2), 9 | 'test_tanh_forward': ('Tanh Non-Linearity (forward)', 2), 10 | 'test_tanh_derivative': ('Tanh Non-Linearity (derivative)', 2), 11 | 'test_relu_forward': ('ReLU Non-Linearity (forward)', 2), 12 | 'test_relu_derivative': ('ReLU Non-Linearity (derivative)', 2), 13 | 14 | 'test_softmax_cross_entropy_forward': ('Softmax Cross Entropy (forward)', 2), 15 | 'test_softmax_cross_entropy_derivative': ('Softmax Cross Entropy (derivative)', 2), 16 | 17 | 'test_batch_norm_train': ('Batch Normalization (training time)', 10), 18 | 'test_batch_norm_inference': ('Batch Normalization (inference time)', 5), 19 | 20 | 'test_linear_layer_forward': ('Linear Layer (Forward)',2), 21 | 'test_linear_layer_backward': ('Linear Layer (Backward)',2), 22 | 23 | 'test_linear_classifier_forward': ('Linear Classifier (forward)', 2), 24 | 'test_linear_classifier_backward': ('Linear Classifier (backward)', 2), 25 | 'test_linear_classifier_step': ('Linear Classifier (step)', 1), 26 | 27 | 'test_single_hidden_forward': ('Single Hidden Layer (forward)', 5), 28 | 'test_single_hidden_backward': ('Single Hidden Layer (backward)', 5), 29 | 'test_mystery_hidden_forward1': ('N Hidden Layer (forward) 1', 5), 30 | 'test_mystery_hidden_forward2': ('N Hidden Layer (forward) 2', 5), 31 | 'test_mystery_hidden_forward3': ('N Hidden Layer (forward) 3', 5), 32 | 'test_mystery_hidden_backward1': ('N Hidden Layer (backward) 1', 5), 33 | 'test_mystery_hidden_backward2': ('N Hidden Layer (backward) 2', 5), 34 | 'test_mystery_hidden_backward3': ('N Hidden Layer (backward) 3', 5), 35 | 36 | 'test_momentum': ('Momentum', 10), 37 | # 'test_train_statistics' :('Train statistics', 5) 38 | } -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/runner.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import sys 6 | import pytest 7 | import six 8 | import time 9 | 10 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 11 | from problems import problems 12 | 13 | class LoggerPlugin(object): 14 | def __init__(self): 15 | self.logs = {} 16 | 17 | def pytest_runtest_logreport(self, report): 18 | if report.when == 'call': 19 | name = re.split('::', report.nodeid)[1] 20 | passed = 1 if report.outcome == 'passed' else 0 21 | self.logs[name] = passed 22 | 23 | 24 | def main(argv): 25 | 26 | 27 | parser = argparse.ArgumentParser(description='Test Runner') 28 | default_module_path = '../solution/gold' 29 | parser.add_argument('--module-path', type=str, default=default_module_path, 30 | help='output directory (default: {})'.format(default_module_path)) 31 | args = parser.parse_args(argv[1:]) 32 | sys.path.append(args.module_path) 33 | test_path = os.path.dirname(os.path.abspath(__file__)) 34 | logger = LoggerPlugin() 35 | 36 | start = time.time() 37 | pytest.main(["-s","-qq", test_path], plugins=[logger]) 38 | end = time.time() 39 | print("Run time: ", end - start) 40 | 41 | scores = {} 42 | total = 0 43 | for k, v in six.iteritems(logger.logs): 44 | problem, value = problems[k] 45 | if problem not in scores: 46 | scores[problem] = 0 47 | scores[problem] += value * v 48 | total += value * v 49 | print(f' {"_"*40}{"_"*11}') 50 | print(f'|{"TASK":<40}|{"SCORE":<10}|') 51 | print(f'|{"_"*40}|{"_"*10}|') 52 | for task, score in scores.items(): 53 | print(f'|{task:<40}|{score:<10}|') 54 | print(f'|{"_"*40}|{"_"*10}|') 55 | print(f'|{"TOTAL SCORE":<40}|{total:<10}|') 56 | print(f'|{"_"*40}|{"_"*10}|') 57 | 58 | if __name__ == '__main__': 59 | main(sys.argv) 60 | -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | norecursedirs=tests/helpers 3 | pep8maxlinelength = 120 4 | -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | test.py 3 | 4 | 5 | Provide training code to test your current implementation on 6 | the MNIST dataset and visualize the training process. 7 | 8 | 9 | We hope this code will accelerate your development and debugging process, 10 | as well as introduce you to the importance of visualizing training statistics 11 | for debugging and tuning purposes. 12 | 13 | """ 14 | import sys 15 | sys.path.append('hw1') 16 | 17 | import hw1 as nn 18 | import numpy as np 19 | import matplotlib.pyplot as plt 20 | import os 21 | import argparse 22 | import contextlib 23 | 24 | 25 | @contextlib.contextmanager 26 | def numpy_print_options(*args, **kwargs): 27 | original = np.get_printoptions() 28 | np.set_printoptions(*args, **kwargs) 29 | try: 30 | yield 31 | finally: 32 | np.set_printoptions(**original) 33 | 34 | 35 | def make_one_hot(labels_idx): 36 | labels = np.zeros((labels_idx.shape[0], 10)) 37 | labels[np.arange(labels_idx.shape[0]), labels_idx] = 1 38 | return labels 39 | 40 | 41 | def process_dset_partition(dset_partition, normalize=True): 42 | data, labels_idx = dset_partition 43 | mu, std = data.mean(), data.std() if normalize else (0, 1) 44 | return (data - mu) / std, make_one_hot(labels_idx) 45 | 46 | def bias_init(x): 47 | return np.zeros((1, x)) 48 | 49 | def visualize(outpath): 50 | # Configure the training visualization process below 51 | # Change these hyperparameters around to experiment with your implementation 52 | epochs = 200 53 | batch_size = 100 54 | thisdir = os.path.dirname(__file__) 55 | savepath = outpath 56 | train_data_path = os.path.join(thisdir, "data/train_data.npy") 57 | train_labels_path = os.path.join(thisdir, "data/train_labels.npy") 58 | 59 | val_data_path = os.path.join(thisdir, "data/val_data.npy") 60 | val_labels_path = os.path.join(thisdir, "data/val_labels.npy") 61 | 62 | test_data_path = os.path.join(thisdir, "data/test_data.npy") 63 | test_labels_path = os.path.join(thisdir, "data/test_labels.npy") 64 | 65 | dset = ( 66 | process_dset_partition((np.load(train_data_path), np.load(train_labels_path))), 67 | process_dset_partition((np.load(val_data_path), np.load(val_labels_path))), 68 | process_dset_partition((np.load(test_data_path), np.load(test_labels_path)))) 69 | 70 | mlp = nn.MLP(784, 10, [32, 32, 32], [nn.Sigmoid(), nn.Sigmoid(), nn.Sigmoid(), nn.Identity()], 71 | np.random.randn, bias_init, 72 | nn.SoftmaxCrossEntropy(), 73 | 1e-3, momentum=0.856) 74 | visualize_training_statistics(mlp, dset, epochs, batch_size, savepath) 75 | print("Saved output to {}".format(savepath)) 76 | 77 | 78 | def plotline(data, xlabel, ylabel, title, path): 79 | plt.plot(data) 80 | plt.xlabel(xlabel) 81 | plt.ylabel(ylabel) 82 | plt.title(title) 83 | plt.savefig(path) 84 | plt.clf() 85 | 86 | 87 | def visualize_training_statistics(mlp, dset, epochs, batch_size, savepath=None): 88 | print("Starting training") 89 | training_losses, training_errors, validation_losses, validation_errors = nn.get_training_stats( 90 | mlp, dset, epochs, batch_size) 91 | 92 | print(training_errors) 93 | path = os.getcwd() if savepath is None else savepath 94 | plotline(training_losses, "Epoch", "Loss", "Training Loss", 95 | os.path.join(path, "train_loss.png")) 96 | plotline(training_errors, "Epoch", "Error", "Training Error", 97 | os.path.join(path, "train_error.png")) 98 | plotline(validation_losses, "Epoch", "Loss", 99 | "Validation Loss", os.path.join(path, "val_loss.png")) 100 | plotline(validation_errors, "Epoch", "Error", 101 | "Validation Error", os.path.join(path, "val_error.png")) 102 | 103 | 104 | def parse_args(): 105 | parser = argparse.ArgumentParser(description="11-785 HW1P1 Visualizer") 106 | parser.add_argument('--outpath', type=str, default=None, 107 | help='Path to output') 108 | return parser.parse_args() 109 | 110 | 111 | def main(arglist): 112 | visualize(outpath=arglist.outpath) 113 | print("Done :)") 114 | 115 | 116 | if __name__ == "__main__": 117 | arglist = parse_args() 118 | main(arglist) 119 | -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/tests/helpers/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import six 3 | import sys 4 | PICKLE_KWARGS = {'encoding': 'latin1'} if six.PY3 else {} 5 | 6 | rtol = 1e-4 7 | atol = 1e-08 8 | TOLERANCE = 1e-4 9 | np.set_printoptions(threshold=50, precision=4) 10 | 11 | 12 | def raw_mnist(path): 13 | return (cleaned_mnist(path)) 14 | 15 | 16 | def cleaned_mnist(path): 17 | data = np.genfromtxt(path, delimiter=',') 18 | X = data[:, :-1] 19 | Y = data[:, -1] 20 | Y = Y.astype(int) 21 | return X, Y 22 | 23 | 24 | def isAllClose(a, b, tol=0.01): 25 | LIST_TYPE = type([]) 26 | if(type(a) == LIST_TYPE or type(b) == LIST_TYPE): 27 | assert len(a) == len(b) 28 | for i, j in zip(a, b): 29 | if(not np.allclose(i, j, atol=tol)): 30 | return False 31 | return True 32 | return np.allclose(a, b, atol=tol) 33 | 34 | 35 | def closeness_test(value, reference, name): 36 | if not isinstance(value, np.ndarray): 37 | errmsg = "%s is not an array" % name 38 | raise TypeError(errmsg) 39 | if not value.dtype == reference.dtype: 40 | errmsg = "%s is of type %s when it should be %s" % (name, value.dtype, reference.dtype) 41 | raise TypeError(errmsg) 42 | if not value.shape == reference.shape: 43 | errmsg = "%s is of shape %s when it should be %s" % (name, value.shape, reference.shape) 44 | raise ValueError(errmsg) 45 | if not np.allclose(value, reference, rtol=rtol, atol=atol): 46 | errmsg = "Wrong value for %s" % name 47 | errmsg = errmsg + "\nSubmission value : \n" 48 | errmsg = errmsg + np.array2string(value) 49 | errmsg = errmsg + "\nReference value :\n" 50 | errmsg = errmsg + np.array2string(reference) 51 | raise ValueError(errmsg) 52 | -------------------------------------------------------------------------------- /HW1/hw1_p1/autograder/hw1_autograder/tests/test_problems.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from helpers.helpers import * 4 | import sys 5 | import pickle 6 | 7 | base_dir = 'autograder/hw1_autograder/' 8 | 9 | autolab = bool(int(os.environ['AUTOLAB'])) if 'AUTOLAB' in os.environ.keys() else False 10 | saved_data = pickle.load(open(base_dir + "data.pkl", 'rb')) 11 | rtol = 1e-4 12 | atol = 1e-04 13 | TOLERANCE = 1e-4 14 | 15 | SEED = 2019 16 | if autolab: 17 | print("We are on Autolab") 18 | TRAINDATAPATH = "/datasets/11785/mnist_train.csv" 19 | TESTDATAPATH = "/datasets/11785/mnist_test.csv" 20 | sys.path.append('handin/') 21 | else: 22 | print("We are on local") 23 | TRAINDATAPATH = base_dir + "tests/data/mnist_train.csv" 24 | TESTDATAPATH = base_dir + "tests/data/mnist_test.csv" 25 | 26 | if os.path.exists(TRAINDATAPATH): 27 | print("Train data exists") 28 | if os.path.exists(TESTDATAPATH): 29 | print("Test data exists") 30 | 31 | sys.path.append('mytorch') 32 | import activation 33 | import loss 34 | import linear 35 | import batchnorm 36 | 37 | sys.path.append('hw1') 38 | import hw1 39 | import mc 40 | 41 | 42 | def raw_mnist(path): 43 | return (cleaned_mnist(path)) 44 | 45 | 46 | def cleaned_mnist(path): 47 | data = np.genfromtxt(path, delimiter=',') 48 | X = data[:, 1:] 49 | Y = data[:, 0] 50 | Y = Y.astype(int) 51 | return X, Y 52 | 53 | 54 | def reset_prng(): 55 | np.random.seed(11785) 56 | 57 | 58 | def weight_init(x, y): 59 | return np.random.randn(x, y) 60 | 61 | 62 | def bias_init(x): 63 | return np.zeros((1, x)) 64 | 65 | 66 | def test_mcq(): 67 | ref = ['b','a','a','a','c'] 68 | ans_1 = mc.question_1() 69 | ans_2 = mc.question_2() 70 | ans_3 = mc.question_3() 71 | ans_4 = mc.question_4() 72 | ans_5 = mc.question_5() 73 | ans = [ans_1,ans_2,ans_3,ans_4,ans_5] 74 | for i in range(len(ref)): 75 | closeness_test(np.array(ord(ans[i])),np.array(ord(ref[i])),"mc.question_%d" %(i+1)) 76 | 77 | 78 | def test_sigmoid_forward(): 79 | data = saved_data[5] 80 | t0 = data[0] 81 | gt = data[1] 82 | student = activation.Sigmoid() 83 | student(t0) 84 | closeness_test(student.state, gt, "sigmoid.state") 85 | 86 | 87 | def test_sigmoid_derivative(): 88 | data = saved_data[6] 89 | t0 = data[0] 90 | gt = data[1] 91 | student = activation.Sigmoid() 92 | student(t0) 93 | closeness_test(student.derivative(), gt, "sigmoid.derivative()") 94 | 95 | 96 | def test_tanh_forward(): 97 | data = saved_data[9] 98 | t0 = data[0] 99 | gt = data[1] 100 | student = activation.Tanh() 101 | student(t0) 102 | closeness_test(student.state, gt, "tanh.state") 103 | 104 | 105 | def test_tanh_derivative(): 106 | data = saved_data[10] 107 | t0 = data[0] 108 | gt = data[1] 109 | student = activation.Tanh() 110 | student(t0) 111 | closeness_test(student.derivative(), gt, "tanh.derivative()") 112 | 113 | 114 | def test_relu_forward(): 115 | data = saved_data[7] 116 | t0 = data[0] 117 | gt = data[1] 118 | student = activation.ReLU() 119 | student(t0) 120 | closeness_test(student.state, gt, "relu.state") 121 | 122 | 123 | def test_relu_derivative(): 124 | data = saved_data[8] 125 | t0 = data[0] 126 | gt = data[1] 127 | student = activation.ReLU() 128 | student(t0) 129 | closeness_test(student.derivative(), gt, "relu.derivative()") 130 | 131 | 132 | def test_softmax_cross_entropy_forward(): 133 | data = saved_data[0] 134 | x = data[0] 135 | y = data[1] 136 | sol = data[2] 137 | 138 | ce = loss.SoftmaxCrossEntropy() 139 | closeness_test(ce(x, y), sol, "ce(x, y)") 140 | 141 | 142 | def test_softmax_cross_entropy_derivative(): 143 | data = saved_data[1] 144 | x = data[0] 145 | y = data[1] 146 | sol = data[2] 147 | ce = loss.SoftmaxCrossEntropy() 148 | ce(x, y) 149 | closeness_test(ce.derivative(), sol, "ce.derivative()") 150 | 151 | 152 | 153 | 154 | def test_batch_norm_train(): 155 | data = saved_data[19] 156 | assert len(data) == 10 157 | x = data[0] 158 | y = data[1] 159 | soldW = data[2:5] 160 | soldb = data[5:8] 161 | soldbeta = data[8] 162 | soldgamma = data[9] 163 | 164 | reset_prng() 165 | 166 | mlp = hw1.MLP(784, 10, [64, 32], [activation.Sigmoid(), activation.Sigmoid(), activation.Identity()], 167 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 168 | momentum=0.0, num_bn_layers=1) 169 | 170 | mlp.forward(x) 171 | mlp.backward(y) 172 | 173 | dW = [x.dW for x in mlp.linear_layers] 174 | db = [x.db for x in mlp.linear_layers] 175 | 176 | for i, (pred, gt) in enumerate(zip(dW, soldW)): 177 | closeness_test(pred, gt, "mlp.dW[%d]" % i) 178 | 179 | for i, (pred, gt) in enumerate(zip(db, soldb)): 180 | closeness_test(pred, gt, "mlp.db[%d]" % i) 181 | 182 | closeness_test(mlp.bn_layers[0].dbeta, soldbeta, "mlp.bn_layers[0].dbeta") 183 | closeness_test(mlp.bn_layers[0].dgamma, soldgamma, "mlp.bn_layers[0].dgamma") 184 | 185 | 186 | def test_batch_norm_inference(): 187 | num_examples = 1000 188 | data = saved_data[20] 189 | assert len(data) == 15 190 | x = data[0] 191 | y = data[1] 192 | soldbeta = data[2] 193 | soldgamma = data[3] 194 | xs = data[4] 195 | solground = data[5:] 196 | reset_prng() 197 | mlp = hw1.MLP(784, 10, [64, 32], [activation.Sigmoid(), activation.Sigmoid(), activation.Identity()], 198 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 199 | momentum=0.0, num_bn_layers=1) 200 | 201 | batch_size = 100 202 | mlp.train() 203 | for b in range(0, 1): 204 | mlp.zero_grads() 205 | mlp.forward(x[b:b + batch_size]) 206 | mlp.backward(y[b:b + batch_size]) 207 | mlp.step() 208 | closeness_test(mlp.bn_layers[0].dbeta, soldbeta, "mlp.bn_layers[0].dbeta") 209 | closeness_test(mlp.bn_layers[0].dgamma, soldgamma, "mlp.bn_layers[0].dgamma") 210 | 211 | for b in range(0, num_examples, batch_size): 212 | mlp.eval() 213 | student = mlp.forward(xs[b:b + batch_size]) 214 | ground = solground[b//batch_size] 215 | closeness_test(student, ground, "mlp.forward(x)") 216 | 217 | 218 | 219 | def test_linear_layer_forward(): 220 | data = saved_data[22] 221 | assert len(data) == 2 222 | x = data[0] 223 | gt = data[1] 224 | 225 | reset_prng() 226 | x = np.random.randn(20, 784) 227 | reset_prng() 228 | linear_layer = linear.Linear(784, 10, weight_init, bias_init) 229 | pred = linear_layer.forward(x) 230 | closeness_test(pred, gt, "linear_layer.forward(x)") 231 | 232 | 233 | def test_linear_layer_backward(): 234 | data = saved_data[23] 235 | assert len(data) == 4 236 | x = data[0] 237 | y = data[1] 238 | soldW = data[2] 239 | soldb = data[3] 240 | 241 | reset_prng() 242 | linear_layer = linear.Linear(784, 10, weight_init, bias_init) 243 | linear_layer.forward(x) 244 | linear_layer.backward(y) 245 | 246 | closeness_test(linear_layer.dW, soldW, "linear_layer.dW") 247 | closeness_test(linear_layer.db, soldb, "linear_layer.db") 248 | 249 | 250 | 251 | def test_linear_classifier_forward(): 252 | data = saved_data[2] 253 | x = data[0] 254 | gt = data[1] 255 | reset_prng() 256 | mlp = hw1.MLP(784, 10, [], [activation.Identity()], weight_init, bias_init, 257 | loss.SoftmaxCrossEntropy(), 0.008, momentum=0.0, 258 | num_bn_layers=0) 259 | pred = mlp.forward(x) 260 | closeness_test(pred, gt, "mlp.forward(x)") 261 | 262 | 263 | def test_linear_classifier_backward(): 264 | data = saved_data[3] 265 | x = data[0] 266 | y = data[1] 267 | soldW = data[2] 268 | soldb = data[3] 269 | reset_prng() 270 | mlp = hw1.MLP(784, 10, [], [activation.Identity()], weight_init, bias_init, 271 | loss.SoftmaxCrossEntropy(), 0.008, momentum=0.0, 272 | num_bn_layers=0) 273 | mlp.forward(x) 274 | mlp.backward(y) 275 | 276 | closeness_test(mlp.linear_layers[0].dW, soldW, "mlp.linear_layers[0].dW") 277 | closeness_test(mlp.linear_layers[0].db, soldb, "mlp.linear_layers[0].db") 278 | 279 | 280 | def test_linear_classifier_step(): 281 | data = saved_data[4] 282 | x = data[0] 283 | y = data[1] 284 | solW = data[2] 285 | solb = data[3] 286 | reset_prng() 287 | mlp = hw1.MLP(784, 10, [], [activation.Identity()], weight_init, bias_init, 288 | loss.SoftmaxCrossEntropy(), 0.008, momentum=0.0, 289 | num_bn_layers=0) 290 | num_test_updates = 5 291 | for u in range(num_test_updates): 292 | mlp.zero_grads() 293 | mlp.forward(x) 294 | mlp.backward(y) 295 | mlp.step() 296 | closeness_test(mlp.linear_layers[0].W, solW, "mlp.linear_layers[0].W") 297 | closeness_test(mlp.linear_layers[0].b, solb, "mlp.linear_layers[0].b") 298 | 299 | 300 | def test_single_hidden_forward(): 301 | data = saved_data[11] 302 | x = data[0] 303 | gt = data[1] 304 | reset_prng() 305 | mlp = hw1.MLP(784, 10, [32], [activation.Sigmoid(), activation.Identity()], 306 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 307 | momentum=0.0, num_bn_layers=0) 308 | 309 | pred = mlp.forward(x) 310 | closeness_test(pred, gt, "mlp.forward(x)") 311 | 312 | 313 | def test_single_hidden_backward(): 314 | data = saved_data[12] 315 | assert len(data) == 6 316 | x = data[0] 317 | y = data[1] 318 | soldW = data[2:4] 319 | soldb = data[4:] 320 | reset_prng() 321 | mlp = hw1.MLP(784, 10, [32], [activation.Sigmoid(), activation.Identity()], 322 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 323 | momentum=0.0, num_bn_layers=0) 324 | mlp.forward(x) 325 | mlp.backward(y) 326 | 327 | dW = [x.dW for x in mlp.linear_layers] 328 | db = [x.db for x in mlp.linear_layers] 329 | 330 | for i, (pred, gt) in enumerate(zip(dW, soldW)): 331 | closeness_test(pred, gt, "mlp.linear_layers[%d].dW" % i) 332 | 333 | for i, (pred, gt) in enumerate(zip(db, soldb)): 334 | closeness_test(pred, gt, "mlp.linear_layers[%d].db" % i) 335 | 336 | 337 | def test_mystery_hidden_forward1(): 338 | data = saved_data[13] 339 | x = data[0] 340 | gt = data[1] 341 | reset_prng() 342 | mlp = hw1.MLP(784, 10, [64, 32], [activation.Sigmoid(), activation.Sigmoid(), activation.Identity()], 343 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 344 | momentum=0.0, num_bn_layers=0) 345 | 346 | pred = mlp.forward(x) 347 | closeness_test(pred, gt, "mlp.forward(x)") 348 | 349 | 350 | def test_mystery_hidden_forward2(): 351 | data = saved_data[14] 352 | x = data[0] 353 | gt = data[1] 354 | reset_prng() 355 | mlp = hw1.MLP(784, 10, [32, 32, 32, 32, 32], 356 | [activation.Sigmoid(), activation.Sigmoid(), activation.Sigmoid(), activation.Sigmoid(), 357 | activation.Sigmoid(), activation.Identity()], 358 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 359 | momentum=0.0, num_bn_layers=0) 360 | 361 | pred = mlp.forward(x) 362 | closeness_test(pred, gt, "mlp.forward(x)") 363 | 364 | 365 | def test_mystery_hidden_forward3(): 366 | data = saved_data[15] 367 | x = data[0] 368 | gt = data[1] 369 | reset_prng() 370 | mlp = hw1.MLP(784, 10, [32], [activation.Sigmoid(), activation.Identity()], 371 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 372 | momentum=0.0, num_bn_layers=0) 373 | 374 | pred = mlp.forward(x) 375 | closeness_test(pred, gt, "mlp.forward(x)") 376 | 377 | 378 | def test_mystery_hidden_backward1(): 379 | data = saved_data[16] 380 | assert len(data) == 8 381 | x = data[0] 382 | y = data[1] 383 | soldW = data[2:5] 384 | soldb = data[5:] 385 | reset_prng() 386 | mlp = hw1.MLP(784, 10, [64, 32], [activation.Sigmoid(), activation.Sigmoid(), activation.Identity()], 387 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 388 | momentum=0.0, num_bn_layers=0) 389 | 390 | mlp.forward(x) 391 | mlp.backward(y) 392 | 393 | dW = [x.dW for x in mlp.linear_layers] 394 | db = [x.db for x in mlp.linear_layers] 395 | 396 | for i, (pred, gt) in enumerate(zip(dW, soldW)): 397 | closeness_test(pred, gt, "mlp.linear_layers[%d].dW" % i) 398 | 399 | for i, (pred, gt) in enumerate(zip(db, soldb)): 400 | closeness_test(pred, gt, "mlp.linear_layers[%d].db" % i) 401 | 402 | 403 | def test_mystery_hidden_backward2(): 404 | data = saved_data[17] 405 | assert len(data) == 14 406 | x = data[0] 407 | y = data[1] 408 | soldW = data[2:8] 409 | soldb = data[8:] 410 | reset_prng() 411 | mlp = hw1.MLP(784, 10, [32, 32, 32, 32, 32], 412 | [activation.Sigmoid(), activation.Sigmoid(), activation.Sigmoid(), activation.Sigmoid(), 413 | activation.Sigmoid(), activation.Identity()], 414 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 415 | momentum=0.0, num_bn_layers=0) 416 | mlp.forward(x) 417 | mlp.backward(y) 418 | 419 | dW = [x.dW for x in mlp.linear_layers] 420 | db = [x.db for x in mlp.linear_layers] 421 | 422 | for i, (pred, gt) in enumerate(zip(dW, soldW)): 423 | closeness_test(pred, gt, "mlp.linear_layers[%d].dW" % i) 424 | 425 | for i, (pred, gt) in enumerate(zip(db, soldb)): 426 | closeness_test(pred, gt, "mlp.linear_layers[%d].db" % i) 427 | 428 | 429 | def test_mystery_hidden_backward3(): 430 | data = saved_data[18] 431 | assert len(data) == 6 432 | x = data[0] 433 | y = data[1] 434 | soldW = data[2:4] 435 | soldb = data[4:] 436 | reset_prng() 437 | mlp = hw1.MLP(784, 10, [32], [activation.Sigmoid(), activation.Identity()], 438 | weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 439 | momentum=0.0, num_bn_layers=0) 440 | mlp.forward(x) 441 | mlp.backward(y) 442 | 443 | dW = [x.dW for x in mlp.linear_layers] 444 | db = [x.db for x in mlp.linear_layers] 445 | 446 | for i, (pred, gt) in enumerate(zip(dW, soldW)): 447 | closeness_test(pred, gt, "mlp.linear_layers[%d].dW" % i) 448 | 449 | for i, (pred, gt) in enumerate(zip(db, soldb)): 450 | closeness_test(pred, gt, "mlp.linear_layers[%d].db" % i) 451 | 452 | 453 | def test_momentum(): 454 | data = saved_data[21] 455 | assert len(data) == 8 456 | x = data[0] 457 | y = data[1] 458 | solW = data[2:5] 459 | solb = data[5:] 460 | reset_prng() 461 | mlp = hw1.MLP(784, 10, [64, 32], [activation.Sigmoid(), activation.Sigmoid(), activation.Identity()], weight_init, bias_init, loss.SoftmaxCrossEntropy(), 0.008, 462 | momentum=0.856, num_bn_layers=0) 463 | 464 | num_test_updates = 5 465 | for u in range(num_test_updates): 466 | mlp.zero_grads() 467 | mlp.forward(x) 468 | mlp.backward(y) 469 | mlp.step() 470 | mlp.eval() 471 | 472 | W = [x.W for x in mlp.linear_layers] 473 | b = [x.b for x in mlp.linear_layers] 474 | 475 | for i, (pred, gt) in enumerate(zip(W, solW)): 476 | closeness_test(pred, gt, "mlp.linear_layers[%d].W" % i) 477 | 478 | for i, (pred, gt) in enumerate(zip(b, solb)): 479 | closeness_test(pred, gt, "mlp.linear_layers[%d].b" % i) 480 | 481 | def failed_test_names(names, preds, gts, status): 482 | values = [(preds[i], gts[i]) for i, s in enumerate(status) if not s] 483 | names = [n for n, s in zip(names, status) if not s] 484 | return names, values 485 | 486 | 487 | def union(xs, ys): 488 | return [x or y for x, y in zip(xs, ys)] 489 | 490 | 491 | def assert_any_zeros(nparr): 492 | for i in range(len(nparr)): 493 | assert (np.all(nparr[i], 0)) 494 | -------------------------------------------------------------------------------- /HW1/hw1_p1/create_tarball.sh: -------------------------------------------------------------------------------- 1 | tar -cvf handin.tar hw1 mytorch 2 | -------------------------------------------------------------------------------- /HW1/hw1_p1/hw1/hw1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Follow the instructions provided in the writeup to completely 3 | implement the class specifications for a basic MLP, optimizer, . 4 | You will be able to test each section individually by submitting 5 | to autolab after implementing what is required for that section 6 | -- do not worry if some methods required are not implemented yet. 7 | 8 | Notes: 9 | 10 | The __call__ method is a special reserved method in 11 | python that defines the behaviour of an object when it is 12 | used as a function. For example, take the Linear activation 13 | function whose implementation has been provided. 14 | 15 | # >>> activation = Identity() 16 | # >>> activation(3) 17 | # 3 18 | # >>> activation.forward(3) 19 | # 3 20 | """ 21 | 22 | # DO NOT import any additional 3rd party external libraries as they will not 23 | # be available to AutoLab and are not needed (or allowed) 24 | 25 | import numpy as np 26 | import os 27 | import sys 28 | 29 | sys.path.append('mytorch') 30 | from loss import * 31 | from activation import * 32 | from batchnorm import * 33 | from linear import * 34 | 35 | 36 | class MLP(object): 37 | 38 | """ 39 | A simple multilayer perceptron 40 | """ 41 | 42 | def __init__(self, input_size, output_size, hiddens, activations, weight_init_fn, 43 | bias_init_fn, criterion, lr, momentum=0.0, num_bn_layers=0): 44 | 45 | # Don't change this --> 46 | self.train_mode = True 47 | self.num_bn_layers = num_bn_layers 48 | self.bn = num_bn_layers > 0 49 | self.nlayers = len(hiddens) + 1 50 | self.input_size = input_size 51 | self.output_size = output_size 52 | self.activations = activations 53 | self.criterion = criterion 54 | self.lr = lr 55 | self.momentum = momentum 56 | # <--------------------- 57 | 58 | # Don't change the name of the following class attributes, 59 | # the autograder will check against these attributes. But you will need to change 60 | # the values in order to initialize them correctly 61 | 62 | # Initialize and add all your linear layers into the list 'self.linear_layers' 63 | # (HINT: self.foo = [ bar(???) for ?? in ? ]) 64 | # (HINT: Can you use zip here?) 65 | self.layersDim = [input_size] + hiddens + [output_size] 66 | self.linear_layers = [Linear(inSize, outSize, weight_init_fn, bias_init_fn) \ 67 | for inSize, outSize in zip(self.layersDim[:-1], self.layersDim[1:])] 68 | #self.linear_layers = [Linear(input_size, output_size, weight_init_fn, bias_init_fn) for i in range(self.nlayers)] 69 | 70 | # If batch norm, add batch norm layers into the list 'self.bn_layers' 71 | if self.bn: 72 | self.bn_layers = [] 73 | for i in range(self.num_bn_layers): 74 | self.bn_layers.append(BatchNorm(self.layersDim[i+1])) 75 | 76 | 77 | def forward(self, x): 78 | """ 79 | Argument: 80 | x (np.array): (batch size, input_size) 81 | Return: 82 | out (np.array): (batch size, output_size) 83 | """ 84 | # Complete the forward pass through your entire MLP. 85 | input = x 86 | for i in range(self.nlayers): 87 | z = self.linear_layers[i].forward(input) 88 | 89 | # Batch norm forward 90 | if self.bn: 91 | if i < self.num_bn_layers: 92 | if self.train_mode: 93 | z = self.bn_layers[i].forward(z) 94 | else: 95 | z = self.bn_layers[i].forward(z, eval=True) 96 | 97 | input = self.activations[i].forward(z) 98 | return input 99 | 100 | def zero_grads(self): 101 | # Use numpyArray.fill(0.0) to zero out your backpropped derivatives in each 102 | # of your linear and batchnorm layers. 103 | for i in range(len(self.linear_layers)): 104 | self.linear_layers[i].dW.fill(0.0) 105 | self.linear_layers[i].db.fill(0.0) 106 | 107 | def step(self): 108 | # Apply a step to the weights and biases of the linear layers. 109 | # Apply a step to the weights of the batchnorm layers. 110 | # (You will add momentum later in the assignment to the linear layers only 111 | # , not the batchnorm layers) 112 | if self.momentum: 113 | for i in range(len(self.linear_layers)): 114 | # update momentum 115 | self.linear_layers[i].momentum_W = self.momentum * self.linear_layers[i].momentum_W - self.lr * self.linear_layers[i].dW 116 | self.linear_layers[i].momentum_B = self.momentum * self.linear_layers[i].momentum_B - self.lr * self.linear_layers[i].db 117 | # update weights and biases 118 | self.linear_layers[i].W += self.linear_layers[i].momentum_W 119 | self.linear_layers[i].b += self.linear_layers[i].momentum_B 120 | else: 121 | for i in range(len(self.linear_layers)): 122 | # Update weights and biases here 123 | self.linear_layers[i].W -= (self.lr * self.linear_layers[i].dW) 124 | self.linear_layers[i].b -= (self.lr * self.linear_layers[i].db) 125 | # Do the same for batchnorm layers 126 | if self.bn: 127 | for i in range(len(self.bn_layers)): 128 | self.bn_layers[i].gamma -= (self.lr * self.bn_layers[i].dgamma) 129 | self.bn_layers[i].beta -= (self.lr * self.bn_layers[i].dbeta) 130 | 131 | 132 | def backward(self, labels): 133 | # Backpropagate through the activation functions, batch norm and 134 | # linear layers. 135 | # Be aware of which return derivatives and which are pure backward passes 136 | # i.e. take in a loss w.r.t it's output. 137 | 138 | # Output 139 | loss = self.criterion.forward(self.activations[self.nlayers-1].state, labels) 140 | dl_dz = self.criterion.derivative() 141 | 142 | # Hidden layers 143 | dz = [] # input of activation 144 | dy = [] # output of activation 145 | dnorm = [] 146 | dy.append(dl_dz) 147 | for i in range(self.nlayers - self.num_bn_layers): 148 | dz.append(np.multiply(dy[i], self.activations[self.nlayers-i-1].derivative())) 149 | dy.append(self.linear_layers[self.nlayers-i-1].backward(dz[i])) 150 | # Batch norm 151 | for i in range(self.nlayers - self.num_bn_layers, self.nlayers): 152 | dnorm.append(np.multiply(dy[i], self.activations[self.nlayers-i-1].derivative())) 153 | dz.append(self.bn_layers[self.nlayers-i-1].backward(dnorm[i-(self.nlayers-self.num_bn_layers)])) 154 | dy.append(self.linear_layers[self.nlayers-i-1].backward(dz[i])) 155 | 156 | def error(self, labels): 157 | return (np.argmax(self.output, axis = 1) != np.argmax(labels, axis = 1)).sum() 158 | 159 | def total_loss(self, labels): 160 | return self.criterion(self.output, labels).sum() 161 | 162 | def __call__(self, x): 163 | return self.forward(x) 164 | 165 | def train(self): 166 | self.train_mode = True 167 | 168 | def eval(self): 169 | self.train_mode = False 170 | 171 | def get_training_stats(mlp, dset, nepochs, batch_size): 172 | 173 | train, val, _ = dset 174 | trainx, trainy = train 175 | valx, valy = val 176 | 177 | idxs = np.arange(len(trainx)) 178 | 179 | training_losses = np.zeros(nepochs) 180 | training_errors = np.zeros(nepochs) 181 | validation_losses = np.zeros(nepochs) 182 | validation_errors = np.zeros(nepochs) 183 | 184 | # Setup ... 185 | for e in range(nepochs): 186 | # Per epoch setup ... 187 | # shuffle training data 188 | np.random.shuffle(idxs) 189 | x_train = trainx[idxs] 190 | y_train = trainy[idxs] 191 | mlp.train() 192 | for b in range(0, len(trainx), batch_size): 193 | # Train ... 194 | # 1. Zerofill derivatives after each batch 195 | mlp.zero_grads() 196 | # 2. Forward 197 | y_pred_t = mlp.forward(x_train[b:b+batch_size]) 198 | y_true_t = y_train[b:b+batch_size] 199 | # 3. Backward 200 | mlp.backward(y_true_t) 201 | # 4. Update with gradients 202 | mlp.step() 203 | # 5. Calculate training loss 204 | loss = [] 205 | for element in SoftmaxCrossEntropy().forward(y_pred_t, y_true_t): 206 | loss.append(element) 207 | training_losses[e] += sum(loss) 208 | # 6. Calculate training error count 209 | for i in range(y_pred_t.shape[0]): 210 | if np.argmax(y_pred_t[i]) != np.argmax(y_true_t[i]): 211 | training_errors[e] += 1 212 | mlp.eval() 213 | for b in range(0, len(valx), batch_size): 214 | # Validate ... 215 | # 1. Zerofill derivatives after each batch 216 | mlp.zero_grads() 217 | # 2. Forward 218 | y_pred_v = mlp.forward(valx[b:b+batch_size]) 219 | y_true_v = valy[b:b+batch_size] 220 | # 3. Calculate validation loss 221 | loss = [] 222 | for element in SoftmaxCrossEntropy().forward(y_pred_v, y_true_v): 223 | loss.append(element) 224 | validation_losses[e] += sum(loss) 225 | # 4. Calculate validation error count 226 | for i in range(y_pred_v.shape[0]): 227 | if np.argmax(y_pred_v[i]) != np.argmax(y_true_v[i]): 228 | validation_errors[e] += 1 229 | 230 | 231 | # Accumulate data... 232 | training_losses[e] = training_losses[e] / trainx.shape[0] 233 | validation_losses[e] = validation_losses[e] / valx.shape[0] 234 | training_errors[e] = training_errors[e] / trainx.shape[0] 235 | validation_errors[e] = validation_errors[e] / valx.shape[0] 236 | 237 | # Cleanup ... 238 | 239 | # Return results ... 240 | 241 | return (training_losses, training_errors, validation_losses, validation_errors) 242 | -------------------------------------------------------------------------------- /HW1/hw1_p1/hw1/mc.py: -------------------------------------------------------------------------------- 1 | def question_1(): 2 | ''' 3 | here you return the answer to the multiple choice question in the handout, 4 | your answer should be a character a,b,c or d 5 | for example, if you think option c is the correct answer, 6 | return 'c' 7 | ''' 8 | return 'b' 9 | 10 | def question_2(): 11 | ''' 12 | here you return the answer to the multiple choice question in the handout, 13 | your answer should be a character a,b,c or d 14 | for example, if you think option c is the correct answer, 15 | return 'c' 16 | ''' 17 | return 'a' 18 | 19 | def question_3(): 20 | ''' 21 | here you return the answer to the multiple choice question in the handout, 22 | your answer should be a character a,b,c or d 23 | for example, if you think option c is the correct answer, 24 | return 'c' 25 | ''' 26 | return 'a' 27 | 28 | def question_4(): 29 | ''' 30 | here you return the answer to the multiple choice question in the handout, 31 | your answer should be a character a,b,c or d 32 | for example, if you think option c is the correct answer, 33 | return 'c' 34 | ''' 35 | return 'a' 36 | 37 | def question_5(): 38 | ''' 39 | here you return the answer to the multiple choice question in the handout, 40 | your answer should be a character a,b,c or d 41 | for example, if you think option c is the correct answer, 42 | return 'c' 43 | ''' 44 | return 'c' 45 | -------------------------------------------------------------------------------- /HW1/hw1_p1/hw1/train_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW1/hw1_p1/hw1/train_error.png -------------------------------------------------------------------------------- /HW1/hw1_p1/hw1/train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW1/hw1_p1/hw1/train_loss.png -------------------------------------------------------------------------------- /HW1/hw1_p1/hw1/val_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW1/hw1_p1/hw1/val_error.png -------------------------------------------------------------------------------- /HW1/hw1_p1/hw1/val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW1/hw1_p1/hw1/val_loss.png -------------------------------------------------------------------------------- /HW1/hw1_p1/mytorch/activation.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import os 6 | 7 | 8 | class Activation(object): 9 | 10 | """ 11 | Interface for activation functions (non-linearities). 12 | 13 | In all implementations, the state attribute must contain the result, 14 | i.e. the output of forward (it will be tested). 15 | """ 16 | 17 | # No additional work is needed for this class, as it acts like an 18 | # abstract base class for the others 19 | 20 | # Note that these activation functions are scalar operations. I.e, they 21 | # shouldn't change the shape of the input. 22 | 23 | def __init__(self): 24 | self.state = None 25 | 26 | def __call__(self, x): 27 | return self.forward(x) 28 | 29 | def forward(self, x): 30 | raise NotImplemented 31 | 32 | def derivative(self): 33 | raise NotImplemented 34 | 35 | 36 | class Identity(Activation): 37 | 38 | """ 39 | Identity function (already implemented). 40 | """ 41 | 42 | # This class is a gimme as it is already implemented for you as an example 43 | 44 | def __init__(self): 45 | super(Identity, self).__init__() 46 | 47 | def forward(self, x): 48 | self.state = x 49 | return x 50 | 51 | def derivative(self): 52 | return 1.0 53 | 54 | 55 | class Sigmoid(Activation): 56 | 57 | """ 58 | Sigmoid non-linearity 59 | """ 60 | 61 | # Remember do not change the function signatures as those are needed 62 | # to stay the same for AutoLab. 63 | 64 | def __init__(self): 65 | super(Sigmoid, self).__init__() 66 | 67 | def forward(self, x): 68 | # Might we need to store something before returning? 69 | Sz = 1 / (1 + np.exp(-x)) 70 | self.state = Sz 71 | return Sz 72 | 73 | def derivative(self): 74 | # Maybe something we need later in here... 75 | return self.state * (1 - self.state) 76 | 77 | 78 | class Tanh(Activation): 79 | 80 | """ 81 | Tanh non-linearity 82 | """ 83 | 84 | def __init__(self): 85 | super(Tanh, self).__init__() 86 | 87 | def forward(self, x): 88 | tanhz = np.tanh(x) 89 | self.state = tanhz 90 | return tanhz 91 | 92 | def derivative(self): 93 | return 1 - self.state**2 94 | 95 | 96 | class ReLU(Activation): 97 | 98 | """ 99 | ReLU non-linearity 100 | """ 101 | 102 | def __init__(self): 103 | super(ReLU, self).__init__() 104 | 105 | def forward(self, x): 106 | clipped = np.clip(x, a_min=0, a_max=None) 107 | self.state = clipped 108 | return clipped 109 | 110 | def derivative(self): 111 | return np.where(self.state > 0, 1, self.state) 112 | -------------------------------------------------------------------------------- /HW1/hw1_p1/mytorch/batchnorm.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | 6 | class BatchNorm(object): 7 | 8 | def __init__(self, in_feature, alpha=0.9): 9 | 10 | # You shouldn't need to edit anything in init 11 | 12 | self.alpha = alpha 13 | self.eps = 1e-8 14 | self.x = None 15 | self.norm = None 16 | self.out = None 17 | 18 | # The following attributes will be tested 19 | self.var = np.ones((1, in_feature)) 20 | self.mean = np.zeros((1, in_feature)) 21 | 22 | self.gamma = np.ones((1, in_feature)) 23 | self.dgamma = np.zeros((1, in_feature)) 24 | 25 | self.beta = np.zeros((1, in_feature)) 26 | self.dbeta = np.zeros((1, in_feature)) 27 | 28 | # inference parameters 29 | self.running_mean = np.zeros((1, in_feature)) 30 | self.running_var = np.ones((1, in_feature)) 31 | 32 | def __call__(self, x, eval=False): 33 | return self.forward(x, eval) 34 | 35 | def forward(self, x, eval=False): 36 | """ 37 | Argument: 38 | x (np.array): (batch_size, in_feature) 39 | eval (bool): inference status 40 | 41 | Return: 42 | out (np.array): (batch_size, in_feature) 43 | """ 44 | self.x = x 45 | 46 | if eval: # use running mean and var for testing 47 | self.norm = (x - self.running_mean) / np.sqrt(self.running_var + self.eps) 48 | else: 49 | self.mean = np.mean(x, axis=0) 50 | self.var = np.var(x, axis=0) 51 | self.norm = (x - self.mean) / np.sqrt(self.var + self.eps) 52 | 53 | # Update running batch statistics 54 | self.running_mean = self.alpha * self.running_mean + (1 - self.alpha) * self.mean 55 | self.running_var = self.alpha * self.running_var + (1 - self.alpha) * self.var 56 | 57 | self.out = self.gamma * self.norm + self.beta 58 | return self.out 59 | 60 | 61 | def backward(self, delta): 62 | """ 63 | Argument: 64 | delta (np.array): (batch size, in feature) 65 | Return: 66 | out (np.array): (batch size, in feature) 67 | """ 68 | diffX = self.x - self.mean 69 | dnorm = delta * self.gamma 70 | self.dbeta = np.sum(delta, axis=0, keepdims=True) 71 | self.dgamma = np.sum(delta*self.norm, axis=0, keepdims=True) 72 | 73 | sqrtVar = np.sqrt(self.var + self.eps) 74 | m = delta.shape[0] 75 | dvar = - np.sum(dnorm * (diffX/(2*(sqrtVar**3))), axis=0) 76 | dmu = - np.sum(dnorm / sqrtVar, axis=0) - (2 / m) * dvar * np.sum(diffX, axis=0) 77 | dx = (dnorm/sqrtVar) + (dvar * (2/m) * diffX) + (dmu / m) 78 | 79 | return dx 80 | -------------------------------------------------------------------------------- /HW1/hw1_p1/mytorch/linear.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import math 6 | 7 | class Linear(): 8 | def __init__(self, in_feature, out_feature, weight_init_fn, bias_init_fn): 9 | 10 | """ 11 | Argument: 12 | W (np.array): (in feature, out feature) 13 | dW (np.array): (in feature, out feature) 14 | momentum_W (np.array): (in feature, out feature) 15 | 16 | b (np.array): (1, out feature) 17 | db (np.array): (1, out feature) 18 | momentum_B (np.array): (1, out feature) 19 | """ 20 | 21 | self.W = weight_init_fn(in_feature, out_feature) 22 | self.b = bias_init_fn(out_feature) 23 | 24 | # TODO: Complete these but do not change the names. 25 | self.dW = np.zeros((in_feature, out_feature)) 26 | self.db = np.zeros((1, out_feature)) 27 | 28 | self.momentum_W = np.zeros((in_feature, out_feature)) 29 | self.momentum_B = np.zeros((1, out_feature)) 30 | 31 | self.x = None 32 | 33 | def __call__(self, x): 34 | return self.forward(x) 35 | 36 | def forward(self, x): 37 | """ 38 | Argument: 39 | x (np.array): (batch size, in feature) 40 | Return: 41 | out (np.array): (batch size, out feature) 42 | """ 43 | yhat = np.dot(x, self.W) + self.b 44 | self.x = x 45 | return yhat 46 | 47 | def backward(self, delta): 48 | 49 | """ 50 | Argument: 51 | delta (np.array): (batch size, out feature) 52 | Return: 53 | out (np.array): (batch size, in feature) 54 | """ 55 | batchSize = delta.shape[0] 56 | self.dW = np.dot(self.x.T, delta) / batchSize # same shape as W 57 | self.db = np.mean(delta, axis=0, keepdims=True) 58 | dx = np.dot(delta, self.W.T) 59 | return dx 60 | -------------------------------------------------------------------------------- /HW1/hw1_p1/mytorch/loss.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import os 6 | 7 | # The following Criterion class will be used again as the basis for a number 8 | # of loss functions (which are in the form of classes so that they can be 9 | # exchanged easily (it's how PyTorch and other ML libraries do it)) 10 | 11 | class Criterion(object): 12 | """ 13 | Interface for loss functions. 14 | """ 15 | 16 | # Nothing needs done to this class, it's used by the following Criterion classes 17 | 18 | def __init__(self): 19 | self.logits = None 20 | self.labels = None 21 | self.loss = None 22 | 23 | def __call__(self, x, y): 24 | return self.forward(x, y) 25 | 26 | def forward(self, x, y): 27 | raise NotImplemented 28 | 29 | def derivative(self): 30 | raise NotImplemented 31 | 32 | class SoftmaxCrossEntropy(Criterion): 33 | """ 34 | Softmax loss 35 | """ 36 | 37 | def __init__(self): 38 | super(SoftmaxCrossEntropy, self).__init__() 39 | self.softmax = None 40 | 41 | def forward(self, x, y): 42 | """ 43 | Argument: 44 | x (np.array): (batch size, 10) 45 | y (np.array): (batch size, 10) 46 | Return: 47 | out (np.array): (batch size, ) 48 | """ 49 | self.logits = x 50 | self.labels = y 51 | x = x - np.amax(x, axis=1).reshape((x.shape[0],1))# Use LogSumExp trick to ensure numerical stability 52 | self.softmax = np.exp(x)/np.sum(np.exp(x), axis=1, keepdims=True) 53 | self.loss = -np.sum(self.labels * np.log(self.softmax), axis=1) 54 | return self.loss 55 | 56 | def derivative(self): 57 | """ 58 | Return: 59 | out (np.array): (batch size, 10) 60 | """ 61 | 62 | return self.softmax - self.labels 63 | -------------------------------------------------------------------------------- /HW1/hw1_p2/README.md: -------------------------------------------------------------------------------- 1 | File structure: 2 | - HW1P2 3 | - main.py 4 | - data 5 | - train.npy 6 | - train_labels.npy 7 | - dev.npy 8 | - dev_labels.npy 9 | - test.npy 10 | - test_labels_v2.npy 11 | - test_labels_v2.csv 12 | - checkpoint 13 | - optim_Epoch_4contextK_v2.txt 14 | - optimCont_Epoch_8contextK_v2.txt 15 | - optimContDecrease_Epoch_8contextK_v2.txt 16 | 17 | To run my model: 18 | - Have the file structure as above 19 | - Type "source activate pytorch_p36" in the terminal 20 | - In the HW1P2 directory, type "python3 main.py" to run the model 21 | - After the model finishes, find predicted test_labels_v2.npy and csv files under data folder 22 | Dataloader design: 23 | - Used a python list to store (i, j) to index into into the list, where i is the utterance 24 | index and j is the frame index. I used python.take to include 12 left and right frames of 25 | the current indexed frame. 26 | 27 | Steps taken to train my model that gets the optimal result: 28 | - First train model with configuration of the following structure 29 | hidden layer dimensions: [2048,1024,1024,1024,1024,512,256] 30 | MLP( 31 | (net): Sequential( 32 | (0): Linear(in_features=1000, out_features=2048, bias=True) 33 | (1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | (2): LeakyReLU(negative_slope=0.01) 35 | (3): Linear(in_features=2048, out_features=1024, bias=True) 36 | (4): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | (5): LeakyReLU(negative_slope=0.01) 38 | (6): Linear(in_features=1024, out_features=1024, bias=True) 39 | (7): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 40 | (8): LeakyReLU(negative_slope=0.01) 41 | (9): Linear(in_features=1024, out_features=1024, bias=True) 42 | (10): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 43 | (11): LeakyReLU(negative_slope=0.01) 44 | (12): Linear(in_features=1024, out_features=1024, bias=True) 45 | (13): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 46 | (14): LeakyReLU(negative_slope=0.01) 47 | (15): Linear(in_features=1024, out_features=512, bias=True) 48 | (16): LeakyReLU(negative_slope=0.01) 49 | (17): Linear(in_features=512, out_features=256, bias=True) 50 | (18): LeakyReLU(negative_slope=0.01) 51 | (19): Linear(in_features=256, out_features=138, bias=True) 52 | ) 53 | ) 54 | kContext size: 12 55 | optimizer: Adam with 0.001 learning rate with learning rate scheduler 56 | learning rate scheduler: ReduceLROnPlateau, mode=min, patientce=2 57 | batch size: 256 58 | number of Epochs: 5 59 | Save the model in file optim_Epoch_4contextK_v2.txt 60 | - Then load the model from optim_Epoch_4contextK_v2.txt and continue training same expect 61 | number of Epochs: 9 62 | Save the model in file optimCont_Epoch_8contextK_v2.txt 63 | - Then load the model fin file optimCont_Epoch_8contextK_v2.txt and continue trainign with 64 | number of Epochs: 9 65 | Save the model in file optimContDecrease_Epoch_8contextK_v2.txt 66 | 67 | Reference Links: 68 | https://florimond.dev/blog/articles/2018/10/reconciling-dataclasses-and-properties-in-python/ 69 | https://medium.com/@Biboswan98/optim-adam-vs-optim-sgd-lets-dive-in-8dbf1890fbdc 70 | https://pytorch.org/docs/stable/optim.html 71 | https://stackoverflow.com/questions/41153803/zero-padding-slice-past-end-of-array-in-numpy 72 | https://dev.to/hardiksondagar/how-to-use-aws-ebs-volume-as-a-swap-memory-5d15 73 | https://www.google.com/search?client=safari&sxsrf=ACYBGNSjEyEM_L_y5jQjSiKA7LAUMCu38g%3A1581151963058&source=hp&ei=23Y-Xosv5q_K0w_e2IjoAg&q=torch.save%28model.state_dict%28%29&oq=torch.save%28model.state_dict%28%29&gs_l=psy-ab.3..0l3j0i333l3.4254.4254..4482...3.0..0.85.85.1......0....2j1..gws-wiz.yz9xThPrXqE&ved=0ahUKEwjLiN2IysHnAhXml3IEHV4sAi0Q4dUDCAs&uact=5 74 | https://pytorch.org/tutorials/beginner/saving_loading_models.html 75 | https://stackoverflow.com/questions/33492260/save-multiple-arrays-to-a-csv-file-with-column-names 76 | https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.ReduceLROnPlateau 77 | -------------------------------------------------------------------------------- /HW1/hw1_p2/main.py: -------------------------------------------------------------------------------- 1 | # Import all the necessary libraries 2 | import numpy as np 3 | import torch 4 | import sys 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import os 9 | import pandas as pd 10 | 11 | from torch.utils.data import DataLoader, Dataset, TensorDataset 12 | 13 | import time 14 | import matplotlib.pyplot as plt 15 | 16 | class MyDataset(Dataset): 17 | # Create own Dataset and append features only when accessed 18 | def __init__(self, dataset, k): 19 | self.k = k 20 | self.dataX = dataset[0] 21 | self.dataY = dataset[1] if len(dataset) == 2 else None 22 | self.idxMap = [] 23 | for i, utter in enumerate(self.dataX): 24 | for j in range(utter.shape[0]): 25 | self.idxMap.append((i, j)) # frame index, each frame has dim 40 26 | 27 | def __getitem__(self, idx): 28 | i, j = self.idxMap[idx] 29 | withContext = self.dataX[i].take(range(j - self.k, j + self.k + 1), mode='clip', axis=0).flatten() 30 | x = torch.Tensor(withContext).float() 31 | y = self.dataY[i][j] if self.dataY is not None else -1 32 | return x, y 33 | 34 | def __len__(self): 35 | return len(self.idxMap) 36 | 37 | def getLoaders(train, dev, test, datapath, cuda, k, batchSize): 38 | trainX, trainY = train # 24500 * l * 40 where l could be any length 39 | devX, devY = dev # 1100 * l * 40 40 | testX, _ = test # 361 * l * 40 41 | 42 | print('*** Create data loader ***') 43 | # Train 44 | train_loader_args = dict(shuffle=True, batch_size=batchSize, num_workers=8, pin_memory=True) 45 | train_loader = DataLoader(MyDataset(train, k), **train_loader_args) 46 | 47 | # Dev 48 | dev_loader_args = dict(shuffle=True, batch_size=batchSize, num_workers=1, pin_memory=True) 49 | dev_loader = DataLoader(MyDataset(dev, k), **dev_loader_args) 50 | 51 | # Test 52 | test_loader_args = dict(shuffle=False, batch_size=batchSize, num_workers=1, pin_memory=True) 53 | test_loader = DataLoader(MyDataset(test, k), **test_loader_args) 54 | 55 | return train_loader, dev_loader, test_loader 56 | 57 | 58 | class MLP(nn.Module): 59 | def __init__(self, sizeList): 60 | super(MLP, self).__init__() 61 | layers = [] 62 | self.sizeList = sizeList 63 | for i in range(len(sizeList) - 2): 64 | if i < 5: # batchnorm 65 | layers.append(nn.Linear(sizeList[i], sizeList[i+1])) 66 | layers.append(nn.BatchNorm1d(sizeList[i+1])) 67 | layers.append(nn.LeakyReLU()) 68 | else: # regular layer 69 | layers.append(nn.Linear(sizeList[i], sizeList[i+1])) 70 | layers.append(nn.LeakyReLU()) 71 | # Last layer 72 | layers.append(nn.Linear(sizeList[-2], sizeList[-1])) 73 | self.net = nn.Sequential(*layers) 74 | def forward(self, x): 75 | return self.net(x) 76 | 77 | 78 | # A function that will train the network for one epoch 79 | def train_epoch(model, train_loader, criterion, optimizer, device): 80 | model.train() 81 | running_loss = 0.0 82 | start_time = time.time() 83 | for batch_idx, (data, target) in enumerate(train_loader): 84 | optimizer.zero_grad() 85 | data = data.cuda() 86 | target = target.cuda() 87 | 88 | outputs = model(data) 89 | loss = criterion(outputs, target) 90 | running_loss += loss.item() 91 | 92 | loss.backward() 93 | optimizer.step() 94 | if batch_idx % 1000 == 0: 95 | print("Finished " + str(batch_idx) + "\t Timestamp: "+ str(time.time() - start_time)) 96 | 97 | end_time = time.time() 98 | running_loss = running_loss / len(train_loader) 99 | print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's') 100 | return running_loss 101 | 102 | # A function that will evaluate out network's performance on the test set 103 | def test_model(model, test_loader, criterion, device): 104 | with torch.no_grad(): 105 | model.eval() 106 | start_time = time.time() 107 | 108 | running_loss = 0.0 109 | total_predictions = 0.0 110 | correct_predictions = 0.0 111 | for batch_idx, (data, target) in enumerate(test_loader): 112 | data = data.cuda() 113 | target = target.cuda() 114 | 115 | outputs = model(data) 116 | 117 | _, predicted = torch.max(outputs.data, 1) 118 | total_predictions += target.size(0) 119 | correct_predictions += (predicted == target).sum().item() 120 | 121 | loss = criterion(outputs, target).detach() 122 | running_loss += loss.item() 123 | if batch_idx % 1000 == 0: 124 | print("Finished " + str(batch_idx) + "\t Timestamp: "+ str(time.time() - start_time)) 125 | 126 | running_loss /= len(test_loader) 127 | acc = (correct_predictions/total_predictions)*100.0 128 | print('Testing Loss: ', running_loss) 129 | print('Testing Accuracy: ', acc, '%') 130 | return running_loss, acc 131 | 132 | 133 | # A function that predicts test set label 134 | def predictLabels(model, test_loader, device): 135 | model.eval() 136 | 137 | res = np.array([]) 138 | for batch_idx, (data, target) in enumerate(test_loader): 139 | data = data.cuda() 140 | target = target.cuda() 141 | 142 | outputs = model(data) 143 | _, predicted = torch.max(outputs.data, dim=1) 144 | res = np.concatenate((res, predicted.cpu().numpy().reshape(-1))) 145 | return res 146 | 147 | 148 | def main(hyper): 149 | datapath = hyper['dataPath'] 150 | weightDirName = hyper["weightDirName"] 151 | cuda = torch.cuda.is_available() 152 | num_workers = 8 if cuda else 0 153 | nEpochs = hyper["nEpochs"] 154 | 155 | print('*** Load raw data ***') 156 | train = (np.load(os.path.join(datapath, 'train.npy'), allow_pickle=True), 157 | np.load(os.path.join(datapath, 'train_labels.npy'), allow_pickle=True)) 158 | dev = (np.load(os.path.join(datapath, 'dev.npy'), allow_pickle=True), 159 | np.load(os.path.join(datapath, 'dev_labels.npy'), allow_pickle=True)) 160 | test = (np.load(os.path.join(datapath, 'test.npy'), allow_pickle=True), None) 161 | 162 | # Get data loaders 163 | train_loader, dev_loader, test_loader = getLoaders(train, dev, test, datapath, cuda, hyper["kContext"], hyper['batchSize']) 164 | 165 | # Create the model and define the Loss and Optimizer 166 | print("*** Create the model and define Loss and Optimizer ***") 167 | inputSize = (2 * hyper["kContext"] + 1) * 40 # new dim 168 | outputSize = 138 # possible phoneme states 169 | model = MLP([inputSize] + hyper["hiddenDims"] + [outputSize]) 170 | checkpoint = torch.load(hyper["checkpointPath"]) 171 | model.load_state_dict(checkpoint["model_state_dict"]) 172 | criterion = nn.CrossEntropyLoss() 173 | optimizer = optim.Adam(model.parameters(), lr=hyper["lr"]) 174 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=2) 175 | device = torch.device("cuda" if cuda else "cpu") 176 | model.cuda() 177 | print(model) 178 | 179 | # Train the model for N epochs 180 | print("*** Train the Model for N Epochs ***") 181 | Train_loss = [] 182 | Test_loss = [] 183 | Test_acc = [] 184 | for i in range(nEpochs): 185 | print("Train "+ str(i)+" epoch") 186 | train_loss = train_epoch(model, train_loader, criterion, optimizer, device) 187 | print("Dev "+ str(i)+" epoch") 188 | dev_loss, dev_acc = test_model(model, dev_loader, criterion, device) 189 | scheduler.step(dev_loss) 190 | Train_loss.append(train_loss) 191 | Test_loss.append(dev_loss) 192 | Test_acc.append(dev_acc) 193 | print('='*20) 194 | print("*** Saving Checkpoint ***") 195 | path = "{}optimContDecreaseCont_Epoch_{}contextK_v2.txt".format(weightDirName, str(i), str(hyper["kContext"])) 196 | torch.save({ 197 | 'epoch': i, 198 | 'model_state_dict': model.state_dict(), 199 | 'optimizer_state_dict': optimizer.state_dict(), 200 | 'train_loss': train_loss, 201 | 'dev_loss':dev_loss, 202 | 'dev_acc': dev_acc 203 | }, path) 204 | 205 | 206 | # Visualize Training and Validation data 207 | print("*** Visualize Training and Validation Data ***") 208 | plt.title('Training Loss') 209 | plt.xlabel('Epoch Number') 210 | plt.ylabel('Loss') 211 | plt.plot(Train_loss) 212 | plt.savefig("Train_Vis.png") 213 | 214 | plt.title('Dev Accuracy') 215 | plt.xlabel('Epoch Number') 216 | plt.ylabel('Accuracy (%)') 217 | plt.plot(Test_acc) 218 | plt.savefig("Dev_Vis.png") 219 | 220 | # Writeout test labels 221 | labels = predictLabels(model, test_loader, device) 222 | np.save(hyper["testLabelName"], labels) 223 | labels = list(map(int, labels)) 224 | idxs = np.array(list(range(len(labels)))) 225 | labels = np.array(labels) 226 | df = pd.DataFrame({"id" : idxs, "label" : labels}) 227 | df.to_csv(hyper["testLabelCSVfn"], index=False) 228 | 229 | if __name__ == "__main__": 230 | hyper = { 231 | "nEpochs":5, 232 | "lr":0.0001, 233 | "lr_decayRate":0.0, 234 | "randomSeed":2, 235 | "kContext":12, 236 | "batchSize":256, 237 | "dataPath":'./data', 238 | "weightDirName": './checkpoint/', 239 | "testLabelName" : "./data/test_labels.npy", 240 | "testLabelCSVfn": "./data/test_labels.csv", 241 | "hiddenDims": [2048,1024,1024,1024,1024,512,256], 242 | "checkpointPath":"./checkpoint/optimContDecrease_Epoch_8contextK_v2.txt" 243 | } 244 | main(hyper) 245 | 246 | 247 | # In[ ]: 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /HW2/HW2P1_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/HW2P1_Writeup.pdf -------------------------------------------------------------------------------- /HW2/HW2P2_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/HW2P2_Writeup.pdf -------------------------------------------------------------------------------- /HW2/hw2p1/autograder/hw2_autograder/ref_result/res_b.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/hw2p1/autograder/hw2_autograder/ref_result/res_b.npy -------------------------------------------------------------------------------- /HW2/hw2p1/autograder/hw2_autograder/ref_result/res_c.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/hw2p1/autograder/hw2_autograder/ref_result/res_c.npy -------------------------------------------------------------------------------- /HW2/hw2p1/autograder/hw2_autograder/weights/mlp_weights_part_b.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/hw2p1/autograder/hw2_autograder/weights/mlp_weights_part_b.npy -------------------------------------------------------------------------------- /HW2/hw2p1/autograder/hw2_autograder/weights/mlp_weights_part_c.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/hw2p1/autograder/hw2_autograder/weights/mlp_weights_part_c.npy -------------------------------------------------------------------------------- /HW2/hw2p1/create_tarball.sh: -------------------------------------------------------------------------------- 1 | tar -cvf handin.tar -X exclude.txt hw2 mytorch 2 | -------------------------------------------------------------------------------- /HW2/hw2p1/exclude.txt: -------------------------------------------------------------------------------- 1 | mytorch/activation.pyc 2 | mytorch/batchnorm.pyc 3 | mytorch/linear.pyc 4 | mytorch/loss.pyc -------------------------------------------------------------------------------- /HW2/hw2p1/hw2/hw2.py: -------------------------------------------------------------------------------- 1 | # DO NOT import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed)​ 3 | 4 | import numpy as np 5 | import os 6 | import sys 7 | 8 | sys.path.append('mytorch') 9 | from loss import * 10 | from activation import * 11 | from linear import * 12 | from conv import * 13 | 14 | class CNN(object): 15 | 16 | """ 17 | A simple convolutional neural network 18 | 19 | Here you build implement the same architecture described in Section 3.3 20 | You need to specify the detailed architecture in function "get_cnn_model" below 21 | The returned model architecture should be same as in Section 3.3 Figure 3 22 | """ 23 | 24 | def __init__(self, input_width, num_input_channels, num_channels, kernel_sizes, strides, 25 | num_linear_neurons, activations, conv_weight_init_fn, bias_init_fn, 26 | linear_weight_init_fn, criterion, lr): 27 | """ 28 | input_width : int : The width of the input to the first convolutional layer 29 | num_input_channels : int : Number of channels for the input layer 30 | num_channels : [int] : List containing number of (output) channels for each conv layer 31 | kernel_sizes : [int] : List containing kernel width for each conv layer 32 | strides : [int] : List containing stride size for each conv layer 33 | num_linear_neurons : int : Number of neurons in the linear layer 34 | activations : [obj] : List of objects corresponding to the activation fn for each conv layer 35 | conv_weight_init_fn : fn : Function to init each conv layers weights 36 | bias_init_fn : fn : Function to initialize each conv layers AND the linear layers bias to 0 37 | linear_weight_init_fn : fn : Function to initialize the linear layers weights 38 | criterion : obj : Object to the criterion (SoftMaxCrossEntropy) to be used 39 | lr : float : The learning rate for the class 40 | 41 | You can be sure that len(activations) == len(num_channels) == len(kernel_sizes) == len(strides) 42 | """ 43 | 44 | # Don't change this --> 45 | self.train_mode = True 46 | self.nlayers = len(num_channels) 47 | 48 | self.activations = activations 49 | self.criterion = criterion 50 | 51 | self.lr = lr 52 | # <--------------------- 53 | 54 | # Don't change the name of the following class attributes, 55 | # the autograder will check against these attributes. But you will need to change 56 | # the values in order to initialize them correctly 57 | 58 | ## Your code goes here --> 59 | # self.convolutional_layers (list Conv1D) = [] 60 | # self.flatten (Flatten) = Flatten() 61 | # self.linear_layer (Linear) = Linear(???) 62 | # <--------------------- 63 | outChannel = num_input_channels 64 | outSize = 0 65 | inputSize = input_width 66 | self.convolutional_layers = [] 67 | for i in range(self.nlayers): 68 | self.convolutional_layers.append(Conv1D(in_channel=outChannel, out_channel=num_channels[i], \ 69 | kernel_size=kernel_sizes[i], stride=strides[i], \ 70 | weight_init_fn=conv_weight_init_fn, bias_init_fn=bias_init_fn)) 71 | outChannel = num_channels[i] 72 | outSize = (inputSize - kernel_sizes[i]) // strides[i] + 1 73 | inputSize = outSize 74 | self.flatten = Flatten() 75 | self.linear_layer = Linear(in_feature=outChannel*outSize, out_feature=num_linear_neurons, \ 76 | weight_init_fn=linear_weight_init_fn, bias_init_fn=bias_init_fn) 77 | 78 | 79 | def forward(self, x): 80 | """ 81 | Argument: 82 | x (np.array): (batch_size, num_input_channels, input_width) 83 | Return: 84 | out (np.array): (batch_size, num_linear_neurons) 85 | """ 86 | 87 | ## Your code goes here --> 88 | # Iterate through each layer 89 | # <--------------------- 90 | 91 | # Save output (necessary for error and loss) 92 | # self.output = x 93 | input = x 94 | for i in range(self.nlayers): 95 | z = self.convolutional_layers[i].forward(input) 96 | input = self.activations[i].forward(z) 97 | input = self.flatten.forward(input) 98 | self.output = self.linear_layer.forward(input) 99 | return self.output 100 | 101 | def backward(self, labels): 102 | """ 103 | Argument: 104 | labels (np.array): (batch_size, num_linear_neurons) 105 | Return: 106 | grad (np.array): (batch size, num_input_channels, input_width) 107 | """ 108 | 109 | m, _ = labels.shape 110 | self.loss = self.criterion(self.output, labels).sum() 111 | grad = self.criterion.derivative() 112 | 113 | ## Your code goes here --> 114 | # Iterate through each layer in reverse order 115 | # <--------------------- 116 | dy = self.linear_layer.backward(grad) # backprop on linear layer 117 | dy = self.flatten.backward(dy) # output of activation 118 | dz = 0 # input of activation 119 | for i in range(self.nlayers-1, -1, -1): 120 | dz = np.multiply(dy, self.activations[i].derivative()) 121 | dy = self.convolutional_layers[i].backward(dz) 122 | grad = dy 123 | return grad 124 | 125 | 126 | def zero_grads(self): 127 | # Do not modify this method 128 | for i in range(self.nlayers): 129 | self.convolutional_layers[i].dW.fill(0.0) 130 | self.convolutional_layers[i].db.fill(0.0) 131 | 132 | self.linear_layer.dW.fill(0.0) 133 | self.linear_layer.db.fill(0.0) 134 | 135 | def step(self): 136 | # Do not modify this method 137 | for i in range(self.nlayers): 138 | self.convolutional_layers[i].W = (self.convolutional_layers[i].W - 139 | self.lr * self.convolutional_layers[i].dW) 140 | self.convolutional_layers[i].b = (self.convolutional_layers[i].b - 141 | self.lr * self.convolutional_layers[i].db) 142 | 143 | self.linear_layer.W = (self.linear_layer.W - self.lr * self.linear_layers.dW) 144 | self.linear_layers.b = (self.linear_layers.b - self.lr * self.linear_layers.db) 145 | 146 | 147 | def __call__(self, x): 148 | # Do not modify this method 149 | return self.forward(x) 150 | 151 | def train(self): 152 | # Do not modify this method 153 | self.train_mode = True 154 | 155 | def eval(self): 156 | # Do not modify this method 157 | self.train_mode = False 158 | -------------------------------------------------------------------------------- /HW2/hw2p1/hw2/mc.py: -------------------------------------------------------------------------------- 1 | # Multiple Choice 2 | 3 | # Return the answer to the multiple choice question in the handout. 4 | # If you think option c is the correct answer, 5 | # return 'c' 6 | 7 | def question_1(): 8 | return 'b' 9 | 10 | def question_2(): 11 | return 'd' 12 | 13 | def question_3(): 14 | return 'b' 15 | 16 | def question_4(): 17 | return 'a' 18 | 19 | def question_5(): 20 | return 'a' 21 | -------------------------------------------------------------------------------- /HW2/hw2p1/hw2/mlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from layers import * 3 | 4 | # This code is only for your reference for Sections 3.3 and 3.4 5 | 6 | class MLP(): 7 | def __init__(self, layer_sizes): 8 | self.layers = [] 9 | for i in range(len(layer_sizes) - 1): 10 | in_size, out_size = layer_sizes[i], layer_sizes[i+1] 11 | self.layers.append(Linear(in_size, out_size)) 12 | self.layers.append(ReLU()) 13 | self.layers = self.layers[:-1] # remove final ReLU 14 | 15 | def __call__(self, x): 16 | return self.forward(x) 17 | 18 | def init_weights(self, weights): 19 | for i in range(len(weights)): 20 | self.layers[i*2].W = weights[i].T 21 | 22 | def forward(self, x): 23 | out = x 24 | for layer in self.layers: 25 | out = layer(out) 26 | return out 27 | 28 | def backward(self, delta): 29 | for layer in self.layers[::-1]: 30 | delta = layer.backward(delta) 31 | return delta 32 | 33 | 34 | if __name__ == '__main__': 35 | D = 24 # length of each feature vector 36 | layer_sizes = [8 * D, 8, 16, 4] 37 | mlp = MLP([8 * D, 8, 16, 4]) 38 | -------------------------------------------------------------------------------- /HW2/hw2p1/hw2/mlp_scan.py: -------------------------------------------------------------------------------- 1 | # DO NOT import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import os 6 | import sys 7 | 8 | sys.path.append('mytorch') 9 | from loss import * 10 | from activation import * 11 | from linear import * 12 | from conv import * 13 | 14 | 15 | class CNN_SimpleScanningMLP(): 16 | def __init__(self): 17 | # Your code goes here 18 | self.conv1 = Conv1D(in_channel=24, out_channel=8, kernel_size=8, stride=4) 19 | self.conv2 = Conv1D(in_channel=8, out_channel=16, kernel_size=1, stride=1) 20 | self.conv3 = Conv1D(in_channel=16, out_channel=4, kernel_size=1, stride=1) 21 | self.layers = [ 22 | self.conv1, 23 | ReLU(), 24 | self.conv2, 25 | ReLU(), 26 | self.conv3, 27 | Flatten() 28 | ] 29 | 30 | def __call__(self, x): 31 | # Do not modify this method 32 | return self.forward(x) 33 | 34 | def init_weights(self, weights): 35 | # Load the weights for your CNN from the MLP Weights given 36 | # w1, w2, w3 contain the weights for the three layers of the MLP 37 | # Load them appropriately into the CNN 38 | 39 | w1,w2,w3 = weights 40 | # print(w1.shape) # 192 * 8 41 | # print(w2.shape) # 8 * 16 42 | # print(w3.shape) # 16 * 4 43 | self.conv1.W = np.transpose(np.reshape(np.transpose(w1), (8, 8, 24)), (0, 2, 1)) 44 | self.conv2.W = np.transpose(np.reshape(np.transpose(w2), (16, 1, 8)), (0, 2, 1)) 45 | self.conv3.W = np.transpose(np.reshape(np.transpose(w3), (4, 1, 16)), (0, 2, 1)) 46 | 47 | def forward(self, x): 48 | """ 49 | Do not modify this method 50 | 51 | Argument: 52 | x (np.array): (batch size, in channel, in width) 53 | Return: 54 | out (np.array): (batch size, out channel , out width) 55 | """ 56 | 57 | out = x 58 | for layer in self.layers: 59 | out = layer(out) 60 | return out 61 | 62 | def backward(self, delta): 63 | """ 64 | Do not modify this method 65 | 66 | Argument: 67 | delta (np.array): (batch size, out channel, out width) 68 | Return: 69 | dx (np.array): (batch size, in channel, in width) 70 | """ 71 | 72 | for layer in self.layers[::-1]: 73 | delta = layer.backward(delta) 74 | return delta 75 | 76 | 77 | class CNN_DistributedScanningMLP(): 78 | def __init__(self): 79 | # Your code goes here 80 | self.conv1 = Conv1D(in_channel=24, out_channel=2, kernel_size=2, stride=2) 81 | self.conv2 = Conv1D(in_channel=2, out_channel=8, kernel_size=2, stride=2) 82 | self.conv3 = Conv1D(in_channel=8, out_channel=4, kernel_size=2, stride=1) 83 | self.layers = [ 84 | self.conv1, 85 | ReLU(), 86 | self.conv2, 87 | ReLU(), 88 | self.conv3, 89 | Flatten() 90 | ] 91 | def __call__(self, x): 92 | # Do not modify this method 93 | return self.forward(x) 94 | 95 | def init_weights(self, weights): 96 | # Load the weights for your CNN from the MLP Weights given 97 | # w1, w2, w3 contain the weights for the three layers of the MLP 98 | # Load them appropriately into the CNN 99 | 100 | w1, w2, w3 = weights 101 | # conv1.W.shape outChannel * inChannel * width = 2 * 24 * 2 102 | # conv2.W.shape outChannel * inChannel * width = 8 * 2 * 2 103 | # conv3.W.shape outChannel * inChannel * width = 4 * 8 * 2 104 | self.conv1.W = np.transpose(np.reshape(np.transpose(w1[:, :2]), (2, 8, 24))[:, :2, :], (0, 2, 1)) 105 | self.conv2.W = np.transpose(np.reshape(np.transpose(w2[:, :8]), (8, 4, 2))[:, :2, :], (0, 2, 1)) 106 | self.conv3.W = np.transpose(np.reshape(np.transpose(w3), (4, 2, 8)), (0, 2, 1)) 107 | 108 | def forward(self, x): 109 | """ 110 | Do not modify this method 111 | 112 | Argument: 113 | x (np.array): (batch size, in channel, in width) 114 | Return: 115 | out (np.array): (batch size, out channel , out width) 116 | """ 117 | 118 | out = x 119 | for layer in self.layers: 120 | out = layer(out) 121 | return out 122 | 123 | def backward(self, delta): 124 | """ 125 | Do not modify this method 126 | 127 | Argument: 128 | delta (np.array): (batch size, out channel, out width) 129 | Return: 130 | dx (np.array): (batch size, in channel, in width) 131 | """ 132 | 133 | for layer in self.layers[::-1]: 134 | delta = layer.backward(delta) 135 | return delta 136 | -------------------------------------------------------------------------------- /HW2/hw2p1/mytorch/activation.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import os 6 | 7 | 8 | class Activation(object): 9 | 10 | """ 11 | Interface for activation functions (non-linearities). 12 | 13 | In all implementations, the state attribute must contain the result, 14 | i.e. the output of forward (it will be tested). 15 | """ 16 | 17 | # No additional work is needed for this class, as it acts like an 18 | # abstract base class for the others 19 | 20 | # Note that these activation functions are scalar operations. I.e, they 21 | # shouldn't change the shape of the input. 22 | 23 | def __init__(self): 24 | self.state = None 25 | 26 | def __call__(self, x): 27 | return self.forward(x) 28 | 29 | def forward(self, x): 30 | raise NotImplemented 31 | 32 | def derivative(self): 33 | raise NotImplemented 34 | 35 | 36 | class Identity(Activation): 37 | 38 | """ 39 | Identity function (already implemented). 40 | """ 41 | 42 | # This class is a gimme as it is already implemented for you as an example 43 | 44 | def __init__(self): 45 | super(Identity, self).__init__() 46 | 47 | def forward(self, x): 48 | self.state = x 49 | return x 50 | 51 | def derivative(self): 52 | return 1.0 53 | 54 | 55 | class Sigmoid(Activation): 56 | 57 | """ 58 | Sigmoid non-linearity 59 | """ 60 | 61 | # Remember do not change the function signatures as those are needed 62 | # to stay the same for AutoLab. 63 | 64 | def __init__(self): 65 | super(Sigmoid, self).__init__() 66 | 67 | def forward(self, x): 68 | # Might we need to store something before returning? 69 | Sz = 1 / (1 + np.exp(-x)) 70 | self.state = Sz 71 | return Sz 72 | 73 | def derivative(self): 74 | # Maybe something we need later in here... 75 | return self.state * (1 - self.state) 76 | 77 | 78 | class Tanh(Activation): 79 | 80 | """ 81 | Tanh non-linearity 82 | """ 83 | 84 | def __init__(self): 85 | super(Tanh, self).__init__() 86 | 87 | def forward(self, x): 88 | tanhz = np.tanh(x) 89 | self.state = tanhz 90 | return tanhz 91 | 92 | def derivative(self): 93 | return 1 - self.state**2 94 | 95 | 96 | class ReLU(Activation): 97 | 98 | """ 99 | ReLU non-linearity 100 | """ 101 | 102 | def __init__(self): 103 | super(ReLU, self).__init__() 104 | 105 | def forward(self, x): 106 | clipped = np.clip(x, a_min=0, a_max=None) 107 | self.state = clipped 108 | return clipped 109 | 110 | def derivative(self): 111 | return np.where(self.state > 0, 1, self.state) 112 | -------------------------------------------------------------------------------- /HW2/hw2p1/mytorch/batchnorm.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | 6 | class BatchNorm(object): 7 | 8 | def __init__(self, in_feature, alpha=0.9): 9 | 10 | # You shouldn't need to edit anything in init 11 | 12 | self.alpha = alpha 13 | self.eps = 1e-8 14 | self.x = None 15 | self.norm = None 16 | self.out = None 17 | 18 | # The following attributes will be tested 19 | self.var = np.ones((1, in_feature)) 20 | self.mean = np.zeros((1, in_feature)) 21 | 22 | self.gamma = np.ones((1, in_feature)) 23 | self.dgamma = np.zeros((1, in_feature)) 24 | 25 | self.beta = np.zeros((1, in_feature)) 26 | self.dbeta = np.zeros((1, in_feature)) 27 | 28 | # inference parameters 29 | self.running_mean = np.zeros((1, in_feature)) 30 | self.running_var = np.ones((1, in_feature)) 31 | 32 | def __call__(self, x, eval=False): 33 | return self.forward(x, eval) 34 | 35 | def forward(self, x, eval=False): 36 | """ 37 | Argument: 38 | x (np.array): (batch_size, in_feature) 39 | eval (bool): inference status 40 | 41 | Return: 42 | out (np.array): (batch_size, in_feature) 43 | """ 44 | self.x = x 45 | 46 | if eval: # use running mean and var for testing 47 | self.norm = (x - self.running_mean) / np.sqrt(self.running_var + self.eps) 48 | else: 49 | self.mean = np.mean(x, axis=0) 50 | self.var = np.var(x, axis=0) 51 | self.norm = (x - self.mean) / np.sqrt(self.var + self.eps) 52 | 53 | # Update running batch statistics 54 | self.running_mean = self.alpha * self.running_mean + (1 - self.alpha) * self.mean 55 | self.running_var = self.alpha * self.running_var + (1 - self.alpha) * self.var 56 | 57 | self.out = self.gamma * self.norm + self.beta 58 | return self.out 59 | 60 | 61 | def backward(self, delta): 62 | """ 63 | Argument: 64 | delta (np.array): (batch size, in feature) 65 | Return: 66 | out (np.array): (batch size, in feature) 67 | """ 68 | diffX = self.x - self.mean 69 | dnorm = delta * self.gamma 70 | self.dbeta = np.sum(delta, axis=0, keepdims=True) 71 | self.dgamma = np.sum(delta*self.norm, axis=0, keepdims=True) 72 | 73 | sqrtVar = np.sqrt(self.var + self.eps) 74 | m = delta.shape[0] 75 | dvar = - np.sum(dnorm * (diffX/(2*(sqrtVar**3))), axis=0) 76 | dmu = - np.sum(dnorm / sqrtVar, axis=0) - (2 / m) * dvar * np.sum(diffX, axis=0) 77 | dx = (dnorm/sqrtVar) + (dvar * (2/m) * diffX) + (dmu / m) 78 | 79 | return dx 80 | -------------------------------------------------------------------------------- /HW2/hw2p1/mytorch/conv.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | 6 | 7 | class Conv1D(): 8 | def __init__(self, in_channel, out_channel, kernel_size, stride, 9 | weight_init_fn=None, bias_init_fn=None): 10 | # Do not modify this method 11 | self.in_channel = in_channel 12 | self.out_channel = out_channel 13 | self.kernel_size = kernel_size 14 | self.stride = stride 15 | 16 | if weight_init_fn is None: 17 | self.W = np.random.normal(0, 1.0, (out_channel, in_channel, kernel_size)) 18 | else: 19 | self.W = weight_init_fn(out_channel, in_channel, kernel_size) 20 | 21 | if bias_init_fn is None: 22 | self.b = np.zeros(out_channel) 23 | else: 24 | self.b = bias_init_fn(out_channel) 25 | 26 | self.dW = np.zeros(self.W.shape) 27 | self.db = np.zeros(self.b.shape) 28 | 29 | self.x = None 30 | self.input_size = None 31 | 32 | def __call__(self, x): 33 | return self.forward(x) 34 | 35 | def forward(self, x): 36 | """ 37 | Argument: 38 | x (np.array): (batch_size, in_channel, input_size) 39 | Return: 40 | out (np.array): (batch_size, out_channel, output_size) 41 | """ 42 | batch_size, in_channel, input_size = x.shape 43 | output_size = (input_size - self.kernel_size) // self.stride + 1 44 | res = np.zeros([batch_size, self.out_channel, output_size]) 45 | self.x = x 46 | self.input_size = input_size 47 | for batch in range(res.shape[0]): 48 | for cOut in range(self.out_channel): 49 | for i in range(output_size): 50 | res[batch, cOut, i] = \ 51 | np.multiply(x[batch, :, i*self.stride:i*self.stride+self.kernel_size], self.W[cOut, :, :]).sum() 52 | res[batch, cOut] += self.b[cOut] 53 | return res 54 | # startIdx = 0 55 | # for i in range(output_size): 56 | # curr = np.tensordot(x[:, :, startIdx:startIdx + self.kernel_size], self.W, axes=([1,2], [1, 2])) 57 | # res.append(curr) 58 | # startIdx += self.stride 59 | # res = np.array(res) 60 | # print(res.shape) 61 | # return np.array(res) 62 | 63 | 64 | def backward(self, delta): 65 | """ 66 | Argument: 67 | delta (np.array): (batch_size, out_channel, output_size) 68 | Return: 69 | dx (np.array): (batch_size, in_channel, input_size) 70 | """ 71 | 72 | # Calculate dW 73 | batch_size, out_channel, output_size = delta.shape 74 | for batch in range(batch_size): 75 | for cOut in range(self.out_channel): 76 | for cIn in range(self.in_channel): 77 | for i in range(self.kernel_size): 78 | for out in range(output_size): 79 | self.dW[cOut, cIn, i] += self.x[batch, cIn, i + self.stride * out] * delta[batch, cOut, out] 80 | 81 | # Calculate db 82 | self.db = np.sum(delta, axis=(0, 2)) 83 | 84 | # Calculate dX 85 | dX = np.zeros(self.x.shape) 86 | for batch in range(batch_size): 87 | for cIn in range(self.in_channel): 88 | for cOut in range(self.out_channel): 89 | for s in range((self.input_size - self.kernel_size)//self.stride + 1): 90 | for k in range(self.kernel_size): 91 | dX[batch, cIn, self.stride * s + k] += delta[batch, cOut, s] * self.W[cOut, cIn, k] 92 | 93 | return dX 94 | 95 | 96 | 97 | class Flatten(): 98 | def __call__(self, x): 99 | return self.forward(x) 100 | 101 | def forward(self, x): 102 | """ 103 | Argument: 104 | x (np.array): (batch_size, in_channel, in_width) 105 | Return: 106 | out (np.array): (batch_size, in_channel * in width) 107 | """ 108 | self.b, self.c, self.w = x.shape 109 | return x.reshape(self.b, self.c * self.w) 110 | 111 | def backward(self, delta): 112 | """ 113 | Argument: 114 | delta (np.array): (batch size, in channel * in width) 115 | Return: 116 | dx (np.array): (batch size, in channel, in width) 117 | """ 118 | # Return the derivative of the loss with respect to the flatten 119 | # layer input 120 | # Calculate dX 121 | dx = np.reshape(delta, (self.b, self.c, self.w)) 122 | return dx 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /HW2/hw2p1/mytorch/linear.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import math 6 | 7 | class Linear(): 8 | def __init__(self, in_feature, out_feature, weight_init_fn, bias_init_fn): 9 | 10 | """ 11 | Argument: 12 | W (np.array): (in feature, out feature) 13 | dW (np.array): (in feature, out feature) 14 | momentum_W (np.array): (in feature, out feature) 15 | 16 | b (np.array): (1, out feature) 17 | db (np.array): (1, out feature) 18 | momentum_B (np.array): (1, out feature) 19 | """ 20 | 21 | self.W = weight_init_fn(in_feature, out_feature) 22 | self.b = bias_init_fn(out_feature) 23 | 24 | # TODO: Complete these but do not change the names. 25 | self.dW = np.zeros((in_feature, out_feature)) 26 | self.db = np.zeros((1, out_feature)) 27 | 28 | self.momentum_W = np.zeros((in_feature, out_feature)) 29 | self.momentum_B = np.zeros((1, out_feature)) 30 | 31 | self.x = None 32 | 33 | def __call__(self, x): 34 | return self.forward(x) 35 | 36 | def forward(self, x): 37 | """ 38 | Argument: 39 | x (np.array): (batch size, in feature) 40 | Return: 41 | out (np.array): (batch size, out feature) 42 | """ 43 | yhat = np.dot(x, self.W) + self.b 44 | self.x = x 45 | return yhat 46 | 47 | def backward(self, delta): 48 | 49 | """ 50 | Argument: 51 | delta (np.array): (batch size, out feature) 52 | Return: 53 | out (np.array): (batch size, in feature) 54 | """ 55 | batchSize = delta.shape[0] 56 | self.dW = np.dot(self.x.T, delta) / batchSize # same shape as W 57 | self.db = np.mean(delta, axis=0, keepdims=True) 58 | dx = np.dot(delta, self.W.T) 59 | return dx 60 | -------------------------------------------------------------------------------- /HW2/hw2p1/mytorch/loss.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | 4 | import numpy as np 5 | import os 6 | 7 | # The following Criterion class will be used again as the basis for a number 8 | # of loss functions (which are in the form of classes so that they can be 9 | # exchanged easily (it's how PyTorch and other ML libraries do it)) 10 | 11 | class Criterion(object): 12 | """ 13 | Interface for loss functions. 14 | """ 15 | 16 | # Nothing needs done to this class, it's used by the following Criterion classes 17 | 18 | def __init__(self): 19 | self.logits = None 20 | self.labels = None 21 | self.loss = None 22 | 23 | def __call__(self, x, y): 24 | return self.forward(x, y) 25 | 26 | def forward(self, x, y): 27 | raise NotImplemented 28 | 29 | def derivative(self): 30 | raise NotImplemented 31 | 32 | class SoftmaxCrossEntropy(Criterion): 33 | """ 34 | Softmax loss 35 | """ 36 | 37 | def __init__(self): 38 | super(SoftmaxCrossEntropy, self).__init__() 39 | self.softmax = None 40 | 41 | def forward(self, x, y): 42 | """ 43 | Argument: 44 | x (np.array): (batch size, 10) 45 | y (np.array): (batch size, 10) 46 | Return: 47 | out (np.array): (batch size, ) 48 | """ 49 | self.logits = x 50 | self.labels = y 51 | x = x - np.amax(x, axis=1).reshape((x.shape[0],1))# Use LogSumExp trick to ensure numerical stability 52 | self.softmax = np.exp(x)/np.sum(np.exp(x), axis=1, keepdims=True) 53 | self.loss = -np.sum(self.labels * np.log(self.softmax), axis=1) 54 | return self.loss 55 | 56 | def derivative(self): 57 | """ 58 | Return: 59 | out (np.array): (batch size, 10) 60 | """ 61 | 62 | return self.softmax - self.labels 63 | -------------------------------------------------------------------------------- /HW2/hw2p2/README.md: -------------------------------------------------------------------------------- 1 | File structure: 2 | - HW1P2 3 | - main.py 4 | - 11-785hw2p2-s20 5 | - provided file and folders including data from kaggle 6 | - checkpoint 7 | - ContContInitWeight_BaselineSGD_StepLR_Epoch4.txt 8 | - output 9 | - Saved label test file for classification and verification 10 | 11 | To run my model: 12 | - Have the file structure as above 13 | - Type "source activate pytorch_p36" in the terminal 14 | - In the HW2P2 directory, type "python3 main.py" to run the model 15 | - After the model finishes, find predicted test_class_labels.npy and csv files under output folder 16 | Network design: 17 | - Used MobileNetV2 to implement. 18 | - Used a python list to store (i, j) to index into into the list, where i is the utterance 19 | index and j is the frame index. I used python.take to include 12 left and right frames of 20 | the current indexed frame. 21 | 22 | Steps taken to train my model that gets the optimal result: 23 | - First train model with configuration of the following structure 24 | hidden layer dimensions: [2048,1024,1024,1024,1024,512,256] 25 | MLP( 26 | (net): Sequential( 27 | (0): Linear(in_features=1000, out_features=2048, bias=True) 28 | (1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 29 | (2): LeakyReLU(negative_slope=0.01) 30 | (3): Linear(in_features=2048, out_features=1024, bias=True) 31 | (4): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 32 | (5): LeakyReLU(negative_slope=0.01) 33 | (6): Linear(in_features=1024, out_features=1024, bias=True) 34 | (7): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 35 | (8): LeakyReLU(negative_slope=0.01) 36 | (9): Linear(in_features=1024, out_features=1024, bias=True) 37 | (10): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 38 | (11): LeakyReLU(negative_slope=0.01) 39 | (12): Linear(in_features=1024, out_features=1024, bias=True) 40 | (13): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 41 | (14): LeakyReLU(negative_slope=0.01) 42 | (15): Linear(in_features=1024, out_features=512, bias=True) 43 | (16): LeakyReLU(negative_slope=0.01) 44 | (17): Linear(in_features=512, out_features=256, bias=True) 45 | (18): LeakyReLU(negative_slope=0.01) 46 | (19): Linear(in_features=256, out_features=138, bias=True) 47 | ) 48 | ) 49 | kContext size: 12 50 | optimizer: Adam with 0.001 learning rate with learning rate scheduler 51 | learning rate scheduler: ReduceLROnPlateau, mode=min, patientce=2 52 | batch size: 256 53 | number of Epochs: 5 54 | Save the model in file optim_Epoch_4contextK_v2.txt 55 | - Then load the model from optim_Epoch_4contextK_v2.txt and continue training same expect 56 | number of Epochs: 9 57 | Save the model in file optimCont_Epoch_8contextK_v2.txt 58 | - Then load the model fin file optimCont_Epoch_8contextK_v2.txt and continue trainign with 59 | number of Epochs: 9 60 | Save the model in file optimContDecrease_Epoch_8contextK_v2.txt 61 | 62 | Reference Links: 63 | https://florimond.dev/blog/articles/2018/10/reconciling-dataclasses-and-properties-in-python/ 64 | https://medium.com/@Biboswan98/optim-adam-vs-optim-sgd-lets-dive-in-8dbf1890fbdc 65 | https://pytorch.org/docs/stable/optim.html 66 | https://stackoverflow.com/questions/41153803/zero-padding-slice-past-end-of-array-in-numpy 67 | https://dev.to/hardiksondagar/how-to-use-aws-ebs-volume-as-a-swap-memory-5d15 68 | https://www.google.com/search?client=safari&sxsrf=ACYBGNSjEyEM_L_y5jQjSiKA7LAUMCu38g%3A1581151963058&source=hp&ei=23Y-Xosv5q_K0w_e2IjoAg&q=torch.save%28model.state_dict%28%29&oq=torch.save%28model.state_dict%28%29&gs_l=psy-ab.3..0l3j0i333l3.4254.4254..4482...3.0..0.85.85.1......0....2j1..gws-wiz.yz9xThPrXqE&ved=0ahUKEwjLiN2IysHnAhXml3IEHV4sAi0Q4dUDCAs&uact=5 69 | https://pytorch.org/tutorials/beginner/saving_loading_models.html 70 | https://stackoverflow.com/questions/33492260/save-multiple-arrays-to-a-csv-file-with-column-names 71 | https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.ReduceLROnPlateau 72 | -------------------------------------------------------------------------------- /HW2/hw2p2/hw2p2_writeup_submit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW2/hw2p2/hw2p2_writeup_submit.pdf -------------------------------------------------------------------------------- /HW2/hw2p2/p2.py: -------------------------------------------------------------------------------- 1 | # Import all the necessary libraries 2 | import PIL 3 | import numpy as np 4 | import torch 5 | import sys 6 | import os 7 | import time 8 | import pandas as pd 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | 13 | from torch.utils.data import DataLoader, Dataset, TensorDataset 14 | from torchvision import datasets, transforms 15 | from sklearn.metrics import roc_auc_score 16 | 17 | 18 | class MyDatasetTestClassify(Dataset): 19 | """ 20 | Dataset instance of classification task, test data 21 | """ 22 | def __init__(self, testFN, testImgFolderPath): 23 | self.imgFolderPath = testImgFolderPath 24 | with open(testFN) as f: 25 | self.fileList = [line.rstrip() for line in f] 26 | def __len__(self): 27 | return len(self.fileList) 28 | def __getitem__(self, idx): 29 | img = PIL.Image.open(self.imgFolderPath + self.fileList[idx]) 30 | img = transforms.ToTensor()(img) 31 | return img, -1 32 | def getFileList(self): 33 | return self.fileList 34 | 35 | 36 | class MyDatasetVerify(Dataset): 37 | """ 38 | Dataset instance of verification task 39 | """ 40 | def __init__(self, pairFN, imgFolderPath): 41 | self.imgFolderPath = imgFolderPath 42 | with open(pairFN) as f: 43 | self.pairList = [line.rstrip() for line in f] 44 | def __len__(self): 45 | return len(self.pairList) 46 | def __getitem__(self, idx): 47 | items = self.pairList[idx].split() 48 | fn1, fn2 = items[0], items[1] 49 | img1 = PIL.Image.open(self.imgFolderPath + fn1) 50 | img2 = PIL.Image.open(self.imgFolderPath + fn2) 51 | img1 = transforms.ToTensor()(img1) 52 | img2 = transforms.ToTensor()(img2) 53 | if len(items) == 3: # validation 54 | return img1, img2, int(items[2]) 55 | else: # test 56 | return img1, img2, -1 57 | def getPairList(self): 58 | return self.pairList 59 | 60 | 61 | class BottleNeck(nn.Module): 62 | """ 63 | Bottleneck block fo MobileNetV2 64 | """ 65 | def __init__(self, inChannel, outChannel, stride, expandT): 66 | super(BottleNeck, self).__init__() 67 | self.stride = stride 68 | self.shouldSkip = self.stride == 1 and inChannel == outChannel 69 | hiddenDim = int(inChannel * expandT) 70 | self.conv = nn.Sequential( 71 | # 1 x 1 expansion layer + bn + ReLU6 72 | nn.Conv2d(inChannel, hiddenDim, 1, 1, 0, bias=False), 73 | nn.BatchNorm2d(hiddenDim), 74 | nn.ReLU6(inplace=True), 75 | # 3 x 3 depthwise conv + bn + ReLU6 76 | nn.Conv2d(hiddenDim, hiddenDim, 3, self.stride, 1, groups=hiddenDim, bias=False), 77 | nn.BatchNorm2d(hiddenDim), 78 | nn.ReLU6(inplace=True), 79 | # 1 x 1 projection layer + bn 80 | nn.Conv2d(hiddenDim, outChannel, 1, 1, 0, bias=False), 81 | nn.BatchNorm2d(outChannel) 82 | ) 83 | 84 | def forward(self, x): 85 | # only skip connnection when stride==1 and inChannel==outChannel 86 | if self.shouldSkip: 87 | return x + self.conv(x) 88 | else: 89 | return self.conv(x) 90 | 91 | 92 | class Network(nn.Module): 93 | """ 94 | MobileNetV2 implementation 95 | """ 96 | def __init__(self, inputSize, bottlesSetting, numClasses, feat_dim=1280): 97 | super(Network, self).__init__() 98 | block = BottleNeck 99 | firstChannel = 32 100 | lastChannel = 1280 101 | blocks = [conv2d_3x3_bn_relu(3, firstChannel, 1)] 102 | # build MobileNet bottlenecks 103 | bottleInChannel = firstChannel 104 | for t, c, n, s in bottlesSetting: 105 | bottleOutChannel = c 106 | for i in range(n): 107 | if i == 0: # the first layer in a sequence 108 | blocks.append(block(bottleInChannel, bottleOutChannel, s, t)) 109 | else: 110 | blocks.append(block(bottleInChannel, bottleOutChannel, 1, t)) 111 | bottleInChannel = bottleOutChannel 112 | # build the conv2d 1x1 layer 113 | blocks.append(conv2d_1x1_bn_relu(bottleInChannel, lastChannel)) 114 | self.net = nn.Sequential(*blocks) 115 | 116 | # built classifier 117 | self.classifier = nn.Linear(lastChannel, numClasses) 118 | 119 | def forward(self, x): 120 | x = self.net(x) 121 | output = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1) # flatten 122 | classification_out = self.classifier(output) 123 | embedding_out = output 124 | return embedding_out, classification_out 125 | 126 | 127 | def conv2d_3x3_bn_relu(inChannel, outChannel, stride): 128 | """ 129 | Conv2d layer with 3x3 kenel and customized stride + batchnorm + relu 130 | """ 131 | return nn.Sequential( 132 | nn.Conv2d(inChannel, outChannel, 3, stride, 1, bias=False), 133 | nn.BatchNorm2d(outChannel), 134 | nn.ReLU6(inplace=True) 135 | ) 136 | 137 | 138 | def conv2d_1x1_bn_relu(inChannel, outChannel): 139 | """ 140 | Conv2d layer with 1x1 kenel and 1 stride + batchnorm + relu 141 | """ 142 | return nn.Sequential( 143 | nn.Conv2d(inChannel, outChannel, 1, 1, 0, bias=False), 144 | nn.BatchNorm2d(outChannel), 145 | nn.ReLU6(inplace=True) 146 | ) 147 | 148 | 149 | def getLoaders(trainDS, devDS, testDS, batchS): 150 | """ 151 | Create and return dataloader for train, dev, test dataset 152 | """ 153 | print("*** Create data loader ***") 154 | 155 | # Train 156 | loader_args = dict(shuffle=True, batch_size=batchS, num_workers=8, pin_memory=True) 157 | train_loader = DataLoader(trainDS, **loader_args) 158 | 159 | # Dev 160 | dev_loader = DataLoader(devDS, **loader_args) 161 | 162 | # Test 163 | test_loader_args = dict(shuffle=False, batch_size=100, num_workers=1, pin_memory=True) 164 | test_loader = DataLoader(testDS, **test_loader_args) 165 | 166 | return train_loader, dev_loader, test_loader 167 | 168 | 169 | def train_epoch(model, data_loader, criterion, optimizer, epoch): 170 | model.train() 171 | running_loss = 0.0 172 | start_time = time.time() 173 | for batch_idx, (data, target) in enumerate(data_loader): 174 | optimizer.zero_grad() 175 | data, target = data.cuda(), target.cuda() 176 | 177 | outputs = model(data)[1] 178 | loss = criterion(outputs, target) 179 | running_loss += loss.item() 180 | 181 | loss.backward() 182 | optimizer.step() 183 | 184 | if batch_idx % 1000 == 0: 185 | print("Epoch: {}\tBatch: {}\tTimestamp: {}".format(epoch, batch_idx, time.time() - start_time)) 186 | 187 | # clear computation cache 188 | torch.cuda.empty_cache() 189 | del data 190 | del target 191 | del loss 192 | end_time = time.time() 193 | running_loss = running_loss / len(data_loader) 194 | return running_loss 195 | 196 | 197 | def testClassify(model, test_loader, epoch): 198 | with torch.no_grad(): 199 | model.eval() 200 | start_time = time.time() 201 | running_loss = 0.0 202 | correct_predictions = 0.0 203 | total_predictions = 0.0 204 | for batch_idx, (data, target) in enumerate(test_loader): 205 | data, target = data.cuda(), target.cuda() 206 | 207 | outputs = model(data.float())[1] 208 | _, predicted = torch.max(outputs.data, 1) 209 | loss = criterion(outputs, target.long()).detach() 210 | total_predictions += target.size(0) 211 | correct_predictions += (predicted==target).sum().item() 212 | running_loss += loss.item() 213 | if batch_idx % 500 == 0: 214 | print("Epoch: {}\tBatch: {}\tTimestamp: {}".format(epoch, batch_idx, time.time()-start_time)) 215 | del data 216 | del target 217 | running_loss /= len(test_loader) 218 | acc = (correct_predictions/total_predictions)*100.0 219 | return running_loss, acc 220 | 221 | 222 | def testVerify(model, vLoader): 223 | similarity = np.array([]) 224 | true = np.array([]) 225 | start_time = time.time() 226 | with torch.no_grad(): 227 | start_time = time.time() 228 | for batch_idx, (imgs1, imgs2, targets) in enumerate(vLoader): 229 | imgs1, imgs2, targets = imgs1.cuda(), imgs2.cuda(), targets.cuda() 230 | # find cos similarity between embeddings 231 | imgs1Embed = model(imgs1.float())[0] 232 | imgs2Embed = model(imgs2.float())[0] 233 | sim = F.cosine_similarity(imgs1Embed, imgs2Embed) 234 | similarity = np.concatenate((similarity, sim.cpu().numpy().reshape(-1))) 235 | true = np.concatenate((true, targets.cpu().numpy().reshape(-1))) 236 | if batch_idx % 100 == 0: 237 | print("Batch: {}\t Timestamp:{}".format(batch_idx, time.time()-start_time)) 238 | del imgs1 239 | del imgs2 240 | del targets 241 | return similarity, true 242 | 243 | 244 | def predictLabels(model, test_loader): 245 | with torch.no_grad(): 246 | model.eval() 247 | res = np.array([]) 248 | for batch_idx, (data, target) in enumerate(test_loader): 249 | data, target = data.cuda(), target.cuda() 250 | 251 | outputs = model(data)[1] 252 | _, predicted = torch.max(outputs.data, dim=1) 253 | res = np.concatenate((res, predicted.cpu().numpy().reshape(-1))) 254 | del data 255 | del target 256 | return res 257 | 258 | 259 | 260 | def main(hyper): 261 | dataFolder = hyper['dataPath'] 262 | wegithDirName = hyper['weightDirName'] 263 | cuda = torch.cuda.is_available() 264 | 265 | print("*** Load raw data ***") 266 | train = datasets.ImageFolder(root=dataFolder+"/train_data/medium", transform=transforms.Compose([ 267 | transforms.RandomHorizontalFlip(), transforms.ToTensor()])) 268 | dev = datasets.ImageFolder(root=dataFolder+"/validation_classification/medium", 269 | transform=transforms.ToTensor()) 270 | # Load custom dataset for test since it does not follow ImageFolder structure 271 | test = MyDatasetTestClassify(hyper["classifyTestListPath"], hyper["classifyTestImgFolderPath"]) 272 | print("train data stat: {} images \t {} classes".format(train.__len__(), len(train.classes))) 273 | print("dev data stat: {} images \t {} classes".format(dev.__len__(), len(dev.classes))) 274 | print("test data stat: {} images".format(test.__len__())) 275 | 276 | # Get data loaders 277 | train_loader, dev_loader, test_loader = getLoaders(train, dev, test, hyper["batchSize"]) 278 | 279 | # Create the model and define the Loss and Optimizer 280 | print("*** Create the model and define Loss and Optimizer ***") 281 | inputSize = train.__len__() # number of train input images 282 | outputSize = len(train.classes) # number of unique face classes 283 | model = Network(inputSize, hyper["bottleneckSetting"], outputSize) 284 | checkpoint = torch.load(hyper["checkpoingPath"]) 285 | model.load_state_dict(checkpoint["model_state_dict"]) 286 | criterion = nn.CrossEntropyLoss() 287 | optimizer = optim.SGD(model.parameters(), lr=hyper["lr"], momentum=0.9, nesterov=True, weight_decay=5e-4) 288 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=1) 289 | device = torch.device("cuda" if cuda else "cpu") 290 | model.cuda() 291 | print(model) 292 | 293 | # Train the model for N epochs 294 | print("*** Train the model for N epochs ***") 295 | Train_loss = [] 296 | Train_acc = [] 297 | Test_loss = [] 298 | Test_acc = [] 299 | for i in range(hyper["nEpochs"]): 300 | for prarm_group in optimizer.param_groups: 301 | print("Current lr: \t{}".format(prarm_group["lr"])) 302 | startTime = time.time() 303 | print("Train\tEpoch: {}".format(i)) 304 | train_loss = train_epoch(model, train_loader, criterion, optimizer, i) 305 | if hyper["task"] == "Classification": 306 | print("Classify Train \tEpoch: {}".format(i)) 307 | train_loss, train_acc = testClassify(model, train_loader, i) 308 | print("Classify Dev \tEpoch: {}".format(i)) 309 | dev_loss, dev_acc = testClassify(model, dev_loader, i) 310 | print('Train Loss: {:.4f}\tTrain Accuracy: {:.4f}\tVal Loss: {:.4f}\tVal Accuracy: {:.4f}'. 311 | format(train_loss, train_acc, dev_loss, dev_acc)) 312 | else: 313 | print("Verification task") 314 | scheduler.step(dev_loss) 315 | Train_loss.append(train_loss) 316 | Train_acc.append(train_acc) 317 | Test_loss.append(dev_loss) 318 | Test_acc.append(dev_acc) 319 | print("*** Saving Checkpoint ***") 320 | path = "{}ContContContInitWeight_BaselineSGD_StepLR_Epoch{}.txt".format(wegithDirName, i) 321 | torch.save({ 322 | "epoch": i, 323 | 'model_state_dict': model.state_dict(), 324 | 'optimizer_state_dict': optimizer.state_dict(), 325 | 'train_loss': train_loss, 326 | "train_acc": train_acc, 327 | 'dev_loss':dev_loss, 328 | 'dev_acc': dev_acc 329 | }, path) 330 | print("="*20 + " Epoch {} took {}s".format(i, time.time()-startTime) + "="*20) 331 | 332 | 333 | # Writeout test labels for classification task 334 | labels = predictLabels(model, test_loader) 335 | labels = list(map(int, labels)) 336 | idxs = np.array(test.getFileList()) 337 | labels = np.array(labels) 338 | # create mappings of file set labels to true labels 339 | alphabetSorted = sorted([str(x) for x in range(0, 2300)]) 340 | filesetTrueLabelTuple = [(i, int(alphabetSorted[i])) for i in range(len(alphabetSorted))] 341 | mapping = dict(filesetTrueLabelTuple) 342 | labels = np.array(list(map(mapping.get, labels))) 343 | np.save(hyper["testClassLabelName"], labels) 344 | df = pd.DataFrame({"Id" : idxs, "Category" : labels}) 345 | df.to_csv(hyper["testClassLabelCSVfn"], index=False) 346 | 347 | # Read in verification pairs for validation 348 | verifyData_valid = MyDatasetVerify(hyper["verifyPairListPath"], hyper["verifyImgFolderPath"]) 349 | verify_loader_args_valid = dict(shuffle=False, batch_size=200, num_workers=8, pin_memory=True) 350 | verify_loader_valid = DataLoader(verifyData_valid, **verify_loader_args_valid) 351 | # Calculate simliarity score 352 | cosScore_valid, trueScore_valid = testVerify(model, verify_loader_valid) 353 | # Report AUC 354 | auc = roc_auc_score(trueScore_valid, cosScore_valid) 355 | print("*** AUC: {} ***".format(auc)) 356 | 357 | # Read in verification pairs for test 358 | verifyData_test = MyDatasetVerify(hyper["verifyTestPairListPath"], hyper["verifyTestImgFolderPath"]) 359 | verify_loader_args_test = dict(shuffle=False, batch_size=300, num_workers=8, pin_memory=True) 360 | verify_loader_test = DataLoader(verifyData_test, **verify_loader_args_test) 361 | # Calculate similarity score 362 | cosScore_test, _ = testVerify(model, verify_loader_test) 363 | 364 | # Save predictied similarity 365 | cosScore_test = np.array(cosScore_test) 366 | np.save(hyper["testVeriLabelName"], cosScore_test) 367 | trial = np.array(verifyData_test.getPairList()) 368 | df = pd.DataFrame({"trial" : trial, "score" : cosScore_test}) 369 | df.to_csv(hyper["testVeriLabelCSVfn"], index=False) 370 | 371 | if __name__ == "__main__": 372 | # Hyperparameters 373 | hyper = { 374 | "task": "Classification", 375 | "bottleneckSetting": [[1, 16, 1, 1], # t, c, n, s 376 | [6, 24, 2, 1], 377 | [6, 32, 3, 2], 378 | [6, 64, 4, 1], 379 | [6, 96, 3, 2], 380 | [6, 160, 3, 1], 381 | [6, 320, 1, 1]], 382 | "nEpochs":50, 383 | "batchSize":256, 384 | "lr":0.001,#1e-3, 385 | "dataPath": "./11-785hw2p2-s20", 386 | "checkpoingPath": "./checkpoint/ContContInitWeight_BaselineSGD_StepLR_Epoch4.txt", 387 | "classifyTestImgFolderPath": "./11-785hw2p2-s20/test_classification/medium/", 388 | "classifyTestListPath": "./11-785hw2p2-s20/test_order_classification.txt", 389 | "verifyImgFolderPath": "./11-785hw2p2-s20/validation_verification/", 390 | "verifyPairListPath": "./11-785hw2p2-s20/validation_trials_verification.txt", 391 | "verifyTestPairListPath": "./11-785hw2p2-s20/test_trials_verification_student.txt", 392 | "verifyTestImgFolderPath": "./11-785hw2p2-s20/test_verification/", 393 | "weightDirName": "./checkpoint/", 394 | "testClassLabelName":"./output/test_class_labels.npy", 395 | "testClassLabelCSVfn":"./output/test_class_labels.csv", 396 | "testVeriLabelName":"./output/test_veri_labels.npy", 397 | "testVeriLabelCSVfn":"./output/test_veri_labels.csv", 398 | } 399 | main(hyper) 400 | 401 | 402 | 403 | -------------------------------------------------------------------------------- /HW3/HW3P1_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW3/HW3P1_Writeup.pdf -------------------------------------------------------------------------------- /HW3/HW3P2_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW3/HW3P2_Writeup.pdf -------------------------------------------------------------------------------- /HW3/hw3p1/autograder/hw3_autograder/test.py: -------------------------------------------------------------------------------- 1 | # Test object to be used for other homeworks 2 | class Test(object): 3 | def __init__(self): 4 | pass 5 | 6 | def assertions(self, user_vals, expected_vals, test_type, test_name): 7 | if test_type == 'type': 8 | try: 9 | assert type(user_vals) == type(expected_vals) 10 | except Exception as e: 11 | print('Type error, your type doesnt match the expected type.') 12 | print('Wrong type for %s' % test_name) 13 | print('Your type: ', type(user_vals)) 14 | print('Expected type:', type(expected_vals)) 15 | return False 16 | elif test_type == 'shape': 17 | try: 18 | assert user_vals.shape == expected_vals.shape 19 | except Exception as e: 20 | print('Shape error, your shapes doesnt match the expected shape.') 21 | print('Wrong shape for %s' % test_name) 22 | print('Your shape: ', user_vals.shape) 23 | print('Expected shape:', expected_vals.shape) 24 | return False 25 | elif test_type == 'closeness': 26 | try: 27 | assert np.allclose(user_vals, expected_vals) 28 | except Exception as e: 29 | print('Closeness error, your values dont match the expected values.') 30 | print('Wrong values for %s' % test_name) 31 | print('Your values: ', user_vals) 32 | print('Expected values:', expected_vals) 33 | return False 34 | return True 35 | 36 | def print_failure(self, cur_test): 37 | print('*'*77) 38 | print('The local autograder will not work if you do not pass %s.' % cur_test) 39 | print('*'*77) 40 | print(' ') 41 | 42 | def print_name(self, cur_question): 43 | print('-'*20) 44 | print(cur_question) 45 | 46 | def print_outcome(self, short, outcome): 47 | print(short + ': ', 'PASS' if outcome else '*** FAIL ***') 48 | print('-'*20) 49 | print() 50 | -------------------------------------------------------------------------------- /HW3/hw3p1/autograder/hw3_autograder/test_rnn.py: -------------------------------------------------------------------------------- 1 | import sys, pdb, os 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from collections import OrderedDict 6 | from test import Test 7 | 8 | sys.path.append('mytorch') 9 | from rnn_cell import * 10 | from loss import * 11 | 12 | sys.path.append('hw3') 13 | from rnn_classifier import * 14 | 15 | # Reference Pytorch RNN Model 16 | class Reference_Model(nn.Module): 17 | def __init__(self, input_size, hidden_size, output_size, rnn_layers=2): 18 | super(Reference_Model, self).__init__() 19 | self.rnn = nn.RNN(input_size, hidden_size, num_layers=rnn_layers, bias=True, batch_first=True) 20 | self.output = nn.Linear(hidden_size, output_size) 21 | 22 | def forward(self, x, init_h=None): 23 | out, hidden = self.rnn(x, init_h) 24 | out = self.output(out[:,-1,:]) 25 | return out 26 | 27 | 28 | class RNN_Test(Test): 29 | def __init__(self): 30 | pass 31 | 32 | def test_rnncell_forward(self): 33 | np.random.seed(11785) 34 | torch.manual_seed(11785) 35 | # Using i within this loop to vary the inputs 36 | for i in range(1, 6): 37 | 38 | # Make pytorch rnn cell and get weights 39 | pytorch_rnn_cell = nn.RNNCell(i*2, i*3) 40 | state_dict = pytorch_rnn_cell.state_dict() 41 | W_ih, W_hh = state_dict['weight_ih'].numpy(), state_dict['weight_hh'].numpy() 42 | b_ih, b_hh = state_dict['bias_ih'].numpy(), state_dict['bias_hh'].numpy() 43 | 44 | # Set user cell and weights 45 | user_cell = RNN_Cell(i*2, i*3) 46 | user_cell.init_weights(W_ih, W_hh, b_ih, b_hh) 47 | 48 | # Get inputs 49 | time_steps = i*2 50 | inp = torch.randn(time_steps, i*2, i*2) 51 | hx = torch.randn(i*2, i*3) 52 | hx_user = hx 53 | 54 | # Loop through inputs 55 | for t in range(time_steps): 56 | hx = pytorch_rnn_cell(inp[t], hx) 57 | hx_user = user_cell(inp[t], hx_user) 58 | assert(np.allclose(hx.detach().numpy(), hx_user, rtol=1e-03)) 59 | 60 | return True 61 | 62 | def test_rnncell_backward(self): 63 | expected_results = np.load(os.path.join('autograder', 'hw3_autograder', 64 | 'data', 'rnncell_backward.npy'), allow_pickle = True) 65 | dx1_, dh1_, dx2_, dh2_, dW_ih_, dW_hh_, db_ih_, db_hh_ = expected_results 66 | 67 | np.random.seed(11785) 68 | torch.manual_seed(11785) 69 | 70 | batch_size = 3 71 | input_size = 10 72 | hidden_size = 20 73 | user_cell = RNN_Cell(10, 20) 74 | 75 | # Run backward once 76 | delta = np.random.randn(batch_size, hidden_size) 77 | h = np.random.randn(batch_size, hidden_size) 78 | h_prev_l = np.random.randn(batch_size, input_size) 79 | h_prev_t = np.random.randn(batch_size, hidden_size) 80 | dx1, dh1 = user_cell.backward(delta, h, h_prev_l, h_prev_t) 81 | 82 | # Run backward again 83 | delta = np.random.randn(batch_size, hidden_size) 84 | h = np.random.randn(batch_size, hidden_size) 85 | h_prev_l = np.random.randn(batch_size, input_size) 86 | h_prev_t = np.random.randn(batch_size, hidden_size) 87 | dx2, dh2 = user_cell.backward(delta, h, h_prev_l, h_prev_t) 88 | 89 | dW_ih, dW_hh = user_cell.dW_ih, user_cell.dW_hh 90 | db_ih, db_hh = user_cell.db_ih, user_cell.db_hh 91 | 92 | # Verify derivatives 93 | assert(np.allclose(dx1, dx1_, rtol=1e-04)) 94 | assert(np.allclose(dx2, dx2_, rtol=1e-04)) 95 | assert(np.allclose(dh1, dh1_, rtol=1e-04)) 96 | assert(np.allclose(dh2, dh2_, rtol=1e-04)) 97 | assert(np.allclose(dW_ih, dW_ih_, rtol=1e-04)) 98 | assert(np.allclose(dW_hh, dW_hh_, rtol=1e-04)) 99 | assert(np.allclose(db_ih, db_ih_, rtol=1e-04)) 100 | assert(np.allclose(db_hh, db_hh_, rtol=1e-04)) 101 | 102 | # Use to save test data for next semester 103 | # results = [dx1, dh1, dx2, dh2, dW_ih, dW_hh, db_ih, db_hh] 104 | # np.save(os.path.join('autograder', 'hw3_autograder', 105 | # 'data', 'rnncell_backward.npy'), results, allow_pickle=True) 106 | return True 107 | 108 | def test_rnn_classifier(self): 109 | rnn_layers = 2 110 | batch_size = 5 111 | seq_len = 10 112 | input_size = 40 113 | hidden_size = 32 # hidden_size > 100 will cause precision error 114 | output_size = 138 115 | 116 | np.random.seed(11785) 117 | torch.manual_seed(11785) 118 | 119 | data_x = np.random.randn(batch_size, seq_len, input_size) 120 | data_y = np.random.randint(0, output_size, batch_size) 121 | 122 | # Initialize 123 | # Reference model 124 | rnn_model = Reference_Model(input_size, hidden_size, output_size, rnn_layers=rnn_layers) 125 | model_state_dict = rnn_model.state_dict() 126 | # My model 127 | my_rnn_model = RNN_Phoneme_Classifier(input_size, hidden_size, output_size, num_layers=rnn_layers) 128 | rnn_weights = [[model_state_dict['rnn.weight_ih_l%d' % l].numpy(), 129 | model_state_dict['rnn.weight_hh_l%d' % l].numpy(), 130 | model_state_dict['rnn.bias_ih_l%d' % l].numpy(), 131 | model_state_dict['rnn.bias_hh_l%d' % l].numpy()] for l in range(rnn_layers)] 132 | fc_weights = [model_state_dict['output.weight'].numpy(), model_state_dict['output.bias'].numpy()] 133 | my_rnn_model.init_weights(rnn_weights, fc_weights) 134 | 135 | # Test forward pass 136 | # Reference model 137 | ref_init_h = nn.Parameter(torch.zeros(rnn_layers, batch_size, hidden_size, dtype=torch.float), requires_grad=True) 138 | ref_out_tensor = rnn_model(torch.FloatTensor(data_x), ref_init_h) 139 | ref_out = ref_out_tensor.detach().numpy() 140 | 141 | # My model 142 | my_out = my_rnn_model(data_x) 143 | 144 | # Verify forward outputs 145 | print('Testing RNN Classifier Forward...') 146 | assert(np.allclose(my_out, ref_out, rtol=1e-03)) 147 | # if not self.assertions(my_out, ref_out, 'closeness', 'RNN Classifier Forwrd'): #rtol=1e-03) 148 | # return 'RNN Forward' 149 | print('RNN Classifier Forward: PASS' ) 150 | print('Testing RNN Classifier Backward...') 151 | 152 | # Test backward pass 153 | # Reference model 154 | criterion = nn.CrossEntropyLoss() 155 | loss = criterion(ref_out_tensor, torch.LongTensor(data_y)) 156 | ref_loss = loss.detach().item() 157 | rnn_model.zero_grad() 158 | loss.backward() 159 | grad_dict = {k:v.grad for k, v in zip(rnn_model.state_dict(), rnn_model.parameters())} 160 | dh = ref_init_h.grad 161 | 162 | # My model 163 | my_criterion = SoftmaxCrossEntropy() 164 | my_labels_onehot = np.zeros((batch_size, output_size)) 165 | my_labels_onehot[np.arange(batch_size), data_y] = 1.0 166 | my_loss = my_criterion(my_out, my_labels_onehot).mean() 167 | delta = my_criterion.derivative() 168 | my_dh = my_rnn_model.backward(delta) 169 | 170 | # Verify derivative w.r.t. each network parameters 171 | assert(np.allclose(my_dh, dh.detach().numpy(), rtol=1e-04)) 172 | assert(np.allclose(my_rnn_model.output_layer.dW, grad_dict['output.weight'].detach().numpy(), rtol=1e-03)) 173 | assert(np.allclose(my_rnn_model.output_layer.db, grad_dict['output.bias'].detach().numpy())) 174 | for l, rnn_cell in enumerate(my_rnn_model.rnn): 175 | assert(np.allclose(my_rnn_model.rnn[l].dW_ih, grad_dict['rnn.weight_ih_l%d' % l].detach().numpy(), rtol=1e-03)) 176 | assert(np.allclose(my_rnn_model.rnn[l].dW_hh, grad_dict['rnn.weight_hh_l%d' % l].detach().numpy(), rtol=1e-03)) 177 | assert(np.allclose(my_rnn_model.rnn[l].db_ih, grad_dict['rnn.bias_ih_l%d' % l].detach().numpy(), rtol=1e-03)) 178 | assert(np.allclose(my_rnn_model.rnn[l].db_hh, grad_dict['rnn.bias_hh_l%d' % l].detach().numpy(), rtol=1e-03)) 179 | 180 | print('RNN Classifier Backward: PASS' ) 181 | return True 182 | 183 | 184 | 185 | def run_test(self): 186 | # Test forward 187 | self.print_name('Section 3.1 - RNN Forward') 188 | forward_outcome = self.test_rnncell_forward() 189 | self.print_outcome('RNN Forward', forward_outcome) 190 | if forward_outcome == False: 191 | self.print_failure('RNN Forward') 192 | return False 193 | 194 | # Test Backward 195 | self.print_name('Section 3.2 - RNN Backward') 196 | backward_outcome = self.test_rnncell_backward() 197 | self.print_outcome('RNN backward', backward_outcome) 198 | if backward_outcome == False: 199 | self.print_failure('RNN Backward') 200 | return False 201 | 202 | # Test RNN Classifier 203 | self.print_name('Section 3.3 - RNN Classifier') 204 | classifier_outcome = self.test_rnn_classifier() 205 | self.print_outcome('RNN Classifier', classifier_outcome) 206 | if classifier_outcome == False: 207 | self.print_failure(classifier_outcome) 208 | return False 209 | 210 | return True 211 | -------------------------------------------------------------------------------- /HW3/hw3p1/create_tarball.sh: -------------------------------------------------------------------------------- 1 | tar -cvf handin.tar hw3 mytorch 2 | -------------------------------------------------------------------------------- /HW3/hw3p1/hw3/hw3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | sys.path.append('mytorch') 5 | from gru_cell import * 6 | from linear import * 7 | 8 | # This is the neural net that will run one timestep of the input 9 | # You only need to implement the forward method of this class. 10 | # This is to test that your GRU Cell implementation is correct when used as a GRU. 11 | class CharacterPredictor(object): 12 | def __init__(self, input_dim, hidden_dim, num_classes): 13 | super(CharacterPredictor, self).__init__() 14 | # The network consists of a GRU Cell and a linear layer 15 | self.rnn = GRU_Cell(input_dim, hidden_dim) 16 | self.projection = Linear(hidden_dim, num_classes) 17 | self.hiddenDim = hidden_dim 18 | 19 | def init_rnn_weights(self, w_hi, w_hr, w_hn, w_ii, w_ir, w_in): 20 | # DO NOT MODIFY 21 | self.rnn.init_weights(w_hi, w_hr, w_hn, w_ii, w_ir, w_in) 22 | 23 | def __call__(self, x, h): 24 | return self.forward(x, h) 25 | 26 | def forward(self, x, h): 27 | # A pass through one time step of the input 28 | hnext = self.rnn.forward(x, h) 29 | logits = self.projection.forward(hnext) 30 | return logits, hnext 31 | 32 | # An instance of the class defined above runs through a sequence of inputs to 33 | # generate the logits for all the timesteps. 34 | def inference(net, inputs): 35 | # input: 36 | # - net: An instance of CharacterPredictor 37 | # - inputs - a sequence of inputs of dimensions [seq_len x feature_dim] 38 | # output: 39 | # - logits - one per time step of input. Dimensions [seq_len x num_classes] 40 | logits = [] 41 | hnext = np.zeros(net.hiddenDim, dtype=float) 42 | for t in range(len(inputs)): 43 | logit, hnext = net.forward(inputs[t], hnext) 44 | logits.append(logit.copy()) 45 | return np.array(logits) 46 | 47 | -------------------------------------------------------------------------------- /HW3/hw3p1/hw3/mc.py: -------------------------------------------------------------------------------- 1 | # You know the drill... 2 | 3 | def question_1(): 4 | return 'b' 5 | 6 | def question_2(): 7 | return 'b' 8 | 9 | def question_3(): 10 | return 'b' 11 | 12 | def question_4(): 13 | return 'a' 14 | -------------------------------------------------------------------------------- /HW3/hw3p1/hw3/rnn_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | sys.path.append('mytorch') 5 | from rnn_cell import * 6 | from linear import * 7 | 8 | # RNN Phoneme Classifier 9 | class RNN_Phoneme_Classifier(object): 10 | def __init__(self, input_size, hidden_size, output_size, num_layers=2): 11 | self.input_size = input_size 12 | self.hidden_size = hidden_size 13 | self.num_layers = num_layers 14 | 15 | ## TODO: Understand then uncomment this code :) 16 | self.rnn = [RNN_Cell(input_size, hidden_size) if i == 0 else 17 | RNN_Cell(hidden_size, hidden_size) for i in range(num_layers)] 18 | self.output_layer = Linear(hidden_size, output_size) 19 | 20 | # store hidden states at each time step, [(seq_len+1) * (num_layers, batch_size, hidden_size)] 21 | self.hiddens = [] 22 | 23 | def init_weights(self, rnn_weights, linear_weights): 24 | """ 25 | Initialize weights 26 | 27 | Parameters 28 | ---------- 29 | rnn_weights: 30 | [[W_ih_l0, W_hh_l0, b_ih_l0, b_hh_l0], 31 | [W_ih_l1, W_hh_l1, b_ih_l1, b_hh_l1], ...] 32 | 33 | linear_weights: 34 | [W, b] 35 | """ 36 | for i, rnn_cell in enumerate(self.rnn): 37 | rnn_cell.init_weights(*rnn_weights[i]) 38 | self.output_layer.init_weights(*linear_weights) 39 | 40 | def __call__(self, x, h_0=None): 41 | return self.forward(x, h_0) 42 | 43 | def forward(self, x, h_0=None): 44 | 45 | """ 46 | RNN forward, multiple layers, multiple time steps 47 | 48 | Parameters 49 | ---------- 50 | x : (batch_size, seq_len, input_size) 51 | Input 52 | h_0 : (num_layers, batch_size, hidden_size) 53 | Initial hidden states. Defaults to zeros if not specified 54 | 55 | Returns 56 | ------- 57 | logits : (batch_size, output_size) 58 | Output logits 59 | """ 60 | 61 | # Get the batch size and sequence length, and initialize the hidden 62 | # vectors given the paramters. 63 | batch_size, seq_len = x.shape[0], x.shape[1] 64 | if h_0 is None: 65 | hidden = np.zeros((self.num_layers, batch_size, self.hidden_size), dtype=float) 66 | else: 67 | hidden = h_0 68 | 69 | # Save x and append the hidden vector to the hiddens list 70 | self.x = x 71 | self.hiddens.append(hidden.copy()) 72 | 73 | ### Add your code here ---> 74 | # (More specific pseudocode may exist in lecture slides): Lecture 13 PPT slide 72 75 | # Iterate through the sequence 76 | for t in range(seq_len): 77 | # Iterate over the length of your self.rnn (through the layers) 78 | xInput = self.x[:, t, :] 79 | hidden = [] 80 | for i in range(len(self.rnn)): 81 | # Run the rnn cell with the correct parameters and update 82 | # the parameters as needed. Update hidden. 83 | h_ti = self.rnn[i].forward(xInput, self.hiddens[-1][i]) 84 | xInput = h_ti 85 | hidden.append(h_ti) 86 | # Similar to above, append a copy of the current hidden array to the hiddens list 87 | self.hiddens.append(hidden.copy()) 88 | 89 | # Get the outputs from the last time step using the linear layer and return it 90 | logits = self.output_layer.forward(xInput) 91 | return logits 92 | 93 | def backward(self, delta): 94 | 95 | """ 96 | RNN Back Propagation Through Time (BPTT) 97 | 98 | Parameters 99 | ---------- 100 | delta : (batch_size, hidden_size) 101 | gradient w.r.t. the last time step output dY(seq_len-1) 102 | 103 | Returns 104 | ------- 105 | dh_0 : (num_layers, batch_size, hidden_size) 106 | gradient w.r.t. the initial hidden states 107 | """ 108 | 109 | # Initilizations 110 | batch_size, seq_len = self.x.shape[0], self.x.shape[1] 111 | dh = np.zeros((self.num_layers, batch_size, self.hidden_size), dtype=float) 112 | dh[-1] = self.output_layer.backward(delta) 113 | 114 | ''' 115 | ''' 116 | # Notes: 117 | # More specific pseudocode may exist in lecture slides and a visualization 118 | # exists in the writeup. Lecture 13 PPT slide 94 119 | # WATCH out for off by 1 errors due to implementation decisions. 120 | # 121 | # Pseudocode: 122 | # Iterate in reverse order of time (from seq_len-1 to 0) 123 | for t in range(seq_len-1, -1, -1): 124 | # Iterate in reverse order of layers (from num_layers-1 to 0) 125 | for i in range(self.num_layers-1, -1, -1): 126 | # Get h_prev_l either from hiddens or x depending on the layer 127 | # (Recall that hiddens has an extra initial hidden state) 128 | h_prev_l = self.hiddens[t+1][i-1] if i != 0 else self.x[:, t, :] 129 | h_prev_t = self.hiddens[t][i] 130 | # Use dh and hiddens to get the other parameters for the backward method 131 | # (Recall that hiddens has an extra initial hidden state) 132 | retdx, retdh = self.rnn[i].backward(dh[i], self.hiddens[t+1][i], h_prev_l, h_prev_t) 133 | # Update dh with the new dh from the backward pass of the rnn cell 134 | dh[i] = retdh 135 | # If you aren't at the first layer, you will want to add dx to 136 | # the gradient from l-1th layer 137 | if i != 0: 138 | dh[i-1] += retdx 139 | # Normalize dh by batch_size since initial hidden states are also treated 140 | # as parameters of the network (divide by batch size) 141 | dh = dh / batch_size 142 | 143 | dh_0 = dh 144 | return dh_0 145 | -------------------------------------------------------------------------------- /HW3/hw3p1/mytorch/activation.py: -------------------------------------------------------------------------------- 1 | # Do not import any additional 3rd party external libraries as they will not 2 | # be available to AutoLab and are not needed (or allowed) 3 | import numpy as np 4 | 5 | class Activation(object): 6 | 7 | """ 8 | Interface for activation functions (non-linearities). 9 | """ 10 | 11 | def __init__(self): 12 | self.state = None 13 | 14 | def __call__(self, x): 15 | return self.forward(x) 16 | 17 | def forward(self, x): 18 | raise NotImplemented 19 | 20 | def derivative(self): 21 | raise NotImplemented 22 | 23 | class Sigmoid(Activation): 24 | 25 | """ 26 | Sigmoid activation function 27 | """ 28 | 29 | def __init__(self): 30 | super(Sigmoid, self).__init__() 31 | 32 | def forward(self, x): 33 | self.state = (1 / (1 + np.exp(-x))) 34 | return self.state 35 | 36 | def derivative(self): 37 | return (self.state) * (1 - self.state) 38 | 39 | 40 | class Tanh(Activation): 41 | 42 | """ 43 | Modified Tanh to work with BPTT. 44 | The tanh(x) result has to be stored elsewhere otherwise we will 45 | have to store results for multiple timesteps in this class for each cell, 46 | which could be considered bad design. 47 | 48 | Now in the derivative case, we can pass in the stored hidden state and 49 | compute the derivative for that state instead of the "current" stored state 50 | which could be anything. 51 | """ 52 | 53 | def __init__(self): 54 | super(Tanh, self).__init__() 55 | 56 | def forward(self, x): 57 | self.state = np.tanh(x) 58 | return self.state 59 | 60 | def derivative(self, state=None): 61 | if state is not None: 62 | return 1 - (state**2) 63 | else: 64 | return 1 - (self.state**2) 65 | 66 | -------------------------------------------------------------------------------- /HW3/hw3p1/mytorch/gru_cell.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from activation import * 3 | 4 | class GRU_Cell: 5 | """docstring for GRU_Cell""" 6 | def __init__(self, in_dim, hidden_dim): 7 | self.d = in_dim 8 | self.h = hidden_dim 9 | h = self.h 10 | d = self.d 11 | self.x_t=0 12 | 13 | self.Wzh = np.random.randn(h,h) 14 | self.Wrh = np.random.randn(h,h) 15 | self.Wh = np.random.randn(h,h) 16 | 17 | self.Wzx = np.random.randn(h,d) 18 | self.Wrx = np.random.randn(h,d) 19 | self.Wx = np.random.randn(h,d) 20 | 21 | self.dWzh = np.zeros((h,h)) 22 | self.dWrh = np.zeros((h,h)) 23 | self.dWh = np.zeros((h,h)) 24 | 25 | self.dWzx = np.zeros((h,d)) 26 | self.dWrx = np.zeros((h,d)) 27 | self.dWx = np.zeros((h,d)) 28 | 29 | self.z_act = Sigmoid() 30 | self.r_act = Sigmoid() 31 | self.h_act = Tanh() 32 | 33 | # Define other variables to store forward results for backward here 34 | 35 | 36 | def init_weights(self, Wzh, Wrh, Wh, Wzx, Wrx, Wx): 37 | self.Wzh = Wzh 38 | self.Wrh = Wrh 39 | self.Wh = Wh 40 | self.Wzx = Wzx 41 | self.Wrx = Wrx 42 | self.Wx = Wx 43 | 44 | def __call__(self, x, h): 45 | return self.forward(x,h) 46 | 47 | def forward(self, x, h): 48 | # input: 49 | # - x: shape(input dim), observation at current time-step 50 | # - h: shape(hidden dim), hidden-state at previous time-step 51 | # 52 | # output: 53 | # - h_t: hidden state at current time-step 54 | 55 | self.x = x 56 | self.hidden = h 57 | 58 | # Add your code here. 59 | # Define your variables based on the writeup using the corresponding 60 | # names below. 61 | self.z1 = np.dot(self.Wzh, h) 62 | self.z2 = np.dot(self.Wzx, x) 63 | self.z3 = self.z1 + self.z2 64 | self.z4 = self.z_act(self.z3) 65 | self.z = self.z4 66 | 67 | self.z5 = np.dot(self.Wrh, h) 68 | self.z6 = np.dot(self.Wrx, x) 69 | self.z7 = self.z5 + self.z6 70 | self.z8 = self.r_act(self.z7) 71 | self.r = self.z8 72 | 73 | self.z9 = self.z8 * h 74 | self.z10 = np.dot(self.Wh, self.z9) 75 | self.z11 = np.dot(self.Wx, x) 76 | self.z12 = self.z10 + self.z11 77 | self.z13 = self.h_act(self.z12) 78 | self.h_tilda = self.z13 79 | 80 | self.z14 = 1 - self.z4 81 | self.z15 = self.z14 * h 82 | self.z16 = self.z4 * self.z13 83 | self.z17 = self.z15 + self.z16 84 | h_t = self.z17 85 | 86 | assert self.x.shape == (self.d, ) 87 | assert self.hidden.shape == (self.h, ) 88 | 89 | assert self.r.shape == (self.h, ) 90 | assert self.z.shape == (self.h, ) 91 | assert self.h_tilda.shape == (self.h, ) 92 | assert h_t.shape == (self.h, ) 93 | 94 | return h_t 95 | 96 | 97 | # This must calculate the gradients wrt the parameters and return the 98 | # derivative wrt the inputs, xt and ht, to the cell. 99 | def backward(self, delta): 100 | # input: 101 | # - delta: shape (hidden dim), summation of derivative wrt loss from next layer at 102 | # the same time-step and derivative wrt loss from same layer at 103 | # next time-step 104 | # output: 105 | # - dx: Derivative of loss wrt the input x 106 | # - dh: Derivative of loss wrt the input hidden h 107 | 108 | # 1) Reshape everything you saved in the forward pass. 109 | # 2) Compute all of the derivatives 110 | # 3) Know that the autograders the gradients in a certain order, and the 111 | # local autograder will tell you which gradient you are currently failing. 112 | d16 = delta 113 | d15 = delta 114 | 115 | d13 = d16 * self.z4 116 | d4 = d16 * self.z13 117 | 118 | d14 = d15 * self.hidden 119 | dh = d15 * self.z14 120 | 121 | d4 += -d14 122 | 123 | d12 = d13 * (1 - self.h_act(self.z12) * self.h_act(self.z12)).T 124 | 125 | d10 = d12 126 | d11 = d12 127 | 128 | self.dWx += np.dot(d11.T, self.x.reshape(1,-1)) 129 | dx_t = np.dot(d11, self.Wx) 130 | 131 | self.dWh += np.dot(d10.T, np.reshape(self.z9, (1, -1))) 132 | d9 = np.dot(d10, self.Wh) 133 | 134 | d8 = d9 * self.hidden 135 | dh += d9 * self.r 136 | 137 | d7 = d8 * self.r_act(self.z7) * (1 - self.r_act(self.z7)) 138 | 139 | d5 = d7 140 | d6 = d7 141 | 142 | self.dWrx += np.dot(d6.T, np.reshape(self.x, (1, -1))) 143 | dx_t += np.dot(d6, self.Wrx) 144 | 145 | self.dWrh += np.dot(d5.T, np.reshape(self.hidden, (1, -1))) 146 | dh += np.dot(d5, self.Wrh) 147 | 148 | d3 = d4 * self.z_act(self.z3) * (1 - self.z_act(self.z3)) 149 | 150 | d2 = d3 151 | d1 = d3 152 | 153 | self.dWzx += np.dot(d2.T, np.reshape(self.x, (1, -1))) 154 | dx_t += np.dot(d2, self.Wzx) 155 | 156 | self.dWzh += np.dot(d1.T, np.reshape(self.hidden, (1, -1))) 157 | dh += np.dot(d1, self.Wzh) 158 | 159 | 160 | assert dx_t.shape == (1, self.d) 161 | assert dh.shape == (1, self.h) 162 | 163 | # return dx, dh 164 | return dx_t, dh -------------------------------------------------------------------------------- /HW3/hw3p1/mytorch/linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Linear(object): 4 | def __init__(self, in_feature, out_feature): 5 | self.in_feature = in_feature 6 | self.out_feature = out_feature 7 | 8 | self.W = np.random.randn(out_feature, in_feature) 9 | self.b = np.zeros(out_feature) 10 | 11 | self.dW = np.zeros(self.W.shape) 12 | self.db = np.zeros(self.b.shape) 13 | 14 | def __call__(self, x): 15 | return self.forward(x) 16 | 17 | def init_weights(self, W, b): 18 | self.W = W 19 | self.b = b 20 | 21 | def forward(self, x): 22 | self.x = x 23 | self.out = x.dot(self.W.T) + self.b 24 | return self.out 25 | 26 | def backward(self, delta): 27 | batch_size = delta.shape[0] 28 | self.db = np.sum(delta, axis=0) / batch_size 29 | self.dW = np.dot(delta.T, self.x) / batch_size 30 | dx = np.dot(delta, self.W) 31 | return dx 32 | 33 | -------------------------------------------------------------------------------- /HW3/hw3p1/mytorch/loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Criterion(object): 4 | """ 5 | Interface for loss functions. 6 | """ 7 | 8 | def __init__(self): 9 | self.logits = None 10 | self.labels = None 11 | self.loss = None 12 | 13 | def __call__(self, x, y): 14 | return self.forward(x, y) 15 | 16 | def forward(self, x, y): 17 | raise NotImplemented 18 | 19 | def derivative(self): 20 | raise NotImplemented 21 | 22 | class SoftmaxCrossEntropy(Criterion): 23 | 24 | """ 25 | Softmax loss 26 | """ 27 | 28 | def __init__(self): 29 | super(SoftmaxCrossEntropy, self).__init__() 30 | self.sm = None 31 | 32 | def forward(self, x, y): 33 | self.logits = x 34 | self.labels = y 35 | exps = np.exp(x - np.max(x, axis=1)[:,None]) 36 | self.sm = exps / np.sum(exps, axis=1)[:,None] 37 | loss = -np.log((self.sm*y).sum(axis=1)) 38 | return loss 39 | 40 | def derivative(self): 41 | return self.sm - self.labels 42 | -------------------------------------------------------------------------------- /HW3/hw3p1/mytorch/rnn_cell.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from activation import * 3 | 4 | class RNN_Cell(object): 5 | def __init__(self, input_size, hidden_size): 6 | 7 | self.input_size = input_size 8 | self.hidden_size = hidden_size 9 | 10 | # Activation function for 11 | self.activation = Tanh() 12 | 13 | # hidden dimension and input dimension 14 | h = self.hidden_size 15 | d = self.input_size 16 | 17 | # Weights and biases 18 | self.W_ih = np.random.randn(h, d) 19 | self.W_hh = np.random.randn(h, h) 20 | self.b_ih = np.random.randn(h) 21 | self.b_hh = np.random.randn(h) 22 | 23 | # Gradients 24 | self.dW_ih = np.zeros((h, d)) 25 | self.dW_hh = np.zeros((h, h)) 26 | 27 | self.db_ih = np.zeros(h) 28 | self.db_hh = np.zeros(h) 29 | 30 | def init_weights(self, W_ih, W_hh, b_ih, b_hh): 31 | self.W_ih = W_ih 32 | self.W_hh = W_hh 33 | self.b_ih = b_ih 34 | self.b_hh = b_hh 35 | 36 | def zero_grad(self): 37 | d = self.input_size 38 | h = self.hidden_size 39 | self.dW_ih = np.zeros((h, d)) 40 | self.dW_hh = np.zeros((h, h)) 41 | self.db_ih = np.zeros(h) 42 | self.db_hh = np.zeros(h) 43 | 44 | def __call__(self, x, h): 45 | return self.forward(x, h) 46 | 47 | def forward(self, x, h): 48 | """ 49 | RNN cell forward (single time step) 50 | 51 | Input (see writeup for explanation) 52 | ---------- 53 | x : (batch_size, input_size) 54 | h : (batch_size, hidden_size) 55 | 56 | Returns 57 | ------- 58 | h_prime : (batch_size, hidden_size) 59 | """ 60 | h_prime = self.activation(np.dot(x, self.W_ih.T) + self.b_ih + np.dot(h, self.W_hh.T) + self.b_hh) 61 | return h_prime 62 | 63 | def backward(self, delta, h, h_prev_l, h_prev_t): 64 | """ 65 | RNN cell backward (single time step) 66 | 67 | Input (see writeup for explanation) 68 | ---------- 69 | delta : (batch_size, hidden_size) 70 | h : (batch_size, hidden_size) 71 | h_prev_l: (batch_size, input_size) 72 | h_prev_t: (batch_size, hidden_size) 73 | 74 | Returns 75 | ------- 76 | dx : (batch_size, input_size) 77 | dh : (batch_size, hidden_size) 78 | """ 79 | 80 | batch_size = delta.shape[0] 81 | 82 | # 0) Done! Step backward through the tanh activation function. 83 | # Note, because of BPTT, we had to externally save the tanh state, and 84 | # have modified the tanh activation function to accept an optionally input. 85 | dz = self.activation.derivative(state=h) * delta # (batch_size, hidden_size) 86 | 87 | # 1) Compute the averaged gradients of the weights and biases 88 | self.dW_ih += np.dot(dz.T, h_prev_l) / batch_size # (hidden_size, input_size) 89 | self.dW_hh += np.dot(dz.T, h_prev_t) / batch_size # (hidden_size, hidden_size) 90 | self.db_ih += np.mean(dz, axis=0) # (hidden_size) 91 | self.db_hh += np.mean(dz, axis=0) # (hidden_size) 92 | 93 | # 2) Compute dx, dh 94 | dx = np.dot(dz, self.W_ih) # (batch_size, input_size) 95 | dh = np.dot(dz, self.W_hh) # (batch_size, hidden_size) 96 | 97 | # 3) Return dx, dh 98 | return dx, dh 99 | -------------------------------------------------------------------------------- /HW3/hw3p1/mytorch/search.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | ''' 5 | SymbolSets: A list containing all the symbols (the vocabulary without blank) 6 | 7 | y_probs: Numpy array with shape (# of symbols + 1, Seq_length, batch_size) 8 | Your batch size for part 1 will remain 1, but if you plan to use your 9 | implementation for part 2 you need to incorporate batch_size. 10 | 11 | Return the forward probability of the greedy path (a float) and 12 | the corresponding compressed symbol sequence i.e. without blanks 13 | or repeated symbols (a string). 14 | ''' 15 | def GreedySearch(SymbolSets, y_probs): 16 | # Follow the pseudocode from lecture to complete greedy search :-) 17 | num_symbols, seq_len, batch_size = y_probs.shape 18 | 19 | # Find the maximum probable path via greedy search 20 | resSymbolPaths = [] 21 | resProbs = [] 22 | resCompressedPaths = [] 23 | for b in range(batch_size): 24 | prob = 1 25 | symbolPath = ["placeHolder"] * seq_len 26 | for t in range(seq_len): 27 | currMax = 0 28 | curr = "placeHolder" 29 | for i in range(num_symbols): 30 | if y_probs[i][t][b] > currMax: 31 | currMax = y_probs[i][t][b] 32 | # Take care of blank symbol 33 | curr = "placeHolder" if i == 0 else SymbolSets[i-1] 34 | symbolPath[t] = curr 35 | prob *= currMax 36 | resSymbolPaths.append(symbolPath) 37 | resProbs.append(prob) 38 | 39 | # Build compressed paths 40 | for b in range(batch_size): 41 | compressedPath = "" 42 | prev = None 43 | for t in range(seq_len): 44 | # redundant symbol 45 | if prev != None and resSymbolPaths[b][t] == prev: 46 | continue 47 | if resSymbolPaths[b][t] == "placeHolder": 48 | prev = None 49 | continue 50 | compressedPath += resSymbolPaths[b][t] 51 | prev = resSymbolPaths[b][t] 52 | resCompressedPaths.append(compressedPath) 53 | 54 | # Return output accordingly wrt batch_size 55 | if batch_size == 1: 56 | return resCompressedPaths[0], resProbs[0] 57 | else: 58 | return resCompressedPaths, resProbs 59 | 60 | 61 | 62 | ############################################################################## 63 | 64 | 65 | def InitializePaths(SymbolSets, y): 66 | InitialBlankPathScore, InitialPathScore = {}, {} 67 | # First push the blank into a path-ending-with-blank stack. No symbol has been invoked yet 68 | path = "" 69 | InitialBlankPathScore[path] = y[0] # Score of blank at t=1 70 | InitialPathsWithFinalBlank = set() 71 | InitialPathsWithFinalBlank.add(path) 72 | 73 | # Push rest of the symbols into a path-ending-with-symbol set, without the blank 74 | InitialPathsWithFinalSymbol = set() 75 | for i in range(len(SymbolSets)): 76 | path = SymbolSets[i] 77 | InitialPathScore[path] = y[i + 1] 78 | InitialPathsWithFinalSymbol.add(path) # set addition 79 | return InitialPathsWithFinalBlank, InitialPathsWithFinalSymbol, InitialBlankPathScore, InitialPathScore 80 | 81 | def ExtendWithBlank(PathsWithTerminalBlank, PathsWithTerminalSymbol, y, BlankPathScore, PathScore): 82 | UpdatedPathsWithTerminalBlank = set() 83 | UpdatedBlankPathScore = {} 84 | 85 | # First work on paths with terminal blanks, horizontal transitions 86 | for path in PathsWithTerminalBlank: 87 | # Repeating a blank does not change the symbol sequence 88 | UpdatedPathsWithTerminalBlank.add(path) 89 | UpdatedBlankPathScore[path] = BlankPathScore[path] * y[0] 90 | # Then extend paths with terminal symbols by blanks 91 | for path in PathsWithTerminalSymbol: 92 | # If there is already an equivalent string in UpdatedPathsWithTerminalBlank 93 | # simply add the score. If not create a new entry 94 | if path in UpdatedPathsWithTerminalBlank: 95 | UpdatedBlankPathScore[path] += PathScore[path] * y[0] 96 | else: 97 | UpdatedPathsWithTerminalBlank.add(path) 98 | UpdatedBlankPathScore[path] = PathScore[path] * y[0] 99 | return UpdatedPathsWithTerminalBlank, UpdatedBlankPathScore 100 | 101 | 102 | def ExtendWithSymbol(PathsWithTerminalBlank, PathsWithTerminalSymbol, SymbolSet, y, BlankPathScore, PathScore): 103 | UpdatedPathsWithTerminalSymbol = set() 104 | UpdatedPathScore = {} 105 | 106 | # First extend the paths terminating in blanks. This will always create a new sequence 107 | for path in PathsWithTerminalBlank: 108 | for i in range(len(SymbolSet)): # Symbolset does not include blanks 109 | newpath = path + SymbolSet[i] 110 | UpdatedPathsWithTerminalSymbol.add(newpath) 111 | UpdatedPathScore[newpath] = BlankPathScore[path] * y[i+1] 112 | 113 | # Next work on paths with terminal symbols 114 | for path in PathsWithTerminalSymbol: 115 | for i in range(len(SymbolSet)): # Symbolset does not include blanks 116 | # Extend the path with every symbol other than blank 117 | newpath = path if (SymbolSet[i] == path[-1]) else path + SymbolSet[i] # horizontal 118 | if newpath in UpdatedPathsWithTerminalSymbol: # Already in list, merge paths 119 | UpdatedPathScore[newpath] += PathScore[path] * y[i+1] 120 | else: # Create new path 121 | UpdatedPathsWithTerminalSymbol.add(newpath) 122 | UpdatedPathScore[newpath] = PathScore[path] * y[i+1] 123 | return UpdatedPathsWithTerminalSymbol, UpdatedPathScore 124 | 125 | def Prune(PathsWithTerminalBlank, PathsWithTerminalSymbol, BlankPathScore, PathScore, BeamWidth): 126 | PrunedBlankPathScore, PrunedPathScore = {}, {} 127 | PrunedPathsWithTerminalBlank, PrunedPathsWithTerminalSymbol = set(), set() 128 | scorelist = [] 129 | # First gather all the relevant scores 130 | for p in PathsWithTerminalBlank: 131 | scorelist.append(BlankPathScore[p]) 132 | for p in PathsWithTerminalSymbol: 133 | scorelist.append(PathScore[p]) 134 | 135 | # Sort and find cutoff score that retains exactly BeamWidth paths 136 | scorelist.sort(reverse=True) 137 | cutoff = scorelist[BeamWidth] if (BeamWidth < len(scorelist)) else scorelist[-1] 138 | 139 | for p in PathsWithTerminalBlank: 140 | if BlankPathScore[p] > cutoff: 141 | PrunedPathsWithTerminalBlank.add(p) 142 | PrunedBlankPathScore[p] = BlankPathScore[p] 143 | 144 | for p in PathsWithTerminalSymbol: 145 | if PathScore[p] > cutoff: 146 | PrunedPathsWithTerminalSymbol.add(p) 147 | PrunedPathScore[p] = PathScore[p] 148 | return PrunedPathsWithTerminalBlank, PrunedPathsWithTerminalSymbol, PrunedBlankPathScore, PrunedPathScore 149 | 150 | def MergeIdenticalPaths(PathsWithTerminalBlank, PathsWithTerminalSymbol, BlankPathScore, PathScore): 151 | # All paths with terminal symbosl will remain 152 | MergedPaths = PathsWithTerminalSymbol 153 | FinalPathScore = PathScore 154 | 155 | # Paths with terminal blanks will contribute scores to existing identical paths from 156 | # PathsWithTerminalSymbol if present, or be included in the final set, otherwise 157 | for p in PathsWithTerminalBlank: 158 | if p in MergedPaths: 159 | FinalPathScore[p] += BlankPathScore[p] 160 | else: 161 | MergedPaths.add(p) 162 | FinalPathScore[p] = BlankPathScore[p] 163 | return MergedPaths, FinalPathScore 164 | 165 | ''' 166 | SymbolSets: A list containing all the symbols (the vocabulary without blank) 167 | 168 | y_probs: Numpy array with shape (# of symbols + 1, Seq_length, batch_size) 169 | Your batch size for part 1 will remain 1, but if you plan to use your 170 | implementation for part 2 you need to incorporate batch_size. 171 | 172 | BeamWidth: Width of the beam. 173 | 174 | The function should return the symbol sequence with the best path score 175 | (forward probability) and a dictionary of all the final merged paths with 176 | their scores. 177 | ''' 178 | 179 | def BeamSearch(SymbolSets, y_probs, BeamWidth): 180 | # Follow the pseudocode from lecture to complete beam search :-) 181 | PathScore = {} # dict of scores for paths ending with symbols 182 | BlankPathScore = {} # dict of scores for paths ending with blanks 183 | num_symbols, seq_len, batch_size = y_probs.shape 184 | 185 | # First time instant: initialize paths with each of the symbols, including blank, using score at t=1 186 | NewPathsWithTerminalBlank, NewPathsWithTerminalSymbol, NewBlankPathScore, NewPathScore = InitializePaths(SymbolSets, y_probs[:, 0, :]) 187 | 188 | # Subsequent time steps 189 | for t in range(1, seq_len): 190 | PathsWithTerminalBlank, PathsWithTerminalSymbol, BlankPathScore, PathScore = Prune(NewPathsWithTerminalBlank, 191 | NewPathsWithTerminalSymbol, 192 | NewBlankPathScore, NewPathScore, 193 | BeamWidth) 194 | 195 | NewPathsWithTerminalBlank, NewBlankPathScore = ExtendWithBlank(PathsWithTerminalBlank, PathsWithTerminalSymbol, y_probs[:, t, :], BlankPathScore, PathScore) 196 | 197 | # Next extend paths by a symbol 198 | NewPathsWithTerminalSymbol, NewPathScore = ExtendWithSymbol(PathsWithTerminalBlank, PathsWithTerminalSymbol, SymbolSets, y_probs[:, t, :], BlankPathScore, PathScore) 199 | 200 | # Merge identical paths differing only by the final blank 201 | MergedPaths, FinalPathScore = MergeIdenticalPaths(NewPathsWithTerminalBlank, NewPathsWithTerminalSymbol, NewBlankPathScore, NewPathScore) 202 | 203 | 204 | # Pick the best path 205 | BestPath = max(FinalPathScore, key=FinalPathScore.get) # Find the path with the best score 206 | return BestPath, FinalPathScore 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /HW3/hw3p2/README.md: -------------------------------------------------------------------------------- 1 | File structure: 2 | - hw3p2 - p2.py 3 | - hw3p2 4 | - provided file and folders including data from gaggle 5 | - checkpoint 6 | - CNN1_Cont_Epoch6.txt 7 | - data 8 | - Saved prediction of test data 9 | 10 | To run my model: 11 | - Have the file structure as above 12 | - Type “source activate pytorch_p36” in the terminal 13 | - In the hw3p2 top directory, type “python3 p2.py” to run the model 14 | - After the model finishes, find predicted predicted.np and predicted.csv files under the data folder 15 | 16 | Design choices: 17 | - Used one conv1d CNN with (input, hiddenSize) = (40, 256) with kernel size=3, padding=1, stride=1, bias=False. Followed by a BatchNorm1d and ReLU. Then followed by 3 stacked BiLSTM layers each of 256 units with dropout rate = 0.2 to avoid overfitting to the training data. Then followed by one linear layer with dim (2*256, 256) and another linear layer with dim (256, 37) 18 | - Used Adam optimizer with lr=1e-3 and weight decay=5e-5 (following the baseline advise) 19 | - Used a ReduceLROnPlateau scheduler with factor = 0.5, patience=1 20 | - Used CTCLoss 21 | - Trained with the above configuration for 30 epochs. -------------------------------------------------------------------------------- /HW3/hw3p2/p2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # Install CTCBeamDecoder Pacakge 8 | get_ipython().system('git clone --recursive https://github.com/parlance/ctcdecode.git') 9 | get_ipython().system('pip install wget') 10 | get_ipython().run_line_magic('cd', 'ctcdecode') 11 | get_ipython().system('pip install .') 12 | get_ipython().run_line_magic('cd', '..') 13 | 14 | 15 | # In[141]: 16 | 17 | 18 | # Import packages 19 | import numpy as np 20 | import torch 21 | import sys 22 | import torch.nn as nn 23 | import torch.optim as optim 24 | import os 25 | import pandas as pd 26 | import time 27 | from torch.utils.data import DataLoader, Dataset, TensorDataset 28 | 29 | sys.path.append("./hw3p2/") 30 | from phoneme_list import N_STATES, N_PHONEMES, PHONEME_LIST, PHONEME_MAP 31 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence 32 | from ctcdecode import CTCBeamDecoder 33 | 34 | 35 | # In[105]: 36 | 37 | 38 | # Install Lev Package 39 | get_ipython().system('pip install python-levenshtein') 40 | import Levenshtein as lev 41 | 42 | 43 | # In[143]: 44 | 45 | 46 | class MyDataset(Dataset): 47 | def __init__(self, dataset): 48 | self.dataX = dataset[0] 49 | self.dataY = dataset[1] if len(dataset) == 2 else None 50 | 51 | def __getitem__(self, idx): 52 | return torch.from_numpy(self.dataX[idx]).float(), torch.from_numpy(self.dataY[idx] + 1 if self.dataY is not None else np.array([-1])).int() # add 1 to label to account for blank 53 | 54 | def __len__(self): 55 | return len(self.dataX) 56 | 57 | 58 | # Model that takes packed sequences in training 59 | class PackedModel(nn.Module): 60 | def __init__(self, hidden_size, nlayers, out_size=47, embed_size=40): 61 | super(PackedModel, self).__init__() 62 | self.nlayers = nlayers 63 | self.hidden_size = hidden_size 64 | self.embed_size = embed_size 65 | self.out_size = out_size 66 | self.cnns = torch.nn.Sequential( 67 | nn.Conv1d(self.embed_size, self.hidden_size, 3, padding=1, bias=False), 68 | nn.BatchNorm1d(self.hidden_size), 69 | nn.ReLU(inplace=True)) 70 | self.rnns = nn.LSTM(input_size=self.hidden_size, 71 | hidden_size=self.hidden_size, 72 | num_layers=3, 73 | bias=True, 74 | batch_first=True, 75 | dropout=0.2, # regularization 76 | bidirectional=True) 77 | self.hidden2label = torch.nn.Sequential( 78 | nn.Linear(self.hidden_size*2, self.hidden_size), 79 | nn.Linear(self.hidden_size, self.out_size)) 80 | def forward(self, x, xLens): # x dim (B, T_in, C_in=40) 81 | x_cnn_input = x.permute(0, 2, 1) # (B, C_in, T_in) 82 | x_post_cnn = self.cnns(x_cnn_input) # (B, C_out, T_out) 83 | x_rnn_in = x_post_cnn.permute(2, 0, 1) # (T, B, C_out) 84 | x_packed = pack_padded_sequence(x_rnn_in, xLens, enforce_sorted=False) 85 | out_packed, hidden = self.rnns(x_packed) 86 | out, out_lens = pad_packed_sequence(out_packed, batch_first=True) # (B, T, C) 87 | 88 | # Log softmax after output layer is required since nn.CTCLoss expect log prob 89 | out_prob = self.hidden2label(out).log_softmax(2) # (B, T, Classes=47) 90 | 91 | # Permute to fit for input format of CTCLoss 92 | out_prob = out_prob.permute(1, 0, 2) #torch.transpose(out_prob, 0, 1) # (T, B, C) 93 | 94 | # TODO: calculate new xLens 95 | return out_prob, xLens 96 | 97 | 98 | def getLoaders(train, dev, test, batchSize): 99 | trainX, trainY = train 100 | devX, devY = dev 101 | testX, _ = test 102 | 103 | print("*** Create data loader ***") 104 | # Train 105 | train_loader_args = dict(shuffle=True, batch_size=batchSize, num_workers=8, collate_fn=pad_collate, pin_memory=True) 106 | train_loader = DataLoader(MyDataset(train), **train_loader_args) 107 | 108 | # Dev 109 | dev_loader = DataLoader(MyDataset(dev), **train_loader_args) 110 | 111 | # Test 112 | test_loader_args = dict(shuffle=False, batch_size=batchSize, num_workers=8, collate_fn=pad_collate, pin_memory=True) 113 | test_loader = DataLoader(MyDataset(test), **test_loader_args) 114 | 115 | return train_loader, dev_loader, test_loader 116 | 117 | 118 | def decode(output_probs, dataLens, beamWidth): 119 | decoder = CTCBeamDecoder(labels=PHONEME_MAP, beam_width=beamWidth, 120 | num_processes=os.cpu_count(), log_probs_input=True) 121 | output_probs = torch.transpose(output_probs, 0, 1) # post transpose: (B, T, C=47) 122 | output, _, _, out_seq_len = decoder.decode(output_probs, dataLens) # output dim: (BatchSize, Beamwith, T), Out_seq_len dim (batchsize, bewmwidth) 123 | decodedListShort = [] 124 | decodedListLong = [] 125 | for b in range(output_probs.size(0)): 126 | currDecode = "" 127 | if out_seq_len[b][0] != 0: 128 | currDecodeShort = "".join([PHONEME_MAP[i] for i in output[b, 0, :out_seq_len[b][0]]]) 129 | currDecodeLong = "".join([PHONEME_LIST[i] for i in output[b, 0, :out_seq_len[b][0]]]) 130 | decodedListShort.append(currDecodeShort) 131 | decodedListLong.append(currDecodeLong) 132 | 133 | return decodedListShort, decodedListLong 134 | 135 | 136 | def idx2phonemes(target): 137 | return "".join([PHONEME_MAP[x] for x in target]) 138 | 139 | def calculateLevScore(w1, w2): 140 | return lev.distance(w1.replace(" ", ""), w2.replace(" ", "")) 141 | 142 | def train_epoch(mode, data_loader, criterion, optimizer, epoch): 143 | model.train() 144 | start_time = time.time() 145 | for batch_idx, (data, target, dataLens, targetLens) in enumerate(data_loader): 146 | optimizer.zero_grad() 147 | data, target, dataLens, targetLens = data.cuda(), target.cuda(), dataLens.cuda(), targetLens.cuda() 148 | 149 | output, dataLens_new = model(data, dataLens) # out dim: (T, B, C) 150 | loss = criterion(output, # (T, B, C) T is the largest len in the batch 151 | target, # (B, S), S is the largest len in the batch 152 | dataLens_new, # (B,), len of sequences in output_log_prob 153 | targetLens) # (B,) 154 | loss.backward() 155 | optimizer.step() 156 | if batch_idx % 50 == 0: 157 | print("Epoch: {}\tBatch: {}\tTimestamp: {}".format(epoch, batch_idx, time.time() - start_time)) 158 | 159 | torch.cuda.empty_cache() 160 | del data 161 | del target 162 | del dataLens 163 | del targetLens 164 | 165 | 166 | def test_epoch(model, data_loader, epoch, decodeMode=False): 167 | with torch.no_grad(): 168 | model.eval() 169 | start_time = time.time() 170 | running_loss = 0.0 171 | running_charErr = 0.0 172 | totalSampleCnt = 0 173 | 174 | for batch_idx, (data, target, dataLens, targetLens) in enumerate(data_loader): 175 | data, target, dataLens, targetLens = data.cuda(), target.cuda(), dataLens.cuda(), targetLens.cuda() 176 | output, dataLens_new = model(data, dataLens) 177 | loss = criterion(output, 178 | target, 179 | dataLens_new, 180 | targetLens) 181 | 182 | running_loss += loss.item() 183 | totalSampleCnt += len(data) 184 | if decodeMode: 185 | decodedStringsShort, decodedStringsLong = decode(output, dataLens, hyper["beamWidth"]) 186 | targetStrings = [idx2phonemes(i) for i in target] 187 | for i in range(len(targetStrings)): 188 | currCharErr = calculateLevScore(decodedStringsShort[i], targetStrings[i]) 189 | running_charErr += currCharErr 190 | if batch_idx % 50 == 0: 191 | print("Epoch: {}\tBatch: {}\tTimestamp: {}".format(epoch, batch_idx, time.time() - start_time)) 192 | torch.cuda.empty_cache() 193 | del data 194 | del target 195 | del dataLens 196 | del targetLens 197 | loss_per_sample = running_loss / len(data_loader) 198 | dist_per_sample = running_charErr / len(data_loader) 199 | return loss_per_sample, dist_per_sample 200 | 201 | def predict(model, data_loader): 202 | model.eval() 203 | resShort = np.array([]) 204 | resLong = np.array([]) 205 | start_time = time.time() 206 | totalSampleCnt = 0 207 | for batch_idx, (data, target, dataLens, targetLens) in enumerate(data_loader): 208 | data, target, dataLens, targetLens = data.cuda(), target.cuda(), dataLens.cuda(), targetLens.cuda() 209 | output, dataLens_new = model(data, dataLens) 210 | 211 | decodedStringsShort, decodedStringsLong = decode(output, dataLens, hyper["beamWidth"]) 212 | resShort = np.concatenate((resShort, decodedStringsShort)) 213 | resLong = np.concatenate((resLong, decodedStringsLong)) 214 | print("Predict \tBatch: {}\tTimestamp: {}".format(batch_idx, time.time() - start_time)) 215 | torch.cuda.empty_cache() 216 | del data 217 | del target 218 | del dataLens 219 | del targetLens 220 | 221 | return resShort, resLong 222 | 223 | 224 | def pad_collate(batch): 225 | # reference from tutorial: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html 226 | # sortedBatch = batch # sorted(batch, key=lambda x: x[0].shape[0], reverse=True) 227 | inputs = [x[0] for x in batch] 228 | targets = [x[1] for x in batch] 229 | inputs_pad = pad_sequence(inputs, batch_first=True) # dim (B, T, C) since batch_first is true, (T, B, C) if false 230 | targets_pad = pad_sequence(targets, batch_first=True) 231 | inputs_lens = torch.LongTensor([len(x) for x in inputs]) 232 | targets_lens = torch.LongTensor([len(x) for x in targets]) 233 | return inputs_pad, targets_pad, inputs_lens, targets_lens 234 | 235 | 236 | # In[144]: 237 | 238 | 239 | def main(hyper): 240 | # Load datasets 241 | print("*** Load raw data ***") 242 | train = (np.load(os.path.join(hyper["dataPath"], "wsj0_train"), allow_pickle=True), 243 | (np.load(os.path.join(hyper["dataPath"], "wsj0_train_merged_labels.npy"), allow_pickle=True))) 244 | dev = (np.load(os.path.join(hyper["dataPath"], "wsj0_dev.npy"), allow_pickle=True), 245 | (np.load(os.path.join(hyper["dataPath"], "wsj0_dev_merged_labels.npy"), allow_pickle=True))) 246 | test = (np.load(os.path.join(hyper["dataPath"], "wsj0_test"), allow_pickle=True), None) 247 | 248 | # Get data loaders 249 | train_loader, dev_loader, test_loader = getLoaders(train, dev, test, hyper["batchSize"]) 250 | 251 | # Set random seed 252 | np.random.seed(hyper["seed"]) 253 | torch.manual_seed(hyper["seed"]) 254 | torch.cuda.manual_seed(hyper["seed"]) 255 | 256 | # Add blank space for phoneme map 257 | PHONEME_MAP = [" "] + PHONEME_MAP 258 | PHONEME_LIST = [" "] + PHONEME_LIST 259 | 260 | # Create the model and define the Loss an Optimizer 261 | print("*** Create the model and define Loss and Optimizer ***") 262 | model = PackedModel(hidden_size=hyper["hiddenSize"], nlayers=hyper["nlayers"], out_size=47, embed_size=40) 263 | checkpoint = torch.load(hyper["savedCheckpoint"]) 264 | model.load_state_dict(checkpoint["model_state_dict"]) 265 | optimizer = optim.Adam(model.parameters(), lr=hyper["lr"], weight_decay=hyper["weightDecay"]) 266 | criterion = nn.CTCLoss() 267 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1, verbose=True) 268 | model.cuda() 269 | print(model) 270 | 271 | # Train the model for N epochs 272 | for i in range(hyper["nEpochs"]): 273 | # Print current learnng rate 274 | for prarm_group in optimizer.param_groups: 275 | print("Current lr: \t{}".format(prarm_group["lr"])) 276 | 277 | # Trian 278 | print("Train\tEpoch: {}".format(i)) 279 | startTime = time.time() 280 | train_epoch(model, train_loader, criterion, optimizer, i) 281 | 282 | # Evaluate 283 | print("Evaluate Train \tEpoch: {}".format(i)) 284 | train_lossPerSample, train_distPerSample = test_epoch(model, train_loader, i) 285 | print('Train_LossPerSample: {:.4f}\tTrain_DistPerSample: {:.4f}'.format( 286 | train_lossPerSample, train_distPerSample)) 287 | print("Evaluate Dev \tEpoch: {}".format(i)) 288 | dev_lossPerSample, dev_distPerSample = test_epoch(model, dev_loader, i) 289 | print('Dev_LossPerSample: {:.4f}\tDev_DistPerSample: {:.4f}'.format( 290 | dev_lossPerSample, dev_distPerSample)) 291 | 292 | scheduler.step(dev_lossPerSample) 293 | 294 | # Save checkpoint 295 | print("*** Saving Checkpoint ***") 296 | path = "{}CNN1_Cont_Epoch{}.txt".format(hyper["checkpointPath"], i) 297 | torch.save({ 298 | "epoch":i, 299 | 'model_state_dict': model.state_dict(), 300 | 'optimizer_state_dict': optimizer.state_dict()}, path) 301 | print("="*20 + " Epoch {} took {}s".format(i, time.time()-startTime) + "="*20) 302 | 303 | # Predict and save 304 | resShort, resLong = predict(model, test_loader) 305 | np.save(hyper["testLabelName"], resShort) 306 | idxs = np.array(list(range(len(resShort)))) 307 | df = pd.DataFrame({"id" : idxs, "Predicted" : resShort}) 308 | df.to_csv(hyper["testLabelCSVfn"], index=False) 309 | 310 | 311 | 312 | 313 | # In[145]: 314 | 315 | 316 | if __name__ == "__main__": 317 | hyper = { 318 | "dataPath": "./hw3p2", 319 | "batchSize": 64, 320 | "lr":5e-4, 321 | "weightDecay":5e-5, 322 | "hiddenSize": 256, 323 | "nlayers":3, 324 | "nEpochs":20, 325 | "beamWidth":30, 326 | "checkpointPath": "./checkpoint/", 327 | "seed":20, 328 | "testLabelName" : "./data/predicted.npy", 329 | "testLabelCSVfn": "./data/predicted.csv", 330 | "savedCheckpoint": "./checkpoint/CNN1_Cont_Epoch6.txt" 331 | } 332 | main(hyper) 333 | 334 | 335 | # In[ ]: 336 | 337 | 338 | 339 | 340 | -------------------------------------------------------------------------------- /HW4/HW4P1_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/HW4P1_Writeup.pdf -------------------------------------------------------------------------------- /HW4/HW4P2_Writeup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/HW4P2_Writeup.pdf -------------------------------------------------------------------------------- /HW4/Weight_Decay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/Weight_Decay.png -------------------------------------------------------------------------------- /HW4/hw4p1/handout/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | cp hw4/experiments/$(runid)/predictions-test-$(epoch).npy predictions.npy 3 | cp hw4/experiments/$(runid)/generated-$(epoch).txt generated.txt 4 | cp hw4/experiments/$(runid)/generated_logits-test-$(epoch).npy generated_logits.npy 5 | cp hw4/training.ipynb training.ipynb 6 | tar -cvf handin.tar training.ipynb predictions.npy generated.txt generated_logits.npy 7 | rm -f generated.txt predictions.npy training.ipynb generated_logits.npy 8 | -------------------------------------------------------------------------------- /HW4/hw4p1/handout/fixtures/generation.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/hw4p1/handout/fixtures/generation.npy -------------------------------------------------------------------------------- /HW4/hw4p1/handout/fixtures/generation_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/hw4p1/handout/fixtures/generation_test.npy -------------------------------------------------------------------------------- /HW4/hw4p1/handout/fixtures/prediction.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/hw4p1/handout/fixtures/prediction.npz -------------------------------------------------------------------------------- /HW4/hw4p1/handout/fixtures/prediction_test.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiranchen/CMU11785-Deep-Learning/9717abd005e9aea9ae0a0d02169cf16f36260729/HW4/hw4p1/handout/fixtures/prediction_test.npz -------------------------------------------------------------------------------- /HW4/hw4p1/handout/hw4/tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def log_softmax(x, axis): 4 | ret = x - np.max(x, axis=axis, keepdims=True) 5 | lsm = np.log(np.sum(np.exp(ret), axis=axis, keepdims=True)) 6 | return ret - lsm 7 | 8 | 9 | def array_to_str(arr, vocab): 10 | return " ".join(vocab[a] for a in arr) 11 | 12 | 13 | def test_prediction(out, targ): 14 | out = log_softmax(out, 1) 15 | nlls = out[np.arange(out.shape[0]), targ] 16 | nll = -np.mean(nlls) 17 | return nll 18 | 19 | def test_generation(inp, pred, vocab): 20 | outputs = u"" 21 | for i in range(inp.shape[0]): 22 | w1 = array_to_str(inp[i], vocab) 23 | w2 = array_to_str(pred[i], vocab) 24 | outputs += u"Input | Output #{}: {} | {}\n".format(i, w1, w2) 25 | return outputs -------------------------------------------------------------------------------- /HW4/hw4p2/README.md: -------------------------------------------------------------------------------- 1 | File structure: 2 | - main.py 3 | - models.py 4 | - plot.py 5 | - train_test.py 6 | - dataloader.py 7 | - attention (folder) 8 | - checkpoint (folder) - data (folder) 9 | - train_new.npy 10 | - dev_new.npy 11 | - test_new.npy 12 | - train_transcripts.npy - dev_transcripts.npy - predicted_test.csv 13 | - predicted_dev.csv 14 | 15 | 16 | To run my model: 17 | - Have the file structure as above
 18 | - Type “source activate pytorch_p36” in the terminal
 19 | - In the top directory, type “python3 main.py” to run the model
 20 | - After the model finishes, find predicted predicted_test.csv, predicted_dev.csv in the data folder 21 | 22 | 23 | Design choices: 24 | - Experimented with gumbel noise, changing teacher forcing rate from 0.1 and gradually to 0.4, use ReduceOnLRPlateau schedule 25 | - Used Adam optimizer with lr=0.001 26 | - Used a ReduceLROnPlateau scheduler with factor = 0.75, patience=1, threshold=0.01 27 | - First train the model start with teacher forcing rate at 0.1, after 25 epochs, when the edit 28 | distance stop improving, change teacher forcing rate to 0.2, then train for another 30 29 | epochs 30 | - Gumbel noise does not help my performance somehow? the model stopped improving at a 31 | very early stage. 32 | - Changing teacher forcing rate worked well for me
 -------------------------------------------------------------------------------- /HW4/hw4p2/dataloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.utils.data import Dataset 4 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence 5 | 6 | LETTER_LIST = ['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \ 7 | 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','',''] 8 | 9 | 10 | ''' 11 | Optional, create dictionaries for letter2index and index2letter transformations 12 | ''' 13 | def create_dictionaries(letter_list): 14 | n = len(letter_list) 15 | letter2index = {letter_list[i]:i for i in range(0, n)} 16 | index2letter = {i:letter_list[i] for i in range(0, n)} 17 | return letter2index, index2letter 18 | 19 | letter2index, index2letter = create_dictionaries(LETTER_LIST) 20 | 21 | ''' 22 | Loading all the numpy files containing the utterance information and text information 23 | ''' 24 | def load_data(dataPath): 25 | speech_train = np.load(dataPath+"train_new.npy", allow_pickle=True, encoding='bytes') 26 | speech_dev = np.load(dataPath+"dev_new.npy", allow_pickle=True, encoding='bytes') 27 | speech_test = np.load(dataPath+"test_new.npy", allow_pickle=True, encoding='bytes') 28 | 29 | transcript_train = np.load(dataPath+"train_transcripts.npy", allow_pickle=True, encoding='bytes') 30 | transcript_dev = np.load(dataPath+"dev_transcripts.npy", allow_pickle=True, encoding='bytes') 31 | 32 | return speech_train, speech_dev, speech_test, transcript_train, transcript_dev 33 | 34 | 35 | ''' 36 | Transforms alphabetical input to numerical input, replace each letter by its corresponding 37 | index from letter_list 38 | ''' 39 | def transform_letter_to_index(transcript): 40 | ''' 41 | :param transcript :(N, ) Transcripts are the text input 42 | :param letter_list: Letter list defined above 43 | :return letter_to_index_list: Returns a list for all the transcript sentence to index 44 | ''' 45 | letter_to_index_list = [] 46 | for sent in transcript: 47 | letters = [letter2index['']] 48 | for word in sent: 49 | # Converte from byte format to string for mapping 50 | s = word.decode('utf-8') 51 | for c in s: 52 | letters.append(letter2index[c]) 53 | # Space between each word 54 | letters.append(letter2index[' ']) 55 | letters.pop() 56 | letters.append(letter2index['']) 57 | letter_to_index_list.append(letters) 58 | return letter_to_index_list 59 | 60 | def transform_index_to_letter(index, stopIdxs): 61 | index_to_letter_list = [] 62 | for r in index: 63 | curr = "" 64 | for i in r: 65 | # Reached the end of the sentence 66 | if i in stopIdxs: 67 | break 68 | else: 69 | curr += index2letter[i] 70 | index_to_letter_list.append(curr) 71 | return index_to_letter_list 72 | 73 | 74 | class Speech2TextDataset(Dataset): 75 | ''' 76 | Dataset class for the speech to text data, this may need some tweaking in the 77 | getitem method as your implementation in the collate function may be different from 78 | ours. 79 | ''' 80 | def __init__(self, speech, text): 81 | self.dataX = speech 82 | self.dataY = text 83 | 84 | def __len__(self): 85 | return self.dataX.shape[0] 86 | 87 | def __getitem__(self, index): 88 | if self.dataY == None: # test scenario 89 | return torch.tensor(self.dataX[index].astype(np.float32)) 90 | else: 91 | return torch.tensor(self.dataX[index].astype(np.float32)), torch.tensor(self.dataY[index]) 92 | 93 | def collate_train(batch): 94 | ### Return the padded speech and text data, and the length of utterance and transcript ### 95 | inputs_pad = [] 96 | targets_pad = [] 97 | inputs_lens = [] 98 | targets_lens = [] 99 | for b in range(len(batch)): 100 | inputs_pad.append(torch.tensor(batch[b][0])) 101 | inputs_lens.append(len(batch[b][0])) 102 | targets_pad.append(torch.tensor(batch[b][1][1:])) # shift one char for target sentence 103 | targets_lens.append(len(batch[b][1])-1) # sentence 104 | inputs_pad = pad_sequence(inputs_pad, batch_first=True) # dim (B, T, C) since batch_first is true, (T, B, C) if false 105 | targets_pad = pad_sequence(targets_pad, batch_first=True) 106 | inputs_lens = torch.tensor(inputs_lens) 107 | targets_lens = torch.tensor(targets_lens) 108 | return inputs_pad, targets_pad, inputs_lens, targets_lens 109 | 110 | def collate_test(batch): 111 | ### Return padded speech and length of utterance ### 112 | inputs_pad = [] 113 | inputs_lens = [] 114 | for b in range(len(batch)): 115 | inputs_pad.append(torch.tensor(batch[b])) 116 | inputs_lens.append(len(batch[b])) 117 | inputs_pad = pad_sequence(inputs_pad, batch_first=True) 118 | inputs_lens = torch.tensor(inputs_lens) 119 | return inputs_pad, inputs_lens 120 | 121 | -------------------------------------------------------------------------------- /HW4/hw4p2/main.py: -------------------------------------------------------------------------------- 1 | # To add a new cell, type '# %%' 2 | # To add a new markdown cell, type '# %% [markdown]' 3 | # %% 4 | import numpy as np 5 | import torch 6 | import sys 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | import os 10 | import pandas as pd 11 | import time 12 | from torch.utils.data import DataLoader 13 | 14 | from dataloader import load_data, transform_letter_to_index, collate_train, collate_test, Speech2TextDataset 15 | from dataloader import LETTER_LIST, letter2index, index2letter 16 | from models import Seq2Seq 17 | from train_test import train, val 18 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 19 | 20 | 21 | # %% 22 | hyper = { 23 | 'dataPath':"./data/", 24 | 'batchSize':64 if DEVICE=='cuda' else 3, 25 | 'epochs':25, 26 | 'encoder_hidden_dim':256, 27 | 'decoder_hidden_dim':512, 28 | 'embed_dim':256, 29 | 'value_size':128, 30 | 'key_size':128, 31 | 'isAttended':True, 32 | 'displayBatchFreq':50, 33 | 'displayPredFreq':10, 34 | 'checkpointPath':"./checkpoint/", 35 | "savedCheckpoint": "./checkpoint/init_epoch11.txt", 36 | 'testPredCSVfn':'./data/predicted_test.csv', 37 | 'devPredCSVfn':'./data/predicted_dev.csv', 38 | 'testPredNpyfn':'./data/predicted_test.npy' 39 | } 40 | 41 | 42 | # %% 43 | # Load datasets 44 | print("*** Load raw data ***") 45 | speech_train, speech_dev, speech_test, transcript_train, transcript_dev = load_data(hyper['dataPath']) 46 | 47 | 48 | # %% 49 | # Preprocess transcript to char level index 50 | print("*** Process transcript to char level index ***") 51 | character_text_train = transform_letter_to_index(transcript_train) 52 | character_text_dev = transform_letter_to_index(transcript_dev) 53 | 54 | 55 | # %% 56 | # Get dataloaders 57 | print("*** Get data loaders ***") 58 | train_dataset = Speech2TextDataset(speech_train, character_text_train) 59 | dev_dataset = Speech2TextDataset(speech_dev, character_text_dev) 60 | test_dataset = Speech2TextDataset(speech_test, None) 61 | train_loader = DataLoader(train_dataset, batch_size=hyper['batchSize'], shuffle=True, collate_fn=collate_train) # 387 62 | dev_loader = DataLoader(dev_dataset, batch_size=hyper['batchSize'], shuffle=False, collate_fn=collate_train) # 18 63 | test_loader = DataLoader(test_dataset, batch_size=hyper['batchSize'], shuffle=False, collate_fn=collate_test) # 9 64 | 65 | 66 | # %% 67 | # Define model and optimizer 68 | print("*** Create the model and define Loss and Optimizer ***") 69 | model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), encoder_hidden_dim=hyper['encoder_hidden_dim'], 70 | decoder_hidden_dim=hyper['decoder_hidden_dim'], 71 | embed_dim=hyper['embed_dim'], 72 | value_size=hyper['value_size'], 73 | key_size=hyper['key_size'], 74 | isAttended=hyper['isAttended']) 75 | optimizer = optim.Adam(model.parameters(), lr=0.001) 76 | criterion = nn.CrossEntropyLoss(reduction='none') 77 | scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.75, patience=1, verbose=True, threshold=1e-2) 78 | 79 | 80 | # %% 81 | model.to(DEVICE) 82 | print(model) 83 | 84 | 85 | # %% 86 | import warnings 87 | warnings.filterwarnings("ignore", category=UserWarning) 88 | 89 | for epoch in range(hyper['epochs']): 90 | # Print current learnng rate 91 | for prarm_group in optimizer.param_groups: 92 | print("Current lr: \t{}".format(prarm_group["lr"])) 93 | 94 | # Train 95 | print("Start Train \t{} Epoch".format(epoch)) 96 | startTime = time.time() 97 | train(model, train_loader, criterion, optimizer, epoch, hyper['displayBatchFreq']) 98 | 99 | # Save checkpoint 100 | print("*** Saving Checkpoint ***") 101 | path = "{}init_epoch{}.txt".format(hyper["checkpointPath"], epoch) 102 | torch.save({ 103 | 'epoch':epoch, 104 | 'model_state_dict':model.state_dict(), 105 | 'optimizer_state_dict': optimizer.state_dict()}, path) 106 | print("="*20 + " Epoch {} took {}s".format(epoch, time.time()-startTime) + "="*20) 107 | 108 | # Evaluate 109 | print("Start Dev \t{} Epoch".format(epoch)) 110 | editDist = val(model, dev_loader, criterion, epoch, sampleSize=0, displayBatchFreq=50, displayPredFreq=3) 111 | scheduler.step(editDist) 112 | 113 | 114 | 115 | # %% 116 | checkpoint = torch.load(hyper["savedCheckpoint"]) 117 | model.load_state_dict(checkpoint["model_state_dict"]) 118 | model.to(DEVICE) 119 | 120 | 121 | # %% 122 | # valid inference 123 | validInfer = inference(model, dev_loader, hyper, isValid=True) 124 | 125 | 126 | # %% 127 | # test inference 128 | testInfer = inference(model, test_loader, hyper, isValid=False) 129 | 130 | -------------------------------------------------------------------------------- /HW4/hw4p2/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.utils as utils 4 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence 5 | import random 6 | from dataloader import letter2index 7 | from matplotlib.lines import Line2D 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | import time 11 | 12 | 13 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 14 | 15 | class Attention(nn.Module): 16 | ''' 17 | Attention is calculated using key, value and query from Encoder and decoder. 18 | Below are the set of operations you need to perform for computing attention: 19 | energy = bmm(key, query) 20 | attention = softmax(energy) 21 | context = bmm(attention, value) 22 | ''' 23 | def __init__(self): 24 | super(Attention, self).__init__() 25 | 26 | def forward(self, query, key, value, lens): 27 | ''' 28 | :param query :(N, context_size) Query is the output of LSTMCell from Decoder 29 | :param key: (N, T_max, key_size) Key Projection from Encoder per time step 30 | :param value: (N, T_max, value_size) Value Projection from Encoder per time step 31 | :param lens: (N, T) Length of key and value, used for binary masking 32 | :return output: Attended Context 33 | :return attention: Attention mask that can be plotted 34 | ''' 35 | # print("key.size {}".format(key.size())) 36 | # print("query.size {}".format(query.size())) 37 | energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2) # (N, T_max, key_size) * (N, context_size, 1) = (N, T_max, 1) -> (N, T_max) 38 | # print("enery.size {}".format(energy.size())) 39 | 40 | # binary masking for padded positions 41 | mask = torch.arange(key.size(1)).unsqueeze(0) >= lens.unsqueeze(1) # (1, T) >= (B, 1) -> (N, T_max) 42 | # print("mask.size {}".format(mask.size())) 43 | mask = mask.to(DEVICE) 44 | energy.masked_fill_(mask, -1e9) # (N, T_max) 45 | attention = nn.functional.softmax(energy, dim=1) # (N, T_max) 46 | output = torch.bmm(attention.unsqueeze(1), value).squeeze(1) # (N, T_max) 47 | 48 | return output, attention 49 | 50 | 51 | class pBLSTM(nn.Module): 52 | ''' 53 | Pyramidal BiLSTM 54 | The length of utterance (speech input) can be hundereds to thousands of frames long. 55 | The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence, 56 | and inferior results even after extensive training. 57 | The major reason is inability of AttendAndSpell operation to extract relevant information 58 | from a large number of input steps. 59 | ''' 60 | def __init__(self, input_dim, hidden_dim): 61 | super(pBLSTM, self).__init__() 62 | self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True) 63 | 64 | def forward(self, x): 65 | ''' 66 | :param x :(N, T) input to the pBLSTM 67 | :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 68 | ''' 69 | 70 | x_padded, x_lens = pad_packed_sequence(x, batch_first=True) 71 | x_lens = x_lens.to(DEVICE) 72 | 73 | # chop off extra odd/even sequence 74 | x_padded = x_padded[:, :(x_padded.size(1) // 2) * 2, :] # (B, T, dim) 75 | 76 | # reshape to (B, T/2, dim*2) 77 | x_reshaped = x_padded.reshape(x_padded.size(0), x_padded.size(1) // 2, x_padded.size(2) * 2) 78 | x_lens = x_lens // 2 79 | 80 | x_packed = pack_padded_sequence(x_reshaped, lengths=x_lens, batch_first=True, enforce_sorted=False) 81 | 82 | 83 | out, _ = self.blstm(x_packed) 84 | return out 85 | 86 | 87 | 88 | 89 | class Encoder(nn.Module): 90 | ''' 91 | Encoder takes the utterances as inputs and returns the key and value. 92 | Key and value are nothing but simple projections of the output from pBLSTM network. 93 | ''' 94 | def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128): 95 | super(Encoder, self).__init__() 96 | self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True) 97 | 98 | ### Add code to define the blocks of pBLSTMs! ### 99 | self.pBLSTMs = nn.Sequential( 100 | pBLSTM(hidden_dim*4, hidden_dim), 101 | pBLSTM(hidden_dim*4, hidden_dim), 102 | pBLSTM(hidden_dim*4, hidden_dim) 103 | ) 104 | 105 | self.key_network = nn.Linear(hidden_dim*2, value_size) 106 | self.value_network = nn.Linear(hidden_dim*2, key_size) 107 | 108 | def forward(self, x, lens): 109 | rnn_inp = pack_padded_sequence(x, lengths=lens, batch_first=True, enforce_sorted=False) 110 | 111 | outputs, _ = self.lstm(rnn_inp) 112 | 113 | 114 | ### Use the outputs and pass it through the pBLSTM blocks! ### 115 | outputs = self.pBLSTMs(outputs) 116 | 117 | linear_input, encoder_lens = pad_packed_sequence(outputs, batch_first=True) 118 | keys = self.key_network(linear_input) 119 | value = self.value_network(linear_input) 120 | return keys, value, encoder_lens 121 | 122 | 123 | class Decoder(nn.Module): 124 | ''' 125 | As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 126 | thus we use LSTMCell instead of LSLTM here. 127 | The output from the second LSTMCell can be used as query here for attention module. 128 | In place of value that we get from the attention, this can be replace by context we get from the attention. 129 | Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance. 130 | ''' 131 | def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, value_size=128, key_size=128, isAttended=False): 132 | super(Decoder, self).__init__() 133 | self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) 134 | self.lstm1 = nn.LSTMCell(input_size=embed_dim + value_size, hidden_size=decoder_hidden_dim) 135 | self.lstm2 = nn.LSTMCell(input_size=decoder_hidden_dim, hidden_size=key_size) 136 | 137 | self.isAttended = isAttended 138 | if (isAttended == True): 139 | self.attention = Attention() 140 | 141 | self.character_prob = nn.Linear(key_size + value_size, vocab_size) 142 | self.value_size = value_size 143 | self.hidden_dim = decoder_hidden_dim 144 | 145 | def forward(self, key, values, encoder_lens, batch_idx, text=None, isTrain=True, teacherForcingRate = 0.1, isGumbel=False): 146 | ''' 147 | :param key :(N, T, key_size) Output of the Encoder Key projection layer 148 | :param values: (N, T, value_size) Output of the Encoder Value projection layer 149 | :param text: (N, text_len) Batch input of text with text_length 150 | :param isTrain: Train or eval mode 151 | :return predictions: Returns the character perdiction probability 152 | ''' 153 | batch_size = key.shape[0] 154 | 155 | if (isTrain == True): 156 | max_len = text.shape[1] 157 | embeddings = self.embedding(text) 158 | else: 159 | max_len = 250 160 | 161 | predictions = [] 162 | hidden_states = [None, None] 163 | prediction = torch.zeros(batch_size, 1).to(DEVICE) 164 | context = values[:, 0, :] # initialize context 165 | attentionPlot = [] 166 | for i in range(max_len): 167 | # * Implement Gumble noise and teacher forcing techniques 168 | # * When attention is True, replace values[i,:,:] with the context you get from attention. 169 | # * If you haven't implemented attention yet, then you may want to check the index and break 170 | # out of the loop so you do you do not get index out of range errors. 171 | 172 | if (isTrain): 173 | # Teacher forcing 174 | teacher_forcing = True if random.random() > teacherForcingRate else False # currently 0.2 175 | if not teacher_forcing: 176 | # Use previous prediction/initial zeroed prediction for teacher forcing 177 | if i != 0 and isGumbel: # use Gumbel noise to add noise to add variety to phoneme 178 | char_embed = torch.nn.functional.gumbel_softmax(prediction).mm(self.embedding.weight) 179 | else: 180 | char_embed = self.embedding(prediction.argmax(dim=-1)) 181 | else: 182 | if i == 0: 183 | start_char = torch.zeros(batch_size, dtype=torch.long).fill_(letter2index['']).to(DEVICE) 184 | char_embed = self.embedding(start_char) 185 | else: 186 | # Use ground truth 187 | char_embed = embeddings[:, i-1, :] 188 | else: 189 | if i == 0: 190 | start_char = torch.zeros(batch_size, dtype=torch.long).fill_(letter2index['']).to(DEVICE) 191 | char_embed = self.embedding(start_char) 192 | else: 193 | char_embed = self.embedding(prediction.argmax(dim=-1)) 194 | 195 | # Input to decoder is the concatenated char embedding and attention context vector 196 | inp = torch.cat([char_embed, context], dim=1) 197 | hidden_states[0] = self.lstm1(inp, hidden_states[0]) 198 | 199 | inp_2 = hidden_states[0][0] 200 | hidden_states[1] = self.lstm2(inp_2, hidden_states[1]) # output (h_1, c_1) 201 | 202 | ### Compute attention from the output of the second LSTM Cell ### 203 | output = hidden_states[1][0] 204 | 205 | # Attention plot during training 206 | if self.isAttended: 207 | context, attention = self.attention(output, key, values, encoder_lens) 208 | # plot random sample from batch T * key_size 209 | if batch_idx % 64 == 0 and isTrain: 210 | currAtten = attention[0].detach().cpu() 211 | 212 | attentionPlot.append(currAtten) #(len of input seq, len of output seq) 213 | else: 214 | context = values[:, i, :] if i < values.size(1) else torch.zeros(batch_size, self.value_size).to(DEVICE) 215 | 216 | prediction = self.character_prob(torch.cat([output, context], dim=1)) 217 | predictions.append(prediction.unsqueeze(1)) 218 | 219 | # Plot attention plot 220 | if batch_idx % 64 == 0 and isTrain: 221 | attentions = torch.stack(attentionPlot, dim=1) 222 | 223 | plt.clf() 224 | sns.heatmap(attentions, cmap='GnBu') 225 | plt.savefig("./attention/heat_{}s.png".format(time.time())) 226 | 227 | 228 | 229 | return torch.cat(predictions, dim=1) 230 | 231 | 232 | class Seq2Seq(nn.Module): 233 | ''' 234 | We train an end-to-end sequence to sequence model comprising of Encoder and Decoder. 235 | This is simply a wrapper "model" for your encoder and decoder. 236 | ''' 237 | def __init__(self, input_dim, vocab_size, encoder_hidden_dim=256, decoder_hidden_dim=512, embed_dim=256, value_size=128, key_size=128, isAttended=False): 238 | super(Seq2Seq, self).__init__() 239 | self.encoder = Encoder(input_dim, encoder_hidden_dim) #encoder_hidden_dim) 240 | self.decoder = Decoder(vocab_size, decoder_hidden_dim, embed_dim, value_size, key_size, isAttended) 241 | 242 | def forward(self, speech_input, speech_len, batchNum, text_input=None, isTrain=True): 243 | key, value, encoder_lens = self.encoder(speech_input, speech_len) 244 | if (isTrain == True): 245 | predictions = self.decoder(key, value, encoder_lens, batchNum, text_input) 246 | else: 247 | predictions = self.decoder(key, value, encoder_lens, batchNum, text=None, isTrain=False) 248 | return predictions 249 | -------------------------------------------------------------------------------- /HW4/hw4p2/plot.py: -------------------------------------------------------------------------------- 1 | from matplotlib.lines import Line2D 2 | import matplotlib.pyplot as plt 3 | 4 | ''' 5 | If you plan to use the code skeleton from therecitation pass model.parameters() 6 | from train loop.model here is an object of Seq2Seq class. 7 | ''' 8 | 9 | def plot_grad_flow(named_parameters): 10 | '''Plots the gradients flowing through different layers in the net during training. 11 | Can be used for checking for possible gradient vanishing / exploding problems. 12 | Usage: Plug this function in Trainer class after loss.backwards() as 13 | "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow 14 | ''' 15 | ave_grads = [] 16 | max_grads= [] 17 | layers = [] 18 | for n, p in named_parameters: 19 | if(p.requires_grad) and ("bias" not in n): 20 | if(p is not None): 21 | layers.append(n) 22 | ave_grads.append(p.grad.abs().mean()) 23 | max_grads.append(p.grad.abs().max()) 24 | plt.clf() 25 | plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c") 26 | plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b") 27 | plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" ) 28 | plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical") 29 | plt.xlim(left=0, right=len(ave_grads)) 30 | plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions 31 | plt.xlabel("Layers") 32 | plt.ylabel("average gradient") 33 | plt.title("Gradient flow") 34 | #plt.tight_layout() 35 | plt.grid(True) 36 | plt.legend([Line2D([0], [0], color="c", lw=4), 37 | Line2D([0], [0], color="b", lw=4), 38 | Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient']) 39 | return plt -------------------------------------------------------------------------------- /HW4/hw4p2/train_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | from plot import plot_grad_flow 4 | ### Add Your Other Necessary Imports Here! ### 5 | 6 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 7 | 8 | def generate_mask(lens): 9 | lens = torch.tensor(lens).to(DEVICE) 10 | max_len = torch.max(lens) 11 | 12 | mask = (torch.arange(0, max_len).repeat(lens.size(0), 1).to(DEVICE) < \ 13 | lens.unsqueeze(1).expand(lens.size(0), max_len)).int() 14 | return mask 15 | 16 | def calc_edit_dist(preds, targets): 17 | res = 0.0 18 | for pred, target in zip(preds, targets): 19 | dist = Levenshtein.distance(pred, target) 20 | # print("Lev dist pred {}".format(pred)) 21 | # print("Lev dist target {}".format(target)) 22 | # print("Lev dist {}".format(dist)) 23 | res += dist 24 | return res 25 | def train(model, train_loader, criterion, optimizer, epoch, displayBatchFreq=50): 26 | model.train() 27 | model.to(DEVICE) 28 | start = time.time() 29 | runningLoss = 0.0 30 | runningPerplex = 0.0 31 | trainingLoss = 0.0 32 | trainingPerplex = 0.0 33 | 34 | # 1) Iterate through your loader 35 | for batch_idx, (data, target, dataLens, targetLens) in enumerate(train_loader): 36 | # 2) Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion 37 | with torch.autograd.set_detect_anomaly(False): # close to 38 | # 3) Set the inputs to the device. 39 | data, target, dataLens, targetLens = data.to(DEVICE), target.to(DEVICE), dataLens.to(DEVICE), targetLens.to(DEVICE) 40 | optimizer.zero_grad() 41 | 42 | # 4) Pass your inputs, and length of speech into the model. 43 | predictions = model(speech_input=data, speech_len=dataLens, batchNum=batch_idx, text_input=target, isTrain=True) 44 | #print("prediction size {}".format(predictions.size())) 45 | 46 | # 5) Generate a mask based on the lengths of the text to create a masked loss. 47 | # 5.1) Ensure the mask is on the device and is the correct shape. 48 | mask = generate_mask(targetLens).to(DEVICE) 49 | #print("mask size {}".format(mask.size())) 50 | 51 | 52 | # 6) If necessary, reshape your predictions and origianl text input 53 | # 6.1) Use .contiguous() if you need to. 54 | 55 | # 7) Use the criterion to get the loss. 56 | #print("Loss input") 57 | #print("Predictions size {}".format(predictions.size())) 58 | #print("Target size {}".format(target.size())) 59 | loss = criterion(predictions.view(-1, predictions.size(2)), target.view(-1)) 60 | 61 | # 8) Use the mask to calculate a masked loss. 62 | #print("Loss Size {}".format(loss.size())) 63 | #print("mask.view(-1) size {}".format(mask.view(-1).size())) 64 | masked_loss = torch.sum(loss * mask.view(-1)) / torch.sum(mask) 65 | #print("Masked_loss size {}".format(masked_loss.size())) 66 | #masked_loss = loss.sum() / targetLens.sum() 67 | ## Cumulate running loss and perplexity 68 | currLoss = masked_loss.item() 69 | currPerplex = torch.exp(masked_loss).item() 70 | runningLoss += currLoss 71 | runningPerplex += currPerplex 72 | trainingLoss += currLoss 73 | trainingPerplex += currPerplex 74 | 75 | # 9) Run the backward pass on the masked loss. 76 | masked_loss.backward() 77 | ## Plot gradient flow 78 | 79 | # 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2) 80 | torch.nn.utils.clip_grad_norm(model.parameters(), 2) 81 | 82 | # 11) Take a step with your optimizer 83 | optimizer.step() 84 | 85 | # 12) Normalize the masked loss 86 | 87 | 88 | # 13) Optionally print the training loss after every N batches 89 | if batch_idx % displayBatchFreq == (displayBatchFreq-1): 90 | plt = plot_grad_flow(model.named_parameters()) 91 | plt.savefig('./grad_plot/epoch{}_batch{}'.format(epoch, batch_idx), bbox_inches='tight') 92 | if batch_idx % displayBatchFreq == (displayBatchFreq-1): 93 | print("Epoch: {} Batch: {}\tLoss: {:.5f}\tCurrPerplex: {:.5f}\tAvgPreplex:{:.5f}\tTimestamp: {:.5f}".format(epoch,\ 94 | batch_idx, runningLoss/displayBatchFreq,\ 95 | currPerplex, runningPerplex/displayBatchFreq, 96 | time.time() - start)) 97 | runningLoss = 0.0 98 | runningPerplex = 0.0 99 | 100 | del data 101 | del target 102 | del dataLens 103 | del targetLens 104 | torch.cuda.empty_cache() 105 | 106 | end = time.time() 107 | print("Finished Epoch: {}\tTrainLoss: {:.5f}\tTrainPerplex: {:.5f}\tTimeTook: {:.5f}".format(epoch,\ 108 | trainingLoss/len(train_loader), trainingPerplex/len(train_loader), end - start)) 109 | 110 | def val(model, test_loader, criterion, epoch, displayBatchFreq=50, displayPredFreq=10): 111 | ### Write your test code here! ### 112 | model.eval() 113 | model.to(DEVICE) 114 | start = time.time() 115 | runningLoss = 0.0 116 | runningPerplex = 0.0 117 | runningDist = 0.0 118 | testLoss = 0.0 119 | testPerplex = 0.0 120 | numSeq = 0.0 121 | print(len(test_loader)) 122 | for batch_idx, (data, target, dataLens, targetLens) in enumerate(test_loader): 123 | data, target, dataLens, targetLens = data.to(DEVICE), target.to(DEVICE), dataLens.to(DEVICE), targetLens.to(DEVICE) 124 | 125 | predictions = model(speech_input=data, speech_len=dataLens, batchNum=batch_idx, text_input=None, isTrain=False) 126 | 127 | #mask = generate_mask(targetLens).to(DEVICE)#torch.arange(target.size(1)).unsqueeze(0).to(DEVICE) >= targetLens.unsqueeze(1) 128 | 129 | #loss = criterion(predictions.view(-1, predictions.size(2)), target.view(-1)) 130 | 131 | #masked_loss = torch.sum(loss * mask.view(-1)) / torch.sum(mask) 132 | 133 | #runningLoss += masked_loss.item() 134 | #runningPerplex += torch.exp(masked_loss).item() 135 | #testLoss += masked_loss.item() 136 | #testPerplex += torch.exp(masked_loss).item() 137 | 138 | # if batch_idx % displayBatchFreq == (displayBatchFreq-1): 139 | # print("Epoch: {} Batch: {}\tLoss: {:.5f}\tPerplex: {:.5f}\tTimestamp: {:.5f}".format(epoch,\ 140 | # batch_idx, runningLoss/displayBatchFreq,\ 141 | # runningPerplex/displayBatchFreq, 142 | # time.time() - start)) 143 | # runningLoss = 0.0 144 | # runningPerplex = 0.0 145 | 146 | # Compare validation edit distance 147 | 148 | predText = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(),\ 149 | [letter2index[''], letter2index['']]) 150 | targetText = transform_index_to_letter(target.detach().cpu().numpy(),\ 151 | [letter2index[''], letter2index['']]) 152 | 153 | runningDist += calc_edit_dist(predText, targetText) 154 | numSeq += len(predText) 155 | 156 | if batch_idx % displayPredFreq == (displayPredFreq-1): 157 | print("-"*20) 158 | print("Pred:\n{}\nTarget:\n{}\n".format(predText[0], targetText[0])) 159 | print("-"*20) 160 | 161 | del data 162 | del target 163 | del dataLens 164 | del targetLens 165 | torch.cuda.empty_cache() 166 | end = time.time() 167 | print("Finished Epoch: {}\tEditDist: {:.5f}\tTimeTook: {:.5f}".format(epoch, runningDist/numSeq, end - start)) 168 | return runningDist/numSeq 169 | 170 | def inference(model, data_loader, hyper, isValid=False): 171 | res = [] 172 | with torch.no_grad(): 173 | model.eval() 174 | model.to(DEVICE) 175 | start = time.time() 176 | if isValid: 177 | targetRes = [] 178 | runningDist = 0.0 179 | numSeq = 0.0 180 | for batch_idx, (data, target, dataLens, targetLens) in enumerate(data_loader): 181 | data, target, dataLens, targetLens = data.to(DEVICE), target.to(DEVICE), dataLens.to(DEVICE), targetLens.to(DEVICE) 182 | predictions = model(speech_input=data, speech_len=dataLens, batchNum=batch_idx, text_input=None, isTrain=False) 183 | 184 | # Compare validation edit distance 185 | 186 | predText = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(),\ 187 | [letter2index[''], letter2index['']]) 188 | targetText = transform_index_to_letter(target.detach().cpu().numpy(),\ 189 | [letter2index[''], letter2index['']]) 190 | res.extend(predText) 191 | targetRes.extend(targetText) 192 | 193 | runningDist += calc_edit_dist(predText, targetText) 194 | numSeq += len(predText) 195 | 196 | if batch_idx % 5 == (5-1): 197 | print("-"*20) 198 | print("Pred:\n{}\nTarget:\n{}\n".format(predText[0], targetText[0])) 199 | print("-"*20) 200 | print("Edit distance for VAL:\t{:.5f}".format(runningDist/numSeq)) 201 | df = pd.DataFrame({"Id" : np.array(list(range(len(res)))), "Predicted" : np.array(res), "Target": np.array(targetRes)}) 202 | df.to_csv(hyper['devPredCSVfn'], index=False) 203 | return df 204 | else: 205 | for batch_idx, (data, dataLens) in enumerate(data_loader): 206 | data, dataLens = data.to(DEVICE), dataLens.to(DEVICE) 207 | predictions = model(data, dataLens, batch_idx, text_input=None, isTrain=False) 208 | predTexts = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(),\ 209 | [letter2index[''], letter2index['']]) 210 | res.extend(predTexts) 211 | 212 | idxs = np.array(list(range(len(res)))) 213 | preds = np.array(res) 214 | np.save(hyper['testPredNpyfn'], preds) 215 | df = pd.DataFrame({"Id" : idxs, "Predicted" : preds}) 216 | df.to_csv(hyper['testPredCSVfn'], index=False) 217 | return df 218 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Table of Contents 2 | - [Table of Contents](#table-of-contents) 3 | - [Introduction](#introduction) 4 | - [HW1 MLP | Phoneme Recognition](#hw1-mlp--phoneme-recognition) 5 | - [HW2 CNN | Face Recognition and Verification](#hw2-cnn--face-recognition-and-verification) 6 | - [HW3 RNN - Forward/Backword/CTC Beamsearch | Connectionist Temporal Classification](#hw3-rnn---forwardbackwordctc-beamsearch--connectionist-temporal-classification) 7 | - [HW4 Word-level Neural Language Models using RNNs | Attention Mechanisms and Memory Networks](#hw4-word-leve-neural-language-models-using-rnns--attention-mechanisms-and-memory-networks) 8 | 9 | ## Introduction 10 | This repo contains course project of [11785 Deep Learning](http://deeplearning.cs.cmu.edu) at CMU. The projects starts off with MLPs and progresses into more complicated concepts like attention and seq2seq models. Each homework assignment consists of two parts. 11 | Part 1 is the Autolab software engineering component that involves engineering my own version of pytorch libraries, implementing important algorithms, and developing optimization methods from scratch. 12 | Part 2 is the Kaggle data science component which work on project on hot AI topics, like speech recognition, face recognition, and neural machine translation. 13 | 14 | 15 | ## HW1 MLP | Phoneme Recognition 16 | 17 | - HW1P1 18 | Implement simple MLP activations, loss, batch normalization. 19 | 20 | - HW1P2 21 | Kaggle challenge: [Frame level classification of speech](https://www.kaggle.com/c/11-785-s20-hw1p2).
Using knowledge of feedforward neural networks and apply it to speech recognition task. The provided dataset consists of audio recordings (utterances) and their phoneme state (subphoneme) lables. The data comes from articles published in the Wall Street Journal (WSJ) that are read aloud and labelled using the original text. 22 | The job is to identify the phoneme state label for each frame in the test dataset. It is important to note that utterances are of variable length. 23 | 24 | ## HW2 CNN | Face Recognition and Verification 25 | - HW2P1 26 | Implement NumPy-based Convolutional Neural Networks libraries. 27 | 28 | - HW2P2 29 | Kaggle challebge: [Face Classification](https://www.kaggle.com/c/11-785-s20-hw2p2-classification) & [Verification](https://www.kaggle.com/c/11-785-s20-hw2p2-verification) using Convolutional Neural Networks.
Given an image of a person’s face, the task of classifying the ID of the face is known as face classification. The input to the system will be a face image and the system will have to predict the ID of the face. The ground truth will be present in the training data and the network will be doing an 30 | N-way classification to get the prediction. The system is provided with a validation set for fine-tuning the model. 31 | ## HW3 RNN - Forward/Backword/CTC Beamsearch | Connectionist Temporal Classification 32 | - HW3P1 33 | Implement RNNs and GRUs deep learning library like PyTorch. 34 | 35 | - HW3P2 36 | Kaggle challenge: [Utterance to Phoneme Mapping](https://www.kaggle.com/c/11-785-s20-hw3p2).
This challenge works with speech data. The contest uses unaligned labels, which means the correlation between the features and labels is not given explicitly and the model will have to figure this out by itself. Hence the data will have a list of phonemes for each utterance, but not which frames correspond to which phonemes. 37 | The main task for this assignment will be to predict the phonemes contained in utterances in the test set. The training data does not contain aligned phonemes, and it is not a requirement to produce alignment for the test data. 38 | 39 | ## HW4 Word-level Neural Language Models using RNNs | Attention Mechanisms and Memory Networks 40 | - HW4P1 41 | Train a Recurrent Neural Network on the WikiText-2 Language Moldeling Dataset. This task uses reucurrent network to model and generate text, and uses various techniques to regularize recurrent networks and improve their performance. 42 | 43 | - HW4P2 44 | Kaggle challenge: [Deep Learning Transcript Generation with Attention](https://www.kaggle.com/c/11-785-s20-hw4p2).
In this challenge, use a combination of Recurrent Neural Networks (RNNs) / Convolutional Neural Networks (CNNs) and Dense Networks to design a system for speech to text transcription. End-to-end, the system should be able to transcribe a given speech utterance to its corresponding transcript. 45 | --------------------------------------------------------------------------------