├── .gitignore
├── notes
    ├── two_tier.jpg
    ├── softmax_visualization.mp4
    └── two_tier.txt
├── lib
    ├── train.py
    ├── __init__.py
    └── ops.py
├── preprocess.py
├── dataset.py
├── vestigial
    └── variable_length_data.py
├── README.md
├── baseline.py
├── baseline_gaussian.py
├── two_tier.py
├── two_tier_v.py
├── conv.py
├── two_tier_conv.py
├── vrnn.py
├── vrnn_ar.py
└── three_tier.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/notes/two_tier.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igul222/speech/HEAD/notes/two_tier.jpg


--------------------------------------------------------------------------------
/notes/softmax_visualization.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igul222/speech/HEAD/notes/softmax_visualization.mp4


--------------------------------------------------------------------------------
/lib/train.py:
--------------------------------------------------------------------------------
 1 | import lib
 2 | import numpy
 3 | import theano
 4 | import theano.tensor as T
 5 | import lasagne
 6 | # from theano.compile.nanguardmode import NanGuardMode
 7 | 
 8 | import math
 9 | import time
10 | import locale
11 | 
12 | import numpy
13 | 
14 | locale.setlocale(locale.LC_ALL, '')
15 | 
16 | def print_params_info(cost, params):
17 |     """Print information about the parameters in the given param set."""
18 | 
19 |     params = sorted(params, key=lambda p: p.name)
20 |     values = [p.get_value(borrow=True) for p in params]
21 |     shapes = [p.shape for p in values]
22 |     print "Params for cost:"
23 |     for param, value, shape in zip(params, values, shapes):
24 |         print "\t{0} ({1})".format(
25 |             param.name,
26 |             ",".join([str(x) for x in shape])
27 |         )
28 | 
29 |     total_param_count = 0
30 |     for shape in shapes:
31 |         param_count = 1
32 |         for dim in shape:
33 |             param_count *= dim
34 |         total_param_count += param_count
35 |     print "Total parameter count: {0}".format(
36 |         locale.format("%d", total_param_count, grouping=True)
37 |     )


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | RAW_DATA_DIR="/media/seagate/blizzard/unsegmented"
 5 | OUTPUT_DIR="/media/seagate/blizzard/parts"
 6 | 
 7 | # Step 1: write all filenames to a list
 8 | with open(OUTPUT_DIR+'/preprocess_file_list.txt', 'w') as f:
 9 |     for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR):
10 |         for filename in filenames:
11 |             f.write("file '" + dirpath + '/'+ filename + "'\n")
12 | 
13 | # Step 2: concatenate everything into one massive wav file
14 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(OUTPUT_DIR, OUTPUT_DIR))
15 | 
16 | # # get the length of the resulting file
17 | length = float(subprocess.check_output('ffprobe -i {}/preprocess_all_audio.wav -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR), shell=True))
18 | 
19 | # # Step 3: split the big file into 8-second chunks
20 | for i in xrange(int(length)//8 - 1):
21 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(i, OUTPUT_DIR, OUTPUT_DIR, i))
22 | 
23 | # # Step 4: clean up temp files
24 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
25 | os.system('rm {}/preprocess_file_list.txt'.format(OUTPUT_DIR))


--------------------------------------------------------------------------------
/notes/two_tier.txt:
--------------------------------------------------------------------------------
 1 | Description of the model implemented in two_tier.py
 2 | =========
 3 | 
 4 | The model operates on scalar-quantized 16KHz speech waveforms, sample-by-sample:
 5 | each waveform is a sequence x = x_0, x_1, ..., x_t, where each x_i represents 
 6 | one sample, and can take one of 256 discrete values, corresponding to 256 linear
 7 | quantization levels.
 8 | 
 9 | The model (taken as a whole) is purely autoregressive; it factorizes the 
10 | distribution P(x) over length-t waveforms as:
11 | P(x) = P(x_0) * P(x_1 | x_0) * P(x_2 | x_0, x_1) * ... * P(x_t | x_0:x_t-1)
12 | 
13 | First I break the sequence into frames of 4 samples each:
14 | 
15 | f_0:4 = [x_0, x_1, x_2, x_3]
16 | f_4:7 = [x_4, x_5, x_6, x_7]
17 | etc...
18 | 
19 | I run an RNN (specifically, a 3-layer 1024-dim GRU) over these frames (first
20 | converting the discrete-valued samples in the frames back into continuous values
21 | so that they can be fed into the RNN).
22 | 
23 | I apply 4 separate 1024->1024 linear projections to the output of the RNN at 
24 | each frame (one linear projection per sample). For a frame f_t:t+3, I'll call
25 | the output of these 4 linear projections o_t, o_t+1, o_t+2, and o_t+3.
26 | 
27 | Finally, an MLP predicts (using softmax) a distribution over x_t conditioned on
28 | x_t-1, x_t-2, x_t-3, x_t-4, and o_t-4. Here rather than feeding in the 
29 | real-valued samples I find the network performs better if I represent each 
30 | sample as a 256-dim one-hot vector and concatenate the vectors for each sample, 
31 | along with o_t-4. (In the implementation I use an embedding table for efficiency).
32 | 
33 | Training details (most of these don't really matter that much):
34 | 
35 | I train on 8-second sequences from the Blizzard dataset, using truncated BPTT.
36 | Each truncated BPTT subsequence contains 256 samples (or 64 frames). Minibatch 
37 | size 128.
38 | 
39 | I use Adam (default settings). I apply weight normalization 
40 | (Salimans & Kingma 2016) on all weight matrices, which lets me use Adam's 
41 | default learning rate of 1e-3. If you don't use weight norm, try lowering your
42 | learning rate to 2e-4.
43 | 
44 | Gradients are clipped elementwise to +/- 1.
45 | 
46 | All weight matrices are initialized to uniform distributions with stdev 
47 | 1/sqrt(fan_in) (LeCun 1998) except ones which occur before ReLUs; there I use
48 | the initialization from (He et al. 2015).
49 | 
50 | For Blizzard, samples usually start to sound okay after ~50K iterations with the
51 | above settings. This takes about 6 hours on a Titan X for a 512-dim model.


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | import ops
 2 | import train as _train
 3 | 
 4 | import numpy
 5 | import theano
 6 | import theano.tensor as T
 7 | 
 8 | import cPickle as pickle
 9 | 
10 | _params = {}
11 | def param(name, *args, **kwargs):
12 |     """
13 |     A wrapper for `theano.shared` which enables parameter sharing in models.
14 |     
15 |     Creates and returns theano shared variables similarly to `theano.shared`, 
16 |     except if you try to create a param with the same name as a 
17 |     previously-created one, `param(...)` will just return the old one instead of 
18 |     making a new one.
19 | 
20 |     This constructor also adds a `param` attribute to the shared variables it 
21 |     creates, so that you can easily search a graph for all params.
22 |     """
23 | 
24 |     if name not in _params:
25 |         kwargs['name'] = name
26 |         param = theano.shared(*args, **kwargs)
27 |         param.param = True
28 |         _params[name] = param
29 |     return _params[name]
30 | 
31 | def delete_params(name):
32 |     to_delete = [p_name for p_name in _params if name in p_name]
33 |     for p_name in to_delete:
34 |         del _params[p_name]
35 | 
36 | def search(node, critereon):
37 |     """
38 |     Traverse the Theano graph starting at `node` and return a list of all nodes
39 |     which match the `critereon` function. When optimizing a cost function, you 
40 |     can use this to get a list of all of the trainable params in the graph, like
41 |     so:
42 | 
43 |     `lib.search(cost, lambda x: hasattr(x, "param"))`
44 |     """
45 | 
46 |     def _search(node, critereon, visited):
47 |         if node in visited:
48 |             return []
49 |         visited.add(node)
50 | 
51 |         results = []
52 |         if isinstance(node, T.Apply):
53 |             for inp in node.inputs:
54 |                 results += _search(inp, critereon, visited)
55 |         else: # Variable node
56 |             if critereon(node):
57 |                 results.append(node)
58 |             if node.owner is not None:
59 |                 results += _search(node.owner, critereon, visited)
60 |         return results
61 | 
62 |     return _search(node, critereon, set())
63 | 
64 | def floatX(x):
65 |     """
66 |     Convert `x` to the numpy type specified in `theano.config.floatX`.
67 |     """
68 | 
69 |     if theano.config.floatX == 'float16':
70 |         return numpy.float16(x)
71 |     elif theano.config.floatX == 'float32':
72 |         return numpy.float32(x)
73 |     else: # Theano's default float type is float64
74 |         print "Warning: lib.floatX using float64"
75 |         return numpy.float64(x)
76 | 
77 | def save_params(path):
78 |     param_vals = {}
79 |     for name, param in _params.iteritems():
80 |         param_vals[name] = param.get_value()
81 | 
82 |     with open(path, 'wb') as f:
83 |         pickle.dump(param_vals, f)
84 | 
85 | def load_params(path):
86 |     with open(path, 'rb') as f:
87 |         param_vals = pickle.load(f)
88 | 
89 |     for name, val in param_vals.iteritems():
90 |         _params[name].set_value(val)
91 | 
92 | def clear_all_params():
93 |     to_delete = [p_name for p_name in _params]
94 |     for p_name in to_delete:
95 |         del _params[p_name]


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | 
  6 | import numpy
  7 | import scipy.io.wavfile
  8 | import scikits.audiolab
  9 | 
 10 | import random
 11 | import time
 12 | 
 13 | random_seed = 123
 14 | 
 15 | def feed_epoch(data_path, n_files, BATCH_SIZE, SEQ_LEN, OVERLAP, Q_LEVELS, Q_ZERO):
 16 |     global random_seed
 17 |     """
 18 |     Generator that yields training inputs (subbatch, reset). `subbatch` contains
 19 |     quantized audio data; `reset` is a boolean indicating the start of a new
 20 |     sequence (i.e. you should reset h0 whenever `reset` is True).
 21 | 
 22 |     Feeds subsequences which overlap by a specified amount, so that the model
 23 |     can always have target for every input in a given subsequence.
 24 | 
 25 |     Loads sequentially-named FLAC files in a directory
 26 |     (p0.flac, p1.flac, p2.flac, ..., p[n_files-1].flac)
 27 | 
 28 |     Assumes all flac files have the same length.
 29 | 
 30 |     data_path: directory containing the flac files
 31 |     n_files: how many FLAC files are in the directory
 32 |     (see two_tier.py for a description of the constants)
 33 | 
 34 |     returns: (subbatch, reset)
 35 |     subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
 36 |     reset: True or False
 37 |     """
 38 | 
 39 |     def round_to(x, y):
 40 |         """round x up to the nearest y"""
 41 |         return int(numpy.ceil(x / float(y))) * y
 42 | 
 43 |     def batch_quantize(data):
 44 |         """
 45 |         floats in (-1, 1) to ints in [0, Q_LEVELS-1]
 46 |         scales normalized across axis 1
 47 |         """
 48 |         eps = numpy.float64(1e-5)
 49 | 
 50 |         data -= data.min(axis=1)[:, None]
 51 | 
 52 |         data *= ((Q_LEVELS - eps) / data.max(axis=1)[:, None])
 53 |         data += eps/2
 54 |         # print "WARNING using zero-dc-offset normalization"
 55 |         # data -= data.mean(axis=1)[:, None]
 56 |         # data *= (((Q_LEVELS/2.) - eps) / numpy.abs(data).max(axis=1)[:, None])
 57 |         # data += Q_LEVELS/2
 58 | 
 59 |         data = data.astype('int32')
 60 | 
 61 |         return data
 62 | 
 63 |     paths = [data_path+'/p{}.flac'.format(i) for i in xrange(n_files)]
 64 | 
 65 |     random.seed(random_seed)
 66 |     random.shuffle(paths)
 67 |     random_seed += 1
 68 | 
 69 |     batches = []
 70 |     for i in xrange(len(paths) / BATCH_SIZE):
 71 |         batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
 72 | 
 73 |     random.shuffle(batches)
 74 | 
 75 |     for batch_paths in batches:
 76 |         # batch_seq_len = length of longest sequence in the batch, rounded up to
 77 |         # the nearest SEQ_LEN.
 78 |         batch_seq_len = len(scikits.audiolab.flacread(batch_paths[0])[0])
 79 |         batch_seq_len = round_to(batch_seq_len, SEQ_LEN)
 80 | 
 81 |         batch = numpy.zeros(
 82 |             (BATCH_SIZE, batch_seq_len), 
 83 |             dtype='float64'
 84 |         )
 85 | 
 86 |         for i, path in enumerate(batch_paths):
 87 |             data, fs, enc = scikits.audiolab.flacread(path)
 88 |             batch[i, :len(data)] = data
 89 | 
 90 |         if Q_LEVELS != None:
 91 |             batch = batch_quantize(batch)
 92 | 
 93 |             batch = numpy.concatenate([
 94 |                 numpy.full((BATCH_SIZE, OVERLAP), Q_ZERO, dtype='int32'),
 95 |                 batch
 96 |             ], axis=1)
 97 |         else:
 98 |             batch = numpy.concatenate([
 99 |                 numpy.full((BATCH_SIZE, OVERLAP), 0, dtype='float32'),
100 |                 batch
101 |             ], axis=1)
102 |             batch = batch.astype('float32')
103 | 
104 |             batch -= batch.mean()
105 |             batch /= batch.std()
106 | 
107 |         for i in xrange((batch.shape[1] - OVERLAP) // SEQ_LEN):
108 |             reset = numpy.int32(i==0)
109 |             subbatch = batch[:, i*SEQ_LEN : (i+1)*SEQ_LEN + OVERLAP]
110 |             yield (subbatch, reset)


--------------------------------------------------------------------------------
/vestigial/variable_length_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | 
  5 | WARNING: I'm pretty sure there's a bug in here somewhere:
  6 | I can't get the same training loss that I get with data.py's feed_epoch using
  7 | load_sequential_flac_files and feed_data.
  8 | """
  9 | 
 10 | import numpy
 11 | import scipy.io.wavfile
 12 | import scikits.audiolab
 13 | 
 14 | import random
 15 | import time
 16 | 
 17 | def load_segmented_blizzard_metadata(data_path, test_set_size):
 18 |     """
 19 |     data_path: path to the blizzard dataset (should have a subdirectory 'segmented' with a file 'prompts.gui')
 20 |     test_set_size: how many files to use for the test set
 21 |     """
 22 |     with open(DATA_PATH+'/prompts.gui') as prompts_file:
 23 |         lines = [l[:-1] for l in prompts_file]
 24 | 
 25 |     filepaths = [DATA_PATH + '/wavn/' + fname + '.wav' for fname in lines[::3]]
 26 |     transcripts = lines[1::3]
 27 | 
 28 |     # Clean up the transcripts
 29 |     for i in xrange(len(transcripts)):
 30 |         t = transcripts[i]
 31 |         t = t.replace('@ ', '')
 32 |         t = t.replace('# ', '')
 33 |         t = t.replace('| ', '')
 34 |         t = t.lower()
 35 |         transcripts[i] = t
 36 | 
 37 |     # We use '*' as a null padding character
 38 |     charmap = {'*': 0}
 39 |     inv_charmap = ['*']
 40 |     for t in transcripts:
 41 |         for char in t:
 42 |             if char not in charmap:
 43 |                 charmap[char] = len(charmap)
 44 |                 inv_charmap.append(char)
 45 | 
 46 |     all_data = zip(filepaths, transcripts)
 47 |     random.seed(123)
 48 |     random.shuffle(all_data)
 49 |     train_data = all_data[test_set_size:]
 50 |     test_data  = all_data[:test_set_size]
 51 | 
 52 |     return charmap, inv_charmap, train_data, test_data
 53 | 
 54 | def load_sequential_flac_files(data_path, n_files, test_set_size):
 55 |     """
 56 |     Load sequentially-named FLAC files in a directory
 57 |     (p0.flac, p1.flac, p2.flac, ..., p[n_files-1].flac)
 58 | 
 59 |     data_path: directory containing the flac files
 60 |     n_files: how many FLAC files are in the directory
 61 |     test_set_size: how many files to use for the test set
 62 |     """
 63 |     filepaths = [data_path+'/p{}.flac'.format(i) for i in xrange(n_files)]
 64 |     transcripts = ['*' for i in xrange(n_files)]
 65 |     charmap = {'*': 0}
 66 |     inv_charmap = ['*']
 67 |     all_data = zip(filepaths, transcripts)
 68 |     random.seed(123)
 69 |     random.shuffle(all_data)
 70 |     train_data = all_data[test_set_size:]
 71 |     test_data  = all_data[:test_set_size]
 72 |     return charmap, inv_charmap, train_data, test_data
 73 | 
 74 | def feed_data(data, charmap, shuffle, BATCH_SIZE, BITRATE, Q_LEVELS, Q_ZERO, N_PREV_SAMPLES, SEQ_LEN):
 75 |     """
 76 |     see the top of twotier.py for a description of the constants
 77 |     """
 78 |     def read_audio_file(path):
 79 |         if path.endswith('wav'):
 80 |             audio = scipy.io.wavfile.read(path)[1].astype('float64')
 81 |         elif path.endswith('flac'):
 82 |             audio = scikits.audiolab.flacread(path)[0]
 83 |         else:
 84 |             raise Exception('Unknown filetype')
 85 | 
 86 |         eps = numpy.float64(1e-5)
 87 |         audio -= audio.min()
 88 |         audio *= (Q_LEVELS - eps) / audio.max()
 89 |         audio += eps/2
 90 |         return audio.astype('int32')
 91 | 
 92 |     _data = list(data)
 93 |     if shuffle:
 94 |         random.shuffle(_data)
 95 | 
 96 |     # Make sure the buffer size is longer than the longest sample in the dataset
 97 |     buffer = numpy.full((BATCH_SIZE, BITRATE*40), Q_ZERO, dtype='int32')
 98 |     head = 0
 99 |     transcripts = [None] * BATCH_SIZE
100 | 
101 |     while True:
102 |         # Load new sequences into the buffer if necessary
103 |         resets = numpy.zeros(BATCH_SIZE, dtype='int32')
104 |         for i in xrange(BATCH_SIZE):
105 |             if numpy.array_equiv(buffer[i, head:], Q_ZERO):
106 |                 if len(_data) == 0:
107 |                     return # We've exhausted the dataset.
108 |                 path, transcript = _data.pop()
109 |                 audio = read_audio_file(path)
110 |                 # We add a few samples of Q_ZERO in the beginning to match
111 |                 # generation time (where we generate starting from zeros).
112 |                 if len(audio) + N_PREV_SAMPLES > buffer.shape[1] - head:
113 |                     raise Exception('Audio file too long!')
114 |                 buffer[i, head+N_PREV_SAMPLES:head+len(audio)+N_PREV_SAMPLES] = audio
115 |                 transcripts[i] = transcript
116 |                 resets[i] = 1
117 | 
118 |         # Make a dense (padded) transcript matrix from transcripts
119 |         padded_transcripts = numpy.full(
120 |             (BATCH_SIZE, max(len(x) for x in transcripts)),
121 |             charmap['*'],
122 |             dtype='int32'
123 |         )
124 |         for i, t in enumerate(transcripts):
125 |             padded_transcripts[i, :len(t)] = [charmap[c] for c in t]
126 | 
127 |         # Yield the data batch
128 |         yield (
129 |             buffer[:, head:head+SEQ_LEN],
130 |             padded_transcripts,
131 |             resets
132 |         )
133 | 
134 |         # Advance the head and if needed, roll the buffer
135 |         buffer[:, head:head+SEQ_LEN] = Q_ZERO
136 |         head += SEQ_LEN
137 |         if head > buffer.shape[1] // 100:
138 |             buffer = numpy.roll(buffer, -head, axis=1)
139 |             head = 0


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | - **5/13**: Does my two-tier model actually learn longer-term dependencies, or does it just train faster? I vary frame size, controlling for sequence length, number of params, number of iters.
 2 | 	- Frame size 4: `twotier_determ_bigrun_qzero_1462749482`, 1.827 iters 0-10K, 1.513 iters 90K-100K. (copied from below)
 3 | 	- Frame size 2: `twotier_fs2_iters_1463123438` 1.775 iters 0-10K, 1.485 iters 90-100K.
 4 | 	- Frame size 1: `twotier_fs1_iters_1463157179` (aborted but got to 70K iters)
 5 | - **5/12**: I run the two-tier model with frame_size=2.
 6 | 	- Evaluating by wall-clock time, taking the better of n_frames=64, 128
 7 | 		- `twotier_fs2_nf64_time_1463123320` 1.834 first hour, 1.523 12th hour
 8 | 		- `twotier_fs2_nf128_time_1463123388` 1.883 first hour, 1.504 12th hour
 9 | 	- Interesting: frame size 2 performs (almost) as well as frame size 4. What about fs 1?
10 | 		- n_frames 64 `twotier_fs1_nf64_time_1463175548` (see spreadsheet)
11 | 		- n_frames 128 `twotier_fs1_nf128_time_1463175563` (see spreadsheet)
12 | 		- n_frames 256 `twotier_fs1_nf256_time_1463175585` (see spreadsheet)
13 | - **5/10**: I try overfitting to Kyle's kiwi01.wav. I train for 6 hours, generating samples every hour.
14 | 	- Both two-tier model and baseline (`baseline_kiwi_1462942688`, `twotier_kiwi_1462942828`) get almost-zero train cost, and generate samples indistinguishable from the original.
15 | - **5/9**: Per Yoshua's suggestion I add a term to the loss function asking the frame-level RNN to predict the next frame, without help of the sample-level MLP.
16 | 	- Before: `twotier_determ_bigrun_qzero_1462749482`, 1.827 iters 0-10K, 1.513 iters 90K-100K. (copied from below)
17 | 	- After: `twotier_ipcost_1462871075` 1.928 iters 0-10K, 1.537 iters 90K-100K. Samples are a little different but I'm not sure they're any better or worse.
18 | 	- I also try weighting the auxiliary cost term by 0.1: `twotier_ipcost_weighted_1462891119` 1.848 iters 0-10K, 1.520 iters 90-100K. Samples indistinguishable from original model.
19 | 	- Conclusions
20 | 		- This is basically multi-task learning, which usually works as a regularizer in regimes of limited data. But our data here is unlimited, so it's reasonable that this doesn't help NLL.
21 | 		- It's still possible that this method might produce better samples in some scenarios (even though it didn't seem to here), so I'll keep trying this in future experiments.
22 | - **5/9**: I try changing my input normalization so that samples have zero DC offset (per Kyle McDonald's suggestion). Unfortunately this is probably going to improve NLL, but in a way that's meaningless. I'll evaluate by listening to samples and checking them in Audacity.
23 | 	- `twotier_zero_dc_offset_1462873780` 1.792 iters 0-10K, 1.504 iters 90K-100K. Samples seem weirdly broken though: speech still sounds good, but there's a very faint whining noise in the background the whole time. Maybe this is something to come back to if I have more time but for now I'm just going to leave it off.
24 | - **5/9**: I implement a flat, baseline model (`baseline.py`) and evaluate it against the two-tier model.
25 | 	- Basically a language model: 3 layers of stacked 512-dim GRU, taking as input one sample at a time and predicting the next timestep.
26 | 	- I try two variants: one feeding values into the GRUs as real values (what I did in two-tier), the other as embeddings of 256 discrete values.
27 | 	- I report NLLs in bits per sample on the train set (not perfect procedure, but mostly-OK because I never make it through one epoch).
28 | 	- Controlling for wall-clock time, where each model uses its own reasonable hyperparams (to see which model "wins" overall):
29 | 		- Two-tier: `twotier_time_benchmark_1462865129` 1.833 first hour, 1.503 12th hour. Samples a little noisy but decent / not broken. ***best model***
30 | 		- Flat reals seqlen 64: `speech_baseline_time_reals_seqlen64_1462866948` 2.057 first hour, 1.696 12th hour. Samples clean but "warbly" / guttural sounding?
31 | 		- Flat reals seqlen 128: `speech_baseline_time_reals_seqlen128_1462867000` 2.143 first hour, 1.612 12th hour ***best baseline model***
32 | 		- Flat embeddings seqlen 64: `speech_baseline_time_embed_seqlen64_1462867483` 2.104 first hour, 1.688 12th hour
33 | 		- Flat embeddings seqlen 128: `speech_baseline_time_embed_seqlen128_1462867499` 2.144 first hour, 1.624 12th hour
34 | 		- **5/13**: I run even more hyperparam combinations to be thorough.
35 | 			- Flat reals seqlen 256 512dim 3-layer `baseline_seqlen256_time_1463191213`
36 | 			- Two-tier 512dim 4-layer `twotier_512d_4layer_1463191505`
37 | 			- Two-tier 512dim 5-layer `twotier_512d_5layer_1463192292`
38 | 			- Two-tier 1024dim 3-layer `twotier_1024d_3layer_1463191610`
39 | 			- Two-tier 1024dim 4-layer `twotier_1024d_4layer_1463192438`
40 | 			- Two-tier 1024dim 5-layer `twotier_1024d_5layer_1463191722`
41 | 			- Flat reals seqlen 128 512dim 4-layer `baseline_seqlen128_512d_4layer_1463191559`
42 | 			- Flat reals seqlen 128 512dim 5-layer `baseline_seqlen128_512d_5layer_1463192296`
43 | 			- Flat reals seqlen 128 1024dim 3-layer `baseline_seqlen128_1024d_3layer_1463191659`
44 | 			- Flat reals seqlen 128 1024dim 4-layer `baseline_seqlen128_1024d_4layer_1463192446`
45 | 			- Flat reals seqlen 128 1024dim 5-layer `baseline_seqlen128_1024d_5layer_1463191875`
46 | 	- <del>To see what happens if we ignore differences in training speed, I run a trial controlling for number of training steps, where each step sees the same sequence length (256) and batch size (128).</del>
47 | 		- <del>Two-tier: `twotier_determ_bigrun_qzero_1462749482`, 1.827 iters 0-10K, 1.513 iters 90K-100K.</del>
48 | 		- <del>Flat reals: `speech_baseline_iters_reals_1462866911` 2.003 iters 0-10K, 1.528 iters 90K-100K.</del>
49 | 		- <del>Flat embeddings: `speech_baseline_iters_embed_1462867526` 1.961 iters 0-10K, 1.534 iters 90K-100K.</del>
50 | 		- Update: I don't think these results are valid experimental procedure since I didn't control for time (giving baseline an advantage) or number of params (giving two-tier an advantage). Probably best to ignore them. Instead see the results for `twotier_fs1_iters_1463157179` above.
51 | 	- Conclusions
52 | 		- If you ignore training speed, for the hyperparameters tested, my model slightly outperforms the baseline. 
53 | 		- But I don't think it's fair to ignore training speed. If you control for training speed, for the hyperparameters tested, my model outperforms the baseline by a wider margin.
54 | - **5/8**: To better understand how the model uses its softmax output, I sample from a 1024-dim model trained for 50K iterations and plot the softmax output distribution at each timestep. See `notes/softmax_visualization.mp4` (action starts around 7:00). I find the model learns roughly-Gaussian unimodal distributions.
55 | - **5/8**: I'm worried that the samples don't sound quite as good as the old implementation for some reason, so I make the script deterministic (`numpy.random.seed(123)`) and carefully step through the entire model, making sure its generated samples matched my previous implementation number-for-number.
56 | - **5/7**: Initial release of a cleaned-up (actually mostly rewritten) version of my current best model in `two_tier.py`. Written description in `notes/two_tier.txt` and hastily-drawn model diagram in `notes/two_tier.jpg`.


--------------------------------------------------------------------------------
/lib/ops.py:
--------------------------------------------------------------------------------
  1 | import lib
  2 | import numpy
  3 | import theano
  4 | import theano.tensor as T
  5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  6 | 
  7 | srng = RandomStreams(seed=234)
  8 | 
  9 | def Linear(
 10 |         name, 
 11 |         input_dims, 
 12 |         output_dim, 
 13 |         inputs,
 14 |         biases=True,
 15 |         initialization=None,
 16 |         weightnorm=True
 17 |         ):
 18 |     # print "warning weightnorm off"
 19 | 
 20 |     """
 21 |     Compute a linear transform of one or more inputs, optionally with a bias.
 22 | 
 23 |     input_dims: list of ints, or int (if single input); the dimensionality of
 24 |                 the input(s).
 25 |     output_dim: the dimensionality of the output.
 26 |     biases:     whether or not to include a bias term.
 27 |     inputs:     a theano variable, or list of variables (if multiple inputs);
 28 |                 the inputs to which to apply the transform.
 29 |     initialization: one of None, `lecun`, `he`, `orthogonal`
 30 |     """
 31 | 
 32 |     if not isinstance(input_dims, list):
 33 |         input_dims = [input_dims]
 34 |         inputs = [inputs]
 35 | 
 36 |     terms = []
 37 | 
 38 |     def uniform(stdev, size):
 39 |         """uniform distribution with the given stdev and size"""
 40 |         return numpy.random.uniform(
 41 |             low=-stdev * numpy.sqrt(3),
 42 |             high=stdev * numpy.sqrt(3),
 43 |             size=size
 44 |         ).astype(theano.config.floatX)
 45 | 
 46 |     for i, (inp, inp_dim) in enumerate(zip(inputs, input_dims)):
 47 |         if initialization == 'lecun' or (initialization == None and inp_dim != output_dim):
 48 |             weight_values = uniform(numpy.sqrt(1. / inp_dim), (inp_dim, output_dim))
 49 |         elif initialization == 'he':
 50 |             weight_values = uniform(numpy.sqrt(2. / inp_dim), (inp_dim, output_dim))
 51 |         elif initialization == 'orthogonal' or (initialization == None and inp_dim == output_dim):
 52 |             # From lasagne
 53 |             def sample(shape):
 54 |                 if len(shape) < 2:
 55 |                     raise RuntimeError("Only shapes of length 2 or more are "
 56 |                                        "supported.")
 57 |                 flat_shape = (shape[0], numpy.prod(shape[1:]))
 58 |                 # TODO: why normal and not uniform?
 59 |                 a = numpy.random.normal(0.0, 1.0, flat_shape)
 60 |                 u, _, v = numpy.linalg.svd(a, full_matrices=False)
 61 |                 # pick the one with the correct shape
 62 |                 q = u if u.shape == flat_shape else v
 63 |                 q = q.reshape(shape)
 64 |                 return q.astype(theano.config.floatX)
 65 |             weight_values = sample((inp_dim, output_dim))
 66 |         else:
 67 |             raise Exception("Invalid initialization!")
 68 | 
 69 |         weight = lib.param(
 70 |             name + '.W'+str(i),
 71 |             weight_values
 72 |         )
 73 | 
 74 |         if weightnorm:
 75 |             norm_values = numpy.linalg.norm(weight_values, axis=0)
 76 |             norms = lib.param(
 77 |                 name + '.g'+str(i),
 78 |                 norm_values
 79 |             )
 80 | 
 81 |             normed_weight = weight * (norms / weight.norm(2, axis=0)).dimshuffle('x', 0)
 82 |             terms.append(T.dot(inp, normed_weight))
 83 |         else:
 84 |             terms.append(T.dot(inp, weight))
 85 | 
 86 |     if biases:
 87 |         terms.append(lib.param(
 88 |             name + '.b',
 89 |             numpy.zeros((output_dim,), dtype=theano.config.floatX)
 90 |         ))
 91 | 
 92 |     out = reduce(lambda a,b: a+b, terms)
 93 |     out.name = name + '.output'
 94 |     return out
 95 | 
 96 | 
 97 | def Embedding(name, n_symbols, output_dim, indices):
 98 |     vectors = lib.param(
 99 |         name,
100 |         numpy.random.randn(
101 |             n_symbols, 
102 |             output_dim
103 |         ).astype(theano.config.floatX)
104 |     )
105 | 
106 |     output_shape = [
107 |         indices.shape[i]
108 |         for i in xrange(indices.ndim)
109 |     ] + [output_dim]
110 | 
111 |     return vectors[indices.flatten()].reshape(output_shape)
112 | 
113 | def softmax_and_sample(logits):
114 |     old_shape = logits.shape
115 |     flattened_logits = logits.reshape((-1, logits.shape[logits.ndim-1]))
116 |     samples = T.cast(
117 |         srng.multinomial(pvals=T.nnet.softmax(flattened_logits)),
118 |         theano.config.floatX
119 |     ).reshape(old_shape)
120 |     return T.argmax(samples, axis=samples.ndim-1)
121 | 
122 | def Recurrent(name, hidden_dims, step_fn, inputs, non_sequences=[], h0s=None):
123 |     if not isinstance(inputs, list):
124 |         inputs = [inputs]
125 | 
126 |     if not isinstance(hidden_dims, list):
127 |         hidden_dims = [hidden_dims]
128 | 
129 |     if h0s is None:
130 |         h0s = [None]*len(hidden_dims)
131 | 
132 |     for i in xrange(len(hidden_dims)):
133 |         if h0s[i] is None:
134 |             h0_unbatched = lib.param(
135 |                 name + '.h0_' + str(i),
136 |                 numpy.zeros((hidden_dims[i],), dtype=theano.config.floatX)
137 |             )
138 |             num_batches = inputs[0].shape[1]
139 |             h0s[i] = T.alloc(h0_unbatched, num_batches, hidden_dims[i])
140 | 
141 |         h0s[i] = T.patternbroadcast(h0s[i], [False] * h0s[i].ndim)
142 | 
143 |     outputs, _ = theano.scan(
144 |         step_fn,
145 |         sequences=inputs,
146 |         outputs_info=h0s,
147 |         non_sequences=non_sequences
148 |     )
149 | 
150 |     return outputs
151 | 
152 | def GRUStep(name, input_dim, hidden_dim, current_input, last_hidden):
153 |     processed_input = lib.ops.Linear(
154 |         name+'.Input',
155 |         input_dim,
156 |         3 * hidden_dim,
157 |         current_input
158 |     )
159 | 
160 |     gates = T.nnet.sigmoid(
161 |         lib.ops.Linear(
162 |             name+'.Recurrent_Gates',
163 |             hidden_dim,
164 |             2 * hidden_dim,
165 |             last_hidden,
166 |             biases=False
167 |         ) + processed_input[:, :2*hidden_dim]
168 |     )
169 | 
170 |     update = gates[:, :hidden_dim]
171 |     reset  = gates[:, hidden_dim:]
172 | 
173 |     scaled_hidden = reset * last_hidden
174 | 
175 |     candidate = T.tanh(
176 |         lib.ops.Linear(
177 |             name+'.Recurrent_Candidate', 
178 |             hidden_dim, 
179 |             hidden_dim, 
180 |             scaled_hidden,
181 |             biases=False,
182 |             initialization='orthogonal'
183 |         ) + processed_input[:, 2*hidden_dim:]
184 |     )
185 | 
186 |     one = lib.floatX(1.0)
187 |     return (update * candidate) + ((one - update) * last_hidden)
188 | 
189 | def LowMemGRU(name, input_dim, hidden_dim, inputs, h0=None):
190 |     inputs = inputs.dimshuffle(1,0,2)
191 | 
192 |     def step(current_input, last_hidden):
193 |         return GRUStep(
194 |             name+'.Step', 
195 |             input_dim, 
196 |             hidden_dim, 
197 |             current_input, 
198 |             last_hidden
199 |         )
200 | 
201 |     if h0 is None:
202 |         h0s = None
203 |     else:
204 |         h0s = [h0]
205 | 
206 |     out = Recurrent(
207 |         name+'.Recurrent',
208 |         hidden_dim,
209 |         step,
210 |         inputs,
211 |         h0s=h0s
212 |     )
213 | 
214 |     out = out.dimshuffle(1,0,2)
215 |     out.name = name+'.output'
216 |     return out


--------------------------------------------------------------------------------
/baseline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.wait_for_gpu(high_priority=False, debug=True)
 11 | except ImportError:
 12 |     pass
 13 | 
 14 | import numpy
 15 | numpy.random.seed(123)
 16 | import random
 17 | random.seed(123)
 18 | 
 19 | import dataset
 20 | 
 21 | import theano
 22 | import theano.tensor as T
 23 | import theano.ifelse
 24 | import lib
 25 | import lasagne
 26 | import scipy.io.wavfile
 27 | 
 28 | import time
 29 | import functools
 30 | import itertools
 31 | 
 32 | # Hyperparams
 33 | BATCH_SIZE = 128
 34 | SEQ_LEN = 256 # How many audio samples to include in each truncated BPTT pass
 35 | SEQ_LEN_ANNEAL_ITERS = 1
 36 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 37 | N_GRUS = 4 # How many GRUs to stack in the frame-level model
 38 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 39 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 40 | 
 41 | # Dataset
 42 | DATA_PATH = '/media/seagate/blizzard/parts'
 43 | N_FILES = 141703
 44 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 45 | # N_FILES = 516
 46 | BITRATE = 16000
 47 | 
 48 | # Other constants
 49 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 50 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
 51 | STOP_ITERS = 200*1000 # Stop after this many iterations
 52 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 53 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 54 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 55 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 56 | 
 57 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, 256, 1, Q_LEVELS, Q_ZERO)
 58 | for i in xrange(100*500):
 59 |     data_feeder.next()
 60 | 
 61 | print "Model settings:"
 62 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 63 | all_vars = sorted(all_vars, key=lambda x: x[0])
 64 | for var_name, var_value in all_vars:
 65 |     print "\t{}: {}".format(var_name, var_value)
 66 | 
 67 | def sample_level_rnn(input_sequences, h0, reset):
 68 |     """
 69 |     input_sequences.shape: (batch size, seq len)
 70 |     h0.shape:              (batch size, N_GRUS, DIM)
 71 |     reset.shape:           ()
 72 |     output.shape:          (batch size, seq len, Q_LEVELS)
 73 |     """
 74 | 
 75 |     learned_h0 = lib.param(
 76 |         'SampleLevel.h0',
 77 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
 78 |     )
 79 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
 80 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
 81 | 
 82 |     # Embedded inputs
 83 |     #################
 84 | 
 85 |     FRAME_SIZE = Q_LEVELS
 86 |     frames = lib.ops.Embedding('SampleLevel.Embedding', Q_LEVELS, Q_LEVELS, input_sequences)
 87 | 
 88 |     # Real-valued inputs
 89 |     ####################
 90 | 
 91 |     # 'frames' of size 1
 92 |     # FRAME_SIZE = 1
 93 |     # frames = input_sequences.reshape((
 94 |     #     input_sequences.shape[0],
 95 |     #     input_sequences.shape[1],
 96 |     #     1
 97 |     # ))
 98 |     # # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
 99 |     # # (a reasonable range to pass as inputs to the RNN)
100 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
101 |     # frames *= lib.floatX(2)
102 | 
103 |     gru0 = lib.ops.LowMemGRU('SampleLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
104 |     # gru0 = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU0FF', DIM, DIM, gru0, initialization='he'))
105 |     grus = [gru0]
106 |     for i in xrange(1, N_GRUS):
107 |         gru = lib.ops.LowMemGRU('SampleLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
108 |         # gru = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU'+str(i)+'FF', DIM, DIM, gru, initialization='he'))
109 |         grus.append(gru)
110 | 
111 |     # We apply the softmax later
112 |     output = lib.ops.Linear(
113 |         'Output',
114 |         N_GRUS*DIM,
115 |         Q_LEVELS,
116 |         T.concatenate(grus, axis=2)
117 |     )
118 |     # output = lib.ops.Linear(
119 |     #     'Output',
120 |     #     DIM,
121 |     #     Q_LEVELS,
122 |     #     grus[-1]
123 |     # )
124 | 
125 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
126 | 
127 |     return (output, last_hidden)
128 | 
129 | sequences   = T.imatrix('sequences')
130 | h0          = T.tensor3('h0')
131 | reset       = T.iscalar('reset')
132 | 
133 | input_sequences = sequences[:, :-1]
134 | target_sequences = sequences[:, 1:]
135 | 
136 | sample_level_outputs, new_h0 = sample_level_rnn(input_sequences, h0, reset)
137 | 
138 | cost = T.nnet.categorical_crossentropy(
139 |     T.nnet.softmax(sample_level_outputs.reshape((-1, Q_LEVELS))),
140 |     target_sequences.flatten()
141 | ).mean()
142 | 
143 | # By default we report cross-entropy cost in bits. 
144 | # Switch to nats by commenting out this line:
145 | cost = cost * lib.floatX(1.44269504089)
146 | 
147 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
148 | lib._train.print_params_info(cost, params)
149 | 
150 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
151 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
152 | 
153 | updates = lasagne.updates.adam(grads, params)
154 | 
155 | train_fn = theano.function(
156 |     [sequences, h0, reset],
157 |     [cost, new_h0],
158 |     updates=updates,
159 |     on_unused_input='warn'
160 | )
161 | 
162 | generate_outputs, generate_new_h0 = sample_level_rnn(sequences, h0, reset)
163 | generate_fn = theano.function(
164 |     [sequences, h0, reset],
165 |     [lib.ops.softmax_and_sample(generate_outputs), generate_new_h0],
166 |     on_unused_input='warn'
167 | )
168 | 
169 | def generate_and_save_samples(tag):
170 | 
171 |     def write_audio_file(name, data):
172 |         data = data.astype('float32')
173 |         data -= data.min()
174 |         data /= data.max()
175 |         data -= 0.5
176 |         data *= 0.95
177 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
178 | 
179 |     # Generate 5 sample files, each 5 seconds long
180 |     N_SEQS = 10
181 |     LENGTH = 5*BITRATE
182 | 
183 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
184 |     samples[:, 0] = Q_ZERO
185 | 
186 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
187 | 
188 |     for t in xrange(1, LENGTH):
189 |         samples[:, t:t+1], h0 = generate_fn(
190 |             samples[:, t-1:t],
191 |             h0,
192 |             numpy.int32(t == 1)
193 |         )
194 | 
195 |     for i in xrange(N_SEQS):
196 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
197 | 
198 | print "Training!"
199 | total_iters = 0
200 | total_time = 0.
201 | last_print_time = 0.
202 | last_print_iters = 0
203 | curr_seq_len = 2
204 | costs = []
205 | for epoch in itertools.count():
206 | 
207 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
208 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, curr_seq_len, 1, Q_LEVELS, Q_ZERO)
209 | 
210 |     for seqs, reset in data_feeder:
211 |         start_time = time.time()
212 |         cost, h0 = train_fn(seqs, h0, reset)
213 |         total_time += time.time() - start_time
214 |         total_iters += 1
215 | 
216 |         costs.append(cost)
217 | 
218 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
219 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
220 |             
221 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
222 |                 epoch,
223 |                 total_iters,
224 |                 numpy.mean(costs),
225 |                 total_time,
226 |                 total_time / total_iters
227 |             )
228 |             tag = "iters{}_time{}".format(total_iters, total_time)
229 | 
230 |             generate_and_save_samples(tag)
231 |             lib.save_params('params_{}.pkl'.format(tag))
232 | 
233 |             costs = []
234 |             last_print_time += PRINT_TIME
235 |             last_print_iters += PRINT_ITERS
236 | 
237 |         if total_iters % SEQ_LEN_ANNEAL_ITERS == 0:
238 |             if curr_seq_len < SEQ_LEN:
239 |                 print "Doubling curr_seq_len to {}".format(curr_seq_len*2)
240 |                 curr_seq_len *= 2
241 |                 break
242 | 
243 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
244 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
245 | 
246 |             print "Done!"
247 | 
248 |             try: # This only matters on Ishaan's computer
249 |                 import experiment_tools
250 |                 experiment_tools.send_sms("done!")
251 |             except ImportError:
252 |                 pass
253 | 
254 |             sys.exit()


--------------------------------------------------------------------------------
/baseline_gaussian.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.wait_for_gpu(high_priority=False, debug=False)
 11 | except ImportError:
 12 |     pass
 13 | 
 14 | import numpy
 15 | numpy.random.seed(123)
 16 | import random
 17 | random.seed(123)
 18 | 
 19 | import dataset
 20 | 
 21 | import theano
 22 | import theano.tensor as T
 23 | import theano.ifelse
 24 | import lib
 25 | import lasagne
 26 | import scipy.io.wavfile
 27 | 
 28 | import time
 29 | import functools
 30 | import itertools
 31 | 
 32 | # Hyperparams
 33 | BATCH_SIZE = 128
 34 | SEQ_LEN = 256 # How many audio samples to include in each truncated BPTT pass
 35 | SEQ_LEN_ANNEAL_ITERS = 1
 36 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 37 | N_GRUS = 4 # How many GRUs to stack in the frame-level model
 38 | Q_LEVELS = None # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 39 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 40 | 
 41 | # Dataset
 42 | # DATA_PATH = '/media/seagate/blizzard/parts'
 43 | # N_FILES = 141703
 44 | DATA_PATH = '/PersimmonData/kiwi_parts'
 45 | N_FILES = 516
 46 | BITRATE = 16000
 47 | 
 48 | # Other constants
 49 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 50 | GENERATE_AND_SAVE = True
 51 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
 52 | STOP_ITERS = 200*1000 # Stop after this many iterations
 53 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 54 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 55 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 56 | Q_ZERO = None#numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 57 | 
 58 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 59 | theano_srng = RandomStreams(seed=234)
 60 | 
 61 | # data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, 256, 1, Q_LEVELS, Q_ZERO)
 62 | # for i in xrange(100*500):
 63 | #     data_feeder.next()
 64 | 
 65 | print "Model settings:"
 66 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 67 | all_vars = sorted(all_vars, key=lambda x: x[0])
 68 | for var_name, var_value in all_vars:
 69 |     print "\t{}: {}".format(var_name, var_value)
 70 | 
 71 | def gaussian_nll(x, mu, log_sigma):
 72 |     sigma_squared = T.exp(2*log_sigma)
 73 |     return (
 74 |         lib.floatX(0.5*numpy.log(2*numpy.pi)) + 
 75 |         (2*log_sigma) + 
 76 |         ( ((x-mu)**2) / (2*sigma_squared) )
 77 |     )
 78 | 
 79 | def sample_level_rnn(input_sequences, h0, reset):
 80 |     """
 81 |     input_sequences.shape: (batch size, seq len)
 82 |     h0.shape:              (batch size, N_GRUS, DIM)
 83 |     reset.shape:           ()
 84 |     output.shape:          (batch size, seq len, Q_LEVELS)
 85 |     """
 86 | 
 87 |     learned_h0 = lib.param(
 88 |         'SampleLevel.h0',
 89 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
 90 |     )
 91 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
 92 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
 93 | 
 94 |     # Embedded inputs
 95 |     #################
 96 | 
 97 |     # FRAME_SIZE = Q_LEVELS
 98 |     # frames = lib.ops.Embedding('SampleLevel.Embedding', Q_LEVELS, Q_LEVELS, input_sequences)
 99 | 
100 |     # Real-valued inputs
101 |     ####################
102 | 
103 |     # 'frames' of size 1
104 |     FRAME_SIZE = 1
105 |     frames = input_sequences.reshape((
106 |         input_sequences.shape[0],
107 |         input_sequences.shape[1],
108 |         1
109 |     ))
110 |     # # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
111 |     # # (a reasonable range to pass as inputs to the RNN)
112 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
113 |     # frames *= lib.floatX(2)
114 | 
115 |     gru0 = lib.ops.LowMemGRU('SampleLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
116 |     # gru0 = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU0FF', DIM, DIM, gru0, initialization='he'))
117 |     grus = [gru0]
118 |     for i in xrange(1, N_GRUS):
119 |         gru = lib.ops.LowMemGRU('SampleLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
120 |         # gru = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU'+str(i)+'FF', DIM, DIM, gru, initialization='he'))
121 |         grus.append(gru)
122 | 
123 |     # We apply the softmax later
124 |     output = lib.ops.Linear(
125 |         'Output',
126 |         N_GRUS*DIM,
127 |         2,
128 |         T.concatenate(grus, axis=2)
129 |     )
130 |     # output = lib.ops.Linear(
131 |     #     'Output',
132 |     #     DIM,
133 |     #     Q_LEVELS,
134 |     #     grus[-1]
135 |     # )
136 | 
137 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
138 | 
139 |     return (output, last_hidden)
140 | 
141 | sequences   = T.matrix('sequences')
142 | h0          = T.tensor3('h0')
143 | reset       = T.iscalar('reset')
144 | 
145 | input_sequences = sequences[:, :-1]
146 | target_sequences = sequences[:, 1:]
147 | 
148 | sample_level_outputs, new_h0 = sample_level_rnn(input_sequences, h0, reset)
149 | 
150 | cost = T.mean(gaussian_nll(
151 |     target_sequences.flatten(), 
152 |     sample_level_outputs.flatten()[::2], 
153 |     sample_level_outputs.flatten()[1::2]
154 | ))
155 | # cost = T.nnet.categorical_crossentropy(
156 | #     T.nnet.softmax(sample_level_outputs.reshape((-1, Q_LEVELS))),
157 | #     target_sequences.flatten()
158 | # ).mean()
159 | 
160 | # By default we report cross-entropy cost in bits. 
161 | # Switch to nats by commenting out this line:
162 | # cost = cost * lib.floatX(1.44269504089)
163 | 
164 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
165 | lib._train.print_params_info(cost, params)
166 | 
167 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
168 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
169 | 
170 | updates = lasagne.updates.adam(grads, params)
171 | 
172 | train_fn = theano.function(
173 |     [sequences, h0, reset],
174 |     [cost, new_h0],
175 |     updates=updates,
176 |     on_unused_input='warn'
177 | )
178 | 
179 | generate_outputs, generate_new_h0 = sample_level_rnn(sequences, h0, reset)
180 | g_mu = generate_outputs[:,:,0]
181 | g_log_sigma = generate_outputs[:,:,1]
182 | g_samples = g_mu + (T.exp(g_log_sigma)*theano_srng.normal(g_mu.shape))
183 | generate_fn = theano.function(
184 |     [sequences, h0, reset],
185 |     [g_samples, generate_new_h0],
186 |     on_unused_input='warn'
187 | )
188 | 
189 | def generate_and_save_samples(tag):
190 | 
191 |     def write_audio_file(name, data):
192 |         # data = data.astype('float32')
193 |         # data -= data.min()
194 |         # data /= data.max()
195 |         # data -= 0.5
196 |         # data *= 0.95
197 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
198 | 
199 |     # Generate 5 sample files, each 5 seconds long
200 |     N_SEQS = 10
201 |     LENGTH = 5*BITRATE
202 | 
203 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='float32')
204 |     samples[:, 0] = 0
205 | 
206 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
207 | 
208 |     for t in xrange(1, LENGTH):
209 |         samples[:, t:t+1], h0 = generate_fn(
210 |             samples[:, t-1:t],
211 |             h0,
212 |             numpy.int32(t == 1)
213 |         )
214 | 
215 |     for i in xrange(N_SEQS):
216 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
217 | 
218 | print "Training!"
219 | total_iters = 0
220 | total_time = 0.
221 | last_print_time = 0.
222 | last_print_iters = 0
223 | curr_seq_len = 2
224 | costs = []
225 | for epoch in itertools.count():
226 | 
227 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
228 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, curr_seq_len, 1, Q_LEVELS, Q_ZERO)
229 | 
230 |     for seqs, reset in data_feeder:
231 |         start_time = time.time()
232 |         cost, h0 = train_fn(seqs, h0, reset)
233 |         total_time += time.time() - start_time
234 |         total_iters += 1
235 | 
236 |         costs.append(cost)
237 | 
238 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
239 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
240 |             
241 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
242 |                 epoch,
243 |                 total_iters,
244 |                 numpy.mean(costs),
245 |                 total_time,
246 |                 total_time / total_iters
247 |             )
248 |             tag = "iters{}_time{}".format(total_iters, total_time)
249 | 
250 |             if GENERATE_AND_SAVE:
251 |                 generate_and_save_samples(tag)
252 |                 lib.save_params('params_{}.pkl'.format(tag))
253 | 
254 |             costs = []
255 |             last_print_time += PRINT_TIME
256 |             last_print_iters += PRINT_ITERS
257 | 
258 |         if total_iters % SEQ_LEN_ANNEAL_ITERS == 0:
259 |             if curr_seq_len < SEQ_LEN:
260 |                 print "Doubling curr_seq_len to {}".format(curr_seq_len*2)
261 |                 curr_seq_len *= 2
262 |                 break
263 | 
264 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
265 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
266 | 
267 |             print "Done!"
268 | 
269 |             try: # This only matters on Ishaan's computer
270 |                 import experiment_tools
271 |                 experiment_tools.send_sms("done!")
272 |             except ImportError:
273 |                 pass
274 | 
275 |             sys.exit()


--------------------------------------------------------------------------------
/two_tier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.register_crash_notifier()
 11 |     experiment_tools.wait_for_gpu(high_priority=False)
 12 | except ImportError:
 13 |     pass
 14 | 
 15 | import numpy
 16 | numpy.random.seed(123)
 17 | import random
 18 | random.seed(123)
 19 | 
 20 | import dataset
 21 | 
 22 | import theano
 23 | import theano.tensor as T
 24 | import theano.tensor.nnet.neighbours
 25 | import theano.ifelse
 26 | import lib
 27 | import lasagne
 28 | import scipy.io.wavfile
 29 | 
 30 | import time
 31 | import functools
 32 | import itertools
 33 | 
 34 | # Hyperparams
 35 | BATCH_SIZE = 128
 36 | N_FRAMES = 128 # How many 'frames' to include in each truncated BPTT pass
 37 | FRAME_SIZE = 2 # How many samples per frame
 38 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 39 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
 40 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 41 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 42 | 
 43 | # Dataset
 44 | DATA_PATH = '/media/seagate/blizzard/parts'
 45 | N_FILES = 141703
 46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 47 | # N_FILES = 516
 48 | BITRATE = 16000
 49 | 
 50 | # Other constants
 51 | TRAIN_MODE = 'time' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 52 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
 53 | STOP_ITERS = 100000 # Stop after this many iterations
 54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 55 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 56 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 57 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
 58 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 59 | 
 60 | print "Model settings:"
 61 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 62 | all_vars = sorted(all_vars, key=lambda x: x[0])
 63 | for var_name, var_value in all_vars:
 64 |     print "\t{}: {}".format(var_name, var_value)
 65 | 
 66 | def frame_level_rnn(input_sequences, h0, reset):
 67 |     """
 68 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
 69 |     h0.shape:              (batch size, N_GRUS, DIM)
 70 |     reset.shape:           ()
 71 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
 72 |     """
 73 | 
 74 |     learned_h0 = lib.param(
 75 |         'FrameLevel.h0',
 76 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
 77 |     )
 78 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
 79 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
 80 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
 81 | 
 82 |     frames = input_sequences.reshape((
 83 |         input_sequences.shape[0],
 84 |         input_sequences.shape[1] / FRAME_SIZE,
 85 |         FRAME_SIZE
 86 |     ))
 87 | 
 88 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
 89 |     # (a reasonable range to pass as inputs to the RNN)
 90 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
 91 |     frames *= lib.floatX(2)
 92 | 
 93 |     gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
 94 |     grus = [gru0]
 95 |     for i in xrange(1, N_GRUS):
 96 |         gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
 97 |         grus.append(gru)
 98 | 
 99 |     output = lib.ops.Linear(
100 |         'FrameLevel.Output', 
101 |         DIM,
102 |         FRAME_SIZE * DIM,
103 |         grus[-1],
104 |         initialization='he'
105 |     )
106 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
107 | 
108 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
109 | 
110 |     return (output, last_hidden)
111 | 
112 | def sample_level_predictor(frame_level_outputs, prev_samples):
113 |     """
114 |     frame_level_outputs.shape: (batch size, DIM)
115 |     prev_samples.shape:        (batch size, FRAME_SIZE)
116 |     output.shape:              (batch size, Q_LEVELS)
117 |     """
118 | 
119 |     prev_samples = lib.ops.Embedding(
120 |         'SampleLevel.Embedding',
121 |         Q_LEVELS,
122 |         Q_LEVELS,
123 |         prev_samples
124 |     ).reshape((-1, FRAME_SIZE * Q_LEVELS))
125 | 
126 |     out = lib.ops.Linear(
127 |         'SampleLevel.L1_PrevSamples', 
128 |         FRAME_SIZE * Q_LEVELS,
129 |         DIM,
130 |         prev_samples,
131 |         biases=False,
132 |         initialization='he'
133 |     )
134 |     out += frame_level_outputs
135 |     out = T.nnet.relu(out)
136 | 
137 |     out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
138 |     out = T.nnet.relu(out)
139 |     out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
140 |     out = T.nnet.relu(out)
141 | 
142 |     # We apply the softmax later
143 |     return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
144 | 
145 | sequences   = T.imatrix('sequences')
146 | h0          = T.tensor3('h0')
147 | reset       = T.iscalar('reset')
148 | 
149 | input_sequences = sequences[:, :-FRAME_SIZE]
150 | target_sequences = sequences[:, FRAME_SIZE:]
151 | 
152 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
153 | 
154 | prev_samples = sequences[:, :-1]
155 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
156 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
157 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
158 | 
159 | sample_level_outputs = sample_level_predictor(
160 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
161 |     prev_samples
162 | )
163 | 
164 | cost = T.nnet.categorical_crossentropy(
165 |     T.nnet.softmax(sample_level_outputs),
166 |     target_sequences.flatten()
167 | ).mean()
168 | 
169 | # By default we report cross-entropy cost in bits. 
170 | # Switch to nats by commenting out this line:
171 | cost = cost * lib.floatX(1.44269504089)
172 | 
173 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
174 | lib._train.print_params_info(cost, params)
175 | 
176 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
177 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
178 | 
179 | updates = lasagne.updates.adam(grads, params, learning_rate=1e-3)
180 | 
181 | train_fn = theano.function(
182 |     [sequences, h0, reset],
183 |     [cost, new_h0],
184 |     updates=updates,
185 |     on_unused_input='warn'
186 | )
187 | 
188 | frame_level_generate_fn = theano.function(
189 |     [sequences, h0, reset],
190 |     frame_level_rnn(sequences, h0, reset),
191 |     on_unused_input='warn'
192 | )
193 | 
194 | frame_level_outputs = T.matrix('frame_level_outputs')
195 | prev_samples        = T.imatrix('prev_samples')
196 | sample_level_generate_fn = theano.function(
197 |     [frame_level_outputs, prev_samples],
198 |     lib.ops.softmax_and_sample(
199 |         sample_level_predictor(
200 |             frame_level_outputs, 
201 |             prev_samples
202 |         )
203 |     ),
204 |     on_unused_input='warn'
205 | )
206 | 
207 | def generate_and_save_samples(tag):
208 | 
209 |     def write_audio_file(name, data):
210 |         data = data.astype('float32')
211 |         data -= data.min()
212 |         data /= data.max()
213 |         data -= 0.5
214 |         data *= 0.95
215 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
216 | 
217 |     # Generate 5 sample files, each 5 seconds long
218 |     N_SEQS = 10
219 |     LENGTH = 5*BITRATE
220 | 
221 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
222 |     samples[:, :FRAME_SIZE] = Q_ZERO
223 | 
224 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
225 |     frame_level_outputs = None
226 | 
227 |     for t in xrange(FRAME_SIZE, LENGTH):
228 | 
229 |         if t % FRAME_SIZE == 0:
230 |             frame_level_outputs, h0 = frame_level_generate_fn(
231 |                 samples[:, t-FRAME_SIZE:t], 
232 |                 h0,
233 |                 numpy.int32(t == FRAME_SIZE)
234 |             )
235 | 
236 |         samples[:, t] = sample_level_generate_fn(
237 |             frame_level_outputs[:, t % FRAME_SIZE], 
238 |             samples[:, t-FRAME_SIZE:t]
239 |         )
240 | 
241 |     for i in xrange(N_SEQS):
242 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
243 | 
244 | print "Training!"
245 | total_iters = 0
246 | total_time = 0.
247 | last_print_time = 0.
248 | last_print_iters = 0
249 | for epoch in itertools.count():
250 | 
251 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
252 |     costs = []
253 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
254 | 
255 |     for seqs, reset in data_feeder:
256 | 
257 |         start_time = time.time()
258 |         cost, h0 = train_fn(seqs, h0, reset)
259 |         total_time += time.time() - start_time
260 |         total_iters += 1
261 | 
262 |         costs.append(cost)
263 | 
264 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
265 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
266 |             
267 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
268 |                 epoch,
269 |                 total_iters,
270 |                 numpy.mean(costs),
271 |                 total_time,
272 |                 total_time / total_iters
273 |             )
274 |             tag = "iters{}_time{}".format(total_iters, total_time)
275 |             generate_and_save_samples(tag)
276 |             lib.save_params('params_{}.pkl'.format(tag))
277 | 
278 |             costs = []
279 |             last_print_time += PRINT_TIME
280 |             last_print_iters += PRINT_ITERS
281 | 
282 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
283 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
284 | 
285 |             print "Done!"
286 | 
287 |             try: # This only matters on Ishaan's computer
288 |                 import experiment_tools
289 |                 experiment_tools.send_sms("done!")
290 |             except ImportError:
291 |                 pass
292 | 
293 |             sys.exit()


--------------------------------------------------------------------------------
/two_tier_v.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.register_crash_notifier()
 11 |     experiment_tools.wait_for_gpu(high_priority=False)
 12 | except ImportError:
 13 |     pass
 14 | 
 15 | import numpy
 16 | numpy.random.seed(123)
 17 | import random
 18 | random.seed(123)
 19 | 
 20 | import dataset
 21 | 
 22 | import theano
 23 | import theano.tensor as T
 24 | import theano.tensor.nnet.neighbours
 25 | import theano.ifelse
 26 | import lib
 27 | import lasagne
 28 | import scipy.io.wavfile
 29 | 
 30 | import time
 31 | import functools
 32 | import itertools
 33 | 
 34 | # Hyperparams
 35 | BATCH_SIZE = 128
 36 | N_FRAMES = 128 # How many 'frames' to include in each truncated BPTT pass
 37 | FRAME_SIZE = 16 # How many samples per frame
 38 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 39 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
 40 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 41 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 42 | 
 43 | # Dataset
 44 | DATA_PATH = '/media/seagate/blizzard/parts'
 45 | N_FILES = 141703
 46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 47 | # N_FILES = 516
 48 | BITRATE = 16000
 49 | 
 50 | # Other constants
 51 | TRAIN_MODE = 'time' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 52 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
 53 | STOP_ITERS = 100000 # Stop after this many iterations
 54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 55 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 56 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 57 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
 58 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 59 | 
 60 | print "Model settings:"
 61 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 62 | all_vars = sorted(all_vars, key=lambda x: x[0])
 63 | for var_name, var_value in all_vars:
 64 |     print "\t{}: {}".format(var_name, var_value)
 65 | 
 66 | def frame_level_rnn(input_sequences, h0, reset):
 67 |     """
 68 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
 69 |     h0.shape:              (batch size, N_GRUS, DIM)
 70 |     reset.shape:           ()
 71 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
 72 |     """
 73 | 
 74 |     learned_h0 = lib.param(
 75 |         'FrameLevel.h0',
 76 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
 77 |     )
 78 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
 79 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
 80 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
 81 | 
 82 |     frames = input_sequences.reshape((
 83 |         input_sequences.shape[0],
 84 |         input_sequences.shape[1] / FRAME_SIZE,
 85 |         FRAME_SIZE
 86 |     ))
 87 | 
 88 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
 89 |     # (a reasonable range to pass as inputs to the RNN)
 90 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
 91 |     frames *= lib.floatX(2)
 92 | 
 93 |     gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
 94 |     grus = [gru0]
 95 |     for i in xrange(1, N_GRUS):
 96 |         gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
 97 |         grus.append(gru)
 98 | 
 99 |     output = lib.ops.Linear(
100 |         'FrameLevel.Output', 
101 |         DIM,
102 |         FRAME_SIZE * DIM,
103 |         grus[-1],
104 |         initialization='he'
105 |     )
106 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
107 | 
108 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
109 | 
110 |     return (output, last_hidden)
111 | 
112 | def sample_level_predictor(frame_level_outputs, prev_samples):
113 |     """
114 |     frame_level_outputs.shape: (batch size, DIM)
115 |     prev_samples.shape:        (batch size, FRAME_SIZE)
116 |     output.shape:              (batch size, Q_LEVELS)
117 |     """
118 | 
119 |     prev_samples = lib.ops.Embedding(
120 |         'SampleLevel.Embedding',
121 |         Q_LEVELS,
122 |         Q_LEVELS,
123 |         prev_samples
124 |     ).reshape((-1, FRAME_SIZE * Q_LEVELS))
125 | 
126 |     out = lib.ops.Linear(
127 |         'SampleLevel.L1_PrevSamples', 
128 |         FRAME_SIZE * Q_LEVELS,
129 |         DIM,
130 |         prev_samples,
131 |         biases=False,
132 |         initialization='he'
133 |     )
134 |     out += frame_level_outputs
135 |     out = T.nnet.relu(out)
136 | 
137 |     out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
138 |     out = T.nnet.relu(out)
139 |     out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
140 |     out = T.nnet.relu(out)
141 | 
142 |     # We apply the softmax later
143 |     return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
144 | 
145 | sequences   = T.imatrix('sequences')
146 | h0          = T.tensor3('h0')
147 | reset       = T.iscalar('reset')
148 | 
149 | input_sequences = sequences[:, :-FRAME_SIZE]
150 | target_sequences = sequences[:, FRAME_SIZE:]
151 | 
152 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
153 | 
154 | prev_samples = sequences[:, :-1]
155 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
156 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
157 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
158 | 
159 | sample_level_outputs = sample_level_predictor(
160 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
161 |     prev_samples
162 | )
163 | 
164 | cost = T.nnet.categorical_crossentropy(
165 |     T.nnet.softmax(sample_level_outputs),
166 |     target_sequences.flatten()
167 | ).mean()
168 | 
169 | # By default we report cross-entropy cost in bits. 
170 | # Switch to nats by commenting out this line:
171 | cost = cost * lib.floatX(1.44269504089)
172 | 
173 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
174 | lib._train.print_params_info(cost, params)
175 | 
176 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
177 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
178 | 
179 | updates = lasagne.updates.adam(grads, params, learning_rate=1e-3)
180 | 
181 | train_fn = theano.function(
182 |     [sequences, h0, reset],
183 |     [cost, new_h0],
184 |     updates=updates,
185 |     on_unused_input='warn'
186 | )
187 | 
188 | frame_level_generate_fn = theano.function(
189 |     [sequences, h0, reset],
190 |     frame_level_rnn(sequences, h0, reset),
191 |     on_unused_input='warn'
192 | )
193 | 
194 | frame_level_outputs = T.matrix('frame_level_outputs')
195 | prev_samples        = T.imatrix('prev_samples')
196 | sample_level_generate_fn = theano.function(
197 |     [frame_level_outputs, prev_samples],
198 |     lib.ops.softmax_and_sample(
199 |         sample_level_predictor(
200 |             frame_level_outputs, 
201 |             prev_samples
202 |         )
203 |     ),
204 |     on_unused_input='warn'
205 | )
206 | 
207 | def generate_and_save_samples(tag):
208 | 
209 |     def write_audio_file(name, data):
210 |         data = data.astype('float32')
211 |         data -= data.min()
212 |         data /= data.max()
213 |         data -= 0.5
214 |         data *= 0.95
215 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
216 | 
217 |     # Generate 5 sample files, each 5 seconds long
218 |     N_SEQS = 10
219 |     LENGTH = 5*BITRATE
220 | 
221 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
222 |     samples[:, :FRAME_SIZE] = Q_ZERO
223 | 
224 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
225 |     frame_level_outputs = None
226 | 
227 |     for t in xrange(FRAME_SIZE, LENGTH):
228 | 
229 |         if t % FRAME_SIZE == 0:
230 |             frame_level_outputs, h0 = frame_level_generate_fn(
231 |                 samples[:, t-FRAME_SIZE:t], 
232 |                 h0,
233 |                 numpy.int32(t == FRAME_SIZE)
234 |             )
235 | 
236 |         samples[:, t] = sample_level_generate_fn(
237 |             frame_level_outputs[:, t % FRAME_SIZE], 
238 |             samples[:, t-FRAME_SIZE:t]
239 |         )
240 | 
241 |     for i in xrange(N_SEQS):
242 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
243 | 
244 | print "Training!"
245 | total_iters = 0
246 | total_time = 0.
247 | last_print_time = 0.
248 | last_print_iters = 0
249 | for epoch in itertools.count():
250 | 
251 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
252 |     costs = []
253 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
254 | 
255 |     for seqs, reset in data_feeder:
256 | 
257 |         start_time = time.time()
258 |         cost, h0 = train_fn(seqs, h0, reset)
259 |         total_time += time.time() - start_time
260 |         total_iters += 1
261 | 
262 |         costs.append(cost)
263 | 
264 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
265 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
266 |             
267 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
268 |                 epoch,
269 |                 total_iters,
270 |                 numpy.mean(costs),
271 |                 total_time,
272 |                 total_time / total_iters
273 |             )
274 |             tag = "iters{}_time{}".format(total_iters, total_time)
275 |             generate_and_save_samples(tag)
276 |             lib.save_params('params_{}.pkl'.format(tag))
277 | 
278 |             costs = []
279 |             last_print_time += PRINT_TIME
280 |             last_print_iters += PRINT_ITERS
281 | 
282 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
283 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
284 | 
285 |             print "Done!"
286 | 
287 |             try: # This only matters on Ishaan's computer
288 |                 import experiment_tools
289 |                 experiment_tools.send_sms("done!")
290 |             except ImportError:
291 |                 pass
292 | 
293 |             sys.exit()


--------------------------------------------------------------------------------
/conv.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convolutional Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.register_crash_notifier()
 11 |     experiment_tools.wait_for_gpu(high_priority=True)
 12 | except ImportError:
 13 |     pass
 14 | 
 15 | import numpy
 16 | numpy.random.seed(123)
 17 | import random
 18 | random.seed(123)
 19 | 
 20 | import dataset
 21 | 
 22 | import theano
 23 | import theano.tensor as T
 24 | import theano.ifelse
 25 | import lib
 26 | import lasagne
 27 | import scipy.io.wavfile
 28 | 
 29 | import time
 30 | import functools
 31 | import itertools
 32 | 
 33 | # Hyperparams
 34 | BATCH_SIZE = 128
 35 | SEQ_LEN = 256
 36 | DIM = 128
 37 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 38 | GRAD_CLIP = 1
 39 | 
 40 | LAYERS = 5
 41 | FILTER_SIZE = 17
 42 | 
 43 | # Dataset
 44 | DATA_PATH = '/media/seagate/blizzard/parts'
 45 | N_FILES = 141703
 46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 47 | # N_FILES = 516
 48 | BITRATE = 16000
 49 | 
 50 | # Other constants
 51 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 52 | PRINT_ITERS = 10 # Print cost, generate samples, save model checkpoint every N iterations.
 53 | STOP_ITERS = 1000 # Stop after this many iterations
 54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 55 | STOP_TIME = 60*60*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 56 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 57 | 
 58 | print "Model settings:"
 59 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 60 | all_vars = sorted(all_vars, key=lambda x: x[0])
 61 | for var_name, var_value in all_vars:
 62 |     print "\t{}: {}".format(var_name, var_value)
 63 | 
 64 | def MaskedConv1D(name, input_dim, output_dim, filter_size, inputs, mask_type=None, he_init=False):
 65 |     """
 66 |     inputs.shape: (batch size, input_dim, 1, width)
 67 |     mask_type: None, 'a', 'b'
 68 |     output.shape: (batch size, output_dim, 1, width)
 69 |     """
 70 | 
 71 |     if mask_type is not None:
 72 |         mask = numpy.ones(
 73 |             (output_dim, input_dim, 1, filter_size), 
 74 |             dtype=theano.config.floatX
 75 |         )
 76 |         center = filter_size//2
 77 |         mask[:,:,0,center+1:] = 0.
 78 |         if mask_type == 'a':
 79 |             mask[:,:,0,center] = 0.
 80 | 
 81 |     def uniform(stdev, size):
 82 |         """uniform distribution with the given stdev and size"""
 83 |         return numpy.random.uniform(
 84 |             low=-stdev * numpy.sqrt(3),
 85 |             high=stdev * numpy.sqrt(3),
 86 |             size=size
 87 |         ).astype(theano.config.floatX)
 88 | 
 89 |     if mask_type=='a':
 90 |         n_in = filter_size//2
 91 |     elif mask_type=='b':
 92 |         n_in = filter_size//2 + 1
 93 |     else:
 94 |         n_in = filter_size
 95 |     n_in *= input_dim
 96 | 
 97 |     if he_init:
 98 |         init_stdev = numpy.sqrt(2./n_in)
 99 |     else:
100 |         init_stdev = numpy.sqrt(1./n_in)
101 | 
102 |     filters = lib.param(
103 |         name+'.Filters',
104 |         uniform(
105 |             init_stdev,
106 |             (output_dim, input_dim, 1, filter_size)
107 |         )
108 |     )
109 | 
110 |     if mask_type is not None:
111 |         filters = filters * mask
112 | 
113 |     # TODO benchmark against the lasagne 'conv1d' implementations
114 |     result = T.nnet.conv2d(inputs, filters, filter_flip=False, border_mode='half')
115 | 
116 |     if mask_type is not None:
117 |         result = result[:, :, :, :inputs.shape[3]]
118 | 
119 |     biases = lib.param(
120 |         name+'.Biases',
121 |         numpy.zeros(output_dim, dtype=theano.config.floatX)
122 |     )
123 |     result += biases[None, :, None, None]
124 | 
125 |     return result
126 | 
127 | def Conv1D(name, input_dim, output_dim, filter_size, inputs, mask_type=None, he_init=False):
128 |     """
129 |     inputs.shape: (batch size, input_dim, 1, width)
130 |     mask_type: None, 'a', 'b'
131 |     output.shape: (batch size, output_dim, 1, width)
132 |     """
133 | 
134 |     # if mask_type is not None:
135 |     #     mask = numpy.ones(
136 |     #         (output_dim, input_dim, 1, filter_size), 
137 |     #         dtype=theano.config.floatX
138 |     #     )
139 |     #     center = filter_size//2
140 |     #     mask[:,:,0,center+1:] = 0.
141 |     #     if mask_type == 'a':
142 |     #         mask[:,:,0,center] = 0.
143 | 
144 |     if mask_type=='a':
145 |         filter_size = filter_size//2
146 |     elif mask_type=='b':
147 |         filter_size = filter_size//2 + 1
148 | 
149 |     def uniform(stdev, size):
150 |         """uniform distribution with the given stdev and size"""
151 |         return numpy.random.uniform(
152 |             low=-stdev * numpy.sqrt(3),
153 |             high=stdev * numpy.sqrt(3),
154 |             size=size
155 |         ).astype(theano.config.floatX)
156 | 
157 |     # if mask_type is not None:
158 |     #     n_in = numpy.sum(mask)
159 |     # else:
160 |     n_in = input_dim * filter_size
161 | 
162 |     if he_init:
163 |         init_stdev = numpy.sqrt(2./n_in)
164 |     else:
165 |         init_stdev = numpy.sqrt(1./n_in)
166 | 
167 |     filters = lib.param(
168 |         name+'.Filters',
169 |         uniform(
170 |             init_stdev,
171 |             (output_dim, input_dim, 1, filter_size)
172 |         )
173 |     )
174 | 
175 |     # if mask_type is not None:
176 |     #     filters = filters * mask
177 | 
178 |     if mask_type=='a':
179 |         pad = filter_size
180 |     elif mask_type=='b':
181 |         pad = filter_size-1
182 |     else:
183 |         # border mode 'half'
184 |         pad = filter_size//2
185 | 
186 |     # TODO benchmark against the lasagne 'conv1d' implementations
187 |     result = T.nnet.conv2d(inputs, filters, filter_flip=False, border_mode=(0,pad))
188 | 
189 |     if mask_type is not None:
190 |         result = result[:, :, :, :inputs.shape[3]]
191 | 
192 |     biases = lib.param(
193 |         name+'.Biases',
194 |         numpy.zeros(output_dim, dtype=theano.config.floatX)
195 |     )
196 |     result += biases[None, :, None, None]
197 | 
198 |     return result
199 | 
200 | sequences = T.imatrix('sequences')
201 | 
202 | INPUT_DIM = Q_LEVELS
203 | inputs = lib.ops.Embedding('Embedding', Q_LEVELS, Q_LEVELS, sequences)
204 | inputs = inputs.dimshuffle(0, 2, 'x', 1)
205 | 
206 | # INPUT_DIM = 1
207 | # inputs = lib.floatX(4)*sequences.astype('float32')/lib.floatX(Q_LEVELS) - lib.floatX(2)
208 | # inputs = inputs[:, None, None, :]
209 | 
210 | output = MaskedConv1D('InputConv', INPUT_DIM, DIM, FILTER_SIZE, inputs, mask_type='a', he_init=True)
211 | output = T.nnet.relu(output)
212 | 
213 | for i in xrange(1,LAYERS):
214 |     output = MaskedConv1D('Conv'+str(i), DIM, DIM, FILTER_SIZE, output, mask_type='b', he_init=True)
215 |     output = T.nnet.relu(output)
216 | 
217 | output = MaskedConv1D('OutputConv', DIM, Q_LEVELS, 1, output, mask_type='b')
218 | 
219 | output = output.dimshuffle(0,2,3,1) # Move the Q_LEVELS dim to the end
220 | cost = T.nnet.categorical_crossentropy(
221 |     T.nnet.softmax(output.reshape((-1, Q_LEVELS))),
222 |     sequences.flatten()
223 | ).mean()
224 | 
225 | # By default we report cross-entropy cost in bits. 
226 | # Switch to nats by commenting out this line:
227 | cost = cost * lib.floatX(1.44269504089)
228 | 
229 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
230 | lib._train.print_params_info(cost, params)
231 | 
232 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
233 | # Do people use grad clipping in convnets?
234 | # grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
235 | 
236 | updates = lasagne.updates.adam(grads, params)
237 | 
238 | train_fn = theano.function(
239 |     [sequences],
240 |     cost,
241 |     updates=updates,
242 |     on_unused_input='warn'
243 | )
244 | 
245 | # generate_outputs, generate_new_h0 = sample_level_rnn(sequences, h0, reset)
246 | # generate_fn = theano.function(
247 | #     [sequences, h0, reset],
248 | #     [lib.ops.softmax_and_sample(generate_outputs), generate_new_h0],
249 | #     on_unused_input='warn'
250 | # )
251 | 
252 | # def generate_and_save_samples(tag):
253 | 
254 | #     def write_audio_file(name, data):
255 | #         data = data.astype('float32')
256 | #         data -= data.min()
257 | #         data /= data.max()
258 | #         data -= 0.5
259 | #         data *= 0.95
260 | #         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
261 | 
262 | #     # Generate 5 sample files, each 5 seconds long
263 | #     N_SEQS = 10
264 | #     LENGTH = 5*BITRATE
265 | 
266 | #     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
267 | #     samples[:, 0] = Q_ZERO
268 | 
269 | #     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
270 | 
271 | #     for t in xrange(1, LENGTH):
272 | #         samples[:, t:t+1], h0 = generate_fn(
273 | #             samples[:, t-1:t],
274 | #             h0,
275 | #             numpy.int32(t == 1)
276 | #         )
277 | 
278 | #     for i in xrange(N_SEQS):
279 | #         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
280 | 
281 | print "Training!"
282 | total_iters = 0
283 | total_time = 0.
284 | last_print_time = 0.
285 | last_print_iters = 0
286 | for epoch in itertools.count():
287 | 
288 |     costs = []
289 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, 0, Q_LEVELS, Q_ZERO)
290 | 
291 |     for seqs, reset in data_feeder:
292 | 
293 |         start_time = time.time()
294 |         cost = train_fn(seqs)
295 |         total_time += time.time() - start_time
296 |         total_iters += 1
297 | 
298 |         costs.append(cost)
299 | 
300 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
301 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
302 |             
303 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
304 |                 epoch,
305 |                 total_iters,
306 |                 numpy.mean(costs),
307 |                 total_time,
308 |                 total_time / total_iters
309 |             )
310 |             tag = "iters{}_time{}".format(total_iters, total_time)
311 | 
312 |             # generate_and_save_samples(tag)
313 |             # lib.save_params('params_{}.pkl'.format(tag))
314 | 
315 |             costs = []
316 |             last_print_time += PRINT_TIME
317 |             last_print_iters += PRINT_ITERS
318 | 
319 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
320 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
321 | 
322 |             print "Done!"
323 | 
324 |             try: # This only matters on Ishaan's computer
325 |                 import experiment_tools
326 |                 experiment_tools.send_sms("done!")
327 |             except ImportError:
328 |                 pass
329 | 
330 |             sys.exit()


--------------------------------------------------------------------------------
/two_tier_conv.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.register_crash_notifier()
 11 |     experiment_tools.wait_for_gpu(high_priority=False)
 12 | except ImportError:
 13 |     pass
 14 | 
 15 | import numpy
 16 | numpy.random.seed(123)
 17 | import random
 18 | random.seed(123)
 19 | 
 20 | import dataset
 21 | 
 22 | import theano
 23 | import theano.tensor as T
 24 | import theano.tensor.nnet.neighbours
 25 | import theano.ifelse
 26 | import lib
 27 | import lasagne
 28 | import scipy.io.wavfile
 29 | 
 30 | import time
 31 | import functools
 32 | import itertools
 33 | 
 34 | # Hyperparams
 35 | BATCH_SIZE = 128
 36 | N_FRAMES = 128 # How many 'frames' to include in each truncated BPTT pass
 37 | FRAME_SIZE = 2 # How many samples per frame
 38 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 39 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
 40 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 41 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 42 | 
 43 | # Dataset
 44 | DATA_PATH = '/media/seagate/blizzard/parts'
 45 | N_FILES = 141703
 46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 47 | # N_FILES = 516
 48 | BITRATE = 16000
 49 | 
 50 | # Other constants
 51 | TRAIN_MODE = 'time' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 52 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
 53 | STOP_ITERS = 100000 # Stop after this many iterations
 54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 55 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 56 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 57 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
 58 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 59 | 
 60 | print "Model settings:"
 61 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 62 | all_vars = sorted(all_vars, key=lambda x: x[0])
 63 | for var_name, var_value in all_vars:
 64 |     print "\t{}: {}".format(var_name, var_value)
 65 | 
 66 | def MaskedConv1D(name, input_dim, output_dim, filter_size, inputs, mask_type=None, he_init=False):
 67 |     """
 68 |     inputs.shape: (batch size, input_dim, 1, width)
 69 |     mask_type: None, 'a', 'b'
 70 |     output.shape: (batch size, output_dim, 1, width)
 71 |     """
 72 | 
 73 |     if mask_type is not None:
 74 |         mask = numpy.ones(
 75 |             (output_dim, input_dim, 1, filter_size), 
 76 |             dtype=theano.config.floatX
 77 |         )
 78 |         center = filter_size//2
 79 |         mask[:,:,0,center+1:] = 0.
 80 |         if mask_type == 'a':
 81 |             mask[:,:,0,center] = 0.
 82 | 
 83 |     def uniform(stdev, size):
 84 |         """uniform distribution with the given stdev and size"""
 85 |         return numpy.random.uniform(
 86 |             low=-stdev * numpy.sqrt(3),
 87 |             high=stdev * numpy.sqrt(3),
 88 |             size=size
 89 |         ).astype(theano.config.floatX)
 90 | 
 91 |     if mask_type=='a':
 92 |         n_in = filter_size//2
 93 |     elif mask_type=='b':
 94 |         n_in = filter_size//2 + 1
 95 |     else:
 96 |         n_in = filter_size
 97 |     n_in *= input_dim
 98 | 
 99 |     if he_init:
100 |         init_stdev = numpy.sqrt(2./n_in)
101 |     else:
102 |         init_stdev = numpy.sqrt(1./n_in)
103 | 
104 |     filters = lib.param(
105 |         name+'.Filters',
106 |         uniform(
107 |             init_stdev,
108 |             (output_dim, input_dim, 1, filter_size)
109 |         )
110 |     )
111 | 
112 |     if mask_type is not None:
113 |         filters = filters * mask
114 | 
115 |     # TODO benchmark against the lasagne 'conv1d' implementations
116 |     result = T.nnet.conv2d(inputs, filters, filter_flip=False, border_mode='half')
117 | 
118 |     if mask_type is not None:
119 |         result = result[:, :, :, :inputs.shape[3]]
120 | 
121 |     biases = lib.param(
122 |         name+'.Biases',
123 |         numpy.zeros(output_dim, dtype=theano.config.floatX)
124 |     )
125 |     result += biases[None, :, None, None]
126 | 
127 |     return result
128 | 
129 | def frame_level_rnn(input_sequences, h0, reset):
130 |     """
131 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
132 |     h0.shape:              (batch size, N_GRUS, DIM)
133 |     reset.shape:           ()
134 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
135 |     """
136 | 
137 |     learned_h0 = lib.param(
138 |         'FrameLevel.h0',
139 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
140 |     )
141 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
142 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
143 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
144 | 
145 |     frames = input_sequences.reshape((
146 |         input_sequences.shape[0],
147 |         input_sequences.shape[1] / FRAME_SIZE,
148 |         FRAME_SIZE
149 |     ))
150 | 
151 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
152 |     # (a reasonable range to pass as inputs to the RNN)
153 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
154 |     frames *= lib.floatX(2)
155 | 
156 |     gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
157 |     grus = [gru0]
158 |     for i in xrange(1, N_GRUS):
159 |         gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
160 |         grus.append(gru)
161 | 
162 |     output = lib.ops.Linear(
163 |         'FrameLevel.Output', 
164 |         DIM,
165 |         FRAME_SIZE * DIM,
166 |         grus[-1],
167 |         initialization='he'
168 |     )
169 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
170 | 
171 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
172 | 
173 |     return (output, last_hidden)
174 | 
175 | def sample_level_predictor(frame_level_outputs, prev_samples):
176 |     """
177 |     frame_level_outputs.shape: (batch size, DIM)
178 |     prev_samples.shape:        (batch size, FRAME_SIZE)
179 |     output.shape:              (batch size, Q_LEVELS)
180 |     """
181 | 
182 |     prev_samples = lib.ops.Embedding(
183 |         'SampleLevel.Embedding',
184 |         Q_LEVELS,
185 |         Q_LEVELS,
186 |         prev_samples
187 |     ).reshape((-1, FRAME_SIZE * Q_LEVELS))
188 | 
189 |     out = lib.ops.Linear(
190 |         'SampleLevel.L1_PrevSamples', 
191 |         FRAME_SIZE * Q_LEVELS,
192 |         DIM,
193 |         prev_samples,
194 |         biases=False,
195 |         initialization='he'
196 |     )
197 |     out += frame_level_outputs
198 |     out = T.nnet.relu(out)
199 | 
200 |     out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
201 |     out = T.nnet.relu(out)
202 |     out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
203 |     out = T.nnet.relu(out)
204 | 
205 |     # We apply the softmax later
206 |     return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
207 | 
208 | sequences   = T.imatrix('sequences')
209 | h0          = T.tensor3('h0')
210 | reset       = T.iscalar('reset')
211 | 
212 | input_sequences = sequences[:, :-FRAME_SIZE]
213 | target_sequences = sequences[:, FRAME_SIZE:]
214 | 
215 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
216 | 
217 | prev_samples = sequences[:, :-1]
218 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
219 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
220 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
221 | 
222 | sample_level_outputs = sample_level_predictor(
223 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
224 |     prev_samples
225 | )
226 | 
227 | cost = T.nnet.categorical_crossentropy(
228 |     T.nnet.softmax(sample_level_outputs),
229 |     target_sequences.flatten()
230 | ).mean()
231 | 
232 | # By default we report cross-entropy cost in bits. 
233 | # Switch to nats by commenting out this line:
234 | cost = cost * lib.floatX(1.44269504089)
235 | 
236 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
237 | lib._train.print_params_info(cost, params)
238 | 
239 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
240 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
241 | 
242 | updates = lasagne.updates.adam(grads, params, learning_rate=1e-3)
243 | 
244 | train_fn = theano.function(
245 |     [sequences, h0, reset],
246 |     [cost, new_h0],
247 |     updates=updates,
248 |     on_unused_input='warn'
249 | )
250 | 
251 | frame_level_generate_fn = theano.function(
252 |     [sequences, h0, reset],
253 |     frame_level_rnn(sequences, h0, reset),
254 |     on_unused_input='warn'
255 | )
256 | 
257 | frame_level_outputs = T.matrix('frame_level_outputs')
258 | prev_samples        = T.imatrix('prev_samples')
259 | sample_level_generate_fn = theano.function(
260 |     [frame_level_outputs, prev_samples],
261 |     lib.ops.softmax_and_sample(
262 |         sample_level_predictor(
263 |             frame_level_outputs, 
264 |             prev_samples
265 |         )
266 |     ),
267 |     on_unused_input='warn'
268 | )
269 | 
270 | def generate_and_save_samples(tag):
271 | 
272 |     def write_audio_file(name, data):
273 |         data = data.astype('float32')
274 |         data -= data.min()
275 |         data /= data.max()
276 |         data -= 0.5
277 |         data *= 0.95
278 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
279 | 
280 |     # Generate 5 sample files, each 5 seconds long
281 |     N_SEQS = 10
282 |     LENGTH = 5*BITRATE
283 | 
284 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
285 |     samples[:, :FRAME_SIZE] = Q_ZERO
286 | 
287 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
288 |     frame_level_outputs = None
289 | 
290 |     for t in xrange(FRAME_SIZE, LENGTH):
291 | 
292 |         if t % FRAME_SIZE == 0:
293 |             frame_level_outputs, h0 = frame_level_generate_fn(
294 |                 samples[:, t-FRAME_SIZE:t], 
295 |                 h0,
296 |                 numpy.int32(t == FRAME_SIZE)
297 |             )
298 | 
299 |         samples[:, t] = sample_level_generate_fn(
300 |             frame_level_outputs[:, t % FRAME_SIZE], 
301 |             samples[:, t-FRAME_SIZE:t]
302 |         )
303 | 
304 |     for i in xrange(N_SEQS):
305 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
306 | 
307 | print "Training!"
308 | total_iters = 0
309 | total_time = 0.
310 | last_print_time = 0.
311 | last_print_iters = 0
312 | for epoch in itertools.count():
313 | 
314 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
315 |     costs = []
316 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
317 | 
318 |     for seqs, reset in data_feeder:
319 | 
320 |         start_time = time.time()
321 |         cost, h0 = train_fn(seqs, h0, reset)
322 |         total_time += time.time() - start_time
323 |         total_iters += 1
324 | 
325 |         costs.append(cost)
326 | 
327 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
328 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
329 |             
330 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
331 |                 epoch,
332 |                 total_iters,
333 |                 numpy.mean(costs),
334 |                 total_time,
335 |                 total_time / total_iters
336 |             )
337 |             tag = "iters{}_time{}".format(total_iters, total_time)
338 |             generate_and_save_samples(tag)
339 |             lib.save_params('params_{}.pkl'.format(tag))
340 | 
341 |             costs = []
342 |             last_print_time += PRINT_TIME
343 |             last_print_iters += PRINT_ITERS
344 | 
345 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
346 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
347 | 
348 |             print "Done!"
349 | 
350 |             try: # This only matters on Ishaan's computer
351 |                 import experiment_tools
352 |                 experiment_tools.send_sms("done!")
353 |             except ImportError:
354 |                 pass
355 | 
356 |             sys.exit()


--------------------------------------------------------------------------------
/vrnn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.wait_for_gpu(high_priority=False)
 11 | except ImportError:
 12 |     pass
 13 | 
 14 | import numpy
 15 | numpy.random.seed(123)
 16 | import random
 17 | random.seed(123)
 18 | 
 19 | import dataset
 20 | 
 21 | import theano
 22 | import theano.tensor as T
 23 | import theano.ifelse
 24 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 25 | import lib
 26 | import lasagne
 27 | import scipy.io.wavfile
 28 | 
 29 | import time
 30 | import functools
 31 | import itertools
 32 | 
 33 | theano_srng = RandomStreams(seed=234)
 34 | 
 35 | # Hyperparams
 36 | BATCH_SIZE = 128
 37 | FRAME_SIZE = 16
 38 | N_FRAMES = (32*16)/FRAME_SIZE
 39 | SEQ_LEN = FRAME_SIZE*N_FRAMES # How many audio samples to include in each truncated BPTT pass
 40 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 41 | LATENT_DIM = 128
 42 | N_GRUS = 2
 43 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 44 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 45 | 
 46 | VANILLA = False
 47 | 
 48 | ALPHA_ITERS = 10000
 49 | 
 50 | # Dataset
 51 | DATA_PATH = '/media/seagate/blizzard/parts'
 52 | N_FILES = 141703
 53 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 54 | # N_FILES = 516
 55 | BITRATE = 16000
 56 | 
 57 | # Other constants
 58 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 59 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
 60 | STOP_ITERS = 100000 # Stop after this many iterations
 61 | GENERATE_SAMPLES_AND_SAVE_PARAMS = True
 62 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 63 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 64 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 65 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 66 | SAMPLE_LEN = 5*BITRATE
 67 | # SAMPLE_LEN = 1024
 68 | 
 69 | print "Model settings:"
 70 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 71 | all_vars = sorted(all_vars, key=lambda x: x[0])
 72 | for var_name, var_value in all_vars:
 73 |     print "\t{}: {}".format(var_name, var_value)
 74 | 
 75 | def Layer(name, n_in, n_out, inputs):
 76 |     output = lib.ops.Linear(name, n_in, n_out, inputs, initialization='he')
 77 |     output = T.nnet.relu(output)
 78 |     return output
 79 | 
 80 | def MLP(name, n_in, n_out, inputs):
 81 |     output = Layer(name+'.1', n_in, DIM, inputs)
 82 |     output = Layer(name+'.2', DIM, DIM, output)
 83 |     output = Layer(name+'.3', DIM, DIM, output)
 84 |     output = lib.ops.Linear(name+'.Output', DIM, n_out, output)
 85 |     return output
 86 | 
 87 | def FrameProcessor(frames):
 88 |     """
 89 |     frames.shape: (batch size, n frames, FRAME_SIZE)
 90 |     output.shape: (batch size, n frames, DIM)
 91 |     """
 92 | 
 93 |     embedded = lib.ops.Embedding('FrameEmbedding', Q_LEVELS, Q_LEVELS, frames)
 94 |     embedded = embedded.reshape((frames.shape[0], frames.shape[1], Q_LEVELS * FRAME_SIZE))
 95 |     output = MLP('FrameProcessor', FRAME_SIZE*Q_LEVELS, DIM, embedded)
 96 |     return output
 97 | 
 98 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
 99 |     # frames *= lib.floatX(2)
100 |     # output = MLP('FrameProcessor', FRAME_SIZE, DIM, frames)
101 |     # return output
102 | 
103 | def LatentsProcessor(latents):
104 |     """
105 |     latents.shape: (batch size, n frames, LATENT_DIM)
106 |     output.shape: (batch size, n frames, DIM)
107 |     """
108 |     return MLP('LatentsProcessor', LATENT_DIM, DIM, latents)
109 | 
110 | def Prior(contexts):
111 |     """
112 |     contexts.shape: (batch size, n frames, DIM)
113 |     outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
114 |     """
115 |     mu_and_log_sigma = MLP('Prior', DIM, 2*LATENT_DIM, contexts)
116 |     return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
117 | 
118 | def Encoder(processed_frames, contexts):
119 |     """
120 |     processed_frames.shape: (batch size, n frames, DIM)
121 |     contexts.shape: (batch size, n frames, DIM)
122 |     outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
123 |     """
124 |     inputs = T.concatenate([
125 |         processed_frames,
126 |         contexts
127 |     ], axis=2)
128 |     mu_and_log_sigma = MLP('Encoder', 2*DIM, 2*LATENT_DIM, inputs)
129 |     return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
130 | 
131 | def Decoder(latents, contexts, prevs):
132 |     """
133 |     latents.shape: (batch size, n frames, LATENT_DIM)
134 |     contexts.shape: (batch size, n frames, DIM)
135 |     prevs.shape: (batch size, n frames * FRAME_SIZE)
136 |     outputs: (batch size, n frames, FRAME_SIZE, Q_LEVELS)
137 |     """
138 |     inputs = T.concatenate([
139 |         LatentsProcessor(latents),
140 |         contexts
141 |     ], axis=2)
142 |     output = MLP('Decoder', 2*DIM, FRAME_SIZE*Q_LEVELS, inputs)
143 |     return output.reshape((output.shape[0], output.shape[1], FRAME_SIZE, Q_LEVELS))
144 | 
145 | def Recurrence(processed_frames, h0, reset):
146 |     """
147 |     processed_frames.shape: (batch size, n frames, DIM)
148 |     h0.shape: (batch size, N_GRUS, DIM)
149 |     reset.shape: ()
150 |     output.shape: (batch size, n frames, DIM)
151 |     """
152 | 
153 |     # print "warning no recurrence"
154 |     # return T.zeros_like(processed_frames), h0
155 | 
156 |     learned_h0 = lib.param(
157 |         'Recurrence.h0',
158 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
159 |     )
160 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
161 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
162 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
163 | 
164 |     gru0 = lib.ops.LowMemGRU('Recurrence.GRU0', DIM, DIM, processed_frames, h0=h0[:, 0])
165 |     grus = [gru0]
166 |     for i in xrange(1, N_GRUS):
167 |         gru = lib.ops.LowMemGRU('Recurrence.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
168 |         grus.append(gru)
169 | 
170 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
171 | 
172 |     return (grus[-1], last_hidden)
173 | 
174 | 
175 | sequences = T.imatrix('sequences')
176 | h0 = T.tensor3('h0')
177 | reset = T.iscalar('reset')
178 | 
179 | frames = sequences.reshape((sequences.shape[0], -1, FRAME_SIZE))
180 | processed_frames = FrameProcessor(frames)
181 | 
182 | contexts, new_h0 = Recurrence(processed_frames[:,:-1], h0, reset)
183 | 
184 | mu_prior, log_sigma_prior = Prior(contexts)
185 | mu_post, log_sigma_post = Encoder(processed_frames[:,1:], contexts)
186 | 
187 | # log_sigma_prior = T.log(T.nnet.softplus(log_sigma_prior))
188 | # log_sigma_post = T.log(T.nnet.softplus(log_sigma_post))
189 | 
190 | eps = theano_srng.normal(mu_post.shape).astype('float32')
191 | latents = mu_post
192 | if not VANILLA:
193 |     latents += (T.exp(log_sigma_post) * eps)
194 | else:
195 |     print "warning no latent noise"
196 | 
197 | reconstructions = Decoder(latents, contexts, sequences[:, FRAME_SIZE-1:-1])
198 | 
199 | reconst_cost = T.nnet.categorical_crossentropy(
200 |     T.nnet.softmax(reconstructions.reshape((-1, Q_LEVELS))),
201 |     frames[:,1:].flatten()
202 | ).mean()
203 | reconst_cost.name = 'reconst_cost'
204 | 
205 | 
206 | def KLGaussianGaussian(mu1, sig1, mu2, sig2):
207 |     """
208 |     (adapted from https://github.com/jych/cle)
209 |     mu1, sig1 = posterior mu and *log* sigma
210 |     mu2, sig2 = prior mu and *log* sigma
211 |     """
212 |     #    0.5 * (1 + 2*log_sigma - mu**2 - T.exp(2*log_sigma)).mean(axis=0).sum()
213 |     kl = 0.5 * (2*sig2 - 2*sig1 + (T.exp(2*sig1) + (mu1 - mu2)**2) / T.exp(2*sig2) - 1)
214 |     return kl
215 | 
216 | reg_cost = KLGaussianGaussian(
217 |     mu_post,
218 |     log_sigma_post,
219 |     mu_prior, 
220 |     log_sigma_prior
221 | )
222 | reg_cost = reg_cost.sum() / T.cast(frames[:,1:].flatten().shape[0], 'float32')
223 | 
224 | # By default we report cross-entropy cost in bits. 
225 | # Switch to nats by commenting out this line:
226 | reg_cost = reg_cost * lib.floatX(1.44269504089)
227 | reconst_cost = reconst_cost * lib.floatX(1.44269504089)
228 | 
229 | alpha = T.scalar('alpha')
230 | cost = reconst_cost
231 | if not VANILLA:
232 |     cost += (alpha * reg_cost)
233 | 
234 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
235 | lib._train.print_params_info(cost, params)
236 | 
237 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
238 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
239 | 
240 | updates = lasagne.updates.adam(grads, params)
241 | 
242 | train_fn = theano.function(
243 |     [sequences, h0, reset, alpha],
244 |     [reg_cost, reconst_cost, cost, new_h0],
245 |     updates=updates,
246 |     on_unused_input='warn'
247 | )
248 | 
249 | gen_fn_contexts, gen_fn_new_h0 = Recurrence(processed_frames, h0, reset)
250 | gen_recurrence_fn = theano.function(
251 |     [sequences, h0, reset],
252 |     [gen_fn_contexts, gen_fn_new_h0],
253 |     on_unused_input='warn'
254 | )
255 | 
256 | gen_vae_fn = theano.function(
257 |     [contexts],
258 |     lib.ops.softmax_and_sample(
259 |         Decoder(
260 |             mu_prior + theano_srng.normal(mu_prior.shape).astype('float32') * T.exp(log_sigma_prior), 
261 |             contexts
262 |         )
263 |     ),
264 |     on_unused_input='warn'
265 | )
266 | 
267 | def generate_and_save_samples(tag):
268 | 
269 |     def write_audio_file(name, data):
270 |         data = data.astype('float32')
271 |         data -= data.min()
272 |         data /= data.max()
273 |         data -= 0.5
274 |         data *= 0.95
275 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
276 | 
277 |     # Generate 5 sample files, each 5 seconds long
278 |     N_SEQS = 10
279 |     LENGTH = SAMPLE_LEN - (SAMPLE_LEN%FRAME_SIZE)
280 | 
281 |     samples = numpy.zeros((N_SEQS, LENGTH/FRAME_SIZE, FRAME_SIZE), dtype='int32')
282 |     samples[:, 0] = Q_ZERO
283 | 
284 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
285 |     contexts, h0 = gen_recurrence_fn(samples[:,0], h0, numpy.int32(1))
286 | 
287 |     for frame_i in xrange(1, LENGTH/FRAME_SIZE):
288 |         samples[:,frame_i:frame_i+1] = gen_vae_fn(contexts)
289 |         contexts, h0 = gen_recurrence_fn(samples[:,frame_i], h0, numpy.int32(0))
290 | 
291 |     for i in xrange(N_SEQS):
292 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i].reshape((-1)))
293 | 
294 | print "Training!"
295 | total_iters = 0
296 | total_time = 0.
297 | last_print_time = 0.
298 | last_print_iters = 0
299 | reg_costs = []
300 | reconst_costs = []
301 | costs = []
302 | for epoch in itertools.count():
303 | 
304 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
305 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
306 | 
307 |     def sigmoid(x):
308 |       return 1 / (1 + numpy.exp(-x))
309 | 
310 |     for seqs, reset in data_feeder:
311 | 
312 |         # alpha = lib.floatX(sigmoid((total_iters - ALPHA_B)/float(ALPHA_A)))
313 |         # if alpha > 0.99:
314 |         #     alpha = lib.floatX(1)
315 |         # if alpha < 1e-5:
316 |         #     alpha = lib.floatX(1e-5)
317 | 
318 |         # alpha = lib.floatX(0)
319 | 
320 |         alpha = lib.floatX(float(total_iters) / ALPHA_ITERS)
321 |         if alpha > 1:
322 |             alpha = lib.floatX(1)
323 | 
324 |         start_time = time.time()
325 |         reg_cost, reconst_cost, cost, h0 = train_fn(seqs, h0, reset, alpha)
326 |         total_time += time.time() - start_time
327 |         total_iters += 1
328 | 
329 |         reg_costs.append(reg_cost)
330 |         reconst_costs.append(reconst_cost)
331 |         costs.append(cost)
332 | 
333 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
334 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
335 |             
336 |             print "epoch:{}\ttotal iters:{}\talpha:{}\treg:{}\treconst:{}\tfull:{}\ttotal time:{}\ttime per iter:{}".format(
337 |                 epoch,
338 |                 total_iters,
339 |                 alpha,
340 |                 numpy.mean(reg_costs),
341 |                 numpy.mean(reconst_costs),
342 |                 numpy.mean(costs),
343 |                 total_time,
344 |                 total_time / total_iters
345 |             )
346 |             tag = "iters{}_time{}".format(total_iters, total_time)
347 | 
348 |             if GENERATE_SAMPLES_AND_SAVE_PARAMS:
349 |                 generate_and_save_samples(tag)
350 |                 lib.save_params('params_{}.pkl'.format(tag))
351 | 
352 |             reg_costs = []
353 |             reconst_costs = []
354 |             costs = []
355 |             last_print_time += PRINT_TIME
356 |             last_print_iters += PRINT_ITERS
357 | 
358 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
359 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
360 | 
361 |             print "Done!"
362 | 
363 |             try: # This only matters on Ishaan's computer
364 |                 import experiment_tools
365 |                 experiment_tools.send_sms("done!")
366 |             except ImportError:
367 |                 pass
368 | 
369 |             sys.exit()


--------------------------------------------------------------------------------
/vrnn_ar.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | import os, sys
  6 | sys.path.append(os.getcwd())
  7 | 
  8 | try: # This only matters on Ishaan's computer
  9 |     import experiment_tools
 10 |     experiment_tools.wait_for_gpu(high_priority=False)
 11 | except ImportError:
 12 |     pass
 13 | 
 14 | import numpy
 15 | numpy.random.seed(123)
 16 | import random
 17 | random.seed(123)
 18 | 
 19 | import dataset
 20 | 
 21 | import theano
 22 | import theano.tensor as T
 23 | import theano.ifelse
 24 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 25 | import lib
 26 | import lasagne
 27 | import scipy.io.wavfile
 28 | 
 29 | import time
 30 | import functools
 31 | import itertools
 32 | 
 33 | theano_srng = RandomStreams(seed=234)
 34 | 
 35 | # Hyperparams
 36 | BATCH_SIZE = 128
 37 | FRAME_SIZE = 16
 38 | N_FRAMES = (32*16)/FRAME_SIZE
 39 | SEQ_LEN = FRAME_SIZE*N_FRAMES # How many audio samples to include in each truncated BPTT pass
 40 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 41 | LATENT_DIM = 128
 42 | N_GRUS = 2
 43 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 44 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 45 | 
 46 | VANILLA = False
 47 | 
 48 | ALPHA_ITERS = 10000
 49 | 
 50 | # Dataset
 51 | DATA_PATH = '/media/seagate/blizzard/parts'
 52 | N_FILES = 141703
 53 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 54 | # N_FILES = 516
 55 | BITRATE = 16000
 56 | 
 57 | # Other constants
 58 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 59 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
 60 | STOP_ITERS = 100000 # Stop after this many iterations
 61 | GENERATE_SAMPLES_AND_SAVE_PARAMS = True
 62 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 63 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 64 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 65 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 66 | SAMPLE_LEN = 5*BITRATE
 67 | # SAMPLE_LEN = 1024
 68 | 
 69 | print "Model settings:"
 70 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 71 | all_vars = sorted(all_vars, key=lambda x: x[0])
 72 | for var_name, var_value in all_vars:
 73 |     print "\t{}: {}".format(var_name, var_value)
 74 | 
 75 | def Layer(name, n_in, n_out, inputs):
 76 |     output = lib.ops.Linear(name, n_in, n_out, inputs, initialization='he')
 77 |     output = T.nnet.relu(output)
 78 |     return output
 79 | 
 80 | def MLP(name, n_in, n_out, inputs):
 81 |     output = Layer(name+'.1', n_in, DIM, inputs)
 82 |     output = Layer(name+'.2', DIM, DIM, output)
 83 |     output = Layer(name+'.3', DIM, DIM, output)
 84 |     output = lib.ops.Linear(name+'.Output', DIM, n_out, output)
 85 |     return output
 86 | 
 87 | def FrameProcessor(frames):
 88 |     """
 89 |     frames.shape: (batch size, n frames, FRAME_SIZE)
 90 |     output.shape: (batch size, n frames, DIM)
 91 |     """
 92 | 
 93 |     embedded = lib.ops.Embedding('FrameEmbedding', Q_LEVELS, Q_LEVELS, frames)
 94 |     embedded = embedded.reshape((frames.shape[0], frames.shape[1], Q_LEVELS * FRAME_SIZE))
 95 |     output = MLP('FrameProcessor', FRAME_SIZE*Q_LEVELS, DIM, embedded)
 96 |     return output
 97 | 
 98 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
 99 |     # frames *= lib.floatX(2)
100 |     # output = MLP('FrameProcessor', FRAME_SIZE, DIM, frames)
101 |     # return output
102 | 
103 | def LatentsProcessor(latents):
104 |     """
105 |     latents.shape: (batch size, n frames, LATENT_DIM)
106 |     output.shape: (batch size, n frames, DIM)
107 |     """
108 |     return MLP('LatentsProcessor', LATENT_DIM, DIM, latents)
109 | 
110 | def Prior(contexts):
111 |     """
112 |     contexts.shape: (batch size, n frames, DIM)
113 |     outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
114 |     """
115 |     mu_and_log_sigma = MLP('Prior', DIM, 2*LATENT_DIM, contexts)
116 |     return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
117 | 
118 | def Encoder(processed_frames, contexts):
119 |     """
120 |     processed_frames.shape: (batch size, n frames, DIM)
121 |     contexts.shape: (batch size, n frames, DIM)
122 |     outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
123 |     """
124 |     inputs = T.concatenate([
125 |         processed_frames,
126 |         contexts
127 |     ], axis=2)
128 |     mu_and_log_sigma = MLP('Encoder', 2*DIM, 2*LATENT_DIM, inputs)
129 |     return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
130 | 
131 | def Decoder(latents, contexts, prevs):
132 |     """
133 |     latents.shape: (batch size, n frames, LATENT_DIM)
134 |     contexts.shape: (batch size, n frames, DIM)
135 |     prevs.shape: (batch size, n frames * FRAME_SIZE)
136 |     outputs: (batch size, n frames, FRAME_SIZE, Q_LEVELS)
137 |     """
138 |     inputs = T.concatenate([
139 |         LatentsProcessor(latents),
140 |         contexts
141 |     ], axis=2)
142 |     output = MLP('Decoder', 2*DIM, FRAME_SIZE*Q_LEVELS, inputs)
143 |     return output.reshape((output.shape[0], output.shape[1], FRAME_SIZE, Q_LEVELS))
144 | 
145 | def Recurrence(processed_frames, h0, reset):
146 |     """
147 |     processed_frames.shape: (batch size, n frames, DIM)
148 |     h0.shape: (batch size, N_GRUS, DIM)
149 |     reset.shape: ()
150 |     output.shape: (batch size, n frames, DIM)
151 |     """
152 | 
153 |     # print "warning no recurrence"
154 |     # return T.zeros_like(processed_frames), h0
155 | 
156 |     learned_h0 = lib.param(
157 |         'Recurrence.h0',
158 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
159 |     )
160 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
161 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
162 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
163 | 
164 |     gru0 = lib.ops.LowMemGRU('Recurrence.GRU0', DIM, DIM, processed_frames, h0=h0[:, 0])
165 |     grus = [gru0]
166 |     for i in xrange(1, N_GRUS):
167 |         gru = lib.ops.LowMemGRU('Recurrence.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
168 |         grus.append(gru)
169 | 
170 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
171 | 
172 |     return (grus[-1], last_hidden)
173 | 
174 | 
175 | sequences = T.imatrix('sequences')
176 | h0 = T.tensor3('h0')
177 | reset = T.iscalar('reset')
178 | 
179 | frames = sequences.reshape((sequences.shape[0], -1, FRAME_SIZE))
180 | processed_frames = FrameProcessor(frames)
181 | 
182 | contexts, new_h0 = Recurrence(processed_frames[:,:-1], h0, reset)
183 | 
184 | mu_prior, log_sigma_prior = Prior(contexts)
185 | mu_post, log_sigma_post = Encoder(processed_frames[:,1:], contexts)
186 | 
187 | # log_sigma_prior = T.log(T.nnet.softplus(log_sigma_prior))
188 | # log_sigma_post = T.log(T.nnet.softplus(log_sigma_post))
189 | 
190 | eps = theano_srng.normal(mu_post.shape).astype('float32')
191 | latents = mu_post
192 | if not VANILLA:
193 |     latents += (T.exp(log_sigma_post) * eps)
194 | else:
195 |     print "warning no latent noise"
196 | 
197 | reconstructions = Decoder(latents, contexts, sequences[:, FRAME_SIZE-1:-1])
198 | 
199 | reconst_cost = T.nnet.categorical_crossentropy(
200 |     T.nnet.softmax(reconstructions.reshape((-1, Q_LEVELS))),
201 |     frames[:,1:].flatten()
202 | ).mean()
203 | reconst_cost.name = 'reconst_cost'
204 | 
205 | 
206 | def KLGaussianGaussian(mu1, sig1, mu2, sig2):
207 |     """
208 |     (adapted from https://github.com/jych/cle)
209 |     mu1, sig1 = posterior mu and *log* sigma
210 |     mu2, sig2 = prior mu and *log* sigma
211 |     """
212 |     #    0.5 * (1 + 2*log_sigma - mu**2 - T.exp(2*log_sigma)).mean(axis=0).sum()
213 |     kl = 0.5 * (2*sig2 - 2*sig1 + (T.exp(2*sig1) + (mu1 - mu2)**2) / T.exp(2*sig2) - 1)
214 |     return kl
215 | 
216 | reg_cost = KLGaussianGaussian(
217 |     mu_post,
218 |     log_sigma_post,
219 |     mu_prior, 
220 |     log_sigma_prior
221 | )
222 | reg_cost = reg_cost.sum() / T.cast(frames[:,1:].flatten().shape[0], 'float32')
223 | 
224 | # By default we report cross-entropy cost in bits. 
225 | # Switch to nats by commenting out this line:
226 | reg_cost = reg_cost * lib.floatX(1.44269504089)
227 | reconst_cost = reconst_cost * lib.floatX(1.44269504089)
228 | 
229 | alpha = T.scalar('alpha')
230 | cost = reconst_cost
231 | if not VANILLA:
232 |     cost += (alpha * reg_cost)
233 | 
234 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
235 | lib._train.print_params_info(cost, params)
236 | 
237 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
238 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
239 | 
240 | updates = lasagne.updates.adam(grads, params)
241 | 
242 | train_fn = theano.function(
243 |     [sequences, h0, reset, alpha],
244 |     [reg_cost, reconst_cost, cost, new_h0],
245 |     updates=updates,
246 |     on_unused_input='warn'
247 | )
248 | 
249 | gen_fn_contexts, gen_fn_new_h0 = Recurrence(processed_frames, h0, reset)
250 | gen_recurrence_fn = theano.function(
251 |     [sequences, h0, reset],
252 |     [gen_fn_contexts, gen_fn_new_h0],
253 |     on_unused_input='warn'
254 | )
255 | 
256 | gen_vae_fn = theano.function(
257 |     [contexts],
258 |     lib.ops.softmax_and_sample(
259 |         Decoder(
260 |             mu_prior + theano_srng.normal(mu_prior.shape).astype('float32') * T.exp(log_sigma_prior), 
261 |             contexts
262 |         )
263 |     ),
264 |     on_unused_input='warn'
265 | )
266 | 
267 | def generate_and_save_samples(tag):
268 | 
269 |     def write_audio_file(name, data):
270 |         data = data.astype('float32')
271 |         data -= data.min()
272 |         data /= data.max()
273 |         data -= 0.5
274 |         data *= 0.95
275 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
276 | 
277 |     # Generate 5 sample files, each 5 seconds long
278 |     N_SEQS = 10
279 |     LENGTH = SAMPLE_LEN - (SAMPLE_LEN%FRAME_SIZE)
280 | 
281 |     samples = numpy.zeros((N_SEQS, LENGTH/FRAME_SIZE, FRAME_SIZE), dtype='int32')
282 |     samples[:, 0] = Q_ZERO
283 | 
284 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
285 |     contexts, h0 = gen_recurrence_fn(samples[:,0], h0, numpy.int32(1))
286 | 
287 |     for frame_i in xrange(1, LENGTH/FRAME_SIZE):
288 |         samples[:,frame_i:frame_i+1] = gen_vae_fn(contexts)
289 |         contexts, h0 = gen_recurrence_fn(samples[:,frame_i], h0, numpy.int32(0))
290 | 
291 |     for i in xrange(N_SEQS):
292 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i].reshape((-1)))
293 | 
294 | print "Training!"
295 | total_iters = 0
296 | total_time = 0.
297 | last_print_time = 0.
298 | last_print_iters = 0
299 | reg_costs = []
300 | reconst_costs = []
301 | costs = []
302 | for epoch in itertools.count():
303 | 
304 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
305 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
306 | 
307 |     def sigmoid(x):
308 |       return 1 / (1 + numpy.exp(-x))
309 | 
310 |     for seqs, reset in data_feeder:
311 | 
312 |         # alpha = lib.floatX(sigmoid((total_iters - ALPHA_B)/float(ALPHA_A)))
313 |         # if alpha > 0.99:
314 |         #     alpha = lib.floatX(1)
315 |         # if alpha < 1e-5:
316 |         #     alpha = lib.floatX(1e-5)
317 | 
318 |         # alpha = lib.floatX(0)
319 | 
320 |         alpha = lib.floatX(float(total_iters) / ALPHA_ITERS)
321 |         if alpha > 1:
322 |             alpha = lib.floatX(1)
323 | 
324 |         start_time = time.time()
325 |         reg_cost, reconst_cost, cost, h0 = train_fn(seqs, h0, reset, alpha)
326 |         total_time += time.time() - start_time
327 |         total_iters += 1
328 | 
329 |         reg_costs.append(reg_cost)
330 |         reconst_costs.append(reconst_cost)
331 |         costs.append(cost)
332 | 
333 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
334 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
335 |             
336 |             print "epoch:{}\ttotal iters:{}\talpha:{}\treg:{}\treconst:{}\tfull:{}\ttotal time:{}\ttime per iter:{}".format(
337 |                 epoch,
338 |                 total_iters,
339 |                 alpha,
340 |                 numpy.mean(reg_costs),
341 |                 numpy.mean(reconst_costs),
342 |                 numpy.mean(costs),
343 |                 total_time,
344 |                 total_time / total_iters
345 |             )
346 |             tag = "iters{}_time{}".format(total_iters, total_time)
347 | 
348 |             if GENERATE_SAMPLES_AND_SAVE_PARAMS:
349 |                 generate_and_save_samples(tag)
350 |                 lib.save_params('params_{}.pkl'.format(tag))
351 | 
352 |             reg_costs = []
353 |             reconst_costs = []
354 |             costs = []
355 |             last_print_time += PRINT_TIME
356 |             last_print_iters += PRINT_ITERS
357 | 
358 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
359 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
360 | 
361 |             print "Done!"
362 | 
363 |             try: # This only matters on Ishaan's computer
364 |                 import experiment_tools
365 |                 experiment_tools.send_sms("done!")
366 |             except ImportError:
367 |                 pass
368 | 
369 |             sys.exit()


--------------------------------------------------------------------------------
/three_tier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Speech Generation Model
  3 | Ishaan Gulrajani
  4 | """
  5 | 
  6 | import os, sys
  7 | sys.path.append(os.getcwd())
  8 | 
  9 | try: # This only matters on Ishaan's computer
 10 |     import experiment_tools
 11 |     experiment_tools.register_crash_notifier()
 12 |     experiment_tools.wait_for_gpu(high_priority=False, debug=True)
 13 | except ImportError:
 14 |     pass
 15 | 
 16 | import numpy
 17 | numpy.random.seed(123)
 18 | import random
 19 | random.seed(123)
 20 | 
 21 | import dataset
 22 | 
 23 | import theano
 24 | import theano.tensor as T
 25 | import theano.tensor.nnet.neighbours
 26 | import theano.ifelse
 27 | import lib
 28 | import lasagne
 29 | import scipy.io.wavfile
 30 | 
 31 | import time
 32 | import functools
 33 | import itertools
 34 | 
 35 | # Hyperparams
 36 | BATCH_SIZE = 128
 37 | SEQ_LEN = 512 # How many samples to include in each truncated BPTT pass
 38 | PRE_SEQ_LEN = 1024
 39 | FRAME_SIZE = 2 # How many samples per frame
 40 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
 41 | BIG_FRAME_SIZE = 8 # how many samples per big frame
 42 | N_BIG_GRUS = 4 # how many GRUs to stack in the big-frame-level model
 43 | assert(SEQ_LEN % BIG_FRAME_SIZE == 0)
 44 | assert(BIG_FRAME_SIZE % FRAME_SIZE == 0)
 45 | DIM = 1024 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 46 | BIG_DIM = 1024 # dimensionality for the slowest level
 47 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 48 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 49 | 
 50 | # Dataset
 51 | DATA_PATH = '/media/seagate/blizzard/parts'
 52 | N_FILES = 141703
 53 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 54 | # N_FILES = 516
 55 | BITRATE = 16000
 56 | 
 57 | # Other constants
 58 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 59 | N_FRAMES = SEQ_LEN / FRAME_SIZE # Number of frames in each truncated BPTT pass
 60 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 61 | 
 62 | # # Pretrain loop
 63 | PRE_TRAIN_MODE = 'time' # only time supported right now
 64 | PRE_PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 65 | PRE_STOP_TIME = 60*60*4 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 66 | PRE_PRINT_ITERS = 0
 67 | PRE_STOP_ITERS = 0
 68 | 
 69 | # in between "pretraining" and "fine tuning" (i.e. end-to-end) there's a period 
 70 | # where we only train the bottom levels, so that when we train end-to-end we 
 71 | # don't screw up the top levels with gradients from the random bottom levels
 72 | # if PRE_STOP_TIME > 0:
 73 | # TIME_BEFORE_FINETUNE = 60*60*1
 74 | TIME_BEFORE_FINETUNE = 0
 75 | 
 76 | # Train loop
 77 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
 78 | PRINT_ITERS = 1 # Print cost, generate samples, save model checkpoint every N iterations.
 79 | STOP_ITERS = 100000 # Stop after this many iterations
 80 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
 81 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
 82 | 
 83 | STOP_TIME -= PRE_STOP_TIME
 84 | 
 85 | print "Model settings:"
 86 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 87 | all_vars = sorted(all_vars, key=lambda x: x[0])
 88 | for var_name, var_value in all_vars:
 89 |     print "\t{}: {}".format(var_name, var_value)
 90 | 
 91 | def big_frame_level_rnn(input_sequences, h0, reset):
 92 |     """
 93 |     input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE)
 94 |     h0.shape:              (batch size, N_BIG_GRUS, BIG_DIM)
 95 |     reset.shape:           ()
 96 |     output[0].shape:       (batch size, n frames, DIM)
 97 |     output[1].shape:       same as h0.shape
 98 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
 99 |     """
100 | 
101 |     learned_h0 = lib.param(
102 |         'BigFrameLevel.h0',
103 |         numpy.zeros((N_BIG_GRUS, BIG_DIM), dtype=theano.config.floatX)
104 |     )
105 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_GRUS, BIG_DIM)
106 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
107 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
108 | 
109 |     frames = input_sequences.reshape((
110 |         input_sequences.shape[0],
111 |         input_sequences.shape[1] / BIG_FRAME_SIZE,
112 |         BIG_FRAME_SIZE
113 |     ))
114 | 
115 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
116 |     # (a reasonable range to pass as inputs to the RNN)
117 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
118 |     frames *= lib.floatX(2)
119 | 
120 |     gru0 = lib.ops.LowMemGRU('BigFrameLevel.GRU0', BIG_FRAME_SIZE, BIG_DIM, frames, h0=h0[:, 0])
121 |     grus = [gru0]
122 |     for i in xrange(1, N_BIG_GRUS):
123 |         gru = lib.ops.LowMemGRU('BigFrameLevel.GRU'+str(i), BIG_DIM, BIG_DIM, grus[-1], h0=h0[:, i])
124 |         grus.append(gru)
125 | 
126 |     output = lib.ops.Linear(
127 |         'BigFrameLevel.Output', 
128 |         BIG_DIM,
129 |         DIM * BIG_FRAME_SIZE / FRAME_SIZE,
130 |         grus[-1]
131 |     )
132 |     output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM))
133 | 
134 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
135 | 
136 |     independent_preds = lib.ops.Linear(
137 |         'BigFrameLevel.IndependentPreds', 
138 |         BIG_DIM,
139 |         Q_LEVELS * BIG_FRAME_SIZE,
140 |         grus[-1]
141 |     )
142 |     independent_preds = independent_preds.reshape((independent_preds.shape[0], independent_preds.shape[1] * BIG_FRAME_SIZE, Q_LEVELS))
143 | 
144 |     return (output, last_hidden, independent_preds)
145 | 
146 | def frame_level_rnn(input_sequences, other_input, h0, reset):
147 |     """
148 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
149 |     other_input.shape:     (batch size, n frames, DIM)
150 |     h0.shape:              (batch size, N_GRUS, DIM)
151 |     reset.shape:           ()
152 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
153 |     """
154 | 
155 |     learned_h0 = lib.param(
156 |         'FrameLevel.h0',
157 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
158 |     )
159 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
160 |     learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
161 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
162 | 
163 |     frames = input_sequences.reshape((
164 |         input_sequences.shape[0],
165 |         input_sequences.shape[1] / FRAME_SIZE,
166 |         FRAME_SIZE
167 |     ))
168 | 
169 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
170 |     # (a reasonable range to pass as inputs to the RNN)
171 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
172 |     frames *= lib.floatX(2)
173 | 
174 |     gru_input = lib.ops.Linear('FrameLevel.InputExpand', FRAME_SIZE, DIM, frames) + other_input
175 | 
176 |     gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', DIM, DIM, gru_input, h0=h0[:, 0])
177 |     grus = [gru0]
178 |     for i in xrange(1, N_GRUS):
179 |         gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
180 |         grus.append(gru)
181 | 
182 |     output = lib.ops.Linear(
183 |         'FrameLevel.Output', 
184 |         DIM,
185 |         FRAME_SIZE * DIM,
186 |         grus[-1],
187 |         initialization='he'
188 |     )
189 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
190 | 
191 |     last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
192 | 
193 |     return (output, last_hidden)
194 | 
195 | def sample_level_predictor(frame_level_outputs, prev_samples):
196 |     """
197 |     frame_level_outputs.shape: (batch size, DIM)
198 |     prev_samples.shape:        (batch size, FRAME_SIZE)
199 |     output.shape:              (batch size, Q_LEVELS)
200 |     """
201 | 
202 |     prev_samples = lib.ops.Embedding(
203 |         'SampleLevel.Embedding',
204 |         Q_LEVELS,
205 |         Q_LEVELS,
206 |         prev_samples
207 |     ).reshape((-1, FRAME_SIZE * Q_LEVELS))
208 | 
209 |     out = lib.ops.Linear(
210 |         'SampleLevel.L1_PrevSamples', 
211 |         FRAME_SIZE * Q_LEVELS,
212 |         DIM,
213 |         prev_samples,
214 |         biases=False,
215 |         initialization='he'
216 |     )
217 |     out += frame_level_outputs
218 |     out = T.nnet.relu(out)
219 | 
220 |     out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
221 |     out = T.nnet.relu(out)
222 |     out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
223 |     out = T.nnet.relu(out)
224 | 
225 |     # We apply the softmax later
226 |     return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
227 | 
228 | sequences   = T.imatrix('sequences')
229 | h0          = T.tensor3('h0')
230 | big_h0      = T.tensor3('big_h0')
231 | reset       = T.iscalar('reset')
232 | 
233 | big_input_sequences = sequences[:, :-BIG_FRAME_SIZE]
234 | input_sequences = sequences[:, BIG_FRAME_SIZE-FRAME_SIZE:-FRAME_SIZE]
235 | target_sequences = sequences[:, BIG_FRAME_SIZE:]
236 | 
237 | big_frame_level_outputs, new_big_h0, big_frame_independent_preds = big_frame_level_rnn(big_input_sequences, big_h0, reset)
238 | 
239 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)
240 | 
241 | prev_samples = sequences[:, BIG_FRAME_SIZE-FRAME_SIZE:-1]
242 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
243 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
244 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
245 | 
246 | sample_level_outputs = sample_level_predictor(
247 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
248 |     prev_samples
249 | )
250 | 
251 | cost = T.nnet.categorical_crossentropy(
252 |     T.nnet.softmax(sample_level_outputs),
253 |     target_sequences.flatten()
254 | ).mean()
255 | 
256 | # By default we report cross-entropy cost in bits. 
257 | # Switch to nats by commenting out this line:
258 | cost = cost * lib.floatX(1.44269504089)
259 | 
260 | ip_cost = lib.floatX(1.44269504089) * T.nnet.categorical_crossentropy(
261 |     T.nnet.softmax(big_frame_independent_preds.reshape((-1, Q_LEVELS))),
262 |     target_sequences.flatten()
263 | ).mean()
264 | 
265 | all_params = lib.search(cost, lambda x: hasattr(x, 'param'))
266 | ip_params = lib.search(ip_cost, lambda x: hasattr(x, 'param') and 'BigFrameLevel' in x.name)
267 | other_params = [p for p in all_params if p not in ip_params]
268 | all_params = ip_params + other_params
269 | lib._train.print_params_info(ip_cost, ip_params)
270 | lib._train.print_params_info(cost, other_params)
271 | lib._train.print_params_info(cost, all_params)
272 | 
273 | ip_grads = T.grad(ip_cost, wrt=ip_params, disconnected_inputs='warn')
274 | ip_grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in ip_grads]
275 | 
276 | other_grads = T.grad(cost, wrt=other_params, disconnected_inputs='warn')
277 | other_grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in other_grads]
278 | 
279 | grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn')
280 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
281 | 
282 | 
283 | ip_updates = lasagne.updates.adam(ip_grads, ip_params)
284 | other_updates = lasagne.updates.adam(other_grads, other_params)
285 | updates = lasagne.updates.adam(grads, all_params)
286 | 
287 | ip_train_fn = theano.function(
288 |     [sequences, big_h0, reset],
289 |     [ip_cost, new_big_h0],
290 |     updates=ip_updates,
291 |     on_unused_input='warn'
292 | )
293 | 
294 | other_train_fn = theano.function(
295 |     [sequences, big_h0, h0, reset],
296 |     [cost, new_big_h0, new_h0],
297 |     updates=other_updates,
298 |     on_unused_input='warn'
299 | )
300 | 
301 | train_fn = theano.function(
302 |     [sequences, big_h0, h0, reset],
303 |     [cost, new_big_h0, new_h0],
304 |     updates=updates,
305 |     on_unused_input='warn'
306 | )
307 | 
308 | big_frame_level_generate_fn = theano.function(
309 |     [sequences, big_h0, reset],
310 |     big_frame_level_rnn(sequences, big_h0, reset)[0:2],
311 |     on_unused_input='warn'
312 | )
313 | 
314 | big_frame_level_outputs = T.matrix('big_frame_level_outputs')
315 | frame_level_generate_fn = theano.function(
316 |     [sequences, big_frame_level_outputs, h0, reset],
317 |     frame_level_rnn(sequences, big_frame_level_outputs.dimshuffle(0,'x',1), h0, reset),
318 |     on_unused_input='warn'
319 | )
320 | 
321 | frame_level_outputs = T.matrix('frame_level_outputs')
322 | prev_samples        = T.imatrix('prev_samples')
323 | sample_level_generate_fn = theano.function(
324 |     [frame_level_outputs, prev_samples],
325 |     lib.ops.softmax_and_sample(
326 |         sample_level_predictor(
327 |             frame_level_outputs, 
328 |             prev_samples
329 |         )
330 |     ),
331 |     on_unused_input='warn'
332 | )
333 | 
334 | def generate_and_save_samples(tag):
335 | 
336 |     def write_audio_file(name, data):
337 |         data = data.astype('float32')
338 |         data -= data.min()
339 |         data /= data.max()
340 |         data -= 0.5
341 |         data *= 0.95
342 |         scipy.io.wavfile.write(name+'.wav', BITRATE, data)
343 | 
344 |     # Generate 5 sample files, each 5 seconds long
345 |     N_SEQS = 10
346 |     LENGTH = 5*BITRATE
347 | 
348 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
349 |     samples[:, :BIG_FRAME_SIZE] = Q_ZERO
350 | 
351 |     big_h0 = numpy.zeros((N_SEQS, N_BIG_GRUS, BIG_DIM), dtype='float32')
352 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
353 |     big_frame_level_outputs = None
354 |     frame_level_outputs = None
355 | 
356 |     for t in xrange(BIG_FRAME_SIZE, LENGTH):
357 | 
358 |         if t % BIG_FRAME_SIZE == 0:
359 |             big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
360 |                 samples[:, t-BIG_FRAME_SIZE:t],
361 |                 big_h0,
362 |                 numpy.int32(t == BIG_FRAME_SIZE)
363 |             )
364 | 
365 |         if t % FRAME_SIZE == 0:
366 |             frame_level_outputs, h0 = frame_level_generate_fn(
367 |                 samples[:, t-FRAME_SIZE:t], 
368 |                 big_frame_level_outputs[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)],
369 |                 h0,
370 |                 numpy.int32(t == BIG_FRAME_SIZE)
371 |             )
372 | 
373 |         samples[:, t] = sample_level_generate_fn(
374 |             frame_level_outputs[:, t % FRAME_SIZE], 
375 |             samples[:, t-FRAME_SIZE:t]
376 |         )
377 | 
378 |     for i in xrange(N_SEQS):
379 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
380 | 
381 | if PRE_STOP_TIME > 0:
382 |     print "Pretraining!"
383 |     total_iters = 0
384 |     total_time = 0.
385 |     last_print_time = 0.
386 |     last_print_iters = 0
387 |     pretrain_finished = False
388 | 
389 |     for epoch in itertools.count():
390 |         if pretrain_finished:
391 |             break
392 | 
393 |         big_h0 = numpy.zeros((BATCH_SIZE, N_BIG_GRUS, BIG_DIM), dtype='float32')
394 |         costs = []
395 |         data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, PRE_SEQ_LEN, BIG_FRAME_SIZE, Q_LEVELS, Q_ZERO)
396 | 
397 |         for seqs, reset in data_feeder:
398 |             if pretrain_finished:
399 |                 break
400 | 
401 |             start_time = time.time()
402 |             cost, big_h0 = ip_train_fn(seqs, big_h0, reset)
403 |             total_time += time.time() - start_time
404 |             total_iters += 1
405 | 
406 |             costs.append(cost)
407 | 
408 | 
409 | 
410 |             if (PRE_TRAIN_MODE=='iters' and total_iters-last_print_iters == PRE_PRINT_ITERS) or \
411 |                 (PRE_TRAIN_MODE=='time' and total_time-last_print_time >= PRE_PRINT_TIME):
412 |                 
413 |                 print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
414 |                     epoch,
415 |                     total_iters,
416 |                     numpy.mean(costs),
417 |                     total_time,
418 |                     total_time / total_iters
419 |                 )
420 |                 tag = "iters{}_time{}".format(total_iters, total_time)
421 |                 lib.save_params('params_pretrain_{}.pkl'.format(tag))
422 | 
423 |                 costs = []
424 |                 last_print_time += PRE_PRINT_TIME
425 |                 last_print_iters += PRE_PRINT_ITERS
426 | 
427 |             if (PRE_TRAIN_MODE=='iters' and total_iters == PRE_STOP_ITERS) or \
428 |                 (PRE_TRAIN_MODE=='time' and total_time >= PRE_STOP_TIME):
429 | 
430 |                 print "Done!"
431 | 
432 |                 pretrain_finished = True
433 | 
434 | print "Training!"
435 | total_iters = 0
436 | total_time = 0.
437 | last_print_time = 0.
438 | last_print_iters = 0
439 | last_eigs = 0.
440 | finetune = False
441 | for epoch in itertools.count():
442 | 
443 |     big_h0 = numpy.zeros((BATCH_SIZE, N_BIG_GRUS, BIG_DIM), dtype='float32')
444 |     h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
445 |     costs = []
446 |     data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, BIG_FRAME_SIZE, Q_LEVELS, Q_ZERO)
447 | 
448 |     for seqs, reset in data_feeder:
449 | 
450 |         if finetune:
451 |             _train_fn = train_fn
452 |         else:
453 |             _train_fn = other_train_fn
454 | 
455 |         start_time = time.time()
456 |         cost, big_h0, h0 = _train_fn(seqs, big_h0, h0, reset)
457 |         total_time += time.time() - start_time
458 |         total_iters += 1
459 | 
460 |         costs.append(cost)
461 | 
462 |         if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
463 |             (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
464 |             
465 |             print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
466 |                 epoch,
467 |                 total_iters,
468 |                 numpy.mean(costs),
469 |                 total_time,
470 |                 total_time / total_iters
471 |             )
472 |             print "Warning not generating samples"
473 |             # tag = "iters{}_time{}".format(total_iters, total_time)
474 |             # generate_and_save_samples(tag)
475 |             # lib.save_params('params_{}.pkl'.format(tag))
476 | 
477 |             if last_print_time <= TIME_BEFORE_FINETUNE <= last_print_time + PRINT_TIME:
478 |                 print "Switching to fine-tuning!"
479 |                 finetune = True
480 | 
481 |             costs = []
482 |             last_print_time += PRINT_TIME
483 |             last_print_iters += PRINT_ITERS
484 | 
485 |         if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
486 |             (TRAIN_MODE=='time' and total_time >= STOP_TIME):
487 | 
488 |             print "Done!"
489 | 
490 |             try: # This only matters on Ishaan's computer
491 |                 import experiment_tools
492 |                 experiment_tools.send_sms("done!")
493 |             except ImportError:
494 |                 pass
495 | 
496 |             sys.exit()


--------------------------------------------------------------------------------