├── .gitignore
├── notes
├── two_tier.jpg
├── softmax_visualization.mp4
└── two_tier.txt
├── lib
├── train.py
├── __init__.py
└── ops.py
├── preprocess.py
├── dataset.py
├── vestigial
└── variable_length_data.py
├── README.md
├── baseline.py
├── baseline_gaussian.py
├── two_tier.py
├── two_tier_v.py
├── conv.py
├── two_tier_conv.py
├── vrnn.py
├── vrnn_ar.py
└── three_tier.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 |
--------------------------------------------------------------------------------
/notes/two_tier.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igul222/speech/HEAD/notes/two_tier.jpg
--------------------------------------------------------------------------------
/notes/softmax_visualization.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igul222/speech/HEAD/notes/softmax_visualization.mp4
--------------------------------------------------------------------------------
/lib/train.py:
--------------------------------------------------------------------------------
1 | import lib
2 | import numpy
3 | import theano
4 | import theano.tensor as T
5 | import lasagne
6 | # from theano.compile.nanguardmode import NanGuardMode
7 |
8 | import math
9 | import time
10 | import locale
11 |
12 | import numpy
13 |
14 | locale.setlocale(locale.LC_ALL, '')
15 |
16 | def print_params_info(cost, params):
17 | """Print information about the parameters in the given param set."""
18 |
19 | params = sorted(params, key=lambda p: p.name)
20 | values = [p.get_value(borrow=True) for p in params]
21 | shapes = [p.shape for p in values]
22 | print "Params for cost:"
23 | for param, value, shape in zip(params, values, shapes):
24 | print "\t{0} ({1})".format(
25 | param.name,
26 | ",".join([str(x) for x in shape])
27 | )
28 |
29 | total_param_count = 0
30 | for shape in shapes:
31 | param_count = 1
32 | for dim in shape:
33 | param_count *= dim
34 | total_param_count += param_count
35 | print "Total parameter count: {0}".format(
36 | locale.format("%d", total_param_count, grouping=True)
37 | )
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 |
4 | RAW_DATA_DIR="/media/seagate/blizzard/unsegmented"
5 | OUTPUT_DIR="/media/seagate/blizzard/parts"
6 |
7 | # Step 1: write all filenames to a list
8 | with open(OUTPUT_DIR+'/preprocess_file_list.txt', 'w') as f:
9 | for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR):
10 | for filename in filenames:
11 | f.write("file '" + dirpath + '/'+ filename + "'\n")
12 |
13 | # Step 2: concatenate everything into one massive wav file
14 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(OUTPUT_DIR, OUTPUT_DIR))
15 |
16 | # # get the length of the resulting file
17 | length = float(subprocess.check_output('ffprobe -i {}/preprocess_all_audio.wav -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR), shell=True))
18 |
19 | # # Step 3: split the big file into 8-second chunks
20 | for i in xrange(int(length)//8 - 1):
21 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(i, OUTPUT_DIR, OUTPUT_DIR, i))
22 |
23 | # # Step 4: clean up temp files
24 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
25 | os.system('rm {}/preprocess_file_list.txt'.format(OUTPUT_DIR))
--------------------------------------------------------------------------------
/notes/two_tier.txt:
--------------------------------------------------------------------------------
1 | Description of the model implemented in two_tier.py
2 | =========
3 |
4 | The model operates on scalar-quantized 16KHz speech waveforms, sample-by-sample:
5 | each waveform is a sequence x = x_0, x_1, ..., x_t, where each x_i represents
6 | one sample, and can take one of 256 discrete values, corresponding to 256 linear
7 | quantization levels.
8 |
9 | The model (taken as a whole) is purely autoregressive; it factorizes the
10 | distribution P(x) over length-t waveforms as:
11 | P(x) = P(x_0) * P(x_1 | x_0) * P(x_2 | x_0, x_1) * ... * P(x_t | x_0:x_t-1)
12 |
13 | First I break the sequence into frames of 4 samples each:
14 |
15 | f_0:4 = [x_0, x_1, x_2, x_3]
16 | f_4:7 = [x_4, x_5, x_6, x_7]
17 | etc...
18 |
19 | I run an RNN (specifically, a 3-layer 1024-dim GRU) over these frames (first
20 | converting the discrete-valued samples in the frames back into continuous values
21 | so that they can be fed into the RNN).
22 |
23 | I apply 4 separate 1024->1024 linear projections to the output of the RNN at
24 | each frame (one linear projection per sample). For a frame f_t:t+3, I'll call
25 | the output of these 4 linear projections o_t, o_t+1, o_t+2, and o_t+3.
26 |
27 | Finally, an MLP predicts (using softmax) a distribution over x_t conditioned on
28 | x_t-1, x_t-2, x_t-3, x_t-4, and o_t-4. Here rather than feeding in the
29 | real-valued samples I find the network performs better if I represent each
30 | sample as a 256-dim one-hot vector and concatenate the vectors for each sample,
31 | along with o_t-4. (In the implementation I use an embedding table for efficiency).
32 |
33 | Training details (most of these don't really matter that much):
34 |
35 | I train on 8-second sequences from the Blizzard dataset, using truncated BPTT.
36 | Each truncated BPTT subsequence contains 256 samples (or 64 frames). Minibatch
37 | size 128.
38 |
39 | I use Adam (default settings). I apply weight normalization
40 | (Salimans & Kingma 2016) on all weight matrices, which lets me use Adam's
41 | default learning rate of 1e-3. If you don't use weight norm, try lowering your
42 | learning rate to 2e-4.
43 |
44 | Gradients are clipped elementwise to +/- 1.
45 |
46 | All weight matrices are initialized to uniform distributions with stdev
47 | 1/sqrt(fan_in) (LeCun 1998) except ones which occur before ReLUs; there I use
48 | the initialization from (He et al. 2015).
49 |
50 | For Blizzard, samples usually start to sound okay after ~50K iterations with the
51 | above settings. This takes about 6 hours on a Titan X for a 512-dim model.
--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | import ops
2 | import train as _train
3 |
4 | import numpy
5 | import theano
6 | import theano.tensor as T
7 |
8 | import cPickle as pickle
9 |
10 | _params = {}
11 | def param(name, *args, **kwargs):
12 | """
13 | A wrapper for `theano.shared` which enables parameter sharing in models.
14 |
15 | Creates and returns theano shared variables similarly to `theano.shared`,
16 | except if you try to create a param with the same name as a
17 | previously-created one, `param(...)` will just return the old one instead of
18 | making a new one.
19 |
20 | This constructor also adds a `param` attribute to the shared variables it
21 | creates, so that you can easily search a graph for all params.
22 | """
23 |
24 | if name not in _params:
25 | kwargs['name'] = name
26 | param = theano.shared(*args, **kwargs)
27 | param.param = True
28 | _params[name] = param
29 | return _params[name]
30 |
31 | def delete_params(name):
32 | to_delete = [p_name for p_name in _params if name in p_name]
33 | for p_name in to_delete:
34 | del _params[p_name]
35 |
36 | def search(node, critereon):
37 | """
38 | Traverse the Theano graph starting at `node` and return a list of all nodes
39 | which match the `critereon` function. When optimizing a cost function, you
40 | can use this to get a list of all of the trainable params in the graph, like
41 | so:
42 |
43 | `lib.search(cost, lambda x: hasattr(x, "param"))`
44 | """
45 |
46 | def _search(node, critereon, visited):
47 | if node in visited:
48 | return []
49 | visited.add(node)
50 |
51 | results = []
52 | if isinstance(node, T.Apply):
53 | for inp in node.inputs:
54 | results += _search(inp, critereon, visited)
55 | else: # Variable node
56 | if critereon(node):
57 | results.append(node)
58 | if node.owner is not None:
59 | results += _search(node.owner, critereon, visited)
60 | return results
61 |
62 | return _search(node, critereon, set())
63 |
64 | def floatX(x):
65 | """
66 | Convert `x` to the numpy type specified in `theano.config.floatX`.
67 | """
68 |
69 | if theano.config.floatX == 'float16':
70 | return numpy.float16(x)
71 | elif theano.config.floatX == 'float32':
72 | return numpy.float32(x)
73 | else: # Theano's default float type is float64
74 | print "Warning: lib.floatX using float64"
75 | return numpy.float64(x)
76 |
77 | def save_params(path):
78 | param_vals = {}
79 | for name, param in _params.iteritems():
80 | param_vals[name] = param.get_value()
81 |
82 | with open(path, 'wb') as f:
83 | pickle.dump(param_vals, f)
84 |
85 | def load_params(path):
86 | with open(path, 'rb') as f:
87 | param_vals = pickle.load(f)
88 |
89 | for name, val in param_vals.iteritems():
90 | _params[name].set_value(val)
91 |
92 | def clear_all_params():
93 | to_delete = [p_name for p_name in _params]
94 | for p_name in to_delete:
95 | del _params[p_name]
--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 |
6 | import numpy
7 | import scipy.io.wavfile
8 | import scikits.audiolab
9 |
10 | import random
11 | import time
12 |
13 | random_seed = 123
14 |
15 | def feed_epoch(data_path, n_files, BATCH_SIZE, SEQ_LEN, OVERLAP, Q_LEVELS, Q_ZERO):
16 | global random_seed
17 | """
18 | Generator that yields training inputs (subbatch, reset). `subbatch` contains
19 | quantized audio data; `reset` is a boolean indicating the start of a new
20 | sequence (i.e. you should reset h0 whenever `reset` is True).
21 |
22 | Feeds subsequences which overlap by a specified amount, so that the model
23 | can always have target for every input in a given subsequence.
24 |
25 | Loads sequentially-named FLAC files in a directory
26 | (p0.flac, p1.flac, p2.flac, ..., p[n_files-1].flac)
27 |
28 | Assumes all flac files have the same length.
29 |
30 | data_path: directory containing the flac files
31 | n_files: how many FLAC files are in the directory
32 | (see two_tier.py for a description of the constants)
33 |
34 | returns: (subbatch, reset)
35 | subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
36 | reset: True or False
37 | """
38 |
39 | def round_to(x, y):
40 | """round x up to the nearest y"""
41 | return int(numpy.ceil(x / float(y))) * y
42 |
43 | def batch_quantize(data):
44 | """
45 | floats in (-1, 1) to ints in [0, Q_LEVELS-1]
46 | scales normalized across axis 1
47 | """
48 | eps = numpy.float64(1e-5)
49 |
50 | data -= data.min(axis=1)[:, None]
51 |
52 | data *= ((Q_LEVELS - eps) / data.max(axis=1)[:, None])
53 | data += eps/2
54 | # print "WARNING using zero-dc-offset normalization"
55 | # data -= data.mean(axis=1)[:, None]
56 | # data *= (((Q_LEVELS/2.) - eps) / numpy.abs(data).max(axis=1)[:, None])
57 | # data += Q_LEVELS/2
58 |
59 | data = data.astype('int32')
60 |
61 | return data
62 |
63 | paths = [data_path+'/p{}.flac'.format(i) for i in xrange(n_files)]
64 |
65 | random.seed(random_seed)
66 | random.shuffle(paths)
67 | random_seed += 1
68 |
69 | batches = []
70 | for i in xrange(len(paths) / BATCH_SIZE):
71 | batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
72 |
73 | random.shuffle(batches)
74 |
75 | for batch_paths in batches:
76 | # batch_seq_len = length of longest sequence in the batch, rounded up to
77 | # the nearest SEQ_LEN.
78 | batch_seq_len = len(scikits.audiolab.flacread(batch_paths[0])[0])
79 | batch_seq_len = round_to(batch_seq_len, SEQ_LEN)
80 |
81 | batch = numpy.zeros(
82 | (BATCH_SIZE, batch_seq_len),
83 | dtype='float64'
84 | )
85 |
86 | for i, path in enumerate(batch_paths):
87 | data, fs, enc = scikits.audiolab.flacread(path)
88 | batch[i, :len(data)] = data
89 |
90 | if Q_LEVELS != None:
91 | batch = batch_quantize(batch)
92 |
93 | batch = numpy.concatenate([
94 | numpy.full((BATCH_SIZE, OVERLAP), Q_ZERO, dtype='int32'),
95 | batch
96 | ], axis=1)
97 | else:
98 | batch = numpy.concatenate([
99 | numpy.full((BATCH_SIZE, OVERLAP), 0, dtype='float32'),
100 | batch
101 | ], axis=1)
102 | batch = batch.astype('float32')
103 |
104 | batch -= batch.mean()
105 | batch /= batch.std()
106 |
107 | for i in xrange((batch.shape[1] - OVERLAP) // SEQ_LEN):
108 | reset = numpy.int32(i==0)
109 | subbatch = batch[:, i*SEQ_LEN : (i+1)*SEQ_LEN + OVERLAP]
110 | yield (subbatch, reset)
--------------------------------------------------------------------------------
/vestigial/variable_length_data.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 |
5 | WARNING: I'm pretty sure there's a bug in here somewhere:
6 | I can't get the same training loss that I get with data.py's feed_epoch using
7 | load_sequential_flac_files and feed_data.
8 | """
9 |
10 | import numpy
11 | import scipy.io.wavfile
12 | import scikits.audiolab
13 |
14 | import random
15 | import time
16 |
17 | def load_segmented_blizzard_metadata(data_path, test_set_size):
18 | """
19 | data_path: path to the blizzard dataset (should have a subdirectory 'segmented' with a file 'prompts.gui')
20 | test_set_size: how many files to use for the test set
21 | """
22 | with open(DATA_PATH+'/prompts.gui') as prompts_file:
23 | lines = [l[:-1] for l in prompts_file]
24 |
25 | filepaths = [DATA_PATH + '/wavn/' + fname + '.wav' for fname in lines[::3]]
26 | transcripts = lines[1::3]
27 |
28 | # Clean up the transcripts
29 | for i in xrange(len(transcripts)):
30 | t = transcripts[i]
31 | t = t.replace('@ ', '')
32 | t = t.replace('# ', '')
33 | t = t.replace('| ', '')
34 | t = t.lower()
35 | transcripts[i] = t
36 |
37 | # We use '*' as a null padding character
38 | charmap = {'*': 0}
39 | inv_charmap = ['*']
40 | for t in transcripts:
41 | for char in t:
42 | if char not in charmap:
43 | charmap[char] = len(charmap)
44 | inv_charmap.append(char)
45 |
46 | all_data = zip(filepaths, transcripts)
47 | random.seed(123)
48 | random.shuffle(all_data)
49 | train_data = all_data[test_set_size:]
50 | test_data = all_data[:test_set_size]
51 |
52 | return charmap, inv_charmap, train_data, test_data
53 |
54 | def load_sequential_flac_files(data_path, n_files, test_set_size):
55 | """
56 | Load sequentially-named FLAC files in a directory
57 | (p0.flac, p1.flac, p2.flac, ..., p[n_files-1].flac)
58 |
59 | data_path: directory containing the flac files
60 | n_files: how many FLAC files are in the directory
61 | test_set_size: how many files to use for the test set
62 | """
63 | filepaths = [data_path+'/p{}.flac'.format(i) for i in xrange(n_files)]
64 | transcripts = ['*' for i in xrange(n_files)]
65 | charmap = {'*': 0}
66 | inv_charmap = ['*']
67 | all_data = zip(filepaths, transcripts)
68 | random.seed(123)
69 | random.shuffle(all_data)
70 | train_data = all_data[test_set_size:]
71 | test_data = all_data[:test_set_size]
72 | return charmap, inv_charmap, train_data, test_data
73 |
74 | def feed_data(data, charmap, shuffle, BATCH_SIZE, BITRATE, Q_LEVELS, Q_ZERO, N_PREV_SAMPLES, SEQ_LEN):
75 | """
76 | see the top of twotier.py for a description of the constants
77 | """
78 | def read_audio_file(path):
79 | if path.endswith('wav'):
80 | audio = scipy.io.wavfile.read(path)[1].astype('float64')
81 | elif path.endswith('flac'):
82 | audio = scikits.audiolab.flacread(path)[0]
83 | else:
84 | raise Exception('Unknown filetype')
85 |
86 | eps = numpy.float64(1e-5)
87 | audio -= audio.min()
88 | audio *= (Q_LEVELS - eps) / audio.max()
89 | audio += eps/2
90 | return audio.astype('int32')
91 |
92 | _data = list(data)
93 | if shuffle:
94 | random.shuffle(_data)
95 |
96 | # Make sure the buffer size is longer than the longest sample in the dataset
97 | buffer = numpy.full((BATCH_SIZE, BITRATE*40), Q_ZERO, dtype='int32')
98 | head = 0
99 | transcripts = [None] * BATCH_SIZE
100 |
101 | while True:
102 | # Load new sequences into the buffer if necessary
103 | resets = numpy.zeros(BATCH_SIZE, dtype='int32')
104 | for i in xrange(BATCH_SIZE):
105 | if numpy.array_equiv(buffer[i, head:], Q_ZERO):
106 | if len(_data) == 0:
107 | return # We've exhausted the dataset.
108 | path, transcript = _data.pop()
109 | audio = read_audio_file(path)
110 | # We add a few samples of Q_ZERO in the beginning to match
111 | # generation time (where we generate starting from zeros).
112 | if len(audio) + N_PREV_SAMPLES > buffer.shape[1] - head:
113 | raise Exception('Audio file too long!')
114 | buffer[i, head+N_PREV_SAMPLES:head+len(audio)+N_PREV_SAMPLES] = audio
115 | transcripts[i] = transcript
116 | resets[i] = 1
117 |
118 | # Make a dense (padded) transcript matrix from transcripts
119 | padded_transcripts = numpy.full(
120 | (BATCH_SIZE, max(len(x) for x in transcripts)),
121 | charmap['*'],
122 | dtype='int32'
123 | )
124 | for i, t in enumerate(transcripts):
125 | padded_transcripts[i, :len(t)] = [charmap[c] for c in t]
126 |
127 | # Yield the data batch
128 | yield (
129 | buffer[:, head:head+SEQ_LEN],
130 | padded_transcripts,
131 | resets
132 | )
133 |
134 | # Advance the head and if needed, roll the buffer
135 | buffer[:, head:head+SEQ_LEN] = Q_ZERO
136 | head += SEQ_LEN
137 | if head > buffer.shape[1] // 100:
138 | buffer = numpy.roll(buffer, -head, axis=1)
139 | head = 0
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | - **5/13**: Does my two-tier model actually learn longer-term dependencies, or does it just train faster? I vary frame size, controlling for sequence length, number of params, number of iters.
2 | - Frame size 4: `twotier_determ_bigrun_qzero_1462749482`, 1.827 iters 0-10K, 1.513 iters 90K-100K. (copied from below)
3 | - Frame size 2: `twotier_fs2_iters_1463123438` 1.775 iters 0-10K, 1.485 iters 90-100K.
4 | - Frame size 1: `twotier_fs1_iters_1463157179` (aborted but got to 70K iters)
5 | - **5/12**: I run the two-tier model with frame_size=2.
6 | - Evaluating by wall-clock time, taking the better of n_frames=64, 128
7 | - `twotier_fs2_nf64_time_1463123320` 1.834 first hour, 1.523 12th hour
8 | - `twotier_fs2_nf128_time_1463123388` 1.883 first hour, 1.504 12th hour
9 | - Interesting: frame size 2 performs (almost) as well as frame size 4. What about fs 1?
10 | - n_frames 64 `twotier_fs1_nf64_time_1463175548` (see spreadsheet)
11 | - n_frames 128 `twotier_fs1_nf128_time_1463175563` (see spreadsheet)
12 | - n_frames 256 `twotier_fs1_nf256_time_1463175585` (see spreadsheet)
13 | - **5/10**: I try overfitting to Kyle's kiwi01.wav. I train for 6 hours, generating samples every hour.
14 | - Both two-tier model and baseline (`baseline_kiwi_1462942688`, `twotier_kiwi_1462942828`) get almost-zero train cost, and generate samples indistinguishable from the original.
15 | - **5/9**: Per Yoshua's suggestion I add a term to the loss function asking the frame-level RNN to predict the next frame, without help of the sample-level MLP.
16 | - Before: `twotier_determ_bigrun_qzero_1462749482`, 1.827 iters 0-10K, 1.513 iters 90K-100K. (copied from below)
17 | - After: `twotier_ipcost_1462871075` 1.928 iters 0-10K, 1.537 iters 90K-100K. Samples are a little different but I'm not sure they're any better or worse.
18 | - I also try weighting the auxiliary cost term by 0.1: `twotier_ipcost_weighted_1462891119` 1.848 iters 0-10K, 1.520 iters 90-100K. Samples indistinguishable from original model.
19 | - Conclusions
20 | - This is basically multi-task learning, which usually works as a regularizer in regimes of limited data. But our data here is unlimited, so it's reasonable that this doesn't help NLL.
21 | - It's still possible that this method might produce better samples in some scenarios (even though it didn't seem to here), so I'll keep trying this in future experiments.
22 | - **5/9**: I try changing my input normalization so that samples have zero DC offset (per Kyle McDonald's suggestion). Unfortunately this is probably going to improve NLL, but in a way that's meaningless. I'll evaluate by listening to samples and checking them in Audacity.
23 | - `twotier_zero_dc_offset_1462873780` 1.792 iters 0-10K, 1.504 iters 90K-100K. Samples seem weirdly broken though: speech still sounds good, but there's a very faint whining noise in the background the whole time. Maybe this is something to come back to if I have more time but for now I'm just going to leave it off.
24 | - **5/9**: I implement a flat, baseline model (`baseline.py`) and evaluate it against the two-tier model.
25 | - Basically a language model: 3 layers of stacked 512-dim GRU, taking as input one sample at a time and predicting the next timestep.
26 | - I try two variants: one feeding values into the GRUs as real values (what I did in two-tier), the other as embeddings of 256 discrete values.
27 | - I report NLLs in bits per sample on the train set (not perfect procedure, but mostly-OK because I never make it through one epoch).
28 | - Controlling for wall-clock time, where each model uses its own reasonable hyperparams (to see which model "wins" overall):
29 | - Two-tier: `twotier_time_benchmark_1462865129` 1.833 first hour, 1.503 12th hour. Samples a little noisy but decent / not broken. ***best model***
30 | - Flat reals seqlen 64: `speech_baseline_time_reals_seqlen64_1462866948` 2.057 first hour, 1.696 12th hour. Samples clean but "warbly" / guttural sounding?
31 | - Flat reals seqlen 128: `speech_baseline_time_reals_seqlen128_1462867000` 2.143 first hour, 1.612 12th hour ***best baseline model***
32 | - Flat embeddings seqlen 64: `speech_baseline_time_embed_seqlen64_1462867483` 2.104 first hour, 1.688 12th hour
33 | - Flat embeddings seqlen 128: `speech_baseline_time_embed_seqlen128_1462867499` 2.144 first hour, 1.624 12th hour
34 | - **5/13**: I run even more hyperparam combinations to be thorough.
35 | - Flat reals seqlen 256 512dim 3-layer `baseline_seqlen256_time_1463191213`
36 | - Two-tier 512dim 4-layer `twotier_512d_4layer_1463191505`
37 | - Two-tier 512dim 5-layer `twotier_512d_5layer_1463192292`
38 | - Two-tier 1024dim 3-layer `twotier_1024d_3layer_1463191610`
39 | - Two-tier 1024dim 4-layer `twotier_1024d_4layer_1463192438`
40 | - Two-tier 1024dim 5-layer `twotier_1024d_5layer_1463191722`
41 | - Flat reals seqlen 128 512dim 4-layer `baseline_seqlen128_512d_4layer_1463191559`
42 | - Flat reals seqlen 128 512dim 5-layer `baseline_seqlen128_512d_5layer_1463192296`
43 | - Flat reals seqlen 128 1024dim 3-layer `baseline_seqlen128_1024d_3layer_1463191659`
44 | - Flat reals seqlen 128 1024dim 4-layer `baseline_seqlen128_1024d_4layer_1463192446`
45 | - Flat reals seqlen 128 1024dim 5-layer `baseline_seqlen128_1024d_5layer_1463191875`
46 | - To see what happens if we ignore differences in training speed, I run a trial controlling for number of training steps, where each step sees the same sequence length (256) and batch size (128).
47 | - Two-tier: `twotier_determ_bigrun_qzero_1462749482`, 1.827 iters 0-10K, 1.513 iters 90K-100K.
48 | - Flat reals: `speech_baseline_iters_reals_1462866911` 2.003 iters 0-10K, 1.528 iters 90K-100K.
49 | - Flat embeddings: `speech_baseline_iters_embed_1462867526` 1.961 iters 0-10K, 1.534 iters 90K-100K.
50 | - Update: I don't think these results are valid experimental procedure since I didn't control for time (giving baseline an advantage) or number of params (giving two-tier an advantage). Probably best to ignore them. Instead see the results for `twotier_fs1_iters_1463157179` above.
51 | - Conclusions
52 | - If you ignore training speed, for the hyperparameters tested, my model slightly outperforms the baseline.
53 | - But I don't think it's fair to ignore training speed. If you control for training speed, for the hyperparameters tested, my model outperforms the baseline by a wider margin.
54 | - **5/8**: To better understand how the model uses its softmax output, I sample from a 1024-dim model trained for 50K iterations and plot the softmax output distribution at each timestep. See `notes/softmax_visualization.mp4` (action starts around 7:00). I find the model learns roughly-Gaussian unimodal distributions.
55 | - **5/8**: I'm worried that the samples don't sound quite as good as the old implementation for some reason, so I make the script deterministic (`numpy.random.seed(123)`) and carefully step through the entire model, making sure its generated samples matched my previous implementation number-for-number.
56 | - **5/7**: Initial release of a cleaned-up (actually mostly rewritten) version of my current best model in `two_tier.py`. Written description in `notes/two_tier.txt` and hastily-drawn model diagram in `notes/two_tier.jpg`.
--------------------------------------------------------------------------------
/lib/ops.py:
--------------------------------------------------------------------------------
1 | import lib
2 | import numpy
3 | import theano
4 | import theano.tensor as T
5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
6 |
7 | srng = RandomStreams(seed=234)
8 |
9 | def Linear(
10 | name,
11 | input_dims,
12 | output_dim,
13 | inputs,
14 | biases=True,
15 | initialization=None,
16 | weightnorm=True
17 | ):
18 | # print "warning weightnorm off"
19 |
20 | """
21 | Compute a linear transform of one or more inputs, optionally with a bias.
22 |
23 | input_dims: list of ints, or int (if single input); the dimensionality of
24 | the input(s).
25 | output_dim: the dimensionality of the output.
26 | biases: whether or not to include a bias term.
27 | inputs: a theano variable, or list of variables (if multiple inputs);
28 | the inputs to which to apply the transform.
29 | initialization: one of None, `lecun`, `he`, `orthogonal`
30 | """
31 |
32 | if not isinstance(input_dims, list):
33 | input_dims = [input_dims]
34 | inputs = [inputs]
35 |
36 | terms = []
37 |
38 | def uniform(stdev, size):
39 | """uniform distribution with the given stdev and size"""
40 | return numpy.random.uniform(
41 | low=-stdev * numpy.sqrt(3),
42 | high=stdev * numpy.sqrt(3),
43 | size=size
44 | ).astype(theano.config.floatX)
45 |
46 | for i, (inp, inp_dim) in enumerate(zip(inputs, input_dims)):
47 | if initialization == 'lecun' or (initialization == None and inp_dim != output_dim):
48 | weight_values = uniform(numpy.sqrt(1. / inp_dim), (inp_dim, output_dim))
49 | elif initialization == 'he':
50 | weight_values = uniform(numpy.sqrt(2. / inp_dim), (inp_dim, output_dim))
51 | elif initialization == 'orthogonal' or (initialization == None and inp_dim == output_dim):
52 | # From lasagne
53 | def sample(shape):
54 | if len(shape) < 2:
55 | raise RuntimeError("Only shapes of length 2 or more are "
56 | "supported.")
57 | flat_shape = (shape[0], numpy.prod(shape[1:]))
58 | # TODO: why normal and not uniform?
59 | a = numpy.random.normal(0.0, 1.0, flat_shape)
60 | u, _, v = numpy.linalg.svd(a, full_matrices=False)
61 | # pick the one with the correct shape
62 | q = u if u.shape == flat_shape else v
63 | q = q.reshape(shape)
64 | return q.astype(theano.config.floatX)
65 | weight_values = sample((inp_dim, output_dim))
66 | else:
67 | raise Exception("Invalid initialization!")
68 |
69 | weight = lib.param(
70 | name + '.W'+str(i),
71 | weight_values
72 | )
73 |
74 | if weightnorm:
75 | norm_values = numpy.linalg.norm(weight_values, axis=0)
76 | norms = lib.param(
77 | name + '.g'+str(i),
78 | norm_values
79 | )
80 |
81 | normed_weight = weight * (norms / weight.norm(2, axis=0)).dimshuffle('x', 0)
82 | terms.append(T.dot(inp, normed_weight))
83 | else:
84 | terms.append(T.dot(inp, weight))
85 |
86 | if biases:
87 | terms.append(lib.param(
88 | name + '.b',
89 | numpy.zeros((output_dim,), dtype=theano.config.floatX)
90 | ))
91 |
92 | out = reduce(lambda a,b: a+b, terms)
93 | out.name = name + '.output'
94 | return out
95 |
96 |
97 | def Embedding(name, n_symbols, output_dim, indices):
98 | vectors = lib.param(
99 | name,
100 | numpy.random.randn(
101 | n_symbols,
102 | output_dim
103 | ).astype(theano.config.floatX)
104 | )
105 |
106 | output_shape = [
107 | indices.shape[i]
108 | for i in xrange(indices.ndim)
109 | ] + [output_dim]
110 |
111 | return vectors[indices.flatten()].reshape(output_shape)
112 |
113 | def softmax_and_sample(logits):
114 | old_shape = logits.shape
115 | flattened_logits = logits.reshape((-1, logits.shape[logits.ndim-1]))
116 | samples = T.cast(
117 | srng.multinomial(pvals=T.nnet.softmax(flattened_logits)),
118 | theano.config.floatX
119 | ).reshape(old_shape)
120 | return T.argmax(samples, axis=samples.ndim-1)
121 |
122 | def Recurrent(name, hidden_dims, step_fn, inputs, non_sequences=[], h0s=None):
123 | if not isinstance(inputs, list):
124 | inputs = [inputs]
125 |
126 | if not isinstance(hidden_dims, list):
127 | hidden_dims = [hidden_dims]
128 |
129 | if h0s is None:
130 | h0s = [None]*len(hidden_dims)
131 |
132 | for i in xrange(len(hidden_dims)):
133 | if h0s[i] is None:
134 | h0_unbatched = lib.param(
135 | name + '.h0_' + str(i),
136 | numpy.zeros((hidden_dims[i],), dtype=theano.config.floatX)
137 | )
138 | num_batches = inputs[0].shape[1]
139 | h0s[i] = T.alloc(h0_unbatched, num_batches, hidden_dims[i])
140 |
141 | h0s[i] = T.patternbroadcast(h0s[i], [False] * h0s[i].ndim)
142 |
143 | outputs, _ = theano.scan(
144 | step_fn,
145 | sequences=inputs,
146 | outputs_info=h0s,
147 | non_sequences=non_sequences
148 | )
149 |
150 | return outputs
151 |
152 | def GRUStep(name, input_dim, hidden_dim, current_input, last_hidden):
153 | processed_input = lib.ops.Linear(
154 | name+'.Input',
155 | input_dim,
156 | 3 * hidden_dim,
157 | current_input
158 | )
159 |
160 | gates = T.nnet.sigmoid(
161 | lib.ops.Linear(
162 | name+'.Recurrent_Gates',
163 | hidden_dim,
164 | 2 * hidden_dim,
165 | last_hidden,
166 | biases=False
167 | ) + processed_input[:, :2*hidden_dim]
168 | )
169 |
170 | update = gates[:, :hidden_dim]
171 | reset = gates[:, hidden_dim:]
172 |
173 | scaled_hidden = reset * last_hidden
174 |
175 | candidate = T.tanh(
176 | lib.ops.Linear(
177 | name+'.Recurrent_Candidate',
178 | hidden_dim,
179 | hidden_dim,
180 | scaled_hidden,
181 | biases=False,
182 | initialization='orthogonal'
183 | ) + processed_input[:, 2*hidden_dim:]
184 | )
185 |
186 | one = lib.floatX(1.0)
187 | return (update * candidate) + ((one - update) * last_hidden)
188 |
189 | def LowMemGRU(name, input_dim, hidden_dim, inputs, h0=None):
190 | inputs = inputs.dimshuffle(1,0,2)
191 |
192 | def step(current_input, last_hidden):
193 | return GRUStep(
194 | name+'.Step',
195 | input_dim,
196 | hidden_dim,
197 | current_input,
198 | last_hidden
199 | )
200 |
201 | if h0 is None:
202 | h0s = None
203 | else:
204 | h0s = [h0]
205 |
206 | out = Recurrent(
207 | name+'.Recurrent',
208 | hidden_dim,
209 | step,
210 | inputs,
211 | h0s=h0s
212 | )
213 |
214 | out = out.dimshuffle(1,0,2)
215 | out.name = name+'.output'
216 | return out
--------------------------------------------------------------------------------
/baseline.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.wait_for_gpu(high_priority=False, debug=True)
11 | except ImportError:
12 | pass
13 |
14 | import numpy
15 | numpy.random.seed(123)
16 | import random
17 | random.seed(123)
18 |
19 | import dataset
20 |
21 | import theano
22 | import theano.tensor as T
23 | import theano.ifelse
24 | import lib
25 | import lasagne
26 | import scipy.io.wavfile
27 |
28 | import time
29 | import functools
30 | import itertools
31 |
32 | # Hyperparams
33 | BATCH_SIZE = 128
34 | SEQ_LEN = 256 # How many audio samples to include in each truncated BPTT pass
35 | SEQ_LEN_ANNEAL_ITERS = 1
36 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
37 | N_GRUS = 4 # How many GRUs to stack in the frame-level model
38 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
39 | GRAD_CLIP = 1 # Elementwise grad clip threshold
40 |
41 | # Dataset
42 | DATA_PATH = '/media/seagate/blizzard/parts'
43 | N_FILES = 141703
44 | # DATA_PATH = '/PersimmonData/kiwi_parts'
45 | # N_FILES = 516
46 | BITRATE = 16000
47 |
48 | # Other constants
49 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
50 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
51 | STOP_ITERS = 200*1000 # Stop after this many iterations
52 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
53 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
54 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
55 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
56 |
57 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, 256, 1, Q_LEVELS, Q_ZERO)
58 | for i in xrange(100*500):
59 | data_feeder.next()
60 |
61 | print "Model settings:"
62 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
63 | all_vars = sorted(all_vars, key=lambda x: x[0])
64 | for var_name, var_value in all_vars:
65 | print "\t{}: {}".format(var_name, var_value)
66 |
67 | def sample_level_rnn(input_sequences, h0, reset):
68 | """
69 | input_sequences.shape: (batch size, seq len)
70 | h0.shape: (batch size, N_GRUS, DIM)
71 | reset.shape: ()
72 | output.shape: (batch size, seq len, Q_LEVELS)
73 | """
74 |
75 | learned_h0 = lib.param(
76 | 'SampleLevel.h0',
77 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
78 | )
79 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
80 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
81 |
82 | # Embedded inputs
83 | #################
84 |
85 | FRAME_SIZE = Q_LEVELS
86 | frames = lib.ops.Embedding('SampleLevel.Embedding', Q_LEVELS, Q_LEVELS, input_sequences)
87 |
88 | # Real-valued inputs
89 | ####################
90 |
91 | # 'frames' of size 1
92 | # FRAME_SIZE = 1
93 | # frames = input_sequences.reshape((
94 | # input_sequences.shape[0],
95 | # input_sequences.shape[1],
96 | # 1
97 | # ))
98 | # # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
99 | # # (a reasonable range to pass as inputs to the RNN)
100 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
101 | # frames *= lib.floatX(2)
102 |
103 | gru0 = lib.ops.LowMemGRU('SampleLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
104 | # gru0 = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU0FF', DIM, DIM, gru0, initialization='he'))
105 | grus = [gru0]
106 | for i in xrange(1, N_GRUS):
107 | gru = lib.ops.LowMemGRU('SampleLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
108 | # gru = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU'+str(i)+'FF', DIM, DIM, gru, initialization='he'))
109 | grus.append(gru)
110 |
111 | # We apply the softmax later
112 | output = lib.ops.Linear(
113 | 'Output',
114 | N_GRUS*DIM,
115 | Q_LEVELS,
116 | T.concatenate(grus, axis=2)
117 | )
118 | # output = lib.ops.Linear(
119 | # 'Output',
120 | # DIM,
121 | # Q_LEVELS,
122 | # grus[-1]
123 | # )
124 |
125 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
126 |
127 | return (output, last_hidden)
128 |
129 | sequences = T.imatrix('sequences')
130 | h0 = T.tensor3('h0')
131 | reset = T.iscalar('reset')
132 |
133 | input_sequences = sequences[:, :-1]
134 | target_sequences = sequences[:, 1:]
135 |
136 | sample_level_outputs, new_h0 = sample_level_rnn(input_sequences, h0, reset)
137 |
138 | cost = T.nnet.categorical_crossentropy(
139 | T.nnet.softmax(sample_level_outputs.reshape((-1, Q_LEVELS))),
140 | target_sequences.flatten()
141 | ).mean()
142 |
143 | # By default we report cross-entropy cost in bits.
144 | # Switch to nats by commenting out this line:
145 | cost = cost * lib.floatX(1.44269504089)
146 |
147 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
148 | lib._train.print_params_info(cost, params)
149 |
150 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
151 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
152 |
153 | updates = lasagne.updates.adam(grads, params)
154 |
155 | train_fn = theano.function(
156 | [sequences, h0, reset],
157 | [cost, new_h0],
158 | updates=updates,
159 | on_unused_input='warn'
160 | )
161 |
162 | generate_outputs, generate_new_h0 = sample_level_rnn(sequences, h0, reset)
163 | generate_fn = theano.function(
164 | [sequences, h0, reset],
165 | [lib.ops.softmax_and_sample(generate_outputs), generate_new_h0],
166 | on_unused_input='warn'
167 | )
168 |
169 | def generate_and_save_samples(tag):
170 |
171 | def write_audio_file(name, data):
172 | data = data.astype('float32')
173 | data -= data.min()
174 | data /= data.max()
175 | data -= 0.5
176 | data *= 0.95
177 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
178 |
179 | # Generate 5 sample files, each 5 seconds long
180 | N_SEQS = 10
181 | LENGTH = 5*BITRATE
182 |
183 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
184 | samples[:, 0] = Q_ZERO
185 |
186 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
187 |
188 | for t in xrange(1, LENGTH):
189 | samples[:, t:t+1], h0 = generate_fn(
190 | samples[:, t-1:t],
191 | h0,
192 | numpy.int32(t == 1)
193 | )
194 |
195 | for i in xrange(N_SEQS):
196 | write_audio_file("sample_{}_{}".format(tag, i), samples[i])
197 |
198 | print "Training!"
199 | total_iters = 0
200 | total_time = 0.
201 | last_print_time = 0.
202 | last_print_iters = 0
203 | curr_seq_len = 2
204 | costs = []
205 | for epoch in itertools.count():
206 |
207 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
208 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, curr_seq_len, 1, Q_LEVELS, Q_ZERO)
209 |
210 | for seqs, reset in data_feeder:
211 | start_time = time.time()
212 | cost, h0 = train_fn(seqs, h0, reset)
213 | total_time += time.time() - start_time
214 | total_iters += 1
215 |
216 | costs.append(cost)
217 |
218 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
219 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
220 |
221 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
222 | epoch,
223 | total_iters,
224 | numpy.mean(costs),
225 | total_time,
226 | total_time / total_iters
227 | )
228 | tag = "iters{}_time{}".format(total_iters, total_time)
229 |
230 | generate_and_save_samples(tag)
231 | lib.save_params('params_{}.pkl'.format(tag))
232 |
233 | costs = []
234 | last_print_time += PRINT_TIME
235 | last_print_iters += PRINT_ITERS
236 |
237 | if total_iters % SEQ_LEN_ANNEAL_ITERS == 0:
238 | if curr_seq_len < SEQ_LEN:
239 | print "Doubling curr_seq_len to {}".format(curr_seq_len*2)
240 | curr_seq_len *= 2
241 | break
242 |
243 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
244 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
245 |
246 | print "Done!"
247 |
248 | try: # This only matters on Ishaan's computer
249 | import experiment_tools
250 | experiment_tools.send_sms("done!")
251 | except ImportError:
252 | pass
253 |
254 | sys.exit()
--------------------------------------------------------------------------------
/baseline_gaussian.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.wait_for_gpu(high_priority=False, debug=False)
11 | except ImportError:
12 | pass
13 |
14 | import numpy
15 | numpy.random.seed(123)
16 | import random
17 | random.seed(123)
18 |
19 | import dataset
20 |
21 | import theano
22 | import theano.tensor as T
23 | import theano.ifelse
24 | import lib
25 | import lasagne
26 | import scipy.io.wavfile
27 |
28 | import time
29 | import functools
30 | import itertools
31 |
32 | # Hyperparams
33 | BATCH_SIZE = 128
34 | SEQ_LEN = 256 # How many audio samples to include in each truncated BPTT pass
35 | SEQ_LEN_ANNEAL_ITERS = 1
36 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
37 | N_GRUS = 4 # How many GRUs to stack in the frame-level model
38 | Q_LEVELS = None # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
39 | GRAD_CLIP = 1 # Elementwise grad clip threshold
40 |
41 | # Dataset
42 | # DATA_PATH = '/media/seagate/blizzard/parts'
43 | # N_FILES = 141703
44 | DATA_PATH = '/PersimmonData/kiwi_parts'
45 | N_FILES = 516
46 | BITRATE = 16000
47 |
48 | # Other constants
49 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
50 | GENERATE_AND_SAVE = True
51 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
52 | STOP_ITERS = 200*1000 # Stop after this many iterations
53 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
54 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
55 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
56 | Q_ZERO = None#numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
57 |
58 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
59 | theano_srng = RandomStreams(seed=234)
60 |
61 | # data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, 256, 1, Q_LEVELS, Q_ZERO)
62 | # for i in xrange(100*500):
63 | # data_feeder.next()
64 |
65 | print "Model settings:"
66 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
67 | all_vars = sorted(all_vars, key=lambda x: x[0])
68 | for var_name, var_value in all_vars:
69 | print "\t{}: {}".format(var_name, var_value)
70 |
71 | def gaussian_nll(x, mu, log_sigma):
72 | sigma_squared = T.exp(2*log_sigma)
73 | return (
74 | lib.floatX(0.5*numpy.log(2*numpy.pi)) +
75 | (2*log_sigma) +
76 | ( ((x-mu)**2) / (2*sigma_squared) )
77 | )
78 |
79 | def sample_level_rnn(input_sequences, h0, reset):
80 | """
81 | input_sequences.shape: (batch size, seq len)
82 | h0.shape: (batch size, N_GRUS, DIM)
83 | reset.shape: ()
84 | output.shape: (batch size, seq len, Q_LEVELS)
85 | """
86 |
87 | learned_h0 = lib.param(
88 | 'SampleLevel.h0',
89 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
90 | )
91 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
92 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
93 |
94 | # Embedded inputs
95 | #################
96 |
97 | # FRAME_SIZE = Q_LEVELS
98 | # frames = lib.ops.Embedding('SampleLevel.Embedding', Q_LEVELS, Q_LEVELS, input_sequences)
99 |
100 | # Real-valued inputs
101 | ####################
102 |
103 | # 'frames' of size 1
104 | FRAME_SIZE = 1
105 | frames = input_sequences.reshape((
106 | input_sequences.shape[0],
107 | input_sequences.shape[1],
108 | 1
109 | ))
110 | # # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
111 | # # (a reasonable range to pass as inputs to the RNN)
112 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
113 | # frames *= lib.floatX(2)
114 |
115 | gru0 = lib.ops.LowMemGRU('SampleLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
116 | # gru0 = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU0FF', DIM, DIM, gru0, initialization='he'))
117 | grus = [gru0]
118 | for i in xrange(1, N_GRUS):
119 | gru = lib.ops.LowMemGRU('SampleLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
120 | # gru = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU'+str(i)+'FF', DIM, DIM, gru, initialization='he'))
121 | grus.append(gru)
122 |
123 | # We apply the softmax later
124 | output = lib.ops.Linear(
125 | 'Output',
126 | N_GRUS*DIM,
127 | 2,
128 | T.concatenate(grus, axis=2)
129 | )
130 | # output = lib.ops.Linear(
131 | # 'Output',
132 | # DIM,
133 | # Q_LEVELS,
134 | # grus[-1]
135 | # )
136 |
137 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
138 |
139 | return (output, last_hidden)
140 |
141 | sequences = T.matrix('sequences')
142 | h0 = T.tensor3('h0')
143 | reset = T.iscalar('reset')
144 |
145 | input_sequences = sequences[:, :-1]
146 | target_sequences = sequences[:, 1:]
147 |
148 | sample_level_outputs, new_h0 = sample_level_rnn(input_sequences, h0, reset)
149 |
150 | cost = T.mean(gaussian_nll(
151 | target_sequences.flatten(),
152 | sample_level_outputs.flatten()[::2],
153 | sample_level_outputs.flatten()[1::2]
154 | ))
155 | # cost = T.nnet.categorical_crossentropy(
156 | # T.nnet.softmax(sample_level_outputs.reshape((-1, Q_LEVELS))),
157 | # target_sequences.flatten()
158 | # ).mean()
159 |
160 | # By default we report cross-entropy cost in bits.
161 | # Switch to nats by commenting out this line:
162 | # cost = cost * lib.floatX(1.44269504089)
163 |
164 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
165 | lib._train.print_params_info(cost, params)
166 |
167 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
168 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
169 |
170 | updates = lasagne.updates.adam(grads, params)
171 |
172 | train_fn = theano.function(
173 | [sequences, h0, reset],
174 | [cost, new_h0],
175 | updates=updates,
176 | on_unused_input='warn'
177 | )
178 |
179 | generate_outputs, generate_new_h0 = sample_level_rnn(sequences, h0, reset)
180 | g_mu = generate_outputs[:,:,0]
181 | g_log_sigma = generate_outputs[:,:,1]
182 | g_samples = g_mu + (T.exp(g_log_sigma)*theano_srng.normal(g_mu.shape))
183 | generate_fn = theano.function(
184 | [sequences, h0, reset],
185 | [g_samples, generate_new_h0],
186 | on_unused_input='warn'
187 | )
188 |
189 | def generate_and_save_samples(tag):
190 |
191 | def write_audio_file(name, data):
192 | # data = data.astype('float32')
193 | # data -= data.min()
194 | # data /= data.max()
195 | # data -= 0.5
196 | # data *= 0.95
197 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
198 |
199 | # Generate 5 sample files, each 5 seconds long
200 | N_SEQS = 10
201 | LENGTH = 5*BITRATE
202 |
203 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='float32')
204 | samples[:, 0] = 0
205 |
206 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
207 |
208 | for t in xrange(1, LENGTH):
209 | samples[:, t:t+1], h0 = generate_fn(
210 | samples[:, t-1:t],
211 | h0,
212 | numpy.int32(t == 1)
213 | )
214 |
215 | for i in xrange(N_SEQS):
216 | write_audio_file("sample_{}_{}".format(tag, i), samples[i])
217 |
218 | print "Training!"
219 | total_iters = 0
220 | total_time = 0.
221 | last_print_time = 0.
222 | last_print_iters = 0
223 | curr_seq_len = 2
224 | costs = []
225 | for epoch in itertools.count():
226 |
227 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
228 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, curr_seq_len, 1, Q_LEVELS, Q_ZERO)
229 |
230 | for seqs, reset in data_feeder:
231 | start_time = time.time()
232 | cost, h0 = train_fn(seqs, h0, reset)
233 | total_time += time.time() - start_time
234 | total_iters += 1
235 |
236 | costs.append(cost)
237 |
238 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
239 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
240 |
241 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
242 | epoch,
243 | total_iters,
244 | numpy.mean(costs),
245 | total_time,
246 | total_time / total_iters
247 | )
248 | tag = "iters{}_time{}".format(total_iters, total_time)
249 |
250 | if GENERATE_AND_SAVE:
251 | generate_and_save_samples(tag)
252 | lib.save_params('params_{}.pkl'.format(tag))
253 |
254 | costs = []
255 | last_print_time += PRINT_TIME
256 | last_print_iters += PRINT_ITERS
257 |
258 | if total_iters % SEQ_LEN_ANNEAL_ITERS == 0:
259 | if curr_seq_len < SEQ_LEN:
260 | print "Doubling curr_seq_len to {}".format(curr_seq_len*2)
261 | curr_seq_len *= 2
262 | break
263 |
264 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
265 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
266 |
267 | print "Done!"
268 |
269 | try: # This only matters on Ishaan's computer
270 | import experiment_tools
271 | experiment_tools.send_sms("done!")
272 | except ImportError:
273 | pass
274 |
275 | sys.exit()
--------------------------------------------------------------------------------
/two_tier.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.register_crash_notifier()
11 | experiment_tools.wait_for_gpu(high_priority=False)
12 | except ImportError:
13 | pass
14 |
15 | import numpy
16 | numpy.random.seed(123)
17 | import random
18 | random.seed(123)
19 |
20 | import dataset
21 |
22 | import theano
23 | import theano.tensor as T
24 | import theano.tensor.nnet.neighbours
25 | import theano.ifelse
26 | import lib
27 | import lasagne
28 | import scipy.io.wavfile
29 |
30 | import time
31 | import functools
32 | import itertools
33 |
34 | # Hyperparams
35 | BATCH_SIZE = 128
36 | N_FRAMES = 128 # How many 'frames' to include in each truncated BPTT pass
37 | FRAME_SIZE = 2 # How many samples per frame
38 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
39 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
40 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
41 | GRAD_CLIP = 1 # Elementwise grad clip threshold
42 |
43 | # Dataset
44 | DATA_PATH = '/media/seagate/blizzard/parts'
45 | N_FILES = 141703
46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
47 | # N_FILES = 516
48 | BITRATE = 16000
49 |
50 | # Other constants
51 | TRAIN_MODE = 'time' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
52 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
53 | STOP_ITERS = 100000 # Stop after this many iterations
54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
55 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
56 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
57 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
58 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
59 |
60 | print "Model settings:"
61 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
62 | all_vars = sorted(all_vars, key=lambda x: x[0])
63 | for var_name, var_value in all_vars:
64 | print "\t{}: {}".format(var_name, var_value)
65 |
66 | def frame_level_rnn(input_sequences, h0, reset):
67 | """
68 | input_sequences.shape: (batch size, n frames * FRAME_SIZE)
69 | h0.shape: (batch size, N_GRUS, DIM)
70 | reset.shape: ()
71 | output.shape: (batch size, n frames * FRAME_SIZE, DIM)
72 | """
73 |
74 | learned_h0 = lib.param(
75 | 'FrameLevel.h0',
76 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
77 | )
78 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
79 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
80 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
81 |
82 | frames = input_sequences.reshape((
83 | input_sequences.shape[0],
84 | input_sequences.shape[1] / FRAME_SIZE,
85 | FRAME_SIZE
86 | ))
87 |
88 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
89 | # (a reasonable range to pass as inputs to the RNN)
90 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
91 | frames *= lib.floatX(2)
92 |
93 | gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
94 | grus = [gru0]
95 | for i in xrange(1, N_GRUS):
96 | gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
97 | grus.append(gru)
98 |
99 | output = lib.ops.Linear(
100 | 'FrameLevel.Output',
101 | DIM,
102 | FRAME_SIZE * DIM,
103 | grus[-1],
104 | initialization='he'
105 | )
106 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
107 |
108 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
109 |
110 | return (output, last_hidden)
111 |
112 | def sample_level_predictor(frame_level_outputs, prev_samples):
113 | """
114 | frame_level_outputs.shape: (batch size, DIM)
115 | prev_samples.shape: (batch size, FRAME_SIZE)
116 | output.shape: (batch size, Q_LEVELS)
117 | """
118 |
119 | prev_samples = lib.ops.Embedding(
120 | 'SampleLevel.Embedding',
121 | Q_LEVELS,
122 | Q_LEVELS,
123 | prev_samples
124 | ).reshape((-1, FRAME_SIZE * Q_LEVELS))
125 |
126 | out = lib.ops.Linear(
127 | 'SampleLevel.L1_PrevSamples',
128 | FRAME_SIZE * Q_LEVELS,
129 | DIM,
130 | prev_samples,
131 | biases=False,
132 | initialization='he'
133 | )
134 | out += frame_level_outputs
135 | out = T.nnet.relu(out)
136 |
137 | out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
138 | out = T.nnet.relu(out)
139 | out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
140 | out = T.nnet.relu(out)
141 |
142 | # We apply the softmax later
143 | return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
144 |
145 | sequences = T.imatrix('sequences')
146 | h0 = T.tensor3('h0')
147 | reset = T.iscalar('reset')
148 |
149 | input_sequences = sequences[:, :-FRAME_SIZE]
150 | target_sequences = sequences[:, FRAME_SIZE:]
151 |
152 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
153 |
154 | prev_samples = sequences[:, :-1]
155 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
156 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
157 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
158 |
159 | sample_level_outputs = sample_level_predictor(
160 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
161 | prev_samples
162 | )
163 |
164 | cost = T.nnet.categorical_crossentropy(
165 | T.nnet.softmax(sample_level_outputs),
166 | target_sequences.flatten()
167 | ).mean()
168 |
169 | # By default we report cross-entropy cost in bits.
170 | # Switch to nats by commenting out this line:
171 | cost = cost * lib.floatX(1.44269504089)
172 |
173 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
174 | lib._train.print_params_info(cost, params)
175 |
176 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
177 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
178 |
179 | updates = lasagne.updates.adam(grads, params, learning_rate=1e-3)
180 |
181 | train_fn = theano.function(
182 | [sequences, h0, reset],
183 | [cost, new_h0],
184 | updates=updates,
185 | on_unused_input='warn'
186 | )
187 |
188 | frame_level_generate_fn = theano.function(
189 | [sequences, h0, reset],
190 | frame_level_rnn(sequences, h0, reset),
191 | on_unused_input='warn'
192 | )
193 |
194 | frame_level_outputs = T.matrix('frame_level_outputs')
195 | prev_samples = T.imatrix('prev_samples')
196 | sample_level_generate_fn = theano.function(
197 | [frame_level_outputs, prev_samples],
198 | lib.ops.softmax_and_sample(
199 | sample_level_predictor(
200 | frame_level_outputs,
201 | prev_samples
202 | )
203 | ),
204 | on_unused_input='warn'
205 | )
206 |
207 | def generate_and_save_samples(tag):
208 |
209 | def write_audio_file(name, data):
210 | data = data.astype('float32')
211 | data -= data.min()
212 | data /= data.max()
213 | data -= 0.5
214 | data *= 0.95
215 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
216 |
217 | # Generate 5 sample files, each 5 seconds long
218 | N_SEQS = 10
219 | LENGTH = 5*BITRATE
220 |
221 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
222 | samples[:, :FRAME_SIZE] = Q_ZERO
223 |
224 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
225 | frame_level_outputs = None
226 |
227 | for t in xrange(FRAME_SIZE, LENGTH):
228 |
229 | if t % FRAME_SIZE == 0:
230 | frame_level_outputs, h0 = frame_level_generate_fn(
231 | samples[:, t-FRAME_SIZE:t],
232 | h0,
233 | numpy.int32(t == FRAME_SIZE)
234 | )
235 |
236 | samples[:, t] = sample_level_generate_fn(
237 | frame_level_outputs[:, t % FRAME_SIZE],
238 | samples[:, t-FRAME_SIZE:t]
239 | )
240 |
241 | for i in xrange(N_SEQS):
242 | write_audio_file("sample_{}_{}".format(tag, i), samples[i])
243 |
244 | print "Training!"
245 | total_iters = 0
246 | total_time = 0.
247 | last_print_time = 0.
248 | last_print_iters = 0
249 | for epoch in itertools.count():
250 |
251 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
252 | costs = []
253 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
254 |
255 | for seqs, reset in data_feeder:
256 |
257 | start_time = time.time()
258 | cost, h0 = train_fn(seqs, h0, reset)
259 | total_time += time.time() - start_time
260 | total_iters += 1
261 |
262 | costs.append(cost)
263 |
264 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
265 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
266 |
267 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
268 | epoch,
269 | total_iters,
270 | numpy.mean(costs),
271 | total_time,
272 | total_time / total_iters
273 | )
274 | tag = "iters{}_time{}".format(total_iters, total_time)
275 | generate_and_save_samples(tag)
276 | lib.save_params('params_{}.pkl'.format(tag))
277 |
278 | costs = []
279 | last_print_time += PRINT_TIME
280 | last_print_iters += PRINT_ITERS
281 |
282 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
283 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
284 |
285 | print "Done!"
286 |
287 | try: # This only matters on Ishaan's computer
288 | import experiment_tools
289 | experiment_tools.send_sms("done!")
290 | except ImportError:
291 | pass
292 |
293 | sys.exit()
--------------------------------------------------------------------------------
/two_tier_v.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.register_crash_notifier()
11 | experiment_tools.wait_for_gpu(high_priority=False)
12 | except ImportError:
13 | pass
14 |
15 | import numpy
16 | numpy.random.seed(123)
17 | import random
18 | random.seed(123)
19 |
20 | import dataset
21 |
22 | import theano
23 | import theano.tensor as T
24 | import theano.tensor.nnet.neighbours
25 | import theano.ifelse
26 | import lib
27 | import lasagne
28 | import scipy.io.wavfile
29 |
30 | import time
31 | import functools
32 | import itertools
33 |
34 | # Hyperparams
35 | BATCH_SIZE = 128
36 | N_FRAMES = 128 # How many 'frames' to include in each truncated BPTT pass
37 | FRAME_SIZE = 16 # How many samples per frame
38 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
39 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
40 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
41 | GRAD_CLIP = 1 # Elementwise grad clip threshold
42 |
43 | # Dataset
44 | DATA_PATH = '/media/seagate/blizzard/parts'
45 | N_FILES = 141703
46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
47 | # N_FILES = 516
48 | BITRATE = 16000
49 |
50 | # Other constants
51 | TRAIN_MODE = 'time' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
52 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
53 | STOP_ITERS = 100000 # Stop after this many iterations
54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
55 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
56 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
57 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
58 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
59 |
60 | print "Model settings:"
61 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
62 | all_vars = sorted(all_vars, key=lambda x: x[0])
63 | for var_name, var_value in all_vars:
64 | print "\t{}: {}".format(var_name, var_value)
65 |
66 | def frame_level_rnn(input_sequences, h0, reset):
67 | """
68 | input_sequences.shape: (batch size, n frames * FRAME_SIZE)
69 | h0.shape: (batch size, N_GRUS, DIM)
70 | reset.shape: ()
71 | output.shape: (batch size, n frames * FRAME_SIZE, DIM)
72 | """
73 |
74 | learned_h0 = lib.param(
75 | 'FrameLevel.h0',
76 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
77 | )
78 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
79 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
80 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
81 |
82 | frames = input_sequences.reshape((
83 | input_sequences.shape[0],
84 | input_sequences.shape[1] / FRAME_SIZE,
85 | FRAME_SIZE
86 | ))
87 |
88 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
89 | # (a reasonable range to pass as inputs to the RNN)
90 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
91 | frames *= lib.floatX(2)
92 |
93 | gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
94 | grus = [gru0]
95 | for i in xrange(1, N_GRUS):
96 | gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
97 | grus.append(gru)
98 |
99 | output = lib.ops.Linear(
100 | 'FrameLevel.Output',
101 | DIM,
102 | FRAME_SIZE * DIM,
103 | grus[-1],
104 | initialization='he'
105 | )
106 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
107 |
108 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
109 |
110 | return (output, last_hidden)
111 |
112 | def sample_level_predictor(frame_level_outputs, prev_samples):
113 | """
114 | frame_level_outputs.shape: (batch size, DIM)
115 | prev_samples.shape: (batch size, FRAME_SIZE)
116 | output.shape: (batch size, Q_LEVELS)
117 | """
118 |
119 | prev_samples = lib.ops.Embedding(
120 | 'SampleLevel.Embedding',
121 | Q_LEVELS,
122 | Q_LEVELS,
123 | prev_samples
124 | ).reshape((-1, FRAME_SIZE * Q_LEVELS))
125 |
126 | out = lib.ops.Linear(
127 | 'SampleLevel.L1_PrevSamples',
128 | FRAME_SIZE * Q_LEVELS,
129 | DIM,
130 | prev_samples,
131 | biases=False,
132 | initialization='he'
133 | )
134 | out += frame_level_outputs
135 | out = T.nnet.relu(out)
136 |
137 | out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
138 | out = T.nnet.relu(out)
139 | out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
140 | out = T.nnet.relu(out)
141 |
142 | # We apply the softmax later
143 | return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
144 |
145 | sequences = T.imatrix('sequences')
146 | h0 = T.tensor3('h0')
147 | reset = T.iscalar('reset')
148 |
149 | input_sequences = sequences[:, :-FRAME_SIZE]
150 | target_sequences = sequences[:, FRAME_SIZE:]
151 |
152 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
153 |
154 | prev_samples = sequences[:, :-1]
155 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
156 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
157 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
158 |
159 | sample_level_outputs = sample_level_predictor(
160 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
161 | prev_samples
162 | )
163 |
164 | cost = T.nnet.categorical_crossentropy(
165 | T.nnet.softmax(sample_level_outputs),
166 | target_sequences.flatten()
167 | ).mean()
168 |
169 | # By default we report cross-entropy cost in bits.
170 | # Switch to nats by commenting out this line:
171 | cost = cost * lib.floatX(1.44269504089)
172 |
173 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
174 | lib._train.print_params_info(cost, params)
175 |
176 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
177 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
178 |
179 | updates = lasagne.updates.adam(grads, params, learning_rate=1e-3)
180 |
181 | train_fn = theano.function(
182 | [sequences, h0, reset],
183 | [cost, new_h0],
184 | updates=updates,
185 | on_unused_input='warn'
186 | )
187 |
188 | frame_level_generate_fn = theano.function(
189 | [sequences, h0, reset],
190 | frame_level_rnn(sequences, h0, reset),
191 | on_unused_input='warn'
192 | )
193 |
194 | frame_level_outputs = T.matrix('frame_level_outputs')
195 | prev_samples = T.imatrix('prev_samples')
196 | sample_level_generate_fn = theano.function(
197 | [frame_level_outputs, prev_samples],
198 | lib.ops.softmax_and_sample(
199 | sample_level_predictor(
200 | frame_level_outputs,
201 | prev_samples
202 | )
203 | ),
204 | on_unused_input='warn'
205 | )
206 |
207 | def generate_and_save_samples(tag):
208 |
209 | def write_audio_file(name, data):
210 | data = data.astype('float32')
211 | data -= data.min()
212 | data /= data.max()
213 | data -= 0.5
214 | data *= 0.95
215 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
216 |
217 | # Generate 5 sample files, each 5 seconds long
218 | N_SEQS = 10
219 | LENGTH = 5*BITRATE
220 |
221 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
222 | samples[:, :FRAME_SIZE] = Q_ZERO
223 |
224 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
225 | frame_level_outputs = None
226 |
227 | for t in xrange(FRAME_SIZE, LENGTH):
228 |
229 | if t % FRAME_SIZE == 0:
230 | frame_level_outputs, h0 = frame_level_generate_fn(
231 | samples[:, t-FRAME_SIZE:t],
232 | h0,
233 | numpy.int32(t == FRAME_SIZE)
234 | )
235 |
236 | samples[:, t] = sample_level_generate_fn(
237 | frame_level_outputs[:, t % FRAME_SIZE],
238 | samples[:, t-FRAME_SIZE:t]
239 | )
240 |
241 | for i in xrange(N_SEQS):
242 | write_audio_file("sample_{}_{}".format(tag, i), samples[i])
243 |
244 | print "Training!"
245 | total_iters = 0
246 | total_time = 0.
247 | last_print_time = 0.
248 | last_print_iters = 0
249 | for epoch in itertools.count():
250 |
251 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
252 | costs = []
253 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
254 |
255 | for seqs, reset in data_feeder:
256 |
257 | start_time = time.time()
258 | cost, h0 = train_fn(seqs, h0, reset)
259 | total_time += time.time() - start_time
260 | total_iters += 1
261 |
262 | costs.append(cost)
263 |
264 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
265 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
266 |
267 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
268 | epoch,
269 | total_iters,
270 | numpy.mean(costs),
271 | total_time,
272 | total_time / total_iters
273 | )
274 | tag = "iters{}_time{}".format(total_iters, total_time)
275 | generate_and_save_samples(tag)
276 | lib.save_params('params_{}.pkl'.format(tag))
277 |
278 | costs = []
279 | last_print_time += PRINT_TIME
280 | last_print_iters += PRINT_ITERS
281 |
282 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
283 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
284 |
285 | print "Done!"
286 |
287 | try: # This only matters on Ishaan's computer
288 | import experiment_tools
289 | experiment_tools.send_sms("done!")
290 | except ImportError:
291 | pass
292 |
293 | sys.exit()
--------------------------------------------------------------------------------
/conv.py:
--------------------------------------------------------------------------------
1 | """
2 | Convolutional Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.register_crash_notifier()
11 | experiment_tools.wait_for_gpu(high_priority=True)
12 | except ImportError:
13 | pass
14 |
15 | import numpy
16 | numpy.random.seed(123)
17 | import random
18 | random.seed(123)
19 |
20 | import dataset
21 |
22 | import theano
23 | import theano.tensor as T
24 | import theano.ifelse
25 | import lib
26 | import lasagne
27 | import scipy.io.wavfile
28 |
29 | import time
30 | import functools
31 | import itertools
32 |
33 | # Hyperparams
34 | BATCH_SIZE = 128
35 | SEQ_LEN = 256
36 | DIM = 128
37 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
38 | GRAD_CLIP = 1
39 |
40 | LAYERS = 5
41 | FILTER_SIZE = 17
42 |
43 | # Dataset
44 | DATA_PATH = '/media/seagate/blizzard/parts'
45 | N_FILES = 141703
46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
47 | # N_FILES = 516
48 | BITRATE = 16000
49 |
50 | # Other constants
51 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
52 | PRINT_ITERS = 10 # Print cost, generate samples, save model checkpoint every N iterations.
53 | STOP_ITERS = 1000 # Stop after this many iterations
54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
55 | STOP_TIME = 60*60*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
56 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
57 |
58 | print "Model settings:"
59 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
60 | all_vars = sorted(all_vars, key=lambda x: x[0])
61 | for var_name, var_value in all_vars:
62 | print "\t{}: {}".format(var_name, var_value)
63 |
64 | def MaskedConv1D(name, input_dim, output_dim, filter_size, inputs, mask_type=None, he_init=False):
65 | """
66 | inputs.shape: (batch size, input_dim, 1, width)
67 | mask_type: None, 'a', 'b'
68 | output.shape: (batch size, output_dim, 1, width)
69 | """
70 |
71 | if mask_type is not None:
72 | mask = numpy.ones(
73 | (output_dim, input_dim, 1, filter_size),
74 | dtype=theano.config.floatX
75 | )
76 | center = filter_size//2
77 | mask[:,:,0,center+1:] = 0.
78 | if mask_type == 'a':
79 | mask[:,:,0,center] = 0.
80 |
81 | def uniform(stdev, size):
82 | """uniform distribution with the given stdev and size"""
83 | return numpy.random.uniform(
84 | low=-stdev * numpy.sqrt(3),
85 | high=stdev * numpy.sqrt(3),
86 | size=size
87 | ).astype(theano.config.floatX)
88 |
89 | if mask_type=='a':
90 | n_in = filter_size//2
91 | elif mask_type=='b':
92 | n_in = filter_size//2 + 1
93 | else:
94 | n_in = filter_size
95 | n_in *= input_dim
96 |
97 | if he_init:
98 | init_stdev = numpy.sqrt(2./n_in)
99 | else:
100 | init_stdev = numpy.sqrt(1./n_in)
101 |
102 | filters = lib.param(
103 | name+'.Filters',
104 | uniform(
105 | init_stdev,
106 | (output_dim, input_dim, 1, filter_size)
107 | )
108 | )
109 |
110 | if mask_type is not None:
111 | filters = filters * mask
112 |
113 | # TODO benchmark against the lasagne 'conv1d' implementations
114 | result = T.nnet.conv2d(inputs, filters, filter_flip=False, border_mode='half')
115 |
116 | if mask_type is not None:
117 | result = result[:, :, :, :inputs.shape[3]]
118 |
119 | biases = lib.param(
120 | name+'.Biases',
121 | numpy.zeros(output_dim, dtype=theano.config.floatX)
122 | )
123 | result += biases[None, :, None, None]
124 |
125 | return result
126 |
127 | def Conv1D(name, input_dim, output_dim, filter_size, inputs, mask_type=None, he_init=False):
128 | """
129 | inputs.shape: (batch size, input_dim, 1, width)
130 | mask_type: None, 'a', 'b'
131 | output.shape: (batch size, output_dim, 1, width)
132 | """
133 |
134 | # if mask_type is not None:
135 | # mask = numpy.ones(
136 | # (output_dim, input_dim, 1, filter_size),
137 | # dtype=theano.config.floatX
138 | # )
139 | # center = filter_size//2
140 | # mask[:,:,0,center+1:] = 0.
141 | # if mask_type == 'a':
142 | # mask[:,:,0,center] = 0.
143 |
144 | if mask_type=='a':
145 | filter_size = filter_size//2
146 | elif mask_type=='b':
147 | filter_size = filter_size//2 + 1
148 |
149 | def uniform(stdev, size):
150 | """uniform distribution with the given stdev and size"""
151 | return numpy.random.uniform(
152 | low=-stdev * numpy.sqrt(3),
153 | high=stdev * numpy.sqrt(3),
154 | size=size
155 | ).astype(theano.config.floatX)
156 |
157 | # if mask_type is not None:
158 | # n_in = numpy.sum(mask)
159 | # else:
160 | n_in = input_dim * filter_size
161 |
162 | if he_init:
163 | init_stdev = numpy.sqrt(2./n_in)
164 | else:
165 | init_stdev = numpy.sqrt(1./n_in)
166 |
167 | filters = lib.param(
168 | name+'.Filters',
169 | uniform(
170 | init_stdev,
171 | (output_dim, input_dim, 1, filter_size)
172 | )
173 | )
174 |
175 | # if mask_type is not None:
176 | # filters = filters * mask
177 |
178 | if mask_type=='a':
179 | pad = filter_size
180 | elif mask_type=='b':
181 | pad = filter_size-1
182 | else:
183 | # border mode 'half'
184 | pad = filter_size//2
185 |
186 | # TODO benchmark against the lasagne 'conv1d' implementations
187 | result = T.nnet.conv2d(inputs, filters, filter_flip=False, border_mode=(0,pad))
188 |
189 | if mask_type is not None:
190 | result = result[:, :, :, :inputs.shape[3]]
191 |
192 | biases = lib.param(
193 | name+'.Biases',
194 | numpy.zeros(output_dim, dtype=theano.config.floatX)
195 | )
196 | result += biases[None, :, None, None]
197 |
198 | return result
199 |
200 | sequences = T.imatrix('sequences')
201 |
202 | INPUT_DIM = Q_LEVELS
203 | inputs = lib.ops.Embedding('Embedding', Q_LEVELS, Q_LEVELS, sequences)
204 | inputs = inputs.dimshuffle(0, 2, 'x', 1)
205 |
206 | # INPUT_DIM = 1
207 | # inputs = lib.floatX(4)*sequences.astype('float32')/lib.floatX(Q_LEVELS) - lib.floatX(2)
208 | # inputs = inputs[:, None, None, :]
209 |
210 | output = MaskedConv1D('InputConv', INPUT_DIM, DIM, FILTER_SIZE, inputs, mask_type='a', he_init=True)
211 | output = T.nnet.relu(output)
212 |
213 | for i in xrange(1,LAYERS):
214 | output = MaskedConv1D('Conv'+str(i), DIM, DIM, FILTER_SIZE, output, mask_type='b', he_init=True)
215 | output = T.nnet.relu(output)
216 |
217 | output = MaskedConv1D('OutputConv', DIM, Q_LEVELS, 1, output, mask_type='b')
218 |
219 | output = output.dimshuffle(0,2,3,1) # Move the Q_LEVELS dim to the end
220 | cost = T.nnet.categorical_crossentropy(
221 | T.nnet.softmax(output.reshape((-1, Q_LEVELS))),
222 | sequences.flatten()
223 | ).mean()
224 |
225 | # By default we report cross-entropy cost in bits.
226 | # Switch to nats by commenting out this line:
227 | cost = cost * lib.floatX(1.44269504089)
228 |
229 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
230 | lib._train.print_params_info(cost, params)
231 |
232 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
233 | # Do people use grad clipping in convnets?
234 | # grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
235 |
236 | updates = lasagne.updates.adam(grads, params)
237 |
238 | train_fn = theano.function(
239 | [sequences],
240 | cost,
241 | updates=updates,
242 | on_unused_input='warn'
243 | )
244 |
245 | # generate_outputs, generate_new_h0 = sample_level_rnn(sequences, h0, reset)
246 | # generate_fn = theano.function(
247 | # [sequences, h0, reset],
248 | # [lib.ops.softmax_and_sample(generate_outputs), generate_new_h0],
249 | # on_unused_input='warn'
250 | # )
251 |
252 | # def generate_and_save_samples(tag):
253 |
254 | # def write_audio_file(name, data):
255 | # data = data.astype('float32')
256 | # data -= data.min()
257 | # data /= data.max()
258 | # data -= 0.5
259 | # data *= 0.95
260 | # scipy.io.wavfile.write(name+'.wav', BITRATE, data)
261 |
262 | # # Generate 5 sample files, each 5 seconds long
263 | # N_SEQS = 10
264 | # LENGTH = 5*BITRATE
265 |
266 | # samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
267 | # samples[:, 0] = Q_ZERO
268 |
269 | # h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
270 |
271 | # for t in xrange(1, LENGTH):
272 | # samples[:, t:t+1], h0 = generate_fn(
273 | # samples[:, t-1:t],
274 | # h0,
275 | # numpy.int32(t == 1)
276 | # )
277 |
278 | # for i in xrange(N_SEQS):
279 | # write_audio_file("sample_{}_{}".format(tag, i), samples[i])
280 |
281 | print "Training!"
282 | total_iters = 0
283 | total_time = 0.
284 | last_print_time = 0.
285 | last_print_iters = 0
286 | for epoch in itertools.count():
287 |
288 | costs = []
289 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, 0, Q_LEVELS, Q_ZERO)
290 |
291 | for seqs, reset in data_feeder:
292 |
293 | start_time = time.time()
294 | cost = train_fn(seqs)
295 | total_time += time.time() - start_time
296 | total_iters += 1
297 |
298 | costs.append(cost)
299 |
300 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
301 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
302 |
303 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
304 | epoch,
305 | total_iters,
306 | numpy.mean(costs),
307 | total_time,
308 | total_time / total_iters
309 | )
310 | tag = "iters{}_time{}".format(total_iters, total_time)
311 |
312 | # generate_and_save_samples(tag)
313 | # lib.save_params('params_{}.pkl'.format(tag))
314 |
315 | costs = []
316 | last_print_time += PRINT_TIME
317 | last_print_iters += PRINT_ITERS
318 |
319 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
320 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
321 |
322 | print "Done!"
323 |
324 | try: # This only matters on Ishaan's computer
325 | import experiment_tools
326 | experiment_tools.send_sms("done!")
327 | except ImportError:
328 | pass
329 |
330 | sys.exit()
--------------------------------------------------------------------------------
/two_tier_conv.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.register_crash_notifier()
11 | experiment_tools.wait_for_gpu(high_priority=False)
12 | except ImportError:
13 | pass
14 |
15 | import numpy
16 | numpy.random.seed(123)
17 | import random
18 | random.seed(123)
19 |
20 | import dataset
21 |
22 | import theano
23 | import theano.tensor as T
24 | import theano.tensor.nnet.neighbours
25 | import theano.ifelse
26 | import lib
27 | import lasagne
28 | import scipy.io.wavfile
29 |
30 | import time
31 | import functools
32 | import itertools
33 |
34 | # Hyperparams
35 | BATCH_SIZE = 128
36 | N_FRAMES = 128 # How many 'frames' to include in each truncated BPTT pass
37 | FRAME_SIZE = 2 # How many samples per frame
38 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
39 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
40 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
41 | GRAD_CLIP = 1 # Elementwise grad clip threshold
42 |
43 | # Dataset
44 | DATA_PATH = '/media/seagate/blizzard/parts'
45 | N_FILES = 141703
46 | # DATA_PATH = '/PersimmonData/kiwi_parts'
47 | # N_FILES = 516
48 | BITRATE = 16000
49 |
50 | # Other constants
51 | TRAIN_MODE = 'time' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
52 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
53 | STOP_ITERS = 100000 # Stop after this many iterations
54 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
55 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
56 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
57 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
58 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
59 |
60 | print "Model settings:"
61 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
62 | all_vars = sorted(all_vars, key=lambda x: x[0])
63 | for var_name, var_value in all_vars:
64 | print "\t{}: {}".format(var_name, var_value)
65 |
66 | def MaskedConv1D(name, input_dim, output_dim, filter_size, inputs, mask_type=None, he_init=False):
67 | """
68 | inputs.shape: (batch size, input_dim, 1, width)
69 | mask_type: None, 'a', 'b'
70 | output.shape: (batch size, output_dim, 1, width)
71 | """
72 |
73 | if mask_type is not None:
74 | mask = numpy.ones(
75 | (output_dim, input_dim, 1, filter_size),
76 | dtype=theano.config.floatX
77 | )
78 | center = filter_size//2
79 | mask[:,:,0,center+1:] = 0.
80 | if mask_type == 'a':
81 | mask[:,:,0,center] = 0.
82 |
83 | def uniform(stdev, size):
84 | """uniform distribution with the given stdev and size"""
85 | return numpy.random.uniform(
86 | low=-stdev * numpy.sqrt(3),
87 | high=stdev * numpy.sqrt(3),
88 | size=size
89 | ).astype(theano.config.floatX)
90 |
91 | if mask_type=='a':
92 | n_in = filter_size//2
93 | elif mask_type=='b':
94 | n_in = filter_size//2 + 1
95 | else:
96 | n_in = filter_size
97 | n_in *= input_dim
98 |
99 | if he_init:
100 | init_stdev = numpy.sqrt(2./n_in)
101 | else:
102 | init_stdev = numpy.sqrt(1./n_in)
103 |
104 | filters = lib.param(
105 | name+'.Filters',
106 | uniform(
107 | init_stdev,
108 | (output_dim, input_dim, 1, filter_size)
109 | )
110 | )
111 |
112 | if mask_type is not None:
113 | filters = filters * mask
114 |
115 | # TODO benchmark against the lasagne 'conv1d' implementations
116 | result = T.nnet.conv2d(inputs, filters, filter_flip=False, border_mode='half')
117 |
118 | if mask_type is not None:
119 | result = result[:, :, :, :inputs.shape[3]]
120 |
121 | biases = lib.param(
122 | name+'.Biases',
123 | numpy.zeros(output_dim, dtype=theano.config.floatX)
124 | )
125 | result += biases[None, :, None, None]
126 |
127 | return result
128 |
129 | def frame_level_rnn(input_sequences, h0, reset):
130 | """
131 | input_sequences.shape: (batch size, n frames * FRAME_SIZE)
132 | h0.shape: (batch size, N_GRUS, DIM)
133 | reset.shape: ()
134 | output.shape: (batch size, n frames * FRAME_SIZE, DIM)
135 | """
136 |
137 | learned_h0 = lib.param(
138 | 'FrameLevel.h0',
139 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
140 | )
141 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
142 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
143 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
144 |
145 | frames = input_sequences.reshape((
146 | input_sequences.shape[0],
147 | input_sequences.shape[1] / FRAME_SIZE,
148 | FRAME_SIZE
149 | ))
150 |
151 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
152 | # (a reasonable range to pass as inputs to the RNN)
153 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
154 | frames *= lib.floatX(2)
155 |
156 | gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
157 | grus = [gru0]
158 | for i in xrange(1, N_GRUS):
159 | gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
160 | grus.append(gru)
161 |
162 | output = lib.ops.Linear(
163 | 'FrameLevel.Output',
164 | DIM,
165 | FRAME_SIZE * DIM,
166 | grus[-1],
167 | initialization='he'
168 | )
169 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
170 |
171 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
172 |
173 | return (output, last_hidden)
174 |
175 | def sample_level_predictor(frame_level_outputs, prev_samples):
176 | """
177 | frame_level_outputs.shape: (batch size, DIM)
178 | prev_samples.shape: (batch size, FRAME_SIZE)
179 | output.shape: (batch size, Q_LEVELS)
180 | """
181 |
182 | prev_samples = lib.ops.Embedding(
183 | 'SampleLevel.Embedding',
184 | Q_LEVELS,
185 | Q_LEVELS,
186 | prev_samples
187 | ).reshape((-1, FRAME_SIZE * Q_LEVELS))
188 |
189 | out = lib.ops.Linear(
190 | 'SampleLevel.L1_PrevSamples',
191 | FRAME_SIZE * Q_LEVELS,
192 | DIM,
193 | prev_samples,
194 | biases=False,
195 | initialization='he'
196 | )
197 | out += frame_level_outputs
198 | out = T.nnet.relu(out)
199 |
200 | out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
201 | out = T.nnet.relu(out)
202 | out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
203 | out = T.nnet.relu(out)
204 |
205 | # We apply the softmax later
206 | return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
207 |
208 | sequences = T.imatrix('sequences')
209 | h0 = T.tensor3('h0')
210 | reset = T.iscalar('reset')
211 |
212 | input_sequences = sequences[:, :-FRAME_SIZE]
213 | target_sequences = sequences[:, FRAME_SIZE:]
214 |
215 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
216 |
217 | prev_samples = sequences[:, :-1]
218 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
219 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
220 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
221 |
222 | sample_level_outputs = sample_level_predictor(
223 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
224 | prev_samples
225 | )
226 |
227 | cost = T.nnet.categorical_crossentropy(
228 | T.nnet.softmax(sample_level_outputs),
229 | target_sequences.flatten()
230 | ).mean()
231 |
232 | # By default we report cross-entropy cost in bits.
233 | # Switch to nats by commenting out this line:
234 | cost = cost * lib.floatX(1.44269504089)
235 |
236 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
237 | lib._train.print_params_info(cost, params)
238 |
239 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
240 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
241 |
242 | updates = lasagne.updates.adam(grads, params, learning_rate=1e-3)
243 |
244 | train_fn = theano.function(
245 | [sequences, h0, reset],
246 | [cost, new_h0],
247 | updates=updates,
248 | on_unused_input='warn'
249 | )
250 |
251 | frame_level_generate_fn = theano.function(
252 | [sequences, h0, reset],
253 | frame_level_rnn(sequences, h0, reset),
254 | on_unused_input='warn'
255 | )
256 |
257 | frame_level_outputs = T.matrix('frame_level_outputs')
258 | prev_samples = T.imatrix('prev_samples')
259 | sample_level_generate_fn = theano.function(
260 | [frame_level_outputs, prev_samples],
261 | lib.ops.softmax_and_sample(
262 | sample_level_predictor(
263 | frame_level_outputs,
264 | prev_samples
265 | )
266 | ),
267 | on_unused_input='warn'
268 | )
269 |
270 | def generate_and_save_samples(tag):
271 |
272 | def write_audio_file(name, data):
273 | data = data.astype('float32')
274 | data -= data.min()
275 | data /= data.max()
276 | data -= 0.5
277 | data *= 0.95
278 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
279 |
280 | # Generate 5 sample files, each 5 seconds long
281 | N_SEQS = 10
282 | LENGTH = 5*BITRATE
283 |
284 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
285 | samples[:, :FRAME_SIZE] = Q_ZERO
286 |
287 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
288 | frame_level_outputs = None
289 |
290 | for t in xrange(FRAME_SIZE, LENGTH):
291 |
292 | if t % FRAME_SIZE == 0:
293 | frame_level_outputs, h0 = frame_level_generate_fn(
294 | samples[:, t-FRAME_SIZE:t],
295 | h0,
296 | numpy.int32(t == FRAME_SIZE)
297 | )
298 |
299 | samples[:, t] = sample_level_generate_fn(
300 | frame_level_outputs[:, t % FRAME_SIZE],
301 | samples[:, t-FRAME_SIZE:t]
302 | )
303 |
304 | for i in xrange(N_SEQS):
305 | write_audio_file("sample_{}_{}".format(tag, i), samples[i])
306 |
307 | print "Training!"
308 | total_iters = 0
309 | total_time = 0.
310 | last_print_time = 0.
311 | last_print_iters = 0
312 | for epoch in itertools.count():
313 |
314 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
315 | costs = []
316 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
317 |
318 | for seqs, reset in data_feeder:
319 |
320 | start_time = time.time()
321 | cost, h0 = train_fn(seqs, h0, reset)
322 | total_time += time.time() - start_time
323 | total_iters += 1
324 |
325 | costs.append(cost)
326 |
327 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
328 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
329 |
330 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
331 | epoch,
332 | total_iters,
333 | numpy.mean(costs),
334 | total_time,
335 | total_time / total_iters
336 | )
337 | tag = "iters{}_time{}".format(total_iters, total_time)
338 | generate_and_save_samples(tag)
339 | lib.save_params('params_{}.pkl'.format(tag))
340 |
341 | costs = []
342 | last_print_time += PRINT_TIME
343 | last_print_iters += PRINT_ITERS
344 |
345 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
346 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
347 |
348 | print "Done!"
349 |
350 | try: # This only matters on Ishaan's computer
351 | import experiment_tools
352 | experiment_tools.send_sms("done!")
353 | except ImportError:
354 | pass
355 |
356 | sys.exit()
--------------------------------------------------------------------------------
/vrnn.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.wait_for_gpu(high_priority=False)
11 | except ImportError:
12 | pass
13 |
14 | import numpy
15 | numpy.random.seed(123)
16 | import random
17 | random.seed(123)
18 |
19 | import dataset
20 |
21 | import theano
22 | import theano.tensor as T
23 | import theano.ifelse
24 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
25 | import lib
26 | import lasagne
27 | import scipy.io.wavfile
28 |
29 | import time
30 | import functools
31 | import itertools
32 |
33 | theano_srng = RandomStreams(seed=234)
34 |
35 | # Hyperparams
36 | BATCH_SIZE = 128
37 | FRAME_SIZE = 16
38 | N_FRAMES = (32*16)/FRAME_SIZE
39 | SEQ_LEN = FRAME_SIZE*N_FRAMES # How many audio samples to include in each truncated BPTT pass
40 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
41 | LATENT_DIM = 128
42 | N_GRUS = 2
43 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
44 | GRAD_CLIP = 1 # Elementwise grad clip threshold
45 |
46 | VANILLA = False
47 |
48 | ALPHA_ITERS = 10000
49 |
50 | # Dataset
51 | DATA_PATH = '/media/seagate/blizzard/parts'
52 | N_FILES = 141703
53 | # DATA_PATH = '/PersimmonData/kiwi_parts'
54 | # N_FILES = 516
55 | BITRATE = 16000
56 |
57 | # Other constants
58 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
59 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
60 | STOP_ITERS = 100000 # Stop after this many iterations
61 | GENERATE_SAMPLES_AND_SAVE_PARAMS = True
62 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
63 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
64 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
65 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
66 | SAMPLE_LEN = 5*BITRATE
67 | # SAMPLE_LEN = 1024
68 |
69 | print "Model settings:"
70 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
71 | all_vars = sorted(all_vars, key=lambda x: x[0])
72 | for var_name, var_value in all_vars:
73 | print "\t{}: {}".format(var_name, var_value)
74 |
75 | def Layer(name, n_in, n_out, inputs):
76 | output = lib.ops.Linear(name, n_in, n_out, inputs, initialization='he')
77 | output = T.nnet.relu(output)
78 | return output
79 |
80 | def MLP(name, n_in, n_out, inputs):
81 | output = Layer(name+'.1', n_in, DIM, inputs)
82 | output = Layer(name+'.2', DIM, DIM, output)
83 | output = Layer(name+'.3', DIM, DIM, output)
84 | output = lib.ops.Linear(name+'.Output', DIM, n_out, output)
85 | return output
86 |
87 | def FrameProcessor(frames):
88 | """
89 | frames.shape: (batch size, n frames, FRAME_SIZE)
90 | output.shape: (batch size, n frames, DIM)
91 | """
92 |
93 | embedded = lib.ops.Embedding('FrameEmbedding', Q_LEVELS, Q_LEVELS, frames)
94 | embedded = embedded.reshape((frames.shape[0], frames.shape[1], Q_LEVELS * FRAME_SIZE))
95 | output = MLP('FrameProcessor', FRAME_SIZE*Q_LEVELS, DIM, embedded)
96 | return output
97 |
98 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
99 | # frames *= lib.floatX(2)
100 | # output = MLP('FrameProcessor', FRAME_SIZE, DIM, frames)
101 | # return output
102 |
103 | def LatentsProcessor(latents):
104 | """
105 | latents.shape: (batch size, n frames, LATENT_DIM)
106 | output.shape: (batch size, n frames, DIM)
107 | """
108 | return MLP('LatentsProcessor', LATENT_DIM, DIM, latents)
109 |
110 | def Prior(contexts):
111 | """
112 | contexts.shape: (batch size, n frames, DIM)
113 | outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
114 | """
115 | mu_and_log_sigma = MLP('Prior', DIM, 2*LATENT_DIM, contexts)
116 | return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
117 |
118 | def Encoder(processed_frames, contexts):
119 | """
120 | processed_frames.shape: (batch size, n frames, DIM)
121 | contexts.shape: (batch size, n frames, DIM)
122 | outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
123 | """
124 | inputs = T.concatenate([
125 | processed_frames,
126 | contexts
127 | ], axis=2)
128 | mu_and_log_sigma = MLP('Encoder', 2*DIM, 2*LATENT_DIM, inputs)
129 | return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
130 |
131 | def Decoder(latents, contexts, prevs):
132 | """
133 | latents.shape: (batch size, n frames, LATENT_DIM)
134 | contexts.shape: (batch size, n frames, DIM)
135 | prevs.shape: (batch size, n frames * FRAME_SIZE)
136 | outputs: (batch size, n frames, FRAME_SIZE, Q_LEVELS)
137 | """
138 | inputs = T.concatenate([
139 | LatentsProcessor(latents),
140 | contexts
141 | ], axis=2)
142 | output = MLP('Decoder', 2*DIM, FRAME_SIZE*Q_LEVELS, inputs)
143 | return output.reshape((output.shape[0], output.shape[1], FRAME_SIZE, Q_LEVELS))
144 |
145 | def Recurrence(processed_frames, h0, reset):
146 | """
147 | processed_frames.shape: (batch size, n frames, DIM)
148 | h0.shape: (batch size, N_GRUS, DIM)
149 | reset.shape: ()
150 | output.shape: (batch size, n frames, DIM)
151 | """
152 |
153 | # print "warning no recurrence"
154 | # return T.zeros_like(processed_frames), h0
155 |
156 | learned_h0 = lib.param(
157 | 'Recurrence.h0',
158 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
159 | )
160 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
161 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
162 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
163 |
164 | gru0 = lib.ops.LowMemGRU('Recurrence.GRU0', DIM, DIM, processed_frames, h0=h0[:, 0])
165 | grus = [gru0]
166 | for i in xrange(1, N_GRUS):
167 | gru = lib.ops.LowMemGRU('Recurrence.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
168 | grus.append(gru)
169 |
170 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
171 |
172 | return (grus[-1], last_hidden)
173 |
174 |
175 | sequences = T.imatrix('sequences')
176 | h0 = T.tensor3('h0')
177 | reset = T.iscalar('reset')
178 |
179 | frames = sequences.reshape((sequences.shape[0], -1, FRAME_SIZE))
180 | processed_frames = FrameProcessor(frames)
181 |
182 | contexts, new_h0 = Recurrence(processed_frames[:,:-1], h0, reset)
183 |
184 | mu_prior, log_sigma_prior = Prior(contexts)
185 | mu_post, log_sigma_post = Encoder(processed_frames[:,1:], contexts)
186 |
187 | # log_sigma_prior = T.log(T.nnet.softplus(log_sigma_prior))
188 | # log_sigma_post = T.log(T.nnet.softplus(log_sigma_post))
189 |
190 | eps = theano_srng.normal(mu_post.shape).astype('float32')
191 | latents = mu_post
192 | if not VANILLA:
193 | latents += (T.exp(log_sigma_post) * eps)
194 | else:
195 | print "warning no latent noise"
196 |
197 | reconstructions = Decoder(latents, contexts, sequences[:, FRAME_SIZE-1:-1])
198 |
199 | reconst_cost = T.nnet.categorical_crossentropy(
200 | T.nnet.softmax(reconstructions.reshape((-1, Q_LEVELS))),
201 | frames[:,1:].flatten()
202 | ).mean()
203 | reconst_cost.name = 'reconst_cost'
204 |
205 |
206 | def KLGaussianGaussian(mu1, sig1, mu2, sig2):
207 | """
208 | (adapted from https://github.com/jych/cle)
209 | mu1, sig1 = posterior mu and *log* sigma
210 | mu2, sig2 = prior mu and *log* sigma
211 | """
212 | # 0.5 * (1 + 2*log_sigma - mu**2 - T.exp(2*log_sigma)).mean(axis=0).sum()
213 | kl = 0.5 * (2*sig2 - 2*sig1 + (T.exp(2*sig1) + (mu1 - mu2)**2) / T.exp(2*sig2) - 1)
214 | return kl
215 |
216 | reg_cost = KLGaussianGaussian(
217 | mu_post,
218 | log_sigma_post,
219 | mu_prior,
220 | log_sigma_prior
221 | )
222 | reg_cost = reg_cost.sum() / T.cast(frames[:,1:].flatten().shape[0], 'float32')
223 |
224 | # By default we report cross-entropy cost in bits.
225 | # Switch to nats by commenting out this line:
226 | reg_cost = reg_cost * lib.floatX(1.44269504089)
227 | reconst_cost = reconst_cost * lib.floatX(1.44269504089)
228 |
229 | alpha = T.scalar('alpha')
230 | cost = reconst_cost
231 | if not VANILLA:
232 | cost += (alpha * reg_cost)
233 |
234 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
235 | lib._train.print_params_info(cost, params)
236 |
237 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
238 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
239 |
240 | updates = lasagne.updates.adam(grads, params)
241 |
242 | train_fn = theano.function(
243 | [sequences, h0, reset, alpha],
244 | [reg_cost, reconst_cost, cost, new_h0],
245 | updates=updates,
246 | on_unused_input='warn'
247 | )
248 |
249 | gen_fn_contexts, gen_fn_new_h0 = Recurrence(processed_frames, h0, reset)
250 | gen_recurrence_fn = theano.function(
251 | [sequences, h0, reset],
252 | [gen_fn_contexts, gen_fn_new_h0],
253 | on_unused_input='warn'
254 | )
255 |
256 | gen_vae_fn = theano.function(
257 | [contexts],
258 | lib.ops.softmax_and_sample(
259 | Decoder(
260 | mu_prior + theano_srng.normal(mu_prior.shape).astype('float32') * T.exp(log_sigma_prior),
261 | contexts
262 | )
263 | ),
264 | on_unused_input='warn'
265 | )
266 |
267 | def generate_and_save_samples(tag):
268 |
269 | def write_audio_file(name, data):
270 | data = data.astype('float32')
271 | data -= data.min()
272 | data /= data.max()
273 | data -= 0.5
274 | data *= 0.95
275 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
276 |
277 | # Generate 5 sample files, each 5 seconds long
278 | N_SEQS = 10
279 | LENGTH = SAMPLE_LEN - (SAMPLE_LEN%FRAME_SIZE)
280 |
281 | samples = numpy.zeros((N_SEQS, LENGTH/FRAME_SIZE, FRAME_SIZE), dtype='int32')
282 | samples[:, 0] = Q_ZERO
283 |
284 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
285 | contexts, h0 = gen_recurrence_fn(samples[:,0], h0, numpy.int32(1))
286 |
287 | for frame_i in xrange(1, LENGTH/FRAME_SIZE):
288 | samples[:,frame_i:frame_i+1] = gen_vae_fn(contexts)
289 | contexts, h0 = gen_recurrence_fn(samples[:,frame_i], h0, numpy.int32(0))
290 |
291 | for i in xrange(N_SEQS):
292 | write_audio_file("sample_{}_{}".format(tag, i), samples[i].reshape((-1)))
293 |
294 | print "Training!"
295 | total_iters = 0
296 | total_time = 0.
297 | last_print_time = 0.
298 | last_print_iters = 0
299 | reg_costs = []
300 | reconst_costs = []
301 | costs = []
302 | for epoch in itertools.count():
303 |
304 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
305 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
306 |
307 | def sigmoid(x):
308 | return 1 / (1 + numpy.exp(-x))
309 |
310 | for seqs, reset in data_feeder:
311 |
312 | # alpha = lib.floatX(sigmoid((total_iters - ALPHA_B)/float(ALPHA_A)))
313 | # if alpha > 0.99:
314 | # alpha = lib.floatX(1)
315 | # if alpha < 1e-5:
316 | # alpha = lib.floatX(1e-5)
317 |
318 | # alpha = lib.floatX(0)
319 |
320 | alpha = lib.floatX(float(total_iters) / ALPHA_ITERS)
321 | if alpha > 1:
322 | alpha = lib.floatX(1)
323 |
324 | start_time = time.time()
325 | reg_cost, reconst_cost, cost, h0 = train_fn(seqs, h0, reset, alpha)
326 | total_time += time.time() - start_time
327 | total_iters += 1
328 |
329 | reg_costs.append(reg_cost)
330 | reconst_costs.append(reconst_cost)
331 | costs.append(cost)
332 |
333 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
334 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
335 |
336 | print "epoch:{}\ttotal iters:{}\talpha:{}\treg:{}\treconst:{}\tfull:{}\ttotal time:{}\ttime per iter:{}".format(
337 | epoch,
338 | total_iters,
339 | alpha,
340 | numpy.mean(reg_costs),
341 | numpy.mean(reconst_costs),
342 | numpy.mean(costs),
343 | total_time,
344 | total_time / total_iters
345 | )
346 | tag = "iters{}_time{}".format(total_iters, total_time)
347 |
348 | if GENERATE_SAMPLES_AND_SAVE_PARAMS:
349 | generate_and_save_samples(tag)
350 | lib.save_params('params_{}.pkl'.format(tag))
351 |
352 | reg_costs = []
353 | reconst_costs = []
354 | costs = []
355 | last_print_time += PRINT_TIME
356 | last_print_iters += PRINT_ITERS
357 |
358 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
359 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
360 |
361 | print "Done!"
362 |
363 | try: # This only matters on Ishaan's computer
364 | import experiment_tools
365 | experiment_tools.send_sms("done!")
366 | except ImportError:
367 | pass
368 |
369 | sys.exit()
--------------------------------------------------------------------------------
/vrnn_ar.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 | import os, sys
6 | sys.path.append(os.getcwd())
7 |
8 | try: # This only matters on Ishaan's computer
9 | import experiment_tools
10 | experiment_tools.wait_for_gpu(high_priority=False)
11 | except ImportError:
12 | pass
13 |
14 | import numpy
15 | numpy.random.seed(123)
16 | import random
17 | random.seed(123)
18 |
19 | import dataset
20 |
21 | import theano
22 | import theano.tensor as T
23 | import theano.ifelse
24 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
25 | import lib
26 | import lasagne
27 | import scipy.io.wavfile
28 |
29 | import time
30 | import functools
31 | import itertools
32 |
33 | theano_srng = RandomStreams(seed=234)
34 |
35 | # Hyperparams
36 | BATCH_SIZE = 128
37 | FRAME_SIZE = 16
38 | N_FRAMES = (32*16)/FRAME_SIZE
39 | SEQ_LEN = FRAME_SIZE*N_FRAMES # How many audio samples to include in each truncated BPTT pass
40 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
41 | LATENT_DIM = 128
42 | N_GRUS = 2
43 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
44 | GRAD_CLIP = 1 # Elementwise grad clip threshold
45 |
46 | VANILLA = False
47 |
48 | ALPHA_ITERS = 10000
49 |
50 | # Dataset
51 | DATA_PATH = '/media/seagate/blizzard/parts'
52 | N_FILES = 141703
53 | # DATA_PATH = '/PersimmonData/kiwi_parts'
54 | # N_FILES = 516
55 | BITRATE = 16000
56 |
57 | # Other constants
58 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
59 | PRINT_ITERS = 1000 # Print cost, generate samples, save model checkpoint every N iterations.
60 | STOP_ITERS = 100000 # Stop after this many iterations
61 | GENERATE_SAMPLES_AND_SAVE_PARAMS = True
62 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
63 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
64 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
65 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
66 | SAMPLE_LEN = 5*BITRATE
67 | # SAMPLE_LEN = 1024
68 |
69 | print "Model settings:"
70 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
71 | all_vars = sorted(all_vars, key=lambda x: x[0])
72 | for var_name, var_value in all_vars:
73 | print "\t{}: {}".format(var_name, var_value)
74 |
75 | def Layer(name, n_in, n_out, inputs):
76 | output = lib.ops.Linear(name, n_in, n_out, inputs, initialization='he')
77 | output = T.nnet.relu(output)
78 | return output
79 |
80 | def MLP(name, n_in, n_out, inputs):
81 | output = Layer(name+'.1', n_in, DIM, inputs)
82 | output = Layer(name+'.2', DIM, DIM, output)
83 | output = Layer(name+'.3', DIM, DIM, output)
84 | output = lib.ops.Linear(name+'.Output', DIM, n_out, output)
85 | return output
86 |
87 | def FrameProcessor(frames):
88 | """
89 | frames.shape: (batch size, n frames, FRAME_SIZE)
90 | output.shape: (batch size, n frames, DIM)
91 | """
92 |
93 | embedded = lib.ops.Embedding('FrameEmbedding', Q_LEVELS, Q_LEVELS, frames)
94 | embedded = embedded.reshape((frames.shape[0], frames.shape[1], Q_LEVELS * FRAME_SIZE))
95 | output = MLP('FrameProcessor', FRAME_SIZE*Q_LEVELS, DIM, embedded)
96 | return output
97 |
98 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
99 | # frames *= lib.floatX(2)
100 | # output = MLP('FrameProcessor', FRAME_SIZE, DIM, frames)
101 | # return output
102 |
103 | def LatentsProcessor(latents):
104 | """
105 | latents.shape: (batch size, n frames, LATENT_DIM)
106 | output.shape: (batch size, n frames, DIM)
107 | """
108 | return MLP('LatentsProcessor', LATENT_DIM, DIM, latents)
109 |
110 | def Prior(contexts):
111 | """
112 | contexts.shape: (batch size, n frames, DIM)
113 | outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
114 | """
115 | mu_and_log_sigma = MLP('Prior', DIM, 2*LATENT_DIM, contexts)
116 | return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
117 |
118 | def Encoder(processed_frames, contexts):
119 | """
120 | processed_frames.shape: (batch size, n frames, DIM)
121 | contexts.shape: (batch size, n frames, DIM)
122 | outputs: (mu, log_sigma), each with shape (batch size, n frames, LATENT_DIM)
123 | """
124 | inputs = T.concatenate([
125 | processed_frames,
126 | contexts
127 | ], axis=2)
128 | mu_and_log_sigma = MLP('Encoder', 2*DIM, 2*LATENT_DIM, inputs)
129 | return mu_and_log_sigma[:,:,:LATENT_DIM], mu_and_log_sigma[:,:,LATENT_DIM:]
130 |
131 | def Decoder(latents, contexts, prevs):
132 | """
133 | latents.shape: (batch size, n frames, LATENT_DIM)
134 | contexts.shape: (batch size, n frames, DIM)
135 | prevs.shape: (batch size, n frames * FRAME_SIZE)
136 | outputs: (batch size, n frames, FRAME_SIZE, Q_LEVELS)
137 | """
138 | inputs = T.concatenate([
139 | LatentsProcessor(latents),
140 | contexts
141 | ], axis=2)
142 | output = MLP('Decoder', 2*DIM, FRAME_SIZE*Q_LEVELS, inputs)
143 | return output.reshape((output.shape[0], output.shape[1], FRAME_SIZE, Q_LEVELS))
144 |
145 | def Recurrence(processed_frames, h0, reset):
146 | """
147 | processed_frames.shape: (batch size, n frames, DIM)
148 | h0.shape: (batch size, N_GRUS, DIM)
149 | reset.shape: ()
150 | output.shape: (batch size, n frames, DIM)
151 | """
152 |
153 | # print "warning no recurrence"
154 | # return T.zeros_like(processed_frames), h0
155 |
156 | learned_h0 = lib.param(
157 | 'Recurrence.h0',
158 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
159 | )
160 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
161 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
162 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
163 |
164 | gru0 = lib.ops.LowMemGRU('Recurrence.GRU0', DIM, DIM, processed_frames, h0=h0[:, 0])
165 | grus = [gru0]
166 | for i in xrange(1, N_GRUS):
167 | gru = lib.ops.LowMemGRU('Recurrence.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
168 | grus.append(gru)
169 |
170 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
171 |
172 | return (grus[-1], last_hidden)
173 |
174 |
175 | sequences = T.imatrix('sequences')
176 | h0 = T.tensor3('h0')
177 | reset = T.iscalar('reset')
178 |
179 | frames = sequences.reshape((sequences.shape[0], -1, FRAME_SIZE))
180 | processed_frames = FrameProcessor(frames)
181 |
182 | contexts, new_h0 = Recurrence(processed_frames[:,:-1], h0, reset)
183 |
184 | mu_prior, log_sigma_prior = Prior(contexts)
185 | mu_post, log_sigma_post = Encoder(processed_frames[:,1:], contexts)
186 |
187 | # log_sigma_prior = T.log(T.nnet.softplus(log_sigma_prior))
188 | # log_sigma_post = T.log(T.nnet.softplus(log_sigma_post))
189 |
190 | eps = theano_srng.normal(mu_post.shape).astype('float32')
191 | latents = mu_post
192 | if not VANILLA:
193 | latents += (T.exp(log_sigma_post) * eps)
194 | else:
195 | print "warning no latent noise"
196 |
197 | reconstructions = Decoder(latents, contexts, sequences[:, FRAME_SIZE-1:-1])
198 |
199 | reconst_cost = T.nnet.categorical_crossentropy(
200 | T.nnet.softmax(reconstructions.reshape((-1, Q_LEVELS))),
201 | frames[:,1:].flatten()
202 | ).mean()
203 | reconst_cost.name = 'reconst_cost'
204 |
205 |
206 | def KLGaussianGaussian(mu1, sig1, mu2, sig2):
207 | """
208 | (adapted from https://github.com/jych/cle)
209 | mu1, sig1 = posterior mu and *log* sigma
210 | mu2, sig2 = prior mu and *log* sigma
211 | """
212 | # 0.5 * (1 + 2*log_sigma - mu**2 - T.exp(2*log_sigma)).mean(axis=0).sum()
213 | kl = 0.5 * (2*sig2 - 2*sig1 + (T.exp(2*sig1) + (mu1 - mu2)**2) / T.exp(2*sig2) - 1)
214 | return kl
215 |
216 | reg_cost = KLGaussianGaussian(
217 | mu_post,
218 | log_sigma_post,
219 | mu_prior,
220 | log_sigma_prior
221 | )
222 | reg_cost = reg_cost.sum() / T.cast(frames[:,1:].flatten().shape[0], 'float32')
223 |
224 | # By default we report cross-entropy cost in bits.
225 | # Switch to nats by commenting out this line:
226 | reg_cost = reg_cost * lib.floatX(1.44269504089)
227 | reconst_cost = reconst_cost * lib.floatX(1.44269504089)
228 |
229 | alpha = T.scalar('alpha')
230 | cost = reconst_cost
231 | if not VANILLA:
232 | cost += (alpha * reg_cost)
233 |
234 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
235 | lib._train.print_params_info(cost, params)
236 |
237 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
238 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
239 |
240 | updates = lasagne.updates.adam(grads, params)
241 |
242 | train_fn = theano.function(
243 | [sequences, h0, reset, alpha],
244 | [reg_cost, reconst_cost, cost, new_h0],
245 | updates=updates,
246 | on_unused_input='warn'
247 | )
248 |
249 | gen_fn_contexts, gen_fn_new_h0 = Recurrence(processed_frames, h0, reset)
250 | gen_recurrence_fn = theano.function(
251 | [sequences, h0, reset],
252 | [gen_fn_contexts, gen_fn_new_h0],
253 | on_unused_input='warn'
254 | )
255 |
256 | gen_vae_fn = theano.function(
257 | [contexts],
258 | lib.ops.softmax_and_sample(
259 | Decoder(
260 | mu_prior + theano_srng.normal(mu_prior.shape).astype('float32') * T.exp(log_sigma_prior),
261 | contexts
262 | )
263 | ),
264 | on_unused_input='warn'
265 | )
266 |
267 | def generate_and_save_samples(tag):
268 |
269 | def write_audio_file(name, data):
270 | data = data.astype('float32')
271 | data -= data.min()
272 | data /= data.max()
273 | data -= 0.5
274 | data *= 0.95
275 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
276 |
277 | # Generate 5 sample files, each 5 seconds long
278 | N_SEQS = 10
279 | LENGTH = SAMPLE_LEN - (SAMPLE_LEN%FRAME_SIZE)
280 |
281 | samples = numpy.zeros((N_SEQS, LENGTH/FRAME_SIZE, FRAME_SIZE), dtype='int32')
282 | samples[:, 0] = Q_ZERO
283 |
284 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
285 | contexts, h0 = gen_recurrence_fn(samples[:,0], h0, numpy.int32(1))
286 |
287 | for frame_i in xrange(1, LENGTH/FRAME_SIZE):
288 | samples[:,frame_i:frame_i+1] = gen_vae_fn(contexts)
289 | contexts, h0 = gen_recurrence_fn(samples[:,frame_i], h0, numpy.int32(0))
290 |
291 | for i in xrange(N_SEQS):
292 | write_audio_file("sample_{}_{}".format(tag, i), samples[i].reshape((-1)))
293 |
294 | print "Training!"
295 | total_iters = 0
296 | total_time = 0.
297 | last_print_time = 0.
298 | last_print_iters = 0
299 | reg_costs = []
300 | reconst_costs = []
301 | costs = []
302 | for epoch in itertools.count():
303 |
304 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
305 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)
306 |
307 | def sigmoid(x):
308 | return 1 / (1 + numpy.exp(-x))
309 |
310 | for seqs, reset in data_feeder:
311 |
312 | # alpha = lib.floatX(sigmoid((total_iters - ALPHA_B)/float(ALPHA_A)))
313 | # if alpha > 0.99:
314 | # alpha = lib.floatX(1)
315 | # if alpha < 1e-5:
316 | # alpha = lib.floatX(1e-5)
317 |
318 | # alpha = lib.floatX(0)
319 |
320 | alpha = lib.floatX(float(total_iters) / ALPHA_ITERS)
321 | if alpha > 1:
322 | alpha = lib.floatX(1)
323 |
324 | start_time = time.time()
325 | reg_cost, reconst_cost, cost, h0 = train_fn(seqs, h0, reset, alpha)
326 | total_time += time.time() - start_time
327 | total_iters += 1
328 |
329 | reg_costs.append(reg_cost)
330 | reconst_costs.append(reconst_cost)
331 | costs.append(cost)
332 |
333 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
334 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
335 |
336 | print "epoch:{}\ttotal iters:{}\talpha:{}\treg:{}\treconst:{}\tfull:{}\ttotal time:{}\ttime per iter:{}".format(
337 | epoch,
338 | total_iters,
339 | alpha,
340 | numpy.mean(reg_costs),
341 | numpy.mean(reconst_costs),
342 | numpy.mean(costs),
343 | total_time,
344 | total_time / total_iters
345 | )
346 | tag = "iters{}_time{}".format(total_iters, total_time)
347 |
348 | if GENERATE_SAMPLES_AND_SAVE_PARAMS:
349 | generate_and_save_samples(tag)
350 | lib.save_params('params_{}.pkl'.format(tag))
351 |
352 | reg_costs = []
353 | reconst_costs = []
354 | costs = []
355 | last_print_time += PRINT_TIME
356 | last_print_iters += PRINT_ITERS
357 |
358 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
359 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
360 |
361 | print "Done!"
362 |
363 | try: # This only matters on Ishaan's computer
364 | import experiment_tools
365 | experiment_tools.send_sms("done!")
366 | except ImportError:
367 | pass
368 |
369 | sys.exit()
--------------------------------------------------------------------------------
/three_tier.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN Speech Generation Model
3 | Ishaan Gulrajani
4 | """
5 |
6 | import os, sys
7 | sys.path.append(os.getcwd())
8 |
9 | try: # This only matters on Ishaan's computer
10 | import experiment_tools
11 | experiment_tools.register_crash_notifier()
12 | experiment_tools.wait_for_gpu(high_priority=False, debug=True)
13 | except ImportError:
14 | pass
15 |
16 | import numpy
17 | numpy.random.seed(123)
18 | import random
19 | random.seed(123)
20 |
21 | import dataset
22 |
23 | import theano
24 | import theano.tensor as T
25 | import theano.tensor.nnet.neighbours
26 | import theano.ifelse
27 | import lib
28 | import lasagne
29 | import scipy.io.wavfile
30 |
31 | import time
32 | import functools
33 | import itertools
34 |
35 | # Hyperparams
36 | BATCH_SIZE = 128
37 | SEQ_LEN = 512 # How many samples to include in each truncated BPTT pass
38 | PRE_SEQ_LEN = 1024
39 | FRAME_SIZE = 2 # How many samples per frame
40 | N_GRUS = 1 # How many GRUs to stack in the frame-level model
41 | BIG_FRAME_SIZE = 8 # how many samples per big frame
42 | N_BIG_GRUS = 4 # how many GRUs to stack in the big-frame-level model
43 | assert(SEQ_LEN % BIG_FRAME_SIZE == 0)
44 | assert(BIG_FRAME_SIZE % FRAME_SIZE == 0)
45 | DIM = 1024 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
46 | BIG_DIM = 1024 # dimensionality for the slowest level
47 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
48 | GRAD_CLIP = 1 # Elementwise grad clip threshold
49 |
50 | # Dataset
51 | DATA_PATH = '/media/seagate/blizzard/parts'
52 | N_FILES = 141703
53 | # DATA_PATH = '/PersimmonData/kiwi_parts'
54 | # N_FILES = 516
55 | BITRATE = 16000
56 |
57 | # Other constants
58 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
59 | N_FRAMES = SEQ_LEN / FRAME_SIZE # Number of frames in each truncated BPTT pass
60 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
61 |
62 | # # Pretrain loop
63 | PRE_TRAIN_MODE = 'time' # only time supported right now
64 | PRE_PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
65 | PRE_STOP_TIME = 60*60*4 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
66 | PRE_PRINT_ITERS = 0
67 | PRE_STOP_ITERS = 0
68 |
69 | # in between "pretraining" and "fine tuning" (i.e. end-to-end) there's a period
70 | # where we only train the bottom levels, so that when we train end-to-end we
71 | # don't screw up the top levels with gradients from the random bottom levels
72 | # if PRE_STOP_TIME > 0:
73 | # TIME_BEFORE_FINETUNE = 60*60*1
74 | TIME_BEFORE_FINETUNE = 0
75 |
76 | # Train loop
77 | TRAIN_MODE = 'iters' # 'iters' to use PRINT_ITERS and STOP_ITERS, 'time' to use PRINT_TIME and STOP_TIME
78 | PRINT_ITERS = 1 # Print cost, generate samples, save model checkpoint every N iterations.
79 | STOP_ITERS = 100000 # Stop after this many iterations
80 | PRINT_TIME = 60*60 # Print cost, generate samples, save model checkpoint every N seconds.
81 | STOP_TIME = 60*60*12 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
82 |
83 | STOP_TIME -= PRE_STOP_TIME
84 |
85 | print "Model settings:"
86 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
87 | all_vars = sorted(all_vars, key=lambda x: x[0])
88 | for var_name, var_value in all_vars:
89 | print "\t{}: {}".format(var_name, var_value)
90 |
91 | def big_frame_level_rnn(input_sequences, h0, reset):
92 | """
93 | input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE)
94 | h0.shape: (batch size, N_BIG_GRUS, BIG_DIM)
95 | reset.shape: ()
96 | output[0].shape: (batch size, n frames, DIM)
97 | output[1].shape: same as h0.shape
98 | output[2].shape: (batch size, seq len, Q_LEVELS)
99 | """
100 |
101 | learned_h0 = lib.param(
102 | 'BigFrameLevel.h0',
103 | numpy.zeros((N_BIG_GRUS, BIG_DIM), dtype=theano.config.floatX)
104 | )
105 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_GRUS, BIG_DIM)
106 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
107 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
108 |
109 | frames = input_sequences.reshape((
110 | input_sequences.shape[0],
111 | input_sequences.shape[1] / BIG_FRAME_SIZE,
112 | BIG_FRAME_SIZE
113 | ))
114 |
115 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
116 | # (a reasonable range to pass as inputs to the RNN)
117 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
118 | frames *= lib.floatX(2)
119 |
120 | gru0 = lib.ops.LowMemGRU('BigFrameLevel.GRU0', BIG_FRAME_SIZE, BIG_DIM, frames, h0=h0[:, 0])
121 | grus = [gru0]
122 | for i in xrange(1, N_BIG_GRUS):
123 | gru = lib.ops.LowMemGRU('BigFrameLevel.GRU'+str(i), BIG_DIM, BIG_DIM, grus[-1], h0=h0[:, i])
124 | grus.append(gru)
125 |
126 | output = lib.ops.Linear(
127 | 'BigFrameLevel.Output',
128 | BIG_DIM,
129 | DIM * BIG_FRAME_SIZE / FRAME_SIZE,
130 | grus[-1]
131 | )
132 | output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM))
133 |
134 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
135 |
136 | independent_preds = lib.ops.Linear(
137 | 'BigFrameLevel.IndependentPreds',
138 | BIG_DIM,
139 | Q_LEVELS * BIG_FRAME_SIZE,
140 | grus[-1]
141 | )
142 | independent_preds = independent_preds.reshape((independent_preds.shape[0], independent_preds.shape[1] * BIG_FRAME_SIZE, Q_LEVELS))
143 |
144 | return (output, last_hidden, independent_preds)
145 |
146 | def frame_level_rnn(input_sequences, other_input, h0, reset):
147 | """
148 | input_sequences.shape: (batch size, n frames * FRAME_SIZE)
149 | other_input.shape: (batch size, n frames, DIM)
150 | h0.shape: (batch size, N_GRUS, DIM)
151 | reset.shape: ()
152 | output.shape: (batch size, n frames * FRAME_SIZE, DIM)
153 | """
154 |
155 | learned_h0 = lib.param(
156 | 'FrameLevel.h0',
157 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
158 | )
159 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
160 | learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
161 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
162 |
163 | frames = input_sequences.reshape((
164 | input_sequences.shape[0],
165 | input_sequences.shape[1] / FRAME_SIZE,
166 | FRAME_SIZE
167 | ))
168 |
169 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
170 | # (a reasonable range to pass as inputs to the RNN)
171 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
172 | frames *= lib.floatX(2)
173 |
174 | gru_input = lib.ops.Linear('FrameLevel.InputExpand', FRAME_SIZE, DIM, frames) + other_input
175 |
176 | gru0 = lib.ops.LowMemGRU('FrameLevel.GRU0', DIM, DIM, gru_input, h0=h0[:, 0])
177 | grus = [gru0]
178 | for i in xrange(1, N_GRUS):
179 | gru = lib.ops.LowMemGRU('FrameLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
180 | grus.append(gru)
181 |
182 | output = lib.ops.Linear(
183 | 'FrameLevel.Output',
184 | DIM,
185 | FRAME_SIZE * DIM,
186 | grus[-1],
187 | initialization='he'
188 | )
189 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
190 |
191 | last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)
192 |
193 | return (output, last_hidden)
194 |
195 | def sample_level_predictor(frame_level_outputs, prev_samples):
196 | """
197 | frame_level_outputs.shape: (batch size, DIM)
198 | prev_samples.shape: (batch size, FRAME_SIZE)
199 | output.shape: (batch size, Q_LEVELS)
200 | """
201 |
202 | prev_samples = lib.ops.Embedding(
203 | 'SampleLevel.Embedding',
204 | Q_LEVELS,
205 | Q_LEVELS,
206 | prev_samples
207 | ).reshape((-1, FRAME_SIZE * Q_LEVELS))
208 |
209 | out = lib.ops.Linear(
210 | 'SampleLevel.L1_PrevSamples',
211 | FRAME_SIZE * Q_LEVELS,
212 | DIM,
213 | prev_samples,
214 | biases=False,
215 | initialization='he'
216 | )
217 | out += frame_level_outputs
218 | out = T.nnet.relu(out)
219 |
220 | out = lib.ops.Linear('SampleLevel.L2', DIM, DIM, out, initialization='he')
221 | out = T.nnet.relu(out)
222 | out = lib.ops.Linear('SampleLevel.L3', DIM, DIM, out, initialization='he')
223 | out = T.nnet.relu(out)
224 |
225 | # We apply the softmax later
226 | return lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out)
227 |
228 | sequences = T.imatrix('sequences')
229 | h0 = T.tensor3('h0')
230 | big_h0 = T.tensor3('big_h0')
231 | reset = T.iscalar('reset')
232 |
233 | big_input_sequences = sequences[:, :-BIG_FRAME_SIZE]
234 | input_sequences = sequences[:, BIG_FRAME_SIZE-FRAME_SIZE:-FRAME_SIZE]
235 | target_sequences = sequences[:, BIG_FRAME_SIZE:]
236 |
237 | big_frame_level_outputs, new_big_h0, big_frame_independent_preds = big_frame_level_rnn(big_input_sequences, big_h0, reset)
238 |
239 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)
240 |
241 | prev_samples = sequences[:, BIG_FRAME_SIZE-FRAME_SIZE:-1]
242 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
243 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
244 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
245 |
246 | sample_level_outputs = sample_level_predictor(
247 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
248 | prev_samples
249 | )
250 |
251 | cost = T.nnet.categorical_crossentropy(
252 | T.nnet.softmax(sample_level_outputs),
253 | target_sequences.flatten()
254 | ).mean()
255 |
256 | # By default we report cross-entropy cost in bits.
257 | # Switch to nats by commenting out this line:
258 | cost = cost * lib.floatX(1.44269504089)
259 |
260 | ip_cost = lib.floatX(1.44269504089) * T.nnet.categorical_crossentropy(
261 | T.nnet.softmax(big_frame_independent_preds.reshape((-1, Q_LEVELS))),
262 | target_sequences.flatten()
263 | ).mean()
264 |
265 | all_params = lib.search(cost, lambda x: hasattr(x, 'param'))
266 | ip_params = lib.search(ip_cost, lambda x: hasattr(x, 'param') and 'BigFrameLevel' in x.name)
267 | other_params = [p for p in all_params if p not in ip_params]
268 | all_params = ip_params + other_params
269 | lib._train.print_params_info(ip_cost, ip_params)
270 | lib._train.print_params_info(cost, other_params)
271 | lib._train.print_params_info(cost, all_params)
272 |
273 | ip_grads = T.grad(ip_cost, wrt=ip_params, disconnected_inputs='warn')
274 | ip_grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in ip_grads]
275 |
276 | other_grads = T.grad(cost, wrt=other_params, disconnected_inputs='warn')
277 | other_grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in other_grads]
278 |
279 | grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn')
280 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
281 |
282 |
283 | ip_updates = lasagne.updates.adam(ip_grads, ip_params)
284 | other_updates = lasagne.updates.adam(other_grads, other_params)
285 | updates = lasagne.updates.adam(grads, all_params)
286 |
287 | ip_train_fn = theano.function(
288 | [sequences, big_h0, reset],
289 | [ip_cost, new_big_h0],
290 | updates=ip_updates,
291 | on_unused_input='warn'
292 | )
293 |
294 | other_train_fn = theano.function(
295 | [sequences, big_h0, h0, reset],
296 | [cost, new_big_h0, new_h0],
297 | updates=other_updates,
298 | on_unused_input='warn'
299 | )
300 |
301 | train_fn = theano.function(
302 | [sequences, big_h0, h0, reset],
303 | [cost, new_big_h0, new_h0],
304 | updates=updates,
305 | on_unused_input='warn'
306 | )
307 |
308 | big_frame_level_generate_fn = theano.function(
309 | [sequences, big_h0, reset],
310 | big_frame_level_rnn(sequences, big_h0, reset)[0:2],
311 | on_unused_input='warn'
312 | )
313 |
314 | big_frame_level_outputs = T.matrix('big_frame_level_outputs')
315 | frame_level_generate_fn = theano.function(
316 | [sequences, big_frame_level_outputs, h0, reset],
317 | frame_level_rnn(sequences, big_frame_level_outputs.dimshuffle(0,'x',1), h0, reset),
318 | on_unused_input='warn'
319 | )
320 |
321 | frame_level_outputs = T.matrix('frame_level_outputs')
322 | prev_samples = T.imatrix('prev_samples')
323 | sample_level_generate_fn = theano.function(
324 | [frame_level_outputs, prev_samples],
325 | lib.ops.softmax_and_sample(
326 | sample_level_predictor(
327 | frame_level_outputs,
328 | prev_samples
329 | )
330 | ),
331 | on_unused_input='warn'
332 | )
333 |
334 | def generate_and_save_samples(tag):
335 |
336 | def write_audio_file(name, data):
337 | data = data.astype('float32')
338 | data -= data.min()
339 | data /= data.max()
340 | data -= 0.5
341 | data *= 0.95
342 | scipy.io.wavfile.write(name+'.wav', BITRATE, data)
343 |
344 | # Generate 5 sample files, each 5 seconds long
345 | N_SEQS = 10
346 | LENGTH = 5*BITRATE
347 |
348 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
349 | samples[:, :BIG_FRAME_SIZE] = Q_ZERO
350 |
351 | big_h0 = numpy.zeros((N_SEQS, N_BIG_GRUS, BIG_DIM), dtype='float32')
352 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
353 | big_frame_level_outputs = None
354 | frame_level_outputs = None
355 |
356 | for t in xrange(BIG_FRAME_SIZE, LENGTH):
357 |
358 | if t % BIG_FRAME_SIZE == 0:
359 | big_frame_level_outputs, big_h0 = big_frame_level_generate_fn(
360 | samples[:, t-BIG_FRAME_SIZE:t],
361 | big_h0,
362 | numpy.int32(t == BIG_FRAME_SIZE)
363 | )
364 |
365 | if t % FRAME_SIZE == 0:
366 | frame_level_outputs, h0 = frame_level_generate_fn(
367 | samples[:, t-FRAME_SIZE:t],
368 | big_frame_level_outputs[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)],
369 | h0,
370 | numpy.int32(t == BIG_FRAME_SIZE)
371 | )
372 |
373 | samples[:, t] = sample_level_generate_fn(
374 | frame_level_outputs[:, t % FRAME_SIZE],
375 | samples[:, t-FRAME_SIZE:t]
376 | )
377 |
378 | for i in xrange(N_SEQS):
379 | write_audio_file("sample_{}_{}".format(tag, i), samples[i])
380 |
381 | if PRE_STOP_TIME > 0:
382 | print "Pretraining!"
383 | total_iters = 0
384 | total_time = 0.
385 | last_print_time = 0.
386 | last_print_iters = 0
387 | pretrain_finished = False
388 |
389 | for epoch in itertools.count():
390 | if pretrain_finished:
391 | break
392 |
393 | big_h0 = numpy.zeros((BATCH_SIZE, N_BIG_GRUS, BIG_DIM), dtype='float32')
394 | costs = []
395 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, PRE_SEQ_LEN, BIG_FRAME_SIZE, Q_LEVELS, Q_ZERO)
396 |
397 | for seqs, reset in data_feeder:
398 | if pretrain_finished:
399 | break
400 |
401 | start_time = time.time()
402 | cost, big_h0 = ip_train_fn(seqs, big_h0, reset)
403 | total_time += time.time() - start_time
404 | total_iters += 1
405 |
406 | costs.append(cost)
407 |
408 |
409 |
410 | if (PRE_TRAIN_MODE=='iters' and total_iters-last_print_iters == PRE_PRINT_ITERS) or \
411 | (PRE_TRAIN_MODE=='time' and total_time-last_print_time >= PRE_PRINT_TIME):
412 |
413 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
414 | epoch,
415 | total_iters,
416 | numpy.mean(costs),
417 | total_time,
418 | total_time / total_iters
419 | )
420 | tag = "iters{}_time{}".format(total_iters, total_time)
421 | lib.save_params('params_pretrain_{}.pkl'.format(tag))
422 |
423 | costs = []
424 | last_print_time += PRE_PRINT_TIME
425 | last_print_iters += PRE_PRINT_ITERS
426 |
427 | if (PRE_TRAIN_MODE=='iters' and total_iters == PRE_STOP_ITERS) or \
428 | (PRE_TRAIN_MODE=='time' and total_time >= PRE_STOP_TIME):
429 |
430 | print "Done!"
431 |
432 | pretrain_finished = True
433 |
434 | print "Training!"
435 | total_iters = 0
436 | total_time = 0.
437 | last_print_time = 0.
438 | last_print_iters = 0
439 | last_eigs = 0.
440 | finetune = False
441 | for epoch in itertools.count():
442 |
443 | big_h0 = numpy.zeros((BATCH_SIZE, N_BIG_GRUS, BIG_DIM), dtype='float32')
444 | h0 = numpy.zeros((BATCH_SIZE, N_GRUS, DIM), dtype='float32')
445 | costs = []
446 | data_feeder = dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, BIG_FRAME_SIZE, Q_LEVELS, Q_ZERO)
447 |
448 | for seqs, reset in data_feeder:
449 |
450 | if finetune:
451 | _train_fn = train_fn
452 | else:
453 | _train_fn = other_train_fn
454 |
455 | start_time = time.time()
456 | cost, big_h0, h0 = _train_fn(seqs, big_h0, h0, reset)
457 | total_time += time.time() - start_time
458 | total_iters += 1
459 |
460 | costs.append(cost)
461 |
462 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
463 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME):
464 |
465 | print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
466 | epoch,
467 | total_iters,
468 | numpy.mean(costs),
469 | total_time,
470 | total_time / total_iters
471 | )
472 | print "Warning not generating samples"
473 | # tag = "iters{}_time{}".format(total_iters, total_time)
474 | # generate_and_save_samples(tag)
475 | # lib.save_params('params_{}.pkl'.format(tag))
476 |
477 | if last_print_time <= TIME_BEFORE_FINETUNE <= last_print_time + PRINT_TIME:
478 | print "Switching to fine-tuning!"
479 | finetune = True
480 |
481 | costs = []
482 | last_print_time += PRINT_TIME
483 | last_print_iters += PRINT_ITERS
484 |
485 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
486 | (TRAIN_MODE=='time' and total_time >= STOP_TIME):
487 |
488 | print "Done!"
489 |
490 | try: # This only matters on Ishaan's computer
491 | import experiment_tools
492 | experiment_tools.send_sms("done!")
493 | except ImportError:
494 | pass
495 |
496 | sys.exit()
--------------------------------------------------------------------------------