├── eva ├── __init__.py ├── layers │ ├── __init__.py │ ├── shift_down.py │ ├── feed_vertical.py │ ├── padded_convolution2d.py │ ├── residual_block.py │ ├── causal_atrous_convolution1d.py │ ├── gated_block.py │ ├── out_channels.py │ ├── wavenet_block.py │ ├── gated_cnn.py │ └── masked_convolution2d.py ├── models │ ├── __init__.py │ ├── pixelcnn.py │ ├── gated_pixelcnn.py │ └── wavenet.py └── util │ ├── __init__.py │ ├── autil.py │ └── mutil.py ├── keras_tf_atrous_bug.py ├── README.md ├── img_inference.py ├── img_train.py ├── .gitignore ├── audio_train.py ├── audio_inference.py └── LICENSE /eva/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eva/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eva/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eva/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eva/util/autil.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def to_pcm8(data): 4 | return np.round(data).astype('uint8') -------------------------------------------------------------------------------- /eva/layers/shift_down.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Lambda, ZeroPadding2D 2 | import keras.backend as K 3 | 4 | 5 | def ShiftDown(model): 6 | shape = K.int_shape(model)[1] 7 | model = ZeroPadding2D(padding=(1,0,0,0))(model) 8 | model = Lambda(lambda x: x[:,:shape,:,:])(model) 9 | return model 10 | -------------------------------------------------------------------------------- /keras_tf_atrous_bug.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.models import Model 3 | from keras.layers import Input, AtrousConvolution1D 4 | 5 | inp = Input((100, 1)) 6 | M = AtrousConvolution1D(1, 2, atrous_rate=25, border_mode='same')(inp) 7 | M = Model(inp, M) 8 | M.compile('sgd', 'mse') 9 | 10 | M.train_on_batch(np.random.rand(1, 100, 1), np.random.rand(1, 100, 1)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | EVA 2 | =========== 3 | An AI research project which explores audio synthesis. 4 | 5 | **Models** 6 | - [x] PixelCNN 7 | - [x] Gated PixelCNN 8 | - [x] WaveNet 9 | 10 | **Side tasks** 11 | - [ ] Add argparse (It makes git easier, but the code is less readable). 12 | - [ ] Extract shared functionality of image and audio infer/train. 13 | - [ ] Debug TF atrous rate bug. -------------------------------------------------------------------------------- /eva/layers/feed_vertical.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Lambda, Convolution2D 2 | from eva.layers.shift_down import ShiftDown 3 | 4 | 5 | class FeedVertical(object): 6 | def __init__(self, filters): 7 | self.filters = filters 8 | 9 | def __call__(self, model): 10 | # model = Lambda(ShiftDown)(model) 11 | model = ShiftDown(model) 12 | model = Convolution2D(2*self.filters, 1, 1, border_mode='valid')(model) 13 | return model 14 | -------------------------------------------------------------------------------- /img_inference.py: -------------------------------------------------------------------------------- 1 | #%% Setup. 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import zoom 4 | 5 | from eva.models.gated_pixelcnn import GatedPixelCNN 6 | 7 | from eva.util.mutil import generate 8 | 9 | INPUTS = { 10 | 'cifar10' : (32, 32, 3), 11 | 'mnist_rgb' : (28, 28, 3), 12 | 'mnist' : (28, 28, 1) 13 | } 14 | 15 | #%% Model. 16 | TYPE = 'mnist_rgb' 17 | MODEL = GatedPixelCNN 18 | FILTERS = 126 19 | DEPTH = 12 20 | 21 | #%% Generation. 22 | BATCH = 10 23 | LATENT = 4 24 | DETERMINISTIC = False 25 | 26 | #%% Parse model. 27 | ARGS = (INPUTS[TYPE], FILTERS, DEPTH) 28 | if MODEL == GatedPixelCNN and LABELS is not None: 29 | ARGS += (1,) 30 | 31 | M = MODEL(*ARGS) 32 | M.load_weights('model.h5') 33 | 34 | 35 | #%% Choice (Probabilistic). 36 | batch = generate(M, LATENT, BATCH, deterministic=DETERMINISTIC) 37 | for pic in tqdm(batch): 38 | plt.imshow(pic, interpolation='nearest') 39 | plt.show(block=True) 40 | -------------------------------------------------------------------------------- /eva/layers/padded_convolution2d.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Convolution2D, ZeroPadding2D 2 | 3 | 4 | class PaddedConvolution2D(object): 5 | def __init__(self, filters, filter_size, stack, mask='B'): 6 | self.filters = filters 7 | self.filter_size = filter_size 8 | self.stack = stack 9 | self.mask = mask 10 | 11 | def __call__(self, model): 12 | if self.stack == 'vertical': 13 | model = ZeroPadding2D(padding=(self.filter_size[0]//2, 0, self.filter_size[1]//2, self.filter_size[1]//2))(model) 14 | model = Convolution2D(2*self.filters, self.filter_size[0]//2+1, self.filter_size[1], border_mode='valid')(model) 15 | elif self.stack == 'horizontal': 16 | model = ZeroPadding2D(padding=(0, 0, self.filter_size[1]//2, 0))(model) 17 | if self.mask == 'A': 18 | model = Convolution2D(2*self.filters, 1, self.filter_size[1]//2, border_mode='valid')(model) 19 | else: 20 | model = Convolution2D(2*self.filters, 1, self.filter_size[1]//2+1, border_mode='valid')(model) 21 | 22 | return model 23 | -------------------------------------------------------------------------------- /eva/layers/residual_block.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Convolution2D, Merge 2 | from keras.layers.advanced_activations import PReLU 3 | 4 | from eva.layers.masked_convolution2d import MaskedConvolution2D 5 | 6 | class ResidualBlock(object): 7 | def __init__(self, filters): 8 | self.filters = filters 9 | 10 | def __call__(self, model): 11 | # 2h -> h 12 | block = PReLU()(model) 13 | block = MaskedConvolution2D(self.filters//2, 1, 1)(block) 14 | 15 | # h 3x3 -> h 16 | block = PReLU()(block) 17 | block = MaskedConvolution2D(self.filters//2, 3, 3, border_mode='same')(block) 18 | 19 | # h -> 2h 20 | block = PReLU()(block) 21 | block = MaskedConvolution2D(self.filters, 1, 1)(block) 22 | 23 | return Merge(mode='sum')([model, block]) 24 | 25 | class ResidualBlockList(object): 26 | def __init__(self, filters, depth): 27 | self.filters = filters 28 | self.depth = depth 29 | 30 | def __call__(self, model): 31 | for _ in range(self.depth): 32 | model = ResidualBlock(self.filters)(model) 33 | 34 | return model -------------------------------------------------------------------------------- /eva/layers/causal_atrous_convolution1d.py: -------------------------------------------------------------------------------- 1 | import keras.backend as K 2 | from keras.layers import AtrousConvolution1D 3 | from keras.utils.np_utils import conv_output_length 4 | 5 | class CausalAtrousConvolution1D(AtrousConvolution1D): 6 | def __init__(self, *args, mask_type='B', **kwargs): 7 | super().__init__(*args, **kwargs) 8 | if self.border_mode != 'valid': 9 | raise ValueError("Causal mode dictates border_mode=valid.") 10 | self.length = self.atrous_rate * (self.filter_length - 1) 11 | 12 | # XXX debug. 13 | # if mask_type == 'A': 14 | # self.length += 1 15 | 16 | def get_output_shape_for(self, input_shape): 17 | length = conv_output_length(input_shape[1] + self.length, 18 | self.filter_length, 19 | self.border_mode, 20 | self.subsample[0], 21 | dilation=self.atrous_rate) 22 | 23 | return (input_shape[0], length, self.nb_filter) 24 | 25 | def call(self, x, mask=None): 26 | return super().call(K.asymmetric_temporal_padding(x, self.length, 0), mask) 27 | -------------------------------------------------------------------------------- /img_train.py: -------------------------------------------------------------------------------- 1 | #%% Setup. 2 | import numpy as np 3 | 4 | from keras.datasets import cifar10, mnist 5 | from keras.utils.visualize_util import plot 6 | from keras.callbacks import TensorBoard, ModelCheckpoint 7 | 8 | from eva.models.pixelcnn import PixelCNN 9 | from eva.models.gated_pixelcnn import GatedPixelCNN 10 | 11 | from eva.util.mutil import clean_data 12 | 13 | #%% Data. 14 | DATASET = mnist 15 | 16 | DATA, LABELS = clean_data(DATASET.load_data(), rgb=True, latent=True) 17 | 18 | #%% Model. 19 | MODEL = GatedPixelCNN 20 | FILTERS = 126 21 | DEPTH = 12 22 | 23 | LOAD = False 24 | 25 | ARGS = (DATA.shape[1:], FILTERS, DEPTH) 26 | if MODEL == GatedPixelCNN and LABELS is not None: 27 | ARGS += (1,) 28 | 29 | M = MODEL(*ARGS) 30 | if LOAD: 31 | M.load_weights('model.h5') 32 | 33 | M.summary() 34 | 35 | plot(M) 36 | 37 | #%% Train. 38 | M.fit(DATA if LABELS is None else [DATA, LABELS], 39 | [(np.expand_dims(DATA[:, :, :, c].reshape(DATA.shape[0], DATA.shape[1]*DATA.shape[2]), -1)*255).astype(int) for c in range(DATA.shape[3])], 40 | batch_size=32, nb_epoch=200, 41 | verbose=1, callbacks=[TensorBoard(), ModelCheckpoint('model.h5', save_weights_only=True)]) # Only weights because Keras is a bitch. 42 | -------------------------------------------------------------------------------- /eva/layers/gated_block.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Lambda, Dense, Reshape, Merge 2 | import keras.backend as K 3 | 4 | from eva.layers.padded_convolution2d import PaddedConvolution2D 5 | from eva.layers.feed_vertical import FeedVertical 6 | 7 | 8 | class GatedBlock(object): 9 | def __init__(self, filters, v=None, h=None, crop_right=False): 10 | self.filters = filters 11 | self.v = v 12 | self.h = h 13 | self.crop_right = crop_right 14 | 15 | def __call__(self, model): 16 | if self.crop_right: 17 | model = Lambda(lambda x: x[:, :, :K.int_shape(x)[2]-1, :])(model) 18 | 19 | if self.v is not None: 20 | model = Merge(mode='sum')([model, self.v]) 21 | 22 | if self.h is not None: 23 | hV = Dense(output_dim=2*self.filters)(self.h) 24 | hV = Reshape((1, 1, 2*self.filters))(hV) 25 | model = Lambda(lambda x: x[0]+x[1])([model,hV]) 26 | 27 | model_f = Lambda(lambda x: x[:,:,:,:self.filters])(model) 28 | model_g = Lambda(lambda x: x[:,:,:,self.filters:])(model) 29 | 30 | model_f = Lambda(lambda x: K.tanh(x))(model_f) 31 | model_g = Lambda(lambda x: K.sigmoid(x))(model_g) 32 | 33 | res = Merge(mode='mul')([model_f, model_g]) 34 | return res 35 | -------------------------------------------------------------------------------- /eva/models/pixelcnn.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import Input 3 | from keras.layers.advanced_activations import PReLU 4 | from keras.optimizers import Nadam 5 | 6 | from eva.layers.residual_block import ResidualBlockList 7 | from eva.layers.out_channels import OutChannels 8 | from eva.layers.masked_convolution2d import MaskedConvolution2D 9 | 10 | def PixelCNN(input_shape, filters, depth, build=True): 11 | height, width, channels = input_shape 12 | palette = 256 # TODO: Make it scalable to any amount of palette. 13 | 14 | input_img = Input(shape=input_shape, name=str(channels)+'_channels_'+str(palette)+'_palette') 15 | 16 | model = MaskedConvolution2D(filters, 7, 7, mask='A', border_mode='same', name='masked2d_A')(input_img) 17 | 18 | model = ResidualBlockList(filters, depth)(model) 19 | model = PReLU()(model) 20 | 21 | for _ in range(2): 22 | model = MaskedConvolution2D(filters, 1, 1, border_mode='valid')(model) 23 | model = PReLU()(model) 24 | 25 | outs = OutChannels(*input_shape, masked=True, palette=palette)(model) 26 | 27 | if build: 28 | model = Model(input=input_img, output=outs) 29 | model.compile(optimizer=Nadam(), loss='binary_crossentropy' if channels == 1 else 'sparse_categorical_crossentropy') 30 | 31 | return model 32 | -------------------------------------------------------------------------------- /eva/layers/out_channels.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Convolution2D, Reshape, Lambda, Activation 2 | 3 | from eva.layers.masked_convolution2d import MaskedConvolution2D 4 | 5 | class OutChannels(object): 6 | def __init__(self, height, width, channels, masked=False, palette=256): 7 | self.height = height 8 | self.width = width 9 | self.channels = channels 10 | self.cxp = MaskedConvolution2D if masked else Convolution2D 11 | self.palette = palette 12 | 13 | def __call__(self, model): 14 | if self.channels == 1: 15 | outs = Convolution2D(1, 1, 1, border_mode='valid')(model) 16 | outs = Activation('sigmoid')(outs) 17 | else: 18 | out = self.cxp(self.palette*self.channels, 1, 1, border_mode='valid', name='channels_mult_palette')(model) 19 | out = Reshape((self.height, self.width, self.palette, self.channels), name='palette_channels')(out) 20 | 21 | outs = [None] * self.channels 22 | for i in range(self.channels): 23 | outs[i] = Lambda(lambda x: x[:, :, :, :, i], name='channel'+str(i)+'_extract')(out) 24 | outs[i] = Reshape((self.height * self.width, self.palette), name='hw_palette'+str(i))(outs[i]) 25 | outs[i] = Activation('softmax', name='channel'+str(i))(outs[i]) 26 | 27 | return outs -------------------------------------------------------------------------------- /eva/models/gated_pixelcnn.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import Input, Convolution2D 3 | from keras.layers.advanced_activations import PReLU 4 | from keras.optimizers import Nadam 5 | 6 | from eva.layers.gated_cnn import GatedCNN, GatedCNNs 7 | from eva.layers.out_channels import OutChannels 8 | from eva.layers.masked_convolution2d import MaskedConvolution2D 9 | 10 | def GatedPixelCNN(input_shape, filters, depth, latent=None, build=True): 11 | height, width, channels = input_shape 12 | palette = 256 # TODO: Make it scalable to any amount of palette. 13 | 14 | input_img = Input(shape=input_shape, name=str(channels)+'_channels_'+str(palette)+'_palette') 15 | 16 | latent_vector = None 17 | if latent is not None: 18 | latent_vector = Input(shape=(latent,), name='latent_vector') 19 | 20 | model = GatedCNNs(filters, depth, latent_vector)(*GatedCNN(filters, latent_vector)(input_img)) 21 | 22 | for _ in range(2): 23 | model = Convolution2D(filters, 1, 1, border_mode='valid')(model) 24 | model = PReLU()(model) 25 | 26 | outs = OutChannels(*input_shape, masked=False, palette=palette)(model) 27 | 28 | if build: 29 | model = Model(input=[input_img, latent_vector] if latent is not None else input_img, output=outs) 30 | model.compile(optimizer=Nadam(), loss='binary_crossentropy' if channels == 1 else 'sparse_categorical_crossentropy') 31 | 32 | return model 33 | -------------------------------------------------------------------------------- /eva/layers/wavenet_block.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Activation, Merge, Convolution1D, AtrousConvolution1D 2 | 3 | from eva.layers.causal_atrous_convolution1d import CausalAtrousConvolution1D 4 | 5 | 6 | class WavenetBlock(object): 7 | def __init__(self, filters, rate): 8 | self.filters = filters 9 | self.rate = rate 10 | 11 | def __call__(self, model): 12 | original = model 13 | 14 | tanh_out = CausalAtrousConvolution1D(self.filters, 2, atrous_rate=self.rate, border_mode='valid')(model) 15 | tanh_out = Activation('tanh')(tanh_out) 16 | 17 | sigm_out = CausalAtrousConvolution1D(self.filters, 2, atrous_rate=self.rate, border_mode='valid')(model) 18 | sigm_out = Activation('sigmoid')(sigm_out) 19 | 20 | model = Merge(mode='mul')([tanh_out, sigm_out]) 21 | 22 | skip_x = Convolution1D(self.filters, 1, border_mode='same')(model) 23 | 24 | res_x = Convolution1D(self.filters, 1, border_mode='same')(model) 25 | res_x = Merge(mode='sum')([original, res_x]) 26 | return res_x, skip_x 27 | 28 | class WavenetBlocks(object): 29 | def __init__(self, filters, depth, stacks=1, h=None): 30 | self.filters = filters 31 | self.depth = depth 32 | self.stacks = stacks 33 | self.h = h 34 | 35 | def __call__(self, res): 36 | skips = [None] * (self.stacks * self.depth) 37 | for j in range(self.stacks): 38 | for i in range(self.depth): 39 | res, skip = WavenetBlock(self.filters, 2**i)(res) 40 | skips[j*self.depth + i] = skip 41 | 42 | return res, skips 43 | -------------------------------------------------------------------------------- /eva/layers/gated_cnn.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Convolution2D, Merge 2 | import keras.backend as K 3 | 4 | from eva.layers.padded_convolution2d import PaddedConvolution2D 5 | from eva.layers.feed_vertical import FeedVertical 6 | from eva.layers.gated_block import GatedBlock 7 | 8 | 9 | class GatedCNN(object): 10 | def __init__(self, filters, h=None): 11 | self.filters = filters 12 | self.h = h 13 | 14 | def __call__(self, model1, model2=None): 15 | if model2 is None: 16 | h_model = model1 17 | filter_size = (7, 7) 18 | else: 19 | h_model = model2 20 | filter_size = (3, 3) 21 | 22 | v_model = PaddedConvolution2D(self.filters, filter_size, 'vertical')(model1) 23 | feed_vertical = FeedVertical(self.filters)(v_model) 24 | v_model = GatedBlock(self.filters, h=self.h)(v_model) 25 | 26 | h_model_new = PaddedConvolution2D(self.filters, filter_size, 'horizontal', 'A')(h_model) 27 | h_model_new = GatedBlock(self.filters, v=feed_vertical, h=self.h, crop_right=True)(h_model_new) 28 | h_model_new = Convolution2D(self.filters, 1, 1, border_mode='valid')(h_model_new) 29 | 30 | return (v_model, h_model_new if model2 is None else Merge(mode='sum')([h_model_new, h_model])) 31 | 32 | class GatedCNNs(object): 33 | def __init__(self, filters, length, h=None): 34 | self.filters = filters 35 | self.length = length 36 | self.h = h 37 | 38 | def __call__(self, v_model, h_model): 39 | for _ in range(self.length): 40 | v_model, h_model = GatedCNN(self.filters, self.h)(v_model, h_model) 41 | 42 | return h_model -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | # VS CODE 3 | /.vscode 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv/ 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # Editor ctags 97 | tags 98 | 99 | # Jupyter Kernels 100 | kernel-* 101 | 102 | # OS 103 | .directory 104 | 105 | # Quiver 106 | tmp 107 | 108 | # Logs 109 | logs 110 | 111 | # Checkpoints 112 | *.h5 113 | *.png 114 | 115 | # Data 116 | data/ 117 | *.npy 118 | *.wav -------------------------------------------------------------------------------- /eva/models/wavenet.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import Input, Convolution1D, Activation, Merge, Lambda 3 | from keras.layers.advanced_activations import PReLU 4 | from keras.optimizers import Nadam 5 | 6 | from eva.layers.causal_atrous_convolution1d import CausalAtrousConvolution1D 7 | from eva.layers.wavenet_block import WavenetBlock, WavenetBlocks 8 | 9 | 10 | def Wavenet(input_shape, filters, depth, stacks, last=0, h=None, build=True): 11 | # TODO: Soft targets? A float to make targets a gaussian with stdev. 12 | # TODO: Train only receptive field. The temporal-first outputs are computed from zero-padding. 13 | # TODO: Global conditioning? 14 | # TODO: Local conditioning? 15 | 16 | _, nb_bins = input_shape 17 | 18 | input_audio = Input(input_shape, name='audio_input') 19 | 20 | model = CausalAtrousConvolution1D(filters, 2, mask_type='A', atrous_rate=1, border_mode='valid')(input_audio) 21 | 22 | out, skip_connections = WavenetBlocks(filters, depth, stacks)(model) 23 | 24 | out = Merge(mode='sum', name='merging_skips')(skip_connections) 25 | out = PReLU()(out) 26 | 27 | out = Convolution1D(nb_bins, 1, border_mode='same')(out) 28 | out = PReLU()(out) 29 | 30 | out = Convolution1D(nb_bins, 1, border_mode='same')(out) 31 | 32 | # https://storage.googleapis.com/deepmind-live-cms/documents/BlogPost-Fig2-Anim-160908-r01.gif 33 | if last > 0: 34 | out = Lambda(lambda x: x[:, -last:], output_shape=(last, out._keras_shape[2]), name='last_out')(out) 35 | 36 | out = Activation('softmax')(out) 37 | 38 | if build: 39 | model = Model(input_audio, out) 40 | model.compile(Nadam(), 'sparse_categorical_crossentropy') 41 | 42 | return model 43 | 44 | def compute_receptive_field(sample_rate, depth, stacks): 45 | receptive_field = stacks * (2 ** depth) - (stacks - 1) 46 | receptive_field_ms = (receptive_field * 1000) / sample_rate 47 | return receptive_field, receptive_field_ms -------------------------------------------------------------------------------- /audio_train.py: -------------------------------------------------------------------------------- 1 | #%% Setup. 2 | import signal 3 | import sys 4 | 5 | import numpy as np 6 | import scipy.io.wavfile 7 | 8 | from keras.utils.visualize_util import plot 9 | from keras.callbacks import TensorBoard, ModelCheckpoint 10 | from keras.utils import np_utils 11 | 12 | from eva.models.wavenet import Wavenet, compute_receptive_field 13 | 14 | from eva.util.mutil import sparse_labels 15 | 16 | #%% Data 17 | RATE, DATA = scipy.io.wavfile.read('./data/undertale/undertale_001_once_upon_a_time.comp.wav') 18 | 19 | #%% Train Config. 20 | EPOCHS = 2000 21 | BATCH = 8 22 | 23 | #%% Model Config. 24 | MODEL = Wavenet 25 | FILTERS = 32 26 | DEPTH = 10 27 | STACKS = 5 28 | BINS = 256 29 | LAST = RATE 30 | LENGTH = LAST + compute_receptive_field(RATE, DEPTH, STACKS)[0] 31 | 32 | LOAD = False 33 | 34 | #%% Model. 35 | INPUT = (LENGTH, BINS) 36 | ARGS = (INPUT, FILTERS, DEPTH, STACKS, LAST) 37 | 38 | M = MODEL(*ARGS) 39 | if LOAD: 40 | M.load_weights('model.h5') 41 | 42 | M.summary() 43 | 44 | plot(M) 45 | 46 | #%% Train. 47 | padded_data = np.zeros(DATA.shape[0]+LENGTH-1) 48 | padded_data[LENGTH-1:] = DATA 49 | 50 | def train_gen(): 51 | while True: 52 | i = np.random.randint(0, DATA.shape[0]-2, size=BATCH, dtype=int) 53 | 54 | data = np.zeros((BATCH, LENGTH)) 55 | y = np.zeros((BATCH, LAST, 1)) 56 | x = np.zeros((BATCH, LENGTH, BINS)) 57 | for s in range(BATCH): 58 | si = i[s] 59 | data[s] = padded_data[si:si+LENGTH].astype(int) 60 | y[s] = np.expand_dims(data[s, -LAST:], -1) 61 | x[s, list(range(LENGTH)), data[s].astype(int)] = 1 62 | yield x, y 63 | 64 | def save(): 65 | M.save('sigint_model.h5') 66 | 67 | def save_gracefully(signal, frame): 68 | save() 69 | sys.exit(0) 70 | 71 | signal.signal(signal.SIGINT, save_gracefully) 72 | 73 | # Fuck theano and its recursions <3 74 | sys.setrecursionlimit(50000) 75 | 76 | M.fit_generator(train_gen(), samples_per_epoch=RATE//4, nb_epoch=EPOCHS, callbacks=[ModelCheckpoint('model.h5')]) 77 | -------------------------------------------------------------------------------- /eva/util/mutil.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from tqdm import tqdm 4 | 5 | def generate(model, latent=None, batch=1, deterministic=False): 6 | get_color = lambda model, pixels, cols, row, col, channel, latent=None: model.predict_on_batch(pixels if latent is None else [pixels, latent])[channel][:, row*cols+col] 7 | normalize = lambda pixel: pixel/255 8 | 9 | shape = model.input_shape 10 | pixels = np.zeros(shape=(batch,) + (shape if isinstance(shape, tuple) else shape[0])[1:]) 11 | latent_vec = None if latent is None else np.expand_dims(np.ones(batch).astype(int)*latent, -1) 12 | batch, rows, cols, channels = pixels.shape 13 | for row in tqdm(range(rows)): 14 | for col in tqdm(range(cols)): 15 | for channel in tqdm(range(channels)): 16 | ps = get_color(model, pixels, cols, row, col, channel, latent_vec) 17 | if deterministic: 18 | pixels[:, row, col, channel] = normalize(np.array([np.argmax(p) for p in ps])) 19 | continue 20 | pixels[:, row, col, channel] = normalize(np.array([np.random.choice(256, p=p) for p in ps])) 21 | return pixels 22 | 23 | def clean_data(data, rgb=True, latent=True): 24 | (train, train_l), (test, test_l) = data 25 | data = np.concatenate((train, test), axis=0) 26 | data = data.astype('float32') 27 | data /= 255 28 | 29 | assert len(data.shape) == 3 or len(data.shape) == 4 30 | 31 | if len(data.shape) == 3: 32 | data = np.expand_dims(data, -1) 33 | 34 | if rgb: 35 | data = np.repeat(data, 1 if data.shape[3] == 3 else 3, 3) 36 | 37 | labels = None 38 | if latent: 39 | labels = np.concatenate((train_l, test_l), axis=0) 40 | 41 | if len(labels.shape) == 1: 42 | labels = labels[np.newaxis].T 43 | 44 | assert len(labels.shape) == 2 45 | assert labels.shape[0] == data.shape[0] 46 | 47 | return data, labels 48 | 49 | def sparse_labels(labels): 50 | return np.expand_dims(np.argmax(labels, -1), -1) 51 | 52 | def sparse_labels_generator(generator): 53 | while True: 54 | inputs, labels = next(generator) 55 | yield (inputs, sparse_labels(labels)) -------------------------------------------------------------------------------- /audio_inference.py: -------------------------------------------------------------------------------- 1 | #%% Setup. 2 | print('Importing.') 3 | import signal 4 | import sys 5 | 6 | import numpy as np 7 | import scipy.io.wavfile 8 | 9 | from keras.utils.visualize_util import plot 10 | from keras.callbacks import TensorBoard, ModelCheckpoint 11 | 12 | from tqdm import tqdm 13 | 14 | from eva.models.wavenet import Wavenet, compute_receptive_field 15 | 16 | from eva.util.mutil import sparse_labels 17 | from eva.util.autil import to_pcm8 18 | 19 | #%% Data. 20 | RATE, DATA = scipy.io.wavfile.read('./data/undertale/undertale_001_once_upon_a_time.comp.wav') 21 | 22 | #%% Generation Config. 23 | print('Preparing generation.') 24 | UNITS = 356415 25 | 26 | #%% Model Config. 27 | print('Preparing the model.') 28 | MODEL = Wavenet 29 | FILTERS = 32 30 | DEPTH = 10 31 | STACKS = 5 32 | BINS = 256 33 | LAST = RATE 34 | LENGTH = LAST + compute_receptive_field(RATE, DEPTH, STACKS)[0] 35 | 36 | 37 | #%% Model. 38 | print('Loading the model.') 39 | INPUT = (LENGTH, BINS) 40 | ARGS = (INPUT, FILTERS, DEPTH, STACKS) 41 | 42 | M = MODEL(*ARGS) 43 | M.load_weights('model.h5') 44 | 45 | M.summary() 46 | 47 | plot(M) 48 | 49 | #%% Generate. 50 | print('Generating.') 51 | def save(): 52 | print('Saving.') 53 | np.save('samples.npy', samples) 54 | np.save('audio.npy', audio) 55 | 56 | pcm8_64bit_wide = to_pcm8(audio) 57 | 58 | for i in tqdm(range(audio.shape[0])): 59 | scipy.io.wavfile.write('audio{}.wav'.format(i), RATE, pcm8_64bit_wide[i]) 60 | 61 | def save_gracefully(signal, frame): 62 | save() 63 | sys.exit(0) 64 | 65 | signal.signal(signal.SIGINT, save_gracefully) 66 | 67 | BATCH_SIZE = 1 68 | 69 | samples = np.zeros(shape=(1, UNITS, BINS)) 70 | samples[0, list(range(LENGTH-1)), DATA[:LENGTH-1].astype(int)] 71 | samples = np.repeat(samples, BATCH_SIZE, axis=0) 72 | 73 | audio = np.zeros(shape=(1, UNITS)) 74 | audio[0, :LENGTH-1] = DATA[:LENGTH-1] 75 | audio = np.repeat(audio, BATCH_SIZE, axis=0) 76 | 77 | for i in tqdm(range(UNITS+LENGTH-1)): 78 | if i >= UNITS-1: 79 | break 80 | x = i+LENGTH-1 81 | samples[:, x] = M.predict(samples[:, i:x+1], batch_size=1)[:, -1] 82 | samples[:,x,np.argmax(samples[:,x], axis=-1)] += 1-np.sum(samples[:, x], axis=-1) 83 | audio[:, x] = np.array([np.random.choice(256, p=p) for p in samples[:, x]]) 84 | if i % (RATE//2) == 0: 85 | print(str(i/(RATE)) + " Seconds generated!") 86 | save() 87 | 88 | save() -------------------------------------------------------------------------------- /eva/layers/masked_convolution2d.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from keras import backend as K 6 | from keras.layers import Convolution2D 7 | 8 | class MaskedConvolution2D(Convolution2D): 9 | def __init__(self, *args, mask='B' , n_channels=3, mono=False, **kwargs): 10 | super().__init__(*args, **kwargs) 11 | self.mask_type = mask 12 | self.n_channels = n_channels 13 | self.mono = mono 14 | 15 | # TODO: Define mask here? we can use filter_size and number of filters to predict the weights shape, however, build() is still a safer place for it. 16 | self.mask = None 17 | 18 | def build(self, input_shape): 19 | super().build(input_shape) 20 | 21 | self.mask = np.ones(self.W_shape) 22 | 23 | assert mask.shape[0] == mask.shape[1] 24 | filter_size = self.mask.shape[0] 25 | filter_center = filter_size / 2 26 | 27 | self.mask[math.ceil(filter_center):] = 0 28 | self.mask[math.floor(filter_center):, math.ceil(filter_center):] = 0 29 | 30 | if self.mono: 31 | if self.mask_type == 'A': 32 | self.mask[math.floor(filter_center), math.floor(filter_center)] = 0 33 | else: 34 | op = np.greater_equal if self.mask_type == 'A' else np.greater 35 | for i in range(self.n_channels): 36 | for j in range(self.n_channels): 37 | if op(i, j): 38 | self.mask[math.floor(filter_center), math.floor(filter_center), i::self.n_channels, j::self.n_channels] = 0 39 | 40 | self.mask = K.variable(self.mask) 41 | 42 | def call(self, x, mask=None): 43 | # TODO: learn what is this mask parameter and how to use it. 44 | output = K.conv2d(x, self.W * self.mask, strides=self.subsample, 45 | border_mode=self.border_mode, 46 | dim_ordering=self.dim_ordering, 47 | filter_shape=self.W_shape) 48 | if self.bias: 49 | if self.dim_ordering == 'th': 50 | output += K.reshape(self.b, (1, self.nb_filter, 1, 1)) 51 | elif self.dim_ordering == 'tf': 52 | output += K.reshape(self.b, (1, 1, 1, self.nb_filter)) 53 | else: 54 | raise ValueError('Invalid dim_ordering:', self.dim_ordering) 55 | output = self.activation(output) 56 | return output 57 | 58 | def get_config(self): 59 | return dict(list(super().get_config().items()) + list({'mask': self.mask_type, 'n_channels': self.n_channels, 'mono': self.mono}.items())) 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. --------------------------------------------------------------------------------