├── .gitignore
├── .ipynb_checkpoints
    ├── MAGNet-New-checkpoint.ipynb
    └── train-for-javascript-checkpoint.ipynb
├── LICENSE
├── PyTorch
    ├── generate.py
    ├── model.py
    └── train.py
├── README.md
├── legacy
    ├── MAGNet-New.ipynb
    ├── train-for-javascript.ipynb
    └── train-for-python.ipynb
└── utils
    ├── audio_dataset_generator.py
    ├── load_and_convert.py
    ├── random_search.py
    ├── sequence_stfts_test.ipynb
    ├── stft_net_with_hop.ipynb
    └── stft_test.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.npy
3 | local_assets/
4 | assets/


--------------------------------------------------------------------------------
/.ipynb_checkpoints/train-for-javascript-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MAGnet \n",
  8 |     "### Train your own models to generate audio in python or convert to use in the browser on mimicproject.com"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import sys\n",
 18 |     "import tensorflow as tf\n",
 19 |     "import pywt\n",
 20 |     "from utils.audio_dataset_generator import AudioDatasetGenerator\n",
 21 |     "import numpy as np\n",
 22 |     "import tensorflowjs as tfjs"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "### Set up variables\n",
 30 |     "Including the path to your audio"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 6,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# Model\n",
 42 |     "load_model           = False\n",
 43 |     "\n",
 44 |     "# Dataset\n",
 45 |     "sequence_length      = 40\n",
 46 |     "audio_data_path      = \"assets/grime/\"\n",
 47 |     "force_new_dataset    = True\n",
 48 |     "\n",
 49 |     "# Feature Extraction and Audio Genreation\n",
 50 |     "sample_rate          = 44100\n",
 51 |     "fft_settings         = [2048, 1024, 512]\n",
 52 |     "fft_size             = fft_settings[0]\n",
 53 |     "window_size          = fft_settings[1]\n",
 54 |     "hop_size             = fft_settings[2]\n",
 55 |     "\n",
 56 |     "# General Network\n",
 57 |     "learning_rate        = 0.001\n",
 58 |     "amount_epochs        = 100\n",
 59 |     "batch_size           = 64\n",
 60 |     "loss_type            = \"mse\"\n",
 61 |     "weight_decay         = 0.0001\n",
 62 |     "\n",
 63 |     "# Recurrent Neural Network\n",
 64 |     "rnn_type             = \"lstm\"\n",
 65 |     "number_rnn_layers    = 2\n",
 66 |     "rnn_number_units     = 256"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Make the dataset from the audio"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Make your dataset\n",
 83 |     "\n",
 84 |     "dataset = AudioDatasetGenerator(fft_size, window_size, hop_size,\n",
 85 |     "                                sequence_length, sample_rate)\n",
 86 |     "\n",
 87 |     "dataset.load(audio_data_path, force_new_dataset)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Set up the model\n",
 99 |     "\n",
100 |     "model = tf.keras.Sequential()\n",
101 |     "\n",
102 |     "model.add(tf.keras.layers.BatchNormalization(input_shape=[dataset.x_frames.shape[1], dataset.x_frames.shape[2]]))\n",
103 |     "\n",
104 |     "for layer in range(number_rnn_layers):\n",
105 |     "    return_sequence = False if layer == (number_rnn_layers - 1) else True\n",
106 |     "    model.add(tf.keras.layers.LSTM(rnn_number_units, return_sequences= return_sequence))\n",
107 |     "    \n",
108 |     "model.add(tf.keras.layers.Dense(dataset.y_frames.shape[1]))\n",
109 |     "\n",
110 |     "model.add(tf.keras.layers.Activation('linear'))\n",
111 |     "opt = tf.keras.optimizers.Adam(learning_rate)\n",
112 |     "model.compile(optimizer=opt, loss=loss_type)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "### Train your model"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "scrolled": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "# Train\n",
131 |     "\n",
132 |     "model.fit(dataset.x_frames, dataset.y_frames, batch_size=batch_size, epochs=amount_epochs)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "### Save your model as a keras model"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "# Save your model\n",
151 |     "\n",
152 |     "model.save(\"<YOUR MODEL>.h5\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Convert to use online with tensorflow.js. \n",
160 |     "Find example code at https://mimicproject.com/code/b530ba9e-dfd9-0440-8358-86b6420b210d\n",
161 |     "Upload the \n",
162 |     "* .json file\n",
163 |     "* the shards\n",
164 |     "* your audio\n",
165 |     "Update the MODEL_URLS and SAMPLE_URLS in dataset-paths.js"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "tfjs.converters.save_keras_model(model, \"<YOUR MODEL>.json\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "### Or generate samples in python"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 84,
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "100% audio generation complete.   \r"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "# Or generate samples in python\n",
201 |     "\n",
202 |     "amount_samples      = 1\n",
203 |     "sequence_length_max = 500\n",
204 |     "impulse_scale       = 1.0\n",
205 |     "griffin_iterations  = 60\n",
206 |     "random_chance       = 0.05\n",
207 |     "random_strength     = 0.0\n",
208 |     "\n",
209 |     "dimension1 = dataset.x_frames.shape[1]\n",
210 |     "dimension2 = dataset.x_frames.shape[2]\n",
211 |     "shape = (1, dimension1, dimension2, 1) if use_cnn else (1, dimension1, dimension2)\n",
212 |     "\n",
213 |     "audio = []\n",
214 |     "\n",
215 |     "if use_wavelets:\n",
216 |     "    temp_audio = np.array(0)\n",
217 |     "for i in range(amount_samples):                                                                                                                                   \n",
218 |     "    \n",
219 |     "    random_index = np.random.randint(0, (len(dataset.x_frames) - 1))                                                                                                                    \n",
220 |     "                                                                                                                                                                              \n",
221 |     "    impulse = np.array(dataset.x_frames[random_index]) * impulse_scale\n",
222 |     "    predicted_magnitudes = impulse\n",
223 |     "    \n",
224 |     "    if use_wavelets:\n",
225 |     "        for seq in range (impulse.shape[0]):\n",
226 |     "            coeffs = pywt.array_to_coeffs(impulse[seq], dataset.coeff_slices)\n",
227 |     "            recon = (pywt.waverecn(coeffs, wavelet=wavelet))\n",
228 |     "            temp_audio = np.append(temp_audio, recon)\n",
229 |     "    for j in range(sequence_length_max):\n",
230 |     "        prediction = model.predict(impulse.reshape(shape))\n",
231 |     "        #Wavelet audio\n",
232 |     "        if use_wavelets:\n",
233 |     "            coeffs = pywt.array_to_coeffs(prediction[0], dataset.coeff_slices)\n",
234 |     "            recon = (pywt.waverecn(coeffs, wavelet=wavelet))\n",
235 |     "            temp_audio = np.append(temp_audio, recon)\n",
236 |     "        \n",
237 |     "        if use_cnn:\n",
238 |     "            prediction = prediction.reshape(1, dataset.y_frames.shape[1], 1)\n",
239 |     "        \n",
240 |     "        predicted_magnitudes = np.vstack((predicted_magnitudes, prediction))    \n",
241 |     "        impulse = predicted_magnitudes[-sequence_length:]\n",
242 |     "        \n",
243 |     "        if (np.random.random_sample() < random_chance) :\n",
244 |     "            idx = np.random.randint(0, dataset.sequence_length)\n",
245 |     "            impulse[idx] = impulse[idx] + np.random.random_sample(impulse[idx].shape) * random_strength\n",
246 |     "        \n",
247 |     "        done = int(float(i * sequence_length_max + j) / float(amount_samples * sequence_length_max) * 100.0) + 1\n",
248 |     "        sys.stdout.write('{}% audio generation complete.   \\r'.format(done))\n",
249 |     "        sys.stdout.flush()\n",
250 |     "        \n",
251 |     "    if use_wavelets:                                                                                                                                                                        \n",
252 |     "        audio += [temp_audio]\n",
253 |     "    else:\n",
254 |     "        predicted_magnitudes = np.array(predicted_magnitudes).reshape(-1, int(window_size)+1)                                                                           \n",
255 |     "        audio += [dataset.griffin_lim(predicted_magnitudes.T, griffin_iterations)]\n",
256 |     "audio = np.array(audio)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 72,
262 |    "metadata": {},
263 |    "outputs": [
264 |     {
265 |      "name": "stderr",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "IOPub data rate exceeded.\n",
269 |       "The notebook server will temporarily stop sending output\n",
270 |       "to the client in order to avoid crashing it.\n",
271 |       "To change this limit, set the config variable\n",
272 |       "`--NotebookApp.iopub_data_rate_limit`.\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "# Play them back\n",
278 |     "\n",
279 |     "from IPython.display import Audio\n",
280 |     "i = 0\n",
281 |     "Audio(audio[i], rate=sample_rate)"
282 |    ]
283 |   }
284 |  ],
285 |  "metadata": {
286 |   "kernelspec": {
287 |    "display_name": "Python 3",
288 |    "language": "python",
289 |    "name": "python3"
290 |   },
291 |   "language_info": {
292 |    "codemirror_mode": {
293 |     "name": "ipython",
294 |     "version": 3
295 |    },
296 |    "file_extension": ".py",
297 |    "mimetype": "text/x-python",
298 |    "name": "python",
299 |    "nbconvert_exporter": "python",
300 |    "pygments_lexer": "ipython3",
301 |    "version": "3.8.5"
302 |   }
303 |  },
304 |  "nbformat": 4,
305 |  "nbformat_minor": 2
306 | }
307 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Mick Grierson
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/PyTorch/generate.py:
--------------------------------------------------------------------------------
 1 | import soundfile as sf
 2 | import numpy as np
 3 | import librosa
 4 | from model import RNNModel, SpectrogramDataset, preprocess_data
 5 | import torch 
 6 | from datetime import datetime
 7 | 
 8 | n_fft=2048
 9 | hop_length=512
10 | win_length=2048
11 | sequence_length = 20
12 | file_name = "../assets/Wiley.wav"
13 | x_frames, y_frames = preprocess_data(file_name, n_fft=n_fft, 
14 |                                      hop_length=hop_length, win_length=win_length, 
15 |                                      sequence_length=sequence_length)
16 | spectrogram_dataset = SpectrogramDataset(x_frames, y_frames)
17 | 
18 | 
19 | points = [0.0, 0.5, 0.2, 0.7]
20 | lengths = [200, 200, 200, 200] 
21 | random_strength = 0.2
22 | 
23 | model = RNNModel(input_size=1025, hidden_size=128, num_layers=2, output_size=1025)  # Example model initialization
24 | checkpoint = "model_weights_26-Feb-2024-17-10-36.pth"
25 | model.load_state_dict(torch.load(checkpoint))
26 | model.eval()
27 | 
28 | output_sequence_length = np.array(lengths).sum()
29 | dimension1 = x_frames.shape[1]
30 | dimension2 = x_frames.shape[2]
31 | shape = (1, dimension1, dimension2)
32 | ctr = 0
33 | change_at = lengths[ctr]
34 | 
35 | audio = []
36 | index = int(points[ctr] * len(x_frames))
37 | impulse = x_frames[index]
38 | predicted_magnitudes = impulse
39 | random_chance = 0.05
40 | print(x_frames.shape, impulse.shape)
41 | 
42 | for j in range(output_sequence_length):
43 |     prediction = model(impulse.unsqueeze(0))
44 |     predicted_magnitudes = torch.cat((predicted_magnitudes, prediction.transpose(0,1)), dim=1)
45 |     impulse = predicted_magnitudes[:,-sequence_length:]
46 |     if (np.random.random_sample() < random_chance) :
47 |         np.random.seed()
48 |         random_index = np.random.randint(0, (len(x_frames) - 1))                                                                                                                    
49 |         impulse = x_frames[random_index]
50 |     if j > change_at:
51 |       print(ctr, j, change_at, index)
52 |       ctr = ctr + 1
53 |       index = int(points[ctr] * len(x_frames))
54 |       impulse = x_frames[index]
55 |       change_at = change_at + lengths[ctr]
56 | 
57 | predicted_magnitudes = predicted_magnitudes.detach().numpy()
58 | audio = librosa.griffinlim(predicted_magnitudes, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
59 | print(predicted_magnitudes.shape, len(audio))
60 | timestampStr = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
61 | # # WRITE AUDIO
62 | output_name = "wiley"
63 | sf.write(f"{output_name}_{timestampStr}.wav", audio, 44100)


--------------------------------------------------------------------------------
/PyTorch/model.py:
--------------------------------------------------------------------------------
 1 | # Recurrent Neural Network
 2 | from torch.utils.data import Dataset
 3 | import torch
 4 | import torch.nn as nn
 5 | import numpy as np
 6 | import librosa
 7 | import sys
 8 | from os.path import isdir, exists
 9 | from os import listdir
10 | 
11 | 
12 | def preprocess_data(path, n_fft=2048,hop_length=512, win_length=2048, sequence_length = 40, sr = 44100):
13 |     cached_x_path = path + '_x_frames.npy'
14 |     cached_y_path = path + '_y_frames.npy'
15 |     if exists(cached_x_path) and exists(cached_y_path):
16 |         x_frames = np.load(cached_x_path)
17 |         y_frames = np.load(cached_y_path)
18 |         print("loading cached data")
19 |         return torch.tensor(x_frames), torch.tensor(y_frames)
20 |     
21 |     x = [0]
22 |     if not isdir(path):
23 |         x, sr = librosa.load(path, sr=sr) 
24 |     else:
25 |         files = listdir(path)
26 |         x = np.array([0])
27 |         for file in files:
28 |             if not ".DS" in file:
29 |                 audio, sr, = librosa.load(path + file, sr = 44100)
30 |                 x = np.concatenate((x, audio))
31 |     x = np.array(x, dtype=np.float32) 
32 |     data_tf = torch.tensor(x)
33 |     # Compute STFT
34 |     n = torch.stft(data_tf, n_fft=n_fft, hop_length=hop_length, win_length=win_length, 
35 |                 window=torch.hann_window(win_length), center=True, normalized=False, onesided=True, return_complex=True)
36 | 
37 |     magnitude_spectrograms = torch.abs(n)
38 |     print(data_tf.shape, n.shape, magnitude_spectrograms.shape)
39 | 
40 |     start = 0
41 |     end = magnitude_spectrograms.shape[1] - sequence_length - 1 
42 |     step = 1
43 |     x_frames = []
44 |     y_frames = []
45 |     
46 |     for i in range(start, end, step):
47 |         done = int((float(i) / float(end)) * 100.0)
48 |         sys.stdout.write('{}% data generation complete.   \r'.format(done))
49 |         sys.stdout.flush()
50 |         x = magnitude_spectrograms[:, i:i + sequence_length]
51 |         y = magnitude_spectrograms[:, i + sequence_length]
52 |         x_frames.append(x)
53 |         y_frames.append(y)
54 | 
55 |     x_frames = torch.stack(x_frames)
56 |     y_frames = torch.stack(y_frames)
57 |     print(x_frames.shape, y_frames.shape)
58 |     np.save(cached_x_path, x_frames)
59 |     np.save(cached_y_path, y_frames)
60 |     return x_frames, y_frames
61 | 
62 | class SpectrogramDataset(Dataset):
63 |     def __init__(self, x_frames, y_frames):
64 |         self.x_frames = x_frames
65 |         self.y_frames = y_frames
66 | 
67 |     def __len__(self):
68 |         return self.x_frames.shape[0]  # Number of frames
69 | 
70 |     def __getitem__(self, idx):
71 |         return self.x_frames[idx], self.y_frames[idx]
72 | 
73 | class RNNModel(nn.Module):
74 |     def __init__(self, input_size, hidden_size, num_layers, output_size):
75 |         super(RNNModel, self).__init__()
76 |     
77 |         self.batch_norm = nn.BatchNorm1d(input_size)
78 |         print(input_size, hidden_size, num_layers)
79 |         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
80 |         self.fc = nn.Linear(hidden_size, output_size)
81 |     
82 |     def forward(self, x):
83 |         x = self.batch_norm(x) # BatchNorm expects [batch, features, seq_len]
84 |         x, _ = self.lstm(x.transpose(1, 2))  # lstm expects [batch, seq_len, features]
85 |         x = self.fc(x[:, -1, :]) 
86 |         return x
87 |     
88 | 


--------------------------------------------------------------------------------
/PyTorch/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.optim import Adam
 4 | from torch.utils.data import DataLoader
 5 | from model import RNNModel, SpectrogramDataset, preprocess_data
 6 | from datetime import datetime
 7 | 
 8 | n_fft = 2048
 9 | hop_length = 512
10 | win_length = 2048
11 | sequence_length = 40
12 | file_name = "../assets/Wiley.wav"
13 | x_frames, y_frames = preprocess_data(file_name, n_fft=n_fft, 
14 |                                      hop_length=hop_length, win_length=win_length, 
15 |                                      sequence_length=sequence_length, sr = 44100)
16 | # Create an instance of the dataset
17 | spectrogram_dataset = SpectrogramDataset(x_frames, y_frames)
18 | 
19 | # Create a DataLoader
20 | batch_size = 64  # Define your batch size
21 | shuffle = True   # Shuffle the data every epoch
22 | 
23 | dataloader = DataLoader(spectrogram_dataset, batch_size=batch_size, shuffle=shuffle, drop_last=True)
24 | 
25 | # # Model parameters
26 | learning_rate = 0.001
27 | amount_epochs = 200
28 | batch_size = 64
29 | loss_type = nn.MSELoss()
30 | weight_decay = 0.0001
31 | 
32 | model = RNNModel(input_size=n_fft//2+1, hidden_size=128, num_layers=2, output_size=n_fft//2+1) 
33 | 
34 | # checkpoint = 'model_weights_26-Feb-2024-16-58-29.pth'
35 | # model.load_state_dict(torch.load(checkpoint))
36 | # model.eval() 
37 | 
38 | opt = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
39 | 
40 | for epoch in range(amount_epochs):
41 |     running_loss = 0.0
42 |     for inputs, targets in dataloader:
43 |         opt.zero_grad()
44 |         outputs = model(inputs)
45 |         loss = loss_type(outputs, targets)
46 |         loss.backward()
47 |         opt.step()
48 |         running_loss += loss.item()
49 |     print(f'Epoch [{epoch+1}/{amount_epochs}], Loss: {running_loss/len(dataloader):.4f}')
50 |     running_loss = 0.0
51 | 
52 | timestampStr = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
53 | torch.save(model.state_dict(), f"model_weights_{timestampStr}.pth")
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MAGNet
 2 | 
 3 | ## Colab
 4 | 
 5 | Colab for training and generating audio using Keras
 6 | 
 7 | [![colab_badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CgXT8xsmoPA1MWH4tifY6sTAnutsANRv?usp=sharing)
 8 | 
 9 | ## Train / Generate in [PyTorch](PyTorch/)
10 | 
11 | Recommended approach and needed for using realtime with Dorothy
12 | 
13 | ## Realtime implementation 
14 | 
15 | Implemented as part of the [Dorothy](https://github.com/lmccallum/dorothy) creative coding library 
16 | 
17 | ## Description
18 | 
19 | This demonstrates an LSTM audio generation process using MAGNet, a spectral approach to audio analysis and generation with neural networks. The techniques included here were used as part of the Mezzanine Vs. MAGNet project featured as part of the Barbican's AI: More than Human exhibition
20 | 
21 | It represents ongoing work from researchers at The Creative Computing Institute, UAL and Goldsmiths, University of London. MAGNet trains on the magnitude spectra of acoustic audio signals, and reproduces entirely new magnitude spectra that can be turned back in to sound using phase reconstruction - it's very high quality in terms of audio fidelity.
22 | 
23 | This repo provides a chance for people to train their own models with their own source audio and genreate new sounds. Both given projects are designed to be simple to understand and easy to run.
24 | 
25 | ## Legacy versions
26 | 
27 | train-for-python contains a walkthrough of how to use tflearn to do this entirely in python. Phase reconstruction is done using griffin-lim
28 | 
29 | train-for-javascript contains a walkthrough of how use Keras to train a model, which can then be converted to be used in javascript projects in the browser using tensorflow.js. Example code for this is on the MIMIC platform https://mimicproject.com/code/b530ba9e-dfd9-0440-8358-86b6420b210d. Phase reconstruction is doing using a port of LWS, developed by the Goldsmiths for this purpose.
30 | 
31 | Contributions have been made by Mick Grierson, Leon Fedden, Sam Park-Wolfe, Jakub Fiala and Louis McCallum. 
32 | 


--------------------------------------------------------------------------------
/legacy/MAGNet-New.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sys\n",
 10 |     "import librosa\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "import librosa.display\n",
 13 |     "import IPython.display as ipd\n",
 14 |     "import os\n",
 15 |     "import tensorflow as tf\n",
 16 |     "import numpy as np"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stderr",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "2024-02-26 13:13:13.612659: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1\n",
 29 |       "2024-02-26 13:13:13.612680: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB\n",
 30 |       "2024-02-26 13:13:13.612683: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB\n",
 31 |       "2024-02-26 13:13:13.612732: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.\n",
 32 |       "2024-02-26 13:13:13.613041: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "a = tf.convert_to_tensor(([0,1,2,3,4,5]), np.float32)\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "<tf.Tensor: shape=(6,), dtype=float32, numpy=array([0., 1., 2., 3., 4., 5.], dtype=float32)>"
 49 |       ]
 50 |      },
 51 |      "execution_count": 3,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "a"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 7,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "2.0"
 69 |       ]
 70 |      },
 71 |      "execution_count": 7,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "a.numpy()[2]"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "<frozen importlib._bootstrap>:228: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "path = \"assets/\"\n",
 95 |     "\n",
 96 |     "files = os.listdir(path)\n",
 97 |     "x = np.array([0])\n",
 98 |     "for file in files:\n",
 99 |     "    if not \".DS\" in file:\n",
100 |     "        audio, sr, = librosa.load(path + file)\n",
101 |     "        x = np.concatenate((x, audio))"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 4,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "(3583747,)"
113 |       ]
114 |      },
115 |      "execution_count": 4,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "x.shape"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 5,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stderr",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "2024-02-26 12:53:05.121663: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1\n",
134 |       "2024-02-26 12:53:05.121695: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB\n",
135 |       "2024-02-26 12:53:05.121698: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB\n",
136 |       "2024-02-26 12:53:05.121932: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.\n",
137 |       "2024-02-26 12:53:05.122124: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)\n"
138 |      ]
139 |     },
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "TensorShape([3583747])"
144 |       ]
145 |      },
146 |      "execution_count": 5,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "data_tf = tf.convert_to_tensor(x, np.float32)\n",
153 |     "data_tf.shape"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 6,
159 |    "metadata": {},
160 |    "outputs": [
161 |     {
162 |      "ename": "",
163 |      "evalue": "",
164 |      "output_type": "error",
165 |      "traceback": [
166 |       "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
167 |       "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
168 |       "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
169 |       "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "n = tf.signal.stft(data_tf,2048,512)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "magnitude_spectrograms = tf.abs(n)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "#we need to get all the fft frames and organise them into sequence batches\n",
193 |     "start = 0\n",
194 |     "sequence_length = 40\n",
195 |     "end = magnitude_spectrograms.shape[0] - sequence_length - 1\n",
196 |     "step = 1\n",
197 |     "x_frames = []\n",
198 |     "y_frames = []\n",
199 |     "for i in range(start, end, step):\n",
200 |     "    done = int(float(i) / float(end) * 100.0)\n",
201 |     "    sys.stdout.write('{}% data generation complete.   \\r'.format(done))\n",
202 |     "    sys.stdout.flush()\n",
203 |     "    x = magnitude_spectrograms[i:i+sequence_length]\n",
204 |     "    y = magnitude_spectrograms[i+sequence_length]\n",
205 |     "    x_frames.append(x)\n",
206 |     "    y_frames.append(y)\n",
207 |     "x_frames = np.array(x_frames)\n",
208 |     "y_frames = np.array(y_frames)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "learning_rate        = 0.001\n",
218 |     "amount_epochs        = 500\n",
219 |     "batch_size           = 64\n",
220 |     "loss_type            = \"mse\"\n",
221 |     "weight_decay         = 0.0001\n",
222 |     "\n",
223 |     "\n",
224 |     "\n",
225 |     "# Recurrent Neural Network\n",
226 |     "rnn_type             = \"lstm\"\n",
227 |     "number_rnn_layers    = 3\n",
228 |     "rnn_number_units     = 128\n",
229 |     "model = tf.keras.Sequential()\n",
230 |     "\n",
231 |     "model.add(tf.keras.layers.BatchNormalization(input_shape=[x_frames.shape[1], x_frames.shape[2]]))\n",
232 |     "\n",
233 |     "for layer in range(number_rnn_layers):\n",
234 |     "    return_sequence = False if layer == (number_rnn_layers - 1) else True\n",
235 |     "    model.add(tf.keras.layers.LSTM(rnn_number_units, return_sequences= return_sequence))\n",
236 |     "    \n",
237 |     "model.add(tf.keras.layers.Dense(y_frames.shape[1]))\n",
238 |     "\n",
239 |     "model.add(tf.keras.layers.Activation('linear'))\n",
240 |     "opt = tf.keras.optimizers.Adam(learning_rate)\n",
241 |     "model.compile(optimizer=opt, loss=loss_type)\n",
242 |     "\n",
243 |     "# this model trains much much faster than the prior models "
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "scrolled": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "model.fit(x_frames, y_frames, batch_size=batch_size, epochs=amount_epochs)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# Save your model\n",
264 |     "model.save(\"myModel.h5\")"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "# Load your model\n",
274 |     "model = tf.keras.models.load_model(\"myModel.h5\")"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "amount_samples      = 1\n",
284 |     "sequence_length_max = 1000\n",
285 |     "impulse_scale       = 1.0\n",
286 |     "random_chance       = 0.1\n",
287 |     "random_strength     = 1.0\n",
288 |     "window_size = 1024\n",
289 |     "\n",
290 |     "dimension1 = x_frames.shape[1]\n",
291 |     "dimension2 = x_frames.shape[2]\n",
292 |     "shape = (1, dimension1, dimension2)\n",
293 |     "\n",
294 |     "audio = []\n",
295 |     "\n",
296 |     "for i in range(amount_samples):                                                                                                                                   \n",
297 |     "    \n",
298 |     "    random_index = np.random.randint(0, (len(x_frames) - 1))                                                                                                                    \n",
299 |     "    impulse = np.array(x_frames[random_index]) * impulse_scale\n",
300 |     "    predicted_magnitudes = impulse\n",
301 |     "    \n",
302 |     "    for j in range(sequence_length_max):\n",
303 |     "        prediction = model.predict(impulse.reshape(shape))\n",
304 |     "        predicted_magnitudes = np.vstack((predicted_magnitudes, prediction))\n",
305 |     "        impulse = predicted_magnitudes[-sequence_length:]\n",
306 |     "        \n",
307 |     "        if (np.random.random_sample() < random_chance) :\n",
308 |     "            random_index = np.random.randint(0, (len(x_frames) - 1))                                                                                                                    \n",
309 |     "            impulse = np.array(x_frames[random_index]) * impulse_scale * random_strength\n",
310 |     "            #predicted_magnitudes = impulse\n",
311 |     "        \n",
312 |     "        done = int(float(i * sequence_length_max + j) / float(amount_samples * sequence_length_max) * 100.0) + 1\n",
313 |     "        sys.stdout.write('{}% audio generation complete.   \\r'.format(done))\n",
314 |     "        sys.stdout.flush()\n",
315 |     "        \n",
316 |     "        #predicted_magnitudes = np.array(predicted_magnitudes).reshape(-1, window_size+1) \n",
317 |     "        predicted_magnitudes = np.array(predicted_magnitudes).reshape(-1, window_size+1)                                                                           \n",
318 |     "        #audio += [librosa.griffinlim(predicted_magnitudes.T)]\n",
319 |     "        #audio+=[predicted_magnitudes.T]\n",
320 |     "    new_sample = [librosa.griffinlim(predicted_magnitudes.T)]\n",
321 |     "    audio.append(new_sample)\n",
322 |     "audio = np.array(audio)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "from IPython.display import Audio"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "i = 0\n",
341 |     "Audio(audio[0], rate=sr)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": []
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": []
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": []
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": []
371 |   }
372 |  ],
373 |  "metadata": {
374 |   "kernelspec": {
375 |    "display_name": "Python 3 (ipykernel)",
376 |    "language": "python",
377 |    "name": "python3"
378 |   },
379 |   "language_info": {
380 |    "codemirror_mode": {
381 |     "name": "ipython",
382 |     "version": 3
383 |    },
384 |    "file_extension": ".py",
385 |    "mimetype": "text/x-python",
386 |    "name": "python",
387 |    "nbconvert_exporter": "python",
388 |    "pygments_lexer": "ipython3",
389 |    "version": "3.9.13"
390 |   }
391 |  },
392 |  "nbformat": 4,
393 |  "nbformat_minor": 2
394 | }
395 | 


--------------------------------------------------------------------------------
/legacy/train-for-javascript.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MAGnet \n",
  8 |     "### Train your own models to generate audio in python or convert to use in the browser on mimicproject.com"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 2,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import sys\n",
 18 |     "import tensorflow as tf\n",
 19 |     "import pywt\n",
 20 |     "from utils.audio_dataset_generator import AudioDatasetGenerator\n",
 21 |     "import numpy as np\n",
 22 |     "import tensorflowjs as tfjs"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "### Set up variables\n",
 30 |     "Including the path to your audio"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 6,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# Model\n",
 42 |     "load_model           = False\n",
 43 |     "\n",
 44 |     "# Dataset\n",
 45 |     "sequence_length      = 40\n",
 46 |     "audio_data_path      = \"assets/grime/\"\n",
 47 |     "force_new_dataset    = True\n",
 48 |     "\n",
 49 |     "# Feature Extraction and Audio Genreation\n",
 50 |     "sample_rate          = 44100\n",
 51 |     "fft_settings         = [2048, 1024, 512]\n",
 52 |     "fft_size             = fft_settings[0]\n",
 53 |     "window_size          = fft_settings[1]\n",
 54 |     "hop_size             = fft_settings[2]\n",
 55 |     "\n",
 56 |     "# General Network\n",
 57 |     "learning_rate        = 0.001\n",
 58 |     "amount_epochs        = 100\n",
 59 |     "batch_size           = 64\n",
 60 |     "loss_type            = \"mse\"\n",
 61 |     "weight_decay         = 0.0001\n",
 62 |     "\n",
 63 |     "# Recurrent Neural Network\n",
 64 |     "rnn_type             = \"lstm\"\n",
 65 |     "number_rnn_layers    = 2\n",
 66 |     "rnn_number_units     = 256"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Make the dataset from the audio"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Make your dataset\n",
 83 |     "\n",
 84 |     "dataset = AudioDatasetGenerator(fft_size, window_size, hop_size,\n",
 85 |     "                                sequence_length, sample_rate)\n",
 86 |     "\n",
 87 |     "dataset.load(audio_data_path, force_new_dataset)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Set up the model\n",
 99 |     "\n",
100 |     "model = tf.keras.Sequential()\n",
101 |     "\n",
102 |     "model.add(tf.keras.layers.BatchNormalization(input_shape=[dataset.x_frames.shape[1], dataset.x_frames.shape[2]]))\n",
103 |     "\n",
104 |     "for layer in range(number_rnn_layers):\n",
105 |     "    return_sequence = False if layer == (number_rnn_layers - 1) else True\n",
106 |     "    model.add(tf.keras.layers.LSTM(rnn_number_units, return_sequences= return_sequence))\n",
107 |     "    \n",
108 |     "model.add(tf.keras.layers.Dense(dataset.y_frames.shape[1]))\n",
109 |     "\n",
110 |     "model.add(tf.keras.layers.Activation('linear'))\n",
111 |     "opt = tf.keras.optimizers.Adam(learning_rate)\n",
112 |     "model.compile(optimizer=opt, loss=loss_type)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "### Train your model"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "scrolled": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "# Train\n",
131 |     "\n",
132 |     "model.fit(dataset.x_frames, dataset.y_frames, batch_size=batch_size, epochs=amount_epochs)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "### Save your model as a keras model"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "# Save your model\n",
151 |     "\n",
152 |     "model.save(\"<YOUR MODEL>.h5\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Convert to use online with tensorflow.js. \n",
160 |     "Find example code at https://mimicproject.com/code/b530ba9e-dfd9-0440-8358-86b6420b210d\n",
161 |     "Upload the \n",
162 |     "* .json file\n",
163 |     "* the shards\n",
164 |     "* your audio\n",
165 |     "Update the MODEL_URLS and SAMPLE_URLS in dataset-paths.js"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 3,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "# load your model\n",
175 |     "\n",
176 |     "model = tf.keras.models.load_model(\"../../models/linn_22050_2048_512_lstm_2_128.h5\")\n",
177 |     "\n",
178 |     "tfjs.converters.save_keras_model(model, \"linn_22050.json\")"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "### Or generate samples in python"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 84,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "name": "stdout",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "100% audio generation complete.   \r"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "# Or generate samples in python\n",
203 |     "\n",
204 |     "amount_samples      = 1\n",
205 |     "sequence_length_max = 500\n",
206 |     "impulse_scale       = 1.0\n",
207 |     "griffin_iterations  = 60\n",
208 |     "random_chance       = 0.05\n",
209 |     "random_strength     = 0.0\n",
210 |     "\n",
211 |     "dimension1 = dataset.x_frames.shape[1]\n",
212 |     "dimension2 = dataset.x_frames.shape[2]\n",
213 |     "shape = (1, dimension1, dimension2, 1) if use_cnn else (1, dimension1, dimension2)\n",
214 |     "\n",
215 |     "audio = []\n",
216 |     "\n",
217 |     "if use_wavelets:\n",
218 |     "    temp_audio = np.array(0)\n",
219 |     "for i in range(amount_samples):                                                                                                                                   \n",
220 |     "    \n",
221 |     "    random_index = np.random.randint(0, (len(dataset.x_frames) - 1))                                                                                                                    \n",
222 |     "                                                                                                                                                                              \n",
223 |     "    impulse = np.array(dataset.x_frames[random_index]) * impulse_scale\n",
224 |     "    predicted_magnitudes = impulse\n",
225 |     "    \n",
226 |     "    if use_wavelets:\n",
227 |     "        for seq in range (impulse.shape[0]):\n",
228 |     "            coeffs = pywt.array_to_coeffs(impulse[seq], dataset.coeff_slices)\n",
229 |     "            recon = (pywt.waverecn(coeffs, wavelet=wavelet))\n",
230 |     "            temp_audio = np.append(temp_audio, recon)\n",
231 |     "    for j in range(sequence_length_max):\n",
232 |     "        prediction = model.predict(impulse.reshape(shape))\n",
233 |     "        #Wavelet audio\n",
234 |     "        if use_wavelets:\n",
235 |     "            coeffs = pywt.array_to_coeffs(prediction[0], dataset.coeff_slices)\n",
236 |     "            recon = (pywt.waverecn(coeffs, wavelet=wavelet))\n",
237 |     "            temp_audio = np.append(temp_audio, recon)\n",
238 |     "        \n",
239 |     "        if use_cnn:\n",
240 |     "            prediction = prediction.reshape(1, dataset.y_frames.shape[1], 1)\n",
241 |     "        \n",
242 |     "        predicted_magnitudes = np.vstack((predicted_magnitudes, prediction))    \n",
243 |     "        impulse = predicted_magnitudes[-sequence_length:]\n",
244 |     "        \n",
245 |     "        if (np.random.random_sample() < random_chance) :\n",
246 |     "            idx = np.random.randint(0, dataset.sequence_length)\n",
247 |     "            impulse[idx] = impulse[idx] + np.random.random_sample(impulse[idx].shape) * random_strength\n",
248 |     "        \n",
249 |     "        done = int(float(i * sequence_length_max + j) / float(amount_samples * sequence_length_max) * 100.0) + 1\n",
250 |     "        sys.stdout.write('{}% audio generation complete.   \\r'.format(done))\n",
251 |     "        sys.stdout.flush()\n",
252 |     "        \n",
253 |     "    if use_wavelets:                                                                                                                                                                        \n",
254 |     "        audio += [temp_audio]\n",
255 |     "    else:\n",
256 |     "        predicted_magnitudes = np.array(predicted_magnitudes).reshape(-1, int(window_size)+1)                                                                           \n",
257 |     "        audio += [dataset.griffin_lim(predicted_magnitudes.T, griffin_iterations)]\n",
258 |     "audio = np.array(audio)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 72,
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "name": "stderr",
268 |      "output_type": "stream",
269 |      "text": [
270 |       "IOPub data rate exceeded.\n",
271 |       "The notebook server will temporarily stop sending output\n",
272 |       "to the client in order to avoid crashing it.\n",
273 |       "To change this limit, set the config variable\n",
274 |       "`--NotebookApp.iopub_data_rate_limit`.\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "# Play them back\n",
280 |     "\n",
281 |     "from IPython.display import Audio\n",
282 |     "i = 0\n",
283 |     "Audio(audio[i], rate=sample_rate)"
284 |    ]
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernelspec": {
289 |    "display_name": "Python 3",
290 |    "language": "python",
291 |    "name": "python3"
292 |   },
293 |   "language_info": {
294 |    "codemirror_mode": {
295 |     "name": "ipython",
296 |     "version": 3
297 |    },
298 |    "file_extension": ".py",
299 |    "mimetype": "text/x-python",
300 |    "name": "python",
301 |    "nbconvert_exporter": "python",
302 |    "pygments_lexer": "ipython3",
303 |    "version": "3.8.5"
304 |   }
305 |  },
306 |  "nbformat": 4,
307 |  "nbformat_minor": 2
308 | }
309 | 


--------------------------------------------------------------------------------
/utils/audio_dataset_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import random
  4 | import librosa
  5 | import pywt
  6 | import math
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | class AudioDatasetGenerator:
 12 |     """
 13 |     Class to manage the dataset for audio generation.
 14 |     """
 15 | 
 16 |     def __init__(self, fft_size=2048, window_size=1024, hop_size=512,
 17 |                  sequence_length=16, sample_rate=44100):
 18 |         """Inits the class. Set the fft values to have a significant effect on
 19 |         the training of the neural network."""
 20 |         self.counter = 0
 21 |         self.epoch_count = 0
 22 |         self.previous_epoch = -1
 23 |         self.x_frames = []
 24 |         self.y_frames = []
 25 |         self.fft_size = fft_size
 26 |         self.window_size = window_size
 27 |         self.hop_size = hop_size
 28 |         self.sequence_length = sequence_length
 29 |         self.sample_rate = sample_rate
 30 | 
 31 |     def load(self, data_path, force=False):
 32 |         """Loads the dataset from either the binary numpy file, or generates
 33 |         from a folder of wav files specified at the data_path."""
 34 |         x_frames_name = os.path.join(data_path, "x_frames.npy")
 35 |         y_frames_name = os.path.join(data_path, "y_frames.npy")
 36 |         if os.path.isfile(x_frames_name) and os.path.isfile(y_frames_name) and force == False:
 37 |             self.x_frames = np.load(x_frames_name)
 38 |             self.y_frames = np.load(y_frames_name)
 39 |         elif os.path.exists(data_path):
 40 |             self._generate_data(data_path)
 41 |             self.x_frames = np.array(self.x_frames)
 42 |             self.y_frames = np.array(self.y_frames)
 43 |             self.x_frames, self.y_frames = self.unison_shuffled_copies(self.x_frames,
 44 |                                                                        self.y_frames)
 45 |             np.save(x_frames_name, self.x_frames)
 46 |             np.save(y_frames_name, self.y_frames)
 47 |         else:
 48 |             raise ValueError("Couldn't load files from the supplied path.")
 49 | 
 50 |     def get_next_batch(self, batch_size):
 51 |         """Gets a new batch. Reshuffles the dataset at the end of the epoch."""
 52 |         if self.counter + batch_size > len(self.y_frames):
 53 |             self.counter = 0
 54 |             self.epoch_count += 1
 55 |             self.x_frames, self.y_frames = self.unison_shuffled_copies(self.x_frames,
 56 |                                                                        self.y_frames)
 57 |         return_x = self.x_frames[self.counter:self.counter + batch_size]
 58 |         return_y = self.y_frames[self.counter:self.counter + batch_size]
 59 |         self.counter += batch_size
 60 |         return return_x, return_y
 61 | 
 62 |     def is_new_epoch(self):
 63 |         """Returns true if there has been a new epoch."""
 64 |         if self.epoch_count != self.previous_epoch:
 65 |             self.previous_epoch = self.epoch_count
 66 |             return True
 67 |         return False
 68 | 
 69 |     def get_epoch(self):
 70 |         """Returns the current epoch."""
 71 |         return self.epoch_count
 72 | 
 73 |     def reset_epoch(self):
 74 |         """Resets the current epoch."""
 75 |         self.epoch_count = 0
 76 |         self.previous_epoch -1
 77 | 
 78 |     def get_x_shape(self):
 79 |         """Gets the shame for the x frames. Useful for placeholders."""
 80 |         return [None, self.x_frames.shape[1], self.x_frames.shape[2]]
 81 | 
 82 |     def get_y_shape(self):
 83 |         """Gets the shame for the y frames. Useful for placeholders."""
 84 |         return [None, self.y_frames.shape[1]]
 85 | 
 86 |     def completed_all_epochs(self, desired_epochs):
 87 |         """Returns true once the get next batch method has been called enough
 88 |         to have run through desired_epochs amount of epochs."""
 89 |         return self.epoch_count >= desired_epochs
 90 | 
 91 |     def unison_shuffled_copies(self, a, b):
 92 |         """Shuffle NumPy arrays in unison."""
 93 |         assert len(a) == len(b)
 94 |         p = np.random.permutation(len(a))
 95 |         return a[p], b[p]
 96 | 
 97 |     def griffin_lim(self, stftm_matrix, max_iter=100):
 98 |         """"Iterative method to 'build' phases for magnitudes."""
 99 |         stft_matrix = np.random.random(stftm_matrix.shape)
100 |         y = librosa.core.istft(stft_matrix, self.hop_size, self.window_size)
101 |         for i in range(max_iter):
102 |             stft_matrix = librosa.core.stft(y, self.fft_size, self.hop_size, self.window_size)
103 |             stft_matrix = stftm_matrix * stft_matrix / np.abs(stft_matrix)
104 |             y = librosa.core.istft(stft_matrix, self.hop_size, self.window_size)
105 |         return y
106 |     
107 |     def generate_samples(self, prediction_tensor, x, training, keep_prob,
108 |                          amount_samples=5, sequence_max_length=2000,
109 |                          impulse_scale=666, griffin_iterations=100):
110 |         """Generates samples in the supplied folder path."""
111 |         all_audio = []
112 |         with tf.Session() as sess:
113 |             sess.run(tf.global_variables_initializer())
114 |             for i in range(amount_samples):
115 |                 random_index = random.randint(0, (len(self.x_frames) - 1))
116 | 
117 |                 impulse_shape = np.array(self.x_frames[random_index]).shape
118 |                 #impulse = np.random.random_sample(size=impulse_shape) * impulse_scale
119 |                 impulse = self.x_frames[random_index]
120 |                 predicted_magnitudes = impulse
121 |                 for j in range(sequence_max_length):
122 |                     impulse = np.array(impulse).reshape(1,self.x_frames.shape[1], self.x_frames.shape[2])
123 |                     
124 |                     prediction = sess.run(prediction_tensor,
125 |                     feed_dict={x: impulse, training: False, keep_prob: 1.0})
126 |                     prediction = prediction.reshape(1, prediction.shape[1])
127 |                     predicted_magnitudes = np.vstack((predicted_magnitudes, prediction))
128 |                     impulse = predicted_magnitudes[-self.sequence_length:]
129 | 
130 |                 predicted_magnitudes = np.array(predicted_magnitudes)
131 |                 all_audio += [self.griffin_lim(predicted_magnitudes.T, griffin_iterations)]
132 |             return np.array(all_audio)
133 | 
134 |     def _generate_data(self, data_path):
135 |         """Create some data from a folder of wav files.
136 |         NOTE: the augmentation process should be parameterised."""
137 |         file_names = os.listdir(data_path)
138 |         fft_frames = []
139 |         self.x_frames = []
140 |         self.y_frames = []
141 |         for file in file_names:
142 |             if file.endswith('.wav'):
143 |                 file = os.path.join(data_path, file)
144 |                 data, sample_rate = librosa.load(file, sr=self.sample_rate,
145 |                                                  mono=True)
146 |                 data = np.append(np.zeros(self.window_size * self.sequence_length), data)
147 |                 mags_phases = librosa.stft(data, n_fft=self.fft_size,
148 |                                            win_length=self.window_size,
149 |                                            hop_length=self.hop_size)
150 |                 magnitudes, phases = librosa.magphase(mags_phases)
151 |                 for magnitude_bins in magnitudes.T:
152 |                     fft_frames += [magnitude_bins]  
153 | 
154 |         start = 0
155 |         end = len(fft_frames) - self.sequence_length - 1
156 |         step = 1
157 |         for i in range(start, end, step):
158 |             done = int(float(i) / float(end) * 100.0)
159 |             sys.stdout.write('{}% data generation complete.   \r'.format(done))
160 |             sys.stdout.flush()
161 | 
162 |             x = fft_frames[i:i + self.sequence_length]
163 |             y = fft_frames[i + self.sequence_length]
164 |             self.x_frames.append(x)
165 |             self.y_frames.append(y)
166 | 
167 |         sys.stdout.write('100% data generation complete.')
168 |         sys.stdout.flush()
169 | 
170 | 
171 | class AudioWaveletDatasetGenerator:
172 |     """
173 |     Class for wavelets
174 |     """
175 | 
176 |     def __init__(self, window_size=1024, sequence_length=16, sample_rate=44100, wavelet='db10'):
177 |         """Inits the class. Set the fft values to have a significant effect on
178 |         the training of the neural network."""
179 |         self.x_frames = []
180 |         self.y_frames = []
181 |         self.window_size = window_size
182 |         print(self.window_size)
183 |         self.sample_rate = sample_rate
184 |         self.sequence_length = sequence_length
185 |         self.coeff_slices = []
186 |         self.wavelet = wavelet
187 |         
188 |     def load(self, data_path, force=False):
189 |         """Loads the dataset from either the binary numpy file, or generates
190 |         from a folder of wav files specified at the data_path."""
191 |         x_frames_name = os.path.join(data_path, "x_frames.npy")
192 |         y_frames_name = os.path.join(data_path, "y_frames.npy")
193 |         if os.path.isfile(x_frames_name) and os.path.isfile(y_frames_name) and force == False:
194 |             self.x_frames = np.load(x_frames_name)
195 |             self.y_frames = np.load(y_frames_name)
196 |         elif os.path.exists(data_path):
197 |             self._generate_data(data_path)
198 |             self.x_frames = np.array(self.x_frames)
199 |             self.y_frames = np.array(self.y_frames)
200 |             self.x_frames, self.y_frames = self.unison_shuffled_copies(self.x_frames,
201 |                                                                        self.y_frames)
202 |             np.save(x_frames_name, self.x_frames)
203 |             np.save(y_frames_name, self.y_frames)
204 |         else:
205 |             raise ValueError("Couldn't load files from the supplied path.")
206 | 
207 |     def unison_shuffled_copies(self, a, b):
208 |         """Shuffle NumPy arrays in unison."""
209 |         assert len(a) == len(b)
210 |         p = np.random.permutation(len(a))
211 |         return a[p], b[p]
212 |             
213 |     def _generate_data(self, data_path):
214 |         """Create some data from a folder of wav files.
215 |         NOTE: the augmentation process should be parameterised."""
216 |         file_names = os.listdir(data_path)
217 | 
218 |         self.x_frames = []
219 |         self.y_frames = []
220 |         ws = self.window_size
221 |         for file in file_names:
222 |             if file.endswith('.wav'):
223 |                 file = os.path.join(data_path, file)
224 |                 data, sample_rate = librosa.load(file, sr=self.sample_rate, mono=True)
225 |                 data = np.append(np.zeros(self.window_size * self.sequence_length), data)
226 |                 for offset in range(0, 1):
227 |                     wavelet_frames = []
228 |                     sys.stdout.write('{} offset   \r'.format(offset))
229 |                     sys.stdout.flush()
230 |                     for i in range (0, math.floor((len(data)-offset)/float(ws))):
231 |                         coeffs = pywt.wavedec(data[(i*ws)+offset:(i*ws+ws)+offset], self.wavelet)
232 |                         coeff_arr, self.coeff_slices = pywt.coeffs_to_array(coeffs) #slices to flat array
233 |                         wavelet_frames.append(coeff_arr) 
234 | 
235 |                     start = 0
236 |                     end = len(wavelet_frames) - self.sequence_length
237 |                     assert end > 0
238 |                     step = 1
239 |                     for i in range(start, end, step):
240 |                         x = wavelet_frames[i:i + self.sequence_length]
241 |                         y = wavelet_frames[i + self.sequence_length]
242 |                         self.x_frames.append(x)
243 |                         self.y_frames.append(y)
244 |                 done = int(float(offset) / float(1020) * 100.0)
245 |                 sys.stdout.write('{}% data generation complete.   \r'.format(done))
246 |                 sys.stdout.flush()
247 |         sys.stdout.write('100% data generation complete.')
248 |         sys.stdout.flush()
249 |         


--------------------------------------------------------------------------------
/utils/load_and_convert.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflowjs as tfjs
3 | 
4 | model = tf.keras.models.model_from_json('{"class_name": "Sequential", "config": {"name": "sequential_9", "layers": [{"class_name": "BatchNormalization", "config": {"name": "batch_normalization_4", "trainable": true, "batch_input_shape": [null, 40, 1025], "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "LSTM", "config": {"name": "lstm_13", "trainable": true, "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 256, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "LSTM", "config": {"name": "lstm_14", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 256, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "Dense", "config": {"name": "dense_13", "trainable": true, "dtype": "float32", "units": 1025, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_5", "trainable": true, "dtype": "float32", "activation": "linear"}}]}, "keras_version": "2.2.4-tf", "backend": "tensorflow"}')
5 | model.load_weights('/Users/louismccallum/Documents/programming/MIMIC/Examples/rnn-audio/python/wiley2layer_adam_nodropout_155_44100/wiley2layer_adam_nodropout_155_44100.h5')
6 | tfjs.converters.save_keras_model(model, "/Users/louismccallum/Documents/programming/MIMIC/Examples/rnn-audio/wileyModel")
7 | 


--------------------------------------------------------------------------------
/utils/random_search.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import tensorflow as tf
  3 | import tflearn
  4 | from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell, GRUCell
  5 | from tflearn.layers.core import dropout
  6 | from tflearn.layers.conv import conv_2d, max_pool_2d
  7 | from audio_dataset_generator import AudioDatasetGenerator
  8 | import random
  9 | import numpy as np
 10 | import json
 11 | 
 12 | 
 13 | def conv_net(net, filters, kernels, non_linearity):  
 14 |     """
 15 |     A quick function to build a conv net. 
 16 |     At the end it reshapes the network to be 3d to work with recurrent units.
 17 |     """
 18 |     assert len(filters) == len(kernels)
 19 |     
 20 |     for i in range(len(filters)):
 21 |         net = conv_2d(net, filters[i], kernels[i], activation=non_linearity)
 22 |         net = max_pool_2d(net, 2)
 23 |         
 24 |     dim1 = net.get_shape().as_list()[1]
 25 |     dim2 = net.get_shape().as_list()[2]
 26 |     dim3 = net.get_shape().as_list()[3]
 27 |     return tf.reshape(net, [-1, dim1 * dim3, dim2])
 28 |    
 29 |                       
 30 | def recurrent_net(net, rec_type, rec_size, return_sequence):
 31 |     """
 32 |     A quick if else block to build a recurrent layer, based on the type specified
 33 |     by the user.
 34 |     """
 35 |     if rec_type == 'lstm':
 36 |         net = tflearn.layers.recurrent.lstm(net, rec_size, return_seq=return_sequence)
 37 |     elif rec_type == 'gru':
 38 |         net = tflearn.layers.recurrent.gru(net, rec_size, return_seq=return_sequence)
 39 |     elif rec_type == 'bi_lstm':
 40 |         net = bidirectional_rnn(net, 
 41 |                                 BasicLSTMCell(rec_size), 
 42 |                                 BasicLSTMCell(rec_size), 
 43 |                                 return_seq=return_sequence)
 44 |     elif rec_type == 'bi_gru':
 45 |         net = bidirectional_rnn(net, 
 46 |                                 GRUCell(rec_size), 
 47 |                                 GRUCell(rec_size), 
 48 |                                 return_seq=return_sequence)
 49 |     else:
 50 |         raise ValueError('Incorrect rnn type passed. Try lstm, gru, bi_lstm or bi_gru.')
 51 |     return net
 52 | 
 53 | 
 54 | def create_random_parameters():
 55 |     hyperparameters = dict()
 56 |     
 57 |     # Dataset
 58 |     hyperparameters['sequence_length']      = random.choice([40, 50, 60, 70, 80])
 59 | 
 60 |     # Feature Extraction and Audio Genreation
 61 |     hyperparameters['sample_rate']          = 22050
 62 |     hyperparameters['fft_size']             = 2048
 63 |     hyperparameters['window_size']          = 1024
 64 |     hyperparameters['hop_size']             = 512
 65 | 
 66 |     # General Network
 67 |     hyperparameters['learning_rate']        = random.choice([1e-2, 1e-3, 1e-4])
 68 |     hyperparameters['amount_epochs']        = 700
 69 |     hyperparameters['batch_size']           = random.choice([32, 64, 128, 256])
 70 |     hyperparameters['keep_prob']            = random.choice([0.1, 0.2, 0.3, 0.5, 0.75, 1.0])
 71 |     hyperparameters['activation']           = random.choice(['sigmoid', 'tanh', 'relu', 'leaky_relu', 'selu'])
 72 |     hyperparameters['optimiser']            = random.choice(['adam', 'rmsprop'])
 73 |     hyperparameters['fully_connected_dim']  = random.choice([512, 1024, 2048])
 74 | 
 75 |     # Recurrent Neural Network
 76 |     hyperparameters['rnn_type']             = random.choice(["lstm", "gru", "bi_lstm", "bi_gru"])
 77 |     hyperparameters['number_rnn_layers']    = random.choice([1, 2, 3, 4])
 78 |     hyperparameters['rnn_number_units']     = random.choice([256, 512, 1024])
 79 | 
 80 |     # Convolutional Neural Network
 81 |     hyperparameters['use_cnn']              = random.choice([True, False])
 82 |     cnn_int                                 = random.randint(0, 3)
 83 |     hyperparameters['number_filters']       = [[32], [64], [32, 64], [64, 32]][cnn_int]
 84 |     hyperparameters['filter_sizes']         = [[1], [3], [1, 5], [1, 3]][cnn_int]
 85 |     
 86 |     hyperparameters['fitness']              = 0.0
 87 |     
 88 |     return hyperparameters
 89 | 
 90 | 
 91 | epoch = 0
 92 | 
 93 | for model_no in range(100):
 94 |     try:
 95 |         hyperparameters = create_random_parameters()
 96 | 
 97 |         paths = ["assets/electronic_piano/", "assets/other", "assets/test_samples/"]
 98 | 
 99 |         for audio_data_path in paths:
100 | 
101 |             tf.reset_default_graph()
102 | 
103 |             dataset = AudioDatasetGenerator(hyperparameters['fft_size'], 
104 |                                             hyperparameters['window_size'], 
105 |                                             hyperparameters['hop_size'],
106 |                                             hyperparameters['sequence_length'], 
107 |                                             hyperparameters['sample_rate'])
108 | 
109 |             dataset.load(audio_data_path, True)
110 | 
111 |             if hyperparameters['use_cnn']:
112 |                 dataset.x_frames = dataset.x_frames.reshape(dataset.x_frames.shape[0], 
113 |                                                             dataset.x_frames.shape[1], 
114 |                                                             dataset.x_frames.shape[2], 
115 |                                                             1)
116 |             if hyperparameters['use_cnn']:
117 |                 net = tflearn.input_data([None, 
118 |                                           dataset.x_frames.shape[1], 
119 |                                           dataset.x_frames.shape[2], 
120 |                                           dataset.x_frames.shape[3]], 
121 |                                          name="input_data0")
122 |                 net = conv_net(net, 
123 |                                hyperparameters['number_filters'], 
124 |                                hyperparameters['filter_sizes'],
125 |                                hyperparameters['activation'])
126 |             else:                  
127 |                 net = tflearn.input_data([None, 
128 |                                           dataset.x_frames.shape[1], 
129 |                                           dataset.x_frames.shape[2]], 
130 |                                          name="input_data0") 
131 | 
132 |             # Batch Norm
133 |             net = tflearn.batch_normalization(net, name="batch_norm0")
134 | 
135 |             # Recurrent
136 |             for layer in range(hyperparameters['number_rnn_layers']):
137 |                 return_sequence = not layer == (hyperparameters['number_rnn_layers'] - 1)
138 |                 net = recurrent_net(net, 
139 |                                     hyperparameters['rnn_type'], 
140 |                                     hyperparameters['rnn_number_units'], 
141 |                                     return_sequence)
142 |                 if hyperparameters['keep_prob'] < 1.0:
143 |                     net = dropout(net, 1.0 - hyperparameters['keep_prob'])
144 | 
145 |             # Dense + MLP Out
146 |             net = tflearn.fully_connected(net, 
147 |                                           dataset.y_frames.shape[1], 
148 |                                           activation=hyperparameters['activation'],                                            
149 |                                           regularizer='L2', 
150 |                                           weight_decay=0.001)
151 | 
152 |             net = tflearn.fully_connected(net, 
153 |                                           dataset.y_frames.shape[1], 
154 |                                           activation='linear')
155 | 
156 |             net = tflearn.regression(net, 
157 |                                      optimizer=hyperparameters['optimiser'],
158 |                                      learning_rate=hyperparameters['learning_rate'],                                 
159 |                                      loss="mean_square")
160 | 
161 |             model = tflearn.DNN(net, tensorboard_verbose=1)
162 | 
163 |             model.fit(dataset.x_frames, 
164 |                       dataset.y_frames, 
165 |                       show_metric=True, 
166 |                       batch_size=hyperparameters['batch_size'], 
167 |                       n_epoch=hyperparameters['amount_epochs'])
168 | 
169 |             model_name = '{}_{}'.format(epoch, model_no)
170 |             with open(model_name + '.json', 'w') as fp:
171 |                 json.dump(hyperparameters, fp)
172 | 
173 |             amount_samples      = 1
174 |             sequence_length_max = 1000
175 |             impulse_scale       = 1.0
176 |             griffin_iterations  = 60
177 |             random_chance       = 0.0
178 |             random_strength     = 0.0
179 | 
180 |             dimension1 = dataset.x_frames.shape[1]
181 |             dimension2 = dataset.x_frames.shape[2]
182 |             shape = (1, dimension1, dimension2, 1) if hyperparameters['use_cnn'] else (1, dimension1, dimension2)
183 | 
184 |             audio = []
185 | 
186 |             for i in range(amount_samples):                                                                                                                                   
187 | 
188 |                 random_index = 5                                                                                                                    
189 | 
190 |                 impulse = np.array(dataset.x_frames[random_index]) * impulse_scale
191 |                 predicted_magnitudes = impulse
192 | 
193 |                 for j in range(sequence_length_max):
194 | 
195 |                     prediction = model.predict(impulse.reshape(shape))
196 | 
197 |                     if hyperparameters['use_cnn']:
198 |                         prediction = prediction.reshape(1, dataset.y_frames.shape[1], 1)
199 | 
200 |                     predicted_magnitudes = np.vstack((predicted_magnitudes, prediction))                                                                                                  
201 |                     impulse = predicted_magnitudes[-sequence_length:]
202 | 
203 |                     if (np.random.random_sample() < random_chance) :
204 |                         idx = np.random.randint(0, dataset.sequence_length)
205 |                         impulse[idx] = impulse[idx] + np.random.random_sample(impulse[idx].shape) * random_strength
206 | 
207 |                 predicted_magnitudes = np.array(predicted_magnitudes).reshape(-1, window_size+1)                                                                           
208 |                 audio = np.array(dataset.griffin_lim(predicted_magnitudes.T, griffin_iterations))
209 |                 filepath = model_name + '_{}_{}.wav'.format(i, audio_data_path)
210 |                 librosa.output.write_wav(filepath, 
211 |                                          audio, 
212 |                                          hyperparameters['sample_rate'])
213 |         
214 |     except (KeyboardInterrupt, SystemExit):
215 |         raise
216 |     except Exception as ex:
217 |         print(ex)
218 |         pass
219 | 


--------------------------------------------------------------------------------