├── Input
└── __init__.py
├── Models
├── __init__.py
├── OutputLayer.py
├── InterpolationLayer.py
├── UnetSpectrogramSeparator.py
└── UnetAudioSeparator.py
├── waveunet.png
├── audio_examples
├── Cristina Vane - So Easy
│ ├── mix.mp3
│ ├── vocals_true.mp3
│ ├── vocals_estimate.mp3
│ ├── accompaniment_true.mp3
│ └── accompaniment_estimate.mp3
├── Triviul feat. The Fiend - Widow
│ ├── mix.mp3
│ ├── vocals_true.mp3
│ ├── vocals_estimate.mp3
│ ├── accompaniment_true.mp3
│ └── accompaniment_estimate.mp3
└── The Mountaineering Club - Mallory
│ ├── mix.mp3
│ ├── vocals_true.mp3
│ ├── vocals_estimate.mp3
│ ├── accompaniment_true.mp3
│ └── accompaniment_estimate.mp3
├── checkpoints
└── README.md
├── requirements.txt
├── data
└── README.md
├── musb_005_angela thomas wade_audio_model_without_context_cut_28234samples_61002samples_93770samples_126538.wav
├── Predict.py
├── LICENSE
├── Plot.py
├── Test.py
├── Training.py
├── Utils.py
├── Config.py
├── README.md
├── Evaluate.py
├── Datasets.py
└── CCMixter.xml
/Input/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/waveunet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/waveunet.png
--------------------------------------------------------------------------------
/audio_examples/Cristina Vane - So Easy/mix.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Cristina Vane - So Easy/mix.mp3
--------------------------------------------------------------------------------
/audio_examples/Cristina Vane - So Easy/vocals_true.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Cristina Vane - So Easy/vocals_true.mp3
--------------------------------------------------------------------------------
/audio_examples/Triviul feat. The Fiend - Widow/mix.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Triviul feat. The Fiend - Widow/mix.mp3
--------------------------------------------------------------------------------
/audio_examples/The Mountaineering Club - Mallory/mix.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/The Mountaineering Club - Mallory/mix.mp3
--------------------------------------------------------------------------------
/audio_examples/Cristina Vane - So Easy/vocals_estimate.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Cristina Vane - So Easy/vocals_estimate.mp3
--------------------------------------------------------------------------------
/audio_examples/Cristina Vane - So Easy/accompaniment_true.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Cristina Vane - So Easy/accompaniment_true.mp3
--------------------------------------------------------------------------------
/audio_examples/The Mountaineering Club - Mallory/vocals_true.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/The Mountaineering Club - Mallory/vocals_true.mp3
--------------------------------------------------------------------------------
/audio_examples/Triviul feat. The Fiend - Widow/vocals_true.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Triviul feat. The Fiend - Widow/vocals_true.mp3
--------------------------------------------------------------------------------
/checkpoints/README.md:
--------------------------------------------------------------------------------
1 | Unzip the pretrained models archive file here to usethe models we provide!
2 |
3 | Your own models will also be saved here under a randomly assigned ID.
--------------------------------------------------------------------------------
/audio_examples/Cristina Vane - So Easy/accompaniment_estimate.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Cristina Vane - So Easy/accompaniment_estimate.mp3
--------------------------------------------------------------------------------
/audio_examples/Triviul feat. The Fiend - Widow/vocals_estimate.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Triviul feat. The Fiend - Widow/vocals_estimate.mp3
--------------------------------------------------------------------------------
/audio_examples/The Mountaineering Club - Mallory/vocals_estimate.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/The Mountaineering Club - Mallory/vocals_estimate.mp3
--------------------------------------------------------------------------------
/audio_examples/Triviul feat. The Fiend - Widow/accompaniment_true.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Triviul feat. The Fiend - Widow/accompaniment_true.mp3
--------------------------------------------------------------------------------
/audio_examples/The Mountaineering Club - Mallory/accompaniment_true.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/The Mountaineering Club - Mallory/accompaniment_true.mp3
--------------------------------------------------------------------------------
/audio_examples/Triviul feat. The Fiend - Widow/accompaniment_estimate.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/Triviul feat. The Fiend - Widow/accompaniment_estimate.mp3
--------------------------------------------------------------------------------
/audio_examples/The Mountaineering Club - Mallory/accompaniment_estimate.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/audio_examples/The Mountaineering Club - Mallory/accompaniment_estimate.mp3
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.15.4
2 | sacred==0.7.3
3 | tensorflow-gpu==1.8.0
4 | librosa==0.6.2
5 | soundfile==0.10.2
6 | lxml==4.2.1
7 | musdb==0.2.3
8 | museval==0.2.0
9 | google==2.0.1
10 | protobuf==3.4.0
11 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Pre-processed datasets will be saved here to speed up training!
2 | Different configurations require different datasets (one subfolder will be created for each one), so it might be desirable to delete some from time to time manually to save space!
--------------------------------------------------------------------------------
/musb_005_angela thomas wade_audio_model_without_context_cut_28234samples_61002samples_93770samples_126538.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/f90/Wave-U-Net/HEAD/musb_005_angela thomas wade_audio_model_without_context_cut_28234samples_61002samples_93770samples_126538.wav
--------------------------------------------------------------------------------
/Predict.py:
--------------------------------------------------------------------------------
1 | from sacred import Experiment
2 | from Config import config_ingredient
3 | import Evaluate
4 | import os
5 |
6 | ex = Experiment('Waveunet Prediction', ingredients=[config_ingredient])
7 |
8 | @ex.config
9 | def cfg():
10 | model_path = os.path.join("checkpoints", "full_44KHz", "full_44KHz-236118") # Load stereo vocal model by default
11 | input_path = os.path.join("audio_examples", "The Mountaineering Club - Mallory", "mix.mp3") # Which audio file to separate
12 | output_path = None # Where to save results. Default: Same location as input.
13 |
14 | @ex.automain
15 | def main(cfg, model_path, input_path, output_path):
16 | model_config = cfg["model_config"]
17 | Evaluate.produce_source_estimates(model_config, model_path, input_path, output_path)
--------------------------------------------------------------------------------
/Models/OutputLayer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | import Utils
4 |
5 | def independent_outputs(featuremap, source_names, num_channels, filter_width, padding, activation):
6 | outputs = dict()
7 | for name in source_names:
8 | outputs[name] = tf.layers.conv1d(featuremap, num_channels, filter_width, activation=activation, padding=padding)
9 | return outputs
10 |
11 | def difference_output(input_mix, featuremap, source_names, num_channels, filter_width, padding, activation, training):
12 | outputs = dict()
13 | sum_source = 0
14 | for name in source_names[:-1]:
15 | out = tf.layers.conv1d(featuremap, num_channels, filter_width, activation=activation, padding=padding)
16 | outputs[name] = out
17 | sum_source = sum_source + out
18 |
19 | # Compute last source based on the others
20 | last_source = Utils.crop(input_mix, sum_source.get_shape().as_list()) - sum_source
21 | last_source = Utils.AudioClip(last_source, training)
22 | outputs[source_names[-1]] = last_source
23 | return outputs
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Daniel Stoller
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Models/InterpolationLayer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 |
4 | def learned_interpolation_layer(input, padding, level):
5 | '''
6 | Implements a trainable upsampling layer by interpolation by a factor of two, from N samples to N*2 - 1.
7 | Interpolation of intermediate feature vectors v_1 and v_2 (of dimensionality F) is performed by
8 | w \cdot v_1 + (1-w) \cdot v_2, where \cdot is point-wise multiplication, and w an F-dimensional weight vector constrained to [0,1]
9 | :param input: Input features of shape [batch_size, 1, width, F]
10 | :param padding:
11 | :param level:
12 | :return:
13 | '''
14 | assert(padding == "valid" or padding == "same")
15 | features = input.get_shape().as_list()[3]
16 |
17 | # Construct 2FxF weight matrix, where F is the number of feature channels in the feature map.
18 | # Matrix is constrained, made up out of two diagonal FxF matrices with diagonal weights w and 1-w. w is constrained to be in [0,1] # mioid
19 | weights = tf.get_variable("interp_" + str(level), shape=[features], dtype=tf.float32)
20 | weights_scaled = tf.nn.sigmoid(weights) # Constrain weights to [0,1]
21 | counter_weights = 1.0 - weights_scaled # Mirrored weights for the features from the other time step
22 | conv_weights = tf.expand_dims(tf.concat([tf.expand_dims(tf.diag(weights_scaled), axis=0), tf.expand_dims(tf.diag(counter_weights), axis=0)], axis=0), axis=0)
23 | intermediate_vals = tf.nn.conv2d(input, conv_weights, strides=[1,1,1,1], padding=padding.upper())
24 |
25 | intermediate_vals = tf.transpose(intermediate_vals, [2, 0, 1, 3])
26 | out = tf.transpose(input, [2, 0, 1, 3])
27 | num_entries = out.get_shape().as_list()[0]
28 | out = tf.concat([out, intermediate_vals], axis=0)
29 | indices = list()
30 |
31 | # Interleave interpolated features with original ones, starting with the first original one
32 | num_outputs = (2*num_entries - 1) if padding == "valid" else 2*num_entries
33 | for idx in range(num_outputs):
34 | if idx % 2 == 0:
35 | indices.append(idx // 2)
36 | else:
37 | indices.append(num_entries + idx//2)
38 | out = tf.gather(out, indices)
39 | current_layer = tf.transpose(out, [1, 2, 0, 3])
40 | return current_layer
--------------------------------------------------------------------------------
/Plot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | import matplotlib.pyplot as plt
4 | import matplotlib.ticker as ticker
5 | from mpl_toolkits.axes_grid1 import make_axes_locatable
6 |
7 | import Utils
8 | from Evaluate import compute_mean_metrics
9 |
10 |
11 | def draw_violin_sdr(json_folder):
12 | acc, voc = compute_mean_metrics(json_folder, compute_averages=False)
13 | acc = acc[~np.isnan(acc)]
14 | voc = voc[~np.isnan(voc)]
15 | data = [acc, voc]
16 | inds = [1,2]
17 |
18 | fig, ax = plt.subplots()
19 | ax.violinplot(data, showmeans=True, showmedians=False, showextrema=False, vert=False)
20 | ax.scatter(np.percentile(data, 50, axis=1),inds, marker="o", color="black")
21 | ax.set_title("Segment-wise SDR distribution")
22 | ax.vlines([np.min(acc), np.min(voc), np.max(acc), np.max(voc)], [0.8, 1.8, 0.8, 1.8], [1.2, 2.2, 1.2, 2.2], color="blue")
23 | ax.hlines(inds, [np.min(acc), np.min(voc)], [np.max(acc), np.max(voc)], color='black', linestyle='--', lw=1, alpha=0.5)
24 |
25 | ax.set_yticks([1,2])
26 | ax.set_yticklabels(["Accompaniment", "Vocals"])
27 |
28 | fig.set_size_inches(8, 3.)
29 | fig.savefig("sdr_histogram.pdf", bbox_inches='tight')
30 |
31 | def draw_spectrogram(example_wav="musb_005_angela thomas wade_audio_model_without_context_cut_28234samples_61002samples_93770samples_126538.wav"):
32 | y, sr = Utils.load(example_wav, sr=None)
33 | spec = np.abs(librosa.stft(y, 512, 256, 512))
34 | norm_spec = librosa.power_to_db(spec**2)
35 | black_time_frames = np.array([28234, 61002, 93770, 126538]) / 256.0
36 |
37 | fig, ax = plt.subplots()
38 | img = ax.imshow(norm_spec)
39 | plt.vlines(black_time_frames, [0, 0, 0, 0], [10, 10, 10, 10], colors="red", lw=2, alpha=0.5)
40 | plt.vlines(black_time_frames, [256, 256, 256, 256], [246, 246, 246, 246], colors="red", lw=2, alpha=0.5)
41 |
42 | divider = make_axes_locatable(ax)
43 | cax = divider.append_axes("right", size="5%", pad=0.1)
44 | plt.colorbar(img, cax=cax)
45 |
46 | ax.xaxis.set_label_position("bottom")
47 | #ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * 256.0 / sr))
48 | #ax.xaxis.set_major_formatter(ticks_x)
49 | ax.xaxis.set_major_locator(ticker.FixedLocator(([i * sr / 256. for i in range(len(y)//sr + 1)])))
50 | ax.xaxis.set_major_formatter(ticker.FixedFormatter(([str(i) for i in range(len(y)//sr + 1)])))
51 |
52 | ax.yaxis.set_major_locator(ticker.FixedLocator(([float(i) * 2000.0 / (sr/2.0) * 256. for i in range(6)])))
53 | ax.yaxis.set_major_formatter(ticker.FixedFormatter([str(i*2) for i in range(6)]))
54 |
55 | ax.set_xlabel("t (s)")
56 | ax.set_ylabel('f (KHz)')
57 |
58 | fig.set_size_inches(7., 3.)
59 | fig.savefig("spectrogram_example.pdf", bbox_inches='tight')
60 |
61 |
--------------------------------------------------------------------------------
/Test.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.contrib.signal.python.ops import window_ops
3 | import numpy as np
4 | import os
5 |
6 | import Datasets
7 | import Models.UnetSpectrogramSeparator
8 | import Models.UnetAudioSeparator
9 | import functools
10 |
11 | def test(model_config, partition, model_folder, load_model):
12 | # Determine input and output shapes
13 | disc_input_shape = [model_config["batch_size"], model_config["num_frames"], 0] # Shape of discriminator input
14 | if model_config["network"] == "unet":
15 | separator_class = Models.UnetAudioSeparator.UnetAudioSeparator(model_config)
16 | elif model_config["network"] == "unet_spectrogram":
17 | separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator(model_config)
18 | else:
19 | raise NotImplementedError
20 |
21 | sep_input_shape, sep_output_shape = separator_class.get_padding(np.array(disc_input_shape))
22 | separator_func = separator_class.get_output
23 |
24 | # Creating the batch generators
25 | assert ((sep_input_shape[1] - sep_output_shape[1]) % 2 == 0)
26 | dataset = Datasets.get_dataset(model_config, sep_input_shape, sep_output_shape, partition=partition)
27 | iterator = dataset.make_one_shot_iterator()
28 | batch = iterator.get_next()
29 |
30 | print("Testing...")
31 |
32 | # BUILD MODELS
33 | # Separator
34 | separator_sources = separator_func(batch["mix"], False, not model_config["raw_audio_loss"], reuse=False) # Sources are output in order [acc, voice] for voice separation, [bass, drums, other, vocals] for multi-instrument separation
35 |
36 | global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False, dtype=tf.int64)
37 |
38 | # Start session and queue input threads
39 | sess = tf.Session()
40 | sess.run(tf.global_variables_initializer())
41 | writer = tf.summary.FileWriter(model_config["log_dir"] + os.path.sep + model_folder, graph=sess.graph)
42 |
43 | # CHECKPOINTING
44 | # Load pretrained model to test
45 | restorer = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
46 | print("Num of variables" + str(len(tf.global_variables())))
47 | restorer.restore(sess, load_model)
48 | print('Pre-trained model restored for testing')
49 |
50 | # Start training loop
51 | _global_step = sess.run(global_step)
52 | print("Starting!")
53 |
54 | total_loss = 0.0
55 | batch_num = 1
56 |
57 | # Supervised objective: MSE for raw audio, MAE for magnitude space (Jansson U-Net)
58 | separator_loss = 0
59 | for key in model_config["source_names"]:
60 | real_source = batch[key]
61 | sep_source = separator_sources[key]
62 |
63 | if model_config["network"] == "unet_spectrogram" and not model_config["raw_audio_loss"]:
64 | window = functools.partial(window_ops.hann_window, periodic=True)
65 | stfts = tf.contrib.signal.stft(tf.squeeze(real_source, 2), frame_length=1024, frame_step=768,
66 | fft_length=1024, window_fn=window)
67 | real_mag = tf.abs(stfts)
68 | separator_loss += tf.reduce_mean(tf.abs(real_mag - sep_source))
69 | else:
70 | separator_loss += tf.reduce_mean(tf.square(real_source - sep_source))
71 | separator_loss = separator_loss / float(model_config["num_sources"]) # Normalise by number of sources
72 |
73 | while True:
74 | try:
75 | curr_loss = sess.run(separator_loss)
76 | total_loss = total_loss + (1.0 / float(batch_num)) * (curr_loss - total_loss)
77 | batch_num += 1
78 | except tf.errors.OutOfRangeError as e:
79 | break
80 |
81 | summary = tf.Summary(value=[tf.Summary.Value(tag="test_loss", simple_value=total_loss)])
82 | writer.add_summary(summary, global_step=_global_step)
83 |
84 | writer.flush()
85 | writer.close()
86 |
87 | print("Finished testing - Mean MSE: " + str(total_loss))
88 |
89 | # Close session, clear computational graph
90 | sess.close()
91 | tf.reset_default_graph()
92 |
93 | return total_loss
--------------------------------------------------------------------------------
/Models/UnetSpectrogramSeparator.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from Utils import LeakyReLU
4 | import functools
5 | from tensorflow.contrib.signal.python.ops import window_ops
6 |
7 | class UnetSpectrogramSeparator:
8 | '''
9 | U-Net separator network for singing voice separation.
10 | Takes in the mixture magnitude spectrogram and return estimates of the accompaniment and voice magnitude spectrograms.
11 | Uses "same" convolutions like in original paper
12 | '''
13 |
14 | def __init__(self, model_config):
15 | '''
16 | Initialize U-net
17 | :param num_layers: Number of down- and upscaling layers in the network
18 | '''
19 | self.num_layers = model_config["num_layers"]
20 | self.num_initial_filters = model_config["num_initial_filters"]
21 | self.mono = model_config["mono_downmix"]
22 | self.source_names = model_config["source_names"]
23 |
24 | assert(len(self.source_names) == 2) # Only use for acc/voice separation for now, since model gets too big otherwise
25 | assert(self.mono) # Only mono
26 |
27 | # Spectrogram settings
28 | self.frame_len = 1024
29 | self.hop = 768
30 |
31 | def get_padding(self, shape):
32 | '''
33 | Calculates the required amounts of padding along each axis of the input and output, so that the Unet works and has the given shape as output shape
34 | :param shape: Desired output shape
35 | :return: Padding along each axis (total): (Input frequency, input time)
36 | '''
37 |
38 | return [shape[0], shape[1], 1], [shape[0], shape[1], 1]
39 |
40 | def get_output(self, input, training, return_spectrogram=False, reuse=True):
41 | '''
42 | Creates symbolic computation graph of the U-Net for a given input batch
43 | :param input: Input batch of mixtures, 3D tensor [batch_size, num_samples, 1], mono raw audio
44 | :param reuse: Whether to create new parameter variables or reuse existing ones
45 | :Param return_spectrogram: Whether to output the spectrogram estimate or convert it to raw audio and return that
46 | :return: U-Net output: If return_spectrogram: Accompaniment and voice magnitudes as length-two list with two 4D tensors. Otherwise Two 3D tensors containing the raw audio estimates
47 | '''
48 | # Setup STFT computation
49 | window = functools.partial(window_ops.hann_window, periodic=True)
50 | inv_window = tf.contrib.signal.inverse_stft_window_fn(self.hop, forward_window_fn=window)
51 | with tf.variable_scope("separator", reuse=reuse):
52 | # Compute spectrogram
53 | assert(input.get_shape().as_list()[2] == 1) # Model works ONLY on mono
54 | stfts = tf.contrib.signal.stft(tf.squeeze(input, 2), frame_length=self.frame_len, frame_step=self.hop, fft_length=self.frame_len, window_fn=window)
55 | mix_mag = tf.abs(stfts)
56 | mix_angle = tf.angle(stfts)
57 |
58 | # Input for network
59 | mix_mag_norm = tf.log1p(tf.expand_dims(mix_mag, 3))
60 | mix_mag_norm = mix_mag_norm[:,:,:-1,:] # Cut off last frequency bin to make number of frequency bins divisible by 2
61 |
62 | mags = dict()
63 | for name in self.source_names: # One U-Net for each source as per Jansson et al
64 | enc_outputs = list()
65 | current_layer = mix_mag_norm
66 |
67 | # Down-convolution: Repeat pool-conv
68 | for i in range(self.num_layers):
69 | assert(current_layer.get_shape().as_list()[1] % 2 == 0 and current_layer.get_shape().as_list()[2] % 2 == 0)
70 | current_layer = tf.layers.conv2d(current_layer, self.num_initial_filters*(2**i), [5, 5], strides=[2,2], activation=None, padding='same')
71 | current_layer = tf.contrib.layers.batch_norm(current_layer, activation_fn=LeakyReLU, is_training=training)
72 |
73 | if i < self.num_layers - 1:
74 | enc_outputs.append(current_layer)
75 |
76 | # Upconvolution
77 | for i in range(self.num_layers - 1):
78 | # Repeat: Up-convolution (transposed conv with stride), copy-and-crop feature map from down-ward path, convolution to combine both feature maps
79 | current_layer = tf.layers.conv2d_transpose(current_layer, self.num_initial_filters*(2**(self.num_layers-i-2)), [5, 5], strides=[2,2], activation=None, padding="same") # *2
80 | current_layer = tf.contrib.layers.batch_norm(current_layer, is_training=training, activation_fn=tf.nn.relu)
81 | current_layer = tf.concat([enc_outputs[-i-1], current_layer], axis=3) #tf.concat([enc_outputs[-i - 1], current_layer], axis=3)
82 | if i < 3:
83 | current_layer = tf.layers.dropout(current_layer, training=training)
84 |
85 | # Compute mask
86 | mask = tf.layers.conv2d_transpose(current_layer, 1, [5,5], strides=[2,2], activation=tf.nn.sigmoid, padding="same")
87 | mask = tf.pad(mask, [(0,0), (0,0), (0, 1), (0,0)], mode="CONSTANT", constant_values=0.5) # Pad last frequency bin of mask that is missing since we removed it in the input
88 | mask = tf.squeeze(mask, 3)
89 |
90 | # Compute source magnitudes
91 | source_mag = tf.multiply(mix_mag, mask)
92 | mags[name] = source_mag
93 |
94 | if return_spectrogram:
95 | return mags
96 | else:
97 | audio_out = dict()
98 | # Reconstruct audio
99 | for source_name in list(mags.keys()):
100 | stft = tf.multiply(tf.complex(mags[source_name], 0.0), tf.exp(tf.complex(0.0, mix_angle)))
101 | audio = tf.contrib.signal.inverse_stft(stft, self.frame_len, self.hop, self.frame_len, window_fn=inv_window)
102 |
103 | # Reshape to [batch_size, samples, 1]
104 | audio = tf.expand_dims(audio, 2)
105 |
106 | audio_out[source_name] = audio
107 |
108 | return audio_out
109 |
--------------------------------------------------------------------------------
/Training.py:
--------------------------------------------------------------------------------
1 | from sacred import Experiment
2 | from Config import config_ingredient
3 | import tensorflow as tf
4 | import numpy as np
5 | import os
6 |
7 | import Datasets
8 | import Utils
9 | import Models.UnetSpectrogramSeparator
10 | import Models.UnetAudioSeparator
11 | import Test
12 | import Evaluate
13 |
14 | import functools
15 | from tensorflow.contrib.signal.python.ops import window_ops
16 |
17 | ex = Experiment('Waveunet Training', ingredients=[config_ingredient])
18 |
19 | @ex.config
20 | # Executed for training, sets the seed value to the Sacred config so that Sacred fixes the Python and Numpy RNG to the same state everytime.
21 | def set_seed():
22 | seed = 1337
23 |
24 | @config_ingredient.capture
25 | def train(model_config, experiment_id, load_model=None):
26 | # Determine input and output shapes
27 | disc_input_shape = [model_config["batch_size"], model_config["num_frames"], 0] # Shape of input
28 | if model_config["network"] == "unet":
29 | separator_class = Models.UnetAudioSeparator.UnetAudioSeparator(model_config)
30 | elif model_config["network"] == "unet_spectrogram":
31 | separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator(model_config)
32 | else:
33 | raise NotImplementedError
34 |
35 | sep_input_shape, sep_output_shape = separator_class.get_padding(np.array(disc_input_shape))
36 | separator_func = separator_class.get_output
37 |
38 | # Placeholders and input normalisation
39 | dataset = Datasets.get_dataset(model_config, sep_input_shape, sep_output_shape, partition="train")
40 | iterator = dataset.make_one_shot_iterator()
41 | batch = iterator.get_next()
42 |
43 | print("Training...")
44 |
45 | # BUILD MODELS
46 | # Separator
47 | separator_sources = separator_func(batch["mix"], True, not model_config["raw_audio_loss"], reuse=False) # Sources are output in order [acc, voice] for voice separation, [bass, drums, other, vocals] for multi-instrument separation
48 |
49 | # Supervised objective: MSE for raw audio, MAE for magnitude space (Jansson U-Net)
50 | separator_loss = 0
51 | for key in model_config["source_names"]:
52 | real_source = batch[key]
53 | sep_source = separator_sources[key]
54 |
55 | if model_config["network"] == "unet_spectrogram" and not model_config["raw_audio_loss"]:
56 | window = functools.partial(window_ops.hann_window, periodic=True)
57 | stfts = tf.contrib.signal.stft(tf.squeeze(real_source, 2), frame_length=1024, frame_step=768,
58 | fft_length=1024, window_fn=window)
59 | real_mag = tf.abs(stfts)
60 | separator_loss += tf.reduce_mean(tf.abs(real_mag - sep_source))
61 | else:
62 | separator_loss += tf.reduce_mean(tf.square(real_source - sep_source))
63 | separator_loss = separator_loss / float(model_config["num_sources"]) # Normalise by number of sources
64 |
65 | # TRAINING CONTROL VARIABLES
66 | global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False, dtype=tf.int64)
67 | increment_global_step = tf.assign(global_step, global_step + 1)
68 |
69 | # Set up optimizers
70 | separator_vars = Utils.getTrainableVariables("separator")
71 | print("Sep_Vars: " + str(Utils.getNumParams(separator_vars)))
72 | print("Num of variables" + str(len(tf.global_variables())))
73 |
74 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
75 | with tf.control_dependencies(update_ops):
76 | with tf.variable_scope("separator_solver"):
77 | separator_solver = tf.train.AdamOptimizer(learning_rate=model_config["init_sup_sep_lr"]).minimize(separator_loss, var_list=separator_vars)
78 |
79 | # SUMMARIES
80 | tf.summary.scalar("sep_loss", separator_loss, collections=["sup"])
81 | sup_summaries = tf.summary.merge_all(key='sup')
82 |
83 | # Start session and queue input threads
84 | config = tf.ConfigProto()
85 | config.gpu_options.allow_growth=True
86 | sess = tf.Session(config=config)
87 | sess.run(tf.global_variables_initializer())
88 | writer = tf.summary.FileWriter(model_config["log_dir"] + os.path.sep + str(experiment_id),graph=sess.graph)
89 |
90 | # CHECKPOINTING
91 | # Load pretrained model to continue training, if we are supposed to
92 | if load_model != None:
93 | restorer = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
94 | print("Num of variables" + str(len(tf.global_variables())))
95 | restorer.restore(sess, load_model)
96 | print('Pre-trained model restored from file ' + load_model)
97 |
98 | saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
99 |
100 | # Start training loop
101 | _global_step = sess.run(global_step)
102 | _init_step = _global_step
103 | for _ in range(model_config["epoch_it"]):
104 | # TRAIN SEPARATOR
105 | _, _sup_summaries = sess.run([separator_solver, sup_summaries])
106 | writer.add_summary(_sup_summaries, global_step=_global_step)
107 |
108 | # Increment step counter, check if maximum iterations per epoch is achieved and stop in that case
109 | _global_step = sess.run(increment_global_step)
110 |
111 | # Epoch finished - Save model
112 | print("Finished epoch!")
113 | save_path = saver.save(sess, model_config["model_base_dir"] + os.path.sep + str(experiment_id) + os.path.sep + str(experiment_id), global_step=int(_global_step))
114 |
115 | # Close session, clear computational graph
116 | writer.flush()
117 | writer.close()
118 | sess.close()
119 | tf.reset_default_graph()
120 |
121 | return save_path
122 |
123 | @config_ingredient.capture
124 | def optimise(model_config, experiment_id):
125 | epoch = 0
126 | best_loss = 10000
127 | model_path = None
128 | best_model_path = None
129 | for i in range(2):
130 | worse_epochs = 0
131 | if i==1:
132 | print("Finished first round of training, now entering fine-tuning stage")
133 | model_config["batch_size"] *= 2
134 | model_config["init_sup_sep_lr"] = 1e-5
135 | while worse_epochs < model_config["worse_epochs"]: # Early stopping on validation set after a few epochs
136 | print("EPOCH: " + str(epoch))
137 | model_path = train(load_model=model_path)
138 | curr_loss = Test.test(model_config, model_folder=str(experiment_id), partition="valid", load_model=model_path)
139 | epoch += 1
140 | if curr_loss < best_loss:
141 | worse_epochs = 0
142 | print("Performance on validation set improved from " + str(best_loss) + " to " + str(curr_loss))
143 | best_model_path = model_path
144 | best_loss = curr_loss
145 | else:
146 | worse_epochs += 1
147 | print("Performance on validation set worsened to " + str(curr_loss))
148 | print("TRAINING FINISHED - TESTING WITH BEST MODEL " + best_model_path)
149 | test_loss = Test.test(model_config, model_folder=str(experiment_id), partition="test", load_model=best_model_path)
150 | return best_model_path, test_loss
151 |
152 | @ex.automain
153 | def run(cfg):
154 | model_config = cfg["model_config"]
155 | print("SCRIPT START")
156 | # Create subfolders if they do not exist to save results
157 | for dir in [model_config["model_base_dir"], model_config["log_dir"]]:
158 | if not os.path.exists(dir):
159 | os.makedirs(dir)
160 |
161 | # Optimize in a supervised fashion until validation loss worsens
162 | sup_model_path, sup_loss = optimise()
163 | print("Supervised training finished! Saved model at " + sup_model_path + ". Performance: " + str(sup_loss))
164 |
165 | # Evaluate trained model on MUSDB
166 | Evaluate.produce_musdb_source_estimates(model_config, sup_model_path, model_config["musdb_path"], model_config["estimates_path"])
--------------------------------------------------------------------------------
/Utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import librosa
4 |
5 | def getTrainableVariables(tag=""):
6 | return [v for v in tf.trainable_variables() if tag in v.name]
7 |
8 | def getNumParams(tensors):
9 | return np.sum([np.prod(t.get_shape().as_list()) for t in tensors])
10 |
11 | def crop_and_concat(x1,x2, match_feature_dim=True):
12 | '''
13 | Copy-and-crop operation for two feature maps of different size.
14 | Crops the first input x1 equally along its borders so that its shape is equal to
15 | the shape of the second input x2, then concatenates them along the feature channel axis.
16 | :param x1: First input that is cropped and combined with the second input
17 | :param x2: Second input
18 | :return: Combined feature map
19 | '''
20 | if x2 is None:
21 | return x1
22 |
23 | x1 = crop(x1,x2.get_shape().as_list(), match_feature_dim)
24 | return tf.concat([x1, x2], axis=2)
25 |
26 | def random_amplify(sample):
27 | '''
28 | Randomly amplifies or attenuates the input signal
29 | :return: Amplified signal
30 | '''
31 | for key, val in list(sample.items()):
32 | if key != "mix":
33 | sample[key] = tf.random_uniform([], 0.7, 1.0) * val
34 |
35 | sample["mix"] = tf.add_n([val for key, val in list(sample.items()) if key != "mix"])
36 | return sample
37 |
38 | def crop_sample(sample, crop_frames):
39 | for key, val in list(sample.items()):
40 | if key != "mix" and crop_frames > 0:
41 | sample[key] = val[crop_frames:-crop_frames,:]
42 | return sample
43 |
44 | def pad_freqs(tensor, target_shape):
45 | '''
46 | Pads the frequency axis of a 4D tensor of shape [batch_size, freqs, timeframes, channels] or 2D tensor [freqs, timeframes] with zeros
47 | so that it reaches the target shape. If the number of frequencies to pad is uneven, the rows are appended at the end.
48 | :param tensor: Input tensor to pad with zeros along the frequency axis
49 | :param target_shape: Shape of tensor after zero-padding
50 | :return: Padded tensor
51 | '''
52 | target_freqs = (target_shape[1] if len(target_shape) == 4 else target_shape[0]) #TODO
53 | if isinstance(tensor, tf.Tensor):
54 | input_shape = tensor.get_shape().as_list()
55 | else:
56 | input_shape = tensor.shape
57 |
58 | if len(input_shape) == 2:
59 | input_freqs = input_shape[0]
60 | else:
61 | input_freqs = input_shape[1]
62 |
63 | diff = target_freqs - input_freqs
64 | if diff % 2 == 0:
65 | pad = [(diff/2, diff/2)]
66 | else:
67 | pad = [(diff//2, diff//2 + 1)] # Add extra frequency bin at the end
68 |
69 | if len(target_shape) == 2:
70 | pad = pad + [(0,0)]
71 | else:
72 | pad = [(0,0)] + pad + [(0,0), (0,0)]
73 |
74 | if isinstance(tensor, tf.Tensor):
75 | return tf.pad(tensor, pad, mode='constant', constant_values=0.0)
76 | else:
77 | return np.pad(tensor, pad, mode='constant', constant_values=0.0)
78 |
79 | def LeakyReLU(x, alpha=0.2):
80 | return tf.maximum(alpha*x, x)
81 |
82 | def AudioClip(x, training):
83 | '''
84 | Simply returns the input if training is set to True, otherwise clips the input to [-1,1]
85 | :param x: Input tensor (coming from last layer of neural network)
86 | :param training: Whether model is in training (True) or testing mode (False)
87 | :return: Output tensor (potentially clipped)
88 | '''
89 | if training:
90 | return x
91 | else:
92 | return tf.maximum(tf.minimum(x, 1.0), -1.0)
93 |
94 | def resample(audio, orig_sr, new_sr):
95 | return librosa.resample(audio.T, orig_sr, new_sr).T
96 |
97 | def load(path, sr=22050, mono=True, offset=0.0, duration=None, dtype=np.float32):
98 | # ALWAYS output (n_frames, n_channels) audio
99 | y, orig_sr = librosa.load(path, sr, mono, offset, duration, dtype)
100 | if len(y.shape) == 1:
101 | y = np.expand_dims(y, axis=0)
102 | return y.T, orig_sr
103 |
104 | def crop(tensor, target_shape, match_feature_dim=True):
105 | '''
106 | Crops a 3D tensor [batch_size, width, channels] along the width axes to a target shape.
107 | Performs a centre crop. If the dimension difference is uneven, crop last dimensions first.
108 | :param tensor: 4D tensor [batch_size, width, height, channels] that should be cropped.
109 | :param target_shape: Target shape (4D tensor) that the tensor should be cropped to
110 | :return: Cropped tensor
111 | '''
112 | shape = np.array(tensor.get_shape().as_list())
113 | diff = shape - np.array(target_shape)
114 | assert(diff[0] == 0 and (diff[2] == 0 or not match_feature_dim))# Only width axis can differ
115 | if (diff[1] % 2 != 0):
116 | print("WARNING: Cropping with uneven number of extra entries on one side")
117 | assert diff[1] >= 0 # Only positive difference allowed
118 | if diff[1] == 0:
119 | return tensor
120 | crop_start = diff // 2
121 | crop_end = diff - crop_start
122 |
123 | return tensor[:,crop_start[1]:-crop_end[1],:]
124 |
125 | def spectrogramToAudioFile(magnitude, fftWindowSize, hopSize, phaseIterations=10, phase=None, length=None):
126 | '''
127 | Computes an audio signal from the given magnitude spectrogram, and optionally an initial phase.
128 | Griffin-Lim is executed to recover/refine the given the phase from the magnitude spectrogram.
129 | :param magnitude: Magnitudes to be converted to audio
130 | :param fftWindowSize: Size of FFT window used to create magnitudes
131 | :param hopSize: Hop size in frames used to create magnitudes
132 | :param phaseIterations: Number of Griffin-Lim iterations to recover phase
133 | :param phase: If given, starts ISTFT with this particular phase matrix
134 | :param length: If given, audio signal is clipped/padded to this number of frames
135 | :return:
136 | '''
137 | if phase is not None:
138 | if phaseIterations > 0:
139 | # Refine audio given initial phase with a number of iterations
140 | return reconPhase(magnitude, fftWindowSize, hopSize, phaseIterations, phase, length)
141 | # reconstructing the new complex matrix
142 | stftMatrix = magnitude * np.exp(phase * 1j) # magnitude * e^(j*phase)
143 | audio = librosa.istft(stftMatrix, hop_length=hopSize, length=length)
144 | else:
145 | audio = reconPhase(magnitude, fftWindowSize, hopSize, phaseIterations)
146 | return audio
147 |
148 | def reconPhase(magnitude, fftWindowSize, hopSize, phaseIterations=10, initPhase=None, length=None):
149 | '''
150 | Griffin-Lim algorithm for reconstructing the phase for a given magnitude spectrogram, optionally with a given
151 | intial phase.
152 | :param magnitude: Magnitudes to be converted to audio
153 | :param fftWindowSize: Size of FFT window used to create magnitudes
154 | :param hopSize: Hop size in frames used to create magnitudes
155 | :param phaseIterations: Number of Griffin-Lim iterations to recover phase
156 | :param initPhase: If given, starts reconstruction with this particular phase matrix
157 | :param length: If given, audio signal is clipped/padded to this number of frames
158 | :return:
159 | '''
160 | for i in range(phaseIterations):
161 | if i == 0:
162 | if initPhase is None:
163 | reconstruction = np.random.random_sample(magnitude.shape) + 1j * (2 * np.pi * np.random.random_sample(magnitude.shape) - np.pi)
164 | else:
165 | reconstruction = np.exp(initPhase * 1j) # e^(j*phase), so that angle => phase
166 | else:
167 | reconstruction = librosa.stft(audio, fftWindowSize, hopSize)
168 | spectrum = magnitude * np.exp(1j * np.angle(reconstruction))
169 | if i == phaseIterations - 1:
170 | audio = librosa.istft(spectrum, hopSize, length=length)
171 | else:
172 | audio = librosa.istft(spectrum, hopSize)
173 | return audio
--------------------------------------------------------------------------------
/Models/UnetAudioSeparator.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | import Models.InterpolationLayer
4 | import Utils
5 | from Utils import LeakyReLU
6 | import numpy as np
7 | import Models.OutputLayer
8 |
9 | class UnetAudioSeparator:
10 | '''
11 | U-Net separator network for singing voice separation.
12 | Uses valid convolutions, so it predicts for the centre part of the input - only certain input and output shapes are therefore possible (see getpadding function)
13 | '''
14 |
15 | def __init__(self, model_config):
16 | '''
17 | Initialize U-net
18 | :param num_layers: Number of down- and upscaling layers in the network
19 | '''
20 | self.num_layers = model_config["num_layers"]
21 | self.num_initial_filters = model_config["num_initial_filters"]
22 | self.filter_size = model_config["filter_size"]
23 | self.merge_filter_size = model_config["merge_filter_size"]
24 | self.input_filter_size = model_config["input_filter_size"]
25 | self.output_filter_size = model_config["output_filter_size"]
26 | self.upsampling = model_config["upsampling"]
27 | self.output_type = model_config["output_type"]
28 | self.context = model_config["context"]
29 | self.padding = "valid" if model_config["context"] else "same"
30 | self.source_names = model_config["source_names"]
31 | self.num_channels = 1 if model_config["mono_downmix"] else 2
32 | self.output_activation = model_config["output_activation"]
33 |
34 | def get_padding(self, shape):
35 | '''
36 | Calculates the required amounts of padding along each axis of the input and output, so that the Unet works and has the given shape as output shape
37 | :param shape: Desired output shape
38 | :return: Input_shape, output_shape, where each is a list [batch_size, time_steps, channels]
39 | '''
40 |
41 | if self.context:
42 | # Check if desired shape is possible as output shape - go from output shape towards lowest-res feature map
43 | rem = float(shape[1]) # Cut off batch size number and channel
44 |
45 | # Output filter size
46 | rem = rem - self.output_filter_size + 1
47 |
48 | # Upsampling blocks
49 | for i in range(self.num_layers):
50 | rem = rem + self.merge_filter_size - 1
51 | rem = (rem + 1.) / 2.# out = in + in - 1 <=> in = (out+1)/
52 |
53 | # Round resulting feature map dimensions up to nearest integer
54 | x = np.asarray(np.ceil(rem),dtype=np.int64)
55 | assert(x >= 2)
56 |
57 | # Compute input and output shapes based on lowest-res feature map
58 | output_shape = x
59 | input_shape = x
60 |
61 | # Extra conv
62 | input_shape = input_shape + self.filter_size - 1
63 |
64 | # Go from centre feature map through up- and downsampling blocks
65 | for i in range(self.num_layers):
66 | output_shape = 2*output_shape - 1 #Upsampling
67 | output_shape = output_shape - self.merge_filter_size + 1 # Conv
68 |
69 | input_shape = 2*input_shape - 1 # Decimation
70 | if i < self.num_layers - 1:
71 | input_shape = input_shape + self.filter_size - 1 # Conv
72 | else:
73 | input_shape = input_shape + self.input_filter_size - 1
74 |
75 | # Output filters
76 | output_shape = output_shape - self.output_filter_size + 1
77 |
78 | input_shape = np.concatenate([[shape[0]], [input_shape], [self.num_channels]])
79 | output_shape = np.concatenate([[shape[0]], [output_shape], [self.num_channels]])
80 |
81 | return input_shape, output_shape
82 | else:
83 | return [shape[0], shape[1], self.num_channels], [shape[0], shape[1], self.num_channels]
84 |
85 | def get_output(self, input, training, return_spectrogram=False, reuse=True):
86 | '''
87 | Creates symbolic computation graph of the U-Net for a given input batch
88 | :param input: Input batch of mixtures, 3D tensor [batch_size, num_samples, num_channels]
89 | :param reuse: Whether to create new parameter variables or reuse existing ones
90 | :return: U-Net output: List of source estimates. Each item is a 3D tensor [batch_size, num_out_samples, num_channels]
91 | '''
92 | with tf.variable_scope("separator", reuse=reuse):
93 | enc_outputs = list()
94 | current_layer = input
95 |
96 | # Down-convolution: Repeat strided conv
97 | for i in range(self.num_layers):
98 | current_layer = tf.layers.conv1d(current_layer, self.num_initial_filters + (self.num_initial_filters * i), self.filter_size, strides=1, activation=LeakyReLU, padding=self.padding) # out = in - filter + 1
99 | enc_outputs.append(current_layer)
100 | current_layer = current_layer[:,::2,:] # Decimate by factor of 2 # out = (in-1)/2 + 1
101 |
102 | current_layer = tf.layers.conv1d(current_layer, self.num_initial_filters + (self.num_initial_filters * self.num_layers),self.filter_size,activation=LeakyReLU,padding=self.padding) # One more conv here since we need to compute features after last decimation
103 |
104 | # Feature map here shall be X along one dimension
105 |
106 | # Upconvolution
107 | for i in range(self.num_layers):
108 | #UPSAMPLING
109 | current_layer = tf.expand_dims(current_layer, axis=1)
110 | if self.upsampling == 'learned':
111 | # Learned interpolation between two neighbouring time positions by using a convolution filter of width 2, and inserting the responses in the middle of the two respective inputs
112 | current_layer = Models.InterpolationLayer.learned_interpolation_layer(current_layer, self.padding, i)
113 | else:
114 | if self.context:
115 | current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2] * 2 - 1], align_corners=True)
116 | else:
117 | current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2]*2]) # out = in + in - 1
118 | current_layer = tf.squeeze(current_layer, axis=1)
119 | # UPSAMPLING FINISHED
120 |
121 | assert(enc_outputs[-i-1].get_shape().as_list()[1] == current_layer.get_shape().as_list()[1] or self.context) #No cropping should be necessary unless we are using context
122 | current_layer = Utils.crop_and_concat(enc_outputs[-i-1], current_layer, match_feature_dim=False)
123 | current_layer = tf.layers.conv1d(current_layer, self.num_initial_filters + (self.num_initial_filters * (self.num_layers - i - 1)), self.merge_filter_size,
124 | activation=LeakyReLU,
125 | padding=self.padding) # out = in - filter + 1
126 |
127 | current_layer = Utils.crop_and_concat(input, current_layer, match_feature_dim=False)
128 |
129 | # Output layer
130 | # Determine output activation function
131 | if self.output_activation == "tanh":
132 | out_activation = tf.tanh
133 | elif self.output_activation == "linear":
134 | out_activation = lambda x: Utils.AudioClip(x, training)
135 | else:
136 | raise NotImplementedError
137 |
138 | if self.output_type == "direct":
139 | return Models.OutputLayer.independent_outputs(current_layer, self.source_names, self.num_channels, self.output_filter_size, self.padding, out_activation)
140 | elif self.output_type == "difference":
141 | cropped_input = Utils.crop(input,current_layer.get_shape().as_list(), match_feature_dim=False)
142 | return Models.OutputLayer.difference_output(cropped_input, current_layer, self.source_names, self.num_channels, self.output_filter_size, self.padding, out_activation, training)
143 | else:
144 | raise NotImplementedError
--------------------------------------------------------------------------------
/Config.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sacred import Ingredient
3 |
4 | config_ingredient = Ingredient("cfg")
5 |
6 | @config_ingredient.config
7 | def cfg():
8 | # Base configuration
9 | model_config = {"musdb_path" : "/mnt/windaten/Datasets/MUSDB18/", # SET MUSDB PATH HERE, AND SET CCMIXTER PATH IN CCMixter.xml
10 | "estimates_path" : "/mnt/windaten/Source_Estimates", # SET THIS PATH TO WHERE YOU WANT SOURCE ESTIMATES PRODUCED BY THE TRAINED MODEL TO BE SAVED. Folder itself must exist!
11 | "data_path" : "data", # Set this to where the preprocessed dataset should be saved
12 |
13 | "model_base_dir" : "checkpoints", # Base folder for model checkpoints
14 | "log_dir" : "logs", # Base folder for logs files
15 | "batch_size" : 16, # Batch size
16 | "init_sup_sep_lr" : 1e-4, # Supervised separator learning rate
17 | "epoch_it" : 2000, # Number of supervised separator steps per epoch
18 | 'cache_size': 4000, # Number of audio snippets buffered in the random shuffle queue. Larger is better, since workers put multiple examples of one song into this queue. The number of different songs that is sampled from with each batch equals cache_size / num_snippets_per_track. Set as high as your RAM allows.
19 | 'num_workers' : 4, # Number of processes used for each TF map operation used when loading the dataset
20 | "num_snippets_per_track" : 100, # Number of snippets that should be extracted from each song at a time after loading it. Higher values make data loading faster, but can reduce the batches song diversity
21 | 'num_layers' : 12, # How many U-Net layers
22 | 'filter_size' : 15, # For Wave-U-Net: Filter size of conv in downsampling block
23 | 'merge_filter_size' : 5, # For Wave-U-Net: Filter size of conv in upsampling block
24 | 'input_filter_size' : 15, # For Wave-U-Net: Filter size of first convolution in first downsampling block
25 | 'output_filter_size': 1, # For Wave-U-Net: Filter size of convolution in the output layer
26 | 'num_initial_filters' : 24, # Number of filters for convolution in first layer of network
27 | "num_frames": 16384, # DESIRED number of time frames in the output waveform per samples (could be changed when using valid padding)
28 | 'expected_sr': 22050, # Downsample all audio input to this sampling rate
29 | 'mono_downmix': True, # Whether to downsample the audio input
30 | 'output_type' : 'direct', # Type of output layer, either "direct" or "difference". Direct output: Each source is result of tanh activation and independent. DIfference: Last source output is equal to mixture input - sum(all other sources)
31 | 'output_activation' : 'tanh', # Activation function for output layer. "tanh" or "linear". Linear output involves clipping to [-1,1] at test time, and might be more stable than tanh
32 | 'context' : False, # Type of padding for convolutions in separator. If False, feature maps double or half in dimensions after each convolution, and convolutions are padded with zeros ("same" padding). If True, convolution is only performed on the available mixture input, thus the output is smaller than the input
33 | 'network' : 'unet', # Type of network architecture, either unet (our model) or unet_spectrogram (Jansson et al 2017 model)
34 | 'upsampling' : 'linear', # Type of technique used for upsampling the feature maps in a unet architecture, either 'linear' interpolation or 'learned' filling in of extra samples
35 | 'task' : 'voice', # Type of separation task. 'voice' : Separate music into voice and accompaniment. 'multi_instrument': Separate music into guitar, bass, vocals, drums and other (Sisec)
36 | 'augmentation' : True, # Random attenuation of source signals to improve generalisation performance (data augmentation)
37 | 'raw_audio_loss' : True, # Only active for unet_spectrogram network. True: L2 loss on audio. False: L1 loss on spectrogram magnitudes for training and validation and test loss
38 | 'worse_epochs' : 20, # Patience for early stoppping on validation set
39 | }
40 | experiment_id = np.random.randint(0,1000000)
41 |
42 | # Set output sources
43 | if model_config["task"] == "multi_instrument":
44 | model_config["source_names"] = ["bass", "drums", "other", "vocals"]
45 | elif model_config["task"] == "voice":
46 | model_config["source_names"] = ["accompaniment", "vocals"]
47 | else:
48 | raise NotImplementedError
49 | model_config["num_sources"] = len(model_config["source_names"])
50 | model_config["num_channels"] = 1 if model_config["mono_downmix"] else 2
51 |
52 | @config_ingredient.named_config
53 | def baseline():
54 | print("Training baseline model")
55 |
56 | @config_ingredient.named_config
57 | def baseline_diff():
58 | print("Training baseline model with difference output")
59 | model_config = {
60 | "output_type" : "difference"
61 | }
62 |
63 | @config_ingredient.named_config
64 | def baseline_context():
65 | print("Training baseline model with difference output and input context (valid convolutions)")
66 | model_config = {
67 | "output_type" : "difference",
68 | "context" : True
69 | }
70 |
71 | @config_ingredient.named_config
72 | def baseline_stereo():
73 | print("Training baseline model with difference output and input context (valid convolutions) and stereo input/output")
74 | model_config = {
75 | "output_type" : "difference",
76 | "context" : True,
77 | "mono_downmix" : False
78 | }
79 |
80 | @config_ingredient.named_config
81 | def full():
82 | print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer")
83 | model_config = {
84 | "output_type" : "difference",
85 | "context" : True,
86 | "upsampling": "learned",
87 | "mono_downmix" : False
88 | }
89 |
90 | @config_ingredient.named_config
91 | def full_44KHz():
92 | print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer, and 44.1 KHz sampling rate")
93 | model_config = {
94 | "output_type" : "difference",
95 | "context" : True,
96 | "upsampling": "learned",
97 | "mono_downmix" : False,
98 | "expected_sr" : 44100
99 | }
100 |
101 | @config_ingredient.named_config
102 | def baseline_context_smallfilter_deep():
103 | model_config = {
104 | "output_type": "difference",
105 | "context": True,
106 | "num_layers" : 14,
107 | "duration" : 7,
108 | "filter_size" : 5,
109 | "merge_filter_size" : 1
110 | }
111 |
112 | @config_ingredient.named_config
113 | def full_multi_instrument():
114 | print("Training multi-instrument separation with best model")
115 | model_config = {
116 | "output_type": "difference",
117 | "context": True,
118 | "upsampling": "linear",
119 | "mono_downmix": False,
120 | "task" : "multi_instrument"
121 | }
122 |
123 | @config_ingredient.named_config
124 | def baseline_comparison():
125 | model_config = {
126 | "batch_size": 4, # Less output since model is so big. Doesn't matter since the model's output is not dependent on its output or input size (only convolutions)
127 |
128 | "output_type": "difference",
129 | "context": True,
130 | "num_frames" : 768*127 + 1024,
131 | "duration" : 13,
132 | "expected_sr" : 8192,
133 | "num_initial_filters" : 34
134 | }
135 |
136 | @config_ingredient.named_config
137 | def unet_spectrogram():
138 | model_config = {
139 | "batch_size": 4, # Less output since model is so big.
140 |
141 | "network" : "unet_spectrogram",
142 | "num_layers" : 6,
143 | "expected_sr" : 8192,
144 | "num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
145 | "duration" : 13,
146 | "num_initial_filters" : 16
147 | }
148 |
149 | @config_ingredient.named_config
150 | def unet_spectrogram_l1():
151 | model_config = {
152 | "batch_size": 4, # Less output since model is so big.
153 |
154 | "network" : "unet_spectrogram",
155 | "num_layers" : 6,
156 | "expected_sr" : 8192,
157 | "num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
158 | "duration" : 13,
159 | "num_initial_filters" : 16,
160 | "raw_audio_loss" : False
161 | }
162 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Wave-U-Net
2 | Implementation of the [Wave-U-Net](https://arxiv.org/abs/1806.03185) for audio source separation.
3 |
4 | For the (improved) Pytorch version, click [here](https://github.com/f90/Wave-U-Net-Pytorch).
5 |
6 | For a third-party implementation in Tensorflow 2/Keras (not by me), click [here](https://github.com/satvik-venkatesh/Wave-U-net-TF2).
7 |
8 | ## Listening examples
9 |
10 | Listen to vocal separation results [here](https://sisec18.unmix.app/#/unmix/Side%20Effects%20Project%20-%20Sing%20With%20Me/STL1) and to multi-instrument separation results [here](https://sisec18.unmix.app/#/unmix/Side%20Effects%20Project%20-%20Sing%20With%20Me/STL2)
11 |
12 | ## What is the Wave-U-Net?
13 | The Wave-U-Net is a convolutional neural network applicable to audio source separation tasks, which works directly on the raw audio waveform, presented in [this paper](https://arxiv.org/abs/1806.03185).
14 |
15 | The Wave-U-Net is an adaptation of the U-Net architecture to the one-dimensional time domain to perform end-to-end audio source separation. Through a series of downsampling and upsampling blocks, which involve 1D convolutions combined with a down-/upsampling process, features are computed on multiple scales/levels of abstraction and time resolution, and combined to make a prediction.
16 |
17 | See the diagram below for a summary of the network architecture.
18 |
19 |
20 |
21 | ### Participation in the SiSec separation competition
22 |
23 | The Wave-U-Net also participated in the [SiSec separation campaign](https://sisec.inria.fr/) as submissions [STL1](https://github.com/sigsep/sigsep-mus-2018/blob/master/submissions/STL1/description.md) and [STL2](https://github.com/sigsep/sigsep-mus-2018/blob/master/submissions/STL2/description.md) and achieved a good performance, especially considering the limited dataset we used compared to many other submissions despite having a more data-hungry end-to-end approach (we have to learn the frequency decomposition front-end from data as well).
24 |
25 | # Installation
26 |
27 | ## Requirements
28 |
29 | GPU strongly recommended to avoid very long training times.
30 |
31 | The project is based on Python 3.6.8 and requires [libsndfile](http://mega-nerd.com/libsndfile/) and CUDA 9 to be installed.
32 |
33 | Then, the following Python packages need to be installed:
34 |
35 | ```
36 | numpy==1.15.4
37 | sacred==0.7.3
38 | tensorflow-gpu==1.8.0
39 | librosa==0.6.2
40 | soundfile==0.10.2
41 | lxml==4.2.1
42 | musdb==0.2.3
43 | museval==0.2.0
44 | google==2.0.1
45 | protobuf==3.4.0
46 | ```
47 |
48 | Alternatively to ``tensorflow-gpu`` the CPU version of TF, ``tensorflow`` can be used, if there is no GPU available.
49 | All the above packages are also saved in the file ``requirements.txt`` located in this repository, so you can clone the repository and then execute the following in the downloaded repository's path to install all the required packages at once:
50 |
51 | ``pip install -r requirements.txt``
52 |
53 | To recreate the figures from the paper, use functions in ``Plot.py``. The ``matplotlib<3.0`` package needs to be installed as well in that case.
54 |
55 | ### Download datasets
56 |
57 | To directly use the pre-trained models we provide for download to separate your own songs, now skip directly to the [last section](#test), since the datasets are not needed in that case.
58 |
59 | To reproduce the experiments in the paper (train all the models), you need to download the datasets below. You can of course use your own datasets for training, but for this you would need to modify the code manually, which will not be discussed here.
60 |
61 | #### MUSDB18
62 |
63 | Download the [full MUSDB18 dataset](https://sigsep.github.io/datasets/musdb.html) and extract it into a folder of your choice. It should have two subfolders: "test" and "train" as well as a README.md file.
64 |
65 | #### CCMixter (only required for vocal separation experiments)
66 |
67 | If you want to replicate the vocal separation experiments and not only the multi-instrument experiments, you also need to download the CCMixter vocal separation database from https://members.loria.fr/ALiutkus/kam/. Extract this dataset into a folder of your choice. Its main folder should contain one subfolder for each song.
68 |
69 | ### Set-up filepaths
70 |
71 | Now you need to set up the correct file paths for the datasets and the location where source estimates should be saved.
72 |
73 | Open the ``Config.py`` file, and set the ``musdb_path`` entry of the ``model_config`` dictionary to the location of the main folder of the MUSDB18 dataset.
74 | Also set the ``estimates_path`` entry of the same ``model_config`` dictionary to the path pointing to an empty folder where you want the final source estimates of the model to be saved into.
75 |
76 | If you use CCMixter, open the ``CCMixter.xml`` in the main repository folder, and replace the given file path tagged as ``databaseFolderPath`` with your path to the main folder of CCMixter.
77 |
78 | ## Training the models / model overview
79 |
80 | Since the paper investigates many model variants of the Wave-U-Net and also trains the [U-Net proposed for vocal separation](https://ismir2017.smcnus.org/wp-content/uploads/2017/10/171_Paper.pdf), which achieved state-of-the-art performance, as a comparison, we give a list of model variants to train and the command needed to start training them:
81 |
82 | | Model name (from paper) | Description | Separate vocals or multi-instrument? | Command for training |
83 | |-------------------------|---------------------------------------------------------|--------------------------------------|-----------------------------------------------|
84 | | M1 | Baseline Wave-U-Net model | Vocals | ``python Training.py`` |
85 | | M2 | M1 + difference output layer | Vocals | ``python Training.py with cfg.baseline_diff`` |
86 | | M3 | M2 + proper input context | Vocals | ``python Training.py with cfg.baseline_context`` |
87 | | M4 | BEST-PERFORMING: M3 + Stereo I/O | Vocals | ``python Training.py with cfg.baseline_stereo`` |
88 | | M5 | M4 + Learned upsampling layer | Vocals | ``python Training.py with cfg.full`` |
89 | | M6 | M4 applied to multi-instrument sep. | Multi-instrument | ``python Training.py with cfg.full_multi_instrument`` |
90 | | M7 | Wave-U-Net model to compare with SotA models U7,U7a | Vocals | ``python Training.py with cfg.baseline_comparison`` |
91 | | U7 | U-Net replication from prior work, audio-based MSE loss | Vocals | ``python Training.py with cfg.unet_spectrogram`` |
92 | | U7a | Like U7, but with L1 magnitude loss | Vocals | ``python Training.py with cfg.unet_spectrogram_l1`` |
93 |
94 | **NEW:**
95 |
96 | We also include the following models not part of the paper (also with pre-trained weights for download!):
97 |
98 | | Model name (not in paper)| Description | Separate vocals or multi-instrument? | Command for training |
99 | |-------------------------|---------------------------------------------------------|--------------------------------------|-----------------------------------------------|
100 | | M5-HighSR | M5 with 44.1 KHz sampling rate | Vocals | ``python Training.py with cfg.full_44KHz`` |
101 |
102 | M5-HighSR is our best vocal separator, reaching a median (mean) vocal/acc SDR of 4.95 (1.01) and 11.16 (12.87), respectively.
103 |
104 | # Test trained models on songs!
105 |
106 | We provide a pretrained versions of models M4, M6 and M5-HighSR so you can separate any of your songs right away.
107 |
108 | ## Downloading our pretrained models
109 |
110 | Download our pretrained models [here](https://www.dropbox.com/s/oq0woy3cmf5s8y7/models.zip?dl=1).
111 | Unzip the archive into the ``checkpoints`` subfolder in this repository, so that you have one subfolder for each model (e.g. ``REPO/checkpoints/baseline_stereo``)
112 |
113 | ## Run pretrained models
114 |
115 | For a quick demo on an example song with our pre-trained best vocal separation model (M5-HighSR), one can simply execute
116 |
117 | `` python Predict.py with cfg.full_44KHz ``
118 |
119 | to separate the song "Mallory" included in this repository's ``audio_examples`` subfolder into vocals and accompaniment. The output will be saved next to the input file.
120 |
121 | To apply our pretrained model to any of your own songs, simply point to its audio file path using the ``input_path`` parameter:
122 |
123 | `` python Predict.py with cfg.full_44KHz input_path="/mnt/medien/Daniel/Music/Dark Passion Play/Nightwish - Bye Bye Beautiful.mp3"``
124 |
125 | If you want to save the predictions to a custom folder instead of where the input song is, just add the ``output_path`` parameter:
126 |
127 | `` python Predict.py with cfg.full_44KHz input_path="/mnt/medien/Daniel/Music/Dark Passion Play/Nightwish - Bye Bye Beautiful.mp3" output_path="/home/daniel" ``
128 |
129 | If you want to use other pre-trained models we provide (such as our multi-instrument separator) or your own ones, point to the location of the Tensorflow checkpoint file using the ``model_path`` parameter, making sure that the model configuration (here: ``full_multi_instrument``) matches with the model saved in the checkpoint. As an example for our pre-packaged multi-instrument model:
130 |
131 | `` python Predict.py with cfg.full_multi_instrument model_path="checkpoints/full_multi_instrument/full_multi_instrument-134067" input_path="/mnt/medien/Daniel/Music/Dark Passion Play/Nightwish - Bye Bye Beautiful.mp3" output_path="/home/daniel" ``
132 |
133 | # Known issues / Troubleshooting
134 |
135 | MacOS: If matplotlib gives errors upon being imported, see [this issue](https://github.com/f90/Wave-U-Net/issues/15) and [that issue](https://github.com/f90/Wave-U-Net/issues/8) for solutions.
136 |
137 | During the preparation of the MUSDB dataset, conversion to WAV can sometimes halt because of an ffmpeg process freezing that is used within the musdb python package to identify the datasets mp4 audio streams. This seems to be an error occurring upon the subprocess.Popen() used deep within the stempeg library. Due to its random nature, it is not currently known how to fix this. I suggest regenerating the dataset again if this error occurs.
138 |
--------------------------------------------------------------------------------
/Evaluate.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import librosa
4 |
5 | import os
6 | import json
7 | import glob
8 |
9 | import Models.UnetAudioSeparator
10 | import Models.UnetSpectrogramSeparator
11 |
12 | import musdb
13 | import museval
14 | import Utils
15 |
16 | def predict(track, model_config, load_model, results_dir=None):
17 | '''
18 | Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script.
19 | Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path!
20 | :param track: Track object
21 | :param results_dir: Directory where SDR etc. values should be saved
22 | :return: Source estimates dictionary
23 | '''
24 |
25 | # Determine input and output shapes, if we use U-net as separator
26 | disc_input_shape = [model_config["batch_size"], model_config["num_frames"], 0] # Shape of discriminator input
27 | if model_config["network"] == "unet":
28 | separator_class = Models.UnetAudioSeparator.UnetAudioSeparator(model_config)
29 | elif model_config["network"] == "unet_spectrogram":
30 | separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator(model_config)
31 | else:
32 | raise NotImplementedError
33 |
34 | sep_input_shape, sep_output_shape = separator_class.get_padding(np.array(disc_input_shape))
35 | separator_func = separator_class.get_output
36 |
37 | # Batch size of 1
38 | sep_input_shape[0] = 1
39 | sep_output_shape[0] = 1
40 |
41 | mix_ph = tf.placeholder(tf.float32, sep_input_shape)
42 |
43 | print("Testing...")
44 |
45 | # BUILD MODELS
46 | # Separator
47 | separator_sources = separator_func(mix_ph, training=False, return_spectrogram=False, reuse=False)
48 |
49 | # Start session and queue input threads
50 | sess = tf.Session()
51 | sess.run(tf.global_variables_initializer())
52 |
53 | # Load model
54 | # Load pretrained model to continue training, if we are supposed to
55 | restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2)
56 | print("Num of variables" + str(len(tf.global_variables())))
57 | restorer.restore(sess, load_model)
58 | print('Pre-trained model restored for song prediction')
59 |
60 | mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[1] # Audio has (n_samples, n_channels) shape
61 | separator_preds = predict_track(model_config, sess, mix_audio, orig_sr, sep_input_shape, sep_output_shape, separator_sources, mix_ph)
62 |
63 | # Upsample predicted source audio and convert to stereo. Make sure to resample back to the exact number of samples in the original input (with fractional orig_sr/new_sr this causes issues otherwise)
64 | pred_audio = {name : Utils.resample(separator_preds[name], model_config["expected_sr"], orig_sr)[:mix_audio.shape[0],:] for name in model_config["source_names"]}
65 |
66 | if model_config["mono_downmix"] and mix_channels > 1: # Convert to multichannel if mixture input was multichannel by duplicating mono estimate
67 | pred_audio = {name : np.tile(pred_audio[name], [1, mix_channels]) for name in list(pred_audio.keys())}
68 |
69 | # Evaluate using museval, if we are currently evaluating MUSDB
70 | if results_dir is not None:
71 | scores = museval.eval_mus_track(track, pred_audio, output_dir=results_dir)
72 |
73 | # print nicely formatted mean scores
74 | print(scores)
75 |
76 | # Close session, clear computational graph
77 | sess.close()
78 | tf.reset_default_graph()
79 |
80 | return pred_audio
81 |
82 | def predict_track(model_config, sess, mix_audio, mix_sr, sep_input_shape, sep_output_shape, separator_sources, mix_context):
83 | '''
84 | Outputs source estimates for a given input mixture signal mix_audio [n_frames, n_channels] and a given Tensorflow session and placeholders belonging to the prediction network.
85 | It iterates through the track, collecting segment-wise predictions to form the output.
86 | :param model_config: Model configuration dictionary
87 | :param sess: Tensorflow session used to run the network inference
88 | :param mix_audio: [n_frames, n_channels] audio signal (numpy array). Can have higher sampling rate or channels than the model supports, will be downsampled correspondingly.
89 | :param mix_sr: Sampling rate of mix_audio
90 | :param sep_input_shape: Input shape of separator ([batch_size, num_samples, num_channels])
91 | :param sep_output_shape: Input shape of separator ([batch_size, num_samples, num_channels])
92 | :param separator_sources: List of Tensorflow tensors that represent the output of the separator network
93 | :param mix_context: Input tensor of the network
94 | :return:
95 | '''
96 | # Load mixture, convert to mono and downsample then
97 | assert(len(mix_audio.shape) == 2)
98 | if model_config["mono_downmix"]:
99 | mix_audio = np.mean(mix_audio, axis=1, keepdims=True)
100 | else:
101 | if mix_audio.shape[1] == 1:# Duplicate channels if input is mono but model is stereo
102 | mix_audio = np.tile(mix_audio, [1, 2])
103 |
104 | mix_audio = Utils.resample(mix_audio, mix_sr, model_config["expected_sr"])
105 |
106 | # Append zeros to mixture if its shorter than input size of network - this will be cut off at the end again
107 | if mix_audio.shape[0] < sep_input_shape[1]:
108 | extra_pad = sep_input_shape[1] - mix_audio.shape[0]
109 | mix_audio = np.pad(mix_audio, [(0, extra_pad), (0,0)], mode="constant", constant_values=0.0)
110 | else:
111 | extra_pad = 0
112 |
113 | # Preallocate source predictions (same shape as input mixture)
114 | source_time_frames = mix_audio.shape[0]
115 | source_preds = {name : np.zeros(mix_audio.shape, np.float32) for name in model_config["source_names"]}
116 |
117 | input_time_frames = sep_input_shape[1]
118 | output_time_frames = sep_output_shape[1]
119 |
120 | # Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
121 | pad_time_frames = (input_time_frames - output_time_frames) // 2
122 | mix_audio_padded = np.pad(mix_audio, [(pad_time_frames, pad_time_frames), (0,0)], mode="constant", constant_values=0.0)
123 |
124 | # Iterate over mixture magnitudes, fetch network rpediction
125 | for source_pos in range(0, source_time_frames, output_time_frames):
126 | # If this output patch would reach over the end of the source spectrogram, set it so we predict the very end of the output, then stop
127 | if source_pos + output_time_frames > source_time_frames:
128 | source_pos = source_time_frames - output_time_frames
129 |
130 | # Prepare mixture excerpt by selecting time interval
131 | mix_part = mix_audio_padded[source_pos:source_pos + input_time_frames,:]
132 | mix_part = np.expand_dims(mix_part, axis=0)
133 |
134 | source_parts = sess.run(separator_sources, feed_dict={mix_context: mix_part})
135 |
136 | # Save predictions
137 | # source_shape = [1, freq_bins, acc_mag_part.shape[2], num_chan]
138 | for name in model_config["source_names"]:
139 | source_preds[name][source_pos:source_pos + output_time_frames] = source_parts[name][0, :, :]
140 |
141 | # In case we had to pad the mixture at the end, remove those samples from source prediction now
142 | if extra_pad > 0:
143 | source_preds = {name : source_preds[name][:-extra_pad,:] for name in list(source_preds.keys())}
144 |
145 | return source_preds
146 |
147 | def produce_musdb_source_estimates(model_config, load_model, musdb_path, output_path, subsets=None):
148 | '''
149 | Predicts source estimates for MUSDB for a given model checkpoint and configuration, and evaluate them.
150 | :param model_config: Model configuration of the model to be evaluated
151 | :param load_model: Model checkpoint path
152 | :return:
153 | '''
154 | print("Evaluating trained model saved at " + str(load_model)+ " on MUSDB and saving source estimate audio to " + str(output_path))
155 |
156 | mus = musdb.DB(root_dir=musdb_path)
157 | predict_fun = lambda track : predict(track, model_config, load_model, output_path)
158 | assert(mus.test(predict_fun))
159 | mus.run(predict_fun, estimates_dir=output_path, subsets=subsets)
160 |
161 | def produce_source_estimates(model_config, load_model, input_path, output_path=None):
162 | '''
163 | For a given input mixture file, saves source predictions made by a given model.
164 | :param model_config: Model configuration
165 | :param load_model: Model checkpoint path
166 | :param input_path: Path to input mixture audio file
167 | :param output_path: Output directory where estimated sources should be saved. Defaults to the same folder as the input file, if not given
168 | :return: Dictionary of source estimates containing the source signals as numpy arrays
169 | '''
170 | print("Producing source estimates for input mixture file " + input_path)
171 | # Prepare input audio as track object (in the MUSDB sense), so we can use the MUSDB-compatible prediction function
172 | audio, sr = Utils.load(input_path, sr=None, mono=False)
173 | # Create something that looks sufficiently like a track object to our MUSDB function
174 | class TrackLike(object):
175 | def __init__(self, audio, rate, shape):
176 | self.audio = audio
177 | self.rate = rate
178 | self.shape = shape
179 | track = TrackLike(audio, sr, audio.shape)
180 |
181 | sources_pred = predict(track, model_config, load_model) # Input track to prediction function, get source estimates
182 |
183 | # Save source estimates as audio files into output dictionary
184 | input_folder, input_filename = os.path.split(input_path)
185 | if output_path is None:
186 | # By default, set it to the input_path folder
187 | output_path = input_folder
188 | if not os.path.exists(output_path):
189 | print("WARNING: Given output path " + output_path + " does not exist. Trying to create it...")
190 | os.makedirs(output_path)
191 | assert(os.path.exists(output_path))
192 | for source_name, source_audio in list(sources_pred.items()):
193 | librosa.output.write_wav(os.path.join(output_path, input_filename) + "_" + source_name + ".wav", source_audio, sr)
194 |
195 | def compute_mean_metrics(json_folder, compute_averages=True, metric="SDR"):
196 | '''
197 | Computes averages or collects evaluation metrics produced from MUSDB evaluation of a separator
198 | (see "produce_musdb_source_estimates" function), namely the mean, standard deviation, median, and median absolute
199 | deviation (MAD). Function is used to produce the results in the paper.
200 | Averaging ignores NaN values arising from parts where a source is silent
201 | :param json_folder: Path to the folder in which a collection of json files was written by the MUSDB evaluation library, one for each song.
202 | This is the output of the "produce_musdb_source_estimates" function.(By default, this is model_config["estimates_path"] + test or train)
203 | :param compute_averages: Whether to compute the average over all song segments (to get final evaluation measures) or to return the full list of segments
204 | :param metric: Which metric to evaluate (either "SDR", "SIR", "SAR" or "ISR")
205 | :return: IF compute_averages is True, returns a list with length equal to the number of separated sources, with each list element a tuple of (median, MAD, mean, SD).
206 | If it is false, also returns this list, but each element is now a numpy vector containing all segment-wise performance values
207 | '''
208 | files = glob.glob(os.path.join(json_folder, "*.json"))
209 | inst_list = None
210 | print("Found " + str(len(files)) + " JSON files to evaluate...")
211 | for path in files:
212 | #print(path)
213 | if path.__contains__("test.json"):
214 | print("Found test JSON, skipping...")
215 | continue
216 |
217 | with open(path, "r") as f:
218 | js = json.load(f)
219 |
220 | if inst_list is None:
221 | inst_list = [list() for _ in range(len(js["targets"]))]
222 |
223 | for i in range(len(js["targets"])):
224 | inst_list[i].extend([np.float(f['metrics'][metric]) for f in js["targets"][i]["frames"]])
225 |
226 | #return np.array(sdr_acc), np.array(sdr_voc)
227 | inst_list = [np.array(perf) for perf in inst_list]
228 |
229 | if compute_averages:
230 | return [(np.nanmedian(perf), np.nanmedian(np.abs(perf - np.nanmedian(perf))), np.nanmean(perf), np.nanstd(perf)) for perf in inst_list]
231 | else:
232 | return inst_list
--------------------------------------------------------------------------------
/Datasets.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os.path
3 | import random
4 | from multiprocessing import Process
5 |
6 | import Utils
7 |
8 | import numpy as np
9 | from lxml import etree
10 | import librosa
11 | import soundfile
12 | import os
13 | import tensorflow as tf
14 | import musdb
15 |
16 | def take_random_snippets(sample, keys, input_shape, num_samples):
17 | # Take a sample (collection of audio files) and extract snippets from it at a number of random positions
18 | start_pos = tf.random_uniform([num_samples], 0, maxval=sample["length"] - input_shape[0], dtype=tf.int64)
19 | return take_snippets_at_pos(sample, keys, start_pos, input_shape, num_samples)
20 |
21 | def take_all_snippets(sample, keys, input_shape, output_shape):
22 | # Take a sample and extract snippets from the audio signals, using a hop size equal to the output size of the network
23 | start_pos = tf.range(0, sample["length"] - input_shape[0], delta=output_shape[0], dtype=tf.int64)
24 | num_samples = start_pos.shape[0]
25 | return take_snippets_at_pos(sample, keys, start_pos, input_shape, num_samples)
26 |
27 | def take_snippets_at_pos(sample, keys, start_pos, input_shape, num_samples):
28 | # Take a sample and extract snippets from the audio signals at the given start positions with the given number of samples width
29 | batch = dict()
30 | for key in keys:
31 | batch[key] = tf.map_fn(lambda pos: sample[key][pos:pos + input_shape[0], :], start_pos, dtype=tf.float32)
32 | batch[key].set_shape([num_samples, input_shape[0], input_shape[1]])
33 |
34 | return tf.data.Dataset.from_tensor_slices(batch)
35 |
36 | def _floats_feature(value):
37 | return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))
38 |
39 | def _int64_feature(value):
40 | """Returns an int64_list from a bool / enum / int / uint."""
41 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
42 |
43 | def write_records(sample_list, model_config, input_shape, output_shape, records_path):
44 | # Writes samples in the given list as TFrecords into a given path, using the current model config and in/output shapes
45 |
46 | # Compute padding
47 | if (input_shape[1] - output_shape[1]) % 2 != 0:
48 | print("WARNING: Required number of padding of " + str(input_shape[1] - output_shape[1]) + " is uneven!")
49 | pad_frames = (input_shape[1] - output_shape[1]) // 2
50 |
51 | # Set up writers
52 | num_writers = 1
53 | writers = [tf.python_io.TFRecordWriter(records_path + str(i) + ".tfrecords") for i in range(num_writers)]
54 |
55 | # Go through songs and write them to TFRecords
56 | all_keys = model_config["source_names"] + ["mix"]
57 | for sample in sample_list:
58 | print("Reading song")
59 | try:
60 | audio_tracks = dict()
61 |
62 | for key in all_keys:
63 | audio, _ = Utils.load(sample[key], sr=model_config["expected_sr"], mono=model_config["mono_downmix"])
64 |
65 | if not model_config["mono_downmix"] and audio.shape[1] == 1:
66 | print("WARNING: Had to duplicate mono track to generate stereo")
67 | audio = np.tile(audio, [1, 2])
68 |
69 | audio_tracks[key] = audio
70 | except Exception as e:
71 | print(e)
72 | print("ERROR occurred during loading file " + str(sample) + ". Skipping")
73 | continue
74 |
75 | # Pad at beginning and end with zeros
76 | audio_tracks = {key : np.pad(audio_tracks[key], [(pad_frames, pad_frames), (0, 0)], mode="constant", constant_values=0.0) for key in list(audio_tracks.keys())}
77 |
78 | # All audio tracks must be exactly same length and channels
79 | length = audio_tracks["mix"].shape[0]
80 | channels = audio_tracks["mix"].shape[1]
81 | for audio in list(audio_tracks.values()):
82 | assert(audio.shape[0] == length)
83 | assert (audio.shape[1] == channels)
84 |
85 | # Write to TFrecords the flattened version
86 | feature = {key: _floats_feature(audio_tracks[key]) for key in all_keys}
87 | feature["length"] = _int64_feature(length)
88 | feature["channels"] = _int64_feature(channels)
89 | sample = tf.train.Example(features=tf.train.Features(feature=feature))
90 | writers[np.random.randint(0, num_writers)].write(sample.SerializeToString())
91 |
92 | for writer in writers:
93 | writer.close()
94 |
95 | def parse_record(example_proto, source_names, shape):
96 | # Parse record from TFRecord file
97 |
98 | all_names = source_names + ["mix"]
99 |
100 | features = {key : tf.FixedLenSequenceFeature([], allow_missing=True, dtype=tf.float32) for key in all_names}
101 | features["length"] = tf.FixedLenFeature([], tf.int64)
102 | features["channels"] = tf.FixedLenFeature([], tf.int64)
103 |
104 | parsed_features = tf.parse_single_example(example_proto, features)
105 |
106 | # Reshape
107 | length = tf.cast(parsed_features["length"], tf.int64)
108 | channels = tf.constant(shape[-1], tf.int64) #tf.cast(parsed_features["channels"], tf.int64)
109 | sample = dict()
110 | for key in all_names:
111 | sample[key] = tf.reshape(parsed_features[key], tf.stack([length, channels]))
112 | sample["length"] = length
113 | sample["channels"] = channels
114 |
115 | return sample
116 |
117 | def get_dataset(model_config, input_shape, output_shape, partition):
118 | '''
119 | For a model configuration and input/output shapes of the network, get the corresponding dataset for a given partition
120 | :param model_config: Model config
121 | :param input_shape: Input shape of network
122 | :param output_shape: Output shape of network
123 | :param partition: "train", "valid", or "test" partition
124 | :return: Tensorflow dataset object
125 | '''
126 |
127 |
128 | # Check if pre-processed dataset is already available for this model config and partition
129 | dataset_name = "task_" + model_config["task"] + "_" + \
130 | "sr_" + str(model_config["expected_sr"]) + "_" + \
131 | "mono_" + str(model_config["mono_downmix"])
132 | main_folder = os.path.join(model_config["data_path"], dataset_name)
133 |
134 | if not os.path.exists(main_folder):
135 | # We have to prepare the MUSDB dataset
136 | print("Preparing MUSDB dataset! This could take a while...")
137 | dsd_train, dsd_test = getMUSDB(model_config["musdb_path"]) # List of (mix, acc, bass, drums, other, vocal) tuples
138 |
139 | # Pick 25 random songs for validation from MUSDB train set (this is always the same selection each time since we fix the random seed!)
140 | val_idx = np.random.choice(len(dsd_train), size=25, replace=False)
141 | train_idx = [i for i in range(len(dsd_train)) if i not in val_idx]
142 | print("Validation with MUSDB training songs no. " + str(val_idx))
143 |
144 | # Draw randomly from datasets
145 | dataset = dict()
146 | dataset["train"] = [dsd_train[i] for i in train_idx]
147 | dataset["valid"] = [dsd_train[i] for i in val_idx]
148 | dataset["test"] = dsd_test
149 |
150 | # MUSDB base dataset loaded now, now create task-specific dataset based on that
151 | if model_config["task"] == "voice":
152 | # Prepare CCMixter
153 | print("Preparing CCMixter dataset!")
154 | ccm = getCCMixter("CCMixter.xml")
155 | dataset["train"].extend(ccm)
156 |
157 | # Convert audio files into TFRecords now
158 |
159 | # The dataset structure is a dictionary with "train", "valid", "test" keys, whose entries are lists, where each element represents a song.
160 | # Each song is represented as a dictionary containing elements mix, acc, vocal or mix, bass, drums, other, vocal depending on the task.
161 |
162 | num_cores = 8
163 |
164 | for curr_partition in ["train", "valid", "test"]:
165 | print("Writing " + curr_partition + " partition...")
166 |
167 | # Shuffle sample order
168 | sample_list = dataset[curr_partition]
169 | random.shuffle(sample_list)
170 |
171 | # Create folder
172 | partition_folder = os.path.join(main_folder, curr_partition)
173 | os.makedirs(partition_folder)
174 |
175 | part_entries = int(np.ceil(float(len(sample_list) / float(num_cores))))
176 | processes = list()
177 | for core in range(num_cores):
178 | train_filename = os.path.join(partition_folder, str(core) + "_") # address to save the TFRecords file
179 | sample_list_subset = sample_list[core * part_entries:min((core + 1) * part_entries, len(sample_list))]
180 | proc = Process(target=write_records,
181 | args=(sample_list_subset, model_config, input_shape, output_shape, train_filename))
182 | proc.start()
183 | processes.append(proc)
184 | for p in processes:
185 | p.join()
186 |
187 | print("Dataset ready!")
188 | # Finally, load TFRecords dataset based on the desired partition
189 | dataset_folder = os.path.join(main_folder, partition)
190 | records_files = glob.glob(os.path.join(dataset_folder, "*.tfrecords"))
191 | random.shuffle(records_files)
192 | dataset = tf.data.TFRecordDataset(records_files)
193 | dataset = dataset.map(lambda x : parse_record(x, model_config["source_names"], input_shape[1:]), num_parallel_calls=model_config["num_workers"])
194 | dataset = dataset.prefetch(10)
195 |
196 | # Take random samples from each song
197 | if partition == "train":
198 | dataset = dataset.flat_map(lambda x : take_random_snippets(x, model_config["source_names"] + ["mix"], input_shape[1:], model_config["num_snippets_per_track"]))
199 | else:
200 | dataset = dataset.flat_map(lambda x : take_all_snippets(x, model_config["source_names"] + ["mix"], input_shape[1:], output_shape[1:]))
201 | dataset = dataset.prefetch(100)
202 |
203 | if partition == "train" and model_config["augmentation"]: # If its the train partition, activate data augmentation if desired
204 | dataset = dataset.map(Utils.random_amplify, num_parallel_calls=model_config["num_workers"]).prefetch(100)
205 |
206 | # Cut source outputs to centre part
207 | dataset = dataset.map(lambda x : Utils.crop_sample(x, (input_shape[1] - output_shape[1])//2)).prefetch(100)
208 |
209 | if partition == "train": # Repeat endlessly and shuffle when training
210 | dataset = dataset.repeat()
211 | dataset = dataset.shuffle(buffer_size=model_config["cache_size"])
212 |
213 | dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(model_config["batch_size"]))
214 | dataset = dataset.prefetch(1)
215 |
216 | return dataset
217 |
218 | def get_path(db_path, instrument_node):
219 | return db_path + os.path.sep + instrument_node.xpath("./relativeFilepath")[0].text
220 |
221 | def getMUSDB(database_path):
222 | mus = musdb.DB(root_dir=database_path, is_wav=False)
223 |
224 | subsets = list()
225 |
226 | for subset in ["train", "test"]:
227 | tracks = mus.load_mus_tracks(subset)
228 | samples = list()
229 |
230 | # Go through tracks
231 | for track in tracks:
232 | # Skip track if mixture is already written, assuming this track is done already
233 | track_path = track.path[:-4]
234 | mix_path = track_path + "_mix.wav"
235 | acc_path = track_path + "_accompaniment.wav"
236 | if os.path.exists(mix_path):
237 | print("WARNING: Skipping track " + mix_path + " since it exists already")
238 |
239 | # Add paths and then skip
240 | paths = {"mix" : mix_path, "accompaniment" : acc_path}
241 | paths.update({key : track_path + "_" + key + ".wav" for key in ["bass", "drums", "other", "vocals"]})
242 |
243 | samples.append(paths)
244 |
245 | continue
246 |
247 | rate = track.rate
248 |
249 | # Go through each instrument
250 | paths = dict()
251 | stem_audio = dict()
252 | for stem in ["bass", "drums", "other", "vocals"]:
253 | path = track_path + "_" + stem + ".wav"
254 | audio = track.targets[stem].audio
255 | soundfile.write(path, audio, rate, "PCM_16")
256 | stem_audio[stem] = audio
257 | paths[stem] = path
258 |
259 | # Add other instruments to form accompaniment
260 | acc_audio = np.clip(sum([stem_audio[key] for key in list(stem_audio.keys()) if key != "vocals"]), -1.0, 1.0)
261 | soundfile.write(acc_path, acc_audio, rate, "PCM_16")
262 | paths["accompaniment"] = acc_path
263 |
264 | # Create mixture
265 | mix_audio = track.audio
266 | soundfile.write(mix_path, mix_audio, rate, "PCM_16")
267 | paths["mix"] = mix_path
268 |
269 | diff_signal = np.abs(mix_audio - acc_audio - stem_audio["vocals"])
270 | print("Maximum absolute deviation from source additivity constraint: " + str(np.max(diff_signal)))# Check if acc+vocals=mix
271 | print("Mean absolute deviation from source additivity constraint: " + str(np.mean(diff_signal)))
272 |
273 | samples.append(paths)
274 |
275 | subsets.append(samples)
276 |
277 | return subsets
278 |
279 | def getCCMixter(xml_path):
280 | tree = etree.parse(xml_path)
281 | root = tree.getroot()
282 | db_path = root.find("./databaseFolderPath").text
283 | tracks = root.findall(".//track")
284 |
285 | samples = list()
286 |
287 | for track in tracks:
288 | # Get mix and vocal instruments
289 | voice = get_path(db_path, track.xpath(".//instrument[instrumentName='Voice']")[0])
290 | mix = get_path(db_path, track.xpath(".//instrument[instrumentName='Mix']")[0])
291 | acc = get_path(db_path, track.xpath(".//instrument[instrumentName='Instrumental']")[0])
292 |
293 | samples.append({"mix" : mix, "accompaniment" : acc, "vocals" : voice})
294 |
295 | return samples
--------------------------------------------------------------------------------
/CCMixter.xml:
--------------------------------------------------------------------------------
1 |
2 | CCMixter
3 | /home/daniel/Datasets/ccmixter_corpus
4 |
5 |
45 |
85 |
125 |
165 |
205 |
245 |
285 |
325 |
365 |
405 |
445 |
485 |
525 |
565 |
605 |
645 |
685 |
725 |
765 |
805 |
845 |
885 |
925 |
965 |
1005 |
1045 |
1085 |
1125 |
1165 |
1205 |
1245 |
1285 |
1325 |
1365 |
1405 |
1445 |
1485 |
1525 |
1565 |
1605 |
1645 |
1685 |
1725 |
1765 |
1805 |
1845 |
1885 |
1925 |
1965 |
2005 |
2006 |
2007 |
--------------------------------------------------------------------------------