Writing module to load stems into numpy tensors.
26 |
27 |
28 | Expand source code
29 | Browse git
30 |
31 | # flake8: noqa
32 | """
33 | Writing module to load stems into numpy tensors.
34 |
35 |
36 | """
37 | from stempeg.write import FilesWriter
38 | import numpy as np
39 | import warnings
40 | import ffmpeg
41 | import pprint
42 | from multiprocessing import Pool
43 | import atexit
44 | from functools import partial
45 | import datetime as dt
46 |
47 | class Reader(object):
48 | """Base class for reader
49 |
50 | Holds reader options
51 | """
52 |
53 | def __init__(self):
54 | pass
55 |
56 |
57 | class StreamsReader(Reader):
58 | """Holding configuration for streams
59 |
60 | This is the default reader. Nothing to be hold
61 | """
62 |
63 | def __init__(self):
64 | pass
65 |
66 |
67 | class ChannelsReader(Reader):
68 | """Using multichannels to multiplex to stems
69 |
70 | stems will be extracted from multichannel-pairs
71 | e.g. 8 channels will be converted to 4 stereo pairs
72 |
73 |
74 | Args:
75 | from_channels: int
76 | number of channels, defaults to `2`.
77 | """
78 |
79 | def __init__(self, nb_channels=2):
80 | self.nb_channels = nb_channels
81 |
82 |
83 | def _read_ffmpeg(
84 | filename,
85 | sample_rate,
86 | channels,
87 | start,
88 | duration,
89 | dtype,
90 | ffmpeg_format,
91 | stem_idx
92 | ):
93 | """Loading data using ffmpeg and numpy
94 |
95 | Args:
96 | filename (str): filename path
97 | sample_rate (int): sample rate
98 | channels (int): metadata info object needed to
99 | know the channel configuration in advance
100 | start (float): start position in seconds
101 | duration (float): duration in seconds
102 | dtype (numpy.dtype): Type of audio array to be casted into
103 | stem_idx (int): stream id
104 | ffmpeg_format (str): ffmpeg intermediate format encoding.
105 | Choose "f32le" for best compatibility
106 |
107 | Returns:
108 | (array_like): numpy audio array
109 | """
110 | output_kwargs = {'format': ffmpeg_format, 'ar': sample_rate}
111 | if duration is not None:
112 | output_kwargs['t'] = str(dt.timedelta(seconds=duration))
113 | if start is not None:
114 | output_kwargs['ss'] = str(dt.timedelta(seconds=start))
115 |
116 | output_kwargs['map'] = '0:' + str(stem_idx)
117 | process = (
118 | ffmpeg
119 | .input(filename)
120 | .output('pipe:', **output_kwargs)
121 | .run_async(pipe_stdout=True, pipe_stderr=True))
122 | buffer, _ = process.communicate()
123 |
124 | # decode to raw pcm format
125 | if ffmpeg_format == "f64le":
126 | # PCM 64 bit float
127 | numpy_dtype = '<f8'
128 | elif ffmpeg_format == "f32le":
129 | # PCM 32 bit float
130 | numpy_dtype = '<f4'
131 | elif ffmpeg_format == "s16le":
132 | # PCM 16 bit signed int
133 | numpy_dtype = '<i2'
134 | else:
135 | raise NotImplementedError("ffmpeg format is not supported")
136 |
137 | waveform = np.frombuffer(buffer, dtype=numpy_dtype).reshape(-1, channels)
138 |
139 | if not waveform.dtype == np.dtype(dtype):
140 | # cast to target/output dtype
141 | waveform = waveform.astype(dtype, order='C')
142 | # when coming from integer, apply normalization t0 [-1.0, 1.0]
143 | if np.issubdtype(numpy_dtype, np.integer):
144 | waveform = waveform / (np.iinfo(numpy_dtype).max + 1.0)
145 | return waveform
146 |
147 | def read_stems(
148 | filename,
149 | start=None,
150 | duration=None,
151 | stem_id=None,
152 | always_3d=False,
153 | dtype=np.float64,
154 | ffmpeg_format="f32le",
155 | info=None,
156 | sample_rate=None,
157 | reader=StreamsReader(),
158 | multiprocess=False
159 | ):
160 | """Read stems into numpy tensor
161 |
162 | This function can read both, multi-stream and single stream audio files.
163 | If used for reading normal audio, the output is a 1d or 2d (mono/stereo)
164 | array. When multiple streams are read, the output is a 3d array.
165 |
166 | An option stems_from_multichannel was added to load stems that are
167 | aggregated into multichannel audio (concatenation of pairs of
168 | stereo channels), see more info on audio `stempeg.write.write_stems`.
169 |
170 | By default `read_stems` assumes that multiple substreams were used to
171 | save the stem file (`reader=stempeg.StreamsReader()`). To support
172 | multistream files on audio formats that do not support multiple streams
173 | (e.g. WAV), streams can be mapped to multiple pairs of channels. In that
174 | case, `stempeg.ChannelsReader()`, can be passed. Also see:
175 | `stempeg.write.ChannelsWriter`.
176 |
177 |
178 | Args:
179 | filename (str): filename of the audio file to load data from.
180 | start (float): Start offset to load from in seconds.
181 | duration (float): Duration to load in seconds.
182 | stem_id (int, optional): substream id,
183 | defauls to `None` (all substreams are loaded).
184 | always_3d (bool, optional): By default, reading a
185 | single-stream audio file will return a
186 | two-dimensional array. With ``always_3d=True``, audio data is
187 | always returned as a three-dimensional array, even if the audio
188 | file has only one stream.
189 | dtype (np.dtype, optional): Numpy data type to use, default to `np.float32`.
190 | info (Info, Optional): Pass ffmpeg `Info` object to reduce number
191 | of os calls on file.
192 | This can be used e.g. the sample rate and length of a track is
193 | already known in advance. Useful for ML training where the
194 | info objects can be pre-processed, thus audio loading can
195 | be speed up.
196 | sample_rate (float, optional): Sample rate of returned audio.
197 | Defaults to `None` which results in
198 | the sample rate returned from the mixture.
199 | reader (Reader): Holds parameters for the reading method.
200 | One of the following:
201 | `StreamsReader(...)`
202 | Read from a single multistream audio (default).
203 | `ChannelsReader(...)`
204 | Read/demultiplexed from multiple channels.
205 | multiprocess (bool): Applys multi-processing for reading
206 | substreams in parallel to speed up reading. Defaults to `True`
207 |
208 | Returns:
209 | stems (array_like):
210 | stems tensor of `shape=(stem x samples x channels)`
211 | rate (float):
212 | sample rate
213 |
214 | Shape:
215 | - Output: `[S, T, C']`, with
216 | `S`, if the file has multiple streams and,
217 | `C` is the audio has multiple channels.
218 |
219 | >>> audio, sample_rate = stempeg.read_stems("test.stem.mp4")
220 | >>> audio.shape
221 | [5, 220500, 2]
222 | >>> sample_rate
223 | 44100
224 | """
225 | if multiprocess:
226 | _pool = Pool()
227 | atexit.register(_pool.close)
228 | else:
229 | _pool = None
230 |
231 | if not isinstance(filename, str):
232 | filename = filename.decode()
233 |
234 | # use ffprobe to get info object (samplerate, lengths)
235 | try:
236 | if info is None:
237 | metadata = Info(filename)
238 | else:
239 | metadata = info
240 |
241 | ffmpeg.probe(filename)
242 | except ffmpeg._run.Error as e:
243 | raise Warning(
244 | 'An error occurs with ffprobe (see ffprobe output below)\n\n{}'
245 | .format(e.stderr.decode()))
246 |
247 | # check number of audio streams in file
248 | if 'streams' not in metadata.info or metadata.nb_audio_streams == 0:
249 | raise Warning('No audio stream found.')
250 |
251 | # using ChannelReader would ignore substreams
252 | if isinstance(reader, ChannelsReader):
253 | if metadata.nb_audio_streams != 1:
254 | raise Warning(
255 | 'stempeg.ChannelsReader() only processes the first substream.'
256 | )
257 | else:
258 | if metadata.audio_streams[0][
259 | 'channels'
260 | ] % reader.nb_channels != 0:
261 | raise Warning('Stems should be encoded as multi-channel.')
262 | else:
263 | substreams = 0
264 | else:
265 | if stem_id is not None:
266 | substreams = stem_id
267 | else:
268 | substreams = metadata.audio_stream_idx()
269 |
270 | if not isinstance(substreams, list):
271 | substreams = [substreams]
272 |
273 | # if not, get sample rate from mixture
274 | if sample_rate is None:
275 | sample_rate = metadata.sample_rate(0)
276 |
277 | _chans = metadata.channels_streams
278 | # check if all substreams have the same number of channels
279 | if len(set(_chans)) == 1:
280 | channels = min(_chans)
281 | else:
282 | raise RuntimeError("Stems do not have the same number of channels per substream")
283 |
284 | # set channels to minimum channel per stream
285 | stems = []
286 |
287 | if _pool:
288 | results = _pool.map_async(
289 | partial(
290 | _read_ffmpeg,
291 | filename,
292 | sample_rate,
293 | channels,
294 | start,
295 | duration,
296 | dtype,
297 | ffmpeg_format
298 | ),
299 | substreams,
300 | callback=stems.extend
301 | )
302 | results.wait()
303 | _pool.terminate()
304 | else:
305 | stems = [
306 | _read_ffmpeg(
307 | filename,
308 | sample_rate,
309 | channels,
310 | start,
311 | duration,
312 | dtype,
313 | ffmpeg_format,
314 | stem_idx
315 | )
316 | for stem_idx in substreams
317 | ]
318 | stem_durations = np.array([t.shape[0] for t in stems])
319 | if not (stem_durations == stem_durations[0]).all():
320 | warnings.warning("Stems differ in length and were shortend")
321 | min_length = np.min(stem_durations)
322 | stems = [t[:min_length, :] for t in stems]
323 |
324 | # aggregate list of stems to numpy tensor
325 | stems = np.array(stems)
326 |
327 | # If ChannelsReader is used, demultiplex from channels
328 | if isinstance(reader, (ChannelsReader)) and stems.shape[-1] > 1:
329 | stems = stems.transpose(1, 0, 2)
330 | stems = stems.reshape(
331 | stems.shape[0], stems.shape[1], -1, reader.nb_channels
332 | )
333 | stems = stems.transpose(2, 0, 3, 1)[..., 0]
334 |
335 | if not always_3d:
336 | stems = np.squeeze(stems)
337 | return stems, sample_rate
338 |
339 |
340 | class Info(object):
341 | """Audio properties that hold a number of metadata.
342 |
343 | The object is created when can be used when `read_stems` is called.
344 | This is can be passed, to `read_stems` to reduce loading time.
345 | """
346 |
347 | def __init__(self, filename):
348 | super(Info, self).__init__()
349 | self.info = ffmpeg.probe(filename)
350 | self.audio_streams = [
351 | stream for stream in self.info['streams']
352 | if stream['codec_type'] == 'audio'
353 | ]
354 |
355 | @property
356 | def nb_audio_streams(self):
357 | """Returns the number of audio substreams"""
358 | return len(self.audio_streams)
359 |
360 | @property
361 | def nb_samples_streams(self):
362 | """Returns a list of number of samples for each substream"""
363 | return [self.samples(k) for k, stream in enumerate(self.audio_streams)]
364 |
365 | @property
366 | def channels_streams(self):
367 | """Returns the number of channels per substream"""
368 | return [
369 | self.channels(k) for k, stream in enumerate(self.audio_streams)
370 | ]
371 |
372 | @property
373 | def duration_streams(self):
374 | """Returns a list of durations (in s) for all substreams"""
375 | return [
376 | self.duration(k) for k, stream in enumerate(self.audio_streams)
377 | ]
378 |
379 | @property
380 | def title_streams(self):
381 | """Returns stream titles for all substreams"""
382 | return [
383 | stream['tags'].get('handler_name')
384 | for stream in self.audio_streams
385 | ]
386 |
387 | def audio_stream_idx(self):
388 | """Returns audio substream indices"""
389 | return [s['index'] for s in self.audio_streams]
390 |
391 | def samples(self, idx):
392 | """Returns the number of samples for a stream index"""
393 | return int(self.audio_streams[idx]['duration_ts'])
394 |
395 | def duration(self, idx):
396 | """Returns the duration (in seconds) for a stream index"""
397 | return float(self.audio_streams[idx]['duration'])
398 |
399 | def title(self, idx):
400 | """Return the `handler_name` metadata for a given stream index"""
401 | return self.audio_streams[idx]['tags']['handler_name']
402 |
403 | def rate(self, idx):
404 | # deprecated from older stempeg version
405 | return self.sample_rate(idx)
406 |
407 | def sample_rate(self, idx):
408 | """Return sample rate for a given substream"""
409 | return int(self.audio_streams[idx]['sample_rate'])
410 |
411 | def channels(self, idx):
412 | """Returns the number of channels for a gvien substream"""
413 | return int(self.audio_streams[idx]['channels'])
414 |
415 | def __repr__(self):
416 | """Print stream information"""
417 | return pprint.pformat(self.audio_streams)
418 |
419 |