├── Makefile ├── README.md ├── decode_audio.py ├── decode_audio_ffmpeg.c └── dlpack.h /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | 3 | LIBS_FFMPEG = -lavformat -lavcodec -lavfilter -lavutil 4 | 5 | SHAREDFLAGS = -shared -fPIC 6 | 7 | #CFLAGS = -g 8 | CFLAGS = -O3 9 | 10 | ffmpeg: avio_reading decode_audio_ffmpeg decode_audio_ffmpeg.so 11 | 12 | avio_reading: avio_reading.c 13 | $(CC) -o $@ $< $(LIBS_FFMPEG) $(CFLAGS) 14 | 15 | 16 | decode_audio_ffmpeg: decode_audio_ffmpeg.c 17 | $(CC) -o $@ $< $(LIBS_FFMPEG) $(CFLAGS) 18 | 19 | decode_audio_ffmpeg.so: decode_audio_ffmpeg.c 20 | $(CC) -o $@ $(SHAREDFLAGS) $< $(LIBS_FFMPEG) $(CFLAGS) 21 | 22 | clean: 23 | rm -f decode_audio_ffmpeg decode_audio_ffmpeg.so 24 | 25 | .PHONY: clean ffmpeg 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Work In Progress 2 | 3 | This repo is a primer in reading audio (via ffmpeg) into NumPy/PyTorch arrays without copying data or process launching. Interfacing with FFmpeg is done in pure C code in [decode_audio.c](./decode_audio.c). Python wrapper is implemented in [decode_audio.py](./decode_audio.py) using a standard library module ctypes. C code returns a plain C structure [Audio](./decode_audio.c#L12-L20). This structure is then interpeted and wrapped by NumPy or PyTorch without copy. 4 | 5 | At the bottom is an example of alternative solution using process launching. The first solution is preferable if you must load huge amounts of audio in various formats (for reading `*.wav` files, there exists a standard Python [`wave`](https://docs.python.org/3/library/wave.html) module and [`scipy.io.wavfile.read`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.read.html)). 6 | 7 | It is also a simple primer on FFmpeg audio decoding loop and basic ctypes usage for interfacing C code and NumPy/PyTorch (without creating a full-blown PyTorch C++ extension). 8 | 9 | ### Usage 10 | ```shell 11 | # install dependencies: ffmpeg executables and shared libraries on ubuntu 12 | apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavfilter-dev 13 | ``` 14 | 15 | ```shell 16 | # create sample audio test.wav 17 | ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -c:a pcm_s16le -ar 8000 test.wav 18 | 19 | # convert audio to raw format 20 | ffmpeg -i test.wav -f s16le -acodec pcm_s16le golden.raw 21 | 22 | # play a raw file 23 | ffplay -f s16le -ac 1 -ar 8000 golden.raw 24 | 25 | # compile executable for testing 26 | make decode_audio_ffmpeg 27 | 28 | # convert audio to raw format and compare to golden 29 | ./decode_audio_ffmpeg test.wav bin.raw 30 | diff golden.raw bin.raw 31 | 32 | # compile a shared library for interfacing with NumPy and PyTorch 33 | make decode_audio_ffmpeg.so 34 | 35 | # convert audio to raw format (NumPy) and compare to golden 36 | python3 decode_audio.py -i test.wav -o numpy.raw 37 | diff golden.raw numpy.raw 38 | 39 | # convert audio to raw format (PyTorch) and compare to golden 40 | python3 decode_audio.py -i test.wav -o torch.raw 41 | diff golden.raw torch.raw 42 | 43 | # convert audio to raw format (PyTorch / DLPack) and compare to golden 44 | python3 decode_audio.py -i test.wav -o dlpack.raw 45 | diff golden.raw dlpack.raw 46 | ``` 47 | 48 | ```python 49 | # read audio using subprocess 50 | # python3 decode_audio_subprocess.py test.wav 51 | 52 | import sys 53 | import subprocess 54 | import struct 55 | 56 | format_ffmpeg, format_struct = [('s16le', 'h'), ('f32le', 'f'), ('u8', 'B'), ('s8', 'b')][0] 57 | sample_rate = 8_000 # resample 58 | num_channels = 1 # force mono 59 | 60 | audio = memoryview(subprocess.check_output(['ffmpeg', '-nostdin', '-hide_banner', '-nostats', '-loglevel', 'quiet', '-i', sys.argv[1], '-f', format_ffmpeg, '-ar', str(sample_rate), '-ac', str(num_channels), '-'])) 61 | audio = audio.cast(format_struct, shape = [len(audio) // num_channels // struct.calcsize(format_struct), num_channels]) 62 | 63 | print('shape', audio.shape, 'itemsize', audio.itemsize, 'format', audio.format) 64 | # shape (40000, 1) itemsize 2 format h 65 | ``` 66 | 67 | ### TODO 68 | - SOX backend ( https://github.com/pytorch/audio/blob/master/torchaudio/torch_sox.cpp) 69 | - ffmpeg audio filter graph 70 | - decode from a buffer 71 | - non-allocating version that keeps allocations in Python for simpler memory management 72 | - probe function 73 | -------------------------------------------------------------------------------- /decode_audio.py: -------------------------------------------------------------------------------- 1 | # support raw data input 2 | # redo API, maybe add explicit duration, maybe remove DLPack, keep only NumPy 3 | # specify output channel_layour? 4 | 5 | # https://bugs.python.org/issue11429 6 | # https://bugs.python.org/issue12836 7 | # https://stackoverflow.com/questions/20439640/ffmpeg-audio-transcoding-using-libav-libraries 8 | # https://stackoverflow.com/questions/45549285/resampling-audio-using-libswresample-from-48000-to-44100 9 | # https://www.codetd.com/en/article/6791150 10 | # https://gist.github.com/jimjibone/6569303 11 | # https://gavv.github.io/articles/decode-play/ 12 | 13 | import os 14 | import sys 15 | import ctypes 16 | 17 | class DLDeviceType(ctypes.c_int): 18 | kDLCPU = 1 19 | kDLGPU = 2 20 | kDLCPUPinned = 3 21 | kDLOpenCL = 4 22 | kDLVulkan = 7 23 | kDLMetal = 8 24 | kDLVPI = 9 25 | kDLROCM = 10 26 | kDLExtDev = 12 27 | 28 | class DLDataTypeCode(ctypes.c_uint8): 29 | kDLInt = 0 30 | kDLUInt = 1 31 | kDLFloat = 2 32 | kDLBfloat = 4 33 | 34 | def __str__(self): 35 | return {self.kDLInt : 'int', self.kDLUInt : 'uint', self.kDLFloat : 'float', self.kDLBfloat : 'bfloat'}[self.value] 36 | 37 | class DLDataType(ctypes.Structure): 38 | _fields_ = [ 39 | ('type_code', DLDataTypeCode), 40 | ('bits', ctypes.c_uint8), 41 | ('lanes', ctypes.c_uint16) 42 | ] 43 | 44 | @property 45 | def descr(self): 46 | typestr = str(self.type_code) + str(self.bits) 47 | return [('f' + str(l), typestr) for l in range(self.lanes)] 48 | 49 | def __str__(self): 50 | return repr(self.descr) if len(self.descr) != 1 else self.descr[0][1] 51 | 52 | class DLContext(ctypes.Structure): 53 | _fields_ = [ 54 | ('device_type', DLDeviceType), 55 | ('device_id', ctypes.c_int) 56 | ] 57 | 58 | class DLTensor(ctypes.Structure): 59 | _fields_ = [ 60 | ('data', ctypes.c_void_p), 61 | ('ctx', DLContext), 62 | ('ndim', ctypes.c_int), 63 | ('dtype', DLDataType), 64 | ('shape', ctypes.POINTER(ctypes.c_int64)), 65 | ('strides', ctypes.POINTER(ctypes.c_int64)), 66 | ('byte_offset', ctypes.c_uint64) 67 | ] 68 | 69 | @property 70 | def size(self): 71 | prod = 1 72 | for i in range(self.ndim): 73 | prod *= self.shape[i] 74 | return prod 75 | 76 | @property 77 | def itemsize(self): 78 | return self.dtype.lanes * self.dtype.bits // 8; 79 | 80 | @property 81 | def nbytes(self): 82 | return self.size * self.itemsize 83 | 84 | @property 85 | def __array_interface__(self): 86 | shape = tuple(self.shape[dim] for dim in range(self.ndim)) 87 | strides = tuple(self.strides[dim] * self.itemsize for dim in range(self.ndim)) 88 | typestr = '|' + str(self.dtype.type_code)[0] + str(self.itemsize) 89 | return dict(version = 3, shape = shape, strides = strides, data = (self.data, True), offset = self.byte_offset, typestr = typestr) 90 | 91 | def __str__(self): 92 | return 'dtype={dtype}, ndim={ndim}, shape={shape}, strides={strides}, byte_offset={byte_offset}'.format(dtype = self.dtype, ndim = self.ndim, shape = tuple(self.shape[i] for i in range(self.ndim)), strides = tuple(self.strides[i] for i in range(self.ndim)), byte_offset = self.byte_offset) 93 | 94 | class DLManagedTensor(ctypes.Structure): 95 | _fields_ = [ 96 | ('dl_tensor', DLTensor), 97 | ('manager_ctx', ctypes.c_void_p), 98 | ('deleter', ctypes.CFUNCTYPE(None, ctypes.c_void_p)) 99 | ] 100 | 101 | PyCapsule_Destructor = ctypes.CFUNCTYPE(None, ctypes.py_object) 102 | PyCapsule_New = ctypes.pythonapi.PyCapsule_New 103 | PyCapsule_New.restype = ctypes.py_object 104 | PyCapsule_New.argtypes = (ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p)#PyCapsule_Destructor) 105 | PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer 106 | PyCapsule_GetPointer.restype = ctypes.c_void_p 107 | PyCapsule_GetPointer.argtypes = (ctypes.py_object, ctypes.c_char_p) 108 | 109 | class DecodeAudio(ctypes.Structure): 110 | _fields_ = [ 111 | ('error', ctypes.c_char * 128), 112 | ('fmt', ctypes.c_char * 8), 113 | ('sample_rate', ctypes.c_ulonglong), 114 | ('num_channels', ctypes.c_ulonglong), 115 | ('num_samples', ctypes.c_ulonglong), 116 | ('duration', ctypes.c_double), 117 | ('itemsize', ctypes.c_ulonglong), 118 | ('data', DLManagedTensor) 119 | ] 120 | 121 | def __init__(self, lib_path = os.path.abspath('decode_audio_ffmpeg.so')): 122 | self.lib = ctypes.CDLL(lib_path) 123 | self.lib.decode_audio.argtypes = [ctypes.c_char_p, DecodeAudio, DecodeAudio, ctypes.c_char_p, ctypes.c_int, ctypes.c_int] 124 | self.lib.decode_audio.restype = DecodeAudio 125 | 126 | def __str__(self): 127 | return f'num_samples={self.num_samples}, num_channels={self.num_channels}, sample_fmt={self.fmt.decode()}, {self.data.dl_tensor}' 128 | 129 | def __call__(self, input_path = None, input_buffer = None, output_buffer = None, filter_string = '', sample_rate = None, probe = False, verbose = False): 130 | uint8 = DLDataType(lanes = 1, bits = 8, code = DLDataTypeCode.kDLUInt) 131 | input_options = DecodeAudio() 132 | output_options = DecodeAudio() 133 | 134 | if input_buffer is not None: 135 | #input_options.data.dl_tensor.data = ctypes.cast(input_buffer, ctypes.c_void_p) 136 | #input_options.data.dl_tensor.data = input_buffer.ctypes.data_as(ctypes.c_void_p) 137 | input_options.data.dl_tensor.data = ctypes.c_void_p(input_buffer.__array_interface__['data'][0]) 138 | input_options.data.dl_tensor.shape = (ctypes.c_int64 * 1)(len(input_buffer)) 139 | input_options.data.dl_tensor.ndim = 1 140 | input_options.data.dl_tensor.dtype = uint8 141 | 142 | if output_buffer is not None: 143 | output_options.data.dl_tensor.data = ctypes.cast((ctypes.c_char * len(input_buffer)).from_buffer(memoryview(output_buffer)), ctypes.c_void_p) 144 | output_options.data.dl_tensor.shape = (ctypes.c_int64 * 1)(len(output_buffer)) 145 | output_options.data.dl_tensor.ndim = 1 146 | output_options.data.dl_tensor.dtype = uint8 147 | 148 | if sample_rate is not None: 149 | output_options.sample_rate = sample_rate 150 | 151 | audio = self.lib.decode_audio(input_path.encode() if input_path else None, input_options, output_options, filter_string.encode() if filter_string else None, probe, verbose) 152 | if audio.error: 153 | raise Exception(audio.error.decode()) 154 | return audio 155 | 156 | def to_dlpack(self): 157 | byte_order = 'little' if b'le' in self.fmt else 'big' if b'be' in self.fmt else 'native' 158 | assert byte_order == 'native' or byte_order == sys.byteorder 159 | return PyCapsule_New(ctypes.byref(self.data), b'dltensor', None) 160 | 161 | def numpy_from_dlpack(pycapsule): 162 | data = ctypes.cast(PyCapsule_GetPointer(pycapsule, b'dltensor'), ctypes.POINTER(DLManagedTensor)).contents 163 | wrapped = type('', (), dict(__array_interface__ = data.dl_tensor.__array_interface__, __del__ = lambda self: data.deleter(ctypes.byref(data)) if data.deleter else None))() 164 | return numpy.asarray(wrapped) 165 | 166 | 167 | if __name__ == '__main__': 168 | import argparse 169 | import time 170 | import numpy 171 | import scipy.io.wavfile 172 | import soundfile 173 | import torch.utils.dlpack 174 | 175 | parser = argparse.ArgumentParser() 176 | parser.add_argument('--input-path', '-i', default = 'test.wav') 177 | parser.add_argument('--output-path', '-o', default = 'numpy.raw') 178 | parser.add_argument('--buffer', action = 'store_true') 179 | parser.add_argument('--sample-rate', type = int) 180 | parser.add_argument('--filter', default = '')#volume=volume=3.0') 181 | parser.add_argument('--probe', action = 'store_true') 182 | parser.add_argument('--verbose', action = 'store_true') 183 | args = parser.parse_args() 184 | 185 | def measure(k, f, audio_path, K = 100, timer = time.process_time, **kwargs): 186 | tic = timer() 187 | for i in range(K): 188 | audio = f(audio_path, **kwargs) 189 | print(k, (timer() - tic) * 1e6 / K, 'microsec') 190 | return audio 191 | 192 | measure('scipy.io.wavfile.read', scipy.io.wavfile.read, args.input_path) 193 | measure('soundfile.read', soundfile.read, args.input_path, dtype = 'int16') 194 | 195 | decode_audio = DecodeAudio() 196 | 197 | input_buffer_ = open(args.input_path, 'rb').read() 198 | input_buffer = numpy.frombuffer(input_buffer_, dtype = numpy.uint8) 199 | output_buffer = bytearray(b'\0' * 1000000) #numpy.zeros((1_000_000), dtype = numpy.uint8) 200 | audio = measure('ffmpeg', decode_audio, args.input_path if not args.buffer else None, input_buffer = input_buffer if args.buffer else None, output_buffer = output_buffer if args.buffer else None, filter_string = args.filter, sample_rate = args.sample_rate, probe = args.probe, verbose = args.verbose) 201 | 202 | print('ffplay', '-f', audio.fmt.decode(), '-ac', audio.num_channels, '-ar', audio.sample_rate, '-i', args.input_path, '#', audio) 203 | if not args.probe: 204 | dlpack_tensor = audio.to_dlpack() 205 | if 'numpy' in args.output_path: 206 | array = numpy_from_dlpack(dlpack_tensor) 207 | elif 'torch' in args.output_path: 208 | array = torch.utils.dlpack.from_dlpack(dlpack_tensor) 209 | 210 | numpy.asarray(array).tofile(args.output_path) 211 | print('ffplay', '-f', audio.fmt.decode(), '-ac', audio.num_channels, '-ar', audio.sample_rate, '-i', args.output_path, '# num samples:', audio.num_samples, 'dtype', array.dtype, 'shape', array.shape) 212 | 213 | del array 214 | del dlpack_tensor 215 | -------------------------------------------------------------------------------- /decode_audio_ffmpeg.c: -------------------------------------------------------------------------------- 1 | // based on https://github.com/FFmpeg/FFmpeg/blob/master/doc/examples/decode_audio.c and https://github.com/FFmpeg/FFmpeg/blob/master/doc/examples/demuxing_decoding.c and https://github.com/FFmpeg/FFmpeg/blob/master/doc/examples/filtering_audio.c 2 | 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h 19 | #include "dlpack.h" 20 | 21 | void deleter(struct DLManagedTensor* self) 22 | { 23 | if(self->dl_tensor.data) 24 | { 25 | free(self->dl_tensor.data); 26 | self->dl_tensor.data = NULL; 27 | } 28 | 29 | if(self->dl_tensor.shape) 30 | { 31 | free(self->dl_tensor.shape); 32 | self->dl_tensor.shape = NULL; 33 | } 34 | 35 | if(self->dl_tensor.strides) 36 | { 37 | free(self->dl_tensor.strides); 38 | self->dl_tensor.strides = NULL; 39 | } 40 | } 41 | 42 | void __attribute__ ((constructor)) onload() 43 | { 44 | //needed before ffmpeg 4.0, deprecated in ffmpeg 4.0 45 | av_register_all(); 46 | avfilter_register_all(); 47 | } 48 | 49 | struct DecodeAudio 50 | { 51 | char error[128]; 52 | char fmt[8]; 53 | uint64_t sample_rate; 54 | uint64_t num_channels; 55 | uint64_t num_samples; 56 | uint64_t itemsize; 57 | double duration; 58 | DLManagedTensor data; 59 | }; 60 | 61 | void process_output_frame(uint8_t** data, AVFrame* frame, int num_samples, int num_channels, uint64_t* data_len, int itemsize) 62 | { 63 | if(num_channels == 1) 64 | data = memcpy(*data, frame->data, itemsize * frame->nb_samples) + itemsize * frame->nb_samples; 65 | else 66 | { 67 | for (int i = 0; i < num_samples; i++) 68 | { 69 | for (int c = 0; c < num_channels; c++) 70 | { 71 | if(*data_len >= itemsize) 72 | { 73 | *data = memcpy(*data, frame->data[c] + itemsize * i, itemsize) + itemsize; 74 | *data_len -= itemsize; 75 | } 76 | } 77 | } 78 | } 79 | } 80 | 81 | int decode_packet(AVCodecContext *av_ctx, AVFilterContext* buffersrc_ctx, AVFilterContext* buffersink_ctx, AVPacket *pkt, uint8_t** data, uint64_t* data_len, int itemsize) 82 | { 83 | AVFrame *frame = av_frame_alloc(); 84 | AVFrame *filt_frame = av_frame_alloc(); 85 | 86 | int ret = avcodec_send_packet(av_ctx, pkt); 87 | 88 | int filtering = buffersrc_ctx != NULL && buffersink_ctx != NULL; 89 | while (ret >= 0) 90 | { 91 | ret = avcodec_receive_frame(av_ctx, frame); 92 | if (ret == 0) 93 | { 94 | if(filtering) 95 | { 96 | ret = av_buffersrc_add_frame_flags(buffersrc_ctx, frame, AV_BUFFERSRC_FLAG_KEEP_REF); 97 | if(ret < 0) 98 | goto end; 99 | } 100 | 101 | while (filtering) 102 | { 103 | ret = av_buffersink_get_frame(buffersink_ctx, filt_frame); 104 | if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) 105 | break; 106 | if (ret < 0) 107 | goto end; 108 | process_output_frame(data, filt_frame, filt_frame->nb_samples, av_ctx->channels, data_len, itemsize); 109 | av_frame_unref(filt_frame); 110 | } 111 | 112 | if(!filtering) 113 | { 114 | process_output_frame(data, frame, frame->nb_samples, av_ctx->channels, data_len, itemsize); 115 | } 116 | //av_frame_unref(frame); 117 | } 118 | } 119 | 120 | end: 121 | if (ret == AVERROR(EAGAIN)) 122 | ret = 0; 123 | 124 | av_frame_free(&frame); 125 | av_frame_free(&filt_frame); 126 | return ret; 127 | } 128 | 129 | struct buffer_cursor 130 | { 131 | uint8_t *base; 132 | size_t size; 133 | uint8_t *ptr; 134 | size_t left; 135 | }; 136 | 137 | static int buffer_read(void *opaque, uint8_t *buf, int buf_size) 138 | { 139 | struct buffer_cursor *cursor = (struct buffer_cursor *)opaque; 140 | buf_size = FFMIN(buf_size, cursor->left); 141 | 142 | if (!buf_size) 143 | return AVERROR_EOF; 144 | 145 | memcpy(buf, cursor->ptr, buf_size); 146 | cursor->ptr += buf_size; 147 | cursor->left -= buf_size; 148 | return buf_size; 149 | } 150 | 151 | static int64_t buffer_seek(void* opaque, int64_t offset, int whence) 152 | { 153 | struct buffer_cursor *cursor = (struct buffer_cursor *)opaque; 154 | if(whence == AVSEEK_SIZE) 155 | return cursor->size; 156 | 157 | cursor->ptr = cursor->base + offset; 158 | cursor->left = cursor->size - offset; 159 | return offset; 160 | } 161 | 162 | size_t nbytes(struct DecodeAudio* audio) 163 | { 164 | size_t itemsize = audio->data.dl_tensor.dtype.lanes * audio->data.dl_tensor.dtype.bits / 8; 165 | size_t size = 1; 166 | for(size_t i = 0; i < audio->data.dl_tensor.ndim; i++) 167 | size *= audio->data.dl_tensor.shape[i]; 168 | return size * itemsize; 169 | } 170 | 171 | struct DecodeAudio decode_audio(const char* input_path, struct DecodeAudio input_options, struct DecodeAudio output_options, const char* filter_string, int probe, int verbose) 172 | { 173 | av_log_set_level(verbose ? AV_LOG_DEBUG : AV_LOG_FATAL); 174 | 175 | clock_t tic = clock(); 176 | 177 | struct DecodeAudio audio = { 0 }; 178 | 179 | AVIOContext* io_ctx = NULL; 180 | AVFormatContext* fmt_ctx = avformat_alloc_context(); 181 | AVCodecContext* dec_ctx = NULL; 182 | AVPacket* pkt = NULL; 183 | char filter_args[1024]; 184 | AVFilterGraph *graph = NULL; 185 | AVFilterInOut *gis = avfilter_inout_alloc(); 186 | AVFilterInOut *gos = avfilter_inout_alloc(); 187 | AVFilterContext *buffersrc_ctx = NULL; 188 | AVFilterContext *buffersink_ctx = NULL; 189 | AVFilter *buffersrc = avfilter_get_by_name("abuffer"); 190 | AVFilter *buffersink = avfilter_get_by_name("abuffersink"); 191 | assert(buffersrc != NULL && buffersink != NULL); 192 | uint8_t* avio_ctx_buffer = NULL; 193 | struct buffer_cursor cursor = { 0 }; 194 | int buffer_multiple = 1; 195 | 196 | if(filter_string != NULL && strlen(filter_string) > 512) 197 | { 198 | strcpy(audio.error, "Too long filter string"); 199 | goto end; 200 | } 201 | 202 | if(input_path == NULL) 203 | { 204 | size_t avio_ctx_buffer_size = 4096 * buffer_multiple; 205 | avio_ctx_buffer = av_malloc(avio_ctx_buffer_size); 206 | assert(avio_ctx_buffer); 207 | 208 | cursor.base = cursor.ptr = (uint8_t*)input_options.data.dl_tensor.data; 209 | cursor.size = cursor.left = nbytes(&input_options); 210 | io_ctx = avio_alloc_context(avio_ctx_buffer, avio_ctx_buffer_size, 0, &cursor, &buffer_read, NULL, &buffer_seek); 211 | if(!io_ctx) 212 | { 213 | strcpy(audio.error, "Cannot allocate IO context"); 214 | goto end; 215 | } 216 | 217 | fmt_ctx->pb = io_ctx; 218 | } 219 | 220 | if(verbose) printf("decode_audio_BEFORE__: %.2f microsec\n", (float)(clock() - tic) * 1000000 / CLOCKS_PER_SEC); 221 | 222 | fmt_ctx->format_probesize = 2048; 223 | AVInputFormat* input_format = av_find_input_format("wav"); 224 | if (avformat_open_input(&fmt_ctx, input_path, input_format, NULL) != 0) 225 | { 226 | strcpy(audio.error, "Cannot open file"); 227 | goto end; 228 | } 229 | if(probe) return audio; 230 | fmt_ctx->streams[0]->probe_packets = 1; 231 | //fmt_ctx->streams[0]->probesize = 2048; 232 | if(verbose) printf("decode_audio_BEFORE: %.2f microsec\n", (float)(clock() - tic) * 1000000 / CLOCKS_PER_SEC); 233 | 234 | //if (avformat_find_stream_info(fmt_ctx, NULL) < 0) 235 | //{ 236 | // strcpy(audio.error, "Cannot open find stream information"); 237 | // goto end; 238 | //} 239 | if(verbose) printf("decode_audio_AFTER: %.2f microsec\n", (float)(clock() - tic) * 1000000 / CLOCKS_PER_SEC); 240 | 241 | 242 | int stream_index = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, NULL, 0); 243 | if (stream_index < 0) 244 | { 245 | strcpy(audio.error, "Cannot find audio stream"); 246 | goto end; 247 | } 248 | AVStream *stream = fmt_ctx->streams[stream_index]; 249 | //stream->codecpar->block_align = 4096 * buffer_multiple; 250 | 251 | AVCodec *codec = avcodec_find_decoder(stream->codecpar->codec_id); 252 | if (!codec) 253 | { 254 | strcpy(audio.error, "Codec not found"); 255 | goto end; 256 | } 257 | 258 | dec_ctx = avcodec_alloc_context3(codec); 259 | if (!dec_ctx) 260 | { 261 | strcpy(audio.error, "Cannot allocate audio codec context"); 262 | goto end; 263 | } 264 | 265 | if (avcodec_parameters_to_context(dec_ctx, stream->codecpar) < 0) 266 | { 267 | strcpy(audio.error, "Failed to copy audio codec parameters to decoder context"); 268 | goto end; 269 | } 270 | 271 | if (avcodec_open2(dec_ctx, codec, NULL) < 0) 272 | { 273 | strcpy(audio.error, "Cannot open codec"); 274 | goto end; 275 | } 276 | 277 | enum AVSampleFormat sample_fmt = dec_ctx->sample_fmt; 278 | if (av_sample_fmt_is_planar(sample_fmt)) 279 | { 280 | const char *packed = av_get_sample_fmt_name(sample_fmt); 281 | printf("Warning: the sample format the decoder produced is planar (%s). This example will output the first channel only.\n", packed ? packed : "?"); 282 | sample_fmt = av_get_packed_sample_fmt(dec_ctx->sample_fmt); 283 | } 284 | static struct sample_fmt_entry {enum AVSampleFormat sample_fmt; const char *fmt_be, *fmt_le; DLDataType dtype;} supported_sample_fmt_entries[] = 285 | { 286 | { AV_SAMPLE_FMT_U8, "u8" , "u8" , { kDLUInt , 8 , 1 }}, 287 | { AV_SAMPLE_FMT_S16, "s16be", "s16le" , { kDLInt , 16, 1 }}, 288 | { AV_SAMPLE_FMT_S32, "s32be", "s32le" , { kDLInt , 32, 1 }}, 289 | { AV_SAMPLE_FMT_FLT, "f32be", "f32le" , { kDLFloat , 32, 1 }}, 290 | { AV_SAMPLE_FMT_DBL, "f64be", "f64le" , { kDLFloat , 64, 1 }}, 291 | }; 292 | 293 | double in_duration = stream->time_base.num * (int)stream->duration / stream->time_base.den; 294 | //double in_duration = fmt_ctx->duration / (float) AV_TIME_BASE; assert(in_duration > 0); 295 | double out_duration = in_duration; 296 | int in_sample_rate = dec_ctx->sample_rate; 297 | int out_sample_rate = output_options.sample_rate > 0 ? output_options.sample_rate : in_sample_rate; 298 | uint64_t out_num_samples = out_duration * out_sample_rate; 299 | int out_num_channels = dec_ctx->channels; 300 | 301 | DLDataType in_dtype, out_dtype; 302 | enum AVSampleFormat in_sample_fmt = AV_SAMPLE_FMT_NONE, out_sample_fmt = AV_SAMPLE_FMT_NONE; 303 | for (int k = 0; k < FF_ARRAY_ELEMS(supported_sample_fmt_entries); k++) 304 | { 305 | struct sample_fmt_entry* entry = &supported_sample_fmt_entries[k]; 306 | 307 | if (sample_fmt == entry->sample_fmt) 308 | { 309 | in_dtype = entry->dtype; 310 | in_sample_fmt = entry->sample_fmt; 311 | strcpy(audio.fmt, AV_NE(entry->fmt_be, entry->fmt_le)); 312 | } 313 | 314 | if (strcmp(output_options.fmt, entry->fmt_le) == 0 || strcmp(output_options.fmt, entry->fmt_be) == 0) 315 | { 316 | out_dtype = entry->dtype; 317 | out_sample_fmt = entry->sample_fmt; 318 | } 319 | } 320 | if (in_sample_fmt == AV_SAMPLE_FMT_NONE) 321 | { 322 | strcpy(audio.error, "Cannot deduce format"); 323 | goto end; 324 | } 325 | if (out_sample_fmt == AV_SAMPLE_FMT_NONE) 326 | { 327 | out_sample_fmt = in_sample_fmt; 328 | out_dtype = in_dtype; 329 | } 330 | 331 | if (!dec_ctx->channel_layout) 332 | dec_ctx->channel_layout = av_get_default_channel_layout(dec_ctx->channels); 333 | uint64_t channel_layout = dec_ctx->channel_layout; 334 | 335 | audio.duration = out_duration; 336 | audio.sample_rate = out_sample_rate; 337 | audio.num_channels = out_num_channels; 338 | audio.num_samples = out_num_samples; 339 | audio.data.dl_tensor.ctx.device_type = kDLCPU; 340 | audio.data.dl_tensor.ndim = 2; 341 | audio.data.dl_tensor.dtype = out_dtype; 342 | audio.data.dl_tensor.shape = malloc(audio.data.dl_tensor.ndim * sizeof(int64_t)); 343 | audio.data.dl_tensor.shape[0] = audio.num_samples; 344 | audio.data.dl_tensor.shape[1] = audio.num_channels; 345 | audio.data.dl_tensor.strides = malloc(audio.data.dl_tensor.ndim * sizeof(int64_t)); 346 | audio.data.dl_tensor.strides[0] = audio.data.dl_tensor.shape[1]; 347 | audio.data.dl_tensor.strides[1] = 1; 348 | audio.itemsize = audio.data.dl_tensor.dtype.lanes * audio.data.dl_tensor.dtype.bits / 8; 349 | 350 | if(probe) 351 | goto end; 352 | 353 | bool need_filter = filter_string != NULL && strlen(filter_string) > 0; 354 | bool need_resample = out_sample_rate != in_sample_rate || out_sample_fmt != in_sample_fmt; 355 | if(need_filter || need_resample) 356 | { 357 | graph = avfilter_graph_alloc(); 358 | if(!graph) 359 | { 360 | strcpy(audio.error, "Cannot allocate filter graph"); 361 | goto end; 362 | } 363 | 364 | sprintf(filter_args, "sample_rate=%d:sample_fmt=%s:channel_layout=0x%"PRIx64":time_base=%d/%d", in_sample_rate, av_get_sample_fmt_name(in_sample_fmt), channel_layout, dec_ctx->time_base.num, dec_ctx->time_base.den); 365 | 366 | if (avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in", filter_args, NULL, graph) < 0) 367 | { 368 | strcpy(audio.error, "Cannot create buffer source"); 369 | goto end; 370 | } 371 | int ret; 372 | if ((ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out", NULL, NULL, graph)) < 0) 373 | { 374 | strcpy(audio.error, "Cannot create buffer sink"); 375 | goto end; 376 | } 377 | const enum AVSampleFormat out_sample_fmts[] = { out_sample_fmt, -1 }; 378 | if (av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1, AV_OPT_SEARCH_CHILDREN) < 0) 379 | { 380 | strcpy(audio.error, "Cannot set output sample format"); 381 | goto end; 382 | } 383 | const int64_t out_channel_layouts[] = { channel_layout , -1 }; 384 | if (av_opt_set_int_list(buffersink_ctx, "channel_layouts", out_channel_layouts, -1, AV_OPT_SEARCH_CHILDREN) < 0) 385 | { 386 | strcpy(audio.error, "Cannot set output channel layout"); 387 | goto end; 388 | } 389 | const int out_sample_rates[] = { out_sample_rate, -1 }; 390 | if (av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1, AV_OPT_SEARCH_CHILDREN) < 0) 391 | { 392 | strcpy(audio.error, "Cannot set output sample rate"); 393 | goto end; 394 | } 395 | 396 | const char* out_sample_fmt_name = av_get_sample_fmt_name(out_sample_fmt); 397 | if(need_resample) 398 | { 399 | sprintf(filter_args, "%s%saresample=out_sample_rate=%d:out_sample_fmt=%s,aformat=sample_rates=%d:sample_fmts=%s:channel_layouts=0x%"PRIu64, need_filter ? filter_string : "", need_filter ? "," : "", out_sample_rate, out_sample_fmt_name, out_sample_rate, out_sample_fmt_name, channel_layout); 400 | } 401 | else 402 | { 403 | sprintf(filter_args, "%s%saformat=sample_rates=%d:sample_fmts=%s:channel_layouts=0x%"PRIu64, need_filter ? filter_string : "", need_filter ? "," : "", out_sample_rate, out_sample_fmt_name, channel_layout); 404 | } 405 | 406 | gis->name = av_strdup("out"); 407 | gis->filter_ctx = buffersink_ctx; 408 | gis->pad_idx = 0; 409 | gis->next = NULL; 410 | 411 | gos->name = av_strdup("in"); 412 | gos->filter_ctx = buffersrc_ctx; 413 | gos->pad_idx = 0; 414 | gos->next = NULL; 415 | 416 | if(avfilter_graph_parse_ptr(graph, filter_args, &gis, &gos, NULL) < 0) 417 | { 418 | strcpy(audio.error, "Cannot parse graph"); 419 | goto end; 420 | } 421 | 422 | if(avfilter_graph_config(graph, NULL) < 0) 423 | { 424 | strcpy(audio.error, "Cannot configure graph."); 425 | goto end; 426 | } 427 | } 428 | 429 | uint64_t data_len = 0; 430 | if(output_options.data.dl_tensor.data) 431 | { 432 | data_len = nbytes(&output_options); 433 | audio.data.dl_tensor.data = output_options.data.dl_tensor.data; 434 | } 435 | else 436 | { 437 | audio.data.deleter = deleter; 438 | data_len = audio.num_samples * audio.num_channels * audio.itemsize; 439 | audio.data.dl_tensor.data = calloc(data_len, 1); 440 | } 441 | 442 | uint8_t* data_ptr = audio.data.dl_tensor.data; 443 | pkt = av_packet_alloc(); 444 | while (av_read_frame(fmt_ctx, pkt) >= 0) 445 | { 446 | //if (pkt->stream_index == stream_index && decode_packet(dec_ctx, buffersrc_ctx, buffersink_ctx, pkt, &data_ptr, &data_len, audio.itemsize) < 0) 447 | // break; 448 | //av_packet_unref(pkt); 449 | } 450 | return audio; 451 | 452 | pkt->data = NULL; 453 | pkt->size = 0; 454 | //decode_packet(dec_ctx, buffersrc_ctx, buffersink_ctx, pkt, &data_ptr, &data_len, audio.itemsize); 455 | 456 | end: 457 | if(graph) 458 | avfilter_graph_free(&graph); 459 | if(dec_ctx) 460 | avcodec_free_context(&dec_ctx); 461 | if(fmt_ctx) 462 | avformat_close_input(&fmt_ctx); 463 | if(pkt) 464 | av_packet_free(&pkt); 465 | if(gis) 466 | avfilter_inout_free(&gis); 467 | if(gos) 468 | avfilter_inout_free(&gos); 469 | if(io_ctx) 470 | av_free(io_ctx); 471 | 472 | //fprintf(stderr, "Error occurred: %s\n", av_err2str(ret)); 473 | return audio; 474 | } 475 | 476 | int main(int argc, char **argv) 477 | { 478 | if (argc <= 2) 479 | { 480 | printf("Usage: %s \n", argv[0]); 481 | return 1; 482 | } 483 | 484 | struct DecodeAudio input_options = { 0 }, output_options = { 0 }; 485 | 486 | //struct DecodeAudio audio = decode_audio(argv[1], false, input_options, output_options, argc == 4 ? argv[3] : NULL); 487 | 488 | char buf[100000]; 489 | int64_t read = fread(buf, 1, sizeof(buf), fopen(argv[1], "r")); 490 | input_options.data.dl_tensor.data = &buf; 491 | input_options.data.dl_tensor.ndim = 1; 492 | input_options.data.dl_tensor.shape = &read; 493 | input_options.data.dl_tensor.dtype.lanes = 1; 494 | input_options.data.dl_tensor.dtype.bits = 8; 495 | input_options.data.dl_tensor.dtype.code = kDLUInt; 496 | 497 | clock_t tic = clock(); 498 | struct DecodeAudio audio = decode_audio(NULL, input_options, output_options, argc == 4 ? argv[3] : NULL, false, true); 499 | printf("decode_audio: %.2f microsec\n", (float)(clock() - tic) * 1000000 / CLOCKS_PER_SEC); 500 | 501 | //printf("Error: [%s]\n", audio.error); 502 | //printf("ffplay -i %s\n", argv[1]); 503 | //printf("ffplay -f %s -ac %d -ar %d -i %s # num samples: %d\n", audio.fmt, (int)audio.num_channels, (int)audio.sample_rate, argv[2], (int)audio.num_samples); 504 | //fwrite(audio.data.dl_tensor.data, audio.itemsize, audio.num_samples * audio.num_channels, fopen(argv[2], "wb")); 505 | return 0; 506 | } 507 | -------------------------------------------------------------------------------- /dlpack.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file dlpack.h 4 | * \brief The common header of DLPack. 5 | */ 6 | #ifndef DLPACK_DLPACK_H_ 7 | #define DLPACK_DLPACK_H_ 8 | 9 | #ifdef __cplusplus 10 | #define DLPACK_EXTERN_C extern "C" 11 | #else 12 | #define DLPACK_EXTERN_C 13 | #endif 14 | 15 | /*! \brief The current version of dlpack */ 16 | #define DLPACK_VERSION 020 17 | 18 | /*! \brief DLPACK_DLL prefix for windows */ 19 | #ifdef _WIN32 20 | #ifdef DLPACK_EXPORTS 21 | #define DLPACK_DLL __declspec(dllexport) 22 | #else 23 | #define DLPACK_DLL __declspec(dllimport) 24 | #endif 25 | #else 26 | #define DLPACK_DLL 27 | #endif 28 | 29 | #include 30 | #include 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | /*! 36 | * \brief The device type in DLContext. 37 | */ 38 | typedef enum { 39 | /*! \brief CPU device */ 40 | kDLCPU = 1, 41 | /*! \brief CUDA GPU device */ 42 | kDLGPU = 2, 43 | /*! 44 | * \brief Pinned CUDA GPU device by cudaMallocHost 45 | * \note kDLCPUPinned = kDLCPU | kDLGPU 46 | */ 47 | kDLCPUPinned = 3, 48 | /*! \brief OpenCL devices. */ 49 | kDLOpenCL = 4, 50 | /*! \brief Vulkan buffer for next generation graphics. */ 51 | kDLVulkan = 7, 52 | /*! \brief Metal for Apple GPU. */ 53 | kDLMetal = 8, 54 | /*! \brief Verilog simulator buffer */ 55 | kDLVPI = 9, 56 | /*! \brief ROCm GPUs for AMD GPUs */ 57 | kDLROCM = 10, 58 | /*! 59 | * \brief Reserved extension device type, 60 | * used for quickly test extension device 61 | * The semantics can differ depending on the implementation. 62 | */ 63 | kDLExtDev = 12, 64 | } DLDeviceType; 65 | 66 | /*! 67 | * \brief A Device context for Tensor and operator. 68 | */ 69 | typedef struct { 70 | /*! \brief The device type used in the device. */ 71 | DLDeviceType device_type; 72 | /*! \brief The device index */ 73 | int device_id; 74 | } DLContext; 75 | 76 | /*! 77 | * \brief The type code options DLDataType. 78 | */ 79 | typedef enum { 80 | kDLInt = 0U, 81 | kDLUInt = 1U, 82 | kDLFloat = 2U, 83 | kDLBfloat = 4U, 84 | } DLDataTypeCode; 85 | 86 | /*! 87 | * \brief The data type the tensor can hold. 88 | * 89 | * Examples 90 | * - float: type_code = 2, bits = 32, lanes=1 91 | * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4 92 | * - int8: type_code = 0, bits = 8, lanes=1 93 | */ 94 | typedef struct { 95 | /*! 96 | * \brief Type code of base types. 97 | * We keep it uint8_t instead of DLDataTypeCode for minimal memory 98 | * footprint, but the value should be one of DLDataTypeCode enum values. 99 | * */ 100 | uint8_t code; 101 | /*! 102 | * \brief Number of bits, common choices are 8, 16, 32. 103 | */ 104 | uint8_t bits; 105 | /*! \brief Number of lanes in the type, used for vector types. */ 106 | uint16_t lanes; 107 | } DLDataType; 108 | 109 | /*! 110 | * \brief Plain C Tensor object, does not manage memory. 111 | */ 112 | typedef struct { 113 | /*! 114 | * \brief The opaque data pointer points to the allocated data. This will be 115 | * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always 116 | * aligned to 256 bytes as in CUDA. 117 | * 118 | * For given DLTensor, the size of memory required to store the contents of 119 | * data is calculated as follows: 120 | * 121 | * \code{.c} 122 | * static inline size_t GetDataSize(const DLTensor* t) { 123 | * size_t size = 1; 124 | * for (tvm_index_t i = 0; i < t->ndim; ++i) { 125 | * size *= t->shape[i]; 126 | * } 127 | * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; 128 | * return size; 129 | * } 130 | * \endcode 131 | */ 132 | void* data; 133 | /*! \brief The device context of the tensor */ 134 | DLContext ctx; 135 | /*! \brief Number of dimensions */ 136 | int ndim; 137 | /*! \brief The data type of the pointer*/ 138 | DLDataType dtype; 139 | /*! \brief The shape of the tensor */ 140 | int64_t* shape; 141 | /*! 142 | * \brief strides of the tensor (in number of elements, not bytes) 143 | * can be NULL, indicating tensor is compact and row-majored. 144 | */ 145 | int64_t* strides; 146 | /*! \brief The offset in bytes to the beginning pointer to data */ 147 | uint64_t byte_offset; 148 | } DLTensor; 149 | 150 | /*! 151 | * \brief C Tensor object, manage memory of DLTensor. This data structure is 152 | * intended to facilitate the borrowing of DLTensor by another framework. It is 153 | * not meant to transfer the tensor. When the borrowing framework doesn't need 154 | * the tensor, it should call the deleter to notify the host that the resource 155 | * is no longer needed. 156 | */ 157 | typedef struct DLManagedTensor { 158 | /*! \brief DLTensor which is being memory managed */ 159 | DLTensor dl_tensor; 160 | /*! \brief the context of the original host framework of DLManagedTensor in 161 | * which DLManagedTensor is used in the framework. It can also be NULL. 162 | */ 163 | void * manager_ctx; 164 | /*! \brief Destructor signature void (*)(void*) - this should be called 165 | * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL 166 | * if there is no way for the caller to provide a reasonable destructor. 167 | * The destructors deletes the argument self as well. 168 | */ 169 | void (*deleter)(struct DLManagedTensor * self); 170 | } DLManagedTensor; 171 | #ifdef __cplusplus 172 | } // DLPACK_EXTERN_C 173 | #endif 174 | #endif // DLPACK_DLPACK_H_ 175 | --------------------------------------------------------------------------------