├── .gitignore ├── CMakeLists.txt ├── Copyright.txt ├── LICENSE ├── README.md ├── audio-0.1-0.rockspec ├── audio.c ├── generic ├── audio.c └── sox.c ├── init.lua ├── sox.c ├── test ├── load_and_save_example.lua ├── spectrogram.lua └── test_decompress.lua └── voice.mp3 /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR) 2 | CMAKE_POLICY(VERSION 2.8) 3 | 4 | FIND_PACKAGE(Torch REQUIRED) 5 | 6 | FIND_PATH(SOX_INCLUDE_DIR sox.h 7 | "/usr/include/sox/") 8 | FIND_LIBRARY(SOX_LIBRARIES sox REQUIRED) 9 | message ("SOX_INCLUDE_DIR: ${SOX_INCLUDE_DIR}") 10 | message ("SOX_LIBRARIES: ${SOX_LIBRARIES}") 11 | 12 | FIND_PATH(FFTW_INCLUDE_DIR fftw3.h) 13 | FIND_LIBRARY(FFTW_LIBRARIES fftw3 REQUIRED) 14 | message ("FFTW_INCLUDE_DIR: ${FFTW_INCLUDE_DIR}") 15 | message ("FFTW_LIBRARIES: ${FFTW_LIBRARIES}") 16 | 17 | SET(src sox.c) 18 | include_directories (${SOX_INCLUDE_DIR}) 19 | ADD_TORCH_PACKAGE(sox "${src}" "${luasrc}" "Audio Processing") 20 | TARGET_LINK_LIBRARIES(sox luaT TH ${SOX_LIBRARIES}) 21 | 22 | include_directories (${FFTW_INCLUDE_DIR}) 23 | SET(src audio.c) 24 | SET(luasrc init.lua voice.mp3) 25 | ADD_TORCH_PACKAGE(audio "${src}" "${luasrc}" "Audio Processing") 26 | TARGET_LINK_LIBRARIES(audio luaT TH ${SOX_LIBRARIES} ${FFTW_LIBRARIES}) 27 | -------------------------------------------------------------------------------- /Copyright.txt: -------------------------------------------------------------------------------- 1 | =============================================================================== 2 | 3 | lua---audio -- http://github.com/soumith/lua---audio 4 | 5 | Copyright (c) 2012 Soumith Chintala 6 | 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are met: 11 | 12 | 1. Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | 15 | 2. Redistributions in binary form must reproduce the above copyright 16 | notice, this list of conditions and the following disclaimer in the 17 | documentation and/or other materials provided with the distribution. 18 | 19 | 3. The names of the software's contributors may not be used to endorse or 20 | promote products derived from this software without specific prior 21 | written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 27 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 28 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 29 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 30 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 31 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 | POSSIBILITY OF SUCH DAMAGE. 34 | 35 | =============================================================================== 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Romantic WTF public license. 2 | -------------------------------- 3 | a.k.a. version "<3" or simply v3 4 | 5 | Dear user, 6 | 7 | The Romantic WTF Public License a.k.a version "<3" 8 | 9 | \ 10 | '.,__ 11 | \ / 12 | '/,__ 13 | / 14 | / 15 | / 16 | has been / released 17 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 18 | under the Romantic WTF Public License. 19 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~`,´ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 20 | I hereby grant you an irrevocable license to 21 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 22 | do what the gentle caress you want to 23 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 24 | with this lovely 25 | ~ ~ ~ ~ ~ ~ ~ ~ 26 | / license. 27 | / ~ ~ ~ ~ 28 | / Love, 29 | # / '.' 30 | ####### · 31 | ##### 32 | ### 33 | # 34 | 35 | -- Soumith Chintala. 36 | 37 | 38 | P.S.: Even though I poured my heart into this work, 39 | I _cannot_ provide any warranty regarding 40 | its fitness for _any_ purpose. You 41 | acknowledge that I will not be held liable 42 | for any damage its use could incur. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Audio Library for Torch 2 | ======================= 3 | 4 | Audio library for Torch-7 5 | * Support audio I/O (Load files, save files) 6 | * Common audio operations (Short-time Fourier transforms, Spectrograms) 7 | 8 | Load the following formats into a torch Tensor 9 | * mp3, wav, aac, ogg, flac, avr, cdda, cvs/vms, 10 | * aiff, au, amr, mp2, mp4, ac3, avi, wmv, 11 | * mpeg, ircam and any other format supported by libsox. 12 | 13 | Calculate Short-time Fourier transforms with 14 | * window types - rectangular, hamming, hann, bartlett 15 | 16 | Generate spectrograms 17 | 18 | Dependencies 19 | ------------ 20 | * libsox v14.3.2 or above 21 | * libfftw3 22 | 23 | Quick install on 24 | OSX (Homebrew): 25 | ```bash 26 | $ brew install sox 27 | $ brew install fftw 28 | ``` 29 | Linux (Ubuntu): 30 | ```bash 31 | $ sudo apt-get install libfftw3-dev 32 | $ sudo apt-get install sox libsox-dev libsox-fmt-all 33 | ``` 34 | 35 | Installation 36 | ------------ 37 | This project can be installed with `luarocks` like this: 38 | 39 | ```bash 40 | $ luarocks install https://raw.githubusercontent.com/soumith/lua---audio/master/audio-0.1-0.rockspec 41 | ``` 42 | 43 | On Ubuntu 13.04 64-bit, I had to modify the command slightly because of new library directory structures not picked up by luarocks. 44 | ```bash 45 | $ sudo luarocks install https://raw.githubusercontent.com/soumith/lua---audio/master/audio-0.1-0.rockspec LIBSOX_LIBDIR=/usr/lib/x86_64-linux-gnu/ LIBFFTW3_LIBDIR=/usr/lib/x86_64-linux-gnu 46 | ``` 47 | 48 | Or, if you have downloaded this repository on your machine, and 49 | you are in its directory: 50 | 51 | ```bash 52 | $ luarocks make 53 | ``` 54 | 55 | Usage 56 | ===== 57 | audio.load 58 | ``` 59 | loads an audio file into a torch.Tensor 60 | usage: 61 | audio.load( 62 | string -- path to file 63 | ) 64 | 65 | returns torch.Tensor of size NSamples x NChannels, sample_rate 66 | ``` 67 | 68 | audio.save 69 | ``` 70 | saves a tensor into an audio file. The extension of the given path is used as the saving format. 71 | usage: 72 | audio.save( 73 | string -- path to file 74 | tensor -- NSamples x NChannels 2D tensor 75 | number -- sample_rate of the audio to be saved as 76 | ) 77 | ``` 78 | 79 | audio.compress 80 | ``` 81 | Compresses a tensor in-memory and returns a CharTensor. The extension of the given path is used as the saving format. This can be decompressed using the "decompress" method 82 | usage: 83 | audio.compress(__ 84 | tensor -- NSamples x NChannels 2D tensor 85 | number -- sample_rate of the audio to be saved as 86 | extension -- format of audio to compress in. Example: mp3, ogg, flac, sox etc. 87 | ) 88 | ``` 89 | 90 | audio.decompress 91 | ``` 92 | Decompresses a tensor in-memory and returns raw audio. The extension of the given path is used as the loading format. 93 | usage: 94 | audio.decompress(__ 95 | CharTensor -- 1D CharTensor that was returned by .compress 96 | extension -- format of audio used to compress. Example: mp3, ogg, flac, sox etc. 97 | ) 98 | ``` 99 | 100 | audio.stft 101 | ``` 102 | calculate the stft of an audio. returns a 3D tensor, with number_of_windows x window_size/2+1 x 2(complex number with real and complex parts) 103 | usage: 104 | audio.stft( 105 | torch.Tensor -- input single-channel audio 106 | number -- window size 107 | string -- window type: rect, hamming, hann, bartlett 108 | number -- stride 109 | ) 110 | ``` 111 | 112 | audio.spectrogram 113 | ``` 114 | generate the spectrogram of an audio. returns a 2D tensor, with number_of_windows x window_size/2+1, each value representing the magnitude of each frequency in dB 115 | usage: 116 | audio.spectrogram( 117 | torch.Tensor -- input single-channel audio 118 | number -- window size 119 | string -- window type: rect, hamming, hann, bartlett 120 | number -- stride 121 | ) 122 | ``` 123 | 124 | Example Usage 125 | ------------- 126 | Generate a spectrogram 127 | ```lua 128 | require 'audio' 129 | require 'image' -- to display the spectrogram 130 | voice = audio.samplevoice() 131 | spect = audio.spectrogram(voice, 8192, 'hann', 512) 132 | image.display(spect) 133 | ``` 134 | -------------------------------------------------------------------------------- /audio-0.1-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "audio" 2 | version = "0.1-0" 3 | 4 | source = { 5 | url = "git://github.com/soumith/lua---audio", 6 | tag = "master" 7 | } 8 | 9 | description = { 10 | summary = "Audio library for Torch-7", 11 | detailed = [[ 12 | Support audio I/O (Load files) 13 | Common audio operations (Short-time Fourier transforms, Spectrograms) 14 | ]], 15 | homepage = "https://github.com/soumith/lua---audio", 16 | license = "RWTFPL" 17 | } 18 | 19 | dependencies = { 20 | "torch >= 7.0", 21 | "sys >= 1.0", 22 | "xlua >= 1.0" 23 | } 24 | 25 | build = { 26 | type = "command", 27 | build_command = [[ 28 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 29 | ]], 30 | install_command = "cd build && $(MAKE) install" 31 | } 32 | -------------------------------------------------------------------------------- /audio.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #if LUA_VERSION_NUM >= 503 12 | #define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n))) 13 | #define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n))) 14 | #endif 15 | 16 | void abort_(const char * s, ...) 17 | { 18 | va_list args; 19 | va_start(args, s); 20 | vfprintf(stderr, s, args); 21 | fprintf(stderr, "\n"); 22 | va_end(args); 23 | abort(); 24 | } 25 | 26 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) 27 | #define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor) 28 | #define audio_(NAME) TH_CONCAT_3(audio_, Real, NAME) 29 | 30 | #include "generic/audio.c" 31 | #include "THGenerateAllTypes.h" 32 | 33 | DLL_EXPORT int luaopen_libaudio(lua_State *L) 34 | { 35 | audio_ByteMain_init(L); 36 | audio_CharMain_init(L); 37 | audio_ShortMain_init(L); 38 | audio_IntMain_init(L); 39 | audio_LongMain_init(L); 40 | audio_FloatMain_init(L); 41 | audio_DoubleMain_init(L); 42 | 43 | lua_newtable(L); 44 | lua_pushvalue(L, -1); 45 | lua_setglobal(L, "audio"); 46 | 47 | lua_newtable(L); 48 | luaT_setfuncs(L, audio_DoubleMain__, 0); 49 | lua_setfield(L, -2, "double"); 50 | 51 | lua_newtable(L); 52 | luaT_setfuncs(L, audio_FloatMain__, 0); 53 | lua_setfield(L, -2, "float"); 54 | 55 | lua_newtable(L); 56 | luaT_setfuncs(L, audio_ByteMain__, 0); 57 | lua_setfield(L, -2, "byte"); 58 | 59 | lua_newtable(L); 60 | luaT_setfuncs(L, audio_CharMain__, 0); 61 | lua_setfield(L, -2, "char"); 62 | 63 | lua_newtable(L); 64 | luaT_setfuncs(L, audio_ShortMain__, 0); 65 | lua_setfield(L, -2, "short"); 66 | 67 | lua_newtable(L); 68 | luaT_setfuncs(L, audio_IntMain__, 0); 69 | lua_setfield(L, -2, "int"); 70 | 71 | lua_newtable(L); 72 | luaT_setfuncs(L, audio_LongMain__, 0); 73 | lua_setfield(L, -2, "long"); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /generic/audio.c: -------------------------------------------------------------------------------- 1 | #ifndef TH_GENERIC_FILE 2 | #define TH_GENERIC_FILE "generic/audio.c" 3 | #else 4 | 5 | #undef TAPI 6 | #define TAPI __declspec(dllimport) 7 | 8 | /* ---------------------------------------------------------------------- */ 9 | /* -- */ 10 | /* -- Copyright (c) 2012 Soumith Chintala */ 11 | /* -- */ 12 | /* -- Permission is hereby granted, free of charge, to any person obtaining */ 13 | /* -- a copy of this software and associated documentation files (the */ 14 | /* -- "Software"), to deal in the Software without restriction, including */ 15 | /* -- without limitation the rights to use, copy, modify, merge, publish, */ 16 | /* -- distribute, sublicense, and/or sell copies of the Software, and to */ 17 | /* -- permit persons to whom the Software is furnished to do so, subject to */ 18 | /* -- the following conditions: */ 19 | /* -- */ 20 | /* -- The above copyright notice and this permission notice shall be */ 21 | /* -- included in all copies or substantial portions of the Software. */ 22 | /* -- */ 23 | /* -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ 24 | /* -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ 25 | /* -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ 26 | /* -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE */ 27 | /* -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION */ 28 | /* -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */ 29 | /* -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 30 | /* -- */ 31 | /* ---------------------------------------------------------------------- */ 32 | /* -- description: */ 33 | /* -- audio.c - general purpose audio transforms for Torch-7 */ 34 | /* -- */ 35 | /* -- history: */ 36 | /* -- May 26th, 2012, 5:46PM - wrote stft and channel 37 | flattening functions - Soumith Chintala */ 38 | /* ---------------------------------------------------------------------- */ 39 | 40 | 41 | // write audio.toMono() which converts a multi-channel audio to single channel 42 | 43 | #ifndef M_PI 44 | #define M_PI 3.14159265358979323846264338327950288 45 | #endif 46 | 47 | // Apply an edge-smoothing function to make the signal periodic. 48 | // Different window types are provided 49 | static inline void audio_(apply_window)(double *input, 50 | long window_size, int window_type) { 51 | long i, m = window_size -1; 52 | switch (window_type) { 53 | case 1: // Rectangular Window, do nothing 54 | break; 55 | case 2: // Hamming Window 56 | for (i = 0; i < window_size; ++i) 57 | input[i] *= .53836 - .46164 * cos(2 * M_PI * i / m); 58 | break; 59 | case 3: // Hann Window 60 | for (i = 0; i < window_size; ++i) 61 | input[i] *= .5 - .5 * cos(2 * M_PI * i / m); 62 | break; 63 | case 4: // Bartlett Window 64 | for (i = 0; i < window_size; ++i) 65 | input[i] *= 2. / m * (m / 2. - fabs(i - m / 2.)); 66 | break; 67 | default: 68 | THError("[stft_generic] Unknown window_type"); 69 | break; 70 | } 71 | } 72 | 73 | //////////////////////////////////////////////////////////////////////////// 74 | // generic short-time fourier transform function that supports multiple window types 75 | // arguments [tensor, window-size, window-type, hop-size/stride] 76 | // window_type [1, 2, 3, 4] for [rectangular, hamming, hann, bartlett] 77 | static THTensor * audio_(stft_generic)(THTensor *input, 78 | long window_size, int window_type, 79 | long stride) 80 | { 81 | const long length = input->size[0]; 82 | long nChannels = 1; 83 | 84 | if (THTensor_(nDimension)(input) > 1) 85 | nChannels = input->size[1]; 86 | 87 | if (nChannels > 1) 88 | THError("[stft_generic] Multi-channel stft not supported"); 89 | 90 | real *input_data = THTensor_(data)(input); 91 | const long nwindows = ((length - window_size)/stride) + 1; 92 | const long noutput = window_size/2 + 1; 93 | THTensor *output = THTensor_(newWithSize3d)(nwindows, noutput, 2); 94 | real *output_data = THTensor_(data)(output); 95 | double *buffer = malloc(sizeof(double) * window_size); 96 | fftw_complex *fbuffer = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*noutput); 97 | long index, k, outindex=0; 98 | 99 | fftw_plan plan = fftw_plan_dft_r2c_1d(window_size, buffer, fbuffer, FFTW_ESTIMATE); 100 | 101 | // loop over the input. get a buffer. apply window. call stft. repeat with stride. 102 | for (index = 0; index + window_size <= length; index = index + stride) { 103 | for (k=0; ksignal.channels; 40 | long buffer_size = fd->signal.length; 41 | if (buffer_size == 0) { 42 | if (nsamples != -1) { 43 | buffer_size = nsamples; 44 | } else { 45 | THError("[read_audio] Unknown length"); 46 | } 47 | } 48 | *sample_rate = (int) fd->signal.rate; 49 | int32_t *buffer = (int32_t *)malloc(sizeof(int32_t) * buffer_size); 50 | size_t samples_read = sox_read(fd, buffer, buffer_size); 51 | if (samples_read == 0) 52 | THError("[read_audio] Empty file or read failed in sox_read"); 53 | // alloc tensor 54 | THTensor_(resize2d)(tensor, samples_read / nchannels, nchannels ); 55 | real *tensor_data = THTensor_(data)(tensor); 56 | // convert audio to dest tensor 57 | int x,k; 58 | for (x=0; xsize[1]; 98 | long nsamples = src->size[0]; 99 | real* data = THTensor_(data)(src); 100 | 101 | // convert audio to dest tensor 102 | int x,k; 103 | for (x=0; xsize[1]; 120 | long nsamples = src->size[0]; 121 | 122 | sox_format_t *fd; 123 | 124 | // Create sox objects and write into int32_t buffer 125 | sox_signalinfo_t sinfo; 126 | sinfo.rate = sample_rate; 127 | sinfo.channels = nchannels; 128 | sinfo.length = nsamples * nchannels; 129 | sinfo.precision = sizeof(int32_t) * 8; /* precision in bits */ 130 | #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0 131 | sinfo.mult = NULL; 132 | #endif 133 | fd = sox_open_write(file_name, &sinfo, NULL, extension, NULL, NULL); 134 | if (fd == NULL) 135 | THError("[write_audio_file] Failure to open file for writing"); 136 | 137 | libsox_(write_audio)(fd, src, extension, sample_rate); 138 | 139 | // free buffer and sox structures 140 | sox_close(fd); 141 | 142 | return; 143 | } 144 | 145 | void libsox_(write_audio_memory)(THCharTensor* out, THTensor* src, 146 | const char *extension, int sample_rate) 147 | { 148 | if (THTensor_(isContiguous)(src) == 0) 149 | THError("[write_audio_file] Input should be contiguous tensors"); 150 | 151 | long nchannels = src->size[1]; 152 | long nsamples = src->size[0]; 153 | 154 | sox_format_t *fd; 155 | char *buffer = NULL; 156 | size_t buffer_size = -1; 157 | 158 | // Create sox objects and write into int32_t buffer 159 | sox_signalinfo_t sinfo; 160 | sinfo.rate = sample_rate; 161 | sinfo.channels = nchannels; 162 | sinfo.length = nsamples * nchannels; 163 | sinfo.precision = sizeof(int32_t) * 8; /* precision in bits */ 164 | #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0 165 | sinfo.mult = NULL; 166 | #endif 167 | fd = sox_open_memstream_write(&buffer, &buffer_size, &sinfo, NULL, extension, NULL); 168 | if (fd == NULL) 169 | THError("[write_audio_memory] Failure to open sox object for writing"); 170 | 171 | libsox_(write_audio)(fd, src, extension, sample_rate); 172 | 173 | // free sox structures 174 | sox_close(fd); 175 | 176 | // write the number of samples as well. to get around a SOX bug for certain formats. 177 | int64_t olength = nsamples * nchannels; 178 | size_t out_size = buffer_size + 8; 179 | char * out_data = (char*) malloc(out_size); 180 | 181 | /* TODO: investigate why if I create a storage and memcpy over, it's segfaulting */ 182 | // THCharStorage* out_storage = THCharStorage_newWithSize1(out_size); 183 | // char* out_data = THCharStorage_data(out_storage); 184 | 185 | // write the actual data after an offset of int64_t 186 | memcpy(out_data + 8, buffer, buffer_size); 187 | memcpy(out_data, &olength, 8); 188 | 189 | THCharStorage* out_storage = THCharStorage_newWithData(out_data, out_size); 190 | 191 | THCharTensor_setStorage1d(out, out_storage, 0, out_size, 1); 192 | 193 | // free buffers and stuff 194 | free(buffer); 195 | 196 | return; 197 | } 198 | 199 | static int libsox_(Main_load)(lua_State *L) { 200 | const char *filename = luaL_checkstring(L, 1); 201 | THTensor *tensor = THTensor_(new)(); 202 | int sample_rate = 0; 203 | libsox_(read_audio_file)(filename, tensor, &sample_rate); 204 | luaT_pushudata(L, tensor, torch_Tensor); 205 | lua_pushnumber(L, (double) sample_rate); 206 | return 2; 207 | } 208 | 209 | static int libsox_(Main_decompress)(lua_State *L) { 210 | THCharTensor *inp = luaT_checkudata(L, 1, "torch.CharTensor"); 211 | const char *extension = luaL_checkstring(L, 2); 212 | THTensor *tensor = THTensor_(new)(); 213 | int sample_rate = 0; 214 | libsox_(read_audio_memory)(inp, tensor, &sample_rate, extension); 215 | luaT_pushudata(L, tensor, torch_Tensor); 216 | lua_pushnumber(L, (double) sample_rate); 217 | return 2; 218 | } 219 | 220 | static int libsox_(Main_save)(lua_State *L) { 221 | const char *filename = luaL_checkstring(L, 1); 222 | THTensor *tensor = luaT_checkudata(L, 2, torch_Tensor); 223 | const char *extension = luaL_checkstring(L, 3); 224 | int sample_rate = luaL_checkint(L, 4); 225 | libsox_(write_audio_file)(filename, tensor, extension, sample_rate); 226 | return 1; 227 | } 228 | 229 | static int libsox_(Main_compress)(lua_State *L) { 230 | THCharTensor *out = luaT_checkudata(L, 1, "torch.CharTensor"); 231 | THTensor *src = luaT_checkudata(L, 2, torch_Tensor); 232 | const char *extension = luaL_checkstring(L, 3); 233 | int sample_rate = luaL_checkint(L, 4); 234 | libsox_(write_audio_memory)(out, src, extension, sample_rate); 235 | return 1; 236 | } 237 | 238 | static const luaL_Reg libsox_(Main__)[] = 239 | { 240 | {"load", libsox_(Main_load)}, 241 | {"save", libsox_(Main_save)}, 242 | {"compress", libsox_(Main_compress)}, 243 | {"decompress", libsox_(Main_decompress)}, 244 | {NULL, NULL} 245 | }; 246 | 247 | DLL_EXPORT int libsox_(Main_init)(lua_State *L) 248 | { 249 | luaT_pushmetatable(L, torch_Tensor); 250 | luaT_registeratname(L, libsox_(Main__), "libsox"); 251 | // Initialize sox library 252 | sox_format_init(); 253 | return 1; 254 | } 255 | 256 | #endif 257 | -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | -- 3 | -- Copyright (c) 2012 Soumith Chintala 4 | -- 5 | -- Permission is hereby granted, free of charge, to any person obtaining 6 | -- a copy of this software and associated documentation files (the 7 | -- "Software"), to deal in the Software without restriction, including 8 | -- without limitation the rights to use, copy, modify, merge, publish, 9 | -- distribute, sublicense, and/or sell copies of the Software, and to 10 | -- permit persons to whom the Software is furnished to do so, subject to 11 | -- the following conditions: 12 | -- 13 | -- The above copyright notice and this permission notice shall be 14 | -- included in all copies or substantial portions of the Software. 15 | -- 16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -- 24 | ---------------------------------------------------------------------- 25 | -- description: 26 | -- audio - an audio toolBox, for Torch 27 | -- 28 | -- history: 29 | -- May 24th, 2012, 7:28PM - wrote sox wrappers - Soumith Chintala 30 | ---------------------------------------------------------------------- 31 | 32 | require 'torch' 33 | require 'sys' 34 | require 'xlua' 35 | require 'dok' 36 | require 'paths' 37 | require 'libaudio' 38 | 39 | ---------------------------------------------------------------------- 40 | -- load from multiple formats 41 | -- 42 | local function load(filename) 43 | if not filename then 44 | print(dok.usage('audio.load', 45 | 'loads an audio file into a torch.Tensor', nil, 46 | {type='string', help='path to file', req=true})) 47 | dok.error('missing file name', 'audio.load') 48 | end 49 | if not paths.filep(filename) then 50 | dok.error('Specified filename: ' .. filename .. ' not found', 'audio.load') 51 | end 52 | local tensor 53 | if not xlua.require 'libsox' then 54 | dok.error('libsox package not found, please install libsox','audio.load') 55 | end 56 | local a, sample_rate = torch.Tensor().libsox.load(filename) 57 | return a, sample_rate 58 | end 59 | rawset(audio, 'load', load) 60 | -------------------------------------------------------------------------- 61 | -- save to multiple formats 62 | local function save(filename, src, sample_rate) 63 | if not filename or not src then 64 | error('filename or src tensor missing') 65 | end 66 | if not xlua.require 'libsox' then 67 | dok.error('libsox package not found, please install libsox','audio.save') 68 | end 69 | local extension = paths.extname(filename) 70 | assert(extension, 'did not find extension (like .wav or .mp3) in filename. ' 71 | .. 'Give a filename with an extension, for example: hello.wav') 72 | assert(sample_rate and type(sample_rate) == 'number', 73 | 'provide a sample rate (a number) such as 22050') 74 | src.libsox.save(filename, src, extension, sample_rate) 75 | end 76 | rawset(audio, 'save', save) 77 | -------------------------------------------------------------------------- 78 | -- compress 79 | -- save to multiple formats 80 | function audio.compress(src, sample_rate, extension) 81 | if not src then 82 | error('src tensor missing') 83 | end 84 | assert(sample_rate and type(sample_rate) == 'number', 85 | 'provide a sample rate (a number) such as 22050') 86 | if not xlua.require 'libsox' then 87 | dok.error('libsox package not found, please install libsox','audio.compress') 88 | end 89 | local out = torch.CharTensor() 90 | src.libsox.compress(out, src, extension, sample_rate) 91 | return out 92 | end 93 | 94 | -- decompress 95 | function audio.decompress(src, extension) 96 | if not src then 97 | error('src tensor missing') 98 | end 99 | if not extension then 100 | error('extension string missing') 101 | end 102 | if not xlua.require 'libsox' then 103 | dok.error('libsox package not found, please install libsox','audio.decompress') 104 | end 105 | local a, sample_rate = torch.Tensor().libsox.decompress(src, extension) 106 | return a, sample_rate 107 | end 108 | 109 | -- compressMP3 110 | function audio.compressMP3(src, sample_rate) 111 | return audio.compress(src, sample_rate, 'mp3') 112 | end 113 | function audio.compressOGG(src, sample_rate) 114 | return audio.compress(src, sample_rate, 'ogg') 115 | end 116 | 117 | function audio.decompressMP3(src) 118 | return audio.decompress(src, 'mp3') 119 | end 120 | function audio.decompressOGG(src) 121 | return audio.decompress(src, 'ogg') 122 | end 123 | ---------------------------------------------------------------------- 124 | -- spectrogram 125 | -- 126 | local function spectrogram(...) 127 | local output, input, window_size, window_type, stride 128 | local args = {...} 129 | if select('#',...) == 4 then 130 | input = args[1] 131 | window_size = args[2] 132 | window_type = args[3] 133 | stride = args[4] 134 | else 135 | print(dok.usage('audio.spectrogram', 136 | 'generate the spectrogram of an audio. ' 137 | .. 'returns a 2D tensor, with ' 138 | .. 'number_of_windows x window_size/2+1, ' 139 | .. 'each value representing the magnitude of ' 140 | .. 'each frequency in dB', nil, 141 | {type='torch.Tensor', 142 | help='input single-channel audio', req=true}, 143 | {type='number', help='window size', req=true}, 144 | {type='string', 145 | help='window type: rect, hamming, hann, bartlett' , req=true}, 146 | {type='number', help='stride', req=true})) 147 | dok.error('incorrect arguments', 'audio.spectrogram') 148 | end 149 | 150 | -- calculate stft 151 | local stftout = audio.stft(input, window_size, window_type, stride) 152 | 153 | -- calculate magnitude of signal and convert to dB to make it look prettier 154 | local stftout_r = stftout:select(3,1) 155 | local stftout_c = stftout:select(3,2) 156 | stftout_r:pow(2) 157 | stftout_c:pow(2) 158 | local stftout_magnitude = stftout_r + stftout_c 159 | stftout_magnitude = stftout_magnitude + 0.01 -- adding constant to avoid log(0) 160 | output = stftout_magnitude:log() * 10 161 | return output:transpose(1,2) 162 | end 163 | rawset(audio, 'spectrogram', spectrogram) 164 | 165 | local function stft(...) 166 | local output, input, window_size, window_type, stride 167 | local args = {...} 168 | if select('#',...) == 4 then 169 | input = args[1] 170 | window_size = args[2] 171 | window_type = args[3] 172 | stride = args[4] 173 | else 174 | print(dok.usage('audio.stft', 175 | 'calculate the stft of an audio. ' 176 | .. 'returns a 3D tensor, with ' 177 | .. 'number_of_windows x window_size/2+1 x 2 ' 178 | .. ' (complex number with real and complex parts)', nil, 179 | {type='torch.Tensor', 180 | help='input single-channel audio', req=true}, 181 | {type='number', help='window size', req=true}, 182 | {type='string', 183 | help='window type: rect, hamming, hann, bartlett' , req=true}, 184 | {type='number', help='stride', req=true})) 185 | dok.error('incorrect arguments', 'audio.stft') 186 | end 187 | local window_type_id; 188 | if window_type == 'rect' then 189 | window_type_id = 1 190 | elseif window_type == 'hamming' then 191 | window_type_id = 2 192 | elseif window_type == 'hann' then 193 | window_type_id = 3 194 | elseif window_type == 'bartlett' then 195 | window_type_id = 4 196 | end 197 | -- calculate stft 198 | output = torch.Tensor().audio.stft(input, window_size, window_type_id, stride) 199 | return output 200 | end 201 | rawset(audio, 'stft', stft) 202 | 203 | local function cqt(...) 204 | local output, input, fmin, fmax, bins_per_octave, sample_rate 205 | local args = {...} 206 | if select('#',...) == 5 then 207 | input = args[1] 208 | fmin = args[2] 209 | fmax = args[3] 210 | bins_per_octave = args[3] 211 | sample_rate = args[4] 212 | else 213 | print(dok.usage('audio.cqt', 214 | 'calculate the constant-Q transformed audio signal. returns a [TODO: fill this description]', nil, 215 | {type='torch.Tensor', help='input single-channel audio', req=true}, 216 | {type='number', help='lowest frequency of interest', req=true}, 217 | {type='number', help='highest frequency of interest', req=true}, 218 | {type='number', help='frequency bins per octave', req=true}, 219 | {type='number', help='sampling rate of the input', req=true})) 220 | dok.error('incorrect arguments', 'audio.cqt') 221 | end 222 | -- calculate cqt 223 | output = torch.Tensor().audio.cqt(input, fmin, fmax, bins_per_octave, sample_rate) 224 | return output 225 | end 226 | rawset(audio, 'cqt', cqt) 227 | 228 | 229 | ---------------------------------------------------------------------- 230 | -- loads voice.mp3 that is included with the repo 231 | local function samplevoice() 232 | local fname = 'voice.mp3' 233 | local voice = audio.load(sys.concat(sys.fpath(), fname)) 234 | return voice 235 | end 236 | rawset(audio, 'samplevoice', samplevoice) 237 | 238 | return audio 239 | -------------------------------------------------------------------------------- /sox.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #if LUA_VERSION_NUM >= 503 12 | #define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n))) 13 | #define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n))) 14 | #endif 15 | 16 | void abort_(const char * s, ...) 17 | { 18 | va_list args; 19 | va_start(args, s); 20 | vfprintf(stderr, s, args); 21 | fprintf(stderr, "\n"); 22 | va_end(args); 23 | abort(); 24 | } 25 | 26 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) 27 | #define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor) 28 | #define libsox_(NAME) TH_CONCAT_3(libsox_, Real, NAME) 29 | 30 | #include "generic/sox.c" 31 | #include "THGenerateAllTypes.h" 32 | 33 | DLL_EXPORT int luaopen_libsox(lua_State *L) 34 | { 35 | libsox_ByteMain_init(L); 36 | libsox_CharMain_init(L); 37 | libsox_ShortMain_init(L); 38 | libsox_IntMain_init(L); 39 | libsox_LongMain_init(L); 40 | libsox_FloatMain_init(L); 41 | libsox_DoubleMain_init(L); 42 | 43 | lua_newtable(L); 44 | lua_pushvalue(L, -1); 45 | lua_setglobal(L, "libsox"); 46 | 47 | lua_newtable(L); 48 | luaT_setfuncs(L, libsox_DoubleMain__, 0); 49 | lua_setfield(L, -2, "double"); 50 | 51 | lua_newtable(L); 52 | luaT_setfuncs(L, libsox_FloatMain__, 0); 53 | lua_setfield(L, -2, "float"); 54 | 55 | lua_newtable(L); 56 | luaT_setfuncs(L, libsox_ByteMain__, 0); 57 | lua_setfield(L, -2, "byte"); 58 | 59 | lua_newtable(L); 60 | luaT_setfuncs(L, libsox_CharMain__, 0); 61 | lua_setfield(L, -2, "char"); 62 | 63 | lua_newtable(L); 64 | luaT_setfuncs(L, libsox_ShortMain__, 0); 65 | lua_setfield(L, -2, "short"); 66 | 67 | lua_newtable(L); 68 | luaT_setfuncs(L, libsox_IntMain__, 0); 69 | lua_setfield(L, -2, "int"); 70 | 71 | lua_newtable(L); 72 | luaT_setfuncs(L, libsox_LongMain__, 0); 73 | lua_setfield(L, -2, "long"); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /test/load_and_save_example.lua: -------------------------------------------------------------------------------- 1 | require 'audio' 2 | 3 | t, sample_rate = audio.load('voice.mp3') 4 | print(#t) 5 | print(sample_rate) 6 | audio.save('test.wav', t, sample_rate) 7 | 8 | -------------------------------------------------------------------------------- /test/spectrogram.lua: -------------------------------------------------------------------------------- 1 | require 'audio' 2 | require 'image' -- to display the spectrogram 3 | voice = audio.samplevoice() 4 | spect = audio.spectrogram(voice, 8192, 'hann', 512) 5 | image.display(spect) 6 | -------------------------------------------------------------------------------- /test/test_decompress.lua: -------------------------------------------------------------------------------- 1 | require 'audio' 2 | m=audio.samplevoice() 3 | print(m:nElement()) 4 | print('ok') 5 | o = audio.compress(m, 22050, 'ogg') 6 | print(torch.type(o)) 7 | print(o:nElement()) 8 | print('compressed') 9 | -- outf = torch.DiskFile("www.ogg", "w"):binary() 10 | -- outf:writeChar(o:storage()) 11 | -- outf:close() 12 | m2 = audio.decompress(o, 'ogg') 13 | -------------------------------------------------------------------------------- /voice.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soumith/lua---audio/d61eab4ffd9b3ef8218d367689236566e44e8f82/voice.mp3 --------------------------------------------------------------------------------