├── .gitignore
├── CMakeLists.txt
├── Copyright.txt
├── LICENSE
├── README.md
├── audio-0.1-0.rockspec
├── audio.c
├── generic
├── audio.c
└── sox.c
├── init.lua
├── sox.c
├── test
├── load_and_save_example.lua
├── spectrogram.lua
└── test_decompress.lua
└── voice.mp3
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
2 | CMAKE_POLICY(VERSION 2.8)
3 |
4 | FIND_PACKAGE(Torch REQUIRED)
5 |
6 | FIND_PATH(SOX_INCLUDE_DIR sox.h
7 | "/usr/include/sox/")
8 | FIND_LIBRARY(SOX_LIBRARIES sox REQUIRED)
9 | message ("SOX_INCLUDE_DIR: ${SOX_INCLUDE_DIR}")
10 | message ("SOX_LIBRARIES: ${SOX_LIBRARIES}")
11 |
12 | FIND_PATH(FFTW_INCLUDE_DIR fftw3.h)
13 | FIND_LIBRARY(FFTW_LIBRARIES fftw3 REQUIRED)
14 | message ("FFTW_INCLUDE_DIR: ${FFTW_INCLUDE_DIR}")
15 | message ("FFTW_LIBRARIES: ${FFTW_LIBRARIES}")
16 |
17 | SET(src sox.c)
18 | include_directories (${SOX_INCLUDE_DIR})
19 | ADD_TORCH_PACKAGE(sox "${src}" "${luasrc}" "Audio Processing")
20 | TARGET_LINK_LIBRARIES(sox luaT TH ${SOX_LIBRARIES})
21 |
22 | include_directories (${FFTW_INCLUDE_DIR})
23 | SET(src audio.c)
24 | SET(luasrc init.lua voice.mp3)
25 | ADD_TORCH_PACKAGE(audio "${src}" "${luasrc}" "Audio Processing")
26 | TARGET_LINK_LIBRARIES(audio luaT TH ${SOX_LIBRARIES} ${FFTW_LIBRARIES})
27 |
--------------------------------------------------------------------------------
/Copyright.txt:
--------------------------------------------------------------------------------
1 | ===============================================================================
2 |
3 | lua---audio -- http://github.com/soumith/lua---audio
4 |
5 | Copyright (c) 2012 Soumith Chintala
6 |
7 | All rights reserved.
8 |
9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 |
12 | 1. Redistributions of source code must retain the above copyright
13 | notice, this list of conditions and the following disclaimer.
14 |
15 | 2. Redistributions in binary form must reproduce the above copyright
16 | notice, this list of conditions and the following disclaimer in the
17 | documentation and/or other materials provided with the distribution.
18 |
19 | 3. The names of the software's contributors may not be used to endorse or
20 | promote products derived from this software without specific prior
21 | written permission.
22 |
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 | POSSIBILITY OF SUCH DAMAGE.
34 |
35 | ===============================================================================
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The Romantic WTF public license.
2 | --------------------------------
3 | a.k.a. version "<3" or simply v3
4 |
5 | Dear user,
6 |
7 | The Romantic WTF Public License a.k.a version "<3"
8 |
9 | \
10 | '.,__
11 | \ /
12 | '/,__
13 | /
14 | /
15 | /
16 | has been / released
17 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
18 | under the Romantic WTF Public License.
19 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~`,´ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
20 | I hereby grant you an irrevocable license to
21 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
22 | do what the gentle caress you want to
23 | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
24 | with this lovely
25 | ~ ~ ~ ~ ~ ~ ~ ~
26 | / license.
27 | / ~ ~ ~ ~
28 | / Love,
29 | # / '.'
30 | ####### ·
31 | #####
32 | ###
33 | #
34 |
35 | -- Soumith Chintala.
36 |
37 |
38 | P.S.: Even though I poured my heart into this work,
39 | I _cannot_ provide any warranty regarding
40 | its fitness for _any_ purpose. You
41 | acknowledge that I will not be held liable
42 | for any damage its use could incur.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Audio Library for Torch
2 | =======================
3 |
4 | Audio library for Torch-7
5 | * Support audio I/O (Load files, save files)
6 | * Common audio operations (Short-time Fourier transforms, Spectrograms)
7 |
8 | Load the following formats into a torch Tensor
9 | * mp3, wav, aac, ogg, flac, avr, cdda, cvs/vms,
10 | * aiff, au, amr, mp2, mp4, ac3, avi, wmv,
11 | * mpeg, ircam and any other format supported by libsox.
12 |
13 | Calculate Short-time Fourier transforms with
14 | * window types - rectangular, hamming, hann, bartlett
15 |
16 | Generate spectrograms
17 |
18 | Dependencies
19 | ------------
20 | * libsox v14.3.2 or above
21 | * libfftw3
22 |
23 | Quick install on
24 | OSX (Homebrew):
25 | ```bash
26 | $ brew install sox
27 | $ brew install fftw
28 | ```
29 | Linux (Ubuntu):
30 | ```bash
31 | $ sudo apt-get install libfftw3-dev
32 | $ sudo apt-get install sox libsox-dev libsox-fmt-all
33 | ```
34 |
35 | Installation
36 | ------------
37 | This project can be installed with `luarocks` like this:
38 |
39 | ```bash
40 | $ luarocks install https://raw.githubusercontent.com/soumith/lua---audio/master/audio-0.1-0.rockspec
41 | ```
42 |
43 | On Ubuntu 13.04 64-bit, I had to modify the command slightly because of new library directory structures not picked up by luarocks.
44 | ```bash
45 | $ sudo luarocks install https://raw.githubusercontent.com/soumith/lua---audio/master/audio-0.1-0.rockspec LIBSOX_LIBDIR=/usr/lib/x86_64-linux-gnu/ LIBFFTW3_LIBDIR=/usr/lib/x86_64-linux-gnu
46 | ```
47 |
48 | Or, if you have downloaded this repository on your machine, and
49 | you are in its directory:
50 |
51 | ```bash
52 | $ luarocks make
53 | ```
54 |
55 | Usage
56 | =====
57 | audio.load
58 | ```
59 | loads an audio file into a torch.Tensor
60 | usage:
61 | audio.load(
62 | string -- path to file
63 | )
64 |
65 | returns torch.Tensor of size NSamples x NChannels, sample_rate
66 | ```
67 |
68 | audio.save
69 | ```
70 | saves a tensor into an audio file. The extension of the given path is used as the saving format.
71 | usage:
72 | audio.save(
73 | string -- path to file
74 | tensor -- NSamples x NChannels 2D tensor
75 | number -- sample_rate of the audio to be saved as
76 | )
77 | ```
78 |
79 | audio.compress
80 | ```
81 | Compresses a tensor in-memory and returns a CharTensor. The extension of the given path is used as the saving format. This can be decompressed using the "decompress" method
82 | usage:
83 | audio.compress(__
84 | tensor -- NSamples x NChannels 2D tensor
85 | number -- sample_rate of the audio to be saved as
86 | extension -- format of audio to compress in. Example: mp3, ogg, flac, sox etc.
87 | )
88 | ```
89 |
90 | audio.decompress
91 | ```
92 | Decompresses a tensor in-memory and returns raw audio. The extension of the given path is used as the loading format.
93 | usage:
94 | audio.decompress(__
95 | CharTensor -- 1D CharTensor that was returned by .compress
96 | extension -- format of audio used to compress. Example: mp3, ogg, flac, sox etc.
97 | )
98 | ```
99 |
100 | audio.stft
101 | ```
102 | calculate the stft of an audio. returns a 3D tensor, with number_of_windows x window_size/2+1 x 2(complex number with real and complex parts)
103 | usage:
104 | audio.stft(
105 | torch.Tensor -- input single-channel audio
106 | number -- window size
107 | string -- window type: rect, hamming, hann, bartlett
108 | number -- stride
109 | )
110 | ```
111 |
112 | audio.spectrogram
113 | ```
114 | generate the spectrogram of an audio. returns a 2D tensor, with number_of_windows x window_size/2+1, each value representing the magnitude of each frequency in dB
115 | usage:
116 | audio.spectrogram(
117 | torch.Tensor -- input single-channel audio
118 | number -- window size
119 | string -- window type: rect, hamming, hann, bartlett
120 | number -- stride
121 | )
122 | ```
123 |
124 | Example Usage
125 | -------------
126 | Generate a spectrogram
127 | ```lua
128 | require 'audio'
129 | require 'image' -- to display the spectrogram
130 | voice = audio.samplevoice()
131 | spect = audio.spectrogram(voice, 8192, 'hann', 512)
132 | image.display(spect)
133 | ```
134 |
--------------------------------------------------------------------------------
/audio-0.1-0.rockspec:
--------------------------------------------------------------------------------
1 | package = "audio"
2 | version = "0.1-0"
3 |
4 | source = {
5 | url = "git://github.com/soumith/lua---audio",
6 | tag = "master"
7 | }
8 |
9 | description = {
10 | summary = "Audio library for Torch-7",
11 | detailed = [[
12 | Support audio I/O (Load files)
13 | Common audio operations (Short-time Fourier transforms, Spectrograms)
14 | ]],
15 | homepage = "https://github.com/soumith/lua---audio",
16 | license = "RWTFPL"
17 | }
18 |
19 | dependencies = {
20 | "torch >= 7.0",
21 | "sys >= 1.0",
22 | "xlua >= 1.0"
23 | }
24 |
25 | build = {
26 | type = "command",
27 | build_command = [[
28 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
29 | ]],
30 | install_command = "cd build && $(MAKE) install"
31 | }
32 |
--------------------------------------------------------------------------------
/audio.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | #if LUA_VERSION_NUM >= 503
12 | #define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n)))
13 | #define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n)))
14 | #endif
15 |
16 | void abort_(const char * s, ...)
17 | {
18 | va_list args;
19 | va_start(args, s);
20 | vfprintf(stderr, s, args);
21 | fprintf(stderr, "\n");
22 | va_end(args);
23 | abort();
24 | }
25 |
26 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
27 | #define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
28 | #define audio_(NAME) TH_CONCAT_3(audio_, Real, NAME)
29 |
30 | #include "generic/audio.c"
31 | #include "THGenerateAllTypes.h"
32 |
33 | DLL_EXPORT int luaopen_libaudio(lua_State *L)
34 | {
35 | audio_ByteMain_init(L);
36 | audio_CharMain_init(L);
37 | audio_ShortMain_init(L);
38 | audio_IntMain_init(L);
39 | audio_LongMain_init(L);
40 | audio_FloatMain_init(L);
41 | audio_DoubleMain_init(L);
42 |
43 | lua_newtable(L);
44 | lua_pushvalue(L, -1);
45 | lua_setglobal(L, "audio");
46 |
47 | lua_newtable(L);
48 | luaT_setfuncs(L, audio_DoubleMain__, 0);
49 | lua_setfield(L, -2, "double");
50 |
51 | lua_newtable(L);
52 | luaT_setfuncs(L, audio_FloatMain__, 0);
53 | lua_setfield(L, -2, "float");
54 |
55 | lua_newtable(L);
56 | luaT_setfuncs(L, audio_ByteMain__, 0);
57 | lua_setfield(L, -2, "byte");
58 |
59 | lua_newtable(L);
60 | luaT_setfuncs(L, audio_CharMain__, 0);
61 | lua_setfield(L, -2, "char");
62 |
63 | lua_newtable(L);
64 | luaT_setfuncs(L, audio_ShortMain__, 0);
65 | lua_setfield(L, -2, "short");
66 |
67 | lua_newtable(L);
68 | luaT_setfuncs(L, audio_IntMain__, 0);
69 | lua_setfield(L, -2, "int");
70 |
71 | lua_newtable(L);
72 | luaT_setfuncs(L, audio_LongMain__, 0);
73 | lua_setfield(L, -2, "long");
74 |
75 | return 1;
76 | }
77 |
--------------------------------------------------------------------------------
/generic/audio.c:
--------------------------------------------------------------------------------
1 | #ifndef TH_GENERIC_FILE
2 | #define TH_GENERIC_FILE "generic/audio.c"
3 | #else
4 |
5 | #undef TAPI
6 | #define TAPI __declspec(dllimport)
7 |
8 | /* ---------------------------------------------------------------------- */
9 | /* -- */
10 | /* -- Copyright (c) 2012 Soumith Chintala */
11 | /* -- */
12 | /* -- Permission is hereby granted, free of charge, to any person obtaining */
13 | /* -- a copy of this software and associated documentation files (the */
14 | /* -- "Software"), to deal in the Software without restriction, including */
15 | /* -- without limitation the rights to use, copy, modify, merge, publish, */
16 | /* -- distribute, sublicense, and/or sell copies of the Software, and to */
17 | /* -- permit persons to whom the Software is furnished to do so, subject to */
18 | /* -- the following conditions: */
19 | /* -- */
20 | /* -- The above copyright notice and this permission notice shall be */
21 | /* -- included in all copies or substantial portions of the Software. */
22 | /* -- */
23 | /* -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
24 | /* -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
25 | /* -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
26 | /* -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE */
27 | /* -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION */
28 | /* -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
29 | /* -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
30 | /* -- */
31 | /* ---------------------------------------------------------------------- */
32 | /* -- description: */
33 | /* -- audio.c - general purpose audio transforms for Torch-7 */
34 | /* -- */
35 | /* -- history: */
36 | /* -- May 26th, 2012, 5:46PM - wrote stft and channel
37 | flattening functions - Soumith Chintala */
38 | /* ---------------------------------------------------------------------- */
39 |
40 |
41 | // write audio.toMono() which converts a multi-channel audio to single channel
42 |
43 | #ifndef M_PI
44 | #define M_PI 3.14159265358979323846264338327950288
45 | #endif
46 |
47 | // Apply an edge-smoothing function to make the signal periodic.
48 | // Different window types are provided
49 | static inline void audio_(apply_window)(double *input,
50 | long window_size, int window_type) {
51 | long i, m = window_size -1;
52 | switch (window_type) {
53 | case 1: // Rectangular Window, do nothing
54 | break;
55 | case 2: // Hamming Window
56 | for (i = 0; i < window_size; ++i)
57 | input[i] *= .53836 - .46164 * cos(2 * M_PI * i / m);
58 | break;
59 | case 3: // Hann Window
60 | for (i = 0; i < window_size; ++i)
61 | input[i] *= .5 - .5 * cos(2 * M_PI * i / m);
62 | break;
63 | case 4: // Bartlett Window
64 | for (i = 0; i < window_size; ++i)
65 | input[i] *= 2. / m * (m / 2. - fabs(i - m / 2.));
66 | break;
67 | default:
68 | THError("[stft_generic] Unknown window_type");
69 | break;
70 | }
71 | }
72 |
73 | ////////////////////////////////////////////////////////////////////////////
74 | // generic short-time fourier transform function that supports multiple window types
75 | // arguments [tensor, window-size, window-type, hop-size/stride]
76 | // window_type [1, 2, 3, 4] for [rectangular, hamming, hann, bartlett]
77 | static THTensor * audio_(stft_generic)(THTensor *input,
78 | long window_size, int window_type,
79 | long stride)
80 | {
81 | const long length = input->size[0];
82 | long nChannels = 1;
83 |
84 | if (THTensor_(nDimension)(input) > 1)
85 | nChannels = input->size[1];
86 |
87 | if (nChannels > 1)
88 | THError("[stft_generic] Multi-channel stft not supported");
89 |
90 | real *input_data = THTensor_(data)(input);
91 | const long nwindows = ((length - window_size)/stride) + 1;
92 | const long noutput = window_size/2 + 1;
93 | THTensor *output = THTensor_(newWithSize3d)(nwindows, noutput, 2);
94 | real *output_data = THTensor_(data)(output);
95 | double *buffer = malloc(sizeof(double) * window_size);
96 | fftw_complex *fbuffer = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*noutput);
97 | long index, k, outindex=0;
98 |
99 | fftw_plan plan = fftw_plan_dft_r2c_1d(window_size, buffer, fbuffer, FFTW_ESTIMATE);
100 |
101 | // loop over the input. get a buffer. apply window. call stft. repeat with stride.
102 | for (index = 0; index + window_size <= length; index = index + stride) {
103 | for (k=0; ksignal.channels;
40 | long buffer_size = fd->signal.length;
41 | if (buffer_size == 0) {
42 | if (nsamples != -1) {
43 | buffer_size = nsamples;
44 | } else {
45 | THError("[read_audio] Unknown length");
46 | }
47 | }
48 | *sample_rate = (int) fd->signal.rate;
49 | int32_t *buffer = (int32_t *)malloc(sizeof(int32_t) * buffer_size);
50 | size_t samples_read = sox_read(fd, buffer, buffer_size);
51 | if (samples_read == 0)
52 | THError("[read_audio] Empty file or read failed in sox_read");
53 | // alloc tensor
54 | THTensor_(resize2d)(tensor, samples_read / nchannels, nchannels );
55 | real *tensor_data = THTensor_(data)(tensor);
56 | // convert audio to dest tensor
57 | int x,k;
58 | for (x=0; xsize[1];
98 | long nsamples = src->size[0];
99 | real* data = THTensor_(data)(src);
100 |
101 | // convert audio to dest tensor
102 | int x,k;
103 | for (x=0; xsize[1];
120 | long nsamples = src->size[0];
121 |
122 | sox_format_t *fd;
123 |
124 | // Create sox objects and write into int32_t buffer
125 | sox_signalinfo_t sinfo;
126 | sinfo.rate = sample_rate;
127 | sinfo.channels = nchannels;
128 | sinfo.length = nsamples * nchannels;
129 | sinfo.precision = sizeof(int32_t) * 8; /* precision in bits */
130 | #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
131 | sinfo.mult = NULL;
132 | #endif
133 | fd = sox_open_write(file_name, &sinfo, NULL, extension, NULL, NULL);
134 | if (fd == NULL)
135 | THError("[write_audio_file] Failure to open file for writing");
136 |
137 | libsox_(write_audio)(fd, src, extension, sample_rate);
138 |
139 | // free buffer and sox structures
140 | sox_close(fd);
141 |
142 | return;
143 | }
144 |
145 | void libsox_(write_audio_memory)(THCharTensor* out, THTensor* src,
146 | const char *extension, int sample_rate)
147 | {
148 | if (THTensor_(isContiguous)(src) == 0)
149 | THError("[write_audio_file] Input should be contiguous tensors");
150 |
151 | long nchannels = src->size[1];
152 | long nsamples = src->size[0];
153 |
154 | sox_format_t *fd;
155 | char *buffer = NULL;
156 | size_t buffer_size = -1;
157 |
158 | // Create sox objects and write into int32_t buffer
159 | sox_signalinfo_t sinfo;
160 | sinfo.rate = sample_rate;
161 | sinfo.channels = nchannels;
162 | sinfo.length = nsamples * nchannels;
163 | sinfo.precision = sizeof(int32_t) * 8; /* precision in bits */
164 | #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
165 | sinfo.mult = NULL;
166 | #endif
167 | fd = sox_open_memstream_write(&buffer, &buffer_size, &sinfo, NULL, extension, NULL);
168 | if (fd == NULL)
169 | THError("[write_audio_memory] Failure to open sox object for writing");
170 |
171 | libsox_(write_audio)(fd, src, extension, sample_rate);
172 |
173 | // free sox structures
174 | sox_close(fd);
175 |
176 | // write the number of samples as well. to get around a SOX bug for certain formats.
177 | int64_t olength = nsamples * nchannels;
178 | size_t out_size = buffer_size + 8;
179 | char * out_data = (char*) malloc(out_size);
180 |
181 | /* TODO: investigate why if I create a storage and memcpy over, it's segfaulting */
182 | // THCharStorage* out_storage = THCharStorage_newWithSize1(out_size);
183 | // char* out_data = THCharStorage_data(out_storage);
184 |
185 | // write the actual data after an offset of int64_t
186 | memcpy(out_data + 8, buffer, buffer_size);
187 | memcpy(out_data, &olength, 8);
188 |
189 | THCharStorage* out_storage = THCharStorage_newWithData(out_data, out_size);
190 |
191 | THCharTensor_setStorage1d(out, out_storage, 0, out_size, 1);
192 |
193 | // free buffers and stuff
194 | free(buffer);
195 |
196 | return;
197 | }
198 |
199 | static int libsox_(Main_load)(lua_State *L) {
200 | const char *filename = luaL_checkstring(L, 1);
201 | THTensor *tensor = THTensor_(new)();
202 | int sample_rate = 0;
203 | libsox_(read_audio_file)(filename, tensor, &sample_rate);
204 | luaT_pushudata(L, tensor, torch_Tensor);
205 | lua_pushnumber(L, (double) sample_rate);
206 | return 2;
207 | }
208 |
209 | static int libsox_(Main_decompress)(lua_State *L) {
210 | THCharTensor *inp = luaT_checkudata(L, 1, "torch.CharTensor");
211 | const char *extension = luaL_checkstring(L, 2);
212 | THTensor *tensor = THTensor_(new)();
213 | int sample_rate = 0;
214 | libsox_(read_audio_memory)(inp, tensor, &sample_rate, extension);
215 | luaT_pushudata(L, tensor, torch_Tensor);
216 | lua_pushnumber(L, (double) sample_rate);
217 | return 2;
218 | }
219 |
220 | static int libsox_(Main_save)(lua_State *L) {
221 | const char *filename = luaL_checkstring(L, 1);
222 | THTensor *tensor = luaT_checkudata(L, 2, torch_Tensor);
223 | const char *extension = luaL_checkstring(L, 3);
224 | int sample_rate = luaL_checkint(L, 4);
225 | libsox_(write_audio_file)(filename, tensor, extension, sample_rate);
226 | return 1;
227 | }
228 |
229 | static int libsox_(Main_compress)(lua_State *L) {
230 | THCharTensor *out = luaT_checkudata(L, 1, "torch.CharTensor");
231 | THTensor *src = luaT_checkudata(L, 2, torch_Tensor);
232 | const char *extension = luaL_checkstring(L, 3);
233 | int sample_rate = luaL_checkint(L, 4);
234 | libsox_(write_audio_memory)(out, src, extension, sample_rate);
235 | return 1;
236 | }
237 |
238 | static const luaL_Reg libsox_(Main__)[] =
239 | {
240 | {"load", libsox_(Main_load)},
241 | {"save", libsox_(Main_save)},
242 | {"compress", libsox_(Main_compress)},
243 | {"decompress", libsox_(Main_decompress)},
244 | {NULL, NULL}
245 | };
246 |
247 | DLL_EXPORT int libsox_(Main_init)(lua_State *L)
248 | {
249 | luaT_pushmetatable(L, torch_Tensor);
250 | luaT_registeratname(L, libsox_(Main__), "libsox");
251 | // Initialize sox library
252 | sox_format_init();
253 | return 1;
254 | }
255 |
256 | #endif
257 |
--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
1 | ----------------------------------------------------------------------
2 | --
3 | -- Copyright (c) 2012 Soumith Chintala
4 | --
5 | -- Permission is hereby granted, free of charge, to any person obtaining
6 | -- a copy of this software and associated documentation files (the
7 | -- "Software"), to deal in the Software without restriction, including
8 | -- without limitation the rights to use, copy, modify, merge, publish,
9 | -- distribute, sublicense, and/or sell copies of the Software, and to
10 | -- permit persons to whom the Software is furnished to do so, subject to
11 | -- the following conditions:
12 | --
13 | -- The above copyright notice and this permission notice shall be
14 | -- included in all copies or substantial portions of the Software.
15 | --
16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | --
24 | ----------------------------------------------------------------------
25 | -- description:
26 | -- audio - an audio toolBox, for Torch
27 | --
28 | -- history:
29 | -- May 24th, 2012, 7:28PM - wrote sox wrappers - Soumith Chintala
30 | ----------------------------------------------------------------------
31 |
32 | require 'torch'
33 | require 'sys'
34 | require 'xlua'
35 | require 'dok'
36 | require 'paths'
37 | require 'libaudio'
38 |
39 | ----------------------------------------------------------------------
40 | -- load from multiple formats
41 | --
42 | local function load(filename)
43 | if not filename then
44 | print(dok.usage('audio.load',
45 | 'loads an audio file into a torch.Tensor', nil,
46 | {type='string', help='path to file', req=true}))
47 | dok.error('missing file name', 'audio.load')
48 | end
49 | if not paths.filep(filename) then
50 | dok.error('Specified filename: ' .. filename .. ' not found', 'audio.load')
51 | end
52 | local tensor
53 | if not xlua.require 'libsox' then
54 | dok.error('libsox package not found, please install libsox','audio.load')
55 | end
56 | local a, sample_rate = torch.Tensor().libsox.load(filename)
57 | return a, sample_rate
58 | end
59 | rawset(audio, 'load', load)
60 | --------------------------------------------------------------------------
61 | -- save to multiple formats
62 | local function save(filename, src, sample_rate)
63 | if not filename or not src then
64 | error('filename or src tensor missing')
65 | end
66 | if not xlua.require 'libsox' then
67 | dok.error('libsox package not found, please install libsox','audio.save')
68 | end
69 | local extension = paths.extname(filename)
70 | assert(extension, 'did not find extension (like .wav or .mp3) in filename. '
71 | .. 'Give a filename with an extension, for example: hello.wav')
72 | assert(sample_rate and type(sample_rate) == 'number',
73 | 'provide a sample rate (a number) such as 22050')
74 | src.libsox.save(filename, src, extension, sample_rate)
75 | end
76 | rawset(audio, 'save', save)
77 | --------------------------------------------------------------------------
78 | -- compress
79 | -- save to multiple formats
80 | function audio.compress(src, sample_rate, extension)
81 | if not src then
82 | error('src tensor missing')
83 | end
84 | assert(sample_rate and type(sample_rate) == 'number',
85 | 'provide a sample rate (a number) such as 22050')
86 | if not xlua.require 'libsox' then
87 | dok.error('libsox package not found, please install libsox','audio.compress')
88 | end
89 | local out = torch.CharTensor()
90 | src.libsox.compress(out, src, extension, sample_rate)
91 | return out
92 | end
93 |
94 | -- decompress
95 | function audio.decompress(src, extension)
96 | if not src then
97 | error('src tensor missing')
98 | end
99 | if not extension then
100 | error('extension string missing')
101 | end
102 | if not xlua.require 'libsox' then
103 | dok.error('libsox package not found, please install libsox','audio.decompress')
104 | end
105 | local a, sample_rate = torch.Tensor().libsox.decompress(src, extension)
106 | return a, sample_rate
107 | end
108 |
109 | -- compressMP3
110 | function audio.compressMP3(src, sample_rate)
111 | return audio.compress(src, sample_rate, 'mp3')
112 | end
113 | function audio.compressOGG(src, sample_rate)
114 | return audio.compress(src, sample_rate, 'ogg')
115 | end
116 |
117 | function audio.decompressMP3(src)
118 | return audio.decompress(src, 'mp3')
119 | end
120 | function audio.decompressOGG(src)
121 | return audio.decompress(src, 'ogg')
122 | end
123 | ----------------------------------------------------------------------
124 | -- spectrogram
125 | --
126 | local function spectrogram(...)
127 | local output, input, window_size, window_type, stride
128 | local args = {...}
129 | if select('#',...) == 4 then
130 | input = args[1]
131 | window_size = args[2]
132 | window_type = args[3]
133 | stride = args[4]
134 | else
135 | print(dok.usage('audio.spectrogram',
136 | 'generate the spectrogram of an audio. '
137 | .. 'returns a 2D tensor, with '
138 | .. 'number_of_windows x window_size/2+1, '
139 | .. 'each value representing the magnitude of '
140 | .. 'each frequency in dB', nil,
141 | {type='torch.Tensor',
142 | help='input single-channel audio', req=true},
143 | {type='number', help='window size', req=true},
144 | {type='string',
145 | help='window type: rect, hamming, hann, bartlett' , req=true},
146 | {type='number', help='stride', req=true}))
147 | dok.error('incorrect arguments', 'audio.spectrogram')
148 | end
149 |
150 | -- calculate stft
151 | local stftout = audio.stft(input, window_size, window_type, stride)
152 |
153 | -- calculate magnitude of signal and convert to dB to make it look prettier
154 | local stftout_r = stftout:select(3,1)
155 | local stftout_c = stftout:select(3,2)
156 | stftout_r:pow(2)
157 | stftout_c:pow(2)
158 | local stftout_magnitude = stftout_r + stftout_c
159 | stftout_magnitude = stftout_magnitude + 0.01 -- adding constant to avoid log(0)
160 | output = stftout_magnitude:log() * 10
161 | return output:transpose(1,2)
162 | end
163 | rawset(audio, 'spectrogram', spectrogram)
164 |
165 | local function stft(...)
166 | local output, input, window_size, window_type, stride
167 | local args = {...}
168 | if select('#',...) == 4 then
169 | input = args[1]
170 | window_size = args[2]
171 | window_type = args[3]
172 | stride = args[4]
173 | else
174 | print(dok.usage('audio.stft',
175 | 'calculate the stft of an audio. '
176 | .. 'returns a 3D tensor, with '
177 | .. 'number_of_windows x window_size/2+1 x 2 '
178 | .. ' (complex number with real and complex parts)', nil,
179 | {type='torch.Tensor',
180 | help='input single-channel audio', req=true},
181 | {type='number', help='window size', req=true},
182 | {type='string',
183 | help='window type: rect, hamming, hann, bartlett' , req=true},
184 | {type='number', help='stride', req=true}))
185 | dok.error('incorrect arguments', 'audio.stft')
186 | end
187 | local window_type_id;
188 | if window_type == 'rect' then
189 | window_type_id = 1
190 | elseif window_type == 'hamming' then
191 | window_type_id = 2
192 | elseif window_type == 'hann' then
193 | window_type_id = 3
194 | elseif window_type == 'bartlett' then
195 | window_type_id = 4
196 | end
197 | -- calculate stft
198 | output = torch.Tensor().audio.stft(input, window_size, window_type_id, stride)
199 | return output
200 | end
201 | rawset(audio, 'stft', stft)
202 |
203 | local function cqt(...)
204 | local output, input, fmin, fmax, bins_per_octave, sample_rate
205 | local args = {...}
206 | if select('#',...) == 5 then
207 | input = args[1]
208 | fmin = args[2]
209 | fmax = args[3]
210 | bins_per_octave = args[3]
211 | sample_rate = args[4]
212 | else
213 | print(dok.usage('audio.cqt',
214 | 'calculate the constant-Q transformed audio signal. returns a [TODO: fill this description]', nil,
215 | {type='torch.Tensor', help='input single-channel audio', req=true},
216 | {type='number', help='lowest frequency of interest', req=true},
217 | {type='number', help='highest frequency of interest', req=true},
218 | {type='number', help='frequency bins per octave', req=true},
219 | {type='number', help='sampling rate of the input', req=true}))
220 | dok.error('incorrect arguments', 'audio.cqt')
221 | end
222 | -- calculate cqt
223 | output = torch.Tensor().audio.cqt(input, fmin, fmax, bins_per_octave, sample_rate)
224 | return output
225 | end
226 | rawset(audio, 'cqt', cqt)
227 |
228 |
229 | ----------------------------------------------------------------------
230 | -- loads voice.mp3 that is included with the repo
231 | local function samplevoice()
232 | local fname = 'voice.mp3'
233 | local voice = audio.load(sys.concat(sys.fpath(), fname))
234 | return voice
235 | end
236 | rawset(audio, 'samplevoice', samplevoice)
237 |
238 | return audio
239 |
--------------------------------------------------------------------------------
/sox.c:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | #if LUA_VERSION_NUM >= 503
12 | #define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n)))
13 | #define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n)))
14 | #endif
15 |
16 | void abort_(const char * s, ...)
17 | {
18 | va_list args;
19 | va_start(args, s);
20 | vfprintf(stderr, s, args);
21 | fprintf(stderr, "\n");
22 | va_end(args);
23 | abort();
24 | }
25 |
26 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
27 | #define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
28 | #define libsox_(NAME) TH_CONCAT_3(libsox_, Real, NAME)
29 |
30 | #include "generic/sox.c"
31 | #include "THGenerateAllTypes.h"
32 |
33 | DLL_EXPORT int luaopen_libsox(lua_State *L)
34 | {
35 | libsox_ByteMain_init(L);
36 | libsox_CharMain_init(L);
37 | libsox_ShortMain_init(L);
38 | libsox_IntMain_init(L);
39 | libsox_LongMain_init(L);
40 | libsox_FloatMain_init(L);
41 | libsox_DoubleMain_init(L);
42 |
43 | lua_newtable(L);
44 | lua_pushvalue(L, -1);
45 | lua_setglobal(L, "libsox");
46 |
47 | lua_newtable(L);
48 | luaT_setfuncs(L, libsox_DoubleMain__, 0);
49 | lua_setfield(L, -2, "double");
50 |
51 | lua_newtable(L);
52 | luaT_setfuncs(L, libsox_FloatMain__, 0);
53 | lua_setfield(L, -2, "float");
54 |
55 | lua_newtable(L);
56 | luaT_setfuncs(L, libsox_ByteMain__, 0);
57 | lua_setfield(L, -2, "byte");
58 |
59 | lua_newtable(L);
60 | luaT_setfuncs(L, libsox_CharMain__, 0);
61 | lua_setfield(L, -2, "char");
62 |
63 | lua_newtable(L);
64 | luaT_setfuncs(L, libsox_ShortMain__, 0);
65 | lua_setfield(L, -2, "short");
66 |
67 | lua_newtable(L);
68 | luaT_setfuncs(L, libsox_IntMain__, 0);
69 | lua_setfield(L, -2, "int");
70 |
71 | lua_newtable(L);
72 | luaT_setfuncs(L, libsox_LongMain__, 0);
73 | lua_setfield(L, -2, "long");
74 |
75 | return 1;
76 | }
77 |
--------------------------------------------------------------------------------
/test/load_and_save_example.lua:
--------------------------------------------------------------------------------
1 | require 'audio'
2 |
3 | t, sample_rate = audio.load('voice.mp3')
4 | print(#t)
5 | print(sample_rate)
6 | audio.save('test.wav', t, sample_rate)
7 |
8 |
--------------------------------------------------------------------------------
/test/spectrogram.lua:
--------------------------------------------------------------------------------
1 | require 'audio'
2 | require 'image' -- to display the spectrogram
3 | voice = audio.samplevoice()
4 | spect = audio.spectrogram(voice, 8192, 'hann', 512)
5 | image.display(spect)
6 |
--------------------------------------------------------------------------------
/test/test_decompress.lua:
--------------------------------------------------------------------------------
1 | require 'audio'
2 | m=audio.samplevoice()
3 | print(m:nElement())
4 | print('ok')
5 | o = audio.compress(m, 22050, 'ogg')
6 | print(torch.type(o))
7 | print(o:nElement())
8 | print('compressed')
9 | -- outf = torch.DiskFile("www.ogg", "w"):binary()
10 | -- outf:writeChar(o:storage())
11 | -- outf:close()
12 | m2 = audio.decompress(o, 'ogg')
13 |
--------------------------------------------------------------------------------
/voice.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soumith/lua---audio/d61eab4ffd9b3ef8218d367689236566e44e8f82/voice.mp3
--------------------------------------------------------------------------------