├── .gitignore
├── CMakeLists.txt
├── Copyright.txt
├── LICENSE
├── README.md
├── audio-0.1-0.rockspec
├── audio.c
├── generic
    ├── audio.c
    └── sox.c
├── init.lua
├── sox.c
├── test
    ├── load_and_save_example.lua
    ├── spectrogram.lua
    └── test_decompress.lua
└── voice.mp3


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
 2 | CMAKE_POLICY(VERSION 2.8)
 3 | 
 4 | FIND_PACKAGE(Torch REQUIRED)
 5 | 
 6 | FIND_PATH(SOX_INCLUDE_DIR sox.h
 7 |   "/usr/include/sox/")
 8 | FIND_LIBRARY(SOX_LIBRARIES sox REQUIRED)
 9 | message ("SOX_INCLUDE_DIR: ${SOX_INCLUDE_DIR}")
10 | message ("SOX_LIBRARIES: ${SOX_LIBRARIES}")
11 | 
12 | FIND_PATH(FFTW_INCLUDE_DIR fftw3.h)
13 | FIND_LIBRARY(FFTW_LIBRARIES fftw3 REQUIRED)
14 | message ("FFTW_INCLUDE_DIR: ${FFTW_INCLUDE_DIR}")
15 | message ("FFTW_LIBRARIES: ${FFTW_LIBRARIES}")
16 | 
17 | SET(src sox.c)
18 | include_directories (${SOX_INCLUDE_DIR})
19 | ADD_TORCH_PACKAGE(sox "${src}" "${luasrc}" "Audio Processing")
20 | TARGET_LINK_LIBRARIES(sox luaT TH ${SOX_LIBRARIES})
21 | 
22 | include_directories (${FFTW_INCLUDE_DIR})
23 | SET(src audio.c)
24 | SET(luasrc init.lua voice.mp3)
25 | ADD_TORCH_PACKAGE(audio "${src}" "${luasrc}" "Audio Processing")
26 | TARGET_LINK_LIBRARIES(audio luaT TH ${SOX_LIBRARIES} ${FFTW_LIBRARIES})
27 | 


--------------------------------------------------------------------------------
/Copyright.txt:
--------------------------------------------------------------------------------
 1 | ===============================================================================
 2 | 
 3 | lua---audio -- http://github.com/soumith/lua---audio
 4 | 
 5 | Copyright (c) 2012 Soumith Chintala
 6 | 
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 | 1. Redistributions of source code must retain the above copyright
13 | notice, this list of conditions and the following disclaimer.
14 | 
15 | 2. Redistributions in binary form must reproduce the above copyright
16 | notice, this list of conditions and the following disclaimer in the
17 | documentation and/or other materials provided with the distribution.
18 | 
19 | 3. The names of the software's contributors may not be used to endorse or
20 | promote products derived from this software without specific prior
21 | written permission.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 | POSSIBILITY OF SUCH DAMAGE.
34 | 
35 | ===============================================================================
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |                    The Romantic WTF public license.
 2 |                    --------------------------------
 3 |                    a.k.a. version "<3" or simply v3
 4 | 
 5 |            Dear user,
 6 | 
 7 |            The Romantic WTF Public License a.k.a version "<3"
 8 | 
 9 |                                             \ 
10 |                                              '.,__
11 |                                           \  /
12 |                                            '/,__
13 |                                            /
14 |                                           /
15 |                                          /
16 |                       has been          / released
17 |                  ~ ~ ~ ~ ~ ~ ~ ~       ~ ~ ~ ~ ~ ~ ~ ~ 
18 |                under  the  Romantic   WTF Public License.
19 |               ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~`,´ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
20 |               I hereby grant you an irrevocable license to
21 |                ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
22 |                  do what the gentle caress you want to
23 |                       ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~  
24 |                           with   this   lovely
25 |                              ~ ~ ~ ~ ~ ~ ~ ~ 
26 |                               / license.
27 |                              /  ~ ~ ~ ~
28 |                             /    Love,
29 |                       #    /      '.'
30 |                       #######      ·
31 |                       #####
32 |                       ###
33 |                       #
34 | 
35 |            -- Soumith Chintala.
36 | 
37 | 
38 |            P.S.: Even though I poured my heart into this work, 
39 |                  I _cannot_ provide any warranty regarding 
40 |                  its fitness for _any_ purpose. You
41 |                  acknowledge that I will not be held liable
42 |                  for any damage its use could incur.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Audio Library for Torch
  2 | =======================
  3 | 
  4 | Audio library for Torch-7
  5 |  * Support audio I/O (Load files, save files)
  6 |  * Common audio operations (Short-time Fourier transforms, Spectrograms)
  7 | 
  8 | Load the following formats into a torch Tensor
  9 |  * mp3, wav, aac, ogg, flac, avr, cdda, cvs/vms,
 10 |  * aiff, au, amr, mp2, mp4, ac3, avi, wmv,
 11 |  * mpeg, ircam and any other format supported by libsox.
 12 | 
 13 | Calculate Short-time Fourier transforms with
 14 |  * window types - rectangular, hamming, hann, bartlett
 15 | 
 16 | Generate spectrograms
 17 | 
 18 | Dependencies
 19 | ------------
 20 | * libsox v14.3.2 or above
 21 | * libfftw3
 22 | 
 23 | Quick install on
 24 | OSX (Homebrew):
 25 | ```bash
 26 | $ brew install sox
 27 | $ brew install fftw
 28 | ```
 29 | Linux (Ubuntu):
 30 | ```bash
 31 | $ sudo apt-get install libfftw3-dev
 32 | $ sudo apt-get install sox libsox-dev libsox-fmt-all
 33 | ```
 34 | 
 35 | Installation
 36 | ------------
 37 | This project can be installed with `luarocks` like this:
 38 | 
 39 | ```bash
 40 | $ luarocks install https://raw.githubusercontent.com/soumith/lua---audio/master/audio-0.1-0.rockspec
 41 | ```
 42 | 
 43 | On Ubuntu 13.04 64-bit, I had to modify the command slightly because of new library directory structures not picked up by luarocks.
 44 | ```bash
 45 | $ sudo luarocks install https://raw.githubusercontent.com/soumith/lua---audio/master/audio-0.1-0.rockspec LIBSOX_LIBDIR=/usr/lib/x86_64-linux-gnu/ LIBFFTW3_LIBDIR=/usr/lib/x86_64-linux-gnu
 46 | ```
 47 | 
 48 | Or, if you have downloaded this repository on your machine, and
 49 | you are in its directory:
 50 | 
 51 | ```bash
 52 | $ luarocks make
 53 | ```
 54 | 
 55 | Usage
 56 | =====
 57 | audio.load
 58 | ```
 59 |  loads an audio file into a torch.Tensor
 60 |  usage:
 61 |  audio.load(
 62 |      string                              -- path to file
 63 |  )
 64 | 
 65 | returns torch.Tensor of size NSamples x NChannels, sample_rate
 66 | ```
 67 | 
 68 | audio.save
 69 | ```
 70 |  saves a tensor into an audio file. The extension of the given path is used as the saving format.
 71 |  usage:
 72 |  audio.save(
 73 |      string                              -- path to file
 74 | 	 tensor                              -- NSamples x NChannels 2D tensor
 75 | 	 number                              -- sample_rate of the audio to be saved as
 76 |  )
 77 | ```
 78 | 
 79 | audio.compress
 80 | ```
 81 |  Compresses a tensor in-memory and returns a CharTensor. The extension of the given path is used as the saving format. This can be decompressed using the "decompress" method
 82 |  usage:
 83 |  audio.compress(__
 84 | 	 tensor                              -- NSamples x NChannels 2D tensor
 85 | 	 number                              -- sample_rate of the audio to be saved as
 86 |      extension                           -- format of audio to compress in. Example: mp3, ogg, flac, sox etc.
 87 |  )
 88 | ```
 89 | 
 90 | audio.decompress
 91 | ```
 92 |  Decompresses a tensor in-memory and returns raw audio. The extension of the given path is used as the loading format.
 93 |  usage:
 94 |  audio.decompress(__
 95 | 	 CharTensor                          -- 1D CharTensor that was returned by .compress
 96 |      extension                           -- format of audio used to compress. Example: mp3, ogg, flac, sox etc.
 97 |  )
 98 | ```
 99 | 
100 | audio.stft
101 | ```
102 | calculate the stft of an audio. returns a 3D tensor, with number_of_windows x window_size/2+1 x 2(complex number with real and complex parts)
103 | usage:
104 | audio.stft(
105 |     torch.Tensor                        -- input single-channel audio
106 |     number                              -- window size
107 |     string                              -- window type: rect, hamming, hann, bartlett
108 |     number                              -- stride
109 | )
110 | ```
111 | 
112 | audio.spectrogram
113 | ```
114 | generate the spectrogram of an audio. returns a 2D tensor, with number_of_windows x window_size/2+1, each value representing the magnitude of each frequency in dB
115 | usage:
116 | audio.spectrogram(
117 |     torch.Tensor                        -- input single-channel audio
118 |     number                              -- window size
119 |     string                              -- window type: rect, hamming, hann, bartlett
120 |     number                              -- stride
121 | )
122 | ```
123 | 
124 | Example Usage
125 | -------------
126 | Generate a spectrogram
127 | ```lua
128 | require 'audio'
129 | require 'image' -- to display the spectrogram
130 | voice = audio.samplevoice()
131 | spect = audio.spectrogram(voice, 8192, 'hann', 512)
132 | image.display(spect)
133 | ```
134 | 


--------------------------------------------------------------------------------
/audio-0.1-0.rockspec:
--------------------------------------------------------------------------------
 1 | package = "audio"
 2 | version = "0.1-0"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/soumith/lua---audio",
 6 |    tag = "master"
 7 | }
 8 | 
 9 | description = {
10 |    summary  = "Audio library for Torch-7",
11 |    detailed = [[
12 |    	    Support audio I/O (Load files)
13 | 	    Common audio operations (Short-time Fourier transforms, Spectrograms)
14 |    ]],
15 |    homepage = "https://github.com/soumith/lua---audio",
16 |    license  = "RWTFPL"
17 | }
18 | 
19 | dependencies = {
20 |    "torch >= 7.0",
21 |    "sys >= 1.0",
22 |    "xlua >= 1.0"
23 | }
24 | 
25 | build = {
26 |    type = "command",
27 |    build_command = [[
28 |    cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
29 |    ]],
30 |    install_command = "cd build && $(MAKE) install"
31 | }
32 | 


--------------------------------------------------------------------------------
/audio.c:
--------------------------------------------------------------------------------
 1 | #include <TH.h>
 2 | #include <luaT.h>
 3 | #include <unistd.h>
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <string.h>
 7 | #include <math.h>
 8 | 
 9 | #include <fftw3.h>
10 | 
11 | #if LUA_VERSION_NUM >= 503
12 | #define luaL_checklong(L,n)     ((long)luaL_checkinteger(L, (n)))
13 | #define luaL_checkint(L,n)      ((int)luaL_checkinteger(L, (n)))
14 | #endif
15 | 
16 | void abort_(const char * s, ...)
17 | {
18 |   va_list args;
19 |   va_start(args, s);
20 |   vfprintf(stderr, s, args);
21 |   fprintf(stderr, "\n");
22 |   va_end(args);
23 |   abort();
24 | }
25 | 
26 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
27 | #define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
28 | #define audio_(NAME) TH_CONCAT_3(audio_, Real, NAME)
29 | 
30 | #include "generic/audio.c"
31 | #include "THGenerateAllTypes.h"
32 | 
33 | DLL_EXPORT int luaopen_libaudio(lua_State *L)
34 | {
35 |   audio_ByteMain_init(L);
36 |   audio_CharMain_init(L);
37 |   audio_ShortMain_init(L);
38 |   audio_IntMain_init(L);
39 |   audio_LongMain_init(L);
40 |   audio_FloatMain_init(L);
41 |   audio_DoubleMain_init(L);
42 | 
43 |   lua_newtable(L);
44 |   lua_pushvalue(L, -1);
45 |   lua_setglobal(L, "audio");
46 | 
47 |   lua_newtable(L);
48 |   luaT_setfuncs(L, audio_DoubleMain__, 0);
49 |   lua_setfield(L, -2, "double");
50 | 
51 |   lua_newtable(L);
52 |   luaT_setfuncs(L, audio_FloatMain__, 0);
53 |   lua_setfield(L, -2, "float");
54 | 
55 |   lua_newtable(L);
56 |   luaT_setfuncs(L, audio_ByteMain__, 0);
57 |   lua_setfield(L, -2, "byte");
58 | 
59 |   lua_newtable(L);
60 |   luaT_setfuncs(L, audio_CharMain__, 0);
61 |   lua_setfield(L, -2, "char");
62 | 
63 |   lua_newtable(L);
64 |   luaT_setfuncs(L, audio_ShortMain__, 0);
65 |   lua_setfield(L, -2, "short");
66 | 
67 |   lua_newtable(L);
68 |   luaT_setfuncs(L, audio_IntMain__, 0);
69 |   lua_setfield(L, -2, "int");
70 | 
71 |   lua_newtable(L);
72 |   luaT_setfuncs(L, audio_LongMain__, 0);
73 |   lua_setfield(L, -2, "long");
74 | 
75 |   return 1;
76 | }
77 | 


--------------------------------------------------------------------------------
/generic/audio.c:
--------------------------------------------------------------------------------
  1 | #ifndef TH_GENERIC_FILE
  2 | #define TH_GENERIC_FILE "generic/audio.c"
  3 | #else
  4 | 
  5 | #undef TAPI
  6 | #define TAPI __declspec(dllimport)
  7 | 
  8 | /* ---------------------------------------------------------------------- */
  9 | /* -- */
 10 | /* -- Copyright (c) 2012 Soumith Chintala */
 11 | /* --  */
 12 | /* -- Permission is hereby granted, free of charge, to any person obtaining */
 13 | /* -- a copy of this software and associated documentation files (the */
 14 | /* -- "Software"), to deal in the Software without restriction, including */
 15 | /* -- without limitation the rights to use, copy, modify, merge, publish, */
 16 | /* -- distribute, sublicense, and/or sell copies of the Software, and to */
 17 | /* -- permit persons to whom the Software is furnished to do so, subject to */
 18 | /* -- the following conditions: */
 19 | /* --  */
 20 | /* -- The above copyright notice and this permission notice shall be */
 21 | /* -- included in all copies or substantial portions of the Software. */
 22 | /* --  */
 23 | /* -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
 24 | /* -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
 25 | /* -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
 26 | /* -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE */
 27 | /* -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION */
 28 | /* -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
 29 | /* -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
 30 | /* --  */
 31 | /* ---------------------------------------------------------------------- */
 32 | /* -- description: */
 33 | /* --     audio.c - general purpose audio transforms for Torch-7 */
 34 | /* -- */
 35 | /* -- history:  */
 36 | /* --     May 26th, 2012, 5:46PM - wrote stft and channel 
 37 |                                   flattening functions - Soumith Chintala */
 38 | /* ---------------------------------------------------------------------- */
 39 | 
 40 | 
 41 | // write audio.toMono() which converts a multi-channel audio to single channel
 42 | 
 43 | #ifndef M_PI
 44 | #define M_PI 3.14159265358979323846264338327950288
 45 | #endif
 46 | 
 47 | // Apply an edge-smoothing function to make the signal periodic. 
 48 | // Different window types are provided
 49 | static inline void audio_(apply_window)(double *input, 
 50 |                                        long window_size, int window_type) {
 51 |   long i, m = window_size -1;
 52 |     switch (window_type) {
 53 |     case 1: // Rectangular Window, do nothing
 54 |       break;
 55 |     case 2: // Hamming Window
 56 |       for (i = 0; i < window_size; ++i) 
 57 |         input[i] *= .53836 - .46164 * cos(2 * M_PI * i / m);
 58 |       break;
 59 |     case 3: // Hann Window
 60 |       for (i = 0; i < window_size; ++i) 
 61 |         input[i] *= .5 - .5 * cos(2 * M_PI * i / m);
 62 |       break;
 63 |     case 4: // Bartlett Window
 64 |       for (i = 0; i < window_size; ++i) 
 65 |         input[i] *= 2. / m * (m / 2. - fabs(i - m / 2.));
 66 |       break;
 67 |     default:
 68 |       THError("[stft_generic] Unknown window_type");
 69 |       break;
 70 |     }
 71 | }
 72 | 
 73 | ////////////////////////////////////////////////////////////////////////////
 74 | // generic short-time fourier transform function that supports multiple window types
 75 | // arguments [tensor, window-size, window-type, hop-size/stride]
 76 | // window_type [1, 2, 3, 4] for [rectangular, hamming, hann, bartlett]
 77 | static THTensor * audio_(stft_generic)(THTensor *input, 
 78 |                                        long window_size, int window_type, 
 79 |                                        long stride)
 80 | {
 81 |   const long length = input->size[0];
 82 |   long nChannels = 1;
 83 | 
 84 |   if (THTensor_(nDimension)(input) > 1)
 85 |     nChannels = input->size[1];
 86 |   
 87 |   if (nChannels > 1)
 88 |     THError("[stft_generic] Multi-channel stft not supported");
 89 | 
 90 |   real *input_data = THTensor_(data)(input);
 91 |   const long nwindows = ((length - window_size)/stride) + 1;
 92 |   const long noutput = window_size/2 + 1;
 93 |   THTensor *output = THTensor_(newWithSize3d)(nwindows, noutput, 2);
 94 |   real *output_data = THTensor_(data)(output);
 95 |   double *buffer = malloc(sizeof(double) * window_size);
 96 |   fftw_complex *fbuffer = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*noutput);
 97 |   long index, k, outindex=0;
 98 | 
 99 |   fftw_plan plan = fftw_plan_dft_r2c_1d(window_size, buffer, fbuffer, FFTW_ESTIMATE);
100 | 
101 |   // loop over the input. get a buffer. apply window. call stft. repeat with stride.
102 |   for (index = 0; index + window_size <= length; index = index + stride) {
103 |     for (k=0; k<window_size; k++)
104 |       buffer[k] = (double)input_data[index+k];
105 | 
106 |     audio_(apply_window)(buffer, window_size, window_type);
107 |     fftw_execute(plan);     // now apply rfftw over the buffer
108 |         
109 |     for (k=0; k < noutput; k++) {
110 |       output_data[outindex + k * 2] = (real) fbuffer[noutput - k - 1][0];
111 |       output_data[outindex + k * 2 + 1] = (real) fbuffer[noutput - k - 1][1];
112 |     }
113 |     outindex += noutput *2;
114 |   }
115 | 
116 |   // cleanup
117 |   fftw_destroy_plan(plan);
118 |   fftw_free(fbuffer);
119 |   free(buffer);
120 |   return output;
121 | }
122 | 
123 | static int audio_(Main_stft)(lua_State *L) {
124 |   THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
125 |   long window_size = luaL_checklong(L, 2);
126 |   int window_type = luaL_checkint(L, 3);
127 |   long stride = luaL_checklong(L, 4);
128 |   THTensor *output = audio_(stft_generic)(input, window_size, window_type, stride);
129 |   luaT_pushudata(L, output, torch_Tensor);
130 |   return 1;
131 | }
132 | // End of STFT section
133 | ////////////////////////////////////////////////////////////////////////////
134 | 
135 | ////////////////////////////////////////////////////////////////////////////
136 | // fast Constant-Q transform as proposed in this paper:
137 | // http://www.elec.qmul.ac.uk/people/anssik/cqt/smc2010.pdf
138 | // Inspired the matlab implementation here:
139 | // http://www.eecs.qmul.ac.uk/~anssik/cqt/
140 | 
141 | static THTensor * audio_(genCQTkernel)(double fmax, int bins,
142 |                                        double fs, double q, double atomHopFactor,
143 | 				       double thres)
144 | {
145 |   // TODO: write this
146 |   THTensor *output;
147 |   return output;
148 | }
149 | 
150 | // arguments [tensor, minimum-frequency, maximum-frequency, bins-per-octave, sample-rate]
151 | // returns [Constant-Q transformed signal tensor]
152 | static THTensor * audio_(cqt_generic)(THTensor *input, 
153 |                                        double fmin, double fmax, int bins,
154 |                                        double fs)
155 | {
156 |   // Check the input to be 1-D
157 |   // input parameters
158 |   double q = 1; // default value
159 |   double atomHopFactor = 0.25; // default value
160 |   double thresh = 0.0005; // default value
161 |   THTensor* output;
162 |   // winFlag = 'sqrt_blackmanharris'; // This window scheme
163 |   
164 |   // define
165 |   double octaveNr = ceil(log2(fmax/fmin));
166 |   fmin = (fmax/exp2(octaveNr)) * exp2(1/(double)bins); // set fmin to actual value
167 |   // xlen_init = length(x); // not needed for now
168 | 
169 |   // design lowpass filter
170 |   /* TODO:
171 |     if ~exist('B','var') || ~exist('A','var')
172 |     LPorder = 6; %order of the anti-aliasing filter
173 |     cutoff = 0.5;
174 |     [B A] = butter(LPorder,cutoff,'low'); %design f_nyquist/2-lowpass filter
175 |     end
176 |   */
177 |   
178 |   // design kernel for one octave
179 |   // TODO: Fill this function, right now it's empty
180 |   THTensor* cqtKernel = audio_(genCQTkernel)(fmax, bins,fs,q,atomHopFactor,thresh);
181 | 
182 |   // calculate CQT
183 |   /* TODO: 
184 |      cellCQT = cell(1,octaveNr);
185 |      maxBlock = cqtKernel.fftLEN * 2^(octaveNr-1); // largest FFT Block (virtual)
186 |      suffixZeros = maxBlock;
187 |      prefixZeros = maxBlock;
188 |      x = [zeros(prefixZeros,1); x; zeros(suffixZeros,1)]; // zeropadding
189 |      OVRLP = cqtKernel.fftLEN - cqtKernel.fftHOP;
190 |      K = cqtKernel.fKernel'; // conjugate spectral kernel for cqt transformation  
191 |   */
192 |   /* 
193 |   for (long i=1; i < octaveNr; ++i) {
194 |     xx = buffer(x,cqtKernel.fftLEN, OVRLP,'nodelay'); // generating FFT blocks
195 |     XX = fft(xx); // applying fft to each column (each FFT frame)
196 |     cellCQT{i} = K*XX; // calculating cqt coefficients for all FFT frames for this octave
197 |     if (i !=octaveNr) {
198 |       x = filtfilt(B,A,x); // anti aliasing filter
199 |       x = x(1:2:end); // drop samplerate by 2
200 |     }
201 |   }
202 |   */
203 |   
204 |   // map to sparse matrix representation
205 |   /* TODO: this should be optional, unless we want a sparse representation to save memory
206 |   spCQT = cell2sparse(cellCQT,octaveNr,bins,cqtKernel.firstcenter,cqtKernel.atomHOP,cqtKernel.atomNr);
207 |   */
208 |   
209 |   // return
210 |   /* TODO: No need to created this structure. Just returning transformed x (tensor) should be sufficient
211 |   intParam = struct('sufZeros',suffixZeros,'preZeros',prefixZeros,'xlen_init',xlen_init,'fftLEN',cqtKernel.fftLEN,'fftHOP',cqtKernel.fftHOP,'q',q,'filtCoeffA',A,'filtCoeffB',B,'firstcenter',cqtKernel.firstcenter,'atomHOP',cqtKernel.atomHOP,'atomNr',cqtKernel.atomNr,'Nk_max',cqtKernel.Nk_max,'Q',cqtKernel.Q,'rast',0);
212 |   Xcqt = struct('spCQT',spCQT,'fKernel',cqtKernel.fKernel,'fmax',fmax,'fmin',fmin,'octaveNr',octaveNr,'bins',cqtKernel.bins,'intParams',intParam);
213 |   */
214 |   
215 |   return output;
216 | }
217 | 
218 | static int audio_(Main_cqt)(lua_State *L) {
219 |   THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
220 |   double fmin = luaL_checknumber(L, 2);
221 |   double fmax = luaL_checknumber(L, 3);
222 |   int bins = luaL_checkint(L, 4);
223 |   long sample_rate = luaL_checknumber(L, 5);
224 |   THTensor *output = audio_(cqt_generic)(input, fmin, fmax, bins, sample_rate);
225 |   luaT_pushudata(L, output, torch_Tensor);
226 |   return 1;
227 | }
228 | // End of CQT section
229 | ////////////////////////////////////////////////////////////////////////////
230 | 
231 | static const struct luaL_Reg audio_(Main__) [] = {
232 |   {"stft", audio_(Main_stft)},
233 |   {"cqt", audio_(Main_cqt)},
234 |   {NULL, NULL}
235 | };
236 | 
237 | void audio_(Main_init)(lua_State *L)
238 | {
239 |   luaT_pushmetatable(L, torch_Tensor);
240 |   luaT_registeratname(L, audio_(Main__), "audio");
241 | }
242 | 
243 | #endif
244 | 


--------------------------------------------------------------------------------
/generic/sox.c:
--------------------------------------------------------------------------------
  1 | #ifndef TH_GENERIC_FILE
  2 | #define TH_GENERIC_FILE "generic/sox.c"
  3 | #else
  4 | 
  5 | /* ---------------------------------------------------------------------- */
  6 | /* -- */
  7 | /* -- Copyright (c) 2012 Soumith Chintala */
  8 | /* --  */
  9 | /* -- Permission is hereby granted, free of charge, to any person obtaining */
 10 | /* -- a copy of this software and associated documentation files (the */
 11 | /* -- "Software"), to deal in the Software without restriction, including */
 12 | /* -- without limitation the rights to use, copy, modify, merge, publish, */
 13 | /* -- distribute, sublicense, and/or sell copies of the Software, and to */
 14 | /* -- permit persons to whom the Software is furnished to do so, subject to */
 15 | /* -- the following conditions: */
 16 | /* --  */
 17 | /* -- The above copyright notice and this permission notice shall be */
 18 | /* -- included in all copies or substantial portions of the Software. */
 19 | /* --  */
 20 | /* -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
 21 | /* -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
 22 | /* -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
 23 | /* -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE */
 24 | /* -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION */
 25 | /* -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
 26 | /* -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
 27 | /* --  */
 28 | /* ---------------------------------------------------------------------- */
 29 | /* -- description: */
 30 | /* --     sox.c - a wrapper from libSox to Torch-7 */
 31 | /* -- */
 32 | /* -- history:  */
 33 | /* --     May 24th, 2012, 8:38PM - wrote load function - Soumith Chintala */
 34 | /* ---------------------------------------------------------------------- */
 35 | 
 36 | void libsox_(read_audio)(sox_format_t *fd, THTensor* tensor,
 37 |                          int* sample_rate, size_t nsamples)
 38 | {
 39 |   int nchannels = fd->signal.channels;
 40 |   long buffer_size = fd->signal.length;
 41 |   if (buffer_size == 0) {
 42 |     if (nsamples != -1) {
 43 |       buffer_size = nsamples;
 44 |     } else {
 45 |       THError("[read_audio] Unknown length");
 46 |     }
 47 |   }
 48 |   *sample_rate = (int) fd->signal.rate;
 49 |   int32_t *buffer = (int32_t *)malloc(sizeof(int32_t) * buffer_size);
 50 |   size_t samples_read = sox_read(fd, buffer, buffer_size);
 51 |   if (samples_read == 0)
 52 |     THError("[read_audio] Empty file or read failed in sox_read");
 53 |   // alloc tensor
 54 |   THTensor_(resize2d)(tensor, samples_read / nchannels, nchannels );
 55 |   real *tensor_data = THTensor_(data)(tensor);
 56 |   // convert audio to dest tensor
 57 |   int x,k;
 58 |   for (x=0; x<samples_read/nchannels; x++) {
 59 |     for (k=0; k<nchannels; k++) {
 60 |       *tensor_data++ = (real)buffer[x*nchannels+k];
 61 |     }
 62 |   }
 63 |   // free buffer and sox structures
 64 |   free(buffer);
 65 | }
 66 | 
 67 | void libsox_(read_audio_file)(const char *file_name, THTensor* tensor, int* sample_rate)
 68 | {
 69 |   // Create sox objects and read into int32_t buffer
 70 |   sox_format_t *fd;
 71 |   fd = sox_open_read(file_name, NULL, NULL, NULL);
 72 |   if (fd == NULL)
 73 |     THError("[read_audio_file] Failure to read file");
 74 |   libsox_(read_audio)(fd, tensor, sample_rate, -1);
 75 |   sox_close(fd);
 76 | }
 77 | 
 78 | void libsox_(read_audio_memory)(THCharTensor *inp, THTensor* tensor,
 79 |                                 int* sample_rate, const char* extension)
 80 | {
 81 |   // Create sox objects and read into int32_t buffer
 82 |   sox_format_t *fd;
 83 |   char* buffer = THCharTensor_data(inp);
 84 |   size_t buffer_size = THCharTensor_size(inp, 0);
 85 |   int64_t length;
 86 |   memcpy(&length, buffer, 8);
 87 |   fd = sox_open_mem_read(buffer + 8, buffer_size, NULL, NULL, extension);
 88 |   if (fd == NULL)
 89 |     THError("[read_audio_memory] Failure to read input buffer");
 90 |   libsox_(read_audio)(fd, tensor, sample_rate, length);
 91 |   sox_close(fd);
 92 | }
 93 | 
 94 | void libsox_(write_audio)(sox_format_t *fd, THTensor* src,
 95 | 			  const char *extension, int sample_rate)
 96 | {
 97 |   long nchannels = src->size[1];
 98 |   long nsamples = src->size[0];
 99 |   real* data = THTensor_(data)(src);
100 | 
101 |   // convert audio to dest tensor
102 |   int x,k;
103 |   for (x=0; x<nsamples; x++) {
104 |     for (k=0; k<nchannels; k++) {
105 |       int32_t sample = (int32_t)(data[x*nchannels+k]);
106 |       size_t samples_written = sox_write(fd, &sample, 1);
107 |       if (samples_written != 1)
108 | 	THError("[write_audio_file] write failed in sox_write");
109 |     }
110 |   }
111 | }
112 | 
113 | void libsox_(write_audio_file)(const char *file_name, THTensor* src,
114 | 			       const char *extension, int sample_rate)
115 | {
116 |   if (THTensor_(isContiguous)(src) == 0)
117 |     THError("[write_audio_file] Input should be contiguous tensors");
118 | 
119 |   long nchannels = src->size[1];
120 |   long nsamples = src->size[0];
121 | 
122 |   sox_format_t *fd;
123 | 
124 |   // Create sox objects and write into int32_t buffer
125 |   sox_signalinfo_t sinfo;
126 |   sinfo.rate = sample_rate;
127 |   sinfo.channels = nchannels;
128 |   sinfo.length = nsamples * nchannels;
129 |   sinfo.precision = sizeof(int32_t) * 8; /* precision in bits */
130 | #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
131 |   sinfo.mult = NULL;
132 | #endif
133 |   fd = sox_open_write(file_name, &sinfo, NULL, extension, NULL, NULL);
134 |   if (fd == NULL)
135 |     THError("[write_audio_file] Failure to open file for writing");
136 | 
137 |   libsox_(write_audio)(fd, src, extension, sample_rate);
138 | 
139 |   // free buffer and sox structures
140 |   sox_close(fd);
141 | 
142 |   return;
143 | }
144 | 
145 | void libsox_(write_audio_memory)(THCharTensor* out, THTensor* src,
146 | 				 const char *extension, int sample_rate)
147 | {
148 |   if (THTensor_(isContiguous)(src) == 0)
149 |     THError("[write_audio_file] Input should be contiguous tensors");
150 | 
151 |   long nchannels = src->size[1];
152 |   long nsamples = src->size[0];
153 | 
154 |   sox_format_t *fd;
155 |   char *buffer = NULL;
156 |   size_t buffer_size = -1;
157 | 
158 |   // Create sox objects and write into int32_t buffer
159 |   sox_signalinfo_t sinfo;
160 |   sinfo.rate = sample_rate;
161 |   sinfo.channels = nchannels;
162 |   sinfo.length = nsamples * nchannels;
163 |   sinfo.precision = sizeof(int32_t) * 8; /* precision in bits */
164 | #if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
165 |   sinfo.mult = NULL;
166 | #endif
167 |   fd = sox_open_memstream_write(&buffer, &buffer_size, &sinfo, NULL, extension, NULL);
168 |   if (fd == NULL)
169 |     THError("[write_audio_memory] Failure to open sox object for writing");
170 | 
171 |   libsox_(write_audio)(fd, src, extension, sample_rate);
172 | 
173 |   // free sox structures
174 |   sox_close(fd);
175 | 
176 |   // write the number of samples as well. to get around a SOX bug for certain formats.
177 |   int64_t olength = nsamples * nchannels;
178 |   size_t  out_size = buffer_size + 8;
179 |   char *  out_data = (char*) malloc(out_size);
180 | 
181 |   /* TODO: investigate why if I create a storage and memcpy over, it's segfaulting */
182 |   // THCharStorage* out_storage = THCharStorage_newWithSize1(out_size);
183 |   // char* out_data = THCharStorage_data(out_storage);
184 | 
185 |   // write the actual data after an offset of int64_t
186 |   memcpy(out_data + 8, buffer, buffer_size);
187 |   memcpy(out_data, &olength, 8);
188 | 
189 |   THCharStorage* out_storage = THCharStorage_newWithData(out_data, out_size);
190 | 
191 |   THCharTensor_setStorage1d(out, out_storage, 0, out_size, 1);
192 | 
193 |   // free buffers and stuff
194 |   free(buffer);
195 | 
196 |   return;
197 | }
198 | 
199 | static int libsox_(Main_load)(lua_State *L) {
200 |   const char *filename = luaL_checkstring(L, 1);
201 |   THTensor *tensor = THTensor_(new)();
202 |   int sample_rate = 0;
203 |   libsox_(read_audio_file)(filename, tensor, &sample_rate);
204 |   luaT_pushudata(L, tensor, torch_Tensor);
205 |   lua_pushnumber(L, (double) sample_rate);
206 |   return 2;
207 | }
208 | 
209 | static int libsox_(Main_decompress)(lua_State *L) {
210 |   THCharTensor *inp = luaT_checkudata(L, 1, "torch.CharTensor");
211 |   const char *extension = luaL_checkstring(L, 2);
212 |   THTensor *tensor = THTensor_(new)();
213 |   int sample_rate = 0;
214 |   libsox_(read_audio_memory)(inp, tensor, &sample_rate, extension);
215 |   luaT_pushudata(L, tensor, torch_Tensor);
216 |   lua_pushnumber(L, (double) sample_rate);
217 |   return 2;
218 | }
219 | 
220 | static int libsox_(Main_save)(lua_State *L) {
221 |   const char *filename = luaL_checkstring(L, 1);
222 |   THTensor *tensor = luaT_checkudata(L, 2, torch_Tensor);
223 |   const char *extension = luaL_checkstring(L, 3);
224 |   int sample_rate = luaL_checkint(L, 4);
225 |   libsox_(write_audio_file)(filename, tensor, extension, sample_rate);
226 |   return 1;
227 | }
228 | 
229 | static int libsox_(Main_compress)(lua_State *L) {
230 |   THCharTensor *out = luaT_checkudata(L, 1, "torch.CharTensor");
231 |   THTensor *src = luaT_checkudata(L, 2, torch_Tensor);
232 |   const char *extension = luaL_checkstring(L, 3);
233 |   int sample_rate = luaL_checkint(L, 4);
234 |   libsox_(write_audio_memory)(out, src, extension, sample_rate);
235 |   return 1;
236 | }
237 | 
238 | static const luaL_Reg libsox_(Main__)[] =
239 | {
240 |   {"load", libsox_(Main_load)},
241 |   {"save", libsox_(Main_save)},
242 |   {"compress", libsox_(Main_compress)},
243 |   {"decompress", libsox_(Main_decompress)},
244 |   {NULL, NULL}
245 | };
246 | 
247 | DLL_EXPORT int libsox_(Main_init)(lua_State *L)
248 | {
249 |   luaT_pushmetatable(L, torch_Tensor);
250 |   luaT_registeratname(L, libsox_(Main__), "libsox");
251 |   // Initialize sox library
252 |   sox_format_init();
253 |   return 1;
254 | }
255 | 
256 | #endif
257 | 


--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
  1 | ----------------------------------------------------------------------
  2 | --
  3 | -- Copyright (c) 2012 Soumith Chintala
  4 | --
  5 | -- Permission is hereby granted, free of charge, to any person obtaining
  6 | -- a copy of this software and associated documentation files (the
  7 | -- "Software"), to deal in the Software without restriction, including
  8 | -- without limitation the rights to use, copy, modify, merge, publish,
  9 | -- distribute, sublicense, and/or sell copies of the Software, and to
 10 | -- permit persons to whom the Software is furnished to do so, subject to
 11 | -- the following conditions:
 12 | --
 13 | -- The above copyright notice and this permission notice shall be
 14 | -- included in all copies or substantial portions of the Software.
 15 | --
 16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 23 | --
 24 | ----------------------------------------------------------------------
 25 | -- description:
 26 | --     audio - an audio toolBox, for Torch
 27 | --
 28 | -- history:
 29 | --     May 24th, 2012, 7:28PM - wrote sox wrappers - Soumith Chintala
 30 | ----------------------------------------------------------------------
 31 | 
 32 | require 'torch'
 33 | require 'sys'
 34 | require 'xlua'
 35 | require 'dok'
 36 | require 'paths'
 37 | require 'libaudio'
 38 | 
 39 | ----------------------------------------------------------------------
 40 | -- load from multiple formats
 41 | --
 42 | local function load(filename)
 43 |    if not filename then
 44 |       print(dok.usage('audio.load',
 45 |                        'loads an audio file into a torch.Tensor', nil,
 46 |                        {type='string', help='path to file', req=true}))
 47 |       dok.error('missing file name', 'audio.load')
 48 |    end
 49 |    if not paths.filep(filename) then
 50 |       dok.error('Specified filename: ' .. filename .. ' not found', 'audio.load')
 51 |    end
 52 |    local tensor
 53 |    if not xlua.require 'libsox' then
 54 |       dok.error('libsox package not found, please install libsox','audio.load')
 55 |    end
 56 |    local a, sample_rate = torch.Tensor().libsox.load(filename)
 57 |    return a, sample_rate
 58 | end
 59 | rawset(audio, 'load', load)
 60 | --------------------------------------------------------------------------
 61 | -- save to multiple formats
 62 | local function save(filename, src, sample_rate)
 63 |    if not filename or not src then
 64 |       error('filename or src tensor missing')
 65 |    end
 66 |    if not xlua.require 'libsox' then
 67 |       dok.error('libsox package not found, please install libsox','audio.save')
 68 |    end
 69 |    local extension = paths.extname(filename)
 70 |    assert(extension, 'did not find extension (like .wav or .mp3) in filename. '
 71 | 	  .. 'Give a filename with an extension, for example: hello.wav')
 72 |    assert(sample_rate and type(sample_rate) == 'number',
 73 | 	  'provide a sample rate (a number) such as 22050')
 74 |    src.libsox.save(filename, src, extension, sample_rate)
 75 | end
 76 | rawset(audio, 'save', save)
 77 | --------------------------------------------------------------------------
 78 | -- compress
 79 | -- save to multiple formats
 80 | function audio.compress(src, sample_rate, extension)
 81 |    if not src then
 82 |       error('src tensor missing')
 83 |    end
 84 |    assert(sample_rate and type(sample_rate) == 'number',
 85 | 	  'provide a sample rate (a number) such as 22050')
 86 |    if not xlua.require 'libsox' then
 87 |       dok.error('libsox package not found, please install libsox','audio.compress')
 88 |    end
 89 |    local out = torch.CharTensor()
 90 |    src.libsox.compress(out, src, extension, sample_rate)
 91 |    return out
 92 | end
 93 | 
 94 | -- decompress
 95 | function audio.decompress(src, extension)
 96 |    if not src then
 97 |       error('src tensor missing')
 98 |    end
 99 |    if not extension then
100 |       error('extension string missing')
101 |    end
102 |    if not xlua.require 'libsox' then
103 |       dok.error('libsox package not found, please install libsox','audio.decompress')
104 |    end
105 |    local a, sample_rate = torch.Tensor().libsox.decompress(src, extension)
106 |    return a, sample_rate
107 | end
108 | 
109 | -- compressMP3
110 | function audio.compressMP3(src, sample_rate)
111 |    return audio.compress(src, sample_rate, 'mp3')
112 | end
113 | function audio.compressOGG(src, sample_rate)
114 |    return audio.compress(src, sample_rate, 'ogg')
115 | end
116 | 
117 | function audio.decompressMP3(src)
118 |    return audio.decompress(src, 'mp3')
119 | end
120 | function audio.decompressOGG(src)
121 |    return audio.decompress(src, 'ogg')
122 | end
123 | ----------------------------------------------------------------------
124 | -- spectrogram
125 | --
126 | local function spectrogram(...)
127 |    local output, input, window_size, window_type, stride
128 |    local args = {...}
129 |    if select('#',...) == 4 then
130 |       input = args[1]
131 |       window_size = args[2]
132 |       window_type = args[3]
133 |       stride = args[4]
134 |    else
135 |       print(dok.usage('audio.spectrogram',
136 | 		      'generate the spectrogram of an audio. '
137 | 			  .. 'returns a 2D tensor, with '
138 | 			  .. 'number_of_windows x window_size/2+1, '
139 | 			  .. 'each value representing the magnitude of '
140 | 			  .. 'each frequency in dB', nil,
141 | 		      {type='torch.Tensor',
142 | 		       help='input single-channel audio', req=true},
143 | 		      {type='number', help='window size', req=true},
144 | 		      {type='string',
145 | 		       help='window type: rect, hamming, hann, bartlett' , req=true},
146 | 		      {type='number', help='stride', req=true}))
147 |       dok.error('incorrect arguments', 'audio.spectrogram')
148 |    end
149 | 
150 |    -- calculate stft
151 |    local stftout = audio.stft(input, window_size, window_type, stride)
152 | 
153 |    -- calculate magnitude of signal and convert to dB to make it look prettier
154 |    local stftout_r = stftout:select(3,1)
155 |    local stftout_c = stftout:select(3,2)
156 |    stftout_r:pow(2)
157 |    stftout_c:pow(2)
158 |    local stftout_magnitude = stftout_r + stftout_c
159 |    stftout_magnitude = stftout_magnitude + 0.01 -- adding constant to avoid log(0)
160 |    output = stftout_magnitude:log() * 10
161 |    return output:transpose(1,2)
162 | end
163 | rawset(audio, 'spectrogram', spectrogram)
164 | 
165 | local function stft(...)
166 |     local output, input, window_size, window_type, stride
167 |     local args = {...}
168 |     if select('#',...) == 4 then
169 | 	input = args[1]
170 | 	window_size = args[2]
171 | 	window_type = args[3]
172 | 	stride = args[4]
173 |     else
174 | 	print(dok.usage('audio.stft',
175 | 			'calculate the stft of an audio. '
176 | 			    .. 'returns a 3D tensor, with '
177 | 			    .. 'number_of_windows x window_size/2+1 x 2 '
178 | 			    .. ' (complex number with real and complex parts)', nil,
179 | 			{type='torch.Tensor',
180 | 			 help='input single-channel audio', req=true},
181 | 			{type='number', help='window size', req=true},
182 | 			{type='string',
183 | 			 help='window type: rect, hamming, hann, bartlett' , req=true},
184 | 			{type='number', help='stride', req=true}))
185 | 	dok.error('incorrect arguments', 'audio.stft')
186 |     end
187 |     local window_type_id;
188 |     if window_type == 'rect' then
189 | 	window_type_id = 1
190 |     elseif window_type == 'hamming' then
191 | 	window_type_id = 2
192 |     elseif window_type == 'hann' then
193 | 	window_type_id = 3
194 |     elseif window_type == 'bartlett' then
195 | 	window_type_id = 4
196 |     end
197 |     -- calculate stft
198 |     output = torch.Tensor().audio.stft(input, window_size, window_type_id, stride)
199 |     return output
200 | end
201 | rawset(audio, 'stft', stft)
202 | 
203 | local function cqt(...)
204 |    local output, input, fmin, fmax, bins_per_octave, sample_rate
205 |    local args = {...}
206 |    if select('#',...) == 5 then
207 |       input = args[1]
208 |       fmin = args[2]
209 |       fmax = args[3]
210 |       bins_per_octave = args[3]
211 |       sample_rate = args[4]
212 |    else
213 |       print(dok.usage('audio.cqt',
214 | 		      'calculate the constant-Q transformed audio signal. returns a [TODO: fill this description]', nil,
215 | 		      {type='torch.Tensor', help='input single-channel audio', req=true},
216 | 		      {type='number', help='lowest frequency of interest', req=true},
217 | 		      {type='number', help='highest frequency of interest', req=true},
218 | 		      {type='number', help='frequency bins per octave', req=true},
219 | 		      {type='number', help='sampling rate of the input', req=true}))
220 |       dok.error('incorrect arguments', 'audio.cqt')
221 |    end
222 |    -- calculate cqt
223 |    output = torch.Tensor().audio.cqt(input, fmin, fmax, bins_per_octave, sample_rate)
224 |    return output
225 | end
226 | rawset(audio, 'cqt', cqt)
227 | 
228 | 
229 | ----------------------------------------------------------------------
230 | -- loads voice.mp3 that is included with the repo
231 | local function samplevoice()
232 |    local fname = 'voice.mp3'
233 |    local voice = audio.load(sys.concat(sys.fpath(), fname))
234 |    return voice
235 | end
236 | rawset(audio, 'samplevoice', samplevoice)
237 | 
238 | return audio
239 | 


--------------------------------------------------------------------------------
/sox.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <TH.h>
 3 | #include <luaT.h>
 4 | #include <unistd.h>
 5 | #include <stdlib.h>
 6 | #include <stdio.h>
 7 | #include <string.h>
 8 | 
 9 | #include <sox.h>
10 | 
11 | #if LUA_VERSION_NUM >= 503
12 | #define luaL_checklong(L,n)     ((long)luaL_checkinteger(L, (n)))
13 | #define luaL_checkint(L,n)      ((int)luaL_checkinteger(L, (n)))
14 | #endif
15 | 
16 | void abort_(const char * s, ...)
17 | {
18 |   va_list args;
19 |   va_start(args, s);
20 |   vfprintf(stderr, s, args);
21 |   fprintf(stderr, "\n");
22 |   va_end(args);
23 |   abort();
24 | }
25 | 
26 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
27 | #define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
28 | #define libsox_(NAME) TH_CONCAT_3(libsox_, Real, NAME)
29 | 
30 | #include "generic/sox.c"
31 | #include "THGenerateAllTypes.h"
32 | 
33 | DLL_EXPORT int luaopen_libsox(lua_State *L)
34 | {
35 |   libsox_ByteMain_init(L);
36 |   libsox_CharMain_init(L);
37 |   libsox_ShortMain_init(L);
38 |   libsox_IntMain_init(L);
39 |   libsox_LongMain_init(L);
40 |   libsox_FloatMain_init(L);
41 |   libsox_DoubleMain_init(L);
42 | 
43 |   lua_newtable(L);
44 |   lua_pushvalue(L, -1);
45 |   lua_setglobal(L, "libsox");
46 | 
47 |   lua_newtable(L);
48 |   luaT_setfuncs(L, libsox_DoubleMain__, 0);
49 |   lua_setfield(L, -2, "double");
50 | 
51 |   lua_newtable(L);
52 |   luaT_setfuncs(L, libsox_FloatMain__, 0);
53 |   lua_setfield(L, -2, "float");
54 | 
55 |   lua_newtable(L);
56 |   luaT_setfuncs(L, libsox_ByteMain__, 0);
57 |   lua_setfield(L, -2, "byte");
58 | 
59 |   lua_newtable(L);
60 |   luaT_setfuncs(L, libsox_CharMain__, 0);
61 |   lua_setfield(L, -2, "char");
62 | 
63 |   lua_newtable(L);
64 |   luaT_setfuncs(L, libsox_ShortMain__, 0);
65 |   lua_setfield(L, -2, "short");
66 | 
67 |   lua_newtable(L);
68 |   luaT_setfuncs(L, libsox_IntMain__, 0);
69 |   lua_setfield(L, -2, "int");
70 | 
71 |   lua_newtable(L);
72 |   luaT_setfuncs(L, libsox_LongMain__, 0);
73 |   lua_setfield(L, -2, "long");
74 | 
75 |   return 1;
76 | }
77 | 


--------------------------------------------------------------------------------
/test/load_and_save_example.lua:
--------------------------------------------------------------------------------
1 | require 'audio'
2 | 
3 | t, sample_rate = audio.load('voice.mp3')
4 | print(#t)
5 | print(sample_rate)
6 | audio.save('test.wav', t, sample_rate)
7 | 
8 | 


--------------------------------------------------------------------------------
/test/spectrogram.lua:
--------------------------------------------------------------------------------
1 | require 'audio'
2 | require 'image' -- to display the spectrogram
3 | voice = audio.samplevoice()
4 | spect = audio.spectrogram(voice, 8192, 'hann', 512)
5 | image.display(spect)
6 | 


--------------------------------------------------------------------------------
/test/test_decompress.lua:
--------------------------------------------------------------------------------
 1 | require 'audio'
 2 | m=audio.samplevoice()
 3 | print(m:nElement())
 4 | print('ok')
 5 | o = audio.compress(m, 22050, 'ogg')
 6 | print(torch.type(o))
 7 | print(o:nElement())
 8 | print('compressed')
 9 | -- outf = torch.DiskFile("www.ogg", "w"):binary()
10 | -- outf:writeChar(o:storage())
11 | -- outf:close()
12 | m2 = audio.decompress(o, 'ogg')
13 | 


--------------------------------------------------------------------------------
/voice.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soumith/lua---audio/d61eab4ffd9b3ef8218d367689236566e44e8f82/voice.mp3


--------------------------------------------------------------------------------