├── .gitignore
├── Makefile
├── README.md
├── __init__.py
├── audio_batch.py
├── batching-proof.ipynb
├── conv.py
├── conv_no_mid.ipynb
├── conv_same_filter_vars.ipynb
├── convert_data.py
├── data
├── lowpass
│ ├── post
│ │ ├── beethoven_opus10_1.npz
│ │ ├── plysaw.npz
│ │ ├── sin.npz
│ │ └── square.npz
│ └── pre
│ │ ├── beethoven_opus10_1.npz
│ │ ├── plysaw.npz
│ │ ├── sin.npz
│ │ └── square.npz
└── unknown
│ ├── post
│ └── beethoven_opus10_1.npz
│ └── pre
│ └── beethoven_opus10_1.npz
├── linear.py
├── lowpass.py
├── modules
├── __init__.py
└── wavio.py
├── non_linear.py
├── playground.ipynb
├── resources
├── chello_amplitute.svg
├── chello_frequency.png
├── chello_frequency.svg
├── frequency_time_data.png
├── microcontrollers_fft_example.png
├── play.png
├── sample-rate.png
└── spectrogram.jpg
├── runner.py
├── sound_files
├── README.md
├── UNMODIFIED - beethoven_opus10_1.mp3
├── beethoven_opus10_1_format0.mid
├── lowpass - post - beethoven_opus10_1.wav
├── lowpass - post - beethoven_opus10_1_mono.wav
├── lowpass - post - plysaw.wav
├── lowpass - post - sin.wav
├── lowpass - post - square.wav
├── lowpass - pre - beethoven_opus10_1.wav
├── lowpass - pre - beethoven_opus10_1_mono.wav
├── lowpass - pre - plysaw.wav
├── lowpass - pre - sin.wav
├── lowpass - pre - square.wav
├── out
│ ├── conv_diff_filter_vars_hw50_fs50_beethoven_opus10_generated.wav
│ ├── conv_epoch=0_beethoven_opus10_generated.wav
│ ├── conv_epoch=10_beethoven_opus10_generated.wav
│ ├── conv_epoch=15_beethoven_opus10_generated.wav
│ ├── conv_epoch=200_beethoven_opus10_generated.wav
│ ├── conv_epoch=20_beethoven_opus10_generated.wav
│ ├── conv_epoch=30_beethoven_opus10_generated.wav
│ ├── conv_epoch=40_beethoven_opus10_generated.wav
│ ├── conv_epoch=50_beethoven_opus10_generated.wav
│ ├── conv_epoch=5_beethoven_opus10_generated.wav
│ ├── conv_no_mid_hw50_fs50_beethoven_opus10_generated.wav
│ ├── conv_same_filter_vars_hw50_fs50_beethoven_opus10_generated.wav
│ ├── linear_epoch=20_beethoven_opus10_generated.wav
│ ├── linear_epoch=4000_beethoven_opus10_generated.wav
│ ├── non_lin_epoch=20_beethoven_opus10_generated.wav
│ └── non_lin_epoch=4000_beethoven_opus10_generated.wav
├── pedals - post - beethoven_opus10_1.wav
├── pedals - post - plysaw.wav
├── pedals - post - sin.wav
├── pedals - post - square.wav
├── pedals - pre - beethoven_opus10_1.wav
├── pedals - pre - plysaw.wav
├── pedals - pre - sin.wav
├── pedals - pre - square.wav
├── unknown - post - beethoven_opus10_1.wav
└── unknown - pre - beethoven_opus10_1.wav
├── test.py
└── util.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | .vscode/
92 |
93 | fuckery/
94 | tmp/
95 |
96 | README-printing.pdf
97 | README.pdf
98 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | all: readme printing
2 |
3 | readme:
4 | pandoc -f markdown_github -o README.pdf README.md --variable=geometry:"margin=0.75in" --highlight-style=zenburn --variable=colorlinks:true --variable=papersize:"letter" --variable=fontsize:"10pt"
5 |
6 | printing:
7 | pandoc -f markdown_github -o README-printing.pdf README.md --variable=geometry:"margin=0.75in" --highlight-style=zenburn --variable=colorlinks:true --variable=papersize:"letter" --variable=fontsize:"10pt" --variable=links-as-notes:true
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TensorFlow Signal Processing
2 | by Julie Shapiro
3 |
4 | # UPDATE (2018/03/31)
5 | I've received some questions about this paper recently that make me think people are confusing it for a functional project. I should make it clear that this is merely a paper describing problem complexity and as such makes some choices that I would not recommend to someone looking to do this kind of stuff in an actual tool. The notes at the end especially about MSE and just error in general combined with the fact that I'm not using an FFT setup really takes away from what this can do at the moment.
6 |
7 | That said, I'm currently working on something pretty cool, to try to do effect emulation in a much better way. My new TitanXP is just waiting for me to finish my RNN reading :)
8 |
9 | # Problem Overview
10 |
11 | The objective of this project is to evaluate the effectiveness of doing audio effect emulation using deep learning. For audio, there are two main classifications of tools: generators and effects. A generator is something which takes non-audio input, either physical or midi, and creates audio out of it. This would include instruments, synthesizers, drums - basically anything that really stands out as being traditionally musical. The other category, effects, are elements which take audio as input and transform it into some other audio to output. This can range from a simple filter to more complex effects such as distortion or reverb; even the echo of a room or quality drop over a phone call is an effect. The idea behind this project is to see if we can train a network to emulate one of these effects using deep learning.
12 | Audio is an interesting medium for machine learning. Like image data, the output can be judged both quantitatively and qualitatively. On top of this, audio itself has a complex structure. The additive property of waves can cause some unforeseen outcomes. On top of that, digital audio data is inherently convoluted: it is stored as a time series of points which are sampled from the audio signal itself. These points are fast Fourier transformed back into the signal whenever the audio is ready to be output. Because of this, a lot of the information which is affected by effects is hidden behind this signal processing problem.*
13 | In the past, doing signal processing in machine learning involved doing some manual decomposition of the input in order to abstract away the signal processing [1]. Often audio would be rendered into images of the spectrogram, which show the frequency distribution of the audio. While this works fine for classification problems, it seems unnecessary for an end to end problem like the one this paper is focused on. For that, we need to do actual signal processing in order to detect the features that matter.
14 |
15 | *Note - A lot of effects can still be done as transformations are done as an application to an untransformed wave, but it's often the case that the effect is significantly easier when done in the frequency space.
16 |
17 | The current progress on this project is available at [github.com/jshap70/TensorFlow-Signal-Processing](http://github.com/jshap70/TensorFlow-Signal-Processing)
18 |
19 |
20 |
21 | ## Sample Types
22 |
23 | Previously I mentioned how audio is a conceptually complex structure. This is because audio data is time series data of the amplitude of the audio, however almost all of the information that we think of as being "stored" as sound is stored in the frequency space of the sound. The relationship between the two is extracted by using a Fourier transform. An example can be seen below, where the time series data on the left would produce the frequency chart on the right.
24 |
25 | [2]
26 |
27 | However, this is an oversimplification. In reality, the frequency chart is adding a dimension to the data, so representing it in 2D space means that the frequency chart above is only valid for a small time cross section of the audio. A real frequency distribution of the sound would look as such.
28 |
29 | [3]
30 |
31 | And in fact this is what most machine learning uses to train audio on, except instead of having a height in the amplitude dimension they use the image's color channels and color intensity to represent it. This type of representation is called a Spectrogram. Spectrograms actually store 3 dimensional data, with frequency shown in the vertical direction, amplitude shown as color intensity, and time shown along the horizontal axis. You can see an example below.
32 |
33 | [4]
34 |
35 | That is why the goal of this project is to attempt to have the network learn the frequency-amplitude relationship on it's own, so that we can skip the step which manually extracts the important features.
36 | Digital audio data is stored as sampled points from the amplitude vs time graph, which is to be expected given that it's the direct form -albeit with a Fourier transform- that the output needs to be. A basic example can be seen below.
37 |
38 | [5]
39 |
40 | The audio used in this project has a uniform sample rate, which allows us to batch it easier.
41 |
42 |
43 | # The Network
44 |
45 | The plan to teach the network how to interpret the audio data needed to address 2 main concerns: first, it needed to be able to look at the audio and extract frequency data from it, and second it needed to be able to "undo" this operation so that the data could be converted back into audio.
46 | As far as the first problem is concerned, it's possible for us to add time as a dimension to the audio data similar to the frequency spectrogram concept above. In that model, time is represented as part of the image by being one of it's axis. In this way, the 2 dimensional instantaneous frequency plot becomes a 3 dimensional image. For our data, we have a 1 dimension of data: amplitude. By adding time as a dimension to this data, by batching it in contiguous time chunks, we can attempt to expose the network to patterns in the data. Or at least that's the idea.
47 | The second major issue deals with making the system end-to-end. We are looking to be able to take the output of the network, write it to a file, and play it back without having to take any extra steps. For a linear or nonlinear network, this isn't really an issue. At any point they should just natively be able to transform the result to a readable format. However, for a convolutional network which is introducing extra depth in the network, it's necessary to have a convolutional transpose layer. This type of layer is sometimes referred to as a 'deconvolutional' layer, however it's important to note that this is actually a misnomer, as deconvolution is a completely different process which is used in computer vision. Regardless of the terminology, a convolutional transpose layer allows you to take layers which have been convolved and attempts to transform the data back into more meaningful data. In our case, it changes the output back into the amplitude graph. The cool thing about convolutional transform layers is that we can reuse the exact same filter variables from our original convolutional layer in the transform layer. This significantly lessens the training difficulty of the network.
48 | With this in mind, we'll move on to the main design.
49 |
50 | ## Layer Design
51 | Intuitively, it would make sense that a linear network would most likely not be able to properly model this problem. The data is probably too complex for a linear model to interpret it. However, I still wanted to form a baseline to see just what kind of benefit we would achieve by moving to a more advanced network.
52 | So to start, I used a standard fully connected linear regression neural network, varying the depth of the hidden layer to find something that seemed reasonable to train. The goal of this network was to try to overfit the training data to show that it can at least be brute forced. With the standard training set I was using, these networks were taking upwards of 4,000 epochs to train.
53 | Moving past the basic networks, it seems somewhat intuitive that this problem would be decently well represented by a convolutional network because of it's ability to attempt to train filters on sections of the data. If these filters are large enough to detect full oscillations, it may be able to extract some relevant frequency data. As mentioned previously, any time we use a convolutional layer we will have to use a convolutional transpose layer on the output. The cool thing about convolutional transform layers is that we can reuse the exact same filter variables from our original convolutional layer in the transform layer, which will significantly lessens the training difficulty of the network.
54 | So currently we've built up a system which should be able to look at the data in a fashion which is potentially more true to the frequency space of the data. Now, all we need to do to finish off this basic setup is to place a fully connected layer in between the convolution and convolutional transpose layer
55 |
56 | ## Sampling and Batching
57 |
58 | Looking at the data itself, the wav files are stereo 16bit PCM integer files. To begin with, I converted the data to a 32bit float wav file and normalized the audio to fit within that standard. I split apart each file into mono tracks because it allows for us to experiment with different network designs a lot faster. However, there are filters which have different effects across stereo channels, so we will lose the ability to train on those for now.
59 | The audio we are training on is a set of sine, square, and saw waves which vary through a range of frequencies. Although these waves are very basic, the idea is that these more simple audio samples might help to train the network to understand frequency analysis easier. The validation data is split off from the training dataset, but the testing data is entirely different. Although the testing data uses the same filter, it is testing how the network performs when given a much more difficult problem: a piano. The idea is that this has a much more complex wave shape, so it will be a better test of how well the network understands the problem.
60 |
61 | Because it is time series data, the batching process is a bit trickier. Although the data needs to be kept in contiguous chunks, we can still extract the smaller sections of it to train independently on to ensure the network is trained uniformly. To do this, I implemented a batching system that does a scrolling window selection of the audio for discrete time periods, and then I shuffle those batches for every epoch. If we set the offset of each window to the one next to it is smaller than the length of each window, then we can get some overlap in the windows to further increase the number of available batches.
62 |
63 | Side note - It might seem at first that we would want to take sections of the data at small enough intervals to only allow for a handful of oscillations in the data. This might ensure that the network would get an idea of the instantaneous frequency data. But in reality this will not work. The issue is that the length of an oscillation is directly the result of pitch, so if the pitch changes the window might then cut off parts which are needed to extract the data. This is another reason why we must rely on the convolutional filters to slice the data for us.
64 |
65 | ## Training data
66 |
67 | The training data this project is mostly composed of some simple, generated audio samples which covers varying types of sound and pitches. On the more simple side, we have sine, square, and saw waves that move through a frequency range. Starting off I just use a lowpass (cuts off high frequencies) effect as the filter, but later I used a more complex effect made with some pedal effects. Unfortunately I have not had enough time to do very in depth testing with the later yet, so this paper will not be able to cover it.
68 | An example of the lowpass training data can be heard here:
69 | input: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/lowpass%20-%20pre%20-%20square.wav)
70 | output: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/lowpass%20-%20post%20-%20square.wav)
71 |
72 |
73 | # Results
74 |
75 | Before we look at the various networks themselves, let's look at the expected input and output.
76 | input: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/lowpass%20-%20pre%20-%20beethoven_opus10_1_mono.wav)
77 | expected output: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/lowpass%20-%20post%20-%20beethoven_opus10_1_mono.wav)
78 |
79 | Note that the generated testing outputs will all have a slight 'tick' to them around every half a second. This is a result of my hackish batching system for testing. Essentially it's just the padding on the convolution resetting the audio data to 0.0 at the edges of the batches, so the audio clicks as the output snaps abruptly to this value. Given time, I could have written one that used a sliding window system similar to the training data and eliminated this noise.
80 |
81 | ## Linear Regression
82 |
83 | First up, the results of the linear network.
84 |
85 | Based on trial and error, I found the linear network converged best when the hidden layer had around 1000 nodes; however, a network of this size is entirely unrealistic. It took almost 5 hours to train this network, and that was on a very powerful machine.
86 | Regardless, After after setting up the network, I ran it and got the following output after training it a little bit.
87 |
88 | Predicted Output of Linear Network @ 20 epochs: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/linear_epoch=20_beethoven_opus10_generated.wav)
89 |
90 | Strangely it's just white noise; maybe we just need to train it more.
91 |
92 | Predicted Output of Linear Network @ 4000 epochs: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/linear_epoch=4000_beethoven_opus10_generated.wav)
93 |
94 | Well, it at least sounds like audio now. However, it only really sounds like the training data; nothing from the testing data is really retained at all. Let's look at some of the numbers behind it and see if that tells us why it's as bad as it though bad.
95 |
96 | ```python
97 | x, y, P, MSE, sess = run_lin(1000, 4000)
98 | run_test(x, y, P, MSE, sess, run_name='best_linear')
99 |
100 | mse rmse std
101 | epoch training validation training validation training validation reference
102 | 4000 0.00342 0.00327 0.05847 0.05722 0.05885 0.05723 0.10412
103 | test mse: 0.00119 test rmse: 0.03446 test std: 0.03447
104 | ```
105 |
106 | Surprisingly, the training and validation MSE's are higher than the testing ones. Given how much the testing output sounds like the training set, you would expect it to be larger than the training because of overtraining. This is one of the first indications that MSE may not be the best judge of accuracy for this problem, but more on that later.
107 |
108 | ## Nonlinear Regression
109 |
110 | It's obvious linear regression isn't going to cut it, so let's move on to nonlinear regression. Similar to the linear network, I found that the nonlinear networks also only converged when their hidden layers had 1000 nodes.
111 |
112 | Predicted Output of Nonlinear Network @ 20 epochs: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/non_lin_epoch=20_beethoven_opus10_generated.wav)
113 |
114 | It's somehow worse than the linear one. However, generally nonlinear networks are more difficult to train, so let's try that again but with more epochs.
115 |
116 | Predicted Output of Nonlinear Network @ 4000 epochs: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/non_lin_epoch=4000_beethoven_opus10_generated.wav)
117 |
118 | It still sounds just about the same as the nonlinear output. The numbers tell a very similar story as the audio.
119 |
120 | ```python
121 | x, y, P, MSE, sess = run_nonlin(1000, 4000)
122 | run_test(x, y, P, MSE, sess, run_name='non_lin_epoch=%d' % 4000)
123 |
124 | mse rmse std
125 | epoch training validation training validation training validation reference
126 | 4000 0.00471 0.00363 0.06866 0.06024 0.06897 0.06025 0.10219
127 | test mse: 0.00144 test rmse: 0.03792 test std: 0.03792
128 | ```
129 |
130 | Overall, the nonlinear output fits pretty well with what we expected: the data is too complex for it to figure out the underlying model.
131 |
132 | ## Convolution
133 |
134 | This brings us to our last network, convolution. One of the side benefits of using this convolutional network is that the middle hidden layer can be significantly smaller than it was on the linear and nonlinear networks. Where the previous networks needed 1000 inner nodes, this network only needs 50. This drastically cuts back on the time needed to train the network.
135 | I generated testing testing results as I trained the network so we could see what effect the increased training levels had on it.
136 |
137 | ```python
138 | mse rmse std
139 | epoch training validation training validation training validation reference
140 | 0 0.01625 0.01399 0.12748 0.11827 0.08833 0.07587 0.11131
141 | test mse: 0.00829 test rmse: 0.09110 test std: 0.02744
142 | 5 0.00202 0.00542 0.04490 0.07361 0.04497 0.07355 0.09906
143 | test mse: 0.00021 test rmse: 0.01464 test std: 0.01371
144 | 10 0.00253 0.00120 0.05026 0.03461 0.05052 0.03447 0.10856
145 | test mse: 0.00014 test rmse: 0.01181 test std: 0.01087
146 | 15 0.00194 0.00235 0.04408 0.04844 0.04439 0.04842 0.10380
147 | test mse: 0.00010 test rmse: 0.01023 test std: 0.01024
148 | 20 0.00168 0.00280 0.04100 0.05291 0.04124 0.05290 0.10097
149 | test mse: 0.00011 test rmse: 0.01061 test std: 0.00992
150 | 30 0.00180 0.00212 0.04244 0.04603 0.04261 0.04604 0.10356
151 | test mse: 9.1680e-05 test rmse: 0.00957 test std: 0.00951
152 | 40 0.00179 0.00208 0.04229 0.04556 0.04245 0.04556 0.10453
153 | test mse: 8.9273e-05 test rmse: 0.00945 test std: 0.00945
154 | 50 0.00153 0.00280 0.03918 0.05287 0.03933 0.05289 0.10946
155 | test mse: 9.0900e-05 test rmse: 0.00953 test std: 0.00950
156 | ```
157 | And here are the predicted outputs from the convolutional network:
158 | * `epoch = 00` : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_epoch=0_beethoven_opus10_generated.wav)
159 | * `epoch = 05` : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_epoch=5_beethoven_opus10_generated.wav)
160 | * `epoch = 10` : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_epoch=10_beethoven_opus10_generated.wav)
161 | * `epoch = 20` : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_epoch=20_beethoven_opus10_generated.wav)
162 | * `epoch = 50` : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_epoch=50_beethoven_opus10_generated.wav)
163 | * `epoch = 200` : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_epoch=200_beethoven_opus10_generated.wav)
164 |
165 | First of all, this is the first time we have actually had the audio be properly output of the network, so we're off to a good start. Next, it's clear that the effect is not only being emulated, the network is actually doing a fairly good job at it as the number of epochs increase. The cooler thing is just how good the emulation actually is. Although it's not anything amazing, and it tends to have some crackle, it does a fairly decent job of capturing the effect.
166 |
167 | After getting some good results, I did some experimenting with the network to see if I couldn't back up some of my original conjectures. The following is a sample using the convolutional network above and the rest have slightly modified networks in the ways listed.
168 |
169 | * Unmodified baseline: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_same_filter_vars_hw50_fs50_beethoven_opus10_generated.wav)
170 |
171 |
172 | * No hidden layer : [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_no_mid_hw50_fs50_beethoven_opus10_generated.wav)
173 |
174 | Firstly, I tried to see what would happen if I removed the hidden middle layer. Although the network still seems to understand the audio format, as proven by the fact that it's not outputing white noise like the previous examples, it is significantly worse at applying the filter's effect to it. This seems fairly logical, as for this network the filtering effect would have to be trained into the convolutional layer.
175 |
176 | * Different filters for convolution and the convolutional transpose: [](https://github.com/jshap70/TensorFlow-Signal-Processing/blob/master/sound_files/out/conv_diff_filter_vars_hw50_fs50_beethoven_opus10_generated.wav)
177 |
178 | As expected, when the network was set up with different filters it seems significantly more resistant to training. This seems logical, given it now has almost double the variables. Furthermore, whenever it makes an adjustment to any of the variables in either filter, it will then have to spend time training the other filter to follow suit. This fighting between variables can dramatically increase the time needed to train the network.
179 |
180 |
181 | # Final Thoughs
182 |
183 | ### MSE
184 | One thing I'm not quite sure about is why the MSE is such a bad judge of output quality. Taking a look at the MSE values in the [convolution](#convolution) ouput, we can see that similar to the values in the [linear](#linear-regression) and [nonlinear](#nonlinear-regression) networks: the testing error is quite significantly lower than the training error. Furthermore, given that the output of the first two networks were completely incorrect and the convolutional network was correct, you would expect a drastic decrease in error on that output. Yet the difference is actually pretty small.
185 | My guess as to why this value is not a good estimation of error is related to how the audio is constantly making small oscillations. I believe it's possible to have value which provides something very close to the correct audio but also provides a larger error than some value which will cause the audio to fall apart. I wonder if the value used to train the network could actually be doing raw frequency analysis of it's own in order to guide the training of the network, though that may be seen as "cheating" given the scope of the problem.
186 |
187 | ### Network Complexity
188 | Given more time, I would have worked on a system which would use different convolutional layers with different filter sizes in order to allow the fully connected layer even more information when applying its effect.
189 | After building up a sufficiently well trained convolutional network to extract the audio features, I would then try to extract those layers from the network and see if we then couldn't train a new middle hidden layer significantly easier. This would allow us to emulate effects that we don't have a lot of training information on, such as just short sound clips.
190 |
191 | ## End
192 | Overall I think this project was a decent success. Going forward I want to continue to experiment with trying to build a more complex convolutional system, but the time scale of this project simply didn't allow for that.
193 |
194 | Tanks for reading!
195 |
196 | #### Notes / Sources
197 |
198 | [1] At least this is true for most practical applications. An example can be seen here: [github.com/markostam/audio-deepdream-tf](https://github.com/markostam/audio-deepdream-tf)
199 |
200 | [2] Image showing the relationship between time series and frequency data. Source: [learn.adafruit.com/fft-fun-with-fourier-transforms/background](https://learn.adafruit.com/fft-fun-with-fourier-transforms/background)
201 |
202 | [3] This image is heavily modified from the source, but still it originally came from: [processing.org/tutorials/sound/](https://processing.org/tutorials/sound/)
203 |
204 | [4] Spectrogram image from: [dwutygodnik.com/artykul/673-uwaznosc-fraktale-spektra-modele.html](http://www.dwutygodnik.com/artykul/673-uwaznosc-fraktale-spektra-modele.html)
205 |
206 | [5] Image showing how digital audio data is stored. WARNING: THIS SOURCE COULD BE DANGEROUS! [Google Chrome now blocks this site for suspected phishing attacks, proceed at your own risk!](https://transparencyreport.google.com/safe-browsing/search?url=http:%2F%2Fprogulator.com%2Fdigital-audio%2Fsampling-and-bit-depth%2F) Source: [progulator(.)com/digital-audio/sampling-and-bit-depth/](#link_removed) Also, note that there are some very large errors in this article. Most importantly, it incorrectly does not cover how Fourier transforms are used to go from the digital point sampling back to the analog signal and makes the common fault of believing the data is just directly interpreted as an averaging operation.
207 |
208 |
209 | [misc]
210 | some audio midi from - http://www.piano-midi.de/brahms.htm
211 | wavio - https://github.com/mgeier/python-audio/ - by Warren Weckesser
212 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/__init__.py
--------------------------------------------------------------------------------
/audio_batch.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from util import min_batch
4 |
5 | def make_batch(audio_in, audio_out, n, batch_length, sample_offset):
6 | """This is a tad more complicated than normal b/c batches can overlap"""
7 | if n * sample_offset > audio_in.shape[0] - batch_length:
8 | raise ValueError('too many batches %f %f' % (n * sample_offset, audio_in.shape[0] - batch_length))
9 |
10 | if audio_out.shape[0] != audio_in.shape[0]:
11 | raise ValueError('audio in and audio out are not the same length')
12 |
13 | # perm = np.arange(n)
14 | input_set = np.zeros([n, batch_length])
15 | output_set = np.zeros([n, batch_length])
16 |
17 | offset = 0
18 | for i in range(n):
19 | input_set[i] = audio_in[offset:offset + batch_length]
20 | output_set[i] = audio_out[offset:offset + batch_length]
21 | offset += sample_offset
22 |
23 | return input_set, output_set
24 |
25 |
26 | def batch_audio(audio_in, audio_out, seconds, offset=None):
27 | """Automatically batch the audio into sections of length `seconds`
28 |
29 | returns batch set and validation set"""
30 |
31 | ### basic time calculations
32 | # assume 44.1khz wav file
33 | sample_length = int(44100 * seconds)
34 |
35 | if offset is None:
36 | # give it some arbitrary separation
37 | # an offset of 0.1 with 1 second means each value is used in 10 batches
38 | offset = min(seconds, 0.1)
39 | print('using an offset of', offset, 'seconds')
40 | sample_offset = int(44100 * offset)
41 |
42 |
43 | ### calculate number of slices
44 | n = int((audio_in.shape[0] - sample_length) / sample_offset)
45 |
46 | return make_batch(audio_in, audio_out, n, sample_length, sample_offset)
47 |
48 |
49 |
50 | def get_valid(audio_in, audio_out, seconds, valid_percent):
51 | """Extracts a validation set from the audio data"""
52 | # assume 44.1khz wav file
53 | sample_length = int(44100 * seconds)
54 |
55 | input_set = np.copy(audio_in)
56 | output_set = np.copy(audio_out)
57 |
58 | # ### Cut out long periods of blank
59 | # FIXME: this has bugs b/c `input_set[:start]` doesn't fix the offset of what's removed
60 | # start = None
61 | # # seems space efficient /s
62 | # for i in range(audio_in.shape[1]):
63 | # if (audio_in[0,i] == 0) and (audio_in[1,i] == 0) and (audio_out[0,i] == 0) and (audio_out[1,i] == 0):
64 | # if start is None:
65 | # start = i
66 | # else:
67 | # if start is not None and start - i > 1:
68 | # input_set = np.append(input_set[:start], input_set[i:])
69 | # output_set = np.append(output_set[:start], output_set[i:])
70 |
71 | ### split off testing set
72 | num_descrete = int((input_set.shape[0] - sample_length) / sample_length)
73 | if valid_percent >= 1:
74 | raise ValueError('Invalid sample validation percentage')
75 | num_valid = num_descrete * valid_percent
76 | if num_valid <= 0:
77 | raise ValueError('Invalid sample validation percentage for given audio sample')
78 |
79 | validation_pos = int((input_set.shape[0] - sample_length * num_valid) / sample_length)
80 | ix = np.random.permutation(validation_pos)
81 |
82 | valid_set = np.asarray(
83 | [ input_set[int(ix[0] * sample_length) : int(ix[0] * sample_length + num_valid * sample_length)],
84 | output_set[int(ix[0] * sample_length) : int(ix[0] * sample_length + num_valid * sample_length)] ])
85 | input_set = np.concatenate(
86 | [ input_set[:int(ix[0] * sample_length)],
87 | input_set[int(ix[0] * sample_length + num_valid * sample_length):] ])
88 | output_set = np.concatenate(
89 | [ output_set[:int(ix[0] * sample_length)],
90 | output_set[int(ix[0] * sample_length + num_valid * sample_length):] ])
91 |
92 | return input_set, output_set, valid_set[0], valid_set[1]
93 |
94 |
--------------------------------------------------------------------------------
/batching-proof.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 39,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "\n",
13 | "from audio_batch import get_valid"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "--------------"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 4,
26 | "metadata": {
27 | "collapsed": false,
28 | "scrolled": false
29 | },
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "..................................................\n",
36 | "valid_in is 0.013269% empty in the worst case\n"
37 | ]
38 | }
39 | ],
40 | "source": [
41 | "sin_pre = np.load('./data/lowpass/pre/sin.npz')['data']\n",
42 | "sqr_pre = np.load('./data/lowpass/pre/square.npz')['data']\n",
43 | "saw_pre = np.load('./data/lowpass/pre/plysaw.npz')['data']\n",
44 | "\n",
45 | "sin_post = np.load('./data/lowpass/post/sin.npz')['data']\n",
46 | "sqr_post = np.load('./data/lowpass/post/square.npz')['data']\n",
47 | "saw_post = np.load('./data/lowpass/post/plysaw.npz')['data']\n",
48 | "\n",
49 | "set_in = np.concatenate([sin_pre[0], sin_pre[1], sqr_pre[0], sqr_pre[1], saw_pre[0], saw_pre[1]])\n",
50 | "set_out = np.concatenate([sin_post[0], sin_post[1], sqr_post[0], sqr_post[1], saw_post[0], saw_post[1]])\n",
51 | "\n",
52 | "\n",
53 | "sets = []\n",
54 | "for j in range(50):\n",
55 | " train_in, train_out, valid_in, valid_out = get_valid(set_in, set_out, 1, .25)\n",
56 | "\n",
57 | " blank = []\n",
58 | " for i in range(len(valid_in)):\n",
59 | " if valid_in[i] == 0.0:\n",
60 | " blank.append(i)\n",
61 | "\n",
62 | " sets.append(len(blank) / len(valid_in))\n",
63 | " print('.', end=\"\", flush=True)\n",
64 | " \n",
65 | "print()\n",
66 | "print('valid_in is %f%% empty in the worst case' % (np.asarray(sets).max()))"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 8,
72 | "metadata": {
73 | "collapsed": false
74 | },
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/html": [
79 | ""
80 | ],
81 | "text/plain": [
82 | ""
83 | ]
84 | },
85 | "metadata": {},
86 | "output_type": "display_data"
87 | }
88 | ],
89 | "source": [
90 | "from IPython.core.display import display, HTML\n",
91 | "display(HTML(\"\"))"
92 | ]
93 | }
94 | ],
95 | "metadata": {
96 | "anaconda-cloud": {},
97 | "kernelspec": {
98 | "display_name": "Python [Root]",
99 | "language": "python",
100 | "name": "Python [Root]"
101 | },
102 | "language_info": {
103 | "codemirror_mode": {
104 | "name": "ipython",
105 | "version": 3
106 | },
107 | "file_extension": ".py",
108 | "mimetype": "text/x-python",
109 | "name": "python",
110 | "nbconvert_exporter": "python",
111 | "pygments_lexer": "ipython3",
112 | "version": "3.5.2"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 0
117 | }
118 |
--------------------------------------------------------------------------------
/conv.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from lowpass import lowpass
4 | from runner import run
5 |
6 | def gen_conv(layer_width, filter_size):
7 | std = 0.1
8 | alpha = 0.00001
9 |
10 | input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = lowpass()
11 |
12 | # reshape with channels
13 | input_set = input_set.reshape(-1, input_set.shape[1], 1)
14 | output_set = output_set.reshape(-1, output_set.shape[1], 1)
15 | valid_in_batches = valid_in_batches.reshape(-1, valid_in_batches.shape[1], 1)
16 | valid_out_batches = valid_out_batches.reshape(-1, valid_out_batches.shape[1], 1)
17 |
18 |
19 | ### GEN LAYERS
20 | x = tf.placeholder(tf.float32, shape=[None, input_set.shape[1], 1], name='x')
21 | x_4 = tf.expand_dims(x, 1)
22 | y = tf.placeholder(tf.float32, shape=[None, output_set.shape[1], 1], name='y')
23 | y_4 = tf.expand_dims(y, 1)
24 |
25 | w0 = tf.Variable(tf.truncated_normal([1, filter_size, 1, layer_width], stddev=std), name='w0')
26 | b0 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='b0')
27 | conv_0 = tf.nn.conv2d(
28 | x_4,
29 | w0,
30 | strides=[1,1,1,1],
31 | padding='SAME')
32 | lay0 = conv_0 + b0
33 | lay0 = tf.nn.relu(lay0)
34 |
35 | w1 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='w1')
36 | b1 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='b1')
37 | lay1 = lay0 * w1 + b1
38 | lay1 = tf.nn.relu(lay1)
39 |
40 | # required b/c conv2d_transpose does not infer None sized object's sizes at runtime, but we can cheat like this
41 | dyn_input_shape = tf.shape(x_4)
42 | batch_size = dyn_input_shape[0]
43 |
44 | # w2 = w0 (because of transpose)
45 | # w2 = tf.Variable(tf.truncated_normal([1, filter_size, 1, layer_width], stddev=std), name='w2')
46 | b2 = tf.Variable(tf.truncated_normal([1, 1], stddev=std), name='b2')
47 | conv_2 = tf.nn.conv2d_transpose(
48 | lay1,
49 | w0,
50 | output_shape=tf.pack([batch_size, 1, output_set.shape[1], 1]),
51 | strides=[1,1,1,1],
52 | padding='SAME')
53 | lay2 = conv_2 + b2
54 |
55 |
56 | P = tf.squeeze(lay2) # drop size 1 dim (channels)
57 |
58 | MSE = tf.reduce_mean(tf.square(lay2 - y_4))
59 | L2 = alpha * (tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1))
60 |
61 | optimizer = tf.train.AdamOptimizer().minimize(MSE + L2)
62 |
63 | global_step = tf.Variable(0, name='global_step', trainable=False)
64 | run_time = tf.Variable(0, name='run_time', trainable=False)
65 |
66 | saver = tf.train.Saver(
67 | { "w0": w0,
68 | "b0": b0,
69 | "w1": w1,
70 | "b1": b1,
71 | "b2": b2,
72 | "global_step": global_step,
73 | "run_time": run_time })
74 |
75 | return x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std
76 |
77 |
78 | def run_conv(hidden_width, filter_size, epochs, batch_size=50, save_dist=None):
79 | # oh god what have I done
80 | x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = gen_conv(hidden_width, filter_size)
81 | sess = tf.Session()
82 | sess.run(tf.initialize_all_variables())
83 | run(sess, x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, 'lowpass', 'convolution', hidden_width, epochs, batch_size=batch_size, extra=filter_size, check_dist=save_dist)
84 | return x, y, P, MSE, sess
85 |
--------------------------------------------------------------------------------
/conv_no_mid.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "\n",
13 | "from lowpass import lowpass\n",
14 | "from runner import run\n",
15 | "from test import run_test"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 7,
21 | "metadata": {
22 | "collapsed": false
23 | },
24 | "outputs": [],
25 | "source": [
26 | "def gen_conv_no_mid(layer_width, filter_size):\n",
27 | " std = 0.1\n",
28 | " alpha = 0.00001\n",
29 | "\n",
30 | " input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = lowpass()\n",
31 | "\n",
32 | " # reshape with channels\n",
33 | " input_set = input_set.reshape(-1, input_set.shape[1], 1)\n",
34 | " output_set = output_set.reshape(-1, output_set.shape[1], 1)\n",
35 | " valid_in_batches = valid_in_batches.reshape(-1, valid_in_batches.shape[1], 1)\n",
36 | " valid_out_batches = valid_out_batches.reshape(-1, valid_out_batches.shape[1], 1)\n",
37 | "\n",
38 | "\n",
39 | " ### GEN LAYERS\n",
40 | " x = tf.placeholder(tf.float32, shape=[None, input_set.shape[1], 1], name='x')\n",
41 | " x_4 = tf.expand_dims(x, 1)\n",
42 | " y = tf.placeholder(tf.float32, shape=[None, output_set.shape[1], 1], name='y')\n",
43 | " y_4 = tf.expand_dims(y, 1)\n",
44 | "\n",
45 | " w0 = tf.Variable(tf.truncated_normal([1, filter_size, 1, layer_width], stddev=std), name='w0')\n",
46 | " b0 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='b0')\n",
47 | " conv_0 = tf.nn.conv2d(\n",
48 | " x_4,\n",
49 | " w0,\n",
50 | " strides=[1,1,1,1],\n",
51 | " padding='SAME')\n",
52 | " lay0 = conv_0 + b0\n",
53 | " lay0 = tf.nn.relu(lay0)\n",
54 | "\n",
55 | " # required b/c conv2d_transpose does not infer None sized object's sizes at runtime, but we can cheat like this\n",
56 | " dyn_input_shape = tf.shape(x_4)\n",
57 | " batch_size = dyn_input_shape[0]\n",
58 | "\n",
59 | " w2 = tf.Variable(tf.truncated_normal([1, filter_size, 1, layer_width], stddev=std), name='w2')\n",
60 | " b2 = tf.Variable(tf.truncated_normal([1, 1], stddev=std), name='b2')\n",
61 | " conv_2 = tf.nn.conv2d_transpose(\n",
62 | " lay0,\n",
63 | " w2,\n",
64 | " output_shape=tf.pack([batch_size, 1, output_set.shape[1], 1]),\n",
65 | " strides=[1,1,1,1],\n",
66 | " padding='SAME')\n",
67 | " lay2 = conv_2 + b2\n",
68 | "\n",
69 | "\n",
70 | " P = tf.squeeze(lay2) # drop size 1 dim (channels)\n",
71 | "\n",
72 | " MSE = tf.reduce_mean(tf.square(lay2 - y_4))\n",
73 | " L2 = alpha * (tf.nn.l2_loss(w0) + tf.nn.l2_loss(w2))\n",
74 | "\n",
75 | " optimizer = tf.train.AdamOptimizer().minimize(MSE + L2)\n",
76 | "\n",
77 | " global_step = tf.Variable(0, name='global_step', trainable=False)\n",
78 | "\n",
79 | " saver = tf.train.Saver(\n",
80 | " { \"w0\": w0,\n",
81 | " \"b0\": b0,\n",
82 | " \"w2\": w2,\n",
83 | " \"b2\": b2,\n",
84 | " \"global_step\": global_step})\n",
85 | "\n",
86 | " return x, y, MSE, P, optimizer, global_step, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 11,
92 | "metadata": {
93 | "collapsed": true
94 | },
95 | "outputs": [],
96 | "source": [
97 | "def run_conv(hidden_width, filter_size, epochs, batch_size=50, save_dist=None):\n",
98 | " # oh god what have I done\n",
99 | " x, y, MSE, P, optimizer, global_step, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = gen_conv_no_mid(hidden_width, filter_size)\n",
100 | " sess = tf.Session()\n",
101 | " sess.run(tf.initialize_all_variables())\n",
102 | " run(sess, x, y, MSE, P, optimizer, global_step, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, 'lowpass', 'convolution_no_middle', hidden_width, epochs, batch_size=batch_size, extra=filter_size, check_dist=save_dist)\n",
103 | " return x, y, P, MSE, sess"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 12,
109 | "metadata": {
110 | "collapsed": false,
111 | "scrolled": false
112 | },
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "starting from epoch: 0\n",
119 | "\t mse rmse std \n",
120 | "\t training validation training validation training validation reference runtime\n",
121 | "..\n",
122 | "epoch: 2 0.00293 0.00375 0.05411 0.06125 0.05414 0.06109 0.11379 1.3 ..\n",
123 | "epoch: 4 0.00114 0.00181 0.03374 0.04250 0.03388 0.04251 0.11379 2.5 ..\n",
124 | "epoch: 6 0.00081 0.00137 0.02847 0.03707 0.02851 0.03703 0.11379 3.8 ..\n",
125 | "epoch: 8 0.00066 0.00116 0.02570 0.03408 0.02576 0.03408 0.11379 5.1 ..\n",
126 | "epoch: 10 0.00056 0.00101 0.02364 0.03172 0.02371 0.03172 0.11379 6.9 ..\n",
127 | "epoch: 12 0.00048 0.00088 0.02196 0.02969 0.02201 0.02968 0.11379 8.5 ..\n",
128 | "epoch: 14 0.00042 0.00078 0.02059 0.02798 0.02065 0.02798 0.11379 9.9 ..\n",
129 | "epoch: 16 0.00038 0.00070 0.01943 0.02647 0.01949 0.02648 0.11379 11.1 ..\n",
130 | "epoch: 18 0.00034 0.00064 0.01843 0.02523 0.01848 0.02524 0.11379 12.3 ..\n",
131 | "epoch: 20 0.00031 0.00058 0.01759 0.02406 0.01763 0.02406 0.11379 13.4 ..\n",
132 | "epoch: 22 0.00028 0.00053 0.01686 0.02313 0.01689 0.02313 0.11379 14.5 Interrupted\n",
133 | " test mse: 9.32496e-05\n",
134 | "test rmse: 0.00965659\n",
135 | " test std: 0.00965639720152\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "x, y, P, MSE, sess = run_conv(hidden_width=50, filter_size=50, epochs=40, batch_size=100, save_dist=2)\n",
141 | "run_test(x, y, P, MSE, sess, run_name='conv_no_mid_hw50_fs50')"
142 | ]
143 | }
144 | ],
145 | "metadata": {
146 | "anaconda-cloud": {},
147 | "kernelspec": {
148 | "display_name": "Python [Root]",
149 | "language": "python",
150 | "name": "Python [Root]"
151 | },
152 | "language_info": {
153 | "codemirror_mode": {
154 | "name": "ipython",
155 | "version": 3
156 | },
157 | "file_extension": ".py",
158 | "mimetype": "text/x-python",
159 | "name": "python",
160 | "nbconvert_exporter": "python",
161 | "pygments_lexer": "ipython3",
162 | "version": "3.5.2"
163 | }
164 | },
165 | "nbformat": 4,
166 | "nbformat_minor": 0
167 | }
168 |
--------------------------------------------------------------------------------
/conv_same_filter_vars.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "\n",
13 | "from lowpass import lowpass\n",
14 | "from runner import run\n",
15 | "from test import run_test"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 17,
21 | "metadata": {
22 | "collapsed": false
23 | },
24 | "outputs": [
25 | {
26 | "data": {
27 | "text/html": [
28 | ""
29 | ],
30 | "text/plain": [
31 | ""
32 | ]
33 | },
34 | "metadata": {},
35 | "output_type": "display_data"
36 | }
37 | ],
38 | "source": [
39 | "from IPython.core.display import display, HTML\n",
40 | "display(HTML(\"\"))"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 15,
46 | "metadata": {
47 | "collapsed": true
48 | },
49 | "outputs": [],
50 | "source": [
51 | "def gen_conv_same_filter_vars(layer_width, filter_size):\n",
52 | " std = 0.1\n",
53 | " alpha = 0.00001\n",
54 | "\n",
55 | " input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = lowpass()\n",
56 | "\n",
57 | " # reshape with channels\n",
58 | " input_set = input_set.reshape(-1, input_set.shape[1], 1)\n",
59 | " output_set = output_set.reshape(-1, output_set.shape[1], 1)\n",
60 | " valid_in_batches = valid_in_batches.reshape(-1, valid_in_batches.shape[1], 1)\n",
61 | " valid_out_batches = valid_out_batches.reshape(-1, valid_out_batches.shape[1], 1)\n",
62 | "\n",
63 | "\n",
64 | " ### GEN LAYERS\n",
65 | " x = tf.placeholder(tf.float32, shape=[None, input_set.shape[1], 1], name='x')\n",
66 | " x_4 = tf.expand_dims(x, 1)\n",
67 | " y = tf.placeholder(tf.float32, shape=[None, output_set.shape[1], 1], name='y')\n",
68 | " y_4 = tf.expand_dims(y, 1)\n",
69 | "\n",
70 | " w0 = tf.Variable(tf.truncated_normal([1, filter_size, 1, layer_width], stddev=std), name='w0')\n",
71 | " b0 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='b0')\n",
72 | " conv_0 = tf.nn.conv2d(\n",
73 | " x_4,\n",
74 | " w0,\n",
75 | " strides=[1,1,1,1],\n",
76 | " padding='SAME')\n",
77 | " lay0 = conv_0 + b0\n",
78 | " lay0 = tf.nn.relu(lay0)\n",
79 | "\n",
80 | " w1 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='w1')\n",
81 | " b1 = tf.Variable(tf.truncated_normal([layer_width], stddev=std), name='b1')\n",
82 | " lay1 = lay0 * w1 + b1\n",
83 | " lay1 = tf.nn.relu(lay1)\n",
84 | "\n",
85 | " # required b/c conv2d_transpose does not infer None sized object's sizes at runtime, but we can cheat like this\n",
86 | " dyn_input_shape = tf.shape(x_4)\n",
87 | " batch_size = dyn_input_shape[0]\n",
88 | "\n",
89 | "# w2 = tf.Variable(tf.truncated_normal([1, filter_size, 1, layer_width], stddev=std), name='w2')\n",
90 | " b2 = tf.Variable(tf.truncated_normal([1, 1], stddev=std), name='b2')\n",
91 | " conv_2 = tf.nn.conv2d_transpose(\n",
92 | " lay1,\n",
93 | " w0,\n",
94 | " output_shape=tf.pack([batch_size, 1, output_set.shape[1], 1]),\n",
95 | " strides=[1,1,1,1],\n",
96 | " padding='SAME')\n",
97 | " lay2 = conv_2 + b2\n",
98 | "\n",
99 | "\n",
100 | " P = tf.squeeze(lay2) # drop size 1 dim (channels)\n",
101 | "\n",
102 | " MSE = tf.reduce_mean(tf.square(lay2 - y_4))\n",
103 | " L2 = alpha * (tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1))\n",
104 | "\n",
105 | " optimizer = tf.train.AdamOptimizer().minimize(MSE + L2)\n",
106 | "\n",
107 | " global_step = tf.Variable(0, name='global_step', trainable=False)\n",
108 | "\n",
109 | " saver = tf.train.Saver(\n",
110 | " { \"w0\": w0,\n",
111 | " \"b0\": b0,\n",
112 | " \"w1\": w1,\n",
113 | " \"b1\": b1,\n",
114 | "# \"w2\": w2,\n",
115 | "# \"b2\": b2,\n",
116 | " \"global_step\": global_step})\n",
117 | "\n",
118 | " return x, y, MSE, P, optimizer, global_step, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std\n",
119 | "\n",
120 | "\n",
121 | "def run_conv(hidden_width, filter_size, epochs, batch_size=50, save_dist=None):\n",
122 | " # oh god what have I done\n",
123 | " x, y, MSE, P, optimizer, global_step, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = gen_conv_same_filter_vars(hidden_width, filter_size)\n",
124 | " sess = tf.Session()\n",
125 | " sess.run(tf.initialize_all_variables())\n",
126 | " run(sess, x, y, MSE, P, optimizer, global_step, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, 'lowpass', 'convolution_same_filter_vars', hidden_width, epochs, batch_size=batch_size, extra=filter_size, check_dist=save_dist)\n",
127 | " return x, y, P, MSE, sess"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 19,
133 | "metadata": {
134 | "collapsed": false,
135 | "scrolled": true
136 | },
137 | "outputs": [
138 | {
139 | "name": "stdout",
140 | "output_type": "stream",
141 | "text": [
142 | "starting from epoch: 0\n",
143 | "\t mse rmse std \n",
144 | "\t training validation training validation training validation reference runtime\n",
145 | ".....\n",
146 | "epoch: 5 0.00228 0.00587 0.04778 0.07661 0.04800 0.07662 0.09688 4.0 .....\n",
147 | "epoch: 10 0.00187 0.00506 0.04327 0.07112 0.04350 0.07114 0.09688 8.1 .....\n",
148 | "epoch: 15 0.00161 0.00437 0.04012 0.06612 0.04038 0.06613 0.09688 12.1 .....\n",
149 | "epoch: 20 0.00149 0.00403 0.03863 0.06350 0.03889 0.06351 0.09688 15.1 \n",
150 | "\t mse rmse std \n",
151 | "\t training validation training validation training validation reference runtime\n",
152 | "epoch: 20 0.00149 0.00403 0.03863 0.06350 0.03889 0.06351 0.09688 15.1\n",
153 | " test mse: 0.000125857\n",
154 | "test rmse: 0.0112186\n",
155 | " test std: 0.0112017653747\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "x, y, P, MSE, sess = run_conv(hidden_width=50, filter_size=50, epochs=20, batch_size=100, save_dist=5)\n",
161 | "run_test(x, y, P, MSE, sess, run_name='conv_same_filter_vars_hw50_fs50')"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "collapsed": true
169 | },
170 | "outputs": [],
171 | "source": []
172 | }
173 | ],
174 | "metadata": {
175 | "anaconda-cloud": {},
176 | "kernelspec": {
177 | "display_name": "Python [Root]",
178 | "language": "python",
179 | "name": "Python [Root]"
180 | },
181 | "language_info": {
182 | "codemirror_mode": {
183 | "name": "ipython",
184 | "version": 3
185 | },
186 | "file_extension": ".py",
187 | "mimetype": "text/x-python",
188 | "name": "python",
189 | "nbconvert_exporter": "python",
190 | "pygments_lexer": "ipython3",
191 | "version": "3.5.2"
192 | }
193 | },
194 | "nbformat": 4,
195 | "nbformat_minor": 0
196 | }
197 |
--------------------------------------------------------------------------------
/convert_data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """This script converts wav files from 8, 16, or 24 bit PCM(Integer) to np.float32 np.arrays
3 | and saves them to the right locations"""
4 |
5 | import os
6 |
7 | import numpy as np
8 |
9 | import modules.wavio as wavio
10 |
11 |
12 | def export_data(directory, filename):
13 | """Converts wav files from 8, 16, or 24 bit PCM(Integer) to np.float32 np.arrays and saves them to the right locations based on filename.
14 |
15 | Args:
16 | directory (str): relative directory the file is in.
17 | filename (str): must be of the format " -
- .wav".
18 |
19 | Returns:
20 | nothing
21 | """
22 |
23 | OUT_DIR = './data'
24 |
25 | inn = wavio.read(os.path.join(directory, filename))
26 |
27 | # convert to float and normalize if needed
28 | # help from https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
29 | if inn.data.dtype.kind in 'iu':
30 | i = np.iinfo(inn.data.dtype)
31 | abs_max = 2 ** (i.bits - 1)
32 | offset = i.min + abs_max
33 | floats = (inn.data.astype(np.float32) - offset) / abs_max
34 |
35 | split = filename.split(" - ")
36 |
37 | filter_dir = split[0]
38 | pre_post = split[1]
39 | save_file_name = split[2].split(".")[0]
40 |
41 | if not os.path.exists(os.path.join(OUT_DIR, filter_dir)):
42 | os.makedirs(os.path.join(OUT_DIR, filter_dir))
43 | if not os.path.exists(os.path.join(OUT_DIR, filter_dir, pre_post)):
44 | os.makedirs(os.path.join(OUT_DIR, filter_dir, pre_post))
45 |
46 | np.savez_compressed(os.path.join(OUT_DIR, filter_dir, pre_post, save_file_name), data=np.swapaxes(floats,0,1))
47 |
48 |
49 | def export_filtered_audio(directory):
50 | """Exports audio used as our data"""
51 | # DIRECTORY = './sound_files'
52 |
53 | for file in os.listdir(directory):
54 | if file.endswith(".wav"):
55 | export_data(directory, file)
56 |
57 |
58 | def write_output(output, filename):
59 | OUT_DIR = './sound_files/out'
60 | wavio.write(OUT_DIR + '/' + filename, output, 44100, sampwidth=2)
61 |
--------------------------------------------------------------------------------
/data/lowpass/post/beethoven_opus10_1.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/post/beethoven_opus10_1.npz
--------------------------------------------------------------------------------
/data/lowpass/post/plysaw.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/post/plysaw.npz
--------------------------------------------------------------------------------
/data/lowpass/post/sin.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/post/sin.npz
--------------------------------------------------------------------------------
/data/lowpass/post/square.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/post/square.npz
--------------------------------------------------------------------------------
/data/lowpass/pre/beethoven_opus10_1.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/pre/beethoven_opus10_1.npz
--------------------------------------------------------------------------------
/data/lowpass/pre/plysaw.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/pre/plysaw.npz
--------------------------------------------------------------------------------
/data/lowpass/pre/sin.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/pre/sin.npz
--------------------------------------------------------------------------------
/data/lowpass/pre/square.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/lowpass/pre/square.npz
--------------------------------------------------------------------------------
/data/unknown/post/beethoven_opus10_1.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/unknown/post/beethoven_opus10_1.npz
--------------------------------------------------------------------------------
/data/unknown/pre/beethoven_opus10_1.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/data/unknown/pre/beethoven_opus10_1.npz
--------------------------------------------------------------------------------
/linear.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from lowpass import lowpass
4 | from runner import run
5 |
6 | def gen_lin(layer_width):
7 | std = 0.1
8 | alpha = 0.00001
9 |
10 | input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = lowpass()
11 |
12 | ### GEN LAYERS
13 | x = tf.placeholder(tf.float32, shape=[None, input_set.shape[1]], name='x')
14 | y = tf.placeholder(tf.float32, shape=[None, output_set.shape[1]], name='y')
15 |
16 | w0 = tf.Variable(tf.truncated_normal([input_set.shape[1], layer_width], stddev=std), name='w0')
17 | b0 = tf.Variable(tf.truncated_normal([1,layer_width], stddev=std), name='b0')
18 | lay0 = tf.matmul(x,w0) + b0
19 |
20 | w1 = tf.Variable(tf.truncated_normal([layer_width, layer_width], stddev=std), name='w1')
21 | b1 = tf.Variable(tf.truncated_normal([1,layer_width], stddev=std), name='b1')
22 | lay1 = tf.matmul(lay0,w1) + b1
23 |
24 | w2 = tf.Variable(tf.truncated_normal([layer_width, output_set.shape[1]], stddev=std), name='w2')
25 | b2 = tf.Variable(tf.truncated_normal([1,output_set.shape[1]], stddev=std), name='b2')
26 | lay2 = tf.matmul(lay1,w2) + b2
27 |
28 | P = lay2
29 |
30 | MSE = tf.reduce_mean(tf.square(P - y))
31 | L2 = alpha * (tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))
32 |
33 | optimizer = tf.train.AdamOptimizer().minimize(MSE + L2)
34 |
35 | global_step = tf.Variable(0, name='global_step', trainable=False)
36 | run_time = tf.Variable(0, name='run_time', trainable=False)
37 |
38 | saver = tf.train.Saver(
39 | { "w0": w0,
40 | "b0": b0,
41 | "w1": w1,
42 | "b1": b1,
43 | "w2": w2,
44 | "b2": b2,
45 | "global_step": global_step,
46 | "run_time": run_time })
47 |
48 | return x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std
49 |
50 |
51 | def run_lin(hidden_width, epochs, check_dist=None):
52 | # oh god what have I done
53 | x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = gen_lin(hidden_width)
54 | sess = tf.Session()
55 | sess.run(tf.initialize_all_variables())
56 | run(sess, x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, 'lowpass', 'linear', hidden_width, epochs, check_dist=check_dist)
57 | return x, y, P, MSE, sess
58 |
--------------------------------------------------------------------------------
/lowpass.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from audio_batch import batch_audio, get_valid
4 |
5 |
6 | def lowpass():
7 | sin_pre = np.load('./data/lowpass/pre/sin.npz')['data']
8 | sqr_pre = np.load('./data/lowpass/pre/square.npz')['data']
9 | saw_pre = np.load('./data/lowpass/pre/plysaw.npz')['data']
10 |
11 | sin_post = np.load('./data/lowpass/post/sin.npz')['data']
12 | sqr_post = np.load('./data/lowpass/post/square.npz')['data']
13 | saw_post = np.load('./data/lowpass/post/plysaw.npz')['data']
14 |
15 |
16 | set_in = np.concatenate([sin_pre[0], sin_pre[1], sqr_pre[0], sqr_pre[1], saw_pre[0], saw_pre[1]])
17 | set_out = np.concatenate([sin_post[0], sin_post[1], sqr_post[0], sqr_post[1], saw_post[0], saw_post[1]])
18 |
19 | train_in, train_out, valid_in, valid_out = get_valid(set_in, set_out, 1, .25)
20 |
21 | if not train_in.shape[0] + valid_in.shape[0] == train_out.shape[0] + valid_out.shape[0] == set_in.shape[0] == set_out.shape[0]:
22 | raise ValueError('audio shapes don\'t match up')
23 |
24 | input_set, output_set = batch_audio(train_in, train_out, .5, offset=.1)
25 | valid_in_batches = valid_in.reshape(int(valid_in.shape[0] / input_set.shape[1]), input_set.shape[1])
26 | valid_out_batches = valid_out.reshape(int(valid_out.shape[0] / output_set.shape[1]), output_set.shape[1])
27 |
28 | train_ref_std = output_set.std()
29 |
30 | # input_set = input_set.reshape(-1, input_set.shape[1], 1)
31 | # output_set = output_set.reshape(-1, output_set.shape[1], 1)
32 | # valid_in_batches = valid_in_batches.reshape(-1, valid_in_batches.shape[1], 1)
33 | # valid_out_batches = valid_out_batches.reshape(-1, valid_out_batches.shape[1], 1)
34 | return input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std
35 |
--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/modules/__init__.py
--------------------------------------------------------------------------------
/modules/wavio.py:
--------------------------------------------------------------------------------
1 | """
2 | The wavio module defines the functions:
3 |
4 | read(file)
5 | Read a WAV file and return a `wavio.Wav` object, with attributes
6 | `data`, `rate` and `sampwidth`.
7 |
8 | write(filename, data, rate, scale=None, sampwidth=None)
9 | Write a numpy array to a WAV file.
10 |
11 |
12 | -----
13 | Author: Warren Weckesser
14 | License: BSD 2-Clause:
15 | Copyright (c) 2015, Warren Weckesser
16 | All rights reserved.
17 |
18 | Redistribution and use in source and binary forms, with or without
19 | modification, are permitted provided that the following conditions are met:
20 |
21 | 1. Redistributions of source code must retain the above copyright notice,
22 | this list of conditions and the following disclaimer.
23 |
24 | 2. Redistributions in binary form must reproduce the above copyright notice,
25 | this list of conditions and the following disclaimer in the documentation
26 | and/or other materials provided with the distribution.
27 |
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
32 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 | POSSIBILITY OF SUCH DAMAGE.
39 | """
40 |
41 | from __future__ import division as _division
42 |
43 | import wave as _wave
44 | import numpy as _np
45 |
46 |
47 | __version__ = "0.0.4.dev1"
48 |
49 |
50 | def _wav2array(nchannels, sampwidth, data):
51 | """data must be the string containing the bytes from the wav file."""
52 | num_samples, remainder = divmod(len(data), sampwidth * nchannels)
53 | if remainder > 0:
54 | raise ValueError('The length of data is not a multiple of '
55 | 'sampwidth * num_channels.')
56 | if sampwidth > 4:
57 | raise ValueError("sampwidth must not be greater than 4.")
58 |
59 | if sampwidth == 3:
60 | a = _np.empty((num_samples, nchannels, 4), dtype=_np.uint8)
61 | raw_bytes = _np.fromstring(data, dtype=_np.uint8)
62 | a[:, :, :sampwidth] = raw_bytes.reshape(-1, nchannels, sampwidth)
63 | a[:, :, sampwidth:] = (a[:, :, sampwidth - 1:sampwidth] >> 7) * 255
64 | result = a.view('> _np.array([0, 8, 16])) & 255
94 | wavdata = a8.astype(_np.uint8).tostring()
95 | else:
96 | # Make sure the array is little-endian, and then convert using
97 | # tostring()
98 | a = a.astype('<' + a.dtype.str[1:], copy=False)
99 | wavdata = a.tostring()
100 | return wavdata
101 |
102 |
103 | class Wav(object):
104 | """
105 | Object returned by `wavio.read`. Attributes are:
106 |
107 | data : numpy array
108 | The array of data read from the WAV file.
109 | rate : float
110 | The sample rate of the WAV file.
111 | sampwidth : int
112 | The sample width (i.e. number of bytes per sample) of the WAV file.
113 | For example, `sampwidth == 3` is a 24 bit WAV file.
114 |
115 | """
116 |
117 | def __init__(self, data, rate, sampwidth):
118 | self.data = data
119 | self.rate = rate
120 | self.sampwidth = sampwidth
121 |
122 | def __repr__(self):
123 | s = ("Wav(data.shape=%s, data.dtype=%s, rate=%r, sampwidth=%r)" %
124 | (self.data.shape, self.data.dtype, self.rate, self.sampwidth))
125 | return s
126 |
127 |
128 | def read(file):
129 | """
130 | Read a WAV file.
131 |
132 | Parameters
133 | ----------
134 | file : string or file object
135 | Either the name of a file or an open file pointer.
136 |
137 | Returns
138 | -------
139 | wav : wavio.Wav() instance
140 | The return value is an instance of the class `wavio.Wav`,
141 | with the following attributes:
142 |
143 | data : numpy array
144 | The array containing the data. The shape of the array
145 | is (num_samples, num_channels). num_channels is the
146 | number of audio channels (1 for mono, 2 for stereo).
147 | rate : float
148 | The sampling frequency (i.e. frame rate)
149 | sampwidth : float
150 | The sample width, in bytes. E.g. for a 24 bit WAV file,
151 | sampwidth is 3.
152 |
153 | Notes
154 | -----
155 | This function uses the `wave` module of the Python standard libary
156 | to read the WAV file, so it has the same limitations as that library.
157 | In particular, the function does not read compressed WAV files, and
158 | it does not read files with floating point data.
159 |
160 | The array returned by `wavio.read` is alway two-dimensional. If the
161 | WAV data is mono, the array will have shape (num_samples, 1).
162 |
163 | `wavio.read()` does not scale or normalize the data. The data in the
164 | array `wav.data` is the data that was in the file. When the file
165 | contains 24 bit samples, the resulting numpy array is 32 bit integers,
166 | with values that have been sign-extended.
167 | """
168 | wav = _wave.open(file)
169 | rate = wav.getframerate()
170 | nchannels = wav.getnchannels()
171 | sampwidth = wav.getsampwidth()
172 | nframes = wav.getnframes()
173 | data = wav.readframes(nframes)
174 | wav.close()
175 | array = _wav2array(nchannels, sampwidth, data)
176 | w = Wav(data=array, rate=rate, sampwidth=sampwidth)
177 | return w
178 |
179 |
180 | _sampwidth_dtypes = {1: _np.uint8,
181 | 2: _np.int16,
182 | 3: _np.int32,
183 | 4: _np.int32}
184 | _sampwidth_ranges = {1: (0, 256),
185 | 2: (-2**15, 2**15),
186 | 3: (-2**23, 2**23),
187 | 4: (-2**31, 2**31)}
188 |
189 |
190 | def _scale_to_sampwidth(data, sampwidth, vmin, vmax):
191 | # Scale and translate the values to fit the range of the data type
192 | # associated with the given sampwidth.
193 |
194 | data = data.clip(vmin, vmax)
195 |
196 | dt = _sampwidth_dtypes[sampwidth]
197 | if vmax == vmin:
198 | data = _np.zeros(data.shape, dtype=dt)
199 | else:
200 | outmin, outmax = _sampwidth_ranges[sampwidth]
201 | if outmin != vmin or outmax != vmax:
202 | data = ((float(outmax - outmin)) * (data - vmin) /
203 | (vmax - vmin)).astype(_np.int64) + outmin
204 | data[data == outmax] = outmax - 1
205 | data = data.astype(dt)
206 |
207 | return data
208 |
209 |
210 | def write(file, data, rate, scale=None, sampwidth=None):
211 | """
212 | Write the numpy array `data` to a WAV file.
213 |
214 | The Python standard library "wave" is used to write the data
215 | to the file, so this function has the same limitations as that
216 | module. In particular, the Python library does not support
217 | floating point data. When given a floating point array, this
218 | function converts the values to integers. See below for the
219 | conversion rules.
220 |
221 | Parameters
222 | ----------
223 | file : string, or file object open for writing in binary mode
224 | Either the name of a file or an open file pointer.
225 | data : numpy array, 1- or 2-dimensional, integer or floating point
226 | If it is 2-d, the rows are the frames (i.e. samples) and the
227 | columns are the channels.
228 | rate : float
229 | The sampling frequency (i.e. frame rate) of the data.
230 | sampwidth : int, optional
231 | The sample width, in bytes, of the output file.
232 | If `sampwidth` is not given, it is inferred (if possible) from
233 | the data type of `data`, as follows::
234 |
235 | data.dtype sampwidth
236 | ---------- ---------
237 | uint8, int8 1
238 | uint16, int16 2
239 | uint32, int32 4
240 |
241 | For any other data types, or to write a 24 bit file, `sampwidth`
242 | must be given.
243 | scale : tuple or str, optional
244 | By default, the data written to the file is scaled up or down to
245 | occupy the full range of the output data type. So, for example,
246 | the unsigned 8 bit data [0, 1, 2, 15] would be written to the file
247 | as [0, 17, 30, 255]. More generally, the default behavior is
248 | (roughly)::
249 |
250 | vmin = data.min()
251 | vmax = data.max()
252 | outmin =
253 | outmax =
254 | outdata = (outmax - outmin)*(data - vmin)/(vmax - vmin) + outmin
255 |
256 | The `scale` argument allows the scaling of the output data to be
257 | changed. `scale` can be a tuple of the form `(vmin, vmax)`, in which
258 | case the given values override the use of `data.min()` and
259 | `data.max()` for `vmin` and `vmax` shown above. (If either value
260 | is `None`, the value shown above is used.) Data outside the
261 | range (vmin, vmax) is clipped. If `vmin == vmax`, the output is
262 | all zeros.
263 |
264 | If `scale` is the string "none", then `vmin` and `vmax` are set to
265 | `outmin` and `outmax`, respectively. This means the data is written
266 | to the file with no scaling. (Note: `scaling="none" is not the same
267 | as `scaling=None`. The latter means "use the default behavior",
268 | which is to scale by the data minimum and maximum.)
269 |
270 | If `scale` is the string "dtype-limits", then `vmin` and `vmax`
271 | are set to the minimum and maximum integers of `data.dtype`.
272 | The string "dtype-limits" is not allowed when the `data` is a
273 | floating point array.
274 |
275 | If using `scale` results in values that exceed the limits of the
276 | output sample width, the data is clipped. For example, the
277 | following code::
278 |
279 | >>> x = np.array([-100, 0, 100, 200, 300, 325])
280 | >>> wavio.write('foo.wav', x, 8000, scale='none', sampwidth=1)
281 |
282 | will write the values [0, 0, 100, 200, 255, 255] to the file.
283 |
284 | Example
285 | -------
286 | Create a 3 second 440 Hz sine wave, and save it in a 24-bit WAV file.
287 |
288 | >>> import numpy as np
289 | >>> import wavio
290 | >>> rate = 22050 # samples per second
291 | >>> T = 3 # sample duration (seconds)
292 | >>> f = 440.0 # sound frequency (Hz)
293 | >>> t = np.linspace(0, T, T*rate, endpoint=False)
294 | >>> x = np.sin(2*np.pi * f * t)
295 | >>> wavio.write("sine24.wav", x, rate, sampwidth=3)
296 |
297 | Create a file that contains the 16 bit integer values -10000 and 10000
298 | repeated 100 times. Don't automatically scale the values. Use a sample
299 | rate 8000.
300 |
301 | >>> x = np.empty(200, dtype=np.int16)
302 | >>> x[::2] = -10000
303 | >>> x[1::2] = 10000
304 | >>> wavio.write("foo.wav", x, 8000, scale='none')
305 |
306 | Check that the file contains what we expect.
307 |
308 | >>> w = wavio.read("foo.wav")
309 | >>> np.all(w.data[:, 0] == x)
310 | True
311 |
312 | In the following, the values -10000 and 10000 (from within the 16 bit
313 | range [-2**15, 2**15-1]) are mapped to the corresponding values 88 and
314 | 168 (in the range [0, 2**8-1]).
315 |
316 | >>> wavio.write("foo.wav", x, 8000, sampwidth=1, scale='dtype-limits')
317 | >>> w = wavio.read("foo.wav")
318 | >>> w.data[:4, 0]
319 | array([ 88, 168, 88, 168], dtype=uint8)
320 |
321 | """
322 |
323 | if sampwidth is None:
324 | if not _np.issubdtype(data.dtype, _np.integer) or data.itemsize > 4:
325 | raise ValueError('when data.dtype is not an 8-, 16-, or 32-bit '
326 | 'integer type, sampwidth must be specified.')
327 | sampwidth = data.itemsize
328 | else:
329 | if sampwidth not in [1, 2, 3, 4]:
330 | raise ValueError('sampwidth must be 1, 2, 3 or 4.')
331 |
332 | outdtype = _sampwidth_dtypes[sampwidth]
333 | outmin, outmax = _sampwidth_ranges[sampwidth]
334 |
335 | if scale == "none":
336 | data = data.clip(outmin, outmax-1).astype(outdtype)
337 | elif scale == "dtype-limits":
338 | if not _np.issubdtype(data.dtype, _np.integer):
339 | raise ValueError("scale cannot be 'dtype-limits' with "
340 | "non-integer data.")
341 | # Easy transforms that just changed the signedness of the data.
342 | if sampwidth == 1 and data.dtype == _np.int8:
343 | data = (data.astype(_np.int16) + 128).astype(_np.uint8)
344 | elif sampwidth == 2 and data.dtype == _np.uint16:
345 | data = (data.astype(_np.int32) - 32768).astype(_np.int16)
346 | elif sampwidth == 4 and data.dtype == _np.uint32:
347 | data = (data.astype(_np.int64) - 2**31).astype(_np.int32)
348 | elif data.itemsize != sampwidth:
349 | # Integer input, but rescaling is needed to adjust the
350 | # input range to the output sample width.
351 | ii = _np.iinfo(data.dtype)
352 | vmin = ii.min
353 | vmax = ii.max
354 | data = _scale_to_sampwidth(data, sampwidth, vmin, vmax)
355 | else:
356 | if scale is None:
357 | vmin = data.min()
358 | vmax = data.max()
359 | else:
360 | # scale must be a tuple of the form (vmin, vmax)
361 | vmin, vmax = scale
362 | if vmin is None:
363 | vmin = data.min()
364 | if vmax is None:
365 | vmax = data.max()
366 |
367 | data = _scale_to_sampwidth(data, sampwidth, vmin, vmax)
368 |
369 | # At this point, `data` has been converted to have one of the following:
370 | # sampwidth dtype
371 | # --------- -----
372 | # 1 uint8
373 | # 2 int16
374 | # 3 int32
375 | # 4 int32
376 | # The values in `data` are in the form in which they will be saved;
377 | # no more scaling will take place.
378 |
379 | if data.ndim == 1:
380 | data = data.reshape(-1, 1)
381 |
382 | wavdata = _array2wav(data, sampwidth)
383 |
384 | w = _wave.open(file, 'wb')
385 | w.setnchannels(data.shape[1])
386 | w.setsampwidth(sampwidth)
387 | w.setframerate(rate)
388 | w.writeframes(wavdata)
389 | w.close()
390 |
--------------------------------------------------------------------------------
/non_linear.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from lowpass import lowpass
4 | from runner import run
5 |
6 | def gen_nonlin(layer_width):
7 | std = 0.1
8 | alpha = 0.00001
9 |
10 | input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = lowpass()
11 |
12 | ### GEN LAYERS
13 | x = tf.placeholder(tf.float32, shape=[None, input_set.shape[1]], name='x')
14 | y = tf.placeholder(tf.float32, shape=[None, output_set.shape[1]], name='y')
15 |
16 | w0 = tf.Variable(tf.truncated_normal([input_set.shape[1], layer_width], stddev=std), name='w0')
17 | b0 = tf.Variable(tf.truncated_normal([1,layer_width], stddev=std), name='b0')
18 | lay0 = tf.matmul(x,w0) + b0
19 | lay0 = tf.nn.relu(lay0)
20 |
21 | w1 = tf.Variable(tf.truncated_normal([layer_width, layer_width], stddev=std), name='w1')
22 | b1 = tf.Variable(tf.truncated_normal([1,layer_width], stddev=std), name='b1')
23 | lay1 = tf.matmul(lay0,w1) + b1
24 | lay1 = tf.nn.relu(lay1)
25 |
26 | w2 = tf.Variable(tf.truncated_normal([layer_width, output_set.shape[1]], stddev=std), name='w2')
27 | b2 = tf.Variable(tf.truncated_normal([1,output_set.shape[1]], stddev=std), name='b2')
28 | lay2 = tf.matmul(lay1,w2) + b2
29 |
30 | P = lay2
31 |
32 | MSE = tf.reduce_mean(tf.square(P - y))
33 | L2 = alpha * (tf.nn.l2_loss(w0) + tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))
34 |
35 | optimizer = tf.train.AdamOptimizer().minimize(MSE + L2)
36 |
37 | global_step = tf.Variable(0, name='global_step', trainable=False)
38 | run_time = tf.Variable(0, name='run_time', trainable=False)
39 |
40 | saver = tf.train.Saver(
41 | { "w0": w0,
42 | "b0": b0,
43 | "w1": w1,
44 | "b1": b1,
45 | "w2": w2,
46 | "b2": b2,
47 | "global_step": global_step,
48 | "run_time": run_time })
49 |
50 | return x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std
51 |
52 |
53 | def run_nonlin(hidden_width, epochs, check_dist=None):
54 | # oh god what have I done
55 | x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std = gen_nonlin(hidden_width)
56 | sess = tf.Session()
57 | sess.run(tf.initialize_all_variables())
58 | run(sess, x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, 'lowpass', 'nonlinear', hidden_width, epochs, check_dist=check_dist)
59 | return x, y, P, MSE, sess
60 |
--------------------------------------------------------------------------------
/playground.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false,
8 | "scrolled": true
9 | },
10 | "outputs": [],
11 | "source": [
12 | "from linear import run_lin\n",
13 | "from non_linear import run_nonlin\n",
14 | "from conv import run_conv\n",
15 | "\n",
16 | "from test import run_test"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## linear"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {
30 | "collapsed": false,
31 | "scrolled": false
32 | },
33 | "outputs": [
34 | {
35 | "name": "stdout",
36 | "output_type": "stream",
37 | "text": [
38 | "starting from epoch: 0\n",
39 | "\t mse rmse std \n",
40 | "\t training validation training validation training validation reference runtime .....\n",
41 | "epoch: 5 14.33614 61.93803 3.78631 7.87007 3.80303 7.87069 0.10154 0.1 .....\n",
42 | "epoch: 10 4.65010 48.70928 2.15641 6.97920 2.16399 6.98004 0.10154 0.3 .....\n",
43 | "epoch: 15 2.14808 43.46877 1.46563 6.59309 1.47088 6.59405 0.10154 0.5 .....\n",
44 | "epoch: 20 1.24355 40.80497 1.11515 6.38788 1.11887 6.38890 0.10154 0.7 \n",
45 | "\t mse rmse std \n",
46 | "\t training validation training validation training validation reference runtime \n",
47 | "epoch: 20 1.24355 40.80497 1.11515 6.38788 1.11887 6.38890 0.10154 0.7\n",
48 | " test mse: 4.18774\n",
49 | "test rmse: 2.0464\n",
50 | " test std: 2.04668323103\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "epochs = 20\n",
56 | "x, y, P, MSE, sess = run_lin(1000, epochs, check_dist=5)\n",
57 | "run_test(x, y, P, MSE, sess, run_name='linear_epoch=%d' % epochs)"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "--------------"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "## non-linear"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 2,
77 | "metadata": {
78 | "collapsed": false,
79 | "scrolled": true
80 | },
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "restoring network from: ./tmp/lowpass/nonlinear/1000/model.ckpt\n",
87 | "starting from epoch: 2\n",
88 | "\t mse rmse std \n",
89 | "\t training validation training validation training validation reference runtime ...\n",
90 | "epoch: 5 0.01884 0.02149 0.13727 0.14661 0.13772 0.14662 0.10946 0.5 .....\n",
91 | "epoch: 10 0.01724 0.01626 0.13131 0.12753 0.13178 0.12754 0.10946 1.6 .....\n",
92 | "epoch: 15 0.01592 0.01481 0.12618 0.12170 0.12668 0.12172 0.10946 2.7 .....\n",
93 | "epoch: 20 0.01489 0.01375 0.12202 0.11725 0.12254 0.11727 0.10946 3.8 \n",
94 | "\t mse rmse std \n",
95 | "\t training validation training validation training validation reference runtime \n",
96 | "epoch: 20 0.01489 0.01375 0.12202 0.11725 0.12254 0.11727 0.10946 3.8\n",
97 | " test mse: 0.00488164\n",
98 | "test rmse: 0.0698687\n",
99 | " test std: 0.0698763733389\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "epochs = 20\n",
105 | "x, y, P, MSE, sess = run_nonlin(1000, epochs, check_dist=5)\n",
106 | "run_test(x, y, P, MSE, sess, run_name='non_lin_epoch=%d' % epochs)"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 7,
112 | "metadata": {
113 | "collapsed": false
114 | },
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "restoring network from: ./tmp/lowpass/nonlinear/1000/model.ckpt\n",
121 | "starting from epoch: 4000\n",
122 | "\n",
123 | "\t mse rmse std \n",
124 | "\t training validation training validation training validation reference runtime \n",
125 | "epoch: 4000 0.00471 0.00363 0.06866 0.06024 0.06897 0.06025 0.10219 -1.0\n",
126 | " test mse: 0.00143764\n",
127 | "test rmse: 0.0379162\n",
128 | " test std: 0.0379208421829\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "epochs = 4000\n",
134 | "x, y, P, MSE, sess = run_nonlin(1000, epochs)\n",
135 | "run_test(x, y, P, MSE, sess, run_name='non_lin_epoch=%d' % epochs)"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "----"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "## convolution"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 3,
155 | "metadata": {
156 | "collapsed": false,
157 | "scrolled": false
158 | },
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "starting from epoch: 0\n",
165 | "\n",
166 | "\t mse rmse std \n",
167 | "\t training validation training validation training validation reference runtime \n",
168 | "epoch: 0 0.01625 0.01399 0.12748 0.11827 0.08833 0.07587 0.11131 0.0\n",
169 | " test mse: 0.00829977\n",
170 | "test rmse: 0.0911031\n",
171 | " test std: 0.027445545689\n",
172 | "starting from epoch: 0\n",
173 | ".....\n",
174 | "\t mse rmse std \n",
175 | "\t training validation training validation training validation reference runtime \n",
176 | "epoch: 5 0.00202 0.00542 0.04490 0.07361 0.04497 0.07355 0.09906 4.0\n",
177 | " test mse: 0.000214517\n",
178 | "test rmse: 0.0146464\n",
179 | " test std: 0.0137053161762\n",
180 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
181 | "starting from epoch: 5\n",
182 | ".....\n",
183 | "\t mse rmse std \n",
184 | "\t training validation training validation training validation reference runtime \n",
185 | "epoch: 10 0.00253 0.00120 0.05026 0.03461 0.05052 0.03447 0.10856 7.9\n",
186 | " test mse: 0.000139492\n",
187 | "test rmse: 0.0118107\n",
188 | " test std: 0.0108668058817\n",
189 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
190 | "starting from epoch: 10\n",
191 | ".....\n",
192 | "\t mse rmse std \n",
193 | "\t training validation training validation training validation reference runtime \n",
194 | "epoch: 15 0.00194 0.00235 0.04408 0.04844 0.04439 0.04842 0.10380 10.9\n",
195 | " test mse: 0.000104715\n",
196 | "test rmse: 0.010233\n",
197 | " test std: 0.0102382822478\n",
198 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
199 | "starting from epoch: 15\n",
200 | ".....\n",
201 | "\t mse rmse std \n",
202 | "\t training validation training validation training validation reference runtime \n",
203 | "epoch: 20 0.00168 0.00280 0.04100 0.05291 0.04124 0.05290 0.10097 13.9\n",
204 | " test mse: 0.000112734\n",
205 | "test rmse: 0.0106176\n",
206 | " test std: 0.00992121479961\n",
207 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
208 | "starting from epoch: 20\n",
209 | "\t mse rmse std \n",
210 | "\t training validation training validation training validation reference runtime .....\n",
211 | "epoch: 25 0.00183 0.00216 0.04276 0.04644 0.04295 0.04646 0.10356 16.9 .....\n",
212 | "epoch: 30 0.00180 0.00212 0.04244 0.04603 0.04261 0.04604 0.10356 21.2 \n",
213 | "\t mse rmse std \n",
214 | "\t training validation training validation training validation reference runtime \n",
215 | "epoch: 30 0.00180 0.00212 0.04244 0.04603 0.04261 0.04604 0.10356 21.2\n",
216 | " test mse: 9.16805e-05\n",
217 | "test rmse: 0.00957499\n",
218 | " test std: 0.00950527533185\n",
219 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
220 | "starting from epoch: 30\n",
221 | "\t mse rmse std \n",
222 | "\t training validation training validation training validation reference runtime .....\n",
223 | "epoch: 35 0.00180 0.00209 0.04240 0.04574 0.04253 0.04573 0.10453 24.4 .....\n",
224 | "epoch: 40 0.00179 0.00208 0.04229 0.04556 0.04245 0.04556 0.10453 28.4 \n",
225 | "\t mse rmse std \n",
226 | "\t training validation training validation training validation reference runtime \n",
227 | "epoch: 40 0.00179 0.00208 0.04229 0.04556 0.04245 0.04556 0.10453 28.4\n",
228 | " test mse: 8.92737e-05\n",
229 | "test rmse: 0.00944848\n",
230 | " test std: 0.00944516642297\n",
231 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
232 | "starting from epoch: 40\n",
233 | "\t mse rmse std \n",
234 | "\t training validation training validation training validation reference runtime .....\n",
235 | "epoch: 45 0.00154 0.00281 0.03925 0.05296 0.03940 0.05298 0.10946 31.6 .....\n",
236 | "epoch: 50 0.00153 0.00280 0.03918 0.05287 0.03933 0.05289 0.10946 35.7 \n",
237 | "\t mse rmse std \n",
238 | "\t training validation training validation training validation reference runtime \n",
239 | "epoch: 50 0.00153 0.00280 0.03918 0.05287 0.03933 0.05289 0.10946 35.7\n",
240 | " test mse: 9.09006e-05\n",
241 | "test rmse: 0.00953418\n",
242 | " test std: 0.00949598775392\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "for max_epoch in range(0, 20, 5):\n",
248 | " x, y, P, MSE, sess = run_conv(hidden_width=50, filter_size=50, epochs=max_epoch, batch_size=100, save_dist=5)\n",
249 | " run_test(x, y, P, MSE, sess, run_name='conv_epoch=%d' % max_epoch)\n",
250 | "\n",
251 | "for max_epoch in range(20, 51, 10):\n",
252 | " x, y, P, MSE, sess = run_conv(hidden_width=50, filter_size=50, epochs=max_epoch, batch_size=100, save_dist=5)\n",
253 | " run_test(x, y, P, MSE, sess, run_name='conv_epoch=%d' % max_epoch)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 4,
259 | "metadata": {
260 | "collapsed": false
261 | },
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "restoring network from: ./tmp/lowpass/convolution/50/50/model.ckpt\n",
268 | "starting from epoch: 50\n",
269 | "\t mse rmse std \n",
270 | "\t training validation training validation training validation reference runtime ..........\n",
271 | "epoch: 60 0.00159 0.00256 0.03989 0.05064 0.04005 0.05066 0.11154 40.4 ....................\n",
272 | "epoch: 80 0.00155 0.00250 0.03938 0.04995 0.03953 0.04997 0.11154 51.4 ....................\n",
273 | "epoch: 100 0.00150 0.00238 0.03873 0.04881 0.03887 0.04882 0.11154 64.8 ....................\n",
274 | "epoch: 120 0.00144 0.00225 0.03797 0.04746 0.03810 0.04747 0.11154 82.0 ....................\n",
275 | "epoch: 140 0.00138 0.00214 0.03719 0.04622 0.03733 0.04623 0.11154 101.9 ....................\n",
276 | "epoch: 160 0.00132 0.00200 0.03632 0.04475 0.03648 0.04476 0.11154 121.7 ....................\n",
277 | "epoch: 180 0.00126 0.00186 0.03548 0.04308 0.03564 0.04307 0.11154 141.6 ....................\n",
278 | "epoch: 200 0.00122 0.00175 0.03491 0.04178 0.03506 0.04179 0.11154 161.5 \n",
279 | "\t mse rmse std \n",
280 | "\t training validation training validation training validation reference runtime \n",
281 | "epoch: 200 0.00122 0.00175 0.03491 0.04178 0.03506 0.04179 0.11154 161.5\n",
282 | " test mse: 9.88527e-05\n",
283 | "test rmse: 0.00994247\n",
284 | " test std: 0.00988773205481\n"
285 | ]
286 | }
287 | ],
288 | "source": [
289 | "epochs = 200\n",
290 | "x, y, P, MSE, sess = run_conv(hidden_width=50, filter_size=50, epochs=epochs, batch_size=100, save_dist=20)\n",
291 | "run_test(x, y, P, MSE, sess, run_name='conv_epoch=%d' % epochs)"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "------"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 8,
304 | "metadata": {
305 | "collapsed": false
306 | },
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/html": [
311 | ""
312 | ],
313 | "text/plain": [
314 | ""
315 | ]
316 | },
317 | "metadata": {},
318 | "output_type": "display_data"
319 | }
320 | ],
321 | "source": [
322 | "from IPython.core.display import display, HTML\n",
323 | "display(HTML(\"\"))"
324 | ]
325 | }
326 | ],
327 | "metadata": {
328 | "anaconda-cloud": {},
329 | "kernelspec": {
330 | "display_name": "Python [Root]",
331 | "language": "python",
332 | "name": "Python [Root]"
333 | },
334 | "language_info": {
335 | "codemirror_mode": {
336 | "name": "ipython",
337 | "version": 3
338 | },
339 | "file_extension": ".py",
340 | "mimetype": "text/x-python",
341 | "name": "python",
342 | "nbconvert_exporter": "python",
343 | "pygments_lexer": "ipython3",
344 | "version": "3.5.2"
345 | }
346 | },
347 | "nbformat": 4,
348 | "nbformat_minor": 0
349 | }
350 |
--------------------------------------------------------------------------------
/resources/chello_amplitute.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
388 |
--------------------------------------------------------------------------------
/resources/chello_frequency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/resources/chello_frequency.png
--------------------------------------------------------------------------------
/resources/chello_frequency.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
537 |
--------------------------------------------------------------------------------
/resources/frequency_time_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/resources/frequency_time_data.png
--------------------------------------------------------------------------------
/resources/microcontrollers_fft_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/resources/microcontrollers_fft_example.png
--------------------------------------------------------------------------------
/resources/play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/resources/play.png
--------------------------------------------------------------------------------
/resources/sample-rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/resources/sample-rate.png
--------------------------------------------------------------------------------
/resources/spectrogram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/resources/spectrogram.jpg
--------------------------------------------------------------------------------
/runner.py:
--------------------------------------------------------------------------------
1 | import time, os, math
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 | from util import header
7 |
8 | def run(sess, x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, dataset, net_type, hidden_width, epochs, batch_size=500, extra=None, check_dist=None):
9 | try:
10 | actually_run(sess, x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, dataset, net_type, hidden_width, epochs, batch_size=batch_size, extra=extra, check_dist=check_dist)
11 | except KeyboardInterrupt:
12 | print('Interrupted')
13 |
14 |
15 | def actually_run(sess, x, y, MSE, P, optimizer, global_step, run_time, saver, input_set, output_set, valid_in_batches, valid_out_batches, train_ref_std, dataset, net_type, hidden_width, epochs, batch_size=500, extra=None, check_dist=None):
16 | ckpt_dir = "./tmp/%s/%s/%d/" % (dataset, net_type, hidden_width)
17 | if extra is not None:
18 | ckpt_dir += '%d/' % (extra)
19 |
20 | if not os.path.exists(ckpt_dir):
21 | os.makedirs(ckpt_dir)
22 | else:
23 | ckpt = tf.train.get_checkpoint_state(ckpt_dir)
24 | if ckpt and ckpt.model_checkpoint_path:
25 | print('restoring network from:',ckpt.model_checkpoint_path)
26 | saver.restore(sess, ckpt.model_checkpoint_path)
27 |
28 | epoch = sess.run(global_step)
29 | total_time = sess.run(run_time)
30 | run_start = time.time()
31 |
32 | if check_dist is None:
33 | check_dist = epochs // 100
34 |
35 | print("starting from epoch:", epoch)
36 |
37 | # only print if printing more than once
38 | printing = False
39 | if epoch + check_dist < epochs:
40 | header(newLine=False)
41 | printing = True
42 |
43 | while epoch < epochs:
44 | perm = np.random.permutation(input_set.shape[0])
45 |
46 | start = 0
47 | for _ in range( math.ceil( input_set.shape[0] / batch_size ) ):
48 | batch = perm[ start:start + batch_size ]
49 | sess.run([optimizer],feed_dict={x:input_set[batch],y:output_set[batch]})
50 | start += batch_size
51 |
52 | print('.', end="", flush=True)
53 |
54 | epoch+=1
55 | sess.run(global_step.assign(epoch))
56 |
57 | if epoch % check_dist == 0 or epoch == epochs:
58 | curr_time = time.time()
59 | total_time += (curr_time - run_start)/60
60 | sess.run(run_time.assign(total_time))
61 | run_start = curr_time
62 |
63 | saver.save(sess, ckpt_dir + 'model.ckpt')
64 | (mse_train, p_train) = sess.run([MSE, P],feed_dict={x:input_set,y:output_set})
65 | (mse_valid, p_valid) = sess.run([MSE, P],feed_dict={x:valid_in_batches,y:valid_out_batches})
66 | train_std = (np.squeeze(output_set) - p_train).std()
67 | valid_std = (np.squeeze(valid_out_batches) - p_valid).std()
68 |
69 | if printing:
70 | print()
71 | print('epoch:%5d %12.5f%12.5f%12.5f%12.5f%12.5f%12.5f%12.5f%12.1f' % (epoch, mse_train, mse_valid, np.sqrt(mse_train), np.sqrt(mse_valid), train_std, valid_std, train_ref_std, total_time), end=" ")
72 |
73 |
74 | # compute final results (and ensure computed if we're already done)
75 | (mse_train, p_train) = sess.run([MSE, P],feed_dict={x:input_set,y:output_set})
76 | (mse_valid, p_valid) = sess.run([MSE, P],feed_dict={x:valid_in_batches,y:valid_out_batches})
77 | train_std = (np.squeeze(output_set) - p_train).std()
78 | valid_std = (np.squeeze(valid_out_batches) - p_valid).std()
79 |
80 | print()
81 | header()
82 | print('epoch:%5d %12.5f%12.5f%12.5f%12.5f%12.5f%12.5f%12.5f%12.1f' % (epoch, mse_train, mse_valid, np.sqrt(mse_train), np.sqrt(mse_valid), train_std, valid_std, train_ref_std, total_time))
83 |
84 |
--------------------------------------------------------------------------------
/sound_files/README.md:
--------------------------------------------------------------------------------
1 | midi from - http://www.piano-midi.de/brahms.htm
2 |
--------------------------------------------------------------------------------
/sound_files/UNMODIFIED - beethoven_opus10_1.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/UNMODIFIED - beethoven_opus10_1.mp3
--------------------------------------------------------------------------------
/sound_files/beethoven_opus10_1_format0.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/beethoven_opus10_1_format0.mid
--------------------------------------------------------------------------------
/sound_files/lowpass - post - beethoven_opus10_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - post - beethoven_opus10_1.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - post - beethoven_opus10_1_mono.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - post - beethoven_opus10_1_mono.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - post - plysaw.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - post - plysaw.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - post - sin.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - post - sin.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - post - square.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - post - square.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - pre - beethoven_opus10_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - pre - beethoven_opus10_1.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - pre - beethoven_opus10_1_mono.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - pre - beethoven_opus10_1_mono.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - pre - plysaw.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - pre - plysaw.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - pre - sin.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - pre - sin.wav
--------------------------------------------------------------------------------
/sound_files/lowpass - pre - square.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/lowpass - pre - square.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_diff_filter_vars_hw50_fs50_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_diff_filter_vars_hw50_fs50_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=0_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=0_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=10_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=10_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=15_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=15_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=200_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=200_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=20_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=20_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=30_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=30_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=40_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=40_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=50_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=50_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_epoch=5_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_epoch=5_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_no_mid_hw50_fs50_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_no_mid_hw50_fs50_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/conv_same_filter_vars_hw50_fs50_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/conv_same_filter_vars_hw50_fs50_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/linear_epoch=20_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/linear_epoch=20_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/linear_epoch=4000_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/linear_epoch=4000_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/non_lin_epoch=20_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/non_lin_epoch=20_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/out/non_lin_epoch=4000_beethoven_opus10_generated.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/out/non_lin_epoch=4000_beethoven_opus10_generated.wav
--------------------------------------------------------------------------------
/sound_files/pedals - post - beethoven_opus10_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - post - beethoven_opus10_1.wav
--------------------------------------------------------------------------------
/sound_files/pedals - post - plysaw.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - post - plysaw.wav
--------------------------------------------------------------------------------
/sound_files/pedals - post - sin.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - post - sin.wav
--------------------------------------------------------------------------------
/sound_files/pedals - post - square.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - post - square.wav
--------------------------------------------------------------------------------
/sound_files/pedals - pre - beethoven_opus10_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - pre - beethoven_opus10_1.wav
--------------------------------------------------------------------------------
/sound_files/pedals - pre - plysaw.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - pre - plysaw.wav
--------------------------------------------------------------------------------
/sound_files/pedals - pre - sin.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - pre - sin.wav
--------------------------------------------------------------------------------
/sound_files/pedals - pre - square.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/pedals - pre - square.wav
--------------------------------------------------------------------------------
/sound_files/unknown - post - beethoven_opus10_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/unknown - post - beethoven_opus10_1.wav
--------------------------------------------------------------------------------
/sound_files/unknown - pre - beethoven_opus10_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julie-is-late/TensorFlow-Signal-Processing/63c4c04a49b60fa720f9276053091bb6079d1e50/sound_files/unknown - pre - beethoven_opus10_1.wav
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from audio_batch import batch_audio
4 | from convert_data import write_output
5 |
6 | def get_test():
7 | beet_pre = np.load('./data/lowpass/pre/beethoven_opus10_1.npz')['data']
8 | beet_post = np.load('./data/lowpass/post/beethoven_opus10_1.npz')['data']
9 |
10 | set_in = np.concatenate([beet_pre[0], beet_pre[1]])
11 | set_out = np.concatenate([beet_post[0], beet_post[1]])
12 |
13 | input_set, output_set = batch_audio(set_in, set_out, .5, offset=.5)
14 |
15 | return input_set, output_set
16 |
17 |
18 | def run_test(x, y, P, MSE, sess, run_name=None):
19 | test_input_batched, test_output_batched = get_test()
20 |
21 | if len(x.get_shape()) == 3:
22 | (test_p, mse) = sess.run([P, MSE],feed_dict={x:test_input_batched.reshape(-1, test_input_batched.shape[1], 1), y:test_output_batched.reshape(-1, test_output_batched.shape[1], 1)})
23 | else:
24 | (test_p, mse) = sess.run([P, MSE],feed_dict={x:test_input_batched, y:test_output_batched})
25 |
26 | p = np.squeeze(test_p)
27 |
28 | std = (p - test_output_batched[:,:p.shape[1]]).std()
29 | print(' test mse:', mse)
30 | print('test rmse:', np.sqrt(mse))
31 | print(' test std:', std)
32 |
33 | p = p.reshape(p.shape[0] * p.shape[1])
34 |
35 | filename = 'beethoven_opus10_generated.wav'
36 |
37 | if run_name is not None:
38 | filename = run_name + '_' + filename
39 |
40 | write_output(p, filename)
41 |
--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | import tensorflow as tf
4 |
5 |
6 | def min_batch(batch_size, n):
7 | """min_batch generates a permutation of n elements with a width of batch_size"""
8 | ix = np.random.permutation(n)
9 | k = np.empty([math.ceil(float(n) / batch_size)], dtype=object)
10 | for y in range(0, math.ceil(n / batch_size)):
11 | k[y] = np.array([], dtype=int)
12 | for z in range(0, batch_size):
13 | if y * batch_size + z > n - 1:
14 | break
15 | k[y] = np.append(k[y], ix[y * batch_size + z])
16 | return k
17 |
18 |
19 | def weight_variable(shape, std=0.1):
20 | initial = tf.truncated_normal(shape, stddev=std)
21 | return tf.Variable(initial)
22 |
23 | def header(newLine=True):
24 | print('\t mse rmse std ')
25 | print('\t training validation training validation training validation reference runtime ', end="", flush=True)
26 | if newLine:
27 | print()
28 |
--------------------------------------------------------------------------------