├── _config.yml ├── .gitignore ├── utils ├── __init__.py ├── test │ ├── __init__.py │ ├── 098569.mp3 │ └── ftest_stftForTheInpaintingSetting.py ├── legacy │ ├── __init__.py │ ├── notebooks │ │ ├── __init__.py │ │ ├── train.ipynb │ │ ├── try.ipynb │ │ └── test.ipynb │ ├── simulations │ │ ├── __init__.py │ │ ├── nets.ods │ │ ├── simple.py │ │ ├── stft_istft_tfReconstructionTest.py │ │ ├── runNatBigger.py │ │ ├── runNatNatBigger.py │ │ ├── runNatStftTest.py │ │ ├── runNatStftMagnitudeTest.py │ │ ├── runNatStftRealImagTest.py │ │ ├── runNatStftSeventh.py │ │ ├── runNatStftEigth.py │ │ ├── runNatStftSixth.py │ │ ├── runNatMagPhaseGapTest.py │ │ ├── runNatStftSec.py │ │ ├── runNatStftGapTest.py │ │ ├── runNatStftGapToMagTest.py │ │ ├── runNatStftThird.py │ │ ├── runNatStftFifth.py │ │ ├── runNatStftGapOneOneTest.py │ │ ├── runNatStftGapBIGTest.py │ │ ├── runNat.py │ │ ├── runNatSkip.py │ │ └── runNatBig.py │ ├── plotSummary.py │ ├── stftPhaseContextEncoder.py │ ├── evaluationWriter.py │ ├── timeLiner.py │ ├── stftGapContextEncoder.py │ └── stftRealImagContextEncoder.py ├── logdir │ └── readme.md ├── saved_models │ └── readme.md ├── strechableNumpyArray.py ├── colorize.py ├── tfReader.py └── saveParameters.py ├── network ├── __init__.py └── emptyTFGraph.py ├── system ├── __init__.py ├── dnnSystem.py ├── preAndPostProcessor.py ├── magPreAndPostProcessor.py └── contextEncoderSystem.py ├── architecture ├── __init__.py ├── parameters │ ├── __init__.py │ ├── fullyLayerParams.py │ ├── convNetworkParams.py │ └── contextEncoderParameters.py ├── architecture.py ├── channelWiseContextEncoderArchitecture.py └── contextEncoderArchitecture.py ├── datasetGenerator ├── __init__.py ├── fmaTFRecordGenerator.py ├── nSynthTFRecordGenerator.py ├── nSynthDownloader.py ├── fmaDownloader.py ├── fakeTFRecordGenerator.py ├── downloader.py ├── exampleProcessor.py └── tfRecordGenerator.py ├── images ├── Nsynth_2.png ├── Nsynth_3.png ├── Nsynth_6.png ├── Nsynth_7.png ├── Nsynth_12.png ├── Nsynth_13.png ├── Nsynth_17.png ├── Nsynth_67.png ├── decoder-signal.jpg ├── encoder-signal.jpg └── good-spectrogram.png ├── requirements.txt ├── complex_network_parameters.pkl ├── magnitude_network_parameters.pkl ├── audio_examples ├── faded │ ├── nsynth_17_or.mp3 │ ├── nsynth_20_or.mp3 │ ├── nsynth_3_or.mp3 │ ├── nsynth_3_rec.mp3 │ ├── nsynth_17_rec.mp3 │ ├── nsynth_20_rec.mp3 │ ├── nsynth_17_complex_rec.mp3 │ ├── nsynth_20_complex_rec.mp3 │ └── nsynth_3_complex_rec.mp3 ├── good │ ├── nsynth_14_or.mp3 │ ├── nsynth_14_rec.mp3 │ ├── nsynth_15_or.mp3 │ ├── nsynth_15_rec.mp3 │ ├── nsynth_16_or.mp3 │ ├── nsynth_16_rec.mp3 │ ├── nsynth_2_or.mp3 │ ├── nsynth_2_rec.mp3 │ ├── nsynth_4_or.mp3 │ ├── nsynth_4_rec.mp3 │ ├── nsynth_5_or.mp3 │ ├── nsynth_5_rec.mp3 │ ├── nsynth_67_or.mp3 │ ├── nsynth_67_rec.mp3 │ ├── nsynth_6_or.mp3 │ ├── nsynth_6_rec.mp3 │ ├── nsynth_7_or.mp3 │ ├── nsynth_7_rec.mp3 │ ├── nsynth_8_or.mp3 │ ├── nsynth_8_rec.mp3 │ ├── nsynth_2_complex_rec.mp3 │ ├── nsynth_4_complex_rec.mp3 │ ├── nsynth_5_complex_rec.mp3 │ ├── nsynth_6_complex_rec.mp3 │ ├── nsynth_7_complex_rec.mp3 │ ├── nsynth_8_complex_rec.mp3 │ ├── nsynth_14_complex_rec.mp3 │ ├── nsynth_15_complex_rec.mp3 │ ├── nsynth_16_complex_rec.mp3 │ └── nsynth_67_complex_rec.mp3 └── noisy │ ├── nsynth_12_or.mp3 │ ├── nsynth_13_or.mp3 │ ├── nsynth_18_or.mp3 │ ├── nsynth_12_rec.mp3 │ ├── nsynth_13_rec.mp3 │ ├── nsynth_18_rec.mp3 │ ├── nsynth_12_complex_rec.mp3 │ ├── nsynth_13_complex_rec.mp3 │ └── nsynth_18_complex_rec.mp3 ├── LPC-based extrapolation ├── mySNR.m ├── lpcPaper.m └── lpcInFolder.m ├── make_fakedataset.py ├── trainComplexNetwork.py ├── make_nsynthdataset.py ├── trainMagnitudeNetwork.py ├── make_fmadataset.py ├── README.md └── SpecDivExperimentMag.m /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /network/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /system/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /utils/test/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /architecture/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /utils/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /datasetGenerator/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /architecture/parameters/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /utils/legacy/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /utils/legacy/simulations/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | -------------------------------------------------------------------------------- /utils/logdir/readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the logs that are saved in the project -------------------------------------------------------------------------------- /utils/saved_models/readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the models that are saved in the project -------------------------------------------------------------------------------- /images/Nsynth_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_2.png -------------------------------------------------------------------------------- /images/Nsynth_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_3.png -------------------------------------------------------------------------------- /images/Nsynth_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_6.png -------------------------------------------------------------------------------- /images/Nsynth_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_7.png -------------------------------------------------------------------------------- /images/Nsynth_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_12.png -------------------------------------------------------------------------------- /images/Nsynth_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_13.png -------------------------------------------------------------------------------- /images/Nsynth_17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_17.png -------------------------------------------------------------------------------- /images/Nsynth_67.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/Nsynth_67.png -------------------------------------------------------------------------------- /utils/test/098569.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/utils/test/098569.mp3 -------------------------------------------------------------------------------- /images/decoder-signal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/decoder-signal.jpg -------------------------------------------------------------------------------- /images/encoder-signal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/encoder-signal.jpg -------------------------------------------------------------------------------- /images/good-spectrogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/images/good-spectrogram.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow_gpu==1.4.0 2 | librosa==0.5.1 3 | audioread==2.1.5 4 | matplotlib==2.1.0 5 | numpy==1.14.1 6 | 7 | -------------------------------------------------------------------------------- /complex_network_parameters.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/complex_network_parameters.pkl -------------------------------------------------------------------------------- /magnitude_network_parameters.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/magnitude_network_parameters.pkl -------------------------------------------------------------------------------- /utils/legacy/simulations/nets.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/utils/legacy/simulations/nets.ods -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_17_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_17_or.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_20_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_20_or.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_3_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_3_or.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_3_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_3_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_14_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_14_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_14_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_14_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_15_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_15_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_15_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_15_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_16_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_16_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_16_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_16_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_2_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_2_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_2_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_2_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_4_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_4_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_4_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_4_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_5_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_5_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_5_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_5_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_67_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_67_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_67_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_67_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_6_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_6_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_6_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_6_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_7_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_7_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_7_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_7_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_8_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_8_or.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_8_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_8_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_12_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_12_or.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_13_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_13_or.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_18_or.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_18_or.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_17_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_17_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_20_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_20_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_12_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_12_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_13_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_13_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_18_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_18_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_2_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_2_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_4_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_4_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_5_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_5_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_6_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_6_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_7_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_7_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_8_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_8_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_17_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_17_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_20_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_20_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/faded/nsynth_3_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/faded/nsynth_3_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_14_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_14_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_15_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_15_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_16_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_16_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/good/nsynth_67_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/good/nsynth_67_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_12_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_12_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_13_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_13_complex_rec.mp3 -------------------------------------------------------------------------------- /audio_examples/noisy/nsynth_18_complex_rec.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andimarafioti/audioContextEncoder/HEAD/audio_examples/noisy/nsynth_18_complex_rec.mp3 -------------------------------------------------------------------------------- /LPC-based extrapolation/mySNR.m: -------------------------------------------------------------------------------- 1 | function [result] = mySNR(orig_signal, inpainted) 2 | 3 | norm_orig = norm(orig_signal); 4 | norm_difference = norm(orig_signal-inpainted); 5 | result = 10*log10(abs(norm_orig^2)/(abs(norm_difference^2))); 6 | end -------------------------------------------------------------------------------- /utils/legacy/simulations/simple.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | fs = 16000 5 | time = np.arange(0, 0.005, 1/fs) 6 | plt.plot(np.sin(2 * np.pi *440 * time , dtype=np.float32) + np.random.normal(0, 0.1, len(time))) 7 | plt.show() -------------------------------------------------------------------------------- /datasetGenerator/fmaTFRecordGenerator.py: -------------------------------------------------------------------------------- 1 | from datasetGenerator.tfRecordGenerator import TFRecordGenerator 2 | 3 | __author__ = 'Andres' 4 | 5 | 6 | class FMATFRecordGenerator(TFRecordGenerator): 7 | def _filenameShouldBeLoaded(self, filename): 8 | return filename.endswith('.mp3') 9 | -------------------------------------------------------------------------------- /make_fakedataset.py: -------------------------------------------------------------------------------- 1 | from datasetGenerator.exampleProcessor import ExampleProcessor 2 | from datasetGenerator.fakeTFRecordGenerator import FakeTFRecordGenerator 3 | 4 | __author__ = 'Andres' 5 | 6 | 7 | exampleProcessor = ExampleProcessor(gapLength=1024, sideLength=2048, hopSize=512, gapMinRMS=1e-3) 8 | 9 | tfRecordGenerator = FakeTFRecordGenerator(baseName='fake', pathToDataFolder='', exampleProcessor=exampleProcessor) 10 | tfRecordGenerator.generateDataset() 11 | -------------------------------------------------------------------------------- /datasetGenerator/nSynthTFRecordGenerator.py: -------------------------------------------------------------------------------- 1 | from datasetGenerator.tfRecordGenerator import TFRecordGenerator 2 | 3 | __author__ = 'Andres' 4 | 5 | 6 | class NSynthTFRecordGenerator(TFRecordGenerator): 7 | def _filenameShouldBeLoaded(self, filename): 8 | return filename.endswith('.wav') 9 | 10 | 11 | if __name__ == "__main__": 12 | from datasetGenerator.exampleProcessor import ExampleProcessor 13 | 14 | exampleProcessor = ExampleProcessor() 15 | tfRecordGen = NSynthTFRecordGenerator(baseName='test', pathToDataFolder='nsynth-test/audio', exampleProcessor=exampleProcessor) 16 | tfRecordGen.generateDataset() 17 | 18 | 19 | -------------------------------------------------------------------------------- /network/emptyTFGraph.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from network.tfGraph import TFGraph 3 | 4 | __author__ = 'Andres' 5 | 6 | 7 | class EmptyTfGraph(TFGraph): 8 | """ 9 | This class is meant to represent a tensorflow graph. 10 | It is initialized empty and one can add different types of layers to it. 11 | The output of the network is accessed with output() 12 | The input of the function is a placeholder and can be set with input() 13 | 14 | input_shape : Shape of the input (with batch size) 15 | """ 16 | 17 | def __init__(self, shapeOfInput, isTraining, name): 18 | inputSignal = tf.placeholder(tf.float32, shape=shapeOfInput, name='input_data') 19 | super().__init__(inputSignal=inputSignal, isTraining=isTraining, name=name) 20 | -------------------------------------------------------------------------------- /architecture/parameters/fullyLayerParams.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | __author__ = 'Andres' 4 | 5 | 6 | class FullyLayerParams(object): 7 | def __init__(self, inputShape, outputShape, name): 8 | assert inputShape[0] == outputShape[0], 'Batch size is expected to be the first element in the shapes' 9 | 10 | self._inputShape = inputShape 11 | self._outputShape = outputShape 12 | self._name = name 13 | 14 | def inputShape(self): 15 | return self._inputShape 16 | 17 | def outputShape(self): 18 | return self._outputShape 19 | 20 | def name(self): 21 | return self._name 22 | 23 | def batchSize(self): 24 | return self._inputShape[0] 25 | 26 | def inputChannels(self): 27 | return np.prod(self._inputShape[1:]) 28 | 29 | def outputChannels(self): 30 | return np.prod(self._outputShape[1:]) 31 | -------------------------------------------------------------------------------- /architecture/parameters/convNetworkParams.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | 3 | 4 | class ConvNetworkParams(object): 5 | def __init__(self, filterShapes, channels, strides, name): 6 | self._filterShapes = filterShapes 7 | self._channels = channels 8 | self._strides = strides 9 | self._name = name 10 | 11 | def filterShapes(self): 12 | return self._filterShapes 13 | 14 | def channels(self): 15 | return self._channels 16 | 17 | def inputChannels(self): 18 | return self._channels[:-1] 19 | 20 | def outputChannels(self): 21 | return self._channels[1:] 22 | 23 | def strides(self): 24 | return self._strides 25 | 26 | def name(self): 27 | return self._name 28 | 29 | def layerCount(self): 30 | return len(self._strides) 31 | 32 | def convNames(self): 33 | return ["Conv_"+str(index) for index in range(self.layerCount())] 34 | -------------------------------------------------------------------------------- /trainComplexNetwork.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from architecture.contextEncoderArchitecture import ContextEncoderArchitecture 4 | from system.contextEncoderSystem import ContextEncoderSystem 5 | from system.preAndPostProcessor import PreAndPostProcessor 6 | 7 | architecturesParametersFile = "complex_network_parameters.pkl" 8 | sessionsName = "complex_network" 9 | 10 | with open(architecturesParametersFile, 'rb') as savedFile: 11 | Context_Encoder_parameters = pickle.load(savedFile) 12 | 13 | aContextEncoderArchitecture = ContextEncoderArchitecture(*Context_Encoder_parameters.architectureParameters()) 14 | aPreProcessor = PreAndPostProcessor(*Context_Encoder_parameters.preProcessorParameters()) 15 | aContextEncoderSystem = ContextEncoderSystem(aContextEncoderArchitecture, Context_Encoder_parameters.batchSize(), 16 | aPreProcessor, sessionsName) 17 | aContextEncoderSystem.train("nsynth_train_w5120_g1024_h512.tfrecords", "nsynth_valid_w5120_g1024_h512.tfrecords", 1e-3) 18 | -------------------------------------------------------------------------------- /make_nsynthdataset.py: -------------------------------------------------------------------------------- 1 | from datasetGenerator.exampleProcessor import ExampleProcessor 2 | from datasetGenerator.nSynthDownloader import NSynthDownloader 3 | from datasetGenerator.nSynthTFRecordGenerator import NSynthTFRecordGenerator 4 | 5 | __author__ = 'Andres' 6 | 7 | 8 | downloader = NSynthDownloader() 9 | downloader.downloadAndExtract() 10 | 11 | exampleProcessor = ExampleProcessor(gapLength=1024, sideLength=2048, hopSize=512, gapMinRMS=1e-3) 12 | 13 | tfRecordGenerator = NSynthTFRecordGenerator(baseName='nsynth_test', pathToDataFolder=downloader.TEST_DIR, exampleProcessor=exampleProcessor) 14 | tfRecordGenerator.generateDataset() 15 | 16 | tfRecordGenerator = NSynthTFRecordGenerator(baseName='nsynth_valid', pathToDataFolder=downloader.VALID_DIR, exampleProcessor=exampleProcessor) 17 | tfRecordGenerator.generateDataset() 18 | 19 | tfRecordGenerator = NSynthTFRecordGenerator(baseName='nsynth_train', pathToDataFolder=downloader.TRAIN_DIR, exampleProcessor=exampleProcessor) 20 | tfRecordGenerator.generateDataset() 21 | -------------------------------------------------------------------------------- /utils/strechableNumpyArray.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | __author__ = 'Andres' 4 | 5 | 6 | class StrechableNumpyArray(object): 7 | """When trying to add values to a numpy array, things can get slow if the array is too large. 8 | This class tries to solve that by updating the size of the array incrementally""" 9 | def __init__(self, dtype=np.float32): 10 | self._dtype = dtype 11 | self.data = np.zeros((1000000,), dtype=self._dtype) 12 | self.size = 0 13 | 14 | def append(self, x): 15 | if self.size + len(x) >= len(self.data): 16 | capacity = 4 * len(self.data) 17 | newdata = np.zeros((capacity,), dtype=self._dtype) 18 | newdata[:self.size] = self.data[:self.size] 19 | self.data = newdata 20 | 21 | self.data[self.size: self.size + len(x)] = x 22 | self.size += len(x) 23 | 24 | def finalize(self): 25 | output_data = self.data[:self.size] 26 | del self.data 27 | return output_data 28 | -------------------------------------------------------------------------------- /trainMagnitudeNetwork.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from architecture.contextEncoderArchitecture import ContextEncoderArchitecture 4 | from system.contextEncoderSystem import ContextEncoderSystem 5 | from system.magPreAndPostProcessor import MagPreAndPostProcessor 6 | 7 | architecturesParametersFile = "magnitude_network_parameters.pkl" 8 | sessionsName = "magnitude_network" 9 | 10 | with open(architecturesParametersFile, 'rb') as savedFile: 11 | Context_Encoder_parameters = pickle.load(savedFile) 12 | 13 | aContextEncoderArchitecture = ContextEncoderArchitecture(*Context_Encoder_parameters.architectureParameters()) 14 | aPreProcessor = MagPreAndPostProcessor(*Context_Encoder_parameters.preProcessorParameters()) 15 | aContextEncoderSystem = ContextEncoderSystem(aContextEncoderArchitecture, Context_Encoder_parameters.batchSize(), 16 | aPreProcessor, sessionsName) 17 | aContextEncoderSystem.train("nsynth_train_w5120_g1024_h512.tfrecords", "nsynth_valid_w5120_g1024_h512.tfrecords", 1e-3) 18 | -------------------------------------------------------------------------------- /make_fmadataset.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | from audioread import NoBackendError 3 | 4 | from datasetGenerator.exampleProcessor import ExampleProcessor 5 | from datasetGenerator.fmaDownloader import FMADownloader 6 | from datasetGenerator.fmaTFRecordGenerator import FMATFRecordGenerator 7 | 8 | __author__ = 'Andres' 9 | 10 | try: # Test the backend for mp3 files 11 | librosa.load("utils/test/098569.mp3") 12 | except NoBackendError as e: 13 | raise e 14 | 15 | downloader = FMADownloader() 16 | downloader.downloadAndExtract() 17 | 18 | exampleProcessor = ExampleProcessor(gapLength=1024, sideLength=2048, hopSize=512, gapMinRMS=1e-3) 19 | 20 | tfRecordGenerator = FMATFRecordGenerator(baseName='FMA-test', pathToDataFolder=downloader.TEST_DIR, exampleProcessor=exampleProcessor) 21 | tfRecordGenerator.generateDataset() 22 | 23 | tfRecordGenerator = FMATFRecordGenerator(baseName='FMA-valid', pathToDataFolder=downloader.VALID_DIR, exampleProcessor=exampleProcessor) 24 | tfRecordGenerator.generateDataset() 25 | 26 | tfRecordGenerator = FMATFRecordGenerator(baseName='FMA-train', pathToDataFolder=downloader.TRAIN_DIR, exampleProcessor=exampleProcessor) 27 | tfRecordGenerator.generateDataset() 28 | -------------------------------------------------------------------------------- /architecture/architecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | __author__ = 'Andres' 4 | 5 | 6 | class Architecture(object): 7 | def __init__(self): 8 | self._isTraining = tf.placeholder(tf.bool, name='is_training') 9 | self._input = tf.placeholder(tf.float32, shape=self.inputShape(), name='input_data') 10 | self._output = self._network(self._input) 11 | self._target = tf.placeholder(tf.float32, shape=self._output.shape, name='target_data') 12 | self._lossSummaries = [] 13 | self._loss = self._lossGraph() 14 | 15 | def input(self): 16 | return self._input 17 | 18 | def output(self): 19 | return self._output 20 | 21 | def target(self): 22 | return self._target 23 | 24 | def loss(self): 25 | return self._loss 26 | 27 | def lossSummaries(self): 28 | return self._lossSummaries 29 | 30 | def isTraining(self): 31 | return self._isTraining 32 | 33 | def _lossGraph(self): 34 | raise NotImplementedError("Subclass Responsibility") 35 | 36 | def _network(self, data): 37 | raise NotImplementedError("Subclass Responsibility") 38 | 39 | def inputShape(self): 40 | raise NotImplementedError("Subclass Responsibility") 41 | -------------------------------------------------------------------------------- /utils/legacy/plotSummary.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import io 5 | import tensorflow as tf 6 | 7 | __author__ = 'Andres' 8 | 9 | 10 | class PlotSummary(object): 11 | def __init__(self, name): 12 | self._name = name 13 | self._placeholder = tf.placeholder(tf.uint8, (None, None, None, None)) 14 | self._summary = tf.summary.image(name, self._placeholder) 15 | self._image = None 16 | 17 | def produceSummaryToWrite(self, session): 18 | decoded_image = session.run(self._image) 19 | feed_dict = {self._placeholder: decoded_image} 20 | return session.run(self._summary, feed_dict=feed_dict) 21 | 22 | def plotSideBySide(self, out_gaps, reconstructed): 23 | f, axarr = plt.subplots(4, 2, sharey='row') 24 | f.set_size_inches(14, 24) 25 | stop_value = 4 26 | for i in range(0, stop_value): 27 | axarr[i, 0].plot(out_gaps[i]) 28 | axarr[i, 1].plot(reconstructed[i]) 29 | 30 | buf = io.BytesIO() 31 | plt.savefig(buf, format='png') 32 | plt.close() 33 | buf.seek(0) 34 | image = tf.image.decode_png(buf.getvalue(), channels=4) 35 | image = tf.expand_dims(image, 0) 36 | self._image = image -------------------------------------------------------------------------------- /datasetGenerator/nSynthDownloader.py: -------------------------------------------------------------------------------- 1 | from datasetGenerator.downloader import Downloader 2 | 3 | __author__ = 'Andres' 4 | 5 | 6 | class NSynthDownloader(Downloader): 7 | TRAIN_LINK = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-train.jsonwav.tar.gz" 8 | VALID_LINK = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-valid.jsonwav.tar.gz" 9 | TEST_LINK = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-test.jsonwav.tar.gz" 10 | 11 | TRAIN_FILENAME = "nsynth_train.tar.gz" 12 | VALID_FILENAME = "nsynth_valid.tar.gz" 13 | TEST_FILENAME = "nsynth_test.tar.gz" 14 | 15 | TRAIN_DIR = "nsynth-train/audio" 16 | VALID_DIR = "nsynth-valid/audio" 17 | TEST_DIR = "nsynth-test/audio" 18 | 19 | def _downloadLinksAndFilenames(self): 20 | return [(self.TEST_LINK, self.TEST_FILENAME), 21 | (self.TRAIN_LINK, self.TRAIN_FILENAME), 22 | (self.VALID_LINK, self.VALID_FILENAME)] 23 | 24 | def _extractCompressedFile(self, filename): 25 | self._extractTar(filename) 26 | 27 | def _divideDataIntoTrainValidAndTestSubsets(self): 28 | print('NSynth dataset comes divided into training, validation and testing subsets.') 29 | 30 | if __name__ == "__main__": 31 | down = NSynthDownloader() 32 | down.downloadAndExtract() 33 | -------------------------------------------------------------------------------- /LPC-based extrapolation/lpcPaper.m: -------------------------------------------------------------------------------- 1 | contextLength = 2048; 2 | targetLength = 1024; 3 | contextRatio = ceil(contextLength/targetLength); 4 | maxLag = 1000; 5 | 6 | audioFilePath = 'audio/bass_electronic_018-045-075.wav'; 7 | [audio, Fs] = audioread(audioFilePath); 8 | 9 | t = linspace(0, pi/2, targetLength)'; 10 | sqCos = cos(t).^2; 11 | 12 | rec_signal = []; 13 | SNR = []; 14 | 15 | for i = contextRatio:(length(audio)/targetLength)-contextRatio-2 16 | previous_sig = audio(targetLength*(i-contextRatio)+1:targetLength*(i)); 17 | target_sig = audio(targetLength*(i)+1:targetLength*(i+1)); 18 | next_sig = audio(targetLength*(i+1)+1:targetLength*(i+contextRatio+1)); 19 | 20 | if rms(target_sig) < 1e-4 21 | SNR(length(SNR)+1) = -1; 22 | rec_signal = cat(1, rec_signal, zeros([targetLength, 1])); 23 | continue 24 | end 25 | 26 | ab = arburg(previous_sig, maxLag); 27 | Zb = filtic(1,ab,previous_sig(end-(0:(maxLag-1)))); 28 | forw_pred = filter(1,ab,zeros(1,targetLength),Zb)'; 29 | 30 | next_sig = flipud(next_sig); 31 | af = arburg(next_sig, maxLag); 32 | Zf = filtic(1,af, next_sig(end-(0:(maxLag-1)))); 33 | backw_pred = flipud(filter(1,af,zeros(1,targetLength),Zf)'); 34 | 35 | sigout = sqCos.*forw_pred + flipud(sqCos).*backw_pred; 36 | rec_signal = cat(1, rec_signal, sigout); 37 | SNR(length(SNR)+1) = mySNR(target_sig, sigout); 38 | end 39 | 40 | 41 | fprintf('mean SNR where it was calculated is %f \n', mean(SNR(SNR~=-1))); 42 | fprintf('max SNR is %f \n', max(SNR)); 43 | fprintf('SNR is not calculated at %d places \n', length(find(SNR==-1))); 44 | 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Audio inpainting with a context encoder 2 | 3 | This project accompanies the research work on audio inpainting of small gaps done at the Acoustics Research Institute in Vienna collaborating with the Swiss Data Science Center. The paper was [published at IEEE TASLP](https://ieeexplore.ieee.org/document/8867915) available now: https://ieeexplore.ieee.org/document/8867915. 4 | 5 | # Installation 6 | 7 | Install the requirements with `pip install -r requirements.txt`. For windows users, the numpy version should be 1.14.0+mkl (find it [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/)). For the FMA dataset, librosa requires ffmpeg as an mp3 backend. 8 | 9 | # Instructions 10 | The paper uses both google's Nsynth dataset and the FMA dataset. In order to recreate the used dataset, execute in the parent folder either `python make_nsynthdataset.py` or `python make_fmadataset.py`. The output of the scripts are three `tfrecord` files for training, validating and testing the model. 11 | 12 | The default parameters for the network come pickled in the file `magnitude_network_parameters.pkl` and `complex_network_parameters.pkl`. In order to make other architectures use [saveParameters.py](utils/saveParameters.py). 13 | 14 | To train the network, execute in the parent folder `python trainMagnitudeNetwork.py` or `python trainComplexNetwork.py`. This will train the network for 600k steps with a learning rate of 1e-3. You can select on which tfrecords to train the network, the script assumes you have created the nsynth dataset. 15 | 16 | ## Sound examples 17 | 18 | - To hear examples please go to the [accompanying website](https://andimarafioti.github.io/audioContextEncoder/). 19 | -------------------------------------------------------------------------------- /architecture/parameters/contextEncoderParameters.py: -------------------------------------------------------------------------------- 1 | class ContextEncoderParameters(object): 2 | INPUT_CHANNELS = 4 # 2 sides, one for real and one for imag 3 | 4 | def __init__(self, batchSize, signalLength, gapLength, fftWindowLength, fftHopSize, 5 | encoderParameters, fullyConnectedLayerParameters, decoderParameters): 6 | self._batchSize = int(batchSize) 7 | self._signalLength = int(signalLength) 8 | self._gapLength = int(gapLength) 9 | self._fftWindowLength = int(fftWindowLength) 10 | self._fftHopSize = int(fftHopSize) 11 | self._encoderParameters = encoderParameters 12 | self._fullyConnectedLayerParameters = fullyConnectedLayerParameters 13 | self._decoderParameters = decoderParameters 14 | 15 | def architectureParameters(self): 16 | return [self.inputShape(), self._encoderParameters, self._decoderParameters, self._fullyConnectedLayerParameters] 17 | 18 | def preProcessorParameters(self): 19 | return [self._signalLength, self._gapLength, self._fftWindowLength, self._fftHopSize] 20 | 21 | def fftHopSize(self): 22 | return self._fftHopSize 23 | 24 | def fftWindowLength(self): 25 | return self._fftWindowLength 26 | 27 | def batchSize(self): 28 | return self._batchSize 29 | 30 | def inputShape(self): 31 | return self._batchSize, self.contextStftFrameCount(), self._fftFreqBins(), self.INPUT_CHANNELS 32 | 33 | def contextStftFrameCount(self): 34 | return int(((self._signalLength - self._gapLength) / 2) / self._fftHopSize) 35 | 36 | def _fftFreqBins(self): 37 | return self._fftWindowLength//2+1 38 | 39 | -------------------------------------------------------------------------------- /utils/colorize.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.cm 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | def colorize(value, vmin=None, vmax=None, cmap=None): 9 | """ 10 | A utility function for TensorFlow that maps a grayscale image to a matplotlib 11 | colormap for use with TensorBoard image summaries. 12 | By default it will normalize the input value to the range 0..1 before mapping 13 | to a grayscale colormap. 14 | Arguments: 15 | - value: 2D Tensor of shape [height, width] or 3D Tensor of shape 16 | [height, width, 1]. 17 | - vmin: the minimum value of the range used for normalization. 18 | (Default: value minimum) 19 | - vmax: the maximum value of the range used for normalization. 20 | (Default: value maximum) 21 | - cmap: a valid cmap named for use with matplotlib's `get_cmap`. 22 | (Default: 'gray') 23 | Example usage: 24 | ``` 25 | output = tf.random_uniform(shape=[256, 256, 1]) 26 | output_color = colorize(output, vmin=0.0, vmax=1.0, cmap='viridis') 27 | tf.summary.image('output', output_color) 28 | ``` 29 | 30 | Returns a 3D tensor of shape [height, width, 3]. 31 | """ 32 | 33 | # normalize 34 | vmin = tf.reduce_min(value) if vmin is None else vmin 35 | vmax = tf.reduce_max(value) if vmax is None else vmax 36 | value = (value - vmin) / (vmax - vmin) # vmin..vmax 37 | 38 | # squeeze last dim if it exists 39 | value = tf.squeeze(value) 40 | 41 | # quantize 42 | indices = tf.to_int32(tf.round(value * 255)) 43 | 44 | # gather 45 | cm = matplotlib.cm.get_cmap(cmap if cmap is not None else 'viridis') 46 | colors = cm(np.arange(256))[:, :3] 47 | colors = tf.constant(colors, dtype=tf.float32) 48 | value = tf.gather(colors, indices) 49 | 50 | return value 51 | -------------------------------------------------------------------------------- /datasetGenerator/fmaDownloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from datasetGenerator.downloader import Downloader 4 | 5 | __author__ = 'Andres' 6 | 7 | 8 | class FMADownloader(Downloader): 9 | SMALL_LINK = 'https://os.unil.cloud.switch.ch/fma/fma_small.zip' 10 | SMALL_FILENAME = 'fma_small.zip' 11 | SMALL_DIR = SMALL_FILENAME[:-4] 12 | 13 | TRAIN_DIR = 'FMA-train' 14 | VALID_DIR = 'FMA-valid' 15 | TEST_DIR = 'FMA-test' 16 | 17 | def _downloadLinksAndFilenames(self): 18 | return [(self.SMALL_LINK, self.SMALL_FILENAME)] 19 | 20 | def _extractCompressedFile(self, filename): 21 | self._extractZip(filename) 22 | 23 | def _divideDataIntoTrainValidAndTestSubsets(self): 24 | print('Dividing FMA dataset into training, validation and testing subsets.') 25 | for dir_name in [self.TRAIN_DIR, self.VALID_DIR, self.TEST_DIR]: 26 | try: 27 | os.mkdir(dir_name) 28 | except FileExistsError as e: 29 | print('Directory already existed, proceed with caution.\nException:', e) 30 | 31 | i = 0 32 | for path, directory_name, file_names in os.walk(self.SMALL_DIR): 33 | for file_name in file_names: 34 | i += 1 35 | if i < 8: 36 | os.rename(path + '/' + file_name, self.TRAIN_DIR + '/' + file_name) 37 | elif i < 10: 38 | os.rename(path + '/' + file_name, self.VALID_DIR + '/' + file_name) 39 | elif i == 10: 40 | os.rename(path + '/' + file_name, self.TEST_DIR + '/' + file_name) 41 | i = 0 42 | shutil.rmtree(self.SMALL_DIR) 43 | 44 | 45 | if __name__ == "__main__": 46 | down = FMADownloader() 47 | down.downloadAndExtract() 48 | -------------------------------------------------------------------------------- /datasetGenerator/fakeTFRecordGenerator.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import time 4 | import os 5 | import sys 6 | from datasetGenerator.tfRecordGenerator import TFRecordGenerator 7 | 8 | __author__ = 'Andres' 9 | 10 | 11 | class FakeTFRecordGenerator(TFRecordGenerator): 12 | def generateDataset(self): 13 | start = time.time() 14 | 15 | train_filename = self.name() + '.tfrecords' 16 | writer = tf.python_io.TFRecordWriter(train_filename) 17 | 18 | print("start:", start) 19 | count = 0 20 | total = 0 21 | 22 | _sampling_rate = 16000 23 | _window_size = 5120 24 | _time = np.arange(0, _window_size / _sampling_rate, 1 / _sampling_rate) 25 | _low_freq = np.arange(0, 2000, 40) 26 | _mid_low_freq = np.arange(2000, 4000, 40) 27 | _mid_high_freq = np.arange(4000, 6000, 40) 28 | _high_freq = np.arange(6000, 8000, 40) 29 | 30 | for low_freq in _low_freq: 31 | for mid_low_freq in _mid_low_freq: 32 | for mid_high_freq in _mid_high_freq: 33 | for high_freq in _high_freq: 34 | audio = np.sin(2 * np.pi * low_freq * _time) + np.sin(2 * np.pi * mid_low_freq * _time) + \ 35 | np.sin(2 * np.pi * mid_high_freq * _time) + np.sin(2 * np.pi * high_freq * _time) 36 | 37 | self._createFeature(audio, writer) 38 | 39 | count, total = self._notifyIfNeeded(count + 1, total) 40 | sys.stdout.flush() 41 | writer.close() 42 | end = time.time() - start 43 | 44 | print("there were: ", total + count) 45 | print("wow, that took", end, "seconds... might want to change that to mins :)") 46 | 47 | 48 | def _filenameShouldBeLoaded(self, filename): 49 | raise NotImplementedError("We fake bro") 50 | -------------------------------------------------------------------------------- /LPC-based extrapolation/lpcInFolder.m: -------------------------------------------------------------------------------- 1 | contextLength = 2048; 2 | targetLength = 1024; 3 | contextRatio = ceil(contextLength/targetLength); 4 | maxLag = 1000; 5 | 6 | folder = 'fma'; 7 | extension = 'mp3'; 8 | audiofiles = dir(strcat(folder, '/*', extension)); 9 | allSNR = []; 10 | 11 | for file = audiofiles' 12 | 13 | fprintf(1,'Inpainting %s\n', file.name) 14 | [audio, Fs]=audioread(strcat(folder, '/', file.name)); 15 | 16 | t = linspace(0, pi/2, targetLength)'; 17 | sqCos = cos(t).^2; 18 | 19 | SNR = []; 20 | 21 | for i = contextRatio:(length(audio)/targetLength)-contextRatio-2 22 | previous_sig = audio(targetLength*(i-contextRatio)+1:targetLength*(i)); 23 | target_sig = audio(targetLength*(i)+1:targetLength*(i+1)); 24 | next_sig = audio(targetLength*(i+1)+1:targetLength*(i+contextRatio+1)); 25 | 26 | if rms(target_sig) < 1e-4 27 | continue 28 | end 29 | 30 | ab = arburg(previous_sig, maxLag); 31 | Zb = filtic(1,ab,previous_sig(end-(0:(maxLag-1)))); 32 | forw_pred = filter(1,ab,zeros(1,targetLength),Zb)'; 33 | 34 | next_sig = flipud(next_sig); 35 | af = arburg(next_sig, maxLag); 36 | Zf = filtic(1,af, next_sig(end-(0:(maxLag-1)))); 37 | backw_pred = flipud(filter(1,af,zeros(1,targetLength),Zf)'); 38 | 39 | sigout = sqCos.*forw_pred + flipud(sqCos).*backw_pred; 40 | SNR(length(SNR)+1) = mySNR(target_sig, sigout); 41 | end 42 | 43 | fprintf('mean SNR is %f \n', mean(SNR)); 44 | 45 | allSNR = cat(2, SNR, allSNR); 46 | 47 | end 48 | 49 | allSNR(isnan(allSNR)) = 0; 50 | 51 | fprintf('mean SNR is %f \n', mean(allSNR)); 52 | fprintf('std SNR is %f \n', std(allSNR)); 53 | fprintf('min SNR is %f \n', min(allSNR)); 54 | fprintf('25%% percentile SNR is %f \n', prctile(allSNR, 25)); 55 | fprintf('50%% percentile SNR is %f \n', prctile(allSNR, 50)); 56 | fprintf('75%% percentile SNR is %f \n', prctile(allSNR, 75)); 57 | fprintf('max SNR is %f \n', max(SNR)); 58 | -------------------------------------------------------------------------------- /utils/tfReader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Andres' 2 | 3 | import tensorflow as tf 4 | from tensorflow.python.framework.errors_impl import OutOfRangeError 5 | 6 | 7 | class TFReader(object): 8 | def __init__(self, path_to_tfRecord_file, window_size, batchSize, num_epochs=10, capacity=100000): 9 | self._path_to_tfRecord_file = path_to_tfRecord_file 10 | self._capacity = capacity 11 | self._batchSize = batchSize 12 | self._window_size = window_size 13 | self._audios = self._read_and_decode(tf.train.string_input_producer([path_to_tfRecord_file], 14 | num_epochs=num_epochs)) 15 | 16 | def start(self): 17 | self._coordinator = tf.train.Coordinator() 18 | self._threads = tf.train.start_queue_runners(coord=self._coordinator) 19 | 20 | def dataOperation(self, session): 21 | try: 22 | audios = session.run(self._audios) 23 | return audios 24 | except OutOfRangeError: 25 | raise StopIteration 26 | 27 | def finish(self): 28 | self._coordinator.request_stop() 29 | self._coordinator.join(self._threads) 30 | 31 | def _read_and_decode(self, filename_queue): 32 | reader = tf.TFRecordReader() 33 | _, serialized_example = reader.read(filename_queue) 34 | features = tf.parse_single_example(serialized_example, 35 | features={'valid/windows': tf.FixedLenFeature([], tf.string)}) 36 | 37 | windows = tf.decode_raw(features['valid/windows'], tf.float32) 38 | windows = tf.reshape(windows, [self._window_size]) 39 | 40 | audios = tf.train.shuffle_batch([windows], batch_size=self._batchSize, 41 | min_after_dequeue=int(self._capacity * 0.5), 42 | capacity=self._capacity, 43 | num_threads=4) 44 | return audios 45 | -------------------------------------------------------------------------------- /utils/saveParameters.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import sys 4 | 5 | from architecture.parameters.contextEncoderParameters import ContextEncoderParameters 6 | 7 | sys.path.append('.') # In case we launch this from the base folder 8 | 9 | __author__ = 'Andres' 10 | 11 | from architecture.parameters.convNetworkParams import ConvNetworkParams 12 | from architecture.parameters.fullyLayerParams import FullyLayerParams 13 | 14 | "Simple script to save parameters" 15 | 16 | architecturesParametersFile = "magnitude_network_parameters.pkl" 17 | 18 | batchSize = 256 19 | signalLength = 5120 20 | gapLength = 1024 21 | fftWindowLength = 512 22 | fftHopSize = 128 23 | 24 | encoderParams = ConvNetworkParams(filterShapes=[(7, 89), (3, 17), (2, 11), 25 | (1, 9), (1, 5), (2, 5)], 26 | channels=[4, 32, 128, 512, 27 | 256, 160, 128], 28 | strides=[[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], 29 | [1, 1, 2, 1], [1, 1, 1, 1], [1, 1, 1, 1]], 30 | name='Encoder') 31 | 32 | fullyParams = FullyLayerParams(inputShape=(batchSize, 128, 2, 8), outputShape=(batchSize, 8, 8, 32), name="Fully") 33 | 34 | decoderParams = ConvNetworkParams(filterShapes=[(8, 8), (5, 5), (3, 3), (5, 67), (11, 257)], 35 | channels=[32, 128, 512, 257, 11, 1], 36 | strides=[[1, 2, 2, 1], [1, 2, 2, 1], [1, 1, 1, 1], 37 | [1, 2, 2, 1], [1, 1, 1, 1]], 38 | name='Decoder') 39 | 40 | contextEncoderParameters = ContextEncoderParameters(batchSize, signalLength, gapLength, fftWindowLength, fftHopSize, 41 | encoderParams, fullyParams, decoderParams) 42 | 43 | with open(architecturesParametersFile, 'wb') as fiModel: 44 | pickle.dump(contextEncoderParameters, fiModel) 45 | -------------------------------------------------------------------------------- /datasetGenerator/downloader.py: -------------------------------------------------------------------------------- 1 | import ssl 2 | import urllib.request 3 | import tarfile 4 | import zipfile 5 | import os 6 | 7 | __author__ = 'Andres' 8 | 9 | 10 | class Downloader(object): 11 | def downloadAndExtract(self): 12 | for link, filename in self._downloadLinksAndFilenames(): 13 | self._download(link, filename) 14 | self._extractCompressedFile(filename) 15 | self._deleteCompressedFile(filename) 16 | self._divideDataIntoTrainValidAndTestSubsets() 17 | 18 | def _download(self, aLink, toAFilename): 19 | print("Downloading to ", toAFilename) 20 | size = 0 21 | blocksize = 4096 22 | 23 | with urllib.request.urlopen(aLink, context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) as response, \ 24 | open(toAFilename, 'wb') as out_file: # context avoids SSL certifications 25 | length = float(response.getheader('content-length')) 26 | data = response.read(blocksize) 27 | out_file.write(data) 28 | while data: 29 | size += len(data) 30 | print('\r Downloaded {:.2f} % '.format(100 * size / length), end='') 31 | data = response.read(blocksize) 32 | out_file.write(data) 33 | print('') 34 | 35 | def _deleteCompressedFile(self, filename): 36 | print('Deleting', filename) 37 | os.remove(filename) 38 | 39 | def _extractTar(self, aFile): 40 | print('Extracting', aFile) 41 | tar = tarfile.open(aFile) 42 | tar.extractall() 43 | tar.close() 44 | 45 | def _extractZip(self, aFile): 46 | print('Extracting', aFile) 47 | zip_ref = zipfile.ZipFile(aFile, 'r') 48 | zip_ref.extractall() 49 | zip_ref.close() 50 | 51 | def _extractCompressedFile(self, filename): 52 | raise NotImplementedError("Subclass Responsibility") 53 | 54 | def _downloadLinksAndFilenames(self): 55 | raise NotImplementedError("Subclass Responsibility") 56 | 57 | def _divideDataIntoTrainValidAndTestSubsets(self): 58 | raise NotImplementedError("Subclass Responsibility") 59 | -------------------------------------------------------------------------------- /utils/legacy/stftPhaseContextEncoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 5 | 6 | __author__ = 'Andres' 7 | 8 | 9 | class StftPhaseContextEncoder(StftGapContextEncoder): 10 | def _loss_graph(self): 11 | with tf.variable_scope("Loss"): 12 | gap_stft = self._target_model.output() 13 | 14 | abs_stft = tf.reshape(gap_stft[:, :, :, 0], (self._batch_size, 11, 257, 1)) 15 | target_angle = abs_stft * tf.reshape(gap_stft[:, :, :, 1], (self._batch_size, 11, 257, 1)) 16 | 17 | norm_orig = self._squaredEuclideanNorm(target_angle, onAxis=[1, 2, 3]) 18 | norm_orig_summary = tf.summary.scalar("norm_orig", tf.reduce_min(norm_orig)) 19 | 20 | error = target_angle - (self._reconstructed_input_data * abs_stft) 21 | error_per_example = tf.reduce_sum(tf.square(error), axis=[1, 2, 3]) 22 | 23 | reconstruction_loss = 0.5 * tf.reduce_sum(error_per_example * (1 + 5 / (norm_orig + 1e-2))) 24 | 25 | rec_loss_summary = tf.summary.scalar("reconstruction_loss", reconstruction_loss) 26 | 27 | trainable_vars = tf.trainable_variables() 28 | lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in trainable_vars if 'bias' not in v.name]) * 1e-2 29 | l2_loss_summary = tf.summary.scalar("lossL2", lossL2) 30 | 31 | total_loss = tf.add_n([reconstruction_loss, lossL2]) 32 | total_loss_summary = tf.summary.scalar("total_loss", total_loss) 33 | 34 | self._lossSummaries = tf.summary.merge( 35 | [rec_loss_summary, l2_loss_summary, norm_orig_summary, total_loss_summary]) 36 | 37 | return total_loss 38 | 39 | def _evaluateValidSNR(self, summaries_dict, validReader, evalWriter, writer, sess, step): 40 | reconstructed, out_gaps = self._reconstruct(sess, validReader, max_steps=8) 41 | reconstructed = np.reshape(reconstructed, (self._batch_size*8, 11, 257, 1)) 42 | step_valid_SNR = evalWriter.evaluateImages(reconstructed, np.reshape(out_gaps[:, :, :, 1], (self._batch_size*8, 11, 257, 1)), self._initial_model_num + step) 43 | validSNRSummaryToWrite = sess.run(summaries_dict['valid_SNR_summary'], 44 | feed_dict={summaries_dict['valid_SNR']: step_valid_SNR}) 45 | writer.add_summary(validSNRSummaryToWrite, self._initial_model_num + step) -------------------------------------------------------------------------------- /utils/legacy/simulations/stft_istft_tfReconstructionTest.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import time 7 | from tensorflow.contrib.signal.python.ops import window_ops 8 | 9 | __author__ = 'Andres' 10 | 11 | 12 | s = tf.Session() 13 | 14 | sampling_rate = 44000 15 | freq = 210 16 | countOfCycles = 4 17 | _time = tf.range(0, 1024 / sampling_rate, 1 / sampling_rate, dtype=tf.float32) 18 | firstSignal = tf.sin(2 * 3.14159 * freq * _time) 19 | 20 | fft_frame_length = 512 21 | fft_frame_step = 128 22 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 23 | inverse_window = tf.contrib.signal.inverse_stft_window_fn(fft_frame_step, 24 | forward_window_fn=window_fn) 25 | 26 | firstSignal = tf.concat([tf.zeros(fft_frame_length-fft_frame_step), firstSignal, tf.zeros(fft_frame_length-fft_frame_step)], axis=0) 27 | s.run(tf.initialize_all_variables()) 28 | stft = tf.contrib.signal.stft(signals=firstSignal, frame_length=fft_frame_length, frame_step=fft_frame_step, 29 | fft_length=fft_frame_length, window_fn=window_fn) 30 | istft = tf.contrib.signal.inverse_stft(stfts=stft, frame_length=fft_frame_length, frame_step=fft_frame_step, 31 | window_fn=inverse_window) 32 | 33 | stft_times = [] 34 | istft_times = [] 35 | for x in range(1): 36 | t = time.time() 37 | s.run(stft) 38 | stft_times.append(time.time()-t) 39 | print('stft took:', stft_times[-1]) 40 | t = time.time() 41 | s.run(istft) 42 | istft_times.append(time.time()-t) 43 | print('istft took:', istft_times[-1]) 44 | 45 | print(stft_times) 46 | print(istft_times) 47 | print(np.mean(stft_times)) 48 | print(np.mean(istft_times)) 49 | 50 | 51 | with tf.Session() as sess: 52 | t, original, stft_t, reconstructed = sess.run([_time, firstSignal, stft, istft]) 53 | 54 | def _pavlovs_SNR(y_orig, y_inp): 55 | norm_y_orig = np.linalg.norm(y_orig) + 1e-10 56 | norm_y_orig_minus_y_inp = np.linalg.norm(y_orig - y_inp) 57 | return 10 * np.log10((abs(norm_y_orig ** 2)) / abs((norm_y_orig_minus_y_inp ** 2))) 58 | 59 | print(_pavlovs_SNR(original, reconstructed)) 60 | 61 | ax1 = plt.subplot(211) 62 | plt.plot(original) 63 | plt.plot(reconstructed) 64 | plt.subplot(212) 65 | print(np.transpose(np.abs(stft_t)).shape) 66 | plt.pcolormesh(np.transpose(np.abs(stft_t))) 67 | plt.show() 68 | 69 | -------------------------------------------------------------------------------- /utils/legacy/evaluationWriter.py: -------------------------------------------------------------------------------- 1 | # import pandas as pd 2 | import numpy as np 3 | 4 | __author__ = 'Andres' 5 | 6 | 7 | class EvaluationWriter(object): 8 | def __init__(self, excelFileName): 9 | # self._writer = pd.ExcelWriter(excelFileName) 10 | self._index = 0 11 | 12 | def evaluate(self, reconstructed, original_gaps, step): 13 | assert (len(original_gaps) == len(reconstructed)) 14 | 15 | SNRs = self._pavlovs_SNR(original_gaps, reconstructed) 16 | 17 | norm_orig = self._squaredEuclideanNorm(original_gaps) / 5 18 | error = original_gaps - reconstructed 19 | reconstruction_loss = 0.5 * np.sum(np.square(error), axis=1) * (1 + 1 / norm_orig) 20 | 21 | # df = pd.DataFrame({'SNRs ' + str(step): SNRs, 'reconstruction_loss ' + str(step): reconstruction_loss}) 22 | # df.describe().to_excel(self._writer, sheet_name='general', startcol=self._index, index=not self._index) 23 | self._index += 3 24 | return np.mean(SNRs) 25 | 26 | def evaluateImages(self, reconstructed, original_gaps, step): 27 | print('original_gaps:', original_gaps.shape) 28 | print('reconstructed:', reconstructed.shape) 29 | assert (original_gaps.shape == reconstructed.shape) 30 | 31 | SNRs = self._pavlovs_SNR(original_gaps, reconstructed, onAxis=(1, 2, 3)) 32 | 33 | # norm_orig = self._squaredEuclideanNorm(original_gaps, onAxis=(1, 2, 3)) / 5 34 | # error = original_gaps - reconstructed 35 | # reconstruction_loss = 0.5 * np.sum(np.square(error), axis=(1, 2, 3)) * (1 + 1 / norm_orig) 36 | # 37 | # # df = pd.DataFrame({'SNRs ' + str(step): SNRs, 'reconstruction_loss ' + str(step): reconstruction_loss}) 38 | # # df.describe().to_excel(self._writer, sheet_name='general', startcol=self._index, index=not self._index) 39 | # self._index += 3 40 | return np.mean(SNRs) 41 | 42 | def _pavlovs_SNR(self, y_orig, y_inp, onAxis=(1,)): 43 | norm_y_orig = self._squaredEuclideanNorm(y_orig, onAxis) 44 | norm_y_orig_minus_y_inp = self._squaredEuclideanNorm(y_orig - y_inp, onAxis) 45 | return 10 * np.log10(norm_y_orig / norm_y_orig_minus_y_inp) 46 | 47 | def _squaredEuclideanNorm(self, vector, onAxis=(1,)): 48 | squared = np.square(vector) 49 | print('squared:', squared.shape) 50 | summed = np.sum(squared, axis=onAxis) 51 | return summed 52 | 53 | def save(self): 54 | pass 55 | # self._writer.save() 56 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatBigger.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from network.emptyTFGraph import EmptyTfGraph 4 | from utils.legacy.contextEncoder import ContextEncoderNetwork 5 | 6 | __author__ = 'Andres' 7 | 8 | tf.reset_default_graph() 9 | train_filename = 'train_full_w5120_g1024_h512_ex18978619.tfrecords' 10 | valid_filename = 'valid_full_w5120_g1024_h512_ex893971.tfrecords' 11 | 12 | window_size = 5120 13 | gap_length = 1024 14 | batch_size = 256 15 | 16 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 17 | 18 | dataset = aModel.output() 19 | first_half = dataset[:, :(window_size - gap_length) // 2] 20 | second_half = dataset[:, (window_size - gap_length) // 2:] 21 | stacked_halfs = tf.stack([first_half, second_half], axis=2) 22 | aModel.setOutputTo(stacked_halfs) 23 | 24 | with tf.variable_scope("Encoder"): 25 | aModel.addReshape((batch_size, 1, (window_size - gap_length) // 2, 2)) 26 | filter_widths = [129, 65, 17, 9] 27 | input_channels = [2, 32, 64, 128] 28 | output_channels = [32, 64, 128, 256] 29 | strides = [[1, 1, 4, 1]] * len(input_channels) 30 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv'] 31 | aModel.addSeveralConvLayers(filter_widths=filter_widths, input_channels=input_channels, 32 | output_channels=output_channels, strides=strides, names=names) 33 | 34 | aModel.addReshape((batch_size, 2048)) 35 | aModel.addFullyConnectedLayer(2048, 2048, 'Fully') 36 | aModel.addRelu() 37 | aModel.addReshape((batch_size, 1, 8, 256)) 38 | 39 | 40 | with tf.variable_scope("Decoder"): 41 | filter_widths = [9, 17, 65] 42 | input_channels = [256, 128, 64] 43 | output_channels = [128, 64, 16] 44 | strides = [[1, 1, 4, 1]] * len(input_channels) 45 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv'] 46 | aModel.addSeveralDeconvLayers(filter_widths=filter_widths, input_channels=input_channels, 47 | output_channels=output_channels, strides=strides, names=names) 48 | aModel.addDeconvLayerWithoutNonLin(filter_width=129, input_channels=16, output_channels=1, 49 | stride=(1, 1, 2, 1), name="Last_Deconv") 50 | aModel.addReshape((batch_size, gap_length)) 51 | 52 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 53 | gap_length=gap_length, learning_rate=1e-5, name='nat_bigger') 54 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6, restore_num=474000) 55 | -------------------------------------------------------------------------------- /datasetGenerator/exampleProcessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | 4 | __author__ = 'Andres' 5 | 6 | 7 | class ExampleProcessor(object): 8 | def __init__(self, gapLength=1024, sideLength=2048, hopSize=512, gapMinRMS=1e-3): 9 | self._sideLength = sideLength 10 | self._gapLength = gapLength 11 | self._totalLength = gapLength + 2*sideLength 12 | self._hopSize = hopSize 13 | self._gapMinRMS = gapMinRMS 14 | 15 | def gapLength(self): 16 | return self._gapLength 17 | 18 | def sideLength(self): 19 | return self._sideLength 20 | 21 | def describe(self): 22 | return "_w" + str(self._totalLength) + '_g' + str(self._gapLength) + '_h' + str(self._hopSize) 23 | 24 | def process(self, audio_signal): 25 | audio_without_silence_at_beginning_and_end = self._trim_silence(audio_signal, frame_length=self._gapLength) 26 | windowed_audio = self._window(audio_without_silence_at_beginning_and_end) 27 | processed_windows = self._remove_examples_with_low_energy_in_gap(windowed_audio) 28 | return processed_windows 29 | 30 | def _trim_silence(self, audio, frame_length=1024): 31 | if audio.size < frame_length: 32 | frame_length = audio.size 33 | energy = librosa.feature.rmse(audio, frame_length=frame_length) 34 | frames = np.nonzero(energy > self._gapMinRMS * 10) 35 | indices = librosa.core.frames_to_samples(frames)[1] 36 | 37 | # Note: indices can be an empty array, if the whole audio was silence. 38 | return audio[indices[0]:indices[-1]] if indices.size else audio[0:0] 39 | 40 | def _window(self, audio_signal): 41 | window_count = int((len(audio_signal) - self._totalLength) / self._hopSize) 42 | 43 | windowed_audios = np.array([]) 44 | for window_index in range(int(window_count)): 45 | initial_index = int(window_index * self._hopSize) 46 | windowed_audios = np.append(windowed_audios, audio_signal[initial_index:initial_index + self._totalLength]) 47 | windowed_audios = np.reshape(windowed_audios, (-1, self._totalLength)) 48 | return windowed_audios 49 | 50 | def _remove_examples_with_low_energy_in_gap(self, windows): 51 | begin = int(np.floor((self._totalLength - self._gapLength) / 2)) 52 | end = int(np.floor((self._totalLength + self._gapLength) / 2)) 53 | gaps = windows[:, begin:end] 54 | 55 | mask = np.where(np.sum(np.abs(gaps), axis=1) < self._gapLength * self._gapMinRMS) 56 | processed_windows = np.delete(windows, mask, axis=0) 57 | 58 | return processed_windows 59 | 60 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatNatBigger.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from network.emptyTFGraph import EmptyTfGraph 4 | from utils.legacy.contextEncoder import ContextEncoderNetwork 5 | 6 | __author__ = 'Andres' 7 | 8 | tf.reset_default_graph() 9 | train_filename = 'train_full_w5120_g1024_h512_ex18978619.tfrecords' 10 | valid_filename = 'valid_full_w5120_g1024_h512_ex893971.tfrecords' 11 | 12 | window_size = 5120 13 | gap_length = 1024 14 | batch_size = 256 15 | 16 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 17 | 18 | dataset = aModel.output() 19 | first_half = dataset[:, :(window_size - gap_length) // 2] 20 | second_half = dataset[:, (window_size - gap_length) // 2:] 21 | stacked_halfs = tf.stack([first_half, second_half], axis=2) 22 | aModel.setOutputTo(stacked_halfs) 23 | 24 | with tf.variable_scope("Encoder"): 25 | aModel.addReshape((batch_size, 1, (window_size - gap_length) // 2, 2)) 26 | filter_shapes = [(1, 129), (1, 65), (1, 33), (1, 17), (1, 17), (1, 17)] 27 | input_channels = [2, 32, 128, 512, 256, 128] 28 | output_channels = [*input_channels[1:], 64] 29 | strides = [[1, 1, 2, 1]] * len(input_channels) 30 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv', 'Six_Conv'] 31 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 32 | output_channels=output_channels, strides=strides, names=names) 33 | 34 | aModel.addReshape((batch_size, 2048)) 35 | aModel.addFullyConnectedLayer(2048, 2048, 'Fully') 36 | aModel.addRelu() 37 | aModel.addReshape((batch_size, 1, 32, 64)) 38 | 39 | with tf.variable_scope("Decoder"): 40 | filter_shapes = [(1, 17), (1, 17), (1, 33), (1, 65), (1, 65)] 41 | input_channels = [64, 128, 512, 256, 128] 42 | output_channels = [*input_channels[1:], 16] 43 | strides = [[1, 1, 2, 1]] * len(input_channels) 44 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv', 'Fourth_Deconv', 'Fifth_Deconv'] 45 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 46 | output_channels=output_channels, strides=strides, names=names) 47 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(1, 129), input_channels=16, output_channels=1, 48 | stride=(1, 1, 1, 1), name="Last_Deconv") 49 | aModel.addReshape((batch_size, gap_length)) 50 | 51 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 52 | gap_length=gap_length, learning_rate=1e-5, name='nat_sec_bigg') 53 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6, restore_num=564425, per_process_gpu_memory_fraction=0.9) 54 | 55 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftTest.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from network.emptyTFGraph import EmptyTfGraph 4 | from utils.legacy.stftMagContextEncoder import StftTestContextEncoder 5 | 6 | __author__ = 'Andres' 7 | 8 | tf.reset_default_graph() 9 | train_filename = 'test_full_w5120_g1024_h512_ex292266.tfrecords' 10 | valid_filename = 'test_full_w5120_g1024_h512_ex292266.tfrecords' 11 | 12 | window_size = 5120 13 | gap_length = 1024 14 | batch_size = 256 15 | 16 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 17 | 18 | dataset = aModel.output() 19 | signal_length = window_size - gap_length 20 | first_half = dataset[:, :signal_length // 2] 21 | second_half = dataset[:, signal_length // 2:] 22 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 23 | 24 | with tf.name_scope('Energy_Spectogram'): 25 | fft_frame_length = 512 26 | fft_frame_step = 128 27 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step) 28 | mag_stft = tf.abs(stft) # (256, 2, 13, 257) 29 | mag_stft = tf.reshape(mag_stft, (batch_size, 13, 257, 2)) 30 | aModel.setOutputTo(mag_stft) 31 | 32 | with tf.variable_scope("Encoder"): 33 | filter_widths = [(3, 33), (2, 9), (1, 3)] 34 | input_channels = [2, 32, 64] 35 | output_channels = [32, 64, 128] 36 | strides = [[1, 2, 4, 1], [1, 2, 4, 1], [1, 2, 4, 1]] 37 | names = ['First_Conv', 'Second_Conv', 'Third_Conv'] 38 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 39 | output_channels=output_channels, strides=strides, names=names) 40 | 41 | aModel.addReshape((batch_size, 1280)) 42 | aModel.addFullyConnectedLayer(1280, 896, 'Fully') 43 | aModel.addRelu() 44 | aModel.addReshape((batch_size, 1, 7, 128)) 45 | 46 | with tf.variable_scope("Decoder"): 47 | filter_widths = [(1, 5), (1, 9)] 48 | input_channels = [128, 256] 49 | output_channels = [256, 128] 50 | strides = [[1, 1, 2, 1]] * len(input_channels) 51 | names = ['First_Deconv', 'Second_Deconv'] 52 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 53 | output_channels=output_channels, strides=strides, names=names) 54 | aModel.addReshape((batch_size, 1, 7, 512)) 55 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(1, 3), input_channels=512, output_channels=257, 56 | stride=(1, 1, 1, 1), name="Last_Deconv") 57 | aModel.addReshape((batch_size, 7, 257)) 58 | 59 | aContextEncoderNetwork = StftTestContextEncoder(model=aModel, batch_size=batch_size, window_size=window_size, 60 | gap_length=gap_length, learning_rate=1e-4, name='nat_mag_stft_2') 61 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 62 | -------------------------------------------------------------------------------- /utils/legacy/notebooks/train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "from network.natContextEncoder import ContextEncoderNetwork" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "---------\n", 25 | "ContextEncoder\n", 26 | "---------\n", 27 | "Tensor(\"stack:0\", shape=(256, 2048, 2), dtype=float32)\n", 28 | "Tensor(\"Encoder/Reshape:0\", shape=(256, 1, 2048, 2), dtype=float32)\n", 29 | "Tensor(\"Encoder/First_Conv/Relu:0\", shape=(256, 1, 512, 32), dtype=float32)\n", 30 | "Tensor(\"Encoder/Second_Conv/Relu:0\", shape=(256, 1, 128, 64), dtype=float32)\n", 31 | "Tensor(\"Encoder/Third_Conv/Relu:0\", shape=(256, 1, 32, 64), dtype=float32)\n", 32 | "Tensor(\"Reshape:0\", shape=(256, 2048), dtype=float32)\n", 33 | "Tensor(\"Fully/add:0\", shape=(256, 2048), dtype=float32)\n", 34 | "Tensor(\"Reshape_1:0\", shape=(256, 1, 32, 64), dtype=float32)\n", 35 | "Tensor(\"Decoder/First_Deconv/Relu:0\", shape=(256, 1, 128, 64), dtype=float32)\n", 36 | "Tensor(\"Decoder/Second_Deconv/Relu:0\", shape=(256, 1, 512, 16), dtype=float32)\n", 37 | "Tensor(\"Decoder/Last_Deconv/Relu:0\", shape=(256, 1, 1024, 1), dtype=float32)\n", 38 | "Tensor(\"Decoder/Reshape:0\", shape=(256, 1024), dtype=float32)\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "tf.reset_default_graph()\n", 44 | "\n", 45 | "train_filename = 'test_w5120_g1024_h512_ex63501.tfrecords'\n", 46 | "valid_filename = 'test_w5120_g1024_h512_ex63501.tfrecords'\n", 47 | "\n", 48 | "aContextEncoderNetwork = ContextEncoderNetwork(batch_size=256, window_size=5120, gap_length=1024, \n", 49 | " learning_rate=1e-5, name='train')\n", 50 | "# aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "aContextEncoderNetwork.description()" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.6.2" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /utils/legacy/timeLiner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import json 4 | 5 | import tensorflow as tf 6 | from tensorflow.contrib.layers import fully_connected as fc 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | from tensorflow.python.client import timeline 9 | 10 | 11 | class TimeLiner: 12 | _timeline_dict = None 13 | 14 | def update_timeline(self, chrome_trace): 15 | # convert crome trace to python dict 16 | chrome_trace_dict = json.loads(chrome_trace) 17 | # for first run store full trace 18 | if self._timeline_dict is None: 19 | self._timeline_dict = chrome_trace_dict 20 | # for other - update only time consumption, not definitions 21 | else: 22 | for event in chrome_trace_dict['traceEvents']: 23 | # events time consumption started with 'ts' prefix 24 | if 'ts' in event: 25 | self._timeline_dict['traceEvents'].append(event) 26 | 27 | def save(self, f_name): 28 | with open(f_name, 'w') as f: 29 | json.dump(self._timeline_dict, f) 30 | 31 | 32 | batch_size = 100 33 | 34 | inputs = tf.placeholder(tf.float32, [batch_size, 784]) 35 | targets = tf.placeholder(tf.float32, [batch_size, 10]) 36 | 37 | with tf.variable_scope("layer_1"): 38 | fc_1_out = fc(inputs, num_outputs=500, activation_fn=tf.nn.sigmoid) 39 | with tf.variable_scope("layer_2"): 40 | fc_2_out = fc(fc_1_out, num_outputs=784, activation_fn=tf.nn.sigmoid) 41 | with tf.variable_scope("layer_3"): 42 | logits = fc(fc_2_out, num_outputs=10) 43 | 44 | loss = tf.reduce_mean( 45 | tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) 46 | train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss) 47 | 48 | if __name__ == '__main__': 49 | mnist_save_dir = os.path.join(tempfile.gettempdir(), 'MNIST_data') 50 | mnist = input_data.read_data_sets(mnist_save_dir, one_hot=True) 51 | 52 | config = tf.ConfigProto() 53 | config.gpu_options.allow_growth = True 54 | with tf.Session(config=config) as sess: 55 | sess.run(tf.global_variables_initializer()) 56 | 57 | options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 58 | run_metadata = tf.RunMetadata() 59 | many_runs_timeline = TimeLiner() 60 | runs = 5 61 | for i in range(runs): 62 | batch_input, batch_target = mnist.train.next_batch(batch_size) 63 | feed_dict = {inputs: batch_input, 64 | targets: batch_target} 65 | 66 | sess.run(train_op, 67 | feed_dict=feed_dict, 68 | options=options, 69 | run_metadata=run_metadata) 70 | 71 | fetched_timeline = timeline.Timeline(run_metadata.step_stats) 72 | chrome_trace = fetched_timeline.generate_chrome_trace_format() 73 | many_runs_timeline.update_timeline(chrome_trace) 74 | many_runs_timeline.save('timeline_03_merged_%d_runs.json' % runs) 75 | -------------------------------------------------------------------------------- /datasetGenerator/tfRecordGenerator.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import tensorflow as tf 4 | import time 5 | import os 6 | import sys 7 | 8 | from audioread import NoBackendError 9 | 10 | __author__ = 'Andres' 11 | 12 | 13 | class TFRecordGenerator(object): 14 | """To generate a Dataset, instantiate this class with its arguments and call generateDataset()""" 15 | 16 | def __init__(self, baseName, pathToDataFolder, exampleProcessor, targetSamplingRate=16000, notifyEvery=10000): 17 | self._pathToDataFolder = pathToDataFolder 18 | self._exampleProcessor = exampleProcessor 19 | self._notifyEvery = notifyEvery 20 | self._targetSamplingRate = targetSamplingRate 21 | self._baseName = baseName 22 | 23 | def name(self): 24 | return self._baseName + self._exampleProcessor.describe() 25 | 26 | def generateDataset(self): 27 | start = time.time() 28 | 29 | train_filename = self.name() + '.tfrecords' 30 | writer = tf.python_io.TFRecordWriter(train_filename) 31 | 32 | print("start:", start) 33 | count = 0 34 | total = 0 35 | 36 | for file_name in os.listdir(self._pathToDataFolder): 37 | if self._filenameShouldBeLoaded(file_name): 38 | try: 39 | audio, sr = librosa.load(self._pathToDataFolder + '/' + file_name, sr=self._targetSamplingRate) 40 | except NoBackendError: 41 | print("No backend for file:", file_name) 42 | continue 43 | 44 | windows = self._exampleProcessor.process(audio) 45 | if windows.shape[0] is 0: 46 | print("Got a completely silenced signal! with path:", file_name) 47 | continue 48 | 49 | for window in windows: 50 | self._createFeature(window, writer) 51 | 52 | count, total = self._notifyIfNeeded(count + len(windows), total) 53 | sys.stdout.flush() 54 | writer.close() 55 | end = time.time() - start 56 | 57 | print("there were: ", total + count) 58 | print("wow, that took", end, "seconds... might want to change that to mins :)") 59 | 60 | def _createFeature(self, window, writer): 61 | window_bytes = window.astype(np.float32).tostring() 62 | 63 | example = tf.train.Example(features=tf.train.Features(feature={ 64 | 'valid/windows': self._bytes_feature(window_bytes)})) 65 | 66 | writer.write(example.SerializeToString()) 67 | 68 | def _bytes_feature(self, value): 69 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 70 | 71 | def _filenameShouldBeLoaded(self, filename): 72 | raise NotImplementedError("Subclass Responsibility") 73 | 74 | def _notifyIfNeeded(self, count, total): 75 | if count > self._notifyEvery: 76 | count -= self._notifyEvery 77 | total += self._notifyEvery 78 | print(self._notifyEvery, "plus!", time.time()) 79 | return count, total 80 | return count, total 81 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftMagnitudeTest.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import slim 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | from utils.legacy.stftMagContextEncoder import StftTestContextEncoder 6 | 7 | __author__ = 'Andres' 8 | 9 | tf.reset_default_graph() 10 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 11 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 12 | 13 | window_size = 5120 14 | gap_length = 1024 15 | batch_size = 256 16 | 17 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 18 | 19 | signal = aModel.output() 20 | 21 | with tf.name_scope('Energy_Spectogram'): 22 | fft_frame_length = 512 23 | fft_frame_step = 128 24 | stft = tf.contrib.signal.stft(signals=signal, frame_length=fft_frame_length, frame_step=fft_frame_step) 25 | 26 | sides_stft = tf.stack((stft[:, :15, :], stft[:, 15+7:, :]), axis=3) 27 | 28 | mag_stft = tf.abs(sides_stft) # (256, 15, 257, 2) 29 | aModel.setOutputTo(mag_stft) 30 | 31 | with tf.variable_scope("Encoder"): 32 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 33 | input_channels = [2, 32, 64, 128, 128] 34 | output_channels = [32, 64, 128, 128, 200] 35 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 36 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 37 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 38 | output_channels=output_channels, strides=strides, names=names) 39 | 40 | aModel.addReshape((batch_size, 3200)) 41 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 42 | aModel.addRelu() 43 | aModel.addReshape((batch_size, 8, 8, 32)) 44 | 45 | with tf.variable_scope("Decoder"): 46 | filter_shapes = [(5, 5), (3, 3)] 47 | input_channels = [32, 64] 48 | output_channels = [64, 257] 49 | strides = [[1, 2, 2, 1]] * len(input_channels) 50 | names = ['First_Deconv', 'Second_Deconv'] 51 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 52 | output_channels=output_channels, strides=strides, names=names) 53 | 54 | aModel.addReshape((batch_size, 8, 257, 128)) 55 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=7, stride=(1, 2, 2, 1), 56 | name='Third_deconv') 57 | 58 | aModel.addReshape((batch_size, 7, 257, 32)) 59 | 60 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=1, 61 | stride=(1, 1, 1, 1), name="Last_Deconv") 62 | aModel.addReshape((batch_size, 7, 257)) 63 | 64 | print(aModel.description()) 65 | 66 | model_vars = tf.trainable_variables() 67 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 68 | 69 | aContextEncoderNetwork = StftTestContextEncoder(model=aModel, batch_size=batch_size, stft=stft, window_size=window_size, 70 | gap_length=gap_length, learning_rate=1e-4, name='nat_mag_stft_5_') 71 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 72 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftRealImagTest.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # from tensorflow.contrib import slim 4 | from network.emptyTFGraph import EmptyTfGraph 5 | from utils.legacy.stftRealImagContextEncoder import StftRealImagContextEncoder 6 | 7 | __author__ = 'Andres' 8 | 9 | tf.reset_default_graph() 10 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 11 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 12 | 13 | window_size = 5120 14 | gap_length = 1024 15 | batch_size = 256 16 | 17 | fft_frame_length = 512 18 | fft_frame_step = 128 19 | 20 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 21 | 22 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 23 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 37, 257, 2) 24 | 25 | with tf.name_scope('Remove_gap_stft'): 26 | stft = aModel.output() 27 | sides_stft = tf.concat((stft[:, :15, :, :], stft[:, 15+7:, :, :]), axis=3) # (256, 15, 257, 4) 28 | aModel.setOutputTo(sides_stft) 29 | print(aModel.output()) 30 | 31 | with tf.variable_scope("Encoder"): 32 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 33 | input_channels = [4, 32, 64, 128, 128] 34 | output_channels = [32, 64, 128, 128, 200] 35 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 36 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 37 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 38 | output_channels=output_channels, strides=strides, names=names) 39 | 40 | aModel.addReshape((batch_size, 3200)) 41 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 42 | aModel.addRelu() 43 | aModel.addBatchNormalization() 44 | aModel.addDropout(0.3) 45 | aModel.addReshape((batch_size, 8, 8, 32)) 46 | 47 | with tf.variable_scope("Decoder"): 48 | filter_shapes = [(5, 5), (3, 3)] 49 | input_channels = [32, 64] 50 | output_channels = [64, 257] 51 | strides = [[1, 2, 2, 1]] * len(input_channels) 52 | names = ['First_Deconv', 'Second_Deconv'] 53 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 54 | output_channels=output_channels, strides=strides, names=names) 55 | 56 | aModel.addReshape((batch_size, 8, 257, 128)) 57 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=7, stride=(1, 2, 2, 1), 58 | name='Third_deconv') 59 | aModel.addBatchNormalization() 60 | aModel.addDropout(0.1) 61 | 62 | aModel.addReshape((batch_size, 7, 257, 32)) 63 | 64 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=2, 65 | stride=(1, 1, 1, 1), name="Last_Deconv") 66 | 67 | print(aModel.description()) 68 | 69 | # model_vars = tf.trainable_variables() 70 | # slim.model_analyzer.analyze_vars(model_vars, print_info=True) 71 | 72 | aContextEncoderNetwork = StftRealImagContextEncoder(model=aModel, batch_size=batch_size, stft=stft, window_size=window_size, 73 | gap_length=gap_length, learning_rate=1e-4, name='nat_mag_real_imag_1_') 74 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 75 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftSeventh.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from network.emptyTFGraph import EmptyTfGraph 4 | from utils.legacy.contextEncoder import ContextEncoderNetwork 5 | 6 | __author__ = 'Andres' 7 | 8 | tf.reset_default_graph() 9 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 10 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 11 | 12 | window_size = 5120 13 | gap_length = 1024 14 | batch_size = 256 15 | 16 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 17 | 18 | dataset = aModel.output() 19 | signal_length = window_size - gap_length 20 | first_half = dataset[:, :signal_length // 2] 21 | second_half = dataset[:, signal_length // 2:] 22 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 23 | 24 | with tf.name_scope('Energy_Spectogram'): 25 | fft_frame_length = 512 26 | fft_frame_step = 128 27 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step) 28 | real_stft = tf.real(stft) 29 | imag_stft = tf.imag(stft) 30 | real_stft_left = real_stft[:, 0, :, :] 31 | real_stft_right = real_stft[:, 1, :, :] 32 | 33 | imag_stft_left = imag_stft[:, 0, :, :] 34 | imag_stft_right = imag_stft[:, 1, :, :] 35 | 36 | real_stft = tf.concat([real_stft_left, real_stft_right], 1) 37 | imag_stft = tf.concat([imag_stft_left, imag_stft_right], 1) 38 | print(real_stft) 39 | 40 | stacked = tf.stack([real_stft, imag_stft], axis=3) 41 | aModel.setOutputTo(stacked) 42 | 43 | with tf.variable_scope("Encoder"): 44 | filter_widths = [(9, 97), (5, 9), (3, 3), (2, 2)] 45 | input_channels = [2, 32, 64, 128] 46 | output_channels = [32, 64, 128, 160] 47 | strides = [[1, 2, 4, 1], [1, 2, 4, 1], [1, 2, 4, 1], [1, 1, 1, 1]] 48 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv'] 49 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 50 | output_channels=output_channels, strides=strides, names=names) 51 | 52 | aModel.addReshape((batch_size, 3200)) 53 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 54 | aModel.addRelu() 55 | aModel.addReshape((batch_size, 1, 32, 64)) 56 | 57 | with tf.variable_scope("Decoder"): 58 | filter_widths = [(1, 11), (1, 3), (1, 3), (1, 11), (1, 97)] 59 | input_channels = [64, 128, 256, 128, 64] 60 | output_channels = [128, 256, 128, 64, 16] 61 | strides = [[1, 1, 2, 1]] * len(input_channels) 62 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv', 'Fourth_Deconv', 'Fifth_Deconv'] 63 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 64 | output_channels=output_channels, strides=strides, names=names) 65 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(1, 1024), input_channels=16, output_channels=1, 66 | stride=(1, 1, 1, 1), name="Last_Deconv") 67 | aModel.addReshape((batch_size, gap_length)) 68 | 69 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 70 | gap_length=gap_length, learning_rate=1e-5, name='nat_full_stft_8_') 71 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 72 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftEigth.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from network.emptyTFGraph import EmptyTfGraph 4 | from utils.legacy.contextEncoder import ContextEncoderNetwork 5 | 6 | __author__ = 'Andres' 7 | 8 | tf.reset_default_graph() 9 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 10 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 11 | 12 | window_size = 5120 13 | gap_length = 1024 14 | batch_size = 256 15 | 16 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 17 | 18 | dataset = aModel.output() 19 | signal_length = window_size - gap_length 20 | first_half = dataset[:, :signal_length // 2] 21 | second_half = dataset[:, signal_length // 2:] 22 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 23 | 24 | with tf.name_scope('Energy_Spectogram'): 25 | fft_frame_length = 512 26 | fft_frame_step = 128 27 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step) 28 | real_stft = tf.real(stft) 29 | imag_stft = tf.imag(stft) 30 | real_stft_left = real_stft[:, 0, :, :] 31 | real_stft_right = real_stft[:, 1, :, :] 32 | 33 | imag_stft_left = imag_stft[:, 0, :, :] 34 | imag_stft_right = imag_stft[:, 1, :, :] 35 | 36 | real_stft = tf.concat([real_stft_left, real_stft_right], 1) 37 | imag_stft = tf.concat([imag_stft_left, imag_stft_right], 1) 38 | print(real_stft) 39 | 40 | stacked = tf.stack([real_stft, imag_stft], axis=3) 41 | aModel.setOutputTo(stacked) 42 | 43 | with tf.variable_scope("Encoder"): 44 | filter_widths = [(7, 89), (4, 23), (2, 11), (2, 3), (1, 3)] 45 | input_channels = [2, 32, 32, 128, 128] 46 | output_channels = [32, 32, 128, 128, 200] 47 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 2, 2, 1], [1, 1, 1, 1]] 48 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 49 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 50 | output_channels=output_channels, strides=strides, names=names) 51 | 52 | aModel.addReshape((batch_size, 3200)) 53 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 54 | aModel.addRelu() 55 | aModel.addReshape((batch_size, 1, 32, 64)) 56 | 57 | with tf.variable_scope("Decoder"): 58 | filter_widths = [(1, 11), (1, 3), (1, 3), (1, 11), (1, 97)] 59 | input_channels = [64, 128, 256, 128, 64] 60 | output_channels = [128, 256, 128, 64, 16] 61 | strides = [[1, 1, 2, 1]] * len(input_channels) 62 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv', 'Fourth_Deconv', 'Fifth_Deconv'] 63 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 64 | output_channels=output_channels, strides=strides, names=names) 65 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(1, 1024), input_channels=16, output_channels=1, 66 | stride=(1, 1, 1, 1), name="Last_Deconv") 67 | aModel.addReshape((batch_size, gap_length)) 68 | 69 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 70 | gap_length=gap_length, learning_rate=1e-5, name='nat_full_stft_8_') 71 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 72 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftSixth.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.signal.python.ops import window_ops 5 | 6 | from network.tfGraph import TFGraph 7 | from utils.legacy.contextEncoder import ContextEncoderNetwork 8 | 9 | __author__ = 'Andres' 10 | 11 | tf.reset_default_graph() 12 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 13 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 14 | 15 | window_size = 5120 16 | gap_length = 1024 17 | batch_size = 256 18 | 19 | aModel = TFGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 20 | 21 | dataset = aModel.output() 22 | signal_length = window_size - gap_length 23 | first_half = dataset[:, :signal_length // 2] 24 | second_half = dataset[:, signal_length // 2:] 25 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 26 | 27 | with tf.name_scope('Energy_Spectogram'): 28 | fft_frame_length = 512 29 | fft_frame_step = 128 30 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 31 | 32 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step, 33 | window_fn=window_fn) 34 | real_stft = tf.real(stft) 35 | imag_stft = tf.imag(stft) 36 | real_stft_left = real_stft[:, 0, :, :] 37 | real_stft_right = real_stft[:, 1, :, :] 38 | 39 | imag_stft_left = imag_stft[:, 0, :, :] 40 | imag_stft_right = imag_stft[:, 1, :, :] 41 | 42 | real_stft = tf.concat([real_stft_left, real_stft_right], 1) 43 | imag_stft = tf.concat([imag_stft_left, imag_stft_right], 1) 44 | print(real_stft) 45 | 46 | stacked = tf.stack([real_stft, imag_stft], axis=3) 47 | aModel.setOutputTo(stacked) 48 | 49 | with tf.variable_scope("Encoder"): 50 | filter_widths = [(7, 89), (4, 43), (2, 11), (2, 3), (2, 5)] 51 | input_channels = [2, 16, 32, 128, 128] 52 | output_channels = [16, 32, 128, 128, 64] 53 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 1, 1, 1]] 54 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 55 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 56 | output_channels=output_channels, strides=strides, names=names) 57 | print(aModel.output()) 58 | 59 | aModel.addReshape((batch_size, 2176)) 60 | aModel.addFullyConnectedLayer(2176, 1152, 'Fully') 61 | aModel.addRelu() 62 | aModel.addReshape((batch_size, 1, 9, 128)) 63 | 64 | with tf.variable_scope("Decoder"): 65 | filter_widths = [(1, 3), (2, 3), (2, 5)] 66 | input_channels = [128, 256, 64] 67 | output_channels = [256, 64, 128] 68 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1]] 69 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv'] 70 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 71 | output_channels=output_channels, strides=strides, names=names) 72 | aModel.addReshape((batch_size, 2, 128, 288)) 73 | 74 | aModel.addDeconvLayer(filter_shape=(1, 17), input_channels=288, output_channels=20, stride=(1, 2, 2, 1), 75 | name='first_deconv_after_reshape') 76 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(4, 129), input_channels=20, output_channels=1, 77 | stride=(1, 1, 1, 1), name="Last_Deconv") 78 | aModel.addReshape((batch_size, gap_length)) 79 | 80 | print(aModel.description()) 81 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 82 | gap_length=gap_length, learning_rate=1e-4, name='nat_full_stft_6_') 83 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 84 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatMagPhaseGapTest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | from system.preAndPostProcessor import PreAndPostProcessor 6 | from utils.legacy.stftPhaseContextEncoder import StftPhaseContextEncoder 7 | 8 | sys.path.insert(0, '../') 9 | import tensorflow as tf 10 | from tensorflow.contrib import slim 11 | import socket 12 | if 'omenx' in socket.gethostname(): 13 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 14 | 15 | 16 | __author__ = 'Andres' 17 | 18 | tf.reset_default_graph() 19 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 20 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 21 | 22 | signal_length = 5120 23 | gap_length = 1024 24 | batch_size = 256 25 | 26 | fft_window_length = 512 27 | fft_hop_size = 128 28 | 29 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, signal_length), name="Target Model") 30 | anStftForTheInpaintingSetting = PreAndPostProcessor(signalLength=signal_length, 31 | gapLength=gap_length, 32 | fftWindowLength=fft_window_length, 33 | fftHopSize=fft_hop_size) 34 | anStftForTheInpaintingSetting.addStftForGapTo(aTargetModel) 35 | aTargetModel.divideComplexOutputIntoMagAndPhase() # (256, 11, 257, 2) 36 | 37 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, signal_length), name="context encoder") 38 | 39 | anStftForTheInpaintingSetting.addStftForTheContextTo(aModel) 40 | aModel.divideComplexOutputIntoMagAndPhase() 41 | aModel.addReshape((batch_size, 16, 257, 4)) 42 | 43 | with tf.variable_scope("Encoder"): 44 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 45 | input_channels = [4, 32, 64, 128, 128] 46 | output_channels = [32, 64, 128, 128, 200] 47 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 48 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 49 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 50 | output_channels=output_channels, strides=strides, names=names) 51 | 52 | aModel.addReshape((batch_size, 3200)) 53 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 54 | aModel.addRelu() 55 | aModel.addBatchNormalization() 56 | aModel.addReshape((batch_size, 8, 8, 32)) 57 | 58 | with tf.variable_scope("Decoder"): 59 | filter_shapes = [(5, 5), (3, 3)] 60 | input_channels = [32, 64] 61 | output_channels = [64, 257] 62 | strides = [[1, 2, 2, 1]] * len(input_channels) 63 | names = ['First_Deconv', 'Second_Deconv'] 64 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 65 | output_channels=output_channels, strides=strides, names=names) 66 | 67 | aModel.addReshape((batch_size, 8, 257, 128)) 68 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 69 | name='Third_deconv') 70 | aModel.addBatchNormalization() 71 | 72 | aModel.addReshape((batch_size, 11, 257, 32)) 73 | 74 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=1, 75 | stride=(1, 1, 1, 1), name="Last_Deconv") 76 | 77 | print(aModel.description()) 78 | 79 | model_vars = tf.trainable_variables() 80 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 81 | 82 | aContextEncoderNetwork = StftPhaseContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=signal_length, 83 | gap_length=gap_length, learning_rate=1e-3, name='nat_mag_phase_times_mag_gap_') 84 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 85 | -------------------------------------------------------------------------------- /utils/legacy/notebooks/try.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Context Encoder\n", 8 | "\n", 9 | "In this notebook we are going to be trying different networks to test their performance.\n", 10 | "Let's begin by importing tensorflow and the network.\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "scrolled": false 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import tensorflow as tf\n", 22 | "from network.contextEncoder import ContextEncoderNetwork" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Next, we have a modifiable version of the context encoder. The goal is to be able to easily modify the network and try other ideas." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "class ModifiedContextEncoderNetwork(ContextEncoderNetwork):\n", 39 | " def _encoder(self, model, isTraining):\n", 40 | " with tf.variable_scope(\"Encoder\"):\n", 41 | " model.addReshape((self._batch_size, self._window_size - self._gap_length, 1))\n", 42 | " model.addConvLayer(filter_width=129, input_channels=1, output_channels=16,\n", 43 | " stride=4, name=\"First_Conv\", isTraining=isTraining)\n", 44 | " model.addConvLayer(filter_width=65, input_channels=16, output_channels=64,\n", 45 | " stride=4, name=\"Second_Conv\", isTraining=isTraining)\n", 46 | " model.addConvLayer(filter_width=33, input_channels=64, output_channels=256,\n", 47 | " stride=4, name=\"Third_Conv\", isTraining=isTraining)\n", 48 | " model.addConvLayer(filter_width=17, input_channels=256, output_channels=1024,\n", 49 | " stride=4, name=\"Fourth_Conv\", isTraining=isTraining)\n", 50 | " model.addConvLayer(filter_width=9, input_channels=1024, output_channels=4096,\n", 51 | " stride=4, name=\"Last_Conv\", isTraining=isTraining)\n", 52 | "\n", 53 | " def _decoder(self, model, isTraining):\n", 54 | " with tf.variable_scope(\"Decoder\"):\n", 55 | " model.addConvLayerWithoutNonLin(filter_width=5, input_channels=4096, output_channels=1024,\n", 56 | " stride=4, name=\"Decode_Conv\", isTraining=isTraining)\n", 57 | " model.addReshape((self._batch_size, self._gap_length))\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "tf.reset_default_graph()\n", 67 | "\n", 68 | "train_filename = 'train_full_w5120_g1024_h512_19404621.tfrecords'\n", 69 | "valid_filename = 'valid_full_w5120_g1024_h512_ex913967.tfrecords'\n", 70 | "\n", 71 | "aContextEncoderNetwork = ModifiedContextEncoderNetwork(batch_size=256, window_size=5120, gap_length=1024, \n", 72 | " learning_rate=1e-5, name='first_try')\n", 73 | "aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6)" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 3", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.6.2" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 2 98 | } 99 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftSec.py: -------------------------------------------------------------------------------- 1 | """ 2 | This network uses an stft representation of the sides of the signal to produce audio as an output. 3 | The frame length was set to 512 and the frame step to 64, although it should be 128. 4 | There are three convolutions and three deconvolutions. 5 | 6 | This small network was trained for 1141999 steps. It appears to still be learning. 7 | The best values on the validation were found at step 1133999: 8 | 9 | SNRs 1133999 reconstruction_loss 1133999 10 | count 65536 65536 11 | mean 10.638316068 7.60381269454956 12 | std 7.0298398286 19.9564838409424 13 | min -27.67805389 0.011582373641431 14 | 25% 4.3873017658 0.263745993375778 15 | 50% 11.126480919 1.37784320116043 16 | 75% 16.480553727 5.75252687931061 17 | max 30.251670154 1465.634765625 18 | 19 | 20 | """ 21 | 22 | 23 | 24 | 25 | import tensorflow as tf 26 | 27 | from network.emptyTFGraph import EmptyTfGraph 28 | from utils.legacy.contextEncoder import ContextEncoderNetwork 29 | 30 | __author__ = 'Andres' 31 | 32 | tf.reset_default_graph() 33 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 34 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 35 | 36 | window_size = 5120 37 | gap_length = 1024 38 | batch_size = 256 39 | 40 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 41 | 42 | dataset = aModel.output() 43 | signal_length = window_size - gap_length 44 | first_half = dataset[:, :signal_length // 2] 45 | second_half = dataset[:, signal_length // 2:] 46 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 47 | 48 | with tf.name_scope('Energy_Spectogram'): 49 | fft_frame_length = 512 50 | fft_frame_step = 128 51 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step) 52 | real_stft = tf.real(stft) 53 | imag_stft = tf.imag(stft) 54 | real_stft_left = real_stft[:, 0, :, :] 55 | real_stft_right = real_stft[:, 1, :, :] 56 | 57 | imag_stft_left = imag_stft[:, 0, :, :] 58 | imag_stft_right = imag_stft[:, 1, :, :] 59 | 60 | real_stft = tf.concat([real_stft_left, real_stft_right], 1) 61 | imag_stft = tf.concat([imag_stft_left, imag_stft_right], 1) 62 | print(real_stft) 63 | 64 | stacked = tf.stack([real_stft, imag_stft], axis=3) 65 | aModel.setOutputTo(stacked) 66 | 67 | with tf.variable_scope("Encoder"): 68 | filter_widths = [(9, 33), (5, 9), (3, 3)] 69 | input_channels = [2, 32, 64] 70 | output_channels = [32, 64, 128] 71 | strides = [[1, 2, 4, 1], [1, 2, 4, 1], [1, 2, 4, 1]] 72 | names = ['First_Conv', 'Second_Conv', 'Third_Conv'] 73 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 74 | output_channels=output_channels, strides=strides, names=names) 75 | 76 | aModel.addReshape((batch_size, 2560)) 77 | aModel.addFullyConnectedLayer(2560, 2048, 'Fully') 78 | aModel.addRelu() 79 | aModel.addReshape((batch_size, 1, 32, 64)) 80 | 81 | with tf.variable_scope("Decoder"): 82 | filter_widths = [(1, 17), (1, 65)] 83 | input_channels = [64, 64] 84 | output_channels = [64, 16] 85 | strides = [[1, 1, 4, 1]] * len(input_channels) 86 | names = ['First_Deconv', 'Second_Deconv'] 87 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 88 | output_channels=output_channels, strides=strides, names=names) 89 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(1, 129), input_channels=16, output_channels=1, 90 | stride=(1, 1, 2, 1), name="Last_Deconv") 91 | aModel.addReshape((batch_size, gap_length)) 92 | 93 | print(aModel.description()) 94 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 95 | gap_length=gap_length, learning_rate=1e-5, name='nat_full_stft_2') 96 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 97 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftGapTest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | from system.preAndPostProcessor import PreAndPostProcessor 6 | 7 | sys.path.insert(0, '../') 8 | import tensorflow as tf 9 | from tensorflow.contrib import slim 10 | import socket 11 | if 'omenx' in socket.gethostname(): 12 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 13 | 14 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 15 | 16 | __author__ = 'Andres' 17 | 18 | tf.reset_default_graph() 19 | if 'omenx' in socket.gethostname(): 20 | train_filename = '/store/nati/datasets/Nsynth/train_w5120_g1024_h512.tfrecords' 21 | valid_filename = '/store/nati/datasets/Nsynth/valid_w5120_g1024_h512.tfrecords' 22 | else: 23 | train_filename = '/scratch/snx3000/nperraud/data/NSynth/train_w5120_g1024_h512.tfrecords' 24 | valid_filename = '/scratch/snx3000/nperraud/data/NSynth/valid_w5120_g1024_h512.tfrecords' 25 | 26 | 27 | signal_length = 5120 28 | gap_length = 1024 29 | batch_size = 256 30 | 31 | fft_window_length = 512 32 | fft_hop_size = 128 33 | 34 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, signal_length), name="Target Model") 35 | anStftForTheInpaintingSetting = PreAndPostProcessor(signalLength=signal_length, 36 | gapLength=gap_length, 37 | fftWindowLength=fft_window_length, 38 | fftHopSize=fft_hop_size) 39 | anStftForTheInpaintingSetting.addStftForGapTo(aTargetModel) 40 | aTargetModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 11, 257, 2) 41 | 42 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, signal_length), name="context encoder") 43 | anStftForTheInpaintingSetting.addStftForTheContextTo(aModel) 44 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 45 | aModel.addReshape((batch_size, 16, 257, 4)) 46 | 47 | with tf.variable_scope("Encoder"): 48 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 49 | input_channels = [4, 32, 64, 128, 128] 50 | output_channels = [32, 64, 128, 128, 200] 51 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 52 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 53 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 54 | output_channels=output_channels, strides=strides, names=names) 55 | 56 | aModel.addReshape((batch_size, 3200)) 57 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 58 | aModel.addRelu() 59 | aModel.addBatchNormalization() 60 | aModel.addReshape((batch_size, 8, 8, 32)) 61 | 62 | with tf.variable_scope("Decoder"): 63 | filter_shapes = [(5, 5), (3, 3)] 64 | input_channels = [32, 64] 65 | output_channels = [64, 257] 66 | strides = [[1, 2, 2, 1]] * len(input_channels) 67 | names = ['First_Deconv', 'Second_Deconv'] 68 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 69 | output_channels=output_channels, strides=strides, names=names) 70 | 71 | aModel.addReshape((batch_size, 8, 257, 128)) 72 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 73 | name='Third_deconv') 74 | aModel.addBatchNormalization() 75 | 76 | aModel.addReshape((batch_size, 11, 257, 32)) 77 | 78 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=2, 79 | stride=(1, 1, 1, 1), name="Last_Deconv") 80 | 81 | print(aModel.description()) 82 | 83 | model_vars = tf.trainable_variables() 84 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 85 | 86 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=signal_length, 87 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_baseline') 88 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 89 | -------------------------------------------------------------------------------- /system/dnnSystem.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import re 3 | 4 | __author__ = 'Andres' 5 | 6 | 7 | class DNNSystem(object): 8 | def __init__(self, architecture, name): 9 | self._architecture = architecture 10 | self._name = name 11 | 12 | def optimizer(self, learningRate): 13 | raise NotImplementedError("Subclass Responsibility") 14 | 15 | def _feedDict(self, data, sess, isTraining=True): 16 | raise NotImplementedError("Subclass Responsibility") 17 | 18 | def _evaluate(self, summariesDict, feed_dict, validReader, sess): 19 | raise NotImplementedError("Subclass Responsibility") 20 | 21 | def _loadReader(self, dataPath): 22 | raise NotImplementedError("Subclass Responsibility") 23 | 24 | def _evaluationSummaries(self): 25 | raise NotImplementedError("Subclass Responsibility") 26 | 27 | def train(self, trainTFRecordPath, validTFRecordPath, learningRate, numSteps=6e5, restoreNum=None): 28 | with tf.Session() as sess: 29 | trainReader = self._loadReader(trainTFRecordPath) 30 | validReader = self._loadReader(validTFRecordPath) 31 | optimizer = self.optimizer(learningRate) 32 | 33 | saver = tf.train.Saver(max_to_keep=100) 34 | path = self.modelsPath(restoreNum) 35 | _modelNum = get_trailing_number(path[:-5]) 36 | 37 | if _modelNum == 0: 38 | init = tf.global_variables_initializer() 39 | sess.run([init, tf.local_variables_initializer()]) 40 | print("Initialized") 41 | else: 42 | saver.restore(sess, path) 43 | sess.run([tf.local_variables_initializer()]) 44 | print("Model restored.") 45 | 46 | logs_path = 'utils/logdir/' + self._name # write each run to a diff folder. 47 | print("logs path:", logs_path) 48 | writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) 49 | 50 | summariesDict = self._evaluationSummaries() 51 | 52 | try: 53 | trainReader.start() 54 | validReader.start() 55 | 56 | for step in range(1, int(numSteps)): 57 | try: 58 | data = trainReader.dataOperation(session=sess) 59 | except StopIteration: 60 | print("End of queue at step", step) 61 | break 62 | 63 | feed_dict = self._feedDict(data, sess, isTraining=True) 64 | sess.run(optimizer, feed_dict=feed_dict) 65 | 66 | if step % 40 == 0: 67 | train_summ = sess.run(self._architecture.lossSummaries(), feed_dict=feed_dict) 68 | writer.add_summary(train_summ, _modelNum + step) 69 | if step % 2000 == 0: 70 | summaries = self._evaluate(summariesDict, feed_dict, validReader, sess) 71 | for summary in summaries: 72 | writer.add_summary(summary, _modelNum+step) 73 | saver.save(sess, self.modelsPath(_modelNum + step)) 74 | except KeyboardInterrupt: 75 | pass 76 | 77 | saver.save(sess, self.modelsPath(_modelNum + step)) 78 | trainReader.finish() 79 | validReader.finish() 80 | print("Finalizing at step:", _modelNum + step) 81 | print("Last saved model:", self.modelsPath(_modelNum + step)) 82 | 83 | def modelsPath(self, models_number=None): 84 | pathdir = "utils/saved_models/" + self._name 85 | if models_number is None: 86 | ckpt = tf.train.get_checkpoint_state(pathdir) 87 | print(ckpt) 88 | if ckpt and ckpt.model_checkpoint_path: 89 | return ckpt.model_checkpoint_path 90 | else: 91 | models_number = 0 92 | models_path = pathdir + "/model-" + self._name 93 | models_ext = ".ckpt" 94 | return models_path + str(models_number) + models_ext 95 | 96 | 97 | def get_trailing_number(s): 98 | m = re.search(r'\d+$', s) 99 | return int(m.group()) if m else None 100 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftGapToMagTest.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import slim 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 6 | 7 | __author__ = 'Andres' 8 | 9 | tf.reset_default_graph() 10 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 11 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 12 | 13 | window_size = 5120 14 | gap_length = 1024 15 | batch_size = 256 16 | 17 | fft_frame_length = 512 18 | fft_frame_step = 128 19 | 20 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="Target Model") 21 | 22 | with tf.name_scope('Remove_unnecesary_sides_before_stft'): 23 | signal = aTargetModel.output() 24 | signal_without_unnecesary_sides = signal[:, 1664:3456] 25 | aTargetModel.setOutputTo(signal_without_unnecesary_sides) 26 | aTargetModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 27 | aTargetModel.addAbs() # (256, 11, 257) 28 | 29 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 30 | 31 | with tf.name_scope('Remove_gap_before_stft'): 32 | signal = aModel.output() 33 | left_side = signal[:, :2048] 34 | right_side = signal[:, 2048+1024:] 35 | 36 | # This is strange. The window is 5K samples long, the hole 1024 and the 0 pading 384. 37 | # Unless signal in in spectrogram. In that case, the code is not very clear. Maybe consider adding comments. 38 | left_side_padded = tf.concat((left_side, tf.zeros((batch_size, 384))), axis=1) 39 | right_side_padded = tf.concat((tf.zeros((batch_size, 384)), right_side), axis=1) 40 | 41 | # If you pad them with 0, maybe you also stack them allong axis 2 (one after the other.) 42 | signal_without_gap = tf.stack((left_side_padded, right_side_padded), axis=1) # (256, 2, 2432) 43 | aModel.setOutputTo(signal_without_gap) 44 | 45 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) # (256, 2, 16, 257) 46 | aModel.addReshape((batch_size, 32, 257)) 47 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 48 | aModel.addReshape((batch_size, 16, 257, 4)) 49 | 50 | with tf.variable_scope("Encoder"): 51 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 52 | input_channels = [4, 32, 64, 128, 128] 53 | output_channels = [32, 64, 128, 128, 200] 54 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 55 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 56 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 57 | output_channels=output_channels, strides=strides, names=names) 58 | 59 | aModel.addReshape((batch_size, 3200)) 60 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 61 | aModel.addRelu() 62 | aModel.addBatchNormalization() 63 | aModel.addReshape((batch_size, 8, 8, 32)) 64 | 65 | with tf.variable_scope("Decoder"): 66 | filter_shapes = [(5, 5), (3, 3)] 67 | input_channels = [32, 64] 68 | output_channels = [64, 257] 69 | strides = [[1, 2, 2, 1]] * len(input_channels) 70 | names = ['First_Deconv', 'Second_Deconv'] 71 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 72 | output_channels=output_channels, strides=strides, names=names) 73 | 74 | aModel.addReshape((batch_size, 8, 257, 128)) 75 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 76 | name='Third_deconv') 77 | aModel.addBatchNormalization() 78 | 79 | aModel.addReshape((batch_size, 11, 257, 32)) 80 | 81 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=1, 82 | stride=(1, 1, 1, 1), name="Last_Deconv") 83 | 84 | print(aModel.description()) 85 | 86 | model_vars = tf.trainable_variables() 87 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 88 | 89 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=window_size, 90 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_mag_1_') 91 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 92 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftThird.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.signal.python.ops import window_ops 5 | 6 | from network.tfGraph import TFGraph 7 | from utils.legacy.contextEncoder import ContextEncoderNetwork 8 | 9 | __author__ = 'Andres' 10 | 11 | tf.reset_default_graph() 12 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 13 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 14 | 15 | window_size = 5120 16 | gap_length = 1024 17 | batch_size = 256 18 | 19 | aModel = TFGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 20 | 21 | dataset = aModel.output() 22 | signal_length = window_size - gap_length 23 | first_half = dataset[:, :signal_length // 2] 24 | second_half = dataset[:, signal_length // 2:] 25 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 26 | 27 | with tf.name_scope('Energy_Spectogram'): 28 | fft_frame_length = 512 29 | fft_frame_step = 128 30 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 31 | 32 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step, 33 | window_fn=window_fn) 34 | real_stft = tf.real(stft) 35 | imag_stft = tf.imag(stft) 36 | real_stft_left = real_stft[:, 0, :, :] 37 | real_stft_right = real_stft[:, 1, :, :] 38 | 39 | imag_stft_left = imag_stft[:, 0, :, :] 40 | imag_stft_right = imag_stft[:, 1, :, :] 41 | 42 | real_stft = tf.concat([real_stft_left, real_stft_right], 1) 43 | imag_stft = tf.concat([imag_stft_left, imag_stft_right], 1) 44 | print(real_stft) 45 | 46 | stacked = tf.stack([real_stft, imag_stft], axis=3) 47 | aModel.setOutputTo(stacked) 48 | 49 | with tf.variable_scope("Encoder"): 50 | filter_widths = [(7, 89), (4, 43), (2, 11), (2, 5), (2, 3)] 51 | input_channels = [2, 32, 128, 512, 128] 52 | output_channels = [32, 128, 512, 128, 64] 53 | strides = [[1, 1, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1]] 54 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 55 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 56 | output_channels=output_channels, strides=strides, names=names) 57 | print(aModel.output()) 58 | 59 | # aModel.addReshape((batch_size, 3072)) 60 | # aModel.addFullyConnectedLayer(3072, 2827, 'Fully') 61 | # aModel.addRelu() 62 | # aModel.addReshape((batch_size, 11, 257, 1)) 63 | 64 | with tf.variable_scope("Decoder"): 65 | filter_widths = [(1, 3), (2, 3), (3, 4)] 66 | input_channels = [64, 256, 512] 67 | output_channels = [256, 512, 257] 68 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1]] 69 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv'] 70 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 71 | output_channels=output_channels, strides=strides, names=names) 72 | aModel.addReshape((batch_size, 8, 257, 144)) 73 | 74 | aModel.addDeconvLayer(filter_shape=(2, 111), input_channels=144, output_channels=11, stride=(1, 2, 1, 1), 75 | name='first_deconv_after_reshape') 76 | aModel.addReshape((batch_size, 11, 257, 16)) 77 | 78 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(3, 11), input_channels=16, output_channels=2, 79 | stride=(1, 1, 1, 1), name="Last_Deconv") 80 | netOutput = aModel.output() 81 | complexOutput = tf.complex(netOutput[:, :, :, 0], netOutput[:, :, :, 1]) 82 | print(complexOutput) 83 | istft = tf.contrib.signal.inverse_stft(stfts=complexOutput, frame_length=fft_frame_length, frame_step=fft_frame_step, 84 | window_fn=tf.contrib.signal.inverse_stft_window_fn(fft_frame_step, 85 | forward_window_fn=window_fn)) 86 | padding = fft_frame_length-fft_frame_step 87 | unPaddedIstft = istft[:, padding:-padding] 88 | aModel.setOutputTo(unPaddedIstft) 89 | aModel.addReshape((batch_size, gap_length)) 90 | 91 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 92 | gap_length=gap_length, learning_rate=1e-4, name='nat_full_stft_3_') 93 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 94 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftFifth.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.signal.python.ops import window_ops 5 | 6 | from network.emptyTFGraph import EmptyTfGraph 7 | from utils.legacy.contextEncoder import ContextEncoderNetwork 8 | 9 | __author__ = 'Andres' 10 | 11 | tf.reset_default_graph() 12 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 13 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 14 | 15 | window_size = 5120 16 | gap_length = 1024 17 | batch_size = 256 18 | 19 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size - gap_length), name="context encoder") 20 | 21 | dataset = aModel.output() 22 | signal_length = window_size - gap_length 23 | first_half = dataset[:, :signal_length // 2] 24 | second_half = dataset[:, signal_length // 2:] 25 | stacked_halfs = tf.stack([first_half, second_half], axis=1) 26 | 27 | with tf.name_scope('Energy_Spectogram'): 28 | fft_frame_length = 512 29 | fft_frame_step = 128 30 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 31 | 32 | stft = tf.contrib.signal.stft(signals=stacked_halfs, frame_length=fft_frame_length, frame_step=fft_frame_step, 33 | window_fn=window_fn) 34 | real_stft = tf.real(stft) 35 | imag_stft = tf.imag(stft) 36 | real_stft_left = real_stft[:, 0, :, :] 37 | real_stft_right = real_stft[:, 1, :, :] 38 | 39 | imag_stft_left = imag_stft[:, 0, :, :] 40 | imag_stft_right = imag_stft[:, 1, :, :] 41 | 42 | real_stft = tf.concat([real_stft_left, real_stft_right], 1) 43 | imag_stft = tf.concat([imag_stft_left, imag_stft_right], 1) 44 | print(real_stft) 45 | 46 | stacked = tf.stack([real_stft, imag_stft], axis=3) 47 | aModel.setOutputTo(stacked) 48 | 49 | with tf.variable_scope("Encoder"): 50 | filter_widths = [(7, 89), (4, 43), (2, 11), (2, 3), (2, 5)] 51 | input_channels = [2, 16, 32, 128, 128] 52 | output_channels = [16, 32, 128, 128, 64] 53 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 1, 1, 1]] 54 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 55 | aModel.addSeveralConvLayers(filter_shapes=filter_widths, input_channels=input_channels, 56 | output_channels=output_channels, strides=strides, names=names) 57 | print(aModel.output()) 58 | 59 | aModel.addReshape((batch_size, 2176)) 60 | aModel.addFullyConnectedLayer(2176, 1152, 'Fully') 61 | aModel.addRelu() 62 | aModel.addReshape((batch_size, 2, 9, 64)) 63 | 64 | with tf.variable_scope("Decoder"): 65 | filter_widths = [(1, 3), (2, 3), (2, 3)] 66 | input_channels = [64, 256, 64] 67 | output_channels = [256, 64, 257] 68 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1]] 69 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv'] 70 | aModel.addSeveralDeconvLayers(filter_shapes=filter_widths, input_channels=input_channels, 71 | output_channels=output_channels, strides=strides, names=names) 72 | aModel.addReshape((batch_size, 8, 257, 144)) 73 | 74 | aModel.addDeconvLayer(filter_shape=(2, 31), input_channels=144, output_channels=11, stride=(1, 2, 1, 1), 75 | name='first_deconv_after_reshape') 76 | aModel.addReshape((batch_size, 11, 257, 16)) 77 | 78 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(3, 129), input_channels=16, output_channels=2, 79 | stride=(1, 1, 1, 1), name="Last_Deconv") 80 | netOutput = aModel.output() 81 | complexOutput = tf.complex(netOutput[:, :, :, 0], netOutput[:, :, :, 1]) 82 | print(complexOutput) 83 | istft = tf.contrib.signal.inverse_stft(stfts=complexOutput, frame_length=fft_frame_length, frame_step=fft_frame_step, 84 | window_fn=tf.contrib.signal.inverse_stft_window_fn(fft_frame_step, 85 | forward_window_fn=window_fn)) 86 | padding = fft_frame_length-fft_frame_step 87 | unPaddedIstft = istft[:, padding:-padding] 88 | aModel.setOutputTo(unPaddedIstft) 89 | aModel.addReshape((batch_size, gap_length)) 90 | 91 | print(aModel.description()) 92 | aContextEncoderNetwork = ContextEncoderNetwork(model=aModel, batch_size=batch_size, window_size=window_size, 93 | gap_length=gap_length, learning_rate=1e-4, name='nat_full_stft_5_') 94 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 95 | -------------------------------------------------------------------------------- /system/preAndPostProcessor.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.signal.python.ops import window_ops 5 | 6 | __author__ = 'Andres' 7 | 8 | 9 | class PreAndPostProcessor(object): 10 | def __init__(self, signalLength, gapLength, fftWindowLength, fftHopSize): 11 | super(PreAndPostProcessor, self).__init__() 12 | self._signalLength = signalLength 13 | self._gapLength = gapLength 14 | self._fftWindowLength = fftWindowLength 15 | self._fftHopSize = fftHopSize 16 | 17 | def signalLength(self): 18 | return self._signalLength 19 | 20 | def gapLength(self): 21 | return self._gapLength 22 | 23 | def fftWindowLenght(self): 24 | return self._fftWindowLength 25 | 26 | def fftHopSize(self): 27 | return self._fftHopSize 28 | 29 | def padding(self): 30 | return self._fftWindowLength - self._fftHopSize 31 | 32 | def stftForGapOf(self, aBatchOfSignals): 33 | assert len(aBatchOfSignals.shape) == 2 34 | signalWithoutExtraSides = self._removeExtraSidesForSTFTOfGap(aBatchOfSignals) 35 | return self._realAndImagSTFT(signalWithoutExtraSides) 36 | 37 | def stftForTheContextOf(self, aBatchOfSignals): 38 | assert len(aBatchOfSignals.shape) == 2 39 | leftAndRightSideStacked = self._removeGap(aBatchOfSignals) 40 | leftAndRightSideStackedAndPadded = self._addPaddingForStftOfContext(leftAndRightSideStacked) 41 | 42 | realAndImagSTFTOfLeftSide = self._realAndImagSTFT(leftAndRightSideStackedAndPadded[:, 0]) 43 | realAndImagSTFTOfRightSide = self._realAndImagSTFT(leftAndRightSideStackedAndPadded[:, 1]) 44 | 45 | contextRealAndImagSTFT = tf.concat([realAndImagSTFTOfLeftSide, realAndImagSTFTOfRightSide], axis=-1) 46 | return contextRealAndImagSTFT 47 | 48 | def _realAndImagSTFT(self, aBatchOfSignals): 49 | stft = tf.contrib.signal.stft(signals=aBatchOfSignals, 50 | frame_length=self._fftWindowLength, frame_step=self._fftHopSize) 51 | return self._divideComplexIntoRealAndImag(stft) 52 | 53 | def inverseStftOfGap(self, batchOfStftOfGap): 54 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 55 | inverse_window = tf.contrib.signal.inverse_stft_window_fn(self._fftWindowLength, forward_window_fn=window_fn) 56 | padded_gaps = tf.contrib.signal.inverse_stft(stfts=batchOfStftOfGap, frame_length=self._fftWindowLength, 57 | frame_step=self._fftHopSize, window_fn=inverse_window) 58 | return padded_gaps[:, self.padding():-self.padding()] 59 | 60 | def inverseStftOfSignal(self, batchOfStftsOfSignal): 61 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 62 | inverse_window = tf.contrib.signal.inverse_stft_window_fn(self._fftWindowLength, forward_window_fn=window_fn) 63 | return tf.contrib.signal.inverse_stft(stfts=batchOfStftsOfSignal, frame_length=self._fftWindowLength, 64 | frame_step=self._fftHopSize, window_fn=inverse_window) 65 | 66 | def _gapBeginning(self): 67 | return (self._signalLength - self._gapLength) // 2 68 | 69 | def _gapEnding(self): 70 | return self._gapBeginning() + self._gapLength 71 | 72 | def _removeExtraSidesForSTFTOfGap(self, batchOfSignals): 73 | return batchOfSignals[:, self._gapBeginning() - self.padding(): self._gapEnding() + self.padding()] 74 | 75 | def _removeGap(self, batchOfSignals): 76 | leftSide = batchOfSignals[:, :self._gapBeginning()] 77 | rightSide = batchOfSignals[:, self._gapEnding():] 78 | return tf.stack((leftSide, rightSide), axis=1) 79 | 80 | def _addPaddingForStftOfContext(self, batchOfSides): 81 | """batchOfSides should contain the left side on the first dimension and the right side on the second""" 82 | batchSize = batchOfSides.shape.as_list()[0] 83 | leftSidePadded = tf.concat((batchOfSides[:, 0], tf.zeros((batchSize, self.padding()))), axis=1) 84 | rightSidePadded = tf.concat((tf.zeros((batchSize, self.padding())), batchOfSides[:, 1]), axis=1) 85 | return tf.stack((leftSidePadded, rightSidePadded), axis=1) 86 | 87 | def _divideComplexIntoRealAndImag(self, complexTensor): 88 | real_part = tf.real(complexTensor) 89 | imag_part = tf.imag(complexTensor) 90 | return tf.stack([real_part, imag_part], axis=-1, name='divideComplexIntoRealAndImag') 91 | -------------------------------------------------------------------------------- /system/magPreAndPostProcessor.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.signal.python.ops import window_ops 5 | 6 | __author__ = 'Andres' 7 | 8 | 9 | class MagPreAndPostProcessor(object): 10 | def __init__(self, signalLength, gapLength, fftWindowLength, fftHopSize): 11 | super(MagPreAndPostProcessor, self).__init__() 12 | self._signalLength = signalLength 13 | self._gapLength = gapLength 14 | self._fftWindowLength = fftWindowLength 15 | self._fftHopSize = fftHopSize 16 | 17 | def signalLength(self): 18 | return self._signalLength 19 | 20 | def gapLength(self): 21 | return self._gapLength 22 | 23 | def fftWindowLenght(self): 24 | return self._fftWindowLength 25 | 26 | def fftHopSize(self): 27 | return self._fftHopSize 28 | 29 | def padding(self): 30 | return self._fftWindowLength - self._fftHopSize 31 | 32 | def stftForGapOf(self, aBatchOfSignals): 33 | assert len(aBatchOfSignals.shape) == 2 34 | signalWithoutExtraSides = self._removeExtraSidesForSTFTOfGap(aBatchOfSignals) 35 | stft = tf.contrib.signal.stft(signals=signalWithoutExtraSides, 36 | frame_length=self._fftWindowLength, frame_step=self._fftHopSize) 37 | return tf.expand_dims(tf.abs(stft), axis=-1) 38 | 39 | def stftForTheContextOf(self, aBatchOfSignals): 40 | assert len(aBatchOfSignals.shape) == 2 41 | leftAndRightSideStacked = self._removeGap(aBatchOfSignals) 42 | leftAndRightSideStackedAndPadded = self._addPaddingForStftOfContext(leftAndRightSideStacked) 43 | 44 | realAndImagSTFTOfLeftSide = self._realAndImagSTFT(leftAndRightSideStackedAndPadded[:, 0]) 45 | realAndImagSTFTOfRightSide = self._realAndImagSTFT(leftAndRightSideStackedAndPadded[:, 1]) 46 | 47 | contextRealAndImagSTFT = tf.concat([realAndImagSTFTOfLeftSide, realAndImagSTFTOfRightSide], axis=-1) 48 | return contextRealAndImagSTFT 49 | 50 | def _realAndImagSTFT(self, aBatchOfSignals): 51 | stft = tf.contrib.signal.stft(signals=aBatchOfSignals, 52 | frame_length=self._fftWindowLength, frame_step=self._fftHopSize) 53 | return self._divideComplexIntoRealAndImag(stft) 54 | 55 | def inverseStftOfGap(self, batchOfStftOfGap): 56 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 57 | inverse_window = tf.contrib.signal.inverse_stft_window_fn(self._fftWindowLength, forward_window_fn=window_fn) 58 | padded_gaps = tf.contrib.signal.inverse_stft(stfts=batchOfStftOfGap, frame_length=self._fftWindowLength, 59 | frame_step=self._fftHopSize, window_fn=inverse_window) 60 | return padded_gaps[:, self.padding():-self.padding()] 61 | 62 | def inverseStftOfSignal(self, batchOfStftsOfSignal): 63 | window_fn = functools.partial(window_ops.hann_window, periodic=True) 64 | inverse_window = tf.contrib.signal.inverse_stft_window_fn(self._fftWindowLength, forward_window_fn=window_fn) 65 | return tf.contrib.signal.inverse_stft(stfts=batchOfStftsOfSignal, frame_length=self._fftWindowLength, 66 | frame_step=self._fftHopSize, window_fn=inverse_window) 67 | 68 | def _gapBeginning(self): 69 | return (self._signalLength - self._gapLength) // 2 70 | 71 | def _gapEnding(self): 72 | return self._gapBeginning() + self._gapLength 73 | 74 | def _removeExtraSidesForSTFTOfGap(self, batchOfSignals): 75 | return batchOfSignals[:, self._gapBeginning() - self.padding(): self._gapEnding() + self.padding()] 76 | 77 | def _removeGap(self, batchOfSignals): 78 | leftSide = batchOfSignals[:, :self._gapBeginning()] 79 | rightSide = batchOfSignals[:, self._gapEnding():] 80 | return tf.stack((leftSide, rightSide), axis=1) 81 | 82 | def _addPaddingForStftOfContext(self, batchOfSides): 83 | """batchOfSides should contain the left side on the first dimension and the right side on the second""" 84 | batchSize = batchOfSides.shape.as_list()[0] 85 | leftSidePadded = tf.concat((batchOfSides[:, 0], tf.zeros((batchSize, self.padding()))), axis=1) 86 | rightSidePadded = tf.concat((tf.zeros((batchSize, self.padding())), batchOfSides[:, 1]), axis=1) 87 | return tf.stack((leftSidePadded, rightSidePadded), axis=1) 88 | 89 | def _divideComplexIntoRealAndImag(self, complexTensor): 90 | real_part = tf.real(complexTensor) 91 | imag_part = tf.imag(complexTensor) 92 | return tf.stack([real_part, imag_part], axis=-1, name='divideComplexIntoRealAndImag') 93 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftGapOneOneTest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This trained for 85k steps (24hs) with a learning rate of 1e-3 and didn't learn anything. 3 | 4 | """ 5 | 6 | import tensorflow as tf 7 | from tensorflow.contrib import slim 8 | 9 | from network.emptyTFGraph import EmptyTfGraph 10 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 11 | 12 | __author__ = 'Andres' 13 | 14 | tf.reset_default_graph() 15 | train_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 16 | valid_filename = '../test_w5120_g1024_h512_ex63501.tfrecords' 17 | 18 | window_size = 5120 19 | gap_length = 1024 20 | batch_size = 256 21 | 22 | fft_frame_length = 512 23 | fft_frame_step = 128 24 | 25 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="Target Model") 26 | 27 | with tf.name_scope('Remove_unnecesary_sides_before_stft'): 28 | signal = aTargetModel.output() 29 | signal_without_unnecesary_sides = signal[:, 1664:3456] 30 | aTargetModel.setOutputTo(signal_without_unnecesary_sides) 31 | aTargetModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 32 | aTargetModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 11, 257, 2) 33 | 34 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 35 | 36 | with tf.name_scope('Remove_gap_before_stft'): 37 | signal = aModel.output() 38 | left_side = signal[:, :2048] 39 | right_side = signal[:, 2048+1024:] 40 | 41 | # This is strange. The window is 5K samples long, the hole 1024 and the 0 pading 384. 42 | # Unless signal in in spectrogram. In that case, the code is not very clear. Maybe consider adding comments. 43 | left_side_padded = tf.concat((left_side, tf.zeros((batch_size, 384))), axis=1) 44 | right_side_padded = tf.concat((tf.zeros((batch_size, 384)), right_side), axis=1) 45 | 46 | # If you pad them with 0, maybe you also stack them allong axis 2 (one after the other.) 47 | signal_without_gap = tf.stack((left_side_padded, right_side_padded), axis=1) # (256, 2, 2432) 48 | aModel.setOutputTo(signal_without_gap) 49 | 50 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) # (256, 2, 16, 257) 51 | aModel.addReshape((batch_size, 32, 257)) 52 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 53 | aModel.addReshape((batch_size, 16, 257, 4)) 54 | 55 | with tf.variable_scope("Encoder"): 56 | filter_shapes = [(7, 89), (3, 17), (2, 9), (1, 5), (2, 5), (2, 5)] 57 | input_channels = [4, 32, 128, 512, 256, 128] 58 | output_channels = [32, 128, 512, 256, 128, 256] 59 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 2, 2, 1], [1, 1, 2, 1], [1, 1, 2, 1], [1, 1, 2, 1]] 60 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv', 'Sixth_Conv'] 61 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 62 | output_channels=output_channels, strides=strides, names=names) 63 | 64 | aModel.addReshape((batch_size, 2560)) 65 | aModel.addFullyConnectedLayer(2560, 2048, 'Fully') 66 | aModel.addRelu() 67 | aModel.addBatchNormalization() 68 | aModel.addReshape((batch_size, 8, 8, 32)) 69 | 70 | with tf.variable_scope("Decoder"): 71 | filter_shapes = [(5, 5), (3, 3), (3, 3), (11, 11)] 72 | input_channels = [32, 128, 512, 128] 73 | output_channels = [128, 512, 128, 32] 74 | strides = [[1, 2, 2, 1]] * len(input_channels) 75 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv', 'Fourth_Deconv'] 76 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 77 | output_channels=output_channels, strides=strides, names=names) 78 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(13, 13), input_channels=32, output_channels=2, 79 | stride=(1, 1, 1, 1), name="Last_Deconv") 80 | 81 | aModel.addReshape((batch_size, 128, 2, 128)) 82 | aModel.addConvLayer(filter_shape=(1, 1), input_channels=128, output_channels=11, stride=(1, 1, 1, 1), 83 | name='first_1by1') 84 | aModel.addReshape((batch_size, 11, 2, 128)) 85 | aModel.addConvLayer(filter_shape=(1, 1), input_channels=128, output_channels=257, stride=(1, 1, 1, 1), 86 | name='second_1by1') 87 | aModel.addReshape((batch_size, 11, 257, 2)) 88 | 89 | print(aModel.description()) 90 | 91 | model_vars = tf.trainable_variables() 92 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 93 | 94 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=window_size, 95 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_1to1_1_') 96 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 97 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatStftGapBIGTest.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | 5 | from network.emptyTFGraph import EmptyTfGraph 6 | 7 | sys.path.insert(0, '../') 8 | import tensorflow as tf 9 | from tensorflow.contrib import slim 10 | import socket 11 | if 'omenx' in socket.gethostname(): 12 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 13 | 14 | 15 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 16 | 17 | __author__ = 'Andres' 18 | 19 | tf.reset_default_graph() 20 | train_filename = '/scratch/fma_small_train_w5120_g1024_h512.tfrecords' 21 | valid_filename = '/scratch/fma_small_valid_w5120_g1024_h512.tfrecords' 22 | 23 | window_size = 5120 24 | gap_length = 1024 25 | batch_size = 256 26 | 27 | fft_frame_length = 512 28 | fft_frame_step = 128 29 | 30 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="Target Model") 31 | 32 | with tf.name_scope('Remove_unnecesary_sides_before_stft'): 33 | signal = aTargetModel.output() 34 | signal_without_unnecesary_sides = signal[:, 1664:3456] 35 | aTargetModel.setOutputTo(signal_without_unnecesary_sides) 36 | aTargetModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 37 | aTargetModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 11, 257, 2) 38 | 39 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 40 | 41 | with tf.name_scope('Remove_gap_before_stft'): 42 | signal = aModel.output() 43 | left_side = signal[:, :2048] 44 | right_side = signal[:, 2048+1024:] 45 | 46 | # This is strange. The window is 5K samples long, the hole 1024 and the 0 pading 384. 47 | # Unless signal in in spectrogram. In that case, the code is not very clear. Maybe consider adding comments. 48 | left_side_padded = tf.concat((left_side, tf.zeros((batch_size, 384))), axis=1) 49 | right_side_padded = tf.concat((tf.zeros((batch_size, 384)), right_side), axis=1) 50 | 51 | # If you pad them with 0, maybe you also stack them allong axis 2 (one after the other.) 52 | signal_without_gap = tf.stack((left_side_padded, right_side_padded), axis=1) # (256, 2, 2432) 53 | aModel.setOutputTo(signal_without_gap) 54 | 55 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) # (256, 2, 16, 257) 56 | aModel.addReshape((batch_size, 32, 257)) 57 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 58 | aModel.addReshape((batch_size, 16, 257, 4)) 59 | 60 | with tf.variable_scope("Encoder"): 61 | filter_shapes = [(7, 89), (3, 17), (2, 11), (1, 9), (1, 5), (2, 5)] 62 | input_channels = [4, 32, 128, 512, 256, 160] 63 | output_channels = [32, 128, 512, 256, 160, 128] 64 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1], [1, 1, 1, 1]] 65 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv', 'Sixth_Conv'] 66 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 67 | output_channels=output_channels, strides=strides, names=names) 68 | 69 | aModel.addReshape((batch_size, 2048)) 70 | aModel.addFullyConnectedLayer(2048, 2048, 'Fully') 71 | aModel.addRelu() 72 | aModel.addBatchNormalization() 73 | aModel.addReshape((batch_size, 8, 8, 32)) 74 | 75 | with tf.variable_scope("Decoder"): 76 | filter_shapes = [(8, 8), (5, 5), (3, 3)] 77 | input_channels = [32, 128, 512] 78 | output_channels = [128, 512, 257] 79 | strides = [[1, 2, 2, 1], [1, 2, 2, 1], [1, 1, 1, 1]] 80 | names = ['First_Deconv', 'Second_Deconv', 'Third_Deconv'] 81 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 82 | output_channels=output_channels, strides=strides, names=names) 83 | 84 | aModel.addReshape((batch_size, 8, 257, 128)) 85 | aModel.addDeconvLayer(filter_shape=(5, 67), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 86 | name='Fourth_deconv') 87 | aModel.addBatchNormalization() 88 | 89 | aModel.addReshape((batch_size, 11, 257, 32)) 90 | 91 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(11, 257), input_channels=32, output_channels=2, 92 | stride=(1, 1, 1, 1), name="Last_Deconv") 93 | 94 | print(aModel.description()) 95 | 96 | model_vars = tf.trainable_variables() 97 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 98 | 99 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=window_size, 100 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_big_1_') 101 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 102 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | 6 | sys.path.insert(0, '../') 7 | import tensorflow as tf 8 | from tensorflow.contrib import slim 9 | import socket 10 | if 'omenx' in socket.gethostname(): 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 12 | 13 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 14 | 15 | __author__ = 'Andres' 16 | 17 | tf.reset_default_graph() 18 | if 'omenx' in socket.gethostname(): 19 | train_filename = '/store/nati/datasets/Nsynth/train_w5120_g1024_h512.tfrecords' 20 | valid_filename = '/store/nati/datasets/Nsynth/valid_w5120_g1024_h512.tfrecords' 21 | else: 22 | train_filename = '/scratch/snx3000/nperraud/data/NSynth/train_w5120_g1024_h512.tfrecords' 23 | valid_filename = '/scratch/snx3000/nperraud/data/NSynth/valid_w5120_g1024_h512.tfrecords' 24 | 25 | window_size = 5120 26 | gap_length = 1024 27 | batch_size = 256 28 | 29 | fft_frame_length = 512 30 | fft_frame_step = 128 31 | 32 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="Target Model") 33 | 34 | with tf.name_scope('Remove_unnecesary_sides_before_stft'): 35 | signal = aTargetModel.output() 36 | signal_without_unnecesary_sides = signal[:, 1664:3456] 37 | aTargetModel.setOutputTo(signal_without_unnecesary_sides) 38 | aTargetModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 39 | aTargetModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 11, 257, 2) 40 | 41 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 42 | 43 | with tf.name_scope('Remove_gap_before_stft'): 44 | signal = aModel.output() 45 | left_side = signal[:, :2048] 46 | right_side = signal[:, 2048+1024:] 47 | 48 | # This is strange. The window is 5K samples long, the hole 1024 and the 0 pading 384. 49 | # Unless signal in in spectrogram. In that case, the code is not very clear. Maybe consider adding comments. 50 | left_side_padded = tf.concat((left_side, tf.zeros((batch_size, 384))), axis=1) 51 | right_side_padded = tf.concat((tf.zeros((batch_size, 384)), right_side), axis=1) 52 | 53 | # If you pad them with 0, maybe you also stack them allong axis 2 (one after the other.) 54 | signal_without_gap = tf.stack((left_side_padded, right_side_padded), axis=1) # (256, 2, 2432) 55 | aModel.setOutputTo(signal_without_gap) 56 | 57 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) # (256, 2, 16, 257) 58 | aModel.addReshape((batch_size, 32, 257)) 59 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 60 | aModel.addReshape((batch_size, 16, 257, 4)) 61 | 62 | with tf.variable_scope("Encoder"): 63 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 64 | input_channels = [4, 32, 64, 128, 128] 65 | output_channels = [32, 64, 128, 128, 200] 66 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 67 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 68 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 69 | output_channels=output_channels, strides=strides, names=names) 70 | 71 | aModel.addReshape((batch_size, 3200)) 72 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 73 | aModel.addRelu() 74 | aModel.addBatchNormalization() 75 | aModel.addReshape((batch_size, 8, 8, 32)) 76 | 77 | with tf.variable_scope("Decoder"): 78 | filter_shapes = [(5, 5), (3, 3)] 79 | input_channels = [32, 64] 80 | output_channels = [64, 257] 81 | strides = [[1, 2, 2, 1]] * len(input_channels) 82 | names = ['First_Deconv', 'Second_Deconv'] 83 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 84 | output_channels=output_channels, strides=strides, names=names) 85 | 86 | aModel.addReshape((batch_size, 8, 257, 128)) 87 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 88 | name='Third_deconv') 89 | aModel.addBatchNormalization() 90 | 91 | aModel.addReshape((batch_size, 11, 257, 32)) 92 | 93 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=2, 94 | stride=(1, 1, 1, 1), name="Last_Deconv") 95 | 96 | print(aModel.description()) 97 | 98 | model_vars = tf.trainable_variables() 99 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 100 | 101 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=window_size, 102 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_1_') 103 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6, restore_num=None) 104 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatSkip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | 6 | sys.path.insert(0, '../') 7 | import tensorflow as tf 8 | from tensorflow.contrib import slim 9 | import socket 10 | if 'omenx' in socket.gethostname(): 11 | os.environ["CUDA_VISIBLE_DEVICES"]="" 12 | 13 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 14 | 15 | __author__ = 'Andres' 16 | 17 | tf.reset_default_graph() 18 | if 'omenx' in socket.gethostname(): 19 | train_filename = '/store/nati/datasets/Nsynth/train_w5120_g1024_h512.tfrecords' 20 | valid_filename = '/store/nati/datasets/Nsynth/valid_w5120_g1024_h512.tfrecords' 21 | else: 22 | train_filename = '/scratch/snx3000/nperraud/data/NSynth/train_w5120_g1024_h512.tfrecords' 23 | valid_filename = '/scratch/snx3000/nperraud/data/NSynth/valid_w5120_g1024_h512.tfrecords' 24 | 25 | window_size = 5120 26 | gap_length = 1024 27 | batch_size = 256 28 | 29 | fft_frame_length = 512 30 | fft_frame_step = 128 31 | 32 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="Target Model") 33 | 34 | with tf.name_scope('Remove_unnecesary_sides_before_stft'): 35 | signal = aTargetModel.output() 36 | signal_without_unnecesary_sides = signal[:, 1664:3456] 37 | aTargetModel.setOutputTo(signal_without_unnecesary_sides) 38 | aTargetModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 39 | aTargetModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 11, 257, 2) 40 | 41 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 42 | 43 | with tf.name_scope('Remove_gap_before_stft'): 44 | signal = aModel.output() 45 | left_side = signal[:, :2048] 46 | right_side = signal[:, 2048+1024:] 47 | 48 | # This is strange. The window is 5K samples long, the hole 1024 and the 0 pading 384. 49 | # Unless signal in in spectrogram. In that case, the code is not very clear. Maybe consider adding comments. 50 | left_side_padded = tf.concat((left_side, tf.zeros((batch_size, 384))), axis=1) 51 | right_side_padded = tf.concat((tf.zeros((batch_size, 384)), right_side), axis=1) 52 | 53 | # If you pad them with 0, maybe you also stack them allong axis 2 (one after the other.) 54 | signal_without_gap = tf.stack((left_side_padded, right_side_padded), axis=1) # (256, 2, 2432) 55 | aModel.setOutputTo(signal_without_gap) 56 | 57 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) # (256, 2, 16, 257) 58 | aModel.addReshape((batch_size, 32, 257)) 59 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 60 | aModel.addReshape((batch_size, 16, 257, 4)) 61 | 62 | with tf.variable_scope("Encoder"): 63 | filter_shapes = [(7, 89), (3, 17), (2, 6), (1, 5), (1, 3)] 64 | input_channels = [4, 32, 64, 128, 128] 65 | output_channels = [32, 64, 128, 128, 200] 66 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 2, 3, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 67 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Fourth_Conv', 'Fifth_Conv'] 68 | aModel.addSeveralConvLayersWithSkip(filter_shapes=filter_shapes, input_channels=input_channels, 69 | output_channels=output_channels, strides=strides, names=names) 70 | 71 | aModel.addReshape((batch_size, 3200)) 72 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 73 | aModel.addRelu() 74 | aModel.addBatchNormalization() 75 | aModel.addReshape((batch_size, 8, 8, 32)) 76 | 77 | with tf.variable_scope("Decoder"): 78 | filter_shapes = [(5, 5), (3, 3)] 79 | input_channels = [32, 64] 80 | output_channels = [64, 257] 81 | strides = [[1, 2, 2, 1]] * len(input_channels) 82 | names = ['First_Deconv', 'Second_Deconv'] 83 | aModel.addSeveralDeconvLayersWithSkip(filter_shapes=filter_shapes, input_channels=input_channels, 84 | output_channels=output_channels, strides=strides, names=names) 85 | 86 | aModel.addReshape((batch_size, 8, 257, 128)) 87 | aModel.addDeconvLayerWithSkip(filter_shape=(3, 33), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 88 | name='Third_deconv') 89 | aModel.addBatchNormalization() 90 | 91 | aModel.addReshape((batch_size, 11, 257, 32)) 92 | 93 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=2, 94 | stride=(1, 1, 1, 1), name="Last_Deconv") 95 | 96 | print(aModel.description()) 97 | 98 | model_vars = tf.trainable_variables() 99 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 100 | 101 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=window_size, 102 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_1_skip') 103 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 104 | -------------------------------------------------------------------------------- /utils/legacy/simulations/runNatBig.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from network.emptyTFGraph import EmptyTfGraph 5 | 6 | sys.path.insert(0, '../') 7 | import tensorflow as tf 8 | from tensorflow.contrib import slim 9 | import socket 10 | if 'omenx' in socket.gethostname(): 11 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 12 | 13 | from utils.legacy.stftGapContextEncoder import StftGapContextEncoder 14 | 15 | __author__ = 'Andres' 16 | 17 | tf.reset_default_graph() 18 | if 'omenx' in socket.gethostname(): 19 | train_filename = '/store/nati/datasets/Nsynth/train_w5120_g1024_h512.tfrecords' 20 | valid_filename = '/store/nati/datasets/Nsynth/valid_w5120_g1024_h512.tfrecords' 21 | else: 22 | train_filename = '/scratch/snx3000/nperraud/data/NSynth/train_w5120_g1024_h512.tfrecords' 23 | valid_filename = '/scratch/snx3000/nperraud/data/NSynth/valid_w5120_g1024_h512.tfrecords' 24 | 25 | window_size = 5120 26 | gap_length = 1024 27 | batch_size = 256 28 | 29 | fft_frame_length = 512 30 | fft_frame_step = 128 31 | 32 | aTargetModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="Target Model") 33 | 34 | with tf.name_scope('Remove_unnecesary_sides_before_stft'): 35 | signal = aTargetModel.output() 36 | signal_without_unnecesary_sides = signal[:, 1664:3456] 37 | aTargetModel.setOutputTo(signal_without_unnecesary_sides) 38 | aTargetModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) 39 | aTargetModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 11, 257, 2) 40 | 41 | aModel = EmptyTfGraph(shapeOfInput=(batch_size, window_size), name="context encoder") 42 | 43 | with tf.name_scope('Remove_gap_before_stft'): 44 | signal = aModel.output() 45 | left_side = signal[:, :2048] 46 | right_side = signal[:, 2048+1024:] 47 | 48 | # This is strange. The window is 5K samples long, the hole 1024 and the 0 pading 384. 49 | # Unless signal in in spectrogram. In that case, the code is not very clear. Maybe consider adding comments. 50 | left_side_padded = tf.concat((left_side, tf.zeros((batch_size, 384))), axis=1) 51 | right_side_padded = tf.concat((tf.zeros((batch_size, 384)), right_side), axis=1) 52 | 53 | # If you pad them with 0, maybe you also stack them allong axis 2 (one after the other.) 54 | signal_without_gap = tf.stack((left_side_padded, right_side_padded), axis=1) # (256, 2, 2432) 55 | aModel.setOutputTo(signal_without_gap) 56 | 57 | aModel.addSTFT(frame_length=fft_frame_length, frame_step=fft_frame_step) # (256, 2, 16, 257) 58 | aModel.addReshape((batch_size, 32, 257)) 59 | aModel.divideComplexOutputIntoRealAndImaginaryParts() # (256, 32, 257, 2) 60 | aModel.addReshape((batch_size, 16, 257, 4)) 61 | 62 | with tf.variable_scope("Encoder"): 63 | filter_shapes = [(7, 89), (3, 17), (2, 6),(2, 6), (2, 5), (1, 5), (1, 3)] 64 | input_channels = [4, 32, 64, 128, 128, 128, 128] 65 | output_channels = [32, 64, 128, 128, 128, 128, 200] 66 | strides = [[1, 2, 2, 1], [1, 2, 3, 1], [1, 1, 1, 1],[1, 2, 3, 1], [1, 1, 1, 1], [1, 1, 2, 1], [1, 1, 1, 1]] 67 | names = ['First_Conv', 'Second_Conv', 'Third_Conv', 'Third_Conv_b', 'Fourth_Conv','Fourth_Conv_b', 'Fifth_Conv'] 68 | aModel.addSeveralConvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 69 | output_channels=output_channels, strides=strides, names=names) 70 | 71 | aModel.addReshape((batch_size, 3200)) 72 | aModel.addFullyConnectedLayer(3200, 2048, 'Fully') 73 | aModel.addRelu() 74 | aModel.addBatchNormalization() 75 | aModel.addReshape((batch_size, 8, 8, 32)) 76 | 77 | with tf.variable_scope("Decoder"): 78 | filter_shapes = [(5, 5), (5, 5), (5, 5)] 79 | input_channels = [32, 64, 64] 80 | output_channels = [64, 64, 257] 81 | strides = [[1, 2, 2, 1],[1, 1, 1, 1],[1, 2, 2, 1]] 82 | names = ['First_Deconv', 'First_Deconv_b','Second_Deconv'] 83 | aModel.addSeveralDeconvLayers(filter_shapes=filter_shapes, input_channels=input_channels, 84 | output_channels=output_channels, strides=strides, names=names) 85 | 86 | aModel.addReshape((batch_size, 8, 257, 128)) 87 | aModel.addDeconvLayer(filter_shape=(3, 33), input_channels=128, output_channels=11, stride=(1, 2, 2, 1), 88 | name='Third_deconv') 89 | aModel.addBatchNormalization() 90 | 91 | aModel.addReshape((batch_size, 11, 257, 32)) 92 | 93 | aModel.addDeconvLayerWithoutNonLin(filter_shape=(5, 89), input_channels=32, output_channels=2, 94 | stride=(1, 1, 1, 1), name="Last_Deconv") 95 | 96 | print(aModel.description()) 97 | 98 | model_vars = tf.trainable_variables() 99 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 100 | 101 | aContextEncoderNetwork = StftGapContextEncoder(model=aModel, batch_size=batch_size, target_model=aTargetModel, window_size=window_size, 102 | gap_length=gap_length, learning_rate=1e-3, name='nat_stft_gap_1_big') 103 | aContextEncoderNetwork.train(train_filename, valid_filename, num_steps=1e6) 104 | -------------------------------------------------------------------------------- /architecture/channelWiseContextEncoderArchitecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from architecture.architecture import Architecture 4 | from network.tfGraph import TFGraph 5 | 6 | __author__ = 'Andres' 7 | 8 | 9 | class ChannelWiseContextEncoderArchitecture(Architecture): 10 | def __init__(self, inputShape, encoderParams, decoderParams, fullyParams): 11 | with tf.variable_scope("ContextEncoderArchitecture"): 12 | self._inputShape = inputShape 13 | self._encoderParams = encoderParams 14 | self._decoderParams = decoderParams 15 | self._fullyParams = fullyParams 16 | super().__init__() 17 | 18 | def inputShape(self): 19 | return self._inputShape 20 | 21 | def _lossGraph(self): 22 | with tf.variable_scope("Loss"): 23 | targetSquaredNorm = tf.reduce_sum(tf.square(self._target), axis=[1, 2, 3]) 24 | 25 | error = self._target - self._output 26 | error_per_example = tf.reduce_sum(tf.square(error), axis=[1, 2, 3]) 27 | 28 | reconstruction_loss = 0.5 * tf.reduce_sum(error_per_example * (1 + 5 / (targetSquaredNorm+1e-4))) 29 | lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * 1e-2 30 | total_loss = tf.add_n([reconstruction_loss, lossL2]) 31 | 32 | total_loss_summary = tf.summary.scalar("total_loss", total_loss) 33 | l2_loss_summary = tf.summary.scalar("lossL2", lossL2) 34 | rec_loss_summary = tf.summary.scalar("reconstruction_loss", reconstruction_loss) 35 | self._lossSummaries = tf.summary.merge([rec_loss_summary, l2_loss_summary, total_loss_summary]) 36 | 37 | return total_loss 38 | 39 | def _network(self, data): 40 | encodedData = self._encode(data) 41 | connectedData = self._fullyConnect(encodedData) 42 | decodedData = self._decode(connectedData) 43 | return decodedData 44 | 45 | def _encode(self, data): 46 | with tf.variable_scope("Encoder"): 47 | encoder = TFGraph(data, self._isTraining, "Encoder") 48 | 49 | encoder.addSeveralConvLayers(filter_shapes=self._encoderParams.filterShapes(), 50 | input_channels=self._encoderParams.inputChannels(), 51 | output_channels=self._encoderParams.outputChannels(), 52 | strides=self._encoderParams.strides(), 53 | names=self._encoderParams.convNames()) 54 | return encoder.output() 55 | 56 | def _fullyConnect(self, data): 57 | with tf.variable_scope("Fully"): 58 | fullyConnected = TFGraph(data, self._isTraining, "Fully") 59 | fullyConnected.addChannelWiseFullyConnectedLayer('Fully') 60 | fullyConnected.addRelu() 61 | fullyConnected.addBatchNormalization() 62 | fullyConnected.addReshape(self._fullyParams.outputShape()) 63 | return fullyConnected.output() 64 | 65 | def _decode(self, data): 66 | with tf.variable_scope("Decoder"): 67 | decoder = TFGraph(data, self._isTraining, "Decoder") 68 | 69 | decoder.addSeveralDeconvLayers(filter_shapes=self._decoderParams.filterShapes()[0:-2], 70 | input_channels=self._decoderParams.inputChannels()[0:-2], 71 | output_channels=self._decoderParams.outputChannels()[0:-2], 72 | strides=self._decoderParams.strides()[0:-2], 73 | names=self._decoderParams.convNames()[0:-2]) 74 | 75 | currentShape = decoder.outputShape() 76 | constantForReshape = int(4 * currentShape[1] / currentShape[2]) 77 | decoder.addReshape((currentShape[0], int(currentShape[1] / constantForReshape), 78 | currentShape[3], currentShape[2] * constantForReshape)) 79 | 80 | decoder.addDeconvLayer(filter_shape=self._decoderParams.filterShapes()[-2], 81 | input_channels=currentShape[2] * constantForReshape, 82 | output_channels=self._decoderParams.outputChannels()[-2], 83 | stride=self._decoderParams.strides()[-2], 84 | name=self._decoderParams.convNames()[-2]) 85 | decoder.addBatchNormalization() 86 | 87 | currentShape = decoder.outputShape() 88 | constantForReshape = int(self._decoderParams.strides()[-2][2]) 89 | 90 | decoder.addReshape((currentShape[0], currentShape[3], 91 | int(currentShape[2] / constantForReshape), 92 | currentShape[1] * constantForReshape)) 93 | 94 | decoder.addDeconvLayerWithoutNonLin(filter_shape=self._decoderParams.filterShapes()[-1], 95 | input_channels=currentShape[1] * constantForReshape, 96 | output_channels=self._decoderParams.outputChannels()[-1], 97 | stride=self._decoderParams.strides()[-1], 98 | name=self._decoderParams.convNames()[-1]) 99 | return decoder.output() 100 | -------------------------------------------------------------------------------- /architecture/contextEncoderArchitecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from architecture.architecture import Architecture 4 | from network.tfGraph import TFGraph 5 | 6 | __author__ = 'Andres' 7 | 8 | 9 | class ContextEncoderArchitecture(Architecture): 10 | def __init__(self, inputShape, encoderParams, decoderParams, fullyParams): 11 | with tf.variable_scope("ContextEncoderArchitecture"): 12 | self._inputShape = inputShape 13 | self._encoderParams = encoderParams 14 | self._decoderParams = decoderParams 15 | self._fullyParams = fullyParams 16 | super().__init__() 17 | 18 | def inputShape(self): 19 | return self._inputShape 20 | 21 | def _lossGraph(self): 22 | with tf.variable_scope("Loss"): 23 | targetSquaredNorm = tf.reduce_sum(tf.square(self._target), axis=[1, 2, 3]) 24 | 25 | error = self._target - self._output 26 | error_per_example = tf.reduce_sum(tf.square(error), axis=[1, 2, 3]) 27 | 28 | reconstruction_loss = 0.5 * tf.reduce_sum(error_per_example * (1 + 5 / (targetSquaredNorm+1e-4))) 29 | lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * 1e-2 30 | total_loss = tf.add_n([reconstruction_loss, lossL2]) 31 | 32 | total_loss_summary = tf.summary.scalar("total_loss", total_loss) 33 | l2_loss_summary = tf.summary.scalar("lossL2", lossL2) 34 | rec_loss_summary = tf.summary.scalar("reconstruction_loss", reconstruction_loss) 35 | self._lossSummaries = tf.summary.merge([rec_loss_summary, l2_loss_summary, total_loss_summary]) 36 | 37 | return total_loss 38 | 39 | def _network(self, data): 40 | encodedData = self._encode(data) 41 | connectedData = self._fullyConnect(encodedData) 42 | decodedData = self._decode(connectedData) 43 | return decodedData 44 | 45 | def _encode(self, data): 46 | with tf.variable_scope("Encoder"): 47 | encoder = TFGraph(data, self._isTraining, "Encoder") 48 | 49 | encoder.addSeveralConvLayers(filter_shapes=self._encoderParams.filterShapes(), 50 | input_channels=self._encoderParams.inputChannels(), 51 | output_channels=self._encoderParams.outputChannels(), 52 | strides=self._encoderParams.strides(), 53 | names=self._encoderParams.convNames()) 54 | return encoder.output() 55 | 56 | def _fullyConnect(self, data): 57 | with tf.variable_scope("Fully"): 58 | fullyConnected = TFGraph(data, self._isTraining, "Fully") 59 | 60 | fullyConnected.addReshape((self._fullyParams.batchSize(), self._fullyParams.inputChannels())) 61 | fullyConnected.addFullyConnectedLayer(self._fullyParams.inputChannels(), 62 | self._fullyParams.outputChannels(), 63 | 'Fully') 64 | fullyConnected.addRelu() 65 | fullyConnected.addBatchNormalization() 66 | fullyConnected.addReshape(self._fullyParams.outputShape()) 67 | return fullyConnected.output() 68 | 69 | def _decode(self, data): 70 | with tf.variable_scope("Decoder"): 71 | decoder = TFGraph(data, self._isTraining, "Decoder") 72 | 73 | decoder.addSeveralDeconvLayers(filter_shapes=self._decoderParams.filterShapes()[0:-2], 74 | input_channels=self._decoderParams.inputChannels()[0:-2], 75 | output_channels=self._decoderParams.outputChannels()[0:-2], 76 | strides=self._decoderParams.strides()[0:-2], 77 | names=self._decoderParams.convNames()[0:-2]) 78 | 79 | currentShape = decoder.outputShape() 80 | constantForReshape = int(4 * currentShape[1] / currentShape[2]) 81 | decoder.addReshape((currentShape[0], int(currentShape[1] / constantForReshape), 82 | currentShape[3], currentShape[2] * constantForReshape)) 83 | 84 | decoder.addDeconvLayer(filter_shape=self._decoderParams.filterShapes()[-2], 85 | input_channels=currentShape[2] * constantForReshape, 86 | output_channels=self._decoderParams.outputChannels()[-2], 87 | stride=self._decoderParams.strides()[-2], 88 | name=self._decoderParams.convNames()[-2]) 89 | decoder.addBatchNormalization() 90 | 91 | currentShape = decoder.outputShape() 92 | constantForReshape = int(self._decoderParams.strides()[-2][2]) 93 | 94 | decoder.addReshape((currentShape[0], currentShape[3], 95 | int(currentShape[2] / constantForReshape), 96 | currentShape[1] * constantForReshape)) 97 | 98 | decoder.addDeconvLayerWithoutNonLin(filter_shape=self._decoderParams.filterShapes()[-1], 99 | input_channels=currentShape[1] * constantForReshape, 100 | output_channels=self._decoderParams.outputChannels()[-1], 101 | stride=self._decoderParams.strides()[-1], 102 | name=self._decoderParams.convNames()[-1]) 103 | return decoder.output() 104 | -------------------------------------------------------------------------------- /utils/legacy/stftGapContextEncoder.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from utils.legacy.contextEncoder import ContextEncoderNetwork 7 | from utils.strechableNumpyArray import StrechableNumpyArray 8 | 9 | __author__ = 'Andres' 10 | 11 | 12 | class StftGapContextEncoder(ContextEncoderNetwork): 13 | def __init__(self, model, batch_size, target_model, window_size, gap_length, learning_rate, name): 14 | self._target_model = target_model 15 | super(StftGapContextEncoder, self).__init__(model, batch_size, window_size, gap_length, learning_rate, 16 | name) 17 | self._sides = tf.placeholder(tf.float32, shape=(batch_size, self._window_size - self._gap_length), name='sides') 18 | self._reconstructedSignal = self._reconstructSignal(self._sides, self.gap_data) 19 | 20 | def trainSNR(self): 21 | return tf.reduce_mean(self._pavlovs_SNR(self._target_model.output(), self._reconstructed_input_data, 22 | onAxis=[1, 2, 3])) 23 | 24 | def _reconstructSignal(self, sides, gaps): 25 | signal_length = self._window_size - self._gap_length 26 | first_half = sides[:, :signal_length // 2] 27 | second_half = sides[:, signal_length // 2:] 28 | 29 | reconstructed_signal = tf.concat([first_half, gaps, second_half], axis=1) 30 | return reconstructed_signal 31 | 32 | def _loss_graph(self): 33 | with tf.variable_scope("Loss"): 34 | gap_stft = self._target_model.output() 35 | 36 | norm_orig = self._squaredEuclideanNorm(gap_stft, onAxis=[1, 2, 3]) 37 | norm_orig_summary = tf.summary.scalar("norm_orig", tf.reduce_min(norm_orig)) 38 | 39 | error = gap_stft - self._reconstructed_input_data 40 | # Nati comment: here you should use only one reduce sum function 41 | error_per_example = tf.reduce_sum(tf.square(error), axis=[1, 2, 3]) 42 | 43 | reconstruction_loss = 0.5 * tf.reduce_sum(error_per_example * (1 + 5 / (norm_orig+1e-2))) 44 | 45 | rec_loss_summary = tf.summary.scalar("reconstruction_loss", reconstruction_loss) 46 | 47 | trainable_vars = tf.trainable_variables() 48 | lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in trainable_vars if 'bias' not in v.name]) * 1e-2 49 | l2_loss_summary = tf.summary.scalar("lossL2", lossL2) 50 | 51 | total_loss = tf.add_n([reconstruction_loss, lossL2]) 52 | total_loss_summary = tf.summary.scalar("total_loss", total_loss) 53 | 54 | self._lossSummaries = tf.summary.merge([rec_loss_summary, l2_loss_summary, norm_orig_summary, total_loss_summary]) 55 | 56 | return total_loss 57 | 58 | def reconstructAudio(self, audios, model_num=None, max_batchs=200): 59 | with tf.Session() as sess: 60 | if model_num is not None: 61 | path = self.modelsPath(model_num) 62 | else: 63 | path = self.modelsPath(self._initial_model_num) 64 | saver = tf.train.Saver() 65 | saver.restore(sess, path) 66 | print("Model restored.") 67 | 68 | batches_count = int(len(audios) / self._batch_size) 69 | 70 | reconstructed = StrechableNumpyArray() 71 | for batch_num in range(min(batches_count, max_batchs)): 72 | batch_data = audios[batch_num * self._batch_size:batch_num * self._batch_size + self._batch_size] 73 | feed_dict = {self._model.input(): batch_data, self._model.isTraining(): False} 74 | reconstructed_input = sess.run([self._reconstructed_input_data], 75 | feed_dict=feed_dict) 76 | reconstructed.append(np.reshape(reconstructed_input, (-1))) 77 | reconstructed = reconstructed.finalize() 78 | output_shape = self._target_model.output().shape.as_list() 79 | output_shape[0] = -1 80 | reconstructed_stft = np.reshape(reconstructed, output_shape) 81 | return reconstructed_stft 82 | 83 | def _reconstruct(self, sess, data_reader, max_steps): 84 | data_reader.start() 85 | reconstructed = StrechableNumpyArray() 86 | out_gaps = StrechableNumpyArray() 87 | for batch_num in range(max_steps): 88 | try: 89 | sides, gaps = data_reader.dataOperation(session=sess) 90 | except StopIteration: 91 | print(batch_num) 92 | print("rec End of queue!") 93 | break 94 | reconstructed_signal = sess.run(self._reconstructedSignal, 95 | feed_dict={self._sides: sides, self.gap_data: gaps}) 96 | gap_stft = self._target_model.output() 97 | 98 | feed_dict = {self._model.input(): reconstructed_signal, self._target_model.input(): reconstructed_signal, 99 | self._model.isTraining(): False} 100 | reconstructed_input, original = sess.run([self._reconstructed_input_data, gap_stft], feed_dict=feed_dict) 101 | out_gaps.append(np.reshape(original, (-1))) 102 | reconstructed.append(np.reshape(reconstructed_input, (-1))) 103 | 104 | output_shape = self._target_model.output().shape.as_list() 105 | output_shape[0] = -1 106 | reconstructed = reconstructed.finalize() 107 | reconstructed = np.reshape(reconstructed, output_shape) 108 | out_gaps = out_gaps.finalize() 109 | out_gaps = np.reshape(out_gaps, output_shape) 110 | 111 | data_reader.finish() 112 | 113 | return reconstructed, out_gaps 114 | 115 | def _evaluateValidSNR(self, summaries_dict, validReader, evalWriter, writer, sess, step): 116 | reconstructed, out_gaps = self._reconstruct(sess, validReader, max_steps=8) 117 | step_valid_SNR = evalWriter.evaluateImages(reconstructed, out_gaps, self._initial_model_num + step) 118 | validSNRSummaryToWrite = sess.run(summaries_dict['valid_SNR_summary'], 119 | feed_dict={summaries_dict['valid_SNR']: step_valid_SNR}) 120 | writer.add_summary(validSNRSummaryToWrite, self._initial_model_num + step) 121 | 122 | def _evaluatePlotSummary(self, plot_summary, gaps, feed_dict, writer, sess, step): 123 | pass 124 | 125 | def _trainingFeedDict(self, sides, gaps, sess): 126 | rec = sess.run(self._reconstructedSignal, feed_dict={self._sides: sides, self.gap_data: gaps}) 127 | return {self._model.input(): rec, self._target_model.input(): rec, self._model.isTraining(): True} 128 | 129 | 130 | def get_trailing_number(s): 131 | m = re.search(r'\d+$', s) 132 | return int(m.group()) if m else None 133 | -------------------------------------------------------------------------------- /utils/legacy/notebooks/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "scrolled": false 7 | }, 8 | "source": [ 9 | "# Context Encoder \n", 10 | "\n", 11 | "Let's begin by importing tensorflow and the network" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import tensorflow as tf\n", 21 | "from network.contextEncoder import ContextEncoderNetwork" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Now we initialize the context encoder network and select the step we want to use for the reconstruction." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "tf.reset_default_graph()\n", 38 | "\n", 39 | "train_filename = 'train_full_w5120_g1024_h512_19404621.tfrecords'\n", 40 | "valid_filename = 'valid_full_w5120_g1024_h512_ex913967.tfrecords'\n", 41 | "\n", 42 | "aContextEncoderNetwork = ContextEncoderNetwork(batch_size=256, window_size=5120, gap_length=1024, \n", 43 | " learning_rate=1e-5, name='test')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "best_step = 506000 " 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "test_filename = 'test_full_w5120_g1024_h512_ex298385.tfrecords'\n", 62 | "reconstructed, out_gaps = anAutoEncoderNetwork.reconstruct(test_filename, best_step, max_steps=248)\n", 63 | "\n", 64 | "evaluator = EvaluationWriter(anAutoEncoderNetwork._name + str(best_step) + '_test.xlsx')\n", 65 | "evaluator.evaluate(reconstructed, out_gaps, best_step)\n", 66 | "evaluator.save()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from matplotlib.backends.backend_pdf import PdfPages\n", 76 | "pp = PdfPages(anAutoEncoderNetwork._name + '_' + str(best_step) + '_test.pdf')\n", 77 | "\n", 78 | "pylab.rcParams['figure.figsize'] = (14, 28)\n", 79 | "f, axarr = plt.subplots(8, 2, sharey='row')\n", 80 | "\n", 81 | "stop_value = min(256, len(out_gaps)+1)\n", 82 | "for i in range(0, stop_value): \n", 83 | " if i is not 0 and i%8 is 0:\n", 84 | " pp.savefig()\n", 85 | " f, axarr = plt.subplots(8, 2, sharey='row')\n", 86 | " axarr[i%8, 0].plot(out_gaps[i%256])\n", 87 | " axarr[i%8, 1].plot(reconstructed[i%256]) \n", 88 | " \n", 89 | "pp.savefig()\n", 90 | "pp.close()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "def _pavlovs_SNR(y_orig, y_inp):\n", 100 | " norm_y_orig = np.linalg.norm(y_orig) + 1e-10\n", 101 | " norm_y_orig_minus_y_inp = np.linalg.norm(y_orig - y_inp)\n", 102 | " return 10 * np.log10((abs(norm_y_orig ** 2)) / abs((norm_y_orig_minus_y_inp ** 2)))\n", 103 | "\n", 104 | "def _euclideanNorm(vector):\n", 105 | " squared = np.square(vector)\n", 106 | " summed = np.sum(squared, axis=1)\n", 107 | " return np.sqrt(summed + 1e-10)\n", 108 | " \n", 109 | "fake_a = (reconstructed - 0.5) * 2\n", 110 | "gap = (out_gaps - 0.5) * 2\n", 111 | "\n", 112 | "SNRs = np.zeros((len(fake_a),))\n", 113 | "for index, signal in enumerate(fake_a):\n", 114 | " SNRs[index] = _pavlovs_SNR(gap[index], fake_a[index])\n", 115 | "\n", 116 | "norm_orig = _euclideanNorm(gap)\n", 117 | "error = (gap - fake_a)\n", 118 | "reconstruction_loss = 0.5 * np.sum(np.square(error), axis=1) * (1 + 1 / norm_orig)\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "pylab.rcParams['figure.figsize'] = (14, 8)\n", 128 | "\n", 129 | "import scipy.stats as stats\n", 130 | "sorted_SNR = sorted(SNRs)\n", 131 | "\n", 132 | "fit = stats.norm.pdf(sorted_SNR, np.mean(sorted_SNR), np.std(sorted_SNR)) #this is a fitting indeed\n", 133 | "\n", 134 | "plt.plot(sorted_SNR,fit,'-o')\n", 135 | "\n", 136 | "plt.hist(sorted_SNR, 50, normed=True) \n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "plt.scatter(SNRs, reconstruction_loss)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "pylab.rcParams['figure.figsize'] = (14, 28)\n", 155 | "\n", 156 | "examples = np.where((SNRs<-10) & (reconstruction_loss<5))[0]\n", 157 | "\n", 158 | "f, axarr = plt.subplots(8, 2, sharey='row')\n", 159 | "\n", 160 | "for index, example in enumerate(examples): \n", 161 | " if index is not 0 and index%8 is 0:\n", 162 | " f, axarr = plt.subplots(8, 2, sharey='row')\n", 163 | " axarr[index%8, 0].plot(out_gaps[example])\n", 164 | " axarr[index%8, 1].plot(reconstructed[example]) \n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "examples = np.where((SNRs<1) & (SNRs>-1))[0][:256]\n", 174 | "\n", 175 | "f, axarr = plt.subplots(8, 2, sharey='row')\n", 176 | "\n", 177 | "for index, example in enumerate(examples): \n", 178 | " if index is not 0 and index%8 is 0:\n", 179 | " f, axarr = plt.subplots(8, 2, sharey='row')\n", 180 | " axarr[index%8, 0].plot(out_gaps[example])\n", 181 | " axarr[index%8, 1].plot(reconstructed[example]) \n", 182 | " " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "examples = np.where((SNRs>28))[0][:256]\n", 192 | "\n", 193 | "f, axarr = plt.subplots(8, 2, sharey='row')\n", 194 | "\n", 195 | "for index, example in enumerate(examples): \n", 196 | " if index%8 == 0:\n", 197 | " f, axarr = plt.subplots(8, 2, sharey='row')\n", 198 | " axarr[index%8, 0].plot(out_gaps[example])\n", 199 | " axarr[index%8, 1].plot(reconstructed[example]) \n", 200 | " " 201 | ] 202 | } 203 | ], 204 | "metadata": { 205 | "kernelspec": { 206 | "display_name": "Python 3", 207 | "language": "python", 208 | "name": "python3" 209 | }, 210 | "language_info": { 211 | "codemirror_mode": { 212 | "name": "ipython", 213 | "version": 3 214 | }, 215 | "file_extension": ".py", 216 | "mimetype": "text/x-python", 217 | "name": "python", 218 | "nbconvert_exporter": "python", 219 | "pygments_lexer": "ipython3", 220 | "version": "3.6.2" 221 | } 222 | }, 223 | "nbformat": 4, 224 | "nbformat_minor": 2 225 | } 226 | -------------------------------------------------------------------------------- /SpecDivExperimentMag.m: -------------------------------------------------------------------------------- 1 | 2 | %% STFT parameters 3 | 4 | win = {'hann',512,'peak'}; 5 | dual = {'dual',win}; 6 | M = 512; a = M/4; 7 | flag = 'timeinv'; 8 | gamma = pghi_findgamma(win); 9 | 10 | %% Obtain data - THIS MUST BE UPDATED ONCE STUFF IS AVAILABLE! 11 | 12 | 13 | % load('magnitude_trainedOnFma_step723261_8mslater.mat') 14 | % tfdata_amp = magnitudeMat; 15 | % clear magnitudeMat; 16 | % load('magnitude_trainedOnFma_step723261_8msbefore.mat') 17 | % t = linspace(0, pi/2, 7)'; 18 | % sqCos = permute(repmat(cos(t).^2, 1, 3328, 257), [2 1 3]); 19 | % tfdata_amp = [magnitudeMat(:, 3:4, :) (tfdata_amp(:, 1:end-4, :).*sqCos+fliplr(sqCos).*magnitudeMat(:, 5:end, :)) tfdata_amp(:, end-3:end-2, :)]; 20 | % clear magnitudeMat; 21 | % 22 | 23 | load('magnitude_trainedOnFma_step723261_8mslater.mat') 24 | later = magnitudeMat; 25 | clear magnitudeMat; 26 | load('magnitude_trainedOnFma_step723261_8msbefore.mat') 27 | before = magnitudeMat; 28 | clear magnitudeMat; 29 | load('magnitude_trainedOnFma_step723261.mat') 30 | central = magnitudeMat; 31 | clear magnitudeMat; 32 | 33 | tfdata_amp = [(before(:, 3:4, :)+central(:, 1:2, :))/2 (later(:, 1:end-4, :)+central(:, 3:end-2,:)+before(:, 5:end, :))/3 (central(:, end-1:end,:)+later(:, end-3:end-2, :))/2]; 34 | % tfdata_amp = central; 35 | 36 | load('FMA_test_windows_16k.mat'); 37 | alldata_ori = fma_test(1:length(tfdata_amp),5121:5120*2).'; 38 | clear fma_test; 39 | 40 | load('CE_FMAonly_step2547124.mat'); 41 | alldata_rim = CEMat(1:length(tfdata_amp),:).'; 42 | clear generatedTimeSignals; 43 | 44 | load('fma_lpcrec_16k.mat') 45 | alldata_lpc = out(:,:).'; 46 | clear out; 47 | 48 | %num_data = 10; 49 | num_data = size(alldata_ori,2); 50 | num_methods = 10; 51 | 52 | L = 5120; 53 | 54 | num_tframes = 40; 55 | num_unknown = 11; 56 | 57 | %% Prepare arrays for results 58 | 59 | SpecDiv = zeros(num_data,num_methods); 60 | SNR = zeros(num_data,num_methods); 61 | 62 | mask = zeros(M/2+1,L/a); 63 | mask(:,[1:15,end-13:end]) = 1; 64 | 65 | known_idx = [1:15,num_tframes-13:num_tframes]; 66 | idx = 19:(num_tframes-17); 67 | idx = 20:(num_tframes-18); % for 48ms 68 | %idx = 1:40; 69 | 70 | known_tidx = [1:(14*a),(L-12*a)+1:L]; 71 | tidx = (16*a)+1:(L-16*a); 72 | %tidx = 1:L; 73 | 74 | %% Compute error measures 75 | 76 | for kk = 1:num_data 77 | % Load waveform data 78 | data_ori = alldata_ori(:,kk); 79 | data_ori_nomean = data_ori;% - mean(data_ori); 80 | data_lpc = alldata_lpc(:,kk); 81 | data_rim = alldata_rim(:,kk); 82 | 83 | c_ori = dgtreal(data_ori,win,a,M,L,flag); % DGT of original 84 | c_angle_ori = angle(c_ori); % Phase of original DGT 85 | 86 | % LPC 87 | % SNR(kk,1) = 20*log10(norm(data_ori_nomean(tidx))/norm(data_lpc(tidx)-data_ori(tidx))); 88 | % c_lpc = dgtreal(data_lpc,win,a,M,L,flag); 89 | % SpecDiv(kk,1) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_lpc(:,idx))); 90 | 91 | % Real and Imag 92 | % SNR(kk,2) = 20*log10(norm(data_ori_nomean(tidx))/norm(data_rim(tidx)-data_ori(tidx))); 93 | % c_rim = dgtreal(data_rim,win,a,M,L,flag); 94 | % SpecDiv(kk,2) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_rim(:,idx))); 95 | % % Real and Imag + Original Phase 96 | % c_rim_tp = abs(c_rim).*exp(1i*c_angle_ori); 97 | % f_rim_tp = idgtreal(c_rim_tp,dual,a,M,flag); 98 | % SNR(kk,3) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_rim_tp(tidx)-data_ori(tidx))); 99 | % c_rim_rec = dgtreal(f_rim_tp,win,a,M,L,flag); 100 | % SpecDiv(kk,3) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_rim_rec(:,idx))); 101 | % % Real and Imag + PGHI 102 | % c_rim_pghi = pghi(c_rim,pghi_findgamma(win),a,M,mask,flag); 103 | % f_rim_pghi = idgtreal(c_rim_pghi,dual,a,M,flag); 104 | % SNR(kk,4) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_rim_pghi(tidx)-data_ori(tidx))); 105 | % c_rim_pghi_rec = dgtreal(f_rim_pghi,win,a,M,L,flag); 106 | % SpecDiv(kk,4) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_rim_pghi_rec(:,idx))); 107 | % % Real and Imag + FGLIM 108 | c_rim_gla = masked_gla(c_rim,dual,a,M,mask,flag,'fgla','input'); 109 | f_rim_gla = idgtreal(c_rim_gla,dual,a,M,flag); 110 | SNR(kk,5) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_rim_gla(tidx)-data_ori(tidx))); 111 | c_rim_gla_rec = dgtreal(f_rim_gla,win,a,M,L,flag); 112 | SpecDiv(kk,5) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_rim_gla_rec(:,idx))); 113 | % % Real and Imag + PGHI + FGLIM 114 | % c_rim_pgla = gla(c_rim_pghi,dual,a,M,flag,'fgla','input'); 115 | % SpecDiv(kk,6) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_rim_pgla(:,idx))); 116 | % f_rim_pgla = idgtreal(c_rim_pgla,dual,a,M,flag); 117 | % SNR(kk,6) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_rim_pgla(tidx)-data_ori(tidx))); 118 | % c_rim_pgla_rec = dgtreal(f_rim_pgla,win,a,M,L,flag); 119 | % SpecDiv(kk,6) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_rim_pgla_rec(:,idx))); 120 | 121 | % Amplitude (original phase) 122 | c_amp = abs(c_ori); % Initialize magnitude 123 | c_amp(:,16:(num_tframes-14)) = squeeze(tfdata_amp(kk,:,:)).'; % Set inner part to proposed solution 124 | c_amp_tp = abs(c_amp).*exp(1i*c_angle_ori); 125 | f_amp_tp = idgtreal(c_amp_tp,dual,a,M,flag); 126 | SNR(kk,7) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_amp_tp(tidx)-data_ori(tidx))); 127 | c_amp_rec = dgtreal(f_amp_tp,win,a,M,L,flag); 128 | SpecDiv(kk,7) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_amp_rec(:,idx))); 129 | % Amplitude + PGHI 130 | kphase = (c_angle_ori.*mask);%+2*pi*rand(M/2+1,num_tframes).*(1-mask)); 131 | c_amp_kphase = c_amp.*exp(1i*kphase); 132 | c_amp_pghi = pghi(c_amp_kphase,gamma,a,M,mask,flag); 133 | f_amp_pghi = idgtreal(c_amp_pghi,dual,a,M,flag); 134 | SNR(kk,8) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_amp_pghi(tidx)-data_ori(tidx))); 135 | c_amp_pghi_rec = dgtreal(f_amp_pghi,win,a,M,L,flag); 136 | SpecDiv(kk,8) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_amp_pghi_rec(:,idx))); 137 | % Amplitude + FGLIM 138 | % c_amp_gla = masked_gla(c_amp_kphase,dual,a,M, mask,flag,'fgla','input'); 139 | % f_amp_gla = idgtreal(c_amp_gla,dual,a,M,flag); 140 | % SNR(kk,9) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_amp_gla(tidx)-data_ori(tidx))); 141 | % c_amp_gla_rec = dgtreal(f_amp_gla,win,a,M,L,flag); 142 | % SpecDiv(kk,9) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_amp_gla_rec(:,idx))); 143 | % Amplitude + PGHI + FGLIM 144 | c_amp_pgla = masked_gla(c_amp_pghi,dual,a,M, mask,flag,'fgla','input'); 145 | f_amp_pgla = idgtreal(c_amp_pgla,dual,a,M,flag); 146 | SNR(kk,10) = 20*log10(norm(data_ori_nomean(tidx))/norm(f_amp_pgla(tidx)-data_ori(tidx))); 147 | c_amp_pgla_rec = dgtreal(f_amp_pgla,win,a,M,L,flag); 148 | SpecDiv(kk,10) = 20*log10(1/magnitudeerr(c_ori(:,idx),c_amp_pgla_rec(:,idx))); 149 | 150 | if mod(kk,200) == 0 151 | fprintf('-Iteration %d-',kk); 152 | end 153 | end 154 | 155 | maxSNR = max(SNR); 156 | stdSNR = std(SNR); 157 | minSNR = min(SNR); 158 | meanSNR = mean(SNR); 159 | medianSNR = median(SNR); 160 | quant25SNR = quantile(SNR,0.25); 161 | quant75SNR = quantile(SNR,0.75); 162 | 163 | maxSpecDiv= max(SpecDiv); 164 | stdSpecDiv = std(SpecDiv); 165 | minSpecDiv = min(SpecDiv); 166 | meanSpecDiv = mean(SpecDiv); 167 | medianSpecDiv = median(SpecDiv); 168 | quant25SpecDiv = quantile(SpecDiv,0.25); 169 | quant75SpecDiv = quantile(SpecDiv,0.75); 170 | 171 | save('MethodComparison.mat','meanSNR','minSNR','quant25SNR','medianSNR',... 172 | 'quant75SNR','maxSNR','meanSpecDiv','minSpecDiv','quant25SpecDiv',... 173 | 'medianSpecDiv','quant75SpecDiv','maxSpecDiv'); 174 | 175 | SNRstatsFMA = [meanSNR;stdSNR;minSNR;quant25SNR;medianSNR;quant75SNR;maxSNR]; 176 | SpecDivstatsFMA = [meanSpecDiv;stdSpecDiv;minSpecDiv;quant25SpecDiv;medianSpecDiv;quant75SpecDiv;maxSpecDiv]; 177 | 178 | save('StatsFMA.mat','SNRstatsFMA','SpecDivstatsFMA'); 179 | -------------------------------------------------------------------------------- /system/contextEncoderSystem.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from system.dnnSystem import DNNSystem 4 | from utils.colorize import colorize 5 | from utils.strechableNumpyArray import StrechableNumpyArray 6 | from utils.tfReader import TFReader 7 | 8 | __author__ = 'Andres' 9 | 10 | 11 | class ContextEncoderSystem(DNNSystem): 12 | def __init__(self, architecture, batchSize, aPreProcessor, name): 13 | self._windowSize = aPreProcessor.signalLength() 14 | self._batchSize = batchSize 15 | self._audio = tf.placeholder(tf.float32, shape=(batchSize, self._windowSize), name='audio_data') 16 | self._preProcessForGap = aPreProcessor.stftForGapOf(self._audio) 17 | self._preProcessForContext = aPreProcessor.stftForTheContextOf(self._audio) 18 | super().__init__(architecture, name) 19 | self._SNR = tf.reduce_mean(self._pavlovs_SNR(self._architecture.output(), self._architecture.target())) 20 | 21 | def optimizer(self, learningRate): 22 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 23 | with tf.control_dependencies(update_ops): 24 | return tf.train.AdamOptimizer(learning_rate=learningRate).minimize(self._architecture.loss()) 25 | 26 | def _feedDict(self, data, sess, isTraining=True): 27 | net_input, net_target = sess.run([self._preProcessForContext, self._preProcessForGap], feed_dict={self._audio: data}) 28 | return {self._architecture.input(): net_input, self._architecture.target(): net_target, 29 | self._architecture.isTraining(): isTraining} 30 | 31 | def reconstructAudio(self, aBatchOfSignals, model_num, max_steps=200): 32 | with tf.Session() as sess: 33 | path = self.modelsPath(model_num) 34 | saver = tf.train.Saver() 35 | saver.restore(sess, path) 36 | print("Model restored.") 37 | sess.run([tf.local_variables_initializer()]) 38 | reconstructed = StrechableNumpyArray() 39 | out_gaps = StrechableNumpyArray() 40 | input_shape = list(self._architecture.inputShape()) 41 | input_shape[0] = 0 42 | contexts = np.empty(input_shape) 43 | 44 | for batch_num in range(min(int(len(aBatchOfSignals)/self._batchSize), max_steps)): 45 | feed_dict = self._feedDict( 46 | aBatchOfSignals[batch_num * self._batchSize:(batch_num + 1) * self._batchSize], sess, False) 47 | reconstructed_input, original, context = sess.run( 48 | [self._architecture.output(), self._architecture.target(), 49 | self._architecture.input()], 50 | feed_dict=feed_dict) 51 | out_gaps.append(np.reshape(original, (-1))) 52 | reconstructed.append(np.reshape(reconstructed_input, (-1))) 53 | contexts = np.concatenate([contexts, context], axis=0) 54 | 55 | output_shape = self._architecture.output().shape.as_list() 56 | output_shape[0] = -1 57 | reconstructed = reconstructed.finalize() 58 | reconstructed = np.reshape(reconstructed, output_shape) 59 | out_gaps = out_gaps.finalize() 60 | out_gaps = np.reshape(out_gaps, output_shape) 61 | 62 | return reconstructed, out_gaps, contexts 63 | 64 | def reconstruct(self, data_path, model_num, max_steps=200): 65 | with tf.Session() as sess: 66 | reader = self._loadReader(data_path) 67 | path = self.modelsPath(model_num) 68 | saver = tf.train.Saver() 69 | saver.restore(sess, path) 70 | print("Model restored.") 71 | sess.run([tf.local_variables_initializer()]) 72 | return self._reconstruct(sess, reader, max_steps) 73 | 74 | def _reconstruct(self, sess, data_reader, max_steps): 75 | data_reader.start() 76 | reconstructed = StrechableNumpyArray() 77 | out_gaps = StrechableNumpyArray() 78 | input_shape = list(self._architecture.inputShape()) 79 | input_shape[0] = 0 80 | contexts = np.empty(input_shape) 81 | 82 | for batch_num in range(max_steps): 83 | try: 84 | audio = data_reader.dataOperation(session=sess) 85 | except StopIteration: 86 | print("rec End of queue!", batch_num) 87 | break 88 | 89 | feed_dict = self._feedDict(audio, sess, False) 90 | reconstructed_input, original, context = sess.run([self._architecture.output(), self._architecture.target(), 91 | self._architecture.input()], 92 | feed_dict=feed_dict) 93 | out_gaps.append(np.reshape(original, (-1))) 94 | reconstructed.append(np.reshape(reconstructed_input, (-1))) 95 | contexts = np.concatenate([contexts, context], axis=0) 96 | 97 | output_shape = self._architecture.output().shape.as_list() 98 | output_shape[0] = -1 99 | reconstructed = reconstructed.finalize() 100 | reconstructed = np.reshape(reconstructed, output_shape) 101 | out_gaps = out_gaps.finalize() 102 | out_gaps = np.reshape(out_gaps, output_shape) 103 | 104 | data_reader.finish() 105 | 106 | return reconstructed, out_gaps, contexts 107 | 108 | def _evaluate(self, summariesDict, feed_dict, validReader, sess): 109 | trainSNRSummaryToWrite = sess.run(summariesDict['train_SNR_summary'], feed_dict=feed_dict) 110 | 111 | try: 112 | audio = validReader.dataOperation(session=sess) 113 | except StopIteration: 114 | print("valid End of queue!") 115 | return [trainSNRSummaryToWrite] 116 | feed_dict = self._feedDict(audio, sess, False) 117 | validSNRSummary = sess.run(summariesDict['valid_SNR_summary'], feed_dict) 118 | iamgeSummary = sess.run(summariesDict['image_summaries'], feed_dict) 119 | 120 | return [trainSNRSummaryToWrite, validSNRSummary, iamgeSummary] 121 | 122 | def _loadReader(self, dataPath): 123 | return TFReader(dataPath, self._windowSize, batchSize=self._batchSize, capacity=int(2e5), num_epochs=400) 124 | 125 | def _evaluationSummaries(self): 126 | summaries_dict = {'train_SNR_summary': tf.summary.scalar("training_SNR", self._SNR), 127 | 'valid_SNR_summary': tf.summary.scalar("validation_SNR", self._SNR), 128 | 'image_summaries': self._spectrogramImageSummary()} 129 | return summaries_dict 130 | 131 | def _squaredEuclideanNorm(self, tensor, onAxis=[1, 2, 3]): 132 | squared = tf.square(tensor) 133 | summed = tf.reduce_sum(squared, axis=onAxis) 134 | return summed 135 | 136 | def _log10(self, tensor): 137 | numerator = tf.log(tensor) 138 | denominator = tf.log(tf.constant(10, dtype=numerator.dtype)) 139 | return numerator / denominator 140 | 141 | def _pavlovs_SNR(self, y_orig, y_inp, onAxis=[1, 2, 3]): 142 | norm_y_orig = self._squaredEuclideanNorm(y_orig, onAxis) 143 | norm_y_orig_minus_y_inp = self._squaredEuclideanNorm(y_orig - y_inp, onAxis) 144 | return 10 * self._log10(norm_y_orig / norm_y_orig_minus_y_inp) 145 | 146 | def _spectrogramImageSummary(self): 147 | complexOutput = self._architecture.output()[0] 148 | outputSpectrogram = tf.sqrt(tf.reduce_sum(tf.square(complexOutput), axis=-1)) 149 | 150 | complexTarget = self._architecture.target()[0] 151 | targetSpectrogram = tf.sqrt(tf.reduce_sum(tf.square(complexTarget), axis=-1)) 152 | 153 | complexLeft = self._architecture.input()[0, :, :, 0:2] 154 | leftSpectrogram = tf.sqrt(tf.reduce_sum(tf.square(complexLeft), axis=-1)) 155 | 156 | complexRight = self._architecture.input()[0, :, :, 2:4] 157 | rightSpectrogram = tf.sqrt(tf.reduce_sum(tf.square(complexRight), axis=-1)) 158 | 159 | totalSpectrogram = tf.transpose(tf.concat([leftSpectrogram, outputSpectrogram, 160 | rightSpectrogram], axis=0)) 161 | 162 | return tf.summary.merge([tf.summary.image("Original", [colorize(tf.transpose(targetSpectrogram))]), 163 | tf.summary.image("Generated", [colorize(tf.transpose(outputSpectrogram))]), 164 | tf.summary.image("Complete", [colorize(totalSpectrogram)])]) 165 | -------------------------------------------------------------------------------- /utils/test/ftest_stftForTheInpaintingSetting.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from system.preAndPostProcessor import PreAndPostProcessor 7 | 8 | __author__ = 'Andres' 9 | 10 | 11 | class TestStftForTheContextEncoder(TestCase): 12 | def setUp(self): 13 | self.signal_length = 5120 14 | self.gap_length = 1024 15 | self.fft_window_length = 512 16 | self.fft_hop_size = 128 17 | 18 | self.anStftForTheInpaintingSetting = PreAndPostProcessor(signalLength=self.signal_length, 19 | gapLength=self.gap_length, 20 | fftWindowLength=self.fft_window_length, 21 | fftHopSize=self.fft_hop_size) 22 | 23 | def test01TheStftTakesTheInpaintingParametersAsInput(self): 24 | self.assertEquals(self.anStftForTheInpaintingSetting.signalLength(), self.signal_length) 25 | self.assertEquals(self.anStftForTheInpaintingSetting.gapLength(), self.gap_length) 26 | self.assertEquals(self.anStftForTheInpaintingSetting.fftWindowLenght(), self.fft_window_length) 27 | self.assertEquals(self.anStftForTheInpaintingSetting.fftHopSize(), self.fft_hop_size) 28 | 29 | def test02TheStftKnowsHowMuchPaddingItShouldApply(self): 30 | self.assertEquals(self.anStftForTheInpaintingSetting.padding(), self.fft_window_length-self.fft_hop_size) 31 | 32 | fft_window_length = 1024 33 | fft_hop_size = 128 34 | anStftForTheInpaintingSetting = PreAndPostProcessor(signalLength=self.signal_length, 35 | gapLength=self.gap_length, 36 | fftWindowLength=fft_window_length, 37 | fftHopSize=fft_hop_size) 38 | self.assertEquals(anStftForTheInpaintingSetting.padding(), fft_window_length - fft_hop_size) 39 | 40 | fft_window_length = 1024 41 | fft_hop_size = 256 42 | anStftForTheInpaintingSetting = PreAndPostProcessor(signalLength=self.signal_length, 43 | gapLength=self.gap_length, 44 | fftWindowLength=fft_window_length, 45 | fftHopSize=fft_hop_size) 46 | self.assertEquals(anStftForTheInpaintingSetting.padding(), fft_window_length - fft_hop_size) 47 | 48 | def test03TheStftKnowsWhatSignalItShouldTakeForTheSTFTOfTheGap(self): 49 | fake_batch_of_signal = np.array([np.arange(self.signal_length)]) 50 | produced_signal = self.anStftForTheInpaintingSetting._removeExtraSidesForSTFTOfGap(fake_batch_of_signal) 51 | 52 | gap_begins = (self.signal_length-self.gap_length)//2 53 | gap_ends = gap_begins + self.gap_length 54 | padding = self.fft_window_length-self.fft_hop_size 55 | 56 | np.testing.assert_almost_equal(fake_batch_of_signal[:, gap_begins - padding:gap_ends + padding], produced_signal) 57 | 58 | fft_window_length = 128 59 | fft_hop_size = 32 60 | 61 | anStftForTheInpaintingSetting = PreAndPostProcessor(signalLength=self.signal_length, 62 | gapLength=self.gap_length, 63 | fftWindowLength=fft_window_length, 64 | fftHopSize=fft_hop_size) 65 | produced_signal = anStftForTheInpaintingSetting._removeExtraSidesForSTFTOfGap(fake_batch_of_signal) 66 | padding = fft_window_length - fft_hop_size 67 | np.testing.assert_almost_equal(fake_batch_of_signal[:, gap_begins - padding:gap_ends + padding], produced_signal) 68 | 69 | def test04TheStftProducesAnSTFTOfTheExpectedShapeForTheGap(self): 70 | batch_size = 32 71 | aBatchOfSignals = tf.placeholder(tf.float32, shape=(batch_size, self.signal_length), name='input_data') 72 | aStft = self.anStftForTheInpaintingSetting.stftForGapOf(aBatchOfSignals) 73 | 74 | framesOnGap = (((self.gap_length + self.anStftForTheInpaintingSetting.padding()*2)-self.fft_window_length)/ 75 | self.fft_hop_size)+1 76 | binsPerFrame = self.fft_window_length//2+1 77 | realAndImagChannels = 2 78 | self.assertEquals(aStft.shape.as_list(), [32, framesOnGap, binsPerFrame, realAndImagChannels]) 79 | 80 | def test05TheStftRemovesTheGapCorrectly(self): 81 | fake_batch_of_signal = np.array([np.arange(self.signal_length)]) 82 | produced_signal = self.anStftForTheInpaintingSetting._removeGap(fake_batch_of_signal) 83 | 84 | gap_begins = (self.signal_length-self.gap_length)//2 85 | gap_ends = gap_begins + self.gap_length 86 | 87 | left_side = fake_batch_of_signal[:, :gap_begins] 88 | right_side = fake_batch_of_signal[:, gap_ends:] 89 | signal_without_gap = tf.stack((left_side, right_side), axis=1) 90 | 91 | with tf.Session() as sess: 92 | produced_signal, signal_without_gap = sess.run([produced_signal, signal_without_gap]) 93 | 94 | np.testing.assert_almost_equal(signal_without_gap, produced_signal) 95 | 96 | def test06TheStftAddsTheCorrectPaddingToTheSides(self): 97 | side_length = (self.signal_length-self.gap_length)//2 98 | 99 | left_side = np.array([np.arange(side_length, dtype=np.float32)]) 100 | right_side = np.array([np.arange(side_length, dtype=np.float32)]) 101 | fake_batch_of_sides = tf.stack((left_side, right_side), axis=1) 102 | 103 | produced_signal = self.anStftForTheInpaintingSetting._addPaddingForStftOfContext(fake_batch_of_sides) 104 | 105 | with tf.Session() as sess: 106 | produced_signal = sess.run(produced_signal) 107 | 108 | left_side_padded = np.concatenate((left_side, np.zeros((1, self.fft_window_length-self.fft_hop_size))), axis=1) 109 | right_side_padded = np.concatenate((np.zeros((1, self.fft_window_length-self.fft_hop_size)), right_side), axis=1) 110 | new_signal = np.stack([left_side_padded, right_side_padded], axis=1) 111 | 112 | np.testing.assert_almost_equal(new_signal, produced_signal) 113 | 114 | def test07TheStftOfTheContextHasTheExpectedShape(self): 115 | batch_size = 32 116 | aBatchOfSignals = tf.placeholder(tf.float32, shape=(batch_size, self.signal_length), name='input_data') 117 | aStft = self.anStftForTheInpaintingSetting.stftForTheContextOf(aBatchOfSignals) 118 | 119 | side_length = (self.signal_length-self.gap_length)//2 120 | framesOnSides = ((side_length + self.anStftForTheInpaintingSetting.padding() - self.fft_window_length) 121 | / self.fft_hop_size)+1 122 | binsPerFrame = self.fft_window_length//2+1 123 | realAndImagChannels = 2 124 | beforeAndAfterChannels = 2 125 | 126 | self.assertEquals(aStft.shape.as_list(), [32, framesOnSides, binsPerFrame, 127 | realAndImagChannels*beforeAndAfterChannels]) 128 | 129 | def test08TheStftProducesTheCorrectShapeWhenDoingTheInverseStftOnTheGap(self): 130 | batch_size = 32 131 | framesOnGap = (((self.gap_length + self.anStftForTheInpaintingSetting.padding()*2)-self.fft_window_length)/ 132 | self.fft_hop_size)+1 133 | binsPerFrame = self.fft_window_length//2+1 134 | batchOfGapStft = tf.zeros((batch_size, framesOnGap, binsPerFrame), dtype=tf.complex64) 135 | 136 | batchOfGaps = self.anStftForTheInpaintingSetting.inverseStftOfGap(batchOfGapStft) 137 | 138 | with tf.Session() as sess: 139 | batchOfGaps = sess.run(batchOfGaps) 140 | 141 | self.assertEquals(batchOfGaps.shape, (batch_size, self.gap_length)) 142 | 143 | def test09TheStftProducesTheCorrectShapeWhenDoingTheInverseStftOnTheFullSignal(self): 144 | batch_size = 32 145 | frameCount = ((self.signal_length-self.fft_window_length)/self.fft_hop_size)+1 146 | binsPerFrame = self.fft_window_length//2+1 147 | batchOfSignalStft = tf.zeros((batch_size, frameCount, binsPerFrame), dtype=tf.complex64) 148 | 149 | batchOfSignals = self.anStftForTheInpaintingSetting.inverseStftOfSignal(batchOfSignalStft) 150 | 151 | with tf.Session() as sess: 152 | batchOfGaps = sess.run(batchOfSignals) 153 | 154 | self.assertEquals(batchOfGaps.shape, (batch_size, self.signal_length)) 155 | -------------------------------------------------------------------------------- /utils/legacy/stftRealImagContextEncoder.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from utils.legacy.contextEncoder import ContextEncoderNetwork 7 | from utils.legacy.evaluationWriter import EvaluationWriter 8 | from utils.legacy.plotSummary import PlotSummary 9 | from utils.strechableNumpyArray import StrechableNumpyArray 10 | from utils.tfReader import TFReader 11 | 12 | __author__ = 'Andres' 13 | 14 | 15 | class StftRealImagContextEncoder(ContextEncoderNetwork): 16 | def __init__(self, model, batch_size, stft, window_size, gap_length, learning_rate, name): 17 | self._stft = stft 18 | super(StftRealImagContextEncoder, self).__init__(model, batch_size, window_size, gap_length, learning_rate, 19 | name) 20 | self._sides = tf.placeholder(tf.float32, shape=(batch_size, self._window_size - self._gap_length), name='sides') 21 | self._reconstructedSignal = self._reconstructSignal(self._sides, self.gap_data) 22 | 23 | self._SNR = tf.reduce_mean(self._pavlovs_SNR(self._stft[:, 15:15 + 7, :, :], self._reconstructed_input_data, 24 | onAxis=[1, 2, 3])) 25 | 26 | def _reconstructSignal(self, sides, gaps): 27 | signal_length = self._window_size - self._gap_length 28 | first_half = sides[:, :signal_length // 2] 29 | second_half = sides[:, signal_length // 2:] 30 | 31 | reconstructed_signal = tf.concat([first_half, gaps, second_half], axis=1) 32 | return reconstructed_signal 33 | 34 | def _loss_graph(self): 35 | with tf.variable_scope("Loss"): 36 | gap_stft = self._stft[:, 15:15 + 7, :, :] 37 | 38 | norm_orig = self._squaredEuclideanNorm(gap_stft, onAxis=[1, 2, 3]) 39 | norm_orig_summary = tf.summary.scalar("norm_orig", tf.reduce_min(norm_orig)) 40 | 41 | error = gap_stft - self._reconstructed_input_data 42 | # Nati comment: here you should use only one reduce sum function 43 | error_per_example = tf.reduce_sum(tf.square(error), axis=[1, 2, 3]) 44 | 45 | reconstruction_loss = 0.5 * tf.reduce_sum(error_per_example * (1 + 5 / (norm_orig+1e-2))) 46 | 47 | rec_loss_summary = tf.summary.scalar("reconstruction_loss", reconstruction_loss) 48 | 49 | trainable_vars = tf.trainable_variables() 50 | lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in trainable_vars if 'bias' not in v.name]) * 1e-2 51 | l2_loss_summary = tf.summary.scalar("lossL2", lossL2) 52 | 53 | total_loss = tf.add_n([reconstruction_loss, lossL2]) 54 | total_loss_summary = tf.summary.scalar("total_loss", total_loss) 55 | 56 | self._lossSummaries = tf.summary.merge([rec_loss_summary, l2_loss_summary, norm_orig_summary, total_loss_summary]) 57 | 58 | return total_loss 59 | 60 | def reconstructAudio(self, audios, model_num=None, max_batchs=200): 61 | with tf.Session() as sess: 62 | if model_num is not None: 63 | path = self.modelsPath(model_num) 64 | else: 65 | path = self.modelsPath(self._initial_model_num) 66 | saver = tf.train.Saver() 67 | saver.restore(sess, path) 68 | print("Model restored.") 69 | 70 | batches_count = int(len(audios) / self._batch_size) 71 | 72 | reconstructed = StrechableNumpyArray() 73 | original_stfts = StrechableNumpyArray() 74 | for batch_num in range(min(batches_count, max_batchs)): 75 | batch_data = audios[batch_num * self._batch_size:batch_num * self._batch_size + self._batch_size] 76 | feed_dict = {self._model.input(): batch_data, self._model.isTraining(): False} 77 | reconstructed_input, original_stft = sess.run([self._reconstructed_input_data, self._stft], 78 | feed_dict=feed_dict) 79 | original_stfts.append(np.reshape(original_stft, (-1))) 80 | reconstructed.append(np.reshape(reconstructed_input, (-1))) 81 | reconstructed = reconstructed.finalize() 82 | reconstructed_stft = np.reshape(reconstructed, (-1, 37, 257)) 83 | original_stfts = original_stfts.finalize() 84 | original_stft = np.reshape(original_stfts, (-1, 7, 257, 2)) 85 | 86 | return reconstructed_stft, original_stft 87 | 88 | def _reconstruct(self, sess, data_reader, max_steps): 89 | data_reader.start() 90 | reconstructed = StrechableNumpyArray() 91 | out_gaps = StrechableNumpyArray() 92 | for batch_num in range(max_steps): 93 | try: 94 | sides, gaps = data_reader.dataOperation(session=sess) 95 | except StopIteration: 96 | print(batch_num) 97 | print("rec End of queue!") 98 | break 99 | reconstructed_signal = sess.run(self._reconstructedSignal, 100 | feed_dict={self._sides: sides, self.gap_data: gaps}) 101 | gap_stft = self._stft[:, 15:15 + 7, :] 102 | 103 | feed_dict = {self._model.input(): reconstructed_signal, self._model.isTraining(): False} 104 | reconstructed_input, original = sess.run([self._reconstructed_input_data, gap_stft], feed_dict=feed_dict) 105 | out_gaps.append(np.reshape(original, (-1))) 106 | reconstructed.append(np.reshape(reconstructed_input, (-1))) 107 | 108 | reconstructed = reconstructed.finalize() 109 | reconstructed = np.reshape(reconstructed, (-1, 7, 257, 2)) 110 | out_gaps = out_gaps.finalize() 111 | out_gaps = np.reshape(out_gaps, (-1, 7, 257, 2)) 112 | 113 | data_reader.finish() 114 | 115 | return reconstructed, out_gaps 116 | 117 | def train(self, train_data_path, valid_data_path, num_steps=2e2, restore_num=None, 118 | per_process_gpu_memory_fraction=1): 119 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=per_process_gpu_memory_fraction) 120 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: 121 | try: 122 | trainReader = TFReader(train_data_path, self._window_size, self._gap_length, capacity=int(2e5), 123 | num_epochs=400) 124 | validReader = TFReader(valid_data_path, self._window_size, self._gap_length, capacity=int(2e5), 125 | num_epochs=40000) 126 | 127 | saver = tf.train.Saver(max_to_keep=1000) 128 | if restore_num == 0: 129 | init = tf.global_variables_initializer() 130 | sess.run([init, tf.local_variables_initializer()]) 131 | print("Initialized") 132 | else: 133 | path = self.modelsPath(restore_num) 134 | self._initial_model_num = get_trailing_number(path[:-5]) 135 | print(self._initial_model_num) 136 | saver.restore(sess, path) 137 | sess.run([tf.local_variables_initializer()]) 138 | print("Model restored.") 139 | 140 | logs_path = '../logdir_real_cae/' + self._name # write each run to a diff folder. 141 | print("logs path:", logs_path) 142 | writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) 143 | 144 | train_SNR_summary = tf.summary.scalar("training_SNR", self._SNR) 145 | valid_SNR = tf.placeholder(tf.float32, name="valid_SNR") 146 | valid_SNR_summary = tf.summary.scalar("validation_SNR", valid_SNR) 147 | plot_summary = PlotSummary('reconstruction') 148 | 149 | trainReader.start() 150 | evalWriter = EvaluationWriter(self._name + '.xlsx') 151 | 152 | # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 153 | # run_metadata = tf.RunMetadata() 154 | # many_runs_timeline = TimeLiner() 155 | 156 | for step in range(1, int(num_steps)): 157 | try: 158 | sides, gaps = trainReader.dataOperation(session=sess) 159 | except StopIteration: 160 | print(step) 161 | print("End of queue!") 162 | break 163 | 164 | rec = sess.run(self._reconstructedSignal, feed_dict={self._sides: sides, self.gap_data: gaps}) 165 | 166 | feed_dict = {self._model.input(): rec, self.gap_data: gaps, self._model.isTraining(): True} 167 | sess.run(self._optimizer, feed_dict=feed_dict) # , options=options, run_metadata=run_metadata) 168 | 169 | # fetched_timeline = timeline.Timeline(run_metadata.step_stats) 170 | # chrome_trace = fetched_timeline.generate_chrome_trace_format() 171 | # many_runs_timeline.update_timeline(chrome_trace) 172 | 173 | if step % 40 == 0: 174 | train_summ = sess.run(self._lossSummaries, feed_dict=feed_dict) 175 | writer.add_summary(train_summ, self._initial_model_num + step) 176 | if step % 2000 == 0: 177 | print(step) 178 | #reconstructed, out_gaps = self._reconstruct(sess, trainReader, max_steps=8) # WRONG 179 | # plot_summary.plotSideBySide(out_gaps, reconstructed) 180 | trainSNRSummaryToWrite = sess.run(train_SNR_summary, feed_dict=feed_dict) 181 | writer.add_summary(trainSNRSummaryToWrite, self._initial_model_num + step) 182 | #summaryToWrite = plot_summary.produceSummaryToWrite(sess) 183 | #writer.add_summary(summaryToWrite, self._initial_model_num + step) 184 | saver.save(sess, self.modelsPath(self._initial_model_num + step)) 185 | reconstructed, out_gaps = self._reconstruct(sess, validReader, max_steps=8) 186 | step_valid_SNR = evalWriter.evaluateImages(reconstructed, out_gaps, self._initial_model_num + step) 187 | validSNRSummaryToWrite = sess.run(valid_SNR_summary, feed_dict={valid_SNR: step_valid_SNR}) 188 | writer.add_summary(validSNRSummaryToWrite, self._initial_model_num + step) 189 | 190 | except KeyboardInterrupt: 191 | pass 192 | # many_runs_timeline.save('timeline_03_merged_%d_runs.json' % step) 193 | evalWriter.save() 194 | train_summ = sess.run(self._lossSummaries, feed_dict=feed_dict) 195 | writer.add_summary(train_summ, self._initial_model_num + step) 196 | saver.save(sess, self.modelsPath(self._initial_model_num + step)) 197 | self._initial_model_num += step 198 | 199 | trainReader.finish() 200 | print("Finalizing at step:", self._initial_model_num) 201 | print("Last saved model:", self.modelsPath(self._initial_model_num)) 202 | 203 | 204 | def get_trailing_number(s): 205 | m = re.search(r'\d+$', s) 206 | return int(m.group()) if m else None 207 | --------------------------------------------------------------------------------