├── .gitignore ├── LICENSE ├── README.md ├── augment_data.py ├── choose_equal_split.py ├── concatenate_csvs.py ├── create_spectrograms.py ├── ensembling ├── ensemble.theano.py └── get_output_layers.py ├── get_score_from_probabilities.py ├── get_score_from_top3_prediction.py ├── get_sum_of_csvs.py ├── majority_vote_ensembling.py ├── make_submission.py ├── prototxt ├── augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024r-1024r_DLR_nolrcoef.prototxt ├── augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3.prototxt ├── deploy.augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3.prototxt ├── deploy.main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR.prototxt ├── main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR.prototxt ├── solver.augm.nolrcoef.prototxt └── solver.main.adadelta.prototxt ├── test_augm_network.py ├── test_main_network.py └── theano ├── README.md ├── main.py ├── networks ├── __init__.py ├── base_network.py ├── rnn.py ├── rnn_2layers.py ├── rnn_2layers_5khz.py ├── tc_net.py ├── tc_net_deeprnn_shared_pad.py ├── tc_net_mod.py ├── tc_net_mod_5khz_small.py ├── tc_net_rnn.py ├── tc_net_rnn_nodense.py ├── tc_net_rnn_onernn.py ├── tc_net_rnn_onernn_notimepool.py ├── tc_net_rnn_shared.py ├── tc_net_rnn_shared_pad.py └── tc_net_rnn_shared_pad_augm.py └── plot.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 YerevaNN 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spoken language identification with deep learning 2 | 3 | Read more in the following blog posts: 4 | 5 | * [About TopCoder contest and our CNN-based solution implemented in Caffe](http://yerevann.github.io/2015/10/11/spoken-language-identification-with-deep-convolutional-networks/) (October 2015) 6 | * [About combining CNN and RNN using Theano/Lasagne](http://yerevann.github.io/2016/06/26/combining-cnn-and-rnn-for-spoken-language-identification/) (June 2016) 7 | 8 | Theano/Lasagne models are [here](/theano). The basic steps to run them are: 9 | 10 | * Download the dataset from [here](https://community.topcoder.com/longcontest/?module=ViewProblemStatement&rd=16555&pm=13978) or use your own dataset. 11 | * Create spectrograms for recording using `create_spectrograms.py` or `augment_data.py`. The latter will also augment the data by randomly perturbing the spectrograms and cropping a random interval of length 9s from the recording. 12 | * Create listfiles for training set and validation set, where each row of the a listfile describes one example and has 2 values seperated by a comma. The first one is the name of the example, the second one is the label (counting starts from 0). A typical listfile will look like [this](https://gist.github.com/Harhro94/aa11fe6b454c614cdedea882fd00f8d7). 13 | * Change the `png_folder` and listfile paths in [`theano/main.py`](/theano/main.py). 14 | * Run `theano/main.py`. -------------------------------------------------------------------------------- /augment_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | import scipy.io.wavfile as wav 4 | from numpy.lib import stride_tricks 5 | import PIL.Image as Image 6 | import os 7 | 8 | """ short time fourier transform of audio signal """ 9 | def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): 10 | win = window(frameSize) 11 | hopSize = int(frameSize - np.floor(overlapFac * frameSize)) 12 | 13 | # zeros at beginning (thus center of 1st window should be for sample nr. 0) 14 | samples = np.append(np.zeros(np.floor(frameSize/2.0)), sig) 15 | # cols for windowing 16 | cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1 17 | # zeros at end (thus samples can be fully covered by frames) 18 | samples = np.append(samples, np.zeros(frameSize)) 19 | 20 | frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() 21 | frames *= win 22 | 23 | return np.fft.rfft(frames) 24 | 25 | """ scale frequency axis logarithmically """ 26 | def logscale_spec(spec, sr=44100, factor=20., alpha=1.0, f0=0.9, fmax=1): 27 | spec = spec[:, 0:256] 28 | timebins, freqbins = np.shape(spec) 29 | scale = np.linspace(0, 1, freqbins) #** factor 30 | 31 | # http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=650310&url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel4%2F89%2F14168%2F00650310 32 | scale = np.array(map(lambda x: x * alpha if x <= f0 else (fmax-alpha*f0)/(fmax-f0)*(x-f0)+alpha*f0, scale)) 33 | scale *= (freqbins-1)/max(scale) 34 | 35 | newspec = np.complex128(np.zeros([timebins, freqbins])) 36 | allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1]) 37 | freqs = [0.0 for i in range(freqbins)] 38 | totw = [0.0 for i in range(freqbins)] 39 | for i in range(0, freqbins): 40 | if (i < 1 or i + 1 >= freqbins): 41 | newspec[:, i] += spec[:, i] 42 | freqs[i] += allfreqs[i] 43 | totw[i] += 1.0 44 | continue 45 | else: 46 | # scale[15] = 17.2 47 | w_up = scale[i] - np.floor(scale[i]) 48 | w_down = 1 - w_up 49 | j = int(np.floor(scale[i])) 50 | 51 | newspec[:, j] += w_down * spec[:, i] 52 | freqs[j] += w_down * allfreqs[i] 53 | totw[j] += w_down 54 | 55 | newspec[:, j + 1] += w_up * spec[:, i] 56 | freqs[j + 1] += w_up * allfreqs[i] 57 | totw[j + 1] += w_up 58 | 59 | for i in range(len(freqs)): 60 | if (totw[i] > 1e-6): 61 | freqs[i] /= totw[i] 62 | 63 | return newspec, freqs 64 | 65 | """ plot spectrogram""" 66 | def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="gray", channel=0, name='tmp.png', alpha=1, offset=0): 67 | samplerate, samples = wav.read(audiopath) 68 | samples = samples[:, channel] 69 | s = stft(samples, binsize) 70 | 71 | sshow, freq = logscale_spec(s, factor=1, sr=samplerate, alpha=alpha) 72 | sshow = sshow[2:, :] 73 | ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel 74 | timebins, freqbins = np.shape(ims) 75 | 76 | ims = np.transpose(ims) 77 | ims = ims[0:256, offset:offset+768] # 0-11khz, ~9s interval 78 | #print "ims.shape", ims.shape 79 | 80 | image = Image.fromarray(ims) 81 | image = image.convert('L') 82 | image.save(name) 83 | 84 | 85 | file = open('trainingData.csv', 'r') 86 | for iter, line in enumerate(file.readlines()[1:]): # first line of traininData.csv is header (only for trainingData.csv) 87 | filepath = line.split(',')[0] 88 | filename = filepath[:-4] 89 | wavfile = 'tmp.wav' 90 | os.system('mpg123 -w ' + wavfile + ' /home/brainstorm/caffe/Data/mnt/3/language/train/mp3/' + filepath) 91 | for augmentIdx in range(0, 20): 92 | alpha = np.random.uniform(0.9, 1.1) 93 | offset = np.random.randint(90) 94 | plotstft(wavfile, channel=0, name='/home/brainstorm/data/language/train/pngaugm/'+filename+'.'+str(augmentIdx)+'.png', 95 | alpha=alpha, offset=offset) 96 | 97 | os.remove(wavfile) 98 | print "processed %d files" % (iter + 1) 99 | 100 | -------------------------------------------------------------------------------- /choose_equal_split.py: -------------------------------------------------------------------------------- 1 | """split data into training and validation sets""" 2 | import csv 3 | 4 | with open('trainingData.csv', 'rb') as csvfile: 5 | next(csvfile) #skip headers 6 | data = list(csv.reader(csvfile, delimiter=',')) 7 | 8 | #Map every language to an ID 9 | langs = set([language.strip() for _,language in data]) 10 | ID = {lang: i for i,lang in enumerate(sorted(langs))} 11 | 12 | #Write first 306 items to training set and the rest to validation set 13 | cnt = [0 for _ in range(len(langs))] 14 | with open('trainEqual.csv', 'w') as train: 15 | with open('valEqaul.csv', 'w') as val: 16 | for line in data: 17 | filepath, language = map(str.strip, line) 18 | id_lang = ID[language] 19 | 20 | if (cnt[id_lang] < 306): 21 | train.write(filepath[:-4] + ',' + str(id_lang) + '\n') 22 | else: 23 | val.write(filepath[:-4] + ',' + str(id_lang) + '\n') 24 | cnt[id_lang] += 1 25 | -------------------------------------------------------------------------------- /concatenate_csvs.py: -------------------------------------------------------------------------------- 1 | """ Usage: python concatenate_csvs.py csv1path csv2path .. 2 | """ 3 | import sys 4 | import numpy as np 5 | 6 | n_csv = len(sys.argv) - 1 7 | cnt = 12320 8 | 9 | csv = [] 10 | for index in range(1, len(sys.argv)): 11 | csv.append(open(sys.argv[index], 'r')) 12 | 13 | outfile = open('concatenated.csv', 'w') 14 | 15 | for iter in range(12320): 16 | out = [] 17 | for index in range(n_csv): 18 | cur_out = csv[index].readline().split(',') 19 | cur_out = [float(x) for x in cur_out] 20 | out += cur_out 21 | 22 | out = [("%.6f" % x) for x in out] 23 | outfile.write(','.join(out) + '\n') -------------------------------------------------------------------------------- /create_spectrograms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | import scipy.io.wavfile as wav 4 | from numpy.lib import stride_tricks 5 | import PIL.Image as Image 6 | import os 7 | 8 | """ short time fourier transform of audio signal """ 9 | def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): 10 | win = window(frameSize) 11 | hopSize = int(frameSize - np.floor(overlapFac * frameSize)) 12 | 13 | # zeros at beginning (thus center of 1st window should be for sample nr. 0) 14 | samples = np.append(np.zeros(np.floor(frameSize/2.0)), sig) 15 | # cols for windowing 16 | cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1 17 | # zeros at end (thus samples can be fully covered by frames) 18 | samples = np.append(samples, np.zeros(frameSize)) 19 | 20 | frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() 21 | frames *= win 22 | 23 | return np.fft.rfft(frames) 24 | 25 | """ scale frequency axis logarithmically """ 26 | def logscale_spec(spec, sr=44100, factor=20., alpha=1.0, f0=0.9, fmax=1): 27 | spec = spec[:, 0:256] 28 | timebins, freqbins = np.shape(spec) 29 | scale = np.linspace(0, 1, freqbins) #** factor 30 | 31 | # http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=650310&url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel4%2F89%2F14168%2F00650310 32 | scale = np.array(map(lambda x: x * alpha if x <= f0 else (fmax-alpha*f0)/(fmax-f0)*(x-f0)+alpha*f0, scale)) 33 | scale *= (freqbins-1)/max(scale) 34 | 35 | newspec = np.complex128(np.zeros([timebins, freqbins])) 36 | allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1]) 37 | freqs = [0.0 for i in range(freqbins)] 38 | totw = [0.0 for i in range(freqbins)] 39 | for i in range(0, freqbins): 40 | if (i < 1 or i + 1 >= freqbins): 41 | newspec[:, i] += spec[:, i] 42 | freqs[i] += allfreqs[i] 43 | totw[i] += 1.0 44 | continue 45 | else: 46 | # scale[15] = 17.2 47 | w_up = scale[i] - np.floor(scale[i]) 48 | w_down = 1 - w_up 49 | j = int(np.floor(scale[i])) 50 | 51 | newspec[:, j] += w_down * spec[:, i] 52 | freqs[j] += w_down * allfreqs[i] 53 | totw[j] += w_down 54 | 55 | newspec[:, j + 1] += w_up * spec[:, i] 56 | freqs[j + 1] += w_up * allfreqs[i] 57 | totw[j + 1] += w_up 58 | 59 | for i in range(len(freqs)): 60 | if (totw[i] > 1e-6): 61 | freqs[i] /= totw[i] 62 | 63 | return newspec, freqs 64 | 65 | """ plot spectrogram""" 66 | def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="gray", channel=0, name='tmp.png', alpha=1, offset=0): 67 | samplerate, samples = wav.read(audiopath) 68 | samples = samples[:, channel] 69 | s = stft(samples, binsize) 70 | 71 | sshow, freq = logscale_spec(s, factor=1, sr=samplerate, alpha=alpha) 72 | sshow = sshow[2:, :] 73 | ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel 74 | timebins, freqbins = np.shape(ims) 75 | 76 | ims = np.transpose(ims) 77 | # ims = ims[0:256, offset:offset+768] # 0-11khz, ~9s interval 78 | ims = ims[0:256, :] # 0-11khz, ~10s interval 79 | #print "ims.shape", ims.shape 80 | 81 | image = Image.fromarray(ims) 82 | image = image.convert('L') 83 | image.save(name) 84 | 85 | 86 | file = open('trainingData.csv', 'r') 87 | for iter, line in enumerate(file.readlines()[1:]): # first line of traininData.csv is header (only for trainingData.csv) 88 | filepath = line.split(',')[0] 89 | filename = filepath[:-4] 90 | wavfile = 'tmp.wav' 91 | os.system('mpg123 -w ' + wavfile + ' /home/brainstorm/caffe/Data/mnt/3/language/train/mp3/' + filepath) 92 | """ 93 | for augmentIdx in range(0, 20): 94 | alpha = np.random.uniform(0.9, 1.1) 95 | offset = np.random.randint(90) 96 | plotstft(wavfile, channel=0, name='/home/brainstorm/data/language/train/pngaugm/'+filename+'.'+str(augmentIdx)+'.png', 97 | alpha=alpha, offset=offset) 98 | """ 99 | # we create only one spectrogram for each speach sample 100 | # we don't do vocal tract length perturbation (alpha=1.0) 101 | # also we don't crop 9s part from the speech 102 | plotstft(wavfile, channel=0, name='/home/brainstorm/data/language/train/pngaugm/'+filename+'.png', alpha=1.0) 103 | os.remove(wavfile) 104 | print "processed %d files" % (iter + 1) -------------------------------------------------------------------------------- /ensembling/ensemble.theano.py: -------------------------------------------------------------------------------- 1 | """ Usage: python ensemble.theano.py model1 [another_model]* 2 | 3 | for GPU mode 4 | 1. export PATH=$PATH:/usr/local/cuda-6.5/bin 5 | 2. THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,nvcc.flags='-arch=sm_30' python ensemble.theano.py model1 [another_model]* 6 | """ 7 | 8 | import cPickle as pickle 9 | import sys 10 | import caffe 11 | import numpy as np 12 | 13 | caffe.set_mode_gpu() 14 | 15 | def get_score(probs, label): 16 | pred = sorted([(x, it) for it, x in enumerate(probs)], reverse=True) 17 | if (pred[0][1] == label): 18 | return 1000 19 | if (pred[1][1] == label): 20 | return 400 21 | if (pred[2][1] == label): 22 | return 160 23 | return 0 24 | 25 | def get_full_score(preds, labels): 26 | topCoderScore = 0.0 27 | for i in range(len(labels)): 28 | topCoderScore += get_score(preds[i], labels[i]) 29 | 30 | return topCoderScore / len(labels) * 3520 31 | 32 | ####################### COLLECTING INFO ABOUT LANGS ############################ 33 | file = open('../trainingData.csv') 34 | data = file.readlines()[1:] 35 | langs = set() 36 | for line in data: 37 | filepath, language = line.split(',') 38 | language = language.strip() 39 | langs.add(language) 40 | langs = sorted(langs) 41 | file.close() 42 | 43 | n_models = len(sys.argv) - 1 44 | X = np.zeros((12320, n_models * 176), dtype=np.float32) 45 | for iter in range(n_models): 46 | csvpath = 'probs/val/' + sys.argv[iter + 1] 47 | csv = open(csvpath, 'r') 48 | for row_id, line in enumerate(csv.readlines()): 49 | mas = line.split(',') 50 | mas = np.array([float(x) for x in mas], dtype=np.float32) 51 | X[row_id, 176*iter:176*(iter+1)] = mas 52 | csv.close() 53 | 54 | Y = [] 55 | label_file = open('../valEqual.csv') 56 | for line in label_file.readlines(): 57 | Y.append(int(line.split(',')[1])) 58 | label_file.close() 59 | 60 | print "X.shape =", X.shape 61 | print "len(Y) =", len(Y) 62 | 63 | for iter in range(n_models): 64 | print "score of model %d = %f" % (iter+1, get_full_score(X[:, 176*iter:176*(iter+1)], Y)) 65 | 66 | 67 | ######################### TRAINING ENSEMBLING MODEL ############################ 68 | import theano 69 | import theano.tensor as T 70 | import lasagne 71 | import lasagne.layers as layers 72 | 73 | n_train_examples = 10000 74 | X = X.astype(theano.config.floatX) 75 | trainX = X[:n_train_examples] 76 | trainY = Y[:n_train_examples] 77 | valX = X[n_train_examples:] 78 | valY = Y[n_train_examples:] 79 | 80 | input_var = T.matrix('X') 81 | target_var = T.ivector('y') 82 | 83 | from lasagne.nonlinearities import softmax, sigmoid, rectify 84 | network = lasagne.layers.InputLayer((None, X.shape[1]), input_var) 85 | network = lasagne.layers.DenseLayer(network, 4000, nonlinearity=rectify) 86 | network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, 0.5), 176, nonlinearity=softmax) 87 | 88 | prediction = lasagne.layers.get_output(network) 89 | loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) 90 | loss = loss.mean() + 0 * lasagne.regularization.regularize_network_params( 91 | network, lasagne.regularization.l2) 92 | 93 | params = lasagne.layers.get_all_params(network, trainable=True) 94 | learning_rate = theano.shared(np.float32(0.2)) 95 | updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=learning_rate, momentum=0.9) 96 | train_fn = theano.function([input_var, target_var], loss, updates=updates) 97 | validation_fn = theano.function([input_var, target_var], loss) 98 | 99 | for epoch in range(1000): 100 | train_loss = train_fn(trainX, trainY) 101 | val_loss = validation_fn(valX, valY) 102 | print "Epoch %d: train_loss = %f, val_loss = %f, lr = %f" % (epoch + 1, train_loss, val_loss, learning_rate.get_value()) 103 | if (epoch > 0 and epoch % 200 == 0): 104 | learning_rate.set_value(np.float32(learning_rate.get_value() * 0.7)) 105 | 106 | test_prediction = lasagne.layers.get_output(network, deterministic=True) 107 | predict_fn = theano.function([input_var], test_prediction) 108 | all_predictions = predict_fn(valX) 109 | 110 | score = 0.0 111 | for probs, label in zip(all_predictions, valY): 112 | score += get_score(probs, label) 113 | print "Final score on ensembling validaion = %f" % score 114 | print "Expected score = %f" % (score / len(valY) * 3520) 115 | 116 | 117 | print "\n\n==> creating submission..." 118 | X = np.zeros((12320, n_models * 176), dtype=np.float32) 119 | for iter in range(n_models): 120 | csvpath = 'probs/test/' + sys.argv[iter + 1] 121 | csv = open(csvpath, 'r') 122 | for row_id, line in enumerate(csv.readlines()): 123 | mas = line.split(',') 124 | mas = np.array([float(x) for x in mas], dtype=np.float32) 125 | X[row_id, 176*iter:176*(iter+1)] = mas 126 | csv.close() 127 | 128 | prediction = predict_fn(X) 129 | print "prediction.shape =", prediction.shape 130 | ensembled = open('ensembled.csv', 'w') 131 | for probs in prediction: 132 | out = [str(x) for x in probs] 133 | ensembled.write(','.join(out) + '\n') 134 | 135 | 136 | """ 137 | ######################### SAVING MODEL TO BE ABLE TO REPRODUCE ################# 138 | print "==> Saving model..." 139 | with open("model.pickle", 'w') as save_file: 140 | pickle.dump(obj = {'params' : layers.get_all_param_values(network)}, file = save_file, protocol = -1) 141 | """ 142 | -------------------------------------------------------------------------------- /ensembling/get_output_layers.py: -------------------------------------------------------------------------------- 1 | """ Usage: python get_output_layers.py test|val 2 | """ 3 | import sys 4 | import caffe 5 | import numpy as np 6 | 7 | caffe.set_mode_gpu() 8 | 9 | deploy = '../prototxt/deploy.augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3.prototxt' 10 | model = 'augm_dropout0.3_on_augm84K-lr0.01_30K_iter_75000' 11 | model_path = '../models/' + model + '.caffemodel' 12 | 13 | """ 14 | ####################### networks with no augmentation ########################## 15 | net = caffe.Classifier(deploy, model_path) 16 | transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) 17 | transformer.set_transpose('data', (2, 0, 1)) 18 | net.blobs['data'].reshape(1, 1, 256, 858) 19 | 20 | folder = '/home/brainstorm/caffe/Data/mnt/3/language/train/png/' 21 | cnt = 12320 22 | file = open('../valEqual.csv', 'r') 23 | prob_file = open('probs/val/' + model + '.csv', 'w') 24 | 25 | for iter in range(cnt): 26 | name = file.readline().split(',')[0] 27 | net.blobs['data'].data[...] = transformer.preprocess('data', 28 | caffe.io.load_image(folder + name + '.png', color=False)) 29 | probs = net.forward()['loss'][0] 30 | probs = [str(x) for x in probs] 31 | prob_file.write(','.join(probs) + '\n') 32 | 33 | if (iter % 100 == 0): 34 | print "processed %d images" % (iter + 1) 35 | """ 36 | 37 | ######################### networks with augmentation ########################### 38 | assert sys.argv[1] in ('test', 'val') 39 | dataset = sys.argv[1] 40 | augm_cnt = 20 41 | cnt = 12320 42 | 43 | if (dataset == 'val'): 44 | folder = '/home/brainstorm/caffe/Data/mnt/3/language/train/pngaugm/' 45 | file = open('../valEqual.csv', 'r') 46 | else: 47 | folder = '../test/pngaugm/' 48 | file = open('../testingData.csv', 'r') 49 | 50 | # sum - mean of augm_cnt versions of speech 51 | # log - mean of logs of augm_cnt versions of speech 52 | # dense - last dense layer, 1024 outputs 53 | prob_file_sum = open('probs/' + dataset + '/' + model + '.sum' + str(augm_cnt) + '.csv', 'w') 54 | prob_file_log = open('probs/' + dataset + '/' + model + '.log' + str(augm_cnt) + '.csv', 'w') 55 | dense_file = open('probs/' + dataset + '/'+ model + '.dense' + str(augm_cnt) + '.csv', 'w') 56 | 57 | net = caffe.Classifier(deploy, model_path) 58 | transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) 59 | transformer.set_transpose('data', (2, 0, 1)) 60 | 61 | net.blobs['data'].reshape(augm_cnt, 1, 256, 768) 62 | for iter in range(cnt): 63 | if (dataset == 'val'): 64 | name = file.readline().split(',')[0] 65 | else: 66 | name = file.readline().strip()[:-4] 67 | X = np.zeros((augm_cnt, 1, 256, 768), dtype=np.float32) 68 | for index in range(augm_cnt): 69 | augm_path = folder + name + '.' + str(index) + '.png' 70 | X[index] = transformer.preprocess('data', caffe.io.load_image(augm_path, color=False)) 71 | 72 | net.blobs['data'].data[...] = X 73 | out = net.forward()['loss'] 74 | probs_sum = out.mean(axis=0) 75 | probs_log = np.log(out + 1e-7).mean(axis=0) 76 | dense = net.blobs['ip2new'].data 77 | 78 | probs_sum = [str(x) for x in probs_sum] 79 | prob_file_sum.write(','.join(probs_sum) + '\n') 80 | 81 | probs_log = ["%f" % x for x in probs_log] 82 | prob_file_log.write(','.join(probs_log) + '\n') 83 | 84 | for index in range(augm_cnt): 85 | tmp = [str(x) for x in dense[index]] 86 | dense_file.write(','.join(tmp) + '\n') 87 | 88 | if (iter % 10 == 0): 89 | print "processed %d images" % (iter + 1) 90 | -------------------------------------------------------------------------------- /get_score_from_probabilities.py: -------------------------------------------------------------------------------- 1 | """ USAGE: python get_score_from_probabilities.py --prediction= --anwser= 2 | prediction file may have less lines 3 | """ 4 | import sys 5 | import numpy as np 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--prediction', type=str) 10 | parser.add_argument('--answer', type=str, default='valDataNew.csv') 11 | args = parser.parse_args() 12 | print args 13 | 14 | 15 | # info about classes 16 | file = open('trainingData.csv') 17 | data = file.readlines()[1:] 18 | langs = set() 19 | for line in data: 20 | filepath, language = line.split(',') 21 | language = language.strip() 22 | langs.add(language) 23 | langs = sorted(langs) 24 | 25 | 26 | prediction_file = open(args.prediction, 'r') 27 | prediction_lines = prediction_file.readlines() 28 | answer_file = open(args.answer, 'r') 29 | answer_lines = answer_file.readlines() 30 | cnt = len(prediction_lines) 31 | top_coder_score = 0.0 32 | correct = 0 33 | 34 | wrong_answers = open('wrong_answers.txt', 'w') 35 | 36 | for iter in range(cnt): 37 | st = answer_lines[iter] 38 | (name, label) = st.split(',') 39 | label = int(label) 40 | 41 | out = prediction_lines[iter].split(',') 42 | out = [float(x) for x in out] 43 | pred = [(x, it) for it, x in enumerate(out)] 44 | pred = sorted(pred, reverse=True) 45 | 46 | if (pred[0][1] == label): 47 | correct += 1 48 | top_coder_score = top_coder_score + 1000 49 | elif (pred[1][1] == label): 50 | #correct += 1 51 | top_coder_score = top_coder_score + 400 52 | elif (pred[2][1] == label): 53 | #correct += 1 54 | top_coder_score = top_coder_score + 160 55 | 56 | if (pred[0][1] != label): 57 | print >> wrong_answers, answer_lines[iter] + prediction_lines[iter] 58 | 59 | if ((iter + 1) % 100 == 0): 60 | print >> sys.stderr, "processed %d / %d images" % (iter + 1, cnt) 61 | print >> sys.stderr, "expected score:", top_coder_score / (iter + 1) * 35200 62 | 63 | print >> sys.stderr, "Final score: ", top_coder_score, " / ", cnt, "000" 64 | print >> sys.stderr, "expected score:", top_coder_score / cnt * 35200 65 | print >> sys.stderr, "Accuracy: ", 100.0 * correct / cnt -------------------------------------------------------------------------------- /get_score_from_top3_prediction.py: -------------------------------------------------------------------------------- 1 | """ USAGE: python get_score_fromcsv.py --prediction= --anwser= 2 | 3 | Prediction file may have less lines 4 | 5 | Each line of prediction file must contain at least 3 integers: labels of top3 6 | predictions, then it may have some additional information 7 | """ 8 | import sys 9 | import numpy as np 10 | import argparse 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--prediction', type=str) 14 | parser.add_argument('--answer', type=str, default='valDataNew.csv') 15 | args = parser.parse_args() 16 | print args 17 | 18 | 19 | # info about classes 20 | file = open('trainingData.csv') 21 | data = file.readlines()[1:] 22 | langs = set() 23 | for line in data: 24 | filepath, language = line.split(',') 25 | language = language.strip() 26 | langs.add(language) 27 | langs = sorted(langs) 28 | 29 | 30 | prediction_file = open(args.prediction, 'r') 31 | prediction_lines = prediction_file.readlines() 32 | answer_file = open(args.answer, 'r') 33 | answer_lines = answer_file.readlines() 34 | cnt = len(prediction_lines) 35 | top_coder_score = 0.0 36 | correct = 0 37 | 38 | wrong_answers = open('wrong_answers.txt', 'w') 39 | 40 | for iter in range(cnt): 41 | st = answer_lines[iter] 42 | (name, label) = st.split(',') 43 | label = int(label) 44 | 45 | pred = prediction_lines[iter].split(',') 46 | pred = [int(x) for x in pred] 47 | 48 | if (pred[0] == label): 49 | correct += 1 50 | top_coder_score = top_coder_score + 1000 51 | elif (pred[1] == label): 52 | #correct += 1 53 | top_coder_score = top_coder_score + 400 54 | elif (pred[2] == label): 55 | #correct += 1 56 | top_coder_score = top_coder_score + 160 57 | 58 | if (pred[0] != label): 59 | print >> wrong_answers, (answer_lines[iter] + str(pred[3 + pred[0]]) + ',' + str(pred[3 + pred[1]]) + ',' + 60 | str(pred[3 + pred[2]]) + ', votes for correct answer: ' + str(pred[3 + label])) 61 | 62 | if ((iter + 1) % 100 == 0): 63 | print >> sys.stderr, "processed %d / %d images" % (iter + 1, cnt) 64 | print >> sys.stderr, "expected score:", top_coder_score / (iter + 1) * 35200 65 | 66 | print >> sys.stderr, "Final score: ", top_coder_score, " / ", cnt, "000" 67 | print >> sys.stderr, "expected score:", top_coder_score / cnt * 35200 68 | print >> sys.stderr, "Accuracy: ", 100.0 * correct / cnt -------------------------------------------------------------------------------- /get_sum_of_csvs.py: -------------------------------------------------------------------------------- 1 | """ Usage: python get_sum_csvs.py csv1path csv2path .. 2 | """ 3 | import sys 4 | import numpy as np 5 | 6 | n_csv = len(sys.argv) - 1 7 | cnt = 12320 8 | 9 | csv = [] 10 | for index in range(1, len(sys.argv)): 11 | csv.append(open(sys.argv[index], 'r')) 12 | 13 | outfile = open('summed.csv', 'w') 14 | 15 | for iter in range(12320): 16 | out = np.zeros((176,), dtype=np.float32) 17 | for index in range(n_csv): 18 | cur_out = csv[index].readline().split(',') 19 | cur_out = [float(x) for x in cur_out] 20 | out += cur_out 21 | 22 | out = [("%.6f" % x) for x in out] 23 | outfile.write(','.join(out) + '\n') -------------------------------------------------------------------------------- /majority_vote_ensembling.py: -------------------------------------------------------------------------------- 1 | """ Usage: python majority_vote_ensembling.py csv1path csv2path .. 2 | """ 3 | import sys 4 | import numpy as np 5 | 6 | n_csv = len(sys.argv) - 1 7 | train_cnt = 12320 8 | 9 | csv = [] 10 | for index in range(1, len(sys.argv)): 11 | csv.append(open(sys.argv[index], 'r')) 12 | 13 | ensembled = open('top3_prediction_ensembled.csv', 'w') 14 | 15 | for iter in range(train_cnt): 16 | cnt = [0 for i in range(176)] 17 | avg_prob = np.array([0.0 for i in range(176)]) 18 | 19 | for index in range(n_csv): 20 | cur_prob = csv[index].readline().split(',') 21 | cur_prob = np.array([float(x) for x in cur_prob]) 22 | 23 | avg_prob += cur_prob 24 | prediction = cur_prob.argmax() 25 | cnt[prediction] += 1 26 | 27 | 28 | mas = [(cnt[index], avg_prob[index], index) for index in range(176)] 29 | mas = sorted(mas, reverse=True) 30 | 31 | ensembled.write(str(mas[0][2]) + ',' + str(mas[1][2]) + ',' + str(mas[2][2]) + ',') 32 | ensembled.write(','.join([str(x) for x in cnt]) + '\n') 33 | -------------------------------------------------------------------------------- /make_submission.py: -------------------------------------------------------------------------------- 1 | """ Usage: python make_submission.py csvpath model_name 2 | csv - must contain 12320 rows, 176 coloumns: the predictions for test set 3 | """ 4 | 5 | import sys 6 | import numpy as np 7 | 8 | # info about classes 9 | file = open('trainingData.csv') 10 | data = file.readlines()[1:] 11 | langs = set() 12 | for line in data: 13 | filepath, language = line.split(',') 14 | language = language.strip() 15 | langs.add(language) 16 | langs = sorted(langs) 17 | 18 | path = sys.argv[1] 19 | name = sys.argv[2] 20 | read_file = open(path, 'r') 21 | f = open('testingData.csv') 22 | cnt = 12320 23 | print_file = open('predictions/test_' + name + '.csv', 'w') 24 | 25 | for iter in range(cnt): 26 | st = f.readline() 27 | name = st.strip()[:-4] 28 | 29 | out = read_file.readline().split(',') 30 | out = [float(x) for x in out] 31 | pred = sorted([(x, it) for it, x in enumerate(out)], reverse=True) 32 | 33 | for i in range(3): 34 | lang_id = pred[i][1] 35 | lang = langs[lang_id] 36 | print_file.write(name + '.mp3,' + lang + ',' + str(i + 1) + '\n') 37 | 38 | if (iter % 100 == 0): 39 | print >> sys.stderr, "processed %d / %d images" % (iter + 1, cnt) 40 | -------------------------------------------------------------------------------- /prototxt/augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024r-1024r_DLR_nolrcoef.prototxt: -------------------------------------------------------------------------------- 1 | name: "LangNet" 2 | # DATA LAYERS 3 | layer { 4 | name: "mnist" 5 | type: "Data" 6 | top: "data" 7 | top: "label" 8 | include { 9 | phase: TRAIN 10 | } 11 | transform_param { 12 | scale: 0.00390625 13 | } 14 | data_param { 15 | source: "train/train_augm_db" 16 | batch_size: 24 17 | backend: LEVELDB 18 | } 19 | } 20 | layer { 21 | name: "mnist" 22 | type: "Data" 23 | top: "data" 24 | top: "label" 25 | include { 26 | phase: TEST 27 | } 28 | transform_param { 29 | scale: 0.00390625 30 | } 31 | data_param { 32 | source: "train/val_augm_db" 33 | batch_size: 24 34 | backend: LEVELDB 35 | } 36 | } 37 | 38 | # CONV1-RELU1-POOL1 39 | layer { 40 | name: "conv1" 41 | type: "Convolution" 42 | bottom: "data" 43 | top: "conv1" 44 | param { 45 | lr_mult: 1 46 | } 47 | param { 48 | lr_mult: 2 49 | } 50 | convolution_param { 51 | num_output: 32 52 | kernel_size: 7 53 | stride: 1 54 | weight_filler { 55 | type: "xavier" 56 | } 57 | bias_filler { 58 | type: "constant" 59 | } 60 | } 61 | } 62 | layer { 63 | name: "relu1" 64 | type: "ReLU" 65 | bottom: "conv1" 66 | top: "conv1" 67 | } 68 | layer { 69 | name: "pool1" 70 | type: "Pooling" 71 | bottom: "conv1" 72 | top: "pool1" 73 | pooling_param { 74 | pool: MAX 75 | kernel_size: 3 76 | stride: 2 77 | } 78 | } 79 | 80 | # CONV2-RELU2-POOL2_ 81 | layer { 82 | name: "conv2" 83 | type: "Convolution" 84 | bottom: "pool1" 85 | top: "conv2" 86 | param { 87 | lr_mult: 1 88 | } 89 | param { 90 | lr_mult: 2 91 | } 92 | convolution_param { 93 | num_output: 64 94 | kernel_size: 5 95 | stride: 1 96 | weight_filler { 97 | type: "xavier" 98 | } 99 | bias_filler { 100 | type: "constant" 101 | } 102 | } 103 | } 104 | layer { 105 | name: "relu2" 106 | type: "ReLU" 107 | bottom: "conv2" 108 | top: "conv2" 109 | } 110 | layer { 111 | name: "pool2" 112 | type: "Pooling" 113 | bottom: "conv2" 114 | top: "pool2" 115 | pooling_param { 116 | pool: MAX 117 | kernel_size: 3 118 | stride: 2 119 | } 120 | } 121 | 122 | # CONV3-RELU3-POOL3 123 | layer { 124 | name: "conv3" 125 | type: "Convolution" 126 | bottom: "pool2" 127 | top: "conv3" 128 | param { 129 | lr_mult: 1 130 | } 131 | param { 132 | lr_mult: 2 133 | } 134 | convolution_param { 135 | num_output: 64 136 | kernel_size: 3 137 | stride: 1 138 | weight_filler { 139 | type: "xavier" 140 | } 141 | bias_filler { 142 | type: "constant" 143 | } 144 | } 145 | } 146 | layer { 147 | name: "relu3" 148 | type: "ReLU" 149 | bottom: "conv3" 150 | top: "conv3" 151 | } 152 | layer { 153 | name: "pool3" 154 | type: "Pooling" 155 | bottom: "conv3" 156 | top: "pool3" 157 | pooling_param { 158 | pool: MAX 159 | kernel_size: 3 160 | stride:2 161 | } 162 | } 163 | 164 | # CONV4-RELU4-POOL4 165 | layer { 166 | name: "conv4" 167 | type: "Convolution" 168 | bottom: "pool3" 169 | top: "conv4" 170 | param { 171 | lr_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | } 176 | convolution_param { 177 | num_output: 128 178 | kernel_size: 3 179 | stride: 1 180 | weight_filler { 181 | type: "xavier" 182 | } 183 | bias_filler { 184 | type: "constant" 185 | } 186 | } 187 | } 188 | layer { 189 | name: "relu4" 190 | type: "ReLU" 191 | bottom: "conv4" 192 | top: "conv4" 193 | } 194 | layer { 195 | name: "pool4" 196 | type: "Pooling" 197 | bottom: "conv4" 198 | top: "pool4" 199 | pooling_param { 200 | pool: MAX 201 | kernel_size: 3 202 | stride:2 203 | } 204 | } 205 | 206 | # CONV5-RELU5-POOL5 207 | layer { 208 | name: "conv5" 209 | type: "Convolution" 210 | bottom: "pool4" 211 | top: "conv5" 212 | param { 213 | lr_mult: 1 214 | } 215 | param { 216 | lr_mult: 2 217 | } 218 | convolution_param { 219 | num_output: 128 220 | kernel_size: 3 221 | stride: 1 222 | weight_filler { 223 | type: "xavier" 224 | } 225 | bias_filler { 226 | type: "constant" 227 | } 228 | } 229 | } 230 | layer { 231 | name: "relu5" 232 | type: "ReLU" 233 | bottom: "conv5" 234 | top: "conv5" 235 | } 236 | layer { 237 | name: "pool5" 238 | type: "Pooling" 239 | bottom: "conv5" 240 | top: "pool5" 241 | pooling_param { 242 | pool: MAX 243 | kernel_size: 3 244 | stride:2 245 | } 246 | } 247 | 248 | # CONV6-RELU6-POOL6 249 | layer { 250 | name: "conv6" 251 | type: "Convolution" 252 | bottom: "pool5" 253 | top: "conv6" 254 | param { 255 | lr_mult: 1 256 | } 257 | param { 258 | lr_mult: 2 259 | } 260 | convolution_param { 261 | num_output: 256 262 | kernel_size: 3 263 | stride: 1 264 | weight_filler { 265 | type: "xavier" 266 | } 267 | bias_filler { 268 | type: "constant" 269 | } 270 | } 271 | } 272 | layer { 273 | name: "relu6" 274 | type: "ReLU" 275 | bottom: "conv6" 276 | top: "conv6" 277 | } 278 | layer { 279 | name: "pool6" 280 | type: "Pooling" 281 | bottom: "conv6" 282 | top: "pool6" 283 | pooling_param { 284 | pool: MAX 285 | kernel_size: 3 286 | stride:2 287 | } 288 | } 289 | 290 | # IP layers 291 | layer { 292 | name: "ip1new" 293 | type: "InnerProduct" 294 | bottom: "pool6" 295 | top: "ip1new" 296 | param { 297 | lr_mult: 1 298 | } 299 | param { 300 | lr_mult: 2 301 | } 302 | inner_product_param { 303 | num_output: 1024 304 | weight_filler { 305 | type: "xavier" 306 | } 307 | bias_filler { 308 | type: "constant" 309 | } 310 | } 311 | } 312 | layer { 313 | name: "reluOnIp1" 314 | type: "ReLU" 315 | bottom: "ip1new" 316 | top: "ip1new" 317 | } 318 | layer { 319 | name: "ip2new" 320 | type: "InnerProduct" 321 | bottom: "ip1new" 322 | top: "ip2new" 323 | param { 324 | lr_mult: 1 325 | } 326 | param { 327 | lr_mult: 2 328 | } 329 | inner_product_param { 330 | num_output: 1024 331 | weight_filler { 332 | type: "xavier" 333 | } 334 | bias_filler { 335 | type: "constant" 336 | } 337 | } 338 | } 339 | layer { 340 | name: "reluOnIp2" 341 | type: "ReLU" 342 | bottom: "ip2new" 343 | top: "ip2new" 344 | } 345 | layer { 346 | name: "ip3new" 347 | type: "InnerProduct" 348 | bottom: "ip2new" 349 | top: "ip3new" 350 | param { 351 | lr_mult: 1 352 | } 353 | param { 354 | lr_mult: 2 355 | } 356 | inner_product_param { 357 | num_output: 176 358 | weight_filler { 359 | type: "xavier" 360 | } 361 | bias_filler { 362 | type: "constant" 363 | } 364 | } 365 | } 366 | layer { 367 | name: "accuracy" 368 | type: "Accuracy" 369 | bottom: "ip3new" 370 | bottom: "label" 371 | top: "accuracy" 372 | include { 373 | phase: TEST 374 | } 375 | } 376 | layer { 377 | name: "loss" 378 | type: "SoftmaxWithLoss" 379 | bottom: "ip3new" 380 | bottom: "label" 381 | top: "loss" 382 | } 383 | -------------------------------------------------------------------------------- /prototxt/augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3.prototxt: -------------------------------------------------------------------------------- 1 | name: "LangNet" 2 | # DATA LAYERS 3 | layer { 4 | name: "mnist" 5 | type: "Data" 6 | top: "data" 7 | top: "label" 8 | include { 9 | phase: TRAIN 10 | } 11 | transform_param { 12 | scale: 0.00390625 13 | } 14 | data_param { 15 | source: "train/train_augm_db" 16 | batch_size: 23 17 | backend: LEVELDB 18 | } 19 | } 20 | layer { 21 | name: "mnist" 22 | type: "Data" 23 | top: "data" 24 | top: "label" 25 | include { 26 | phase: TEST 27 | } 28 | transform_param { 29 | scale: 0.00390625 30 | } 31 | data_param { 32 | source: "train/val_augm_db" 33 | batch_size: 24 34 | backend: LEVELDB 35 | } 36 | } 37 | 38 | # CONV1-RELU1-POOL1 39 | layer { 40 | name: "conv1" 41 | type: "Convolution" 42 | bottom: "data" 43 | top: "conv1" 44 | param { 45 | lr_mult: 1 46 | } 47 | param { 48 | lr_mult: 2 49 | } 50 | convolution_param { 51 | num_output: 32 52 | kernel_size: 7 53 | stride: 1 54 | weight_filler { 55 | type: "xavier" 56 | } 57 | bias_filler { 58 | type: "constant" 59 | } 60 | } 61 | } 62 | layer { 63 | name: "relu1" 64 | type: "ReLU" 65 | bottom: "conv1" 66 | top: "conv1" 67 | } 68 | layer { 69 | name: "pool1" 70 | type: "Pooling" 71 | bottom: "conv1" 72 | top: "pool1" 73 | pooling_param { 74 | pool: MAX 75 | kernel_size: 3 76 | stride: 2 77 | } 78 | } 79 | 80 | # CONV2-RELU2-POOL2_ 81 | layer { 82 | name: "conv2" 83 | type: "Convolution" 84 | bottom: "pool1" 85 | top: "conv2" 86 | param { 87 | lr_mult: 1 88 | } 89 | param { 90 | lr_mult: 2 91 | } 92 | convolution_param { 93 | num_output: 64 94 | kernel_size: 5 95 | stride: 1 96 | weight_filler { 97 | type: "xavier" 98 | } 99 | bias_filler { 100 | type: "constant" 101 | } 102 | } 103 | } 104 | layer { 105 | name: "relu2" 106 | type: "ReLU" 107 | bottom: "conv2" 108 | top: "conv2" 109 | } 110 | layer { 111 | name: "pool2" 112 | type: "Pooling" 113 | bottom: "conv2" 114 | top: "pool2" 115 | pooling_param { 116 | pool: MAX 117 | kernel_size: 3 118 | stride: 2 119 | } 120 | } 121 | 122 | # CONV3-RELU3-POOL3 123 | layer { 124 | name: "conv3" 125 | type: "Convolution" 126 | bottom: "pool2" 127 | top: "conv3" 128 | param { 129 | lr_mult: 1 130 | } 131 | param { 132 | lr_mult: 2 133 | } 134 | convolution_param { 135 | num_output: 64 136 | kernel_size: 3 137 | stride: 1 138 | weight_filler { 139 | type: "xavier" 140 | } 141 | bias_filler { 142 | type: "constant" 143 | } 144 | } 145 | } 146 | layer { 147 | name: "relu3" 148 | type: "ReLU" 149 | bottom: "conv3" 150 | top: "conv3" 151 | } 152 | layer { 153 | name: "pool3" 154 | type: "Pooling" 155 | bottom: "conv3" 156 | top: "pool3" 157 | pooling_param { 158 | pool: MAX 159 | kernel_size: 3 160 | stride:2 161 | } 162 | } 163 | 164 | # CONV4-RELU4-POOL4 165 | layer { 166 | name: "conv4" 167 | type: "Convolution" 168 | bottom: "pool3" 169 | top: "conv4" 170 | param { 171 | lr_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | } 176 | convolution_param { 177 | num_output: 128 178 | kernel_size: 3 179 | stride: 1 180 | weight_filler { 181 | type: "xavier" 182 | } 183 | bias_filler { 184 | type: "constant" 185 | } 186 | } 187 | } 188 | layer { 189 | name: "relu4" 190 | type: "ReLU" 191 | bottom: "conv4" 192 | top: "conv4" 193 | } 194 | layer { 195 | name: "pool4" 196 | type: "Pooling" 197 | bottom: "conv4" 198 | top: "pool4" 199 | pooling_param { 200 | pool: MAX 201 | kernel_size: 3 202 | stride:2 203 | } 204 | } 205 | 206 | # CONV5-RELU5-POOL5 207 | layer { 208 | name: "conv5" 209 | type: "Convolution" 210 | bottom: "pool4" 211 | top: "conv5" 212 | param { 213 | lr_mult: 1 214 | } 215 | param { 216 | lr_mult: 2 217 | } 218 | convolution_param { 219 | num_output: 128 220 | kernel_size: 3 221 | stride: 1 222 | weight_filler { 223 | type: "xavier" 224 | } 225 | bias_filler { 226 | type: "constant" 227 | } 228 | } 229 | } 230 | layer { 231 | name: "relu5" 232 | type: "ReLU" 233 | bottom: "conv5" 234 | top: "conv5" 235 | } 236 | layer { 237 | name: "pool5" 238 | type: "Pooling" 239 | bottom: "conv5" 240 | top: "pool5" 241 | pooling_param { 242 | pool: MAX 243 | kernel_size: 3 244 | stride:2 245 | } 246 | } 247 | 248 | # CONV6-RELU6-POOL6 249 | layer { 250 | name: "conv6" 251 | type: "Convolution" 252 | bottom: "pool5" 253 | top: "conv6" 254 | param { 255 | lr_mult: 1 256 | } 257 | param { 258 | lr_mult: 2 259 | } 260 | convolution_param { 261 | num_output: 256 262 | kernel_size: 3 263 | stride: 1 264 | weight_filler { 265 | type: "xavier" 266 | } 267 | bias_filler { 268 | type: "constant" 269 | } 270 | } 271 | } 272 | layer { 273 | name: "relu6" 274 | type: "ReLU" 275 | bottom: "conv6" 276 | top: "conv6" 277 | } 278 | layer { 279 | name: "pool6" 280 | type: "Pooling" 281 | bottom: "conv6" 282 | top: "pool6" 283 | pooling_param { 284 | pool: MAX 285 | kernel_size: 3 286 | stride:2 287 | } 288 | } 289 | 290 | # IP layers 291 | layer { 292 | name: "ip1new" 293 | type: "InnerProduct" 294 | bottom: "pool6" 295 | top: "ip1new" 296 | param { 297 | lr_mult: 1 298 | } 299 | param { 300 | lr_mult: 2 301 | } 302 | inner_product_param { 303 | num_output: 1024 304 | weight_filler { 305 | type: "xavier" 306 | } 307 | bias_filler { 308 | type: "constant" 309 | } 310 | } 311 | } 312 | layer { 313 | name: "reluOnIp1" 314 | type: "ReLU" 315 | bottom: "ip1new" 316 | top: "ip1new" 317 | } 318 | layer { 319 | name: "dropOnIp1" 320 | type: "Dropout" 321 | dropout_param { 322 | dropout_ratio: 0.3 323 | } 324 | bottom: "ip1new" 325 | top: "ip1new" 326 | } 327 | layer { 328 | name: "ip2new" 329 | type: "InnerProduct" 330 | bottom: "ip1new" 331 | top: "ip2new" 332 | param { 333 | lr_mult: 1 334 | } 335 | param { 336 | lr_mult: 2 337 | } 338 | inner_product_param { 339 | num_output: 1024 340 | weight_filler { 341 | type: "xavier" 342 | } 343 | bias_filler { 344 | type: "constant" 345 | } 346 | } 347 | } 348 | layer { 349 | name: "reluOnIp2" 350 | type: "ReLU" 351 | bottom: "ip2new" 352 | top: "ip2new" 353 | } 354 | layer { 355 | name: "dropOnIp2" 356 | type: "Dropout" 357 | dropout_param { 358 | dropout_ratio: 0.3 359 | } 360 | bottom: "ip2new" 361 | top: "ip2new" 362 | } 363 | layer { 364 | name: "ip3new" 365 | type: "InnerProduct" 366 | bottom: "ip2new" 367 | top: "ip3new" 368 | param { 369 | lr_mult: 1 370 | } 371 | param { 372 | lr_mult: 2 373 | } 374 | inner_product_param { 375 | num_output: 176 376 | weight_filler { 377 | type: "xavier" 378 | } 379 | bias_filler { 380 | type: "constant" 381 | } 382 | } 383 | } 384 | layer { 385 | name: "accuracy" 386 | type: "Accuracy" 387 | bottom: "ip3new" 388 | bottom: "label" 389 | top: "accuracy" 390 | include { 391 | phase: TEST 392 | } 393 | } 394 | layer { 395 | name: "loss" 396 | type: "SoftmaxWithLoss" 397 | bottom: "ip3new" 398 | bottom: "label" 399 | top: "loss" 400 | } 401 | -------------------------------------------------------------------------------- /prototxt/deploy.augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3.prototxt: -------------------------------------------------------------------------------- 1 | name: "LangNet" 2 | # DATA LAYERS 3 | input: "data" 4 | input_dim: 1 5 | input_dim: 1 6 | input_dim: 256 7 | input_dim: 768 8 | 9 | # CONV1-RELU1-POOL1 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | param { 16 | lr_mult: 1 17 | } 18 | param { 19 | lr_mult: 2 20 | } 21 | convolution_param { 22 | num_output: 32 23 | kernel_size: 7 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | } 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "pool1" 41 | type: "Pooling" 42 | bottom: "conv1" 43 | top: "pool1" 44 | pooling_param { 45 | pool: MAX 46 | kernel_size: 3 47 | stride: 2 48 | } 49 | } 50 | 51 | # CONV2-RELU2-POOL2_ 52 | layer { 53 | name: "conv2" 54 | type: "Convolution" 55 | bottom: "pool1" 56 | top: "conv2" 57 | param { 58 | lr_mult: 1 59 | } 60 | param { 61 | lr_mult: 2 62 | } 63 | convolution_param { 64 | num_output: 64 65 | kernel_size: 5 66 | stride: 1 67 | weight_filler { 68 | type: "xavier" 69 | } 70 | bias_filler { 71 | type: "constant" 72 | } 73 | } 74 | } 75 | layer { 76 | name: "relu2" 77 | type: "ReLU" 78 | bottom: "conv2" 79 | top: "conv2" 80 | } 81 | layer { 82 | name: "pool2" 83 | type: "Pooling" 84 | bottom: "conv2" 85 | top: "pool2" 86 | pooling_param { 87 | pool: MAX 88 | kernel_size: 3 89 | stride: 2 90 | } 91 | } 92 | 93 | # CONV3-RELU3-POOL3 94 | layer { 95 | name: "conv3" 96 | type: "Convolution" 97 | bottom: "pool2" 98 | top: "conv3" 99 | param { 100 | lr_mult: 1 101 | } 102 | param { 103 | lr_mult: 2 104 | } 105 | convolution_param { 106 | num_output: 64 107 | kernel_size: 3 108 | stride: 1 109 | weight_filler { 110 | type: "xavier" 111 | } 112 | bias_filler { 113 | type: "constant" 114 | } 115 | } 116 | } 117 | layer { 118 | name: "relu3" 119 | type: "ReLU" 120 | bottom: "conv3" 121 | top: "conv3" 122 | } 123 | layer { 124 | name: "pool3" 125 | type: "Pooling" 126 | bottom: "conv3" 127 | top: "pool3" 128 | pooling_param { 129 | pool: MAX 130 | kernel_size: 3 131 | stride:2 132 | } 133 | } 134 | 135 | # CONV4-RELU4-POOL4 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "pool3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | } 144 | param { 145 | lr_mult: 2 146 | } 147 | convolution_param { 148 | num_output: 128 149 | kernel_size: 3 150 | stride: 1 151 | weight_filler { 152 | type: "xavier" 153 | } 154 | bias_filler { 155 | type: "constant" 156 | } 157 | } 158 | } 159 | layer { 160 | name: "relu4" 161 | type: "ReLU" 162 | bottom: "conv4" 163 | top: "conv4" 164 | } 165 | layer { 166 | name: "pool4" 167 | type: "Pooling" 168 | bottom: "conv4" 169 | top: "pool4" 170 | pooling_param { 171 | pool: MAX 172 | kernel_size: 3 173 | stride:2 174 | } 175 | } 176 | 177 | # CONV5-RELU5-POOL5 178 | layer { 179 | name: "conv5" 180 | type: "Convolution" 181 | bottom: "pool4" 182 | top: "conv5" 183 | param { 184 | lr_mult: 1 185 | } 186 | param { 187 | lr_mult: 2 188 | } 189 | convolution_param { 190 | num_output: 128 191 | kernel_size: 3 192 | stride: 1 193 | weight_filler { 194 | type: "xavier" 195 | } 196 | bias_filler { 197 | type: "constant" 198 | } 199 | } 200 | } 201 | layer { 202 | name: "relu5" 203 | type: "ReLU" 204 | bottom: "conv5" 205 | top: "conv5" 206 | } 207 | layer { 208 | name: "pool5" 209 | type: "Pooling" 210 | bottom: "conv5" 211 | top: "pool5" 212 | pooling_param { 213 | pool: MAX 214 | kernel_size: 3 215 | stride:2 216 | } 217 | } 218 | 219 | # CONV6-RELU6-POOL6 220 | layer { 221 | name: "conv6" 222 | type: "Convolution" 223 | bottom: "pool5" 224 | top: "conv6" 225 | param { 226 | lr_mult: 1 227 | } 228 | param { 229 | lr_mult: 2 230 | } 231 | convolution_param { 232 | num_output: 256 233 | kernel_size: 3 234 | stride: 1 235 | weight_filler { 236 | type: "xavier" 237 | } 238 | bias_filler { 239 | type: "constant" 240 | } 241 | } 242 | } 243 | layer { 244 | name: "relu6" 245 | type: "ReLU" 246 | bottom: "conv6" 247 | top: "conv6" 248 | } 249 | layer { 250 | name: "pool6" 251 | type: "Pooling" 252 | bottom: "conv6" 253 | top: "pool6" 254 | pooling_param { 255 | pool: MAX 256 | kernel_size: 3 257 | stride:2 258 | } 259 | } 260 | 261 | # IP layers 262 | layer { 263 | name: "ip1new" 264 | type: "InnerProduct" 265 | bottom: "pool6" 266 | top: "ip1new" 267 | param { 268 | lr_mult: 1 269 | } 270 | param { 271 | lr_mult: 2 272 | } 273 | inner_product_param { 274 | num_output: 1024 275 | weight_filler { 276 | type: "xavier" 277 | } 278 | bias_filler { 279 | type: "constant" 280 | } 281 | } 282 | } 283 | layer { 284 | name: "reluOnIp1" 285 | type: "ReLU" 286 | bottom: "ip1new" 287 | top: "ip1new" 288 | } 289 | layer { 290 | name: "dropOnIp1" 291 | type: "Dropout" 292 | dropout_param { 293 | dropout_ratio: 0.3 294 | } 295 | bottom: "ip1new" 296 | top: "ip1new" 297 | } 298 | layer { 299 | name: "ip2new" 300 | type: "InnerProduct" 301 | bottom: "ip1new" 302 | top: "ip2new" 303 | param { 304 | lr_mult: 1 305 | } 306 | param { 307 | lr_mult: 2 308 | } 309 | inner_product_param { 310 | num_output: 1024 311 | weight_filler { 312 | type: "xavier" 313 | } 314 | bias_filler { 315 | type: "constant" 316 | } 317 | } 318 | } 319 | layer { 320 | name: "reluOnIp2" 321 | type: "ReLU" 322 | bottom: "ip2new" 323 | top: "ip2new" 324 | } 325 | layer { 326 | name: "dropOnIp2" 327 | type: "Dropout" 328 | dropout_param { 329 | dropout_ratio: 0.3 330 | } 331 | bottom: "ip2new" 332 | top: "ip2new" 333 | } 334 | layer { 335 | name: "ip3new" 336 | type: "InnerProduct" 337 | bottom: "ip2new" 338 | top: "ip3new" 339 | param { 340 | lr_mult: 1 341 | } 342 | param { 343 | lr_mult: 2 344 | } 345 | inner_product_param { 346 | num_output: 176 347 | weight_filler { 348 | type: "xavier" 349 | } 350 | bias_filler { 351 | type: "constant" 352 | } 353 | } 354 | } 355 | layer { 356 | name: "loss" 357 | type: "Softmax" 358 | bottom: "ip3new" 359 | top: "loss" 360 | } 361 | -------------------------------------------------------------------------------- /prototxt/deploy.main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR.prototxt: -------------------------------------------------------------------------------- 1 | name: "LangNet" 2 | # DATA LAYERS 3 | input: "data" 4 | input_dim: 1 5 | input_dim: 1 6 | input_dim: 256 7 | input_dim: 858 8 | 9 | # CONV1-RELU1-POOL1 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | param { 16 | lr_mult: 15 17 | } 18 | param { 19 | lr_mult: 30 20 | } 21 | convolution_param { 22 | num_output: 32 23 | kernel_size: 7 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | } 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "pool1" 41 | type: "Pooling" 42 | bottom: "conv1" 43 | top: "pool1" 44 | pooling_param { 45 | pool: MAX 46 | kernel_size: 3 47 | stride: 2 48 | } 49 | } 50 | 51 | # CONV2-RELU2-POOL2_ 52 | layer { 53 | name: "conv2" 54 | type: "Convolution" 55 | bottom: "pool1" 56 | top: "conv2" 57 | param { 58 | lr_mult: 12 59 | } 60 | param { 61 | lr_mult: 24 62 | } 63 | convolution_param { 64 | num_output: 64 65 | kernel_size: 5 66 | stride: 1 67 | weight_filler { 68 | type: "xavier" 69 | } 70 | bias_filler { 71 | type: "constant" 72 | } 73 | } 74 | } 75 | layer { 76 | name: "relu2" 77 | type: "ReLU" 78 | bottom: "conv2" 79 | top: "conv2" 80 | } 81 | layer { 82 | name: "pool2" 83 | type: "Pooling" 84 | bottom: "conv2" 85 | top: "pool2" 86 | pooling_param { 87 | pool: MAX 88 | kernel_size: 3 89 | stride: 2 90 | } 91 | } 92 | 93 | # CONV3-RELU3-POOL3 94 | layer { 95 | name: "conv3" 96 | type: "Convolution" 97 | bottom: "pool2" 98 | top: "conv3" 99 | param { 100 | lr_mult: 9 101 | } 102 | param { 103 | lr_mult: 18 104 | } 105 | convolution_param { 106 | num_output: 64 107 | kernel_size: 3 108 | stride: 1 109 | weight_filler { 110 | type: "xavier" 111 | } 112 | bias_filler { 113 | type: "constant" 114 | } 115 | } 116 | } 117 | layer { 118 | name: "relu3" 119 | type: "ReLU" 120 | bottom: "conv3" 121 | top: "conv3" 122 | } 123 | layer { 124 | name: "pool3" 125 | type: "Pooling" 126 | bottom: "conv3" 127 | top: "pool3" 128 | pooling_param { 129 | pool: MAX 130 | kernel_size: 3 131 | stride:2 132 | } 133 | } 134 | 135 | # CONV4-RELU4-POOL4 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "pool3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 4 143 | } 144 | param { 145 | lr_mult: 8 146 | } 147 | convolution_param { 148 | num_output: 128 149 | kernel_size: 3 150 | stride: 1 151 | weight_filler { 152 | type: "xavier" 153 | } 154 | bias_filler { 155 | type: "constant" 156 | } 157 | } 158 | } 159 | layer { 160 | name: "relu4" 161 | type: "ReLU" 162 | bottom: "conv4" 163 | top: "conv4" 164 | } 165 | layer { 166 | name: "pool4" 167 | type: "Pooling" 168 | bottom: "conv4" 169 | top: "pool4" 170 | pooling_param { 171 | pool: MAX 172 | kernel_size: 3 173 | stride:2 174 | } 175 | } 176 | 177 | # CONV5-RELU5-POOL5 178 | layer { 179 | name: "conv5" 180 | type: "Convolution" 181 | bottom: "pool4" 182 | top: "conv5" 183 | param { 184 | lr_mult: 2 185 | } 186 | param { 187 | lr_mult: 4 188 | } 189 | convolution_param { 190 | num_output: 128 191 | kernel_size: 3 192 | stride: 1 193 | weight_filler { 194 | type: "xavier" 195 | } 196 | bias_filler { 197 | type: "constant" 198 | } 199 | } 200 | } 201 | layer { 202 | name: "relu5" 203 | type: "ReLU" 204 | bottom: "conv5" 205 | top: "conv5" 206 | } 207 | layer { 208 | name: "pool5" 209 | type: "Pooling" 210 | bottom: "conv5" 211 | top: "pool5" 212 | pooling_param { 213 | pool: MAX 214 | kernel_size: 3 215 | stride:2 216 | } 217 | } 218 | 219 | # CONV6-RELU6-POOL6 220 | layer { 221 | name: "conv6" 222 | type: "Convolution" 223 | bottom: "pool5" 224 | top: "conv6" 225 | param { 226 | lr_mult: 1 227 | } 228 | param { 229 | lr_mult: 2 230 | } 231 | convolution_param { 232 | num_output: 256 233 | kernel_size: 3 234 | stride: 1 235 | weight_filler { 236 | type: "xavier" 237 | } 238 | bias_filler { 239 | type: "constant" 240 | } 241 | } 242 | } 243 | layer { 244 | name: "relu6" 245 | type: "ReLU" 246 | bottom: "conv6" 247 | top: "conv6" 248 | } 249 | layer { 250 | name: "pool6" 251 | type: "Pooling" 252 | bottom: "conv6" 253 | top: "pool6" 254 | pooling_param { 255 | pool: MAX 256 | kernel_size: 3 257 | stride:2 258 | } 259 | } 260 | 261 | # IP layers 262 | layer { 263 | name: "ip1" 264 | type: "InnerProduct" 265 | bottom: "pool6" 266 | top: "ip1" 267 | param { 268 | lr_mult: 1 269 | } 270 | param { 271 | lr_mult: 2 272 | } 273 | inner_product_param { 274 | num_output: 1024 275 | weight_filler { 276 | type: "xavier" 277 | } 278 | bias_filler { 279 | type: "constant" 280 | } 281 | } 282 | } 283 | layer { 284 | name: "reluOnIp1" 285 | type: "ReLU" 286 | bottom: "ip1" 287 | top: "ip1" 288 | } 289 | layer { 290 | name: "dropOnIp1" 291 | type: "Dropout" 292 | dropout_param { 293 | dropout_ratio: 0.5 294 | } 295 | bottom: "ip1" 296 | top: "ip1" 297 | } 298 | layer { 299 | name: "ip2" 300 | type: "InnerProduct" 301 | bottom: "ip1" 302 | top: "ip2" 303 | param { 304 | lr_mult: 1 305 | } 306 | param { 307 | lr_mult: 2 308 | } 309 | inner_product_param { 310 | num_output: 1024 311 | weight_filler { 312 | type: "xavier" 313 | } 314 | bias_filler { 315 | type: "constant" 316 | } 317 | } 318 | } 319 | layer { 320 | name: "reluOnIp2" 321 | type: "ReLU" 322 | bottom: "ip2" 323 | top: "ip2" 324 | } 325 | layer { 326 | name: "dropOnIp2" 327 | type: "Dropout" 328 | dropout_param { 329 | dropout_ratio: 0.5 330 | } 331 | bottom: "ip2" 332 | top: "ip2" 333 | } 334 | layer { 335 | name: "ip3" 336 | type: "InnerProduct" 337 | bottom: "ip2" 338 | top: "ip3" 339 | param { 340 | lr_mult: 1 341 | } 342 | param { 343 | lr_mult: 2 344 | } 345 | inner_product_param { 346 | num_output: 176 347 | weight_filler { 348 | type: "xavier" 349 | } 350 | bias_filler { 351 | type: "constant" 352 | } 353 | } 354 | } 355 | layer { 356 | name: "loss" 357 | type: "Softmax" 358 | bottom: "ip3" 359 | top: "loss" 360 | } 361 | -------------------------------------------------------------------------------- /prototxt/main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR.prototxt: -------------------------------------------------------------------------------- 1 | name: "LangNet" 2 | # DATA LAYERS 3 | layer { 4 | name: "mnist" 5 | type: "Data" 6 | top: "data" 7 | top: "label" 8 | include { 9 | phase: TRAIN 10 | } 11 | transform_param { 12 | scale: 0.00390625 13 | } 14 | data_param { 15 | source: "train/traindb" 16 | batch_size: 32 17 | backend: LEVELDB 18 | } 19 | } 20 | layer { 21 | name: "mnist" 22 | type: "Data" 23 | top: "data" 24 | top: "label" 25 | include { 26 | phase: TEST 27 | } 28 | transform_param { 29 | scale: 0.00390625 30 | } 31 | data_param { 32 | source: "train/valdb" 33 | batch_size: 1 34 | backend: LEVELDB 35 | } 36 | } 37 | 38 | # CONV1-RELU1-POOL1 39 | layer { 40 | name: "conv1" 41 | type: "Convolution" 42 | bottom: "data" 43 | top: "conv1" 44 | param { 45 | lr_mult: 15 46 | } 47 | param { 48 | lr_mult: 30 49 | } 50 | convolution_param { 51 | num_output: 32 52 | kernel_size: 7 53 | stride: 1 54 | weight_filler { 55 | type: "xavier" 56 | } 57 | bias_filler { 58 | type: "constant" 59 | } 60 | } 61 | } 62 | layer { 63 | name: "relu1" 64 | type: "ReLU" 65 | bottom: "conv1" 66 | top: "conv1" 67 | } 68 | layer { 69 | name: "pool1" 70 | type: "Pooling" 71 | bottom: "conv1" 72 | top: "pool1" 73 | pooling_param { 74 | pool: MAX 75 | kernel_size: 3 76 | stride: 2 77 | } 78 | } 79 | 80 | # CONV2-RELU2-POOL2_ 81 | layer { 82 | name: "conv2" 83 | type: "Convolution" 84 | bottom: "pool1" 85 | top: "conv2" 86 | param { 87 | lr_mult: 12 88 | } 89 | param { 90 | lr_mult: 24 91 | } 92 | convolution_param { 93 | num_output: 64 94 | kernel_size: 5 95 | stride: 1 96 | weight_filler { 97 | type: "xavier" 98 | } 99 | bias_filler { 100 | type: "constant" 101 | } 102 | } 103 | } 104 | layer { 105 | name: "relu2" 106 | type: "ReLU" 107 | bottom: "conv2" 108 | top: "conv2" 109 | } 110 | layer { 111 | name: "pool2" 112 | type: "Pooling" 113 | bottom: "conv2" 114 | top: "pool2" 115 | pooling_param { 116 | pool: MAX 117 | kernel_size: 3 118 | stride: 2 119 | } 120 | } 121 | 122 | # CONV3-RELU3-POOL3 123 | layer { 124 | name: "conv3" 125 | type: "Convolution" 126 | bottom: "pool2" 127 | top: "conv3" 128 | param { 129 | lr_mult: 9 130 | } 131 | param { 132 | lr_mult: 18 133 | } 134 | convolution_param { 135 | num_output: 64 136 | kernel_size: 3 137 | stride: 1 138 | weight_filler { 139 | type: "xavier" 140 | } 141 | bias_filler { 142 | type: "constant" 143 | } 144 | } 145 | } 146 | layer { 147 | name: "relu3" 148 | type: "ReLU" 149 | bottom: "conv3" 150 | top: "conv3" 151 | } 152 | layer { 153 | name: "pool3" 154 | type: "Pooling" 155 | bottom: "conv3" 156 | top: "pool3" 157 | pooling_param { 158 | pool: MAX 159 | kernel_size: 3 160 | stride:2 161 | } 162 | } 163 | 164 | # CONV4-RELU4-POOL4 165 | layer { 166 | name: "conv4" 167 | type: "Convolution" 168 | bottom: "pool3" 169 | top: "conv4" 170 | param { 171 | lr_mult: 4 172 | } 173 | param { 174 | lr_mult: 8 175 | } 176 | convolution_param { 177 | num_output: 128 178 | kernel_size: 3 179 | stride: 1 180 | weight_filler { 181 | type: "xavier" 182 | } 183 | bias_filler { 184 | type: "constant" 185 | } 186 | } 187 | } 188 | layer { 189 | name: "relu4" 190 | type: "ReLU" 191 | bottom: "conv4" 192 | top: "conv4" 193 | } 194 | layer { 195 | name: "pool4" 196 | type: "Pooling" 197 | bottom: "conv4" 198 | top: "pool4" 199 | pooling_param { 200 | pool: MAX 201 | kernel_size: 3 202 | stride:2 203 | } 204 | } 205 | 206 | # CONV5-RELU5-POOL5 207 | layer { 208 | name: "conv5" 209 | type: "Convolution" 210 | bottom: "pool4" 211 | top: "conv5" 212 | param { 213 | lr_mult: 2 214 | } 215 | param { 216 | lr_mult: 4 217 | } 218 | convolution_param { 219 | num_output: 128 220 | kernel_size: 3 221 | stride: 1 222 | weight_filler { 223 | type: "xavier" 224 | } 225 | bias_filler { 226 | type: "constant" 227 | } 228 | } 229 | } 230 | layer { 231 | name: "relu5" 232 | type: "ReLU" 233 | bottom: "conv5" 234 | top: "conv5" 235 | } 236 | layer { 237 | name: "pool5" 238 | type: "Pooling" 239 | bottom: "conv5" 240 | top: "pool5" 241 | pooling_param { 242 | pool: MAX 243 | kernel_size: 3 244 | stride:2 245 | } 246 | } 247 | 248 | # CONV6-RELU6-POOL6 249 | layer { 250 | name: "conv6" 251 | type: "Convolution" 252 | bottom: "pool5" 253 | top: "conv6" 254 | param { 255 | lr_mult: 1 256 | } 257 | param { 258 | lr_mult: 2 259 | } 260 | convolution_param { 261 | num_output: 256 262 | kernel_size: 3 263 | stride: 1 264 | weight_filler { 265 | type: "xavier" 266 | } 267 | bias_filler { 268 | type: "constant" 269 | } 270 | } 271 | } 272 | layer { 273 | name: "relu6" 274 | type: "ReLU" 275 | bottom: "conv6" 276 | top: "conv6" 277 | } 278 | layer { 279 | name: "pool6" 280 | type: "Pooling" 281 | bottom: "conv6" 282 | top: "pool6" 283 | pooling_param { 284 | pool: MAX 285 | kernel_size: 3 286 | stride:2 287 | } 288 | } 289 | 290 | # IP layers 291 | layer { 292 | name: "ip1" 293 | type: "InnerProduct" 294 | bottom: "pool6" 295 | top: "ip1" 296 | param { 297 | lr_mult: 1 298 | } 299 | param { 300 | lr_mult: 2 301 | } 302 | inner_product_param { 303 | num_output: 1024 304 | weight_filler { 305 | type: "xavier" 306 | } 307 | bias_filler { 308 | type: "constant" 309 | } 310 | } 311 | } 312 | layer { 313 | name: "reluOnIp1" 314 | type: "ReLU" 315 | bottom: "ip1" 316 | top: "ip1" 317 | } 318 | layer { 319 | name: "dropOnIp1" 320 | type: "Dropout" 321 | dropout_param { 322 | dropout_ratio: 0.5 323 | } 324 | bottom: "ip1" 325 | top: "ip1" 326 | } 327 | layer { 328 | name: "ip2" 329 | type: "InnerProduct" 330 | bottom: "ip1" 331 | top: "ip2" 332 | param { 333 | lr_mult: 1 334 | } 335 | param { 336 | lr_mult: 2 337 | } 338 | inner_product_param { 339 | num_output: 1024 340 | weight_filler { 341 | type: "xavier" 342 | } 343 | bias_filler { 344 | type: "constant" 345 | } 346 | } 347 | } 348 | layer { 349 | name: "reluOnIp2" 350 | type: "ReLU" 351 | bottom: "ip2" 352 | top: "ip2" 353 | } 354 | layer { 355 | name: "dropOnIp2" 356 | type: "Dropout" 357 | dropout_param { 358 | dropout_ratio: 0.5 359 | } 360 | bottom: "ip2" 361 | top: "ip2" 362 | } 363 | layer { 364 | name: "ip3" 365 | type: "InnerProduct" 366 | bottom: "ip2" 367 | top: "ip3" 368 | param { 369 | lr_mult: 1 370 | } 371 | param { 372 | lr_mult: 2 373 | } 374 | inner_product_param { 375 | num_output: 176 376 | weight_filler { 377 | type: "xavier" 378 | } 379 | bias_filler { 380 | type: "constant" 381 | } 382 | } 383 | } 384 | layer { 385 | name: "accuracy" 386 | type: "Accuracy" 387 | bottom: "ip3" 388 | bottom: "label" 389 | top: "accuracy" 390 | include { 391 | phase: TEST 392 | } 393 | } 394 | layer { 395 | name: "loss" 396 | type: "SoftmaxWithLoss" 397 | bottom: "ip3" 398 | bottom: "label" 399 | top: "loss" 400 | } 401 | -------------------------------------------------------------------------------- /prototxt/solver.augm.nolrcoef.prototxt: -------------------------------------------------------------------------------- 1 | net: "prototxt/augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3.prototxt" 2 | 3 | test_iter: 512 4 | test_interval: 1500 5 | 6 | # The base learning rate, momentum and the weight decay of the network. 7 | base_lr: 0.01 8 | weight_decay: 0.0000 9 | 10 | # The learning rate policy 11 | # lr_policy: "fixed" 12 | # solver_type: ADADELTA 13 | 14 | lr_policy: "inv" 15 | gamma: 0.0003 16 | power: 0.9 17 | 18 | #lr_policy: "step" 19 | #gamma: 0.9 20 | #stepsize: 6000 21 | 22 | display: 1 23 | 24 | max_iter: 800000 25 | 26 | snapshot: 3000 27 | snapshot_prefix: "models/augm_dropout0.3_on_augm84K-lr0.01_30K_90K" 28 | #log: "logs/augm_dropout0.3_on_augm84K-lr0.01_30K_90K.txt" 29 | solver_mode: GPU 30 | 31 | -------------------------------------------------------------------------------- /prototxt/solver.main.adadelta.prototxt: -------------------------------------------------------------------------------- 1 | net: "prototxt/main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR.prototxt" 2 | 3 | test_iter: 100 4 | test_interval: 100 5 | 6 | # The base learning rate, momentum and the weight decay of the network. 7 | weight_decay: 0.0000 8 | 9 | # The learning rate policy 10 | base_lr: 0.01 11 | lr_policy: "fixed" 12 | solver_type: ADADELTA 13 | 14 | display: 1 15 | 16 | max_iter: 800000 17 | 18 | snapshot: 3000 19 | snapshot_prefix: "models/main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR_adadelta0.01" 20 | #log: "logs/main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR_adadelta0.01.txt" 21 | solver_mode: GPU 22 | 23 | -------------------------------------------------------------------------------- /test_augm_network.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import caffe 3 | import numpy as np 4 | 5 | caffe.set_mode_gpu() 6 | 7 | # info about classes 8 | file = open('trainingData.csv') 9 | data = file.readlines()[1:] 10 | langs = set() 11 | for line in data: 12 | filepath, language = line.split(',') 13 | language = language.strip() 14 | langs.add(language) 15 | langs = sorted(langs) 16 | 17 | 18 | # network parameters: 19 | deploy_name = 'augm_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.3-1024rd0.3' 20 | network_name = 'augm_dropout0.3_on_augm84K-lr0.01_30K' 21 | iterations = '90000' 22 | aveSamples = 20 # average over this many samples 23 | 24 | net = caffe.Classifier(model_file='prototxt/deploy.' + deploy_name + '.prototxt', 25 | pretrained_file='models/' + network_name + '_iter_' + iterations + '.caffemodel') 26 | 27 | net.blobs['data'].reshape(1, 1, 256, 768) 28 | predict_set = sys.argv[1] 29 | 30 | if (predict_set == "test"): 31 | folder = 'test/png/' 32 | f = open('testingData.csv') 33 | cnt = 12320 34 | print_file = open('predictions/test_' + network_name + '_iter_' + iterations + '_' + str(aveSamples) + '.csv', 'w') 35 | elif (predict_set == "val"): 36 | folder = '/home/brainstorm/caffe/Data/mnt/3/language/train/pngaugm/' 37 | f = open('valEqual.csv') 38 | cnt = 12320 39 | print_file = open('predictions/validation_' + network_name + '_iter_' + iterations + '_' + str(aveSamples) + '.csv', 'w') 40 | else: # train 41 | folder = '/home/brainstorm/caffe/Data/mnt/3/language/train/pngaugm/' 42 | f = open('trainEqual.csv') 43 | cnt = 10000 44 | print_file = open('predictions/train_' + network_name + '_iter_' + iterations + '_' + str(aveSamples) + '.csv', 'w') 45 | 46 | preds = [] 47 | labels = [] 48 | topcoder_score = 0.0 49 | processed = 0 50 | 51 | for iter in range(cnt): 52 | st = f.readline() 53 | if (predict_set == "val" or predict_set == "train"): 54 | (name, label) = st.split(',') 55 | label = int(label) 56 | else: 57 | name = st.strip()[:-4] 58 | processed += 1 59 | out = np.zeros((176, )) 60 | for randomIndex in range(aveSamples): 61 | image = caffe.io.load_image(folder + name + '.' + str(randomIndex) + '.png', color=False) 62 | image = np.transpose(image, (2, 0, 1)) 63 | #image = np.concatenate([image, np.zeros((1, 256, 858 - 768), dtype=np.float32)], axis=2) 64 | net.blobs['data'].data[...] = image 65 | out += net.forward()['loss'][0] 66 | 67 | pred = sorted([(x, it) for it, x in enumerate(out)], reverse=True) 68 | 69 | if (predict_set == "val" or predict_set == "train"): 70 | if (pred[0][1] == label): 71 | topcoder_score = topcoder_score + 1000 72 | elif (pred[1][1] == label): 73 | topcoder_score = topcoder_score + 400 74 | elif (pred[2][1] == label): 75 | topcoder_score = topcoder_score + 160 76 | 77 | for i in range(3): 78 | lang_id = pred[i][1] 79 | lang = langs[lang_id] 80 | print_file.write(name + '.mp3,' + lang + ',' + str(i + 1) + '\n') 81 | 82 | if (iter % 100 == 0): 83 | print >> sys.stderr, network_name + '_iter_' + iterations + '_' + str(aveSamples) 84 | print >> sys.stderr, "processed %d / %d images (%d samples/mp3)" % (iter, cnt, aveSamples) 85 | print >> sys.stderr, "score: ", topcoder_score 86 | print >> sys.stderr, "expected score:", topcoder_score / processed * 35200 87 | 88 | print >> sys.stderr, "Final score: ", topcoder_score, " / ", cnt, "000" 89 | print >> sys.stderr, "expected score:", topcoder_score / processed * 35200 90 | -------------------------------------------------------------------------------- /test_main_network.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import caffe 3 | import numpy as np 4 | 5 | caffe.set_mode_gpu() 6 | 7 | # info about classes 8 | file = open('trainingData.csv') 9 | data = file.readlines()[1:] 10 | langs = set() 11 | for line in data: 12 | filepath, language = line.split(',') 13 | language = language.strip() 14 | langs.add(language) 15 | langs = sorted(langs) 16 | 17 | 18 | # network parameters: 19 | deploy_name = 'main_32r-2-64r-2-64r-2-128r-2-128r-2-256r-2-1024rd0.5-1024rd0.5_DLR' 20 | network_name = deploy_name + '_150K-momentum' 21 | iterations = '51000' 22 | 23 | net = caffe.Classifier(model_file='prototxt/deploy.' + deploy_name + '.prototxt', 24 | pretrained_file='models/' + network_name + '_iter_' + iterations + '.caffemodel') 25 | 26 | transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) 27 | transformer.set_transpose('data', (2, 0, 1)) 28 | net.blobs['data'].reshape(1, 1, 256, 858) 29 | 30 | predict_set = sys.argv[1] 31 | 32 | if (predict_set == "test"): 33 | folder = 'test/png/' 34 | f = open('testingData.csv') 35 | cnt = 12320 36 | print_file = open('predictions/test_' + network_name + '_iter_' + iterations + '.csv', 'w') 37 | elif (predict_set == "val"): 38 | folder = '/home/brainstorm/caffe/Data/mnt/3/language/train/pngaugm/' ## stegh dreci augm 39 | f = open('valDataNew.csv') 40 | cnt = 16176 41 | print_file = open('predictions/validation_' + network_name + '_iter_' + iterations + '.csv', 'w') 42 | else: # train 43 | folder = '/home/brainstorm/caffe/Data/mnt/3/language/train/png/' 44 | f = open('trainingDataNew.csv') 45 | cnt = 10000 46 | print_file = open('predictions/train_' + network_name + '_iter_' + iterations + '.csv', 'w') 47 | 48 | preds = [] 49 | labels = [] 50 | topcoder_score = 0 51 | processed = 0 52 | 53 | for iter in range(cnt): 54 | st = f.readline() 55 | if (predict_set == "val" or predict_set == "train"): 56 | (name, label) = st.split(',') 57 | label = int(label) 58 | else: 59 | name = st.strip()[:-4] 60 | processed += 1 61 | 62 | net.blobs['data'].data[...] = transformer.preprocess('data', 63 | caffe.io.load_image(folder + name + '.png', color=False)) 64 | 65 | out = net.forward()['loss'][0] 66 | 67 | pred = sorted([(x, it) for it, x in enumerate(out)], reverse=True) 68 | 69 | if (predict_set == "val" or predict_set == "train"): 70 | if (pred[0][1] == label): 71 | topcoder_score = topcoder_score + 1000 72 | elif (pred[1][1] == label): 73 | topcoder_score = topcoder_score + 400 74 | elif (pred[2][1] == label): 75 | topcoder_score = topcoder_score + 160 76 | 77 | for i in range(3): 78 | lang_id = pred[i][1] 79 | lang = langs[lang_id] 80 | print_file.write(name + '.mp3,' + lang + ',' + str(i + 1) + '\n') 81 | 82 | if (iter % 100 == 0): 83 | print >> sys.stderr, "processed %d / %d images" % (iter, cnt) 84 | print >> sys.stderr, "score: ", topcoder_score 85 | print >> sys.stderr, "expected score:", topcoder_score / processed * 35200 86 | 87 | print >> sys.stderr, "Final score: ", topcoder_score, " / ", cnt, "000" 88 | print >> sys.stderr, "expected score:", topcoder_score / processed * 35200 89 | -------------------------------------------------------------------------------- /theano/README.md: -------------------------------------------------------------------------------- 1 | # Spoken language identification 2 | 3 | `networks` folder contains multiple CNN and/or RNN models implemented in Theano/Lasagne. 4 | 5 | Read more in the corresponding [blog post](http://yerevann.github.io/2016/06/26/combining-cnn-and-rnn-for-spoken-language-identification/). 6 | -------------------------------------------------------------------------------- /theano/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import sklearn.metrics as metrics 4 | import argparse 5 | import time 6 | import json 7 | import importlib 8 | 9 | print "==> parsing input arguments" 10 | parser = argparse.ArgumentParser() 11 | 12 | # TODO: add argument to choose training set 13 | parser.add_argument('--network', type=str, default="network_batch", help='embeding size (50, 100, 200, 300 only)') 14 | parser.add_argument('--epochs', type=int, default=500, help='number of epochs to train') 15 | parser.add_argument('--load_state', type=str, default="", help='state file path') 16 | parser.add_argument('--mode', type=str, default="train", help='mode: train/test/test_on_train') 17 | parser.add_argument('--batch_size', type=int, default=32, help='no commment') 18 | parser.add_argument('--l2', type=float, default=0, help='L2 regularization') 19 | parser.add_argument('--log_every', type=int, default=100, help='print information every x iteration') 20 | parser.add_argument('--save_every', type=int, default=50000, help='save state every x iteration') 21 | parser.add_argument('--prefix', type=str, default="", help='optional prefix of network name') 22 | parser.add_argument('--dropout', type=float, default=0.0, help='dropout rate (between 0 and 1)') 23 | parser.add_argument('--no-batch_norm', dest="batch_norm", action='store_false', help='batch normalization') 24 | parser.add_argument('--rnn_num_units', type=int, default=500, help='number of hidden units if the network is RNN') 25 | parser.add_argument('--equal_split', type=bool, default=False, help='use trainEqual.csv and valEqual.csv') 26 | parser.add_argument('--forward_cnt', type=int, default=1, help='if forward pass is nondeterministic, then how many forward passes are averaged') 27 | 28 | parser.set_defaults(batch_norm=True) 29 | args = parser.parse_args() 30 | print args 31 | 32 | if (args.equal_split): 33 | train_listfile = open("/mnt/hdd615/Hrayr/Spoken-language-identification/trainEqual.csv", "r") 34 | test_listfile = open("/mnt/hdd615/Hrayr/Spoken-language-identification/valEqual.csv", "r") 35 | else: 36 | train_listfile = open("/mnt/hdd615/Hrayr/Spoken-language-identification/trainingDataNew.csv", "r") 37 | test_listfile = open("/mnt/hdd615/Hrayr/Spoken-language-identification/valDataNew.csv", "r") 38 | 39 | train_list_raw = train_listfile.readlines() 40 | test_list_raw = test_listfile.readlines() 41 | 42 | print "==> %d training examples" % len(train_list_raw) 43 | print "==> %d validation examples" % len(test_list_raw) 44 | 45 | train_listfile.close() 46 | test_listfile.close() 47 | 48 | args_dict = dict(args._get_kwargs()) 49 | args_dict['train_list_raw'] = train_list_raw 50 | args_dict['test_list_raw'] = test_list_raw 51 | args_dict['png_folder'] = "/mnt/hdd615/Hrayr/Spoken-language-identification/train/png/" 52 | 53 | 54 | 55 | print "==> using network %s" % args.network 56 | network_module = importlib.import_module("networks." + args.network) 57 | network = network_module.Network(**args_dict) 58 | 59 | 60 | network_name = args.prefix + '%s.bs%d%s%s' % ( 61 | network.say_name(), 62 | args.batch_size, 63 | ".bn" if args.batch_norm else "", 64 | (".d" + str(args.dropout)) if args.dropout>0 else "") 65 | 66 | print "==> network_name:", network_name 67 | 68 | 69 | start_epoch = 0 70 | if args.load_state != "": 71 | start_epoch = network.load_state(args.load_state) + 1 72 | 73 | def do_epoch(mode, epoch): 74 | # mode is 'train' or 'test' or 'predict' 75 | y_true = [] 76 | y_pred = [] 77 | avg_loss = 0.0 78 | prev_time = time.time() 79 | 80 | batches_per_epoch = network.get_batches_per_epoch(mode) 81 | all_prediction = [] 82 | 83 | for i in range(0, batches_per_epoch): 84 | step_data = network.step(i, mode) 85 | prediction = step_data["prediction"] 86 | answers = step_data["answers"] 87 | current_loss = step_data["current_loss"] 88 | log = step_data["log"] 89 | 90 | avg_loss += current_loss 91 | if (mode == "predict" or mode == "predict_on_train"): 92 | all_prediction.append(prediction) 93 | for pass_id in range(args.forward_cnt-1): 94 | step_data = network.step(i, mode) 95 | prediction += step_data["prediction"] 96 | current_loss += step_data["current_loss"] 97 | prediction /= args.forward_cnt 98 | current_loss /= args.forward_cnt 99 | 100 | for x in answers: 101 | y_true.append(x) 102 | 103 | for x in prediction.argmax(axis=1): 104 | y_pred.append(x) 105 | 106 | if ((i + 1) % args.log_every == 0): 107 | cur_time = time.time() 108 | print (" %sing: %d.%d / %d \t loss: %3f \t avg_loss: %.5f \t %s \t time: %.2fs" % 109 | (mode, epoch, (i + 1) * args.batch_size, batches_per_epoch * args.batch_size, 110 | current_loss, avg_loss / (i + 1), log, cur_time - prev_time)) 111 | prev_time = cur_time 112 | 113 | 114 | #print "confusion matrix:" 115 | #print metrics.confusion_matrix(y_true, y_pred) 116 | accuracy = sum([1 if t == p else 0 for t, p in zip(y_true, y_pred)]) 117 | print "accuracy: %.2f percent" % (accuracy * 100.0 / batches_per_epoch / args.batch_size) 118 | 119 | if (mode == "predict"): 120 | all_prediction = np.vstack(all_prediction) 121 | pred_filename = "predictions/" + ("equal_split." if args.equal_split else "") + \ 122 | args.load_state[args.load_state.rfind('/')+1:] + ".csv" 123 | with open(pred_filename, 'w') as pred_csv: 124 | for x in all_prediction: 125 | print >> pred_csv, ",".join([("%.6f" % prob) for prob in x]) 126 | 127 | return avg_loss / batches_per_epoch 128 | 129 | 130 | if args.mode == 'train': 131 | print "==> training" 132 | for epoch in range(start_epoch, args.epochs): 133 | do_epoch('train', epoch) 134 | test_loss = do_epoch('test', epoch) 135 | state_name = 'states/%s.epoch%d.test%.5f.state' % (network_name, epoch, test_loss) 136 | print "==> saving ... %s" % state_name 137 | network.save_params(state_name, epoch) 138 | 139 | elif args.mode == 'test': 140 | do_epoch('predict', 0) 141 | elif args.mode == 'test_on_train': 142 | do_epoch('predict_on_train', 0) 143 | else: 144 | raise Exception("unknown mode") -------------------------------------------------------------------------------- /theano/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YerevaNN/Spoken-language-identification/e947dee00f301115cba1460b655a101d5239b932/theano/networks/__init__.py -------------------------------------------------------------------------------- /theano/networks/base_network.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | 3 | 4 | class BaseNetwork: 5 | 6 | def say_name(self): 7 | return "unknown" 8 | 9 | 10 | def save_params(self, file_name, epoch, **kwargs): 11 | with open(file_name, 'w') as save_file: 12 | pickle.dump( 13 | obj = { 14 | 'params' : [x.get_value() for x in self.params], 15 | 'epoch' : epoch, 16 | }, 17 | file = save_file, 18 | protocol = -1 19 | ) 20 | 21 | 22 | def load_state(self, file_name): 23 | print "==> loading state %s" % file_name 24 | epoch = 0 25 | with open(file_name, 'r') as load_file: 26 | dict = pickle.load(load_file) 27 | loaded_params = dict['params'] 28 | for (x, y) in zip(self.params, loaded_params): 29 | x.set_value(y) 30 | epoch = dict['epoch'] 31 | return epoch 32 | 33 | 34 | def get_batches_per_epoch(self, mode): 35 | if (mode == 'train' or mode == 'predict_on_train'): 36 | return len(self.train_list_raw) / self.batch_size 37 | elif (mode == 'test' or mode == 'predict'): 38 | return len(self.test_list_raw) / self.batch_size 39 | else: 40 | raise Exception("unknown mode") 41 | 42 | 43 | def step(self, batch_index, mode): 44 | 45 | if (mode == "train"): 46 | data, answers = self.read_batch(self.train_list_raw, batch_index) 47 | theano_fn = self.train_fn 48 | elif (mode == "test" or mode == "predict"): 49 | data, answers = self.read_batch(self.test_list_raw, batch_index) 50 | theano_fn = self.test_fn 51 | elif (mode == "predict_on_train"): 52 | data, answers = self.read_batch(self.train_list_raw, batch_index) 53 | theano_fn = self.test_fn 54 | else: 55 | raise Exception("unrecognized mode") 56 | 57 | ret = theano_fn(data, answers) 58 | return {"prediction": ret[0], 59 | "answers": answers, 60 | "current_loss": ret[1], 61 | "log": "", 62 | } -------------------------------------------------------------------------------- /theano/networks/rnn.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.l2 = l2 27 | self.mode = mode 28 | self.num_units = rnn_num_units 29 | 30 | self.input_var = T.tensor3('input_var') 31 | self.answer_var = T.ivector('answer_var') 32 | 33 | print "==> building network" 34 | example = np.random.uniform(size=(self.batch_size, 858, 256), low=0.0, high=1.0).astype(np.float32) ######### 35 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 36 | 37 | # InputLayer 38 | network = layers.InputLayer(shape=(None, 858, 256), input_var=self.input_var) 39 | print layers.get_output(network).eval({self.input_var:example}).shape 40 | 41 | # GRULayer 42 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 43 | print layers.get_output(network).eval({self.input_var:example}).shape 44 | 45 | # Last layer: classification 46 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | 49 | self.params = layers.get_all_params(network, trainable=True) 50 | self.prediction = layers.get_output(network) 51 | 52 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 53 | if (self.l2 > 0): 54 | self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 55 | lasagne.regularization.l2) 56 | else: 57 | self.loss_l2 = 0 58 | self.loss = self.loss_ce + self.loss_l2 59 | 60 | #updates = lasagne.updates.adadelta(self.loss, self.params) 61 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0005) 62 | 63 | if self.mode == 'train': 64 | print "==> compiling train_fn" 65 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 66 | outputs=[self.prediction, self.loss], 67 | updates=updates) 68 | 69 | print "==> compiling test_fn" 70 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 71 | outputs=[self.prediction, self.loss]) 72 | 73 | 74 | def say_name(self): 75 | return "rnn.GRU.num_units%d" % self.num_units 76 | 77 | 78 | def read_batch(self, data_raw, batch_index): 79 | 80 | start_index = batch_index * self.batch_size 81 | end_index = start_index + self.batch_size 82 | 83 | data = np.zeros((self.batch_size, 858, 256), dtype=np.float32) 84 | answers = [] 85 | 86 | for i in range(start_index, end_index): 87 | answers.append(int(data_raw[i].split(',')[1])) 88 | name = data_raw[i].split(',')[0] 89 | path = self.png_folder + name + ".png" 90 | im = Image.open(path) 91 | data[i - start_index, :, :] = np.transpose(np.array(im).astype(np.float32) / 256.0) 92 | 93 | answers = np.array(answers, dtype=np.int32) 94 | return data, answers 95 | 96 | 97 | -------------------------------------------------------------------------------- /theano/networks/rnn_2layers.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, batch_norm, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.l2 = l2 27 | self.mode = mode 28 | self.num_units = rnn_num_units 29 | self.batch_norm = batch_norm 30 | 31 | self.input_var = T.tensor3('input_var') 32 | self.answer_var = T.ivector('answer_var') 33 | 34 | # scale inputs to be in [-1, 1] 35 | input_var_norm = 2 * self.input_var - 1 36 | 37 | print "==> building network" 38 | example = np.random.uniform(size=(self.batch_size, 858, 256), low=0.0, high=1.0).astype(np.float32) ######### 39 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 40 | 41 | # InputLayer 42 | network = layers.InputLayer(shape=(None, 858, 256), input_var=input_var_norm) 43 | print layers.get_output(network).eval({self.input_var:example}).shape 44 | 45 | # GRULayer 46 | network = layers.GRULayer(incoming=network, num_units=self.num_units) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | 49 | # BatchNormalization Layer 50 | if (self.batch_norm): 51 | network = layers.BatchNormLayer(incoming=network) 52 | print layers.get_output(network).eval({self.input_var:example}).shape 53 | 54 | # GRULayer 55 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | 58 | # Last layer: classification 59 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 60 | print layers.get_output(network).eval({self.input_var:example}).shape 61 | 62 | self.params = layers.get_all_params(network, trainable=True) 63 | self.prediction = layers.get_output(network) 64 | 65 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 66 | if (self.l2 > 0): 67 | self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 68 | lasagne.regularization.l2) 69 | else: 70 | self.loss_l2 = 0 71 | self.loss = self.loss_ce + self.loss_l2 72 | 73 | #updates = lasagne.updates.adadelta(self.loss, self.params) 74 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 75 | 76 | if self.mode == 'train': 77 | print "==> compiling train_fn" 78 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 79 | outputs=[self.prediction, self.loss], 80 | updates=updates) 81 | 82 | print "==> compiling test_fn" 83 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 84 | outputs=[self.prediction, self.loss]) 85 | 86 | 87 | def say_name(self): 88 | return "rnn_2layers.GRU.num_units%d" % self.num_units 89 | 90 | 91 | def read_batch(self, data_raw, batch_index): 92 | 93 | start_index = batch_index * self.batch_size 94 | end_index = start_index + self.batch_size 95 | 96 | data = np.zeros((self.batch_size, 858, 256), dtype=np.float32) 97 | answers = [] 98 | 99 | for i in range(start_index, end_index): 100 | answers.append(int(data_raw[i].split(',')[1])) 101 | name = data_raw[i].split(',')[0] 102 | path = self.png_folder + name + ".png" 103 | im = Image.open(path) 104 | data[i - start_index, :, :] = np.transpose(np.array(im).astype(np.float32) / 256.0) 105 | 106 | answers = np.array(answers, dtype=np.int32) 107 | return data, answers 108 | 109 | -------------------------------------------------------------------------------- /theano/networks/rnn_2layers_5khz.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, batch_norm, **kwargs): 20 | 21 | print "==> not used params in network class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.l2 = l2 27 | self.mode = mode 28 | self.num_units = rnn_num_units 29 | self.batch_norm = batch_norm 30 | 31 | self.input_var = T.tensor3('input_var') 32 | self.answer_var = T.ivector('answer_var') 33 | 34 | # scale inputs to be in [-1, 1] 35 | input_var_norm = 2 * self.input_var - 1 36 | 37 | print "==> building network" 38 | example = np.random.uniform(size=(self.batch_size, 858, 128), low=0.0, high=1.0).astype(np.float32) ######### 39 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 40 | 41 | # InputLayer 42 | network = layers.InputLayer(shape=(None, 858, 128), input_var=input_var_norm) 43 | print layers.get_output(network).eval({self.input_var:example}).shape 44 | 45 | # GRULayer 46 | network = layers.GRULayer(incoming=network, num_units=self.num_units) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | 49 | # BatchNormalization Layer 50 | if (self.batch_norm): 51 | network = layers.BatchNormLayer(incoming=network) 52 | print layers.get_output(network).eval({self.input_var:example}).shape 53 | 54 | # GRULayer 55 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | 58 | # BatchNormalization Layer 59 | # There are some states, where this layer was disabled 60 | if (self.batch_norm): 61 | network = layers.BatchNormLayer(incoming=network) 62 | print layers.get_output(network).eval({self.input_var:example}).shape 63 | 64 | # Last layer: classification 65 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | 68 | self.params = layers.get_all_params(network, trainable=True) 69 | self.prediction = layers.get_output(network) 70 | 71 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 72 | if (self.l2 > 0): 73 | self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 74 | lasagne.regularization.l2) 75 | else: 76 | self.loss_l2 = 0 77 | self.loss = self.loss_ce + self.loss_l2 78 | 79 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 80 | 81 | if self.mode == 'train': 82 | print "==> compiling train_fn" 83 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 84 | outputs=[self.prediction, self.loss], 85 | updates=updates) 86 | 87 | print "==> compiling test_fn" 88 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 89 | outputs=[self.prediction, self.loss]) 90 | 91 | 92 | def say_name(self): 93 | return "rnn_2layers_5khz.GRU.num_units%d" % self.num_units 94 | 95 | 96 | def read_batch(self, data_raw, batch_index): 97 | 98 | start_index = batch_index * self.batch_size 99 | end_index = start_index + self.batch_size 100 | 101 | data = np.zeros((self.batch_size, 858, 128), dtype=np.float32) 102 | answers = [] 103 | 104 | for i in range(start_index, end_index): 105 | answers.append(int(data_raw[i].split(',')[1])) 106 | name = data_raw[i].split(',')[0] 107 | path = self.png_folder + name + ".png" 108 | im = Image.open(path) 109 | data[i - start_index, :, :] = np.transpose(np.array(im).astype(np.float32) / 256.0)[:, :128] 110 | 111 | answers = np.array(answers, dtype=np.int32) 112 | return data, answers 113 | 114 | -------------------------------------------------------------------------------- /theano/networks/tc_net.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | 31 | self.input_var = T.tensor4('input_var') 32 | self.answer_var = T.ivector('answer_var') 33 | 34 | print "==> building network" 35 | example = np.random.uniform(size=(self.batch_size, 1, 256, 858), low=0.0, high=1.0).astype(np.float32) ######### 36 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 37 | 38 | network = layers.InputLayer(shape=(None, 1, 256, 858), input_var=self.input_var) 39 | print layers.get_output(network).eval({self.input_var:example}).shape 40 | 41 | # CONV-RELU-POOL 1 42 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 43 | stride=1, nonlinearity=rectify) 44 | print layers.get_output(network).eval({self.input_var:example}).shape 45 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 46 | print layers.get_output(network).eval({self.input_var:example}).shape 47 | if (self.batch_norm): 48 | network = layers.BatchNormLayer(incoming=network) 49 | 50 | # CONV-RELU-POOL 2 51 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 52 | stride=1, nonlinearity=rectify) 53 | print layers.get_output(network).eval({self.input_var:example}).shape 54 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 55 | print layers.get_output(network).eval({self.input_var:example}).shape 56 | if (self.batch_norm): 57 | network = layers.BatchNormLayer(incoming=network) 58 | 59 | 60 | # CONV-RELU-POOL 3 61 | network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), 62 | stride=1, nonlinearity=rectify) 63 | print layers.get_output(network).eval({self.input_var:example}).shape 64 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 65 | print layers.get_output(network).eval({self.input_var:example}).shape 66 | if (self.batch_norm): 67 | network = layers.BatchNormLayer(incoming=network) 68 | 69 | # CONV-RELU-POOL 4 70 | network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), 71 | stride=1, nonlinearity=rectify) 72 | print layers.get_output(network).eval({self.input_var:example}).shape 73 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 74 | print layers.get_output(network).eval({self.input_var:example}).shape 75 | if (self.batch_norm): 76 | network = layers.BatchNormLayer(incoming=network) 77 | 78 | # CONV-RELU-POOL 5 79 | network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), 80 | stride=1, nonlinearity=rectify) 81 | print layers.get_output(network).eval({self.input_var:example}).shape 82 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 83 | print layers.get_output(network).eval({self.input_var:example}).shape 84 | if (self.batch_norm): 85 | network = layers.BatchNormLayer(incoming=network) 86 | 87 | # CONV-RELU-POOL 6 88 | network = layers.Conv2DLayer(incoming=network, num_filters=256, filter_size=(3, 3), 89 | stride=1, nonlinearity=rectify) 90 | print layers.get_output(network).eval({self.input_var:example}).shape 91 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(3, 2), ignore_border=False) 92 | print layers.get_output(network).eval({self.input_var:example}).shape 93 | if (self.batch_norm): 94 | network = layers.BatchNormLayer(incoming=network) 95 | 96 | # DENSE 1 97 | network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify) 98 | if (self.batch_norm): 99 | network = layers.BatchNormLayer(incoming=network) 100 | if (self.dropout > 0): 101 | network = layers.dropout(network, self.dropout) 102 | print layers.get_output(network).eval({self.input_var:example}).shape 103 | 104 | """ 105 | # DENSE 2 106 | network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify) 107 | if (self.batch_norm): 108 | network = layers.BatchNormLayer(incoming=network) 109 | if (self.dropout > 0): 110 | network = layers.dropout(network, self.dropout) 111 | print layers.get_output(network).eval({self.input_var:example}).shape 112 | """ 113 | 114 | # Last layer: classification 115 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 116 | print layers.get_output(network).eval({self.input_var:example}).shape 117 | 118 | 119 | self.params = layers.get_all_params(network, trainable=True) 120 | self.prediction = layers.get_output(network) 121 | 122 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 123 | if (self.l2 > 0): 124 | self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 125 | lasagne.regularization.l2) 126 | else: 127 | self.loss_l2 = 0 128 | self.loss = self.loss_ce + self.loss_l2 129 | 130 | #updates = lasagne.updates.adadelta(self.loss, self.params) 131 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 132 | 133 | if self.mode == 'train': 134 | print "==> compiling train_fn" 135 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 136 | outputs=[self.prediction, self.loss], 137 | updates=updates) 138 | 139 | print "==> compiling test_fn" 140 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 141 | outputs=[self.prediction, self.loss]) 142 | 143 | 144 | def say_name(self): 145 | return "tc_net" 146 | 147 | 148 | def read_batch(self, data_raw, batch_index): 149 | 150 | start_index = batch_index * self.batch_size 151 | end_index = start_index + self.batch_size 152 | 153 | data = np.zeros((self.batch_size, 1, 256, 858), dtype=np.float32) 154 | answers = [] 155 | 156 | for i in range(start_index, end_index): 157 | answers.append(int(data_raw[i].split(',')[1])) 158 | name = data_raw[i].split(',')[0] 159 | path = self.png_folder + name + ".png" 160 | im = Image.open(path) 161 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32) / 256.0 162 | 163 | answers = np.array(answers, dtype=np.int32) 164 | return data, answers 165 | 166 | -------------------------------------------------------------------------------- /theano/networks/tc_net_deeprnn_shared_pad.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | class Network(BaseNetwork): 17 | 18 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 19 | 20 | print "==> not used params in DMN class:", kwargs.keys() 21 | self.train_list_raw = train_list_raw 22 | self.test_list_raw = test_list_raw 23 | self.png_folder = png_folder 24 | self.batch_size = batch_size 25 | self.dropout = dropout 26 | self.l2 = l2 27 | self.mode = mode 28 | self.batch_norm = batch_norm 29 | self.num_units = rnn_num_units 30 | 31 | self.input_var = T.tensor4('input_var') 32 | self.answer_var = T.ivector('answer_var') 33 | 34 | print "==> building network" 35 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 36 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 37 | 38 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 39 | print layers.get_output(network).eval({self.input_var:example}).shape 40 | 41 | # CONV-RELU-POOL 1 42 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 43 | stride=1, nonlinearity=rectify) 44 | print layers.get_output(network).eval({self.input_var:example}).shape 45 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 46 | print layers.get_output(network).eval({self.input_var:example}).shape 47 | if (self.batch_norm): 48 | network = layers.BatchNormLayer(incoming=network) 49 | 50 | # CONV-RELU-POOL 2 51 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 52 | stride=1, nonlinearity=rectify) 53 | print layers.get_output(network).eval({self.input_var:example}).shape 54 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 55 | print layers.get_output(network).eval({self.input_var:example}).shape 56 | if (self.batch_norm): 57 | network = layers.BatchNormLayer(incoming=network) 58 | 59 | 60 | # CONV-RELU-POOL 3 61 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 62 | stride=1, nonlinearity=rectify) 63 | print layers.get_output(network).eval({self.input_var:example}).shape 64 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 65 | print layers.get_output(network).eval({self.input_var:example}).shape 66 | if (self.batch_norm): 67 | network = layers.BatchNormLayer(incoming=network) 68 | 69 | # CONV-RELU-POOL 4 70 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 71 | stride=1, nonlinearity=rectify) 72 | print layers.get_output(network).eval({self.input_var:example}).shape 73 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 74 | print layers.get_output(network).eval({self.input_var:example}).shape 75 | if (self.batch_norm): 76 | network = layers.BatchNormLayer(incoming=network) 77 | 78 | self.params = layers.get_all_params(network, trainable=True) 79 | 80 | output = layers.get_output(network) 81 | num_channels = 32 82 | filter_W = 54 83 | filter_H = 8 84 | 85 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 86 | # explicit values are better for optimizations 87 | 88 | channels = [] 89 | for channel_index in range(num_channels): 90 | channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) 91 | 92 | rnn_network_outputs = [] 93 | W_in_to_updategate = None 94 | W_hid_to_updategate = None 95 | b_updategate = None 96 | W_in_to_resetgate = None 97 | W_hid_to_resetgate = None 98 | b_resetgate = None 99 | W_in_to_hidden_update = None 100 | W_hid_to_hidden_update = None 101 | b_hidden_update = None 102 | 103 | W_in_to_updategate1 = None 104 | W_hid_to_updategate1 = None 105 | b_updategate1 = None 106 | W_in_to_resetgate1 = None 107 | W_hid_to_resetgate1 = None 108 | b_resetgate1 = None 109 | W_in_to_hidden_update1 = None 110 | W_hid_to_hidden_update1 = None 111 | b_hidden_update1 = None 112 | 113 | for channel_index in range(num_channels): 114 | rnn_input_var = channels[channel_index] 115 | 116 | # InputLayer 117 | network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) 118 | 119 | if (channel_index == 0): 120 | # GRULayer 121 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False) 122 | W_in_to_updategate = network.W_in_to_updategate 123 | W_hid_to_updategate = network.W_hid_to_updategate 124 | b_updategate = network.b_updategate 125 | W_in_to_resetgate = network.W_in_to_resetgate 126 | W_hid_to_resetgate = network.W_hid_to_resetgate 127 | b_resetgate = network.b_resetgate 128 | W_in_to_hidden_update = network.W_in_to_hidden_update 129 | W_hid_to_hidden_update = network.W_hid_to_hidden_update 130 | b_hidden_update = network.b_hidden_update 131 | 132 | # BatchNormalization Layer 133 | if (self.batch_norm): 134 | network = layers.BatchNormLayer(incoming=network) 135 | 136 | # GRULayer 137 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 138 | W_in_to_updategate1 = network.W_in_to_updategate 139 | W_hid_to_updategate1 = network.W_hid_to_updategate 140 | b_updategate1 = network.b_updategate 141 | W_in_to_resetgate1 = network.W_in_to_resetgate 142 | W_hid_to_resetgate1 = network.W_hid_to_resetgate 143 | b_resetgate1 = network.b_resetgate 144 | W_in_to_hidden_update1 = network.W_in_to_hidden_update 145 | W_hid_to_hidden_update1 = network.W_hid_to_hidden_update 146 | b_hidden_update1 = network.b_hidden_update 147 | 148 | # add params 149 | self.params += layers.get_all_params(network, trainable=True) 150 | 151 | else: 152 | # GRULayer, but shared 153 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False, 154 | resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), 155 | updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), 156 | hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) 157 | 158 | # BatchNormalization Layer 159 | if (self.batch_norm): 160 | network = layers.BatchNormLayer(incoming=network) 161 | 162 | # GRULayer, but shared 163 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, 164 | resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1), 165 | updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1), 166 | hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1)) 167 | 168 | 169 | rnn_network_outputs.append(layers.get_output(network)) 170 | 171 | all_output_var = T.concatenate(rnn_network_outputs, axis=1) 172 | print all_output_var.eval({self.input_var:example}).shape 173 | 174 | # InputLayer 175 | network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) 176 | 177 | # Dropout Layer 178 | if (self.dropout > 0): 179 | network = layers.dropout(network, self.dropout) 180 | 181 | # BatchNormalization Layer 182 | if (self.batch_norm): 183 | network = layers.BatchNormLayer(incoming=network) 184 | 185 | # Last layer: classification 186 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 187 | print layers.get_output(network).eval({self.input_var:example}).shape 188 | 189 | 190 | self.params += layers.get_all_params(network, trainable=True) 191 | self.prediction = layers.get_output(network) 192 | 193 | #print "==> param shapes", [x.eval().shape for x in self.params] 194 | 195 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 196 | if (self.l2 > 0): 197 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 198 | lasagne.regularization.l2) 199 | else: 200 | self.loss_l2 = 0 201 | self.loss = self.loss_ce + self.loss_l2 202 | 203 | #updates = lasagne.updates.adadelta(self.loss, self.params) 204 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 205 | 206 | if self.mode == 'train': 207 | print "==> compiling train_fn" 208 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 209 | outputs=[self.prediction, self.loss], 210 | updates=updates) 211 | 212 | print "==> compiling test_fn" 213 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 214 | outputs=[self.prediction, self.loss]) 215 | 216 | 217 | def say_name(self): 218 | return "tc_net_deeprnn.4conv.pad.GRU.shared.num_units%d.5khz" % self.num_units 219 | 220 | 221 | def read_batch(self, data_raw, batch_index): 222 | 223 | start_index = batch_index * self.batch_size 224 | end_index = start_index + self.batch_size 225 | 226 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 227 | answers = [] 228 | 229 | for i in range(start_index, end_index): 230 | answers.append(int(data_raw[i].split(',')[1])) 231 | name = data_raw[i].split(',')[0] 232 | path = self.png_folder + name + ".png" 233 | im = Image.open(path) 234 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 235 | 236 | answers = np.array(answers, dtype=np.int32) 237 | return data, answers 238 | 239 | -------------------------------------------------------------------------------- /theano/networks/tc_net_mod.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | 31 | self.input_var = T.tensor4('input_var') 32 | self.answer_var = T.ivector('answer_var') 33 | 34 | print "==> building network" 35 | example = np.random.uniform(size=(self.batch_size, 1, 256, 858), low=0.0, high=1.0).astype(np.float32) ######### 36 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 37 | 38 | network = layers.InputLayer(shape=(None, 1, 256, 858), input_var=self.input_var) 39 | print layers.get_output(network).eval({self.input_var:example}).shape 40 | 41 | 42 | # NOTE: replace pad=2 with ignore_border=False 43 | # CONV-RELU-POOL 1 44 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 45 | stride=1, nonlinearity=rectify) 46 | print layers.get_output(network).eval({self.input_var:example}).shape 47 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 48 | print layers.get_output(network).eval({self.input_var:example}).shape 49 | if (self.batch_norm): 50 | network = layers.BatchNormLayer(incoming=network) 51 | 52 | # CONV-RELU-POOL 2 53 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 54 | stride=1, nonlinearity=rectify) 55 | print layers.get_output(network).eval({self.input_var:example}).shape 56 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 57 | print layers.get_output(network).eval({self.input_var:example}).shape 58 | if (self.batch_norm): 59 | network = layers.BatchNormLayer(incoming=network) 60 | 61 | 62 | # CONV-RELU-POOL 3 63 | network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), 64 | stride=1, nonlinearity=rectify) 65 | print layers.get_output(network).eval({self.input_var:example}).shape 66 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 67 | print layers.get_output(network).eval({self.input_var:example}).shape 68 | if (self.batch_norm): 69 | network = layers.BatchNormLayer(incoming=network) 70 | 71 | # CONV-RELU-POOL 4 72 | network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), 73 | stride=1, nonlinearity=rectify) 74 | print layers.get_output(network).eval({self.input_var:example}).shape 75 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 76 | print layers.get_output(network).eval({self.input_var:example}).shape 77 | if (self.batch_norm): 78 | network = layers.BatchNormLayer(incoming=network) 79 | 80 | # CONV-RELU-POOL 5 81 | network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), 82 | stride=1, nonlinearity=rectify) 83 | print layers.get_output(network).eval({self.input_var:example}).shape 84 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 85 | print layers.get_output(network).eval({self.input_var:example}).shape 86 | if (self.batch_norm): 87 | network = layers.BatchNormLayer(incoming=network) 88 | 89 | # CONV-RELU-POOL 6 90 | network = layers.Conv2DLayer(incoming=network, num_filters=256, filter_size=(3, 3), 91 | stride=1, nonlinearity=rectify) 92 | print layers.get_output(network).eval({self.input_var:example}).shape 93 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(3, 2), pad=2) 94 | print layers.get_output(network).eval({self.input_var:example}).shape 95 | if (self.batch_norm): 96 | network = layers.BatchNormLayer(incoming=network) 97 | 98 | # DENSE 1 99 | network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify) 100 | if (self.batch_norm): 101 | network = layers.BatchNormLayer(incoming=network) 102 | if (self.dropout > 0): 103 | network = layers.dropout(network, self.dropout) 104 | print layers.get_output(network).eval({self.input_var:example}).shape 105 | 106 | 107 | # Last layer: classification 108 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 109 | print layers.get_output(network).eval({self.input_var:example}).shape 110 | 111 | 112 | self.params = layers.get_all_params(network, trainable=True) 113 | self.prediction = layers.get_output(network) 114 | 115 | print "==> param shapes", [x.eval().shape for x in self.params] 116 | 117 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 118 | if (self.l2 > 0): 119 | self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 120 | lasagne.regularization.l2) 121 | else: 122 | self.loss_l2 = 0 123 | self.loss = self.loss_ce + self.loss_l2 124 | 125 | #updates = lasagne.updates.adadelta(self.loss, self.params) 126 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 127 | 128 | if self.mode == 'train': 129 | print "==> compiling train_fn" 130 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 131 | outputs=[self.prediction, self.loss], 132 | updates=updates) 133 | 134 | print "==> compiling test_fn" 135 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 136 | outputs=[self.prediction, self.loss]) 137 | 138 | 139 | def say_name(self): 140 | return "tc_net_mod" 141 | 142 | 143 | def read_batch(self, data_raw, batch_index): 144 | 145 | start_index = batch_index * self.batch_size 146 | end_index = start_index + self.batch_size 147 | 148 | data = np.zeros((self.batch_size, 1, 256, 858), dtype=np.float32) 149 | answers = [] 150 | 151 | for i in range(start_index, end_index): 152 | answers.append(int(data_raw[i].split(',')[1])) 153 | name = data_raw[i].split(',')[0] 154 | path = self.png_folder + name + ".png" 155 | im = Image.open(path) 156 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32) / 256.0 157 | 158 | answers = np.array(answers, dtype=np.int32) 159 | return data, answers 160 | 161 | 162 | -------------------------------------------------------------------------------- /theano/networks/tc_net_mod_5khz_small.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | 31 | self.input_var = T.tensor4('input_var') 32 | self.answer_var = T.ivector('answer_var') 33 | 34 | print "==> building network" 35 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 36 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 37 | 38 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 39 | print layers.get_output(network).eval({self.input_var:example}).shape 40 | 41 | # CONV-RELU-POOL 1 42 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 43 | stride=1, nonlinearity=rectify) 44 | print layers.get_output(network).eval({self.input_var:example}).shape 45 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 46 | print layers.get_output(network).eval({self.input_var:example}).shape 47 | if (self.batch_norm): 48 | network = layers.BatchNormLayer(incoming=network) 49 | 50 | # CONV-RELU-POOL 2 51 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 52 | stride=1, nonlinearity=rectify) 53 | print layers.get_output(network).eval({self.input_var:example}).shape 54 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 55 | print layers.get_output(network).eval({self.input_var:example}).shape 56 | if (self.batch_norm): 57 | network = layers.BatchNormLayer(incoming=network) 58 | 59 | 60 | # CONV-RELU-POOL 3 61 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 62 | stride=1, nonlinearity=rectify) 63 | print layers.get_output(network).eval({self.input_var:example}).shape 64 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 65 | print layers.get_output(network).eval({self.input_var:example}).shape 66 | if (self.batch_norm): 67 | network = layers.BatchNormLayer(incoming=network) 68 | 69 | # CONV-RELU-POOL 4 70 | network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), 71 | stride=1, nonlinearity=rectify) 72 | print layers.get_output(network).eval({self.input_var:example}).shape 73 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 74 | print layers.get_output(network).eval({self.input_var:example}).shape 75 | if (self.batch_norm): 76 | network = layers.BatchNormLayer(incoming=network) 77 | 78 | # CONV-RELU-POOL 5 79 | network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), 80 | stride=1, nonlinearity=rectify) 81 | print layers.get_output(network).eval({self.input_var:example}).shape 82 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 83 | print layers.get_output(network).eval({self.input_var:example}).shape 84 | if (self.batch_norm): 85 | network = layers.BatchNormLayer(incoming=network) 86 | 87 | 88 | # DENSE 1 89 | network = layers.DenseLayer(incoming=network, num_units=256, nonlinearity=rectify) 90 | if (self.batch_norm): 91 | network = layers.BatchNormLayer(incoming=network) 92 | if (self.dropout > 0): 93 | network = layers.dropout(network, self.dropout) 94 | print layers.get_output(network).eval({self.input_var:example}).shape 95 | 96 | 97 | # Last layer: classification 98 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 99 | print layers.get_output(network).eval({self.input_var:example}).shape 100 | 101 | 102 | self.params = layers.get_all_params(network, trainable=True) 103 | self.prediction = layers.get_output(network) 104 | self.test_prediction = layers.get_output(network, deterministic=True) 105 | 106 | print "==> param shapes", [x.eval().shape for x in self.params] 107 | 108 | def get_loss(prediction): 109 | loss_ce = lasagne.objectives.categorical_crossentropy(prediction, self.answer_var).mean() 110 | if (self.l2 > 0): 111 | loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 112 | lasagne.regularization.l2) 113 | else: 114 | loss_l2 = 0 115 | return loss_ce + loss_l2 116 | 117 | self.loss = get_loss(self.prediction) 118 | self.test_loss = get_loss(self.test_prediction) 119 | 120 | #updates = lasagne.updates.adadelta(self.loss, self.params) 121 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 122 | 123 | if self.mode == 'train': 124 | print "==> compiling train_fn" 125 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 126 | outputs=[self.prediction, self.loss], 127 | updates=updates) 128 | 129 | print "==> compiling test_fn" 130 | # deterministic version 131 | #self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 132 | # outputs=[self.test_prediction, self.test_loss]) 133 | 134 | # non deterministic version, as train_fn 135 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 136 | outputs=[self.prediction, self.loss]) 137 | 138 | 139 | def say_name(self): 140 | return "tc_net_mod_5khz_small" 141 | 142 | 143 | def read_batch(self, data_raw, batch_index): 144 | 145 | start_index = batch_index * self.batch_size 146 | end_index = start_index + self.batch_size 147 | 148 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 149 | answers = [] 150 | 151 | for i in range(start_index, end_index): 152 | answers.append(int(data_raw[i].split(',')[1])) 153 | name = data_raw[i].split(',')[0] 154 | path = self.png_folder + name + ".png" 155 | im = Image.open(path) 156 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 157 | 158 | answers = np.array(answers, dtype=np.int32) 159 | return data, answers 160 | 161 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | 71 | self.params = layers.get_all_params(network, trainable=True) 72 | 73 | output = layers.get_output(network) 74 | num_channels = 32 75 | filter_W = 104 76 | filter_H = 13 77 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 78 | # explicit values are better for optimizations 79 | 80 | channels = [] 81 | for channel_index in range(num_channels): 82 | channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) 83 | 84 | rnn_network_outputs = [] 85 | for channel_index in range(num_channels): 86 | rnn_input_var = channels[channel_index] 87 | 88 | # InputLayer 89 | network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) 90 | 91 | # GRULayer 92 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 93 | 94 | # BatchNormalization Layer 95 | if (self.batch_norm): 96 | network = layers.BatchNormLayer(incoming=network) 97 | 98 | # add params 99 | self.params += layers.get_all_params(network, trainable=True) 100 | 101 | rnn_network_outputs.append(layers.get_output(network)) 102 | 103 | all_output_var = T.concatenate(rnn_network_outputs, axis=1) 104 | print all_output_var.eval({self.input_var:example}).shape 105 | 106 | # InputLayer 107 | network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) 108 | 109 | # DENSE 1 110 | network = layers.DenseLayer(incoming=network, num_units=512, nonlinearity=rectify) 111 | if (self.batch_norm): 112 | network = layers.BatchNormLayer(incoming=network) 113 | if (self.dropout > 0): 114 | network = layers.dropout(network, self.dropout) 115 | print layers.get_output(network).eval({self.input_var:example}).shape 116 | 117 | 118 | # Last layer: classification 119 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 120 | print layers.get_output(network).eval({self.input_var:example}).shape 121 | 122 | 123 | self.params += layers.get_all_params(network, trainable=True) 124 | self.prediction = layers.get_output(network) 125 | 126 | #print "==> param shapes", [x.eval().shape for x in self.params] 127 | 128 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 129 | if (self.l2 > 0): 130 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 131 | lasagne.regularization.l2) 132 | else: 133 | self.loss_l2 = 0 134 | self.loss = self.loss_ce + self.loss_l2 135 | 136 | #updates = lasagne.updates.adadelta(self.loss, self.params) 137 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 138 | 139 | if self.mode == 'train': 140 | print "==> compiling train_fn" 141 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 142 | outputs=[self.prediction, self.loss], 143 | updates=updates) 144 | 145 | print "==> compiling test_fn" 146 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 147 | outputs=[self.prediction, self.loss]) 148 | 149 | 150 | def say_name(self): 151 | return "tc_net_rnn.GRU.3conv.num_units%d.5khz" % self.num_units 152 | 153 | 154 | def read_batch(self, data_raw, batch_index): 155 | 156 | start_index = batch_index * self.batch_size 157 | end_index = start_index + self.batch_size 158 | 159 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 160 | answers = [] 161 | 162 | for i in range(start_index, end_index): 163 | answers.append(int(data_raw[i].split(',')[1])) 164 | name = data_raw[i].split(',')[0] 165 | path = self.png_folder + name + ".png" 166 | im = Image.open(path) 167 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 168 | 169 | answers = np.array(answers, dtype=np.int32) 170 | return data, answers 171 | 172 | 173 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn_nodense.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | self.params = layers.get_all_params(network, trainable=True) 71 | 72 | output = layers.get_output(network) 73 | num_channels = 32 74 | filter_W = 104 75 | filter_H = 13 76 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 77 | # explicit values are better for optimizations 78 | 79 | channels = [] 80 | for channel_index in range(num_channels): 81 | channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) 82 | 83 | rnn_network_outputs = [] 84 | for channel_index in range(num_channels): 85 | rnn_input_var = channels[channel_index] 86 | 87 | # InputLayer 88 | network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) 89 | 90 | # GRULayer 91 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 92 | 93 | # BatchNormalization Layer 94 | if (self.batch_norm): 95 | network = layers.BatchNormLayer(incoming=network) 96 | 97 | # add params 98 | self.params += layers.get_all_params(network, trainable=True) 99 | 100 | rnn_network_outputs.append(layers.get_output(network)) 101 | 102 | all_output_var = T.concatenate(rnn_network_outputs, axis=1) 103 | print all_output_var.eval({self.input_var:example}).shape 104 | 105 | # InputLayer 106 | network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) 107 | 108 | """ 109 | # DENSE 1 110 | network = layers.DenseLayer(incoming=network, num_units=512, nonlinearity=rectify) 111 | if (self.batch_norm): 112 | network = layers.BatchNormLayer(incoming=network) 113 | if (self.dropout > 0): 114 | network = layers.dropout(network, self.dropout) 115 | print layers.get_output(network).eval({self.input_var:example}).shape 116 | """ 117 | 118 | # Last layer: classification 119 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 120 | print layers.get_output(network).eval({self.input_var:example}).shape 121 | 122 | 123 | self.params += layers.get_all_params(network, trainable=True) 124 | self.prediction = layers.get_output(network) 125 | 126 | #print "==> param shapes", [x.eval().shape for x in self.params] 127 | 128 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 129 | if (self.l2 > 0): 130 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 131 | lasagne.regularization.l2) 132 | else: 133 | self.loss_l2 = 0 134 | self.loss = self.loss_ce + self.loss_l2 135 | 136 | #updates = lasagne.updates.adadelta(self.loss, self.params) 137 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 138 | 139 | if self.mode == 'train': 140 | print "==> compiling train_fn" 141 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 142 | outputs=[self.prediction, self.loss], 143 | updates=updates) 144 | 145 | print "==> compiling test_fn" 146 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 147 | outputs=[self.prediction, self.loss]) 148 | 149 | 150 | def say_name(self): 151 | return "tc_net_rnn.3conv.GRU.num_units%d.nodense.5khz" % self.num_units 152 | 153 | 154 | def read_batch(self, data_raw, batch_index): 155 | 156 | start_index = batch_index * self.batch_size 157 | end_index = start_index + self.batch_size 158 | 159 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 160 | answers = [] 161 | 162 | for i in range(start_index, end_index): 163 | answers.append(int(data_raw[i].split(',')[1])) 164 | name = data_raw[i].split(',')[0] 165 | path = self.png_folder + name + ".png" 166 | im = Image.open(path) 167 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 168 | 169 | answers = np.array(answers, dtype=np.int32) 170 | return data, answers 171 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn_onernn.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | # CONV-RELU-POOL 4 71 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 72 | stride=1, nonlinearity=rectify) 73 | print layers.get_output(network).eval({self.input_var:example}).shape 74 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 75 | print layers.get_output(network).eval({self.input_var:example}).shape 76 | if (self.batch_norm): 77 | network = layers.BatchNormLayer(incoming=network) 78 | 79 | self.params = layers.get_all_params(network, trainable=True) 80 | 81 | output = layers.get_output(network) 82 | output = output.transpose((0, 3, 1, 2)) 83 | output = output.flatten(ndim=3) 84 | 85 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 86 | # explicit values are better for optimizations 87 | num_channels = 32 88 | filter_W = 54 89 | filter_H = 8 90 | 91 | 92 | # InputLayer 93 | network = layers.InputLayer(shape=(None, filter_W, num_channels * filter_H), input_var=output) 94 | print layers.get_output(network).eval({self.input_var:example}).shape 95 | 96 | # GRULayer 97 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 98 | print layers.get_output(network).eval({self.input_var:example}).shape 99 | if (self.batch_norm): 100 | network = layers.BatchNormLayer(incoming=network) 101 | if (self.dropout > 0): 102 | network = layers.dropout(network, self.dropout) 103 | 104 | # Last layer: classification 105 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 106 | print layers.get_output(network).eval({self.input_var:example}).shape 107 | 108 | 109 | self.params += layers.get_all_params(network, trainable=True) 110 | self.prediction = layers.get_output(network) 111 | 112 | #print "==> param shapes", [x.eval().shape for x in self.params] 113 | 114 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 115 | if (self.l2 > 0): 116 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 117 | lasagne.regularization.l2) 118 | else: 119 | self.loss_l2 = 0 120 | self.loss = self.loss_ce + self.loss_l2 121 | 122 | #updates = lasagne.updates.adadelta(self.loss, self.params) 123 | #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) # good one 124 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) 125 | 126 | if self.mode == 'train': 127 | print "==> compiling train_fn" 128 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 129 | outputs=[self.prediction, self.loss], 130 | updates=updates) 131 | 132 | print "==> compiling test_fn" 133 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 134 | outputs=[self.prediction, self.loss]) 135 | 136 | 137 | def say_name(self): 138 | return "tc_net_rnn.4conv.pad.GRU.onernn.num_units%d.5khz" % self.num_units 139 | 140 | 141 | def read_batch(self, data_raw, batch_index): 142 | 143 | start_index = batch_index * self.batch_size 144 | end_index = start_index + self.batch_size 145 | 146 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 147 | answers = [] 148 | 149 | for i in range(start_index, end_index): 150 | answers.append(int(data_raw[i].split(',')[1])) 151 | name = data_raw[i].split(',')[0] 152 | path = self.png_folder + name + ".png" 153 | im = Image.open(path) 154 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 155 | 156 | answers = np.array(answers, dtype=np.int32) 157 | return data, answers 158 | 159 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn_onernn_notimepool.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2,1), pad=2) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2,1), pad=2) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2,1), pad=2) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | # CONV-RELU-POOL 4 71 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 72 | stride=1, nonlinearity=rectify) 73 | print layers.get_output(network).eval({self.input_var:example}).shape 74 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2,1), pad=2) 75 | print layers.get_output(network).eval({self.input_var:example}).shape 76 | if (self.batch_norm): 77 | network = layers.BatchNormLayer(incoming=network) 78 | 79 | self.params = layers.get_all_params(network, trainable=True) 80 | 81 | output = layers.get_output(network) 82 | output = output.transpose((0, 3, 1, 2)) 83 | output = output.flatten(ndim=3) 84 | 85 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 86 | # explicit values are better for optimizations 87 | num_channels = 32 88 | filter_W = 852 89 | filter_H = 8 90 | 91 | 92 | # InputLayer 93 | network = layers.InputLayer(shape=(None, filter_W, num_channels * filter_H), input_var=output) 94 | print layers.get_output(network).eval({self.input_var:example}).shape 95 | 96 | # GRULayer 97 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 98 | print layers.get_output(network).eval({self.input_var:example}).shape 99 | if (self.batch_norm): 100 | network = layers.BatchNormLayer(incoming=network) 101 | if (self.dropout > 0): 102 | network = layers.dropout(network, self.dropout) 103 | 104 | # Last layer: classification 105 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 106 | print layers.get_output(network).eval({self.input_var:example}).shape 107 | 108 | 109 | self.params += layers.get_all_params(network, trainable=True) 110 | self.prediction = layers.get_output(network) 111 | 112 | #print "==> param shapes", [x.eval().shape for x in self.params] 113 | 114 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 115 | if (self.l2 > 0): 116 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 117 | lasagne.regularization.l2) 118 | else: 119 | self.loss_l2 = 0 120 | self.loss = self.loss_ce + self.loss_l2 121 | 122 | #updates = lasagne.updates.adadelta(self.loss, self.params) 123 | #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) # good one 124 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) 125 | 126 | if self.mode == 'train': 127 | print "==> compiling train_fn" 128 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 129 | outputs=[self.prediction, self.loss], 130 | updates=updates) 131 | 132 | print "==> compiling test_fn" 133 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 134 | outputs=[self.prediction, self.loss]) 135 | 136 | 137 | 138 | def say_name(self): 139 | return "tc_net_rnn.4conv.pad.GRU.onernn.notimepool.num_units%d.5khz" % self.num_units 140 | 141 | 142 | def read_batch(self, data_raw, batch_index): 143 | 144 | start_index = batch_index * self.batch_size 145 | end_index = start_index + self.batch_size 146 | 147 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 148 | answers = [] 149 | 150 | for i in range(start_index, end_index): 151 | answers.append(int(data_raw[i].split(',')[1])) 152 | name = data_raw[i].split(',')[0] 153 | path = self.png_folder + name + ".png" 154 | im = Image.open(path) 155 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 156 | 157 | answers = np.array(answers, dtype=np.int32) 158 | return data, answers 159 | 160 | 161 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn_shared.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | self.params = layers.get_all_params(network, trainable=True) 71 | 72 | output = layers.get_output(network) 73 | num_channels = 32 74 | filter_W = 104 75 | filter_H = 13 76 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 77 | # explicit values are better for optimizations 78 | 79 | channels = [] 80 | for channel_index in range(num_channels): 81 | channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) 82 | 83 | rnn_network_outputs = [] 84 | W_in_to_updategate = None 85 | W_hid_to_updategate = None 86 | b_updategate = None 87 | W_in_to_resetgate = None 88 | W_hid_to_resetgate = None 89 | b_resetgate = None 90 | W_in_to_hidden_update = None 91 | W_hid_to_hidden_update = None 92 | b_hidden_update = None 93 | 94 | for channel_index in range(num_channels): 95 | rnn_input_var = channels[channel_index] 96 | 97 | # InputLayer 98 | network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) 99 | 100 | if (channel_index == 0): 101 | # GRULayer 102 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 103 | W_in_to_updategate = network.W_in_to_updategate 104 | W_hid_to_updategate = network.W_hid_to_updategate 105 | b_updategate = network.b_updategate 106 | W_in_to_resetgate = network.W_in_to_resetgate 107 | W_hid_to_resetgate = network.W_hid_to_resetgate 108 | b_resetgate = network.b_resetgate 109 | W_in_to_hidden_update = network.W_in_to_hidden_update 110 | W_hid_to_hidden_update = network.W_hid_to_hidden_update 111 | b_hidden_update = network.b_hidden_update 112 | 113 | # add params 114 | self.params += layers.get_all_params(network, trainable=True) 115 | 116 | else: 117 | # GRULayer, but shared 118 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, 119 | resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), 120 | updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), 121 | hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) 122 | 123 | 124 | 125 | rnn_network_outputs.append(layers.get_output(network)) 126 | 127 | all_output_var = T.concatenate(rnn_network_outputs, axis=1) 128 | print all_output_var.eval({self.input_var:example}).shape 129 | 130 | # InputLayer 131 | network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) 132 | 133 | # BatchNormalization Layer 134 | if (self.batch_norm): 135 | network = layers.BatchNormLayer(incoming=network) 136 | 137 | # Last layer: classification 138 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 139 | print layers.get_output(network).eval({self.input_var:example}).shape 140 | 141 | 142 | self.params += layers.get_all_params(network, trainable=True) 143 | self.prediction = layers.get_output(network) 144 | 145 | #print "==> param shapes", [x.eval().shape for x in self.params] 146 | 147 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 148 | if (self.l2 > 0): 149 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 150 | lasagne.regularization.l2) 151 | else: 152 | self.loss_l2 = 0 153 | self.loss = self.loss_ce + self.loss_l2 154 | 155 | #updates = lasagne.updates.adadelta(self.loss, self.params) 156 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 157 | 158 | if self.mode == 'train': 159 | print "==> compiling train_fn" 160 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 161 | outputs=[self.prediction, self.loss], 162 | updates=updates) 163 | 164 | print "==> compiling test_fn" 165 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 166 | outputs=[self.prediction, self.loss]) 167 | 168 | 169 | def say_name(self): 170 | return "tc_net_rnn.3conv.GRU.shared.num_units%d.5khz" % self.num_units 171 | 172 | 173 | def read_batch(self, data_raw, batch_index): 174 | 175 | start_index = batch_index * self.batch_size 176 | end_index = start_index + self.batch_size 177 | 178 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 179 | answers = [] 180 | 181 | for i in range(start_index, end_index): 182 | answers.append(int(data_raw[i].split(',')[1])) 183 | name = data_raw[i].split(',')[0] 184 | path = self.png_folder + name + ".png" 185 | im = Image.open(path) 186 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 187 | 188 | answers = np.array(answers, dtype=np.int32) 189 | return data, answers 190 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn_shared_pad.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | # CONV-RELU-POOL 4 71 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 72 | stride=1, nonlinearity=rectify) 73 | print layers.get_output(network).eval({self.input_var:example}).shape 74 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 75 | print layers.get_output(network).eval({self.input_var:example}).shape 76 | if (self.batch_norm): 77 | network = layers.BatchNormLayer(incoming=network) 78 | 79 | self.params = layers.get_all_params(network, trainable=True) 80 | 81 | output = layers.get_output(network) 82 | num_channels = 32 83 | filter_W = 54 84 | filter_H = 8 85 | 86 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 87 | # explicit values are better for optimizations 88 | 89 | channels = [] 90 | for channel_index in range(num_channels): 91 | channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) 92 | 93 | rnn_network_outputs = [] 94 | W_in_to_updategate = None 95 | W_hid_to_updategate = None 96 | b_updategate = None 97 | W_in_to_resetgate = None 98 | W_hid_to_resetgate = None 99 | b_resetgate = None 100 | W_in_to_hidden_update = None 101 | W_hid_to_hidden_update = None 102 | b_hidden_update = None 103 | 104 | for channel_index in range(num_channels): 105 | rnn_input_var = channels[channel_index] 106 | 107 | # InputLayer 108 | network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) 109 | 110 | if (channel_index == 0): 111 | # GRULayer 112 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 113 | W_in_to_updategate = network.W_in_to_updategate 114 | W_hid_to_updategate = network.W_hid_to_updategate 115 | b_updategate = network.b_updategate 116 | W_in_to_resetgate = network.W_in_to_resetgate 117 | W_hid_to_resetgate = network.W_hid_to_resetgate 118 | b_resetgate = network.b_resetgate 119 | W_in_to_hidden_update = network.W_in_to_hidden_update 120 | W_hid_to_hidden_update = network.W_hid_to_hidden_update 121 | b_hidden_update = network.b_hidden_update 122 | 123 | # add params 124 | self.params += layers.get_all_params(network, trainable=True) 125 | 126 | else: 127 | # GRULayer, but shared 128 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, 129 | resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), 130 | updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), 131 | hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) 132 | 133 | 134 | 135 | rnn_network_outputs.append(layers.get_output(network)) 136 | 137 | all_output_var = T.concatenate(rnn_network_outputs, axis=1) 138 | print all_output_var.eval({self.input_var:example}).shape 139 | 140 | # InputLayer 141 | network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) 142 | 143 | # Dropout Layer 144 | if (self.dropout > 0): 145 | network = layers.dropout(network, self.dropout) 146 | 147 | # BatchNormalization Layer 148 | if (self.batch_norm): 149 | network = layers.BatchNormLayer(incoming=network) 150 | 151 | # Last layer: classification 152 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 153 | print layers.get_output(network).eval({self.input_var:example}).shape 154 | 155 | 156 | self.params += layers.get_all_params(network, trainable=True) 157 | self.prediction = layers.get_output(network) 158 | 159 | #print "==> param shapes", [x.eval().shape for x in self.params] 160 | 161 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 162 | if (self.l2 > 0): 163 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 164 | lasagne.regularization.l2) 165 | else: 166 | self.loss_l2 = 0 167 | self.loss = self.loss_ce + self.loss_l2 168 | 169 | #updates = lasagne.updates.adadelta(self.loss, self.params) 170 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 171 | 172 | if self.mode == 'train': 173 | print "==> compiling train_fn" 174 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 175 | outputs=[self.prediction, self.loss], 176 | updates=updates) 177 | 178 | print "==> compiling test_fn" 179 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 180 | outputs=[self.prediction, self.loss]) 181 | 182 | 183 | def say_name(self): 184 | return "tc_net_rnn.4conv.pad.GRU.shared.num_units%d.5khz" % self.num_units 185 | 186 | 187 | def read_batch(self, data_raw, batch_index): 188 | 189 | start_index = batch_index * self.batch_size 190 | end_index = start_index + self.batch_size 191 | 192 | data = np.zeros((self.batch_size, 1, 128, 858), dtype=np.float32) 193 | answers = [] 194 | 195 | for i in range(start_index, end_index): 196 | answers.append(int(data_raw[i].split(',')[1])) 197 | name = data_raw[i].split(',')[0] 198 | path = self.png_folder + name + ".png" 199 | im = Image.open(path) 200 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, :] / 256.0 201 | 202 | answers = np.array(answers, dtype=np.int32) 203 | return data, answers 204 | 205 | -------------------------------------------------------------------------------- /theano/networks/tc_net_rnn_shared_pad_augm.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as T 6 | 7 | import lasagne 8 | from lasagne import layers 9 | from lasagne.nonlinearities import rectify, softmax, sigmoid, tanh 10 | 11 | import PIL.Image as Image 12 | from base_network import BaseNetwork 13 | 14 | floatX = theano.config.floatX 15 | 16 | 17 | class Network(BaseNetwork): 18 | 19 | def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): 20 | 21 | print "==> not used params in DMN class:", kwargs.keys() 22 | self.train_list_raw = train_list_raw 23 | self.test_list_raw = test_list_raw 24 | self.png_folder = png_folder 25 | self.batch_size = batch_size 26 | self.dropout = dropout 27 | self.l2 = l2 28 | self.mode = mode 29 | self.batch_norm = batch_norm 30 | self.num_units = rnn_num_units 31 | 32 | self.input_var = T.tensor4('input_var') 33 | self.answer_var = T.ivector('answer_var') 34 | 35 | print "==> building network" 36 | example = np.random.uniform(size=(self.batch_size, 1, 128, 768), low=0.0, high=1.0).astype(np.float32) ######### 37 | answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### 38 | 39 | network = layers.InputLayer(shape=(None, 1, 128, 768), input_var=self.input_var) 40 | print layers.get_output(network).eval({self.input_var:example}).shape 41 | 42 | # CONV-RELU-POOL 1 43 | network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 44 | stride=1, nonlinearity=rectify) 45 | print layers.get_output(network).eval({self.input_var:example}).shape 46 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 47 | print layers.get_output(network).eval({self.input_var:example}).shape 48 | if (self.batch_norm): 49 | network = layers.BatchNormLayer(incoming=network) 50 | 51 | # CONV-RELU-POOL 2 52 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 53 | stride=1, nonlinearity=rectify) 54 | print layers.get_output(network).eval({self.input_var:example}).shape 55 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 56 | print layers.get_output(network).eval({self.input_var:example}).shape 57 | if (self.batch_norm): 58 | network = layers.BatchNormLayer(incoming=network) 59 | 60 | 61 | # CONV-RELU-POOL 3 62 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 63 | stride=1, nonlinearity=rectify) 64 | print layers.get_output(network).eval({self.input_var:example}).shape 65 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 66 | print layers.get_output(network).eval({self.input_var:example}).shape 67 | if (self.batch_norm): 68 | network = layers.BatchNormLayer(incoming=network) 69 | 70 | # CONV-RELU-POOL 4 71 | network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 72 | stride=1, nonlinearity=rectify) 73 | print layers.get_output(network).eval({self.input_var:example}).shape 74 | network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) 75 | print layers.get_output(network).eval({self.input_var:example}).shape 76 | if (self.batch_norm): 77 | network = layers.BatchNormLayer(incoming=network) 78 | 79 | self.params = layers.get_all_params(network, trainable=True) 80 | 81 | output = layers.get_output(network) 82 | num_channels = 32 83 | filter_W = 48 84 | filter_H = 8 85 | 86 | # NOTE: these constants are shapes of last pool layer, it can be symbolic 87 | # explicit values are better for optimizations 88 | 89 | channels = [] 90 | for channel_index in range(num_channels): 91 | channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) 92 | 93 | rnn_network_outputs = [] 94 | W_in_to_updategate = None 95 | W_hid_to_updategate = None 96 | b_updategate = None 97 | W_in_to_resetgate = None 98 | W_hid_to_resetgate = None 99 | b_resetgate = None 100 | W_in_to_hidden_update = None 101 | W_hid_to_hidden_update = None 102 | b_hidden_update = None 103 | 104 | for channel_index in range(num_channels): 105 | rnn_input_var = channels[channel_index] 106 | 107 | # InputLayer 108 | network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) 109 | 110 | if (channel_index == 0): 111 | # GRULayer 112 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) 113 | W_in_to_updategate = network.W_in_to_updategate 114 | W_hid_to_updategate = network.W_hid_to_updategate 115 | b_updategate = network.b_updategate 116 | W_in_to_resetgate = network.W_in_to_resetgate 117 | W_hid_to_resetgate = network.W_hid_to_resetgate 118 | b_resetgate = network.b_resetgate 119 | W_in_to_hidden_update = network.W_in_to_hidden_update 120 | W_hid_to_hidden_update = network.W_hid_to_hidden_update 121 | b_hidden_update = network.b_hidden_update 122 | 123 | # add params 124 | self.params += layers.get_all_params(network, trainable=True) 125 | 126 | else: 127 | # GRULayer, but shared 128 | network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, 129 | resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), 130 | updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), 131 | hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) 132 | 133 | 134 | 135 | rnn_network_outputs.append(layers.get_output(network)) 136 | 137 | all_output_var = T.concatenate(rnn_network_outputs, axis=1) 138 | print all_output_var.eval({self.input_var:example}).shape 139 | 140 | # InputLayer 141 | network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) 142 | 143 | # Dropout Layer 144 | if (self.dropout > 0): 145 | network = layers.dropout(network, self.dropout) 146 | 147 | # BatchNormalization Layer 148 | if (self.batch_norm): 149 | network = layers.BatchNormLayer(incoming=network) 150 | 151 | # Last layer: classification 152 | network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) 153 | print layers.get_output(network).eval({self.input_var:example}).shape 154 | 155 | 156 | self.params += layers.get_all_params(network, trainable=True) 157 | self.prediction = layers.get_output(network) 158 | 159 | #print "==> param shapes", [x.eval().shape for x in self.params] 160 | 161 | self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() 162 | if (self.l2 > 0): 163 | self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 164 | lasagne.regularization.l2) 165 | else: 166 | self.loss_l2 = 0 167 | self.loss = self.loss_ce + self.loss_l2 168 | 169 | #updates = lasagne.updates.adadelta(self.loss, self.params) 170 | updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) 171 | 172 | if self.mode == 'train': 173 | print "==> compiling train_fn" 174 | self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 175 | outputs=[self.prediction, self.loss], 176 | updates=updates) 177 | 178 | print "==> compiling test_fn" 179 | self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], 180 | outputs=[self.prediction, self.loss]) 181 | 182 | 183 | def say_name(self): 184 | return "tc_net_rnn.4conv.pad.GRU.shared.num_units%d.5khz.augm" % self.num_units 185 | 186 | 187 | def read_batch(self, data_raw, batch_index): 188 | 189 | start_index = batch_index * self.batch_size 190 | end_index = start_index + self.batch_size 191 | 192 | data = np.zeros((self.batch_size, 1, 128, 768), dtype=np.float32) 193 | answers = [] 194 | 195 | for i in range(start_index, end_index): 196 | answers.append(int(data_raw[i].split(',')[1])) 197 | name = data_raw[i].split(',')[0] 198 | path = self.png_folder + name + ".png" 199 | im = Image.open(path) 200 | offset = random.randint(0, 90) 201 | data[i - start_index, 0, :, :] = np.array(im).astype(np.float32)[:128, offset:offset+768] / 256.0 202 | 203 | answers = np.array(answers, dtype=np.int32) 204 | return data, answers 205 | -------------------------------------------------------------------------------- /theano/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import sys 6 | import argparse 7 | import os 8 | 9 | 10 | #parsing arguments 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--plot', type=str, default='plot.png', help='plotfile name with .png') 13 | parser.add_argument('--log', type=str, default='log.txt', help='log file name') 14 | parser.add_argument('--winVal', type=int, default='200', help='window for Val') 15 | parser.add_argument('--winTrain', type=int, default='200', help='window for Train') 16 | parser.add_argument('--no-legend', dest='legend', action='store_false') 17 | parser.add_argument('--no-accuracy', dest='accuracy', action='store_false') 18 | parser.add_argument('--no-loss', dest='loss', action='store_false') 19 | parser.add_argument('--start_epoch', type=float, default=-1.0, help='start plotting from that epoch') 20 | parser.set_defaults(loss=True) 21 | parser.set_defaults(legend=True) 22 | parser.set_defaults(accuracy=True) 23 | 24 | args = parser.parse_args() 25 | 26 | plotname = args.plot 27 | windowVal = args.winVal 28 | windowTrain = args.winTrain 29 | accuracy = [] 30 | 31 | 32 | def movingAverage(loss, window): 33 | mas = [] 34 | for i in range(len(loss)): 35 | j = i - window + 1 36 | if (j < 0): 37 | j = 0 38 | sum = 0.0 39 | for k in range(window): 40 | sum += loss[j + k] 41 | mas.append(sum / window) 42 | return mas 43 | 44 | 45 | def plotTrainVal(filename, index, plotLabel): 46 | valx = [] 47 | valy = [] 48 | trainx = [] 49 | trainy = [] 50 | train_accuracyx = [] 51 | train_accuracyy = [] 52 | val_accuracyx = [] 53 | val_accuracyy = [] 54 | 55 | with open(filename, 'r') as logfile: 56 | for st in logfile.readlines(): 57 | head = st.split('\t')[0].strip() 58 | 59 | if (head[:7] == 'testing' or head[:8] == 'training'): 60 | iteration_expr = head[head.find(':')+1:] 61 | divpos = iteration_expr.find('/') 62 | first = iteration_expr[:divpos] 63 | iterations_per_epoch = float(iteration_expr[divpos+1:]) 64 | dotpos = first.find('.') 65 | epoch = float(first[:dotpos]) 66 | iteration = float(first[dotpos+1:]) 67 | x = epoch + iteration / iterations_per_epoch 68 | 69 | st_loss = st[st.find("avg_loss"):] 70 | cur_loss = float(st_loss[st_loss.find(':')+1:st_loss.find('\t')]) 71 | 72 | if (head[:7] == 'testing'): 73 | valx.append(x) 74 | valy.append(cur_loss) 75 | else: 76 | trainx.append(x) 77 | trainy.append(cur_loss) 78 | 79 | if st.strip()[:8] == "accuracy": 80 | cur_accuracy = float(st[st.find(':')+1:st.find("percent")]) / 100.0 81 | if (len(train_accuracyx) > len(val_accuracyx)): 82 | val_accuracyx.append(valx[-1]) 83 | val_accuracyy.append(cur_accuracy) 84 | else: 85 | train_accuracyx.append(trainx[-1]) 86 | train_accuracyy.append(cur_accuracy) 87 | 88 | while(len(valx) > 0 and valx[0] < args.start_epoch): 89 | valx = valx[1:] 90 | valy = valy[1:] 91 | 92 | while(len(trainx) > 0 and trainx[0] < args.start_epoch): 93 | trainx = trainx[1:] 94 | trainy = trainy[1:] 95 | 96 | 97 | #window config 98 | wndVal = min(windowVal, int(0.8 * len(valy))) 99 | wndTrain = min(windowTrain, int(0.8 * len(trainy))) 100 | 101 | print "Train length: ", len(trainy), " \t\t window: ", wndTrain 102 | print "Val length: ", len(valy), " \t\t window: ", wndVal 103 | 104 | #movAvg and correcting length 105 | #valy = movingAverage(valy, wndVal) 106 | #trainy = movingAverage(trainy, wndTrain) 107 | #valx = valx[:len(valy)] 108 | #trainx = trainx[:len(trainy)] 109 | 110 | 111 | #plotting 112 | greenDiff = 50 113 | redBlueDiff = 50 114 | 115 | if (args.loss): 116 | plt.plot(trainx, trainy, '#00' + hex(index * greenDiff)[2:] 117 | + hex(256 - index * redBlueDiff)[2:], 118 | label=plotLabel + " train") 119 | plt.hold(True) 120 | 121 | plt.plot(valx, valy, '#' + hex(256 - index * redBlueDiff)[2:] 122 | + hex(index * greenDiff)[2:] + '00', 123 | label=plotLabel + " validation") 124 | plt.hold(True) 125 | 126 | if (args.accuracy): 127 | plt.plot(train_accuracyx, train_accuracyy, '#000000', 128 | label=plotLabel + " train_accuracy") 129 | plt.hold(True) 130 | 131 | plt.plot(val_accuracyx, val_accuracyy, '#00FF00', 132 | label=plotLabel + " val_accuracy") 133 | plt.hold(True) 134 | 135 | print "plot index =", index 136 | for (x, y) in zip(val_accuracyx, val_accuracyy): 137 | print "\tepoch = %.0f, accuracy = %f" % (x - 1, y) 138 | print '\tMax: %f // Epoch: %d' % (max(val_accuracyy), val_accuracyx[val_accuracyy.index(max(val_accuracyy))]) 139 | 140 | 141 | plotTrainVal(args.log, 1, args.log) 142 | 143 | 144 | if (args.legend): 145 | plt.legend(loc='upper right', fontsize='x-small') 146 | plt.gcf().savefig(plotname) 147 | 148 | --------------------------------------------------------------------------------