├── adhoc ├── bestSoFar.txt ├── clock1.txt ├── conf8.igsc10-lr1.txt ├── conf8.igsc10.txt ├── conf8.inDim2.digit2.txt ├── conf8.inDim2.txt ├── conf8.l1loss.txt ├── conf8.l2unsquared.txt ├── conf8.lrd.txt ├── convolutional.txt ├── lowdim1.txt ├── plane1.txt ├── sine1.txt ├── spearmint-best-leaky.txt ├── spearmint-best-leaky0.1.txt ├── spearmint-best.txt ├── speedtest.txt ├── speedtestgpu.txt ├── uniform.bipartite1d.lrd.txt └── wave1.txt ├── autoencoder.py ├── config.json ├── deepDives ├── conf1.txt ├── conf10.txt ├── conf11.txt ├── conf2.txt ├── conf3.txt ├── conf4.txt ├── conf5.txt ├── conf6.txt ├── conf7.txt ├── conf8.txt └── conf9.txt ├── distances.py ├── docs └── charts │ └── 1d │ ├── triangle.attempt.png │ ├── triangle.bipartite.png │ ├── triangle.goal.png │ ├── uniform.attempt.png │ ├── uniform.bipartite.png │ ├── uniform.goal.png │ ├── uniform.os0.5.png │ ├── uniform.os1.png │ ├── uniform.os2.png │ └── uniform.os5.png ├── earthMover.py ├── earthMoverTest.py ├── evaluate.py ├── install.sh ├── kohonen.py ├── matplotlibrc ├── munkres.py ├── nearestNeighborsTest.py ├── next_permutation.py ├── nnbase ├── __init__.py ├── attrdict.py ├── autoencoder.py ├── inputs.py ├── layers.py ├── shape.py ├── utils.py └── vis.py ├── readme.md ├── readme.sh ├── requirements.txt ├── spearmintTask.py └── theanorc.txt /adhoc/bestSoFar.txt: -------------------------------------------------------------------------------- 1 | # The parent is ~/spearmintClones/regularization_initialSD/daniel-experiments/kohonen/spearmintOutput/initialSD0.413519287109-regularization7.87353515625e-07/conf.txt 2 | # that is the best at epoch 6400 of the ~/spearmintClones/regularization_initialSD spearmintRuns. 3 | # At epoch 4800 the best would be initialSD0.323230707636, but who cares, 4 | # it's super insensitive to initialSD inside [0.2, 0.6]. 5 | # Removed regularization because it was optimized away already by Spearmint, as it did not help: 6 | # regularization 7.87353515625e-07 -> 0 7 | # 8 | # parent: 9 | # epoch 4800 trainMean 3.604388 trainMedian 3.666215 validationMean 3.885447 validationMedian 3.866639 10 | # this: 11 | # epoch 4800 trainMean 3.551922 trainMedian 3.647972 validationMean 3.893564 validationMedian 3.921978 12 | epochCount 6400 13 | everyNthInput 10 14 | expName adhoc/bestSoFar 15 | gridSizeForInterpolation 30 16 | gridSizeForSampling 20 17 | height 28 18 | hiddenLayerSize 673 19 | inBoolDim 0 20 | inDim 50 21 | initialSD 0.413519287109 22 | inputDigit None 23 | inputType mnist 24 | layerNum 3 25 | learningRate 1.0 26 | minibatchSize 1000 27 | momentum 0.969849416169 28 | oversampling 8.0 29 | plotEach 800 30 | reLULeakiness 0.01 31 | regularization 0.0 32 | useReLU True 33 | width 28 34 | -------------------------------------------------------------------------------- /adhoc/clock1.txt: -------------------------------------------------------------------------------- 1 | # parent adhoc/wave1.txt 2 | epochCount 16000 3 | expName adhoc/clock1 4 | gridSizeForInterpolation 30 5 | gridSizeForSampling 20 6 | height 28 7 | hiddenLayerSize 100 8 | inBoolDim 0 9 | inDim 2 10 | initialSD 2.0 11 | inputType clock 12 | layerNum 3 13 | learningRate 10 14 | minibatchSize 100 15 | momentum 0.6 16 | oversampling 1.0 17 | plotEach 100 18 | trainSize 40000 19 | useReLU False 20 | validSize 400 21 | width 28 22 | -------------------------------------------------------------------------------- /adhoc/conf8.igsc10-lr1.txt: -------------------------------------------------------------------------------- 1 | # parent adhoc/conf8.igsc10.txt 2 | # learningRate 10 -> 1 3 | # To accomodate for the innerGradientStepCount = 10, 4 | # I decreased the LR tenfold. 5 | # parent: 6 | # epoch 400 trainMean 3.823610 trainMedian 3.913259 validationMean 4.035612 validationMedian 4.088287 7 | # this: 8 | # epoch 4800 trainMean 3.724095 trainMedian 3.812470 validationMean 4.026276 validationMedian 4.042130 9 | epochCount 16000 10 | everyNthInput 10 11 | expName adhoc/conf8.igsc10-lr1 12 | gridSizeForInterpolation 30 13 | gridSizeForSampling 20 14 | height 28 15 | hiddenLayerSize 600 16 | inDim 20 17 | inBoolDim 0 18 | initialSD 0.25 19 | innerGradientStepCount 10 20 | inputDigit None 21 | inputType mnist 22 | layerNum 3 23 | learningRate 1 24 | minibatchSize 600 25 | momentum 0.6 26 | oversampling 4.0 27 | plotEach 50 28 | useReLU True 29 | width 28 30 | -------------------------------------------------------------------------------- /adhoc/conf8.igsc10.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # innerGradientStepCount = 10 for-loop around train_fn(initial, data) 3 | # parent: 4 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 5 | # this: 6 | # epoch 400 trainMean 3.823610 trainMedian 3.913259 validationMean 4.035612 validationMedian 4.088287 7 | epochCount 16000 8 | everyNthInput 10 9 | expName adhoc/conf8.igsc10 10 | gridSizeForInterpolation 30 11 | gridSizeForSampling 20 12 | height 28 13 | hiddenLayerSize 600 14 | inDim 20 15 | inBoolDim 0 16 | initialSD 0.25 17 | innerGradientStepCount 10 18 | inputDigit None 19 | inputType mnist 20 | layerNum 3 21 | learningRate 10 22 | minibatchSize 600 23 | momentum 0.6 24 | oversampling 4.0 25 | plotEach 50 26 | useReLU True 27 | width 28 28 | -------------------------------------------------------------------------------- /adhoc/conf8.inDim2.digit2.txt: -------------------------------------------------------------------------------- 1 | # parent adhoc/conf8.inDim2.txt 2 | # inputDigit None -> 2 3 | epochCount 16000 4 | everyNthInput 1 5 | expName adhoc/conf8.inDim2.digit2 6 | gridSizeForInterpolation 100 7 | gridSizeForSampling 20 8 | height 28 9 | hiddenLayerSize 600 10 | inDim 2 11 | inBoolDim 0 12 | initialSD 0.25 13 | inputDigit 2 14 | inputType mnist 15 | layerNum 3 16 | learningRate 1 17 | minibatchSize 600 18 | momentum 0.6 19 | oversampling 1.0 20 | plotEach 10 21 | useReLU True 22 | width 28 23 | -------------------------------------------------------------------------------- /adhoc/conf8.inDim2.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # The idea is that we want some pretty xy*.png that shows something similar to 3 | # what VAE or t-SNE can do with MNIST in 2D. 4 | # inDim 20 -> 2, gridSizeForInterpolation 30 -> 100, everyNthInput 10 -> 1, plotEach 400 -> 10 5 | # learningRate 10 -> 1, oversampling 4.0 -> 1.0 6 | # parent: (irrelevant, though, not comparable) 7 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 8 | # this: 9 | # epoch 3600 trainMean 4.897792 trainMedian 4.980886 validationMean 4.813195 validationMedian 4.877148 10 | epochCount 16000 11 | everyNthInput 1 12 | expName adhoc/conf8.inDim2 13 | gridSizeForInterpolation 100 14 | gridSizeForSampling 20 15 | height 28 16 | hiddenLayerSize 600 17 | inDim 2 18 | inBoolDim 0 19 | initialSD 0.25 20 | inputDigit None 21 | inputType mnist 22 | layerNum 3 23 | learningRate 1 24 | minibatchSize 600 25 | momentum 0.6 26 | oversampling 1.0 27 | plotEach 10 28 | useReLU True 29 | width 28 30 | -------------------------------------------------------------------------------- /adhoc/conf8.l1loss.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # same as conf8 but with using L1 distance between pairs in loss function. 3 | # (Note the larger epoch count, haven't optimized the learning rate yet.) 4 | # Numerically worse, visually also worse in emulating train and validation. 5 | # But definitely better looking samples and planar crosscuts. 6 | # parent: 7 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 8 | # this: 9 | # epoch 16000 trainMean 3.946111 trainMedian 4.026848 validationMean 4.241784 validationMedian 4.268175 10 | epochCount 16000 11 | everyNthInput 10 12 | expName adhoc/conf8.l1loss 13 | gridSizeForInterpolation 30 14 | gridSizeForSampling 20 15 | height 28 16 | hiddenLayerSize 600 17 | inDim 20 18 | inBoolDim 0 19 | initialSD 0.25 20 | inputDigit None 21 | inputType mnist 22 | layerNum 3 23 | learningRate 10 24 | loss l1 25 | minibatchSize 600 26 | momentum 0.6 27 | oversampling 4.0 28 | plotEach 50 29 | useReLU True 30 | width 28 31 | -------------------------------------------------------------------------------- /adhoc/conf8.l2unsquared.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # loss l2squared -> l2unsquared, learningRate 10 -> 1, plotEach 400 -> 50 3 | # 4 | # See https://github.com/danielvarga/daniel-experiments/issues/26 for visualizations. 5 | # 6 | # Note the drastically faster convergence. That's not because of 7 | # some intrinsic nice property of the l2unsquared loss, but rather because 8 | # the learning rate is accidentally set up way higher. 9 | # (Looks lower, but they can't be immediately compared because of the sqrt.) 10 | # 11 | # validationMean and friends are already calculated with l2unsquared distance, 12 | # so this method is closer to actually optimizing the evaluation metric. 13 | # 14 | # parent: 15 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 16 | # this: 17 | # epoch 1000 trainMean 3.653183 trainMedian 3.725959 validationMean 3.967728 validationMedian 4.009489 18 | epochCount 16000 19 | everyNthInput 10 20 | expName adhoc/conf8.l2unsquared 21 | gridSizeForInterpolation 30 22 | gridSizeForSampling 20 23 | height 28 24 | hiddenLayerSize 600 25 | inDim 20 26 | inBoolDim 0 27 | initialSD 0.25 28 | inputDigit None 29 | inputType mnist 30 | layerNum 3 31 | learningRate 1.0 32 | loss l2unsquared 33 | minibatchSize 600 34 | momentum 0.6 35 | oversampling 4.0 36 | plotEach 50 37 | useReLU True 38 | width 28 39 | -------------------------------------------------------------------------------- /adhoc/conf8.lrd.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # learningRateDecay 1.0 (default) -> 0.9998 3 | # parent: 4 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 5 | # this: 6 | # epoch 4800 trainMean 3.762363 trainMedian 3.834788 validationMean 3.976485 validationMedian 4.033859 7 | # (learningRate 3.82905459404 at epoch 4800) 8 | # epoch 20000 trainMean 3.658130 trainMedian 3.713779 validationMean 3.966676 validationMedian 4.045723 9 | # (learningRate 0.183179870248 at epoch 20000) 10 | epochCount 30000 11 | everyNthInput 10 12 | expName adhoc/conf8.lrd 13 | gridSizeForInterpolation 30 14 | gridSizeForSampling 20 15 | height 28 16 | hiddenLayerSize 600 17 | inDim 20 18 | inBoolDim 0 19 | initialSD 0.25 20 | inputDigit None 21 | inputType mnist 22 | layerNum 3 23 | learningRate 10 24 | learningRateDecay 0.9998 25 | minibatchSize 600 26 | momentum 0.6 27 | oversampling 4.0 28 | plotEach 100 29 | useReLU True 30 | width 28 31 | -------------------------------------------------------------------------------- /adhoc/convolutional.txt: -------------------------------------------------------------------------------- 1 | # Using a convolutional architecture based on the upper half of 2 | # ../lasagne-demo/mnist_conv_autoencode.py 3 | # parent is adhoc/spearmint-best.txt , the clone of 4 | # spearmintOutput/hls673-inDim50-layerNum3-lr1.0-mom0.969849416169-n1000-os8.0 5 | # parent: 6 | # epoch 4800 trainMean 3.598628 trainMedian 3.688720 validationMean 3.907617 validationMedian 3.926451 7 | # this: 8 | # epoch 4800 trainMean 4.240630 trainMedian 4.336588 validationMean 4.238855 validationMedian 4.282310 9 | convolutional True 10 | epochCount 4800 11 | everyNthInput 10 12 | expName adhoc/convolutional 13 | gridSizeForInterpolation 30 14 | gridSizeForSampling 20 15 | height 28 16 | hiddenLayerSize 673 17 | inBoolDim 0 18 | inDim 50 19 | initialSD 0.25 20 | inputDigit None 21 | inputType mnist 22 | layerNum 3 23 | learningRate 0.5 24 | minibatchSize 1000 25 | momentum 0.5 26 | oversampling 8.0 27 | plotEach 100 28 | useReLU True 29 | width 28 30 | -------------------------------------------------------------------------------- /adhoc/lowdim1.txt: -------------------------------------------------------------------------------- 1 | epochCount 100 2 | expName adhoc/uniform.bipartite1d 3 | hiddenLayerSize 100 4 | inBoolDim 0 5 | inDim 1 6 | initialSD 0.25 7 | inputType 1d.uniform 8 | isLowDim True 9 | layerNum 3 10 | learningRate 0.1 11 | loss l2squared 12 | minibatchSize 1000 13 | momentum 0.6 14 | oversampling 1.0 15 | plotEach 1 16 | trainSize 40000 17 | useReLU False 18 | validSize 10000 19 | -------------------------------------------------------------------------------- /adhoc/plane1.txt: -------------------------------------------------------------------------------- 1 | # parent adhoc/wave2.txt 2 | epochCount 16000 3 | expName adhoc/plane1-d2 4 | gridSizeForInterpolation 50 5 | gridSizeForSampling 20 6 | height 28 7 | hiddenLayerSize 100 8 | inBoolDim 0 9 | inDim 2 10 | initialSD 0.25 11 | inputType plane 12 | layerNum 3 13 | learningRate 10 14 | minibatchSize 100 15 | momentum 0.6 16 | oversampling 1.0 17 | plotEach 10 18 | trainSize 40000 19 | useReLU False 20 | validSize 400 21 | width 28 22 | -------------------------------------------------------------------------------- /adhoc/sine1.txt: -------------------------------------------------------------------------------- 1 | # parent adhoc/wave1.txt 2 | epochCount 16000 3 | expName adhoc/sine1 4 | gridSizeForInterpolation 30 5 | gridSizeForSampling 20 6 | height 28 7 | hiddenLayerSize 100 8 | inBoolDim 0 9 | inDim 2 10 | initialSD 2.0 11 | inputType sine 12 | layerNum 4 13 | learningRate 10 14 | minibatchSize 500 15 | momentum 0.6 16 | oversampling 1.0 17 | plotEach 100 18 | trainSize 40000 19 | useReLU False 20 | validSize 400 21 | width 28 22 | -------------------------------------------------------------------------------- /adhoc/spearmint-best-leaky.txt: -------------------------------------------------------------------------------- 1 | # identical to spearmint-best, but started with leaky relu. 2 | # parent: 3 | # epoch 4800 trainMean 3.598628 trainMedian 3.688720 validationMean 3.907617 validationMedian 3.926451 4 | # this: 5 | # epoch 4800 trainMean 3.556957 trainMedian 3.631534 validationMean 3.892525 validationMedian 3.911150 6 | epochCount 9600 7 | everyNthInput 10 8 | expName adhoc/spearmint-best-leaky 9 | gridSizeForInterpolation 30 10 | gridSizeForSampling 20 11 | height 28 12 | hiddenLayerSize 673 13 | inBoolDim 0 14 | inDim 50 15 | initialSD 0.25 16 | inputDigit None 17 | inputType mnist 18 | layerNum 3 19 | reLULeakiness 0.01 20 | learningRate 1.0 21 | minibatchSize 1000 22 | momentum 0.969849416169 23 | oversampling 8.0 24 | plotEach 800 25 | useReLU True 26 | width 28 27 | -------------------------------------------------------------------------------- /adhoc/spearmint-best-leaky0.1.txt: -------------------------------------------------------------------------------- 1 | # identical to spearmint-best-leaky, but with extra leakiness: 2 | # reLULeakiness 0.01 -> 0.1 3 | # parent: 4 | # epoch 4800 trainMean 3.556957 trainMedian 3.631534 validationMean 3.892525 validationMedian 3.911150 5 | # this: 6 | # epoch 4800 trainMean 3.640023 trainMedian 3.720077 validationMean 3.930313 validationMedian 3.919364 7 | epochCount 9600 8 | everyNthInput 10 9 | expName adhoc/spearmint-best-leaky0.1 10 | gridSizeForInterpolation 30 11 | gridSizeForSampling 20 12 | height 28 13 | hiddenLayerSize 673 14 | inBoolDim 0 15 | inDim 50 16 | initialSD 0.25 17 | inputDigit None 18 | inputType mnist 19 | layerNum 3 20 | reLULeakiness 0.1 21 | learningRate 1.0 22 | minibatchSize 1000 23 | momentum 0.969849416169 24 | oversampling 8.0 25 | plotEach 800 26 | useReLU True 27 | width 28 28 | -------------------------------------------------------------------------------- /adhoc/spearmint-best.txt: -------------------------------------------------------------------------------- 1 | # identical to 2 | # spearmintOutput/hls673-inDim50-layerNum3-lr1.0-mom0.969849416169-n1000-os8.0 3 | # that produced 4 | # epoch 4800 trainMean 3.591231 trainMedian 3.657815 validationMean 3.887801 validationMedian 3.889197 5 | # , but due to probably chance factors, this is a bit worse. 6 | # this: 7 | # epoch 4800 trainMean 3.598628 trainMedian 3.688720 validationMean 3.907617 validationMedian 3.926451 8 | epochCount 4800 9 | everyNthInput 10 10 | expName adhoc/spearmint-best 11 | gridSizeForInterpolation 30 12 | gridSizeForSampling 20 13 | height 28 14 | hiddenLayerSize 673 15 | inBoolDim 0 16 | inDim 50 17 | initialSD 0.25 18 | inputDigit None 19 | inputType mnist 20 | layerNum 3 21 | learningRate 1.0 22 | minibatchSize 1000 23 | momentum 0.969849416169 24 | oversampling 8.0 25 | plotEach 800 26 | useReLU True 27 | width 28 28 | -------------------------------------------------------------------------------- /adhoc/speedtest.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # with a small epochCount and infinite plotEach for speed benchmarking. 3 | # this: 4 | # epoch 100 trainMean 4.712180 trainMedian 4.785045 validationMean 4.664932 validationMedian 4.719368 5 | epochCount 100 6 | everyNthInput 10 7 | expName adhoc/speedtest 8 | gridSizeForInterpolation 30 9 | gridSizeForSampling 20 10 | height 28 11 | hiddenLayerSize 600 12 | inBoolDim 0 13 | inDim 20 14 | initialSD 0.25 15 | inputDigit None 16 | inputType mnist 17 | layerNum 3 18 | learningRate 10 19 | minibatchSize 600 20 | momentum 0.6 21 | oversampling 4.0 22 | plotEach 100 23 | useReLU True 24 | width 28 25 | -------------------------------------------------------------------------------- /adhoc/speedtestgpu.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # with a small epochCount and infinite plotEach for speed benchmarking. 3 | epochCount 50 4 | everyNthInput 10 5 | expName adhoc/speedtestgpu 6 | gridSizeForInterpolation 30 7 | gridSizeForSampling 20 8 | height 28 9 | hiddenLayerSize 1000 10 | inBoolDim 0 11 | inDim 20 12 | initialSD 0.25 13 | inputDigit None 14 | inputType mnist 15 | layerNum 4 16 | learningRate 10 17 | minibatchSize 100 18 | momentum 0.6 19 | oversampling 1.0 20 | plotEach 100000 21 | useReLU True 22 | width 28 23 | -------------------------------------------------------------------------------- /adhoc/uniform.bipartite1d.lrd.txt: -------------------------------------------------------------------------------- 1 | # BEWARE: bipartiteMatchingBased is not a parameter right now. 2 | # set bipartiteMatchingBased = True in the code. 3 | # 4 | # parent adhoc/uniform.bipartite1d 5 | # learningRateDecay 1.0 -> 0.97 6 | # Haven't fixed the bathtub issue, although converged to a slightly better optimum slightly faster. 7 | # parent: 8 | # epoch 200 epochInterimMean 0.036180 epochInterimMedian 0.029334 9 | # this: 10 | # epoch 100 epochInterimMean 0.032690 epochInterimMedian 0.026891 11 | epochCount 200 12 | expName adhoc/uniform.bipartite1d.lrd 13 | hiddenLayerSize 100 14 | inBoolDim 0 15 | inDim 1 16 | initialSD 0.25 17 | inputType 1d.uniform 18 | isLowDim True 19 | layerNum 3 20 | learningRate 0.1 21 | learningRateDecay 0.97 22 | minibatchSize 1000 23 | momentum 0.6 24 | oversampling 1.0 25 | plotEach 10 26 | trainSize 40000 27 | useReLU False 28 | validSize 10000 29 | -------------------------------------------------------------------------------- /adhoc/wave1.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # inputType mnist -> wave, useReLU True -> False, everyNthInput removed, inputDigit removed, trainSize added 40000, validSize added 5000 3 | epochCount 16000 4 | expName adhoc/wave1 5 | gridSizeForInterpolation 30 6 | gridSizeForSampling 20 7 | height 28 8 | hiddenLayerSize 600 9 | inBoolDim 0 10 | inDim 20 11 | initialSD 0.25 12 | inputType wave 13 | layerNum 3 14 | learningRate 10 15 | minibatchSize 600 16 | momentum 0.6 17 | oversampling 4.0 18 | plotEach 1 19 | trainSize 40000 20 | useReLU False 21 | validSize 400 22 | waveCount 42 23 | width 28 24 | -------------------------------------------------------------------------------- /autoencoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lasagne import layers 4 | import numpy as np 5 | 6 | import sys 7 | import gzip 8 | import cPickle 9 | from PIL import Image 10 | 11 | from nnbase.layers import Unpool2DLayer 12 | from nnbase.utils import FlipBatchIterator 13 | ### this is really dumb, current nolearn doesnt play well with lasagne, 14 | ### so had to manually copy the file I wanted to this folder 15 | import nnbase.shape as shape 16 | 17 | import nnbase.inputs 18 | import nnbase.vis 19 | 20 | # This is very error-prone. 21 | # Optimally, there should be a guarantee that the 22 | # corpus loaded here is the same as the one that the 23 | # encoder was trained on. 24 | def loadCorpus(): 25 | face = True 26 | if face: 27 | directory = "../face/SCUT-FBP/thumb.big/" 28 | X, (height, width) = nnbase.inputs.faces(directory) 29 | else: 30 | X, (height, width) = nnbase.inputs.mnist() 31 | 32 | X = X.astype(np.float64).reshape((-1, 1, height, width)) 33 | mu, sigma = np.mean(X), np.std(X) 34 | print "mu, sigma:", mu, sigma 35 | return X, mu, sigma 36 | 37 | # TODO I don't think that .eval() is how this should work. 38 | def get_output_from_nn(last_layer, X): 39 | indices = np.arange(128, X.shape[0], 128) 40 | # not splitting into batches can cause a memory error 41 | X_batches = np.split(X, indices) 42 | out = [] 43 | for count, X_batch in enumerate(X_batches): 44 | out.append( layers.get_output(last_layer, X_batch).eval() ) 45 | return np.vstack(out) 46 | 47 | # This helper class deals with 48 | # 1. normalizing input and de-normalizing output 49 | # 2. reshaping output into shape compatible with input, namely (-1, 1, x ,y) 50 | class Autoencoder: 51 | # sigma and mu should be trained on the same corpus as the autoencoder itself. 52 | # This is error-prone! 53 | def __init__(self, ae, mu, sigma): 54 | self.ae = ae 55 | self.mu = mu 56 | self.sigma = sigma 57 | 58 | self.encode_layer_index = map(lambda pair : pair[0], self.ae.layers).index('encode_layer') 59 | self.encode_layer = self.ae.get_all_layers()[self.encode_layer_index] 60 | self.afterSplit = False 61 | 62 | # from unnormalized to unnormalized [0,1] MNIST. 63 | # ae is trained on normalized MNIST data. 64 | # For 0-1 clipped digits this should be close to the identity function. 65 | def predict(self, X): 66 | assert not self.afterSplit 67 | self.x, self.y = X.shape[-2:] 68 | flatOutput = self.ae.predict((X - self.mu) / self.sigma).reshape(X.shape) * self.sigma + self.mu 69 | return flatOutput.reshape((-1, 1, self.x, self.y)) 70 | 71 | def encode(self, X): 72 | self.x, self.y = X.shape[-2:] 73 | return get_output_from_nn(self.encode_layer, (X-self.mu)/self.sigma) 74 | 75 | # N.B after we do this, we won't be able to use the original autoencoder , as the layers are broken up 76 | def split(self): 77 | next_layer = self.ae.get_all_layers()[self.encode_layer_index + 1] 78 | self.final_layer = self.ae.get_all_layers()[-1] 79 | new_layer = layers.InputLayer(shape = (None, self.encode_layer.num_units)) 80 | next_layer.input_layer = new_layer 81 | self.afterSplit = True 82 | 83 | def decode(self, X): 84 | assert self.afterSplit 85 | flatOutput = get_output_from_nn(self.final_layer, X) * self.sigma + self.mu 86 | # Evil hack: decode only knows the shape of the input space 87 | # if you did a predict or encode previously. TODO Fix asap. 88 | return flatOutput.reshape((-1, 1, self.x, self.y)) 89 | 90 | 91 | def main(): 92 | X_train, mu, sigma = loadCorpus() 93 | 94 | # autoencoderFile = "../lasagne-demo/conv_ae.pkl" # Trained on the full mnist train dataset 95 | autoencoderFile = "../lasagne-demo/face.big.pkl" # Trained on the ../face/SCUT-FBP/thumb.big dataset. 96 | 97 | ae_raw = cPickle.load(open(autoencoderFile, 'r')) 98 | autoencoder = Autoencoder(ae_raw, mu, sigma) 99 | 100 | sampleIndices = map(int, sys.argv[1:]) 101 | assert len(sampleIndices)==2, "the tool expects two sample indices" 102 | X_train = X_train[sampleIndices] 103 | 104 | X_pred = autoencoder.predict(X_train) 105 | print "ended prediction" 106 | sys.stdout.flush() 107 | 108 | nnbase.vis.get_random_images(X_train, X_pred) 109 | 110 | autoencoder.split() 111 | 112 | X_encoded = autoencoder.encode(X_train) 113 | 114 | x0 = X_encoded[0] 115 | x1 = X_encoded[1] 116 | stepCount = 100 117 | intervalBase = np.linspace(1, 0, num=stepCount) 118 | intervalEncoded = np.multiply.outer(intervalBase, x0)+np.multiply.outer(1.0-intervalBase, x1) 119 | 120 | X_decoded = autoencoder.decode(intervalEncoded) 121 | nnbase.vis.get_picture_array(X_decoded, 10, 10, "interval") 122 | 123 | intervalInputspace = np.multiply.outer(intervalBase, X_train[0])+np.multiply.outer(1.0-intervalBase, X_train[1]) 124 | nnbase.vis.get_picture_array(intervalInputspace, 10, 10, "interval-inputspace") 125 | 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "language" : "PYTHON", 3 | "main-file" : "spearmintTask.py", 4 | "experiment-name" : "initials", 5 | "variables" : { 6 | "inDim" : { 7 | "type" : "INT", 8 | "size" : 1, 9 | "min" : 1, 10 | "max" : 100 11 | }, 12 | "inBoolDim" : { 13 | "type" : "INT", 14 | "size" : 1, 15 | "min" : 0, 16 | "max" : 100 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /deepDives/conf1.txt: -------------------------------------------------------------------------------- 1 | # Modified from spearmintExps/epoch1600/hls220-inDim20-lr10.1885871231-mom0.635511081693-n290-os4.0/conf.txt 2 | # this: 3 | # epoch 4800 trainMean 4.209222 trainMedian 4.325197 validationMean 4.234253 validationMedian 4.293222 4 | epochCount 100000 5 | everyNthInput 10 6 | expName deepDives/conf1-hls200-inDim20-lr10-mom0.6-n300-os4.0 7 | gridSizeForInterpolation 30 8 | gridSizeForSampling 20 9 | height 28 10 | hiddenLayerSize 200 11 | inDim 20 12 | inputDigit None 13 | inputType mnist 14 | layerNum 2 15 | learningRate 10 16 | minibatchSize 300 17 | momentum 0.6 18 | oversampling 4.0 19 | plotEach 400 20 | useReLU True 21 | width 28 22 | -------------------------------------------------------------------------------- /deepDives/conf10.txt: -------------------------------------------------------------------------------- 1 | # parent spearmintOutput/hls650-inDim75-layerNum3-lr10.5-mom0.74-n650-os6.0 2 | # within spearmintRun epochCount4800_depth3_4_useReLUTrue_everyNthInput10_bigger 3 | # only diffence is more data. 4 | # everyNthInput 10 -> 1, epochCount 4800 -> 960, plotEach 800 -> 80 5 | # parent: 6 | # epoch 4800 trainMean 3.614238 trainMedian 3.647176 validationMean 3.950973 validationMedian 3.987011 7 | # this: 8 | # epoch 480 trainMean 3.849584 trainMedian 3.910133 validationMean 3.801794 validationMedian 3.858063 9 | # (not directly comparable of course, more diverse training data here. 10 | # See the cool reverse in train and validation performance.) 11 | epochCount 960 12 | everyNthInput 1 13 | expName deepDives/conf10 14 | gridSizeForInterpolation 30 15 | gridSizeForSampling 20 16 | height 28 17 | hiddenLayerSize 650 18 | inDim 75 19 | inBoolDim 0 20 | initialSD 0.25 21 | inputDigit None 22 | inputType mnist 23 | layerNum 3 24 | learningRate 10.5 25 | minibatchSize 650 26 | momentum 0.74 27 | oversampling 6.0 28 | plotEach 80 29 | useReLU True 30 | width 28 31 | -------------------------------------------------------------------------------- /deepDives/conf11.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf10.txt 2 | # playing with decreasing learning rate until I get to 3 | # implement learning rate decay. 4 | # learningRate 10.5 -> 1.0 epochCount 960 -> 9600 5 | # parent: 6 | # epoch 480 trainMean 3.849584 trainMedian 3.910133 validationMean 3.801794 validationMedian 3.858063 7 | # this: 8 | # epoch 4800 trainMean 3.872003 trainMedian 3.936070 validationMean 3.811173 validationMedian 3.884217 9 | # (Note that these are more or less directly comparable because of the epochCount/learningRate trade-off. 10 | # Also, considering the uncertainty, these are identical.) 11 | epochCount 9600 12 | everyNthInput 1 13 | expName deepDives/conf11 14 | gridSizeForInterpolation 30 15 | gridSizeForSampling 20 16 | height 28 17 | hiddenLayerSize 650 18 | inDim 75 19 | inBoolDim 0 20 | initialSD 0.25 21 | inputDigit None 22 | inputType mnist 23 | layerNum 3 24 | learningRate 1.0 25 | minibatchSize 650 26 | momentum 0.74 27 | oversampling 6.0 28 | plotEach 80 29 | useReLU True 30 | width 28 31 | -------------------------------------------------------------------------------- /deepDives/conf2.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf1.txt 2 | # learningRate 10 -> 1 3 | # parent: 4 | # epoch 4800 trainMean 4.209222 trainMedian 4.325197 validationMean 4.234253 validationMedian 4.293222 5 | # epoch 30000 trainMean 4.135192 trainMedian 4.232051 validationMean 4.198376 validationMedian 4.211795 6 | # this: 7 | # epoch 4800 trainMean 4.375416 trainMedian 4.522119 validationMean 4.346554 validationMedian 4.408077 8 | # epoch 20000 trainMean 4.258607 trainMedian 4.409981 validationMean 4.273648 validationMedian 4.319705 9 | epochCount 100000 10 | everyNthInput 10 11 | expName deepDives/conf2-hls200-inDim20-lr1-mom0.6-n300-os4.0 12 | gridSizeForInterpolation 30 13 | gridSizeForSampling 20 14 | height 28 15 | hiddenLayerSize 200 16 | inDim 20 17 | inputDigit None 18 | inputType mnist 19 | layerNum 2 20 | learningRate 1 21 | minibatchSize 300 22 | momentum 0.6 23 | oversampling 4.0 24 | plotEach 400 25 | useReLU True 26 | width 28 27 | -------------------------------------------------------------------------------- /deepDives/conf3.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf1.txt 2 | # layerNum 2 -> 3 3 | # Playing a bit with depth 3 neural nets. 4 | # parent: 5 | # epoch 4800 trainMean 4.209222 trainMedian 4.325197 validationMean 4.234253 validationMedian 4.293222 6 | # this: 7 | # epoch 4800 trainMean 3.938845 trainMedian 4.025593 validationMean 4.106030 validationMedian 4.138099 8 | epochCount 100000 9 | everyNthInput 10 10 | expName deepDives/conf3-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0 11 | gridSizeForInterpolation 30 12 | gridSizeForSampling 20 13 | height 28 14 | hiddenLayerSize 200 15 | inDim 20 16 | inputDigit None 17 | inputType mnist 18 | layerNum 3 19 | learningRate 10 20 | minibatchSize 300 21 | momentum 0.6 22 | oversampling 4.0 23 | plotEach 400 24 | useReLU True 25 | width 28 26 | -------------------------------------------------------------------------------- /deepDives/conf4.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf3.txt 2 | # params unchanged, input is face/SCUT-FBP/thumb.big 3 | # everyNthInput=1 of course. 4 | # this: 5 | # epoch 60000 trainMean 3.207882 trainMedian 3.244128 validationMean 4.883027 validationMedian 4.791943 6 | # (epoch 60000 because of the super-small corpus size) 7 | epochCount 100000 8 | everyNthInput 1 9 | expName deepDives/conf4-faces-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0 10 | gridSizeForInterpolation 30 11 | gridSizeForSampling 20 12 | hiddenLayerSize 200 13 | imageDirectory ../face/SCUT-FBP/thumb.big/ 14 | inDim 20 15 | inputType image 16 | layerNum 3 17 | learningRate 10 18 | minibatchSize 300 19 | momentum 0.6 20 | oversampling 4.0 21 | plotEach 40 22 | useReLU True 23 | -------------------------------------------------------------------------------- /deepDives/conf5.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf3.txt 2 | # everyNthInput 10 -> 1, inputDigit None -> 6 3 | # this: 4 | # epoch 2400 trainMean 3.216107 trainMedian 3.153695 validationMean 3.394409 validationMedian 3.254620 5 | epochCount 100000 6 | everyNthInput 1 7 | expName deepDives/conf5-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0-digit6 8 | gridSizeForInterpolation 30 9 | gridSizeForSampling 20 10 | height 28 11 | hiddenLayerSize 200 12 | inDim 20 13 | inputDigit 6 14 | inputType mnist 15 | layerNum 3 16 | learningRate 10 17 | minibatchSize 300 18 | momentum 0.6 19 | oversampling 4.0 20 | plotEach 400 21 | useReLU False 22 | width 28 23 | -------------------------------------------------------------------------------- /deepDives/conf6.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf3.txt 2 | # This used sampleInitial gauss: np.random.normal(loc=0.0, scale=1.0/4, size=(n, inDim)) 3 | # Otherwise, it's the famous conf3. 4 | # parent: 5 | # epoch 4800 trainMean 3.938845 trainMedian 4.025593 validationMean 4.106030 validationMedian 4.138099 6 | # this: 7 | # epoch 4800 trainMean 3.909867 trainMedian 3.981797 validationMean 4.073384 validationMedian 4.109737 8 | epochCount 16000 9 | everyNthInput 10 10 | expName deepDives/conf6-gauss 11 | gridSizeForInterpolation 30 12 | gridSizeForSampling 20 13 | height 28 14 | hiddenLayerSize 200 15 | inDim 20 16 | inBoolDim 0 17 | initialSD 0.25 18 | inputDigit None 19 | inputType mnist 20 | layerNum 3 21 | learningRate 10 22 | minibatchSize 300 23 | momentum 0.6 24 | oversampling 4.0 25 | plotEach 400 26 | useReLU True 27 | width 28 28 | -------------------------------------------------------------------------------- /deepDives/conf7.txt: -------------------------------------------------------------------------------- 1 | # parent: see below 2 | # This is identical to the current-best epoch4800-relu-cubemixture spearmintOutput found at 3 | # /Users/daniel/experiments/rbm/daniel-experiments/kohonen/spearmintOutput/hls300-inDim12-layerNum4-lr20.0-mom0.5-n300-os3.99999999824/conf.txt 4 | # but it's running with a straight gauss input distribution. 5 | # Also, plotEach 800 -> 200, epochCount 4800 -> 48000 6 | # parent: 7 | # epoch 4800 trainMean 3.699963 trainMedian 3.767960 validationMean 4.109944 validationMedian 4.142234 8 | # this: 9 | # epoch 4800 trainMean 3.690358 trainMedian 3.760918 validationMean 4.082099 validationMedian 4.111330 10 | epochCount 48000 11 | everyNthInput 10 12 | expName deepDives/conf7-gauss 13 | gridSizeForInterpolation 30 14 | gridSizeForSampling 20 15 | height 28 16 | hiddenLayerSize 300 17 | inDim 12 18 | inBoolDim 0 19 | initialSD 0.25 20 | inputDigit None 21 | inputType mnist 22 | layerNum 4 23 | learningRate 20.0 24 | minibatchSize 300 25 | momentum 0.5 26 | oversampling 3.99999999824 27 | plotEach 200 28 | useReLU True 29 | width 28 30 | -------------------------------------------------------------------------------- /deepDives/conf8.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf6.txt 2 | # I call it tultoltam: 3 | # hiddenLayerSize 200 -> 600, minibatchSize 300 -> 600 4 | # parent: 5 | # epoch 4800 trainMean 3.909867 trainMedian 3.981797 validationMean 4.073384 validationMedian 4.109737 6 | # this: 7 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 8 | epochCount 16000 9 | everyNthInput 10 10 | expName deepDives/conf8 11 | gridSizeForInterpolation 30 12 | gridSizeForSampling 20 13 | height 28 14 | hiddenLayerSize 600 15 | inDim 20 16 | inBoolDim 0 17 | initialSD 0.25 18 | inputDigit None 19 | inputType mnist 20 | layerNum 3 21 | learningRate 10 22 | minibatchSize 600 23 | momentum 0.6 24 | oversampling 4.0 25 | plotEach 400 26 | useReLU True 27 | width 28 28 | -------------------------------------------------------------------------------- /deepDives/conf9.txt: -------------------------------------------------------------------------------- 1 | # parent deepDives/conf8.txt 2 | # minibatchSize 600 -> 300, oversampling 4.0 -> 8.0 3 | # parent: 4 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895 5 | # this: 6 | # epoch 2400 trainMean 3.724940 trainMedian 3.757624 validationMean 3.983316 validationMedian 4.018968 7 | # epoch 4800 trainMean 3.623366 trainMedian 3.650960 validationMean 3.984901 validationMedian 3.984115 8 | epochCount 16000 9 | everyNthInput 10 10 | expName deepDives/conf9 11 | gridSizeForInterpolation 30 12 | gridSizeForSampling 20 13 | height 28 14 | hiddenLayerSize 600 15 | inDim 20 16 | inBoolDim 0 17 | initialSD 0.25 18 | inputDigit None 19 | inputType mnist 20 | layerNum 3 21 | learningRate 10 22 | minibatchSize 300 23 | momentum 0.6 24 | oversampling 8.0 25 | plotEach 400 26 | useReLU True 27 | width 28 28 | -------------------------------------------------------------------------------- /distances.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | 4 | import numpy as np 5 | import theano 6 | import theano.tensor as T 7 | import lasagne 8 | 9 | 10 | import theano.sandbox.rng_mrg 11 | 12 | def logg(*ss): 13 | s = " ".join(map(str,ss)) 14 | sys.stderr.write(s+"\n") 15 | 16 | def start(s): 17 | global startTime 18 | global phase 19 | phase = s 20 | logg(phase+".") 21 | startTime = time.clock() 22 | 23 | def end(s=None): 24 | global startTime 25 | global phase 26 | if s is not None: 27 | phase = s 28 | endTime = time.clock() 29 | logg(phase,"finished in",endTime-startTime,"seconds.") 30 | 31 | 32 | def randomMatrix(n, f): 33 | return np.random.normal(size=n*f).astype(np.float32).reshape((n, f)) 34 | 35 | 36 | def distanceMatrix(x, y): 37 | xL2S = np.sum(x*x, axis=-1) 38 | yL2S = np.sum(y*y, axis=-1) 39 | xL2SM = np.tile(xL2S, (len(y), 1)) 40 | yL2SM = np.tile(yL2S, (len(x), 1)) 41 | squaredDistances = xL2SM + yL2SM.T - 2.0*y.dot(x.T) 42 | distances = np.sqrt(squaredDistances+1e-6) # elementwise. +1e-6 is to supress sqrt-of-negative warning. 43 | return distances 44 | 45 | 46 | # Newer theano builds allow tile() with scalar variable as reps. 47 | # https://github.com/Theano/Theano/pull/2875 48 | # That could make this nicer. 49 | # The worst thing about it is that it the constructed calculation 50 | # silently fails when given smaller datasets. 51 | # TODO If there's no easy fix, at least wrap closest_fn into 52 | # a mini-class that verifies sizes. 53 | def constructSquaredDistanceMatrixVariable(x, y, n, m): 54 | # ([n, f] , [m, f]) -> (n, m) 55 | xL2S = T.sum(x*x, axis=-1) # [n] 56 | yL2S = T.sum(y*y, axis=-1) # [m] 57 | xL2SM = T.zeros((m, n)) + xL2S # broadcasting, [m, n] 58 | yL2SM = T.zeros((n, m)) + yL2S # # broadcasting, [n, m] 59 | 60 | squaredDistances = xL2SM.T + yL2SM - 2.0*T.dot(x, y.T) # [n, m] 61 | return squaredDistances 62 | 63 | def constructSDistanceMatrixFunction(n, m): 64 | x = T.matrix('x') 65 | y = T.matrix('y') 66 | sDistances = constructSquaredDistanceMatrixVariable(x, y, n, m) 67 | return theano.function([x, y], sDistances) 68 | 69 | # For each y, it returns the index of the closest x in L2 distance. 70 | # x is [n, f], y is [m, f] for some f. Output is [m], the values are in range(n). 71 | def constructMinimalDistanceIndicesVariable(x, y, n, m): 72 | sDistances = constructSquaredDistanceMatrixVariable(x, y, n, m) 73 | lamblinsTrick = False 74 | if lamblinsTrick: 75 | # https://github.com/Theano/Theano/issues/1399 76 | # https://gist.github.com/danielvarga/d0eeacea92e65b19188c 77 | # https://groups.google.com/forum/#!topic/theano-users/E7ProqnGUMk 78 | s = sDistances 79 | bestIndices = T.cast( ( T.arange(n).dimshuffle(0, 'x') * T.cast(T.eq(s, s.min(axis=0, keepdims=True)), 'float32') ).sum(axis=0), 'int32') 80 | # This is a heavy-handed workaround for the fact that in 81 | # lamblin's hack, ties lead to completely screwed results. 82 | bestIndices = T.clip(bestIndices, 0, n-1) 83 | else: 84 | bestIndices = T.argmin(sDistances, axis=0) 85 | return bestIndices 86 | 87 | 88 | # The theano.function returned by this is usually called like this: 89 | # closest_fn(sampled, data), output is an index (pointing to a sampled row) 90 | # for each row of data. 91 | # 92 | # To elaborate: 93 | # n = candidateCount, m = targetCount, 94 | # typically candidateCount = sampleSize, targetCount = minibatchSize 95 | # BEWARE: super confusingly, in generativeMLP.py sampleSize is called m. 96 | # 97 | # See testMinimalDistanceIndicesFunction for how to turn indices into samples and distances. 98 | def constructMinimalDistanceIndicesFunction(n, m): 99 | x = T.matrix('x') 100 | y = T.matrix('y') 101 | bestIndices = constructMinimalDistanceIndicesVariable(x, y, n, m) 102 | return theano.function([x, y], bestIndices) 103 | 104 | def constructMinimalDistancesVariable(x, y, initials, n, m): 105 | sDistances = constructSquaredDistanceMatrixVariable(x, y, n, m) 106 | bestIndices = T.argmin(sDistances, axis=0) 107 | bestXes = x[bestIndices] 108 | bestInitials = initials[bestIndices] 109 | return bestXes, bestInitials 110 | 111 | def constructMinimalDistancesFunction(n, m): 112 | x = T.matrix('x') 113 | y = T.matrix('y') 114 | initials = T.matrix('initials') 115 | bestXes, bestInitials = constructMinimalDistancesVariable(x, y, initials, n, m) 116 | return theano.function([x, y], bestXes) 117 | 118 | 119 | def testMinimalDistanceIndicesFunction(batchSize, sampleSize, featureDim): 120 | closest_fn = constructMinimalDistanceIndicesFunction(sampleSize, batchSize) 121 | 122 | data = randomMatrix(batchSize, featureDim) 123 | sampled = randomMatrix(sampleSize, featureDim) 124 | 125 | import kohonen 126 | start("CPU nearest neighbors") 127 | distances = kohonen.distanceMatrix(sampled, data) 128 | assert distances.shape == (len(data), len(sampled)) # Beware the transpose! 129 | bestIndicesByCPU = np.argmin(distances, axis=1) 130 | closestSampledByCPU = sampled[bestIndicesByCPU] 131 | bestDistancesByCPU = np.linalg.norm(data-closestSampledByCPU, axis=1) 132 | end() 133 | 134 | start("GPU nearest neighbors") 135 | bestIndicesByGPU = closest_fn(sampled, data) 136 | # The next two steps are practically instant. 137 | closestSampledByGPU = sampled[bestIndicesByGPU] 138 | bestDistancesByGPU = np.linalg.norm(data-closestSampledByGPU, axis=1) 139 | end() 140 | 141 | print "total bestDistances CPU", bestDistancesByCPU.sum() 142 | print "total bestDistances GPU", bestDistancesByGPU.sum() 143 | 144 | 145 | # This class is a cheap workaround for the fact that I didn't manage to create 146 | # a shape-independent constructMinimalDistanceIndicesFunction. 147 | # It only works if the set of possible shapes is very small, otherwise Theano compilation 148 | # becomes the bottleneck. 149 | class ClosestFnFactory: 150 | def __init__(self): 151 | self.cache = {} 152 | def __call__(self, *args): 153 | assert len(args)==2 154 | sampled, data = args 155 | shape = (len(sampled), len(data)) 156 | candidateCount, targetCount = shape 157 | if shape not in self.cache.keys(): 158 | logg("Adding to ClosestFnFactory cache, shape %s" % str(shape)) 159 | closest_fn = constructMinimalDistanceIndicesFunction(candidateCount, targetCount) 160 | self.cache[shape] = closest_fn 161 | else: 162 | closest_fn = self.cache[shape] 163 | return closest_fn(sampled, data) 164 | 165 | 166 | # A cool little toy learning problem: 167 | # We want to learn a translated 2D standard normal's translation, that's a 2D vector. 168 | # We generate batchSize samples from this target distribution. 169 | # We generate sampleSize samples from our current best bet for the distribution. 170 | # We find the closest generated sample to each target sample. 171 | # We calculate the sum of distances. 172 | # That's the loss that we optimize by gradient descent. 173 | # Note that Theano doesn't even break a sweat when doing backprop 174 | # through a layer of distance minimization. 175 | # Of course that's less impressive than it first sounds, because 176 | # locally, the identity of the nearest target sample never changes. 177 | def toyLearner(): 178 | batchSize = 2000 179 | sampleSize = 2000 180 | inDim = 2 181 | srng = theano.sandbox.rng_mrg.MRG_RandomStreams(seed=234) 182 | 183 | dataVar = T.matrix("data") 184 | initialsVar = srng.normal((sampleSize, inDim)) 185 | parametersVar = theano.shared(np.zeros(inDim, dtype=np.float32), "parameters") 186 | generatedVar = initialsVar + parametersVar # broadcast 187 | 188 | 189 | bestXesVar, bestInitialsVar = constructMinimalDistancesVariable(generatedVar, dataVar, initialsVar, sampleSize, batchSize) 190 | 191 | deltaVar = bestXesVar - dataVar 192 | # mean over samples AND feature coordinates! 193 | # Very frightening fact: with .sum() here, the learning process diverges. 194 | lossVar = (deltaVar*deltaVar).mean() 195 | 196 | updates = lasagne.updates.nesterov_momentum( 197 | lossVar, [parametersVar], learning_rate=0.2, momentum=0.0) 198 | 199 | train_fn = theano.function([dataVar], updates=updates) 200 | 201 | for epoch in range(1000): 202 | data = randomMatrix(batchSize, inDim) + np.array([-5.0, 12.0], dtype=np.float32) 203 | train_fn(data) 204 | print parametersVar.get_value() 205 | 206 | def distanceSpeedTest(): 207 | # I'm not using variable names n and m, because unfortunately 208 | # the order is switched between sampleAndUpdate() and 209 | # constructDistanceMatrixFunction(). 210 | batchSize = 3000 211 | oversampling = 4.324 212 | sampleSize = int(batchSize*oversampling) 213 | f = 28*28 214 | np.random.seed(0) 215 | data = randomMatrix(batchSize, f) 216 | generated = randomMatrix(sampleSize, f) 217 | 218 | dm_fn = constructSDistanceMatrixFunction(sampleSize, batchSize) 219 | 220 | md_fn = constructMinimalDistancesFunction(sampleSize, batchSize) 221 | 222 | start("minimal distances theano") 223 | bestXes = md_fn(generated, data) 224 | print bestXes.shape 225 | print np.sum(bestXes) 226 | end() 227 | 228 | start("all distances theano") 229 | ds = dm_fn(generated, data) 230 | print ds.shape 231 | print np.sum(ds) 232 | end() 233 | 234 | start("all distances numpy") 235 | ds = distanceMatrix(generated, data) 236 | print ds.shape 237 | print np.sum(ds) 238 | end() 239 | 240 | if __name__ == "__main__": 241 | whichTest = sys.argv[1] 242 | assert whichTest in ("distances", "toyLearner", "speeds") 243 | if whichTest=="distances": 244 | testMinimalDistanceIndicesFunction(batchSize=3000, sampleSize=12972, featureDim=28*28) 245 | elif whichTest=="speeds": 246 | distanceSpeedTest() 247 | elif whichTest=="toyLearner": 248 | toyLearner() 249 | -------------------------------------------------------------------------------- /docs/charts/1d/triangle.attempt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/triangle.attempt.png -------------------------------------------------------------------------------- /docs/charts/1d/triangle.bipartite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/triangle.bipartite.png -------------------------------------------------------------------------------- /docs/charts/1d/triangle.goal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/triangle.goal.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.attempt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.attempt.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.bipartite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.bipartite.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.goal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.goal.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.os0.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os0.5.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.os1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os1.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.os2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os2.png -------------------------------------------------------------------------------- /docs/charts/1d/uniform.os5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os5.png -------------------------------------------------------------------------------- /earthMover.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import gzip 3 | import sys 4 | import os 5 | import time 6 | import random 7 | import math 8 | from operator import itemgetter 9 | 10 | import numpy as np 11 | 12 | import theano 13 | import theano.tensor as T 14 | import lasagne 15 | 16 | import kohonen # TODO This should only be used on the abandoned bipartiteMatchingBased==True codepath. 17 | import evaluate 18 | import distances 19 | 20 | import nnbase.inputs 21 | import nnbase.vis 22 | from nnbase.attrdict import AttrDict 23 | 24 | # These are only included to make the unpickling of the autoencoder possible: 25 | from nnbase.layers import Unpool2DLayer 26 | from nnbase.shape import ReshapeLayer 27 | from nnbase.utils import FlipBatchIterator 28 | 29 | L1_LOSS = "l1" 30 | L2_SQUARED_LOSS = "l2squared" 31 | # The weird name is because I really don't want to accidentally use this instead of L2_SQUARED_LOSS: 32 | L2_UNSQUARED_LOSS = "l2unsquared" 33 | 34 | 35 | def logg(*ss): 36 | s = " ".join(map(str,ss)) 37 | sys.stderr.write(s+"\n") 38 | 39 | 40 | def buildConvNet(input_var, layerNum, inDim, hidden, outDim, useReLU, leakiness=0.0): 41 | # ('hidden', layers.DenseLayer), 42 | # ('unflatten', ReshapeLayer), 43 | # ('unpool', Unpool2DLayer), 44 | # ('deconv', layers.Conv2DLayer), 45 | # ('output_layer', ReshapeLayer), 46 | # TODO Copypasted, refactor. 47 | if useReLU: 48 | if leakiness==0.0: 49 | nonlinearity = lasagne.nonlinearities.rectify 50 | gain = 'relu' 51 | else: 52 | nonlinearity = lasagne.nonlinearities.LeakyRectify(leakiness) 53 | gain = math.sqrt(2/(1+leakiness**2)) 54 | else: 55 | nonlinearity = lasagne.nonlinearities.tanh 56 | gain = 1.0 57 | 58 | filter_sizes = 7 59 | conv_filters = 32 60 | deconv_filters = 32 61 | width = 28 # TODO MNIST specific! 62 | height = 28 63 | 64 | l_in = lasagne.layers.InputLayer(shape=(None, inDim), 65 | input_var=input_var) 66 | l_hid = lasagne.layers.DenseLayer( 67 | l_in, num_units=hidden, 68 | nonlinearity=nonlinearity, 69 | W=lasagne.init.GlorotUniform(gain=gain)) 70 | hid2_num_units= deconv_filters * (height + filter_sizes - 1) * (width + filter_sizes - 1) / 4 71 | l_hid2 = lasagne.layers.DenseLayer( 72 | l_hid, num_units=hid2_num_units, 73 | nonlinearity=nonlinearity, 74 | W=lasagne.init.GlorotUniform(gain=gain)) 75 | l_unflatten = ReshapeLayer( 76 | l_hid2, shape=(([0], deconv_filters, (height + filter_sizes - 1) / 2, (width + filter_sizes - 1) / 2 ))) 77 | l_unpool = Unpool2DLayer( 78 | l_unflatten, ds=(2, 2)) 79 | l_deconv = lasagne.layers.Conv2DLayer( 80 | l_unpool, num_filters=1, filter_size = (filter_sizes, filter_sizes), 81 | border_mode="valid", nonlinearity=None) 82 | l_output = ReshapeLayer( 83 | l_deconv, shape = (([0], -1))) 84 | return l_output 85 | 86 | def buildNet(input_var, layerNum, inDim, hidden, outDim, useReLU, leakiness=0.0): 87 | if useReLU: 88 | if leakiness==0.0: 89 | nonlinearity = lasagne.nonlinearities.rectify 90 | gain = 'relu' 91 | else: 92 | nonlinearity = lasagne.nonlinearities.LeakyRectify(leakiness) 93 | gain = math.sqrt(2/(1+leakiness**2)) 94 | else: 95 | nonlinearity = lasagne.nonlinearities.tanh 96 | gain = 1.0 97 | assert layerNum in (2,3,4) 98 | 99 | l_in = lasagne.layers.InputLayer(shape=(None, inDim), 100 | input_var=input_var) 101 | l_hid = lasagne.layers.DenseLayer( 102 | l_in, num_units=hidden, 103 | nonlinearity=nonlinearity, 104 | W=lasagne.init.GlorotUniform(gain=gain)) 105 | if layerNum==2: 106 | l_out = lasagne.layers.DenseLayer( 107 | l_hid, num_units=outDim, 108 | nonlinearity=nonlinearity, 109 | W=lasagne.init.GlorotUniform(gain=gain)) 110 | elif layerNum==3: 111 | l_hid2 = lasagne.layers.DenseLayer( 112 | l_hid, num_units=hidden, 113 | nonlinearity=nonlinearity, 114 | W=lasagne.init.GlorotUniform(gain=gain)) 115 | l_out = lasagne.layers.DenseLayer( 116 | l_hid2, num_units=outDim, 117 | nonlinearity=nonlinearity, 118 | W=lasagne.init.GlorotUniform(gain=gain)) 119 | elif layerNum==4: 120 | l_hid2 = lasagne.layers.DenseLayer( 121 | l_hid, num_units=hidden, 122 | nonlinearity=nonlinearity, 123 | W=lasagne.init.GlorotUniform(gain=gain)) 124 | l_hid3 = lasagne.layers.DenseLayer( 125 | l_hid2, num_units=hidden, 126 | nonlinearity=nonlinearity, 127 | W=lasagne.init.GlorotUniform(gain=gain)) 128 | l_out = lasagne.layers.DenseLayer( 129 | l_hid3, num_units=outDim, 130 | nonlinearity=nonlinearity, 131 | W=lasagne.init.GlorotUniform(gain=gain)) 132 | return l_out 133 | 134 | def sampleInitial(n, inDim, sd, inBoolDim): 135 | continuous = np.random.normal(loc=0.0, scale=sd, size=(n, inDim)).astype(np.float32) 136 | discrete = np.random.randint(0, 2, (n, inBoolDim)) 137 | continuous[:, :inBoolDim] += discrete 138 | return continuous 139 | 140 | def sampleSourceParametrized(net_fn, n, inDim, sd, inBoolDim): 141 | initial = sampleInitial(n, inDim, sd, inBoolDim) 142 | return initial, net_fn(initial) 143 | 144 | def constructSamplerFunction(input_var, net): 145 | output = lasagne.layers.get_output(net) 146 | net_fn = theano.function([input_var], output) 147 | return net_fn 148 | 149 | def constructTrainFunction(input_var, net, learningRate, momentum, regularization, lossType=L2_SQUARED_LOSS): 150 | output = lasagne.layers.get_output(net) 151 | data_var = T.matrix('targets') 152 | if lossType==L1_LOSS: 153 | loss = T.abs_(output-data_var).mean() 154 | elif lossType==L2_SQUARED_LOSS: 155 | loss = lasagne.objectives.squared_error(output, data_var).mean() 156 | elif lossType==L2_UNSQUARED_LOSS: 157 | lossSqr = ((output-data_var)**2).sum(axis=1) 158 | loss = T.sqrt(lossSqr+1e-6).mean() # Fudge constant to avoid numerical stability issues. 159 | else: 160 | assert False, "unknown similarity loss function: %s" % lossType 161 | 162 | if regularization!=0.0: 163 | logg('regularization', regularization) 164 | loss += lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) * regularization 165 | 166 | params = lasagne.layers.get_all_params(net, trainable=True) 167 | 168 | updates = lasagne.updates.nesterov_momentum( 169 | loss, params, learning_rate=learningRate, momentum=momentum) 170 | # The rmsprop update rule is tricky. Properties (as measured on conf8): 171 | # - Converges twice as fast at the beginning. 172 | # - Goes way below nesterov on trainMean. 173 | # - ...which implies that s*.png is visually better, but that's just overfitting, because it 174 | # - reaches approx. the same performance as nesterov on validationMean, 175 | # - and visually it does not improve on diff_validation after convergence on validationMean. 176 | # - Performance has a hockey-stick dependence on epsilon: 177 | # Smaller epsilon is better until 0.0001, and then at 0.00001 it explodes. 178 | # updates = lasagne.updates.rmsprop(loss, params, epsilon=0.0001) 179 | 180 | train_fn = theano.function([input_var, data_var], updates=updates) 181 | return train_fn 182 | 183 | def sampleAndUpdate(train_fn, net_fn, closestFnFactory, inDim, sampleSource, n, data=None, m=None, innerGradientStepCount=1): 184 | if data is None: 185 | data = kohonen.samplesFromTarget(n) # TODO Refactor, I can't even change the goddamn target distribution in this source file! 186 | else: 187 | assert len(data)==n 188 | if m is None: 189 | m = n 190 | 191 | initial, sampled = sampleSource(net_fn, m, inDim) 192 | 193 | doDetailed1DVis = True and (data.shape[1]==1) 194 | 195 | bipartiteMatchingBased = False 196 | if bipartiteMatchingBased: 197 | if data.shape[1]==1: 198 | # In 1d we can actually solve the weighted bipartite matching 199 | # problem, by sorting. Basically that's what Magdon-Ismail and Atiya do. 200 | assert len(data)==len(initial) 201 | data.sort(axis=0) 202 | pairs = sorted(zip(sampled, initial)) 203 | sampled = np.array(map(itemgetter(0), pairs)) 204 | initial = np.array(map(itemgetter(1), pairs)) 205 | else: 206 | # Pretty much obsoleted, because it can't be made fast. 207 | # Does a full weighted bipartite matching. 208 | # Left here for emotional reasons. 209 | permutation = kohonen.optimalPairing(sampled, data) 210 | initial = initial[permutation] 211 | sampled = sampled[permutation] 212 | else: 213 | # TODO We had this cool findGenForData=False experiment here 214 | # TODO that didn't go anywhere at first, but we shouldn't let it go this easily. 215 | findGenForData = True 216 | if findGenForData: 217 | bestIndices = closestFnFactory(sampled, data) 218 | initial = initial[bestIndices] 219 | sampled = sampled[bestIndices] 220 | else: 221 | bestIndices = closestFnFactory(data, sampled) 222 | data = data[bestIndices] 223 | 224 | bestDists = np.linalg.norm(data-sampled, axis=1) 225 | 226 | for i in range(innerGradientStepCount): 227 | # That's where the update happens. 228 | train_fn(initial, data) 229 | 230 | if doDetailed1DVis and random.randrange(100)==0: 231 | postSampled = net_fn(initial) 232 | nnbase.vis.gradientMap1D(data, sampled, postSampled, "gradient") 233 | 234 | # These values are a byproduct of the training step, 235 | # so they are from _before_ the training, not after it. 236 | return bestDists 237 | 238 | 239 | def lowDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger): 240 | n, dim = data.shape 241 | inDim = params.inDim 242 | initial, sampled = sampleSource(net_fn, n, inDim) 243 | nnbase.vis.heatmap(sampled, params.expName+"/heatmap"+str(epoch)) 244 | 245 | 246 | def highDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger): 247 | height, width = params.height, params.width 248 | expName = params.expName 249 | 250 | # TODO This is mixing the responsibilities of evaluation and visualization: 251 | # TODO train_distance and validation_distance are calculated on only visImageCount images. 252 | doValidation = True 253 | if doValidation: 254 | start_time = time.time() 255 | visImageCount = params.gridSizeForSampling ** 2 256 | visualizedValidation = validation[:visImageCount] 257 | visualizedData = data[:visImageCount] 258 | trainMean, trainMedian = evaluate.fitAndVis(visualizedData, 259 | net_fn, closestFnFactory, sampleSource, params.inDim, 260 | height, width, params.gridSizeForSampling, name=expName+"/diff_train"+str(epoch)) 261 | validationMean, validationMedian = evaluate.fitAndVis(visualizedValidation, 262 | net_fn, closestFnFactory, sampleSource, params.inDim, 263 | height, width, params.gridSizeForSampling, name=expName+"/diff_validation"+str(epoch)) 264 | print >> logger, "epoch %d trainMean %f trainMedian %f validationMean %f validationMedian %f" % ( 265 | epoch, trainMean, trainMedian, validationMean, validationMedian) 266 | print >> logger, "time elapsed %f" % (time.time() - start_time) 267 | logger.flush() 268 | 269 | nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/xy"+str(epoch), 270 | height, width, fromGrid=True, gridSize=params.gridSizeForInterpolation, plane=(0,1)) 271 | nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/yz"+str(epoch), 272 | height, width, fromGrid=True, gridSize=params.gridSizeForInterpolation, plane=(1,2)) 273 | nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/xz"+str(epoch), 274 | height, width, fromGrid=True, gridSize=params.gridSizeForInterpolation, plane=(0,2)) 275 | nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/s"+str(epoch), 276 | height, width, fromGrid=False, gridSize=params.gridSizeForSampling, sampleSourceFunction=sampleSource) 277 | 278 | with open(expName+"/som-generator.pkl", 'w') as f: 279 | cPickle.dump(net, f) 280 | 281 | 282 | def train(data, validation, params, logger=None): 283 | if logger is None: 284 | logger = sys.stdout 285 | 286 | isLowDim = "isLowDim" in params and params.isLowDim 287 | 288 | if isLowDim: 289 | nnbase.vis.heatmap(data, params.expName+"/input") 290 | else: 291 | # Have to do before flattening: 292 | nnbase.vis.plotImages(data[:params.gridSizeForSampling**2], params.gridSizeForSampling, params.expName+"/input") 293 | 294 | # My network works with 1D input. 295 | data = nnbase.inputs.flattenImages(data) 296 | validation = nnbase.inputs.flattenImages(validation) 297 | 298 | m = int(params.oversampling*params.minibatchSize) 299 | 300 | outDim = data.shape[1] # Flattening already happened. 301 | if "height" in params: 302 | assert params.height * params.width == outDim 303 | 304 | input_var = T.matrix('inputs') 305 | leakiness = 0.0 if 'reLULeakiness' not in params else params.reLULeakiness 306 | if not params.useReLU: 307 | assert leakiness==0.0, "reLULeakiness not allowed for tanh activation" 308 | if 'convolutional' in params and params.convolutional: 309 | net = buildConvNet(input_var, params.layerNum, params.inDim, params.hiddenLayerSize, outDim, 310 | useReLU=params.useReLU, leakiness=leakiness) 311 | else: 312 | net = buildNet(input_var, params.layerNum, params.inDim, params.hiddenLayerSize, outDim, 313 | useReLU=params.useReLU, leakiness=leakiness) 314 | 315 | minibatchCount = len(data)/params.minibatchSize 316 | 317 | regularization = 0.0 if 'regularization' not in params else params.regularization # L2 318 | 319 | innerGradientStepCount = 1 if 'innerGradientStepCount' not in params else params.innerGradientStepCount 320 | 321 | lossType = params.loss if "loss" in params else L2_SQUARED_LOSS 322 | 323 | learningRate_shared = theano.shared(np.array(params.learningRate, dtype=np.float32)) 324 | 325 | # Per epoch, which means that this is super-sensitive to epoch size. 326 | learningRateDecay = np.float32(1.0 if 'learningRateDecay' not in params else params.learningRateDecay) 327 | 328 | train_fn = constructTrainFunction(input_var, net, learningRate_shared, params.momentum, regularization, lossType) 329 | net_fn = constructSamplerFunction(input_var, net) 330 | closestFnFactory = distances.ClosestFnFactory() 331 | 332 | sampleSource = lambda net_fn, n, inDim: sampleSourceParametrized(net_fn, n, inDim, params.initialSD, params.inBoolDim) 333 | 334 | validationMean = 1e10 # ad hoc inf-like value. 335 | 336 | # The reason for the +1 is that this way, if 337 | # epochCount is a multiple of plotEach, then the 338 | # last thing that happens is an evaluation. 339 | for epoch in range(params.epochCount+1): 340 | shuffledData = np.random.permutation(data) 341 | epochDistances = [] 342 | for i in range(minibatchCount): 343 | dataBatch = shuffledData[i*params.minibatchSize:(i+1)*params.minibatchSize] 344 | 345 | # The issue with using a minibatchSize that's not a divisor of corpus size 346 | # is that m is calculated before the epoch loop. This is not trivial to fix, 347 | # because constructMinimalDistanceIndicesFunction gets n and m as args. 348 | assert params.minibatchSize==len(dataBatch) 349 | 350 | minibatchDistances = sampleAndUpdate(train_fn, net_fn, closestFnFactory, params.inDim, sampleSource, 351 | n=params.minibatchSize, data=dataBatch, m=m, 352 | innerGradientStepCount=innerGradientStepCount) 353 | epochDistances.append(minibatchDistances) 354 | epochDistances = np.array(epochDistances) 355 | epochInterimMean = epochDistances.mean() 356 | epochInterimMedian = np.median(epochDistances) 357 | 358 | # Remove the "epoch != 0" if you are trying to catch evaluation crashes. 359 | if epoch % params.plotEach == 0 and epoch != 0: 360 | print >> logger, "epoch %d epochInterimMean %f epochInterimMedian %f" % (epoch, epochInterimMean, epochInterimMedian) 361 | print >> logger, "learningRate", learningRate_shared.get_value() 362 | if isLowDim: 363 | lowDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger) 364 | else: 365 | highDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger) 366 | 367 | 368 | learningRate_shared.set_value( learningRateDecay * learningRate_shared.get_value() ) 369 | 370 | return validationMean # The last calculated one, we don't recalculate. 371 | 372 | 373 | def setupAndRun(params): 374 | data, validation = nnbase.inputs.readData(params) 375 | # We dump after readData() because it augments params 376 | # with width/height deduced from the input data. 377 | nnbase.inputs.dumpParams(params, file(params.expName+"/conf.txt", "w")) 378 | 379 | isLowDim = "isLowDim" in params and params.isLowDim 380 | 381 | with file(params.expName+"/log.txt", "w") as logger: 382 | if not isLowDim: 383 | meanDist, medianDist = evaluate.fitAndVisNNBaselineMain(data, validation, params) 384 | print >> logger, "nnbaselineMean %f nnbaselineMedian %f" % (meanDist, medianDist) 385 | 386 | value = train(data, validation, params, logger) 387 | print >> logger, "final performance %f" % value 388 | 389 | return value 390 | 391 | def sampleAndPlot(net_fn, inDim, initialSD, inBoolDim, n, name): 392 | initial, sampled = sampleSourceParametrized(net_fn, n, inDim, initialSD, inBoolDim) 393 | nnbase.vis.plot(sampled, name) 394 | 395 | def mainLowDim(expName, minibatchSize, initialSD): 396 | inDim = 2 397 | outDim = 2 398 | layerNum = 3 399 | hidden = 100 400 | input_var = T.matrix('inputs') 401 | net = buildNet(input_var, layerNum, inDim, hidden, outDim, useReLU=False) 402 | train_fn = constructTrainFunction(input_var, net) 403 | net_fn = constructSamplerFunction(input_var, net) 404 | for i in range(100): 405 | print i, 406 | sys.stdout.flush() 407 | sampleAndUpdate(train_fn, net_fn, inDim, n=minibatchSize) 408 | sampleAndPlot(net_fn, inDim, initialSD, 1000, expName+"/d"+str(i)) 409 | print 410 | 411 | 412 | def setDefaultParams(): 413 | params = AttrDict() 414 | params.inputType = "mnist" 415 | 416 | if params.inputType=="image": 417 | params.imageDirectory = "../face/SCUT-FBP/thumb.big/" 418 | params.gridSizeForSampling = 10 419 | params.gridSizeForInterpolation = 20 420 | params.plotEach = 1000 421 | elif params.inputType=="mnist": 422 | params.inputDigit = None 423 | params.everyNthInput = 10 424 | params.gridSizeForSampling = 20 425 | params.gridSizeForInterpolation = 30 426 | params.plotEach = 100 # That's too small for params.inputDigit = None, params.everyNthInput = 1 427 | else: 428 | assert False, "unknown inputType" 429 | 430 | # values coming from adhoc/spearmint-best-leaky.txt 431 | 432 | params.inDim = 50 433 | params.inBoolDim = 0 434 | params.initialSD = 0.25 435 | params.minibatchSize = 1000 436 | # m = oversampling*minibatchSize, that's how many 437 | # generated samples do we pair with our minibatchSize gold samples. 438 | params.oversampling = 8.0 439 | params.hiddenLayerSize = 673 440 | params.layerNum = 3 441 | params.useReLU = True 442 | params.reLULeakiness = 0.01 443 | params.learningRate = 1.0 444 | params.momentum = 0.969849416169 445 | # in experiment regularization_initialSD used 6400 here, but that's 446 | # not nice to Spearmint, as validation optimum is usually 447 | # at 4800, and I don't have early stopping implemented. 448 | params.epochCount = 4800 449 | params.plotEach = 800 450 | return params 451 | 452 | 453 | SHORTENED_PARAM_NAMES = { "learningRate":"lr", "minibatchSize":"n", 454 | "momentum":"mom", "hiddenLayerSize":"hls", 455 | "oversampling":"os"} 456 | 457 | def spearmintDirName(spearmintParams): 458 | pairs = [] 459 | for k in sorted(spearmintParams.keys()): 460 | v = spearmintParams[k] 461 | assert len(v)==1 462 | v = v[0] 463 | if k in SHORTENED_PARAM_NAMES: 464 | k = SHORTENED_PARAM_NAMES[k] 465 | # TODO if v is a float, normalize it. (0.2000001 and 0.199999 to 0.2) 466 | pairs.append((k, str(v))) 467 | pairs.sort() 468 | return "-".join(map(lambda (k,v): k+v, pairs)) 469 | 470 | def spearmintEntry(spearmintParams): 471 | params = setDefaultParams() 472 | for k,v in spearmintParams.iteritems(): 473 | # v[0] because we only work with single values, and those are 1-element ndarrays in spearmint 474 | assert len(v)==1 475 | # We want int32 and float32, not the 64bit versions provided by spearmint. 476 | # http://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types/11389998#11389998 477 | params[k] = np.asscalar(v[0]) 478 | params.expName = "spearmintOutput/" + spearmintDirName(spearmintParams) 479 | 480 | try: 481 | os.mkdir(params.expName) 482 | except OSError: 483 | logg("Warning: target directory already exists, or can't be created.") 484 | 485 | # If we are interested in consistent behavior across several datasets, 486 | # we can simply aggregate here: value = setupAndRun(params1) + setupAndRun(params2) 487 | # where params1 and params2 are the same except for imageDirectory or inputDigit or whatever (and expName). 488 | value = setupAndRun(params) 489 | # np.float32 to float: 490 | value = np.asscalar(value) 491 | return value 492 | 493 | def main(): 494 | assert len(sys.argv)==2 495 | confFilename = sys.argv[1] 496 | params = nnbase.inputs.paramsFromConf(file(confFilename)) 497 | logg("Starting experiment, working directory: "+params.expName) 498 | 499 | try: 500 | os.mkdir(params.expName) 501 | except OSError: 502 | logg("Warning: target directory already exists, or can't be created.") 503 | 504 | value = setupAndRun(params) 505 | logg("final performance %f" % value) 506 | 507 | # TODO This codepath is temporarily abandoned: 508 | # mainLowDim(params.expName, params.minibatchSize) 509 | 510 | if __name__ == "__main__": 511 | doCPUProfile = False 512 | if doCPUProfile: 513 | import cProfile 514 | cProfile.run("main()", "pstats") 515 | else: 516 | main() 517 | -------------------------------------------------------------------------------- /earthMoverTest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import math 3 | 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | # A cool little toy learning problem: 9 | # We want to learn a 1D distribution, e.g. uniform on (-1,+1). 10 | # We want to model it with a gaussian mixture model. 11 | # (Mixture of k 1D standard normals, parametrized by the k means.) 12 | # We generate n samples from the target distribution. 13 | # We generate n samples from our current best bet for the model. 14 | # We find the pairing that minimizes the summed distance between paired points. 15 | def toyLearner(): 16 | n = 2000 17 | k = 100 18 | sigma = 0.05 19 | learningRate = 0.005 20 | epochCount = 100 21 | 22 | centers = np.random.normal(size=k).astype(np.float32) 23 | 24 | def generate(centers, n): 25 | picks = np.random.randint(k, size=n) 26 | currentCenters = centers[picks] # smart indexing 27 | generated = currentCenters + sigma * np.random.normal(size=n).astype(np.float32) 28 | return generated, picks 29 | 30 | for epoch in range(epochCount): 31 | DIST = "triangle" 32 | if DIST=="uniform": 33 | data = np.sort(np.random.uniform(low=-1, high=+1, size=(n,)).astype(np.float32)) 34 | elif DIST=="triangle": 35 | bi = np.random.uniform(low=0, high=1, size=(n,2)).astype(np.float32) 36 | data = np.max(bi, axis=-1) 37 | else: 38 | assert False, "unknown distribution" 39 | 40 | data.sort() 41 | generated, picks = generate(centers, n) 42 | 43 | if epoch%5==0: 44 | plt.hist(generate(centers, 100000)[0], 50, normed=0, facecolor='green') 45 | plt.savefig("emd"+str(epoch)+".pdf") 46 | plt.close() 47 | plt.scatter(centers[:-1], centers[1:]-centers[:-1]) 48 | plt.savefig("delta"+str(epoch)+".pdf") 49 | plt.close() 50 | 51 | sortedPairs = zip(generated, picks) 52 | sortedPairs.sort() 53 | triplets = zip(sortedPairs, data) 54 | # both are sorted at this point, this pairing is the earth mover's pairing. 55 | totalLoss = 0.0 56 | for (g,p), d in triplets: 57 | # linear derivative, corresponds to L2squared. 58 | # math.copysign(1, d-g) would be the derivative of L1=L2unsquared 59 | differential = d-g 60 | totalLoss += abs(differential) # NOT L2 squared, proper L2! 61 | centers[p] += differential * learningRate 62 | centers.sort() 63 | print "loss", totalLoss 64 | 65 | sys.stdout.flush() 66 | 67 | if __name__ == "__main__": 68 | toyLearner() 69 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import theano 4 | import theano.tensor as T 5 | import lasagne 6 | 7 | import distances 8 | import nnbase.vis 9 | 10 | # TODO If this functionality is important, 11 | # TODO I'll probably have to rewrite it in Theano, 12 | # TODO together with the workhorse kohonen.distanceMatrix(). 13 | # TODO Especially if the gradient descent based finetuning comes in. 14 | 15 | def approximateMinibatch(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleForEach): 16 | n = len(data) 17 | initial, sampled = sampleSourceFunction(net_fn, sampleForEach, inDim) 18 | bestDistIndices = closestFnFactory(sampled, data) 19 | sampled = sampled[bestDistIndices] 20 | distances = np.linalg.norm(data-sampled, axis=1) 21 | return initial, sampled, distances 22 | 23 | # For each validation sample we find the closest train sample. 24 | def approximateFromTrain(train, validation, closestFnFactory): 25 | bestDistIndices = closestFnFactory(train, validation) 26 | nearests = train[bestDistIndices] 27 | distances = np.linalg.norm(validation-nearests, axis=1) 28 | return nearests, distances 29 | 30 | # We generate sampleTotal data points, and for each gold data point 31 | # we find the closest generated one. 32 | def approximate(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleTotal): 33 | bestInitial, bestSampled, bestDistances = None, None, None 34 | # approximate_minibatch builds a matrix of size (len(data), sampleForEachMinibatch). 35 | # We want this matrix to fit into memory. 36 | distanceMatrixSizeLimit = int(1e6) 37 | sampleForEachMinibatch = distanceMatrixSizeLimit / len(data) 38 | batchCount = sampleTotal / sampleForEachMinibatch + 1 39 | for indx in xrange(batchCount): 40 | initial, sampled, distances = approximateMinibatch(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleForEachMinibatch) 41 | if bestDistances is None: 42 | bestInitial, bestSampled, bestDistances = initial, sampled, distances 43 | else: 44 | # Could easily vectorize but not a bottleneck. 45 | for i in range(len(bestDistances)): 46 | if distances[i] apt-get.cout 2> apt-get.cerr 2 | # Check the latest deb at https://developer.nvidia.com/cuda-downloads 3 | wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb 4 | sudo dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb 5 | sudo apt-get update 6 | sudo apt-get install cuda 7 | sudo reboot 8 | # check: 9 | /usr/local/cuda-7.5/bin/nvcc --version 10 | 11 | # This installs a Theano that's newer than regular pip install, actually this one: 12 | # git+https://github.com/Theano/Theano.git@15c90dd3#egg=Theano==0.8.git 13 | sudo pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/v0.1/requirements.txt 14 | 15 | # if you have this repo available, copy or copypaste: 16 | cp daniel-experiments/kohonen/theanorc.txt .theanorc 17 | 18 | # check: 19 | python -c "import numpy; numpy.test()" 20 | python `python -c "import os, theano; print os.path.dirname(theano.__file__)"`/misc/check_blas.py 21 | 22 | sudo pip install Lasagne==0.1 23 | 24 | # Libs required for matplotlib that comes with nolearn. 25 | # scikit-learn also comes with nolearn. 26 | sudo apt-get install libpng-dev 27 | sudo apt-get install libfreetype6-dev 28 | sudo pip install git+https://github.com/dnouri/nolearn.git@master#egg=nolearn==0.7.git 29 | # otherwise matplotlib wants to communicate with nonexisting X11: 30 | mkdir .matplotlib 31 | echo "backend : Agg" > .matplotlib/matplotlibrc 32 | 33 | # cuDNN 34 | # Login to NVIDIA, get cuDNN 4.0 for Linux x64: 35 | open https://developer.nvidia.com/cudnn 36 | # or simply take my cached one: 37 | wget people.mokk.bme.hu/~daniel/tmp/cudnn-7.0-linux-x64-v4.0-prod.tgz 38 | cd /usr/local/ 39 | sudo tar zxvf ~/cudnn-7.0-linux-x64-v4.0-prod.tgz 40 | 41 | cd 42 | mkdir .ssh 43 | ssh-keygen -t rsa -b 4096 -C "daniel.varga@prezi.com" 44 | eval "$(ssh-agent -s)" 45 | ssh-add ~/.ssh/id_rsa 46 | # Now add ~/.ssh/id_rsa.pub to github settings. 47 | git config --global user.email "daniel.varga@prezi.com" 48 | git config --global user.name "Daniel Varga" 49 | 50 | # Spearmint 51 | git clone git@github.com:HIPS/Spearmint.git 52 | sudo pip install -e Spearmint 53 | sudo apt-get install mongodb 54 | sudo pip install pymongo 55 | sudo service mongod start 56 | 57 | git clone git@github.com:danielvarga/daniel-experiments.git 58 | # check: 59 | time python daniel-experiments/kohonen/testNumpyToTheano.py > cout 60 | # -> 9.5 secs for testSampleInitial(), 6.7 secs with allow_gc=False. 61 | # test() minimal distances theano finishes in 0.263873 seconds. 62 | 63 | wget http://deeplearning.net/data/mnist/mnist.pkl.gz 64 | mv mnist.pkl.gz daniel-experiments/rbm/data/ 65 | 66 | cd daniel-experiments/kohonen 67 | python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr 68 | -------------------------------------------------------------------------------- /kohonen.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import math 4 | import random 5 | import sys 6 | import cPickle 7 | import gzip 8 | 9 | import next_permutation 10 | import munkres 11 | 12 | def pretty(m): 13 | for row in m: 14 | print "\t".join(map(str, row)) 15 | 16 | def halfCircle(): 17 | x = 1.0 18 | y = 1.0 19 | while x*x+y*y>1.0: 20 | x = random.uniform( 0.0, +1.0) 21 | y = random.uniform(-1.0, +1.0) 22 | return (x,y) 23 | 24 | def wave(): 25 | x = random.uniform( -math.pi, +math.pi) 26 | y = math.sin(x)+random.uniform( -0.2, +0.2) 27 | return (x,y) 28 | 29 | def triangle(): 30 | x = random.uniform(-1.0, +1.0) 31 | y = random.uniform(-1.0, x) 32 | return (x,y) 33 | 34 | 35 | def sampleFromTarget(): 36 | # return wave() 37 | return halfCircle() 38 | # return triangle() 39 | 40 | def samplesFromTarget(n): 41 | return np.array([sampleFromTarget() for i in xrange(n)]) 42 | 43 | def samplesFromInit(n, d, e): 44 | norm = np.random.normal(loc=0.0, scale=1.0, size=(n,e)) 45 | z = np.zeros((n,d-e)) 46 | data = np.hstack((norm, z)) 47 | assert data.shape==(n,d) 48 | return data 49 | 50 | # Both are (n x d) arrays. 51 | def sumOfDistances(x,y): 52 | return np.sum(np.linalg.norm(x-y, axis=1)) 53 | 54 | # Both are (n x d) arrays. 55 | # Scales with O(n!) boo! 56 | # We could bring it down by reducing it to minimum-weight 57 | # matching on a complete bipartite graph. 58 | # If we need really large n, then a sequential 59 | # greedy alg is probably more than good enough. 60 | # Probably we'll have something partially parallel that's even 61 | # faster than the naive sequential greedy alg. 62 | def slowOptimalPairing(x,y): 63 | n,d = x.shape 64 | assert y.shape==(n,d) 65 | bestDist = np.inf 66 | bestP = None 67 | for p in next_permutation.next_permutation(range(n)): 68 | dist = sumOfDistances(x[p],y) 69 | if dist %d' % (row, column, value) 116 | print 'total cost: %d' % total 117 | 118 | Running that program produces:: 119 | 120 | Lowest cost through this matrix: 121 | [5, 9, 1] 122 | [10, 3, 2] 123 | [8, 7, 4] 124 | (0, 0) -> 5 125 | (1, 1) -> 3 126 | (2, 2) -> 4 127 | total cost=12 128 | 129 | The instantiated Munkres object can be used multiple times on different 130 | matrices. 131 | 132 | Non-square Cost Matrices 133 | ======================== 134 | 135 | The Munkres algorithm assumes that the cost matrix is square. However, it's 136 | possible to use a rectangular matrix if you first pad it with 0 values to make 137 | it square. This module automatically pads rectangular cost matrices to make 138 | them square. 139 | 140 | Notes: 141 | 142 | - The module operates on a *copy* of the caller's matrix, so any padding will 143 | not be seen by the caller. 144 | - The cost matrix must be rectangular or square. An irregular matrix will 145 | *not* work. 146 | 147 | Calculating Profit, Rather than Cost 148 | ==================================== 149 | 150 | The cost matrix is just that: A cost matrix. The Munkres algorithm finds 151 | the combination of elements (one from each row and column) that results in 152 | the smallest cost. It's also possible to use the algorithm to maximize 153 | profit. To do that, however, you have to convert your profit matrix to a 154 | cost matrix. The simplest way to do that is to subtract all elements from a 155 | large value. For example:: 156 | 157 | from munkres import Munkres, print_matrix 158 | 159 | matrix = [[5, 9, 1], 160 | [10, 3, 2], 161 | [8, 7, 4]] 162 | cost_matrix = [] 163 | for row in matrix: 164 | cost_row = [] 165 | for col in row: 166 | cost_row += [sys.maxint - col] 167 | cost_matrix += [cost_row] 168 | 169 | m = Munkres() 170 | indexes = m.compute(cost_matrix) 171 | print_matrix(matrix, msg='Highest profit through this matrix:') 172 | total = 0 173 | for row, column in indexes: 174 | value = matrix[row][column] 175 | total += value 176 | print '(%d, %d) -> %d' % (row, column, value) 177 | 178 | print 'total profit=%d' % total 179 | 180 | Running that program produces:: 181 | 182 | Highest profit through this matrix: 183 | [5, 9, 1] 184 | [10, 3, 2] 185 | [8, 7, 4] 186 | (0, 1) -> 9 187 | (1, 0) -> 10 188 | (2, 2) -> 4 189 | total profit=23 190 | 191 | The ``munkres`` module provides a convenience method for creating a cost 192 | matrix from a profit matrix. Since it doesn't know whether the matrix contains 193 | floating point numbers, decimals, or integers, you have to provide the 194 | conversion function; but the convenience method takes care of the actual 195 | creation of the cost matrix:: 196 | 197 | import munkres 198 | 199 | cost_matrix = munkres.make_cost_matrix(matrix, 200 | lambda cost: sys.maxint - cost) 201 | 202 | So, the above profit-calculation program can be recast as:: 203 | 204 | from munkres import Munkres, print_matrix, make_cost_matrix 205 | 206 | matrix = [[5, 9, 1], 207 | [10, 3, 2], 208 | [8, 7, 4]] 209 | cost_matrix = make_cost_matrix(matrix, lambda cost: sys.maxint - cost) 210 | m = Munkres() 211 | indexes = m.compute(cost_matrix) 212 | print_matrix(matrix, msg='Lowest cost through this matrix:') 213 | total = 0 214 | for row, column in indexes: 215 | value = matrix[row][column] 216 | total += value 217 | print '(%d, %d) -> %d' % (row, column, value) 218 | print 'total profit=%d' % total 219 | 220 | References 221 | ========== 222 | 223 | 1. http://www.public.iastate.edu/~ddoty/HungarianAlgorithm.html 224 | 225 | 2. Harold W. Kuhn. The Hungarian Method for the assignment problem. 226 | *Naval Research Logistics Quarterly*, 2:83-97, 1955. 227 | 228 | 3. Harold W. Kuhn. Variants of the Hungarian method for assignment 229 | problems. *Naval Research Logistics Quarterly*, 3: 253-258, 1956. 230 | 231 | 4. Munkres, J. Algorithms for the Assignment and Transportation Problems. 232 | *Journal of the Society of Industrial and Applied Mathematics*, 233 | 5(1):32-38, March, 1957. 234 | 235 | 5. http://en.wikipedia.org/wiki/Hungarian_algorithm 236 | 237 | Copyright and License 238 | ===================== 239 | 240 | This software is released under a BSD license, adapted from 241 | 242 | 243 | Copyright (c) 2008 Brian M. Clapper 244 | All rights reserved. 245 | 246 | Redistribution and use in source and binary forms, with or without 247 | modification, are permitted provided that the following conditions are met: 248 | 249 | * Redistributions of source code must retain the above copyright notice, 250 | this list of conditions and the following disclaimer. 251 | 252 | * Redistributions in binary form must reproduce the above copyright notice, 253 | this list of conditions and the following disclaimer in the documentation 254 | and/or other materials provided with the distribution. 255 | 256 | * Neither the name "clapper.org" nor the names of its contributors may be 257 | used to endorse or promote products derived from this software without 258 | specific prior written permission. 259 | 260 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 261 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 262 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 263 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 264 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 265 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 266 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 267 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 268 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 269 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 270 | POSSIBILITY OF SUCH DAMAGE. 271 | """ 272 | 273 | __docformat__ = 'restructuredtext' 274 | 275 | # --------------------------------------------------------------------------- 276 | # Imports 277 | # --------------------------------------------------------------------------- 278 | 279 | import sys 280 | 281 | # --------------------------------------------------------------------------- 282 | # Exports 283 | # --------------------------------------------------------------------------- 284 | 285 | __all__ = ['Munkres', 'make_cost_matrix'] 286 | 287 | # --------------------------------------------------------------------------- 288 | # Globals 289 | # --------------------------------------------------------------------------- 290 | 291 | # Info about the module 292 | __version__ = "1.0.5.4" 293 | __author__ = "Brian Clapper, bmc@clapper.org" 294 | __url__ = "http://bmc.github.com/munkres/" 295 | __copyright__ = "(c) 2008 Brian M. Clapper" 296 | __license__ = "BSD-style license" 297 | 298 | # --------------------------------------------------------------------------- 299 | # Classes 300 | # --------------------------------------------------------------------------- 301 | 302 | class Munkres: 303 | """ 304 | Calculate the Munkres solution to the classical assignment problem. 305 | See the module documentation for usage. 306 | """ 307 | 308 | def __init__(self): 309 | """Create a new instance""" 310 | self.C = None 311 | self.row_covered = [] 312 | self.col_covered = [] 313 | self.n = 0 314 | self.Z0_r = 0 315 | self.Z0_c = 0 316 | self.marked = None 317 | self.path = None 318 | 319 | def make_cost_matrix(profit_matrix, inversion_function): 320 | """ 321 | **DEPRECATED** 322 | 323 | Please use the module function ``make_cost_matrix()``. 324 | """ 325 | import munkres 326 | return munkres.make_cost_matrix(profit_matrix, inversion_function) 327 | 328 | make_cost_matrix = staticmethod(make_cost_matrix) 329 | 330 | def pad_matrix(self, matrix, pad_value=0): 331 | """ 332 | Pad a possibly non-square matrix to make it square. 333 | 334 | :Parameters: 335 | matrix : list of lists 336 | matrix to pad 337 | 338 | pad_value : int 339 | value to use to pad the matrix 340 | 341 | :rtype: list of lists 342 | :return: a new, possibly padded, matrix 343 | """ 344 | max_columns = 0 345 | total_rows = len(matrix) 346 | 347 | for row in matrix: 348 | max_columns = max(max_columns, len(row)) 349 | 350 | total_rows = max(max_columns, total_rows) 351 | 352 | new_matrix = [] 353 | for row in matrix: 354 | row_len = len(row) 355 | new_row = row[:] 356 | if total_rows > row_len: 357 | # Row too short. Pad it. 358 | new_row += [0] * (total_rows - row_len) 359 | new_matrix += [new_row] 360 | 361 | while len(new_matrix) < total_rows: 362 | new_matrix += [[0] * total_rows] 363 | 364 | return new_matrix 365 | 366 | def compute(self, cost_matrix): 367 | """ 368 | Compute the indexes for the lowest-cost pairings between rows and 369 | columns in the database. Returns a list of (row, column) tuples 370 | that can be used to traverse the matrix. 371 | 372 | :Parameters: 373 | cost_matrix : list of lists 374 | The cost matrix. If this cost matrix is not square, it 375 | will be padded with zeros, via a call to ``pad_matrix()``. 376 | (This method does *not* modify the caller's matrix. It 377 | operates on a copy of the matrix.) 378 | 379 | **WARNING**: This code handles square and rectangular 380 | matrices. It does *not* handle irregular matrices. 381 | 382 | :rtype: list 383 | :return: A list of ``(row, column)`` tuples that describe the lowest 384 | cost path through the matrix 385 | 386 | """ 387 | self.C = self.pad_matrix(cost_matrix) 388 | self.n = len(self.C) 389 | self.original_length = len(cost_matrix) 390 | self.original_width = len(cost_matrix[0]) 391 | self.row_covered = [False for i in range(self.n)] 392 | self.col_covered = [False for i in range(self.n)] 393 | self.Z0_r = 0 394 | self.Z0_c = 0 395 | self.path = self.__make_matrix(self.n * 2, 0) 396 | self.marked = self.__make_matrix(self.n, 0) 397 | 398 | done = False 399 | step = 1 400 | 401 | steps = { 1 : self.__step1, 402 | 2 : self.__step2, 403 | 3 : self.__step3, 404 | 4 : self.__step4, 405 | 5 : self.__step5, 406 | 6 : self.__step6 } 407 | 408 | while not done: 409 | try: 410 | func = steps[step] 411 | step = func() 412 | except KeyError: 413 | done = True 414 | 415 | # Look for the starred columns 416 | results = [] 417 | for i in range(self.original_length): 418 | for j in range(self.original_width): 419 | if self.marked[i][j] == 1: 420 | results += [(i, j)] 421 | 422 | return results 423 | 424 | def __copy_matrix(self, matrix): 425 | """Return an exact copy of the supplied matrix""" 426 | return copy.deepcopy(matrix) 427 | 428 | def __make_matrix(self, n, val): 429 | """Create an *n*x*n* matrix, populating it with the specific value.""" 430 | matrix = [] 431 | for i in range(n): 432 | matrix += [[val for j in range(n)]] 433 | return matrix 434 | 435 | def __step1(self): 436 | """ 437 | For each row of the matrix, find the smallest element and 438 | subtract it from every element in its row. Go to Step 2. 439 | """ 440 | C = self.C 441 | n = self.n 442 | for i in range(n): 443 | minval = min(self.C[i]) 444 | # Find the minimum value for this row and subtract that minimum 445 | # from every element in the row. 446 | for j in range(n): 447 | self.C[i][j] -= minval 448 | 449 | return 2 450 | 451 | def __step2(self): 452 | """ 453 | Find a zero (Z) in the resulting matrix. If there is no starred 454 | zero in its row or column, star Z. Repeat for each element in the 455 | matrix. Go to Step 3. 456 | """ 457 | n = self.n 458 | for i in range(n): 459 | for j in range(n): 460 | if (self.C[i][j] == 0) and \ 461 | (not self.col_covered[j]) and \ 462 | (not self.row_covered[i]): 463 | self.marked[i][j] = 1 464 | self.col_covered[j] = True 465 | self.row_covered[i] = True 466 | 467 | self.__clear_covers() 468 | return 3 469 | 470 | def __step3(self): 471 | """ 472 | Cover each column containing a starred zero. If K columns are 473 | covered, the starred zeros describe a complete set of unique 474 | assignments. In this case, Go to DONE, otherwise, Go to Step 4. 475 | """ 476 | n = self.n 477 | count = 0 478 | for i in range(n): 479 | for j in range(n): 480 | if self.marked[i][j] == 1: 481 | self.col_covered[j] = True 482 | count += 1 483 | 484 | if count >= n: 485 | step = 7 # done 486 | else: 487 | step = 4 488 | 489 | return step 490 | 491 | def __step4(self): 492 | """ 493 | Find a noncovered zero and prime it. If there is no starred zero 494 | in the row containing this primed zero, Go to Step 5. Otherwise, 495 | cover this row and uncover the column containing the starred 496 | zero. Continue in this manner until there are no uncovered zeros 497 | left. Save the smallest uncovered value and Go to Step 6. 498 | """ 499 | step = 0 500 | done = False 501 | row = -1 502 | col = -1 503 | star_col = -1 504 | while not done: 505 | (row, col) = self.__find_a_zero() 506 | if row < 0: 507 | done = True 508 | step = 6 509 | else: 510 | self.marked[row][col] = 2 511 | star_col = self.__find_star_in_row(row) 512 | if star_col >= 0: 513 | col = star_col 514 | self.row_covered[row] = True 515 | self.col_covered[col] = False 516 | else: 517 | done = True 518 | self.Z0_r = row 519 | self.Z0_c = col 520 | step = 5 521 | 522 | return step 523 | 524 | def __step5(self): 525 | """ 526 | Construct a series of alternating primed and starred zeros as 527 | follows. Let Z0 represent the uncovered primed zero found in Step 4. 528 | Let Z1 denote the starred zero in the column of Z0 (if any). 529 | Let Z2 denote the primed zero in the row of Z1 (there will always 530 | be one). Continue until the series terminates at a primed zero 531 | that has no starred zero in its column. Unstar each starred zero 532 | of the series, star each primed zero of the series, erase all 533 | primes and uncover every line in the matrix. Return to Step 3 534 | """ 535 | count = 0 536 | path = self.path 537 | path[count][0] = self.Z0_r 538 | path[count][1] = self.Z0_c 539 | done = False 540 | while not done: 541 | row = self.__find_star_in_col(path[count][1]) 542 | if row >= 0: 543 | count += 1 544 | path[count][0] = row 545 | path[count][1] = path[count-1][1] 546 | else: 547 | done = True 548 | 549 | if not done: 550 | col = self.__find_prime_in_row(path[count][0]) 551 | count += 1 552 | path[count][0] = path[count-1][0] 553 | path[count][1] = col 554 | 555 | self.__convert_path(path, count) 556 | self.__clear_covers() 557 | self.__erase_primes() 558 | return 3 559 | 560 | def __step6(self): 561 | """ 562 | Add the value found in Step 4 to every element of each covered 563 | row, and subtract it from every element of each uncovered column. 564 | Return to Step 4 without altering any stars, primes, or covered 565 | lines. 566 | """ 567 | minval = self.__find_smallest() 568 | for i in range(self.n): 569 | for j in range(self.n): 570 | if self.row_covered[i]: 571 | self.C[i][j] += minval 572 | if not self.col_covered[j]: 573 | self.C[i][j] -= minval 574 | return 4 575 | 576 | def __find_smallest(self): 577 | """Find the smallest uncovered value in the matrix.""" 578 | minval = sys.maxint 579 | for i in range(self.n): 580 | for j in range(self.n): 581 | if (not self.row_covered[i]) and (not self.col_covered[j]): 582 | if minval > self.C[i][j]: 583 | minval = self.C[i][j] 584 | return minval 585 | 586 | def __find_a_zero(self): 587 | """Find the first uncovered element with value 0""" 588 | row = -1 589 | col = -1 590 | i = 0 591 | n = self.n 592 | done = False 593 | 594 | while not done: 595 | j = 0 596 | while True: 597 | if (self.C[i][j] == 0) and \ 598 | (not self.row_covered[i]) and \ 599 | (not self.col_covered[j]): 600 | row = i 601 | col = j 602 | done = True 603 | j += 1 604 | if j >= n: 605 | break 606 | i += 1 607 | if i >= n: 608 | done = True 609 | 610 | return (row, col) 611 | 612 | def __find_star_in_row(self, row): 613 | """ 614 | Find the first starred element in the specified row. Returns 615 | the column index, or -1 if no starred element was found. 616 | """ 617 | col = -1 618 | for j in range(self.n): 619 | if self.marked[row][j] == 1: 620 | col = j 621 | break 622 | 623 | return col 624 | 625 | def __find_star_in_col(self, col): 626 | """ 627 | Find the first starred element in the specified row. Returns 628 | the row index, or -1 if no starred element was found. 629 | """ 630 | row = -1 631 | for i in range(self.n): 632 | if self.marked[i][col] == 1: 633 | row = i 634 | break 635 | 636 | return row 637 | 638 | def __find_prime_in_row(self, row): 639 | """ 640 | Find the first prime element in the specified row. Returns 641 | the column index, or -1 if no starred element was found. 642 | """ 643 | col = -1 644 | for j in range(self.n): 645 | if self.marked[row][j] == 2: 646 | col = j 647 | break 648 | 649 | return col 650 | 651 | def __convert_path(self, path, count): 652 | for i in range(count+1): 653 | if self.marked[path[i][0]][path[i][1]] == 1: 654 | self.marked[path[i][0]][path[i][1]] = 0 655 | else: 656 | self.marked[path[i][0]][path[i][1]] = 1 657 | 658 | def __clear_covers(self): 659 | """Clear all covered matrix cells""" 660 | for i in range(self.n): 661 | self.row_covered[i] = False 662 | self.col_covered[i] = False 663 | 664 | def __erase_primes(self): 665 | """Erase all prime markings""" 666 | for i in range(self.n): 667 | for j in range(self.n): 668 | if self.marked[i][j] == 2: 669 | self.marked[i][j] = 0 670 | 671 | # --------------------------------------------------------------------------- 672 | # Functions 673 | # --------------------------------------------------------------------------- 674 | 675 | def make_cost_matrix(profit_matrix, inversion_function): 676 | """ 677 | Create a cost matrix from a profit matrix by calling 678 | 'inversion_function' to invert each value. The inversion 679 | function must take one numeric argument (of any type) and return 680 | another numeric argument which is presumed to be the cost inverse 681 | of the original profit. 682 | 683 | This is a static method. Call it like this: 684 | 685 | .. python:: 686 | 687 | cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func) 688 | 689 | For example: 690 | 691 | .. python:: 692 | 693 | cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x) 694 | 695 | :Parameters: 696 | profit_matrix : list of lists 697 | The matrix to convert from a profit to a cost matrix 698 | 699 | inversion_function : function 700 | The function to use to invert each entry in the profit matrix 701 | 702 | :rtype: list of lists 703 | :return: The converted matrix 704 | """ 705 | cost_matrix = [] 706 | for row in profit_matrix: 707 | cost_matrix.append([inversion_function(value) for value in row]) 708 | return cost_matrix 709 | 710 | def print_matrix(matrix, msg=None): 711 | """ 712 | Convenience function: Displays the contents of a matrix of integers. 713 | 714 | :Parameters: 715 | matrix : list of lists 716 | Matrix to print 717 | 718 | msg : str 719 | Optional message to print before displaying the matrix 720 | """ 721 | import math 722 | 723 | if msg is not None: 724 | print msg 725 | 726 | # Calculate the appropriate format width. 727 | width = 0 728 | for row in matrix: 729 | for val in row: 730 | width = max(width, int(math.log10(val)) + 1) 731 | 732 | # Make the format string 733 | format = '%%%dd' % width 734 | 735 | # Print the matrix 736 | for row in matrix: 737 | sep = '[' 738 | for val in row: 739 | sys.stdout.write(sep + format % val) 740 | sep = ', ' 741 | sys.stdout.write(']\n') 742 | 743 | # --------------------------------------------------------------------------- 744 | # Main 745 | # --------------------------------------------------------------------------- 746 | 747 | if __name__ == '__main__': 748 | 749 | 750 | matrices = [ 751 | # Square 752 | ([[400, 150, 400], 753 | [400, 450, 600], 754 | [300, 225, 300]], 755 | 850 # expected cost 756 | ), 757 | 758 | # Rectangular variant 759 | ([[400, 150, 400, 1], 760 | [400, 450, 600, 2], 761 | [300, 225, 300, 3]], 762 | 452 # expected cost 763 | ), 764 | 765 | # Square 766 | ([[10, 10, 8], 767 | [ 9, 8, 1], 768 | [ 9, 7, 4]], 769 | 18 770 | ), 771 | 772 | # Rectangular variant 773 | ([[10, 10, 8, 11], 774 | [ 9, 8, 1, 1], 775 | [ 9, 7, 4, 10]], 776 | 15 777 | ), 778 | ] 779 | 780 | m = Munkres() 781 | for cost_matrix, expected_total in matrices: 782 | print_matrix(cost_matrix, msg='cost matrix') 783 | indexes = m.compute(cost_matrix) 784 | total_cost = 0 785 | for r, c in indexes: 786 | x = cost_matrix[r][c] 787 | total_cost += x 788 | print '(%d, %d) -> %d' % (r, c, x) 789 | print 'lowest cost=%d' % total_cost 790 | assert expected_total == total_cost 791 | 792 | -------------------------------------------------------------------------------- /nearestNeighborsTest.py: -------------------------------------------------------------------------------- 1 | # This piece of code was prepared because I asked about its 2 | # performance on the theano-users list: 3 | # https://groups.google.com/forum/#!topic/theano-users/E7ProqnGUMk 4 | # https://gist.github.com/danielvarga/d0eeacea92e65b19188c 5 | 6 | 7 | import numpy as np 8 | import theano 9 | import theano.tensor as T 10 | 11 | 12 | def randomMatrix(n, f): 13 | return np.random.normal(size=n*f).astype(np.float32).reshape((n, f)) 14 | 15 | n = 5000 # number of candidates 16 | m = 1000 # number of targets 17 | f = 500 # number of features 18 | 19 | x = T.matrix('x') # candidates 20 | y = T.matrix('y') # targets 21 | 22 | xL2S = T.sum(x*x, axis=-1) # [n] 23 | yL2S = T.sum(y*y, axis=-1) # [m] 24 | xL2SM = T.zeros((m, n)) + xL2S # broadcasting, [m, n] 25 | yL2SM = T.zeros((n, m)) + yL2S # # broadcasting, [n, m] 26 | squaredPairwiseDistances = xL2SM.T + yL2SM - 2.0*T.dot(x, y.T) # [n, m] 27 | 28 | np.random.seed(1) 29 | 30 | N = randomMatrix(n, f) 31 | M = randomMatrix(m, f) 32 | 33 | lamblinsTrick = True 34 | 35 | if lamblinsTrick: 36 | # from https://github.com/Theano/Theano/issues/1399 37 | s = squaredPairwiseDistances 38 | bestIndices = T.cast( ( T.arange(n).dimshuffle(0, 'x') * T.cast(T.eq(s, s.min(axis=0, keepdims=True)), 'float32') ).sum(axis=0), 'int32') 39 | else: 40 | bestIndices = T.argmin(squaredPairwiseDistances, axis=0) 41 | 42 | nearests_fn = theano.function([x, y], bestIndices, profile=True) 43 | 44 | print nearests_fn(N, M).sum() 45 | -------------------------------------------------------------------------------- /next_permutation.py: -------------------------------------------------------------------------------- 1 | def next_permutation(seq, pred=cmp): 2 | """Like C++ std::next_permutation() but implemented as 3 | generator. Yields copies of seq.""" 4 | 5 | def reverse(seq, start, end): 6 | # seq = seq[:start] + reversed(seq[start:end]) + \ 7 | # seq[end:] 8 | end -= 1 9 | if end <= start: 10 | return 11 | while True: 12 | seq[start], seq[end] = seq[end], seq[start] 13 | if start == end or start+1 == end: 14 | return 15 | start += 1 16 | end -= 1 17 | 18 | if not seq: 19 | raise StopIteration 20 | 21 | try: 22 | seq[0] 23 | except TypeError: 24 | raise TypeError("seq must allow random access.") 25 | 26 | first = 0 27 | last = len(seq) 28 | seq = seq[:] 29 | 30 | # Yield input sequence as the STL version is often 31 | # used inside do {} while. 32 | yield seq 33 | 34 | if last == 1: 35 | raise StopIteration 36 | 37 | while True: 38 | next = last - 1 39 | 40 | while True: 41 | # Step 1. 42 | next1 = next 43 | next -= 1 44 | 45 | if pred(seq[next], seq[next1]) < 0: 46 | # Step 2. 47 | mid = last - 1 48 | while not (pred(seq[next], seq[mid]) < 0): 49 | mid -= 1 50 | seq[next], seq[mid] = seq[mid], seq[next] 51 | 52 | # Step 3. 53 | reverse(seq, next1, last) 54 | 55 | # Change to yield references to get rid of 56 | # (at worst) |seq|! copy operations. 57 | yield seq[:] 58 | break 59 | if next == first: 60 | raise StopIteration 61 | raise StopIteration 62 | -------------------------------------------------------------------------------- /nnbase/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/nnbase/__init__.py -------------------------------------------------------------------------------- /nnbase/attrdict.py: -------------------------------------------------------------------------------- 1 | 2 | # Causes memory leak below python 2.7.3 3 | class AttrDict(dict): 4 | def __init__(self, *args, **kwargs): 5 | super(AttrDict, self).__init__(*args, **kwargs) 6 | self.__dict__ = self 7 | -------------------------------------------------------------------------------- /nnbase/autoencoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lasagne import layers 4 | import numpy as np 5 | 6 | import sys 7 | import gzip 8 | import cPickle 9 | from PIL import Image 10 | 11 | from nnbase.layers import Unpool2DLayer 12 | from nnbase.utils import FlipBatchIterator 13 | ### this is really dumb, current nolearn doesnt play well with lasagne, 14 | ### so had to manually copy the file I wanted to this folder 15 | import nnbase.shape as shape 16 | 17 | import nnbase.inputs 18 | import nnbase.vis 19 | 20 | # This is very error-prone. 21 | # Optimally, there should be a guarantee that the 22 | # corpus loaded here is the same as the one that the 23 | # encoder was trained on. 24 | def loadCorpus(): 25 | face = True 26 | if face: 27 | directory = "../face/SCUT-FBP/thumb.big/" 28 | X, (height, width) = nnbase.inputs.faces(directory) 29 | else: 30 | X, (height, width) = nnbase.inputs.mnist() 31 | 32 | X = X.astype(np.float64).reshape((-1, 1, height, width)) 33 | mu, sigma = np.mean(X), np.std(X) 34 | print "mu, sigma:", mu, sigma 35 | return X, mu, sigma 36 | 37 | # TODO I don't think that .eval() is how this should work. 38 | def get_output_from_nn(last_layer, X): 39 | indices = np.arange(128, X.shape[0], 128) 40 | # not splitting into batches can cause a memory error 41 | X_batches = np.split(X, indices) 42 | out = [] 43 | for count, X_batch in enumerate(X_batches): 44 | out.append( layers.get_output(last_layer, X_batch).eval() ) 45 | return np.vstack(out) 46 | 47 | # This helper class deals with 48 | # 1. normalizing input and de-normalizing output 49 | # 2. reshaping output into shape compatible with input, namely (-1, 1, x ,y) 50 | class Autoencoder: 51 | # sigma and mu should be trained on the same corpus as the autoencoder itself. 52 | # This is error-prone! 53 | def __init__(self, ae, mu, sigma): 54 | self.ae = ae 55 | self.mu = mu 56 | self.sigma = sigma 57 | 58 | self.encode_layer_index = map(lambda pair : pair[0], self.ae.layers).index('encode_layer') 59 | self.encode_layer = self.ae.get_all_layers()[self.encode_layer_index] 60 | self.afterSplit = False 61 | 62 | # from unnormalized to unnormalized [0,1] MNIST. 63 | # ae is trained on normalized MNIST data. 64 | # For 0-1 clipped digits this should be close to the identity function. 65 | def predict(self, X): 66 | assert not self.afterSplit 67 | self.x, self.y = X.shape[-2:] 68 | flatOutput = self.ae.predict((X - self.mu) / self.sigma).reshape(X.shape) * self.sigma + self.mu 69 | return flatOutput.reshape((-1, 1, self.x, self.y)) 70 | 71 | def encode(self, X): 72 | self.x, self.y = X.shape[-2:] 73 | return get_output_from_nn(self.encode_layer, (X-self.mu)/self.sigma) 74 | 75 | # N.B after we do this, we won't be able to use the original autoencoder , as the layers are broken up 76 | def split(self): 77 | next_layer = self.ae.get_all_layers()[self.encode_layer_index + 1] 78 | self.final_layer = self.ae.get_all_layers()[-1] 79 | new_layer = layers.InputLayer(shape = (None, self.encode_layer.num_units)) 80 | next_layer.input_layer = new_layer 81 | self.afterSplit = True 82 | 83 | def decode(self, X): 84 | assert self.afterSplit 85 | flatOutput = get_output_from_nn(self.final_layer, X) * self.sigma + self.mu 86 | # Evil hack: decode only knows the shape of the input space 87 | # if you did a predict or encode previously. TODO Fix asap. 88 | return flatOutput.reshape((-1, 1, self.x, self.y)) 89 | 90 | 91 | def main(): 92 | X_train, mu, sigma = loadCorpus() 93 | 94 | # autoencoderFile = "../lasagne-demo/conv_ae.pkl" # Trained on the full mnist train dataset 95 | autoencoderFile = "../lasagne-demo/face.big.pkl" # Trained on the ../face/SCUT-FBP/thumb.big dataset. 96 | 97 | ae_raw = cPickle.load(open(autoencoderFile, 'r')) 98 | autoencoder = Autoencoder(ae_raw, mu, sigma) 99 | 100 | sampleIndices = map(int, sys.argv[1:]) 101 | assert len(sampleIndices)==2, "the tool expects two sample indices" 102 | X_train = X_train[sampleIndices] 103 | 104 | X_pred = autoencoder.predict(X_train) 105 | print "ended prediction" 106 | sys.stdout.flush() 107 | 108 | nnbase.vis.get_random_images(X_train, X_pred) 109 | 110 | autoencoder.split() 111 | 112 | X_encoded = autoencoder.encode(X_train) 113 | 114 | x0 = X_encoded[0] 115 | x1 = X_encoded[1] 116 | stepCount = 100 117 | intervalBase = np.linspace(1, 0, num=stepCount) 118 | intervalEncoded = np.multiply.outer(intervalBase, x0)+np.multiply.outer(1.0-intervalBase, x1) 119 | 120 | X_decoded = autoencoder.decode(intervalEncoded) 121 | nnbase.vis.get_picture_array(X_decoded, 10, 10, "interval") 122 | 123 | intervalInputspace = np.multiply.outer(intervalBase, X_train[0])+np.multiply.outer(1.0-intervalBase, X_train[1]) 124 | nnbase.vis.get_picture_array(intervalInputspace, 10, 10, "interval-inputspace") 125 | 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /nnbase/inputs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import gzip 4 | import cPickle 5 | 6 | # When I move out the synthetic distributions, these imports should move as well. 7 | import math 8 | import random 9 | import PIL.Image as Image 10 | import PIL.ImageDraw as ImageDraw 11 | 12 | from nnbase.attrdict import AttrDict 13 | import autoencoder 14 | 15 | 16 | def mnist(digit=None, torusHack=False, autoencoded=False, which='train', everyNth=1): 17 | np.random.seed(1) # TODO Not the right place to do this. 18 | datasetFile = "mnist.pkl.gz" 19 | f = gzip.open(datasetFile, 'rb') 20 | datasets = cPickle.load(f) 21 | train_set, valid_set, test_set = datasets 22 | f.close() 23 | if which=='train': 24 | input, output = train_set 25 | elif which=='validation': 26 | input, output = valid_set 27 | elif which=='test': 28 | input, output = test_set 29 | else: 30 | assert which in ('train', 'validation', 'test') 31 | 32 | input = input.reshape((-1, 28, 28)) 33 | if digit is not None: 34 | input = input[output==digit] 35 | if torusHack: 36 | # This is a SINGLE sample, translated and multiplied. 37 | sample = input[0] 38 | inputRows = [] 39 | for dx in range(28): 40 | for dy in range(28): 41 | s = sample.copy() 42 | s = np.hstack((s[:, dy:], s[:, :dy])) 43 | s = np.vstack((s[dx:, :], s[:dx, :])) 44 | inputRows.append(s) 45 | input = np.array(inputRows) 46 | input = np.vstack([[input]*10]) 47 | input = np.random.permutation(input) 48 | input = input[::everyNth] 49 | input = input.astype(np.float32) 50 | 51 | if autoencoded: 52 | autoencoderFile = "../lasagne-demo/conv_ae.pkl" 53 | mu = 0.13045 54 | sigma = 0.30729 55 | ae = autoencoder.Autoencoder(cPickle.load(open(autoencoderFile, 'r')), mu=mu, sigma=sigma) 56 | ae.split() 57 | encodedInput = ae.encode(input.reshape((-1, 1, 28, 28))) 58 | assert encodedInput.shape[1] == 40 59 | # encodedInput = encodedInput.reshape((-1, 8, 5)) 60 | # print encodedInput.shape 61 | # return encodedInput, (8, 5) 62 | decodedInput = ae.decode(encodedInput) 63 | return decodedInput.reshape((-1, 28, 28)) , (28, 28) 64 | else: 65 | return input, (28, 28) 66 | 67 | def flattenImages(input): 68 | shape = input.shape 69 | assert len(shape) in (2, 3) 70 | if len(shape)==2: 71 | return input 72 | l, height, width = shape 73 | return input.reshape((l, height*width)) 74 | 75 | def faces(directory): 76 | imgs = [] 77 | height = None 78 | width = None 79 | for f in os.listdir(directory): 80 | if f.endswith(".jpg") or f.endswith(".png"): 81 | img = Image.open(os.path.join(directory, f)).convert("L") 82 | arr = np.array(img) 83 | if height is None: 84 | height, width = arr.shape 85 | else: 86 | assert (height, width) == arr.shape, "Bad size %s %s" % (f, str(arr.shape)) 87 | imgs.append(arr) 88 | input = np.array(imgs).astype(float) / 255 89 | np.random.seed(1) # TODO Not the right place to do this. 90 | input = np.random.permutation(input) 91 | return input, (height, width) 92 | 93 | def generateWave(n, height, width, waveCount): 94 | d = height*width 95 | phases = 2 * np.pi * np.random.uniform(size=n).astype(np.float32) 96 | rangeMat = np.zeros((n, d)).astype(np.float32) + np.linspace(start=0.0, stop=1.0, num=d).astype(np.float32) # broadcast, tiling rows 97 | phaseMat = np.zeros((n, d)).astype(np.float32) + phases[:, np.newaxis] # broadcast, tiling columns 98 | waves = (np.sin(rangeMat*(waveCount*2*np.pi) + phaseMat)+1.0)/2.0 99 | assert waves.dtype==np.float32 100 | assert np.sum(np.isnan(waves)) == 0 101 | return waves.reshape((n, height, width)) 102 | 103 | # Super ad hoc, but it shouldn't matter. 104 | def generatePlane(n, height, width): 105 | normals = np.random.normal(size=(n,2)).astype(np.float32) 106 | zeros = np.zeros((n, height, width)).astype(np.float32) 107 | # two-dim broadcasts: 108 | heightMat = zeros + np.linspace(start=-1.0, stop=1.0, num=height).astype(np.float32)[:, np.newaxis] 109 | widthMat = zeros + np.linspace(start=-1.0, stop=1.0, num=width).astype(np.float32)[np.newaxis, :] 110 | planes = heightMat*normals[:, 0][:, np.newaxis, np.newaxis] + widthMat*normals[: ,1][:, np.newaxis, np.newaxis] + 0.5 111 | np.clip(planes, 0.0, 1.0, planes) 112 | return planes 113 | 114 | def generateOneClock(width): 115 | data = np.zeros((width, width)).astype(np.float32) 116 | r = float(width/2) 117 | img = Image.fromarray(data) 118 | draw = ImageDraw.Draw(img) 119 | theta = random.uniform(0, 2*math.pi) 120 | intensity = random.uniform(0.0, 1.0) 121 | p = ((r, r), (r*(1+math.cos(theta)), r*(1+math.sin(theta))), (r*(1+math.cos(theta+1)), r*(1+math.sin(theta+1)))) 122 | draw.polygon(p, fill=intensity) 123 | return np.asarray(img) 124 | 125 | def generateClock(n, height, width): 126 | assert height == width 127 | return np.array([ generateOneClock(width) for i in range(n) ]) 128 | 129 | def generateOneDot(width): 130 | data = np.zeros((width, width)).astype(np.float32) 131 | r = float(width/2) 132 | img = Image.fromarray(data) 133 | draw = ImageDraw.Draw(img) 134 | theta = random.uniform(0, 2*math.pi) 135 | intensity = random.uniform(0.0, 1.0) 136 | p = ((r, r), (r*(1+math.cos(theta)), r*(1+math.sin(theta))), (r*(1+math.cos(theta+1)), r*(1+math.sin(theta+1)))) 137 | draw.polygon(p, fill=intensity) 138 | return np.asarray(img) 139 | 140 | def generateSine(n, height, width): 141 | waveLength = height*np.pi 142 | parameters = np.random.uniform(low=waveLength*1.2, high=waveLength*1.8, size=(n,2)).astype(np.float32) 143 | data = np.zeros((n, height*width)).astype(np.float32) 144 | data += np.linspace(start=0.0, stop=height*width-1, num=height*width).astype(np.float32)[np.newaxis, :] 145 | data /= parameters[:,1][:, np.newaxis] 146 | data += parameters[:,0][:, np.newaxis] * 10 147 | data = (np.sin(data)+1.0) / 2 148 | return data.reshape((n, height, width)) 149 | 150 | def generate1DUniform(n): 151 | return np.random.uniform(low=-1, high=+1, size=(n,1)).astype(np.float32) 152 | 153 | def generate1DTriangle(n): 154 | bi = np.random.uniform(low=0, high=1, size=(n,2)).astype(np.float32) 155 | data = np.max(bi, axis=-1, keepdims=True) 156 | assert data.shape==(n,1) 157 | return data 158 | 159 | def generate2DCircle(n): 160 | slacked = 2*n+100 161 | cartesian = np.random.rand(slacked, 2).astype(np.float32) 162 | cartesian *= 2 163 | cartesian -= 1 164 | cartesian = cartesian[np.sum(cartesian*cartesian, axis=1)<1, :] 165 | assert len(cartesian)>=n 166 | cartesian = cartesian[:n, :] 167 | 168 | # import matplotlib.pyplot as plt 169 | # plt.scatter(cartesian[:,0], cartesian[:,1]) 170 | # plt.savefig("circle.pdf") 171 | # plt.close() 172 | 173 | return cartesian 174 | 175 | 176 | def generate2DHalfcircle(n): 177 | slacked = 2*n+100 178 | cartesian = np.random.rand(slacked, 2).astype(np.float32) 179 | cartesian[:, 1] *= 2 180 | cartesian[:, 1] -= 1 181 | cartesian = cartesian[np.sum(cartesian*cartesian, axis=1)<1, :] 182 | assert len(cartesian)>=n 183 | cartesian = cartesian[:n, :] 184 | return cartesian 185 | 186 | 187 | GENERATOR_FUNCTIONS = {"wave": [generateWave, ["waveCount"]], 188 | "plane": [generatePlane, []], 189 | "clock": [generateClock, []], 190 | "sine": [generateSine, []], 191 | "1d.uniform": [generate1DUniform, []], 192 | "1d.triangle": [generate1DTriangle, []], 193 | "2d.circle": [generate2DCircle, []], 194 | "2d.halfcircle": [generate2DHalfcircle, []] 195 | } 196 | 197 | def readData(params): 198 | if params.inputType=="image": 199 | data, (height, width) = faces(params.imageDirectory) 200 | n = len(data) 201 | trainSize = 9*n/10 202 | validation = data[trainSize:] 203 | data = data[:trainSize] 204 | elif params.inputType=="mnist": 205 | autoencoded = params.get("autoencoded", False) 206 | data, (height, width) = mnist(params.inputDigit, which='train', everyNth=params.everyNthInput, autoencoded=autoencoded) 207 | validation, (_, _) = mnist(params.inputDigit, which='validation', autoencoded=autoencoded) 208 | elif params.inputType in GENERATOR_FUNCTIONS.keys(): 209 | generatorFunction, argNames = GENERATOR_FUNCTIONS[params.inputType] 210 | arguments = { argName: params[argName] for argName in argNames } 211 | 212 | isLowDim = "isLowDim" in params and params.isLowDim 213 | if isLowDim: 214 | assert "height" not in params and "width" not in params, "For isLowDim==True, height and width params are meaningless." 215 | data = generatorFunction(params.trainSize, **arguments) 216 | validation = generatorFunction(params.validSize, **arguments) 217 | else: 218 | height, width = params.height, params.width 219 | data = generatorFunction(params.trainSize, height, width, **arguments) 220 | validation = generatorFunction(params.validSize, height, width, **arguments) 221 | else: 222 | assert False, "unknown params.inputType %s" % params.inputType 223 | if "height" in params or "width" in params: 224 | assert (params.height == height) and (params.width == width), "%d!=%d or %d!=%d" % (params.height, height, params.width, width) 225 | return data, validation 226 | 227 | 228 | def dumpParams(params, f): 229 | for k in sorted(params.keys()): 230 | print >>f, k+"\t"+str(params[k]) 231 | 232 | def heuristicCast(s): 233 | s = s.strip() # Don't let some stupid whitespace fool you. 234 | if s=="None": 235 | return None 236 | elif s=="True": 237 | return True 238 | elif s=="False": 239 | return False 240 | try: 241 | return int(s) 242 | except ValueError: 243 | pass 244 | try: 245 | return float(s) 246 | except ValueError: 247 | pass 248 | return s 249 | 250 | def paramsFromConf(f): 251 | params = AttrDict() 252 | for l in f: 253 | if l.startswith("#"): 254 | continue 255 | try: 256 | k, v = l.strip("\n").split("\t") 257 | except: 258 | assert False, "Malformed config line " + l.strip() 259 | try: 260 | v = heuristicCast(v) 261 | except ValueError: 262 | assert False, "Malformed parameter value " + v 263 | params[k] = v 264 | return params 265 | -------------------------------------------------------------------------------- /nnbase/layers.py: -------------------------------------------------------------------------------- 1 | from lasagne import layers 2 | 3 | class Unpool2DLayer(layers.Layer): 4 | """ 5 | This layer performs unpooling over the last two dimensions 6 | of a 4D tensor. 7 | """ 8 | def __init__(self, incoming, ds, **kwargs): 9 | 10 | super(Unpool2DLayer, self).__init__(incoming, **kwargs) 11 | 12 | if (isinstance(ds, int)): 13 | raise ValueError('ds must have len == 2') 14 | else: 15 | ds = tuple(ds) 16 | if len(ds) != 2: 17 | raise ValueError('ds must have len == 2') 18 | if ds[0] != ds[1]: 19 | raise ValueError('ds should be symmetric (I am lazy)') 20 | self.ds = ds 21 | 22 | def get_output_shape_for(self, input_shape): 23 | output_shape = list(input_shape) 24 | 25 | output_shape[2] = input_shape[2] * self.ds[0] 26 | output_shape[3] = input_shape[3] * self.ds[1] 27 | 28 | return tuple(output_shape) 29 | 30 | def get_output_for(self, input, **kwargs): 31 | ds = self.ds 32 | input_shape = input.shape 33 | output_shape = self.get_output_shape_for(input_shape) 34 | return input.repeat(2, axis=2).repeat(2, axis=3) 35 | -------------------------------------------------------------------------------- /nnbase/shape.py: -------------------------------------------------------------------------------- 1 | # See https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.py#L16-L18 2 | 3 | import numpy as np 4 | 5 | from lasagne.theano_extensions import padding 6 | 7 | from lasagne.layers import Layer 8 | 9 | 10 | __all__ = [ 11 | "FlattenLayer", 12 | "flatten", 13 | "ReshapeLayer", 14 | "reshape", 15 | "DimshuffleLayer", 16 | "dimshuffle", 17 | "PadLayer", 18 | "pad", 19 | ] 20 | 21 | 22 | class FlattenLayer(Layer): 23 | def get_output_shape_for(self, input_shape): 24 | return (input_shape[0], int(np.prod(input_shape[1:]))) 25 | 26 | def get_output_for(self, input, **kwargs): 27 | return input.flatten(2) 28 | 29 | flatten = FlattenLayer # shortcut 30 | 31 | 32 | class ReshapeLayer(Layer): 33 | """ 34 | A layer reshaping its input tensor to another tensor of the same total 35 | number of elements. 36 | 37 | :parameters: 38 | - incoming : a :class:`Layer` instance or a tuple 39 | the layer feeding into this layer, or the expected input shape 40 | 41 | - shape : tuple 42 | The target shape specification. Any of its elements can be `[i]`, 43 | a single-element list of int, denoting to use the size of the ith 44 | input dimension. At most one element can be `-1`, denoting to 45 | infer the size for this dimension to match the total number of 46 | elements of the input tensor. Any remaining elements must be 47 | positive integers directly giving the size of the corresponding 48 | dimension. 49 | 50 | :usage: 51 | >>> from lasagne.layers import InputLayer, ReshapeLayer 52 | >>> l_in = InputLayer((None, 100, 20)) 53 | >>> l1 = ReshapeLayer(l_in, ([0], [1], 2, 10)) 54 | >>> l1.get_output_shape() 55 | (None, 100, 2, 10) 56 | >>> l2 = ReshapeLayer(l_in, ([0], 1, 2, 5, -1)) 57 | >>> l2.get_output_shape() 58 | (None, 1, 2, 5, 200) 59 | 60 | :note: 61 | The tensor elements will be fetched and placed in C-like order. That 62 | is, reshaping `[1,2,3,4,5,6]` to shape `(2,3)` will result in a matrix 63 | `[[1,2,3],[4,5,6]]`, not in `[[1,3,5],[2,4,6]]` (Fortran-like order), 64 | regardless of the memory layout of the input tensor. For C-contiguous 65 | input, reshaping is cheap, for others it may require copying the data. 66 | """ 67 | 68 | def __init__(self, incoming, shape, **kwargs): 69 | super(ReshapeLayer, self).__init__(incoming, **kwargs) 70 | shape = tuple(shape) 71 | for s in shape: 72 | if isinstance(s, int): 73 | if s == 0 or s < - 1: 74 | raise ValueError("`shape` integers must be positive or -1") 75 | elif isinstance(s, list): 76 | if len(s) != 1 or not isinstance(s[0], int) or s[0] < 0: 77 | raise ValueError("`shape` input references must be " 78 | "single-element lists of int >= 0") 79 | else: 80 | raise ValueError("`shape` must be a tuple of int and/or [int]") 81 | if sum(s == -1 for s in shape) > 1: 82 | raise ValueError("`shape` cannot contain multiple -1") 83 | self.shape = shape 84 | 85 | def get_output_shape_for(self, input_shape, **kwargs): 86 | # Initialize output shape from shape specification 87 | output_shape = list(self.shape) 88 | # First, replace all `[i]` with the corresponding input dimension, and 89 | # mask parts of the shapes thus becoming irrelevant for -1 inference 90 | masked_input_shape = list(input_shape) 91 | masked_output_shape = list(output_shape) 92 | for dim, o in enumerate(output_shape): 93 | if isinstance(o, list): 94 | if o[0] >= len(input_shape): 95 | raise ValueError("specification contains [%d], but input " 96 | "shape has %d dimensions only" % 97 | (o[0], len(input_shape))) 98 | output_shape[dim] = input_shape[o[0]] 99 | masked_output_shape[dim] = input_shape[o[0]] 100 | if (input_shape[o[0]] is None) \ 101 | and (masked_input_shape[o[0]] is None): 102 | # first time we copied this unknown input size: mask 103 | # it, we have a 1:1 correspondence between out[dim] and 104 | # in[o[0]] and can ignore it for -1 inference even if 105 | # it is unknown. 106 | masked_input_shape[o[0]] = 1 107 | masked_output_shape[dim] = 1 108 | # From the shapes, compute the sizes of the input and output tensor 109 | input_size = (None if any(x is None for x in masked_input_shape) 110 | else np.prod(masked_input_shape)) 111 | output_size = (None if any(x is None for x in masked_output_shape) 112 | else np.prod(masked_output_shape)) 113 | del masked_input_shape, masked_output_shape 114 | # Finally, infer value for -1 if needed 115 | if -1 in output_shape: 116 | dim = output_shape.index(-1) 117 | if (input_size is None) or (output_size is None): 118 | output_shape[dim] = None 119 | output_size = None 120 | else: 121 | output_size *= -1 122 | output_shape[dim] = input_size // output_size 123 | output_size *= output_shape[dim] 124 | # Sanity check 125 | if (input_size is not None) and (output_size is not None) \ 126 | and (input_size != output_size): 127 | raise ValueError("%s cannot be reshaped to specification %s. " 128 | "The total size mismatches." % 129 | (input_shape, self.shape)) 130 | return tuple(output_shape) 131 | 132 | def get_output_for(self, input, **kwargs): 133 | # Replace all `[i]` with the corresponding input dimension 134 | output_shape = list(self.shape) 135 | for dim, o in enumerate(output_shape): 136 | if isinstance(o, list): 137 | output_shape[dim] = input.shape[o[0]] 138 | # Everything else is handled by Theano 139 | return input.reshape(tuple(output_shape)) 140 | 141 | reshape = ReshapeLayer # shortcut 142 | 143 | 144 | class DimshuffleLayer(Layer): 145 | """ 146 | A layer that rearranges the dimension of its input tensor, maintaining 147 | the same same total number of elements. 148 | 149 | :parameters: 150 | - incoming : a :class:`Layer` instance or a tuple 151 | the layer feeding into this layer, or the expected input shape 152 | 153 | - pattern : tuple 154 | The new dimension order, with each element giving the index 155 | of the dimension in the input tensor or `'x'` to broadcast it. 156 | For example `(3,2,1,0)` will reverse the order of a 4-dimensional 157 | tensor. Use `'x'` to broadcast, e.g. `(3,2,1,'x',0)` will 158 | take a 4 tensor of shape `(2,3,5,7)` as input and produce a 159 | tensor of shape `(7,5,3,1,2)` with the 4th dimension being 160 | broadcast-able. In general, all dimensions in the input tensor 161 | must be used to generate the output tensor. Omitting a dimension 162 | attempts to collapse it; this can only be done to broadcast-able 163 | dimensions, e.g. a 5-tensor of shape `(7,5,3,1,2)` with the 4th 164 | being broadcast-able can be shuffled with the pattern `(4,2,1,0)` 165 | collapsing the 4th dimension resulting in a tensor of shape 166 | `(2,3,5,7)`. 167 | 168 | :usage: 169 | >>> from lasagne.layers import InputLayer, DimshuffleLayer 170 | >>> l_in = InputLayer((2, 3, 5, 7)) 171 | >>> l1 = DimshuffleLayer(l_in, (3, 2, 1, 'x', 0)) 172 | >>> l1.get_output_shape() 173 | (7, 5, 3, 1, 2) 174 | >>> l2 = DimshuffleLayer(l1, (4, 2, 1, 0)) 175 | >>> l2.get_output_shape() 176 | (2, 3, 5, 7) 177 | """ 178 | def __init__(self, incoming, pattern, **kwargs): 179 | super(DimshuffleLayer, self).__init__(incoming, **kwargs) 180 | 181 | # Sanity check the pattern 182 | used_dims = set() 183 | for p in pattern: 184 | if isinstance(p, int): 185 | # Dimension p 186 | if p in used_dims: 187 | raise ValueError("pattern contains dimension {0} more " 188 | "than once".format(p)) 189 | used_dims.add(p) 190 | elif p == 'x': 191 | # Broadcast 192 | pass 193 | else: 194 | raise ValueError("pattern should only contain dimension" 195 | "indices or 'x', not {0}".format(p)) 196 | 197 | self.pattern = pattern 198 | 199 | def get_output_shape_for(self, input_shape): 200 | # Build output shape while keeping track of the dimensions that we are 201 | # attempting to collapse, so we can ensure that they are broadcastable 202 | output_shape = [] 203 | dims_used = [False] * len(input_shape) 204 | for p in self.pattern: 205 | if isinstance(p, int): 206 | if p < 0 or p >= len(input_shape): 207 | raise ValueError("pattern contains {0}, but input shape " 208 | "has {1} dimensions " 209 | "only".format(p, len(input_shape))) 210 | # Dimension p 211 | o = input_shape[p] 212 | dims_used[p] = True 213 | elif p == 'x': 214 | # Broadcast; will be of size 1 215 | o = 1 216 | else: 217 | raise RuntimeError("invalid pattern entry, should have " 218 | "caught in the constructor") 219 | output_shape.append(o) 220 | 221 | for i, (dim_size, used) in enumerate(zip(input_shape, dims_used)): 222 | if not used and dim_size != 1 and dim_size is not None: 223 | raise ValueError( 224 | "pattern attempted to collapse dimension " 225 | "{0} of size {1}; dimensions with size != 1/None are not" 226 | "broadcastable and cannot be " 227 | "collapsed".format(i, dim_size)) 228 | 229 | return tuple(output_shape) 230 | 231 | def get_output_for(self, input, **kwargs): 232 | return input.dimshuffle(self.pattern) 233 | 234 | dimshuffle = DimshuffleLayer # shortcut 235 | 236 | 237 | class PadLayer(Layer): 238 | def __init__(self, incoming, width, val=0, batch_ndim=2, **kwargs): 239 | super(PadLayer, self).__init__(incoming, **kwargs) 240 | self.width = width 241 | self.val = val 242 | self.batch_ndim = batch_ndim 243 | 244 | def get_output_shape_for(self, input_shape): 245 | output_shape = () 246 | for k, s in enumerate(input_shape): 247 | if k < self.batch_ndim: 248 | output_shape += (s,) 249 | else: 250 | output_shape += (s + 2 * self.width,) 251 | 252 | return output_shape 253 | 254 | def get_output_for(self, input, **kwargs): 255 | return padding.pad(input, self.width, self.val, self.batch_ndim) 256 | 257 | pad = PadLayer # shortcut -------------------------------------------------------------------------------- /nnbase/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nolearn.lasagne import BatchIterator 4 | 5 | ### when we load the batches to input to the neural network, we randomly / flip 6 | ### rotate the images, to artificially increase the size of the training set 7 | 8 | class FlipBatchIterator(BatchIterator): 9 | 10 | def transform(self, X1, X2): 11 | X1b, X2b = super(FlipBatchIterator, self).transform(X1, X2) 12 | X2b = X2b.reshape(X1b.shape) 13 | 14 | bs = X1b.shape[0] 15 | h_indices = np.random.choice(bs, bs / 2, replace=False) # horizontal flip 16 | v_indices = np.random.choice(bs, bs / 2, replace=False) # vertical flip 17 | 18 | ### uncomment these lines if you want to include rotations (images must be square) ### 19 | #r_indices = np.random.choice(bs, bs / 2, replace=False) # 90 degree rotation 20 | for X in (X1b, X2b): 21 | X[h_indices] = X[h_indices, :, :, ::-1] 22 | X[v_indices] = X[v_indices, :, ::-1, :] 23 | #X[r_indices] = np.swapaxes(X[r_indices, :, :, :], 2, 3) 24 | shape = X2b.shape 25 | X2b = X2b.reshape((shape[0], -1)) 26 | 27 | return X1b, X2b 28 | -------------------------------------------------------------------------------- /nnbase/vis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import PIL.Image as Image 4 | import sys 5 | 6 | def plotSampledImages(net_fn, inDim, name, height, width, fromGrid, gridSize, plane=None, sampleSourceFunction=None): 7 | data = sampleImageForVis(net_fn, inDim, height, width, fromGrid, gridSize, plane=plane, sampleSourceFunction=sampleSourceFunction) 8 | plotImages(data, gridSize, name) 9 | 10 | def sampleImageForVis(net_fn, inDim, height, width, fromGrid, gridSize, plane=None, sampleSourceFunction=None): 11 | randomState = np.random.get_state() 12 | np.random.seed(1) 13 | if fromGrid: 14 | if plane is None: 15 | plane = (0, 1) 16 | n_x = gridSize 17 | n_y = gridSize 18 | initial = [] 19 | for x in np.linspace(-2, +2, n_x): 20 | for y in np.linspace(-2, +2, n_y): 21 | v = np.zeros(inDim, dtype=np.float32) 22 | if plane[0]> 2, and PCA has not happened, it's not too clever to plot the first two dims. 64 | plt.scatter(sampled.T[0], sampled.T[1]) 65 | plt.savefig(name+".pdf") 66 | plt.close() 67 | 68 | def gradientMap1D(data, sampled, postSampled, name): 69 | n = len(data) 70 | assert data.shape == sampled.shape == postSampled.shape == (n,1) 71 | plt.clf() 72 | plt.axis((-2, +2, -2, +2)) 73 | import random 74 | for i in range(len(data)): 75 | # plt.arrow(sampled[i,0], data[i,0], 0, postSampled[i,0]-data[i,0], head_width=0.005) 76 | h = random.random() 77 | plt.arrow(sampled[i,0], h, data[i,0]-sampled[i,0], 0.1, head_width=0.005, color="blue") 78 | plt.arrow(sampled[i,0], h, postSampled[i,0]-sampled[i,0], 0.2, head_width=0.005, color="red") 79 | plt.savefig(name+".pdf") 80 | plt.close() 81 | 82 | 83 | 84 | def get_picture_array_simple(X, height, width, index): 85 | array = X[index].reshape((height, width)) 86 | array = np.clip(array*255, a_min = 0, a_max = 255) 87 | return array.repeat(4, axis = 0).repeat(4, axis = 1).astype(np.uint8()) 88 | 89 | def get_random_images(X_in, X_pred): 90 | index = np.random.randint(len(X_pred)) 91 | print index 92 | height, width = X_in.shape[2:] 93 | original_image = Image.fromarray(get_picture_array_simple(X_in, height, width, index)) 94 | new_size = (original_image.size[0] * 2, original_image.size[1]) 95 | new_im = Image.new('L', new_size) 96 | new_im.paste(original_image, (0,0)) 97 | rec_image = Image.fromarray(get_picture_array_simple(X_pred, height, width, index)) 98 | new_im.paste(rec_image, (original_image.size[0],0)) 99 | new_im.save('test1.png', format="PNG") 100 | 101 | def get_numpy_picture_array(X, n_x, n_y): 102 | height, width = X.shape[-2:] 103 | image_data = np.zeros( 104 | ((height+1) * n_y - 1, (width+1) * n_x - 1), 105 | dtype='uint8' 106 | ) 107 | n = len(X) 108 | assert n <= n_x * n_y 109 | for idx in xrange(n): 110 | x = idx % n_x 111 | y = idx / n_x 112 | sample = X[idx] 113 | image_data[(height+1)*y:(height+1)*y+height, (width+1)*x:(width+1)*x+width] = (255*sample).clip(0, 255) 114 | return image_data 115 | 116 | def get_picture_array(X, n_x, n_y, name): 117 | image_data = get_numpy_picture_array(X, n_x, n_y) 118 | img = Image.fromarray(image_data) 119 | img.save(name+".png") 120 | 121 | # That's awkward, (height, width) corresponds to (n_y, n_x), 122 | # namely the image size is ~ (width*n_x, height*n_y), but the order is reversed between the two. 123 | def diff_vis(dataOriginal, generatedOriginal, height, width, n_x, n_y, name, distances=None): 124 | data = dataOriginal.copy().reshape((-1, height, width)) 125 | generated = generatedOriginal.copy().reshape((-1, height, width)) 126 | if distances is not None: 127 | # Ad hoc values, especially now that there's no bimodality aka left bump of 1s. 128 | VALLEY = 0.3 129 | MAX_OF_REASONABLE = 1.0 130 | assert len(distances) == len(data) == len(generated) 131 | for i,distance in enumerate(distances): 132 | length = np.linalg.norm(data[i]) 133 | relativeDistance = (distance+1e-5)/(length+1e-5) 134 | barHeight = min((int(height*relativeDistance/MAX_OF_REASONABLE), height)) 135 | goGreen = float(relativeDistance this became exp.bigfaces.n100 after learning. 19 | 20 | cd exp.bigfaces.n100 21 | ssh kruso.mokk.bme.hu mkdir ./public_html/kohonen 22 | for dir in xy yz xz s ; do convert $dir[1-9]0000.png $dir[0-9][0-9]0000.png -delay 10 -loop 0 $dir.gif ; done 23 | scp *.gif kruso.mokk.bme.hu:./public_html/kohonen/ 24 | 25 | 26 | # Second attempt at faces: 27 | # https://www.kaggle.com/c/facial-keypoints-detection/data 28 | # http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/#the-data 29 | cd ~/experiments/rbm/daniel-experiments/face/kaggle-facial-keypoints-detection 30 | ( cat training.csv | awk 'BEGIN{FS=","} { print $NF }' | tail -n +2 ; cat test.csv | cut -f2 -d',' | tail -n +2 ) > pixels.txt 31 | # -> 7049 train + 1784 test = 8832 96x96x1 images. 32 | 33 | 34 | # Back at digits again, playing with larger n: 35 | # exp.20dCubeMixture.2layerTanh.n100.digit7 is a stupid mistake. Both 36 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit7 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit7 37 | # and 38 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit7 300 > cout.exp.20dCubeMixture.2layerTanh.n300.digit7 39 | # were pointing to this dir. The n300 started later, so it's overwritten the other, except for 40 | # the latest images, *101700.png - *102800.png. n300 is definitely worse, 41 | # more prone to forked lines. Why? 42 | 43 | # Now running: 44 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit3 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit3 45 | python generative-mlp.py exp.20dGaussian.2layerTanh.n300.digit3 300 > cout.exp.20dGaussian.2layerTanh.n300.digit3 46 | # UPDATE: Dumb me, that's n300 right there. 47 | # Have to do Gaussian again with n100. See below. 48 | 49 | # The filenames tell all, hopefully. For the record: 50 | # 100 hidden units, learning_rate=0.02, momentum=0.5 51 | # scale of normal distribution 1/4, findGenForData True, overSamplingFactor 1. 52 | 53 | # UPDATE: Disregard this paragraph, it compares gauss.n300 to mixture.n100. 54 | # -> After some 2000 epochs, the main difference is that mixture does the forks, 55 | # gauss doesn't, but gauss is super non-diverse. 56 | # After some 10000-30000 epochs (pretty subjective when) mixture stops doing the forks. 57 | # The weirdest is that around here, gauss starts the fork thing, while still not 58 | # being as diverse as mixture. All in all, it's objectively worse. 59 | 60 | # UPDATE: Apples to apples aka n100 to n100 comparison 61 | # between mixture and gauss. 62 | # and also between gauss.n300 and gauss.n100. 63 | python generative-mlp.py exp.20dGaussian.2layerTanh.n100.digit3 100 > cout.exp.20dGaussian.2layerTanh.n100.digit3 64 | # -> EVALUATE! 65 | 66 | # Okay, let's go all in, how about getting rid of the continuous component? 67 | python generative-mlp.py exp.20dBoolean.2layerTanh.n100.digit3 100 > cout.exp.20dBoolean.2layerTanh.n100.digit3 68 | python generative-mlp.py exp.50dBoolean.2layerTanh.n100.digit3 100 > cout.exp.50dBoolean.2layerTanh.n100.digit3 69 | # -> EVALUATE! 70 | 71 | # Does this fully-boolean-craziness work with more diverse data as well? 72 | python generative-mlp.py exp.50dBoolean.2layerTanh.n100.digitAll 100 > cout.exp.50dBoolean.2layerTanh.n100.digitAll 73 | # -> EVALUATE! 74 | 75 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit2 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit2 76 | python generative-mlp.py exp.50dCubeMixture.2layerTanh.n100.digit2 100 > cout.exp.50dCubeMixture.2layerTanh.n100.digit2 77 | # -> Waiting for results. EVALUATE! 78 | 79 | # Lots of work done on quantifying generation performance. 80 | # We sample train and validation (unseen), greedily approximate them with generated samples, 81 | # and quantify/visualize difference between gold and nearest generated (surrogate). 82 | # Specifically, we log total L2 diff on train and valid, 83 | # we visualize difference, and histogram L2 distances between gold and surrogate. 84 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis 85 | 86 | cd exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis 87 | dir=diff_validation ; convert input.png $dir[1-9]000.png $dir[0-9][0-9]000.png $dir[0-9][0-9][0-9]000.png -delay 10 -loop 0 $dir.gif 88 | # (inputs.png is only there as an almost subliminal signal to mark the beginning of the sequence.) 89 | 90 | # -> ANALYZE A BIT MORE, but at first glance, it seem like it does not 91 | # really converge after an initial phase. It's very adamant in NOT 92 | # learning outliers. If it does not like something, it consistently 93 | # behaves like it were not there. Why? 94 | 95 | # For all digits, sampleTotal1e5 (as above): 96 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis 97 | # Same but sampleTotal1e6, let's give the model a bit more chance to reproduce weird things: 98 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.sampleTotal1e6 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.sampleTotal1e6 99 | # -> Setting plotEach=1000 was dumb here, but we'll live with it. 100 | 101 | # Visually inspecting the above two, it seems like sampleTotal1e6 over sampleTotal1e5 102 | # causes just a tiny improvement in matching. (1. When the general shape is 103 | # recognised, the details are similar. 2. It's rare that 1e6 recognizes the 104 | # general shape while 1e5 does not.) 105 | 106 | # Quantitatively: 107 | paste <( grep t < cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis) <( grep t < cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.sampleTotal1e6) | grep -v time | sed "s/_distance//g" 108 | epoch 0 train 3539.577401 validation 3477.146151 epoch 0 train 3522.447895 validation 3458.440632 109 | epoch 1000 train 2213.074515 validation 2232.757161 epoch 1000 train 2185.971621 validation 2206.666417 110 | epoch 2000 train 2142.505005 validation 2149.576490 epoch 2000 train 2107.168020 validation 2124.794779 111 | epoch 3000 train 2135.671001 validation 2104.446155 epoch 3000 train 2045.129582 validation 2079.306816 112 | epoch 4000 train 2067.423567 validation 2073.011328 epoch 4000 train 2034.931709 validation 2050.239458 113 | epoch 5000 train 2073.096339 validation 2053.568913 epoch 5000 train 2020.982528 validation 2032.500311 114 | epoch 6000 train 2030.102489 validation 2039.459535 epoch 6000 train 2024.169692 validation 2019.274401 115 | epoch 7000 train 2009.205883 validation 2029.150878 epoch 7000 train 1999.691904 validation 2010.084477 116 | epoch 8000 train 2030.954336 validation 2016.807750 epoch 8000 train 2008.445006 validation 2000.994180 117 | epoch 9000 train 1996.428312 validation 2007.845411 epoch 9000 train 1975.903737 validation 1992.986778 118 | epoch 10000 train 2004.636604 validation 2001.809792 epoch 10000 train 1971.208121 validation 1988.213054 119 | epoch 11000 train 1970.277075 validation 1996.890203 epoch 11000 train 1941.435286 validation 1983.694747 120 | epoch 12000 train 1991.091591 validation 1990.768504 epoch 12000 train 1959.057641 validation 1980.160241 121 | epoch 13000 train 1951.329268 validation 1986.618364 epoch 13000 train 1965.147074 validation 1974.976362 122 | epoch 14000 train 1971.516946 validation 1982.143524 epoch 14000 train 1941.614413 validation 1972.079845 123 | epoch 15000 train 2034.801743 validation 1982.433723 epoch 15000 train 1988.605444 validation 1968.329759 124 | epoch 16000 train 1962.617783 validation 1976.536872 epoch 16000 train 1944.039658 validation 1965.219307 125 | epoch 17000 train 1958.752054 validation 1974.458151 epoch 17000 train 1942.624157 validation 1962.432814 126 | epoch 18000 train 1969.002308 validation 1972.265950 epoch 18000 train 1919.335813 validation 1960.542414 127 | epoch 19000 train 1973.435493 validation 1971.694208 epoch 19000 train 1948.049374 validation 1957.739240 128 | epoch 20000 train 1949.355630 validation 1968.837136 epoch 20000 train 1965.053454 validation 1955.413557 129 | epoch 21000 train 1951.518872 validation 1967.355088 epoch 21000 train 1949.744274 validation 1952.960039 130 | # The validation set is fixed, the train is a random 400 sample of whole train, 131 | # that's why train is much more jumpy. But both are still converging, 132 | # after 21000 epochs 27 hours. Slooow convergence, zero overfitting. 133 | 134 | # Let's take a look at digit2, it had more time for less data. 135 | cat cout.exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis | grep train | grep "[24680]0000 " | less 136 | epoch 20000 train_distance 2101.250673 validation_distance 2120.222100 137 | epoch 40000 train_distance 2029.797620 validation_distance 2046.173394 138 | epoch 60000 train_distance 1974.057262 validation_distance 2019.035806 139 | epoch 80000 train_distance 1988.527564 validation_distance 1998.971389 140 | epoch 100000 train_distance 1922.723794 validation_distance 1985.801420 141 | epoch 120000 train_distance 1947.624636 validation_distance 1976.859255 142 | epoch 140000 train_distance 1968.396235 validation_distance 1971.583964 143 | epoch 160000 train_distance 1936.777050 validation_distance 1968.063181 144 | epoch 180000 train_distance 1942.150860 validation_distance 1963.848308 145 | epoch 200000 train_distance 1927.290592 validation_distance 1962.477934 146 | epoch 220000 train_distance 1937.469074 validation_distance 1961.097084 147 | epoch 240000 train_distance 1913.067156 validation_distance 1959.697684 148 | epoch 260000 train_distance 1932.563189 validation_distance 1957.309960 149 | # -> Still converging, but the rate is worthless now, after some 29 hours. 150 | 151 | #### 152 | 153 | # TODO With our new evaluation weapons, let's re-attack the issue of 154 | # how to assign surrogates and samples to each other. 155 | # The most important is to take a second look at m. 156 | # (Now that we moved theano graph compilation out of the inner loop.) 157 | # Remember, m is the number of generated samples to choose from 158 | # when finding pairs to n gold points. 159 | # The learning rate should also be checked. 160 | # Things like findGenForData, overSamplingFactor, and maybe 161 | # an epoch-dependent n (minibatch sampling size) or learning rate (a la Kohonen). 162 | 163 | #### 164 | 165 | # Okay, focusing on m. At the early phase of training, it's probably 166 | # not smart to have a large m, as that means that many generated point 167 | # stay at their bad place. Further in the training, it is smart, 168 | # as that helps to learn small details. 169 | # That's theorizing, but let's start with something super simple: m=1000 n=100. 170 | 171 | # From now on, moreVis is taken as given, so the parent exp of 172 | # exp.20dCubeMixture.2layerTanh.n100.m10000.digitAll 173 | # is exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis (the one with sampleTotal1e5) 174 | # diff: plotEach=100 (was 1000), m=1000 (was m=n=100). 175 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.m1000.digitAll 100 > cout.exp.20dCubeMixture.2layerTanh.n100.m1000.digitAll 176 | # -> For some reason, this is so slow that it doesn't even make sense to 177 | # compare it with its parent exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis. 178 | # 680 epochs in 6 hours, versus the old 26000 epochs in 31 hours, 10000 epochs per 12 hours. 179 | # And these new 680-epoch are comparable in quality to the old 2000-epoch results, achieved in 180 | # just 2.5 hours. 181 | # Maybe the ridiculous slowness is just a bug, but let's postpone figuring this out 182 | # after making autoencoder work. 183 | 184 | #### 185 | 186 | # Spearmint-ize 187 | # (Spearmint because I couldn't figure out how to use kwargs with hyperopt) 188 | 189 | # Had to rename generative-mlp.py to generativeMLP.py 190 | # so that it can be imported as a module. 191 | 192 | # We are in the lasagne-demo venv. 193 | brew install mongodb 194 | pip install pymongo 195 | git clone git@github.com:HIPS/Spearmint.git 196 | pip install -e Spearmint 197 | mkdir mongodb 198 | cd mongodb 199 | mongod --fork --logpath ./log --dbpath . 200 | cd .. 201 | mkdir spearmintOutput 202 | 203 | # Reads config.json which references spearmintTask.py 204 | # and writes to a directory named "output". 205 | # Also spearmintTask.py is set up so that it creates directories for each 206 | # job, dirname ./spearmintOutput/LR0.010-n10 or such. 207 | python Spearmint/spearmint/main.py . 208 | 209 | # Cleanup command resets experiment: 210 | Spearmint/spearmint/cleanup.sh . 211 | # TODO This . was not intended for this, there should be a proper subdir for it. 212 | 213 | # It's running now. spearmintOutput/log.cerr is where the current best is visible. 214 | 215 | # -> Stopped, it has basically converged. Moved everything to spearmintExps/epoch200 216 | # I copied the config.json there as well. 217 | # Turns out the best is the maximal allowed inDim 50 and the maximal allowed minibatchSize (!) 100. 218 | # (We've seen smaller minibatchSizes to be better when inDim was small, haven't we? Not sure anymore.) 219 | # Learning rate converged to ~10, it was constrained to [0.2, 50]. 220 | 221 | Spearmint/spearmint/cleanup.sh . 222 | python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr 223 | # - logs of individual runs are in spearmintOutput/*/log.txt 224 | # - spearmint current best is in spearmintOutput/log.cerr 225 | # - jobs are logged in ./output/*. It's really only useful for two things: 226 | # it has cerrs of my jobs, and it has the running times. 227 | # - if we want to graph or something, mongodb is the way, dbname is in config.json 228 | 229 | # Hideous but mostly harmless hack while I learn to query the mongodb or write better logging: 230 | grep final spearmintOutput/*/log.txt | sed "s/\/log.txt:final performance / /" | sed "s/spearmintOutput\///" | tr ' -' '\t' | awk '{ print $NF "\t" $0 }' | sort -n 231 | 232 | # TODOS 233 | # - run the current best for a large number of epochs. 234 | # - tune the last important untuned parameter: the variance of the input gaussian, 235 | # or more generally, the input distribution. (value is not very sensitive to inDim, 236 | # so we might as well fix it as small.) 237 | # - figure out a metric that punishes memorizing samples. 238 | # - log running times in log.txt. maybe we can play tricks with taking 239 | # the median of epoch runtimes instead of sum, that would approximate CPU time pretty well. 240 | # - save git sha + diff in exp dir. 241 | # - revive the mainLowDim codepath. 242 | 243 | # Which conf is currently the best? 244 | grep final spearmintOutput/*/log.txt | awk '{ print $NF,$0 }' | grep -v "^nan" | sort -n | head -1 | sed "s/log\.txt.*/conf.txt/" 245 | open `grep final spearmintOutput/*/log.txt | awk '{ print $NF,$0 }' | sort -n | grep -v "^nan" | head -10 | cut -f2 -d' ' | cut -f1 -d':' | sed "s/log\.txt/s400.png/"` 246 | # -> Visually, some of them are more perfect but less diverse, 247 | # some of them are varying a lot in brightness, 248 | # TODO which parameters influence these? 249 | 250 | # Seems like epoch200 and epoch400 does not tell much about the later convergence 251 | # properties of a param-setting. How about epoch1600? 252 | 253 | # I took the current best, rounded the params a bit, and the result is 254 | # deepDives/conf1.txt 255 | # output is deepDives/conf1-hls200-inDim20-lr10-mom0.6-n300-os4.0 256 | # The above is the general workflow: Take promising confs, tune them, 257 | # set expName to deepDives/confN-DETAILED_DESCRIPTION_PATH, 258 | # put them into deepDives/confN.txt, add that to git. 259 | # When it has run, maybe add final round output to git as well. 260 | 261 | # As seen on 262 | # https://docs.google.com/spreadsheets/d/1IWE7_Xeh81Pa9MgaV2QsDKJSHYmkQjBQring_xdC3CY/edit#gid=0 263 | # , overfitting kicks in a epoch12000. 264 | # epoch TMean TMedian VMean VMedian (10-moving averages) 265 | # 12000 4.1544335 4.2569459 4.19644 4.22872 266 | # 87200 4.1218603 4.227801 4.2057856 4.2594421 267 | 268 | # conf2 is same as conf1 except for the smaller learning rate 10->1. 269 | # Surprisingly the convergence is not that much slower. 270 | # 271 | # Also surprisingly, it seems like it will never reach conf1 accuracy. 272 | # (conf2 vmean settling near 4.27 at epoch24000 but already at 4.28 at epoch1000. 273 | # while conf1 vmean stopped at 4.19 at epoch14000.) 274 | 275 | # I fixed a visualization UX issue: s*.png are now generated from 276 | # the same random numbers, so that they form an animation. 277 | # The flipside is the we now see a smaller part of the generated space. 278 | # spearmintExps/epoch1600/output/00000136.out aka 279 | # spearmintExps/epoch1600/hls117-inDim88-lr2.12805643612-mom0.647445989185-n300-os3.9988942992 280 | # is the first such one. 281 | 282 | mv spearmintOutput spearmintExps/epoch1600 283 | mv output spearmintExps/epoch1600/ 284 | 285 | # Let's try conf1 with layerNum=3, and call it conf3. 286 | # ...Wow. That's amazing. At vmean 4.12 at epoch2800. 287 | # Maybe only the bigger number of parameters? Should check. 288 | 289 | # conf3 vmean plateaued between epoch4400 and epoch12000 at 4.10, 290 | # and then slowly crawled up to 4.13. 291 | 292 | # The new spearmint run epochCount4800_depth3_useReLUFalse_everyNthInput10 293 | # runs on commit 414fb5df9d8bec71f1c05ae199f6f891ca3a5cb1. 294 | # It is different from the parent epochCount1600_useReLUFalse_everyNthInput10 295 | # in the following ways: 296 | # layerNum 2 -> 3, epoch 1600 -> 4800, plotEach 400 -> 800 297 | # learningRate.max 200.0 -> 20.0. 298 | # indim (20,100) -> (10,50) 299 | # and uses vmean instead of vmedian as value. 300 | # Be careful when you compare with the previous spearmint run's vmedians. 301 | # (They move together anyway, but vmedian is super bumpy. A typical difference between the two: 302 | # vmedian is 0.06 larger than vmean, regardless of the current epoch.) 303 | 304 | mkdir spearmintOutput 305 | python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr 306 | 307 | # Weirdly, its top contender after 9 runs, 308 | # 4.489997 spearmintOutput/hls300-inDim10-lr6.34021189647-mom0.5-n300-os3.58636953526 309 | # has parameters quite similar to 310 | # 4.106030 deepDives/conf3-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0 311 | # , but the numbers are much-much worse at epoch4800: 312 | # conf3 epoch 4800 trainMean 3.938845 trainMedian 4.025593 validationMean 4.106030 validationMedian 4.138099 313 | # spearmint epoch 4800 trainMean 4.452593 trainMedian 4.579250 validationMean 4.489997 validationMedian 4.532965 314 | # UPDATE: I seriously botched this: layerNum 3 was the main idea, 315 | # but I actually used layer2. Useless, I put it into Attic/botched.depth2insteadofdepth3 316 | # See below notes on epochCount4800_depth3_4_useReLUFalse_everyNthInput10 about how I fixed this. 317 | # UPDATE2, even more important: I inadverently used relu in all deepDives. 318 | 319 | # deepDives/conf4 is the same as the successful conf3, but with the faces dataset. 320 | # One weird thing is that the output has lots of damaged pixels which are always black. 321 | # (UPDATE: I used relu here without knowing it, that's the reason.) 322 | # (Probably always negative, and clipped to 0.) These go away, but very very slowly: 323 | # at epoch5000 we have ~25 damaged pixels, epoch12000 ~10, epoch20000 exactly 2. 324 | # Unfortunately the result of conf4 is not very convincing. Some of the time it's 325 | # just rote learning, other times the nearest generated sample is a linear combination 326 | # of two rote-learned faces. At least it's pretty good at rote learning: 327 | # reproduces quite a few details of the train sample. 328 | # Of course, what did I expect with just 400 training samples and minibatchsize n300? 329 | 330 | # Motivated by this, I implemented the following benchmark and visualization: 331 | # Same as diff_validation, but with the train dataset taking the place of the generated 332 | # samples. Needs some refactor. I'll call this the nnbaseline, nn as in nearest neighbor. 333 | # It only has to be run once for each dataset, but it's not a big deal if we run it 334 | # once for each traning session. 335 | # Values: 336 | # inputType=mnist, inputDigit=None, everyNthInput=10, gridSizeForSampling=20 337 | # nnbaselineMean 4.863300 nnbaselineMedian 5.040003 338 | # -> Why is gridSizeForSampling relevant? Because of a stupid mixing of 339 | # responsibilities, we use only the first gridSizeForSampling**2 validation points. 340 | # -> mnist() random seed set to 1. We do randomization there, but reproducibly. 341 | 342 | # That sound like good news, and it probably is: our current best is 343 | # epoch 6400 trainMean 3.906343 trainMedian 4.017440 validationMean 4.103766 validationMedian 4.130489 344 | # , which was probably meta-overfitted a bit, but still better. 345 | # But before we start to celebrate, this is probably an artifact: 346 | # Our generated samples are smoothed, less sharp compared to the gold samples, 347 | # so a close but imperfect match is scored higher than when we compare two gold ones. 348 | 349 | # inputType=image, imageDirectory=../face/SCUT-FBP/thumb.big/, everyNthInput=1, gridSizeForSampling=20 350 | # nnbaselineMean 6.403875 nnbaselineMedian 6.177893 351 | # nnbaselineMean 5.891883 nnbaselineMedian 5.616794 (different seed: 1) 352 | # nnbaselineMean 5.928859 nnbaselineMedian 5.845743 (another seed: 2) 353 | # our current best: (bestish, didn't want to meta-overfit by picking the specific best) 354 | # epoch 28000 trainMean 3.643037 trainMedian 3.592704 validationMean 4.875953 validationMedian 4.736241 355 | # -> This is impressive, but not directly comparable, I forgot to fix the random seed. 356 | # (Fixed now, but don't know the seed for conf3. Ouch. 357 | # TODO Should be a parameter to make the whole run reproducible.) 358 | 359 | # What about visual comparison? mnist looks okay to me. If it's rote learning, it's 360 | # at least quite convincing. The samples conf4 generates are evil, with all this mixing, 361 | # but the diffs look okay when compared to the nnbaseline (which is bad, not enough data points). 362 | # Side remark: Even though the individual s*.png-s are shitty, s.gif is pretty cool, 363 | # mixing looks like constant smooth crossfading there, and the details slowly emerging look great, 364 | # rote learning or not. 365 | 366 | for dir in diff_validation diff_train s xy yz xz ; do convert input.png $dir[1-9]00.png $dir[0-9][0-9]00.png $dir[0-9][0-9][0-9]00.png -delay 20 -loop 0 $dir.gif ; done 367 | 368 | 369 | ##### 370 | 371 | # Turns out, I seriously botched the epoch4800 spearmint run: used layerNum 2 instead of 3. 372 | # Try again: epochCount4800_depth3_4_useReLUFalse_everyNthInput10 373 | # It is different from the parent epochCount1600_useReLUFalse_everyNthInput10 aka spearmintExps/epoch1600 374 | # in the following ways: 375 | # layerNum 2 -> [3,4], epoch 1600 -> 4800, plotEach 400 -> 800 376 | # learningRate.max 200.0 -> 20.0. 377 | # indim [20,100] -> [10,50] 378 | 379 | # Oh god I botched something even more serious: 380 | # False is not turned into bool, stays str. That means that deepDives used relu even though 381 | # the conf explicitely said don't use relu. That's what made conf3 perform better than any 382 | # of the spearmint runs. 383 | # I changed the conf[1234].txts to say userelu True. Serialization bug is fixed now. 384 | # tanh spearmint run moved to spearmintExps/epoch4800-tanh, restarting with relu, 385 | # expname epochCount4800_depth3_4_useReLUTrue_everyNthInput10 386 | # BTW relu is not just better than tanh, it's also 30% faster. (I assume they got the same amount 387 | # of CPU cycles. 388 | 389 | # Turns out the cubemixture does not help with the newer models. 390 | # (If I had the time, I would investigate where did it stop helping, 391 | # but relu+layer3 is capable of harder transitions, that's for sure.) 392 | 393 | # Here is the current best epoch4800 spearmintOutput compared with its straight gaussian child-experiment: 394 | cat /Users/daniel/experiments/rbm/daniel-experiments/kohonen/spearmintOutput/hls300-inDim12-layerNum4-lr20.0-mom0.5-n300-os3.99999999824/log.txt | grep train | awk '($2%800==0)' 395 | epoch 800 trainMean 3.942084 trainMedian 4.023556 validationMean 4.120661 validationMedian 4.135574 396 | epoch 1600 trainMean 3.819828 trainMedian 3.863694 validationMean 4.091879 validationMedian 4.141805 397 | epoch 2400 trainMean 3.764334 trainMedian 3.825936 validationMean 4.100879 validationMedian 4.125267 398 | epoch 3200 trainMean 3.727745 trainMedian 3.769446 validationMean 4.115094 validationMedian 4.149114 399 | epoch 4000 trainMean 3.688223 trainMedian 3.761931 validationMean 4.101507 validationMedian 4.165966 400 | epoch 4800 trainMean 3.699963 trainMedian 3.767960 validationMean 4.109944 validationMedian 4.142234 401 | 402 | cat deepDives/conf7-gauss/log.txt | grep train | awk '($2%800==0)' 403 | epoch 800 trainMean 3.946862 trainMedian 4.045183 validationMean 4.104929 validationMedian 4.164662 404 | epoch 1600 trainMean 3.825675 trainMedian 3.930816 validationMean 4.087788 validationMedian 4.111501 405 | epoch 2400 trainMean 3.775109 trainMedian 3.883820 validationMean 4.086608 validationMedian 4.099182 406 | epoch 3200 trainMean 3.747717 trainMedian 3.784954 validationMean 4.099656 validationMedian 4.120957 407 | epoch 4000 trainMean 3.741761 trainMedian 3.797170 validationMean 4.103545 validationMedian 4.097180 408 | epoch 4800 trainMean 3.690358 trainMedian 3.760918 validationMean 4.082099 validationMedian 4.111330 409 | # -> Note the validationMedian being close to the validationMean, that's unusual. 410 | 411 | # Balazs observes that the left bump on the histogram is NOT cause by 412 | # rote learning: it's simply an artifact of the allDigit mnist task: 413 | # 1s are easier to learn, and they also have smaller area. They are the bump. 414 | 415 | # A better task-specific measure of closeness of samples is the relative improvement 416 | # over the all-black baseline, that is d(gold,generated)/d(gold,0). 417 | # (1s are easier to learn, so they are still on the left, but the bimodality goes away.) 418 | # Let's not forget that this is NOT what our algorithm optimizes, nor should it. 419 | # (Unless we want to make it super mnist-specific, which we don't.) 420 | # This metric causes another big inconvenience as well: We can't compare the logged 421 | # aggregate numbers to the histogram numbers. 422 | # So I won't use it in the histogram, and I will use it on the diff. 423 | # Hope that won't cause confusion. 424 | 425 | ######### 426 | 427 | # Trying to port the slow distanceMatrix calculation from numpy to theano. 428 | # I start with a modest goal: 429 | 430 | # A cool little toy learning problem: 431 | # We want to learn a translated 2D standard normal's translation, that's a 2D vector. 432 | # We generate batchSize samples from this target distribution. 433 | # We generate sampleSize samples from our current best bet for the distribution. 434 | # We find the closest generated sample to each target sample. 435 | # We calculate the sum of distances. 436 | # That's the loss that we optimize by gradient descent. 437 | # Note that Theano doesn't even break a sweat when doing backprop 438 | # through a layer of distance minimization. 439 | # Of course that's less impressive than it first sounds, because 440 | # locally, the identity of the nearest target sample never changes. 441 | 442 | # UPDATE: Maybe it does break a sweat after all: it diverges if we multiply the loss by 100. 443 | 444 | ########## 445 | # geforce machine installation notes 446 | 447 | # NVIDIA Drivers 448 | # https://access.redhat.com/solutions/64300 449 | # -> Careful, it hardwires an old driver, I changed it to 450 | # http://http.download.nvidia.com/XFree86/Linux-x86_64/358.16/NVIDIA-Linux-x86_64-358.16.run 451 | 452 | # CUDA 453 | # http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/index.html 454 | # http://developer.download.nvidia.com/compute/cuda/repos/fedora21/x86_64/cuda-repo-fedora21-7.5-18.x86_64.rpm 455 | 456 | # The nvcc compiler needs gcc, and needs <=4.9 gcc. On our fedora 5.1.1 is the default. 457 | # So I've built and installed gcc-4.9.3. 458 | # Standard procedure described in https://gcc.gnu.org/wiki/InstallingGCC 459 | # But default mirror in ./contrib/download_prerequisites 460 | # are too slow, replaced them with ftp://ftp.fu-berlin.de/unix/languages/gcc/infrastructure 461 | # After make install, new/old gcc was in /usr/local/gcc/4.9.3/, but not on PATH. 462 | # We only need it for nvcc anyway, so the best way to add this to ~/.theanorc : 463 | # [nvcc] 464 | # compiler_bindir=/usr/local/gcc/4.9.3/bin/' 465 | 466 | # This is how my ~/.theanorc looks like now on geforce: 467 | [global] 468 | floatX = float32 469 | device = gpu0 470 | warn_float64 = raise 471 | assert_no_cpu_op = raise 472 | cxx = /usr/local/gcc/4.9.3/bin/g++ 473 | [nvcc] 474 | fastmath = True 475 | compiler_bindir = /usr/local/gcc/4.9.3/bin/ 476 | 477 | # On the laptop, compiler_bindir and cxx is not there, and device=cpu, 478 | # the rest is the same. 479 | 480 | ########## 481 | 482 | # Very important note, already mentioned in lasagne-demo/readme.sh : 483 | # I had to patch layers/conv.py 484 | # /usr/lib/python2.7/site-packages/lasagne/layers/conv.py 485 | # Specifically, I added as a first line of Conv2DLayer.__init__() this: 486 | # del kwargs['border_mode'] 487 | # I don't know where this incompatibility is coming from. 488 | 489 | ########## 490 | # Benchmarks 491 | 492 | # testNumpyToTheano.py:testSampleInitial() 10000 epoch 1000 data 1000 generated: 493 | # laptop: 55 sec including compilation. 494 | # geforce: 76 sec including compilation. 495 | 496 | # testNumpyToTheano.py:test() 497 | laptop cpu: 498 | minimal distances theano finished in 2.422537 seconds. 499 | all distances theano finished in 1.913697 seconds. 500 | all distances slow numpy finished in 2.907862 seconds. 501 | all distances fast numpy finished in 2.942749 seconds. 502 | 503 | geforce gpu: 504 | minimal distances theano finished in 0.594864 seconds. 505 | all distances theano finished in 0.094942 seconds. 506 | all distances slow numpy finished in 27.137307 seconds. 507 | all distances fast numpy finished in 27.065705 seconds. 508 | 509 | geforce cpu: 510 | minimal distances theano finished in 25.903046 seconds. 511 | all distances theano finished in 25.355256 seconds. 512 | (numpy are the same.) 513 | 514 | # -> Wow, numpy dot product is dead slow on geforce. 515 | # I manage to run generativeMLP.py on the GPU, but the bottleneck is that stupid dot product. 516 | 517 | # Super cool tip from http://deeplearning.net/software/theano/install_ubuntu.html 518 | python `python -c "import os, theano; print os.path.dirname(theano.__file__)"`/misc/check_blas.py 519 | 520 | ###### 521 | 522 | # I managed to compile this gist on laptop: 523 | open https://gist.github.com/xianyi/6930656 524 | gcc -o a.out test_cblas_dgemm.c -I /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers -L /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current -lblas -lpthread 525 | 526 | ###### 527 | # Set up geforce machine with ubuntu. 528 | 529 | # See ./install.txt for every detail. 530 | 531 | # deepDives/conf8.txt benchmark, 4800 epochs: 532 | # 250 mins on laptop 533 | # 44 mins on geforce 534 | # -> yay! 535 | 536 | # I tried allow_gc = False, but it didn't give real improvement, less than 10% for sure, probably even less. 537 | 538 | ####### 539 | # Make spearmint work. 540 | 541 | # File "/usr/local/lib/python2.7/dist-packages/pymongo/collection.py", line 393, in _legacy_write 542 | # rqst_id, msg, max_size = func(*args) 543 | # bson.errors.InvalidDocument: Cannot encode object: 5.6012859 544 | # -> Solution is to cast from np.float32 to float. 545 | 546 | for f in spearmintOutput/*/log.txt ; do grep "train" $f | tail -1 | cut -f8 -d' ' | tr '\n' ' ' ; echo $f ; done | sort -n 547 | 548 | ####### 549 | # Some parallel run benchmarks. 550 | 551 | # adhoc/speedtest.txt does not scale, running two in parallel takes twice longer, 552 | # even if they get gpu0 and gpu1 respectively. 553 | # 1 GPU 1 proc 554 | for GPU in 0 1 ; do for a in 1 ; do ( time THEANO_FLAGS="device=gpu$GPU" python generativeMLP.py adhoc/speedtest.txt & ) ; done ; done 555 | 556 | # 1 GPU 1 proc: 33.0 = 33.0/process 557 | # 1 GPU 2 proc: 64.0 = 32.0/proc 558 | # 2 GPU 2 proc: 64.0 = 32.0/proc 559 | # :( 560 | # Not very surprising, if I press Ctrl-C it always stops inside numpy, 561 | # and numpy presumably already uses all the CPU cores. (Does it?) 562 | # Let's do a less CPU-intense speedtest. This one always breaks inside theano.function: 563 | # adhoc/speedtestgpu.txt 564 | # 1 GPU 1 proc: 31.0 = 31.0/process 565 | # 1 GPU 2 proc: 59.0 = 29.5/proc 566 | # 2 GPU 2 proc: 63.0 = 31.5/proc 567 | # :( Now that's somewhat more surprising. 568 | 569 | 570 | # testNumpyToTheano.py:testSampleInitial() 10000 epoch 1000 data 1000 generated: 571 | # This one does scale nicely to 8 processes: 572 | for GPU in 0 1 ; do for a in 1 2 3 4 5 6 7 8 ; do ( time THEANO_FLAGS="device=gpu$GPU" python testNumpyToTheano.py > /dev/null & ) ; done ; done 573 | 574 | # 1 GPU 1 proc: 20.8 = 20.8/process 575 | # 1 GPU 2 proc: 21.6 = 10.8/proc 576 | # 1 GPU 4 proc: 27.0 = 6.7/proc (actually, the real runtimes were 23.1, 24.4, 25.6, 27.0) 577 | # 1 GPU 8 proc: 44.0 = 5.5/proc (actually, there was one outlier with 53.0 and the rest around 43.0) 578 | # 2 GPU 2 proc: 21.6 = 10.8/proc 579 | # 2 GPU 4 proc: 26.2 = 6.5/proc 580 | # 2 GPU 8 proc: 43.4 = 5.4/proc 581 | # 2 GPU 16proc: 88.0 = 5.5/proc 582 | 583 | # So the bottom line is that if you have a job, it doesn't matter 584 | # which GPU you send it to even if one is completely starving. 585 | # The only model that I have in mind that can explain this is 586 | # a fixed, non-parallelizable cost of sending data towards 587 | # ANY of the two GPUs. Like a Y shape with a bottleneck at the bottom, 588 | # closer to the CPU. 589 | 590 | ####### 591 | # Let's see some simple synthetic generated distributions. 592 | # I've created a pretty general framework to play with those, see nnbase/inputs.py:GENERATOR_FUNCTIONS. 593 | # The coolest one so far is adhoc/plane1.txt , output in ~/tmp/daniel-experiments/kohonen/adhoc/plane1-d2/ 594 | # and http://people.mokk.bme.hu/~daniel/kohonen/plane1.gif 595 | # in my mail titled "op art". 596 | 597 | 598 | ####### 599 | # Meanwhile I've stopped the original spearmint run, archived it to 600 | # spearmintRuns/epochCount4800_depth3_4_useReLUTrue_everyNthInput10 601 | # and rewrote config.json so that it looks for higher values. 602 | # I call this exp epochCount4800_depth3_4_useReLUTrue_everyNthInput10_bigger 603 | 604 | THEANO_FLAGS='device=gpu1' nohup python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr & 605 | # From now on gpu1 is the spearmint GPU. (Although if the above benchmarks are good, 606 | # it shouldn't matter, except maybe for OOM.) 607 | 608 | for f in spearmintOutput/*/log.txt ; do grep "train" $f | tail -1 | cut -f8 -d' ' | tr '\n' ' ' ; echo $f ; done | sort -n 609 | 610 | ####### 611 | # Did a less complete but still useful way to put distance matrix calculation on the GPU. 612 | 613 | # Makes large oversampling large minibatchSize runs about 3 times faster on geforce, 614 | # does not make a difference on the laptop. 615 | 616 | 617 | # It's not really a bottleneck now, but this CPU-based argmin is really annoying: 618 | THEANO_FLAGS='config.profile=True' CUDA_LAUNCH_BLOCKING=1 python nearestNeighborsTest.py > cout 2> cerr 619 | 620 | # I asked the theano-users list: 621 | https://groups.google.com/forum/#!topic/theano-users/E7ProqnGUMk 622 | https://gist.github.com/danielvarga/d0eeacea92e65b19188c 623 | 624 | # Later found that this is the relevant ticket: 625 | https://github.com/Theano/Theano/issues/1399 626 | # Implemented lamblin's hack there, see the gist above. 627 | 628 | # 25000 candidate, 5000 target: 629 | lamblinsTrick = False 630 | Time in Function.fn.__call__: 8.231399e-01s (99.995%) 631 | <% time>