├── adhoc
    ├── bestSoFar.txt
    ├── clock1.txt
    ├── conf8.igsc10-lr1.txt
    ├── conf8.igsc10.txt
    ├── conf8.inDim2.digit2.txt
    ├── conf8.inDim2.txt
    ├── conf8.l1loss.txt
    ├── conf8.l2unsquared.txt
    ├── conf8.lrd.txt
    ├── convolutional.txt
    ├── lowdim1.txt
    ├── plane1.txt
    ├── sine1.txt
    ├── spearmint-best-leaky.txt
    ├── spearmint-best-leaky0.1.txt
    ├── spearmint-best.txt
    ├── speedtest.txt
    ├── speedtestgpu.txt
    ├── uniform.bipartite1d.lrd.txt
    └── wave1.txt
├── autoencoder.py
├── config.json
├── deepDives
    ├── conf1.txt
    ├── conf10.txt
    ├── conf11.txt
    ├── conf2.txt
    ├── conf3.txt
    ├── conf4.txt
    ├── conf5.txt
    ├── conf6.txt
    ├── conf7.txt
    ├── conf8.txt
    └── conf9.txt
├── distances.py
├── docs
    └── charts
    │   └── 1d
    │       ├── triangle.attempt.png
    │       ├── triangle.bipartite.png
    │       ├── triangle.goal.png
    │       ├── uniform.attempt.png
    │       ├── uniform.bipartite.png
    │       ├── uniform.goal.png
    │       ├── uniform.os0.5.png
    │       ├── uniform.os1.png
    │       ├── uniform.os2.png
    │       └── uniform.os5.png
├── earthMover.py
├── earthMoverTest.py
├── evaluate.py
├── install.sh
├── kohonen.py
├── matplotlibrc
├── munkres.py
├── nearestNeighborsTest.py
├── next_permutation.py
├── nnbase
    ├── __init__.py
    ├── attrdict.py
    ├── autoencoder.py
    ├── inputs.py
    ├── layers.py
    ├── shape.py
    ├── utils.py
    └── vis.py
├── readme.md
├── readme.sh
├── requirements.txt
├── spearmintTask.py
└── theanorc.txt


/adhoc/bestSoFar.txt:
--------------------------------------------------------------------------------
 1 | # The parent is ~/spearmintClones/regularization_initialSD/daniel-experiments/kohonen/spearmintOutput/initialSD0.413519287109-regularization7.87353515625e-07/conf.txt
 2 | # that is the best at epoch 6400 of the ~/spearmintClones/regularization_initialSD spearmintRuns.
 3 | # At epoch 4800 the best would be initialSD0.323230707636, but who cares,
 4 | # it's super insensitive to initialSD inside [0.2, 0.6].
 5 | # Removed regularization because it was optimized away already by Spearmint, as it did not help:
 6 | # regularization 7.87353515625e-07 -> 0
 7 | #
 8 | # parent:
 9 | # epoch 4800 trainMean 3.604388 trainMedian 3.666215 validationMean 3.885447 validationMedian 3.866639
10 | # this:
11 | # epoch 4800 trainMean 3.551922 trainMedian 3.647972 validationMean 3.893564 validationMedian 3.921978
12 | epochCount	6400
13 | everyNthInput	10
14 | expName	adhoc/bestSoFar
15 | gridSizeForInterpolation	30
16 | gridSizeForSampling	20
17 | height	28
18 | hiddenLayerSize	673
19 | inBoolDim	0
20 | inDim	50
21 | initialSD	0.413519287109
22 | inputDigit	None
23 | inputType	mnist
24 | layerNum	3
25 | learningRate	1.0
26 | minibatchSize	1000
27 | momentum	0.969849416169
28 | oversampling	8.0
29 | plotEach	800
30 | reLULeakiness	0.01
31 | regularization	0.0
32 | useReLU	True
33 | width	28
34 | 


--------------------------------------------------------------------------------
/adhoc/clock1.txt:
--------------------------------------------------------------------------------
 1 | # parent adhoc/wave1.txt
 2 | epochCount	16000
 3 | expName	adhoc/clock1
 4 | gridSizeForInterpolation	30
 5 | gridSizeForSampling	20
 6 | height	28
 7 | hiddenLayerSize	100
 8 | inBoolDim	0
 9 | inDim	2
10 | initialSD	2.0
11 | inputType	clock
12 | layerNum	3
13 | learningRate	10
14 | minibatchSize	100
15 | momentum	0.6
16 | oversampling	1.0
17 | plotEach	100
18 | trainSize	40000
19 | useReLU	False
20 | validSize	400
21 | width	28
22 | 


--------------------------------------------------------------------------------
/adhoc/conf8.igsc10-lr1.txt:
--------------------------------------------------------------------------------
 1 | # parent adhoc/conf8.igsc10.txt
 2 | # learningRate 10 -> 1
 3 | # To accomodate for the innerGradientStepCount = 10,
 4 | # I decreased the LR tenfold.
 5 | # parent:
 6 | # epoch 400 trainMean 3.823610 trainMedian 3.913259 validationMean 4.035612 validationMedian 4.088287
 7 | # this:
 8 | # epoch 4800 trainMean 3.724095 trainMedian 3.812470 validationMean 4.026276 validationMedian 4.042130
 9 | epochCount	16000
10 | everyNthInput	10
11 | expName	adhoc/conf8.igsc10-lr1
12 | gridSizeForInterpolation	30
13 | gridSizeForSampling	20
14 | height	28
15 | hiddenLayerSize	600
16 | inDim	20
17 | inBoolDim	0
18 | initialSD	0.25
19 | innerGradientStepCount	10
20 | inputDigit	None
21 | inputType	mnist
22 | layerNum	3
23 | learningRate	1
24 | minibatchSize	600
25 | momentum	0.6
26 | oversampling	4.0
27 | plotEach	50
28 | useReLU	True
29 | width	28
30 | 


--------------------------------------------------------------------------------
/adhoc/conf8.igsc10.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # innerGradientStepCount = 10 for-loop around train_fn(initial, data)
 3 | # parent:
 4 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
 5 | # this:
 6 | # epoch 400 trainMean 3.823610 trainMedian 3.913259 validationMean 4.035612 validationMedian 4.088287
 7 | epochCount	16000
 8 | everyNthInput	10
 9 | expName	adhoc/conf8.igsc10
10 | gridSizeForInterpolation	30
11 | gridSizeForSampling	20
12 | height	28
13 | hiddenLayerSize	600
14 | inDim	20
15 | inBoolDim	0
16 | initialSD	0.25
17 | innerGradientStepCount	10
18 | inputDigit	None
19 | inputType	mnist
20 | layerNum	3
21 | learningRate	10
22 | minibatchSize	600
23 | momentum	0.6
24 | oversampling	4.0
25 | plotEach	50
26 | useReLU	True
27 | width	28
28 | 


--------------------------------------------------------------------------------
/adhoc/conf8.inDim2.digit2.txt:
--------------------------------------------------------------------------------
 1 | # parent adhoc/conf8.inDim2.txt
 2 | # inputDigit None -> 2
 3 | epochCount	16000
 4 | everyNthInput	1
 5 | expName	adhoc/conf8.inDim2.digit2
 6 | gridSizeForInterpolation	100
 7 | gridSizeForSampling	20
 8 | height	28
 9 | hiddenLayerSize	600
10 | inDim	2
11 | inBoolDim	0
12 | initialSD	0.25
13 | inputDigit	2
14 | inputType	mnist
15 | layerNum	3
16 | learningRate	1
17 | minibatchSize	600
18 | momentum	0.6
19 | oversampling	1.0
20 | plotEach	10
21 | useReLU	True
22 | width	28
23 | 


--------------------------------------------------------------------------------
/adhoc/conf8.inDim2.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # The idea is that we want some pretty xy*.png that shows something similar to
 3 | # what VAE or t-SNE can do with MNIST in 2D.
 4 | # inDim 20 -> 2, gridSizeForInterpolation 30 -> 100, everyNthInput 10 -> 1, plotEach 400 -> 10
 5 | # learningRate 10 -> 1, oversampling 4.0 -> 1.0
 6 | # parent: (irrelevant, though, not comparable)
 7 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
 8 | # this:
 9 | # epoch 3600 trainMean 4.897792 trainMedian 4.980886 validationMean 4.813195 validationMedian 4.877148
10 | epochCount	16000
11 | everyNthInput	1
12 | expName	adhoc/conf8.inDim2
13 | gridSizeForInterpolation	100
14 | gridSizeForSampling	20
15 | height	28
16 | hiddenLayerSize	600
17 | inDim	2
18 | inBoolDim	0
19 | initialSD	0.25
20 | inputDigit	None
21 | inputType	mnist
22 | layerNum	3
23 | learningRate	1
24 | minibatchSize	600
25 | momentum	0.6
26 | oversampling	1.0
27 | plotEach	10
28 | useReLU	True
29 | width	28
30 | 


--------------------------------------------------------------------------------
/adhoc/conf8.l1loss.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # same as conf8 but with using L1 distance between pairs in loss function.
 3 | # (Note the larger epoch count, haven't optimized the learning rate yet.)
 4 | # Numerically worse, visually also worse in emulating train and validation.
 5 | # But definitely better looking samples and planar crosscuts.
 6 | # parent:
 7 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
 8 | # this:
 9 | # epoch 16000 trainMean 3.946111 trainMedian 4.026848 validationMean 4.241784 validationMedian 4.268175
10 | epochCount	16000
11 | everyNthInput	10
12 | expName	adhoc/conf8.l1loss
13 | gridSizeForInterpolation	30
14 | gridSizeForSampling	20
15 | height	28
16 | hiddenLayerSize	600
17 | inDim	20
18 | inBoolDim	0
19 | initialSD	0.25
20 | inputDigit	None
21 | inputType	mnist
22 | layerNum	3
23 | learningRate	10
24 | loss	l1
25 | minibatchSize	600
26 | momentum	0.6
27 | oversampling	4.0
28 | plotEach	50
29 | useReLU	True
30 | width	28
31 | 


--------------------------------------------------------------------------------
/adhoc/conf8.l2unsquared.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # loss l2squared -> l2unsquared, learningRate 10 -> 1, plotEach 400 -> 50
 3 | #
 4 | # See https://github.com/danielvarga/daniel-experiments/issues/26 for visualizations.
 5 | #
 6 | # Note the drastically faster convergence. That's not because of
 7 | # some intrinsic nice property of the l2unsquared loss, but rather because
 8 | # the learning rate is accidentally set up way higher.
 9 | # (Looks lower, but they can't be immediately compared because of the sqrt.)
10 | #
11 | # validationMean and friends are already calculated with l2unsquared distance,
12 | # so this method is closer to actually optimizing the evaluation metric.
13 | #
14 | # parent:
15 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
16 | # this:
17 | # epoch 1000 trainMean 3.653183 trainMedian 3.725959 validationMean 3.967728 validationMedian 4.009489
18 | epochCount	16000
19 | everyNthInput	10
20 | expName	adhoc/conf8.l2unsquared
21 | gridSizeForInterpolation	30
22 | gridSizeForSampling	20
23 | height	28
24 | hiddenLayerSize	600
25 | inDim	20
26 | inBoolDim	0
27 | initialSD	0.25
28 | inputDigit	None
29 | inputType	mnist
30 | layerNum	3
31 | learningRate	1.0
32 | loss	l2unsquared
33 | minibatchSize	600
34 | momentum	0.6
35 | oversampling	4.0
36 | plotEach	50
37 | useReLU	True
38 | width	28
39 | 


--------------------------------------------------------------------------------
/adhoc/conf8.lrd.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # learningRateDecay 1.0 (default) -> 0.9998
 3 | # parent:
 4 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
 5 | # this:
 6 | # epoch 4800 trainMean 3.762363 trainMedian 3.834788 validationMean 3.976485 validationMedian 4.033859
 7 | # (learningRate 3.82905459404 at epoch 4800)
 8 | # epoch 20000 trainMean 3.658130 trainMedian 3.713779 validationMean 3.966676 validationMedian 4.045723
 9 | # (learningRate 0.183179870248 at epoch 20000)
10 | epochCount	30000
11 | everyNthInput	10
12 | expName	adhoc/conf8.lrd
13 | gridSizeForInterpolation	30
14 | gridSizeForSampling	20
15 | height	28
16 | hiddenLayerSize	600
17 | inDim	20
18 | inBoolDim	0
19 | initialSD	0.25
20 | inputDigit	None
21 | inputType	mnist
22 | layerNum	3
23 | learningRate	10
24 | learningRateDecay	0.9998
25 | minibatchSize	600
26 | momentum	0.6
27 | oversampling	4.0
28 | plotEach	100
29 | useReLU	True
30 | width	28
31 | 


--------------------------------------------------------------------------------
/adhoc/convolutional.txt:
--------------------------------------------------------------------------------
 1 | # Using a convolutional architecture based on the upper half of
 2 | # ../lasagne-demo/mnist_conv_autoencode.py
 3 | # parent is adhoc/spearmint-best.txt , the clone of
 4 | # spearmintOutput/hls673-inDim50-layerNum3-lr1.0-mom0.969849416169-n1000-os8.0
 5 | # parent:
 6 | # epoch 4800 trainMean 3.598628 trainMedian 3.688720 validationMean 3.907617 validationMedian 3.926451
 7 | # this:
 8 | # epoch 4800 trainMean 4.240630 trainMedian 4.336588 validationMean 4.238855 validationMedian 4.282310
 9 | convolutional	True
10 | epochCount	4800
11 | everyNthInput	10
12 | expName	adhoc/convolutional
13 | gridSizeForInterpolation	30
14 | gridSizeForSampling	20
15 | height	28
16 | hiddenLayerSize	673
17 | inBoolDim	0
18 | inDim	50
19 | initialSD	0.25
20 | inputDigit	None
21 | inputType	mnist
22 | layerNum	3
23 | learningRate	0.5
24 | minibatchSize	1000
25 | momentum	0.5
26 | oversampling	8.0
27 | plotEach	100
28 | useReLU	True
29 | width	28
30 | 


--------------------------------------------------------------------------------
/adhoc/lowdim1.txt:
--------------------------------------------------------------------------------
 1 | epochCount	100
 2 | expName	adhoc/uniform.bipartite1d
 3 | hiddenLayerSize	100
 4 | inBoolDim	0
 5 | inDim	1
 6 | initialSD	0.25
 7 | inputType	1d.uniform
 8 | isLowDim	True
 9 | layerNum	3
10 | learningRate	0.1
11 | loss	l2squared
12 | minibatchSize	1000
13 | momentum	0.6
14 | oversampling	1.0
15 | plotEach	1
16 | trainSize	40000
17 | useReLU	False
18 | validSize	10000
19 | 


--------------------------------------------------------------------------------
/adhoc/plane1.txt:
--------------------------------------------------------------------------------
 1 | # parent adhoc/wave2.txt
 2 | epochCount	16000
 3 | expName	adhoc/plane1-d2
 4 | gridSizeForInterpolation	50
 5 | gridSizeForSampling	20
 6 | height	28
 7 | hiddenLayerSize	100
 8 | inBoolDim	0
 9 | inDim	2
10 | initialSD	0.25
11 | inputType	plane
12 | layerNum	3
13 | learningRate	10
14 | minibatchSize	100
15 | momentum	0.6
16 | oversampling	1.0
17 | plotEach	10
18 | trainSize	40000
19 | useReLU	False
20 | validSize	400
21 | width	28
22 | 


--------------------------------------------------------------------------------
/adhoc/sine1.txt:
--------------------------------------------------------------------------------
 1 | # parent adhoc/wave1.txt
 2 | epochCount	16000
 3 | expName	adhoc/sine1
 4 | gridSizeForInterpolation	30
 5 | gridSizeForSampling	20
 6 | height	28
 7 | hiddenLayerSize	100
 8 | inBoolDim	0
 9 | inDim	2
10 | initialSD	2.0
11 | inputType	sine
12 | layerNum	4
13 | learningRate	10
14 | minibatchSize	500
15 | momentum	0.6
16 | oversampling	1.0
17 | plotEach	100
18 | trainSize	40000
19 | useReLU	False
20 | validSize	400
21 | width	28
22 | 


--------------------------------------------------------------------------------
/adhoc/spearmint-best-leaky.txt:
--------------------------------------------------------------------------------
 1 | # identical to spearmint-best, but started with leaky relu.
 2 | # parent:
 3 | # epoch 4800 trainMean 3.598628 trainMedian 3.688720 validationMean 3.907617 validationMedian 3.926451
 4 | # this:
 5 | # epoch 4800 trainMean 3.556957 trainMedian 3.631534 validationMean 3.892525 validationMedian 3.911150
 6 | epochCount	9600
 7 | everyNthInput	10
 8 | expName	adhoc/spearmint-best-leaky
 9 | gridSizeForInterpolation	30
10 | gridSizeForSampling	20
11 | height	28
12 | hiddenLayerSize	673
13 | inBoolDim	0
14 | inDim	50
15 | initialSD	0.25
16 | inputDigit	None
17 | inputType	mnist
18 | layerNum	3
19 | reLULeakiness	0.01
20 | learningRate	1.0
21 | minibatchSize	1000
22 | momentum	0.969849416169
23 | oversampling	8.0
24 | plotEach	800
25 | useReLU	True
26 | width	28
27 | 


--------------------------------------------------------------------------------
/adhoc/spearmint-best-leaky0.1.txt:
--------------------------------------------------------------------------------
 1 | # identical to spearmint-best-leaky, but with extra leakiness:
 2 | # reLULeakiness 0.01 -> 0.1
 3 | # parent:
 4 | # epoch 4800 trainMean 3.556957 trainMedian 3.631534 validationMean 3.892525 validationMedian 3.911150
 5 | # this:
 6 | # epoch 4800 trainMean 3.640023 trainMedian 3.720077 validationMean 3.930313 validationMedian 3.919364
 7 | epochCount	9600
 8 | everyNthInput	10
 9 | expName	adhoc/spearmint-best-leaky0.1
10 | gridSizeForInterpolation	30
11 | gridSizeForSampling	20
12 | height	28
13 | hiddenLayerSize	673
14 | inBoolDim	0
15 | inDim	50
16 | initialSD	0.25
17 | inputDigit	None
18 | inputType	mnist
19 | layerNum	3
20 | reLULeakiness	0.1
21 | learningRate	1.0
22 | minibatchSize	1000
23 | momentum	0.969849416169
24 | oversampling	8.0
25 | plotEach	800
26 | useReLU	True
27 | width	28
28 | 


--------------------------------------------------------------------------------
/adhoc/spearmint-best.txt:
--------------------------------------------------------------------------------
 1 | # identical to
 2 | # spearmintOutput/hls673-inDim50-layerNum3-lr1.0-mom0.969849416169-n1000-os8.0
 3 | # that produced
 4 | # epoch 4800 trainMean 3.591231 trainMedian 3.657815 validationMean 3.887801 validationMedian 3.889197
 5 | # , but due to probably chance factors, this is a bit worse.
 6 | # this:
 7 | # epoch 4800 trainMean 3.598628 trainMedian 3.688720 validationMean 3.907617 validationMedian 3.926451
 8 | epochCount	4800
 9 | everyNthInput	10
10 | expName	adhoc/spearmint-best
11 | gridSizeForInterpolation	30
12 | gridSizeForSampling	20
13 | height	28
14 | hiddenLayerSize	673
15 | inBoolDim	0
16 | inDim	50
17 | initialSD	0.25
18 | inputDigit	None
19 | inputType	mnist
20 | layerNum	3
21 | learningRate	1.0
22 | minibatchSize	1000
23 | momentum	0.969849416169
24 | oversampling	8.0
25 | plotEach	800
26 | useReLU	True
27 | width	28
28 | 


--------------------------------------------------------------------------------
/adhoc/speedtest.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # with a small epochCount and infinite plotEach for speed benchmarking.
 3 | # this:
 4 | # epoch 100 trainMean 4.712180 trainMedian 4.785045 validationMean 4.664932 validationMedian 4.719368
 5 | epochCount	100
 6 | everyNthInput	10
 7 | expName	adhoc/speedtest
 8 | gridSizeForInterpolation	30
 9 | gridSizeForSampling	20
10 | height	28
11 | hiddenLayerSize	600
12 | inBoolDim	0
13 | inDim	20
14 | initialSD	0.25
15 | inputDigit	None
16 | inputType	mnist
17 | layerNum	3
18 | learningRate	10
19 | minibatchSize	600
20 | momentum	0.6
21 | oversampling	4.0
22 | plotEach	100
23 | useReLU	True
24 | width	28
25 | 


--------------------------------------------------------------------------------
/adhoc/speedtestgpu.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # with a small epochCount and infinite plotEach for speed benchmarking.
 3 | epochCount	50
 4 | everyNthInput	10
 5 | expName	adhoc/speedtestgpu
 6 | gridSizeForInterpolation	30
 7 | gridSizeForSampling	20
 8 | height	28
 9 | hiddenLayerSize	1000
10 | inBoolDim	0
11 | inDim	20
12 | initialSD	0.25
13 | inputDigit	None
14 | inputType	mnist
15 | layerNum	4
16 | learningRate	10
17 | minibatchSize	100
18 | momentum	0.6
19 | oversampling	1.0
20 | plotEach	100000
21 | useReLU	True
22 | width	28
23 | 


--------------------------------------------------------------------------------
/adhoc/uniform.bipartite1d.lrd.txt:
--------------------------------------------------------------------------------
 1 | # BEWARE: bipartiteMatchingBased is not a parameter right now.
 2 | # set     bipartiteMatchingBased = True in the code.
 3 | #
 4 | # parent adhoc/uniform.bipartite1d
 5 | # learningRateDecay 1.0 -> 0.97
 6 | # Haven't fixed the bathtub issue, although converged to a slightly better optimum slightly faster.
 7 | # parent:
 8 | # epoch 200 epochInterimMean 0.036180 epochInterimMedian 0.029334
 9 | # this:
10 | # epoch 100 epochInterimMean 0.032690 epochInterimMedian 0.026891
11 | epochCount	200
12 | expName	adhoc/uniform.bipartite1d.lrd
13 | hiddenLayerSize	100
14 | inBoolDim	0
15 | inDim	1
16 | initialSD	0.25
17 | inputType	1d.uniform
18 | isLowDim	True
19 | layerNum	3
20 | learningRate	0.1
21 | learningRateDecay	0.97
22 | minibatchSize	1000
23 | momentum	0.6
24 | oversampling	1.0
25 | plotEach	10
26 | trainSize	40000
27 | useReLU	False
28 | validSize	10000
29 | 


--------------------------------------------------------------------------------
/adhoc/wave1.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # inputType mnist -> wave, useReLU True -> False, everyNthInput removed, inputDigit removed, trainSize added 40000, validSize added 5000
 3 | epochCount	16000
 4 | expName	adhoc/wave1
 5 | gridSizeForInterpolation	30
 6 | gridSizeForSampling	20
 7 | height	28
 8 | hiddenLayerSize	600
 9 | inBoolDim	0
10 | inDim	20
11 | initialSD	0.25
12 | inputType	wave
13 | layerNum	3
14 | learningRate	10
15 | minibatchSize	600
16 | momentum	0.6
17 | oversampling	4.0
18 | plotEach	1
19 | trainSize	40000
20 | useReLU	False
21 | validSize	400
22 | waveCount	42
23 | width	28
24 | 


--------------------------------------------------------------------------------
/autoencoder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from lasagne import layers
  4 | import numpy as np
  5 | 
  6 | import sys
  7 | import gzip
  8 | import cPickle
  9 | from PIL import Image
 10 | 
 11 | from nnbase.layers import Unpool2DLayer
 12 | from nnbase.utils import FlipBatchIterator
 13 | ### this is really dumb, current nolearn doesnt play well with lasagne,
 14 | ### so had to manually copy the file I wanted to this folder
 15 | import nnbase.shape as shape
 16 | 
 17 | import nnbase.inputs
 18 | import nnbase.vis
 19 | 
 20 | # This is very error-prone.
 21 | # Optimally, there should be a guarantee that the
 22 | # corpus loaded here is the same as the one that the
 23 | # encoder was trained on.
 24 | def loadCorpus():
 25 |     face = True
 26 |     if face:
 27 |         directory = "../face/SCUT-FBP/thumb.big/"
 28 |         X, (height, width) = nnbase.inputs.faces(directory)
 29 |     else:
 30 |         X, (height, width) = nnbase.inputs.mnist()
 31 | 
 32 |     X = X.astype(np.float64).reshape((-1, 1, height, width))
 33 |     mu, sigma = np.mean(X), np.std(X)
 34 |     print "mu, sigma:", mu, sigma
 35 |     return X, mu, sigma
 36 | 
 37 | # TODO I don't think that .eval() is how this should work.
 38 | def get_output_from_nn(last_layer, X):
 39 |     indices = np.arange(128, X.shape[0], 128)
 40 |     # not splitting into batches can cause a memory error
 41 |     X_batches = np.split(X, indices)
 42 |     out = []
 43 |     for count, X_batch in enumerate(X_batches):
 44 |         out.append( layers.get_output(last_layer, X_batch).eval() )
 45 |     return np.vstack(out)
 46 | 
 47 | # This helper class deals with
 48 | # 1. normalizing input and de-normalizing output
 49 | # 2. reshaping output into shape compatible with input, namely (-1, 1, x ,y)
 50 | class Autoencoder:
 51 |     # sigma and mu should be trained on the same corpus as the autoencoder itself.
 52 |     # This is error-prone!
 53 |     def __init__(self, ae, mu, sigma):
 54 |         self.ae = ae
 55 |         self.mu = mu
 56 |         self.sigma = sigma
 57 | 
 58 |         self.encode_layer_index = map(lambda pair : pair[0], self.ae.layers).index('encode_layer')
 59 |         self.encode_layer = self.ae.get_all_layers()[self.encode_layer_index]
 60 |         self.afterSplit = False
 61 | 
 62 |     # from unnormalized to unnormalized [0,1] MNIST.
 63 |     # ae is trained on normalized MNIST data.
 64 |     # For 0-1 clipped digits this should be close to the identity function.
 65 |     def predict(self, X):
 66 |         assert not self.afterSplit
 67 |         self.x, self.y = X.shape[-2:]
 68 |         flatOutput = self.ae.predict((X - self.mu) / self.sigma).reshape(X.shape) * self.sigma + self.mu
 69 |         return flatOutput.reshape((-1, 1, self.x, self.y))
 70 | 
 71 |     def encode(self, X):
 72 |         self.x, self.y = X.shape[-2:]
 73 |         return get_output_from_nn(self.encode_layer, (X-self.mu)/self.sigma)
 74 | 
 75 |     # N.B after we do this, we won't be able to use the original autoencoder , as the layers are broken up
 76 |     def split(self):
 77 |         next_layer = self.ae.get_all_layers()[self.encode_layer_index + 1]
 78 |         self.final_layer = self.ae.get_all_layers()[-1]
 79 |         new_layer = layers.InputLayer(shape = (None, self.encode_layer.num_units))
 80 |         next_layer.input_layer = new_layer
 81 |         self.afterSplit = True
 82 | 
 83 |     def decode(self, X):
 84 |         assert self.afterSplit
 85 |         flatOutput = get_output_from_nn(self.final_layer, X) * self.sigma + self.mu
 86 |         # Evil hack: decode only knows the shape of the input space
 87 |         # if you did a predict or encode previously. TODO Fix asap.
 88 |         return flatOutput.reshape((-1, 1, self.x, self.y))
 89 | 
 90 | 
 91 | def main():
 92 |     X_train, mu, sigma = loadCorpus()
 93 | 
 94 |     # autoencoderFile = "../lasagne-demo/conv_ae.pkl" # Trained on the full mnist train dataset
 95 |     autoencoderFile = "../lasagne-demo/face.big.pkl" # Trained on the ../face/SCUT-FBP/thumb.big dataset.
 96 | 
 97 |     ae_raw = cPickle.load(open(autoencoderFile, 'r'))
 98 |     autoencoder = Autoencoder(ae_raw, mu, sigma)
 99 | 
100 |     sampleIndices = map(int, sys.argv[1:])
101 |     assert len(sampleIndices)==2, "the tool expects two sample indices"
102 |     X_train = X_train[sampleIndices]
103 | 
104 |     X_pred = autoencoder.predict(X_train)
105 |     print "ended prediction"
106 |     sys.stdout.flush()
107 | 
108 |     nnbase.vis.get_random_images(X_train, X_pred)
109 | 
110 |     autoencoder.split()
111 | 
112 |     X_encoded = autoencoder.encode(X_train)
113 | 
114 |     x0 = X_encoded[0]
115 |     x1 = X_encoded[1]
116 |     stepCount = 100
117 |     intervalBase = np.linspace(1, 0, num=stepCount)
118 |     intervalEncoded = np.multiply.outer(intervalBase, x0)+np.multiply.outer(1.0-intervalBase, x1)
119 | 
120 |     X_decoded = autoencoder.decode(intervalEncoded)
121 |     nnbase.vis.get_picture_array(X_decoded, 10, 10, "interval")
122 | 
123 |     intervalInputspace = np.multiply.outer(intervalBase, X_train[0])+np.multiply.outer(1.0-intervalBase, X_train[1])
124 |     nnbase.vis.get_picture_array(intervalInputspace, 10, 10, "interval-inputspace")
125 | 
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "language"        : "PYTHON",
 3 |     "main-file"       : "spearmintTask.py",
 4 |     "experiment-name" : "initials",
 5 |     "variables" : {
 6 |         "inDim" : {
 7 |             "type" : "INT",
 8 |             "size" : 1,
 9 |             "min"  : 1,
10 |             "max"  : 100
11 |         },
12 |         "inBoolDim" : {
13 |             "type" : "INT",
14 |             "size" : 1,
15 |             "min"  : 0,
16 |             "max"  : 100
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/deepDives/conf1.txt:
--------------------------------------------------------------------------------
 1 | # Modified from spearmintExps/epoch1600/hls220-inDim20-lr10.1885871231-mom0.635511081693-n290-os4.0/conf.txt
 2 | # this:
 3 | # epoch 4800 trainMean 4.209222 trainMedian 4.325197 validationMean 4.234253 validationMedian 4.293222
 4 | epochCount	100000
 5 | everyNthInput	10
 6 | expName	deepDives/conf1-hls200-inDim20-lr10-mom0.6-n300-os4.0
 7 | gridSizeForInterpolation	30
 8 | gridSizeForSampling	20
 9 | height	28
10 | hiddenLayerSize	200
11 | inDim	20
12 | inputDigit	None
13 | inputType	mnist
14 | layerNum	2
15 | learningRate	10
16 | minibatchSize	300
17 | momentum	0.6
18 | oversampling	4.0
19 | plotEach	400
20 | useReLU	True
21 | width	28
22 | 


--------------------------------------------------------------------------------
/deepDives/conf10.txt:
--------------------------------------------------------------------------------
 1 | # parent spearmintOutput/hls650-inDim75-layerNum3-lr10.5-mom0.74-n650-os6.0
 2 | # within spearmintRun epochCount4800_depth3_4_useReLUTrue_everyNthInput10_bigger
 3 | # only diffence is more data.
 4 | # everyNthInput 10 -> 1, epochCount 4800 -> 960, plotEach 800 -> 80
 5 | # parent:
 6 | # epoch 4800 trainMean 3.614238 trainMedian 3.647176 validationMean 3.950973 validationMedian 3.987011
 7 | # this:
 8 | # epoch 480 trainMean 3.849584 trainMedian 3.910133 validationMean 3.801794 validationMedian 3.858063
 9 | # (not directly comparable of course, more diverse training data here.
10 | # See the cool reverse in train and validation performance.)
11 | epochCount	960
12 | everyNthInput	1
13 | expName	deepDives/conf10
14 | gridSizeForInterpolation	30
15 | gridSizeForSampling	20
16 | height	28
17 | hiddenLayerSize	650
18 | inDim	75
19 | inBoolDim	0
20 | initialSD	0.25
21 | inputDigit	None
22 | inputType	mnist
23 | layerNum	3
24 | learningRate	10.5
25 | minibatchSize	650
26 | momentum	0.74
27 | oversampling	6.0
28 | plotEach	80
29 | useReLU	True
30 | width	28
31 | 


--------------------------------------------------------------------------------
/deepDives/conf11.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf10.txt
 2 | # playing with decreasing learning rate until I get to
 3 | # implement learning rate decay.
 4 | # learningRate 10.5 -> 1.0 epochCount 960 -> 9600
 5 | # parent:
 6 | # epoch 480 trainMean 3.849584 trainMedian 3.910133 validationMean 3.801794 validationMedian 3.858063
 7 | # this:
 8 | # epoch 4800 trainMean 3.872003 trainMedian 3.936070 validationMean 3.811173 validationMedian 3.884217
 9 | # (Note that these are more or less directly comparable because of the epochCount/learningRate trade-off.
10 | # Also, considering the uncertainty, these are identical.)
11 | epochCount	9600
12 | everyNthInput	1
13 | expName	deepDives/conf11
14 | gridSizeForInterpolation	30
15 | gridSizeForSampling	20
16 | height	28
17 | hiddenLayerSize	650
18 | inDim	75
19 | inBoolDim	0
20 | initialSD	0.25
21 | inputDigit	None
22 | inputType	mnist
23 | layerNum	3
24 | learningRate	1.0
25 | minibatchSize	650
26 | momentum	0.74
27 | oversampling	6.0
28 | plotEach	80
29 | useReLU	True
30 | width	28
31 | 


--------------------------------------------------------------------------------
/deepDives/conf2.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf1.txt
 2 | # learningRate 10 -> 1
 3 | # parent:
 4 | # epoch 4800 trainMean 4.209222 trainMedian 4.325197 validationMean 4.234253 validationMedian 4.293222
 5 | # epoch 30000 trainMean 4.135192 trainMedian 4.232051 validationMean 4.198376 validationMedian 4.211795
 6 | # this:
 7 | # epoch 4800 trainMean 4.375416 trainMedian 4.522119 validationMean 4.346554 validationMedian 4.408077
 8 | # epoch 20000 trainMean 4.258607 trainMedian 4.409981 validationMean 4.273648 validationMedian 4.319705
 9 | epochCount	100000
10 | everyNthInput	10
11 | expName	deepDives/conf2-hls200-inDim20-lr1-mom0.6-n300-os4.0
12 | gridSizeForInterpolation	30
13 | gridSizeForSampling	20
14 | height	28
15 | hiddenLayerSize	200
16 | inDim	20
17 | inputDigit	None
18 | inputType	mnist
19 | layerNum	2
20 | learningRate	1
21 | minibatchSize	300
22 | momentum	0.6
23 | oversampling	4.0
24 | plotEach	400
25 | useReLU	True
26 | width	28
27 | 


--------------------------------------------------------------------------------
/deepDives/conf3.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf1.txt
 2 | # layerNum 2 -> 3
 3 | # Playing a bit with depth 3 neural nets.
 4 | # parent:
 5 | # epoch 4800 trainMean 4.209222 trainMedian 4.325197 validationMean 4.234253 validationMedian 4.293222
 6 | # this:
 7 | # epoch 4800 trainMean 3.938845 trainMedian 4.025593 validationMean 4.106030 validationMedian 4.138099
 8 | epochCount	100000
 9 | everyNthInput	10
10 | expName	deepDives/conf3-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0
11 | gridSizeForInterpolation	30
12 | gridSizeForSampling	20
13 | height	28
14 | hiddenLayerSize	200
15 | inDim	20
16 | inputDigit	None
17 | inputType	mnist
18 | layerNum	3
19 | learningRate	10
20 | minibatchSize	300
21 | momentum	0.6
22 | oversampling	4.0
23 | plotEach	400
24 | useReLU	True
25 | width	28
26 | 


--------------------------------------------------------------------------------
/deepDives/conf4.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf3.txt
 2 | # params unchanged, input is face/SCUT-FBP/thumb.big
 3 | # everyNthInput=1 of course.
 4 | # this:
 5 | # epoch 60000 trainMean 3.207882 trainMedian 3.244128 validationMean 4.883027 validationMedian 4.791943
 6 | # (epoch 60000 because of the super-small corpus size)
 7 | epochCount	100000
 8 | everyNthInput	1
 9 | expName	deepDives/conf4-faces-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0
10 | gridSizeForInterpolation	30
11 | gridSizeForSampling	20
12 | hiddenLayerSize	200
13 | imageDirectory	../face/SCUT-FBP/thumb.big/
14 | inDim	20
15 | inputType	image
16 | layerNum	3
17 | learningRate	10
18 | minibatchSize	300
19 | momentum	0.6
20 | oversampling	4.0
21 | plotEach	40
22 | useReLU	True
23 | 


--------------------------------------------------------------------------------
/deepDives/conf5.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf3.txt
 2 | # everyNthInput 10 -> 1, inputDigit None -> 6
 3 | # this:
 4 | # epoch 2400 trainMean 3.216107 trainMedian 3.153695 validationMean 3.394409 validationMedian 3.254620
 5 | epochCount	100000
 6 | everyNthInput	1
 7 | expName	deepDives/conf5-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0-digit6
 8 | gridSizeForInterpolation	30
 9 | gridSizeForSampling	20
10 | height	28
11 | hiddenLayerSize	200
12 | inDim	20
13 | inputDigit	6
14 | inputType	mnist
15 | layerNum	3
16 | learningRate	10
17 | minibatchSize	300
18 | momentum	0.6
19 | oversampling	4.0
20 | plotEach	400
21 | useReLU	False
22 | width	28
23 | 


--------------------------------------------------------------------------------
/deepDives/conf6.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf3.txt
 2 | # This used sampleInitial gauss: np.random.normal(loc=0.0, scale=1.0/4, size=(n, inDim))
 3 | # Otherwise, it's the famous conf3.
 4 | # parent:
 5 | # epoch 4800 trainMean 3.938845 trainMedian 4.025593 validationMean 4.106030 validationMedian 4.138099
 6 | # this:
 7 | # epoch 4800 trainMean 3.909867 trainMedian 3.981797 validationMean 4.073384 validationMedian 4.109737
 8 | epochCount	16000
 9 | everyNthInput	10
10 | expName	deepDives/conf6-gauss
11 | gridSizeForInterpolation	30
12 | gridSizeForSampling	20
13 | height	28
14 | hiddenLayerSize	200
15 | inDim	20
16 | inBoolDim	0
17 | initialSD	0.25
18 | inputDigit	None
19 | inputType	mnist
20 | layerNum	3
21 | learningRate	10
22 | minibatchSize	300
23 | momentum	0.6
24 | oversampling	4.0
25 | plotEach	400
26 | useReLU	True
27 | width	28
28 | 


--------------------------------------------------------------------------------
/deepDives/conf7.txt:
--------------------------------------------------------------------------------
 1 | # parent: see below
 2 | # This is identical to the current-best epoch4800-relu-cubemixture spearmintOutput found at
 3 | # /Users/daniel/experiments/rbm/daniel-experiments/kohonen/spearmintOutput/hls300-inDim12-layerNum4-lr20.0-mom0.5-n300-os3.99999999824/conf.txt
 4 | # but it's running with a straight gauss input distribution.
 5 | # Also, plotEach 800 -> 200, epochCount 4800 -> 48000
 6 | # parent:
 7 | # epoch 4800 trainMean 3.699963 trainMedian 3.767960 validationMean 4.109944 validationMedian 4.142234
 8 | # this:
 9 | # epoch 4800 trainMean 3.690358 trainMedian 3.760918 validationMean 4.082099 validationMedian 4.111330
10 | epochCount	48000
11 | everyNthInput	10
12 | expName	deepDives/conf7-gauss
13 | gridSizeForInterpolation	30
14 | gridSizeForSampling	20
15 | height	28
16 | hiddenLayerSize	300
17 | inDim	12
18 | inBoolDim	0
19 | initialSD	0.25
20 | inputDigit	None
21 | inputType	mnist
22 | layerNum	4
23 | learningRate	20.0
24 | minibatchSize	300
25 | momentum	0.5
26 | oversampling	3.99999999824
27 | plotEach	200
28 | useReLU	True
29 | width	28
30 | 


--------------------------------------------------------------------------------
/deepDives/conf8.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf6.txt
 2 | # I call it tultoltam:
 3 | # hiddenLayerSize 200 -> 600, minibatchSize 300 -> 600
 4 | # parent:
 5 | # epoch 4800 trainMean 3.909867 trainMedian 3.981797 validationMean 4.073384 validationMedian 4.109737
 6 | # this:
 7 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
 8 | epochCount	16000
 9 | everyNthInput	10
10 | expName	deepDives/conf8
11 | gridSizeForInterpolation	30
12 | gridSizeForSampling	20
13 | height	28
14 | hiddenLayerSize	600
15 | inDim	20
16 | inBoolDim	0
17 | initialSD	0.25
18 | inputDigit	None
19 | inputType	mnist
20 | layerNum	3
21 | learningRate	10
22 | minibatchSize	600
23 | momentum	0.6
24 | oversampling	4.0
25 | plotEach	400
26 | useReLU	True
27 | width	28
28 | 


--------------------------------------------------------------------------------
/deepDives/conf9.txt:
--------------------------------------------------------------------------------
 1 | # parent deepDives/conf8.txt
 2 | # minibatchSize 600 -> 300, oversampling 4.0 -> 8.0
 3 | # parent:
 4 | # epoch 4800 trainMean 3.709048 trainMedian 3.779974 validationMean 3.975647 validationMedian 3.986895
 5 | # this:
 6 | # epoch 2400 trainMean 3.724940 trainMedian 3.757624 validationMean 3.983316 validationMedian 4.018968
 7 | # epoch 4800 trainMean 3.623366 trainMedian 3.650960 validationMean 3.984901 validationMedian 3.984115
 8 | epochCount	16000
 9 | everyNthInput	10
10 | expName	deepDives/conf9
11 | gridSizeForInterpolation	30
12 | gridSizeForSampling	20
13 | height	28
14 | hiddenLayerSize	600
15 | inDim	20
16 | inBoolDim	0
17 | initialSD	0.25
18 | inputDigit	None
19 | inputType	mnist
20 | layerNum	3
21 | learningRate	10
22 | minibatchSize	300
23 | momentum	0.6
24 | oversampling	8.0
25 | plotEach	400
26 | useReLU	True
27 | width	28
28 | 


--------------------------------------------------------------------------------
/distances.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | 
  4 | import numpy as np
  5 | import theano
  6 | import theano.tensor as T
  7 | import lasagne
  8 | 
  9 | 
 10 | import theano.sandbox.rng_mrg
 11 | 
 12 | def logg(*ss):
 13 |     s = " ".join(map(str,ss))
 14 |     sys.stderr.write(s+"\n")
 15 | 
 16 | def start(s):
 17 |     global startTime
 18 |     global phase
 19 |     phase = s
 20 |     logg(phase+".")
 21 |     startTime = time.clock()
 22 | 
 23 | def end(s=None):
 24 |     global startTime
 25 |     global phase
 26 |     if s is not None:
 27 |         phase = s
 28 |     endTime = time.clock()
 29 |     logg(phase,"finished in",endTime-startTime,"seconds.")
 30 | 
 31 | 
 32 | def randomMatrix(n, f):
 33 |     return np.random.normal(size=n*f).astype(np.float32).reshape((n, f))
 34 | 
 35 | 
 36 | def distanceMatrix(x, y):
 37 |     xL2S = np.sum(x*x, axis=-1)
 38 |     yL2S = np.sum(y*y, axis=-1)
 39 |     xL2SM = np.tile(xL2S, (len(y), 1))
 40 |     yL2SM = np.tile(yL2S, (len(x), 1))
 41 |     squaredDistances = xL2SM + yL2SM.T - 2.0*y.dot(x.T)
 42 |     distances = np.sqrt(squaredDistances+1e-6) # elementwise. +1e-6 is to supress sqrt-of-negative warning.
 43 |     return distances
 44 | 
 45 | 
 46 | # Newer theano builds allow tile() with scalar variable as reps.
 47 | # https://github.com/Theano/Theano/pull/2875
 48 | # That could make this nicer.
 49 | # The worst thing about it is that it the constructed calculation
 50 | # silently fails when given smaller datasets.
 51 | # TODO If there's no easy fix, at least wrap closest_fn into
 52 | # a mini-class that verifies sizes.
 53 | def constructSquaredDistanceMatrixVariable(x, y, n, m):
 54 |     # ([n, f] , [m, f]) -> (n, m)
 55 |     xL2S = T.sum(x*x, axis=-1) # [n]
 56 |     yL2S = T.sum(y*y, axis=-1) # [m]
 57 |     xL2SM = T.zeros((m, n)) + xL2S # broadcasting, [m, n]
 58 |     yL2SM = T.zeros((n, m)) + yL2S # # broadcasting, [n, m]
 59 | 
 60 |     squaredDistances = xL2SM.T + yL2SM - 2.0*T.dot(x, y.T) # [n, m]
 61 |     return squaredDistances
 62 | 
 63 | def constructSDistanceMatrixFunction(n, m):
 64 |     x = T.matrix('x')
 65 |     y = T.matrix('y')
 66 |     sDistances = constructSquaredDistanceMatrixVariable(x, y, n, m)
 67 |     return theano.function([x, y], sDistances)
 68 | 
 69 | # For each y, it returns the index of the closest x in L2 distance.
 70 | # x is [n, f], y is [m, f] for some f. Output is [m], the values are in range(n).
 71 | def constructMinimalDistanceIndicesVariable(x, y, n, m):
 72 |     sDistances = constructSquaredDistanceMatrixVariable(x, y, n, m)
 73 |     lamblinsTrick = False
 74 |     if lamblinsTrick:
 75 |         # https://github.com/Theano/Theano/issues/1399
 76 |         # https://gist.github.com/danielvarga/d0eeacea92e65b19188c
 77 |         # https://groups.google.com/forum/#!topic/theano-users/E7ProqnGUMk
 78 |         s = sDistances
 79 |         bestIndices = T.cast( ( T.arange(n).dimshuffle(0, 'x') * T.cast(T.eq(s, s.min(axis=0, keepdims=True)), 'float32') ).sum(axis=0), 'int32')
 80 |         # This is a heavy-handed workaround for the fact that in
 81 |         # lamblin's hack, ties lead to completely screwed results.
 82 |         bestIndices = T.clip(bestIndices, 0, n-1)
 83 |     else:
 84 |         bestIndices = T.argmin(sDistances, axis=0)
 85 |     return bestIndices
 86 | 
 87 | 
 88 | # The theano.function returned by this is usually called like this:
 89 | # closest_fn(sampled, data), output is an index (pointing to a sampled row)
 90 | # for each row of data.
 91 | #
 92 | # To elaborate:
 93 | #  n = candidateCount, m = targetCount,
 94 | # typically candidateCount = sampleSize, targetCount = minibatchSize
 95 | # BEWARE: super confusingly, in generativeMLP.py sampleSize is called m.
 96 | #
 97 | # See testMinimalDistanceIndicesFunction for how to turn indices into samples and distances.
 98 | def constructMinimalDistanceIndicesFunction(n, m):
 99 |     x = T.matrix('x')
100 |     y = T.matrix('y')
101 |     bestIndices = constructMinimalDistanceIndicesVariable(x, y, n, m)
102 |     return theano.function([x, y], bestIndices)
103 | 
104 | def constructMinimalDistancesVariable(x, y, initials, n, m):
105 |     sDistances = constructSquaredDistanceMatrixVariable(x, y, n, m)
106 |     bestIndices = T.argmin(sDistances, axis=0)
107 |     bestXes = x[bestIndices]
108 |     bestInitials = initials[bestIndices]
109 |     return bestXes, bestInitials
110 | 
111 | def constructMinimalDistancesFunction(n, m):
112 |     x = T.matrix('x')
113 |     y = T.matrix('y')
114 |     initials = T.matrix('initials')
115 |     bestXes, bestInitials = constructMinimalDistancesVariable(x, y, initials, n, m)
116 |     return theano.function([x, y], bestXes)
117 | 
118 | 
119 | def testMinimalDistanceIndicesFunction(batchSize, sampleSize, featureDim):
120 |     closest_fn = constructMinimalDistanceIndicesFunction(sampleSize, batchSize)
121 | 
122 |     data = randomMatrix(batchSize, featureDim)
123 |     sampled = randomMatrix(sampleSize, featureDim)
124 | 
125 |     import kohonen
126 |     start("CPU nearest neighbors")
127 |     distances = kohonen.distanceMatrix(sampled, data)
128 |     assert distances.shape == (len(data), len(sampled)) # Beware the transpose!
129 |     bestIndicesByCPU = np.argmin(distances, axis=1)
130 |     closestSampledByCPU = sampled[bestIndicesByCPU]
131 |     bestDistancesByCPU = np.linalg.norm(data-closestSampledByCPU, axis=1)
132 |     end()
133 | 
134 |     start("GPU nearest neighbors")
135 |     bestIndicesByGPU = closest_fn(sampled, data)
136 |     # The next two steps are practically instant.
137 |     closestSampledByGPU = sampled[bestIndicesByGPU]
138 |     bestDistancesByGPU = np.linalg.norm(data-closestSampledByGPU, axis=1)
139 |     end()
140 | 
141 |     print "total bestDistances CPU", bestDistancesByCPU.sum()
142 |     print "total bestDistances GPU", bestDistancesByGPU.sum()
143 | 
144 | 
145 | # This class is a cheap workaround for the fact that I didn't manage to create
146 | # a shape-independent constructMinimalDistanceIndicesFunction.
147 | # It only works if the set of possible shapes is very small, otherwise Theano compilation
148 | # becomes the bottleneck.
149 | class ClosestFnFactory:
150 |     def __init__(self):
151 |         self.cache = {}
152 |     def __call__(self, *args):
153 |         assert len(args)==2
154 |         sampled, data = args
155 |         shape = (len(sampled), len(data))
156 |         candidateCount, targetCount = shape
157 |         if shape not in self.cache.keys():
158 |             logg("Adding to ClosestFnFactory cache, shape %s" % str(shape))
159 |             closest_fn = constructMinimalDistanceIndicesFunction(candidateCount, targetCount)
160 |             self.cache[shape] = closest_fn
161 |         else:
162 |             closest_fn = self.cache[shape]
163 |         return closest_fn(sampled, data)
164 | 
165 | 
166 | # A cool little toy learning problem:
167 | # We want to learn a translated 2D standard normal's translation, that's a 2D vector.
168 | # We generate batchSize samples from this target distribution.
169 | # We generate sampleSize samples from our current best bet for the distribution.
170 | # We find the closest generated sample to each target sample.
171 | # We calculate the sum of distances.
172 | # That's the loss that we optimize by gradient descent.
173 | # Note that Theano doesn't even break a sweat when doing backprop
174 | # through a layer of distance minimization.
175 | # Of course that's less impressive than it first sounds, because
176 | # locally, the identity of the nearest target sample never changes.
177 | def toyLearner():
178 |     batchSize = 2000
179 |     sampleSize = 2000
180 |     inDim = 2
181 |     srng = theano.sandbox.rng_mrg.MRG_RandomStreams(seed=234)
182 | 
183 |     dataVar = T.matrix("data")
184 |     initialsVar = srng.normal((sampleSize, inDim))
185 |     parametersVar = theano.shared(np.zeros(inDim, dtype=np.float32), "parameters")
186 |     generatedVar = initialsVar + parametersVar # broadcast
187 | 
188 | 
189 |     bestXesVar, bestInitialsVar = constructMinimalDistancesVariable(generatedVar, dataVar, initialsVar, sampleSize, batchSize)
190 | 
191 |     deltaVar = bestXesVar - dataVar
192 |     # mean over samples AND feature coordinates!
193 |     # Very frightening fact: with .sum() here, the learning process diverges.
194 |     lossVar = (deltaVar*deltaVar).mean()
195 | 
196 |     updates = lasagne.updates.nesterov_momentum(
197 |             lossVar, [parametersVar], learning_rate=0.2, momentum=0.0)
198 | 
199 |     train_fn = theano.function([dataVar], updates=updates)
200 | 
201 |     for epoch in range(1000):
202 |         data = randomMatrix(batchSize, inDim) + np.array([-5.0, 12.0], dtype=np.float32)
203 |         train_fn(data)
204 |         print parametersVar.get_value()
205 | 
206 | def distanceSpeedTest():
207 |     # I'm not using variable names n and m, because unfortunately
208 |     # the order is switched between sampleAndUpdate() and
209 |     # constructDistanceMatrixFunction().
210 |     batchSize = 3000
211 |     oversampling = 4.324
212 |     sampleSize = int(batchSize*oversampling)
213 |     f = 28*28
214 |     np.random.seed(0)
215 |     data = randomMatrix(batchSize, f)
216 |     generated = randomMatrix(sampleSize, f)
217 | 
218 |     dm_fn = constructSDistanceMatrixFunction(sampleSize, batchSize)
219 | 
220 |     md_fn = constructMinimalDistancesFunction(sampleSize, batchSize)
221 | 
222 |     start("minimal distances theano")
223 |     bestXes = md_fn(generated, data)
224 |     print bestXes.shape
225 |     print np.sum(bestXes)
226 |     end()
227 | 
228 |     start("all distances theano")
229 |     ds = dm_fn(generated, data)
230 |     print ds.shape
231 |     print np.sum(ds)
232 |     end()
233 | 
234 |     start("all distances numpy")
235 |     ds = distanceMatrix(generated, data)
236 |     print ds.shape
237 |     print np.sum(ds)
238 |     end()
239 | 
240 | if __name__ == "__main__":
241 |     whichTest = sys.argv[1]
242 |     assert whichTest in ("distances", "toyLearner", "speeds")
243 |     if whichTest=="distances":
244 |         testMinimalDistanceIndicesFunction(batchSize=3000, sampleSize=12972, featureDim=28*28)
245 |     elif whichTest=="speeds":
246 |         distanceSpeedTest()
247 |     elif whichTest=="toyLearner":
248 |         toyLearner()
249 | 


--------------------------------------------------------------------------------
/docs/charts/1d/triangle.attempt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/triangle.attempt.png


--------------------------------------------------------------------------------
/docs/charts/1d/triangle.bipartite.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/triangle.bipartite.png


--------------------------------------------------------------------------------
/docs/charts/1d/triangle.goal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/triangle.goal.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.attempt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.attempt.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.bipartite.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.bipartite.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.goal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.goal.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.os0.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os0.5.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.os1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os1.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.os2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os2.png


--------------------------------------------------------------------------------
/docs/charts/1d/uniform.os5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/docs/charts/1d/uniform.os5.png


--------------------------------------------------------------------------------
/earthMover.py:
--------------------------------------------------------------------------------
  1 | import cPickle
  2 | import gzip
  3 | import sys
  4 | import os
  5 | import time
  6 | import random
  7 | import math
  8 | from operator import itemgetter
  9 | 
 10 | import numpy as np
 11 | 
 12 | import theano
 13 | import theano.tensor as T
 14 | import lasagne
 15 | 
 16 | import kohonen # TODO This should only be used on the abandoned bipartiteMatchingBased==True codepath.
 17 | import evaluate
 18 | import distances
 19 | 
 20 | import nnbase.inputs
 21 | import nnbase.vis
 22 | from nnbase.attrdict import AttrDict
 23 | 
 24 | # These are only included to make the unpickling of the autoencoder possible:
 25 | from nnbase.layers import Unpool2DLayer
 26 | from nnbase.shape import ReshapeLayer
 27 | from nnbase.utils import FlipBatchIterator
 28 | 
 29 | L1_LOSS = "l1"
 30 | L2_SQUARED_LOSS = "l2squared"
 31 | # The weird name is because I really don't want to accidentally use this instead of L2_SQUARED_LOSS:
 32 | L2_UNSQUARED_LOSS = "l2unsquared"
 33 | 
 34 | 
 35 | def logg(*ss):
 36 |     s = " ".join(map(str,ss))
 37 |     sys.stderr.write(s+"\n")
 38 | 
 39 | 
 40 | def buildConvNet(input_var, layerNum, inDim, hidden, outDim, useReLU, leakiness=0.0):
 41 |     # ('hidden', layers.DenseLayer),
 42 |     # ('unflatten', ReshapeLayer),
 43 |     # ('unpool', Unpool2DLayer),
 44 |     # ('deconv', layers.Conv2DLayer),
 45 |     # ('output_layer', ReshapeLayer),
 46 |     # TODO Copypasted, refactor.
 47 |     if useReLU:
 48 |         if leakiness==0.0:
 49 |             nonlinearity = lasagne.nonlinearities.rectify
 50 |             gain = 'relu'
 51 |         else:
 52 |             nonlinearity = lasagne.nonlinearities.LeakyRectify(leakiness)
 53 |             gain = math.sqrt(2/(1+leakiness**2))
 54 |     else:
 55 |         nonlinearity = lasagne.nonlinearities.tanh
 56 |         gain = 1.0
 57 | 
 58 |     filter_sizes = 7
 59 |     conv_filters = 32
 60 |     deconv_filters = 32
 61 |     width = 28 # TODO MNIST specific!
 62 |     height = 28
 63 | 
 64 |     l_in = lasagne.layers.InputLayer(shape=(None, inDim),
 65 |                                      input_var=input_var)
 66 |     l_hid = lasagne.layers.DenseLayer(
 67 |             l_in, num_units=hidden,
 68 |             nonlinearity=nonlinearity,
 69 |             W=lasagne.init.GlorotUniform(gain=gain))
 70 |     hid2_num_units= deconv_filters * (height + filter_sizes - 1) * (width + filter_sizes - 1) / 4
 71 |     l_hid2 = lasagne.layers.DenseLayer(
 72 |             l_hid, num_units=hid2_num_units,
 73 |             nonlinearity=nonlinearity,
 74 |             W=lasagne.init.GlorotUniform(gain=gain))
 75 |     l_unflatten = ReshapeLayer(
 76 |             l_hid2, shape=(([0], deconv_filters, (height + filter_sizes - 1) / 2, (width + filter_sizes - 1) / 2 )))
 77 |     l_unpool = Unpool2DLayer(
 78 |             l_unflatten, ds=(2, 2))
 79 |     l_deconv = lasagne.layers.Conv2DLayer(
 80 |             l_unpool, num_filters=1, filter_size = (filter_sizes, filter_sizes),
 81 |             border_mode="valid", nonlinearity=None)
 82 |     l_output = ReshapeLayer(
 83 |             l_deconv, shape = (([0], -1)))
 84 |     return l_output
 85 | 
 86 | def buildNet(input_var, layerNum, inDim, hidden, outDim, useReLU, leakiness=0.0):
 87 |     if useReLU:
 88 |         if leakiness==0.0:
 89 |             nonlinearity = lasagne.nonlinearities.rectify
 90 |             gain = 'relu'
 91 |         else:
 92 |             nonlinearity = lasagne.nonlinearities.LeakyRectify(leakiness)
 93 |             gain = math.sqrt(2/(1+leakiness**2))
 94 |     else:
 95 |         nonlinearity = lasagne.nonlinearities.tanh
 96 |         gain = 1.0
 97 |     assert layerNum in (2,3,4)
 98 | 
 99 |     l_in = lasagne.layers.InputLayer(shape=(None, inDim),
100 |                                      input_var=input_var)
101 |     l_hid = lasagne.layers.DenseLayer(
102 |             l_in, num_units=hidden,
103 |             nonlinearity=nonlinearity,
104 |             W=lasagne.init.GlorotUniform(gain=gain))
105 |     if layerNum==2:
106 |         l_out = lasagne.layers.DenseLayer(
107 |             l_hid, num_units=outDim,
108 |             nonlinearity=nonlinearity,
109 |             W=lasagne.init.GlorotUniform(gain=gain))
110 |     elif layerNum==3:
111 |         l_hid2 = lasagne.layers.DenseLayer(
112 |             l_hid, num_units=hidden,
113 |             nonlinearity=nonlinearity,
114 |             W=lasagne.init.GlorotUniform(gain=gain))
115 |         l_out = lasagne.layers.DenseLayer(
116 |             l_hid2, num_units=outDim,
117 |             nonlinearity=nonlinearity,
118 |             W=lasagne.init.GlorotUniform(gain=gain))
119 |     elif layerNum==4:
120 |         l_hid2 = lasagne.layers.DenseLayer(
121 |             l_hid, num_units=hidden,
122 |             nonlinearity=nonlinearity,
123 |             W=lasagne.init.GlorotUniform(gain=gain))
124 |         l_hid3 = lasagne.layers.DenseLayer(
125 |             l_hid2, num_units=hidden,
126 |             nonlinearity=nonlinearity,
127 |             W=lasagne.init.GlorotUniform(gain=gain))
128 |         l_out = lasagne.layers.DenseLayer(
129 |             l_hid3, num_units=outDim,
130 |             nonlinearity=nonlinearity,
131 |             W=lasagne.init.GlorotUniform(gain=gain))
132 |     return l_out
133 | 
134 | def sampleInitial(n, inDim, sd, inBoolDim):
135 |     continuous = np.random.normal(loc=0.0, scale=sd, size=(n, inDim)).astype(np.float32)
136 |     discrete = np.random.randint(0, 2, (n, inBoolDim))
137 |     continuous[:, :inBoolDim] += discrete
138 |     return continuous
139 | 
140 | def sampleSourceParametrized(net_fn, n, inDim, sd, inBoolDim):
141 |     initial = sampleInitial(n, inDim, sd, inBoolDim)
142 |     return initial, net_fn(initial)
143 | 
144 | def constructSamplerFunction(input_var, net):
145 |     output = lasagne.layers.get_output(net)
146 |     net_fn = theano.function([input_var], output)
147 |     return net_fn
148 | 
149 | def constructTrainFunction(input_var, net, learningRate, momentum, regularization, lossType=L2_SQUARED_LOSS):
150 |     output = lasagne.layers.get_output(net)
151 |     data_var = T.matrix('targets')
152 |     if lossType==L1_LOSS:
153 |         loss = T.abs_(output-data_var).mean()
154 |     elif lossType==L2_SQUARED_LOSS:
155 |         loss = lasagne.objectives.squared_error(output, data_var).mean()
156 |     elif lossType==L2_UNSQUARED_LOSS:
157 |         lossSqr = ((output-data_var)**2).sum(axis=1)
158 |         loss = T.sqrt(lossSqr+1e-6).mean() # Fudge constant to avoid numerical stability issues.
159 |     else:
160 |         assert False, "unknown similarity loss function: %s" % lossType
161 | 
162 |     if regularization!=0.0:
163 |         logg('regularization', regularization)
164 |         loss += lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) * regularization
165 | 
166 |     params = lasagne.layers.get_all_params(net, trainable=True)
167 | 
168 |     updates = lasagne.updates.nesterov_momentum(
169 |             loss, params, learning_rate=learningRate, momentum=momentum)
170 |     # The rmsprop update rule is tricky. Properties (as measured on conf8):
171 |     # - Converges twice as fast at the beginning.
172 |     # - Goes way below nesterov on trainMean.
173 |     # - ...which implies that s*.png is visually better, but that's just overfitting, because it
174 |     #   - reaches approx. the same performance as nesterov on validationMean,
175 |     #   - and visually it does not improve on diff_validation after convergence on validationMean.
176 |     # - Performance has a hockey-stick dependence on epsilon:
177 |     #   Smaller epsilon is better until 0.0001, and then at 0.00001 it explodes.
178 |     # updates = lasagne.updates.rmsprop(loss, params, epsilon=0.0001)
179 | 
180 |     train_fn = theano.function([input_var, data_var], updates=updates)
181 |     return train_fn
182 | 
183 | def sampleAndUpdate(train_fn, net_fn, closestFnFactory, inDim, sampleSource, n, data=None, m=None, innerGradientStepCount=1):
184 |     if data is None:
185 |         data = kohonen.samplesFromTarget(n) # TODO Refactor, I can't even change the goddamn target distribution in this source file!
186 |     else:
187 |         assert len(data)==n
188 |     if m is None:
189 |         m = n
190 | 
191 |     initial, sampled = sampleSource(net_fn, m, inDim)
192 | 
193 |     doDetailed1DVis = True and (data.shape[1]==1)
194 | 
195 |     bipartiteMatchingBased = False
196 |     if bipartiteMatchingBased:
197 |         if data.shape[1]==1:
198 |             # In 1d we can actually solve the weighted bipartite matching
199 |             # problem, by sorting. Basically that's what Magdon-Ismail and Atiya do.
200 |             assert len(data)==len(initial)
201 |             data.sort(axis=0)
202 |             pairs = sorted(zip(sampled, initial))
203 |             sampled = np.array(map(itemgetter(0), pairs))
204 |             initial = np.array(map(itemgetter(1), pairs))
205 |         else:
206 |             # Pretty much obsoleted, because it can't be made fast.
207 |             # Does a full weighted bipartite matching.
208 |             # Left here for emotional reasons.
209 |             permutation = kohonen.optimalPairing(sampled, data)
210 |             initial = initial[permutation]
211 |             sampled = sampled[permutation]
212 |     else:
213 |         # TODO We had this cool findGenForData=False experiment here
214 |         # TODO that didn't go anywhere at first, but we shouldn't let it go this easily.
215 |         findGenForData = True
216 |         if findGenForData:
217 |             bestIndices = closestFnFactory(sampled, data)
218 |             initial = initial[bestIndices]
219 |             sampled = sampled[bestIndices]
220 |         else:
221 |             bestIndices = closestFnFactory(data, sampled)
222 |             data = data[bestIndices]
223 | 
224 |     bestDists = np.linalg.norm(data-sampled, axis=1)
225 | 
226 |     for i in range(innerGradientStepCount):
227 |         # That's where the update happens.
228 |         train_fn(initial, data)
229 | 
230 |     if doDetailed1DVis and random.randrange(100)==0:
231 |         postSampled = net_fn(initial)
232 |         nnbase.vis.gradientMap1D(data, sampled, postSampled, "gradient")
233 | 
234 |     # These values are a byproduct of the training step,
235 |     # so they are from _before_ the training, not after it.
236 |     return bestDists
237 | 
238 | 
239 | def lowDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger):
240 |     n, dim = data.shape
241 |     inDim = params.inDim
242 |     initial, sampled = sampleSource(net_fn, n, inDim)
243 |     nnbase.vis.heatmap(sampled, params.expName+"/heatmap"+str(epoch))
244 | 
245 | 
246 | def highDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger):
247 |     height, width = params.height, params.width
248 |     expName = params.expName
249 | 
250 |     # TODO This is mixing the responsibilities of evaluation and visualization:
251 |     # TODO train_distance and validation_distance are calculated on only visImageCount images.
252 |     doValidation = True
253 |     if doValidation:
254 |         start_time = time.time()
255 |         visImageCount = params.gridSizeForSampling ** 2
256 |         visualizedValidation = validation[:visImageCount]
257 |         visualizedData = data[:visImageCount]
258 |         trainMean, trainMedian = evaluate.fitAndVis(visualizedData,
259 |                                       net_fn, closestFnFactory, sampleSource, params.inDim,
260 |                                       height, width, params.gridSizeForSampling, name=expName+"/diff_train"+str(epoch))
261 |         validationMean, validationMedian = evaluate.fitAndVis(visualizedValidation,
262 |                                       net_fn, closestFnFactory, sampleSource, params.inDim,
263 |                                       height, width, params.gridSizeForSampling, name=expName+"/diff_validation"+str(epoch))
264 |         print >> logger, "epoch %d trainMean %f trainMedian %f validationMean %f validationMedian %f" % (
265 |             epoch, trainMean, trainMedian, validationMean, validationMedian)
266 |         print >> logger, "time elapsed %f" % (time.time() - start_time)
267 |         logger.flush()
268 | 
269 |     nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/xy"+str(epoch),
270 |         height, width, fromGrid=True, gridSize=params.gridSizeForInterpolation, plane=(0,1))
271 |     nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/yz"+str(epoch),
272 |         height, width, fromGrid=True, gridSize=params.gridSizeForInterpolation, plane=(1,2))
273 |     nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/xz"+str(epoch),
274 |         height, width, fromGrid=True, gridSize=params.gridSizeForInterpolation, plane=(0,2))
275 |     nnbase.vis.plotSampledImages(net_fn, params.inDim, expName+"/s"+str(epoch),
276 |         height, width, fromGrid=False, gridSize=params.gridSizeForSampling, sampleSourceFunction=sampleSource)
277 | 
278 |     with open(expName+"/som-generator.pkl", 'w') as f:
279 |         cPickle.dump(net, f)
280 | 
281 | 
282 | def train(data, validation, params, logger=None):
283 |     if logger is None:
284 |         logger = sys.stdout
285 | 
286 |     isLowDim = "isLowDim" in params and params.isLowDim
287 | 
288 |     if isLowDim:
289 |         nnbase.vis.heatmap(data, params.expName+"/input")
290 |     else:
291 |         # Have to do before flattening:
292 |         nnbase.vis.plotImages(data[:params.gridSizeForSampling**2], params.gridSizeForSampling, params.expName+"/input")
293 | 
294 |     # My network works with 1D input.
295 |     data = nnbase.inputs.flattenImages(data)
296 |     validation = nnbase.inputs.flattenImages(validation)
297 | 
298 |     m = int(params.oversampling*params.minibatchSize)
299 | 
300 |     outDim = data.shape[1] # Flattening already happened.
301 |     if "height" in params:
302 |         assert params.height * params.width == outDim
303 | 
304 |     input_var = T.matrix('inputs')
305 |     leakiness = 0.0 if 'reLULeakiness' not in params else params.reLULeakiness
306 |     if not params.useReLU:
307 |         assert leakiness==0.0, "reLULeakiness not allowed for tanh activation"
308 |     if 'convolutional' in params and params.convolutional:
309 |         net = buildConvNet(input_var, params.layerNum, params.inDim, params.hiddenLayerSize, outDim,
310 |                    useReLU=params.useReLU, leakiness=leakiness)
311 |     else:
312 |         net = buildNet(input_var, params.layerNum, params.inDim, params.hiddenLayerSize, outDim,
313 |                    useReLU=params.useReLU, leakiness=leakiness)
314 | 
315 |     minibatchCount = len(data)/params.minibatchSize
316 | 
317 |     regularization = 0.0 if 'regularization' not in params else params.regularization # L2
318 | 
319 |     innerGradientStepCount = 1 if 'innerGradientStepCount' not in params else params.innerGradientStepCount
320 | 
321 |     lossType = params.loss if "loss" in params else L2_SQUARED_LOSS
322 | 
323 |     learningRate_shared = theano.shared(np.array(params.learningRate, dtype=np.float32))
324 | 
325 |     # Per epoch, which means that this is super-sensitive to epoch size.
326 |     learningRateDecay = np.float32(1.0 if 'learningRateDecay' not in params else params.learningRateDecay)
327 | 
328 |     train_fn = constructTrainFunction(input_var, net, learningRate_shared, params.momentum, regularization, lossType)
329 |     net_fn = constructSamplerFunction(input_var, net)
330 |     closestFnFactory = distances.ClosestFnFactory()
331 | 
332 |     sampleSource = lambda net_fn, n, inDim: sampleSourceParametrized(net_fn, n, inDim, params.initialSD, params.inBoolDim)
333 | 
334 |     validationMean = 1e10 # ad hoc inf-like value.
335 | 
336 |     # The reason for the +1 is that this way, if
337 |     # epochCount is a multiple of plotEach, then the
338 |     # last thing that happens is an evaluation.
339 |     for epoch in range(params.epochCount+1):
340 |         shuffledData = np.random.permutation(data)
341 |         epochDistances = []
342 |         for i in range(minibatchCount):
343 |             dataBatch = shuffledData[i*params.minibatchSize:(i+1)*params.minibatchSize]
344 | 
345 |             # The issue with using a minibatchSize that's not a divisor of corpus size
346 |             # is that m is calculated before the epoch loop. This is not trivial to fix,
347 |             # because constructMinimalDistanceIndicesFunction gets n and m as args.
348 |             assert params.minibatchSize==len(dataBatch)
349 | 
350 |             minibatchDistances = sampleAndUpdate(train_fn, net_fn, closestFnFactory, params.inDim, sampleSource,
351 |                                                  n=params.minibatchSize, data=dataBatch, m=m,
352 |                                                  innerGradientStepCount=innerGradientStepCount)
353 |             epochDistances.append(minibatchDistances)
354 |         epochDistances = np.array(epochDistances)
355 |         epochInterimMean = epochDistances.mean()
356 |         epochInterimMedian = np.median(epochDistances)
357 | 
358 |         # Remove the "epoch != 0" if you are trying to catch evaluation crashes.
359 |         if epoch % params.plotEach == 0 and epoch != 0:
360 |             print >> logger, "epoch %d epochInterimMean %f epochInterimMedian %f" % (epoch, epochInterimMean, epochInterimMedian)
361 |             print >> logger, "learningRate", learningRate_shared.get_value()
362 |             if isLowDim:
363 |                 lowDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger)
364 |             else:
365 |                 highDimFitAndVis(data, validation, epoch, net, net_fn, closestFnFactory, sampleSource, params, logger)
366 | 
367 | 
368 |         learningRate_shared.set_value( learningRateDecay * learningRate_shared.get_value() )
369 | 
370 |     return validationMean # The last calculated one, we don't recalculate.
371 | 
372 | 
373 | def setupAndRun(params):
374 |     data, validation = nnbase.inputs.readData(params)
375 |     # We dump after readData() because it augments params
376 |     # with width/height deduced from the input data.
377 |     nnbase.inputs.dumpParams(params, file(params.expName+"/conf.txt", "w"))
378 | 
379 |     isLowDim = "isLowDim" in params and params.isLowDim
380 | 
381 |     with file(params.expName+"/log.txt", "w") as logger:
382 |         if not isLowDim:
383 |             meanDist, medianDist = evaluate.fitAndVisNNBaselineMain(data, validation, params)
384 |             print >> logger, "nnbaselineMean %f nnbaselineMedian %f" % (meanDist, medianDist)
385 | 
386 |         value = train(data, validation, params, logger)
387 |         print >> logger, "final performance %f" % value
388 | 
389 |     return value
390 | 
391 | def sampleAndPlot(net_fn, inDim, initialSD, inBoolDim, n, name):
392 |     initial, sampled = sampleSourceParametrized(net_fn, n, inDim, initialSD, inBoolDim)
393 |     nnbase.vis.plot(sampled, name)
394 | 
395 | def mainLowDim(expName, minibatchSize, initialSD):
396 |     inDim = 2
397 |     outDim = 2
398 |     layerNum = 3
399 |     hidden = 100
400 |     input_var = T.matrix('inputs')
401 |     net = buildNet(input_var, layerNum, inDim, hidden, outDim, useReLU=False)
402 |     train_fn = constructTrainFunction(input_var, net)
403 |     net_fn = constructSamplerFunction(input_var, net)
404 |     for i in range(100):
405 |         print i,
406 |         sys.stdout.flush()
407 |         sampleAndUpdate(train_fn, net_fn, inDim, n=minibatchSize)
408 |         sampleAndPlot(net_fn, inDim, initialSD, 1000, expName+"/d"+str(i))
409 |     print
410 | 
411 | 
412 | def setDefaultParams():
413 |     params = AttrDict()
414 |     params.inputType = "mnist"
415 | 
416 |     if params.inputType=="image":
417 |         params.imageDirectory = "../face/SCUT-FBP/thumb.big/"
418 |         params.gridSizeForSampling = 10
419 |         params.gridSizeForInterpolation = 20
420 |         params.plotEach = 1000
421 |     elif params.inputType=="mnist":
422 |         params.inputDigit = None
423 |         params.everyNthInput = 10
424 |         params.gridSizeForSampling = 20
425 |         params.gridSizeForInterpolation = 30
426 |         params.plotEach = 100 # That's too small for params.inputDigit = None, params.everyNthInput = 1
427 |     else:
428 |         assert False, "unknown inputType"
429 | 
430 |     # values coming from adhoc/spearmint-best-leaky.txt
431 | 
432 |     params.inDim = 50
433 |     params.inBoolDim = 0
434 |     params.initialSD = 0.25
435 |     params.minibatchSize = 1000
436 |     # m = oversampling*minibatchSize, that's how many
437 |     # generated samples do we pair with our minibatchSize gold samples.
438 |     params.oversampling = 8.0
439 |     params.hiddenLayerSize = 673 
440 |     params.layerNum = 3
441 |     params.useReLU = True
442 |     params.reLULeakiness = 0.01
443 |     params.learningRate = 1.0
444 |     params.momentum = 0.969849416169
445 |     # in experiment regularization_initialSD used 6400 here, but that's
446 |     # not nice to Spearmint, as validation optimum is usually
447 |     # at 4800, and I don't have early stopping implemented.
448 |     params.epochCount = 4800
449 |     params.plotEach = 800
450 |     return params
451 | 
452 | 
453 | SHORTENED_PARAM_NAMES = { "learningRate":"lr", "minibatchSize":"n",
454 |                           "momentum":"mom", "hiddenLayerSize":"hls",
455 |                           "oversampling":"os"}
456 | 
457 | def spearmintDirName(spearmintParams):
458 |     pairs = []
459 |     for k in sorted(spearmintParams.keys()):
460 |         v = spearmintParams[k]
461 |         assert len(v)==1
462 |         v = v[0]
463 |         if k in SHORTENED_PARAM_NAMES:
464 |             k = SHORTENED_PARAM_NAMES[k]
465 |         # TODO if v is a float, normalize it. (0.2000001 and 0.199999 to 0.2)
466 |         pairs.append((k, str(v)))
467 |     pairs.sort()
468 |     return "-".join(map(lambda (k,v): k+v, pairs))
469 | 
470 | def spearmintEntry(spearmintParams):
471 |     params = setDefaultParams()
472 |     for k,v in spearmintParams.iteritems():
473 |         # v[0] because we only work with single values, and those are 1-element ndarrays in spearmint
474 |         assert len(v)==1
475 |         # We want int32 and float32, not the 64bit versions provided by spearmint.
476 |         # http://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types/11389998#11389998
477 |         params[k] = np.asscalar(v[0])
478 |     params.expName = "spearmintOutput/" + spearmintDirName(spearmintParams)
479 | 
480 |     try:
481 |         os.mkdir(params.expName)
482 |     except OSError:
483 |         logg("Warning: target directory already exists, or can't be created.")
484 | 
485 |     # If we are interested in consistent behavior across several datasets,
486 |     # we can simply aggregate here: value = setupAndRun(params1) + setupAndRun(params2)
487 |     # where params1 and params2 are the same except for imageDirectory or inputDigit or whatever (and expName).
488 |     value = setupAndRun(params)
489 |     # np.float32 to float:
490 |     value = np.asscalar(value)
491 |     return value
492 | 
493 | def main():
494 |     assert len(sys.argv)==2
495 |     confFilename = sys.argv[1]
496 |     params = nnbase.inputs.paramsFromConf(file(confFilename))
497 |     logg("Starting experiment, working directory: "+params.expName)
498 | 
499 |     try:
500 |         os.mkdir(params.expName)
501 |     except OSError:
502 |         logg("Warning: target directory already exists, or can't be created.")
503 | 
504 |     value = setupAndRun(params)
505 |     logg("final performance %f" % value)
506 | 
507 |     # TODO This codepath is temporarily abandoned:
508 |     # mainLowDim(params.expName, params.minibatchSize)
509 | 
510 | if __name__ == "__main__":
511 |     doCPUProfile = False
512 |     if doCPUProfile:
513 |         import cProfile
514 |         cProfile.run("main()", "pstats")
515 |     else:
516 |         main()
517 | 


--------------------------------------------------------------------------------
/earthMoverTest.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import math
 3 | 
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | # A cool little toy learning problem:
 9 | # We want to learn a 1D distribution, e.g. uniform on (-1,+1).
10 | # We want to model it with a gaussian mixture model.
11 | # (Mixture of k 1D standard normals, parametrized by the k means.)
12 | # We generate n samples from the target distribution.
13 | # We generate n samples from our current best bet for the model.
14 | # We find the pairing that minimizes the summed distance between paired points.
15 | def toyLearner():
16 |     n = 2000
17 |     k = 100
18 |     sigma = 0.05
19 |     learningRate = 0.005
20 |     epochCount = 100
21 | 
22 |     centers = np.random.normal(size=k).astype(np.float32)
23 | 
24 |     def generate(centers, n):
25 |         picks = np.random.randint(k, size=n)
26 |         currentCenters = centers[picks] # smart indexing
27 |         generated = currentCenters + sigma * np.random.normal(size=n).astype(np.float32)
28 |         return generated, picks
29 | 
30 |     for epoch in range(epochCount):
31 |         DIST = "triangle"
32 |         if DIST=="uniform":
33 |             data = np.sort(np.random.uniform(low=-1, high=+1, size=(n,)).astype(np.float32))
34 |         elif DIST=="triangle":
35 |             bi = np.random.uniform(low=0, high=1, size=(n,2)).astype(np.float32)
36 |             data = np.max(bi, axis=-1)
37 |         else:
38 |             assert False, "unknown distribution"
39 | 
40 |         data.sort()
41 |         generated, picks = generate(centers, n)
42 | 
43 |         if epoch%5==0:
44 |             plt.hist(generate(centers, 100000)[0], 50, normed=0, facecolor='green')
45 |             plt.savefig("emd"+str(epoch)+".pdf")
46 |             plt.close()
47 |             plt.scatter(centers[:-1], centers[1:]-centers[:-1])
48 |             plt.savefig("delta"+str(epoch)+".pdf")
49 |             plt.close()
50 | 
51 |         sortedPairs = zip(generated, picks)
52 |         sortedPairs.sort()
53 |         triplets = zip(sortedPairs, data)
54 |         # both are sorted at this point, this pairing is the earth mover's pairing.
55 |         totalLoss = 0.0
56 |         for (g,p), d in triplets:
57 |             # linear derivative, corresponds to L2squared.
58 |             # math.copysign(1, d-g) would be the derivative of L1=L2unsquared
59 |             differential = d-g
60 |             totalLoss += abs(differential) # NOT L2 squared, proper L2!
61 |             centers[p] += differential * learningRate
62 |         centers.sort()
63 |         print "loss", totalLoss
64 | 
65 |         sys.stdout.flush()
66 | 
67 | if __name__ == "__main__":
68 |     toyLearner()
69 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import theano
  4 | import theano.tensor as T
  5 | import lasagne
  6 | 
  7 | import distances
  8 | import nnbase.vis
  9 | 
 10 | # TODO If this functionality is important,
 11 | # TODO I'll probably have to rewrite it in Theano,
 12 | # TODO together with the workhorse kohonen.distanceMatrix().
 13 | # TODO Especially if the gradient descent based finetuning comes in.
 14 | 
 15 | def approximateMinibatch(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleForEach):
 16 |     n = len(data)
 17 |     initial, sampled = sampleSourceFunction(net_fn, sampleForEach, inDim)
 18 |     bestDistIndices = closestFnFactory(sampled, data)
 19 |     sampled = sampled[bestDistIndices]
 20 |     distances = np.linalg.norm(data-sampled, axis=1)
 21 |     return initial, sampled, distances
 22 | 
 23 | # For each validation sample we find the closest train sample.
 24 | def approximateFromTrain(train, validation, closestFnFactory):
 25 |     bestDistIndices = closestFnFactory(train, validation)
 26 |     nearests = train[bestDistIndices]
 27 |     distances = np.linalg.norm(validation-nearests, axis=1)
 28 |     return nearests, distances
 29 | 
 30 | # We generate sampleTotal data points, and for each gold data point
 31 | # we find the closest generated one.
 32 | def approximate(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleTotal):
 33 |     bestInitial, bestSampled, bestDistances = None, None, None
 34 |     # approximate_minibatch builds a matrix of size (len(data), sampleForEachMinibatch).
 35 |     # We want this matrix to fit into memory.
 36 |     distanceMatrixSizeLimit = int(1e6)
 37 |     sampleForEachMinibatch = distanceMatrixSizeLimit / len(data)
 38 |     batchCount = sampleTotal / sampleForEachMinibatch + 1
 39 |     for indx in xrange(batchCount):
 40 |         initial, sampled, distances = approximateMinibatch(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleForEachMinibatch)
 41 |         if bestDistances is None:
 42 |             bestInitial, bestSampled, bestDistances = initial, sampled, distances
 43 |         else:
 44 |             # Could easily vectorize but not a bottleneck.
 45 |             for i in range(len(bestDistances)):
 46 |                 if distances[i]<bestDistances[i]:
 47 |                     bestInitial[i] = initial[i]
 48 |                     bestSampled[i]= sampled[i]
 49 |                     bestDistances[i] = distances[i]
 50 |     return bestInitial, bestSampled, bestDistances
 51 | 
 52 | def fitAndVis(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, height, width, gridSizeForSampling, name):
 53 |     n = len(data)
 54 |     n_x = gridSizeForSampling
 55 |     n_y = gridSizeForSampling
 56 |     assert n <= n_x * n_y
 57 |     sampleTotal = int(1e5)
 58 | 
 59 |     initial, sampled, distances = approximate(data, net_fn, closestFnFactory, sampleSourceFunction, inDim, sampleTotal)
 60 | 
 61 |     # TODO The smart thing here would be to run a gradient descent
 62 |     # TODO on initial, further minimizing distances.
 63 | 
 64 |     meanDist = distances.mean()
 65 |     medianDist = np.median(distances)
 66 | 
 67 |     # Awkward, asserts that diff is in the name.
 68 |     nnbase.vis.plot_distance_histogram(distances, name.replace("diff", "hist"))
 69 | 
 70 |     vis_n = min((n, n_x*n_y))
 71 |     nnbase.vis.diff_vis(data[:vis_n], sampled[:vis_n], height, width, n_x, n_y, name, distances=distances)
 72 | 
 73 |     return meanDist, medianDist
 74 | 
 75 | # NN as in nearest neighbor.
 76 | # A refactor with fitAndVis would be nice, but not a priority now.
 77 | def fitAndVisNNBaseline(train, validation, closestFnFactory, height, width, gridSizeForSampling, name):
 78 |     n = len(validation)
 79 |     n_x = gridSizeForSampling
 80 |     n_y = gridSizeForSampling
 81 |     assert n <= n_x * n_y
 82 | 
 83 |     nearests, distances = approximateFromTrain(train, validation, closestFnFactory)
 84 | 
 85 |     nnbase.vis.plot_distance_histogram(distances, name.replace("diff", "hist"))
 86 | 
 87 |     meanDist = distances.mean()
 88 |     medianDist = np.median(distances)
 89 | 
 90 |     vis_n = min((n, n_x*n_y))
 91 |     nnbase.vis.diff_vis(validation[:vis_n], nearests[:vis_n], height, width, n_x, n_y, name, distances=distances)
 92 | 
 93 |     return meanDist, medianDist
 94 | 
 95 | # Again, as in regular fitAndVis(), the mixing of fit and vis causes
 96 | # this stupid constraint on validation set size. TODO Should refactor.
 97 | def fitAndVisNNBaselineMain(train, validation, params):
 98 |     n = params.gridSizeForSampling ** 2
 99 |     train = nnbase.inputs.flattenImages(train)
100 |     validation = nnbase.inputs.flattenImages(validation)
101 |     visualizedValidation = validation[:n]
102 |     closestFnFactory = distances.ClosestFnFactory()
103 |     # TODO Don't forget to set len(visualizedValidation) to something larger after the refactor.
104 |     meanDist, medianDist = fitAndVisNNBaseline(train, visualizedValidation, closestFnFactory, params.height, params.width,
105 |                                                params.gridSizeForSampling, params.expName+"/diff_nnbaseline")
106 |     return meanDist, medianDist
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | sudo apt-get --yes install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev liblapack-dev git mc vim > apt-get.cout 2> apt-get.cerr
 2 | # Check the latest deb at https://developer.nvidia.com/cuda-downloads
 3 | wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
 4 | sudo dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb 
 5 | sudo apt-get update
 6 | sudo apt-get install cuda
 7 | sudo reboot
 8 | # check:
 9 | /usr/local/cuda-7.5/bin/nvcc --version
10 | 
11 | # This installs a Theano that's newer than regular pip install, actually this one:
12 | # git+https://github.com/Theano/Theano.git@15c90dd3#egg=Theano==0.8.git
13 | sudo pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/v0.1/requirements.txt
14 | 
15 | # if you have this repo available, copy or copypaste:
16 | cp daniel-experiments/kohonen/theanorc.txt .theanorc
17 | 
18 | # check:
19 | python -c "import numpy; numpy.test()"
20 | python `python -c "import os, theano; print os.path.dirname(theano.__file__)"`/misc/check_blas.py
21 | 
22 | sudo pip install Lasagne==0.1
23 | 
24 | # Libs required for matplotlib that comes with nolearn.
25 | # scikit-learn also comes with nolearn.
26 | sudo apt-get install libpng-dev
27 | sudo apt-get install libfreetype6-dev
28 | sudo pip install git+https://github.com/dnouri/nolearn.git@master#egg=nolearn==0.7.git
29 | # otherwise matplotlib wants to communicate with nonexisting X11:
30 | mkdir .matplotlib
31 | echo "backend : Agg" > .matplotlib/matplotlibrc
32 | 
33 | # cuDNN
34 | # Login to NVIDIA, get cuDNN 4.0 for Linux x64:
35 | open https://developer.nvidia.com/cudnn
36 | # or simply take my cached one:
37 | wget people.mokk.bme.hu/~daniel/tmp/cudnn-7.0-linux-x64-v4.0-prod.tgz
38 | cd /usr/local/
39 | sudo tar zxvf ~/cudnn-7.0-linux-x64-v4.0-prod.tgz
40 | 
41 | cd
42 | mkdir .ssh
43 | ssh-keygen -t rsa -b 4096 -C "daniel.varga@prezi.com"
44 | eval "$(ssh-agent -s)"
45 | ssh-add ~/.ssh/id_rsa
46 | # Now add ~/.ssh/id_rsa.pub to github settings.
47 | git config --global user.email "daniel.varga@prezi.com"
48 | git config --global user.name "Daniel Varga"
49 | 
50 | # Spearmint
51 | git clone git@github.com:HIPS/Spearmint.git
52 | sudo pip install -e Spearmint
53 | sudo apt-get install mongodb
54 | sudo pip install pymongo
55 | sudo service mongod start
56 | 
57 | git clone git@github.com:danielvarga/daniel-experiments.git
58 | # check:
59 | time python daniel-experiments/kohonen/testNumpyToTheano.py > cout
60 | # -> 9.5 secs for testSampleInitial(), 6.7 secs with allow_gc=False.
61 | # test() minimal distances theano finishes in 0.263873 seconds.
62 | 
63 | wget http://deeplearning.net/data/mnist/mnist.pkl.gz
64 | mv mnist.pkl.gz daniel-experiments/rbm/data/
65 | 
66 | cd daniel-experiments/kohonen
67 | python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr
68 | 


--------------------------------------------------------------------------------
/kohonen.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import math
  4 | import random
  5 | import sys
  6 | import cPickle
  7 | import gzip
  8 | 
  9 | import next_permutation
 10 | import munkres
 11 | 
 12 | def pretty(m):
 13 |     for row in m:
 14 |         print "\t".join(map(str, row))
 15 | 
 16 | def halfCircle():
 17 |     x = 1.0
 18 |     y = 1.0
 19 |     while x*x+y*y>1.0:
 20 |         x = random.uniform( 0.0, +1.0)
 21 |         y = random.uniform(-1.0, +1.0)
 22 |     return (x,y)
 23 | 
 24 | def wave():
 25 |     x = random.uniform( -math.pi, +math.pi)
 26 |     y = math.sin(x)+random.uniform( -0.2, +0.2)
 27 |     return (x,y)
 28 | 
 29 | def triangle():
 30 |     x = random.uniform(-1.0, +1.0)
 31 |     y = random.uniform(-1.0, x)
 32 |     return (x,y)
 33 | 
 34 | 
 35 | def sampleFromTarget():
 36 |     # return wave()
 37 |     return halfCircle()
 38 |     # return triangle()
 39 | 
 40 | def samplesFromTarget(n):
 41 |     return np.array([sampleFromTarget() for i in xrange(n)])
 42 | 
 43 | def samplesFromInit(n, d, e):
 44 |     norm = np.random.normal(loc=0.0, scale=1.0, size=(n,e))
 45 |     z = np.zeros((n,d-e))
 46 |     data = np.hstack((norm, z))
 47 |     assert data.shape==(n,d)
 48 |     return data
 49 | 
 50 | # Both are (n x d) arrays.
 51 | def sumOfDistances(x,y):
 52 |     return np.sum(np.linalg.norm(x-y, axis=1))
 53 | 
 54 | # Both are (n x d) arrays.
 55 | # Scales with O(n!) boo!
 56 | # We could bring it down by reducing it to minimum-weight
 57 | # matching on a complete bipartite graph.
 58 | # If we need really large n, then a sequential
 59 | # greedy alg is probably more than good enough.
 60 | # Probably we'll have something partially parallel that's even
 61 | # faster than the naive sequential greedy alg.
 62 | def slowOptimalPairing(x,y):
 63 |     n,d = x.shape
 64 |     assert y.shape==(n,d)
 65 |     bestDist = np.inf
 66 |     bestP = None
 67 |     for p in next_permutation.next_permutation(range(n)):
 68 |         dist = sumOfDistances(x[p],y)
 69 |         if dist<bestDist:
 70 |             bestDist = dist
 71 |             bestP = p
 72 |     return bestP
 73 | 
 74 | def distanceMatrix(x, y):
 75 |     xL2S = np.sum(x*x,axis=-1)
 76 |     yL2S = np.sum(y*y,axis=-1)
 77 |     xL2SM = np.tile(xL2S, (len(y), 1))
 78 |     yL2SM = np.tile(yL2S, (len(x), 1))
 79 |     squaredDistances = xL2SM + yL2SM.T - 2.0*y.dot(x.T)
 80 |     # elementwise. abs is to supress negative values caused by rounding errors.
 81 |     # TODO Should switch to squared distances everywhere, but be careful about fitAndVis fitAndVisNNBaseline.
 82 |     distances = np.sqrt(np.abs(squaredDistances))
 83 |     return distances
 84 | 
 85 | def optimalPairing(x, y):
 86 |     distances = distanceMatrix(x,y)
 87 |     perm = munkres.Munkres().compute(distances)
 88 |     p = []
 89 |     for i,(a,b) in enumerate(perm):
 90 |         assert i==a
 91 |         p.append(b)
 92 |     # assert p==slowOptimalPairing(x,y)
 93 |     return p
 94 | 
 95 | 
 96 | class LocalMapping(object):
 97 |     KERNEL_SIZE = 0.33 # Ad hoc is an understatement
 98 |     def __init__(self, source, gradient):
 99 |         self.source = source
100 |         self.gradient = gradient
101 |     # TODO Make it work on arrays as well.
102 |     def __call__(self, x):
103 |         y = x.copy()
104 |         dists = np.linalg.norm(self.source-x, axis=1)
105 |         for (d, g) in zip(dists, self.gradient):
106 |             if d<self.KERNEL_SIZE:
107 |                 y += g * (self.KERNEL_SIZE-d) / self.KERNEL_SIZE
108 |         return y
109 | 
110 | def testLocalMapping():
111 |     d = 2
112 |     source = np.array(  [[0.0, 0.0], [ 1.0, 1.0]])
113 |     gradient = np.array([[0.5, 0.5], [-0.5, 0.5]])
114 |     f = LocalMapping(source, gradient)
115 |     np.testing.assert_array_almost_equal( f(np.array([1.0,  1.0])), np.array([0.95,  1.05]) )
116 |     np.testing.assert_array_almost_equal( f(np.array([0.09, 0.0])), np.array([0.095, 0.005]) )
117 | 
118 | 
119 | class GlobalMapping(LocalMapping):
120 |     def __init__(self, source, gradient, ancestor):
121 |         super(GlobalMapping, self).__init__(source, gradient)
122 |         self.ancestor = ancestor
123 |     def __call__(self, x):
124 |         if self.ancestor is None:
125 |             return super(GlobalMapping, self).__call__(x)
126 |         else:
127 |             intermediate = self.ancestor(x)
128 |             return super(GlobalMapping, self).__call__(intermediate)
129 | 
130 | 
131 | def testGlobalMapping():
132 |     source = np.array(  [[0.0, 0.0], [ 1.0, 1.0]])
133 |     gradient = np.array([[0.5, 0.5], [-0.5, 0.5]])
134 |     f = GlobalMapping(source, gradient, None)
135 |     g = GlobalMapping(source, gradient, f)
136 |     x = np.array([1.0,  1.0])
137 |     np.testing.assert_array_almost_equal( f(f(x)), g(x) )
138 | 
139 | # testLocalMapping()
140 | # testGlobalMapping()
141 | 
142 | def drawMapping(ax, f):
143 |     n = 30
144 |     window = 2
145 |     ax.set_xlim((-window, +window))
146 |     ax.set_ylim((-window, +window))
147 |     for x in np.linspace(-window, +window, num=n):
148 |         for y in np.linspace(-window, +window, num=n):
149 |             x2, y2 = f(np.array([x, y]))
150 |             ax.arrow(x, y, x2-x, y2-y, head_width=0.05, head_length=0.1, fc='k', ec='k')
151 | 
152 | # n is the number of data points.
153 | # e is the dimension of the initial Gaussian.
154 | # (But it's embedded in the d-dimensional feature space.)
155 | # f is the actual mapping, a python function
156 | # mapping from R^d to R^d.
157 | def findMapping(n, e, f, learningRate):
158 |     y = samplesFromTarget(n)
159 |     d = y.shape[1]
160 |     # TODO It's dumb not to init with the e principal components of the data.
161 |     init = samplesFromInit(n, d, e)
162 |     # TODO vectorize f
163 |     x = np.array([f(i) for i in init])
164 |     p = optimalPairing(x,y)
165 |     x = x[p]
166 |     # pretty(np.hstack((x,y)))
167 |     source, gradient = x, learningRate*(y-x)
168 |     dumpMapping = False
169 |     if dumpMapping:
170 |         f = LocalMapping(source, gradient)
171 |         for xp,yp in zip(x,y):
172 |             print xp, yp, f(xp), np.linalg.norm(xp-yp), np.linalg.norm(f(xp)-yp)
173 |     return source, gradient
174 | 
175 | def iteration():
176 |     d = 2
177 |     e = 2
178 |     n = 50
179 |     learningRate = 0.3
180 |     minibatchCount = 50
181 |     plotEvery = 10
182 |     plotCount = minibatchCount/plotEvery
183 | 
184 |     f = GlobalMapping(np.zeros((0,d)), np.zeros((0,d)), None)
185 |     gaussSample = samplesFromInit(100, d, e)
186 | 
187 |     fig, axarr = plt.subplots(minibatchCount/plotEvery, 3)
188 |     fig.set_size_inches(10.0*2, 10.0*plotCount)
189 |     fig.subplots_adjust(hspace=0.2, wspace=0.2)
190 | 
191 |     for i in range(minibatchCount):
192 |         source, gradient = findMapping(n, e, f, learningRate)
193 |         # That's much the same as
194 |         # f = lambda x: LocalMapping(source, gradient)(f(x))
195 |         f = GlobalMapping(source, gradient, f)
196 |         print i,
197 |         sys.stdout.flush()
198 |         if i%plotEvery==0:
199 |             plotIndex = i/plotEvery
200 |             drawMapping(axarr[plotIndex][0], f)
201 |             drawMapping(axarr[plotIndex][1], LocalMapping(source, gradient))
202 |             sampleFromTarget = samplesFromTarget(100)
203 |             axarr[plotIndex][2].scatter(sampleFromTarget[:,0], sampleFromTarget[:,1], color='red')
204 |             sampleFromLearned = np.array([ f(p) for p in gaussSample ])
205 |             axarr[plotIndex][2].scatter(sampleFromLearned[:,0], sampleFromLearned[:,1])
206 | 
207 |     print
208 |     plt.savefig("vis.pdf")
209 | 
210 | def iterationMNIST():
211 |     d = 784
212 |     e = 10
213 |     n = 50
214 |     learningRate = 1.0
215 |     minibatchCount = 90
216 |     plotEvery = 10
217 |     plotCount = minibatchCount/plotEvery
218 | 
219 |     f = GlobalMapping(np.zeros((0,d)), np.zeros((0,d)), None)
220 |     gaussSample = samplesFromInit(100, d, e)
221 | 
222 |     fig, axarr = plt.subplots(minibatchCount/plotEvery, 3)
223 |     fig.set_size_inches(10.0*2, 10.0*plotCount)
224 |     fig.subplots_adjust(hspace=0.2, wspace=0.2)
225 | 
226 |     for i in range(minibatchCount):
227 |         source, gradient = findMapping(n, e, f, learningRate)
228 |         # That's much the same as
229 |         # f = lambda x: LocalMapping(source, gradient)(f(x))
230 |         f = GlobalMapping(source, gradient, f)
231 |         print i,
232 |         sys.stdout.flush()
233 |         if i%plotEvery==0:
234 |             plotIndex = i/plotEvery
235 |             drawMapping(axarr[plotIndex][0], f)
236 |             drawMapping(axarr[plotIndex][1], LocalMapping(source, gradient))
237 |             sampleFromTarget = samplesFromTarget(100)
238 |             axarr[plotIndex][2].scatter(sampleFromTarget[:,0], sampleFromTarget[:,1], color='red')
239 |             sampleFromLearned = np.array([ f(p) for p in gaussSample ])
240 |             axarr[plotIndex][2].scatter(sampleFromLearned[:,0], sampleFromLearned[:,1])
241 | 
242 |     print
243 |     plt.savefig("vis.pdf")
244 | 
245 | def mnist():
246 |     datasetFile = "../rbm/data/mnist.pkl.gz"
247 |     f = gzip.open(datasetFile, 'rb')
248 |     datasets = cPickle.load(f)
249 |     train_set, valid_set, test_set = datasets
250 |     f.close()
251 |     return train_set
252 | 
253 | def main():
254 |     iteration()
255 | 
256 | 
257 | if __name__ == "__main__":
258 |     main()
259 | 


--------------------------------------------------------------------------------
/matplotlibrc:
--------------------------------------------------------------------------------
1 | backend : Agg
2 | 


--------------------------------------------------------------------------------
/munkres.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: iso-8859-1 -*-
  3 | 
  4 | # Documentation is intended to be processed by Epydoc.
  5 | 
  6 | """
  7 | Introduction
  8 | ============
  9 | 
 10 | The Munkres module provides an implementation of the Munkres algorithm
 11 | (also called the Hungarian algorithm or the Kuhn-Munkres algorithm),
 12 | useful for solving the Assignment Problem.
 13 | 
 14 | Assignment Problem
 15 | ==================
 16 | 
 17 | Let *C* be an *n*\ x\ *n* matrix representing the costs of each of *n* workers
 18 | to perform any of *n* jobs. The assignment problem is to assign jobs to
 19 | workers in a way that minimizes the total cost. Since each worker can perform
 20 | only one job and each job can be assigned to only one worker the assignments
 21 | represent an independent set of the matrix *C*.
 22 | 
 23 | One way to generate the optimal set is to create all permutations of
 24 | the indexes necessary to traverse the matrix so that no row and column
 25 | are used more than once. For instance, given this matrix (expressed in
 26 | Python)::
 27 | 
 28 |     matrix = [[5, 9, 1],
 29 |               [10, 3, 2],
 30 |               [8, 7, 4]]
 31 | 
 32 | You could use this code to generate the traversal indexes::
 33 | 
 34 |     def permute(a, results):
 35 |         if len(a) == 1:
 36 |             results.insert(len(results), a)
 37 | 
 38 |         else:
 39 |             for i in range(0, len(a)):
 40 |                 element = a[i]
 41 |                 a_copy = [a[j] for j in range(0, len(a)) if j != i]
 42 |                 subresults = []
 43 |                 permute(a_copy, subresults)
 44 |                 for subresult in subresults:
 45 |                     result = [element] + subresult
 46 |                     results.insert(len(results), result)
 47 | 
 48 |     results = []
 49 |     permute(range(len(matrix)), results) # [0, 1, 2] for a 3x3 matrix
 50 | 
 51 | After the call to permute(), the results matrix would look like this::
 52 | 
 53 |     [[0, 1, 2],
 54 |      [0, 2, 1],
 55 |      [1, 0, 2],
 56 |      [1, 2, 0],
 57 |      [2, 0, 1],
 58 |      [2, 1, 0]]
 59 | 
 60 | You could then use that index matrix to loop over the original cost matrix
 61 | and calculate the smallest cost of the combinations::
 62 | 
 63 |     n = len(matrix)
 64 |     minval = sys.maxint
 65 |     for row in range(n):
 66 |         cost = 0
 67 |         for col in range(n):
 68 |             cost += matrix[row][col]
 69 |         minval = min(cost, minval)
 70 | 
 71 |     print minval
 72 | 
 73 | While this approach works fine for small matrices, it does not scale. It
 74 | executes in O(*n*!) time: Calculating the permutations for an *n*\ x\ *n*
 75 | matrix requires *n*! operations. For a 12x12 matrix, that's 479,001,600
 76 | traversals. Even if you could manage to perform each traversal in just one
 77 | millisecond, it would still take more than 133 hours to perform the entire
 78 | traversal. A 20x20 matrix would take 2,432,902,008,176,640,000 operations. At
 79 | an optimistic millisecond per operation, that's more than 77 million years.
 80 | 
 81 | The Munkres algorithm runs in O(*n*\ ^3) time, rather than O(*n*!). This
 82 | package provides an implementation of that algorithm.
 83 | 
 84 | This version is based on
 85 | http://www.public.iastate.edu/~ddoty/HungarianAlgorithm.html.
 86 | 
 87 | This version was written for Python by Brian Clapper from the (Ada) algorithm
 88 | at the above web site. (The ``Algorithm::Munkres`` Perl version, in CPAN, was
 89 | clearly adapted from the same web site.)
 90 | 
 91 | Usage
 92 | =====
 93 | 
 94 | Construct a Munkres object::
 95 | 
 96 |     from munkres import Munkres
 97 | 
 98 |     m = Munkres()
 99 | 
100 | Then use it to compute the lowest cost assignment from a cost matrix. Here's
101 | a sample program::
102 | 
103 |     from munkres import Munkres, print_matrix
104 | 
105 |     matrix = [[5, 9, 1],
106 |               [10, 3, 2],
107 |               [8, 7, 4]]
108 |     m = Munkres()
109 |     indexes = m.compute(matrix)
110 |     print_matrix(matrix, msg='Lowest cost through this matrix:')
111 |     total = 0
112 |     for row, column in indexes:
113 |         value = matrix[row][column]
114 |         total += value
115 |         print '(%d, %d) -> %d' % (row, column, value)
116 |     print 'total cost: %d' % total
117 | 
118 | Running that program produces::
119 | 
120 |     Lowest cost through this matrix:
121 |     [5, 9, 1]
122 |     [10, 3, 2]
123 |     [8, 7, 4]
124 |     (0, 0) -> 5
125 |     (1, 1) -> 3
126 |     (2, 2) -> 4
127 |     total cost=12
128 | 
129 | The instantiated Munkres object can be used multiple times on different
130 | matrices.
131 | 
132 | Non-square Cost Matrices
133 | ========================
134 | 
135 | The Munkres algorithm assumes that the cost matrix is square. However, it's
136 | possible to use a rectangular matrix if you first pad it with 0 values to make
137 | it square. This module automatically pads rectangular cost matrices to make
138 | them square.
139 | 
140 | Notes:
141 | 
142 | - The module operates on a *copy* of the caller's matrix, so any padding will
143 |   not be seen by the caller.
144 | - The cost matrix must be rectangular or square. An irregular matrix will
145 |   *not* work.
146 | 
147 | Calculating Profit, Rather than Cost
148 | ====================================
149 | 
150 | The cost matrix is just that: A cost matrix. The Munkres algorithm finds
151 | the combination of elements (one from each row and column) that results in
152 | the smallest cost. It's also possible to use the algorithm to maximize
153 | profit. To do that, however, you have to convert your profit matrix to a
154 | cost matrix. The simplest way to do that is to subtract all elements from a
155 | large value. For example::
156 | 
157 |     from munkres import Munkres, print_matrix
158 | 
159 |     matrix = [[5, 9, 1],
160 |               [10, 3, 2],
161 |               [8, 7, 4]]
162 |     cost_matrix = []
163 |     for row in matrix:
164 |         cost_row = []
165 |         for col in row:
166 |             cost_row += [sys.maxint - col]
167 |         cost_matrix += [cost_row]
168 | 
169 |     m = Munkres()
170 |     indexes = m.compute(cost_matrix)
171 |     print_matrix(matrix, msg='Highest profit through this matrix:')
172 |     total = 0
173 |     for row, column in indexes:
174 |         value = matrix[row][column]
175 |         total += value
176 |         print '(%d, %d) -> %d' % (row, column, value)
177 | 
178 |     print 'total profit=%d' % total
179 | 
180 | Running that program produces::
181 | 
182 |     Highest profit through this matrix:
183 |     [5, 9, 1]
184 |     [10, 3, 2]
185 |     [8, 7, 4]
186 |     (0, 1) -> 9
187 |     (1, 0) -> 10
188 |     (2, 2) -> 4
189 |     total profit=23
190 | 
191 | The ``munkres`` module provides a convenience method for creating a cost
192 | matrix from a profit matrix. Since it doesn't know whether the matrix contains
193 | floating point numbers, decimals, or integers, you have to provide the
194 | conversion function; but the convenience method takes care of the actual
195 | creation of the cost matrix::
196 | 
197 |     import munkres
198 | 
199 |     cost_matrix = munkres.make_cost_matrix(matrix,
200 |                                            lambda cost: sys.maxint - cost)
201 | 
202 | So, the above profit-calculation program can be recast as::
203 | 
204 |     from munkres import Munkres, print_matrix, make_cost_matrix
205 | 
206 |     matrix = [[5, 9, 1],
207 |               [10, 3, 2],
208 |               [8, 7, 4]]
209 |     cost_matrix = make_cost_matrix(matrix, lambda cost: sys.maxint - cost)
210 |     m = Munkres()
211 |     indexes = m.compute(cost_matrix)
212 |     print_matrix(matrix, msg='Lowest cost through this matrix:')
213 |     total = 0
214 |     for row, column in indexes:
215 |         value = matrix[row][column]
216 |         total += value
217 |         print '(%d, %d) -> %d' % (row, column, value)
218 |     print 'total profit=%d' % total
219 | 
220 | References
221 | ==========
222 | 
223 | 1. http://www.public.iastate.edu/~ddoty/HungarianAlgorithm.html
224 | 
225 | 2. Harold W. Kuhn. The Hungarian Method for the assignment problem.
226 |    *Naval Research Logistics Quarterly*, 2:83-97, 1955.
227 | 
228 | 3. Harold W. Kuhn. Variants of the Hungarian method for assignment
229 |    problems. *Naval Research Logistics Quarterly*, 3: 253-258, 1956.
230 | 
231 | 4. Munkres, J. Algorithms for the Assignment and Transportation Problems.
232 |    *Journal of the Society of Industrial and Applied Mathematics*,
233 |    5(1):32-38, March, 1957.
234 | 
235 | 5. http://en.wikipedia.org/wiki/Hungarian_algorithm
236 | 
237 | Copyright and License
238 | =====================
239 | 
240 | This software is released under a BSD license, adapted from
241 | <http://opensource.org/licenses/bsd-license.php>
242 | 
243 | Copyright (c) 2008 Brian M. Clapper
244 | All rights reserved.
245 | 
246 | Redistribution and use in source and binary forms, with or without
247 | modification, are permitted provided that the following conditions are met:
248 | 
249 | * Redistributions of source code must retain the above copyright notice,
250 |   this list of conditions and the following disclaimer.
251 | 
252 | * Redistributions in binary form must reproduce the above copyright notice,
253 |   this list of conditions and the following disclaimer in the documentation
254 |   and/or other materials provided with the distribution.
255 | 
256 | * Neither the name "clapper.org" nor the names of its contributors may be
257 |   used to endorse or promote products derived from this software without
258 |   specific prior written permission.
259 | 
260 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
261 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
262 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
263 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
264 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
265 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
266 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
267 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
268 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
269 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
270 | POSSIBILITY OF SUCH DAMAGE.
271 | """
272 | 
273 | __docformat__ = 'restructuredtext'
274 | 
275 | # ---------------------------------------------------------------------------
276 | # Imports
277 | # ---------------------------------------------------------------------------
278 | 
279 | import sys
280 | 
281 | # ---------------------------------------------------------------------------
282 | # Exports
283 | # ---------------------------------------------------------------------------
284 | 
285 | __all__     = ['Munkres', 'make_cost_matrix']
286 | 
287 | # ---------------------------------------------------------------------------
288 | # Globals
289 | # ---------------------------------------------------------------------------
290 | 
291 | # Info about the module
292 | __version__   = "1.0.5.4"
293 | __author__    = "Brian Clapper, bmc@clapper.org"
294 | __url__       = "http://bmc.github.com/munkres/"
295 | __copyright__ = "(c) 2008 Brian M. Clapper"
296 | __license__   = "BSD-style license"
297 | 
298 | # ---------------------------------------------------------------------------
299 | # Classes
300 | # ---------------------------------------------------------------------------
301 | 
302 | class Munkres:
303 |     """
304 |     Calculate the Munkres solution to the classical assignment problem.
305 |     See the module documentation for usage.
306 |     """
307 | 
308 |     def __init__(self):
309 |         """Create a new instance"""
310 |         self.C = None
311 |         self.row_covered = []
312 |         self.col_covered = []
313 |         self.n = 0
314 |         self.Z0_r = 0
315 |         self.Z0_c = 0
316 |         self.marked = None
317 |         self.path = None
318 | 
319 |     def make_cost_matrix(profit_matrix, inversion_function):
320 |         """
321 |         **DEPRECATED**
322 | 
323 |         Please use the module function ``make_cost_matrix()``.
324 |         """
325 |         import munkres
326 |         return munkres.make_cost_matrix(profit_matrix, inversion_function)
327 | 
328 |     make_cost_matrix = staticmethod(make_cost_matrix)
329 | 
330 |     def pad_matrix(self, matrix, pad_value=0):
331 |         """
332 |         Pad a possibly non-square matrix to make it square.
333 | 
334 |         :Parameters:
335 |             matrix : list of lists
336 |                 matrix to pad
337 | 
338 |             pad_value : int
339 |                 value to use to pad the matrix
340 | 
341 |         :rtype: list of lists
342 |         :return: a new, possibly padded, matrix
343 |         """
344 |         max_columns = 0
345 |         total_rows = len(matrix)
346 | 
347 |         for row in matrix:
348 |             max_columns = max(max_columns, len(row))
349 | 
350 |         total_rows = max(max_columns, total_rows)
351 | 
352 |         new_matrix = []
353 |         for row in matrix:
354 |             row_len = len(row)
355 |             new_row = row[:]
356 |             if total_rows > row_len:
357 |                 # Row too short. Pad it.
358 |                 new_row += [0] * (total_rows - row_len)
359 |             new_matrix += [new_row]
360 | 
361 |         while len(new_matrix) < total_rows:
362 |             new_matrix += [[0] * total_rows]
363 | 
364 |         return new_matrix
365 | 
366 |     def compute(self, cost_matrix):
367 |         """
368 |         Compute the indexes for the lowest-cost pairings between rows and
369 |         columns in the database. Returns a list of (row, column) tuples
370 |         that can be used to traverse the matrix.
371 | 
372 |         :Parameters:
373 |             cost_matrix : list of lists
374 |                 The cost matrix. If this cost matrix is not square, it
375 |                 will be padded with zeros, via a call to ``pad_matrix()``.
376 |                 (This method does *not* modify the caller's matrix. It
377 |                 operates on a copy of the matrix.)
378 | 
379 |                 **WARNING**: This code handles square and rectangular
380 |                 matrices. It does *not* handle irregular matrices.
381 | 
382 |         :rtype: list
383 |         :return: A list of ``(row, column)`` tuples that describe the lowest
384 |                  cost path through the matrix
385 | 
386 |         """
387 |         self.C = self.pad_matrix(cost_matrix)
388 |         self.n = len(self.C)
389 |         self.original_length = len(cost_matrix)
390 |         self.original_width = len(cost_matrix[0])
391 |         self.row_covered = [False for i in range(self.n)]
392 |         self.col_covered = [False for i in range(self.n)]
393 |         self.Z0_r = 0
394 |         self.Z0_c = 0
395 |         self.path = self.__make_matrix(self.n * 2, 0)
396 |         self.marked = self.__make_matrix(self.n, 0)
397 | 
398 |         done = False
399 |         step = 1
400 | 
401 |         steps = { 1 : self.__step1,
402 |                   2 : self.__step2,
403 |                   3 : self.__step3,
404 |                   4 : self.__step4,
405 |                   5 : self.__step5,
406 |                   6 : self.__step6 }
407 | 
408 |         while not done:
409 |             try:
410 |                 func = steps[step]
411 |                 step = func()
412 |             except KeyError:
413 |                 done = True
414 | 
415 |         # Look for the starred columns
416 |         results = []
417 |         for i in range(self.original_length):
418 |             for j in range(self.original_width):
419 |                 if self.marked[i][j] == 1:
420 |                     results += [(i, j)]
421 | 
422 |         return results
423 | 
424 |     def __copy_matrix(self, matrix):
425 |         """Return an exact copy of the supplied matrix"""
426 |         return copy.deepcopy(matrix)
427 | 
428 |     def __make_matrix(self, n, val):
429 |         """Create an *n*x*n* matrix, populating it with the specific value."""
430 |         matrix = []
431 |         for i in range(n):
432 |             matrix += [[val for j in range(n)]]
433 |         return matrix
434 | 
435 |     def __step1(self):
436 |         """
437 |         For each row of the matrix, find the smallest element and
438 |         subtract it from every element in its row. Go to Step 2.
439 |         """
440 |         C = self.C
441 |         n = self.n
442 |         for i in range(n):
443 |             minval = min(self.C[i])
444 |             # Find the minimum value for this row and subtract that minimum
445 |             # from every element in the row.
446 |             for j in range(n):
447 |                 self.C[i][j] -= minval
448 | 
449 |         return 2
450 | 
451 |     def __step2(self):
452 |         """
453 |         Find a zero (Z) in the resulting matrix. If there is no starred
454 |         zero in its row or column, star Z. Repeat for each element in the
455 |         matrix. Go to Step 3.
456 |         """
457 |         n = self.n
458 |         for i in range(n):
459 |             for j in range(n):
460 |                 if (self.C[i][j] == 0) and \
461 |                    (not self.col_covered[j]) and \
462 |                    (not self.row_covered[i]):
463 |                     self.marked[i][j] = 1
464 |                     self.col_covered[j] = True
465 |                     self.row_covered[i] = True
466 | 
467 |         self.__clear_covers()
468 |         return 3
469 | 
470 |     def __step3(self):
471 |         """
472 |         Cover each column containing a starred zero. If K columns are
473 |         covered, the starred zeros describe a complete set of unique
474 |         assignments. In this case, Go to DONE, otherwise, Go to Step 4.
475 |         """
476 |         n = self.n
477 |         count = 0
478 |         for i in range(n):
479 |             for j in range(n):
480 |                 if self.marked[i][j] == 1:
481 |                     self.col_covered[j] = True
482 |                     count += 1
483 | 
484 |         if count >= n:
485 |             step = 7 # done
486 |         else:
487 |             step = 4
488 | 
489 |         return step
490 | 
491 |     def __step4(self):
492 |         """
493 |         Find a noncovered zero and prime it. If there is no starred zero
494 |         in the row containing this primed zero, Go to Step 5. Otherwise,
495 |         cover this row and uncover the column containing the starred
496 |         zero. Continue in this manner until there are no uncovered zeros
497 |         left. Save the smallest uncovered value and Go to Step 6.
498 |         """
499 |         step = 0
500 |         done = False
501 |         row = -1
502 |         col = -1
503 |         star_col = -1
504 |         while not done:
505 |             (row, col) = self.__find_a_zero()
506 |             if row < 0:
507 |                 done = True
508 |                 step = 6
509 |             else:
510 |                 self.marked[row][col] = 2
511 |                 star_col = self.__find_star_in_row(row)
512 |                 if star_col >= 0:
513 |                     col = star_col
514 |                     self.row_covered[row] = True
515 |                     self.col_covered[col] = False
516 |                 else:
517 |                     done = True
518 |                     self.Z0_r = row
519 |                     self.Z0_c = col
520 |                     step = 5
521 | 
522 |         return step
523 | 
524 |     def __step5(self):
525 |         """
526 |         Construct a series of alternating primed and starred zeros as
527 |         follows. Let Z0 represent the uncovered primed zero found in Step 4.
528 |         Let Z1 denote the starred zero in the column of Z0 (if any).
529 |         Let Z2 denote the primed zero in the row of Z1 (there will always
530 |         be one). Continue until the series terminates at a primed zero
531 |         that has no starred zero in its column. Unstar each starred zero
532 |         of the series, star each primed zero of the series, erase all
533 |         primes and uncover every line in the matrix. Return to Step 3
534 |         """
535 |         count = 0
536 |         path = self.path
537 |         path[count][0] = self.Z0_r
538 |         path[count][1] = self.Z0_c
539 |         done = False
540 |         while not done:
541 |             row = self.__find_star_in_col(path[count][1])
542 |             if row >= 0:
543 |                 count += 1
544 |                 path[count][0] = row
545 |                 path[count][1] = path[count-1][1]
546 |             else:
547 |                 done = True
548 | 
549 |             if not done:
550 |                 col = self.__find_prime_in_row(path[count][0])
551 |                 count += 1
552 |                 path[count][0] = path[count-1][0]
553 |                 path[count][1] = col
554 | 
555 |         self.__convert_path(path, count)
556 |         self.__clear_covers()
557 |         self.__erase_primes()
558 |         return 3
559 | 
560 |     def __step6(self):
561 |         """
562 |         Add the value found in Step 4 to every element of each covered
563 |         row, and subtract it from every element of each uncovered column.
564 |         Return to Step 4 without altering any stars, primes, or covered
565 |         lines.
566 |         """
567 |         minval = self.__find_smallest()
568 |         for i in range(self.n):
569 |             for j in range(self.n):
570 |                 if self.row_covered[i]:
571 |                     self.C[i][j] += minval
572 |                 if not self.col_covered[j]:
573 |                     self.C[i][j] -= minval
574 |         return 4
575 | 
576 |     def __find_smallest(self):
577 |         """Find the smallest uncovered value in the matrix."""
578 |         minval = sys.maxint
579 |         for i in range(self.n):
580 |             for j in range(self.n):
581 |                 if (not self.row_covered[i]) and (not self.col_covered[j]):
582 |                     if minval > self.C[i][j]:
583 |                         minval = self.C[i][j]
584 |         return minval
585 | 
586 |     def __find_a_zero(self):
587 |         """Find the first uncovered element with value 0"""
588 |         row = -1
589 |         col = -1
590 |         i = 0
591 |         n = self.n
592 |         done = False
593 | 
594 |         while not done:
595 |             j = 0
596 |             while True:
597 |                 if (self.C[i][j] == 0) and \
598 |                    (not self.row_covered[i]) and \
599 |                    (not self.col_covered[j]):
600 |                     row = i
601 |                     col = j
602 |                     done = True
603 |                 j += 1
604 |                 if j >= n:
605 |                     break
606 |             i += 1
607 |             if i >= n:
608 |                 done = True
609 | 
610 |         return (row, col)
611 | 
612 |     def __find_star_in_row(self, row):
613 |         """
614 |         Find the first starred element in the specified row. Returns
615 |         the column index, or -1 if no starred element was found.
616 |         """
617 |         col = -1
618 |         for j in range(self.n):
619 |             if self.marked[row][j] == 1:
620 |                 col = j
621 |                 break
622 | 
623 |         return col
624 | 
625 |     def __find_star_in_col(self, col):
626 |         """
627 |         Find the first starred element in the specified row. Returns
628 |         the row index, or -1 if no starred element was found.
629 |         """
630 |         row = -1
631 |         for i in range(self.n):
632 |             if self.marked[i][col] == 1:
633 |                 row = i
634 |                 break
635 | 
636 |         return row
637 | 
638 |     def __find_prime_in_row(self, row):
639 |         """
640 |         Find the first prime element in the specified row. Returns
641 |         the column index, or -1 if no starred element was found.
642 |         """
643 |         col = -1
644 |         for j in range(self.n):
645 |             if self.marked[row][j] == 2:
646 |                 col = j
647 |                 break
648 | 
649 |         return col
650 | 
651 |     def __convert_path(self, path, count):
652 |         for i in range(count+1):
653 |             if self.marked[path[i][0]][path[i][1]] == 1:
654 |                 self.marked[path[i][0]][path[i][1]] = 0
655 |             else:
656 |                 self.marked[path[i][0]][path[i][1]] = 1
657 | 
658 |     def __clear_covers(self):
659 |         """Clear all covered matrix cells"""
660 |         for i in range(self.n):
661 |             self.row_covered[i] = False
662 |             self.col_covered[i] = False
663 | 
664 |     def __erase_primes(self):
665 |         """Erase all prime markings"""
666 |         for i in range(self.n):
667 |             for j in range(self.n):
668 |                 if self.marked[i][j] == 2:
669 |                     self.marked[i][j] = 0
670 | 
671 | # ---------------------------------------------------------------------------
672 | # Functions
673 | # ---------------------------------------------------------------------------
674 | 
675 | def make_cost_matrix(profit_matrix, inversion_function):
676 |     """
677 |     Create a cost matrix from a profit matrix by calling
678 |     'inversion_function' to invert each value. The inversion
679 |     function must take one numeric argument (of any type) and return
680 |     another numeric argument which is presumed to be the cost inverse
681 |     of the original profit.
682 | 
683 |     This is a static method. Call it like this:
684 | 
685 |     .. python::
686 | 
687 |         cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)
688 | 
689 |     For example:
690 | 
691 |     .. python::
692 | 
693 |         cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)
694 | 
695 |     :Parameters:
696 |         profit_matrix : list of lists
697 |             The matrix to convert from a profit to a cost matrix
698 | 
699 |         inversion_function : function
700 |             The function to use to invert each entry in the profit matrix
701 | 
702 |     :rtype: list of lists
703 |     :return: The converted matrix
704 |     """
705 |     cost_matrix = []
706 |     for row in profit_matrix:
707 |         cost_matrix.append([inversion_function(value) for value in row])
708 |     return cost_matrix
709 | 
710 | def print_matrix(matrix, msg=None):
711 |     """
712 |     Convenience function: Displays the contents of a matrix of integers.
713 | 
714 |     :Parameters:
715 |         matrix : list of lists
716 |             Matrix to print
717 | 
718 |         msg : str
719 |             Optional message to print before displaying the matrix
720 |     """
721 |     import math
722 | 
723 |     if msg is not None:
724 |         print msg
725 | 
726 |     # Calculate the appropriate format width.
727 |     width = 0
728 |     for row in matrix:
729 |         for val in row:
730 |             width = max(width, int(math.log10(val)) + 1)
731 | 
732 |     # Make the format string
733 |     format = '%%%dd' % width
734 | 
735 |     # Print the matrix
736 |     for row in matrix:
737 |         sep = '['
738 |         for val in row:
739 |             sys.stdout.write(sep + format % val)
740 |             sep = ', '
741 |         sys.stdout.write(']\n')
742 | 
743 | # ---------------------------------------------------------------------------
744 | # Main
745 | # ---------------------------------------------------------------------------
746 | 
747 | if __name__ == '__main__':
748 | 
749 | 
750 |     matrices = [
751 |                 # Square
752 |                 ([[400, 150, 400],
753 |                   [400, 450, 600],
754 |                   [300, 225, 300]],
755 |                  850 # expected cost
756 |                 ),
757 | 
758 |                 # Rectangular variant
759 |                 ([[400, 150, 400, 1],
760 |                   [400, 450, 600, 2],
761 |                   [300, 225, 300, 3]],
762 |                  452 # expected cost
763 |                 ),
764 | 
765 |                 # Square
766 |                 ([[10, 10,  8],
767 |                   [ 9,  8,  1],
768 |                   [ 9,  7,  4]],
769 |                  18
770 |                 ),
771 | 
772 |                 # Rectangular variant
773 |                 ([[10, 10,  8, 11],
774 |                   [ 9,  8,  1, 1],
775 |                   [ 9,  7,  4, 10]],
776 |                  15
777 |                 ),
778 |                ]
779 | 
780 |     m = Munkres()
781 |     for cost_matrix, expected_total in matrices:
782 |         print_matrix(cost_matrix, msg='cost matrix')
783 |         indexes = m.compute(cost_matrix)
784 |         total_cost = 0
785 |         for r, c in indexes:
786 |             x = cost_matrix[r][c]
787 |             total_cost += x
788 |             print '(%d, %d) -> %d' % (r, c, x)
789 |         print 'lowest cost=%d' % total_cost
790 |         assert expected_total == total_cost
791 | 
792 | 


--------------------------------------------------------------------------------
/nearestNeighborsTest.py:
--------------------------------------------------------------------------------
 1 | # This piece of code was prepared because I asked about its
 2 | # performance on the theano-users list:
 3 | # https://groups.google.com/forum/#!topic/theano-users/E7ProqnGUMk
 4 | # https://gist.github.com/danielvarga/d0eeacea92e65b19188c
 5 | 
 6 | 
 7 | import numpy as np
 8 | import theano
 9 | import theano.tensor as T
10 | 
11 | 
12 | def randomMatrix(n, f):
13 |     return np.random.normal(size=n*f).astype(np.float32).reshape((n, f))
14 | 
15 | n = 5000 # number of candidates
16 | m = 1000 # number of targets
17 | f = 500  # number of features
18 | 
19 | x = T.matrix('x') # candidates
20 | y = T.matrix('y') # targets
21 | 
22 | xL2S = T.sum(x*x, axis=-1) # [n]
23 | yL2S = T.sum(y*y, axis=-1) # [m]
24 | xL2SM = T.zeros((m, n)) + xL2S # broadcasting, [m, n]
25 | yL2SM = T.zeros((n, m)) + yL2S # # broadcasting, [n, m]
26 | squaredPairwiseDistances = xL2SM.T + yL2SM - 2.0*T.dot(x, y.T) # [n, m]
27 | 
28 | np.random.seed(1)
29 | 
30 | N = randomMatrix(n, f)
31 | M = randomMatrix(m, f)
32 | 
33 | lamblinsTrick = True
34 | 
35 | if lamblinsTrick:
36 |     # from https://github.com/Theano/Theano/issues/1399
37 |     s = squaredPairwiseDistances
38 |     bestIndices = T.cast( ( T.arange(n).dimshuffle(0, 'x') * T.cast(T.eq(s, s.min(axis=0, keepdims=True)), 'float32') ).sum(axis=0), 'int32')
39 | else:
40 |     bestIndices = T.argmin(squaredPairwiseDistances, axis=0)
41 | 
42 | nearests_fn = theano.function([x, y], bestIndices, profile=True)
43 | 
44 | print nearests_fn(N, M).sum()
45 | 


--------------------------------------------------------------------------------
/next_permutation.py:
--------------------------------------------------------------------------------
 1 | def next_permutation(seq, pred=cmp):
 2 |     """Like C++ std::next_permutation() but implemented as
 3 |     generator. Yields copies of seq."""
 4 | 
 5 |     def reverse(seq, start, end):
 6 |         # seq = seq[:start] + reversed(seq[start:end]) + \
 7 |         #       seq[end:]
 8 |         end -= 1
 9 |         if end <= start:
10 |             return
11 |         while True:
12 |             seq[start], seq[end] = seq[end], seq[start]
13 |             if start == end or start+1 == end:
14 |                 return
15 |             start += 1
16 |             end -= 1
17 |     
18 |     if not seq:
19 |         raise StopIteration
20 | 
21 |     try:
22 |         seq[0]
23 |     except TypeError:
24 |         raise TypeError("seq must allow random access.")
25 | 
26 |     first = 0
27 |     last = len(seq)
28 |     seq = seq[:]
29 | 
30 |     # Yield input sequence as the STL version is often
31 |     # used inside do {} while.
32 |     yield seq
33 |     
34 |     if last == 1:
35 |         raise StopIteration
36 | 
37 |     while True:
38 |         next = last - 1
39 | 
40 |         while True:
41 |             # Step 1.
42 |             next1 = next
43 |             next -= 1
44 |             
45 |             if pred(seq[next], seq[next1]) < 0:
46 |                 # Step 2.
47 |                 mid = last - 1
48 |                 while not (pred(seq[next], seq[mid]) < 0):
49 |                     mid -= 1
50 |                 seq[next], seq[mid] = seq[mid], seq[next]
51 |                 
52 |                 # Step 3.
53 |                 reverse(seq, next1, last)
54 | 
55 |                 # Change to yield references to get rid of
56 |                 # (at worst) |seq|! copy operations.
57 |                 yield seq[:]
58 |                 break
59 |             if next == first:
60 |                 raise StopIteration
61 |     raise StopIteration
62 | 


--------------------------------------------------------------------------------
/nnbase/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/earth-moving-generative-net/419fb9fe0b93cfbde5e616c285bf788d1920df1d/nnbase/__init__.py


--------------------------------------------------------------------------------
/nnbase/attrdict.py:
--------------------------------------------------------------------------------
1 | 
2 | # Causes memory leak below python 2.7.3
3 | class AttrDict(dict):
4 |     def __init__(self, *args, **kwargs):
5 |         super(AttrDict, self).__init__(*args, **kwargs)
6 |         self.__dict__ = self
7 | 


--------------------------------------------------------------------------------
/nnbase/autoencoder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from lasagne import layers
  4 | import numpy as np
  5 | 
  6 | import sys
  7 | import gzip
  8 | import cPickle
  9 | from PIL import Image
 10 | 
 11 | from nnbase.layers import Unpool2DLayer
 12 | from nnbase.utils import FlipBatchIterator
 13 | ### this is really dumb, current nolearn doesnt play well with lasagne,
 14 | ### so had to manually copy the file I wanted to this folder
 15 | import nnbase.shape as shape
 16 | 
 17 | import nnbase.inputs
 18 | import nnbase.vis
 19 | 
 20 | # This is very error-prone.
 21 | # Optimally, there should be a guarantee that the
 22 | # corpus loaded here is the same as the one that the
 23 | # encoder was trained on.
 24 | def loadCorpus():
 25 |     face = True
 26 |     if face:
 27 |         directory = "../face/SCUT-FBP/thumb.big/"
 28 |         X, (height, width) = nnbase.inputs.faces(directory)
 29 |     else:
 30 |         X, (height, width) = nnbase.inputs.mnist()
 31 | 
 32 |     X = X.astype(np.float64).reshape((-1, 1, height, width))
 33 |     mu, sigma = np.mean(X), np.std(X)
 34 |     print "mu, sigma:", mu, sigma
 35 |     return X, mu, sigma
 36 | 
 37 | # TODO I don't think that .eval() is how this should work.
 38 | def get_output_from_nn(last_layer, X):
 39 |     indices = np.arange(128, X.shape[0], 128)
 40 |     # not splitting into batches can cause a memory error
 41 |     X_batches = np.split(X, indices)
 42 |     out = []
 43 |     for count, X_batch in enumerate(X_batches):
 44 |         out.append( layers.get_output(last_layer, X_batch).eval() )
 45 |     return np.vstack(out)
 46 | 
 47 | # This helper class deals with
 48 | # 1. normalizing input and de-normalizing output
 49 | # 2. reshaping output into shape compatible with input, namely (-1, 1, x ,y)
 50 | class Autoencoder:
 51 |     # sigma and mu should be trained on the same corpus as the autoencoder itself.
 52 |     # This is error-prone!
 53 |     def __init__(self, ae, mu, sigma):
 54 |         self.ae = ae
 55 |         self.mu = mu
 56 |         self.sigma = sigma
 57 | 
 58 |         self.encode_layer_index = map(lambda pair : pair[0], self.ae.layers).index('encode_layer')
 59 |         self.encode_layer = self.ae.get_all_layers()[self.encode_layer_index]
 60 |         self.afterSplit = False
 61 | 
 62 |     # from unnormalized to unnormalized [0,1] MNIST.
 63 |     # ae is trained on normalized MNIST data.
 64 |     # For 0-1 clipped digits this should be close to the identity function.
 65 |     def predict(self, X):
 66 |         assert not self.afterSplit
 67 |         self.x, self.y = X.shape[-2:]
 68 |         flatOutput = self.ae.predict((X - self.mu) / self.sigma).reshape(X.shape) * self.sigma + self.mu
 69 |         return flatOutput.reshape((-1, 1, self.x, self.y))
 70 | 
 71 |     def encode(self, X):
 72 |         self.x, self.y = X.shape[-2:]
 73 |         return get_output_from_nn(self.encode_layer, (X-self.mu)/self.sigma)
 74 | 
 75 |     # N.B after we do this, we won't be able to use the original autoencoder , as the layers are broken up
 76 |     def split(self):
 77 |         next_layer = self.ae.get_all_layers()[self.encode_layer_index + 1]
 78 |         self.final_layer = self.ae.get_all_layers()[-1]
 79 |         new_layer = layers.InputLayer(shape = (None, self.encode_layer.num_units))
 80 |         next_layer.input_layer = new_layer
 81 |         self.afterSplit = True
 82 | 
 83 |     def decode(self, X):
 84 |         assert self.afterSplit
 85 |         flatOutput = get_output_from_nn(self.final_layer, X) * self.sigma + self.mu
 86 |         # Evil hack: decode only knows the shape of the input space
 87 |         # if you did a predict or encode previously. TODO Fix asap.
 88 |         return flatOutput.reshape((-1, 1, self.x, self.y))
 89 | 
 90 | 
 91 | def main():
 92 |     X_train, mu, sigma = loadCorpus()
 93 | 
 94 |     # autoencoderFile = "../lasagne-demo/conv_ae.pkl" # Trained on the full mnist train dataset
 95 |     autoencoderFile = "../lasagne-demo/face.big.pkl" # Trained on the ../face/SCUT-FBP/thumb.big dataset.
 96 | 
 97 |     ae_raw = cPickle.load(open(autoencoderFile, 'r'))
 98 |     autoencoder = Autoencoder(ae_raw, mu, sigma)
 99 | 
100 |     sampleIndices = map(int, sys.argv[1:])
101 |     assert len(sampleIndices)==2, "the tool expects two sample indices"
102 |     X_train = X_train[sampleIndices]
103 | 
104 |     X_pred = autoencoder.predict(X_train)
105 |     print "ended prediction"
106 |     sys.stdout.flush()
107 | 
108 |     nnbase.vis.get_random_images(X_train, X_pred)
109 | 
110 |     autoencoder.split()
111 | 
112 |     X_encoded = autoencoder.encode(X_train)
113 | 
114 |     x0 = X_encoded[0]
115 |     x1 = X_encoded[1]
116 |     stepCount = 100
117 |     intervalBase = np.linspace(1, 0, num=stepCount)
118 |     intervalEncoded = np.multiply.outer(intervalBase, x0)+np.multiply.outer(1.0-intervalBase, x1)
119 | 
120 |     X_decoded = autoencoder.decode(intervalEncoded)
121 |     nnbase.vis.get_picture_array(X_decoded, 10, 10, "interval")
122 | 
123 |     intervalInputspace = np.multiply.outer(intervalBase, X_train[0])+np.multiply.outer(1.0-intervalBase, X_train[1])
124 |     nnbase.vis.get_picture_array(intervalInputspace, 10, 10, "interval-inputspace")
125 | 
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/nnbase/inputs.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import gzip
  4 | import cPickle
  5 | 
  6 | # When I move out the synthetic distributions, these imports should move as well.
  7 | import math
  8 | import random
  9 | import PIL.Image as Image
 10 | import PIL.ImageDraw as ImageDraw
 11 | 
 12 | from nnbase.attrdict import AttrDict
 13 | import autoencoder
 14 | 
 15 | 
 16 | def mnist(digit=None, torusHack=False, autoencoded=False, which='train', everyNth=1):
 17 |     np.random.seed(1) # TODO Not the right place to do this.
 18 |     datasetFile = "mnist.pkl.gz"
 19 |     f = gzip.open(datasetFile, 'rb')
 20 |     datasets = cPickle.load(f)
 21 |     train_set, valid_set, test_set = datasets
 22 |     f.close()
 23 |     if which=='train':
 24 |         input, output = train_set
 25 |     elif which=='validation':
 26 |         input, output = valid_set
 27 |     elif which=='test':
 28 |         input, output = test_set
 29 |     else:
 30 |         assert which in ('train', 'validation', 'test')
 31 | 
 32 |     input = input.reshape((-1, 28, 28))
 33 |     if digit is not None:
 34 |         input = input[output==digit]
 35 |     if torusHack:
 36 |         # This is a SINGLE sample, translated and multiplied.
 37 |         sample = input[0]
 38 |         inputRows = []
 39 |         for dx in range(28):
 40 |             for dy in range(28):
 41 |                 s = sample.copy()
 42 |                 s = np.hstack((s[:, dy:], s[:, :dy]))
 43 |                 s = np.vstack((s[dx:, :], s[:dx, :]))
 44 |                 inputRows.append(s)
 45 |         input = np.array(inputRows)
 46 |         input = np.vstack([[input]*10])
 47 |     input = np.random.permutation(input)
 48 |     input = input[::everyNth]
 49 |     input = input.astype(np.float32)
 50 | 
 51 |     if autoencoded:
 52 |         autoencoderFile = "../lasagne-demo/conv_ae.pkl"
 53 |         mu = 0.13045
 54 |         sigma = 0.30729
 55 |         ae = autoencoder.Autoencoder(cPickle.load(open(autoencoderFile, 'r')), mu=mu, sigma=sigma)
 56 |         ae.split()
 57 |         encodedInput = ae.encode(input.reshape((-1, 1, 28, 28)))
 58 |         assert encodedInput.shape[1] == 40
 59 |         # encodedInput = encodedInput.reshape((-1, 8, 5))
 60 |         # print encodedInput.shape
 61 |         # return encodedInput, (8, 5)
 62 |         decodedInput = ae.decode(encodedInput)
 63 |         return decodedInput.reshape((-1, 28, 28)) , (28, 28)
 64 |     else:
 65 |         return input, (28, 28)
 66 | 
 67 | def flattenImages(input):
 68 |     shape = input.shape
 69 |     assert len(shape) in (2, 3)
 70 |     if len(shape)==2:
 71 |         return input
 72 |     l, height, width = shape
 73 |     return input.reshape((l, height*width))
 74 | 
 75 | def faces(directory):
 76 |     imgs = []
 77 |     height = None
 78 |     width = None
 79 |     for f in os.listdir(directory):
 80 |         if f.endswith(".jpg") or f.endswith(".png"):
 81 |             img = Image.open(os.path.join(directory, f)).convert("L")
 82 |             arr = np.array(img)
 83 |             if height is None:
 84 |                 height, width = arr.shape
 85 |             else:
 86 |                 assert (height, width) == arr.shape, "Bad size %s %s" % (f, str(arr.shape))
 87 |             imgs.append(arr)
 88 |     input = np.array(imgs).astype(float) / 255
 89 |     np.random.seed(1) # TODO Not the right place to do this.
 90 |     input = np.random.permutation(input)
 91 |     return input, (height, width)
 92 | 
 93 | def generateWave(n, height, width, waveCount):
 94 |     d = height*width
 95 |     phases = 2 * np.pi * np.random.uniform(size=n).astype(np.float32)
 96 |     rangeMat = np.zeros((n, d)).astype(np.float32) + np.linspace(start=0.0, stop=1.0, num=d).astype(np.float32) # broadcast, tiling rows
 97 |     phaseMat = np.zeros((n, d)).astype(np.float32) + phases[:, np.newaxis] # broadcast, tiling columns
 98 |     waves = (np.sin(rangeMat*(waveCount*2*np.pi) + phaseMat)+1.0)/2.0
 99 |     assert waves.dtype==np.float32
100 |     assert np.sum(np.isnan(waves)) == 0
101 |     return waves.reshape((n, height, width))
102 | 
103 | # Super ad hoc, but it shouldn't matter.
104 | def generatePlane(n, height, width):
105 |     normals = np.random.normal(size=(n,2)).astype(np.float32)
106 |     zeros = np.zeros((n, height, width)).astype(np.float32)
107 |     # two-dim broadcasts:
108 |     heightMat = zeros + np.linspace(start=-1.0, stop=1.0, num=height).astype(np.float32)[:, np.newaxis]
109 |     widthMat = zeros + np.linspace(start=-1.0, stop=1.0, num=width).astype(np.float32)[np.newaxis, :]
110 |     planes = heightMat*normals[:, 0][:, np.newaxis, np.newaxis] + widthMat*normals[: ,1][:, np.newaxis, np.newaxis] + 0.5
111 |     np.clip(planes, 0.0, 1.0, planes)
112 |     return planes
113 | 
114 | def generateOneClock(width):
115 |     data = np.zeros((width, width)).astype(np.float32)
116 |     r = float(width/2)
117 |     img = Image.fromarray(data)
118 |     draw = ImageDraw.Draw(img)
119 |     theta = random.uniform(0, 2*math.pi)
120 |     intensity = random.uniform(0.0, 1.0)
121 |     p = ((r, r), (r*(1+math.cos(theta)), r*(1+math.sin(theta))), (r*(1+math.cos(theta+1)), r*(1+math.sin(theta+1))))
122 |     draw.polygon(p, fill=intensity)
123 |     return np.asarray(img)
124 | 
125 | def generateClock(n, height, width):
126 |     assert height == width
127 |     return np.array([ generateOneClock(width) for i in range(n) ])
128 | 
129 | def generateOneDot(width):
130 |     data = np.zeros((width, width)).astype(np.float32)
131 |     r = float(width/2)
132 |     img = Image.fromarray(data)
133 |     draw = ImageDraw.Draw(img)
134 |     theta = random.uniform(0, 2*math.pi)
135 |     intensity = random.uniform(0.0, 1.0)
136 |     p = ((r, r), (r*(1+math.cos(theta)), r*(1+math.sin(theta))), (r*(1+math.cos(theta+1)), r*(1+math.sin(theta+1))))
137 |     draw.polygon(p, fill=intensity)
138 |     return np.asarray(img)
139 | 
140 | def generateSine(n, height, width):
141 |     waveLength = height*np.pi
142 |     parameters = np.random.uniform(low=waveLength*1.2, high=waveLength*1.8, size=(n,2)).astype(np.float32)
143 |     data = np.zeros((n, height*width)).astype(np.float32)
144 |     data += np.linspace(start=0.0, stop=height*width-1, num=height*width).astype(np.float32)[np.newaxis, :]
145 |     data /= parameters[:,1][:, np.newaxis]
146 |     data += parameters[:,0][:, np.newaxis] * 10
147 |     data = (np.sin(data)+1.0) / 2
148 |     return data.reshape((n, height, width))
149 | 
150 | def generate1DUniform(n):
151 |     return np.random.uniform(low=-1, high=+1, size=(n,1)).astype(np.float32)
152 | 
153 | def generate1DTriangle(n):
154 |     bi = np.random.uniform(low=0, high=1, size=(n,2)).astype(np.float32)
155 |     data = np.max(bi, axis=-1, keepdims=True)
156 |     assert data.shape==(n,1)
157 |     return data
158 | 
159 | def generate2DCircle(n):
160 |     slacked = 2*n+100
161 |     cartesian = np.random.rand(slacked, 2).astype(np.float32)
162 |     cartesian *= 2
163 |     cartesian -= 1
164 |     cartesian = cartesian[np.sum(cartesian*cartesian, axis=1)<1, :]
165 |     assert len(cartesian)>=n
166 |     cartesian = cartesian[:n, :]
167 | 
168 |     # import matplotlib.pyplot as plt
169 |     # plt.scatter(cartesian[:,0], cartesian[:,1])
170 |     # plt.savefig("circle.pdf")
171 |     # plt.close()
172 | 
173 |     return cartesian
174 | 
175 | 
176 | def generate2DHalfcircle(n):
177 |     slacked = 2*n+100
178 |     cartesian = np.random.rand(slacked, 2).astype(np.float32)
179 |     cartesian[:, 1] *= 2
180 |     cartesian[:, 1] -= 1
181 |     cartesian = cartesian[np.sum(cartesian*cartesian, axis=1)<1, :]
182 |     assert len(cartesian)>=n
183 |     cartesian = cartesian[:n, :]
184 |     return cartesian
185 | 
186 | 
187 | GENERATOR_FUNCTIONS = {"wave":  [generateWave, ["waveCount"]],
188 |                        "plane": [generatePlane, []],
189 |                        "clock": [generateClock, []],
190 |                        "sine":  [generateSine, []],
191 |                        "1d.uniform": [generate1DUniform, []],
192 |                        "1d.triangle": [generate1DTriangle, []],
193 |                        "2d.circle": [generate2DCircle, []],
194 |                        "2d.halfcircle": [generate2DHalfcircle, []]
195 |                        }
196 | 
197 | def readData(params):
198 |     if params.inputType=="image":
199 |         data, (height, width) = faces(params.imageDirectory)
200 |         n = len(data)
201 |         trainSize = 9*n/10
202 |         validation = data[trainSize:]
203 |         data = data[:trainSize]
204 |     elif params.inputType=="mnist":
205 |         autoencoded = params.get("autoencoded", False)
206 |         data, (height, width) = mnist(params.inputDigit, which='train', everyNth=params.everyNthInput, autoencoded=autoencoded)
207 |         validation, (_, _) = mnist(params.inputDigit, which='validation', autoencoded=autoencoded)
208 |     elif params.inputType in GENERATOR_FUNCTIONS.keys():
209 |         generatorFunction, argNames = GENERATOR_FUNCTIONS[params.inputType]
210 |         arguments = { argName: params[argName] for argName in argNames }
211 | 
212 |         isLowDim = "isLowDim" in params and params.isLowDim
213 |         if isLowDim:
214 |             assert "height" not in params and "width" not in params, "For isLowDim==True, height and width params are meaningless."
215 |             data = generatorFunction(params.trainSize, **arguments)
216 |             validation = generatorFunction(params.validSize, **arguments)
217 |         else:
218 |             height, width = params.height, params.width
219 |             data = generatorFunction(params.trainSize, height, width, **arguments)
220 |             validation = generatorFunction(params.validSize, height, width, **arguments)
221 |     else:
222 |         assert False, "unknown params.inputType %s" % params.inputType
223 |     if "height" in params or "width" in params:
224 |         assert (params.height == height) and (params.width  == width), "%d!=%d or %d!=%d" % (params.height, height, params.width, width)
225 |     return data, validation
226 | 
227 | 
228 | def dumpParams(params, f):
229 |     for k in sorted(params.keys()):
230 |         print >>f, k+"\t"+str(params[k])
231 | 
232 | def heuristicCast(s):
233 |     s = s.strip() # Don't let some stupid whitespace fool you.
234 |     if s=="None":
235 |         return None
236 |     elif s=="True":
237 |         return True
238 |     elif s=="False":
239 |         return False
240 |     try:
241 |         return int(s)
242 |     except ValueError:
243 |         pass
244 |     try:
245 |         return float(s)
246 |     except ValueError:
247 |         pass
248 |     return s
249 | 
250 | def paramsFromConf(f):
251 |     params = AttrDict()
252 |     for l in f:
253 |         if l.startswith("#"):
254 |             continue
255 |         try:
256 |             k, v = l.strip("\n").split("\t")
257 |         except:
258 |             assert False, "Malformed config line " + l.strip()
259 |         try:
260 |             v = heuristicCast(v)
261 |         except ValueError:
262 |             assert False, "Malformed parameter value " + v
263 |         params[k] = v
264 |     return params
265 | 


--------------------------------------------------------------------------------
/nnbase/layers.py:
--------------------------------------------------------------------------------
 1 | from lasagne import layers
 2 | 
 3 | class Unpool2DLayer(layers.Layer):
 4 |     """
 5 |     This layer performs unpooling over the last two dimensions
 6 |     of a 4D tensor.
 7 |     """
 8 |     def __init__(self, incoming, ds, **kwargs):
 9 | 
10 |         super(Unpool2DLayer, self).__init__(incoming, **kwargs)
11 | 
12 |         if (isinstance(ds, int)):
13 |             raise ValueError('ds must have len == 2')
14 |         else:
15 |             ds = tuple(ds)
16 |             if len(ds) != 2:
17 |                 raise ValueError('ds must have len == 2')
18 |             if ds[0] != ds[1]:
19 |                 raise ValueError('ds should be symmetric (I am lazy)')
20 |             self.ds = ds
21 | 
22 |     def get_output_shape_for(self, input_shape):
23 |         output_shape = list(input_shape)
24 | 
25 |         output_shape[2] = input_shape[2] * self.ds[0]
26 |         output_shape[3] = input_shape[3] * self.ds[1]
27 | 
28 |         return tuple(output_shape)
29 | 
30 |     def get_output_for(self, input, **kwargs):
31 |         ds = self.ds
32 |         input_shape = input.shape
33 |         output_shape = self.get_output_shape_for(input_shape)
34 |         return input.repeat(2, axis=2).repeat(2, axis=3)
35 | 


--------------------------------------------------------------------------------
/nnbase/shape.py:
--------------------------------------------------------------------------------
  1 | # See https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.py#L16-L18
  2 | 
  3 | import numpy as np
  4 | 
  5 | from lasagne.theano_extensions import padding
  6 | 
  7 | from lasagne.layers import Layer
  8 | 
  9 | 
 10 | __all__ = [
 11 |     "FlattenLayer",
 12 |     "flatten",
 13 |     "ReshapeLayer",
 14 |     "reshape",
 15 |     "DimshuffleLayer",
 16 |     "dimshuffle",
 17 |     "PadLayer",
 18 |     "pad",
 19 | ]
 20 | 
 21 | 
 22 | class FlattenLayer(Layer):
 23 |     def get_output_shape_for(self, input_shape):
 24 |         return (input_shape[0], int(np.prod(input_shape[1:])))
 25 | 
 26 |     def get_output_for(self, input, **kwargs):
 27 |         return input.flatten(2)
 28 | 
 29 | flatten = FlattenLayer  # shortcut
 30 | 
 31 | 
 32 | class ReshapeLayer(Layer):
 33 |     """
 34 |     A layer reshaping its input tensor to another tensor of the same total
 35 |     number of elements.
 36 | 
 37 |     :parameters:
 38 |         - incoming : a :class:`Layer` instance or a tuple
 39 |             the layer feeding into this layer, or the expected input shape
 40 | 
 41 |         - shape : tuple
 42 |             The target shape specification. Any of its elements can be `[i]`,
 43 |             a single-element list of int, denoting to use the size of the ith
 44 |             input dimension. At most one element can be `-1`, denoting to
 45 |             infer the size for this dimension to match the total number of
 46 |             elements of the input tensor. Any remaining elements must be
 47 |             positive integers directly giving the size of the corresponding
 48 |             dimension.
 49 | 
 50 |     :usage:
 51 |         >>> from lasagne.layers import InputLayer, ReshapeLayer
 52 |         >>> l_in = InputLayer((None, 100, 20))
 53 |         >>> l1 = ReshapeLayer(l_in, ([0], [1], 2, 10))
 54 |         >>> l1.get_output_shape()
 55 |         (None, 100, 2, 10)
 56 |         >>> l2 = ReshapeLayer(l_in, ([0], 1, 2, 5, -1))
 57 |         >>> l2.get_output_shape()
 58 |         (None, 1, 2, 5, 200)
 59 | 
 60 |     :note:
 61 |         The tensor elements will be fetched and placed in C-like order. That
 62 |         is, reshaping `[1,2,3,4,5,6]` to shape `(2,3)` will result in a matrix
 63 |         `[[1,2,3],[4,5,6]]`, not in `[[1,3,5],[2,4,6]]` (Fortran-like order),
 64 |         regardless of the memory layout of the input tensor. For C-contiguous
 65 |         input, reshaping is cheap, for others it may require copying the data.
 66 |     """
 67 | 
 68 |     def __init__(self, incoming, shape, **kwargs):
 69 |         super(ReshapeLayer, self).__init__(incoming, **kwargs)
 70 |         shape = tuple(shape)
 71 |         for s in shape:
 72 |             if isinstance(s, int):
 73 |                 if s == 0 or s < - 1:
 74 |                     raise ValueError("`shape` integers must be positive or -1")
 75 |             elif isinstance(s, list):
 76 |                 if len(s) != 1 or not isinstance(s[0], int) or s[0] < 0:
 77 |                     raise ValueError("`shape` input references must be "
 78 |                                      "single-element lists of int >= 0")
 79 |             else:
 80 |                 raise ValueError("`shape` must be a tuple of int and/or [int]")
 81 |         if sum(s == -1 for s in shape) > 1:
 82 |             raise ValueError("`shape` cannot contain multiple -1")
 83 |         self.shape = shape
 84 | 
 85 |     def get_output_shape_for(self, input_shape, **kwargs):
 86 |         # Initialize output shape from shape specification
 87 |         output_shape = list(self.shape)
 88 |         # First, replace all `[i]` with the corresponding input dimension, and
 89 |         # mask parts of the shapes thus becoming irrelevant for -1 inference
 90 |         masked_input_shape = list(input_shape)
 91 |         masked_output_shape = list(output_shape)
 92 |         for dim, o in enumerate(output_shape):
 93 |             if isinstance(o, list):
 94 |                 if o[0] >= len(input_shape):
 95 |                     raise ValueError("specification contains [%d], but input "
 96 |                                      "shape has %d dimensions only" %
 97 |                                      (o[0], len(input_shape)))
 98 |                 output_shape[dim] = input_shape[o[0]]
 99 |                 masked_output_shape[dim] = input_shape[o[0]]
100 |                 if (input_shape[o[0]] is None) \
101 |                    and (masked_input_shape[o[0]] is None):
102 |                         # first time we copied this unknown input size: mask
103 |                         # it, we have a 1:1 correspondence between out[dim] and
104 |                         # in[o[0]] and can ignore it for -1 inference even if
105 |                         # it is unknown.
106 |                         masked_input_shape[o[0]] = 1
107 |                         masked_output_shape[dim] = 1
108 |         # From the shapes, compute the sizes of the input and output tensor
109 |         input_size = (None if any(x is None for x in masked_input_shape)
110 |                       else np.prod(masked_input_shape))
111 |         output_size = (None if any(x is None for x in masked_output_shape)
112 |                        else np.prod(masked_output_shape))
113 |         del masked_input_shape, masked_output_shape
114 |         # Finally, infer value for -1 if needed
115 |         if -1 in output_shape:
116 |             dim = output_shape.index(-1)
117 |             if (input_size is None) or (output_size is None):
118 |                 output_shape[dim] = None
119 |                 output_size = None
120 |             else:
121 |                 output_size *= -1
122 |                 output_shape[dim] = input_size // output_size
123 |                 output_size *= output_shape[dim]
124 |         # Sanity check
125 |         if (input_size is not None) and (output_size is not None) \
126 |            and (input_size != output_size):
127 |             raise ValueError("%s cannot be reshaped to specification %s. "
128 |                              "The total size mismatches." %
129 |                              (input_shape, self.shape))
130 |         return tuple(output_shape)
131 | 
132 |     def get_output_for(self, input, **kwargs):
133 |         # Replace all `[i]` with the corresponding input dimension
134 |         output_shape = list(self.shape)
135 |         for dim, o in enumerate(output_shape):
136 |             if isinstance(o, list):
137 |                 output_shape[dim] = input.shape[o[0]]
138 |         # Everything else is handled by Theano
139 |         return input.reshape(tuple(output_shape))
140 | 
141 | reshape = ReshapeLayer  # shortcut
142 | 
143 | 
144 | class DimshuffleLayer(Layer):
145 |     """
146 |     A layer that rearranges the dimension of its input tensor, maintaining
147 |     the same same total number of elements.
148 | 
149 |     :parameters:
150 |         - incoming : a :class:`Layer` instance or a tuple
151 |             the layer feeding into this layer, or the expected input shape
152 | 
153 |         - pattern : tuple
154 |             The new dimension order, with each element giving the index
155 |             of the dimension in the input tensor or `'x'` to broadcast it.
156 |             For example `(3,2,1,0)` will reverse the order of a 4-dimensional
157 |             tensor. Use `'x'` to broadcast, e.g. `(3,2,1,'x',0)` will
158 |             take a 4 tensor of shape `(2,3,5,7)` as input and produce a
159 |             tensor of shape `(7,5,3,1,2)` with the 4th dimension being
160 |             broadcast-able. In general, all dimensions in the input tensor
161 |             must be used to generate the output tensor. Omitting a dimension
162 |             attempts to collapse it; this can only be done to broadcast-able
163 |             dimensions, e.g. a 5-tensor of shape `(7,5,3,1,2)` with the 4th
164 |             being broadcast-able can be shuffled with the pattern `(4,2,1,0)`
165 |             collapsing the 4th dimension resulting in a tensor of shape
166 |             `(2,3,5,7)`.
167 | 
168 |     :usage:
169 |         >>> from lasagne.layers import InputLayer, DimshuffleLayer
170 |         >>> l_in = InputLayer((2, 3, 5, 7))
171 |         >>> l1 = DimshuffleLayer(l_in, (3, 2, 1, 'x', 0))
172 |         >>> l1.get_output_shape()
173 |         (7, 5, 3, 1, 2)
174 |         >>> l2 = DimshuffleLayer(l1, (4, 2, 1, 0))
175 |         >>> l2.get_output_shape()
176 |         (2, 3, 5, 7)
177 |     """
178 |     def __init__(self, incoming, pattern, **kwargs):
179 |         super(DimshuffleLayer, self).__init__(incoming, **kwargs)
180 | 
181 |         # Sanity check the pattern
182 |         used_dims = set()
183 |         for p in pattern:
184 |             if isinstance(p, int):
185 |                 # Dimension p
186 |                 if p in used_dims:
187 |                     raise ValueError("pattern contains dimension {0} more "
188 |                                      "than once".format(p))
189 |                 used_dims.add(p)
190 |             elif p == 'x':
191 |                 # Broadcast
192 |                 pass
193 |             else:
194 |                 raise ValueError("pattern should only contain dimension"
195 |                                  "indices or 'x', not {0}".format(p))
196 | 
197 |         self.pattern = pattern
198 | 
199 |     def get_output_shape_for(self, input_shape):
200 |         # Build output shape while keeping track of the dimensions that we are
201 |         # attempting to collapse, so we can ensure that they are broadcastable
202 |         output_shape = []
203 |         dims_used = [False] * len(input_shape)
204 |         for p in self.pattern:
205 |             if isinstance(p, int):
206 |                 if p < 0 or p >= len(input_shape):
207 |                     raise ValueError("pattern contains {0}, but input shape "
208 |                                      "has {1} dimensions "
209 |                                      "only".format(p, len(input_shape)))
210 |                 # Dimension p
211 |                 o = input_shape[p]
212 |                 dims_used[p] = True
213 |             elif p == 'x':
214 |                 # Broadcast; will be of size 1
215 |                 o = 1
216 |             else:
217 |                 raise RuntimeError("invalid pattern entry, should have "
218 |                                    "caught in the constructor")
219 |             output_shape.append(o)
220 | 
221 |         for i, (dim_size, used) in enumerate(zip(input_shape, dims_used)):
222 |             if not used and dim_size != 1 and dim_size is not None:
223 |                 raise ValueError(
224 |                     "pattern attempted to collapse dimension "
225 |                     "{0} of size {1}; dimensions with size != 1/None are not"
226 |                     "broadcastable and cannot be "
227 |                     "collapsed".format(i, dim_size))
228 | 
229 |         return tuple(output_shape)
230 | 
231 |     def get_output_for(self, input, **kwargs):
232 |         return input.dimshuffle(self.pattern)
233 | 
234 | dimshuffle = DimshuffleLayer  # shortcut
235 | 
236 | 
237 | class PadLayer(Layer):
238 |     def __init__(self, incoming, width, val=0, batch_ndim=2, **kwargs):
239 |         super(PadLayer, self).__init__(incoming, **kwargs)
240 |         self.width = width
241 |         self.val = val
242 |         self.batch_ndim = batch_ndim
243 | 
244 |     def get_output_shape_for(self, input_shape):
245 |         output_shape = ()
246 |         for k, s in enumerate(input_shape):
247 |             if k < self.batch_ndim:
248 |                 output_shape += (s,)
249 |             else:
250 |                 output_shape += (s + 2 * self.width,)
251 | 
252 |         return output_shape
253 | 
254 |     def get_output_for(self, input, **kwargs):
255 |         return padding.pad(input, self.width, self.val, self.batch_ndim)
256 | 
257 | pad = PadLayer  # shortcut


--------------------------------------------------------------------------------
/nnbase/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nolearn.lasagne import BatchIterator
 4 | 
 5 | ### when we load the batches to input to the neural network, we randomly / flip
 6 | ### rotate the images, to artificially increase the size of the training set
 7 | 
 8 | class FlipBatchIterator(BatchIterator):
 9 | 
10 |     def transform(self, X1, X2):
11 |         X1b, X2b = super(FlipBatchIterator, self).transform(X1, X2)
12 |         X2b = X2b.reshape(X1b.shape)
13 | 
14 |         bs = X1b.shape[0]
15 |         h_indices = np.random.choice(bs, bs / 2, replace=False)  # horizontal flip
16 |         v_indices = np.random.choice(bs, bs / 2, replace=False)  # vertical flip
17 | 
18 |         ###  uncomment these lines if you want to include rotations (images must be square)  ###
19 |         #r_indices = np.random.choice(bs, bs / 2, replace=False) # 90 degree rotation
20 |         for X in (X1b, X2b):
21 |             X[h_indices] = X[h_indices, :, :, ::-1]
22 |             X[v_indices] = X[v_indices, :, ::-1, :]
23 |             #X[r_indices] = np.swapaxes(X[r_indices, :, :, :], 2, 3)
24 |         shape = X2b.shape
25 |         X2b = X2b.reshape((shape[0], -1))
26 | 
27 |         return X1b, X2b
28 | 


--------------------------------------------------------------------------------
/nnbase/vis.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import PIL.Image as Image
  4 | import sys
  5 | 
  6 | def plotSampledImages(net_fn, inDim, name, height, width, fromGrid, gridSize, plane=None, sampleSourceFunction=None):
  7 |     data = sampleImageForVis(net_fn, inDim, height, width, fromGrid, gridSize, plane=plane, sampleSourceFunction=sampleSourceFunction)
  8 |     plotImages(data, gridSize, name)
  9 | 
 10 | def sampleImageForVis(net_fn, inDim, height, width, fromGrid, gridSize, plane=None, sampleSourceFunction=None):
 11 |     randomState = np.random.get_state()
 12 |     np.random.seed(1)
 13 |     if fromGrid:
 14 |         if plane is None:
 15 |             plane = (0, 1)
 16 |         n_x = gridSize
 17 |         n_y = gridSize
 18 |         initial = []
 19 |         for x in np.linspace(-2, +2, n_x):
 20 |             for y in np.linspace(-2, +2, n_y):
 21 |                 v = np.zeros(inDim, dtype=np.float32)
 22 |                 if plane[0]<inDim:
 23 |                     v[plane[0]] = x
 24 |                 if plane[1]<inDim:
 25 |                     v[plane[1]] = y
 26 |                 initial.append(v)
 27 |         data = net_fn(initial)
 28 |     else:
 29 |         assert plane is None, "unsupported"
 30 |         assert sampleSourceFunction is not None
 31 |         n_x = gridSize
 32 |         n_y = gridSize
 33 |         n = n_x*n_y
 34 |         initial, data = sampleSourceFunction(net_fn, n, inDim)
 35 | 
 36 |     data = data.reshape((-1, height, width))
 37 |     np.random.set_state(randomState)
 38 |     return data
 39 | 
 40 | # shape (-1, height, width)
 41 | def plotImages(data, gridSize, name):
 42 |     height, width = data.shape[-2:]
 43 |     height_inc = height+1
 44 |     width_inc = width+1
 45 |     n_x = n_y = gridSize
 46 |     n = len(data)
 47 |     assert n <= n_x*n_y
 48 | 
 49 |     image_data = np.zeros(
 50 |         (height_inc * n_y + 1, width_inc * n_x - 1),
 51 |         dtype='uint8'
 52 |     )
 53 |     for idx in xrange(n):
 54 |         x = idx % n_x
 55 |         y = idx / n_x
 56 |         sample = data[idx].reshape((height, width))
 57 |         image_data[height_inc*y:height_inc*y+height, width_inc*x:width_inc*x+width] = 255*sample.clip(0, 0.99999)
 58 |     img = Image.fromarray(image_data)
 59 |     img.save(name+".png")
 60 | 
 61 | 
 62 | def plot(sampled, name):
 63 |     # If feature dim >> 2, and PCA has not happened, it's not too clever to plot the first two dims.
 64 |     plt.scatter(sampled.T[0], sampled.T[1])
 65 |     plt.savefig(name+".pdf")
 66 |     plt.close()
 67 | 
 68 | def gradientMap1D(data, sampled, postSampled, name):
 69 |     n = len(data)
 70 |     assert data.shape == sampled.shape == postSampled.shape == (n,1)
 71 |     plt.clf()
 72 |     plt.axis((-2, +2, -2, +2))
 73 |     import random
 74 |     for i in range(len(data)):
 75 |         # plt.arrow(sampled[i,0], data[i,0], 0, postSampled[i,0]-data[i,0], head_width=0.005)
 76 |         h = random.random()
 77 |         plt.arrow(sampled[i,0], h, data[i,0]-sampled[i,0], 0.1, head_width=0.005, color="blue")
 78 |         plt.arrow(sampled[i,0], h, postSampled[i,0]-sampled[i,0], 0.2, head_width=0.005, color="red")
 79 |     plt.savefig(name+".pdf")
 80 |     plt.close()
 81 | 
 82 | 
 83 | 
 84 | def get_picture_array_simple(X, height, width, index):
 85 |     array = X[index].reshape((height, width))
 86 |     array = np.clip(array*255, a_min = 0, a_max = 255)
 87 |     return  array.repeat(4, axis = 0).repeat(4, axis = 1).astype(np.uint8())
 88 | 
 89 | def get_random_images(X_in, X_pred):
 90 |     index = np.random.randint(len(X_pred))
 91 |     print index
 92 |     height, width = X_in.shape[2:]
 93 |     original_image = Image.fromarray(get_picture_array_simple(X_in, height, width, index))
 94 |     new_size = (original_image.size[0] * 2, original_image.size[1])
 95 |     new_im = Image.new('L', new_size)
 96 |     new_im.paste(original_image, (0,0))
 97 |     rec_image = Image.fromarray(get_picture_array_simple(X_pred, height, width, index))
 98 |     new_im.paste(rec_image, (original_image.size[0],0))
 99 |     new_im.save('test1.png', format="PNG")
100 | 
101 | def get_numpy_picture_array(X, n_x, n_y):
102 |     height, width = X.shape[-2:]
103 |     image_data = np.zeros(
104 |         ((height+1) * n_y - 1, (width+1) * n_x - 1),
105 |         dtype='uint8'
106 |     )
107 |     n = len(X)
108 |     assert n <= n_x * n_y
109 |     for idx in xrange(n):
110 |         x = idx % n_x
111 |         y = idx / n_x
112 |         sample = X[idx]
113 |         image_data[(height+1)*y:(height+1)*y+height, (width+1)*x:(width+1)*x+width] = (255*sample).clip(0, 255)
114 |     return image_data
115 | 
116 | def get_picture_array(X, n_x, n_y, name):
117 |     image_data = get_numpy_picture_array(X, n_x, n_y)
118 |     img = Image.fromarray(image_data)
119 |     img.save(name+".png")
120 | 
121 | # That's awkward, (height, width) corresponds to (n_y, n_x),
122 | # namely the image size is ~ (width*n_x, height*n_y), but the order is reversed between the two.
123 | def diff_vis(dataOriginal, generatedOriginal, height, width, n_x, n_y, name, distances=None):
124 |     data = dataOriginal.copy().reshape((-1, height, width))
125 |     generated = generatedOriginal.copy().reshape((-1, height, width))
126 |     if distances is not None:
127 |         # Ad hoc values, especially now that there's no bimodality aka left bump of 1s.
128 |         VALLEY = 0.3
129 |         MAX_OF_REASONABLE = 1.0
130 |         assert len(distances) == len(data) == len(generated)
131 |         for i,distance in enumerate(distances):
132 |             length = np.linalg.norm(data[i])
133 |             relativeDistance = (distance+1e-5)/(length+1e-5)
134 |             barHeight = min((int(height*relativeDistance/MAX_OF_REASONABLE), height))
135 |             goGreen = float(relativeDistance<VALLEY) # 1.0 if we want green, 0.0 if we want red.
136 |             # data is drawn red, generated is drawn green,
137 |             # we hackishly manipulate them here to get the needed color.
138 |             data     [i, :barHeight, :2] = 1.0-goGreen
139 |             generated[i, :barHeight, :2] = goGreen
140 | 
141 |     image_data      = get_numpy_picture_array(data, n_x, n_y)
142 |     image_generated = get_numpy_picture_array(generated, n_x, n_y)
143 |     # To color-combine the images AFTER they are arranged in a grid is
144 |     # more than a little hackish.
145 |     blue = np.minimum(image_data, image_generated) # image_data/2 + image_generated/2
146 |     rgb = np.dstack((image_data, image_generated, blue))
147 |     img = Image.fromarray(rgb, 'RGB')
148 |     img.save(name+".png")
149 | 
150 | def plotGradients(data, sampled, initial, net_fn, filename):
151 |     updated = net_fn(initial)
152 |     plt.xlim(-10,10)
153 |     plt.ylim(-10,10)
154 |     for x, y, z in zip(data, sampled, updated):
155 |         # print np.linalg.norm(x-y)-np.linalg.norm(x-z), x, y, z
156 |         plt.arrow(y[0], y[1], (x-y)[0], (x-y)[1], color=(1,0,0),  head_width=0.05, head_length=0.1)
157 |         plt.arrow(y[0], y[1], (z-y)[0], (z-y)[1], color=(0,0,1), head_width=0.05, head_length=0.1)
158 |     plt.savefig(filename+".pdf")
159 |     plt.close()
160 | 
161 | 
162 | def plot_distance_histogram(distances, filename, data=None):
163 |     doNormalization = False
164 |     if doNormalization:
165 |         assert data is not None
166 |         assert len(data.shape)==2
167 |         lengths = np.linalg.norm(data, axis=1)
168 |         assert lengths.shape == distances.shape
169 |         normalizedDistances = distances/lengths
170 |         values = normalizedDistances
171 |     else:
172 |         values = distances
173 | 
174 |     try:
175 |         # Ad hoc value, hist fails with even one nan value.
176 |         values = np.clip(values, 0.0, 1000.0)
177 |         plt.hist(values, 20, normed=0, facecolor='green')
178 |         plt.savefig(filename+".pdf")
179 |         plt.close()
180 |     except AttributeError:
181 |         sys.stderr.write("Unable to create %s.pdf\n" % filename)
182 | 
183 | 
184 | def heatmap(data, filename):
185 |     binGridSize = 100
186 |     dim = data.shape[1]
187 |     assert dim in (1,2)
188 |     if dim==1:
189 |         plt.clf()
190 |         plt.hist(data[:,0], bins=50)
191 |         plt.savefig(filename+".png")
192 |     else:
193 |         x = data[:, 0].tolist()
194 |         y = data[:, 1].tolist()
195 |         hmap, xedges, yedges = np.histogram2d(x, y, bins=(binGridSize, binGridSize))
196 |         # TODO Output is mixed.
197 |         extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
198 |         plt.clf()
199 |         plt.imshow(hmap, extent=extent)
200 |         plt.savefig(filename+".png")
201 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Earth moving generative net
 2 | 
 3 | This is a neural network model capable of learning high dimensional probability distributions by sampling them.
 4 | Its generative model is the most commonly used one: It samples from some simple fixed prior distribution,
 5 | and the trained feed-forward neural
 6 | network transforms that into the generated sample. It differs from models of this kind (Variational
 7 | Autoencoders, Moment Matching Networks, Generative Adversarial Networks) in how training happens.
 8 | (Another difference from some of these is that the input distribution can be arbitrary, for
 9 | example discrete and mixture input distributions are allowed and practical.)
10 | 
11 | The training optimizes an empirical approximation to the so-called
12 | [Earth Mover's Distance](https://en.wikipedia.org/wiki/Earth_mover%27s_distance).
13 | A minibatch SGD training step takes *n* observations, samples *n* points from the generative model,
14 | pairs the generated points with the observations, and updates the neural weights to decrease
15 | the sum of (squared) pairwise distances.
16 | 
17 | This repo contains a Theano implementation of the model. It's undocumented, but the code is relatively
18 | self-explanatory. The main executable takes a configuration file as it's single parameter.
19 | Such configuration files can be found in the `deepDives` and `adhoc` directories.
20 | 
21 | ```
22 | python earthMover.py deepDives/conf8.txt
23 | ```
24 | 
25 | Some visualizations on MNIST and synthetic distributions:
26 | 
27 | Generating:
28 | 
29 | ![Generating from MNIST](http://people.mokk.bme.hu/~daniel/kohonen/conf8/s5600.png)
30 | 
31 | Approximating unseen samples:
32 | 
33 | ![Approximating unseen samples from MNIST](http://people.mokk.bme.hu/~daniel/kohonen/conf8/diff_validation5600.png)
34 | 
35 | Applying the transformation to a fixed input plane:
36 | 
37 | ![Applying the transformation to a fixed input plane](http://people.mokk.bme.hu/~daniel/kohonen/conf8/xy5600.png)
38 | 
39 | Finding the "right" parametrization for a simple synthetic distribution:
40 | 
41 | ![Clock](http://people.mokk.bme.hu/~daniel/kohonen/clock1-sd1.0/input.png)
42 | 
43 | ![Generated](http://people.mokk.bme.hu/~daniel/kohonen/clock1-sd1.0/xy200.png)
44 | 


--------------------------------------------------------------------------------
/readme.sh:
--------------------------------------------------------------------------------
  1 | . /Users/daniel/experiments/rbm/daniel-experiments/lasagne-demo/venv/bin/activate
  2 | 
  3 | # Attempt at faces:
  4 | # http://www.hcii-lab.net/data/SCUT-FBP/EN/introduce.html
  5 | cd ~/experiments/rbm/daniel-experiments/face/SCUT-FBP
  6 | wget http://www.hcii-lab.net/data/SCUT-FBP/download/Data_Collection.zip
  7 | wget http://www.hcii-lab.net/data/SCUT-FBP/download/Rating_Collection.zip
  8 | unzip -q Data_Collection.zip
  9 | cd Data_Collection
 10 | mkdir ../thumb
 11 | mogrify -path ../thumb -thumbnail 28x28 -extent 28x28 -gravity Center -colorspace gray *.jpg
 12 | 
 13 | cd ~/experiments/rbm/daniel-experiments/kohonen
 14 | montage ../face/SCUT-FBP/thumb/SCUT-FBP-*[0-7].jpg -geometry 28x28+0+0 ../face/SCUT-FBP/tile.jpg
 15 | 
 16 | mkdir ~/experiments/rbm/daniel-experiments/face/SCUT-FBP/thumb.big
 17 | mogrify -path ~/experiments/rbm/daniel-experiments/face/SCUT-FBP/thumb.big -thumbnail 32x42 -extent 32x42 -gravity Center -colorspace gray -format png ~/experiments/rbm/daniel-experiments/face/SCUT-FBP/Data_Collection/*.jpg
 18 | # -> this became exp.bigfaces.n100 after learning.
 19 | 
 20 | cd exp.bigfaces.n100
 21 | ssh kruso.mokk.bme.hu mkdir ./public_html/kohonen
 22 | for dir in xy yz xz s ; do convert $dir[1-9]0000.png $dir[0-9][0-9]0000.png -delay 10 -loop 0 $dir.gif ; done
 23 | scp *.gif kruso.mokk.bme.hu:./public_html/kohonen/
 24 | 
 25 | 
 26 | # Second attempt at faces:
 27 | # https://www.kaggle.com/c/facial-keypoints-detection/data
 28 | # http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/#the-data
 29 | cd ~/experiments/rbm/daniel-experiments/face/kaggle-facial-keypoints-detection
 30 | ( cat training.csv | awk 'BEGIN{FS=","} { print $NF }' | tail -n +2 ; cat test.csv | cut -f2 -d',' | tail -n +2 ) > pixels.txt
 31 | # -> 7049 train + 1784 test = 8832 96x96x1 images.
 32 | 
 33 | 
 34 | # Back at digits again, playing with larger n:
 35 | # exp.20dCubeMixture.2layerTanh.n100.digit7 is a stupid mistake. Both
 36 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit7 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit7
 37 | # and
 38 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit7 300 > cout.exp.20dCubeMixture.2layerTanh.n300.digit7
 39 | # were pointing to this dir. The n300 started later, so it's overwritten the other, except for
 40 | # the latest images, *101700.png - *102800.png. n300 is definitely worse,
 41 | # more prone to forked lines. Why?
 42 | 
 43 | # Now running:
 44 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit3 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit3
 45 | python generative-mlp.py exp.20dGaussian.2layerTanh.n300.digit3 300 > cout.exp.20dGaussian.2layerTanh.n300.digit3
 46 | # UPDATE: Dumb me, that's n300 right there.
 47 | # Have to do Gaussian again with n100. See below.
 48 | 
 49 | # The filenames tell all, hopefully. For the record:
 50 | # 100 hidden units, learning_rate=0.02, momentum=0.5
 51 | # scale of normal distribution 1/4, findGenForData True, overSamplingFactor 1.
 52 | 
 53 | # UPDATE: Disregard this paragraph, it compares gauss.n300 to mixture.n100.
 54 | # -> After some 2000 epochs, the main difference is that mixture does the forks,
 55 | # gauss doesn't, but gauss is super non-diverse.
 56 | # After some 10000-30000 epochs (pretty subjective when) mixture stops doing the forks.
 57 | # The weirdest is that around here, gauss starts the fork thing, while still not
 58 | # being as diverse as mixture. All in all, it's objectively worse.
 59 | 
 60 | # UPDATE: Apples to apples aka n100 to n100 comparison
 61 | # between mixture and gauss.
 62 | # and also between gauss.n300 and gauss.n100.
 63 | python generative-mlp.py exp.20dGaussian.2layerTanh.n100.digit3 100 > cout.exp.20dGaussian.2layerTanh.n100.digit3
 64 | # -> EVALUATE!
 65 | 
 66 | # Okay, let's go all in, how about getting rid of the continuous component?
 67 | python generative-mlp.py exp.20dBoolean.2layerTanh.n100.digit3 100 > cout.exp.20dBoolean.2layerTanh.n100.digit3
 68 | python generative-mlp.py exp.50dBoolean.2layerTanh.n100.digit3 100 > cout.exp.50dBoolean.2layerTanh.n100.digit3
 69 | # -> EVALUATE!
 70 | 
 71 | # Does this fully-boolean-craziness work with more diverse data as well?
 72 | python generative-mlp.py exp.50dBoolean.2layerTanh.n100.digitAll 100 > cout.exp.50dBoolean.2layerTanh.n100.digitAll
 73 | # -> EVALUATE!
 74 | 
 75 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit2 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit2
 76 | python generative-mlp.py exp.50dCubeMixture.2layerTanh.n100.digit2 100 > cout.exp.50dCubeMixture.2layerTanh.n100.digit2
 77 | # -> Waiting for results. EVALUATE!
 78 | 
 79 | # Lots of work done on quantifying generation performance.
 80 | # We sample train and validation (unseen), greedily approximate them with generated samples,
 81 | # and quantify/visualize difference between gold and nearest generated (surrogate).
 82 | # Specifically, we log total L2 diff on train and valid,
 83 | # we visualize difference, and histogram L2 distances between gold and surrogate.
 84 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis
 85 | 
 86 | cd exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis
 87 | dir=diff_validation ; convert input.png $dir[1-9]000.png $dir[0-9][0-9]000.png $dir[0-9][0-9][0-9]000.png -delay 10 -loop 0 $dir.gif
 88 | # (inputs.png is only there as an almost subliminal signal to mark the beginning of the sequence.)
 89 | 
 90 | # -> ANALYZE A BIT MORE, but at first glance, it seem like it does not
 91 | # really converge after an initial phase. It's very adamant in NOT
 92 | # learning outliers. If it does not like something, it consistently
 93 | # behaves like it were not there. Why?
 94 | 
 95 | # For all digits, sampleTotal1e5 (as above):
 96 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis
 97 | # Same but sampleTotal1e6, let's give the model a bit more chance to reproduce weird things:
 98 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.sampleTotal1e6 100 > cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.sampleTotal1e6
 99 | # -> Setting plotEach=1000 was dumb here, but we'll live with it.
100 | 
101 | # Visually inspecting the above two, it seems like sampleTotal1e6 over sampleTotal1e5
102 | # causes just a tiny improvement in matching. (1. When the general shape is
103 | # recognised, the details are similar. 2. It's rare that 1e6 recognizes the
104 | # general shape while 1e5 does not.)
105 | 
106 | # Quantitatively:
107 | paste <( grep t < cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis) <( grep t < cout.exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.sampleTotal1e6) | grep -v time | sed "s/_distance//g"
108 | epoch 0 train 3539.577401 validation 3477.146151        epoch 0 train 3522.447895 validation 3458.440632
109 | epoch 1000 train 2213.074515 validation 2232.757161     epoch 1000 train 2185.971621 validation 2206.666417
110 | epoch 2000 train 2142.505005 validation 2149.576490     epoch 2000 train 2107.168020 validation 2124.794779
111 | epoch 3000 train 2135.671001 validation 2104.446155     epoch 3000 train 2045.129582 validation 2079.306816
112 | epoch 4000 train 2067.423567 validation 2073.011328     epoch 4000 train 2034.931709 validation 2050.239458
113 | epoch 5000 train 2073.096339 validation 2053.568913     epoch 5000 train 2020.982528 validation 2032.500311
114 | epoch 6000 train 2030.102489 validation 2039.459535     epoch 6000 train 2024.169692 validation 2019.274401
115 | epoch 7000 train 2009.205883 validation 2029.150878     epoch 7000 train 1999.691904 validation 2010.084477
116 | epoch 8000 train 2030.954336 validation 2016.807750     epoch 8000 train 2008.445006 validation 2000.994180
117 | epoch 9000 train 1996.428312 validation 2007.845411     epoch 9000 train 1975.903737 validation 1992.986778
118 | epoch 10000 train 2004.636604 validation 2001.809792    epoch 10000 train 1971.208121 validation 1988.213054
119 | epoch 11000 train 1970.277075 validation 1996.890203    epoch 11000 train 1941.435286 validation 1983.694747
120 | epoch 12000 train 1991.091591 validation 1990.768504    epoch 12000 train 1959.057641 validation 1980.160241
121 | epoch 13000 train 1951.329268 validation 1986.618364    epoch 13000 train 1965.147074 validation 1974.976362
122 | epoch 14000 train 1971.516946 validation 1982.143524    epoch 14000 train 1941.614413 validation 1972.079845
123 | epoch 15000 train 2034.801743 validation 1982.433723    epoch 15000 train 1988.605444 validation 1968.329759
124 | epoch 16000 train 1962.617783 validation 1976.536872    epoch 16000 train 1944.039658 validation 1965.219307
125 | epoch 17000 train 1958.752054 validation 1974.458151    epoch 17000 train 1942.624157 validation 1962.432814
126 | epoch 18000 train 1969.002308 validation 1972.265950    epoch 18000 train 1919.335813 validation 1960.542414
127 | epoch 19000 train 1973.435493 validation 1971.694208    epoch 19000 train 1948.049374 validation 1957.739240
128 | epoch 20000 train 1949.355630 validation 1968.837136    epoch 20000 train 1965.053454 validation 1955.413557
129 | epoch 21000 train 1951.518872 validation 1967.355088    epoch 21000 train 1949.744274 validation 1952.960039
130 | # The validation set is fixed, the train is a random 400 sample of whole train,
131 | # that's why train is much more jumpy. But both are still converging,
132 | # after 21000 epochs 27 hours. Slooow convergence, zero overfitting.
133 | 
134 | # Let's take a look at digit2, it had more time for less data.
135 | cat cout.exp.20dCubeMixture.2layerTanh.n100.digit2.moreVis | grep train | grep "[24680]0000 " | less
136 | epoch 20000 train_distance 2101.250673 validation_distance 2120.222100
137 | epoch 40000 train_distance 2029.797620 validation_distance 2046.173394
138 | epoch 60000 train_distance 1974.057262 validation_distance 2019.035806
139 | epoch 80000 train_distance 1988.527564 validation_distance 1998.971389
140 | epoch 100000 train_distance 1922.723794 validation_distance 1985.801420
141 | epoch 120000 train_distance 1947.624636 validation_distance 1976.859255
142 | epoch 140000 train_distance 1968.396235 validation_distance 1971.583964
143 | epoch 160000 train_distance 1936.777050 validation_distance 1968.063181
144 | epoch 180000 train_distance 1942.150860 validation_distance 1963.848308
145 | epoch 200000 train_distance 1927.290592 validation_distance 1962.477934
146 | epoch 220000 train_distance 1937.469074 validation_distance 1961.097084
147 | epoch 240000 train_distance 1913.067156 validation_distance 1959.697684
148 | epoch 260000 train_distance 1932.563189 validation_distance 1957.309960
149 | # -> Still converging, but the rate is worthless now, after some 29 hours.
150 | 
151 | ####
152 | 
153 | # TODO With our new evaluation weapons, let's re-attack the issue of
154 | # how to assign surrogates and samples to each other.
155 | # The most important is to take a second look at m.
156 | # (Now that we moved theano graph compilation out of the inner loop.)
157 | # Remember, m is the number of generated samples to choose from
158 | # when finding pairs to n gold points.
159 | # The learning rate should also be checked.
160 | # Things like findGenForData, overSamplingFactor, and maybe
161 | # an epoch-dependent n (minibatch sampling size) or learning rate (a la Kohonen).
162 | 
163 | ####
164 | 
165 | # Okay, focusing on m. At the early phase of training, it's probably
166 | # not smart to have a large m, as that means that many generated point
167 | # stay at their bad place. Further in the training, it is smart,
168 | # as that helps to learn small details.
169 | # That's theorizing, but let's start with something super simple: m=1000 n=100.
170 | 
171 | # From now on, moreVis is taken as given, so the parent exp of
172 | # exp.20dCubeMixture.2layerTanh.n100.m10000.digitAll
173 | # is exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis (the one with sampleTotal1e5)
174 | # diff: plotEach=100 (was 1000), m=1000 (was m=n=100).
175 | python generative-mlp.py exp.20dCubeMixture.2layerTanh.n100.m1000.digitAll 100 > cout.exp.20dCubeMixture.2layerTanh.n100.m1000.digitAll
176 | # -> For some reason, this is so slow that it doesn't even make sense to
177 | # compare it with its parent exp.20dCubeMixture.2layerTanh.n100.digitAll.moreVis.
178 | # 680 epochs in 6 hours, versus the old 26000 epochs in 31 hours, 10000 epochs per 12 hours.
179 | # And these new 680-epoch are comparable in quality to the old 2000-epoch results, achieved in
180 | # just 2.5 hours.
181 | # Maybe the ridiculous slowness is just a bug, but let's postpone figuring this out
182 | # after making autoencoder work.
183 | 
184 | ####
185 | 
186 | # Spearmint-ize
187 | # (Spearmint because I couldn't figure out how to use kwargs with hyperopt)
188 | 
189 | # Had to rename generative-mlp.py to generativeMLP.py
190 | # so that it can be imported as a module.
191 | 
192 | # We are in the lasagne-demo venv.
193 | brew install mongodb
194 | pip install pymongo
195 | git clone git@github.com:HIPS/Spearmint.git
196 | pip install -e Spearmint
197 | mkdir mongodb
198 | cd mongodb
199 | mongod --fork --logpath ./log --dbpath .
200 | cd ..
201 | mkdir spearmintOutput
202 | 
203 | # Reads config.json which references spearmintTask.py
204 | # and writes to a directory named "output".
205 | # Also spearmintTask.py is set up so that it creates directories for each
206 | # job, dirname ./spearmintOutput/LR0.010-n10 or such.
207 | python Spearmint/spearmint/main.py .
208 | 
209 | # Cleanup command resets experiment:
210 | Spearmint/spearmint/cleanup.sh .
211 | # TODO This . was not intended for this, there should be a proper subdir for it.
212 | 
213 | # It's running now. spearmintOutput/log.cerr is where the current best is visible.
214 | 
215 | # -> Stopped, it has basically converged. Moved everything to spearmintExps/epoch200
216 | # I copied the config.json there as well.
217 | # Turns out the best is the maximal allowed inDim 50 and the maximal allowed minibatchSize (!) 100.
218 | # (We've seen smaller minibatchSizes to be better when inDim was small, haven't we? Not sure anymore.)
219 | # Learning rate converged to ~10, it was constrained to [0.2, 50].
220 | 
221 | Spearmint/spearmint/cleanup.sh .
222 | python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr
223 | # - logs of individual runs are in spearmintOutput/*/log.txt
224 | # - spearmint current best is in spearmintOutput/log.cerr
225 | # - jobs are logged in ./output/*. It's really only useful for two things:
226 | #   it has cerrs of my jobs, and it has the running times.
227 | # - if we want to graph or something, mongodb is the way, dbname is in config.json
228 | 
229 | # Hideous but mostly harmless hack while I learn to query the mongodb or write better logging:
230 | grep final spearmintOutput/*/log.txt | sed "s/\/log.txt:final performance / /" | sed "s/spearmintOutput\///" | tr ' -' '\t' | awk '{ print $NF "\t" $0 }' | sort -n
231 | 
232 | # TODOS
233 | # - run the current best for a large number of epochs.
234 | # - tune the last important untuned parameter: the variance of the input gaussian,
235 | #   or more generally, the input distribution. (value is not very sensitive to inDim,
236 | #   so we might as well fix it as small.)
237 | # - figure out a metric that punishes memorizing samples.
238 | # - log running times in log.txt. maybe we can play tricks with taking
239 | #   the median of epoch runtimes instead of sum, that would approximate CPU time pretty well.
240 | # - save git sha + diff in exp dir.
241 | # - revive the mainLowDim codepath.
242 | 
243 | # Which conf is currently the best?
244 | grep final spearmintOutput/*/log.txt | awk '{ print $NF,$0 }' | grep -v "^nan" | sort -n | head -1 | sed "s/log\.txt.*/conf.txt/"
245 | open `grep final spearmintOutput/*/log.txt | awk '{ print $NF,$0 }' | sort -n | grep -v "^nan" | head -10 | cut -f2 -d' ' | cut -f1 -d':' | sed "s/log\.txt/s400.png/"`
246 | # -> Visually, some of them are more perfect but less diverse,
247 | # some of them are varying a lot in brightness,
248 | # TODO which parameters influence these?
249 | 
250 | # Seems like epoch200 and epoch400 does not tell much about the later convergence
251 | # properties of a param-setting. How about epoch1600?
252 | 
253 | # I took the current best, rounded the params a bit, and the result is
254 | # deepDives/conf1.txt
255 | # output is deepDives/conf1-hls200-inDim20-lr10-mom0.6-n300-os4.0
256 | # The above is the general workflow: Take promising confs, tune them,
257 | # set expName to deepDives/confN-DETAILED_DESCRIPTION_PATH,
258 | # put them into deepDives/confN.txt, add that to git.
259 | # When it has run, maybe add final round output to git as well.
260 | 
261 | # As seen on
262 | # https://docs.google.com/spreadsheets/d/1IWE7_Xeh81Pa9MgaV2QsDKJSHYmkQjBQring_xdC3CY/edit#gid=0
263 | # , overfitting kicks in a epoch12000.
264 | # epoch TMean   TMedian VMean   VMedian (10-moving averages)
265 | # 12000     4.1544335       4.2569459       4.19644         4.22872
266 | # 87200     4.1218603       4.227801        4.2057856       4.2594421
267 | 
268 | # conf2 is same as conf1 except for the smaller learning rate 10->1.
269 | # Surprisingly the convergence is not that much slower.
270 | # 
271 | # Also surprisingly, it seems like it will never reach conf1 accuracy.
272 | # (conf2 vmean settling near 4.27 at epoch24000 but already at 4.28 at epoch1000.
273 | # while conf1 vmean stopped at 4.19 at epoch14000.)
274 | 
275 | # I fixed a visualization UX issue: s*.png are now generated from
276 | # the same random numbers, so that they form an animation.
277 | # The flipside is the we now see a smaller part of the generated space.
278 | # spearmintExps/epoch1600/output/00000136.out aka
279 | # spearmintExps/epoch1600/hls117-inDim88-lr2.12805643612-mom0.647445989185-n300-os3.9988942992
280 | # is the first such one.
281 | 
282 | mv spearmintOutput spearmintExps/epoch1600
283 | mv output spearmintExps/epoch1600/
284 | 
285 | # Let's try conf1 with layerNum=3, and call it conf3.
286 | # ...Wow. That's amazing. At vmean 4.12 at epoch2800.
287 | # Maybe only the bigger number of parameters? Should check.
288 | 
289 | # conf3 vmean plateaued between epoch4400 and epoch12000 at 4.10,
290 | # and then slowly crawled up to 4.13.
291 | 
292 | # The new spearmint run epochCount4800_depth3_useReLUFalse_everyNthInput10
293 | # runs on commit 414fb5df9d8bec71f1c05ae199f6f891ca3a5cb1.
294 | # It is different from the parent epochCount1600_useReLUFalse_everyNthInput10
295 | # in the following ways:
296 | # layerNum 2 -> 3, epoch 1600 -> 4800, plotEach 400 -> 800
297 | # learningRate.max 200.0 -> 20.0.
298 | # indim (20,100) -> (10,50)
299 | # and uses vmean instead of vmedian as value.
300 | # Be careful when you compare with the previous spearmint run's vmedians.
301 | # (They move together anyway, but vmedian is super bumpy. A typical difference between the two:
302 | # vmedian is 0.06 larger than vmean, regardless of the current epoch.)
303 | 
304 | mkdir spearmintOutput
305 | python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr
306 | 
307 | # Weirdly, its top contender after 9 runs,
308 | # 4.489997 spearmintOutput/hls300-inDim10-lr6.34021189647-mom0.5-n300-os3.58636953526
309 | # has parameters quite similar to
310 | # 4.106030 deepDives/conf3-d3-hls200-inDim20-lr10-mom0.6-n300-os4.0
311 | # , but the numbers are much-much worse at epoch4800:
312 | # conf3     epoch 4800 trainMean 3.938845 trainMedian 4.025593 validationMean 4.106030 validationMedian 4.138099
313 | # spearmint epoch 4800 trainMean 4.452593 trainMedian 4.579250 validationMean 4.489997 validationMedian 4.532965
314 | # UPDATE: I seriously botched this: layerNum 3 was the main idea,
315 | # but I actually used layer2. Useless, I put it into Attic/botched.depth2insteadofdepth3
316 | # See below notes on epochCount4800_depth3_4_useReLUFalse_everyNthInput10 about how I fixed this.
317 | # UPDATE2, even more important: I inadverently used relu in all deepDives.
318 | 
319 | # deepDives/conf4 is the same as the successful conf3, but with the faces dataset.
320 | # One weird thing is that the output has lots of damaged pixels which are always black.
321 | # (UPDATE: I used relu here without knowing it, that's the reason.)
322 | # (Probably always negative, and clipped to 0.) These go away, but very very slowly:
323 | # at epoch5000 we have ~25 damaged pixels, epoch12000 ~10, epoch20000 exactly 2.
324 | # Unfortunately the result of conf4 is not very convincing. Some of the time it's
325 | # just rote learning, other times the nearest generated sample is a linear combination
326 | # of two rote-learned faces. At least it's pretty good at rote learning:
327 | # reproduces quite a few details of the train sample.
328 | # Of course, what did I expect with just 400 training samples and minibatchsize n300?
329 | 
330 | # Motivated by this, I implemented the following benchmark and visualization:
331 | # Same as diff_validation, but with the train dataset taking the place of the generated
332 | # samples. Needs some refactor. I'll call this the nnbaseline, nn as in nearest neighbor.
333 | # It only has to be run once for each dataset, but it's not a big deal if we run it
334 | # once for each traning session.
335 | # Values:
336 | # inputType=mnist, inputDigit=None, everyNthInput=10, gridSizeForSampling=20
337 | # nnbaselineMean 4.863300 nnbaselineMedian 5.040003
338 | # -> Why is gridSizeForSampling relevant? Because of a stupid mixing of
339 | # responsibilities, we use only the first gridSizeForSampling**2 validation points.
340 | # -> mnist() random seed set to 1. We do randomization there, but reproducibly.
341 | 
342 | # That sound like good news, and it probably is: our current best is
343 | # epoch 6400 trainMean 3.906343 trainMedian 4.017440 validationMean 4.103766 validationMedian 4.130489
344 | # , which was probably meta-overfitted a bit, but still better.
345 | # But before we start to celebrate, this is probably an artifact:
346 | # Our generated samples are smoothed, less sharp compared to the gold samples,
347 | # so a close but imperfect match is scored higher than when we compare two gold ones.
348 | 
349 | # inputType=image, imageDirectory=../face/SCUT-FBP/thumb.big/, everyNthInput=1, gridSizeForSampling=20
350 | # nnbaselineMean 6.403875 nnbaselineMedian 6.177893
351 | # nnbaselineMean 5.891883 nnbaselineMedian 5.616794 (different seed: 1)
352 | # nnbaselineMean 5.928859 nnbaselineMedian 5.845743 (another seed: 2)
353 | # our current best: (bestish, didn't want to meta-overfit by picking the specific best)
354 | # epoch 28000 trainMean 3.643037 trainMedian 3.592704 validationMean 4.875953 validationMedian 4.736241
355 | # -> This is impressive, but not directly comparable, I forgot to fix the random seed.
356 | # (Fixed now, but don't know the seed for conf3. Ouch.
357 | # TODO Should be a parameter to make the whole run reproducible.)
358 | 
359 | # What about visual comparison? mnist looks okay to me. If it's rote learning, it's
360 | # at least quite convincing. The samples conf4 generates are evil, with all this mixing,
361 | # but the diffs look okay when compared to the nnbaseline (which is bad, not enough data points).
362 | # Side remark: Even though the individual s*.png-s are shitty, s.gif is pretty cool,
363 | # mixing looks like constant smooth crossfading there, and the details slowly emerging look great,
364 | # rote learning or not.
365 | 
366 | for dir in diff_validation diff_train s xy yz xz ; do convert input.png $dir[1-9]00.png $dir[0-9][0-9]00.png $dir[0-9][0-9][0-9]00.png -delay 20 -loop 0 $dir.gif ; done
367 | 
368 | 
369 | #####
370 | 
371 | # Turns out, I seriously botched the epoch4800 spearmint run: used layerNum 2 instead of 3.
372 | # Try again: epochCount4800_depth3_4_useReLUFalse_everyNthInput10
373 | # It is different from the parent epochCount1600_useReLUFalse_everyNthInput10 aka spearmintExps/epoch1600
374 | # in the following ways:
375 | # layerNum 2 -> [3,4], epoch 1600 -> 4800, plotEach 400 -> 800
376 | # learningRate.max 200.0 -> 20.0.
377 | # indim [20,100] -> [10,50]
378 | 
379 | # Oh god I botched something even more serious:
380 | # False is not turned into bool, stays str. That means that deepDives used relu even though
381 | # the conf explicitely said don't use relu. That's what made conf3 perform better than any
382 | # of the spearmint runs.
383 | # I changed the conf[1234].txts to say userelu True. Serialization bug is fixed now.
384 | # tanh spearmint run moved to spearmintExps/epoch4800-tanh, restarting with relu,
385 | # expname epochCount4800_depth3_4_useReLUTrue_everyNthInput10
386 | # BTW relu is not just better than tanh, it's also 30% faster. (I assume they got the same amount
387 | # of CPU cycles.
388 | 
389 | # Turns out the cubemixture does not help with the newer models.
390 | # (If I had the time, I would investigate where did it stop helping,
391 | # but relu+layer3 is capable of harder transitions, that's for sure.)
392 | 
393 | # Here is the current best epoch4800 spearmintOutput compared with its straight gaussian child-experiment:
394 | cat /Users/daniel/experiments/rbm/daniel-experiments/kohonen/spearmintOutput/hls300-inDim12-layerNum4-lr20.0-mom0.5-n300-os3.99999999824/log.txt | grep train | awk '($2%800==0)'
395 | epoch 800 trainMean 3.942084 trainMedian 4.023556 validationMean 4.120661 validationMedian 4.135574
396 | epoch 1600 trainMean 3.819828 trainMedian 3.863694 validationMean 4.091879 validationMedian 4.141805
397 | epoch 2400 trainMean 3.764334 trainMedian 3.825936 validationMean 4.100879 validationMedian 4.125267
398 | epoch 3200 trainMean 3.727745 trainMedian 3.769446 validationMean 4.115094 validationMedian 4.149114
399 | epoch 4000 trainMean 3.688223 trainMedian 3.761931 validationMean 4.101507 validationMedian 4.165966
400 | epoch 4800 trainMean 3.699963 trainMedian 3.767960 validationMean 4.109944 validationMedian 4.142234
401 | 
402 | cat deepDives/conf7-gauss/log.txt | grep train | awk '($2%800==0)'
403 | epoch 800 trainMean 3.946862 trainMedian 4.045183 validationMean 4.104929 validationMedian 4.164662
404 | epoch 1600 trainMean 3.825675 trainMedian 3.930816 validationMean 4.087788 validationMedian 4.111501
405 | epoch 2400 trainMean 3.775109 trainMedian 3.883820 validationMean 4.086608 validationMedian 4.099182
406 | epoch 3200 trainMean 3.747717 trainMedian 3.784954 validationMean 4.099656 validationMedian 4.120957
407 | epoch 4000 trainMean 3.741761 trainMedian 3.797170 validationMean 4.103545 validationMedian 4.097180
408 | epoch 4800 trainMean 3.690358 trainMedian 3.760918 validationMean 4.082099 validationMedian 4.111330
409 | # -> Note the validationMedian being close to the validationMean, that's unusual.
410 | 
411 | # Balazs observes that the left bump on the histogram is NOT cause by
412 | # rote learning: it's simply an artifact of the allDigit mnist task:
413 | # 1s are easier to learn, and they also have smaller area. They are the bump.
414 | 
415 | # A better task-specific measure of closeness of samples is the relative improvement
416 | # over the all-black baseline, that is d(gold,generated)/d(gold,0).
417 | # (1s are easier to learn, so they are still on the left, but the bimodality goes away.)
418 | # Let's not forget that this is NOT what our algorithm optimizes, nor should it.
419 | # (Unless we want to make it super mnist-specific, which we don't.)
420 | # This metric causes another big inconvenience as well: We can't compare the logged
421 | # aggregate numbers to the histogram numbers.
422 | # So I won't use it in the histogram, and I will use it on the diff.
423 | # Hope that won't cause confusion.
424 | 
425 | #########
426 | 
427 | # Trying to port the slow distanceMatrix calculation from numpy to theano.
428 | # I start with a modest goal:
429 | 
430 | # A cool little toy learning problem:
431 | # We want to learn a translated 2D standard normal's translation, that's a 2D vector.
432 | # We generate batchSize samples from this target distribution.
433 | # We generate sampleSize samples from our current best bet for the distribution.
434 | # We find the closest generated sample to each target sample.
435 | # We calculate the sum of distances.
436 | # That's the loss that we optimize by gradient descent.
437 | # Note that Theano doesn't even break a sweat when doing backprop
438 | # through a layer of distance minimization.
439 | # Of course that's less impressive than it first sounds, because
440 | # locally, the identity of the nearest target sample never changes.
441 | 
442 | # UPDATE: Maybe it does break a sweat after all: it diverges if we multiply the loss by 100.
443 | 
444 | ##########
445 | # geforce machine installation notes
446 | 
447 | # NVIDIA Drivers
448 | # https://access.redhat.com/solutions/64300
449 | # -> Careful, it hardwires an old driver, I changed it to
450 | # http://http.download.nvidia.com/XFree86/Linux-x86_64/358.16/NVIDIA-Linux-x86_64-358.16.run
451 | 
452 | # CUDA
453 | # http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/index.html
454 | # http://developer.download.nvidia.com/compute/cuda/repos/fedora21/x86_64/cuda-repo-fedora21-7.5-18.x86_64.rpm
455 | 
456 | # The nvcc compiler needs gcc, and needs <=4.9 gcc. On our fedora 5.1.1 is the default.
457 | # So I've built and installed gcc-4.9.3.
458 | # Standard procedure described in https://gcc.gnu.org/wiki/InstallingGCC
459 | # But default mirror in ./contrib/download_prerequisites
460 | # are too slow, replaced them with ftp://ftp.fu-berlin.de/unix/languages/gcc/infrastructure
461 | # After make install, new/old gcc was in /usr/local/gcc/4.9.3/, but not on PATH.
462 | # We only need it for nvcc anyway, so the best way to add this to ~/.theanorc :
463 | # [nvcc]
464 | # compiler_bindir=/usr/local/gcc/4.9.3/bin/'
465 | 
466 | # This is how my ~/.theanorc looks like now on geforce:
467 | [global]
468 | floatX = float32
469 | device = gpu0
470 | warn_float64 = raise
471 | assert_no_cpu_op = raise
472 | cxx = /usr/local/gcc/4.9.3/bin/g++
473 | [nvcc]
474 | fastmath = True
475 | compiler_bindir = /usr/local/gcc/4.9.3/bin/
476 | 
477 | # On the laptop, compiler_bindir and cxx is not there, and device=cpu,
478 | # the rest is the same.
479 | 
480 | ##########
481 | 
482 | # Very important note, already mentioned in lasagne-demo/readme.sh :
483 | # I had to patch layers/conv.py
484 | # /usr/lib/python2.7/site-packages/lasagne/layers/conv.py
485 | # Specifically, I added as a first line of Conv2DLayer.__init__() this:
486 | # del kwargs['border_mode']
487 | # I don't know where this incompatibility is coming from.
488 | 
489 | ##########
490 | # Benchmarks
491 | 
492 | # testNumpyToTheano.py:testSampleInitial() 10000 epoch 1000 data 1000 generated:
493 | # laptop: 55 sec including compilation.
494 | # geforce: 76 sec including compilation.
495 | 
496 | # testNumpyToTheano.py:test()
497 | laptop cpu:
498 | minimal distances theano finished in 2.422537 seconds.
499 | all distances theano finished in 1.913697 seconds.
500 | all distances slow numpy finished in 2.907862 seconds.
501 | all distances fast numpy finished in 2.942749 seconds.
502 | 
503 | geforce gpu:
504 | minimal distances theano finished in 0.594864 seconds.
505 | all distances theano finished in 0.094942 seconds.
506 | all distances slow numpy finished in 27.137307 seconds.
507 | all distances fast numpy finished in 27.065705 seconds.
508 | 
509 | geforce cpu:
510 | minimal distances theano finished in 25.903046 seconds.
511 | all distances theano finished in 25.355256 seconds.
512 | (numpy are the same.)
513 | 
514 | # -> Wow, numpy dot product is dead slow on geforce.
515 | # I manage to run generativeMLP.py on the GPU, but the bottleneck is that stupid dot product.
516 | 
517 | # Super cool tip from http://deeplearning.net/software/theano/install_ubuntu.html
518 | python `python -c "import os, theano; print os.path.dirname(theano.__file__)"`/misc/check_blas.py
519 | 
520 | ######
521 | 
522 | # I managed to compile this gist on laptop:
523 | open https://gist.github.com/xianyi/6930656
524 | gcc -o a.out test_cblas_dgemm.c -I /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers -L /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current -lblas -lpthread 
525 | 
526 | ######
527 | # Set up geforce machine with ubuntu.
528 | 
529 | # See ./install.txt for every detail.
530 | 
531 | # deepDives/conf8.txt benchmark, 4800 epochs:
532 | # 250 mins on laptop
533 | #  44 mins on geforce
534 | # -> yay!
535 | 
536 | # I tried allow_gc = False, but it didn't give real improvement, less than 10% for sure, probably even less.
537 | 
538 | #######
539 | # Make spearmint work.
540 | 
541 | #   File "/usr/local/lib/python2.7/dist-packages/pymongo/collection.py", line 393, in _legacy_write
542 | #     rqst_id, msg, max_size = func(*args)
543 | # bson.errors.InvalidDocument: Cannot encode object: 5.6012859
544 | # -> Solution is to cast from np.float32 to float.
545 | 
546 | for f in spearmintOutput/*/log.txt ; do grep "train" $f | tail -1 | cut -f8 -d' ' | tr '\n' ' ' ; echo $f ; done | sort -n
547 | 
548 | #######
549 | # Some parallel run benchmarks.
550 | 
551 | # adhoc/speedtest.txt does not scale, running two in parallel takes twice longer,
552 | # even if they get gpu0 and gpu1 respectively.
553 | # 1 GPU 1 proc
554 | for GPU in 0 1 ; do for a in 1 ; do ( time THEANO_FLAGS="device=gpu$GPU" python generativeMLP.py adhoc/speedtest.txt & ) ; done ; done
555 | 
556 | # 1 GPU 1 proc: 33.0 = 33.0/process
557 | # 1 GPU 2 proc: 64.0 = 32.0/proc
558 | # 2 GPU 2 proc: 64.0 = 32.0/proc
559 | # :(
560 | # Not very surprising, if I press Ctrl-C it always stops inside numpy,
561 | # and numpy presumably already uses all the CPU cores. (Does it?)
562 | # Let's do a less CPU-intense speedtest. This one always breaks inside theano.function:
563 | # adhoc/speedtestgpu.txt
564 | # 1 GPU 1 proc: 31.0 = 31.0/process
565 | # 1 GPU 2 proc: 59.0 = 29.5/proc
566 | # 2 GPU 2 proc: 63.0 = 31.5/proc
567 | # :( Now that's somewhat more surprising.
568 | 
569 | 
570 | # testNumpyToTheano.py:testSampleInitial() 10000 epoch 1000 data 1000 generated:
571 | # This one does scale nicely to 8 processes:
572 | for GPU in 0 1 ; do for a in 1 2 3 4 5 6 7 8 ; do ( time THEANO_FLAGS="device=gpu$GPU" python testNumpyToTheano.py > /dev/null & ) ; done ; done
573 | 
574 | # 1 GPU 1 proc: 20.8 = 20.8/process
575 | # 1 GPU 2 proc: 21.6 = 10.8/proc
576 | # 1 GPU 4 proc: 27.0 =  6.7/proc (actually, the real runtimes were 23.1, 24.4, 25.6, 27.0)
577 | # 1 GPU 8 proc: 44.0 =  5.5/proc (actually, there was one outlier with 53.0 and the rest around 43.0)
578 | # 2 GPU 2 proc: 21.6 = 10.8/proc
579 | # 2 GPU 4 proc: 26.2 =  6.5/proc
580 | # 2 GPU 8 proc: 43.4 =  5.4/proc
581 | # 2 GPU 16proc: 88.0 =  5.5/proc
582 | 
583 | # So the bottom line is that if you have a job, it doesn't matter
584 | # which GPU you send it to even if one is completely starving.
585 | # The only model that I have in mind that can explain this is
586 | # a fixed, non-parallelizable cost of sending data towards
587 | # ANY of the two GPUs. Like a Y shape with a bottleneck at the bottom,
588 | # closer to the CPU.
589 | 
590 | #######
591 | # Let's see some simple synthetic generated distributions.
592 | # I've created a pretty general framework to play with those, see nnbase/inputs.py:GENERATOR_FUNCTIONS.
593 | # The coolest one so far is adhoc/plane1.txt , output in ~/tmp/daniel-experiments/kohonen/adhoc/plane1-d2/
594 | # and http://people.mokk.bme.hu/~daniel/kohonen/plane1.gif
595 | # in my mail titled "op art".
596 | 
597 | 
598 | #######
599 | # Meanwhile I've stopped the original spearmint run, archived it to
600 | # spearmintRuns/epochCount4800_depth3_4_useReLUTrue_everyNthInput10
601 | # and rewrote config.json so that it looks for higher values.
602 | # I call this exp epochCount4800_depth3_4_useReLUTrue_everyNthInput10_bigger
603 | 
604 | THEANO_FLAGS='device=gpu1' nohup python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr &
605 | # From now on gpu1 is the spearmint GPU. (Although if the above benchmarks are good,
606 | # it shouldn't matter, except maybe for OOM.)
607 | 
608 | for f in spearmintOutput/*/log.txt ; do grep "train" $f | tail -1 | cut -f8 -d' ' | tr '\n' ' ' ; echo $f ; done | sort -n
609 | 
610 | #######
611 | # Did a less complete but still useful way to put distance matrix calculation on the GPU.
612 | 
613 | # Makes large oversampling large minibatchSize runs about 3 times faster on geforce,
614 | # does not make a difference on the laptop.
615 | 
616 | 
617 | # It's not really a bottleneck now, but this CPU-based argmin is really annoying:
618 | THEANO_FLAGS='config.profile=True' CUDA_LAUNCH_BLOCKING=1 python nearestNeighborsTest.py > cout 2> cerr
619 | 
620 | # I asked the theano-users list:
621 | https://groups.google.com/forum/#!topic/theano-users/E7ProqnGUMk
622 | https://gist.github.com/danielvarga/d0eeacea92e65b19188c
623 | 
624 | # Later found that this is the relevant ticket:
625 | https://github.com/Theano/Theano/issues/1399
626 | # Implemented lamblin's hack there, see the gist above.
627 | 
628 | # 25000 candidate, 5000 target:
629 | lamblinsTrick = False
630 |   Time in Function.fn.__call__: 8.231399e-01s (99.995%)
631 | <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
632 |   79.6%    79.6%       0.654s       6.54e-01s     C        1       1   theano.tensor.basic.MaxAndArgmax
633 |   13.5%    93.1%       0.111s       1.11e-01s     C        1       1   theano.sandbox.cuda.basic_ops.HostFromGpu
634 |    4.2%    97.3%       0.034s       3.42e-02s     C        1       1   theano.sandbox.cuda.blas.GpuDot22Scalar
635 | 
636 | lamblinsTrick = True # UPDATE: Mis-implemented, see below
637 |   Time in Function.fn.__call__: 7.972190e-01s (99.994%)
638 | <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
639 |   41.9%    41.9%       0.333s       3.33e-01s     C        1       1   theano.tensor.elemwise.Sum
640 |   35.9%    77.8%       0.285s       2.85e-01s     C        1       1   theano.tensor.elemwise.Elemwise
641 |   12.4%    90.1%       0.098s       4.92e-02s     C        2       2   theano.sandbox.cuda.basic_ops.HostFromGpu
642 |    4.3%    94.5%       0.034s       3.45e-02s     C        1       1   theano.sandbox.cuda.blas.GpuDot22Scalar
643 | 
644 | lamblinsTrick = True # UPDATE: Correctly implemented this time.
645 |   Time in Function.fn.__call__: 9.521604e-02s (99.951%)
646 | <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
647 |   36.3%    36.3%       0.034s       3.42e-02s     C        1       1   theano.sandbox.cuda.blas.GpuDot22Scalar
648 |   35.3%    71.5%       0.033s       8.33e-03s     C        4       4   theano.sandbox.cuda.basic_ops.GpuElemwise
649 |   20.8%    92.3%       0.020s       4.91e-03s     C        4       4   theano.sandbox.cuda.basic_ops.GpuCAReduce
650 |    7.6%   100.0%       0.007s       3.61e-03s     C        2       2   theano.sandbox.cuda.basic_ops.GpuFromHost
651 | 
652 | # Now we are talking.
653 | 
654 | #######
655 | # Turns out reducing the learning rate does nothing but make convergence proportionally slower. Weird.
656 | # See deepDives/conf11.txt for a bit more detail.
657 | 
658 | #######
659 | # Playing with leaky relus. They indeed seem to help avoiding burnt out neurons.
660 | # The default 0.01 leakiness seems to be okay, see
661 | # adhoc/spearmint-best-leaky.txt and adhoc/spearmint-best-leaky0.1.txt
662 | 
663 | #######
664 | # Setting up a new spearmint run. Manual steps:
665 | EXPNAME=regularization_initialSD
666 | # commit this EXPNAME to kohonen/config.json
667 | # set kohonen/generativeMLP.py:setDefaultParams() very-very carefully.
668 | cd ~/spearmintClones/
669 | mkdir $EXPNAME
670 | cd $EXPNAME
671 | git clone git@github.com:danielvarga/daniel-experiments.git
672 | cd daniel-experiments/kohonen
673 | ln -s ~/Spearmint .
674 | mkdir ../rbm/data
675 | ln -s ~/daniel-experiments/rbm/data/mnist.pkl.gz ../rbm/data/
676 | mkdir spearmintOutput
677 | # If not the first try:
678 | Spearmint/spearmint/cleanup.sh .
679 | # Maybe also rm -rf spearmintOutput/* , but careful.
680 | nohup python Spearmint/spearmint/main.py . > spearmintOutput/log.cout 2> spearmintOutput/log.cerr &
681 | # Carefully check output/00000001.out and spearmintOutput/log.cerr .
682 | # Carefully check spearmintOutput/*/conf.txt and spearmintOutput/*/log.txt
683 | # Wait.
684 | 
685 | # Outcome of experiment:
686 | # Best one was:
687 | # ~/spearmintClones/regularization_initialSD/daniel-experiments/kohonen/spearmintOutput/initialSD0.323230707636-regularization6.16919263619e-07/conf.txt
688 | # that is, initialSD should be ~0.32, regularization should be 6e-07
689 | # which is so low that I round it down to zero.
690 | # (The non-validated improvement coming from this setting compared to its parent adhoc/spearmint-leaky.txt is:
691 | # parent:
692 | # epoch 4800 trainMean 3.556957 trainMedian 3.631534 validationMean 3.892525 validationMedian 3.911150
693 | # this:
694 | # epoch 4800 trainMean 3.614273 trainMedian 3.685415 validationMean 3.877654 validationMedian 3.906008
695 | 
696 | # Note: nontrivial methodological error, we ask spearmint to optimize for epoch6400, but we look for epoch4800
697 | # values, which is the usual validation minimum.
698 | # The optimum that spearmint has found at epoch6400 is 3.869511
699 | # at initialSD=0.413519 regularization=1e-6
700 | 
701 | # Now running another one just for inDim inBoolDim at ~/spearmintClones/initials/
702 | 
703 | #########
704 | # Here I did some experiments with 1d synthetic distributions.
705 | 
706 | # Bottom line: Right now we can't properly learn even 1d uniform,
707 | # because it gets this weird bathtub shape as seen here:
708 | # https://github.com/danielvarga/daniel-experiments/tree/master/kohonen/docs/charts/1d
709 | 
710 | #########
711 | # A different game: what's the best distance function between pairs?
712 | # I've always used L2 squared, but the idea is that whenever the distance is large,
713 | # we shouldn't be too aggressive in moving, because we might be moving toward an
714 | # incorrect target anyway.
715 | #
716 | # Thus adhoc/conf8.l1loss.txt
717 | #
718 | # Turns out it's a trade-off. L1-based samples are nicer looking, but seem
719 | # to be worse in approximating either the train or the validation samples.
720 | # They are much less blurry than L2-based samples, which is absolutely to
721 | # be expected, but good to see in practice.
722 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This is _not_ a minimal set of requirements to run generativeMLP.py
 2 | # Just my current pip freeze as it is.
 3 | # The good news is that it also seems to work with
 4 | # https://github.com/csadrian/residual-net/blob/master/paper/Deep_Residual_Learning_CIFAR-10.py
 5 | #
 6 | boto==2.38.0
 7 | bz2file==0.98
 8 | Cython==0.23.4
 9 | decorator==4.0.4
10 | funcsigs==0.4
11 | gensim==0.12.3
12 | h5py==2.5.0
13 | httpretty==0.8.6
14 | joblib==0.8.4
15 | -e git+https://github.com/Lasagne/Lasagne.git@8f4f9b26b90abcb303da2cbe9ab04f63676aa6bf#egg=Lasagne-dev
16 | matplotlib==1.4.3
17 | mock==1.3.0
18 | nolearn==0.6a0.dev0
19 | nose==1.3.7
20 | numpy==1.10.0.post2
21 | pbr==1.8.1
22 | Pillow==3.0.0
23 | pymongo==3.1
24 | pyparsing==2.0.3
25 | python-dateutil==2.4.2
26 | pytz==2015.6
27 | requests==2.8.1
28 | scikit-image==0.11.3
29 | scikit-learn==0.15.2
30 | scipy==0.16.0
31 | six==1.10.0
32 | smart-open==1.3.0
33 | -e git+git@github.com:HIPS/Spearmint.git@ac8a37e7eac0134d7af61a2d2a06f6fa78a34c74#egg=spearmint-master
34 | tabulate==0.7.5
35 | -e git+https://github.com/Theano/Theano.git@54186290a97186b9c6b76317e007844529a352f4#egg=Theano-dev
36 | wheel==0.24.0
37 | 


--------------------------------------------------------------------------------
/spearmintTask.py:
--------------------------------------------------------------------------------
1 | import generativeMLP
2 | 
3 | def main(job_id, params):
4 |     return generativeMLP.spearmintEntry(params)
5 | 


--------------------------------------------------------------------------------
/theanorc.txt:
--------------------------------------------------------------------------------
 1 | [global]
 2 | device = gpu
 3 | floatX = float32
 4 | force_device=True
 5 | warn_float64 = raise
 6 | assert_no_cpu_op = raise
 7 | 
 8 | [cuda]
 9 | root = /usr/local/cuda-7.5/bin
10 | 
11 | [nvcc]
12 | fastmath = True
13 | 


--------------------------------------------------------------------------------