├── M1+M2
    ├── analogy.py
    ├── model.py
    ├── test.py
    ├── train.py
    └── visualize.py
├── M1
    ├── model.py
    ├── train.py
    └── visualize.py
├── M2
    ├── analogy.py
    ├── model.py
    ├── test.py
    ├── train.py
    └── visualize.py
├── README.md
├── args.py
├── mnist_tools.py
├── util.py
├── vae_m1.py
└── vae_m2.py


/M1+M2/analogy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time, pylab
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | import matplotlib.patches as mpatches
 6 | sys.path.append(os.path.split(os.getcwd())[0])
 7 | import util
 8 | from args import args
 9 | from model import conf1, vae1, conf2, vae2
10 | from vae_m1 import GaussianM1VAE
11 | 
12 | try:
13 | 	os.mkdir(args.vis_dir)
14 | except:
15 | 	pass
16 | 
17 | dist = "bernoulli"
18 | if isinstance(vae1, GaussianM1VAE):
19 | 	dist = "gaussian"
20 | dataset = util.load_images(args.test_image_dir, dist=dist)
21 | 
22 | n_analogies = 10
23 | n_image_channels = 1
24 | image_width = 28
25 | image_height = 28
26 | x = util.sample_x_variable(n_analogies, conf1.ndim_x, dataset, gpu_enabled=conf1.gpu_enabled)
27 | z1 = vae1.encoder(x, test=True)
28 | y = vae2.sample_x_y(z1, test=True)
29 | z2 = vae2.encode_xy_z(z1, y, test=True)
30 | 
31 | fig = pylab.gcf()
32 | fig.set_size_inches(16.0, 16.0)
33 | pylab.clf()
34 | if n_image_channels == 1:
35 | 	pylab.gray()
36 | xp = np
37 | if conf1.gpu_enabled:
38 | 	x.to_cpu()
39 | 	xp = cuda.cupy
40 | for m in xrange(n_analogies):
41 | 	pylab.subplot(n_analogies, conf2.ndim_y + 2, m * 12 + 1)
42 | 	if n_image_channels == 1:
43 | 		pylab.imshow(x.data[m].reshape((image_width, image_height)), interpolation="none")
44 | 	elif n_image_channels == 3:
45 | 		pylab.imshow(x.data[m].reshape((n_image_channels, image_width, image_height)), interpolation="none")
46 | 	pylab.axis("off")
47 | analogy_y = xp.identity(conf2.ndim_y, dtype=xp.float32)
48 | analogy_y = Variable(analogy_y)
49 | for m in xrange(n_analogies):
50 | 	base_z2 = xp.empty((conf2.ndim_y, z2.data.shape[1]), dtype=xp.float32)
51 | 	for n in xrange(conf2.ndim_y):
52 | 		base_z2[n] = z2.data[m]
53 | 	base_z2 = Variable(base_z2)
54 | 	_z1 = vae2.decode_zy_x(base_z2, analogy_y, test=True, apply_f=True)
55 | 	_x = vae1.decoder(_z1, test=True, apply_f=True)
56 | 	if conf1.gpu_enabled:
57 | 		_x.to_cpu()
58 | 	for n in xrange(conf2.ndim_y):
59 | 		pylab.subplot(n_analogies, conf2.ndim_y + 2, m * 12 + 3 + n)
60 | 		if n_image_channels == 1:
61 | 			pylab.imshow(_x.data[n].reshape((image_width, image_height)), interpolation="none")
62 | 		elif n_image_channels == 3:
63 | 			pylab.imshow(_x.data[n].reshape((n_image_channels, image_width, image_height)), interpolation="none")
64 | 		pylab.axis("off")
65 | 
66 | pylab.savefig("{:s}/analogy.png".format(args.vis_dir))
67 | 
68 | 


--------------------------------------------------------------------------------
/M1+M2/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from args import args
 3 | from vae_m1 import BernoulliM1VAE, GaussianM1VAE, Conf as Conf1
 4 | from vae_m2 import BernoulliM2VAE, GaussianM2VAE, Conf as Conf2
 5 | 
 6 | # M1
 7 | conf1 = Conf1()
 8 | conf1.gpu_enabled = True if args.gpu_enabled == 1 else False
 9 | conf1.ndim_x = 28 * 28
10 | conf1.ndim_z = 50
11 | conf1.encoder_apply_dropout = False
12 | conf1.decoder_apply_dropout = False
13 | conf1.encoder_apply_batchnorm = True
14 | conf1.decoder_apply_batchnorm = True
15 | conf1.encoder_apply_batchnorm_to_input = True
16 | conf1.decoder_apply_batchnorm_to_input = True
17 | conf1.gradient_clipping = 1.0
18 | conf1.encoder_hidden_units = [600, 600]
19 | conf1.decoder_hidden_units = [600, 600]
20 | vae1 = BernoulliM1VAE(conf1, name="m1")
21 | vae1.load(args.model_dir)
22 | 
23 | # M2
24 | conf2 = Conf2()
25 | conf2.gpu_enabled = True if args.gpu_enabled == 1 else False
26 | conf2.ndim_x = 50
27 | conf2.ndim_z = 50
28 | conf2.encoder_xy_z_hidden_units = [500]
29 | conf2.encoder_x_y_hidden_units = [500]
30 | conf2.decoder_hidden_units = [500]
31 | conf2.encoder_xy_z_apply_dropout = False
32 | conf2.encoder_x_y_apply_dropout = False
33 | conf2.decoder_apply_dropout = False
34 | conf2.encoder_xy_z_apply_batchnorm = True
35 | conf2.encoder_x_y_apply_batchnorm = True
36 | conf2.decoder_apply_batchnorm = True
37 | conf2.encoder_xy_z_apply_batchnorm_to_input = True
38 | conf2.encoder_x_y_apply_batchnorm_to_input = True
39 | conf2.decoder_apply_batchnorm_to_input = True
40 | conf2.gradient_clipping = 5.0
41 | vae2 = GaussianM2VAE(conf2, name="m2")
42 | vae2.load(args.model_dir)


--------------------------------------------------------------------------------
/M1+M2/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | sys.path.append(os.path.split(os.getcwd())[0])
 6 | import util
 7 | from args import args
 8 | from model import conf1, vae1, conf2, vae2
 9 | from vae_m1 import GaussianM1VAE
10 | 
11 | dist = "bernoulli"
12 | if isinstance(vae1, GaussianM1VAE):
13 | 	dist = "gaussian"
14 | dataset, labels = util.load_labeled_images(args.test_image_dir, dist=dist)
15 | num_data = len(dataset)
16 | 
17 | x_labeled, _, label_ids = util.sample_x_and_label_variables(num_data, conf1.ndim_x, conf2.ndim_y, dataset, labels, gpu_enabled=False)
18 | if conf1.gpu_enabled:
19 | 	x_labeled.to_gpu()
20 | z_labeled = vae1.encoder(x_labeled, test=True)
21 | prediction = vae2.sample_x_label(z_labeled, test=True, argmax=True)
22 | 
23 | correct = 0
24 | for i in xrange(num_data):
25 | 	if prediction[i] == label_ids.data[i]:
26 | 		correct += 1
27 | 
28 | print "test:: classification accuracy: {:.3f}".format(correct / float(num_data))
29 | 
30 | 


--------------------------------------------------------------------------------
/M1+M2/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, time
  3 | import numpy as np
  4 | from chainer import cuda, Variable
  5 | import pandas as pd
  6 | sys.path.append(os.path.split(os.getcwd())[0])
  7 | import util
  8 | from args import args
  9 | from model import conf1, vae1, conf2, vae2
 10 | from vae_m1 import GaussianM1VAE
 11 | 
 12 | dist = "bernoulli"
 13 | if isinstance(vae1, GaussianM1VAE):
 14 | 	dist = "gaussian"
 15 | dataset, labels = util.load_labeled_images(args.train_image_dir, dist=dist)
 16 | 
 17 | max_epoch = 1000
 18 | vae1_num_trains_per_epoch = 5000
 19 | vae2_num_trains_per_epoch = 5000
 20 | batchsize = 100
 21 | 
 22 | # Create labeled/unlabeled split in training set
 23 | num_types_of_label = 10
 24 | num_labeled_data = args.num_labeled_data
 25 | if num_labeled_data < batchsize:
 26 | 	batchsize = num_labeled_data
 27 | num_validation_data = 10000
 28 | labeled_dataset, labels, unlabeled_dataset, validation_dataset, validation_labels = util.create_semisupervised(dataset, labels, num_validation_data, num_labeled_data, num_types_of_label)
 29 | print "labels:", labels
 30 | # alpha = 0.1 * len(dataset)
 31 | # alpha = 0.1 * len(dataset) / len(labeled_dataset)
 32 | alpha = 1
 33 | print "alpha:", alpha
 34 | print "dataset:: labeled: {:d} unlabeled: {:d} validation: {:d}".format(len(labeled_dataset), len(unlabeled_dataset), len(validation_dataset))
 35 | 
 36 | # Export result to csv
 37 | csv_epoch = []
 38 | 
 39 | total_time = 0
 40 | for epoch in xrange(max_epoch):
 41 | 	# Train M1
 42 | 	# sum_loss = 0
 43 | 	# epoch_time = time.time()
 44 | 	# for t in xrange(vae1_num_trains_per_epoch):
 45 | 	# 	x = util.sample_x_variable(batchsize, conf1.ndim_x, dataset, gpu_enabled=conf1.gpu_enabled)
 46 | 
 47 | 	# 	# train
 48 | 	# 	loss = vae1.train(x, L=1)
 49 | 
 50 | 	# 	sum_loss += loss
 51 | 	# 	if t % 10 == 0:
 52 | 	# 		sys.stdout.write("\rTraining M1 in progress...(%d / %d)" % (t, vae1_num_trains_per_epoch))
 53 | 	# 		sys.stdout.flush()
 54 | 	# epoch_time = time.time() - epoch_time
 55 | 	# total_time += epoch_time
 56 | 	# sys.stdout.write("\r")
 57 | 	# print "[M1] epoch:", epoch, "loss: {:.3f}".format(sum_loss / vae1_num_trains_per_epoch), "time: {:d} min".format(int(epoch_time / 60)), "total: {:d} min".format(int(total_time / 60))
 58 | 	# sys.stdout.flush()
 59 | 	# vae1.save(args.model_dir)
 60 | 
 61 | 	# Train M2
 62 | 	sum_loss_labeled = 0
 63 | 	sum_loss_unlabeled = 0
 64 | 	sum_loss_classifier = 0
 65 | 	epoch_time = time.time()
 66 | 	for t in xrange(vae2_num_trains_per_epoch):
 67 | 		x_labeled, y_labeled, label_ids = util.sample_x_and_label_variables(batchsize, conf1.ndim_x, conf2.ndim_y, labeled_dataset, labels, gpu_enabled=conf2.gpu_enabled)
 68 | 		x_unlabeled = util.sample_x_variable(batchsize, conf1.ndim_x, unlabeled_dataset, gpu_enabled=conf2.gpu_enabled)
 69 | 		z_labeled = Variable(vae1.encoder(x_labeled, test=True, apply_f=True).data)
 70 | 		z_unlabeled = Variable(vae1.encoder(x_unlabeled, test=True, apply_f=True).data)
 71 | 
 72 | 		# train
 73 | 		# loss_labeled, loss_unlabeled, loss_classifier = vae2.train_jointly(z_labeled, y_labeled, label_ids, z_unlabeled, alpha=alpha, test=False)
 74 | 		
 75 | 		# train
 76 | 		loss_labeled, loss_unlabeled = vae2.train(z_labeled, y_labeled, label_ids, z_unlabeled)
 77 | 		loss_classifier = vae2.train_classification(z_labeled, label_ids, alpha=alpha)
 78 | 		
 79 | 		sum_loss_labeled += loss_labeled
 80 | 		sum_loss_unlabeled += loss_unlabeled
 81 | 		sum_loss_classifier += loss_classifier
 82 | 		if t % 10 == 0:
 83 | 			sys.stdout.write("\rTraining M2 in progress...({:d} / {:d})".format(t, vae2_num_trains_per_epoch))
 84 | 			sys.stdout.flush()
 85 | 	epoch_time = time.time() - epoch_time
 86 | 	total_time += epoch_time
 87 | 	sys.stdout.write("\r")
 88 | 	print "[M2] epoch:", epoch, "loss::", "labeled: {:.3f}".format(sum_loss_labeled / vae2_num_trains_per_epoch), "unlabeled: {:.3f}".format(sum_loss_unlabeled / vae2_num_trains_per_epoch), "classifier: {:.3f}".format(sum_loss_classifier / vae2_num_trains_per_epoch), "time: {:d} min".format(int(epoch_time / 60)), "total: {:d} min".format(int(total_time / 60))
 89 | 	sys.stdout.flush()
 90 | 	vae2.save(args.model_dir)
 91 | 
 92 | 	# validation
 93 | 	x_labeled, _, label_ids = util.sample_x_and_label_variables(num_validation_data, conf1.ndim_x, conf2.ndim_y, validation_dataset, validation_labels, gpu_enabled=False)
 94 | 	if conf1.gpu_enabled:
 95 | 		x_labeled.to_gpu()
 96 | 	z_labeled = vae1.encoder(x_labeled, test=True)
 97 | 	prediction = vae2.sample_x_label(z_labeled, test=True, argmax=True)
 98 | 	correct = 0
 99 | 	for i in xrange(num_validation_data):
100 | 		if prediction[i] == label_ids.data[i]:
101 | 			correct += 1
102 | 	print "validation:: classification accuracy: {:f}".format(correct / float(num_validation_data))
103 | 
104 | 	# Export to csv
105 | 	csv_epoch.append([epoch, int(total_time / 60), correct / float(num_validation_data)])
106 | 	data = pd.DataFrame(csv_epoch)
107 | 	data.columns = ["epoch", "min", "accuracy"]
108 | 	data.to_csv("{:s}/epoch.csv".format(args.model_dir))
109 | 
110 | 


--------------------------------------------------------------------------------
/M1+M2/visualize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time, pylab
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | import matplotlib.patches as mpatches
 6 | sys.path.append(os.path.split(os.getcwd())[0])
 7 | import util
 8 | from args import args
 9 | from model import conf1, vae1, conf2, vae2
10 | from vae_m1 import GaussianM1VAE
11 | 
12 | try:
13 | 	os.mkdir(args.vis_dir)
14 | except:
15 | 	pass
16 | 
17 | dist = "bernoulli"
18 | if isinstance(vae1, GaussianM1VAE):
19 | 	dist = "gaussian"
20 | dataset, labels = util.load_labeled_images(args.test_image_dir, dist=dist)
21 | 
22 | num_plot = 10000
23 | x = util.sample_x_variable(num_plot, conf1.ndim_x, dataset, gpu_enabled=conf1.gpu_enabled)
24 | z1 = vae1.encoder(x, test=True)
25 | y = vae2.sample_x_y(z1, test=True)
26 | z2 = vae2.encode_xy_z(z1, y, test=True)
27 | 
28 | _z1 = vae2.decode_zy_x(z2, y, test=True, apply_f=True)
29 | _x = vae1.decoder(_z1, test=True)
30 | if conf1.gpu_enabled:
31 | 	z2.to_cpu()
32 | 	_x.to_cpu()
33 | _x = _x.data
34 | 
35 | util.visualize_x(_x, dir=args.vis_dir)
36 | util.visualize_z(z2.data, dir=args.vis_dir)
37 | util.visualize_labeled_z(z2.data, labels, dir=args.vis_dir)


--------------------------------------------------------------------------------
/M1/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from args import args
 3 | from vae_m1 import BernoulliM1VAE, GaussianM1VAE, Conf
 4 | 
 5 | conf = Conf()
 6 | conf.gpu_enabled = True if args.gpu_enabled == 1 else False
 7 | conf.ndim_z = 2
 8 | conf.encoder_apply_dropout = False
 9 | conf.decoder_apply_dropout = False
10 | conf.encoder_apply_batchnorm = True
11 | conf.decoder_apply_batchnorm = True
12 | conf.encoder_apply_batchnorm_to_input = True
13 | conf.decoder_apply_batchnorm_to_input = True
14 | conf.encoder_units = [600, 600]
15 | conf.decoder_units = [600, 600]
16 | vae = BernoulliM1VAE(conf, name="m1")
17 | vae.load(args.model_dir)


--------------------------------------------------------------------------------
/M1/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | sys.path.append(os.path.split(os.getcwd())[0])
 6 | import util
 7 | from args import args
 8 | from model import conf, vae
 9 | from vae_m1 import GaussianM1VAE
10 | 
11 | dist = "bernoulli"
12 | if isinstance(vae, GaussianM1VAE):
13 | 	dist = "gaussian"
14 | dataset = util.load_images(args.train_image_dir, dist=dist)
15 | 
16 | max_epoch = 1000
17 | num_trains_per_epoch = 2000
18 | batchsize = 100
19 | total_time = 0
20 | 
21 | for epoch in xrange(max_epoch):
22 | 	sum_loss = 0
23 | 	epoch_time = time.time()
24 | 	for t in xrange(num_trains_per_epoch):
25 | 		x = util.sample_x_variable(batchsize, conf.ndim_x, dataset, gpu_enabled=conf.gpu_enabled)
26 | 
27 | 		# train
28 | 		loss = vae.train(x, L=1)
29 | 
30 | 		sum_loss += loss
31 | 		if t % 10 == 0:
32 | 			sys.stdout.write("\rTraining M1 in progress...(%d / %d)" % (t, num_trains_per_epoch))
33 | 			sys.stdout.flush()
34 | 	epoch_time = time.time() - epoch_time
35 | 	total_time += epoch_time
36 | 	sys.stdout.write("\r")
37 | 	print "epoch:", epoch, "loss: {:.3f}".format(sum_loss / num_trains_per_epoch), "time: {:d} min".format(int(epoch_time / 60)), "total: {:d} min".format(int(total_time / 60))
38 | 	sys.stdout.flush()
39 | 	vae.save(args.model_dir)
40 | 
41 | 


--------------------------------------------------------------------------------
/M1/visualize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time, pylab
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | import matplotlib.patches as mpatches
 6 | sys.path.append(os.path.split(os.getcwd())[0])
 7 | import util
 8 | from args import args
 9 | from model import conf, vae
10 | from vae_m1 import GaussianM1VAE
11 | from chainer import functions as F
12 | from PIL import Image
13 | 
14 | try:
15 | 	os.mkdir(args.vis_dir)
16 | except:
17 | 	pass
18 | 
19 | dist = "bernoulli"
20 | if isinstance(vae, GaussianM1VAE):
21 | 	dist = "gaussian"
22 | dataset, labels = util.load_labeled_images(args.test_image_dir, dist=dist)
23 | 
24 | num_images = 5000
25 | x, y_labeled, label_ids = util.sample_x_and_label_variables(num_images, conf.ndim_x, 10, dataset, labels, gpu_enabled=False)
26 | if conf.gpu_enabled:
27 | 	x.to_gpu()
28 | z = vae.encoder(x, test=True)
29 | _x = vae.decoder(z, True, True)
30 | if conf.gpu_enabled:
31 | 	z.to_cpu()
32 | 	_x.to_cpu()
33 | util.visualize_x(_x.data, dir=args.vis_dir)
34 | print "visualizing x"
35 | util.visualize_z(z.data, dir=args.vis_dir)
36 | print "visualizing z"
37 | util.visualize_labeled_z(z.data, label_ids.data, dir=args.vis_dir)
38 | print "visualizing labeled z"


--------------------------------------------------------------------------------
/M2/analogy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time, pylab
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | import matplotlib.patches as mpatches
 6 | sys.path.append(os.path.split(os.getcwd())[0])
 7 | import util
 8 | from args import args
 9 | from model import conf, vae
10 | from vae_m2 import GaussianM2VAE
11 | 
12 | try:
13 | 	os.mkdir(args.vis_dir)
14 | except:
15 | 	pass
16 | 
17 | dist = "bernoulli"
18 | if isinstance(vae, GaussianM2VAE):
19 | 	dist = "gaussian"
20 | dataset = util.load_images(args.test_image_dir, dist=dist)
21 | 
22 | n_analogies = 10
23 | n_image_channels = 1
24 | image_width = 28
25 | image_height = 28
26 | x = util.sample_x_variable(10, conf.ndim_x, dataset, gpu_enabled=conf.gpu_enabled)
27 | y = vae.sample_x_y(x, test=True)
28 | z = vae.encode_xy_z(x, y, test=True)
29 | 
30 | fig = pylab.gcf()
31 | fig.set_size_inches(16.0, 16.0)
32 | pylab.clf()
33 | if n_image_channels == 1:
34 | 	pylab.gray()
35 | xp = np
36 | if conf.gpu_enabled:
37 | 	x.to_cpu()
38 | 	xp = cuda.cupy
39 | for m in xrange(n_analogies):
40 | 	pylab.subplot(n_analogies, conf.ndim_y + 2, m * 12 + 1)
41 | 	if n_image_channels == 1:
42 | 		pylab.imshow(x.data[m].reshape((image_width, image_height)), interpolation="none")
43 | 	elif n_image_channels == 3:
44 | 		pylab.imshow(x.data[m].reshape((n_image_channels, image_width, image_height)), interpolation="none")
45 | 	pylab.axis("off")
46 | all_y = xp.identity(conf.ndim_y, dtype=xp.float32)
47 | all_y = Variable(all_y)
48 | for m in xrange(n_analogies):
49 | 	base_z = xp.empty((conf.ndim_y, z.data.shape[1]), dtype=xp.float32)
50 | 	for n in xrange(conf.ndim_y):
51 | 		base_z[n] = z.data[m]
52 | 	base_z = Variable(base_z)
53 | 	_x = vae.decode_zy_x(base_z, all_y, test=True, apply_f=True)
54 | 	if conf.gpu_enabled:
55 | 		_x.to_cpu()
56 | 	for n in xrange(conf.ndim_y):
57 | 		pylab.subplot(n_analogies, conf.ndim_y + 2, m * 12 + 3 + n)
58 | 		if n_image_channels == 1:
59 | 			pylab.imshow(_x.data[n].reshape((image_width, image_height)), interpolation="none")
60 | 		elif n_image_channels == 3:
61 | 			pylab.imshow(_x.data[n].reshape((n_image_channels, image_width, image_height)), interpolation="none")
62 | 		pylab.axis("off")
63 | 
64 | pylab.savefig("{:s}/analogy.png".format(args.vis_dir))
65 | 
66 | 


--------------------------------------------------------------------------------
/M2/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from args import args
 3 | from vae_m2 import BernoulliM2VAE, GaussianM2VAE, Conf
 4 | 
 5 | conf = Conf()
 6 | conf.gpu_enabled = True if args.gpu_enabled == 1 else False
 7 | conf.ndim_z = 50
 8 | conf.encoder_xy_z_apply_dropout = False
 9 | conf.encoder_x_y_apply_dropout = False
10 | conf.decoder_apply_dropout = False
11 | conf.encoder_xy_z_apply_batchnorm_to_input = True
12 | conf.encoder_x_y_apply_batchnorm_to_input = True
13 | conf.decoder_apply_batchnorm_to_input = True
14 | conf.encoder_xy_z_apply_batchnorm = True
15 | conf.encoder_x_y_apply_batchnorm = True
16 | conf.decoder_apply_batchnorm = True
17 | conf.encoder_xy_z_hidden_units = [500]
18 | conf.encoder_x_y_hidden_units = [500]
19 | conf.decoder_hidden_units = [500]
20 | conf.batchnorm_before_activation = True if args.batchnorm_before_activation == 1 else False
21 | 
22 | if args.vae_type == "gaussian":
23 | 	vae = GaussianM2VAE(conf, name="m2")
24 | elif args.vae_type == "bernoulli":
25 | 	vae = BernoulliM2VAE(conf, name="m2")
26 | else:
27 | 	raise Exception()
28 | 	
29 | vae.load(args.model_dir)


--------------------------------------------------------------------------------
/M2/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | sys.path.append(os.path.split(os.getcwd())[0])
 6 | import util
 7 | from args import args
 8 | from model import conf, vae
 9 | from vae_m2 import GaussianM2VAE
10 | 
11 | dist = "bernoulli"
12 | if isinstance(vae, GaussianM2VAE):
13 | 	dist = "gaussian"
14 | dataset, labels = util.load_labeled_images(args.test_image_dir, dist=dist)
15 | num_data = len(dataset)
16 | 
17 | x_labeled, _, label_ids = util.sample_x_and_label_variables(num_data, conf.ndim_x, conf.ndim_y, dataset, labels, gpu_enabled=False)
18 | if conf.gpu_enabled:
19 | 	x_labeled.to_gpu()
20 | prediction = vae.sample_x_label(x_labeled, test=True, argmax=True)
21 | correct = 0
22 | for i in xrange(num_data):
23 | 	if prediction[i] == label_ids.data[i]:
24 | 		correct += 1
25 | print "test:: classification accuracy: {:f}".format(correct / float(num_data))


--------------------------------------------------------------------------------
/M2/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | import pandas as pd
 6 | sys.path.append(os.path.split(os.getcwd())[0])
 7 | import util
 8 | from args import args
 9 | from model import conf, vae
10 | from vae_m2 import GaussianM2VAE
11 | 
12 | dist = "bernoulli"
13 | if isinstance(vae, GaussianM2VAE):
14 | 	dist = "gaussian"
15 | dataset, labels = util.load_labeled_images(args.train_image_dir, dist=dist)
16 | 
17 | max_epoch = 1000
18 | num_trains_per_epoch = 2000
19 | batchsize_l = 100
20 | batchsize_u = 100
21 | 
22 | # Create labeled/unlabeled split in training set
23 | num_types_of_label = 10
24 | num_labeled_data = args.num_labeled_data
25 | num_validation_data = 10000
26 | labeled_dataset, labels, unlabeled_dataset, validation_dataset, validation_labels = util.create_semisupervised(dataset, labels, num_validation_data, num_labeled_data, num_types_of_label)
27 | print "labels:", labels
28 | alpha = 0.1 * len(dataset) / len(labeled_dataset)
29 | alpha = 1.0
30 | print "alpha:", alpha
31 | print "dataset:: labeled: {:d} unlabeled: {:d} validation: {:d}".format(len(labeled_dataset), len(unlabeled_dataset), len(validation_dataset))
32 | 
33 | if num_labeled_data < batchsize_l:
34 | 	batchsize_l = num_labeled_data
35 | 	
36 | if len(unlabeled_dataset) < batchsize_u:
37 | 	batchsize_u = len(unlabeled_dataset)
38 | 
39 | # from PIL import Image
40 | # for i in xrange(len(labeled_dataset)):
41 | # 	image = Image.fromarray(np.uint8(labeled_dataset[i].reshape(28, 28) * 255))
42 | # 	image.save("labeled_images/{:d}.bmp".format(i))
43 | 
44 | # Export result to csv
45 | csv_epoch = []
46 | 
47 | total_time = 0
48 | for epoch in xrange(max_epoch):
49 | 	sum_loss_labeled = 0
50 | 	sum_loss_unlabeled = 0
51 | 	sum_loss_classifier = 0
52 | 	epoch_time = time.time()
53 | 	for t in xrange(num_trains_per_epoch):
54 | 		x_labeled, y_labeled, label_ids = util.sample_x_and_label_variables(batchsize_l, conf.ndim_x, conf.ndim_y, labeled_dataset, labels, gpu_enabled=conf.gpu_enabled)
55 | 		x_unlabeled = util.sample_x_variable(batchsize_u, conf.ndim_x, unlabeled_dataset, gpu_enabled=conf.gpu_enabled)
56 | 
57 | 		# train
58 | 		loss_labeled, loss_unlabeled = vae.train(x_labeled, y_labeled, label_ids, x_unlabeled)
59 | 		loss_classifier = vae.train_classification(x_labeled, label_ids, alpha=alpha)
60 | 
61 | 		sum_loss_labeled += loss_labeled
62 | 		sum_loss_unlabeled += loss_unlabeled
63 | 		sum_loss_classifier += loss_classifier
64 | 		if t % 10 == 0:
65 | 			sys.stdout.write("\rTraining in progress...({:d} / {:d})".format(t, num_trains_per_epoch))
66 | 			sys.stdout.flush()
67 | 	epoch_time = time.time() - epoch_time
68 | 	total_time += epoch_time
69 | 	sys.stdout.write("\r")
70 | 	print "epoch: {:d} loss:: labeled: {:.3f} unlabeled: {:.3f} classifier: {:.3f} time: {:d} min total: {:d} min".format(epoch + 1, sum_loss_labeled / num_trains_per_epoch, sum_loss_unlabeled / num_trains_per_epoch, sum_loss_classifier / num_trains_per_epoch, int(epoch_time / 60), int(total_time / 60))
71 | 	sys.stdout.flush()
72 | 	vae.save(args.model_dir)
73 | 
74 | 	# validation
75 | 	x_labeled, _, label_ids = util.sample_x_and_label_variables(num_validation_data, conf.ndim_x, conf.ndim_y, validation_dataset, validation_labels, gpu_enabled=False)
76 | 	if conf.gpu_enabled:
77 | 		x_labeled.to_gpu()
78 | 	prediction = vae.sample_x_label(x_labeled, test=True, argmax=True)
79 | 	correct = 0
80 | 	for i in xrange(num_validation_data):
81 | 		if prediction[i] == label_ids.data[i]:
82 | 			correct += 1
83 | 	print "validation:: classification accuracy: {:f}".format(correct / float(num_validation_data))
84 | 
85 | 	# Export to csv
86 | 	csv_epoch.append([epoch, int(total_time / 60), correct / float(num_validation_data)])
87 | 	data = pd.DataFrame(csv_epoch)
88 | 	data.columns = ["epoch", "min", "accuracy"]
89 | 	data.to_csv("{:s}/epoch.csv".format(args.model_dir))
90 | 
91 | 


--------------------------------------------------------------------------------
/M2/visualize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, sys, time, pylab
 3 | import numpy as np
 4 | from chainer import cuda, Variable
 5 | import matplotlib.patches as mpatches
 6 | sys.path.append(os.path.split(os.getcwd())[0])
 7 | import util
 8 | from args import args
 9 | from model import conf, vae
10 | from vae_m2 import GaussianM2VAE
11 | 
12 | try:
13 | 	os.mkdir(args.vis_dir)
14 | except:
15 | 	pass
16 | dist = "bernoulli"
17 | if isinstance(vae, GaussianM2VAE):
18 | 	dist = "gaussian"
19 | dataset, labels = util.load_labeled_images(args.test_image_dir, dist=dist)
20 | 
21 | def forward_one_step(num_images):
22 | 	x, y_labeled, label_ids = util.sample_x_and_label_variables(num_images, conf.ndim_x, conf.ndim_y, dataset, labels, gpu_enabled=False)
23 | 	x.to_gpu()
24 | 	y = vae.sample_x_y(x, test=True)
25 | 	z = vae.encoder_xy_z(x, y, test=True)
26 | 	_x = vae.decode_zy_x(z, y, test=True)
27 | 	if conf.gpu_enabled:
28 | 		z.to_cpu()
29 | 		_x.to_cpu()
30 | 	_x = _x.data
31 | 	return z, _x, label_ids
32 | 
33 | z, _x, _ = forward_one_step(100)
34 | util.visualize_x(_x, dir=args.vis_dir)
35 | 
36 | z, _x, label_ids = forward_one_step(5000)
37 | util.visualize_z(z.data, dir=args.vis_dir)
38 | util.visualize_labeled_z(z.data, label_ids.data, dir=args.vis_dir)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Semi-Supervised Learning with Deep Generative Models
  2 | 
  3 | Chainer implementation of Variational AutoEncoder(VAE) model M1, M2, M1+M2 
  4 | 
  5 | [この記事](http://musyoku.github.io/2016/07/02/semi-supervised-learning-with-deep-generative-models/)で実装したコードです。
  6 | 
  7 | ### Requirements
  8 | 
  9 | - Chainer 1.8+
 10 | - sklearn
 11 | 
 12 | To visualize results, you need
 13 | 
 14 | - matplotlib.patches
 15 | - PIL
 16 | - pandas
 17 | 
 18 | #### Download MNIST
 19 | 
 20 | run `mnist-tools.py` to download and extract MNIST.
 21 | 
 22 | #### How to label my own dataset? 
 23 | 
 24 | You can provide label information by filename.
 25 | 
 26 | format:
 27 | 
 28 | `{label_id}_{unique_filename}.{extension}`
 29 | 
 30 | regex:
 31 | 
 32 | `([0-9]+)_.+\.(bmp|png|jpg)`
 33 | 
 34 | e.g. MNIST
 35 | 
 36 | ![labeling](http://musyoku.github.io/images/post/2016-07-02/labeling.png)
 37 | 
 38 | 
 39 | ## M1
 40 | 
 41 | #### Parameters
 42 | 
 43 | | params | value |
 44 | |:-----------|------------:|
 45 | | OS | Windows 7 |
 46 | | GPU | GeForce GTX 970M |
 47 | | ndim_z | 2 |
 48 | | encoder_apply_dropout | False |
 49 | | decoder_apply_dropout | False |
 50 | | encoder_apply_batchnorm | True |
 51 | | decoder_apply_batchnorm | True |
 52 | | encoder_apply_batchnorm_to_input | True |
 53 | | decoder_apply_batchnorm_to_input | True |
 54 | | encoder_units | [600, 600] |
 55 | | decoder_units | [600, 600] |
 56 | | gradient_clipping | 1.0 |
 57 | | learning_rate | 0.0003 |
 58 | | gradient_momentum | 0.9 |
 59 | | gradient_clipping | 1.0 |
 60 | | nonlinear | softplus|
 61 | 
 62 | #### Result
 63 | 
 64 | ##### Latent space
 65 | 
 66 | ![M1](http://musyoku.github.io/images/post/2016-07-02/m1_latent_space.png)
 67 | 
 68 | ## M2
 69 | 
 70 | ##### Parameters
 71 | 
 72 | | params | value |
 73 | |:-----------|------------:|
 74 | | OS | Windows 7 |
 75 | | GPU | GeForce GTX 970M |
 76 | | ndim_z | 50 |
 77 | | encoder_xy_z_apply_dropout | False |
 78 | | encoder_x_y_apply_dropout | False |
 79 | | decoder_apply_dropout | False |
 80 | | encoder_xy_z_apply_batchnorm_to_input | True |
 81 | | encoder_x_y_apply_batchnorm_to_input | True |
 82 | | decoder_apply_batchnorm_to_input | True |
 83 | | encoder_xy_z_apply_batchnorm | True |
 84 | | encoder_x_y_apply_batchnorm | True |
 85 | | decoder_apply_batchnorm | True |
 86 | | encoder_xy_z_hidden_units | [500] |
 87 | | encoder_x_y_hidden_units | [500] |
 88 | | decoder_hidden_units | [500] |
 89 | | batchnorm_before_activation | True |
 90 | | gradient_clipping | 5.0 |
 91 | | learning_rate | 0.0003 |
 92 | | gradient_momentum | 0.9 |
 93 | | gradient_clipping | 1.0 |
 94 | | nonlinear | softplus|
 95 | 
 96 | #### Result
 97 | 
 98 | ##### Classification
 99 | 
100 | ######  Training details
101 | 
102 | | data | # |
103 | |:-----------|------------:|
104 | | labeled | 100 |
105 | | unlabeled | 49900 |
106 | | validation | 10000 |
107 | | test | 10000 |
108 | 
109 | | * | # |
110 | |:-----------|------------:|
111 | | epochs | 490 |
112 | | minutes | 1412 |
113 | | weight updates per epoch | 2000 |
114 | 
115 | ###### Validation accuracy:
116 | 
117 | ![M2](http://musyoku.github.io/images/post/2016-07-02/m2_validation_accuracy.png)
118 | 
119 | ###### Test accuracy: **0.9018**
120 | 
121 | ##### Analogies
122 | 
123 | run `analogy.py` after training
124 | 
125 | Model was trained with...
126 | 
127 | | data | # |
128 | |:-----------|------------:|
129 | | labeled | 100 |
130 | | unlabeled | 49900 |
131 | 
132 | ![M2](http://musyoku.github.io/images/post/2016-07-02/m2_analogy_100.png)
133 | 
134 | | data | # |
135 | |:-----------|------------:|
136 | | labeled | 10000 |
137 | | unlabeled | 40000 |
138 | 
139 | ![M2](http://musyoku.github.io/images/post/2016-07-02/m2_analogy_10000.png)
140 | 
141 | | data | # |
142 | |:-----------|------------:|
143 | | labeled | 50000 |
144 | | unlabeled | 0 |
145 | 
146 | ![M2](http://musyoku.github.io/images/post/2016-07-02/m2_analogy_50000.png)
147 | 
148 | 
149 | ## M1+M2
150 | 
151 | ##### Parameters
152 | 
153 | ##### M1
154 | 
155 | | params | value |
156 | |:-----------|------------:|
157 | | OS | Windows 7 |
158 | | GPU | GeForce GTX 970M |
159 | | ndim_z | 2 |
160 | | encoder_apply_dropout | False |
161 | | decoder_apply_dropout | False |
162 | | encoder_apply_batchnorm | True |
163 | | decoder_apply_batchnorm | True |
164 | | encoder_apply_batchnorm_to_input | True |
165 | | decoder_apply_batchnorm_to_input | True |
166 | | encoder_units | [600, 600] |
167 | | decoder_units | [600, 600] |
168 | | gradient_clipping | 1.0 |
169 | | learning_rate | 0.0003 |
170 | | gradient_momentum | 0.9 |
171 | | gradient_clipping | 1.0 |
172 | | nonlinear | softplus|
173 | 
174 | We trained M1 for 500 epochs before starting training of M2.
175 | 
176 | | * | # |
177 | |:-----------|------------:|
178 | | epochs | 500 |
179 | | minutes | 860 |
180 | | weight updates per epoch | 2000 |
181 | 
182 | ##### M2
183 | 
184 | | params | value |
185 | |:-----------|------------:|
186 | | OS | Windows 7 |
187 | | GPU | GeForce GTX 970M |
188 | | ndim_z | 50 |
189 | | encoder_xy_z_apply_dropout | False |
190 | | encoder_x_y_apply_dropout | False |
191 | | decoder_apply_dropout | False |
192 | | encoder_xy_z_apply_batchnorm_to_input | True |
193 | | encoder_x_y_apply_batchnorm_to_input | True |
194 | | decoder_apply_batchnorm_to_input | True |
195 | | encoder_xy_z_apply_batchnorm | True |
196 | | encoder_x_y_apply_batchnorm | True |
197 | | decoder_apply_batchnorm | True |
198 | | encoder_xy_z_hidden_units | [500] |
199 | | encoder_x_y_hidden_units | [500] |
200 | | decoder_hidden_units | [500] |
201 | | batchnorm_before_activation | True |
202 | | gradient_clipping | 5.0 |
203 | | learning_rate | 0.0003 |
204 | | gradient_momentum | 0.9 |
205 | | gradient_clipping | 1.0 |
206 | | nonlinear | softplus|
207 | 
208 | #### Result
209 | 
210 | ##### Classification
211 | 
212 | ######  Training details
213 | 
214 | | data | # |
215 | |:-----------|------------:|
216 | | labeled | 100 |
217 | | unlabeled | 49900 |
218 | | validation | 10000 |
219 | | test | 10000 |
220 | 
221 | | * | # |
222 | |:-----------|------------:|
223 | | epochs | 600 |
224 | | minutes | 4920 |
225 | | weight updates per epoch | 5000 |
226 | 
227 | ###### Validation accuracy:
228 | 
229 | ![M1+M2](http://musyoku.github.io/images/post/2016-07-02/m1+m2_validation_accuracy.png)
230 | 
231 | ###### Test accuracy
232 | 
233 | seed1: **0.954**
234 | 
235 | seed2: **0.951**
236 | 
237 | 


--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import argparse
 3 | 
 4 | # Arguments
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--gpu_enabled", type=int, default=1)
 7 | parser.add_argument("--train_image_dir", type=str, default="../train_images")
 8 | parser.add_argument("--test_image_dir", type=str, default="../test_images")
 9 | parser.add_argument("--model_dir", type=str, default="model")
10 | parser.add_argument("--vis_dir", type=str, default="visualization")
11 | parser.add_argument("--vae_type", type=str, default="bernoulli")
12 | parser.add_argument("--num_labeled_data", type=int, default=100)
13 | parser.add_argument("--batchnorm_before_activation", type=int, default=1)
14 | args = parser.parse_args()


--------------------------------------------------------------------------------
/mnist_tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import gzip, os, six
 3 | from six.moves.urllib import request
 4 | from PIL import Image
 5 | import numpy as np
 6 | 
 7 | parent = "http://yann.lecun.com/exdb/mnist"
 8 | train_images_filename = "train-images-idx3-ubyte.gz"
 9 | train_labels_filename = "train-labels-idx1-ubyte.gz"
10 | test_images_filename = "t10k-images-idx3-ubyte.gz"
11 | test_labels_filename = "t10k-labels-idx1-ubyte.gz"
12 | n_train = 60000
13 | n_test = 10000
14 | dim = 28 * 28
15 | 
16 | train_dir = "train_images"
17 | test_dir = "test_images"
18 | 
19 | try:
20 | 	os.mkdir(train_dir)
21 | 	os.mkdir(test_dir)
22 | except:
23 | 	pass
24 | 
25 | def load_mnist(data_filename, label_filename, num):
26 | 	data = np.zeros(num * dim, dtype=np.uint8).reshape((num, dim))
27 | 	label = np.zeros(num, dtype=np.uint8).reshape((num, ))
28 | 
29 | 	with gzip.open(data_filename, "rb") as f_images, gzip.open(label_filename, "rb") as f_labels:
30 | 		f_images.read(16)
31 | 		f_labels.read(8)
32 | 		for i in six.moves.range(num):
33 | 			label[i] = ord(f_labels.read(1))
34 | 			for j in six.moves.range(dim):
35 | 				data[i, j] = ord(f_images.read(1))
36 | 
37 | 	return data, label
38 | 
39 | def download_mnist_data():
40 | 	print("Downloading {}...".format(train_images_filename))
41 | 	request.urlretrieve("{}/{}".format(parent, train_images_filename), train_images_filename)
42 | 	print("Downloading {}...".format(train_labels_filename))
43 | 	request.urlretrieve("{}/{}".format(parent, train_labels_filename), train_labels_filename)
44 | 	print("Downloading {}...".format(test_images_filename))
45 | 	request.urlretrieve("{}/{}".format(parent, test_images_filename), test_images_filename)
46 | 	print("Downloading {}...".format(test_labels_filename))
47 | 	request.urlretrieve("{}/{}".format(parent, test_labels_filename), test_labels_filename)
48 | 	print("Done")
49 | 
50 | def extract_mnist_data():
51 | 	if not os.path.exists(train_images_filename):
52 | 		download_mnist_data()
53 | 	print("Extracting training data...")
54 | 	data_train, label_train = load_mnist(train_images_filename, train_labels_filename, n_train)
55 | 	print("Extracting test data...")
56 | 	data_test, label_test = load_mnist(test_images_filename, test_labels_filename, n_test)
57 | 	print("Done")
58 | 	return data_train, label_train, data_test, label_test
59 | 
60 | data_train, label_train, data_test, label_test = extract_mnist_data()
61 | print "Saving training images..."
62 | for i in xrange(data_train.shape[0]):
63 | 	image = Image.fromarray(data_train[i].reshape(28, 28))
64 | 	image.save("{}/{}_{}.bmp".format(train_dir, label_train[i], i))
65 | print "Saving test images..."
66 | for i in xrange(data_test.shape[0]):
67 | 	image = Image.fromarray(data_test[i].reshape(28, 28))
68 | 	image.save("{}/{}_{}.bmp".format(test_dir, label_test[i], i))
69 | print("Done")
70 | 	
71 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, re, math, pylab, sys
  3 | from math import *
  4 | import numpy as np
  5 | from StringIO import StringIO
  6 | from PIL import Image
  7 | from chainer import cuda, Variable, function
  8 | from chainer.utils import type_check
  9 | from sklearn import preprocessing
 10 | import matplotlib.patches as mpatches
 11 | 
 12 | def load_images(image_dir, convert_to_grayscale=True, dist="bernoulli"):
 13 | 	dataset = []
 14 | 	fs = os.listdir(image_dir)
 15 | 	print "loading", len(fs), "images..."
 16 | 	for fn in fs:
 17 | 		f = open("%s/%s" % (image_dir, fn), "rb")
 18 | 		if convert_to_grayscale:
 19 | 			img = np.asarray(Image.open(StringIO(f.read())).convert("L"), dtype=np.float32) / 255.0
 20 | 		else:
 21 | 			img = np.asarray(Image.open(StringIO(f.read())).convert("RGB"), dtype=np.float32).transpose(2, 0, 1) / 255.0
 22 | 		if dist == "bernoulli":
 23 | 			# Sampling
 24 | 			img = preprocessing.binarize(img, threshold=0.5)
 25 | 			pass
 26 | 		elif dist == "gaussian":
 27 | 			pass
 28 | 		else:
 29 | 			raise Exception()
 30 | 		dataset.append(img)
 31 | 		f.close()
 32 | 	return dataset
 33 | 
 34 | def load_labeled_images(image_dir, convert_to_grayscale=True, dist="bernoulli"):
 35 | 	dataset = []
 36 | 	labels = []
 37 | 	fs = os.listdir(image_dir)
 38 | 	i = 0
 39 | 	for fn in fs:
 40 | 		m = re.match("([0-9]+)_.+", fn)
 41 | 		label = int(m.group(1))
 42 | 		f = open("%s/%s" % (image_dir, fn), "rb")
 43 | 		if convert_to_grayscale:
 44 | 			img = np.asarray(Image.open(StringIO(f.read())).convert("L"), dtype=np.float32) / 255.0
 45 | 		else:
 46 | 			img = np.asarray(Image.open(StringIO(f.read())).convert("RGB"), dtype=np.float32).transpose(2, 0, 1) / 255.0
 47 | 		if dist == "bernoulli":
 48 | 			# Sampling
 49 | 			img = preprocessing.binarize(img, threshold=0.5)
 50 | 			pass
 51 | 		elif dist == "gaussian":
 52 | 			pass
 53 | 		else:
 54 | 			raise Exception()
 55 | 		dataset.append(img)
 56 | 		labels.append(label)
 57 | 		f.close()
 58 | 		i += 1
 59 | 		if i % 100 == 0:
 60 | 			sys.stdout.write("\rloading images...({:d} / {:d})".format(i, len(fs)))
 61 | 			sys.stdout.flush()
 62 | 	sys.stdout.write("\n")
 63 | 	return dataset, labels
 64 | 
 65 | def create_semisupervised(dataset, labels, num_validation_data=10000, num_labeled_data=100, num_types_of_label=10):
 66 | 	if len(dataset) < num_validation_data + num_labeled_data:
 67 | 		raise Exception("len(dataset) < num_validation_data + num_labeled_data")
 68 | 	training_labeled_x = []
 69 | 	training_unlabeled_x = []
 70 | 	validation_x = []
 71 | 	validation_labels = []
 72 | 	training_labels = []
 73 | 	indices_for_label = {}
 74 | 	num_data_per_label = int(num_labeled_data / num_types_of_label)
 75 | 	num_unlabeled_data = len(dataset) - num_validation_data - num_labeled_data
 76 | 
 77 | 	indices = np.arange(len(dataset))
 78 | 	np.random.shuffle(indices)
 79 | 
 80 | 	def check(index):
 81 | 		label = labels[index]
 82 | 		if label not in indices_for_label:
 83 | 			indices_for_label[label] = []
 84 | 			return True
 85 | 		if len(indices_for_label[label]) < num_data_per_label:
 86 | 			for i in indices_for_label[label]:
 87 | 				if i == index:
 88 | 					return False
 89 | 			return True
 90 | 		return False
 91 | 
 92 | 	for n in xrange(len(dataset)):
 93 | 		index = indices[n]
 94 | 		if check(index):
 95 | 			indices_for_label[labels[index]].append(index)
 96 | 			training_labeled_x.append(dataset[index])
 97 | 			training_labels.append(labels[index])
 98 | 		else:
 99 | 			if len(training_unlabeled_x) < num_unlabeled_data:
100 | 				training_unlabeled_x.append(dataset[index])
101 | 			else:
102 | 				validation_x.append(dataset[index])
103 | 				validation_labels.append(labels[index])
104 | 
105 | 	return training_labeled_x, training_labels, training_unlabeled_x, validation_x, validation_labels
106 | 
107 | def sample_x_variable(batchsize, ndim_x, dataset, gpu_enabled=True):
108 | 	x_batch = np.zeros((batchsize, ndim_x), dtype=np.float32)
109 | 	indices = np.random.choice(np.arange(len(dataset), dtype=np.int32), size=batchsize, replace=False)
110 | 	for j in range(batchsize):
111 | 		data_index = indices[j]
112 | 		img = dataset[data_index]
113 | 		x_batch[j] = img.reshape((ndim_x,))
114 | 	x_batch = Variable(x_batch)
115 | 	if gpu_enabled:
116 | 		x_batch.to_gpu()
117 | 	return x_batch
118 | 
119 | def sample_x_and_label_variables(batchsize, ndim_x, ndim_y, dataset, labels, gpu_enabled=True):
120 | 	x_batch = np.zeros((batchsize, ndim_x), dtype=np.float32)
121 | 	# one-hot
122 | 	y_batch = np.zeros((batchsize, ndim_y), dtype=np.float32)
123 | 	# label id
124 | 	label_batch = np.zeros((batchsize,), dtype=np.int32)
125 | 	indices = np.random.choice(np.arange(len(dataset), dtype=np.int32), size=batchsize, replace=False)
126 | 	for j in range(batchsize):
127 | 		data_index = indices[j]
128 | 		img = dataset[data_index]
129 | 		x_batch[j] = img.reshape((ndim_x,))
130 | 		y_batch[j, labels[data_index]] = 1
131 | 		label_batch[j] = labels[data_index]
132 | 	x_batch = Variable(x_batch)
133 | 	y_batch = Variable(y_batch)
134 | 	label_batch = Variable(label_batch)
135 | 	if gpu_enabled:
136 | 		x_batch.to_gpu()
137 | 		y_batch.to_gpu()
138 | 		label_batch.to_gpu()
139 | 	return x_batch, y_batch, label_batch
140 | 
141 | def visualize_x(reconstructed_x_batch, image_width=28, image_height=28, image_channel=1, dir=None):
142 | 	if dir is None:
143 | 		raise Exception()
144 | 	try:
145 | 		os.mkdir(dir)
146 | 	except:
147 | 		pass
148 | 	fig = pylab.gcf()
149 | 	fig.set_size_inches(16.0, 16.0)
150 | 	pylab.clf()
151 | 	if image_channel == 1:
152 | 		pylab.gray()
153 | 	for m in range(100):
154 | 		pylab.subplot(10, 10, m + 1)
155 | 		if image_channel == 1:
156 | 			pylab.imshow(reconstructed_x_batch[m].reshape((image_width, image_height)), interpolation="none")
157 | 		elif image_channel == 3:
158 | 			pylab.imshow(reconstructed_x_batch[m].reshape((image_channel, image_width, image_height)), interpolation="none")
159 | 		pylab.axis("off")
160 | 	pylab.savefig("%s/reconstructed_x.png" % dir)
161 | 
162 | def visualize_z(z_batch, dir=None):
163 | 	if dir is None:
164 | 		raise Exception()
165 | 	try:
166 | 		os.mkdir(dir)
167 | 	except:
168 | 		pass
169 | 	fig = pylab.gcf()
170 | 	fig.set_size_inches(20.0, 16.0)
171 | 	pylab.clf()
172 | 	for n in xrange(z_batch.shape[0]):
173 | 		result = pylab.scatter(z_batch[n, 0], z_batch[n, 1], s=40, marker="o", edgecolors='none')
174 | 	pylab.xlabel("z1")
175 | 	pylab.ylabel("z2")
176 | 	pylab.savefig("%s/latent_code.png" % dir)
177 | 
178 | def visualize_labeled_z(z_batch, label_batch, dir=None):
179 | 	fig = pylab.gcf()
180 | 	fig.set_size_inches(20.0, 16.0)
181 | 	pylab.clf()
182 | 	colors = ["#2103c8", "#0e960e", "#e40402","#05aaa8","#ac02ab","#aba808","#151515","#94a169", "#bec9cd", "#6a6551"]
183 | 	for n in xrange(z_batch.shape[0]):
184 | 		result = pylab.scatter(z_batch[n, 0], z_batch[n, 1], c=colors[label_batch[n]], s=40, marker="o", edgecolors='none')
185 | 
186 | 	classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
187 | 	recs = []
188 | 	for i in range(0, len(colors)):
189 | 		recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=colors[i]))
190 | 
191 | 	ax = pylab.subplot(111)
192 | 	box = ax.get_position()
193 | 	ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
194 | 	ax.legend(recs, classes, loc="center left", bbox_to_anchor=(1.1, 0.5))
195 | 	pylab.xticks(pylab.arange(-4, 5))
196 | 	pylab.yticks(pylab.arange(-4, 5))
197 | 	pylab.xlabel("z1")
198 | 	pylab.ylabel("z2")
199 | 	pylab.savefig("%s/labeled_z.png" % dir)
200 | 


--------------------------------------------------------------------------------
/vae_m1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import math
  3 | import numpy as np
  4 | import chainer, os, collections, six
  5 | from chainer import cuda, Variable, optimizers, serializers, optimizer
  6 | from chainer import functions as F
  7 | from chainer import links as L
  8 | 
  9 | activations = {
 10 | 	"sigmoid": F.sigmoid, 
 11 | 	"tanh": F.tanh, 
 12 | 	"softplus": F.softplus, 
 13 | 	"relu": F.relu, 
 14 | 	"leaky_relu": F.leaky_relu, 
 15 | 	"elu": F.elu
 16 | }
 17 | 
 18 | class Conf():
 19 | 	def __init__(self):
 20 | 		self.image_width = 28
 21 | 		self.image_height = 28
 22 | 		self.ndim_x = 28 * 28
 23 | 		self.ndim_z = 100
 24 | 		self.batchnorm_before_activation = True
 25 | 
 26 | 		# gaussianmarg | gaussian
 27 | 		# We recommend you to use "gaussianmarg" when decoder is gaussian.
 28 | 		self.type_pz = "gaussianmarg"
 29 | 		self.type_qz = "gaussianmarg"
 30 | 
 31 | 		# e.g.
 32 | 		# ndim_x (input) -> 2000 -> 1000 -> 100 (output)
 33 | 		# encoder_hidden_units = [2000, 1000]
 34 | 		self.encoder_hidden_units = [600, 600]
 35 | 		self.encoder_activation_function = "softplus"
 36 | 		self.encoder_apply_dropout = True
 37 | 		self.encoder_apply_batchnorm = True
 38 | 		self.encoder_apply_batchnorm_to_input = True
 39 | 
 40 | 		self.decoder_hidden_units = [600, 600]
 41 | 		self.decoder_activation_function = "softplus"
 42 | 		self.decoder_apply_dropout = True
 43 | 		self.decoder_apply_batchnorm = True
 44 | 		self.decoder_apply_batchnorm_to_input = True
 45 | 
 46 | 		self.gpu_enabled = True
 47 | 		self.learning_rate = 0.0003
 48 | 		self.gradient_momentum = 0.9
 49 | 		self.gradient_clipping = 1.0
 50 | 
 51 | 	def check(self):
 52 | 		pass
 53 | 
 54 | def sum_sqnorm(arr):
 55 | 	sq_sum = collections.defaultdict(float)
 56 | 	for x in arr:
 57 | 		with cuda.get_device(x) as dev:
 58 | 			x = x.ravel()
 59 | 			s = x.dot(x)
 60 | 			sq_sum[int(dev)] += s
 61 | 	return sum([float(i) for i in six.itervalues(sq_sum)])
 62 | 	
 63 | class GradientClipping(object):
 64 | 	name = "GradientClipping"
 65 | 
 66 | 	def __init__(self, threshold):
 67 | 		self.threshold = threshold
 68 | 
 69 | 	def __call__(self, opt):
 70 | 		norm = np.sqrt(sum_sqnorm([p.grad for p in opt.target.params()]))
 71 | 		if norm < 1:
 72 | 			return
 73 | 		rate = self.threshold / norm
 74 | 		if rate < 1:
 75 | 			for param in opt.target.params():
 76 | 				grad = param.grad
 77 | 				with cuda.get_device(grad):
 78 | 					grad = cuda.cupy.clip(grad, -self.threshold, self.threshold)
 79 | 
 80 | class VAE():
 81 | 	# name is used for the filename when you save the model
 82 | 	def __init__(self, conf, name="vae"):
 83 | 		conf.check()
 84 | 		self.encoder, self.decoder = self.build(conf)
 85 | 		self.name = name
 86 | 
 87 | 		self.optimizer_encoder = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum)
 88 | 		self.optimizer_encoder.setup(self.encoder)
 89 | 		# self.optimizer_encoder.add_hook(optimizer.WeightDecay(0.001))
 90 | 		self.optimizer_encoder.add_hook(GradientClipping(conf.gradient_clipping))
 91 | 
 92 | 		self.optimizer_decoder = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum)
 93 | 		self.optimizer_decoder.setup(self.decoder)
 94 | 		# self.optimizer_decoder.add_hook(optimizer.WeightDecay(0.001))
 95 | 		self.optimizer_decoder.add_hook(GradientClipping(conf.gradient_clipping))
 96 | 
 97 | 		self.type_pz = conf.type_pz
 98 | 		self.type_qz = conf.type_qz
 99 | 		
100 | 	def build(self, conf):
101 | 		raise Exception()
102 | 
103 | 	def train(self, x, L=1, test=False):
104 | 		raise Exception()
105 | 
106 | 	@property
107 | 	def xp(self):
108 | 		return self.encoder.xp
109 | 
110 | 	@property
111 | 	def gpu(self):
112 | 		if cuda.available is False:
113 | 			return False
114 | 		return True if self.xp is cuda.cupy else False
115 | 
116 | 	def zero_grads(self):
117 | 		self.optimizer_encoder.zero_grads()
118 | 		self.optimizer_decoder.zero_grads()
119 | 
120 | 	def update(self):
121 | 		self.optimizer_encoder.update()
122 | 		self.optimizer_decoder.update()
123 | 
124 | 	def bernoulli_nll_keepbatch(self, x, y):
125 | 		nll = F.softplus(y) - x * y
126 | 		return F.sum(nll, axis=1)
127 | 
128 | 	def gaussian_nll_keepbatch(self, x, mean, ln_var):
129 | 		x_prec = F.exp(-ln_var)
130 | 		x_diff = x - mean
131 | 		x_power = x_diff ** 2 * x_prec * 0.5
132 | 		return F.sum((math.log(2.0 * math.pi) + ln_var) * 0.5 + x_power, axis=1)
133 | 
134 | 	def gaussian_kl_divergence_keepbatch(self, mean, ln_var):
135 | 		var = F.exp(ln_var)
136 | 		kld = F.sum(mean ** 2 + var - ln_var - 1, axis=1) * 0.5
137 | 		return kld
138 | 
139 | 	def log_px_z(self, x, z, test=False):
140 | 		if isinstance(self.decoder, BernoulliDecoder):
141 | 			# do not apply F.sigmoid to the output of the decoder
142 | 			raw_output = self.decoder(z, test=test, apply_f=False)
143 | 			negative_log_likelihood = self.bernoulli_nll_keepbatch(x, raw_output)
144 | 			log_px_z = -negative_log_likelihood
145 | 		else:
146 | 			x_mean, x_ln_var = self.decoder(z, test=test, apply_f=False)
147 | 			negative_log_likelihood = self.gaussian_nll_keepbatch(x, x_mean, x_ln_var)
148 | 			log_px_z = -negative_log_likelihood
149 | 		return log_px_z
150 | 
151 | 	# this will not be used for bernoulli decoder
152 | 	def log_pz(self, z, mean, ln_var):
153 | 		if self.type_pz == "gaussianmarg":
154 | 			# \int q(z)logp(z)dz = -(J/2)*log2pi - (1/2)*sum_{j=1}^{J} (mu^2 + var)
155 | 			# See Appendix B [Auto-Encoding Variational Bayes](http://arxiv.org/abs/1312.6114)
156 | 			# See https://github.com/dpkingma/nips14-ssl/blob/master/anglepy/models/VAE_YZ_X.py line 106
157 | 			log_pz = -0.5 * (math.log(2.0 * math.pi) + mean * mean + F.exp(ln_var))
158 | 		elif self.type_pz == "gaussian":
159 | 			log_pz = -0.5 * math.log(2.0 * math.pi) - 0.5 * z ** 2
160 | 		return F.sum(log_pz, axis=1)
161 | 
162 | 	# this will not be used for bernoulli decoder
163 | 	def log_qz_x(self, z, mean, ln_var):
164 | 		if self.type_qz == "gaussianmarg":
165 | 			# \int q(z)logq(z)dz = -(J/2)*log2pi - (1/2)*sum_{j=1}^{J} (1 + logvar)
166 | 			# See Appendix B [Auto-Encoding Variational Bayes](http://arxiv.org/abs/1312.6114)
167 | 			# See https://github.com/dpkingma/nips14-ssl/blob/master/anglepy/models/VAE_YZ_X.py line 118
168 | 			log_qz_x = -0.5 * F.sum((math.log(2.0 * math.pi) + 1 + ln_var), axis=1)
169 | 		elif self.type_qz == "gaussian":
170 | 			log_qz_x = -self.gaussian_nll_keepbatch(z, mean, ln_var)
171 | 		return log_qz_x
172 | 
173 | 	def load(self, dir=None):
174 | 		if dir is None:
175 | 			raise Exception()
176 | 		for attr in vars(self):
177 | 			prop = getattr(self, attr)
178 | 			if isinstance(prop, chainer.Chain) or isinstance(prop, chainer.optimizer.GradientMethod):
179 | 				filename = dir + "/%s_%s.hdf5" % (self.name, attr)
180 | 				if os.path.isfile(filename):
181 | 					serializers.load_hdf5(filename, prop)
182 | 				else:
183 | 					print filename, "missing."
184 | 		print "model loaded."
185 | 
186 | 	def save(self, dir=None):
187 | 		if dir is None:
188 | 			raise Exception()
189 | 		try:
190 | 			os.mkdir(dir)
191 | 		except:
192 | 			pass
193 | 		for attr in vars(self):
194 | 			prop = getattr(self, attr)
195 | 			if isinstance(prop, chainer.Chain) or isinstance(prop, chainer.optimizer.GradientMethod):
196 | 				serializers.save_hdf5(dir + "/%s_%s.hdf5" % (self.name, attr), prop)
197 | 		print "model saved."
198 | 
199 | class GaussianM1VAE(VAE):
200 | 
201 | 	def build(self, conf):
202 | 		wscale = 0.1
203 | 		encoder_attributes = {}
204 | 		encoder_units = [(conf.ndim_x, conf.encoder_hidden_units[0])]
205 | 		encoder_units += zip(conf.encoder_hidden_units[:-1], conf.encoder_hidden_units[1:])
206 | 		for i, (n_in, n_out) in enumerate(encoder_units):
207 | 			encoder_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
208 | 			if conf.batchnorm_before_activation:
209 | 				encoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
210 | 			else:
211 | 				encoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
212 | 		encoder_attributes["layer_mean"] = L.Linear(conf.encoder_hidden_units[-1], conf.ndim_z, wscale=wscale)
213 | 		encoder_attributes["layer_var"] = L.Linear(conf.encoder_hidden_units[-1], conf.ndim_z, wscale=wscale)
214 | 		encoder = Encoder(**encoder_attributes)
215 | 		encoder.n_layers = len(encoder_units)
216 | 		encoder.activation_function = conf.encoder_activation_function
217 | 		encoder.apply_dropout = conf.encoder_apply_dropout
218 | 		encoder.apply_batchnorm = conf.encoder_apply_batchnorm
219 | 		encoder.apply_batchnorm_to_input = conf.encoder_apply_batchnorm_to_input
220 | 		encoder.batchnorm_before_activation = conf.batchnorm_before_activation
221 | 
222 | 		decoder_attributes = {}
223 | 		decoder_units = [(conf.ndim_z, conf.decoder_hidden_units[0])]
224 | 		decoder_units += zip(conf.decoder_hidden_units[:-1], conf.decoder_hidden_units[1:])
225 | 		for i, (n_in, n_out) in enumerate(decoder_units):
226 | 			decoder_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
227 | 			if conf.batchnorm_before_activation:
228 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
229 | 			else:
230 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
231 | 		decoder_attributes["layer_mean"] = L.Linear(conf.decoder_hidden_units[-1], conf.ndim_x, wscale=wscale)
232 | 		decoder_attributes["layer_var"] = L.Linear(conf.decoder_hidden_units[-1], conf.ndim_x, wscale=wscale)
233 | 		decoder = GaussianDecoder(**decoder_attributes)
234 | 		decoder.n_layers = len(decoder_units)
235 | 		decoder.activation_function = conf.decoder_activation_function
236 | 		decoder.apply_dropout = conf.decoder_apply_dropout
237 | 		decoder.apply_batchnorm = conf.decoder_apply_batchnorm
238 | 		decoder.apply_batchnorm_to_input = conf.decoder_apply_batchnorm_to_input
239 | 		decoder.batchnorm_before_activation = conf.batchnorm_before_activation
240 | 
241 | 		if conf.gpu_enabled:
242 | 			encoder.to_gpu()
243 | 			decoder.to_gpu()
244 | 		return encoder, decoder
245 | 
246 | 	def train(self, x, L=1, test=False):
247 | 		batchsize = x.data.shape[0]
248 | 		z_mean, z_ln_var = self.encoder(x, test=test, apply_f=False)
249 | 		loss = 0
250 | 		for l in xrange(L):
251 | 			# Sample z
252 | 			z = F.gaussian(z_mean, z_ln_var)
253 | 
254 | 			# Compute lower bound
255 | 			log_px_z = self.log_px_z(x, z, test=test)
256 | 			log_pz = self.log_pz(z, z_mean, z_ln_var)
257 | 			log_qz_x = self.log_qz_x(z, z_mean, z_ln_var)
258 | 			lower_bound = log_px_z + log_pz - log_qz_x
259 | 
260 | 			loss += -lower_bound
261 | 
262 | 		loss = F.sum(loss) / L / batchsize
263 | 
264 | 		self.zero_grads()
265 | 		loss.backward()
266 | 		self.update()
267 | 
268 | 		if self.gpu:
269 | 			loss.to_cpu()
270 | 		return loss.data
271 | 
272 | class BernoulliM1VAE(VAE):
273 | 
274 | 	def build(self, conf):
275 | 		wscale = 0.1
276 | 		encoder_attributes = {}
277 | 		encoder_units = [(conf.ndim_x, conf.encoder_hidden_units[0])]
278 | 		encoder_units += zip(conf.encoder_hidden_units[:-1], conf.encoder_hidden_units[1:])
279 | 		for i, (n_in, n_out) in enumerate(encoder_units):
280 | 			encoder_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
281 | 			if conf.batchnorm_before_activation:
282 | 				encoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
283 | 			else:
284 | 				encoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
285 | 		encoder_attributes["layer_mean"] = L.Linear(conf.encoder_hidden_units[-1], conf.ndim_z, wscale=wscale)
286 | 		encoder_attributes["layer_var"] = L.Linear(conf.encoder_hidden_units[-1], conf.ndim_z, wscale=wscale)
287 | 		encoder = Encoder(**encoder_attributes)
288 | 		encoder.n_layers = len(encoder_units)
289 | 		encoder.activation_function = conf.encoder_activation_function
290 | 		encoder.apply_dropout = conf.encoder_apply_dropout
291 | 		encoder.apply_batchnorm = conf.encoder_apply_batchnorm
292 | 		encoder.apply_batchnorm_to_input = conf.encoder_apply_batchnorm_to_input
293 | 		encoder.batchnorm_before_activation = conf.batchnorm_before_activation
294 | 
295 | 		decoder_attributes = {}
296 | 		decoder_units = [(conf.ndim_z, conf.decoder_hidden_units[0])]
297 | 		decoder_units += zip(conf.decoder_hidden_units[:-1], conf.decoder_hidden_units[1:])
298 | 		decoder_units += [(conf.decoder_hidden_units[-1], conf.ndim_x)]
299 | 		for i, (n_in, n_out) in enumerate(decoder_units):
300 | 			decoder_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
301 | 			if conf.batchnorm_before_activation:
302 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
303 | 			else:
304 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
305 | 		decoder = BernoulliDecoder(**decoder_attributes)
306 | 		decoder.n_layers = len(decoder_units)
307 | 		decoder.activation_function = conf.decoder_activation_function
308 | 		decoder.apply_dropout = conf.decoder_apply_dropout
309 | 		decoder.apply_batchnorm = conf.decoder_apply_batchnorm
310 | 		decoder.apply_batchnorm_to_input = conf.decoder_apply_batchnorm_to_input
311 | 		decoder.batchnorm_before_activation = conf.batchnorm_before_activation
312 | 
313 | 		if conf.gpu_enabled:
314 | 			encoder.to_gpu()
315 | 			decoder.to_gpu()
316 | 		return encoder, decoder
317 | 
318 | 	def train(self, x, L=1, test=False):
319 | 		batchsize = x.data.shape[0]
320 | 		z_mean, z_ln_var = self.encoder(x, test=test, apply_f=False)
321 | 		loss = 0
322 | 		for l in xrange(L):
323 | 			# Sample z
324 | 			z = F.gaussian(z_mean, z_ln_var)
325 | 			# Decode
326 | 			x_expectation = self.decoder(z, test=test, apply_f=False)
327 | 			# E_q(z|x)[log(p(x|z))]
328 | 			loss += self.bernoulli_nll_keepbatch(x, x_expectation)
329 | 		if L > 1:
330 | 			loss /= L
331 | 		# KL divergence
332 | 		loss += self.gaussian_kl_divergence_keepbatch(z_mean, z_ln_var)
333 | 		loss = F.sum(loss) / batchsize
334 | 
335 | 		self.zero_grads()
336 | 		loss.backward()
337 | 		self.update()
338 | 
339 | 		if self.gpu:
340 | 			loss.to_cpu()
341 | 		return loss.data
342 | 
343 | class Encoder(chainer.Chain):
344 | 	def __init__(self, **layers):
345 | 		super(Encoder, self).__init__(**layers)
346 | 		self.activation_function = "softplus"
347 | 		self.apply_batchnorm_to_input = True
348 | 		self.apply_batchnorm = True
349 | 		self.apply_dropout = True
350 | 		self.batchnorm_before_activation = True
351 | 
352 | 	@property
353 | 	def xp(self):
354 | 		return np if self._cpu else cuda.cupy
355 | 
356 | 	def forward_one_step(self, x, test=False, apply_f=True):
357 | 		f = activations[self.activation_function]
358 | 
359 | 		chain = [x]
360 | 
361 | 		# Hidden
362 | 		for i in range(self.n_layers):
363 | 			u = chain[-1]
364 | 			if self.batchnorm_before_activation:
365 | 				u = getattr(self, "layer_%i" % i)(u)
366 | 			if i == 0:
367 | 				if self.apply_batchnorm_to_input:
368 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
369 | 			else:
370 | 				if self.apply_batchnorm:
371 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
372 | 			if self.batchnorm_before_activation == False:
373 | 				u = getattr(self, "layer_%i" % i)(u)
374 | 			output = f(u)
375 | 			if self.apply_dropout:
376 | 				output = F.dropout(output, train=not test)
377 | 			chain.append(output)
378 | 
379 | 		u = chain[-1]
380 | 		mean = self.layer_mean(u)
381 | 
382 | 		# log(sigma^2)
383 | 		u = chain[-1]
384 | 		ln_var = self.layer_var(u)
385 | 
386 | 		return mean, ln_var
387 | 
388 | 	def __call__(self, x, test=False, apply_f=True):
389 | 		mean, ln_var = self.forward_one_step(x, test=test, apply_f=apply_f)
390 | 		if apply_f:
391 | 			return F.gaussian(mean, ln_var)
392 | 		return mean, ln_var
393 | 
394 | # Network structure is same as the Encoder
395 | class GaussianDecoder(Encoder):
396 | 
397 | 	def __call__(self, x, test=False, apply_f=False):
398 | 		mean, ln_var = self.forward_one_step(x, test=test, apply_f=False)
399 | 		if apply_f:
400 | 			return F.gaussian(mean, ln_var)
401 | 		return mean, ln_var
402 | 
403 | class BernoulliDecoder(chainer.Chain):
404 | 	def __init__(self, **layers):
405 | 		super(BernoulliDecoder, self).__init__(**layers)
406 | 		self.activation_function = "softplus"
407 | 		self.apply_batchnorm_to_input = True
408 | 		self.apply_batchnorm = True
409 | 		self.apply_dropout = True
410 | 		self.batchnorm_before_activation = True
411 | 
412 | 	@property
413 | 	def xp(self):
414 | 		return np if self._cpu else cuda.cupy
415 | 
416 | 	def forward_one_step(self, x, test=False):
417 | 		f = activations[self.activation_function]
418 | 		chain = [x]
419 | 
420 | 		# Hidden
421 | 		for i in range(self.n_layers):
422 | 			u = chain[-1]
423 | 			if self.batchnorm_before_activation:
424 | 				u = getattr(self, "layer_%i" % i)(u)
425 | 			if i == 0:
426 | 				if self.apply_batchnorm_to_input:
427 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
428 | 			elif i == self.n_layers - 1:
429 | 				if self.apply_batchnorm_to_input and self.batchnorm_before_activation == False:
430 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
431 | 			else:
432 | 				if self.apply_batchnorm:
433 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
434 | 			if self.batchnorm_before_activation == False:
435 | 				u = getattr(self, "layer_%i" % i)(u)
436 | 			if i == self.n_layers - 1:
437 | 				output = u
438 | 			else:
439 | 				output = f(u)
440 | 				if self.apply_dropout:
441 | 					output = F.dropout(output, train=not test)
442 | 			chain.append(output)
443 | 
444 | 		return chain[-1]
445 | 
446 | 	def __call__(self, x, test=False, apply_f=False):
447 | 		output = self.forward_one_step(x, test=test)
448 | 		if apply_f:
449 | 			return F.sigmoid(output)
450 | 		return output


--------------------------------------------------------------------------------
/vae_m2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import math
  3 | import numpy as np
  4 | import chainer, os, collections, six
  5 | from chainer import cuda, Variable, optimizers, serializers, function, optimizer
  6 | from chainer.utils import type_check
  7 | from chainer import functions as F
  8 | from chainer import links as L
  9 | 
 10 | activations = {
 11 | 	"sigmoid": F.sigmoid, 
 12 | 	"tanh": F.tanh, 
 13 | 	"softplus": F.softplus, 
 14 | 	"relu": F.relu, 
 15 | 	"leaky_relu": F.leaky_relu, 
 16 | 	"elu": F.elu
 17 | }
 18 | 
 19 | class Conf():
 20 | 	def __init__(self):
 21 | 		self.image_width = 28
 22 | 		self.image_height = 28
 23 | 		self.ndim_x = 28 * 28
 24 | 		self.ndim_y = 10
 25 | 		self.ndim_z = 50
 26 | 
 27 | 		# True : y = f(BN(Wx + b))
 28 | 		# False: y = f(W*BN(x) + b)
 29 | 		self.batchnorm_before_activation = True
 30 | 
 31 | 		# gaussianmarg | gaussian
 32 | 		self.type_pz = "gaussianmarg"
 33 | 		self.type_qz = "gaussianmarg"
 34 | 
 35 | 		self.encoder_xy_z_hidden_units = [500]
 36 | 		self.encoder_xy_z_activation_function = "softplus"
 37 | 		self.encoder_xy_z_apply_dropout = False
 38 | 		self.encoder_xy_z_apply_batchnorm = True
 39 | 		self.encoder_xy_z_apply_batchnorm_to_input = True
 40 | 
 41 | 		self.encoder_x_y_hidden_units = [500]
 42 | 		self.encoder_x_y_activation_function = "softplus"
 43 | 		self.encoder_x_y_apply_dropout = False
 44 | 		self.encoder_x_y_apply_batchnorm = True
 45 | 		self.encoder_x_y_apply_batchnorm_to_input = True
 46 | 
 47 | 		self.decoder_hidden_units = [500]
 48 | 		self.decoder_activation_function = "softplus"
 49 | 		self.decoder_apply_dropout = False
 50 | 		self.decoder_apply_batchnorm = True
 51 | 		self.decoder_apply_batchnorm_to_input = True
 52 | 
 53 | 		self.gpu_enabled = True
 54 | 		self.learning_rate = 0.0003
 55 | 		self.gradient_momentum = 0.9
 56 | 		self.gradient_clipping = 5.0
 57 | 
 58 | 	def check(self):
 59 | 		pass
 60 | 
 61 | def sum_sqnorm(arr):
 62 | 	sq_sum = collections.defaultdict(float)
 63 | 	for x in arr:
 64 | 		with cuda.get_device(x) as dev:
 65 | 			x = x.ravel()
 66 | 			s = x.dot(x)
 67 | 			sq_sum[int(dev)] += s
 68 | 	return sum([float(i) for i in six.itervalues(sq_sum)])
 69 | 	
 70 | class GradientClipping(object):
 71 | 	name = "GradientClipping"
 72 | 
 73 | 	def __init__(self, threshold):
 74 | 		self.threshold = threshold
 75 | 
 76 | 	def __call__(self, opt):
 77 | 		norm = np.sqrt(sum_sqnorm([p.grad for p in opt.target.params()]))
 78 | 		if norm == 0:
 79 | 			return
 80 | 		rate = self.threshold / norm
 81 | 		if rate < 1:
 82 | 			for param in opt.target.params():
 83 | 				grad = param.grad
 84 | 				with cuda.get_device(grad):
 85 | 					grad *= rate
 86 | 
 87 | class VAE():
 88 | 	# name is used for the filename when you save the model
 89 | 	def __init__(self, conf, name="vae"):
 90 | 		conf.check()
 91 | 		self.encoder_xy_z, self.encoder_x_y, self.decoder = self.build(conf)
 92 | 		self.name = name
 93 | 
 94 | 		self.optimizer_encoder_xy_z = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum)
 95 | 		self.optimizer_encoder_xy_z.setup(self.encoder_xy_z)
 96 | 		# self.optimizer_encoder_xy_z.add_hook(optimizer.WeightDecay(0.00001))
 97 | 		self.optimizer_encoder_xy_z.add_hook(GradientClipping(conf.gradient_clipping))
 98 | 
 99 | 		self.optimizer_encoder_x_y = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum)
100 | 		self.optimizer_encoder_x_y.setup(self.encoder_x_y)
101 | 		# self.optimizer_encoder_x_y.add_hook(optimizer.WeightDecay(0.00001))
102 | 		self.optimizer_encoder_x_y.add_hook(GradientClipping(conf.gradient_clipping))
103 | 
104 | 		self.optimizer_decoder = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum)
105 | 		self.optimizer_decoder.setup(self.decoder)
106 | 		# self.optimizer_decoder.add_hook(optimizer.WeightDecay(0.00001))
107 | 		self.optimizer_decoder.add_hook(GradientClipping(conf.gradient_clipping))
108 | 
109 | 		self.type_pz = conf.type_pz
110 | 		self.type_qz = conf.type_qz
111 | 
112 | 	def build(self, conf):
113 | 		raise Exception()
114 | 
115 | 	def train(self, x, L=1, test=False):
116 | 		raise Exception()
117 | 
118 | 	@property
119 | 	def xp(self):
120 | 		return self.encoder_xy_z.xp
121 | 
122 | 	@property
123 | 	def gpu(self):
124 | 		if cuda.available is False:
125 | 			return False
126 | 		return True if self.xp is cuda.cupy else False
127 | 
128 | 	def zero_grads(self):
129 | 		self.optimizer_encoder_xy_z.zero_grads()
130 | 		self.optimizer_encoder_x_y.zero_grads()
131 | 		self.optimizer_decoder.zero_grads()
132 | 
133 | 	def update(self):
134 | 		self.optimizer_encoder_xy_z.update()
135 | 		self.optimizer_encoder_x_y.update()
136 | 		self.optimizer_decoder.update()
137 | 
138 | 	def update_classifier(self):
139 | 		self.optimizer_encoder_x_y.update()
140 | 
141 | 	def encode_x_z(self, x, test=False):
142 | 		y = self.sample_x_y(x, argmax=False, test=test)
143 | 		z = self.encoder_xy_z(x, y, test=test)
144 | 		return z
145 | 
146 | 	def encode_xy_z(self, x, y, test=False):
147 | 		z = self.encoder_xy_z(x, y, test=test)
148 | 		return z
149 | 
150 | 	def decode_zy_x(self, z, y, test=False, apply_f=True):
151 | 		x = self.decoder(z, y, test=test, apply_f=apply_f)
152 | 		return x
153 | 
154 | 	def sample_x_y(self, x, argmax=False, test=False):
155 | 		batchsize = x.data.shape[0]
156 | 		y_distribution = self.encoder_x_y(x, test=test, softmax=True).data
157 | 		n_labels = y_distribution.shape[1]
158 | 		if self.gpu:
159 | 			y_distribution = cuda.to_cpu(y_distribution)
160 | 		sampled_y = np.zeros((batchsize, n_labels), dtype=np.float32)
161 | 		if argmax:
162 | 			args = np.argmax(y_distribution, axis=1)
163 | 			for b in xrange(batchsize):
164 | 				sampled_y[b, args[b]] = 1
165 | 		else:
166 | 			for b in xrange(batchsize):
167 | 				label_id = np.random.choice(np.arange(n_labels), p=y_distribution[b])
168 | 				sampled_y[b, label_id] = 1
169 | 		sampled_y = Variable(sampled_y)
170 | 		if self.gpu:
171 | 			sampled_y.to_gpu()
172 | 		return sampled_y
173 | 
174 | 	def sample_x_label(self, x, argmax=True, test=False):
175 | 		batchsize = x.data.shape[0]
176 | 		y_distribution = self.encoder_x_y(x, test=test, softmax=True).data
177 | 		n_labels = y_distribution.shape[1]
178 | 		if self.gpu:
179 | 			y_distribution = cuda.to_cpu(y_distribution)
180 | 		if argmax:
181 | 			sampled_label = np.argmax(y_distribution, axis=1)
182 | 		else:
183 | 			sampled_label = np.zeros((batchsize,), dtype=np.int32)
184 | 			labels = np.arange(n_labels)
185 | 			for b in xrange(batchsize):
186 | 				label_id = np.random.choice(labels, p=y_distribution[b])
187 | 				sampled_label[b] = 1
188 | 		return sampled_label
189 | 
190 | 	def bernoulli_nll_keepbatch(self, x, y):
191 | 		nll = F.softplus(y) - x * y
192 | 		return F.sum(nll, axis=1)
193 | 
194 | 	def gaussian_nll_keepbatch(self, x, mean, ln_var, clip=True):
195 | 		if clip:
196 | 			clip_min = math.log(0.001)
197 | 			clip_max = math.log(10)
198 | 			ln_var = F.clip(ln_var, clip_min, clip_max)
199 | 		x_prec = F.exp(-ln_var)
200 | 		x_diff = x - mean
201 | 		x_power = (x_diff * x_diff) * x_prec * 0.5
202 | 		return F.sum((math.log(2.0 * math.pi) + ln_var) * 0.5 + x_power, axis=1)
203 | 
204 | 	def gaussian_kl_divergence_keepbatch(self, mean, ln_var):
205 | 		var = F.exp(ln_var)
206 | 		kld = F.sum(mean * mean + var - ln_var - 1, axis=1) * 0.5
207 | 		return kld
208 | 
209 | 	def log_px_zy(self, x, z, y, test=False):
210 | 		if isinstance(self.decoder, BernoulliDecoder):
211 | 			# do not apply F.sigmoid to the output of the decoder
212 | 			raw_output = self.decoder(z, y, test=test, apply_f=False)
213 | 			negative_log_likelihood = self.bernoulli_nll_keepbatch(x, raw_output)
214 | 			log_px_zy = -negative_log_likelihood
215 | 		else:
216 | 			x_mean, x_ln_var = self.decoder(z, y, test=test, apply_f=False)
217 | 			negative_log_likelihood = self.gaussian_nll_keepbatch(x, x_mean, x_ln_var)
218 | 			log_px_zy = -negative_log_likelihood
219 | 		return log_px_zy
220 | 
221 | 	def log_py(self, y, test=False):
222 | 		xp = self.xp
223 | 		num_types_of_label = y.data.shape[1]
224 | 		# prior p(y) expecting that all classes are evenly distributed
225 | 		constant = math.log(1.0 / num_types_of_label)
226 | 		log_py = xp.full((y.data.shape[0],), constant, xp.float32)
227 | 		return Variable(log_py)
228 | 
229 | 	# this will not be used
230 | 	def log_pz(self, z, mean, ln_var, test=False):
231 | 		if self.type_pz == "gaussianmarg":
232 | 			# \int q(z)logp(z)dz = -(J/2)*log2pi - (1/2)*sum_{j=1}^{J} (mu^2 + var)
233 | 			# See Appendix B [Auto-Encoding Variational Bayes](http://arxiv.org/abs/1312.6114)
234 | 			log_pz = -0.5 * (math.log(2.0 * math.pi) + mean * mean + F.exp(ln_var))
235 | 		elif self.type_pz == "gaussian":
236 | 			log_pz = -0.5 * math.log(2.0 * math.pi) - 0.5 * z ** 2
237 | 		return F.sum(log_pz, axis=1)
238 | 
239 | 	# this will not be used
240 | 	def log_qz_xy(self, z, mean, ln_var, test=False):
241 | 		if self.type_qz == "gaussianmarg":
242 | 			# \int q(z)logq(z)dz = -(J/2)*log2pi - (1/2)*sum_{j=1}^{J} (1 + logvar)
243 | 			# See Appendix B [Auto-Encoding Variational Bayes](http://arxiv.org/abs/1312.6114)
244 | 			log_qz_xy = -0.5 * F.sum((math.log(2.0 * math.pi) + 1 + ln_var), axis=1)
245 | 		elif self.type_qz == "gaussian":
246 | 			log_qz_xy = -self.gaussian_nll_keepbatch(z, mean, ln_var)
247 | 		return log_qz_xy
248 | 
249 | 	def train(self, labeled_x, labeled_y, label_ids, unlabeled_x, test=False):
250 | 		loss, loss_labeled, loss_unlabeled = self.compute_lower_bound_loss(labeled_x, labeled_y, label_ids, unlabeled_x, test=test)
251 | 		self.zero_grads()
252 | 		loss.backward()
253 | 		self.update()
254 | 
255 | 		if self.gpu:
256 | 			loss_labeled.to_cpu()
257 | 			if loss_unlabeled is not None:
258 | 				loss_unlabeled.to_cpu()
259 | 
260 | 		if loss_unlabeled is None:
261 | 			return loss_labeled.data, 0
262 | 
263 | 		return loss_labeled.data, loss_unlabeled.data
264 | 
265 | 	# Extended objective eq.9
266 | 	def train_classification(self, labeled_x, label_ids, alpha=1.0, test=False):
267 | 		loss = alpha * self.compute_classification_loss(labeled_x, label_ids, test=test)
268 | 		self.zero_grads()
269 | 		loss.backward()
270 | 		self.update_classifier()
271 | 		if self.gpu:
272 | 			loss.to_cpu()
273 | 		return loss.data
274 | 
275 | 	def train_jointly(self, labeled_x, labeled_y, label_ids, unlabeled_x, alpha=1.0, test=False):
276 | 		loss_lower_bound, loss_lb_labled, loss_lb_unlabled = self.compute_lower_bound_loss(labeled_x, labeled_y, label_ids, unlabeled_x, test=test)
277 | 		loss_classification = alpha * self.compute_classification_loss(labeled_x, label_ids, test=test)
278 | 		loss = loss_lower_bound + loss_classification
279 | 		self.zero_grads()
280 | 		loss.backward()
281 | 		self.update()
282 | 		if self.gpu:
283 | 			loss_lb_labled.to_cpu()
284 | 			if loss_lb_unlabled is not None:
285 | 				loss_lb_unlabled.to_cpu()
286 | 			loss_classification.to_cpu()
287 | 
288 | 		if loss_lb_unlabled is None:
289 | 			return loss_lb_labled.data, 0, loss_classification.data
290 | 
291 | 		return loss_lb_labled.data, loss_lb_unlabled.data, loss_classification.data
292 | 
293 | 	def compute_lower_bound_loss(self, labeled_x, labeled_y, label_ids, unlabeled_x, test=False):
294 | 
295 | 		def lower_bound(log_px_zy, log_py, log_pz, log_qz_xy):
296 | 			lb = log_px_zy + log_py + log_pz - log_qz_xy
297 | 			return lb
298 | 
299 | 		# _l: labeled
300 | 		# _u: unlabeled
301 | 		batchsize_l = labeled_x.data.shape[0]
302 | 		batchsize_u = unlabeled_x.data.shape[0]
303 | 		num_types_of_label = labeled_y.data.shape[1]
304 | 		xp = self.xp
305 | 
306 | 		### Lower bound for labeled data ###
307 | 		# Compute eq.6 -L(x,y)
308 | 		z_mean_l, z_ln_var_l = self.encoder_xy_z(labeled_x, labeled_y, test=test, apply_f=False)
309 | 		z_l = F.gaussian(z_mean_l, z_ln_var_l)
310 | 		log_px_zy_l = self.log_px_zy(labeled_x, z_l, labeled_y, test=test)
311 | 		log_py_l = self.log_py(labeled_y, test=test)
312 | 		if False:
313 | 			log_pz_l = self.log_pz(z_l, z_mean_l, z_ln_var_l, test=test)
314 | 			log_qz_xy_l = self.log_qz_xy(z_l, z_mean_l, z_ln_var_l, test=test)
315 | 			lower_bound_l = lower_bound(log_px_zy_l, log_py_l, log_pz_l, log_qz_xy_l)
316 | 		else:
317 | 			lower_bound_l = log_px_zy_l + log_py_l - self.gaussian_kl_divergence_keepbatch(z_mean_l, z_ln_var_l)
318 | 
319 | 		if batchsize_u > 0:
320 | 			### Lower bound for unlabeled data ###
321 | 			# To marginalize y, we repeat unlabeled x, and construct a target (batchsize_u * num_types_of_label) x num_types_of_label
322 | 			# Example of n-dimensional x and target matrix for a 3 class problem and batch_size=2.
323 | 			#         unlabeled_x_ext                 y_ext
324 | 			#  [[x0[0], x0[1], ..., x0[n]]         [[1, 0, 0]
325 | 			#   [x1[0], x1[1], ..., x1[n]]          [1, 0, 0]
326 | 			#   [x0[0], x0[1], ..., x0[n]]          [0, 1, 0]
327 | 			#   [x1[0], x1[1], ..., x1[n]]          [0, 1, 0]
328 | 			#   [x0[0], x0[1], ..., x0[n]]          [0, 0, 1]
329 | 			#   [x1[0], x1[1], ..., x1[n]]]         [0, 0, 1]]
330 | 
331 | 			unlabeled_x_ext = xp.zeros((batchsize_u * num_types_of_label, unlabeled_x.data.shape[1]), dtype=xp.float32)
332 | 			y_ext = xp.zeros((batchsize_u * num_types_of_label, num_types_of_label), dtype=xp.float32)
333 | 			for n in xrange(num_types_of_label):
334 | 				y_ext[n * batchsize_u:(n + 1) * batchsize_u,n] = 1
335 | 				unlabeled_x_ext[n * batchsize_u:(n + 1) * batchsize_u] = unlabeled_x.data
336 | 			y_ext = Variable(y_ext)
337 | 			unlabeled_x_ext = Variable(unlabeled_x_ext)
338 | 
339 | 			# Compute eq.6 -L(x,y) for unlabeled data
340 | 			z_mean_u_ext, z_mean_ln_var_u_ext = self.encoder_xy_z(unlabeled_x_ext, y_ext, test=test, apply_f=False)
341 | 			z_u_ext = F.gaussian(z_mean_u_ext, z_mean_ln_var_u_ext)
342 | 			log_px_zy_u = self.log_px_zy(unlabeled_x_ext, z_u_ext, y_ext, test=test)
343 | 			log_py_u = self.log_py(y_ext, test=test)
344 | 			if False:
345 | 				log_pz_u = self.log_pz(z_u_ext, z_mean_u_ext, z_mean_ln_var_u_ext, test=test)
346 | 				log_qz_xy_u = self.log_qz_xy(z_u_ext, z_mean_u_ext, z_mean_ln_var_u_ext, test=test)
347 | 				lower_bound_u = lower_bound(log_px_zy_u, log_py_u, log_pz_u, log_qz_xy_u)
348 | 			else:
349 | 				lower_bound_u = log_px_zy_u + log_py_u - self.gaussian_kl_divergence_keepbatch(z_mean_u_ext, z_mean_ln_var_u_ext)
350 | 
351 | 			# Compute eq.7 sum_y{q(y|x){-L(x,y) + H(q(y|x))}}
352 | 			# Let LB(xn, y) be the lower bound for an input image xn and a label y (y = 0, 1, ..., 9).
353 | 			# Let bs be the batchsize.
354 | 			# 
355 | 			# lower_bound_u is a vector and it looks like...
356 | 			# [LB(x0,0), LB(x1,0), ..., LB(x_bs,0), LB(x0,1), LB(x1,1), ..., LB(x_bs,1), ..., LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]
357 | 			# 
358 | 			# After reshaping. (axis 1 corresponds to label, axis 2 corresponds to batch)
359 | 			# [[LB(x0,0), LB(x1,0), ..., LB(x_bs,0)],
360 | 			#  [LB(x0,1), LB(x1,1), ..., LB(x_bs,1)],
361 | 			#                   .
362 | 			#                   .
363 | 			#                   .
364 | 			#  [LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]]
365 | 			# 
366 | 			# After transposing. (axis 1 corresponds to batch)
367 | 			# [[LB(x0,0), LB(x0,1), ..., LB(x0,9)],
368 | 			#  [LB(x1,0), LB(x1,1), ..., LB(x1,9)],
369 | 			#                   .
370 | 			#                   .
371 | 			#                   .
372 | 			#  [LB(x_bs,0), LB(x_bs,1), ..., LB(x_bs,9)]]
373 | 			lower_bound_u = F.transpose(F.reshape(lower_bound_u, (num_types_of_label, batchsize_u)))
374 | 			
375 | 			y_distribution = self.encoder_x_y(unlabeled_x, test=test, softmax=True)
376 | 			lower_bound_u = y_distribution * (lower_bound_u - F.log(y_distribution + 1e-6))
377 | 
378 | 			loss_labeled = -F.sum(lower_bound_l) / batchsize_l
379 | 			loss_unlabeled = -F.sum(lower_bound_u) / batchsize_u
380 | 			loss = loss_labeled + loss_unlabeled
381 | 		else:
382 | 			loss_unlabeled = None
383 | 			loss_labeled = -F.sum(lower_bound_l) / batchsize_l
384 | 			loss = loss_labeled
385 | 
386 | 		return loss, loss_labeled, loss_unlabeled
387 | 
388 | 	# Extended objective eq.9
389 | 	def compute_classification_loss(self, labeled_x, label_ids, test=False):
390 | 		y_distribution = self.encoder_x_y(labeled_x, softmax=False, test=test)
391 | 		batchsize = labeled_x.data.shape[0]
392 | 		num_types_of_label = y_distribution.data.shape[1]
393 | 
394 | 		loss = F.softmax_cross_entropy(y_distribution, label_ids)
395 | 		return loss
396 | 
397 | 	def load(self, dir=None):
398 | 		if dir is None:
399 | 			raise Exception()
400 | 		for attr in vars(self):
401 | 			prop = getattr(self, attr)
402 | 			if isinstance(prop, chainer.Chain) or isinstance(prop, chainer.optimizer.GradientMethod):
403 | 				filename = dir + "/%s_%s.hdf5" % (self.name, attr)
404 | 				if os.path.isfile(filename):
405 | 					print "loading",  filename
406 | 					serializers.load_hdf5(filename, prop)
407 | 				else:
408 | 					print filename, "missing."
409 | 		print "model loaded."
410 | 
411 | 	def save(self, dir=None):
412 | 		if dir is None:
413 | 			raise Exception()
414 | 		try:
415 | 			os.mkdir(dir)
416 | 		except:
417 | 			pass
418 | 		for attr in vars(self):
419 | 			prop = getattr(self, attr)
420 | 			if isinstance(prop, chainer.Chain) or isinstance(prop, chainer.optimizer.GradientMethod):
421 | 				serializers.save_hdf5(dir + "/%s_%s.hdf5" % (self.name, attr), prop)
422 | 		print "model saved."
423 | 
424 | class GaussianM2VAE(VAE):
425 | 
426 | 	def build(self, conf):
427 | 		wscale = 0.1
428 | 		encoder_xy_z_attributes = {}
429 | 		encoder_xy_z_units = zip(conf.encoder_xy_z_hidden_units[:-1], conf.encoder_xy_z_hidden_units[1:])
430 | 		for i, (n_in, n_out) in enumerate(encoder_xy_z_units):
431 | 			encoder_xy_z_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
432 | 			if conf.batchnorm_before_activation:
433 | 				encoder_xy_z_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
434 | 			else:
435 | 				encoder_xy_z_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
436 | 		encoder_xy_z_attributes["layer_merge_x"] = L.Linear(conf.ndim_x, conf.encoder_xy_z_hidden_units[0], wscale=wscale)
437 | 		encoder_xy_z_attributes["layer_merge_y"] = L.Linear(conf.ndim_y, conf.encoder_xy_z_hidden_units[0], wscale=wscale)
438 | 		encoder_xy_z_attributes["batchnorm_merge"] = L.BatchNormalization(conf.encoder_xy_z_hidden_units[0])
439 | 		encoder_xy_z_attributes["layer_output_mean"] = L.Linear(conf.encoder_xy_z_hidden_units[-1], conf.ndim_z, wscale=wscale)
440 | 		encoder_xy_z_attributes["layer_output_var"] = L.Linear(conf.encoder_xy_z_hidden_units[-1], conf.ndim_z, wscale=wscale)
441 | 		encoder_xy_z = GaussianEncoder(**encoder_xy_z_attributes)
442 | 		encoder_xy_z.n_layers = len(encoder_xy_z_units)
443 | 		encoder_xy_z.activation_function = conf.encoder_xy_z_activation_function
444 | 		encoder_xy_z.apply_dropout = conf.encoder_xy_z_apply_dropout
445 | 		encoder_xy_z.apply_batchnorm = conf.encoder_xy_z_apply_batchnorm
446 | 		encoder_xy_z.apply_batchnorm_to_input = conf.encoder_xy_z_apply_batchnorm_to_input
447 | 		encoder_xy_z.batchnorm_before_activation = conf.batchnorm_before_activation
448 | 
449 | 		encoder_x_y_attributes = {}
450 | 		encoder_x_y_units = [(conf.ndim_x, conf.encoder_x_y_hidden_units[0])]
451 | 		encoder_x_y_units += zip(conf.encoder_x_y_hidden_units[:-1], conf.encoder_x_y_hidden_units[1:])
452 | 		encoder_x_y_units += [(conf.encoder_x_y_hidden_units[-1], conf.ndim_y)]
453 | 		for i, (n_in, n_out) in enumerate(encoder_x_y_units):
454 | 			encoder_x_y_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
455 | 			if conf.batchnorm_before_activation:
456 | 				encoder_x_y_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
457 | 			else:
458 | 				encoder_x_y_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
459 | 		encoder_x_y = SoftmaxEncoder(**encoder_x_y_attributes)
460 | 		encoder_x_y.n_layers = len(encoder_x_y_units)
461 | 		encoder_x_y.activation_function = conf.encoder_x_y_activation_function
462 | 		encoder_x_y.apply_dropout = conf.encoder_x_y_apply_dropout
463 | 		encoder_x_y.apply_batchnorm = conf.encoder_x_y_apply_batchnorm
464 | 		encoder_x_y.apply_batchnorm_to_input = conf.encoder_x_y_apply_batchnorm_to_input
465 | 		encoder_x_y.batchnorm_before_activation = conf.batchnorm_before_activation
466 | 
467 | 		decoder_attributes = {}
468 | 		decoder_units = zip(conf.decoder_hidden_units[:-1], conf.decoder_hidden_units[1:])
469 | 		for i, (n_in, n_out) in enumerate(decoder_units):
470 | 			decoder_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
471 | 			if conf.batchnorm_before_activation:
472 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
473 | 			else:
474 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
475 | 
476 | 		decoder_attributes["layer_merge_x"] = L.Linear(conf.ndim_z, conf.decoder_hidden_units[0], wscale=wscale)
477 | 		decoder_attributes["layer_merge_y"] = L.Linear(conf.ndim_y, conf.decoder_hidden_units[0], wscale=wscale)
478 | 		decoder_attributes["batchnorm_merge"] = L.BatchNormalization(conf.decoder_hidden_units[0])
479 | 		decoder_attributes["layer_output_mean"] = L.Linear(conf.decoder_hidden_units[-1], conf.ndim_x, wscale=wscale)
480 | 		decoder_attributes["layer_output_var"] = L.Linear(conf.decoder_hidden_units[-1], conf.ndim_x, wscale=wscale)
481 | 		decoder = GaussianDecoder(**decoder_attributes)
482 | 		decoder.n_layers = len(decoder_units)
483 | 		decoder.activation_function = conf.decoder_activation_function
484 | 		decoder.apply_dropout = conf.decoder_apply_dropout
485 | 		decoder.apply_batchnorm = conf.decoder_apply_batchnorm
486 | 		decoder.apply_batchnorm_to_input = conf.decoder_apply_batchnorm_to_input
487 | 		decoder.batchnorm_before_activation = conf.batchnorm_before_activation
488 | 
489 | 		if conf.gpu_enabled:
490 | 			encoder_xy_z.to_gpu()
491 | 			encoder_x_y.to_gpu()
492 | 			decoder.to_gpu()
493 | 		return encoder_xy_z, encoder_x_y, decoder
494 | 
495 | class BernoulliM2VAE(VAE):
496 | 
497 | 	def build(self, conf):
498 | 		wscale = 0.1
499 | 		encoder_xy_z_attributes = {}
500 | 		encoder_xy_z_units = zip(conf.encoder_xy_z_hidden_units[:-1], conf.encoder_xy_z_hidden_units[1:])
501 | 		for i, (n_in, n_out) in enumerate(encoder_xy_z_units):
502 | 			encoder_xy_z_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
503 | 			if conf.batchnorm_before_activation:
504 | 				encoder_xy_z_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
505 | 			else:
506 | 				encoder_xy_z_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
507 | 		encoder_xy_z_attributes["layer_merge_x"] = L.Linear(conf.ndim_x, conf.encoder_xy_z_hidden_units[0], wscale=wscale)
508 | 		encoder_xy_z_attributes["layer_merge_y"] = L.Linear(conf.ndim_y, conf.encoder_xy_z_hidden_units[0], wscale=wscale)
509 | 		encoder_xy_z_attributes["batchnorm_merge"] = L.BatchNormalization(conf.encoder_xy_z_hidden_units[0])
510 | 		encoder_xy_z_attributes["layer_output_mean"] = L.Linear(conf.encoder_xy_z_hidden_units[-1], conf.ndim_z, wscale=wscale)
511 | 		encoder_xy_z_attributes["layer_output_var"] = L.Linear(conf.encoder_xy_z_hidden_units[-1], conf.ndim_z, wscale=wscale)
512 | 		encoder_xy_z = GaussianEncoder(**encoder_xy_z_attributes)
513 | 		encoder_xy_z.n_layers = len(encoder_xy_z_units)
514 | 		encoder_xy_z.activation_function = conf.encoder_xy_z_activation_function
515 | 		encoder_xy_z.apply_dropout = conf.encoder_xy_z_apply_dropout
516 | 		encoder_xy_z.apply_batchnorm = conf.encoder_xy_z_apply_batchnorm
517 | 		encoder_xy_z.apply_batchnorm_to_input = conf.encoder_xy_z_apply_batchnorm_to_input
518 | 		encoder_xy_z.batchnorm_before_activation = conf.batchnorm_before_activation
519 | 
520 | 		encoder_x_y_attributes = {}
521 | 		encoder_x_y_units = [(conf.ndim_x, conf.encoder_x_y_hidden_units[0])]
522 | 		encoder_x_y_units += zip(conf.encoder_x_y_hidden_units[:-1], conf.encoder_x_y_hidden_units[1:])
523 | 		encoder_x_y_units += [(conf.encoder_x_y_hidden_units[-1], conf.ndim_y)]
524 | 		for i, (n_in, n_out) in enumerate(encoder_x_y_units):
525 | 			encoder_x_y_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
526 | 			if conf.batchnorm_before_activation:
527 | 				encoder_x_y_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
528 | 			else:
529 | 				encoder_x_y_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
530 | 		encoder_x_y = SoftmaxEncoder(**encoder_x_y_attributes)
531 | 		encoder_x_y.n_layers = len(encoder_x_y_units)
532 | 		encoder_x_y.activation_function = conf.encoder_x_y_activation_function
533 | 		encoder_x_y.apply_dropout = conf.encoder_x_y_apply_dropout
534 | 		encoder_x_y.apply_batchnorm = conf.encoder_x_y_apply_batchnorm
535 | 		encoder_x_y.apply_batchnorm_to_input = conf.encoder_x_y_apply_batchnorm_to_input
536 | 		encoder_x_y.batchnorm_before_activation = conf.batchnorm_before_activation
537 | 
538 | 		decoder_attributes = {}
539 | 		decoder_units = zip(conf.decoder_hidden_units[:-1], conf.decoder_hidden_units[1:])
540 | 		decoder_units += [(conf.decoder_hidden_units[-1], conf.ndim_x)]
541 | 		for i, (n_in, n_out) in enumerate(decoder_units):
542 | 			decoder_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale)
543 | 			if conf.batchnorm_before_activation:
544 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out)
545 | 			else:
546 | 				decoder_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in)
547 | 		decoder_attributes["layer_merge_z"] = L.Linear(conf.ndim_z, conf.decoder_hidden_units[0], wscale=wscale)
548 | 		decoder_attributes["layer_merge_y"] = L.Linear(conf.ndim_y, conf.decoder_hidden_units[0], wscale=wscale)
549 | 		decoder_attributes["batchnorm_merge"] = L.BatchNormalization(conf.decoder_hidden_units[0])
550 | 		decoder = BernoulliDecoder(**decoder_attributes)
551 | 		decoder.n_layers = len(decoder_units)
552 | 		decoder.activation_function = conf.decoder_activation_function
553 | 		decoder.apply_dropout = conf.decoder_apply_dropout
554 | 		decoder.apply_batchnorm = conf.decoder_apply_batchnorm
555 | 		decoder.apply_batchnorm_to_input = conf.decoder_apply_batchnorm_to_input
556 | 		decoder.batchnorm_before_activation = conf.batchnorm_before_activation
557 | 
558 | 		if conf.gpu_enabled:
559 | 			encoder_xy_z.to_gpu()
560 | 			encoder_x_y.to_gpu()
561 | 			decoder.to_gpu()
562 | 		return encoder_xy_z, encoder_x_y, decoder
563 | 
564 | class SoftmaxEncoder(chainer.Chain):
565 | 	def __init__(self, **layers):
566 | 		super(SoftmaxEncoder, self).__init__(**layers)
567 | 		self.activation_function = "softplus"
568 | 		self.apply_batchnorm_to_input = True
569 | 		self.apply_batchnorm = True
570 | 		self.apply_dropout = False
571 | 		self.batchnorm_before_activation = True
572 | 
573 | 	@property
574 | 	def xp(self):
575 | 		return np if self._cpu else cuda.cupy
576 | 
577 | 	def forward_one_step(self, x, test):
578 | 		f = activations[self.activation_function]
579 | 		chain = [x]
580 | 
581 | 		for i in range(self.n_layers):
582 | 			u = chain[-1]
583 | 			if self.batchnorm_before_activation:
584 | 				u = getattr(self, "layer_%i" % i)(u)
585 | 			if i == 0:
586 | 				if self.apply_batchnorm_to_input:
587 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
588 | 			elif i == self.n_layers - 1:
589 | 				if self.apply_batchnorm and self.batchnorm_before_activation == False:
590 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
591 | 			else:
592 | 				if self.apply_batchnorm:
593 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
594 | 			if self.batchnorm_before_activation == False:
595 | 				u = getattr(self, "layer_%i" % i)(u)
596 | 			if i == self.n_layers - 1:
597 | 				output = u
598 | 			else:
599 | 				output = f(u)
600 | 				if self.apply_dropout:
601 | 					output = F.dropout(output, train=not test)
602 | 			chain.append(output)
603 | 
604 | 		return chain[-1]
605 | 
606 | 	def __call__(self, x, test=False, softmax=True):
607 | 		output = self.forward_one_step(x, test=test)
608 | 		if softmax:
609 | 			return F.softmax(output)
610 | 		return output
611 | 
612 | class GaussianEncoder(chainer.Chain):
613 | 	def __init__(self, **layers):
614 | 		super(GaussianEncoder, self).__init__(**layers)
615 | 		self.activation_function = "softplus"
616 | 		self.apply_batchnorm_to_input = True
617 | 		self.apply_batchnorm = True
618 | 		self.apply_dropout = False
619 | 		self.batchnorm_before_activation = True
620 | 
621 | 	@property
622 | 	def xp(self):
623 | 		return np if self._cpu else cuda.cupy
624 | 
625 | 	def forward_one_step(self, x, y, test=False, apply_f=True):
626 | 		f = activations[self.activation_function]
627 | 
628 | 		if self.apply_batchnorm_to_input:
629 | 			if self.batchnorm_before_activation:
630 | 				merged_input = f(self.batchnorm_merge(self.layer_merge_x(x) + self.layer_merge_y(y), test=test))
631 | 			else:
632 | 				merged_input = f(self.layer_merge_x(self.batchnorm_merge(x, test=test)) + self.layer_merge_y(y))
633 | 		else:
634 | 			merged_input = f(self.layer_merge_x(x) + self.layer_merge_y(y))
635 | 
636 | 		chain = [merged_input]
637 | 
638 | 		# Hidden
639 | 		for i in range(self.n_layers):
640 | 			u = chain[-1]
641 | 			if self.batchnorm_before_activation:
642 | 				u = getattr(self, "layer_%i" % i)(u)
643 | 			if self.apply_batchnorm:
644 | 				u = getattr(self, "batchnorm_%d" % i)(u, test=test)
645 | 			if self.batchnorm_before_activation == False:
646 | 				u = getattr(self, "layer_%i" % i)(u)
647 | 			output = f(u)
648 | 			if self.apply_dropout:
649 | 				output = F.dropout(output, train=not test)
650 | 			chain.append(output)
651 | 
652 | 		u = chain[-1]
653 | 		mean = self.layer_output_mean(u)
654 | 
655 | 		# log(sd^2)
656 | 		u = chain[-1]
657 | 		ln_var = self.layer_output_var(u)
658 | 
659 | 		return mean, ln_var
660 | 
661 | 	def __call__(self, x, y, test=False, apply_f=True):
662 | 		mean, ln_var = self.forward_one_step(x, y, test=test, apply_f=apply_f)
663 | 		if apply_f:
664 | 			return F.gaussian(mean, ln_var)
665 | 		return mean, ln_var
666 | 
667 | # Network structure is same as the GaussianEncoder
668 | class GaussianDecoder(GaussianEncoder):
669 | 
670 | 	def __call__(self, z, y, test=False, apply_f=False):
671 | 		mean, ln_var = self.forward_one_step(z, y, test=test, apply_f=False)
672 | 		if apply_f:
673 | 			return F.gaussian(mean, ln_var)
674 | 		return mean, ln_var
675 | 
676 | class BernoulliDecoder(SoftmaxEncoder):
677 | 
678 | 	def forward_one_step(self, z, y, test):
679 | 		f = activations[self.activation_function]
680 | 
681 | 		if self.apply_batchnorm_to_input:
682 | 			if self.batchnorm_before_activation:
683 | 				merged_input = f(self.batchnorm_merge(self.layer_merge_z(z) + self.layer_merge_y(y), test=test))
684 | 			else:
685 | 				merged_input = f(self.layer_merge_z(self.batchnorm_merge(z, test=test)) + self.layer_merge_y(y))
686 | 		else:
687 | 			merged_input = f(self.layer_merge_z(z) + self.layer_merge_y(y))
688 | 
689 | 		chain = [merged_input]
690 | 
691 | 		# Hidden
692 | 		for i in range(self.n_layers):
693 | 			u = chain[-1]
694 | 			if self.batchnorm_before_activation:
695 | 				u = getattr(self, "layer_%i" % i)(u)
696 | 			if i == self.n_layers - 1:
697 | 				if self.apply_batchnorm and self.batchnorm_before_activation == False:
698 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
699 | 			else:
700 | 				if self.apply_batchnorm:
701 | 					u = getattr(self, "batchnorm_%d" % i)(u, test=test)
702 | 			if self.batchnorm_before_activation == False:
703 | 				u = getattr(self, "layer_%i" % i)(u)
704 | 			if i == self.n_layers - 1:
705 | 				output = u
706 | 			else:
707 | 				output = f(u)
708 | 				if self.apply_dropout:
709 | 					output = F.dropout(output, train=not test)
710 | 			chain.append(output)
711 | 
712 | 		return chain[-1]
713 | 
714 | 	def __call__(self, z, y, test=False, apply_f=False):
715 | 		output = self.forward_one_step(z, y, test=test)
716 | 		if apply_f:
717 | 			return F.sigmoid(output)
718 | 		return output


--------------------------------------------------------------------------------