├── download_data.sh
├── download_model.sh
├── experiments
├── plotting
│ └── plot_pairewise.py
├── run_DCC_global.py
├── run_DCC_instance.py
├── run_DCC_pairwise.py
├── run_DCC_triplets.py
├── run_DEC.py
├── run_improved_DEC.py
└── run_sdae.py
├── lib
├── __pycache__
│ ├── datasets.cpython-36.pyc
│ ├── dcc.cpython-36.pyc
│ ├── dec.cpython-36.pyc
│ ├── denoisingAutoencoder.cpython-36.pyc
│ ├── ops.cpython-36.pyc
│ ├── stackedDAE.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── datasets.py
├── dcc.py
├── dec.py
├── denoisingAutoencoder.py
├── ops.py
├── stackedDAE.py
└── utils.py
├── readme.md
└── requirements.txt
/download_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ##!/usr/bin/env bash
3 |
4 | TASKS="reutersidf10k_train.npy \
5 | reutersidf10k_test.npy"
6 |
7 |
8 | for t in $TASKS; do
9 | echo "Downloading model ${t}."
10 | wget "https://s3-us-west-1.amazonaws.com/deep-constrained-clustering/\
11 | Data-Reuters/${t}" -P ./experiments/dataset/reuters/
12 | done
13 |
--------------------------------------------------------------------------------
/download_model.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ##!/usr/bin/env bash
3 |
4 | TASKS="fashion_sdae_weights.pt \
5 | fashion_triplet_embedding.npy \
6 | mnist_sdae_weights.pt \
7 | mnist_triplet_embedding.npy \
8 | reuters10k_sdae_weights.pt"
9 |
10 |
11 | for t in $TASKS; do
12 | echo "Downloading model ${t}."
13 | wget "https://s3-us-west-1.amazonaws.com/deep-constrained-clustering/\
14 | model-log-final/${t}" -P ./model/
15 | done
--------------------------------------------------------------------------------
/experiments/plotting/plot_pairewise.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import sys
4 | import time
5 | import random
6 | import re
7 | import json
8 | import pickle
9 | import pandas as pd
10 | import seaborn as sns
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | from sklearn.manifold import TSNE
14 | from collections import defaultdict
15 |
16 |
17 | if __name__ == "__main__":
18 |
19 | folders = [d for d in os.listdir(".") if os.path.isdir(d) and d != "Legend" and d != "Util"]
20 |
21 | label_dict = {
22 | "M": ["0","1","2","3","4","5","6","7","8","9"],
23 | "F": ["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"],
24 | "R": ["corporate/industrial", "government/social", "markets", "economics"]
25 | }
26 |
27 | for folder in folders:
28 |
29 | print("\nStarting "+folder)
30 |
31 | try:
32 | latent_files = [f for f in os.listdir(folder) if f.startswith("save")]
33 | print(latent_files)
34 | except:
35 | print("No latent files, Skipping Folder")
36 | continue
37 |
38 | link_points = []
39 |
40 | try:
41 | must_links = pd.read_pickle(os.path.join(folder,"mustlinks.pkl"))
42 | cannot_links = pd.read_pickle(os.path.join(folder,"cannotlinks.pkl"))
43 |
44 | random.seed(1)
45 | ml_sample = random.sample(range(must_links.shape[0]), must_links.shape[0])
46 | link_points += must_links.iloc[ml_sample[:20],0].tolist()
47 | link_points += must_links.iloc[ml_sample[:20],1].tolist()
48 | random.seed(2)
49 | cl_sample = random.sample(range(cannot_links.shape[0]), cannot_links.shape[0])
50 | link_points += cannot_links.iloc[cl_sample[:20],0].tolist()
51 | link_points += cannot_links.iloc[cl_sample[:20],1].tolist()
52 | except:
53 | print("No must link / cannot link, Skipping Folder")
54 | continue
55 |
56 | try:
57 | noisy_must_links = pd.read_pickle(os.path.join(folder,"noisymustlinks.pkl"))
58 | random.seed(3)
59 | noisy_ml_sample = random.sample(range(noisy_must_links.shape[0]),noisy_must_links.shape[0])
60 | link_points += noisy_must_links.iloc[noisy_ml_sample[:20],0].tolist()
61 | link_points += noisy_must_links.iloc[noisy_ml_sample[:20],1].tolist()
62 | except:
63 | noisy_must_links = []
64 | noisy_ml_sample = []
65 |
66 | try:
67 | noisy_cannot_links = pd.read_pickle(os.path.join(folder,"noisycannotlinks.pkl"))
68 | random.seed(4)
69 | noisy_cl_sample = random.sample(range(noisy_cannot_links.shape[0]),noisy_cannot_links.shape[0])
70 | link_points += noisy_cannot_links.iloc[noisy_cl_sample[:20],0].tolist()
71 | link_points += noisy_cannot_links.iloc[noisy_cl_sample[:20],1].tolist()
72 | except:
73 | noisy_cannot_links = []
74 | noisy_cl_sample = []
75 |
76 | try:
77 | with open(os.path.join(folder,"intermediate_results.json"), "r") as fp:
78 | intermediate_results = json.load(fp)
79 | except:
80 | intermediate_results = defaultdict(lambda:defaultdict(lambda:0.0))
81 |
82 | link_points = list(set(link_points))
83 |
84 | # Start Plotting
85 | for k, file in enumerate(latent_files):
86 |
87 | df = pd.read_pickle(os.path.join(folder,file))
88 | epoch = re.sub('[^0-9]','', file)
89 |
90 | if folder.startswith("Reuters"):
91 | latent_full = df.sample(frac=0.75, random_state=7).append(df.iloc[link_points,:])
92 | else:
93 | latent_full = df.sample(frac=0.25, random_state=7).append(df.iloc[link_points,:])
94 |
95 | latent = latent_full.iloc[:,0:10].copy()
96 |
97 | time_start = time.time()
98 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=350)
99 | tsne_results = tsne.fit_transform(latent)
100 | print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
101 |
102 | latent['tsne-1'] = tsne_results[:,0]
103 | latent['tsne-2'] = tsne_results[:,1]
104 | latent["class"] = np.array([label_dict[folder[0]][x] for x in latent_full["y"].tolist()])
105 |
106 | plt.figure(k,figsize=(16,10))
107 | plt.title("Accuracy: %.2f, NMI: %.2f"%(intermediate_results["acc"][epoch],intermediate_results["nmi"][epoch]))
108 |
109 | sns.scatterplot(
110 | x="tsne-1", y="tsne-2",
111 | hue="class",
112 | palette=sns.color_palette("hls", latent["class"].nunique()),
113 | data=latent,
114 | legend="full",
115 | alpha=0.8,
116 | s=20
117 | )
118 |
119 |
120 | # plot links
121 | plot_links = [ {"sample": ml_sample, "link": must_links, "count":10, "style": 'b-', "label": "must link"},
122 | {"sample": cl_sample, "link": cannot_links, "count":10, "style": 'r-', "label": "cannot link"},
123 | {"sample": noisy_ml_sample, "link": noisy_must_links, "count":10, "style": 'k-', "label": "noisy must link"},
124 | {"sample": noisy_cl_sample, "link": noisy_cannot_links, "count":10, "style": 'k:', "label": "noisy cannot link"},
125 | ]
126 |
127 | for plot_link in plot_links:
128 | count = 0
129 | for i in plot_link["sample"]:
130 | if count >= plot_link["count"]:
131 | break
132 | try:
133 | p1 = latent.loc[plot_link["link"].loc[i][0]]
134 | p2 = latent.loc[plot_link["link"].loc[i][1]]
135 | plt.plot([p1["tsne-1"],p2["tsne-1"]], [p1["tsne-2"],p2["tsne-2"]], plot_link["style"], label=plot_link["label"])
136 | count += 1
137 | except:
138 | pass
139 |
140 | # remove duplicate label for lines
141 | handles, labels = plt.gca().get_legend_handles_labels()
142 | newLabels, newHandles = [], []
143 | for handle, label in zip(handles, labels):
144 | if label not in newLabels:
145 | newLabels.append(label)
146 | newHandles.append(handle)
147 |
148 | #lgd = plt.gca().legend(newHandles, newLabels, loc='center left', bbox_to_anchor=(1, 0.5))
149 | lgd = plt.gca().legend(newHandles, newLabels, loc='center', bbox_to_anchor=(0.5, -0.10),fancybox=True, ncol=len(newLabels), columnspacing=1.0,handlelength=1.0)
150 |
151 | plt.savefig(os.path.join(folder,folder+"_"+epoch+".png"), bbox_extra_artists=(lgd,), bbox_inches='tight')
152 | plt.clf()
153 |
--------------------------------------------------------------------------------
/experiments/run_DCC_global.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | import torch.utils.data
4 | import numpy as np
5 | import argparse
6 | from lib.dcc import IDEC
7 | from lib.datasets import MNIST, FashionMNIST
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser(description='Global MNIST Example')
11 | parser.add_argument('--lr', type=float, default=0.001, metavar='N',
12 | help='learning rate for training (default: 0.001)')
13 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
14 | help='input batch size for training (default: 256)')
15 | parser.add_argument('--update-interval', type=int, default=1, metavar='N',
16 | help='number of epochs to train (default: 1)')
17 | parser.add_argument('--epochs', type=int, default=200, metavar='N',
18 | help='number of epochs to train (default: 200)')
19 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
20 | help='directory for pre-trained weights')
21 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion)')
22 | parser.add_argument('--use_pretrain', type=str, default="True")
23 | args = parser.parse_args()
24 |
25 | # Load data
26 | mnist_train = MNIST('./dataset/mnist', train=True, download=True)
27 | mnist_test = MNIST('./dataset/mnist', train=False)
28 | X = mnist_train.train_data
29 | y = mnist_train.train_labels
30 | test_X = mnist_test.test_data
31 | test_y = mnist_test.test_labels
32 | if args.data == "Fashion":
33 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
34 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
35 | X = fashionmnist_train.train_data
36 | y = fashionmnist_train.train_labels
37 | test_X = fashionmnist_test.test_data
38 | test_y = fashionmnist_test.test_labels
39 | args.pretrain="../model/fashion_sdae_weights.pt"
40 | ml_penalty = 1
41 |
42 | # Set parameters
43 | ml_penalty, cl_penalty = 0.1, 1
44 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
45 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
46 |
47 | # Print Network Structure
48 | print(idec)
49 | if args.use_pretrain == "True":
50 | idec.load_model(args.pretrain)
51 |
52 | # Construct constriants
53 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
54 | anchor, positive, negative = np.array([]), np.array([]), np.array([])
55 | instance_guidance = torch.zeros(X.shape[0]).cuda()
56 | use_global = True
57 |
58 | # Train the network
59 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y,
60 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
61 | update_interval=args.update_interval,tol=1*1e-3)
62 |
63 | # Make predictions on test set
64 | test_acc, test_nmi = idec.predict(test_X, test_y)
65 |
66 | # Report results
67 | print("Training Accuracy:", train_acc)
68 | print("Training NMI;", train_nmi)
69 | print("Training Epochs:", epo)
70 | print("Test Accuracy:", test_acc)
71 | print("Test NMI:", test_nmi)
72 |
--------------------------------------------------------------------------------
/experiments/run_DCC_instance.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | import torch.utils.data
4 | import numpy as np
5 | import argparse
6 | from lib.dcc import IDEC
7 | from lib.datasets import MNIST, FashionMNIST, Reuters
8 | from sklearn.cluster import KMeans
9 | from lib.utils import detect_wrong
10 |
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser(description='Instance Difficulty Constrained Clustering Example')
14 | parser.add_argument('--lr', type=float, default=0.001, metavar='N',
15 | help='learning rate for training (default: 0.001)')
16 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
17 | help='input batch size for training (default: 256)')
18 | parser.add_argument('--update-interval', type=int, default=1, metavar='N',
19 | help='number of epochs to train (default: 1)')
20 | parser.add_argument('--epochs', type=int, default=200, metavar='N',
21 | help='number of epochs to train (default: 200)')
22 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
23 | help='directory for pre-trained weights')
24 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)')
25 | parser.add_argument('--use_pretrain', type=bool, default=True)
26 | args = parser.parse_args()
27 |
28 | # Load data
29 | mnist_train = MNIST('./dataset/mnist', train=True, download=True)
30 | mnist_test = MNIST('./dataset/mnist', train=False)
31 | X = mnist_train.train_data
32 | y = mnist_train.train_labels
33 | test_X = mnist_test.test_data
34 | test_y = mnist_test.test_labels
35 |
36 | # Set parameters
37 | ml_penalty, cl_penalty = 0.1, 1
38 |
39 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
40 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
41 | if args.data == "Fashion":
42 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
43 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
44 | X = fashionmnist_train.train_data
45 | y = fashionmnist_train.train_labels
46 | test_X = fashionmnist_test.test_data
47 | test_y = fashionmnist_test.test_labels
48 | args.pretrain="../model/fashion_sdae_weights.pt"
49 | ml_penalty = 1
50 | elif args.data == "Reuters":
51 | reuters_train = Reuters('./dataset/reuters', train=True, download=False)
52 | reuters_test = Reuters('./dataset/reuters', train=False)
53 | X = reuters_train.train_data
54 | y = reuters_train.train_labels
55 | test_X = reuters_test.test_data
56 | test_y = reuters_test.test_labels
57 | args.pretrain="../model/reuters10k_sdae_weights.pt"
58 | idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4,
59 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
60 | if args.use_pretrain:
61 | idec.load_model(args.pretrain)
62 |
63 | # Print netowrk structure
64 | print(idec)
65 |
66 | # Construct Constraints
67 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
68 | anchor, positive, negative = np.array([]), np.array([]), np.array([])
69 |
70 | # Provide instance guidance based on k-means results. High confidence (1) for correct instances.
71 | # Low confidence (0.1) for incorrect instances since k-means + AE does not achieve good results.
72 | latent = idec.encodeBatch(X).cpu().numpy()
73 | kmeans = KMeans(10, n_init=20)
74 | y_pred = kmeans.fit_predict(latent)
75 | instance_guidance = detect_wrong(y.cpu().numpy(), y_pred)
76 | instance_guidance = torch.tensor(instance_guidance, dtype=torch.float32).cuda()
77 | use_global = False
78 |
79 | # Train the network
80 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y,
81 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
82 | update_interval=args.update_interval, tol=1*1e-3)
83 |
84 | # Make prediction
85 | test_acc, test_nmi = idec.predict(test_X, test_y)
86 |
87 | # Report results
88 | print("Training Accuracy:", train_acc)
89 | print("Training NMI;", train_nmi)
90 | print("Training Epochs:", epo)
91 | print("Test Accuracy:", test_acc)
92 | print("Test NMI:", test_nmi)
93 |
--------------------------------------------------------------------------------
/experiments/run_DCC_pairwise.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.append("..")
4 | import torch.utils.data
5 | import numpy as np
6 | import pandas as pd
7 | import argparse
8 | import time
9 |
10 | from lib.dcc import IDEC
11 | from lib.datasets import MNIST, FashionMNIST, Reuters
12 | from lib.utils import transitive_closure, generate_random_pair
13 |
14 |
15 | if __name__ == "__main__":
16 | parser = argparse.ArgumentParser(description='Pairwise MNIST Example')
17 | parser.add_argument('--lr', type=float, default=0.001, metavar='N',
18 | help='learning rate for training (default: 0.001)')
19 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
20 | help='input batch size for training (default: 256)')
21 | parser.add_argument('--update-interval', type=int, default=1, metavar='N',
22 | help='number of epochs to train (default: 1)')
23 | parser.add_argument('--epochs', type=int, default=500, metavar='N',
24 | help='number of epochs to train (default: 500)')
25 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
26 | help='directory for pre-trained weights')
27 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)')
28 | parser.add_argument('--without_pretrain', action='store_false')
29 | parser.add_argument('--without_kmeans', action='store_false')
30 | parser.add_argument('--noisy', type=float, default=0.0, metavar='N',
31 | help='noisy constraints rate for training (default: 0.0)')
32 | parser.add_argument('--plotting', action='store_true')
33 | args = parser.parse_args()
34 |
35 | # Load data
36 | mnist_train = MNIST('./dataset/mnist', train=True, download=True)
37 | mnist_test = MNIST('./dataset/mnist', train=False)
38 | X = mnist_train.train_data
39 | y = mnist_train.train_labels
40 | test_X = mnist_test.test_data
41 | test_y = mnist_test.test_labels
42 |
43 | # Set parameters
44 | ml_penalty, cl_penalty = 0.1, 1
45 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
46 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
47 | if args.data == "Fashion":
48 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
49 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
50 | X = fashionmnist_train.train_data
51 | y = fashionmnist_train.train_labels
52 | test_X = fashionmnist_test.test_data
53 | test_y = fashionmnist_test.test_labels
54 | args.pretrain="../model/fashion_sdae_weights.pt"
55 | ml_penalty = 1
56 | elif args.data == "Reuters":
57 | reuters_train = Reuters('./dataset/reuters', train=True, download=False)
58 | reuters_test = Reuters('./dataset/reuters', train=False)
59 | X = reuters_train.train_data
60 | y = reuters_train.train_labels
61 | test_X = reuters_test.test_data
62 | test_y = reuters_test.test_labels
63 | args.pretrain="../model/reuters10k_sdae_weights.pt"
64 | idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4,
65 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
66 |
67 |
68 | model_tag = "Raw"
69 | if args.without_pretrain:
70 | model_tag = "Pretrain"
71 | idec.load_model(args.pretrain)
72 |
73 | init_tag = "Random"
74 | if args.without_kmeans:
75 | init_tag = "KMeans"
76 |
77 | # Print Network Structure
78 | print(idec)
79 |
80 | # Construct Constraints
81 | num_constraints = 6000
82 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = generate_random_pair(y, num_constraints*2)
83 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = transitive_closure(ml_ind1, ml_ind2, cl_ind1, cl_ind2, X.shape[0])
84 |
85 | ml_ind1 = ml_ind1[:num_constraints]
86 | ml_ind2 = ml_ind2[:num_constraints]
87 | cl_ind1 = cl_ind1[:num_constraints]
88 | cl_ind2 = cl_ind2[:num_constraints]
89 |
90 | plotting_dir = ""
91 | if args.plotting:
92 |
93 | dir_name = args.data+"_"+model_tag+"_"+init_tag+"_%d"%num_constraints
94 | if args.noisy > 0:
95 | dir_name += "_Noisy_%d%%"%(int(args.noisy*100))
96 | dir_name += "_"+time.strftime("%Y%m%d-%H%M")
97 | plotting_dir = "./plotting/%s"%dir_name
98 | if not os.path.exists(plotting_dir):
99 | os.mkdir(plotting_dir)
100 |
101 | mldf = pd.DataFrame(data = [ml_ind1,ml_ind2]).T
102 | mldf.to_pickle(os.path.join(plotting_dir,"mustlinks.pkl"))
103 | cldf = pd.DataFrame(data = [cl_ind1,cl_ind2]).T
104 | cldf.to_pickle(os.path.join(plotting_dir,"cannotlinks.pkl"))
105 |
106 | if args.noisy > 0:
107 | nml_ind1, nml_ind2, ncl_ind1, ncl_ind2 = generate_random_pair(y, num_constraints*2)
108 | ncl_ind1, ncl_ind2, nml_ind1, nml_ind2 = transitive_closure(nml_ind1, nml_ind2, ncl_ind1, ncl_ind2, X.shape[0])
109 |
110 | nml_ind1 = nml_ind1[:int(ml_ind1.size*args.noisy)]
111 | nml_ind2 = nml_ind2[:int(ml_ind2.size*args.noisy)]
112 | ncl_ind1 = ncl_ind1[:int(cl_ind1.size*args.noisy)]
113 | ncl_ind2 = ncl_ind2[:int(cl_ind2.size*args.noisy)]
114 |
115 | if plotting_dir:
116 | nmldf = pd.DataFrame(data = [nml_ind1,nml_ind2]).T
117 | nmldf.to_pickle(os.path.join(plotting_dir,"noisymustlinks.pkl"))
118 | ncldf = pd.DataFrame(data = [ncl_ind1,ncl_ind2]).T
119 | ncldf.to_pickle(os.path.join(plotting_dir,"noisycannotlinks.pkl"))
120 |
121 | ml_ind1 = np.append(ml_ind1,nml_ind1)
122 | ml_ind2 = np.append(ml_ind2,nml_ind2)
123 | cl_ind1 = np.append(cl_ind1,ncl_ind1)
124 | cl_ind2 = np.append(cl_ind2,ncl_ind2)
125 |
126 | anchor, positive, negative = np.array([]), np.array([]), np.array([])
127 | instance_guidance = torch.zeros(X.shape[0]).cuda()
128 | use_global = False
129 |
130 | # Train Neural Network
131 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y,
132 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
133 | update_interval=args.update_interval,tol=1*1e-3,use_kmeans=args.without_kmeans,plotting=plotting_dir)
134 |
135 | # Make Predictions
136 | test_acc, test_nmi = idec.predict(test_X, test_y)
137 |
138 | # Report Results
139 | print("ACC:", train_acc)
140 | print("NMI;", train_nmi)
141 | print("Epochs:", epo)
142 | print("testAcc:", test_acc)
143 | print("testNMI:", test_nmi)
144 | print("ML Closure:", ml_ind1.shape[0])
145 | print("CL Closure:", cl_ind1.shape[0])
146 |
--------------------------------------------------------------------------------
/experiments/run_DCC_triplets.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | import torch.utils.data
4 | import numpy as np
5 | import argparse
6 | from lib.dcc import IDEC
7 | from lib.datasets import MNIST, FashionMNIST
8 | from lib.utils import generate_mnist_triplets, generate_triplet_constraints_continuous
9 |
10 |
11 | if __name__ == "__main__":
12 | parser = argparse.ArgumentParser(description='Triplet Constraints Example')
13 | parser.add_argument('--lr', type=float, default=0.001, metavar='N',
14 | help='learning rate for training (default: 0.001)')
15 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
16 | help='input batch size for training (default: 256)')
17 | parser.add_argument('--update-interval', type=int, default=1, metavar='N',
18 | help='number of epochs to train (default: 1)')
19 | parser.add_argument('--epochs', type=int, default=200, metavar='N',
20 | help='number of epochs to train (default: 200)')
21 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
22 | help='directory for pre-trained weights')
23 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion)')
24 | parser.add_argument('--use_pretrain', type=bool, default=True)
25 | args = parser.parse_args()
26 |
27 | # Load data
28 | mnist_train = MNIST('./dataset/mnist', train=True, download=True)
29 | mnist_test = MNIST('./dataset/mnist', train=False)
30 | X = mnist_train.train_data
31 | y = mnist_train.train_labels
32 | test_X = mnist_test.test_data
33 | test_y = mnist_test.test_labels
34 |
35 | # Set parameters
36 | ml_penalty, cl_penalty = 0.1, 1
37 | if args.data == "Fashion":
38 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
39 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
40 | X = fashionmnist_train.train_data
41 | y = fashionmnist_train.train_labels
42 | test_X = fashionmnist_test.test_data
43 | test_y = fashionmnist_test.test_labels
44 | args.pretrain="../model/fashion_sdae_weights.pt"
45 | ml_penalty = 1
46 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
47 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
48 | if args.use_pretrain:
49 | idec.load_model(args.pretrain)
50 |
51 | # Print Network Structure
52 | print(idec)
53 |
54 | # Construct constraints
55 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
56 | if args.data != "Fashion":
57 | anchor, positive, negative = generate_mnist_triplets(y, 6000)
58 | else:
59 | anchor, positive, negative = generate_triplet_constraints_continuous(y,6000)
60 | instance_guidance = torch.zeros(X.shape[0]).cuda()
61 | use_global = False
62 |
63 | # Train the network
64 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y,
65 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
66 | update_interval=args.update_interval, tol=2*1e-3)
67 |
68 | # Make predictions
69 | test_acc, test_nmi = idec.predict(test_X, test_y)
70 |
71 | # Print the result
72 | print("ACC:", train_acc)
73 | print("NMI;", train_nmi)
74 | print("Epochs:", epo)
75 | print("testAcc:", test_acc)
76 | print("testNMI:", test_nmi)
77 | print("ML Closure:", ml_ind1.shape[0])
78 | print("CL Closure:", cl_ind1.shape[0])
79 |
--------------------------------------------------------------------------------
/experiments/run_DEC.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | import argparse
4 | from lib.dec import DEC
5 | from lib.datasets import MNIST
6 |
7 | if __name__ == "__main__":
8 | parser = argparse.ArgumentParser(description='DEC MNIST Example')
9 | parser.add_argument('--lr', type=float, default=0.01, metavar='N',
10 | help='learning rate for training (default: 0.01)')
11 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
12 | help='input batch size for training (default: 256)')
13 | parser.add_argument('--update-interval', type=int, default=1, metavar='N',
14 | help='update-interval (default: 1)')
15 | parser.add_argument('--epochs', type=int, default=200, metavar='N',
16 | help='number of epochs to train (default: 200)')
17 | parser.add_argument('--pretrain', type=str, default="../model/sdae.pt", metavar='N',
18 | help='use pre-trained weights')
19 | args = parser.parse_args()
20 |
21 |
22 | mnist_train = MNIST('./dataset/mnist', train=True, download=True)
23 | mnist_test = MNIST('./dataset/mnist', train=False)
24 | X = mnist_train.train_data
25 | y = mnist_train.train_labels
26 |
27 | dec = DEC(input_dim=784, z_dim=10, n_clusters=10,
28 | encodeLayer=[500, 500, 2000], activation="relu", dropout=0)
29 | print(dec)
30 | dec.load_model(args.pretrain)
31 | dec.fit(X, y, lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
32 | update_interval=args.update_interval)
33 |
34 |
--------------------------------------------------------------------------------
/experiments/run_improved_DEC.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | import torch.utils.data
4 | import numpy as np
5 | import argparse
6 | from lib.dcc import IDEC
7 | from lib.datasets import MNIST, FashionMNIST, Reuters
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser(description='IDEC MNIST Example')
11 | parser.add_argument('--lr', type=float, default=0.001, metavar='N',
12 | help='learning rate for training (default: 0.001)')
13 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
14 | help='input batch size for training (default: 256)')
15 | parser.add_argument('--update-interval', type=int, default=1, metavar='N',
16 | help='number of epochs to train (default: 1)')
17 | parser.add_argument('--epochs', type=int, default=200, metavar='N',
18 | help='number of epochs to train (default: 200)')
19 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
20 | help='directory for pre-trained weights')
21 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)')
22 | parser.add_argument('--use_pretrain', type=bool, default=True)
23 | args = parser.parse_args()
24 |
25 | # Load data
26 | mnist_train = MNIST('./dataset/mnist', train=True, download=True)
27 | mnist_test = MNIST('./dataset/mnist', train=False)
28 | X = mnist_train.train_data
29 | y = mnist_train.train_labels
30 | test_X = mnist_test.test_data
31 | test_y = mnist_test.test_labels
32 |
33 | # Set parameters
34 | ml_penalty, cl_penalty = 0.1, 1
35 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
36 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
37 | if args.data == "Fashion":
38 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
39 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
40 | X = fashionmnist_train.train_data
41 | y = fashionmnist_train.train_labels
42 | test_X = fashionmnist_test.test_data
43 | test_y = fashionmnist_test.test_labels
44 | args.pretrain="../model/fashion_sdae_weights.pt"
45 | ml_penalty = 1
46 | elif args.data == "Reuters":
47 | reuters_train = Reuters('./dataset/reuters', train=True, download=False)
48 | reuters_test = Reuters('./dataset/reuters', train=False)
49 | X = reuters_train.train_data
50 | y = reuters_train.train_labels
51 | test_X = reuters_test.test_data
52 | test_y = reuters_test.test_labels
53 | args.pretrain="../model/reuters10k_sdae_weights.pt"
54 | idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4,
55 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
56 | if args.use_pretrain:
57 | idec.load_model(args.pretrain)
58 |
59 | # Print network structure
60 | print(idec)
61 |
62 | # Construct constraints (here is the baseline so no constraints are provided).
63 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
64 | anchor, positive, negative = np.array([]), np.array([]), np.array([])
65 | instance_guidance = torch.zeros(X.shape[0]).cuda()
66 | use_global = False
67 |
68 | # Train the clustering model
69 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y,
70 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
71 | update_interval=args.update_interval,tol=1*1e-3)
72 |
73 | # Test on the test data
74 | test_acc, test_nmi = idec.predict(test_X, test_y)
75 |
76 | # Print the result
77 | print("Training Accuracy:", train_acc)
78 | print("Training NMI;", train_nmi)
79 | print("Training Epochs:", epo)
80 | print("Test Accuracy:", test_acc)
81 | print("Test NMI:", test_nmi)
82 |
--------------------------------------------------------------------------------
/experiments/run_sdae.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append("..")
4 | import torch.utils.data
5 | import argparse
6 | from lib.stackedDAE import StackedDAE
7 | from lib.datasets import MNIST
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser(description='VAE MNIST Example')
11 | parser.add_argument('--lr', type=float, default=0.1, metavar='N',
12 | help='learning rate for training (default: 0.1)')
13 | parser.add_argument('--batch-size', type=int, default=256, metavar='N',
14 | help='input batch size for training (default: 256)')
15 | parser.add_argument('--pretrainepochs', type=int, default=300, metavar='N',
16 | help='number of epochs to train (default: 300)')
17 | parser.add_argument('--epochs', type=int, default=500, metavar='N',
18 | help='number of epochs to train (default: 500)')
19 | args = parser.parse_args()
20 |
21 | # Load data for pre-training
22 | train_loader = torch.utils.data.DataLoader(
23 | MNIST('./dataset/mnist', train=True, download=True),
24 | batch_size=args.batch_size, shuffle=True, num_workers=0)
25 | test_loader = torch.utils.data.DataLoader(
26 | MNIST('./dataset/mnist', train=False),
27 | batch_size=args.batch_size, shuffle=False, num_workers=0)
28 |
29 | sdae = StackedDAE(input_dim=784, z_dim=10, binary=False,
30 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu",
31 | dropout=0)
32 |
33 | # Print the pre-train model structure
34 | print(sdae)
35 | sdae.pretrain(train_loader, test_loader, lr=args.lr, batch_size=args.batch_size,
36 | num_epochs=args.pretrainepochs, corrupt=0.2, loss_type="mse")
37 |
38 | # Train the stacked denoising autoencoder
39 | sdae.fit(train_loader, test_loader, lr=args.lr, num_epochs=args.epochs, corrupt=0.2, loss_type="mse")
40 |
41 | # Save the weights as pre-trained model for IDEC/DEC/DCC
42 | sdae.save_model("model/sdae_mnist_weights.pt")
43 |
--------------------------------------------------------------------------------
/lib/__pycache__/datasets.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/datasets.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/__pycache__/dcc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/dcc.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/__pycache__/dec.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/dec.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/__pycache__/denoisingAutoencoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/denoisingAutoencoder.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/__pycache__/ops.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/ops.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/__pycache__/stackedDAE.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/stackedDAE.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/lib/datasets.py:
--------------------------------------------------------------------------------
1 | import os
2 | import os.path
3 | import errno
4 | import numpy as np
5 | import gzip
6 | import torch
7 | import pickle
8 | import torch.utils.data as data
9 | import codecs
10 | import urllib
11 |
12 |
13 | class MNIST(data.Dataset):
14 | """`MNIST `_ Dataset.
15 | Args:
16 | root (string): Root directory of dataset where ``processed/training.pt``
17 | and ``processed/test.pt`` exist.
18 | train (bool, optional): If True, creates dataset from ``training.pt``,
19 | otherwise from ``test.pt``.
20 | download (bool, optional): If true, downloads the dataset from the internet and
21 | puts it in root directory. If dataset is already downloaded, it is not
22 | downloaded again.
23 | transform (callable, optional): A function/transform that takes in an PIL image
24 | and returns a transformed version. E.g, ``transforms.RandomCrop``
25 | target_transform (callable, optional): A function/transform that takes in the
26 | target and transforms it.
27 | """
28 | urls = [
29 | 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
30 | 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
31 | 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
32 | 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
33 | ]
34 | raw_folder = 'raw'
35 | processed_folder = 'processed'
36 | training_file = 'training.pt'
37 | test_file = 'test.pt'
38 | classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
39 | '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
40 | class_to_idx = {_class: i for i, _class in enumerate(classes)}
41 |
42 | @property
43 | def targets(self):
44 | if self.train:
45 | return self.train_labels
46 | else:
47 | return self.test_labels
48 |
49 | def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
50 | self.root = os.path.expanduser(root)
51 | self.transform = transform
52 | self.target_transform = target_transform
53 | self.train = train # training set or test set
54 | self.use_cuda = torch.cuda.is_available()
55 |
56 | if download:
57 | self.download()
58 |
59 | if not self._check_exists():
60 | raise RuntimeError('Dataset not found.' +
61 | ' You can use download=True to download it')
62 |
63 | if self.train:
64 | self.train_data, self.train_labels = torch.load(
65 | os.path.join(self.root, self.processed_folder, self.training_file))
66 | self.train_data = self.train_data.view(self.train_data.size(0), -1).float()*0.02
67 | # self.train_data = self.train_data.view(self.train_data.size(0), -1).float()/255
68 | self.train_labels = self.train_labels.int()
69 | if self.use_cuda:
70 | self.train_data = self.train_data.cuda()
71 | self.train_labels = self.train_labels.cuda()
72 | else:
73 | self.test_data, self.test_labels = torch.load(
74 | os.path.join(self.root, self.processed_folder, self.test_file))
75 | self.test_data = self.test_data.view(self.test_data.size(0), -1).float()*0.02
76 | # self.test_data = self.test_data.view(self.test_data.size(0), -1).float()/255
77 | self.test_labels = self.test_labels.int()
78 | if self.use_cuda:
79 | self.test_data = self.test_data.cuda()
80 | self.test_labels = self.test_labels.cuda()
81 |
82 | def __getitem__(self, index):
83 | """
84 | Args:
85 | index (int): Index
86 | Returns:
87 | tuple: (image, target) where target is index of the target class.
88 | """
89 | if self.train:
90 | img, target = self.train_data[index], self.train_labels[index]
91 | else:
92 | img, target = self.test_data[index], self.test_labels[index]
93 |
94 | return img, target
95 |
96 | def __len__(self):
97 | if self.train:
98 | return len(self.train_data)
99 | else:
100 | return len(self.test_data)
101 |
102 | def _check_exists(self):
103 | return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \
104 | os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file))
105 |
106 | def download(self):
107 | """Download the MNIST data if it doesn't exist in processed_folder already."""
108 | from six.moves import urllib
109 | import gzip
110 |
111 | if self._check_exists():
112 | return
113 |
114 | # download files
115 | try:
116 | os.makedirs(os.path.join(self.root, self.raw_folder))
117 | os.makedirs(os.path.join(self.root, self.processed_folder))
118 | except OSError as e:
119 | if e.errno == errno.EEXIST:
120 | pass
121 | else:
122 | raise
123 |
124 | for url in self.urls:
125 | print('Downloading ' + url)
126 | data = urllib.request.urlopen(url)
127 | filename = url.rpartition('/')[2]
128 | file_path = os.path.join(self.root, self.raw_folder, filename)
129 | with open(file_path, 'wb') as f:
130 | f.write(data.read())
131 | with open(file_path.replace('.gz', ''), 'wb') as out_f, \
132 | gzip.GzipFile(file_path) as zip_f:
133 | out_f.write(zip_f.read())
134 | os.unlink(file_path)
135 |
136 | # process and save as torch files
137 | print('Processing...')
138 |
139 | training_set = (
140 | read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
141 | read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
142 | )
143 | test_set = (
144 | read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
145 | read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
146 | )
147 | with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f:
148 | torch.save(training_set, f)
149 | with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f:
150 | torch.save(test_set, f)
151 |
152 | print('Done!')
153 |
154 | def __repr__(self):
155 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
156 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__())
157 | tmp = 'train' if self.train is True else 'test'
158 | fmt_str += ' Split: {}\n'.format(tmp)
159 | fmt_str += ' Root Location: {}\n'.format(self.root)
160 | tmp = ' Transforms (if any): '
161 | fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
162 | tmp = ' Target Transforms (if any): '
163 | fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
164 | return fmt_str
165 |
166 |
167 | def read_label_file(path):
168 | with open(path, 'rb') as f:
169 | data = f.read()
170 | assert get_int(data[:4]) == 2049
171 | length = get_int(data[4:8])
172 | parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
173 | return torch.from_numpy(parsed).view(length).long()
174 |
175 |
176 | def get_int(b):
177 | return int(codecs.encode(b, 'hex'), 16)
178 |
179 |
180 | def read_image_file(path):
181 | with open(path, 'rb') as f:
182 | data = f.read()
183 | assert get_int(data[:4]) == 2051
184 | length = get_int(data[4:8])
185 | num_rows = get_int(data[8:12])
186 | num_cols = get_int(data[12:16])
187 | images = []
188 | parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
189 | return torch.from_numpy(parsed).view(length, num_rows, num_cols)
190 |
191 |
192 | class FashionMNIST(MNIST):
193 | """`Fashion-MNIST `_ Dataset.
194 | Args:
195 | root (string): Root directory of dataset where ``processed/training.pt``
196 | and ``processed/test.pt`` exist.
197 | train (bool, optional): If True, creates dataset from ``training.pt``,
198 | otherwise from ``test.pt``.
199 | download (bool, optional): If true, downloads the dataset from the internet and
200 | puts it in root directory. If dataset is already downloaded, it is not
201 | downloaded again.
202 | transform (callable, optional): A function/transform that takes in an PIL image
203 | and returns a transformed version. E.g, ``transforms.RandomCrop``
204 | target_transform (callable, optional): A function/transform that takes in the
205 | target and transforms it.
206 | """
207 | urls = [
208 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
209 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
210 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
211 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
212 | ]
213 | classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal','Shirt', 'Sneaker', 'Bag', 'Ankle boot']
214 |
215 |
216 | class Reuters(data.Dataset):
217 | # To download the processed reuters data, please run the script named as "download_data.sh"
218 | training_file = "reutersidf10k_train.npy"
219 | test_file = "reutersidf10k_test.npy"
220 |
221 | def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
222 | self.root = os.path.expanduser(root)
223 | self.transform = transform
224 | self.target_transform = target_transform
225 | self.train = train # training set or test set
226 | self.use_cuda = torch.cuda.is_available()
227 |
228 | if download:
229 | self.download()
230 |
231 | if self.train:
232 | rtk10k_train = np.load(os.path.join(self.root, self.training_file)).item()
233 | self.train_data, self.train_labels = torch.tensor(rtk10k_train['data'], dtype=torch.float32), torch.tensor(rtk10k_train['label'], dtype=torch.int)
234 | if self.use_cuda:
235 | self.train_data = self.train_data.cuda()
236 | self.train_labels = self.train_labels.cuda()
237 | else:
238 | rtk10k_test = np.load(os.path.join(self.root, self.test_file)).item()
239 | self.test_data, self.test_labels = torch.tensor(rtk10k_test['data'], dtype=torch.float32), torch.tensor(
240 | rtk10k_test['label'], dtype=torch.int)
241 | if self.use_cuda:
242 | self.test_data = self.test_data.cuda()
243 | self.test_labels = self.test_labels.cuda()
244 |
245 | def __getitem__(self, index):
246 | """
247 | Args:
248 | index (int): Index
249 | Returns:
250 | tuple: (image, target) where target is index of the target class.
251 | """
252 | if self.train:
253 | img, target = self.train_data[index], self.train_labels[index]
254 | else:
255 | img, target = self.test_data[index], self.test_labels[index]
256 |
257 | return img, target
258 |
259 | def __len__(self):
260 | if self.train:
261 | return len(self.train_data)
262 | else:
263 | return len(self.test_data)
264 |
--------------------------------------------------------------------------------
/lib/dcc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import Parameter
4 | import torch.optim as optim
5 | from torch.autograd import Variable
6 |
7 | import numpy as np
8 | import os
9 | import math
10 | import collections
11 | import pickle
12 | import json
13 |
14 | from lib.utils import acc
15 | from sklearn.metrics.cluster import normalized_mutual_info_score
16 | from sklearn.cluster import KMeans
17 | import pandas as pd
18 |
19 | class MSELoss(nn.Module):
20 | def __init__(self):
21 | super(self.__class__, self).__init__()
22 |
23 | def forward(self, input, target):
24 | return torch.mean((input-target)**2)
25 |
26 |
27 | def buildNetwork(layers, activation="relu", dropout=0):
28 | net = []
29 | for i in range(1, len(layers)):
30 | net.append(nn.Linear(layers[i-1], layers[i]))
31 | if activation=="relu":
32 | net.append(nn.ReLU())
33 | elif activation=="sigmoid":
34 | net.append(nn.Sigmoid())
35 | if dropout > 0:
36 | net.append(nn.Dropout(dropout))
37 | return nn.Sequential(*net)
38 |
39 |
40 | class IDEC(nn.Module):
41 | def __init__(self, input_dim=784, z_dim=10, n_clusters=10,
42 | encodeLayer=[400], decodeLayer=[400], activation="relu", dropout=0, alpha=1., gamma=0.1):
43 | super(self.__class__, self).__init__()
44 | self.z_dim = z_dim
45 | self.layers = [input_dim] + encodeLayer + [z_dim]
46 | self.activation = activation
47 | self.dropout = dropout
48 | self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout)
49 | self.decoder = buildNetwork([z_dim] + decodeLayer, activation=activation, dropout=dropout)
50 | self._enc_mu = nn.Linear(encodeLayer[-1], z_dim)
51 | self._dec = nn.Linear(decodeLayer[-1], input_dim)
52 |
53 | self.n_clusters = n_clusters
54 | self.alpha = alpha
55 | self.gamma = gamma
56 | self.mu = Parameter(torch.Tensor(n_clusters, z_dim))
57 |
58 | def save_model(self, path):
59 | torch.save(self.state_dict(), path)
60 |
61 | def load_model(self, path):
62 | pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage)
63 | model_dict = self.state_dict()
64 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
65 | model_dict.update(pretrained_dict)
66 | self.load_state_dict(model_dict)
67 |
68 | def forward(self, x):
69 | h = self.encoder(x)
70 | z = self._enc_mu(h)
71 | h = self.decoder(z)
72 | xrecon = self._dec(h)
73 | # compute q -> NxK
74 | q = self.soft_assign(z)
75 | return z, q, xrecon
76 |
77 | def soft_assign(self, z):
78 | q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha)
79 | q = q**(self.alpha+1.0)/2.0
80 | q = q / torch.sum(q, dim=1, keepdim=True)
81 | return q
82 |
83 | def encodeBatch(self, X, batch_size=256):
84 | use_cuda = torch.cuda.is_available()
85 | if use_cuda:
86 | self.cuda()
87 |
88 | encoded = []
89 | self.eval()
90 | num = X.shape[0]
91 | num_batch = int(math.ceil(1.0*X.shape[0]/batch_size))
92 | for batch_idx in range(num_batch):
93 | xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
94 | inputs = Variable(xbatch)
95 | z,_, _ = self.forward(inputs)
96 | encoded.append(z.data)
97 |
98 | encoded = torch.cat(encoded, dim=0)
99 | return encoded
100 |
101 | def cluster_loss(self, p, q):
102 | def kld(target, pred):
103 | return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1))
104 | kldloss = kld(p, q)
105 | return self.gamma*kldloss
106 |
107 | def recon_loss(self, x, xrecon):
108 | recon_loss = torch.mean((xrecon-x)**2)
109 | return recon_loss
110 |
111 | def pairwise_loss(self, p1, p2, cons_type):
112 | if cons_type == "ML":
113 | ml_loss = torch.mean(-torch.log(torch.sum(p1 * p2, dim=1)))
114 | return ml_loss
115 | else:
116 | cl_loss = torch.mean(-torch.log(1.0 - torch.sum(p1 * p2, dim=1)))
117 | return cl_loss
118 |
119 | def global_size_loss(self, p, cons_detail):
120 | m_p = torch.mean(p, dim=0)
121 | m_p = m_p / torch.sum(m_p)
122 | return torch.sum((m_p-cons_detail)*(m_p-cons_detail))
123 |
124 | def difficulty_loss(self, q, mask):
125 | mask = mask.unsqueeze_(-1)
126 | mask = mask.expand(q.shape[0], q.shape[1])
127 | mask_q = q * mask
128 | diff_loss = -torch.norm(mask_q, 2)
129 | penalty_degree = 0.1
130 | return penalty_degree * diff_loss
131 |
132 | def target_distribution(self, q):
133 | p = q**2 / torch.sum(q, dim=0)
134 | p = p / torch.sum(p, dim=1, keepdim=True)
135 | return p
136 |
137 | def triplet_loss(self, anchor, positive, negative, margin_constant):
138 | # loss = max(d(anchor, negative) - d(anchor, positve) + margin, 0), margin > 0
139 | # d(x, y) = q(x) * q(y)
140 | negative_dis = torch.sum(anchor * negative, dim=1)
141 | positive_dis = torch.sum(anchor * positive, dim=1)
142 | margin = margin_constant * torch.ones(negative_dis.shape).cuda()
143 | diff_dis = negative_dis - positive_dis
144 | penalty = diff_dis + margin
145 | triplet_loss = 1*torch.max(penalty, torch.zeros(negative_dis.shape).cuda())
146 |
147 | return torch.mean(triplet_loss)
148 |
149 | def satisfied_constraints(self,ml_ind1,ml_ind2,cl_ind1, cl_ind2,y_pred):
150 |
151 | if ml_ind1.size == 0 or ml_ind2.size == 0 or cl_ind1.size == 0 or cl_ind2.size == 0:
152 | return 1.1
153 |
154 | count = 0
155 | satisfied = 0
156 | for (i, j) in zip(ml_ind1, ml_ind2):
157 | count += 1
158 | if y_pred[i] == y_pred[j]:
159 | satisfied += 1
160 | for (i, j) in zip(cl_ind1, cl_ind2):
161 | count += 1
162 | if y_pred[i] != y_pred[j]:
163 | satisfied += 1
164 |
165 | return float(satisfied)/count
166 |
167 |
168 | def predict(self, X, y):
169 | use_cuda = torch.cuda.is_available()
170 | if use_cuda:
171 | self.cuda()
172 | latent = self.encodeBatch(X)
173 | q = self.soft_assign(latent)
174 |
175 | # evalute the clustering performance
176 | y_pred = torch.argmax(q, dim=1).data.cpu().numpy()
177 | y = y.data.cpu().numpy()
178 | if y is not None:
179 | print("acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
180 | final_acc = acc(y, y_pred)
181 | final_nmi = normalized_mutual_info_score(y, y_pred)
182 | return final_acc, final_nmi
183 |
184 | def fit(self,anchor, positive, negative, ml_ind1,ml_ind2,cl_ind1, cl_ind2, mask, use_global, ml_p, cl_p, X,y=None, lr=0.001, batch_size=256, num_epochs=10, update_interval=1, tol=1e-3, use_kmeans=True, plotting="",clustering_loss_weight=1):
185 |
186 | # save intermediate results for plotting
187 | intermediate_results = collections.defaultdict(lambda:{})
188 |
189 | '''X: tensor data'''
190 | use_cuda = torch.cuda.is_available()
191 | if use_cuda:
192 | self.cuda()
193 | print("=====Training IDEC=======")
194 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
195 |
196 | if use_kmeans:
197 | print("Initializing cluster centers with kmeans.")
198 | kmeans = KMeans(self.n_clusters, n_init=20)
199 | data = self.encodeBatch(X)
200 | y_pred = kmeans.fit_predict(data.data.cpu().numpy())
201 | y_pred_last = y_pred
202 | self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_))
203 | else:
204 | # use kmeans to randomly initialize cluster ceters
205 | print("Randomly initializing cluster centers.")
206 | kmeans = KMeans(self.n_clusters, n_init=1, max_iter=1)
207 | data = self.encodeBatch(X)
208 | y_pred = kmeans.fit_predict(data.data.cpu().numpy())
209 | y_pred_last = y_pred
210 | self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_))
211 |
212 | if y is not None:
213 | y = y.cpu().numpy()
214 | # print("Kmeans acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
215 | self.train()
216 | num = X.shape[0]
217 | num_batch = int(math.ceil(1.0*X.shape[0]/batch_size))
218 | ml_num_batch = int(math.ceil(1.0*ml_ind1.shape[0]/batch_size))
219 | cl_num_batch = int(math.ceil(1.0*cl_ind1.shape[0]/batch_size))
220 | tri_num_batch = int(math.ceil(1.0*anchor.shape[0]/batch_size))
221 | cl_num = cl_ind1.shape[0]
222 | ml_num = ml_ind1.shape[0]
223 | tri_num = anchor.shape[0]
224 |
225 | final_acc, final_nmi, final_epoch = 0, 0, 0
226 | update_ml = 1
227 | update_cl = 1
228 | update_triplet = 1
229 | for epoch in range(num_epochs):
230 | if epoch%update_interval == 0:
231 | # update the targe distribution p
232 | latent = self.encodeBatch(X)
233 | q = self.soft_assign(latent)
234 | p = self.target_distribution(q).data
235 |
236 | # evalute the clustering performance
237 | y_pred = torch.argmax(q, dim=1).data.cpu().numpy()
238 | if use_global:
239 | y_dict = collections.defaultdict(list)
240 | ind1, ind2 = [], []
241 | for i in range(y_pred.shape[0]):
242 | y_dict[y_pred[i]].append(i)
243 | for key in y_dict.keys():
244 | if y is not None:
245 | print("predicted class: ", key, " total: ", len(y_dict[key]))
246 | #, " mapped index(ground truth): ", np.bincount(y[y_dict[key]]).argmax())
247 |
248 | if y is not None:
249 | print("acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
250 | print("satisfied constraints: %.5f"%self.satisfied_constraints(ml_ind1,ml_ind2,cl_ind1, cl_ind2,y_pred))
251 | final_acc = acc(y, y_pred)
252 | final_nmi = normalized_mutual_info_score(y, y_pred)
253 | final_epoch = epoch
254 |
255 | # save model for plotting
256 | if plotting and (epoch in [10,20,30,40] or epoch%50 == 0 or epoch == num_epochs-1):
257 |
258 | df = pd.DataFrame(latent.cpu().numpy())
259 | df["y"] = y
260 | df.to_pickle(os.path.join(plotting,"save_model_%d.pkl"%(epoch)))
261 |
262 | intermediate_results["acc"][str(epoch)] = acc(y, y_pred)
263 | intermediate_results["nmi"][str(epoch)] = normalized_mutual_info_score(y, y_pred)
264 | with open(os.path.join(plotting,"intermediate_results.json"), "w") as fp:
265 | json.dump(intermediate_results, fp)
266 |
267 | # check stop criterion
268 | try:
269 | delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / num
270 | y_pred_last = y_pred
271 | if epoch>0 and delta_label < tol:
272 | print('delta_label ', delta_label, '< tol ', tol)
273 | print("Reach tolerance threshold. Stopping training.")
274 |
275 | # save model for plotting
276 | if plotting:
277 |
278 | df = pd.DataFrame(latent.cpu().numpy())
279 | df["y"] = y
280 | df.to_pickle(os.path.join(plotting,"save_model_%d.pkl"%epoch))
281 |
282 | intermediate_results["acc"][str(epoch)] = acc(y, y_pred)
283 | intermediate_results["nmi"][str(epoch)] = normalized_mutual_info_score(y, y_pred)
284 | with open(os.path.join(plotting,"intermediate_results.json"), "w") as fp:
285 | json.dump(intermediate_results, fp)
286 | break
287 | except:
288 | pass
289 |
290 | # train 1 epoch for clustering loss
291 | train_loss = 0.0
292 | recon_loss_val = 0.0
293 | cluster_loss_val = 0.0
294 | instance_constraints_loss_val = 0.0
295 | global_loss_val = 0.0
296 | for batch_idx in range(num_batch):
297 | xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
298 | pbatch = p[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
299 | mask_batch = mask[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
300 | optimizer.zero_grad()
301 | inputs = Variable(xbatch)
302 | target = Variable(pbatch)
303 | cons_detail = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
304 | global_cons = torch.from_numpy(cons_detail).float().to("cuda")
305 |
306 | z, qbatch, xrecon = self.forward(inputs)
307 | if use_global == False:
308 | cluster_loss = self.cluster_loss(target, qbatch)
309 | recon_loss = self.recon_loss(inputs, xrecon)
310 | instance_constraints_loss = self.difficulty_loss(qbatch, mask_batch)
311 | loss = cluster_loss + recon_loss + instance_constraints_loss
312 | loss.backward()
313 | optimizer.step()
314 | cluster_loss_val += cluster_loss.data * len(inputs)
315 | recon_loss_val += recon_loss.data * len(inputs)
316 | instance_constraints_loss_val += instance_constraints_loss.data * len(inputs)
317 | train_loss = clustering_loss_weight*cluster_loss_val + recon_loss_val + instance_constraints_loss_val
318 | else:
319 | cluster_loss = self.cluster_loss(target, qbatch)
320 | recon_loss = self.recon_loss(inputs, xrecon)
321 | global_loss = self.global_size_loss(qbatch, global_cons)
322 | loss = cluster_loss + recon_loss + global_loss
323 | loss.backward()
324 | optimizer.step()
325 | cluster_loss_val += cluster_loss.data * len(inputs)
326 | recon_loss_val += recon_loss.data * len(inputs)
327 | train_loss = clustering_loss_weight*cluster_loss_val + recon_loss_val
328 |
329 |
330 | if instance_constraints_loss_val != 0.0:
331 | print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f Instance Difficulty Loss: %.4f"% (
332 | epoch + 1, train_loss / num, cluster_loss_val / num, recon_loss_val / num, instance_constraints_loss_val / num))
333 | elif global_loss_val != 0.0 and use_global:
334 | print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f Global Loss: %.4f"% (
335 | epoch + 1, train_loss / num + global_loss_val/num_batch, cluster_loss_val / num, recon_loss_val / num, global_loss_val / num_batch))
336 | else:
337 | print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f" % (
338 | epoch + 1, train_loss / num, cluster_loss_val / num, recon_loss_val / num))
339 | ml_loss = 0.0
340 | if epoch % update_ml == 0:
341 | for ml_batch_idx in range(ml_num_batch):
342 | px1 = X[ml_ind1[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]]
343 | px2 = X[ml_ind2[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]]
344 | pbatch1 = p[ml_ind1[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx + 1)*batch_size)]]
345 | pbatch2 = p[ml_ind2[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]]
346 | optimizer.zero_grad()
347 | inputs1 = Variable(px1)
348 | inputs2 = Variable(px2)
349 | target1 = Variable(pbatch1)
350 | target2 = Variable(pbatch2)
351 | z1, q1, xr1 = self.forward(inputs1)
352 | z2, q2, xr2 = self.forward(inputs2)
353 | loss = (ml_p*self.pairwise_loss(q1, q2, "ML")+self.recon_loss(inputs1, xr1) + self.recon_loss(inputs2, xr2))
354 | # 0.1 for mnist/reuters, 1 for fashion, the parameters are tuned via grid search on validation set
355 | ml_loss += loss.data
356 | loss.backward()
357 | optimizer.step()
358 |
359 | cl_loss = 0.0
360 | if epoch % update_cl == 0:
361 | for cl_batch_idx in range(cl_num_batch):
362 | px1 = X[cl_ind1[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]]
363 | px2 = X[cl_ind2[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]]
364 | pbatch1 = p[cl_ind1[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx + 1)*batch_size)]]
365 | pbatch2 = p[cl_ind2[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]]
366 | optimizer.zero_grad()
367 | inputs1 = Variable(px1)
368 | inputs2 = Variable(px2)
369 | target1 = Variable(pbatch1)
370 | target2 = Variable(pbatch2)
371 | z1, q1, xr1 = self.forward(inputs1)
372 | z2, q2, xr2 = self.forward(inputs2)
373 | loss = cl_p*self.pairwise_loss(q1, q2, "CL")
374 | cl_loss += loss.data
375 | loss.backward()
376 | optimizer.step()
377 |
378 | if ml_num_batch >0 and cl_num_batch > 0:
379 | print("Pairwise Total:", round(float(ml_loss.cpu()), 2) + float(cl_loss.cpu()), "ML loss", float(ml_loss.cpu()), "CL loss:", float(cl_loss.cpu()))
380 | triplet_loss = 0.0
381 | if epoch % update_triplet == 0:
382 | for tri_batch_idx in range(tri_num_batch):
383 | px1 = X[anchor[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
384 | px2 = X[positive[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
385 | px3 = X[negative[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
386 | pbatch1 = p[anchor[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx + 1)*batch_size)]]
387 | pbatch2 = p[positive[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
388 | pbatch3 = p[negative[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
389 | optimizer.zero_grad()
390 | inputs1 = Variable(px1)
391 | inputs2 = Variable(px2)
392 | inputs3 = Variable(px3)
393 | target1 = Variable(pbatch1)
394 | target2 = Variable(pbatch2)
395 | target3 = Variable(pbatch3)
396 | z1, q1, xr1 = self.forward(inputs1)
397 | z2, q2, xr2 = self.forward(inputs2)
398 | z3, q3, xr3 = self.forward(inputs3)
399 | loss = self.triplet_loss(q1, q2, q3, 0.1)
400 | triplet_loss += loss.data
401 | loss.backward()
402 | optimizer.step()
403 | if tri_num_batch > 0:
404 | print("Triplet Loss:", triplet_loss)
405 | return final_acc, final_nmi, final_epoch
406 |
--------------------------------------------------------------------------------
/lib/dec.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import Parameter
4 | import torch.nn.functional as F
5 | import torch.optim as optim
6 | import torchvision
7 | from torchvision import datasets, transforms
8 | from torch.autograd import Variable
9 |
10 | import numpy as np
11 | import math
12 | from lib.utils import acc
13 | from sklearn.metrics.cluster import normalized_mutual_info_score
14 | from sklearn.cluster import KMeans
15 |
16 |
17 | def buildNetwork(layers, activation="relu", dropout=0):
18 | net = []
19 | for i in range(1, len(layers)):
20 | net.append(nn.Linear(layers[i-1], layers[i]))
21 | if activation=="relu":
22 | net.append(nn.ReLU())
23 | elif activation=="sigmoid":
24 | net.append(nn.Sigmoid())
25 | if dropout > 0:
26 | net.append(nn.Dropout(dropout))
27 | return nn.Sequential(*net)
28 |
29 |
30 | class DEC(nn.Module):
31 | def __init__(self, input_dim=784, z_dim=10, n_clusters=10,
32 | encodeLayer=[400], activation="relu", dropout=0, alpha=1.):
33 | super(self.__class__, self).__init__()
34 | self.z_dim = z_dim
35 | self.layers = [input_dim] + encodeLayer + [z_dim]
36 | self.activation = activation
37 | self.dropout = dropout
38 | self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout) # f(x) = z
39 | self._enc_mu = nn.Linear(encodeLayer[-1], z_dim) # clustering layer -> q
40 |
41 | self.n_clusters = n_clusters
42 | self.alpha = alpha
43 | self.mu = Parameter(torch.Tensor(n_clusters, z_dim))
44 |
45 | def save_model(self, path):
46 | torch.save(self.state_dict(), path)
47 |
48 | def load_model(self, path):
49 | pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage)
50 | model_dict = self.state_dict()
51 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
52 | model_dict.update(pretrained_dict)
53 | self.load_state_dict(model_dict)
54 |
55 | def forward(self, x):
56 | h = self.encoder(x)
57 | z = self._enc_mu(h)
58 | # compute q -> NxK
59 | q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha)
60 | q = q**(self.alpha+1.0)/2.0
61 | q = q / torch.sum(q, dim=1, keepdim=True)
62 | return z, q
63 |
64 | def encodeBatch(self, dataloader, islabel=False):
65 | use_cuda = torch.cuda.is_available()
66 | if use_cuda:
67 | self.cuda()
68 |
69 | encoded = []
70 | ylabels = []
71 | self.eval()
72 | for batch_idx, (inputs, labels) in enumerate(dataloader):
73 | inputs = Variable(inputs)
74 | z,_ = self.forward(inputs)
75 | encoded.append(z.data.cpu())
76 | ylabels.append(labels)
77 |
78 | encoded = torch.cat(encoded, dim=0)
79 | ylabels = torch.cat(ylabels)
80 | if islabel:
81 | out = (encoded, ylabels)
82 | else:
83 | out = encoded
84 | return out
85 |
86 | def loss_function(self, p, q):
87 | def kld(target, pred):
88 | return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1))
89 |
90 | loss = kld(p, q)
91 | return loss
92 |
93 | def target_distribution(self, q):
94 | p = q**2 / torch.sum(q, dim=0)
95 | p = p / torch.sum(p, dim=1, keepdim=True)
96 | return p
97 |
98 | def fit(self, X, y=None, lr=0.001, batch_size=256, num_epochs=10, update_interval=1, tol=1e-3):
99 | '''X: tensor data'''
100 | use_cuda = torch.cuda.is_available()
101 | if use_cuda:
102 | self.cuda()
103 | print("=====Training DEC=======")
104 | #optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
105 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
106 |
107 | print("Initializing cluster centers with kmeans.")
108 | kmeans = KMeans(self.n_clusters, n_init=20)
109 | data, _ = self.forward(X)
110 | y_pred = kmeans.fit_predict(data.data.cpu().numpy())
111 | y_pred_last = y_pred
112 | self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_))
113 | if y is not None:
114 | y = y.cpu().numpy()
115 | print("Kmeans acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
116 |
117 | self.train()
118 | num = X.shape[0]
119 | num_batch = int(math.ceil(1.0*X.shape[0]/batch_size))
120 | print("num_batches:", num_batch)
121 | for epoch in range(num_epochs):
122 | if epoch % update_interval == 0:
123 | # update the targe distribution p
124 | _, q = self.forward(X)
125 | p = self.target_distribution(q).data
126 |
127 | # evalute the clustering performance
128 | y_pred = torch.argmax(q, dim=1).data.cpu().numpy()
129 | if y is not None:
130 | print("epoch: %.5f, acc: %.5f, nmi: %.5f" % (epoch, acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
131 |
132 | # check stop criterion
133 | delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / num
134 | y_pred_last = y_pred
135 | if epoch>0 and delta_label < tol:
136 | print('delta_label ', delta_label, '< tol ', tol)
137 | print("Reach tolerance threshold. Stopping training.")
138 | break
139 |
140 | # train 1 epoch
141 | train_loss = 0.0
142 | for batch_idx in range(num_batch):
143 | xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
144 | pbatch = p[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
145 |
146 | optimizer.zero_grad()
147 | inputs = Variable(xbatch)
148 | target = Variable(pbatch)
149 |
150 | z, qbatch = self.forward(inputs)
151 | loss = self.loss_function(target, qbatch)
152 | train_loss += loss.data*len(inputs)
153 | loss.backward()
154 | optimizer.step()
155 |
156 | #print("#Epoch %3d: Loss: %.4f" % (
157 | # epoch+1, train_loss / num))
158 |
159 |
160 |
161 |
162 |
--------------------------------------------------------------------------------
/lib/denoisingAutoencoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import Parameter
4 | import torch.nn.functional as F
5 | import torch.optim as optim
6 | import torchvision
7 | from torchvision import datasets, transforms
8 | from torch.autograd import Variable
9 |
10 | import numpy as np
11 | import math
12 | from lib.utils import Dataset, masking_noise
13 | from lib.ops import MSELoss, BCELoss
14 |
15 | def adjust_learning_rate(init_lr, optimizer, epoch):
16 | lr = init_lr * (0.1 ** (epoch//100))
17 | toprint = True
18 | for param_group in optimizer.param_groups:
19 | if param_group["lr"]!=lr:
20 | param_group["lr"] = lr
21 | if toprint:
22 | print("Switching to learning rate %f" % lr)
23 | toprint = False
24 |
25 | class DenoisingAutoencoder(nn.Module):
26 | def __init__(self, in_features, out_features, activation="relu",
27 | dropout=0.2, tied=False):
28 | super(self.__class__, self).__init__()
29 | self.in_features = in_features
30 | self.out_features = out_features
31 | self.weight = Parameter(torch.Tensor(out_features, in_features))
32 | if tied:
33 | self.deweight = self.weight.t()
34 | else:
35 | self.deweight = Parameter(torch.Tensor(in_features, out_features))
36 | self.bias = Parameter(torch.Tensor(out_features))
37 | self.vbias = Parameter(torch.Tensor(in_features))
38 |
39 | if activation=="relu":
40 | self.enc_act_func = nn.ReLU()
41 | elif activation=="sigmoid":
42 | self.enc_act_func = nn.Sigmoid()
43 | elif activation=="none":
44 | self.enc_act_func = None
45 | self.dropout = nn.Dropout(p=dropout)
46 |
47 | self.reset_parameters()
48 |
49 | def reset_parameters(self):
50 | stdv = 0.01
51 | self.weight.data.uniform_(-stdv, stdv)
52 | self.bias.data.uniform_(-stdv, stdv)
53 | stdv = 0.01
54 | self.deweight.data.uniform_(-stdv, stdv)
55 | self.vbias.data.uniform_(-stdv, stdv)
56 |
57 | def forward(self, x):
58 | if self.enc_act_func is not None:
59 | return self.dropout(self.enc_act_func(F.linear(x, self.weight, self.bias)))
60 | else:
61 | return self.dropout(F.linear(x, self.weight, self.bias))
62 |
63 | def encode(self, x, train=True):
64 | if train:
65 | self.dropout.train()
66 | else:
67 | self.dropout.eval()
68 | if self.enc_act_func is not None:
69 | return self.dropout(self.enc_act_func(F.linear(x, self.weight, self.bias)))
70 | else:
71 | return self.dropout(F.linear(x, self.weight, self.bias))
72 |
73 | def encodeBatch(self, dataloader):
74 | use_cuda = torch.cuda.is_available()
75 | encoded = []
76 | for batch_idx, (inputs, _) in enumerate(dataloader):
77 | inputs = inputs.view(inputs.size(0), -1).float()
78 | if use_cuda:
79 | inputs = inputs.cuda()
80 | inputs = Variable(inputs)
81 | hidden = self.encode(inputs, train=False)
82 | encoded.append(hidden.data.cpu())
83 |
84 | encoded = torch.cat(encoded, dim=0)
85 | return encoded
86 |
87 | def decode(self, x, binary=False):
88 | if not binary:
89 | return F.linear(x, self.deweight, self.vbias)
90 | else:
91 | return F.sigmoid(F.linear(x, self.deweight, self.vbias))
92 |
93 | def fit(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.3,
94 | loss_type="mse"):
95 | """
96 | data_x: FloatTensor
97 | valid_x: FloatTensor
98 | """
99 | use_cuda = torch.cuda.is_available()
100 | if use_cuda:
101 | self.cuda()
102 | print("=====Denoising Autoencoding layer=======")
103 | # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
104 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
105 | if loss_type=="mse":
106 | criterion = MSELoss()
107 | elif loss_type=="cross-entropy":
108 | criterion = BCELoss()
109 |
110 | # validate
111 | total_loss = 0.0
112 | total_num = 0
113 | for batch_idx, (inputs, _) in enumerate(validloader):
114 | # inputs = inputs.view(inputs.size(0), -1).float()
115 | # if use_cuda:
116 | # inputs = inputs.cuda()
117 | inputs = Variable(inputs)
118 | hidden = self.encode(inputs)
119 | if loss_type=="cross-entropy":
120 | outputs = self.decode(hidden, binary=True)
121 | else:
122 | outputs = self.decode(hidden)
123 |
124 | valid_recon_loss = criterion(outputs, inputs)
125 | total_loss += valid_recon_loss.data * len(inputs)
126 | total_num += inputs.size()[0]
127 |
128 | valid_loss = total_loss / total_num
129 | print("#Epoch 0: Valid Reconstruct Loss: %.4f" % (valid_loss))
130 |
131 | self.train()
132 | for epoch in range(num_epochs):
133 | # train 1 epoch
134 | train_loss = 0.0
135 | adjust_learning_rate(lr, optimizer, epoch)
136 | for batch_idx, (inputs, _) in enumerate(trainloader):
137 | # inputs = inputs.view(inputs.size(0), -1).float()
138 | inputs_corr = masking_noise(inputs, corrupt)
139 | # if use_cuda:
140 | # inputs = inputs.cuda()
141 | # inputs_corr = inputs_corr.cuda()
142 | optimizer.zero_grad()
143 | inputs = Variable(inputs)
144 | inputs_corr = Variable(inputs_corr)
145 |
146 | hidden = self.encode(inputs_corr)
147 | if loss_type=="cross-entropy":
148 | outputs = self.decode(hidden, binary=True)
149 | else:
150 | outputs = self.decode(hidden)
151 | recon_loss = criterion(outputs, inputs)
152 | train_loss += recon_loss.data*len(inputs)
153 | recon_loss.backward()
154 | optimizer.step()
155 |
156 | # validate
157 | valid_loss = 0.0
158 | for batch_idx, (inputs, _) in enumerate(validloader):
159 | # inputs = inputs.view(inputs.size(0), -1).float()
160 | # if use_cuda:
161 | # inputs = inputs.cuda()
162 | inputs = Variable(inputs)
163 | hidden = self.encode(inputs, train=False)
164 | if loss_type=="cross-entropy":
165 | outputs = self.decode(hidden, binary=True)
166 | else:
167 | outputs = self.decode(hidden)
168 |
169 | valid_recon_loss = criterion(outputs, inputs)
170 | valid_loss += valid_recon_loss.data * len(inputs)
171 |
172 | print("#Epoch %3d: Reconstruct Loss: %.4f, Valid Reconstruct Loss: %.4f" % (
173 | epoch+1, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset)))
174 |
175 | def extra_repr(self):
176 | return 'in_features={}, out_features={}, bias={}'.format(
177 | self.in_features, self.out_features, self.bias is not None
178 | )
179 |
180 |
--------------------------------------------------------------------------------
/lib/ops.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import Parameter
4 | from torch.autograd import Variable
5 | import torch.nn.functional as F
6 | import math
7 |
8 |
9 | class MSELoss(nn.Module):
10 | def __init__(self):
11 | super(self.__class__, self).__init__()
12 |
13 | def forward(self, input, target):
14 | return 0.5 * torch.mean((input-target)**2)
15 |
16 | class BCELoss(nn.Module):
17 | def __init__(self):
18 | super(self.__class__, self).__init__()
19 |
20 | def forward(self, input, target):
21 | return -torch.mean(torch.sum(target*torch.log(torch.clamp(input, min=1e-10))+
22 | (1-target)*torch.log(torch.clamp(1-input, min=1e-10)), 1))
23 |
--------------------------------------------------------------------------------
/lib/stackedDAE.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import Parameter
4 | import torch.nn.functional as F
5 | import torch.optim as optim
6 | import torchvision
7 | from torchvision import datasets, transforms
8 | from torch.autograd import Variable
9 |
10 | import numpy as np
11 | import math
12 | from lib.utils import Dataset, masking_noise
13 | from lib.ops import MSELoss, BCELoss
14 | from lib.denoisingAutoencoder import DenoisingAutoencoder
15 |
16 | def buildNetwork(layers, activation="relu", dropout=0):
17 | net = []
18 | for i in range(1, len(layers)):
19 | net.append(nn.Linear(layers[i-1], layers[i]))
20 | if activation=="relu":
21 | net.append(nn.ReLU())
22 | elif activation=="sigmoid":
23 | net.append(nn.Sigmoid())
24 | if dropout > 0:
25 | net.append(nn.Dropout(dropout))
26 | return nn.Sequential(*net)
27 |
28 | def adjust_learning_rate(init_lr, optimizer, epoch):
29 | lr = init_lr * (0.1 ** (epoch//100))
30 | toprint = True
31 | for param_group in optimizer.param_groups:
32 | if param_group["lr"]!=lr:
33 | param_group["lr"] = lr
34 | if toprint:
35 | print("Switching to learning rate %f" % lr)
36 | toprint = False
37 |
38 | class StackedDAE(nn.Module):
39 | def __init__(self, input_dim=784, z_dim=10, binary=True,
40 | encodeLayer=[400], decodeLayer=[400], activation="relu",
41 | dropout=0, tied=False):
42 | super(self.__class__, self).__init__()
43 | self.z_dim = z_dim
44 | self.layers = [input_dim] + encodeLayer + [z_dim]
45 | self.activation = activation
46 | self.dropout = dropout
47 | self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout)
48 | self.decoder = buildNetwork([z_dim] + decodeLayer, activation=activation, dropout=dropout)
49 | self._enc_mu = nn.Linear(encodeLayer[-1], z_dim)
50 |
51 | self._dec = nn.Linear(decodeLayer[-1], input_dim)
52 | self._dec_act = None
53 | if binary:
54 | self._dec_act = nn.Sigmoid()
55 |
56 | def decode(self, z):
57 | h = self.decoder(z)
58 | x = self._dec(h)
59 | if self._dec_act is not None:
60 | x = self._dec_act(x)
61 | return x
62 |
63 | def loss_function(self, recon_x, x):
64 | loss = -torch.mean(torch.sum(x*torch.log(torch.clamp(recon_x, min=1e-10))+
65 | (1-x)*torch.log(torch.clamp(1-recon_x, min=1e-10)), 1))
66 |
67 | return loss
68 |
69 | def forward(self, x):
70 | h = self.encoder(x)
71 | z = self._enc_mu(h)
72 |
73 | return z, self.decode(z)
74 |
75 | def save_model(self, path):
76 | torch.save(self.state_dict(), path)
77 |
78 | def load_model(self, path):
79 | pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage)
80 | model_dict = self.state_dict()
81 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
82 | model_dict.update(pretrained_dict)
83 | self.load_state_dict(model_dict)
84 |
85 | def pretrain(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.2, loss_type="cross-entropy"):
86 | trloader = trainloader
87 | valoader = validloader
88 | daeLayers = []
89 | for l in range(1, len(self.layers)):
90 | infeatures = self.layers[l-1]
91 | outfeatures = self.layers[l]
92 | if l!= len(self.layers)-1:
93 | dae = DenoisingAutoencoder(infeatures, outfeatures, activation=self.activation, dropout=corrupt)
94 | else:
95 | dae = DenoisingAutoencoder(infeatures, outfeatures, activation="none", dropout=0)
96 | print(dae)
97 | if l==1:
98 | dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type=loss_type)
99 | else:
100 | if self.activation=="sigmoid":
101 | dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="cross-entropy")
102 | else:
103 | dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="mse")
104 | data_x = dae.encodeBatch(trloader)
105 | valid_x = dae.encodeBatch(valoader)
106 | trainset = Dataset(data_x, data_x)
107 | trloader = torch.utils.data.DataLoader(
108 | trainset, batch_size=batch_size, shuffle=True, num_workers=0)
109 | validset = Dataset(valid_x, valid_x)
110 | valoader = torch.utils.data.DataLoader(
111 | validset, batch_size=1000, shuffle=False, num_workers=0)
112 | daeLayers.append(dae)
113 |
114 | self.copyParam(daeLayers)
115 |
116 | def copyParam(self, daeLayers):
117 | if self.dropout==0:
118 | every = 2
119 | else:
120 | every = 3
121 | # input layer
122 | # copy encoder weight
123 | self.encoder[0].weight.data.copy_(daeLayers[0].weight.data)
124 | self.encoder[0].bias.data.copy_(daeLayers[0].bias.data)
125 | self._dec.weight.data.copy_(daeLayers[0].deweight.data)
126 | self._dec.bias.data.copy_(daeLayers[0].vbias.data)
127 |
128 | for l in range(1, len(self.layers)-2):
129 | # copy encoder weight
130 | self.encoder[l*every].weight.data.copy_(daeLayers[l].weight.data)
131 | self.encoder[l*every].bias.data.copy_(daeLayers[l].bias.data)
132 |
133 | # copy decoder weight
134 | self.decoder[-(l-1)*every-2].weight.data.copy_(daeLayers[l].deweight.data)
135 | self.decoder[-(l-1)*every-2].bias.data.copy_(daeLayers[l].vbias.data)
136 |
137 | # z layer
138 | self._enc_mu.weight.data.copy_(daeLayers[-1].weight.data)
139 | self._enc_mu.bias.data.copy_(daeLayers[-1].bias.data)
140 | self.decoder[0].weight.data.copy_(daeLayers[-1].deweight.data)
141 | self.decoder[0].bias.data.copy_(daeLayers[-1].vbias.data)
142 |
143 | def fit(self, trainloader, validloader, lr=0.001, num_epochs=10, corrupt=0.3,
144 | loss_type="mse"):
145 | """
146 | data_x: FloatTensor
147 | valid_x: FloatTensor
148 | """
149 | use_cuda = torch.cuda.is_available()
150 | if use_cuda:
151 | self.cuda()
152 | print("=====Stacked Denoising Autoencoding Layer=======")
153 | # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
154 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
155 | if loss_type=="mse":
156 | criterion = MSELoss()
157 | elif loss_type=="cross-entropy":
158 | criterion = BCELoss()
159 |
160 | # validate
161 | total_loss = 0.0
162 | total_num = 0
163 | for batch_idx, (inputs, _) in enumerate(validloader):
164 | inputs = inputs.view(inputs.size(0), -1).float()
165 | if use_cuda:
166 | inputs = inputs.cuda()
167 | inputs = Variable(inputs)
168 | z, outputs = self.forward(inputs)
169 |
170 | valid_recon_loss = criterion(outputs, inputs)
171 | total_loss += valid_recon_loss.data * len(inputs)
172 | total_num += inputs.size()[0]
173 |
174 | valid_loss = total_loss / total_num
175 | print("#Epoch 0: Valid Reconstruct Loss: %.4f" % (valid_loss))
176 | self.train()
177 | for epoch in range(num_epochs):
178 | # train 1 epoch
179 | adjust_learning_rate(lr, optimizer, epoch)
180 | train_loss = 0.0
181 | for batch_idx, (inputs, _) in enumerate(trainloader):
182 | inputs = inputs.view(inputs.size(0), -1).float()
183 | inputs_corr = masking_noise(inputs, corrupt)
184 | if use_cuda:
185 | inputs = inputs.cuda()
186 | inputs_corr = inputs_corr.cuda()
187 | optimizer.zero_grad()
188 | inputs = Variable(inputs)
189 | inputs_corr = Variable(inputs_corr)
190 |
191 | z, outputs = self.forward(inputs_corr)
192 | recon_loss = criterion(outputs, inputs)
193 | train_loss += recon_loss.data*len(inputs)
194 | recon_loss.backward()
195 | optimizer.step()
196 |
197 | # validate
198 | valid_loss = 0.0
199 | for batch_idx, (inputs, _) in enumerate(validloader):
200 | inputs = inputs.view(inputs.size(0), -1).float()
201 | if use_cuda:
202 | inputs = inputs.cuda()
203 | inputs = Variable(inputs)
204 | z, outputs = self.forward(inputs)
205 |
206 | valid_recon_loss = criterion(outputs, inputs)
207 | valid_loss += valid_recon_loss.data * len(inputs)
208 |
209 | print("#Epoch %3d: Reconstruct Loss: %.4f, Valid Reconstruct Loss: %.4f" % (
210 | epoch+1, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset)))
211 |
212 |
213 |
--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
1 | '''Some helper functions for PyTorch, including:
2 | - get_mean_and_std: calculate the mean and std value of dataset.
3 | - msr_init: net parameter initialization.
4 | - progress_bar: progress bar mimic xlua.progress.
5 |
6 | '''
7 | import os
8 | import sys
9 | import time
10 | import math
11 | import numpy as np
12 | import random
13 | import torch
14 | import torch.nn as nn
15 | import torch.nn.init as init
16 | import torch.utils.data as data
17 | from scipy.linalg import norm
18 | from PIL import Image
19 |
20 |
21 | def weights_xavier_init(m):
22 | if isinstance(m, nn.Linear):
23 | nn.init.xavier_uniform(m.weight.data)
24 | nn.init.constant(m.bias.data, 0)
25 |
26 |
27 | class Dataset(data.Dataset):
28 | def __init__(self, data, labels, transform=None, target_transform=None):
29 | self.transform = transform
30 | self.target_transform = target_transform
31 | self.data = data
32 | self.labels = labels
33 | if torch.cuda.is_available():
34 | self.data = self.data.cuda()
35 | self.labels = self.labels.cuda()
36 |
37 | def __getitem__(self, index):
38 | img, target = self.data[index], self.labels[index]
39 | # img = Image.fromarray(img)
40 | if self.transform is not None:
41 | img = self.transform(img)
42 |
43 | if self.target_transform is not None:
44 | target = self.target_transform(target)
45 |
46 | return img, target
47 |
48 | def __len__(self):
49 | return len(self.data)
50 |
51 |
52 | def masking_noise(data, frac):
53 | """
54 | data: Tensor
55 | frac: fraction of unit to be masked out
56 | """
57 | data_noise = data.clone()
58 | rand = torch.rand(data.size())
59 | data_noise[rand 0:
205 | tmp1 = random.randint(0, y.shape[0] - 1)
206 | tmp2 = random.randint(0, y.shape[0] - 1)
207 | if tmp1 == tmp2:
208 | continue
209 | if y[tmp1] == y[tmp2]:
210 | ml_ind1.append(tmp1)
211 | ml_ind2.append(tmp2)
212 | else:
213 | cl_ind1.append(tmp1)
214 | cl_ind2.append(tmp2)
215 | num -= 1
216 | return np.array(ml_ind1), np.array(ml_ind2), np.array(cl_ind1), np.array(cl_ind2)
217 |
218 |
219 | def generate_mnist_triplets(y, num):
220 | """
221 | Generate random triplet constraints
222 | """
223 | # To download the trusted_embedding for mnist data, run the script download_model.sh
224 | # Or you can create your own truseted embedding by running our pairwise constraints model
225 | # with 100000 randomly generated constraints.
226 | mnist_embedding = np.load("../model/mnist_triplet_embedding.npy")
227 | anchor_inds, pos_inds, neg_inds = [], [], []
228 | while num > 0:
229 | tmp_anchor_index = random.randint(0, y.shape[0] - 1)
230 | tmp_pos_index = random.randint(0, y.shape[0] - 1)
231 | tmp_neg_index = random.randint(0, y.shape[0] - 1)
232 | pos_distance = norm(mnist_embedding[tmp_anchor_index]-mnist_embedding[tmp_pos_index], 2)
233 | neg_distance = norm(mnist_embedding[tmp_anchor_index]-mnist_embedding[tmp_neg_index], 2)
234 | # 35 is selected by grid search which produce human trusted positive/negative pairs
235 | if neg_distance <= pos_distance + 35:
236 | continue
237 | anchor_inds.append(tmp_anchor_index)
238 | pos_inds.append(tmp_pos_index)
239 | neg_inds.append(tmp_neg_index)
240 | num -= 1
241 | return np.array(anchor_inds), np.array(pos_inds), np.array(neg_inds)
242 |
243 |
244 | def generate_triplet_constraints_continuous(y, num):
245 | """
246 | Generate random triplet constraints
247 | """
248 | # To download the trusted_embedding for mnist data, run the script download_model.sh
249 | # Or you can create your own truseted embedding by running our pairwise constraints model
250 | # with 100000 randomly generated constraints.
251 | fashion_embedding = np.load("../model/fashion_triplet_embedding.npy")
252 | anchor_inds, pos_inds, neg_inds = [], [], []
253 | while num > 0:
254 | tmp_anchor_index = random.randint(0, y.shape[0] - 1)
255 | tmp_pos_index = random.randint(0, y.shape[0] - 1)
256 | tmp_neg_index = random.randint(0, y.shape[0] - 1)
257 | pos_distance = norm(fashion_embedding[tmp_anchor_index]-fashion_embedding[tmp_pos_index], 2)
258 | neg_distance = norm(fashion_embedding[tmp_anchor_index]-fashion_embedding[tmp_neg_index], 2)
259 | # 80 is selected by grid search which produce human trusted positive/negative pairs
260 | if neg_distance <= pos_distance + 80:
261 | continue
262 | anchor_inds.append(tmp_anchor_index)
263 | pos_inds.append(tmp_pos_index)
264 | neg_inds.append(tmp_neg_index)
265 | num -= 1
266 | return np.array(anchor_inds), np.array(pos_inds), np.array(neg_inds)
267 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Code for ECMLPKDD 2019 Paper: [A Framework for Deep Constrained Clustering - Algorithms and Advances](https://arxiv.org/abs/1901.10061)
2 |
3 | ## Installation
4 |
5 | #### Step 1: Clone the Code from Github
6 |
7 | ```
8 | git clone https://github.com/blueocean92/deep_constrained_clustering
9 | cd deep_constrained_clustering
10 | ```
11 |
12 |
13 |
14 |
15 | #### Step 2: Install Requirements
16 |
17 | **Python**: see [`requirement.txt`](https://github.com/blueocean92/deep_constrained_clustering/blob/master/requirements.txt) for complete list of used packages. We recommend doing a clean installation of requirements using virtualenv:
18 | ```bash
19 | conda create -n testenv python=3.6
20 | source activate testenv
21 | pip install -r requirements.txt
22 | ```
23 |
24 | If you dont want to do the above clean installation via virtualenv, you could also directly install the requirements through:
25 | ```bash
26 | pip install -r requirements.txt --no-index
27 | ```
28 |
29 | **PyTorch**: Note that you need [PyTorch](https://pytorch.org/). We used Version 1.0.0 If you use the above virtualenv, PyTorch will be automatically installed therein.
30 |
31 |
32 | ## Running Constrained Clustering Experiments
33 |
34 | While in `deep_constrained_clustering` folder:
35 |
36 | #### Step 1: Download Pretrained Networks
37 |
38 | ```
39 | sh download_model.sh
40 | ```
41 |
42 | #### Step 2: Download Processed Reuters Data(optional, MNIST and Fashion is available in torchvision.datasets)
43 |
44 | ```
45 | sh download_data.sh
46 | ```
47 |
48 | ```
49 | cd experiments/
50 | ```
51 |
52 | While in `deep_constrained_clustering/experiments` folder:
53 | #### Step 3: Run Experimental Scripts to Reproduce Results
54 |
55 | ###### Option 1: Run Demo Pairwise Constraints Script
56 |
57 | To run the pairwise constrained clustering using pre-trained weights (AE features, 6000 constraints), do:
58 | ```bash
59 | python run_DCC_pairwise.py --data $DATA
60 | ```
61 |
62 | For the `--data` flag which specifies the data set being used, the options are "MNIST", "Fashion" and "Reuters".
63 |
64 | To run the pairwise without constrained clustering from raw features, do:
65 | ```bash
66 | python run_DCC_pairwise.py --data $DATA --without_pretrain
67 | ```
68 |
69 | To run the pairwise without KMeans initialization, do:
70 | ```bash
71 | python run_DCC_pairwise.py --data $DATA --without_kmeans
72 | ```
73 |
74 | To run the pairwise constrained clustering with noisy pairwise constraints do:
75 | ```bash
76 | python run_DCC_pairwise.py --data $DATA --noisy $NOISE
77 | ```
78 |
79 | For the `--noisy` flag which specifies the noisy degree, the option should be a positive float equal to the ratio of noisy constraints to ground truth constraints.
80 |
81 |
82 | To save data for plotting, do:
83 | ```bash
84 | python run_DCC_pairwise.py --data $DATA --plotting
85 | ```
86 |
87 | This will save the experiment data for plotting in folders under ./plotting
88 |
89 | To plot the results, do:
90 | ```bash
91 | python ./plotting/plot_pairwise.py
92 | ```
93 |
94 |
95 | ###### Option 2: Run Demo Instance Constraints Script
96 |
97 | To run the instance difficulty constrained clustering, do:
98 | ```bash
99 | python run_DCC_instance.py --data $DATA
100 | ```
101 |
102 | ###### Option 3: Run Demo Triplets Constraints Script
103 |
104 | To run the triplets constrained clustering (6000 constraints), do:
105 | ```bash
106 | python run_DCC_triplets.py --data $DATA
107 | ```
108 |
109 |
110 | ###### Option 4: Run Demo Global Constraints Script
111 |
112 | To run the global size constrained clustering, do:
113 | ```bash
114 | python run_DCC_global.py --data $DATA
115 | ```
116 |
117 |
118 | ###### Option 5: Run Demo Improved DEC Script
119 |
120 | To run the baseline Improved DEC, do:
121 | ```bash
122 | python run_improved_DEC.py --data $DATA
123 | ```
124 |
125 |
126 |
127 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2018.11.29
2 | cffi==1.11.5
3 | numpy==1.15.4
4 | olefile==0.46
5 | Pillow==6.2.0
6 | pycparser==2.19
7 | scikit-learn==0.20.2
8 | scipy==1.1.0
9 | six==1.12.0
10 | torch==1.0.0
11 | torchvision==0.2.1
12 |
--------------------------------------------------------------------------------