├── download_data.sh
├── download_model.sh
├── experiments
    ├── plotting
    │   └── plot_pairewise.py
    ├── run_DCC_global.py
    ├── run_DCC_instance.py
    ├── run_DCC_pairwise.py
    ├── run_DCC_triplets.py
    ├── run_DEC.py
    ├── run_improved_DEC.py
    └── run_sdae.py
├── lib
    ├── __pycache__
    │   ├── datasets.cpython-36.pyc
    │   ├── dcc.cpython-36.pyc
    │   ├── dec.cpython-36.pyc
    │   ├── denoisingAutoencoder.cpython-36.pyc
    │   ├── ops.cpython-36.pyc
    │   ├── stackedDAE.cpython-36.pyc
    │   └── utils.cpython-36.pyc
    ├── datasets.py
    ├── dcc.py
    ├── dec.py
    ├── denoisingAutoencoder.py
    ├── ops.py
    ├── stackedDAE.py
    └── utils.py
├── readme.md
└── requirements.txt


/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ##!/usr/bin/env bash
 3 | 
 4 | TASKS="reutersidf10k_train.npy \
 5 | reutersidf10k_test.npy"
 6 | 
 7 | 
 8 | for t in $TASKS; do
 9 |     echo "Downloading model ${t}."
10 |     wget "https://s3-us-west-1.amazonaws.com/deep-constrained-clustering/\
11 | Data-Reuters/${t}" -P ./experiments/dataset/reuters/
12 | done
13 | 


--------------------------------------------------------------------------------
/download_model.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ##!/usr/bin/env bash
 3 | 
 4 | TASKS="fashion_sdae_weights.pt \
 5 | fashion_triplet_embedding.npy \
 6 | mnist_sdae_weights.pt \
 7 | mnist_triplet_embedding.npy \
 8 | reuters10k_sdae_weights.pt"
 9 | 
10 | 
11 | for t in $TASKS; do
12 |     echo "Downloading model ${t}."
13 |     wget "https://s3-us-west-1.amazonaws.com/deep-constrained-clustering/\
14 | model-log-final/${t}" -P ./model/
15 | done


--------------------------------------------------------------------------------
/experiments/plotting/plot_pairewise.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import sys
  4 | import time
  5 | import random
  6 | import re
  7 | import json
  8 | import pickle
  9 | import pandas as pd
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | import numpy as np
 13 | from sklearn.manifold import TSNE
 14 | from collections import defaultdict
 15 | 
 16 | 
 17 | if __name__ == "__main__":
 18 |     
 19 |     folders = [d for d in os.listdir(".") if os.path.isdir(d) and d != "Legend" and d != "Util"]
 20 | 
 21 |     label_dict = {
 22 |     	"M": ["0","1","2","3","4","5","6","7","8","9"],
 23 |     	"F": ["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"],
 24 |     	"R": ["corporate/industrial", "government/social", "markets", "economics"]
 25 |     }
 26 | 
 27 |     for folder in folders:
 28 | 
 29 |         print("\nStarting "+folder)
 30 | 
 31 |         try:
 32 |             latent_files = [f for f in os.listdir(folder) if f.startswith("save")]
 33 |             print(latent_files)
 34 |         except:
 35 |             print("No latent files, Skipping Folder")
 36 |             continue
 37 | 
 38 |         link_points = []
 39 | 
 40 |         try:
 41 |             must_links = pd.read_pickle(os.path.join(folder,"mustlinks.pkl"))
 42 |             cannot_links = pd.read_pickle(os.path.join(folder,"cannotlinks.pkl"))
 43 | 
 44 |             random.seed(1)
 45 |             ml_sample = random.sample(range(must_links.shape[0]), must_links.shape[0])
 46 |             link_points += must_links.iloc[ml_sample[:20],0].tolist()
 47 |             link_points += must_links.iloc[ml_sample[:20],1].tolist()
 48 |             random.seed(2)
 49 |             cl_sample = random.sample(range(cannot_links.shape[0]), cannot_links.shape[0])
 50 |             link_points += cannot_links.iloc[cl_sample[:20],0].tolist()
 51 |             link_points += cannot_links.iloc[cl_sample[:20],1].tolist()
 52 |         except:
 53 |             print("No must link / cannot link, Skipping Folder")
 54 |             continue
 55 | 
 56 |         try:
 57 |             noisy_must_links = pd.read_pickle(os.path.join(folder,"noisymustlinks.pkl"))
 58 |             random.seed(3)
 59 |             noisy_ml_sample = random.sample(range(noisy_must_links.shape[0]),noisy_must_links.shape[0])
 60 |             link_points += noisy_must_links.iloc[noisy_ml_sample[:20],0].tolist()
 61 |             link_points += noisy_must_links.iloc[noisy_ml_sample[:20],1].tolist()
 62 |         except:
 63 |             noisy_must_links = []
 64 |             noisy_ml_sample = []
 65 | 
 66 |         try:
 67 |             noisy_cannot_links = pd.read_pickle(os.path.join(folder,"noisycannotlinks.pkl"))
 68 |             random.seed(4)
 69 |             noisy_cl_sample = random.sample(range(noisy_cannot_links.shape[0]),noisy_cannot_links.shape[0])
 70 |             link_points += noisy_cannot_links.iloc[noisy_cl_sample[:20],0].tolist()
 71 |             link_points += noisy_cannot_links.iloc[noisy_cl_sample[:20],1].tolist()
 72 |         except:
 73 |             noisy_cannot_links = []
 74 |             noisy_cl_sample = []
 75 | 
 76 |         try:
 77 |             with open(os.path.join(folder,"intermediate_results.json"), "r") as fp:
 78 |                 intermediate_results = json.load(fp)
 79 |         except:
 80 |             intermediate_results = defaultdict(lambda:defaultdict(lambda:0.0))
 81 | 
 82 |         link_points = list(set(link_points))
 83 | 
 84 |         # Start Plotting
 85 |         for k, file in enumerate(latent_files):
 86 | 
 87 |             df = pd.read_pickle(os.path.join(folder,file))
 88 |             epoch = re.sub('[^0-9]','', file)
 89 | 
 90 |             if folder.startswith("Reuters"):
 91 |                 latent_full = df.sample(frac=0.75, random_state=7).append(df.iloc[link_points,:])
 92 |             else:
 93 |                 latent_full = df.sample(frac=0.25, random_state=7).append(df.iloc[link_points,:])
 94 |             
 95 |             latent = latent_full.iloc[:,0:10].copy()
 96 | 
 97 |             time_start = time.time()
 98 |             tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=350)
 99 |             tsne_results = tsne.fit_transform(latent)
100 |             print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
101 | 
102 |             latent['tsne-1'] = tsne_results[:,0]
103 |             latent['tsne-2'] = tsne_results[:,1]
104 |             latent["class"] = np.array([label_dict[folder[0]][x] for x in latent_full["y"].tolist()])
105 | 
106 |             plt.figure(k,figsize=(16,10))
107 |             plt.title("Accuracy: %.2f, NMI: %.2f"%(intermediate_results["acc"][epoch],intermediate_results["nmi"][epoch]))
108 | 
109 |             sns.scatterplot(
110 |                 x="tsne-1", y="tsne-2",
111 |                 hue="class",
112 |                 palette=sns.color_palette("hls", latent["class"].nunique()),
113 |                 data=latent,
114 |                 legend="full",
115 |                 alpha=0.8,
116 |                 s=20
117 |             )
118 | 
119 | 
120 |             # plot links
121 |             plot_links = [ {"sample": ml_sample, "link": must_links, "count":10, "style": 'b-', "label": "must link"},
122 |                            {"sample": cl_sample, "link": cannot_links, "count":10, "style": 'r-', "label": "cannot link"},
123 |                            {"sample": noisy_ml_sample, "link": noisy_must_links, "count":10, "style": 'k-', "label": "noisy must link"},
124 |                            {"sample": noisy_cl_sample, "link": noisy_cannot_links, "count":10, "style": 'k:', "label": "noisy cannot link"},
125 |                          ]
126 | 
127 |             for plot_link in plot_links:
128 |                 count = 0
129 |                 for i in plot_link["sample"]:
130 |                     if count >= plot_link["count"]:
131 |                         break
132 |                     try:
133 |                         p1 = latent.loc[plot_link["link"].loc[i][0]]
134 |                         p2 = latent.loc[plot_link["link"].loc[i][1]]
135 |                         plt.plot([p1["tsne-1"],p2["tsne-1"]], [p1["tsne-2"],p2["tsne-2"]], plot_link["style"], label=plot_link["label"])
136 |                         count += 1
137 |                     except:
138 |                         pass
139 | 
140 |             # remove duplicate label for lines
141 |             handles, labels = plt.gca().get_legend_handles_labels()
142 |             newLabels, newHandles = [], []
143 |             for handle, label in zip(handles, labels):
144 |               if label not in newLabels:
145 |                 newLabels.append(label)
146 |                 newHandles.append(handle)
147 |             
148 |             #lgd = plt.gca().legend(newHandles, newLabels, loc='center left', bbox_to_anchor=(1, 0.5))
149 |             lgd = plt.gca().legend(newHandles, newLabels, loc='center', bbox_to_anchor=(0.5, -0.10),fancybox=True, ncol=len(newLabels), columnspacing=1.0,handlelength=1.0)
150 | 
151 |             plt.savefig(os.path.join(folder,folder+"_"+epoch+".png"), bbox_extra_artists=(lgd,), bbox_inches='tight')
152 |             plt.clf()
153 | 


--------------------------------------------------------------------------------
/experiments/run_DCC_global.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | import torch.utils.data
 4 | import numpy as np
 5 | import argparse
 6 | from lib.dcc import IDEC
 7 | from lib.datasets import MNIST, FashionMNIST
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser(description='Global MNIST Example')
11 |     parser.add_argument('--lr', type=float, default=0.001, metavar='N',
12 |                         help='learning rate for training (default: 0.001)')
13 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
14 |                         help='input batch size for training (default: 256)')
15 |     parser.add_argument('--update-interval', type=int, default=1, metavar='N',
16 |                         help='number of epochs to train (default: 1)')
17 |     parser.add_argument('--epochs', type=int, default=200, metavar='N',
18 |                         help='number of epochs to train (default: 200)')
19 |     parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
20 |                         help='directory for pre-trained weights')
21 |     parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion)')
22 |     parser.add_argument('--use_pretrain', type=str, default="True")
23 |     args = parser.parse_args()
24 |     
25 |     # Load data
26 |     mnist_train = MNIST('./dataset/mnist', train=True, download=True)
27 |     mnist_test = MNIST('./dataset/mnist', train=False)
28 |     X = mnist_train.train_data
29 |     y = mnist_train.train_labels
30 |     test_X = mnist_test.test_data
31 |     test_y = mnist_test.test_labels
32 |     if args.data == "Fashion":
33 |         fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
34 |         fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
35 |         X = fashionmnist_train.train_data
36 |         y = fashionmnist_train.train_labels
37 |         test_X = fashionmnist_test.test_data
38 |         test_y = fashionmnist_test.test_labels
39 |         args.pretrain="../model/fashion_sdae_weights.pt"
40 |         ml_penalty = 1
41 |         
42 |     # Set parameters
43 |     ml_penalty, cl_penalty = 0.1, 1
44 |     idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
45 |                 encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
46 |     
47 |     # Print Network Structure
48 |     print(idec)
49 |     if args.use_pretrain == "True":
50 |         idec.load_model(args.pretrain)
51 |         
52 |     # Construct constriants
53 |     ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
54 |     anchor, positive, negative = np.array([]), np.array([]), np.array([])
55 |     instance_guidance = torch.zeros(X.shape[0]).cuda()
56 |     use_global = True
57 |     
58 |     # Train the network
59 |     train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y,
60 |                              lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
61 |                              update_interval=args.update_interval,tol=1*1e-3)
62 |     
63 |     # Make predictions on test set
64 |     test_acc, test_nmi = idec.predict(test_X, test_y)
65 | 
66 |     # Report results
67 |     print("Training Accuracy:", train_acc)
68 |     print("Training NMI;", train_nmi)
69 |     print("Training Epochs:", epo)
70 |     print("Test Accuracy:", test_acc)
71 |     print("Test NMI:", test_nmi)
72 | 


--------------------------------------------------------------------------------
/experiments/run_DCC_instance.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | import torch.utils.data
 4 | import numpy as np
 5 | import argparse
 6 | from lib.dcc import IDEC
 7 | from lib.datasets import MNIST, FashionMNIST, Reuters
 8 | from sklearn.cluster import KMeans
 9 | from lib.utils import detect_wrong
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(description='Instance Difficulty Constrained Clustering Example')
14 |     parser.add_argument('--lr', type=float, default=0.001, metavar='N',
15 |                         help='learning rate for training (default: 0.001)')
16 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
17 |                         help='input batch size for training (default: 256)')
18 |     parser.add_argument('--update-interval', type=int, default=1, metavar='N',
19 |                         help='number of epochs to train (default: 1)')
20 |     parser.add_argument('--epochs', type=int, default=200, metavar='N',
21 |                         help='number of epochs to train (default: 200)')
22 |     parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
23 |                         help='directory for pre-trained weights')
24 |     parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)')
25 |     parser.add_argument('--use_pretrain', type=bool, default=True)
26 |     args = parser.parse_args()
27 | 
28 |     # Load data
29 |     mnist_train = MNIST('./dataset/mnist', train=True, download=True)
30 |     mnist_test = MNIST('./dataset/mnist', train=False)
31 |     X = mnist_train.train_data
32 |     y = mnist_train.train_labels
33 |     test_X = mnist_test.test_data
34 |     test_y = mnist_test.test_labels
35 |     
36 |     # Set parameters
37 |     ml_penalty, cl_penalty = 0.1, 1
38 |     
39 |     idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
40 |                 encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
41 |     if args.data == "Fashion":
42 |         fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
43 |         fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
44 |         X = fashionmnist_train.train_data
45 |         y = fashionmnist_train.train_labels
46 |         test_X = fashionmnist_test.test_data
47 |         test_y = fashionmnist_test.test_labels
48 |         args.pretrain="../model/fashion_sdae_weights.pt"
49 |         ml_penalty = 1
50 |     elif args.data == "Reuters":
51 |         reuters_train = Reuters('./dataset/reuters', train=True, download=False)
52 |         reuters_test = Reuters('./dataset/reuters', train=False)
53 |         X = reuters_train.train_data
54 |         y = reuters_train.train_labels
55 |         test_X = reuters_test.test_data
56 |         test_y = reuters_test.test_labels
57 |         args.pretrain="../model/reuters10k_sdae_weights.pt"
58 |         idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4,
59 |                     encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
60 |     if args.use_pretrain:
61 |         idec.load_model(args.pretrain)
62 |         
63 |     # Print netowrk structure
64 |     print(idec)
65 | 
66 |     # Construct Constraints
67 |     ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
68 |     anchor, positive, negative = np.array([]), np.array([]), np.array([])
69 |     
70 |     # Provide instance guidance based on k-means results. High confidence (1) for correct instances.
71 |     # Low confidence (0.1) for incorrect instances since k-means + AE does not achieve good results.
72 |     latent = idec.encodeBatch(X).cpu().numpy()
73 |     kmeans = KMeans(10, n_init=20)
74 |     y_pred = kmeans.fit_predict(latent)
75 |     instance_guidance = detect_wrong(y.cpu().numpy(), y_pred)
76 |     instance_guidance = torch.tensor(instance_guidance, dtype=torch.float32).cuda()
77 |     use_global = False
78 |     
79 |     # Train the network
80 |     train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global,  ml_penalty, cl_penalty, X, y,
81 |                              lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
82 |                              update_interval=args.update_interval, tol=1*1e-3)
83 |     
84 |     # Make prediction
85 |     test_acc, test_nmi = idec.predict(test_X, test_y)
86 | 
87 |     # Report results
88 |     print("Training Accuracy:", train_acc)
89 |     print("Training NMI;", train_nmi)
90 |     print("Training Epochs:", epo)
91 |     print("Test Accuracy:", test_acc)
92 |     print("Test NMI:", test_nmi)
93 | 


--------------------------------------------------------------------------------
/experiments/run_DCC_pairwise.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append("..")
  4 | import torch.utils.data
  5 | import numpy as np
  6 | import pandas as pd
  7 | import argparse
  8 | import time
  9 | 
 10 | from lib.dcc import IDEC
 11 | from lib.datasets import MNIST, FashionMNIST, Reuters
 12 | from lib.utils import transitive_closure, generate_random_pair
 13 | 
 14 | 
 15 | if __name__ == "__main__":
 16 |     parser = argparse.ArgumentParser(description='Pairwise MNIST Example')
 17 |     parser.add_argument('--lr', type=float, default=0.001, metavar='N',
 18 |                         help='learning rate for training (default: 0.001)')
 19 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
 20 |                         help='input batch size for training (default: 256)')
 21 |     parser.add_argument('--update-interval', type=int, default=1, metavar='N',
 22 |                         help='number of epochs to train (default: 1)')
 23 |     parser.add_argument('--epochs', type=int, default=500, metavar='N',
 24 |                         help='number of epochs to train (default: 500)')
 25 |     parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
 26 |                         help='directory for pre-trained weights')
 27 |     parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)')
 28 |     parser.add_argument('--without_pretrain', action='store_false')
 29 |     parser.add_argument('--without_kmeans', action='store_false')
 30 |     parser.add_argument('--noisy', type=float, default=0.0, metavar='N',
 31 |                         help='noisy constraints rate for training (default: 0.0)')
 32 |     parser.add_argument('--plotting', action='store_true')
 33 |     args = parser.parse_args()
 34 | 
 35 |     # Load data
 36 |     mnist_train = MNIST('./dataset/mnist', train=True, download=True)
 37 |     mnist_test = MNIST('./dataset/mnist', train=False)
 38 |     X = mnist_train.train_data
 39 |     y = mnist_train.train_labels
 40 |     test_X = mnist_test.test_data
 41 |     test_y = mnist_test.test_labels
 42 |     
 43 |     # Set parameters
 44 |     ml_penalty, cl_penalty = 0.1, 1
 45 |     idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
 46 |                 encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
 47 |     if args.data == "Fashion":
 48 |         fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
 49 |         fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
 50 |         X = fashionmnist_train.train_data
 51 |         y = fashionmnist_train.train_labels
 52 |         test_X = fashionmnist_test.test_data
 53 |         test_y = fashionmnist_test.test_labels
 54 |         args.pretrain="../model/fashion_sdae_weights.pt"
 55 |         ml_penalty = 1
 56 |     elif args.data == "Reuters":
 57 |         reuters_train = Reuters('./dataset/reuters', train=True, download=False)
 58 |         reuters_test = Reuters('./dataset/reuters', train=False)
 59 |         X = reuters_train.train_data
 60 |         y = reuters_train.train_labels
 61 |         test_X = reuters_test.test_data
 62 |         test_y = reuters_test.test_labels
 63 |         args.pretrain="../model/reuters10k_sdae_weights.pt"
 64 |         idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4,
 65 |                     encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
 66 |     
 67 | 
 68 |     model_tag = "Raw"
 69 |     if args.without_pretrain:
 70 |         model_tag = "Pretrain"
 71 |         idec.load_model(args.pretrain)
 72 |     
 73 |     init_tag = "Random"
 74 |     if args.without_kmeans:
 75 |         init_tag = "KMeans"
 76 | 
 77 |     # Print Network Structure
 78 |     print(idec)
 79 | 
 80 |     # Construct Constraints
 81 |     num_constraints = 6000
 82 |     ml_ind1, ml_ind2, cl_ind1, cl_ind2 = generate_random_pair(y, num_constraints*2)
 83 |     ml_ind1, ml_ind2, cl_ind1, cl_ind2 = transitive_closure(ml_ind1, ml_ind2, cl_ind1, cl_ind2, X.shape[0])
 84 | 
 85 |     ml_ind1 = ml_ind1[:num_constraints]
 86 |     ml_ind2 = ml_ind2[:num_constraints]
 87 |     cl_ind1 = cl_ind1[:num_constraints]
 88 |     cl_ind2 = cl_ind2[:num_constraints]
 89 | 
 90 |     plotting_dir = ""
 91 |     if args.plotting:
 92 |         
 93 |         dir_name = args.data+"_"+model_tag+"_"+init_tag+"_%d"%num_constraints
 94 |         if args.noisy > 0:
 95 |             dir_name += "_Noisy_%d%%"%(int(args.noisy*100))
 96 |         dir_name += "_"+time.strftime("%Y%m%d-%H%M")
 97 |         plotting_dir = "./plotting/%s"%dir_name
 98 |         if not os.path.exists(plotting_dir):
 99 |             os.mkdir(plotting_dir) 
100 | 
101 |         mldf = pd.DataFrame(data = [ml_ind1,ml_ind2]).T
102 |         mldf.to_pickle(os.path.join(plotting_dir,"mustlinks.pkl"))
103 |         cldf = pd.DataFrame(data = [cl_ind1,cl_ind2]).T
104 |         cldf.to_pickle(os.path.join(plotting_dir,"cannotlinks.pkl"))
105 | 
106 |     if args.noisy > 0:
107 |         nml_ind1, nml_ind2, ncl_ind1, ncl_ind2 = generate_random_pair(y, num_constraints*2)
108 |         ncl_ind1, ncl_ind2, nml_ind1, nml_ind2 = transitive_closure(nml_ind1, nml_ind2, ncl_ind1, ncl_ind2, X.shape[0])
109 | 
110 |         nml_ind1 = nml_ind1[:int(ml_ind1.size*args.noisy)]
111 |         nml_ind2 = nml_ind2[:int(ml_ind2.size*args.noisy)]
112 |         ncl_ind1 = ncl_ind1[:int(cl_ind1.size*args.noisy)]
113 |         ncl_ind2 = ncl_ind2[:int(cl_ind2.size*args.noisy)]
114 | 
115 |         if plotting_dir:
116 |             nmldf = pd.DataFrame(data = [nml_ind1,nml_ind2]).T
117 |             nmldf.to_pickle(os.path.join(plotting_dir,"noisymustlinks.pkl"))
118 |             ncldf = pd.DataFrame(data = [ncl_ind1,ncl_ind2]).T
119 |             ncldf.to_pickle(os.path.join(plotting_dir,"noisycannotlinks.pkl"))
120 | 
121 |         ml_ind1 = np.append(ml_ind1,nml_ind1)
122 |         ml_ind2 = np.append(ml_ind2,nml_ind2)
123 |         cl_ind1 = np.append(cl_ind1,ncl_ind1)
124 |         cl_ind2 = np.append(cl_ind2,ncl_ind2)
125 | 
126 |     anchor, positive, negative = np.array([]), np.array([]), np.array([])
127 |     instance_guidance = torch.zeros(X.shape[0]).cuda()
128 |     use_global = False
129 |     
130 |     # Train Neural Network
131 |     train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global,  ml_penalty, cl_penalty, X, y,
132 |                              lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
133 |                              update_interval=args.update_interval,tol=1*1e-3,use_kmeans=args.without_kmeans,plotting=plotting_dir)
134 |     
135 |     # Make Predictions
136 |     test_acc, test_nmi = idec.predict(test_X, test_y)
137 | 
138 |     # Report Results
139 |     print("ACC:", train_acc)
140 |     print("NMI;", train_nmi)
141 |     print("Epochs:", epo)
142 |     print("testAcc:", test_acc)
143 |     print("testNMI:", test_nmi)
144 |     print("ML Closure:", ml_ind1.shape[0])
145 |     print("CL Closure:", cl_ind1.shape[0])
146 | 


--------------------------------------------------------------------------------
/experiments/run_DCC_triplets.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | import torch.utils.data
 4 | import numpy as np
 5 | import argparse
 6 | from lib.dcc import IDEC
 7 | from lib.datasets import MNIST, FashionMNIST
 8 | from lib.utils import generate_mnist_triplets, generate_triplet_constraints_continuous
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser(description='Triplet Constraints Example')
13 |     parser.add_argument('--lr', type=float, default=0.001, metavar='N',
14 |                         help='learning rate for training (default: 0.001)')
15 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
16 |                         help='input batch size for training (default: 256)')
17 |     parser.add_argument('--update-interval', type=int, default=1, metavar='N',
18 |                         help='number of epochs to train (default: 1)')
19 |     parser.add_argument('--epochs', type=int, default=200, metavar='N',
20 |                         help='number of epochs to train (default: 200)')
21 |     parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
22 |                         help='directory for pre-trained weights')
23 |     parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion)')
24 |     parser.add_argument('--use_pretrain', type=bool, default=True)
25 |     args = parser.parse_args()
26 | 
27 |     # Load data
28 |     mnist_train = MNIST('./dataset/mnist', train=True, download=True)
29 |     mnist_test = MNIST('./dataset/mnist', train=False)
30 |     X = mnist_train.train_data
31 |     y = mnist_train.train_labels
32 |     test_X = mnist_test.test_data
33 |     test_y = mnist_test.test_labels
34 |     
35 |     # Set parameters
36 |     ml_penalty, cl_penalty = 0.1, 1
37 |     if args.data == "Fashion":
38 |         fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
39 |         fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
40 |         X = fashionmnist_train.train_data
41 |         y = fashionmnist_train.train_labels
42 |         test_X = fashionmnist_test.test_data
43 |         test_y = fashionmnist_test.test_labels
44 |         args.pretrain="../model/fashion_sdae_weights.pt"
45 |         ml_penalty = 1
46 |     idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
47 |                 encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
48 |     if args.use_pretrain:
49 |         idec.load_model(args.pretrain)
50 |     
51 |     # Print Network Structure
52 |     print(idec)
53 | 
54 |     # Construct constraints
55 |     ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
56 |     if args.data != "Fashion":
57 |         anchor, positive, negative = generate_mnist_triplets(y, 6000)
58 |     else:
59 |         anchor, positive, negative = generate_triplet_constraints_continuous(y,6000)
60 |     instance_guidance = torch.zeros(X.shape[0]).cuda()
61 |     use_global = False
62 | 
63 |     # Train the network
64 |     train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global,  ml_penalty, cl_penalty, X, y,
65 |                              lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
66 |                              update_interval=args.update_interval, tol=2*1e-3)
67 | 
68 |     # Make predictions
69 |     test_acc, test_nmi = idec.predict(test_X, test_y)
70 | 
71 |     # Print the result
72 |     print("ACC:", train_acc)
73 |     print("NMI;", train_nmi)
74 |     print("Epochs:", epo)
75 |     print("testAcc:", test_acc)
76 |     print("testNMI:", test_nmi)
77 |     print("ML Closure:", ml_ind1.shape[0])
78 |     print("CL Closure:", cl_ind1.shape[0])
79 | 


--------------------------------------------------------------------------------
/experiments/run_DEC.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | import argparse
 4 | from lib.dec import DEC
 5 | from lib.datasets import MNIST
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(description='DEC MNIST Example')
 9 |     parser.add_argument('--lr', type=float, default=0.01, metavar='N',
10 |                         help='learning rate for training (default: 0.01)')
11 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
12 |                         help='input batch size for training (default: 256)')
13 |     parser.add_argument('--update-interval', type=int, default=1, metavar='N',
14 |                         help='update-interval  (default: 1)')
15 |     parser.add_argument('--epochs', type=int, default=200, metavar='N',
16 |                         help='number of epochs to train (default: 200)')
17 |     parser.add_argument('--pretrain', type=str, default="../model/sdae.pt", metavar='N',
18 |                         help='use pre-trained weights')
19 |     args = parser.parse_args()
20 | 
21 | 
22 |     mnist_train = MNIST('./dataset/mnist', train=True, download=True)
23 |     mnist_test = MNIST('./dataset/mnist', train=False)
24 |     X = mnist_train.train_data
25 |     y = mnist_train.train_labels
26 | 
27 |     dec = DEC(input_dim=784, z_dim=10, n_clusters=10,
28 |               encodeLayer=[500, 500, 2000], activation="relu", dropout=0)
29 |     print(dec)
30 |     dec.load_model(args.pretrain)
31 |     dec.fit(X, y, lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
32 |             update_interval=args.update_interval)
33 | 
34 | 


--------------------------------------------------------------------------------
/experiments/run_improved_DEC.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | import torch.utils.data
 4 | import numpy as np
 5 | import argparse
 6 | from lib.dcc import IDEC
 7 | from lib.datasets import MNIST, FashionMNIST, Reuters
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser(description='IDEC MNIST Example')
11 |     parser.add_argument('--lr', type=float, default=0.001, metavar='N',
12 |                         help='learning rate for training (default: 0.001)')
13 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
14 |                         help='input batch size for training (default: 256)')
15 |     parser.add_argument('--update-interval', type=int, default=1, metavar='N',
16 |                         help='number of epochs to train (default: 1)')
17 |     parser.add_argument('--epochs', type=int, default=200, metavar='N',
18 |                         help='number of epochs to train (default: 200)')
19 |     parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N',
20 |                         help='directory for pre-trained weights')
21 |     parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)')
22 |     parser.add_argument('--use_pretrain', type=bool, default=True)
23 |     args = parser.parse_args()
24 | 
25 |     # Load data
26 |     mnist_train = MNIST('./dataset/mnist', train=True, download=True)
27 |     mnist_test = MNIST('./dataset/mnist', train=False)
28 |     X = mnist_train.train_data
29 |     y = mnist_train.train_labels
30 |     test_X = mnist_test.test_data
31 |     test_y = mnist_test.test_labels
32 |     
33 |     # Set parameters
34 |     ml_penalty, cl_penalty = 0.1, 1
35 |     idec = IDEC(input_dim=784, z_dim=10, n_clusters=10,
36 |                 encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
37 |     if args.data == "Fashion":
38 |         fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True)
39 |         fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False)
40 |         X = fashionmnist_train.train_data
41 |         y = fashionmnist_train.train_labels
42 |         test_X = fashionmnist_test.test_data
43 |         test_y = fashionmnist_test.test_labels
44 |         args.pretrain="../model/fashion_sdae_weights.pt"
45 |         ml_penalty = 1
46 |     elif args.data == "Reuters":
47 |         reuters_train = Reuters('./dataset/reuters', train=True, download=False)
48 |         reuters_test = Reuters('./dataset/reuters', train=False)
49 |         X = reuters_train.train_data
50 |         y = reuters_train.train_labels
51 |         test_X = reuters_test.test_data
52 |         test_y = reuters_test.test_labels
53 |         args.pretrain="../model/reuters10k_sdae_weights.pt"
54 |         idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4,
55 |                     encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0)
56 |     if args.use_pretrain:
57 |         idec.load_model(args.pretrain)
58 |         
59 |     # Print network structure
60 |     print(idec)
61 |     
62 |     # Construct constraints (here is the baseline so no constraints are provided).
63 |     ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
64 |     anchor, positive, negative = np.array([]), np.array([]), np.array([])
65 |     instance_guidance = torch.zeros(X.shape[0]).cuda()
66 |     use_global = False
67 | 
68 |     # Train the clustering model
69 |     train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global,  ml_penalty, cl_penalty, X, y,
70 |                              lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs,
71 |                              update_interval=args.update_interval,tol=1*1e-3)
72 | 
73 |     # Test on the test data
74 |     test_acc, test_nmi = idec.predict(test_X, test_y)
75 | 
76 |     # Print the result
77 |     print("Training Accuracy:", train_acc)
78 |     print("Training NMI;", train_nmi)
79 |     print("Training Epochs:", epo)
80 |     print("Test Accuracy:", test_acc)
81 |     print("Test NMI:", test_nmi)
82 | 


--------------------------------------------------------------------------------
/experiments/run_sdae.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append("..")
 4 | import torch.utils.data
 5 | import argparse
 6 | from lib.stackedDAE import StackedDAE
 7 | from lib.datasets import MNIST
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser(description='VAE MNIST Example')
11 |     parser.add_argument('--lr', type=float, default=0.1, metavar='N',
12 |                         help='learning rate for training (default: 0.1)')
13 |     parser.add_argument('--batch-size', type=int, default=256, metavar='N',
14 |                         help='input batch size for training (default: 256)')
15 |     parser.add_argument('--pretrainepochs', type=int, default=300, metavar='N',
16 |                         help='number of epochs to train (default: 300)')
17 |     parser.add_argument('--epochs', type=int, default=500, metavar='N',
18 |                         help='number of epochs to train (default: 500)')
19 |     args = parser.parse_args()
20 | 
21 |     # Load data for pre-training
22 |     train_loader = torch.utils.data.DataLoader(
23 |         MNIST('./dataset/mnist', train=True, download=True),
24 |         batch_size=args.batch_size, shuffle=True, num_workers=0)
25 |     test_loader = torch.utils.data.DataLoader(
26 |         MNIST('./dataset/mnist', train=False),
27 |         batch_size=args.batch_size, shuffle=False, num_workers=0)
28 | 
29 |     sdae = StackedDAE(input_dim=784, z_dim=10, binary=False,
30 |                       encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu",
31 |                       dropout=0)
32 |     
33 |     # Print the pre-train model structure
34 |     print(sdae)
35 |     sdae.pretrain(train_loader, test_loader, lr=args.lr, batch_size=args.batch_size,
36 |                   num_epochs=args.pretrainepochs, corrupt=0.2, loss_type="mse")
37 |     
38 |     # Train the stacked denoising autoencoder
39 |     sdae.fit(train_loader, test_loader, lr=args.lr, num_epochs=args.epochs, corrupt=0.2, loss_type="mse")
40 |     
41 |     # Save the weights as pre-trained model for IDEC/DEC/DCC
42 |     sdae.save_model("model/sdae_mnist_weights.pt")
43 | 


--------------------------------------------------------------------------------
/lib/__pycache__/datasets.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/datasets.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/dcc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/dcc.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/dec.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/dec.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/denoisingAutoencoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/denoisingAutoencoder.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/ops.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/ops.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/stackedDAE.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/stackedDAE.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path
  3 | import errno
  4 | import numpy as np
  5 | import gzip
  6 | import torch
  7 | import pickle
  8 | import torch.utils.data as data
  9 | import codecs
 10 | import urllib
 11 | 
 12 | 
 13 | class MNIST(data.Dataset):
 14 |     """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.
 15 |     Args:
 16 |         root (string): Root directory of dataset where ``processed/training.pt``
 17 |             and  ``processed/test.pt`` exist.
 18 |         train (bool, optional): If True, creates dataset from ``training.pt``,
 19 |             otherwise from ``test.pt``.
 20 |         download (bool, optional): If true, downloads the dataset from the internet and
 21 |             puts it in root directory. If dataset is already downloaded, it is not
 22 |             downloaded again.
 23 |         transform (callable, optional): A function/transform that  takes in an PIL image
 24 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
 25 |         target_transform (callable, optional): A function/transform that takes in the
 26 |             target and transforms it.
 27 |     """
 28 |     urls = [
 29 |         'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
 30 |         'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
 31 |         'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
 32 |         'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
 33 |     ]
 34 |     raw_folder = 'raw'
 35 |     processed_folder = 'processed'
 36 |     training_file = 'training.pt'
 37 |     test_file = 'test.pt'
 38 |     classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
 39 |                '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
 40 |     class_to_idx = {_class: i for i, _class in enumerate(classes)}
 41 | 
 42 |     @property
 43 |     def targets(self):
 44 |         if self.train:
 45 |             return self.train_labels
 46 |         else:
 47 |             return self.test_labels
 48 | 
 49 |     def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
 50 |         self.root = os.path.expanduser(root)
 51 |         self.transform = transform
 52 |         self.target_transform = target_transform
 53 |         self.train = train  # training set or test set
 54 |         self.use_cuda = torch.cuda.is_available()
 55 | 
 56 |         if download:
 57 |             self.download()
 58 | 
 59 |         if not self._check_exists():
 60 |             raise RuntimeError('Dataset not found.' +
 61 |                                ' You can use download=True to download it')
 62 | 
 63 |         if self.train:
 64 |             self.train_data, self.train_labels = torch.load(
 65 |                 os.path.join(self.root, self.processed_folder, self.training_file))
 66 |             self.train_data = self.train_data.view(self.train_data.size(0), -1).float()*0.02
 67 |             # self.train_data = self.train_data.view(self.train_data.size(0), -1).float()/255
 68 |             self.train_labels = self.train_labels.int()
 69 |             if self.use_cuda:
 70 |                 self.train_data = self.train_data.cuda()
 71 |                 self.train_labels = self.train_labels.cuda()
 72 |         else:
 73 |             self.test_data, self.test_labels = torch.load(
 74 |                 os.path.join(self.root, self.processed_folder, self.test_file))
 75 |             self.test_data = self.test_data.view(self.test_data.size(0), -1).float()*0.02
 76 |             # self.test_data = self.test_data.view(self.test_data.size(0), -1).float()/255
 77 |             self.test_labels = self.test_labels.int()
 78 |             if self.use_cuda:
 79 |                 self.test_data = self.test_data.cuda()
 80 |                 self.test_labels = self.test_labels.cuda()
 81 | 
 82 |     def __getitem__(self, index):
 83 |         """
 84 |         Args:
 85 |             index (int): Index
 86 |         Returns:
 87 |             tuple: (image, target) where target is index of the target class.
 88 |         """
 89 |         if self.train:
 90 |             img, target = self.train_data[index], self.train_labels[index]
 91 |         else:
 92 |             img, target = self.test_data[index], self.test_labels[index]
 93 | 
 94 |         return img, target
 95 | 
 96 |     def __len__(self):
 97 |         if self.train:
 98 |             return len(self.train_data)
 99 |         else:
100 |             return len(self.test_data)
101 | 
102 |     def _check_exists(self):
103 |         return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \
104 |             os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file))
105 | 
106 |     def download(self):
107 |         """Download the MNIST data if it doesn't exist in processed_folder already."""
108 |         from six.moves import urllib
109 |         import gzip
110 | 
111 |         if self._check_exists():
112 |             return
113 | 
114 |         # download files
115 |         try:
116 |             os.makedirs(os.path.join(self.root, self.raw_folder))
117 |             os.makedirs(os.path.join(self.root, self.processed_folder))
118 |         except OSError as e:
119 |             if e.errno == errno.EEXIST:
120 |                 pass
121 |             else:
122 |                 raise
123 | 
124 |         for url in self.urls:
125 |             print('Downloading ' + url)
126 |             data = urllib.request.urlopen(url)
127 |             filename = url.rpartition('/')[2]
128 |             file_path = os.path.join(self.root, self.raw_folder, filename)
129 |             with open(file_path, 'wb') as f:
130 |                 f.write(data.read())
131 |             with open(file_path.replace('.gz', ''), 'wb') as out_f, \
132 |                     gzip.GzipFile(file_path) as zip_f:
133 |                 out_f.write(zip_f.read())
134 |             os.unlink(file_path)
135 | 
136 |         # process and save as torch files
137 |         print('Processing...')
138 | 
139 |         training_set = (
140 |             read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
141 |             read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
142 |         )
143 |         test_set = (
144 |             read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
145 |             read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
146 |         )
147 |         with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f:
148 |             torch.save(training_set, f)
149 |         with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f:
150 |             torch.save(test_set, f)
151 | 
152 |         print('Done!')
153 | 
154 |     def __repr__(self):
155 |         fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
156 |         fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
157 |         tmp = 'train' if self.train is True else 'test'
158 |         fmt_str += '    Split: {}\n'.format(tmp)
159 |         fmt_str += '    Root Location: {}\n'.format(self.root)
160 |         tmp = '    Transforms (if any): '
161 |         fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
162 |         tmp = '    Target Transforms (if any): '
163 |         fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
164 |         return fmt_str
165 | 
166 | 
167 | def read_label_file(path):
168 |     with open(path, 'rb') as f:
169 |         data = f.read()
170 |         assert get_int(data[:4]) == 2049
171 |         length = get_int(data[4:8])
172 |         parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
173 |         return torch.from_numpy(parsed).view(length).long()
174 | 
175 | 
176 | def get_int(b):
177 |     return int(codecs.encode(b, 'hex'), 16)
178 | 
179 | 
180 | def read_image_file(path):
181 |     with open(path, 'rb') as f:
182 |         data = f.read()
183 |         assert get_int(data[:4]) == 2051
184 |         length = get_int(data[4:8])
185 |         num_rows = get_int(data[8:12])
186 |         num_cols = get_int(data[12:16])
187 |         images = []
188 |         parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
189 |         return torch.from_numpy(parsed).view(length, num_rows, num_cols)
190 | 
191 | 
192 | class FashionMNIST(MNIST):
193 |     """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.
194 |     Args:
195 |         root (string): Root directory of dataset where ``processed/training.pt``
196 |             and  ``processed/test.pt`` exist.
197 |         train (bool, optional): If True, creates dataset from ``training.pt``,
198 |             otherwise from ``test.pt``.
199 |         download (bool, optional): If true, downloads the dataset from the internet and
200 |             puts it in root directory. If dataset is already downloaded, it is not
201 |             downloaded again.
202 |         transform (callable, optional): A function/transform that  takes in an PIL image
203 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
204 |         target_transform (callable, optional): A function/transform that takes in the
205 |             target and transforms it.
206 |     """
207 |     urls = [
208 |         'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
209 |         'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
210 |         'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
211 |         'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
212 |     ]
213 |     classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal','Shirt', 'Sneaker', 'Bag', 'Ankle boot']
214 | 
215 | 
216 | class Reuters(data.Dataset):
217 |     # To download the processed reuters data, please run the script named as "download_data.sh"
218 |     training_file = "reutersidf10k_train.npy"
219 |     test_file = "reutersidf10k_test.npy"
220 | 
221 |     def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
222 |         self.root = os.path.expanduser(root)
223 |         self.transform = transform
224 |         self.target_transform = target_transform
225 |         self.train = train  # training set or test set
226 |         self.use_cuda = torch.cuda.is_available()
227 | 
228 |         if download:
229 |             self.download()
230 | 
231 |         if self.train:
232 |             rtk10k_train = np.load(os.path.join(self.root, self.training_file)).item()
233 |             self.train_data, self.train_labels = torch.tensor(rtk10k_train['data'], dtype=torch.float32), torch.tensor(rtk10k_train['label'], dtype=torch.int)
234 |             if self.use_cuda:
235 |                 self.train_data = self.train_data.cuda()
236 |                 self.train_labels = self.train_labels.cuda()
237 |         else:
238 |             rtk10k_test = np.load(os.path.join(self.root, self.test_file)).item()
239 |             self.test_data, self.test_labels = torch.tensor(rtk10k_test['data'], dtype=torch.float32), torch.tensor(
240 |                 rtk10k_test['label'], dtype=torch.int)
241 |             if self.use_cuda:
242 |                 self.test_data = self.test_data.cuda()
243 |                 self.test_labels = self.test_labels.cuda()
244 | 
245 |     def __getitem__(self, index):
246 |         """
247 |         Args:
248 |             index (int): Index
249 |         Returns:
250 |             tuple: (image, target) where target is index of the target class.
251 |         """
252 |         if self.train:
253 |             img, target = self.train_data[index], self.train_labels[index]
254 |         else:
255 |             img, target = self.test_data[index], self.test_labels[index]
256 | 
257 |         return img, target
258 | 
259 |     def __len__(self):
260 |         if self.train:
261 |             return len(self.train_data)
262 |         else:
263 |             return len(self.test_data)
264 | 


--------------------------------------------------------------------------------
/lib/dcc.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import Parameter
  4 | import torch.optim as optim
  5 | from torch.autograd import Variable
  6 | 
  7 | import numpy as np
  8 | import os
  9 | import math
 10 | import collections
 11 | import pickle
 12 | import json
 13 | 
 14 | from lib.utils import acc
 15 | from sklearn.metrics.cluster import normalized_mutual_info_score
 16 | from sklearn.cluster import KMeans
 17 | import pandas as pd
 18 | 
 19 | class MSELoss(nn.Module):
 20 |     def __init__(self):
 21 |         super(self.__class__, self).__init__()
 22 | 
 23 |     def forward(self, input, target):
 24 |         return torch.mean((input-target)**2)
 25 | 
 26 | 
 27 | def buildNetwork(layers, activation="relu", dropout=0):
 28 |     net = []
 29 |     for i in range(1, len(layers)):
 30 |         net.append(nn.Linear(layers[i-1], layers[i]))
 31 |         if activation=="relu":
 32 |             net.append(nn.ReLU())
 33 |         elif activation=="sigmoid":
 34 |             net.append(nn.Sigmoid())
 35 |         if dropout > 0:
 36 |             net.append(nn.Dropout(dropout))
 37 |     return nn.Sequential(*net)
 38 | 
 39 | 
 40 | class IDEC(nn.Module):
 41 |     def __init__(self, input_dim=784, z_dim=10, n_clusters=10,
 42 |         encodeLayer=[400], decodeLayer=[400], activation="relu", dropout=0, alpha=1., gamma=0.1):
 43 |         super(self.__class__, self).__init__()
 44 |         self.z_dim = z_dim
 45 |         self.layers = [input_dim] + encodeLayer + [z_dim]
 46 |         self.activation = activation
 47 |         self.dropout = dropout
 48 |         self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout)
 49 |         self.decoder = buildNetwork([z_dim] + decodeLayer, activation=activation, dropout=dropout)
 50 |         self._enc_mu = nn.Linear(encodeLayer[-1], z_dim)
 51 |         self._dec = nn.Linear(decodeLayer[-1], input_dim)
 52 | 
 53 |         self.n_clusters = n_clusters
 54 |         self.alpha = alpha
 55 |         self.gamma = gamma
 56 |         self.mu = Parameter(torch.Tensor(n_clusters, z_dim))
 57 | 
 58 |     def save_model(self, path):
 59 |         torch.save(self.state_dict(), path)
 60 | 
 61 |     def load_model(self, path):
 62 |         pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage)
 63 |         model_dict = self.state_dict()
 64 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
 65 |         model_dict.update(pretrained_dict) 
 66 |         self.load_state_dict(model_dict)
 67 | 
 68 |     def forward(self, x):
 69 |         h = self.encoder(x)
 70 |         z = self._enc_mu(h)
 71 |         h = self.decoder(z)
 72 |         xrecon = self._dec(h)
 73 |         # compute q -> NxK
 74 |         q = self.soft_assign(z)
 75 |         return z, q, xrecon
 76 | 
 77 |     def soft_assign(self, z):
 78 |         q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha)
 79 |         q = q**(self.alpha+1.0)/2.0
 80 |         q = q / torch.sum(q, dim=1, keepdim=True)
 81 |         return q
 82 | 
 83 |     def encodeBatch(self, X, batch_size=256):
 84 |         use_cuda = torch.cuda.is_available()
 85 |         if use_cuda:
 86 |             self.cuda()
 87 |         
 88 |         encoded = []
 89 |         self.eval()
 90 |         num = X.shape[0]
 91 |         num_batch = int(math.ceil(1.0*X.shape[0]/batch_size))
 92 |         for batch_idx in range(num_batch):
 93 |             xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
 94 |             inputs = Variable(xbatch)
 95 |             z,_, _ = self.forward(inputs)
 96 |             encoded.append(z.data)
 97 | 
 98 |         encoded = torch.cat(encoded, dim=0)
 99 |         return encoded
100 | 
101 |     def cluster_loss(self, p, q):
102 |         def kld(target, pred):
103 |             return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1))
104 |         kldloss = kld(p, q)
105 |         return self.gamma*kldloss
106 | 
107 |     def recon_loss(self, x, xrecon):
108 |         recon_loss = torch.mean((xrecon-x)**2)
109 |         return recon_loss
110 | 
111 |     def pairwise_loss(self, p1, p2, cons_type):
112 |         if cons_type == "ML":
113 |             ml_loss = torch.mean(-torch.log(torch.sum(p1 * p2, dim=1)))
114 |             return ml_loss
115 |         else:
116 |             cl_loss = torch.mean(-torch.log(1.0 - torch.sum(p1 * p2, dim=1)))
117 |             return cl_loss
118 | 
119 |     def global_size_loss(self, p, cons_detail):
120 |         m_p = torch.mean(p, dim=0)
121 |         m_p = m_p / torch.sum(m_p)
122 |         return torch.sum((m_p-cons_detail)*(m_p-cons_detail))
123 | 
124 |     def difficulty_loss(self, q, mask):
125 |         mask = mask.unsqueeze_(-1)
126 |         mask = mask.expand(q.shape[0], q.shape[1])
127 |         mask_q = q * mask
128 |         diff_loss = -torch.norm(mask_q, 2)
129 |         penalty_degree = 0.1
130 |         return penalty_degree * diff_loss
131 | 
132 |     def target_distribution(self, q):
133 |         p = q**2 / torch.sum(q, dim=0)
134 |         p = p / torch.sum(p, dim=1, keepdim=True)
135 |         return p
136 | 
137 |     def triplet_loss(self, anchor, positive, negative, margin_constant):
138 |         # loss = max(d(anchor, negative) - d(anchor, positve) + margin, 0), margin > 0
139 |         # d(x, y) = q(x) * q(y)
140 |         negative_dis = torch.sum(anchor * negative, dim=1)
141 |         positive_dis = torch.sum(anchor * positive, dim=1)
142 |         margin = margin_constant * torch.ones(negative_dis.shape).cuda()
143 |         diff_dis = negative_dis - positive_dis
144 |         penalty = diff_dis + margin
145 |         triplet_loss = 1*torch.max(penalty, torch.zeros(negative_dis.shape).cuda())
146 | 
147 |         return torch.mean(triplet_loss)
148 | 
149 |     def satisfied_constraints(self,ml_ind1,ml_ind2,cl_ind1, cl_ind2,y_pred):
150 |         
151 |         if ml_ind1.size == 0 or ml_ind2.size == 0 or cl_ind1.size == 0 or cl_ind2.size == 0:
152 |             return 1.1
153 | 
154 |         count = 0
155 |         satisfied = 0
156 |         for (i, j) in zip(ml_ind1, ml_ind2):
157 |             count += 1
158 |             if y_pred[i] == y_pred[j]:
159 |                 satisfied += 1
160 |         for (i, j) in zip(cl_ind1, cl_ind2):
161 |             count += 1
162 |             if y_pred[i] != y_pred[j]:
163 |                 satisfied += 1
164 | 
165 |         return float(satisfied)/count
166 | 
167 | 
168 |     def predict(self, X, y):
169 |         use_cuda = torch.cuda.is_available()
170 |         if use_cuda:
171 |             self.cuda()
172 |         latent = self.encodeBatch(X)
173 |         q = self.soft_assign(latent)
174 | 
175 |         # evalute the clustering performance
176 |         y_pred = torch.argmax(q, dim=1).data.cpu().numpy()
177 |         y = y.data.cpu().numpy()
178 |         if y is not None:
179 |             print("acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
180 |             final_acc = acc(y, y_pred)
181 |             final_nmi = normalized_mutual_info_score(y, y_pred)
182 |         return final_acc, final_nmi
183 | 
184 |     def fit(self,anchor, positive, negative, ml_ind1,ml_ind2,cl_ind1, cl_ind2, mask, use_global, ml_p, cl_p, X,y=None, lr=0.001, batch_size=256, num_epochs=10, update_interval=1, tol=1e-3, use_kmeans=True, plotting="",clustering_loss_weight=1):    
185 |         
186 |         # save intermediate results for plotting
187 |         intermediate_results = collections.defaultdict(lambda:{})
188 |         
189 |         '''X: tensor data'''
190 |         use_cuda = torch.cuda.is_available()
191 |         if use_cuda:
192 |             self.cuda()
193 |         print("=====Training IDEC=======")
194 |         optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
195 | 
196 |         if use_kmeans:
197 |             print("Initializing cluster centers with kmeans.")
198 |             kmeans = KMeans(self.n_clusters, n_init=20)
199 |             data = self.encodeBatch(X)
200 |             y_pred = kmeans.fit_predict(data.data.cpu().numpy())
201 |             y_pred_last = y_pred
202 |             self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_))
203 |         else:
204 |             # use kmeans to randomly initialize cluster ceters
205 |             print("Randomly initializing cluster centers.")
206 |             kmeans = KMeans(self.n_clusters, n_init=1, max_iter=1)
207 |             data = self.encodeBatch(X)
208 |             y_pred = kmeans.fit_predict(data.data.cpu().numpy())
209 |             y_pred_last = y_pred
210 |             self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_))
211 | 
212 |         if y is not None:
213 |             y = y.cpu().numpy()
214 |             # print("Kmeans acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
215 |         self.train()
216 |         num = X.shape[0]
217 |         num_batch = int(math.ceil(1.0*X.shape[0]/batch_size))
218 |         ml_num_batch = int(math.ceil(1.0*ml_ind1.shape[0]/batch_size))
219 |         cl_num_batch = int(math.ceil(1.0*cl_ind1.shape[0]/batch_size))
220 |         tri_num_batch = int(math.ceil(1.0*anchor.shape[0]/batch_size))
221 |         cl_num = cl_ind1.shape[0]
222 |         ml_num = ml_ind1.shape[0]
223 |         tri_num = anchor.shape[0]
224 | 
225 |         final_acc, final_nmi, final_epoch = 0, 0, 0
226 |         update_ml = 1
227 |         update_cl = 1
228 |         update_triplet = 1
229 |         for epoch in range(num_epochs):
230 |             if epoch%update_interval == 0:
231 |                 # update the targe distribution p
232 |                 latent = self.encodeBatch(X)
233 |                 q = self.soft_assign(latent)
234 |                 p = self.target_distribution(q).data
235 | 
236 |                 # evalute the clustering performance
237 |                 y_pred = torch.argmax(q, dim=1).data.cpu().numpy()
238 |                 if use_global:
239 |                     y_dict = collections.defaultdict(list)
240 |                     ind1, ind2 = [], []
241 |                     for i in range(y_pred.shape[0]):
242 |                         y_dict[y_pred[i]].append(i)
243 |                     for key in y_dict.keys():
244 |                         if y is not None:
245 |                             print("predicted class: ", key, " total: ", len(y_dict[key]))
246 |                             #, " mapped index(ground truth): ", np.bincount(y[y_dict[key]]).argmax())
247 | 
248 |                 if y is not None:
249 |                     print("acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
250 |                     print("satisfied constraints: %.5f"%self.satisfied_constraints(ml_ind1,ml_ind2,cl_ind1, cl_ind2,y_pred))
251 |                     final_acc = acc(y, y_pred)
252 |                     final_nmi = normalized_mutual_info_score(y, y_pred)
253 |                     final_epoch = epoch
254 | 
255 |                 # save model for plotting
256 |                 if plotting and (epoch in [10,20,30,40] or epoch%50 == 0 or epoch == num_epochs-1):
257 |                     
258 |                     df = pd.DataFrame(latent.cpu().numpy())
259 |                     df["y"] = y
260 |                     df.to_pickle(os.path.join(plotting,"save_model_%d.pkl"%(epoch)))
261 |                     
262 |                     intermediate_results["acc"][str(epoch)] = acc(y, y_pred)
263 |                     intermediate_results["nmi"][str(epoch)] = normalized_mutual_info_score(y, y_pred)
264 |                     with open(os.path.join(plotting,"intermediate_results.json"), "w") as fp:
265 |                         json.dump(intermediate_results, fp)
266 | 
267 |                 # check stop criterion
268 |                 try:
269 |                     delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / num
270 |                     y_pred_last = y_pred
271 |                     if epoch>0 and delta_label < tol:
272 |                         print('delta_label ', delta_label, '< tol ', tol)
273 |                         print("Reach tolerance threshold. Stopping training.")
274 | 
275 |                         # save model for plotting
276 |                         if plotting:
277 |                             
278 |                             df = pd.DataFrame(latent.cpu().numpy())
279 |                             df["y"] = y
280 |                             df.to_pickle(os.path.join(plotting,"save_model_%d.pkl"%epoch))
281 |                             
282 |                             intermediate_results["acc"][str(epoch)] = acc(y, y_pred)
283 |                             intermediate_results["nmi"][str(epoch)] = normalized_mutual_info_score(y, y_pred)
284 |                             with open(os.path.join(plotting,"intermediate_results.json"), "w") as fp:
285 |                                 json.dump(intermediate_results, fp)
286 |                         break
287 |                 except:
288 |                     pass
289 | 
290 |             # train 1 epoch for clustering loss
291 |             train_loss = 0.0
292 |             recon_loss_val = 0.0
293 |             cluster_loss_val = 0.0
294 |             instance_constraints_loss_val = 0.0
295 |             global_loss_val = 0.0
296 |             for batch_idx in range(num_batch):
297 |                 xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
298 |                 pbatch = p[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
299 |                 mask_batch = mask[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
300 |                 optimizer.zero_grad()
301 |                 inputs = Variable(xbatch)
302 |                 target = Variable(pbatch)
303 |                 cons_detail = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
304 |                 global_cons = torch.from_numpy(cons_detail).float().to("cuda")
305 | 
306 |                 z, qbatch, xrecon = self.forward(inputs)
307 |                 if use_global == False:
308 |                     cluster_loss = self.cluster_loss(target, qbatch)
309 |                     recon_loss = self.recon_loss(inputs, xrecon)
310 |                     instance_constraints_loss = self.difficulty_loss(qbatch, mask_batch)
311 |                     loss = cluster_loss + recon_loss + instance_constraints_loss
312 |                     loss.backward()
313 |                     optimizer.step()
314 |                     cluster_loss_val += cluster_loss.data * len(inputs)
315 |                     recon_loss_val += recon_loss.data * len(inputs)
316 |                     instance_constraints_loss_val += instance_constraints_loss.data * len(inputs)
317 |                     train_loss = clustering_loss_weight*cluster_loss_val + recon_loss_val + instance_constraints_loss_val
318 |                 else:
319 |                     cluster_loss = self.cluster_loss(target, qbatch)
320 |                     recon_loss = self.recon_loss(inputs, xrecon)
321 |                     global_loss = self.global_size_loss(qbatch, global_cons)
322 |                     loss = cluster_loss + recon_loss + global_loss
323 |                     loss.backward()
324 |                     optimizer.step()
325 |                     cluster_loss_val += cluster_loss.data * len(inputs)
326 |                     recon_loss_val += recon_loss.data * len(inputs)
327 |                     train_loss = clustering_loss_weight*cluster_loss_val + recon_loss_val
328 | 
329 | 
330 |             if instance_constraints_loss_val != 0.0:
331 |                 print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f Instance Difficulty Loss: %.4f"% (
332 |                     epoch + 1, train_loss / num, cluster_loss_val / num, recon_loss_val / num, instance_constraints_loss_val / num))
333 |             elif global_loss_val != 0.0 and use_global:
334 |                 print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f Global Loss: %.4f"% (
335 |                     epoch + 1, train_loss / num + global_loss_val/num_batch, cluster_loss_val / num, recon_loss_val / num, global_loss_val / num_batch))
336 |             else:
337 |                 print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f" % (
338 |                     epoch + 1, train_loss / num, cluster_loss_val / num, recon_loss_val / num))
339 |             ml_loss = 0.0
340 |             if epoch % update_ml == 0:
341 |                 for ml_batch_idx in range(ml_num_batch):
342 |                     px1 = X[ml_ind1[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]]
343 |                     px2 = X[ml_ind2[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]]
344 |                     pbatch1 = p[ml_ind1[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx + 1)*batch_size)]]
345 |                     pbatch2 = p[ml_ind2[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]]
346 |                     optimizer.zero_grad()
347 |                     inputs1 = Variable(px1)
348 |                     inputs2 = Variable(px2)
349 |                     target1 = Variable(pbatch1)
350 |                     target2 = Variable(pbatch2)
351 |                     z1, q1, xr1 = self.forward(inputs1)
352 |                     z2, q2, xr2 = self.forward(inputs2)
353 |                     loss = (ml_p*self.pairwise_loss(q1, q2, "ML")+self.recon_loss(inputs1, xr1) + self.recon_loss(inputs2, xr2))
354 |                     # 0.1 for mnist/reuters, 1 for fashion, the parameters are tuned via grid search on validation set
355 |                     ml_loss += loss.data
356 |                     loss.backward()
357 |                     optimizer.step()
358 | 
359 |             cl_loss = 0.0
360 |             if epoch % update_cl == 0:
361 |                 for cl_batch_idx in range(cl_num_batch):
362 |                     px1 = X[cl_ind1[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]]
363 |                     px2 = X[cl_ind2[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]]
364 |                     pbatch1 = p[cl_ind1[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx + 1)*batch_size)]]
365 |                     pbatch2 = p[cl_ind2[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]]
366 |                     optimizer.zero_grad()
367 |                     inputs1 = Variable(px1)
368 |                     inputs2 = Variable(px2)
369 |                     target1 = Variable(pbatch1)
370 |                     target2 = Variable(pbatch2)
371 |                     z1, q1, xr1 = self.forward(inputs1)
372 |                     z2, q2, xr2 = self.forward(inputs2)
373 |                     loss = cl_p*self.pairwise_loss(q1, q2, "CL")
374 |                     cl_loss += loss.data
375 |                     loss.backward()
376 |                     optimizer.step()
377 | 
378 |             if ml_num_batch >0 and cl_num_batch > 0:
379 |                 print("Pairwise Total:", round(float(ml_loss.cpu()), 2) + float(cl_loss.cpu()), "ML loss", float(ml_loss.cpu()), "CL loss:", float(cl_loss.cpu()))
380 |             triplet_loss = 0.0
381 |             if epoch % update_triplet == 0:
382 |                 for tri_batch_idx in range(tri_num_batch):
383 |                     px1 = X[anchor[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
384 |                     px2 = X[positive[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
385 |                     px3 = X[negative[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
386 |                     pbatch1 = p[anchor[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx + 1)*batch_size)]]
387 |                     pbatch2 = p[positive[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
388 |                     pbatch3 = p[negative[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]]
389 |                     optimizer.zero_grad()
390 |                     inputs1 = Variable(px1)
391 |                     inputs2 = Variable(px2)
392 |                     inputs3 = Variable(px3)
393 |                     target1 = Variable(pbatch1)
394 |                     target2 = Variable(pbatch2)
395 |                     target3 = Variable(pbatch3)
396 |                     z1, q1, xr1 = self.forward(inputs1)
397 |                     z2, q2, xr2 = self.forward(inputs2)
398 |                     z3, q3, xr3 = self.forward(inputs3)
399 |                     loss = self.triplet_loss(q1, q2, q3, 0.1)
400 |                     triplet_loss += loss.data
401 |                     loss.backward()
402 |                     optimizer.step()
403 |             if tri_num_batch > 0:
404 |                 print("Triplet Loss:", triplet_loss)
405 |         return final_acc, final_nmi, final_epoch
406 | 


--------------------------------------------------------------------------------
/lib/dec.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import Parameter
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import torchvision
  7 | from torchvision import datasets, transforms
  8 | from torch.autograd import Variable
  9 | 
 10 | import numpy as np
 11 | import math
 12 | from lib.utils import acc
 13 | from sklearn.metrics.cluster import normalized_mutual_info_score
 14 | from sklearn.cluster import KMeans
 15 | 
 16 | 
 17 | def buildNetwork(layers, activation="relu", dropout=0):
 18 |     net = []
 19 |     for i in range(1, len(layers)):
 20 |         net.append(nn.Linear(layers[i-1], layers[i]))
 21 |         if activation=="relu":
 22 |             net.append(nn.ReLU())
 23 |         elif activation=="sigmoid":
 24 |             net.append(nn.Sigmoid())
 25 |         if dropout > 0:
 26 |             net.append(nn.Dropout(dropout))
 27 |     return nn.Sequential(*net)
 28 | 
 29 | 
 30 | class DEC(nn.Module):
 31 |     def __init__(self, input_dim=784, z_dim=10, n_clusters=10,
 32 |         encodeLayer=[400], activation="relu", dropout=0, alpha=1.):
 33 |         super(self.__class__, self).__init__()
 34 |         self.z_dim = z_dim
 35 |         self.layers = [input_dim] + encodeLayer + [z_dim]
 36 |         self.activation = activation
 37 |         self.dropout = dropout
 38 |         self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout) # f(x) = z
 39 |         self._enc_mu = nn.Linear(encodeLayer[-1], z_dim) # clustering layer -> q
 40 | 
 41 |         self.n_clusters = n_clusters
 42 |         self.alpha = alpha
 43 |         self.mu = Parameter(torch.Tensor(n_clusters, z_dim))
 44 | 
 45 |     def save_model(self, path):
 46 |         torch.save(self.state_dict(), path)
 47 | 
 48 |     def load_model(self, path):
 49 |         pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage)
 50 |         model_dict = self.state_dict()
 51 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
 52 |         model_dict.update(pretrained_dict) 
 53 |         self.load_state_dict(model_dict)
 54 | 
 55 |     def forward(self, x):
 56 |         h = self.encoder(x)
 57 |         z = self._enc_mu(h)
 58 |         # compute q -> NxK
 59 |         q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha)
 60 |         q = q**(self.alpha+1.0)/2.0
 61 |         q = q / torch.sum(q, dim=1, keepdim=True)
 62 |         return z, q
 63 | 
 64 |     def encodeBatch(self, dataloader, islabel=False):
 65 |         use_cuda = torch.cuda.is_available()
 66 |         if use_cuda:
 67 |             self.cuda()
 68 |         
 69 |         encoded = []
 70 |         ylabels = []
 71 |         self.eval()
 72 |         for batch_idx, (inputs, labels) in enumerate(dataloader):
 73 |             inputs = Variable(inputs)
 74 |             z,_ = self.forward(inputs)
 75 |             encoded.append(z.data.cpu())
 76 |             ylabels.append(labels)
 77 | 
 78 |         encoded = torch.cat(encoded, dim=0)
 79 |         ylabels = torch.cat(ylabels)
 80 |         if islabel:
 81 |             out = (encoded, ylabels)
 82 |         else:
 83 |             out = encoded
 84 |         return out
 85 | 
 86 |     def loss_function(self, p, q):
 87 |         def kld(target, pred):
 88 |             return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1))
 89 |         
 90 |         loss = kld(p, q)
 91 |         return loss
 92 | 
 93 |     def target_distribution(self, q):
 94 |         p = q**2 / torch.sum(q, dim=0)
 95 |         p = p / torch.sum(p, dim=1, keepdim=True)
 96 |         return p
 97 | 
 98 |     def fit(self, X, y=None, lr=0.001, batch_size=256, num_epochs=10, update_interval=1, tol=1e-3):
 99 |         '''X: tensor data'''
100 |         use_cuda = torch.cuda.is_available()
101 |         if use_cuda:
102 |             self.cuda()
103 |         print("=====Training DEC=======")
104 |         #optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
105 |         optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
106 | 
107 |         print("Initializing cluster centers with kmeans.")
108 |         kmeans = KMeans(self.n_clusters, n_init=20)
109 |         data, _ = self.forward(X)
110 |         y_pred = kmeans.fit_predict(data.data.cpu().numpy())
111 |         y_pred_last = y_pred
112 |         self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_))
113 |         if y is not None:
114 |             y = y.cpu().numpy()
115 |             print("Kmeans acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
116 | 
117 |         self.train()
118 |         num = X.shape[0]
119 |         num_batch = int(math.ceil(1.0*X.shape[0]/batch_size))
120 |         print("num_batches:", num_batch)
121 |         for epoch in range(num_epochs):
122 |             if epoch % update_interval == 0:
123 |                 # update the targe distribution p
124 |                 _, q = self.forward(X)
125 |                 p = self.target_distribution(q).data
126 | 
127 |                 # evalute the clustering performance
128 |                 y_pred = torch.argmax(q, dim=1).data.cpu().numpy()
129 |                 if y is not None:
130 |                     print("epoch: %.5f, acc: %.5f, nmi: %.5f" % (epoch, acc(y, y_pred), normalized_mutual_info_score(y, y_pred)))
131 | 
132 |                 # check stop criterion
133 |                 delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / num
134 |                 y_pred_last = y_pred
135 |                 if epoch>0 and delta_label < tol:
136 |                     print('delta_label ', delta_label, '< tol ', tol)
137 |                     print("Reach tolerance threshold. Stopping training.")
138 |                     break
139 | 
140 |             # train 1 epoch
141 |             train_loss = 0.0
142 |             for batch_idx in range(num_batch):
143 |                 xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
144 |                 pbatch = p[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)]
145 |                 
146 |                 optimizer.zero_grad()
147 |                 inputs = Variable(xbatch)
148 |                 target = Variable(pbatch)
149 | 
150 |                 z, qbatch = self.forward(inputs)
151 |                 loss = self.loss_function(target, qbatch)
152 |                 train_loss += loss.data*len(inputs)
153 |                 loss.backward()
154 |                 optimizer.step()
155 | 
156 |             #print("#Epoch %3d: Loss: %.4f" % (
157 |             #    epoch+1, train_loss / num))
158 | 
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/lib/denoisingAutoencoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import Parameter
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import torchvision
  7 | from torchvision import datasets, transforms
  8 | from torch.autograd import Variable
  9 | 
 10 | import numpy as np
 11 | import math
 12 | from lib.utils import Dataset, masking_noise
 13 | from lib.ops import MSELoss, BCELoss
 14 | 
 15 | def adjust_learning_rate(init_lr, optimizer, epoch):
 16 |     lr = init_lr * (0.1 ** (epoch//100))
 17 |     toprint = True
 18 |     for param_group in optimizer.param_groups:
 19 |         if param_group["lr"]!=lr:
 20 |             param_group["lr"] = lr
 21 |             if toprint:
 22 |                 print("Switching to learning rate %f" % lr)
 23 |                 toprint = False
 24 | 
 25 | class DenoisingAutoencoder(nn.Module):
 26 |     def __init__(self, in_features, out_features, activation="relu", 
 27 |         dropout=0.2, tied=False):
 28 |         super(self.__class__, self).__init__()
 29 |         self.in_features = in_features
 30 |         self.out_features = out_features
 31 |         self.weight = Parameter(torch.Tensor(out_features, in_features))
 32 |         if tied:
 33 |             self.deweight = self.weight.t()
 34 |         else:
 35 |             self.deweight = Parameter(torch.Tensor(in_features, out_features))
 36 |         self.bias = Parameter(torch.Tensor(out_features))
 37 |         self.vbias = Parameter(torch.Tensor(in_features))
 38 |         
 39 |         if activation=="relu":
 40 |             self.enc_act_func = nn.ReLU()
 41 |         elif activation=="sigmoid":
 42 |             self.enc_act_func = nn.Sigmoid()
 43 |         elif activation=="none":
 44 |             self.enc_act_func = None
 45 |         self.dropout = nn.Dropout(p=dropout)
 46 | 
 47 |         self.reset_parameters()
 48 | 
 49 |     def reset_parameters(self):
 50 |         stdv = 0.01
 51 |         self.weight.data.uniform_(-stdv, stdv)
 52 |         self.bias.data.uniform_(-stdv, stdv)
 53 |         stdv = 0.01
 54 |         self.deweight.data.uniform_(-stdv, stdv)
 55 |         self.vbias.data.uniform_(-stdv, stdv)
 56 | 
 57 |     def forward(self, x):
 58 |         if self.enc_act_func is not None:
 59 |             return self.dropout(self.enc_act_func(F.linear(x, self.weight, self.bias)))
 60 |         else:
 61 |             return self.dropout(F.linear(x, self.weight, self.bias))
 62 | 
 63 |     def encode(self, x, train=True):
 64 |         if train:
 65 |             self.dropout.train()
 66 |         else:
 67 |             self.dropout.eval()
 68 |         if self.enc_act_func is not None:
 69 |             return self.dropout(self.enc_act_func(F.linear(x, self.weight, self.bias)))
 70 |         else:
 71 |             return self.dropout(F.linear(x, self.weight, self.bias))
 72 | 
 73 |     def encodeBatch(self, dataloader):
 74 |         use_cuda = torch.cuda.is_available()
 75 |         encoded = []
 76 |         for batch_idx, (inputs, _) in enumerate(dataloader):
 77 |             inputs = inputs.view(inputs.size(0), -1).float()
 78 |             if use_cuda:
 79 |                 inputs = inputs.cuda()
 80 |             inputs = Variable(inputs)
 81 |             hidden = self.encode(inputs, train=False)
 82 |             encoded.append(hidden.data.cpu())
 83 | 
 84 |         encoded = torch.cat(encoded, dim=0)
 85 |         return encoded
 86 | 
 87 |     def decode(self, x, binary=False):
 88 |         if not binary:
 89 |             return F.linear(x, self.deweight, self.vbias)
 90 |         else:
 91 |             return F.sigmoid(F.linear(x, self.deweight, self.vbias))
 92 | 
 93 |     def fit(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.3,
 94 |         loss_type="mse"):
 95 |         """
 96 |         data_x: FloatTensor
 97 |         valid_x: FloatTensor
 98 |         """
 99 |         use_cuda = torch.cuda.is_available()
100 |         if use_cuda:
101 |             self.cuda()
102 |         print("=====Denoising Autoencoding layer=======")
103 |         # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
104 |         optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
105 |         if loss_type=="mse":
106 |             criterion = MSELoss()
107 |         elif loss_type=="cross-entropy":
108 |             criterion = BCELoss()
109 | 
110 |         # validate
111 |         total_loss = 0.0
112 |         total_num = 0
113 |         for batch_idx, (inputs, _) in enumerate(validloader):
114 |             # inputs = inputs.view(inputs.size(0), -1).float()
115 |             # if use_cuda:
116 |             #     inputs = inputs.cuda()
117 |             inputs = Variable(inputs)
118 |             hidden = self.encode(inputs)
119 |             if loss_type=="cross-entropy":
120 |                 outputs = self.decode(hidden, binary=True)
121 |             else:
122 |                 outputs = self.decode(hidden)
123 | 
124 |             valid_recon_loss = criterion(outputs, inputs)
125 |             total_loss += valid_recon_loss.data * len(inputs)
126 |             total_num += inputs.size()[0]
127 | 
128 |         valid_loss = total_loss / total_num
129 |         print("#Epoch 0: Valid Reconstruct Loss: %.4f" % (valid_loss))
130 | 
131 |         self.train()
132 |         for epoch in range(num_epochs):
133 |             # train 1 epoch
134 |             train_loss = 0.0
135 |             adjust_learning_rate(lr, optimizer, epoch)
136 |             for batch_idx, (inputs, _) in enumerate(trainloader):
137 |                 # inputs = inputs.view(inputs.size(0), -1).float()
138 |                 inputs_corr = masking_noise(inputs, corrupt)
139 |                 # if use_cuda:
140 |                 #     inputs = inputs.cuda()
141 |                 #     inputs_corr = inputs_corr.cuda()
142 |                 optimizer.zero_grad()
143 |                 inputs = Variable(inputs)
144 |                 inputs_corr = Variable(inputs_corr)
145 | 
146 |                 hidden = self.encode(inputs_corr)
147 |                 if loss_type=="cross-entropy":
148 |                     outputs = self.decode(hidden, binary=True)
149 |                 else:
150 |                     outputs = self.decode(hidden)
151 |                 recon_loss = criterion(outputs, inputs)
152 |                 train_loss += recon_loss.data*len(inputs)
153 |                 recon_loss.backward()
154 |                 optimizer.step()
155 | 
156 |             # validate
157 |             valid_loss = 0.0
158 |             for batch_idx, (inputs, _) in enumerate(validloader):
159 |                 # inputs = inputs.view(inputs.size(0), -1).float()
160 |                 # if use_cuda:
161 |                 #     inputs = inputs.cuda()
162 |                 inputs = Variable(inputs)
163 |                 hidden = self.encode(inputs, train=False)
164 |                 if loss_type=="cross-entropy":
165 |                     outputs = self.decode(hidden, binary=True)
166 |                 else:
167 |                     outputs = self.decode(hidden)
168 | 
169 |                 valid_recon_loss = criterion(outputs, inputs)
170 |                 valid_loss += valid_recon_loss.data * len(inputs)
171 | 
172 |             print("#Epoch %3d: Reconstruct Loss: %.4f, Valid Reconstruct Loss: %.4f" % (
173 |                 epoch+1, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset)))
174 | 
175 |     def extra_repr(self):
176 |         return 'in_features={}, out_features={}, bias={}'.format(
177 |             self.in_features, self.out_features, self.bias is not None
178 |         )
179 | 
180 | 


--------------------------------------------------------------------------------
/lib/ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import Parameter
 4 | from torch.autograd import Variable
 5 | import torch.nn.functional as F
 6 | import math
 7 | 
 8 | 
 9 | class MSELoss(nn.Module):
10 |     def __init__(self):
11 |         super(self.__class__, self).__init__()
12 | 
13 |     def forward(self, input, target):
14 |         return 0.5 * torch.mean((input-target)**2)
15 | 
16 | class BCELoss(nn.Module):
17 |     def __init__(self):
18 |         super(self.__class__, self).__init__()
19 | 
20 |     def forward(self, input, target):
21 |         return -torch.mean(torch.sum(target*torch.log(torch.clamp(input, min=1e-10))+
22 |             (1-target)*torch.log(torch.clamp(1-input, min=1e-10)), 1))
23 | 


--------------------------------------------------------------------------------
/lib/stackedDAE.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import Parameter
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import torchvision
  7 | from torchvision import datasets, transforms
  8 | from torch.autograd import Variable
  9 | 
 10 | import numpy as np
 11 | import math
 12 | from lib.utils import Dataset, masking_noise
 13 | from lib.ops import MSELoss, BCELoss
 14 | from lib.denoisingAutoencoder import DenoisingAutoencoder
 15 | 
 16 | def buildNetwork(layers, activation="relu", dropout=0):
 17 |     net = []
 18 |     for i in range(1, len(layers)):
 19 |         net.append(nn.Linear(layers[i-1], layers[i]))
 20 |         if activation=="relu":
 21 |             net.append(nn.ReLU())
 22 |         elif activation=="sigmoid":
 23 |             net.append(nn.Sigmoid())
 24 |         if dropout > 0:
 25 |             net.append(nn.Dropout(dropout))
 26 |     return nn.Sequential(*net)
 27 | 
 28 | def adjust_learning_rate(init_lr, optimizer, epoch):
 29 |     lr = init_lr * (0.1 ** (epoch//100))
 30 |     toprint = True
 31 |     for param_group in optimizer.param_groups:
 32 |         if param_group["lr"]!=lr:
 33 |             param_group["lr"] = lr
 34 |             if toprint:
 35 |                 print("Switching to learning rate %f" % lr)
 36 |                 toprint = False
 37 | 
 38 | class StackedDAE(nn.Module):
 39 |     def __init__(self, input_dim=784, z_dim=10, binary=True,
 40 |         encodeLayer=[400], decodeLayer=[400], activation="relu", 
 41 |         dropout=0, tied=False):
 42 |         super(self.__class__, self).__init__()
 43 |         self.z_dim = z_dim
 44 |         self.layers = [input_dim] + encodeLayer + [z_dim]
 45 |         self.activation = activation
 46 |         self.dropout = dropout
 47 |         self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout)
 48 |         self.decoder = buildNetwork([z_dim] + decodeLayer, activation=activation, dropout=dropout)
 49 |         self._enc_mu = nn.Linear(encodeLayer[-1], z_dim)
 50 |         
 51 |         self._dec = nn.Linear(decodeLayer[-1], input_dim)
 52 |         self._dec_act = None
 53 |         if binary:
 54 |             self._dec_act = nn.Sigmoid()
 55 | 
 56 |     def decode(self, z):
 57 |         h = self.decoder(z)
 58 |         x = self._dec(h)
 59 |         if self._dec_act is not None:
 60 |             x = self._dec_act(x)
 61 |         return x
 62 | 
 63 |     def loss_function(self, recon_x, x):
 64 |         loss = -torch.mean(torch.sum(x*torch.log(torch.clamp(recon_x, min=1e-10))+
 65 |             (1-x)*torch.log(torch.clamp(1-recon_x, min=1e-10)), 1))
 66 | 
 67 |         return loss
 68 | 
 69 |     def forward(self, x):
 70 |         h = self.encoder(x)
 71 |         z = self._enc_mu(h)
 72 | 
 73 |         return z, self.decode(z)
 74 | 
 75 |     def save_model(self, path):
 76 |         torch.save(self.state_dict(), path)
 77 | 
 78 |     def load_model(self, path):
 79 |         pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage)
 80 |         model_dict = self.state_dict()
 81 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
 82 |         model_dict.update(pretrained_dict) 
 83 |         self.load_state_dict(model_dict)
 84 | 
 85 |     def pretrain(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.2, loss_type="cross-entropy"):
 86 |         trloader = trainloader
 87 |         valoader = validloader
 88 |         daeLayers = []
 89 |         for l in range(1, len(self.layers)):
 90 |             infeatures = self.layers[l-1]
 91 |             outfeatures = self.layers[l]
 92 |             if l!= len(self.layers)-1:
 93 |                 dae = DenoisingAutoencoder(infeatures, outfeatures, activation=self.activation, dropout=corrupt)
 94 |             else:
 95 |                 dae = DenoisingAutoencoder(infeatures, outfeatures, activation="none", dropout=0)
 96 |             print(dae)
 97 |             if l==1:
 98 |                 dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type=loss_type)
 99 |             else:
100 |                 if self.activation=="sigmoid":
101 |                     dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="cross-entropy")
102 |                 else:
103 |                     dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="mse")
104 |             data_x = dae.encodeBatch(trloader)
105 |             valid_x = dae.encodeBatch(valoader)
106 |             trainset = Dataset(data_x, data_x)
107 |             trloader = torch.utils.data.DataLoader(
108 |                 trainset, batch_size=batch_size, shuffle=True, num_workers=0)
109 |             validset = Dataset(valid_x, valid_x)
110 |             valoader = torch.utils.data.DataLoader(
111 |                 validset, batch_size=1000, shuffle=False, num_workers=0)
112 |             daeLayers.append(dae)
113 | 
114 |         self.copyParam(daeLayers)
115 | 
116 |     def copyParam(self, daeLayers):
117 |         if self.dropout==0:
118 |             every = 2
119 |         else:
120 |             every = 3
121 |         # input layer
122 |         # copy encoder weight
123 |         self.encoder[0].weight.data.copy_(daeLayers[0].weight.data)
124 |         self.encoder[0].bias.data.copy_(daeLayers[0].bias.data)
125 |         self._dec.weight.data.copy_(daeLayers[0].deweight.data)
126 |         self._dec.bias.data.copy_(daeLayers[0].vbias.data)
127 | 
128 |         for l in range(1, len(self.layers)-2):
129 |             # copy encoder weight
130 |             self.encoder[l*every].weight.data.copy_(daeLayers[l].weight.data)
131 |             self.encoder[l*every].bias.data.copy_(daeLayers[l].bias.data)
132 | 
133 |             # copy decoder weight
134 |             self.decoder[-(l-1)*every-2].weight.data.copy_(daeLayers[l].deweight.data)
135 |             self.decoder[-(l-1)*every-2].bias.data.copy_(daeLayers[l].vbias.data)
136 | 
137 |         # z layer
138 |         self._enc_mu.weight.data.copy_(daeLayers[-1].weight.data)
139 |         self._enc_mu.bias.data.copy_(daeLayers[-1].bias.data)
140 |         self.decoder[0].weight.data.copy_(daeLayers[-1].deweight.data)
141 |         self.decoder[0].bias.data.copy_(daeLayers[-1].vbias.data)
142 | 
143 |     def fit(self, trainloader, validloader, lr=0.001, num_epochs=10, corrupt=0.3,
144 |         loss_type="mse"):
145 |         """
146 |         data_x: FloatTensor
147 |         valid_x: FloatTensor
148 |         """
149 |         use_cuda = torch.cuda.is_available()
150 |         if use_cuda:
151 |             self.cuda()
152 |         print("=====Stacked Denoising Autoencoding Layer=======")
153 |         # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
154 |         optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
155 |         if loss_type=="mse":
156 |             criterion = MSELoss()
157 |         elif loss_type=="cross-entropy":
158 |             criterion = BCELoss()
159 | 
160 |         # validate
161 |         total_loss = 0.0
162 |         total_num = 0
163 |         for batch_idx, (inputs, _) in enumerate(validloader):
164 |             inputs = inputs.view(inputs.size(0), -1).float()
165 |             if use_cuda:
166 |                 inputs = inputs.cuda()
167 |             inputs = Variable(inputs)
168 |             z, outputs = self.forward(inputs)
169 | 
170 |             valid_recon_loss = criterion(outputs, inputs)
171 |             total_loss += valid_recon_loss.data * len(inputs)
172 |             total_num += inputs.size()[0]
173 | 
174 |         valid_loss = total_loss / total_num
175 |         print("#Epoch 0: Valid Reconstruct Loss: %.4f" % (valid_loss))
176 |         self.train()
177 |         for epoch in range(num_epochs):
178 |             # train 1 epoch
179 |             adjust_learning_rate(lr, optimizer, epoch)
180 |             train_loss = 0.0
181 |             for batch_idx, (inputs, _) in enumerate(trainloader):
182 |                 inputs = inputs.view(inputs.size(0), -1).float()
183 |                 inputs_corr = masking_noise(inputs, corrupt)
184 |                 if use_cuda:
185 |                     inputs = inputs.cuda()
186 |                     inputs_corr = inputs_corr.cuda()
187 |                 optimizer.zero_grad()
188 |                 inputs = Variable(inputs)
189 |                 inputs_corr = Variable(inputs_corr)
190 | 
191 |                 z, outputs = self.forward(inputs_corr)
192 |                 recon_loss = criterion(outputs, inputs)
193 |                 train_loss += recon_loss.data*len(inputs)
194 |                 recon_loss.backward()
195 |                 optimizer.step()
196 | 
197 |             # validate
198 |             valid_loss = 0.0
199 |             for batch_idx, (inputs, _) in enumerate(validloader):
200 |                 inputs = inputs.view(inputs.size(0), -1).float()
201 |                 if use_cuda:
202 |                     inputs = inputs.cuda()
203 |                 inputs = Variable(inputs)
204 |                 z, outputs = self.forward(inputs)
205 | 
206 |                 valid_recon_loss = criterion(outputs, inputs)
207 |                 valid_loss += valid_recon_loss.data * len(inputs)
208 | 
209 |             print("#Epoch %3d: Reconstruct Loss: %.4f, Valid Reconstruct Loss: %.4f" % (
210 |                 epoch+1, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset)))
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
  1 | '''Some helper functions for PyTorch, including:
  2 |     - get_mean_and_std: calculate the mean and std value of dataset.
  3 |     - msr_init: net parameter initialization.
  4 |     - progress_bar: progress bar mimic xlua.progress.
  5 |     
  6 | '''
  7 | import os
  8 | import sys
  9 | import time
 10 | import math
 11 | import numpy as np
 12 | import random
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.init as init
 16 | import torch.utils.data as data
 17 | from scipy.linalg import norm
 18 | from PIL import Image
 19 | 
 20 | 
 21 | def weights_xavier_init(m):
 22 |     if isinstance(m, nn.Linear):
 23 |         nn.init.xavier_uniform(m.weight.data)
 24 |         nn.init.constant(m.bias.data, 0)
 25 | 
 26 | 
 27 | class Dataset(data.Dataset):
 28 |     def __init__(self, data, labels, transform=None, target_transform=None):
 29 |         self.transform = transform
 30 |         self.target_transform = target_transform
 31 |         self.data = data
 32 |         self.labels = labels
 33 |         if torch.cuda.is_available():
 34 |             self.data = self.data.cuda()
 35 |             self.labels = self.labels.cuda()
 36 | 
 37 |     def __getitem__(self, index):
 38 |         img, target = self.data[index], self.labels[index]
 39 |         # img = Image.fromarray(img)
 40 |         if self.transform is not None:
 41 |             img = self.transform(img)
 42 | 
 43 |         if self.target_transform is not None:
 44 |             target = self.target_transform(target)
 45 | 
 46 |         return img, target
 47 | 
 48 |     def __len__(self):
 49 |         return len(self.data)
 50 | 
 51 | 
 52 | def masking_noise(data, frac):
 53 |     """
 54 |     data: Tensor
 55 |     frac: fraction of unit to be masked out
 56 |     """
 57 |     data_noise = data.clone()
 58 |     rand = torch.rand(data.size())
 59 |     data_noise[rand<frac] = 0
 60 |     return data_noise
 61 | 
 62 | 
 63 | def acc(y_true, y_pred):
 64 |     """
 65 |     Calculate clustering accuracy. Require scikit-learn installed
 66 | 
 67 |     # Arguments
 68 |         y: true labels, numpy.array with shape `(n_samples,)`
 69 |         y_pred: predicted labels, numpy.array with shape `(n_samples,)`
 70 | 
 71 |     # Return
 72 |         accuracy, in [0,1]
 73 |     """
 74 |     y_true = y_true.astype(np.int64)
 75 |     assert y_pred.size == y_true.size
 76 |     D = max(y_pred.max(), y_true.max()) + 1
 77 |     w = np.zeros((D, D), dtype=np.int64)
 78 |     for i in range(y_pred.size):
 79 |         w[y_pred[i], y_true[i]] += 1
 80 |     from sklearn.utils.linear_assignment_ import linear_assignment
 81 |     ind = linear_assignment(w.max() - w)
 82 |     return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
 83 | 
 84 | 
 85 | def detect_wrong(y_true, y_pred):
 86 |     """
 87 |     Simulating instance difficulty constraints. Require scikit-learn installed
 88 |     
 89 |     # Arguments
 90 |         y: true labels, numpy.array with shape `(n_samples,)`
 91 |         y_pred: predicted labels, numpy.array with shape `(n_samples,)`
 92 | 
 93 |     # Return
 94 |         A mask vector M =  1xn which indicates the difficulty degree
 95 |         We treat k-means as weak learner and set low confidence (0.1) for incorrect instances.
 96 |         Set high confidence (1) for correct instances.
 97 |     """
 98 |     y_true = y_true.astype(np.int64)
 99 |     assert y_pred.size == y_true.size
100 |     D = max(y_pred.max(), y_true.max()) + 1
101 |     w = np.zeros((D, D), dtype=np.int64)
102 |     for i in range(y_pred.size):
103 |         w[y_pred[i], y_true[i]] += 1
104 |     from sklearn.utils.linear_assignment_ import linear_assignment
105 |     ind = linear_assignment(w.max() - w)
106 |     mapping_dict = {}
107 |     for pair in ind:
108 |         mapping_dict[pair[0]] = pair[1]
109 |     wrong_preds = []
110 |     for i in range(y_pred.size):
111 |         if mapping_dict[y_pred[i]] != y_true[i]:
112 |             wrong_preds.append(-0.1)   # low confidence -0.1 set for k-means weak learner
113 |         else:
114 |             wrong_preds.append(1)
115 |     return np.array(wrong_preds)
116 | 
117 | 
118 | def transitive_closure(ml_ind1, ml_ind2, cl_ind1, cl_ind2, n):
119 |     """
120 |     This function calculate the total transtive closure for must-links and the full entailment
121 |     for cannot-links. 
122 |     
123 |     # Arguments
124 |         ml_ind1, ml_ind2 = instances within a pair of must-link constraints
125 |         cl_ind1, cl_ind2 = instances within a pair of cannot-link constraints
126 |         n = total training instance number
127 | 
128 |     # Return
129 |         transtive closure (must-links)
130 |         entailment of cannot-links
131 |     """
132 |     ml_graph = dict()
133 |     cl_graph = dict()
134 |     for i in range(n):
135 |         ml_graph[i] = set()
136 |         cl_graph[i] = set()
137 | 
138 |     def add_both(d, i, j):
139 |         d[i].add(j)
140 |         d[j].add(i)
141 | 
142 |     for (i, j) in zip(ml_ind1, ml_ind2):
143 |         add_both(ml_graph, i, j)
144 | 
145 |     def dfs(i, graph, visited, component):
146 |         visited[i] = True
147 |         for j in graph[i]:
148 |             if not visited[j]:
149 |                 dfs(j, graph, visited, component)
150 |         component.append(i)
151 | 
152 |     visited = [False] * n
153 |     for i in range(n):
154 |         if not visited[i]:
155 |             component = []
156 |             dfs(i, ml_graph, visited, component)
157 |             for x1 in component:
158 |                 for x2 in component:
159 |                     if x1 != x2:
160 |                         ml_graph[x1].add(x2)
161 |     for (i, j) in zip(cl_ind1, cl_ind2):
162 |         add_both(cl_graph, i, j)
163 |         for y in ml_graph[j]:
164 |             add_both(cl_graph, i, y)
165 |         for x in ml_graph[i]:
166 |             add_both(cl_graph, x, j)
167 |             for y in ml_graph[j]:
168 |                 add_both(cl_graph, x, y)
169 |     ml_res_set = set()
170 |     cl_res_set = set()
171 |     for i in ml_graph:
172 |         for j in ml_graph[i]:
173 |             if j != i and j in cl_graph[i]:
174 |                 raise Exception('inconsistent constraints between %d and %d' % (i, j))
175 |             if i <= j:
176 |                 ml_res_set.add((i, j))
177 |             else:
178 |                 ml_res_set.add((j, i))
179 |     for i in cl_graph:
180 |         for j in cl_graph[i]:
181 |             if i <= j:
182 |                 cl_res_set.add((i, j))
183 |             else:
184 |                 cl_res_set.add((j, i))
185 |     ml_res1, ml_res2 = [], []
186 |     cl_res1, cl_res2 = [], []
187 |     for (x, y) in ml_res_set:
188 |         ml_res1.append(x)
189 |         ml_res2.append(y)
190 |     for (x, y) in cl_res_set:
191 |         cl_res1.append(x)
192 |         cl_res2.append(y)
193 |     return np.array(ml_res1), np.array(ml_res2), np.array(cl_res1), np.array(cl_res2)
194 | 
195 | 
196 | def generate_random_pair(y, num):
197 |     """
198 |     Generate random pairwise constraints.
199 |     """
200 |     ml_ind1, ml_ind2 = [], []
201 |     cl_ind1, cl_ind2 = [], []
202 |     y = y.to(torch.device("cpu"))
203 |     y = y.numpy()
204 |     while num > 0:
205 |         tmp1 = random.randint(0, y.shape[0] - 1)
206 |         tmp2 = random.randint(0, y.shape[0] - 1)
207 |         if tmp1 == tmp2:
208 |             continue
209 |         if y[tmp1] == y[tmp2]:
210 |             ml_ind1.append(tmp1)
211 |             ml_ind2.append(tmp2)
212 |         else:
213 |             cl_ind1.append(tmp1)
214 |             cl_ind2.append(tmp2)
215 |         num -= 1
216 |     return np.array(ml_ind1), np.array(ml_ind2), np.array(cl_ind1), np.array(cl_ind2)
217 | 
218 | 
219 | def generate_mnist_triplets(y, num):
220 |     """
221 |     Generate random triplet constraints
222 |     """
223 |     # To download the trusted_embedding for mnist data, run the script download_model.sh
224 |     # Or you can create your own truseted embedding by running our pairwise constraints model
225 |     # with 100000 randomly generated constraints.
226 |     mnist_embedding = np.load("../model/mnist_triplet_embedding.npy")
227 |     anchor_inds, pos_inds, neg_inds = [], [], []
228 |     while num > 0:
229 |         tmp_anchor_index = random.randint(0, y.shape[0] - 1)
230 |         tmp_pos_index = random.randint(0, y.shape[0] - 1)
231 |         tmp_neg_index = random.randint(0, y.shape[0] - 1)
232 |         pos_distance = norm(mnist_embedding[tmp_anchor_index]-mnist_embedding[tmp_pos_index], 2)
233 |         neg_distance = norm(mnist_embedding[tmp_anchor_index]-mnist_embedding[tmp_neg_index], 2)
234 |         # 35 is selected by grid search which produce human trusted positive/negative pairs
235 |         if neg_distance <= pos_distance + 35:
236 |             continue
237 |         anchor_inds.append(tmp_anchor_index)
238 |         pos_inds.append(tmp_pos_index)
239 |         neg_inds.append(tmp_neg_index)
240 |         num -= 1
241 |     return np.array(anchor_inds), np.array(pos_inds), np.array(neg_inds)
242 | 
243 | 
244 | def generate_triplet_constraints_continuous(y, num):
245 |     """
246 |     Generate random triplet constraints
247 |     """
248 |     # To download the trusted_embedding for mnist data, run the script download_model.sh
249 |     # Or you can create your own truseted embedding by running our pairwise constraints model
250 |     # with 100000 randomly generated constraints.
251 |     fashion_embedding = np.load("../model/fashion_triplet_embedding.npy")
252 |     anchor_inds, pos_inds, neg_inds = [], [], []
253 |     while num > 0:
254 |         tmp_anchor_index = random.randint(0, y.shape[0] - 1)
255 |         tmp_pos_index = random.randint(0, y.shape[0] - 1)
256 |         tmp_neg_index = random.randint(0, y.shape[0] - 1)
257 |         pos_distance = norm(fashion_embedding[tmp_anchor_index]-fashion_embedding[tmp_pos_index], 2)
258 |         neg_distance = norm(fashion_embedding[tmp_anchor_index]-fashion_embedding[tmp_neg_index], 2)
259 |         # 80 is selected by grid search which produce human trusted positive/negative pairs
260 |         if neg_distance <= pos_distance + 80:
261 |             continue
262 |         anchor_inds.append(tmp_anchor_index)
263 |         pos_inds.append(tmp_pos_index)
264 |         neg_inds.append(tmp_neg_index)
265 |         num -= 1
266 |     return np.array(anchor_inds), np.array(pos_inds), np.array(neg_inds)
267 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Code for ECMLPKDD 2019 Paper: [A Framework for Deep Constrained Clustering - Algorithms and Advances](https://arxiv.org/abs/1901.10061)
  2 | 
  3 | ## Installation
  4 | 
  5 | #### Step 1: Clone the Code from Github
  6 | 
  7 | ```
  8 | git clone https://github.com/blueocean92/deep_constrained_clustering
  9 | cd deep_constrained_clustering
 10 | ```
 11 | 
 12 | 
 13 | 
 14 | 
 15 | #### Step 2: Install Requirements
 16 | 
 17 | **Python**: see [`requirement.txt`](https://github.com/blueocean92/deep_constrained_clustering/blob/master/requirements.txt) for complete list of used packages. We recommend doing a clean installation of requirements using virtualenv:
 18 | ```bash
 19 | conda create -n testenv python=3.6
 20 | source activate testenv
 21 | pip install -r requirements.txt 
 22 | ```
 23 | 
 24 | If you dont want to do the above clean installation via virtualenv, you could also directly install the requirements through:
 25 | ```bash
 26 | pip install -r requirements.txt --no-index
 27 | ```
 28 | 
 29 | **PyTorch**: Note that you need [PyTorch](https://pytorch.org/). We used Version 1.0.0 If you use the above virtualenv, PyTorch will be automatically installed therein. 
 30 | 
 31 | 
 32 | ## Running Constrained Clustering Experiments
 33 | 
 34 | While in `deep_constrained_clustering` folder:
 35 | 
 36 | #### Step 1: Download Pretrained Networks
 37 | 
 38 | ```
 39 | sh download_model.sh
 40 | ```
 41 | 
 42 | #### Step 2: Download Processed Reuters Data(optional, MNIST and Fashion is available in torchvision.datasets)
 43 | 
 44 | ```
 45 | sh download_data.sh
 46 | ```
 47 | 
 48 | ```
 49 | cd experiments/
 50 | ```
 51 | 
 52 | While in `deep_constrained_clustering/experiments` folder:
 53 | #### Step 3: Run Experimental Scripts to Reproduce Results
 54 | 
 55 | ###### Option 1: Run Demo Pairwise Constraints Script
 56 | 
 57 | To run the pairwise constrained clustering using pre-trained weights (AE features, 6000 constraints), do:
 58 | ```bash
 59 | python run_DCC_pairwise.py --data $DATA
 60 | ```
 61 | 
 62 | For the `--data` flag which specifies the data set being used, the options are "MNIST", "Fashion" and "Reuters".
 63 | 
 64 | To run the pairwise without constrained clustering from raw features, do:
 65 | ```bash
 66 | python run_DCC_pairwise.py --data $DATA --without_pretrain
 67 | ```
 68 | 
 69 | To run the pairwise without KMeans initialization, do:
 70 | ```bash
 71 | python run_DCC_pairwise.py --data $DATA --without_kmeans
 72 | ```
 73 | 
 74 | To run the pairwise constrained clustering with noisy pairwise constraints do:
 75 | ```bash
 76 | python run_DCC_pairwise.py --data $DATA --noisy $NOISE
 77 | ```
 78 | 
 79 | For the `--noisy` flag which specifies the noisy degree, the option should be a positive float equal to the ratio of noisy constraints to ground truth constraints.
 80 | 
 81 | 
 82 | To save data for plotting, do:
 83 | ```bash
 84 | python run_DCC_pairwise.py --data $DATA --plotting
 85 | ```
 86 | 
 87 | This will save the experiment data for plotting in folders under ./plotting
 88 | 
 89 | To plot the results, do:
 90 | ```bash
 91 | python ./plotting/plot_pairwise.py
 92 | ```
 93 | 
 94 | 
 95 | ###### Option 2: Run Demo Instance Constraints Script
 96 | 
 97 | To run the instance difficulty constrained clustering, do:
 98 | ```bash
 99 | python run_DCC_instance.py --data $DATA
100 | ```
101 | 
102 | ###### Option 3: Run Demo Triplets Constraints Script
103 | 
104 | To run the triplets constrained clustering (6000 constraints), do:
105 | ```bash
106 | python run_DCC_triplets.py --data $DATA
107 | ```
108 | 
109 | 
110 | ###### Option 4: Run Demo Global Constraints Script
111 | 
112 | To run the global size constrained clustering, do:
113 | ```bash
114 | python run_DCC_global.py --data $DATA
115 | ```
116 | 
117 | 
118 | ###### Option 5: Run Demo Improved DEC Script
119 | 
120 | To run the baseline Improved DEC, do:
121 | ```bash
122 | python run_improved_DEC.py --data $DATA
123 | ```
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2018.11.29
 2 | cffi==1.11.5
 3 | numpy==1.15.4
 4 | olefile==0.46
 5 | Pillow==6.2.0
 6 | pycparser==2.19
 7 | scikit-learn==0.20.2
 8 | scipy==1.1.0
 9 | six==1.12.0
10 | torch==1.0.0
11 | torchvision==0.2.1
12 | 


--------------------------------------------------------------------------------