├── download_data.sh ├── download_model.sh ├── experiments ├── plotting │ └── plot_pairewise.py ├── run_DCC_global.py ├── run_DCC_instance.py ├── run_DCC_pairwise.py ├── run_DCC_triplets.py ├── run_DEC.py ├── run_improved_DEC.py └── run_sdae.py ├── lib ├── __pycache__ │ ├── datasets.cpython-36.pyc │ ├── dcc.cpython-36.pyc │ ├── dec.cpython-36.pyc │ ├── denoisingAutoencoder.cpython-36.pyc │ ├── ops.cpython-36.pyc │ ├── stackedDAE.cpython-36.pyc │ └── utils.cpython-36.pyc ├── datasets.py ├── dcc.py ├── dec.py ├── denoisingAutoencoder.py ├── ops.py ├── stackedDAE.py └── utils.py ├── readme.md └── requirements.txt /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ##!/usr/bin/env bash 3 | 4 | TASKS="reutersidf10k_train.npy \ 5 | reutersidf10k_test.npy" 6 | 7 | 8 | for t in $TASKS; do 9 | echo "Downloading model ${t}." 10 | wget "https://s3-us-west-1.amazonaws.com/deep-constrained-clustering/\ 11 | Data-Reuters/${t}" -P ./experiments/dataset/reuters/ 12 | done 13 | -------------------------------------------------------------------------------- /download_model.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ##!/usr/bin/env bash 3 | 4 | TASKS="fashion_sdae_weights.pt \ 5 | fashion_triplet_embedding.npy \ 6 | mnist_sdae_weights.pt \ 7 | mnist_triplet_embedding.npy \ 8 | reuters10k_sdae_weights.pt" 9 | 10 | 11 | for t in $TASKS; do 12 | echo "Downloading model ${t}." 13 | wget "https://s3-us-west-1.amazonaws.com/deep-constrained-clustering/\ 14 | model-log-final/${t}" -P ./model/ 15 | done -------------------------------------------------------------------------------- /experiments/plotting/plot_pairewise.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import time 5 | import random 6 | import re 7 | import json 8 | import pickle 9 | import pandas as pd 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | from sklearn.manifold import TSNE 14 | from collections import defaultdict 15 | 16 | 17 | if __name__ == "__main__": 18 | 19 | folders = [d for d in os.listdir(".") if os.path.isdir(d) and d != "Legend" and d != "Util"] 20 | 21 | label_dict = { 22 | "M": ["0","1","2","3","4","5","6","7","8","9"], 23 | "F": ["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"], 24 | "R": ["corporate/industrial", "government/social", "markets", "economics"] 25 | } 26 | 27 | for folder in folders: 28 | 29 | print("\nStarting "+folder) 30 | 31 | try: 32 | latent_files = [f for f in os.listdir(folder) if f.startswith("save")] 33 | print(latent_files) 34 | except: 35 | print("No latent files, Skipping Folder") 36 | continue 37 | 38 | link_points = [] 39 | 40 | try: 41 | must_links = pd.read_pickle(os.path.join(folder,"mustlinks.pkl")) 42 | cannot_links = pd.read_pickle(os.path.join(folder,"cannotlinks.pkl")) 43 | 44 | random.seed(1) 45 | ml_sample = random.sample(range(must_links.shape[0]), must_links.shape[0]) 46 | link_points += must_links.iloc[ml_sample[:20],0].tolist() 47 | link_points += must_links.iloc[ml_sample[:20],1].tolist() 48 | random.seed(2) 49 | cl_sample = random.sample(range(cannot_links.shape[0]), cannot_links.shape[0]) 50 | link_points += cannot_links.iloc[cl_sample[:20],0].tolist() 51 | link_points += cannot_links.iloc[cl_sample[:20],1].tolist() 52 | except: 53 | print("No must link / cannot link, Skipping Folder") 54 | continue 55 | 56 | try: 57 | noisy_must_links = pd.read_pickle(os.path.join(folder,"noisymustlinks.pkl")) 58 | random.seed(3) 59 | noisy_ml_sample = random.sample(range(noisy_must_links.shape[0]),noisy_must_links.shape[0]) 60 | link_points += noisy_must_links.iloc[noisy_ml_sample[:20],0].tolist() 61 | link_points += noisy_must_links.iloc[noisy_ml_sample[:20],1].tolist() 62 | except: 63 | noisy_must_links = [] 64 | noisy_ml_sample = [] 65 | 66 | try: 67 | noisy_cannot_links = pd.read_pickle(os.path.join(folder,"noisycannotlinks.pkl")) 68 | random.seed(4) 69 | noisy_cl_sample = random.sample(range(noisy_cannot_links.shape[0]),noisy_cannot_links.shape[0]) 70 | link_points += noisy_cannot_links.iloc[noisy_cl_sample[:20],0].tolist() 71 | link_points += noisy_cannot_links.iloc[noisy_cl_sample[:20],1].tolist() 72 | except: 73 | noisy_cannot_links = [] 74 | noisy_cl_sample = [] 75 | 76 | try: 77 | with open(os.path.join(folder,"intermediate_results.json"), "r") as fp: 78 | intermediate_results = json.load(fp) 79 | except: 80 | intermediate_results = defaultdict(lambda:defaultdict(lambda:0.0)) 81 | 82 | link_points = list(set(link_points)) 83 | 84 | # Start Plotting 85 | for k, file in enumerate(latent_files): 86 | 87 | df = pd.read_pickle(os.path.join(folder,file)) 88 | epoch = re.sub('[^0-9]','', file) 89 | 90 | if folder.startswith("Reuters"): 91 | latent_full = df.sample(frac=0.75, random_state=7).append(df.iloc[link_points,:]) 92 | else: 93 | latent_full = df.sample(frac=0.25, random_state=7).append(df.iloc[link_points,:]) 94 | 95 | latent = latent_full.iloc[:,0:10].copy() 96 | 97 | time_start = time.time() 98 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=350) 99 | tsne_results = tsne.fit_transform(latent) 100 | print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)) 101 | 102 | latent['tsne-1'] = tsne_results[:,0] 103 | latent['tsne-2'] = tsne_results[:,1] 104 | latent["class"] = np.array([label_dict[folder[0]][x] for x in latent_full["y"].tolist()]) 105 | 106 | plt.figure(k,figsize=(16,10)) 107 | plt.title("Accuracy: %.2f, NMI: %.2f"%(intermediate_results["acc"][epoch],intermediate_results["nmi"][epoch])) 108 | 109 | sns.scatterplot( 110 | x="tsne-1", y="tsne-2", 111 | hue="class", 112 | palette=sns.color_palette("hls", latent["class"].nunique()), 113 | data=latent, 114 | legend="full", 115 | alpha=0.8, 116 | s=20 117 | ) 118 | 119 | 120 | # plot links 121 | plot_links = [ {"sample": ml_sample, "link": must_links, "count":10, "style": 'b-', "label": "must link"}, 122 | {"sample": cl_sample, "link": cannot_links, "count":10, "style": 'r-', "label": "cannot link"}, 123 | {"sample": noisy_ml_sample, "link": noisy_must_links, "count":10, "style": 'k-', "label": "noisy must link"}, 124 | {"sample": noisy_cl_sample, "link": noisy_cannot_links, "count":10, "style": 'k:', "label": "noisy cannot link"}, 125 | ] 126 | 127 | for plot_link in plot_links: 128 | count = 0 129 | for i in plot_link["sample"]: 130 | if count >= plot_link["count"]: 131 | break 132 | try: 133 | p1 = latent.loc[plot_link["link"].loc[i][0]] 134 | p2 = latent.loc[plot_link["link"].loc[i][1]] 135 | plt.plot([p1["tsne-1"],p2["tsne-1"]], [p1["tsne-2"],p2["tsne-2"]], plot_link["style"], label=plot_link["label"]) 136 | count += 1 137 | except: 138 | pass 139 | 140 | # remove duplicate label for lines 141 | handles, labels = plt.gca().get_legend_handles_labels() 142 | newLabels, newHandles = [], [] 143 | for handle, label in zip(handles, labels): 144 | if label not in newLabels: 145 | newLabels.append(label) 146 | newHandles.append(handle) 147 | 148 | #lgd = plt.gca().legend(newHandles, newLabels, loc='center left', bbox_to_anchor=(1, 0.5)) 149 | lgd = plt.gca().legend(newHandles, newLabels, loc='center', bbox_to_anchor=(0.5, -0.10),fancybox=True, ncol=len(newLabels), columnspacing=1.0,handlelength=1.0) 150 | 151 | plt.savefig(os.path.join(folder,folder+"_"+epoch+".png"), bbox_extra_artists=(lgd,), bbox_inches='tight') 152 | plt.clf() 153 | -------------------------------------------------------------------------------- /experiments/run_DCC_global.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import torch.utils.data 4 | import numpy as np 5 | import argparse 6 | from lib.dcc import IDEC 7 | from lib.datasets import MNIST, FashionMNIST 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser(description='Global MNIST Example') 11 | parser.add_argument('--lr', type=float, default=0.001, metavar='N', 12 | help='learning rate for training (default: 0.001)') 13 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 14 | help='input batch size for training (default: 256)') 15 | parser.add_argument('--update-interval', type=int, default=1, metavar='N', 16 | help='number of epochs to train (default: 1)') 17 | parser.add_argument('--epochs', type=int, default=200, metavar='N', 18 | help='number of epochs to train (default: 200)') 19 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N', 20 | help='directory for pre-trained weights') 21 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion)') 22 | parser.add_argument('--use_pretrain', type=str, default="True") 23 | args = parser.parse_args() 24 | 25 | # Load data 26 | mnist_train = MNIST('./dataset/mnist', train=True, download=True) 27 | mnist_test = MNIST('./dataset/mnist', train=False) 28 | X = mnist_train.train_data 29 | y = mnist_train.train_labels 30 | test_X = mnist_test.test_data 31 | test_y = mnist_test.test_labels 32 | if args.data == "Fashion": 33 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True) 34 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False) 35 | X = fashionmnist_train.train_data 36 | y = fashionmnist_train.train_labels 37 | test_X = fashionmnist_test.test_data 38 | test_y = fashionmnist_test.test_labels 39 | args.pretrain="../model/fashion_sdae_weights.pt" 40 | ml_penalty = 1 41 | 42 | # Set parameters 43 | ml_penalty, cl_penalty = 0.1, 1 44 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10, 45 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 46 | 47 | # Print Network Structure 48 | print(idec) 49 | if args.use_pretrain == "True": 50 | idec.load_model(args.pretrain) 51 | 52 | # Construct constriants 53 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([]) 54 | anchor, positive, negative = np.array([]), np.array([]), np.array([]) 55 | instance_guidance = torch.zeros(X.shape[0]).cuda() 56 | use_global = True 57 | 58 | # Train the network 59 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y, 60 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, 61 | update_interval=args.update_interval,tol=1*1e-3) 62 | 63 | # Make predictions on test set 64 | test_acc, test_nmi = idec.predict(test_X, test_y) 65 | 66 | # Report results 67 | print("Training Accuracy:", train_acc) 68 | print("Training NMI;", train_nmi) 69 | print("Training Epochs:", epo) 70 | print("Test Accuracy:", test_acc) 71 | print("Test NMI:", test_nmi) 72 | -------------------------------------------------------------------------------- /experiments/run_DCC_instance.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import torch.utils.data 4 | import numpy as np 5 | import argparse 6 | from lib.dcc import IDEC 7 | from lib.datasets import MNIST, FashionMNIST, Reuters 8 | from sklearn.cluster import KMeans 9 | from lib.utils import detect_wrong 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description='Instance Difficulty Constrained Clustering Example') 14 | parser.add_argument('--lr', type=float, default=0.001, metavar='N', 15 | help='learning rate for training (default: 0.001)') 16 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 17 | help='input batch size for training (default: 256)') 18 | parser.add_argument('--update-interval', type=int, default=1, metavar='N', 19 | help='number of epochs to train (default: 1)') 20 | parser.add_argument('--epochs', type=int, default=200, metavar='N', 21 | help='number of epochs to train (default: 200)') 22 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N', 23 | help='directory for pre-trained weights') 24 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)') 25 | parser.add_argument('--use_pretrain', type=bool, default=True) 26 | args = parser.parse_args() 27 | 28 | # Load data 29 | mnist_train = MNIST('./dataset/mnist', train=True, download=True) 30 | mnist_test = MNIST('./dataset/mnist', train=False) 31 | X = mnist_train.train_data 32 | y = mnist_train.train_labels 33 | test_X = mnist_test.test_data 34 | test_y = mnist_test.test_labels 35 | 36 | # Set parameters 37 | ml_penalty, cl_penalty = 0.1, 1 38 | 39 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10, 40 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 41 | if args.data == "Fashion": 42 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True) 43 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False) 44 | X = fashionmnist_train.train_data 45 | y = fashionmnist_train.train_labels 46 | test_X = fashionmnist_test.test_data 47 | test_y = fashionmnist_test.test_labels 48 | args.pretrain="../model/fashion_sdae_weights.pt" 49 | ml_penalty = 1 50 | elif args.data == "Reuters": 51 | reuters_train = Reuters('./dataset/reuters', train=True, download=False) 52 | reuters_test = Reuters('./dataset/reuters', train=False) 53 | X = reuters_train.train_data 54 | y = reuters_train.train_labels 55 | test_X = reuters_test.test_data 56 | test_y = reuters_test.test_labels 57 | args.pretrain="../model/reuters10k_sdae_weights.pt" 58 | idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4, 59 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 60 | if args.use_pretrain: 61 | idec.load_model(args.pretrain) 62 | 63 | # Print netowrk structure 64 | print(idec) 65 | 66 | # Construct Constraints 67 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([]) 68 | anchor, positive, negative = np.array([]), np.array([]), np.array([]) 69 | 70 | # Provide instance guidance based on k-means results. High confidence (1) for correct instances. 71 | # Low confidence (0.1) for incorrect instances since k-means + AE does not achieve good results. 72 | latent = idec.encodeBatch(X).cpu().numpy() 73 | kmeans = KMeans(10, n_init=20) 74 | y_pred = kmeans.fit_predict(latent) 75 | instance_guidance = detect_wrong(y.cpu().numpy(), y_pred) 76 | instance_guidance = torch.tensor(instance_guidance, dtype=torch.float32).cuda() 77 | use_global = False 78 | 79 | # Train the network 80 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y, 81 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, 82 | update_interval=args.update_interval, tol=1*1e-3) 83 | 84 | # Make prediction 85 | test_acc, test_nmi = idec.predict(test_X, test_y) 86 | 87 | # Report results 88 | print("Training Accuracy:", train_acc) 89 | print("Training NMI;", train_nmi) 90 | print("Training Epochs:", epo) 91 | print("Test Accuracy:", test_acc) 92 | print("Test NMI:", test_nmi) 93 | -------------------------------------------------------------------------------- /experiments/run_DCC_pairwise.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append("..") 4 | import torch.utils.data 5 | import numpy as np 6 | import pandas as pd 7 | import argparse 8 | import time 9 | 10 | from lib.dcc import IDEC 11 | from lib.datasets import MNIST, FashionMNIST, Reuters 12 | from lib.utils import transitive_closure, generate_random_pair 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser(description='Pairwise MNIST Example') 17 | parser.add_argument('--lr', type=float, default=0.001, metavar='N', 18 | help='learning rate for training (default: 0.001)') 19 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 20 | help='input batch size for training (default: 256)') 21 | parser.add_argument('--update-interval', type=int, default=1, metavar='N', 22 | help='number of epochs to train (default: 1)') 23 | parser.add_argument('--epochs', type=int, default=500, metavar='N', 24 | help='number of epochs to train (default: 500)') 25 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N', 26 | help='directory for pre-trained weights') 27 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)') 28 | parser.add_argument('--without_pretrain', action='store_false') 29 | parser.add_argument('--without_kmeans', action='store_false') 30 | parser.add_argument('--noisy', type=float, default=0.0, metavar='N', 31 | help='noisy constraints rate for training (default: 0.0)') 32 | parser.add_argument('--plotting', action='store_true') 33 | args = parser.parse_args() 34 | 35 | # Load data 36 | mnist_train = MNIST('./dataset/mnist', train=True, download=True) 37 | mnist_test = MNIST('./dataset/mnist', train=False) 38 | X = mnist_train.train_data 39 | y = mnist_train.train_labels 40 | test_X = mnist_test.test_data 41 | test_y = mnist_test.test_labels 42 | 43 | # Set parameters 44 | ml_penalty, cl_penalty = 0.1, 1 45 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10, 46 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 47 | if args.data == "Fashion": 48 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True) 49 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False) 50 | X = fashionmnist_train.train_data 51 | y = fashionmnist_train.train_labels 52 | test_X = fashionmnist_test.test_data 53 | test_y = fashionmnist_test.test_labels 54 | args.pretrain="../model/fashion_sdae_weights.pt" 55 | ml_penalty = 1 56 | elif args.data == "Reuters": 57 | reuters_train = Reuters('./dataset/reuters', train=True, download=False) 58 | reuters_test = Reuters('./dataset/reuters', train=False) 59 | X = reuters_train.train_data 60 | y = reuters_train.train_labels 61 | test_X = reuters_test.test_data 62 | test_y = reuters_test.test_labels 63 | args.pretrain="../model/reuters10k_sdae_weights.pt" 64 | idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4, 65 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 66 | 67 | 68 | model_tag = "Raw" 69 | if args.without_pretrain: 70 | model_tag = "Pretrain" 71 | idec.load_model(args.pretrain) 72 | 73 | init_tag = "Random" 74 | if args.without_kmeans: 75 | init_tag = "KMeans" 76 | 77 | # Print Network Structure 78 | print(idec) 79 | 80 | # Construct Constraints 81 | num_constraints = 6000 82 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = generate_random_pair(y, num_constraints*2) 83 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = transitive_closure(ml_ind1, ml_ind2, cl_ind1, cl_ind2, X.shape[0]) 84 | 85 | ml_ind1 = ml_ind1[:num_constraints] 86 | ml_ind2 = ml_ind2[:num_constraints] 87 | cl_ind1 = cl_ind1[:num_constraints] 88 | cl_ind2 = cl_ind2[:num_constraints] 89 | 90 | plotting_dir = "" 91 | if args.plotting: 92 | 93 | dir_name = args.data+"_"+model_tag+"_"+init_tag+"_%d"%num_constraints 94 | if args.noisy > 0: 95 | dir_name += "_Noisy_%d%%"%(int(args.noisy*100)) 96 | dir_name += "_"+time.strftime("%Y%m%d-%H%M") 97 | plotting_dir = "./plotting/%s"%dir_name 98 | if not os.path.exists(plotting_dir): 99 | os.mkdir(plotting_dir) 100 | 101 | mldf = pd.DataFrame(data = [ml_ind1,ml_ind2]).T 102 | mldf.to_pickle(os.path.join(plotting_dir,"mustlinks.pkl")) 103 | cldf = pd.DataFrame(data = [cl_ind1,cl_ind2]).T 104 | cldf.to_pickle(os.path.join(plotting_dir,"cannotlinks.pkl")) 105 | 106 | if args.noisy > 0: 107 | nml_ind1, nml_ind2, ncl_ind1, ncl_ind2 = generate_random_pair(y, num_constraints*2) 108 | ncl_ind1, ncl_ind2, nml_ind1, nml_ind2 = transitive_closure(nml_ind1, nml_ind2, ncl_ind1, ncl_ind2, X.shape[0]) 109 | 110 | nml_ind1 = nml_ind1[:int(ml_ind1.size*args.noisy)] 111 | nml_ind2 = nml_ind2[:int(ml_ind2.size*args.noisy)] 112 | ncl_ind1 = ncl_ind1[:int(cl_ind1.size*args.noisy)] 113 | ncl_ind2 = ncl_ind2[:int(cl_ind2.size*args.noisy)] 114 | 115 | if plotting_dir: 116 | nmldf = pd.DataFrame(data = [nml_ind1,nml_ind2]).T 117 | nmldf.to_pickle(os.path.join(plotting_dir,"noisymustlinks.pkl")) 118 | ncldf = pd.DataFrame(data = [ncl_ind1,ncl_ind2]).T 119 | ncldf.to_pickle(os.path.join(plotting_dir,"noisycannotlinks.pkl")) 120 | 121 | ml_ind1 = np.append(ml_ind1,nml_ind1) 122 | ml_ind2 = np.append(ml_ind2,nml_ind2) 123 | cl_ind1 = np.append(cl_ind1,ncl_ind1) 124 | cl_ind2 = np.append(cl_ind2,ncl_ind2) 125 | 126 | anchor, positive, negative = np.array([]), np.array([]), np.array([]) 127 | instance_guidance = torch.zeros(X.shape[0]).cuda() 128 | use_global = False 129 | 130 | # Train Neural Network 131 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y, 132 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, 133 | update_interval=args.update_interval,tol=1*1e-3,use_kmeans=args.without_kmeans,plotting=plotting_dir) 134 | 135 | # Make Predictions 136 | test_acc, test_nmi = idec.predict(test_X, test_y) 137 | 138 | # Report Results 139 | print("ACC:", train_acc) 140 | print("NMI;", train_nmi) 141 | print("Epochs:", epo) 142 | print("testAcc:", test_acc) 143 | print("testNMI:", test_nmi) 144 | print("ML Closure:", ml_ind1.shape[0]) 145 | print("CL Closure:", cl_ind1.shape[0]) 146 | -------------------------------------------------------------------------------- /experiments/run_DCC_triplets.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import torch.utils.data 4 | import numpy as np 5 | import argparse 6 | from lib.dcc import IDEC 7 | from lib.datasets import MNIST, FashionMNIST 8 | from lib.utils import generate_mnist_triplets, generate_triplet_constraints_continuous 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser(description='Triplet Constraints Example') 13 | parser.add_argument('--lr', type=float, default=0.001, metavar='N', 14 | help='learning rate for training (default: 0.001)') 15 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 16 | help='input batch size for training (default: 256)') 17 | parser.add_argument('--update-interval', type=int, default=1, metavar='N', 18 | help='number of epochs to train (default: 1)') 19 | parser.add_argument('--epochs', type=int, default=200, metavar='N', 20 | help='number of epochs to train (default: 200)') 21 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N', 22 | help='directory for pre-trained weights') 23 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion)') 24 | parser.add_argument('--use_pretrain', type=bool, default=True) 25 | args = parser.parse_args() 26 | 27 | # Load data 28 | mnist_train = MNIST('./dataset/mnist', train=True, download=True) 29 | mnist_test = MNIST('./dataset/mnist', train=False) 30 | X = mnist_train.train_data 31 | y = mnist_train.train_labels 32 | test_X = mnist_test.test_data 33 | test_y = mnist_test.test_labels 34 | 35 | # Set parameters 36 | ml_penalty, cl_penalty = 0.1, 1 37 | if args.data == "Fashion": 38 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True) 39 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False) 40 | X = fashionmnist_train.train_data 41 | y = fashionmnist_train.train_labels 42 | test_X = fashionmnist_test.test_data 43 | test_y = fashionmnist_test.test_labels 44 | args.pretrain="../model/fashion_sdae_weights.pt" 45 | ml_penalty = 1 46 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10, 47 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 48 | if args.use_pretrain: 49 | idec.load_model(args.pretrain) 50 | 51 | # Print Network Structure 52 | print(idec) 53 | 54 | # Construct constraints 55 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([]) 56 | if args.data != "Fashion": 57 | anchor, positive, negative = generate_mnist_triplets(y, 6000) 58 | else: 59 | anchor, positive, negative = generate_triplet_constraints_continuous(y,6000) 60 | instance_guidance = torch.zeros(X.shape[0]).cuda() 61 | use_global = False 62 | 63 | # Train the network 64 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y, 65 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, 66 | update_interval=args.update_interval, tol=2*1e-3) 67 | 68 | # Make predictions 69 | test_acc, test_nmi = idec.predict(test_X, test_y) 70 | 71 | # Print the result 72 | print("ACC:", train_acc) 73 | print("NMI;", train_nmi) 74 | print("Epochs:", epo) 75 | print("testAcc:", test_acc) 76 | print("testNMI:", test_nmi) 77 | print("ML Closure:", ml_ind1.shape[0]) 78 | print("CL Closure:", cl_ind1.shape[0]) 79 | -------------------------------------------------------------------------------- /experiments/run_DEC.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import argparse 4 | from lib.dec import DEC 5 | from lib.datasets import MNIST 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser(description='DEC MNIST Example') 9 | parser.add_argument('--lr', type=float, default=0.01, metavar='N', 10 | help='learning rate for training (default: 0.01)') 11 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 12 | help='input batch size for training (default: 256)') 13 | parser.add_argument('--update-interval', type=int, default=1, metavar='N', 14 | help='update-interval (default: 1)') 15 | parser.add_argument('--epochs', type=int, default=200, metavar='N', 16 | help='number of epochs to train (default: 200)') 17 | parser.add_argument('--pretrain', type=str, default="../model/sdae.pt", metavar='N', 18 | help='use pre-trained weights') 19 | args = parser.parse_args() 20 | 21 | 22 | mnist_train = MNIST('./dataset/mnist', train=True, download=True) 23 | mnist_test = MNIST('./dataset/mnist', train=False) 24 | X = mnist_train.train_data 25 | y = mnist_train.train_labels 26 | 27 | dec = DEC(input_dim=784, z_dim=10, n_clusters=10, 28 | encodeLayer=[500, 500, 2000], activation="relu", dropout=0) 29 | print(dec) 30 | dec.load_model(args.pretrain) 31 | dec.fit(X, y, lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, 32 | update_interval=args.update_interval) 33 | 34 | -------------------------------------------------------------------------------- /experiments/run_improved_DEC.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import torch.utils.data 4 | import numpy as np 5 | import argparse 6 | from lib.dcc import IDEC 7 | from lib.datasets import MNIST, FashionMNIST, Reuters 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser(description='IDEC MNIST Example') 11 | parser.add_argument('--lr', type=float, default=0.001, metavar='N', 12 | help='learning rate for training (default: 0.001)') 13 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 14 | help='input batch size for training (default: 256)') 15 | parser.add_argument('--update-interval', type=int, default=1, metavar='N', 16 | help='number of epochs to train (default: 1)') 17 | parser.add_argument('--epochs', type=int, default=200, metavar='N', 18 | help='number of epochs to train (default: 200)') 19 | parser.add_argument('--pretrain', type=str, default="../model/mnist_sdae_weights.pt", metavar='N', 20 | help='directory for pre-trained weights') 21 | parser.add_argument('--data', type=str, default="MNIST", metavar='N', help='dataset(MNIST, Fashion, Reuters)') 22 | parser.add_argument('--use_pretrain', type=bool, default=True) 23 | args = parser.parse_args() 24 | 25 | # Load data 26 | mnist_train = MNIST('./dataset/mnist', train=True, download=True) 27 | mnist_test = MNIST('./dataset/mnist', train=False) 28 | X = mnist_train.train_data 29 | y = mnist_train.train_labels 30 | test_X = mnist_test.test_data 31 | test_y = mnist_test.test_labels 32 | 33 | # Set parameters 34 | ml_penalty, cl_penalty = 0.1, 1 35 | idec = IDEC(input_dim=784, z_dim=10, n_clusters=10, 36 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 37 | if args.data == "Fashion": 38 | fashionmnist_train = FashionMNIST('./dataset/fashion_mnist', train=True, download=True) 39 | fashionmnist_test = FashionMNIST('./dataset/fashion_mnist', train=False) 40 | X = fashionmnist_train.train_data 41 | y = fashionmnist_train.train_labels 42 | test_X = fashionmnist_test.test_data 43 | test_y = fashionmnist_test.test_labels 44 | args.pretrain="../model/fashion_sdae_weights.pt" 45 | ml_penalty = 1 46 | elif args.data == "Reuters": 47 | reuters_train = Reuters('./dataset/reuters', train=True, download=False) 48 | reuters_test = Reuters('./dataset/reuters', train=False) 49 | X = reuters_train.train_data 50 | y = reuters_train.train_labels 51 | test_X = reuters_test.test_data 52 | test_y = reuters_test.test_labels 53 | args.pretrain="../model/reuters10k_sdae_weights.pt" 54 | idec = IDEC(input_dim=2000, z_dim=10, n_clusters=4, 55 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", dropout=0) 56 | if args.use_pretrain: 57 | idec.load_model(args.pretrain) 58 | 59 | # Print network structure 60 | print(idec) 61 | 62 | # Construct constraints (here is the baseline so no constraints are provided). 63 | ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([]) 64 | anchor, positive, negative = np.array([]), np.array([]), np.array([]) 65 | instance_guidance = torch.zeros(X.shape[0]).cuda() 66 | use_global = False 67 | 68 | # Train the clustering model 69 | train_acc, train_nmi, epo = idec.fit(anchor, positive, negative, ml_ind1, ml_ind2, cl_ind1, cl_ind2, instance_guidance, use_global, ml_penalty, cl_penalty, X, y, 70 | lr=args.lr, batch_size=args.batch_size, num_epochs=args.epochs, 71 | update_interval=args.update_interval,tol=1*1e-3) 72 | 73 | # Test on the test data 74 | test_acc, test_nmi = idec.predict(test_X, test_y) 75 | 76 | # Print the result 77 | print("Training Accuracy:", train_acc) 78 | print("Training NMI;", train_nmi) 79 | print("Training Epochs:", epo) 80 | print("Test Accuracy:", test_acc) 81 | print("Test NMI:", test_nmi) 82 | -------------------------------------------------------------------------------- /experiments/run_sdae.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("..") 4 | import torch.utils.data 5 | import argparse 6 | from lib.stackedDAE import StackedDAE 7 | from lib.datasets import MNIST 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser(description='VAE MNIST Example') 11 | parser.add_argument('--lr', type=float, default=0.1, metavar='N', 12 | help='learning rate for training (default: 0.1)') 13 | parser.add_argument('--batch-size', type=int, default=256, metavar='N', 14 | help='input batch size for training (default: 256)') 15 | parser.add_argument('--pretrainepochs', type=int, default=300, metavar='N', 16 | help='number of epochs to train (default: 300)') 17 | parser.add_argument('--epochs', type=int, default=500, metavar='N', 18 | help='number of epochs to train (default: 500)') 19 | args = parser.parse_args() 20 | 21 | # Load data for pre-training 22 | train_loader = torch.utils.data.DataLoader( 23 | MNIST('./dataset/mnist', train=True, download=True), 24 | batch_size=args.batch_size, shuffle=True, num_workers=0) 25 | test_loader = torch.utils.data.DataLoader( 26 | MNIST('./dataset/mnist', train=False), 27 | batch_size=args.batch_size, shuffle=False, num_workers=0) 28 | 29 | sdae = StackedDAE(input_dim=784, z_dim=10, binary=False, 30 | encodeLayer=[500, 500, 2000], decodeLayer=[2000, 500, 500], activation="relu", 31 | dropout=0) 32 | 33 | # Print the pre-train model structure 34 | print(sdae) 35 | sdae.pretrain(train_loader, test_loader, lr=args.lr, batch_size=args.batch_size, 36 | num_epochs=args.pretrainepochs, corrupt=0.2, loss_type="mse") 37 | 38 | # Train the stacked denoising autoencoder 39 | sdae.fit(train_loader, test_loader, lr=args.lr, num_epochs=args.epochs, corrupt=0.2, loss_type="mse") 40 | 41 | # Save the weights as pre-trained model for IDEC/DEC/DCC 42 | sdae.save_model("model/sdae_mnist_weights.pt") 43 | -------------------------------------------------------------------------------- /lib/__pycache__/datasets.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/datasets.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/dcc.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/dcc.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/dec.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/dec.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/denoisingAutoencoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/denoisingAutoencoder.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/ops.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/ops.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/stackedDAE.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/stackedDAE.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blueocean92/deep_constrained_clustering/22e5c98a726b0e48f48c4dbf601e6f1a0199c083/lib/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /lib/datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import errno 4 | import numpy as np 5 | import gzip 6 | import torch 7 | import pickle 8 | import torch.utils.data as data 9 | import codecs 10 | import urllib 11 | 12 | 13 | class MNIST(data.Dataset): 14 | """`MNIST `_ Dataset. 15 | Args: 16 | root (string): Root directory of dataset where ``processed/training.pt`` 17 | and ``processed/test.pt`` exist. 18 | train (bool, optional): If True, creates dataset from ``training.pt``, 19 | otherwise from ``test.pt``. 20 | download (bool, optional): If true, downloads the dataset from the internet and 21 | puts it in root directory. If dataset is already downloaded, it is not 22 | downloaded again. 23 | transform (callable, optional): A function/transform that takes in an PIL image 24 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 25 | target_transform (callable, optional): A function/transform that takes in the 26 | target and transforms it. 27 | """ 28 | urls = [ 29 | 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 30 | 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 31 | 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 32 | 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 33 | ] 34 | raw_folder = 'raw' 35 | processed_folder = 'processed' 36 | training_file = 'training.pt' 37 | test_file = 'test.pt' 38 | classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', 39 | '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine'] 40 | class_to_idx = {_class: i for i, _class in enumerate(classes)} 41 | 42 | @property 43 | def targets(self): 44 | if self.train: 45 | return self.train_labels 46 | else: 47 | return self.test_labels 48 | 49 | def __init__(self, root, train=True, transform=None, target_transform=None, download=False): 50 | self.root = os.path.expanduser(root) 51 | self.transform = transform 52 | self.target_transform = target_transform 53 | self.train = train # training set or test set 54 | self.use_cuda = torch.cuda.is_available() 55 | 56 | if download: 57 | self.download() 58 | 59 | if not self._check_exists(): 60 | raise RuntimeError('Dataset not found.' + 61 | ' You can use download=True to download it') 62 | 63 | if self.train: 64 | self.train_data, self.train_labels = torch.load( 65 | os.path.join(self.root, self.processed_folder, self.training_file)) 66 | self.train_data = self.train_data.view(self.train_data.size(0), -1).float()*0.02 67 | # self.train_data = self.train_data.view(self.train_data.size(0), -1).float()/255 68 | self.train_labels = self.train_labels.int() 69 | if self.use_cuda: 70 | self.train_data = self.train_data.cuda() 71 | self.train_labels = self.train_labels.cuda() 72 | else: 73 | self.test_data, self.test_labels = torch.load( 74 | os.path.join(self.root, self.processed_folder, self.test_file)) 75 | self.test_data = self.test_data.view(self.test_data.size(0), -1).float()*0.02 76 | # self.test_data = self.test_data.view(self.test_data.size(0), -1).float()/255 77 | self.test_labels = self.test_labels.int() 78 | if self.use_cuda: 79 | self.test_data = self.test_data.cuda() 80 | self.test_labels = self.test_labels.cuda() 81 | 82 | def __getitem__(self, index): 83 | """ 84 | Args: 85 | index (int): Index 86 | Returns: 87 | tuple: (image, target) where target is index of the target class. 88 | """ 89 | if self.train: 90 | img, target = self.train_data[index], self.train_labels[index] 91 | else: 92 | img, target = self.test_data[index], self.test_labels[index] 93 | 94 | return img, target 95 | 96 | def __len__(self): 97 | if self.train: 98 | return len(self.train_data) 99 | else: 100 | return len(self.test_data) 101 | 102 | def _check_exists(self): 103 | return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \ 104 | os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file)) 105 | 106 | def download(self): 107 | """Download the MNIST data if it doesn't exist in processed_folder already.""" 108 | from six.moves import urllib 109 | import gzip 110 | 111 | if self._check_exists(): 112 | return 113 | 114 | # download files 115 | try: 116 | os.makedirs(os.path.join(self.root, self.raw_folder)) 117 | os.makedirs(os.path.join(self.root, self.processed_folder)) 118 | except OSError as e: 119 | if e.errno == errno.EEXIST: 120 | pass 121 | else: 122 | raise 123 | 124 | for url in self.urls: 125 | print('Downloading ' + url) 126 | data = urllib.request.urlopen(url) 127 | filename = url.rpartition('/')[2] 128 | file_path = os.path.join(self.root, self.raw_folder, filename) 129 | with open(file_path, 'wb') as f: 130 | f.write(data.read()) 131 | with open(file_path.replace('.gz', ''), 'wb') as out_f, \ 132 | gzip.GzipFile(file_path) as zip_f: 133 | out_f.write(zip_f.read()) 134 | os.unlink(file_path) 135 | 136 | # process and save as torch files 137 | print('Processing...') 138 | 139 | training_set = ( 140 | read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), 141 | read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte')) 142 | ) 143 | test_set = ( 144 | read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), 145 | read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte')) 146 | ) 147 | with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: 148 | torch.save(training_set, f) 149 | with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: 150 | torch.save(test_set, f) 151 | 152 | print('Done!') 153 | 154 | def __repr__(self): 155 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' 156 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) 157 | tmp = 'train' if self.train is True else 'test' 158 | fmt_str += ' Split: {}\n'.format(tmp) 159 | fmt_str += ' Root Location: {}\n'.format(self.root) 160 | tmp = ' Transforms (if any): ' 161 | fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 162 | tmp = ' Target Transforms (if any): ' 163 | fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 164 | return fmt_str 165 | 166 | 167 | def read_label_file(path): 168 | with open(path, 'rb') as f: 169 | data = f.read() 170 | assert get_int(data[:4]) == 2049 171 | length = get_int(data[4:8]) 172 | parsed = np.frombuffer(data, dtype=np.uint8, offset=8) 173 | return torch.from_numpy(parsed).view(length).long() 174 | 175 | 176 | def get_int(b): 177 | return int(codecs.encode(b, 'hex'), 16) 178 | 179 | 180 | def read_image_file(path): 181 | with open(path, 'rb') as f: 182 | data = f.read() 183 | assert get_int(data[:4]) == 2051 184 | length = get_int(data[4:8]) 185 | num_rows = get_int(data[8:12]) 186 | num_cols = get_int(data[12:16]) 187 | images = [] 188 | parsed = np.frombuffer(data, dtype=np.uint8, offset=16) 189 | return torch.from_numpy(parsed).view(length, num_rows, num_cols) 190 | 191 | 192 | class FashionMNIST(MNIST): 193 | """`Fashion-MNIST `_ Dataset. 194 | Args: 195 | root (string): Root directory of dataset where ``processed/training.pt`` 196 | and ``processed/test.pt`` exist. 197 | train (bool, optional): If True, creates dataset from ``training.pt``, 198 | otherwise from ``test.pt``. 199 | download (bool, optional): If true, downloads the dataset from the internet and 200 | puts it in root directory. If dataset is already downloaded, it is not 201 | downloaded again. 202 | transform (callable, optional): A function/transform that takes in an PIL image 203 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 204 | target_transform (callable, optional): A function/transform that takes in the 205 | target and transforms it. 206 | """ 207 | urls = [ 208 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 209 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz', 210 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 211 | 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz', 212 | ] 213 | classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal','Shirt', 'Sneaker', 'Bag', 'Ankle boot'] 214 | 215 | 216 | class Reuters(data.Dataset): 217 | # To download the processed reuters data, please run the script named as "download_data.sh" 218 | training_file = "reutersidf10k_train.npy" 219 | test_file = "reutersidf10k_test.npy" 220 | 221 | def __init__(self, root, train=True, transform=None, target_transform=None, download=False): 222 | self.root = os.path.expanduser(root) 223 | self.transform = transform 224 | self.target_transform = target_transform 225 | self.train = train # training set or test set 226 | self.use_cuda = torch.cuda.is_available() 227 | 228 | if download: 229 | self.download() 230 | 231 | if self.train: 232 | rtk10k_train = np.load(os.path.join(self.root, self.training_file)).item() 233 | self.train_data, self.train_labels = torch.tensor(rtk10k_train['data'], dtype=torch.float32), torch.tensor(rtk10k_train['label'], dtype=torch.int) 234 | if self.use_cuda: 235 | self.train_data = self.train_data.cuda() 236 | self.train_labels = self.train_labels.cuda() 237 | else: 238 | rtk10k_test = np.load(os.path.join(self.root, self.test_file)).item() 239 | self.test_data, self.test_labels = torch.tensor(rtk10k_test['data'], dtype=torch.float32), torch.tensor( 240 | rtk10k_test['label'], dtype=torch.int) 241 | if self.use_cuda: 242 | self.test_data = self.test_data.cuda() 243 | self.test_labels = self.test_labels.cuda() 244 | 245 | def __getitem__(self, index): 246 | """ 247 | Args: 248 | index (int): Index 249 | Returns: 250 | tuple: (image, target) where target is index of the target class. 251 | """ 252 | if self.train: 253 | img, target = self.train_data[index], self.train_labels[index] 254 | else: 255 | img, target = self.test_data[index], self.test_labels[index] 256 | 257 | return img, target 258 | 259 | def __len__(self): 260 | if self.train: 261 | return len(self.train_data) 262 | else: 263 | return len(self.test_data) 264 | -------------------------------------------------------------------------------- /lib/dcc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | import torch.optim as optim 5 | from torch.autograd import Variable 6 | 7 | import numpy as np 8 | import os 9 | import math 10 | import collections 11 | import pickle 12 | import json 13 | 14 | from lib.utils import acc 15 | from sklearn.metrics.cluster import normalized_mutual_info_score 16 | from sklearn.cluster import KMeans 17 | import pandas as pd 18 | 19 | class MSELoss(nn.Module): 20 | def __init__(self): 21 | super(self.__class__, self).__init__() 22 | 23 | def forward(self, input, target): 24 | return torch.mean((input-target)**2) 25 | 26 | 27 | def buildNetwork(layers, activation="relu", dropout=0): 28 | net = [] 29 | for i in range(1, len(layers)): 30 | net.append(nn.Linear(layers[i-1], layers[i])) 31 | if activation=="relu": 32 | net.append(nn.ReLU()) 33 | elif activation=="sigmoid": 34 | net.append(nn.Sigmoid()) 35 | if dropout > 0: 36 | net.append(nn.Dropout(dropout)) 37 | return nn.Sequential(*net) 38 | 39 | 40 | class IDEC(nn.Module): 41 | def __init__(self, input_dim=784, z_dim=10, n_clusters=10, 42 | encodeLayer=[400], decodeLayer=[400], activation="relu", dropout=0, alpha=1., gamma=0.1): 43 | super(self.__class__, self).__init__() 44 | self.z_dim = z_dim 45 | self.layers = [input_dim] + encodeLayer + [z_dim] 46 | self.activation = activation 47 | self.dropout = dropout 48 | self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout) 49 | self.decoder = buildNetwork([z_dim] + decodeLayer, activation=activation, dropout=dropout) 50 | self._enc_mu = nn.Linear(encodeLayer[-1], z_dim) 51 | self._dec = nn.Linear(decodeLayer[-1], input_dim) 52 | 53 | self.n_clusters = n_clusters 54 | self.alpha = alpha 55 | self.gamma = gamma 56 | self.mu = Parameter(torch.Tensor(n_clusters, z_dim)) 57 | 58 | def save_model(self, path): 59 | torch.save(self.state_dict(), path) 60 | 61 | def load_model(self, path): 62 | pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage) 63 | model_dict = self.state_dict() 64 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 65 | model_dict.update(pretrained_dict) 66 | self.load_state_dict(model_dict) 67 | 68 | def forward(self, x): 69 | h = self.encoder(x) 70 | z = self._enc_mu(h) 71 | h = self.decoder(z) 72 | xrecon = self._dec(h) 73 | # compute q -> NxK 74 | q = self.soft_assign(z) 75 | return z, q, xrecon 76 | 77 | def soft_assign(self, z): 78 | q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha) 79 | q = q**(self.alpha+1.0)/2.0 80 | q = q / torch.sum(q, dim=1, keepdim=True) 81 | return q 82 | 83 | def encodeBatch(self, X, batch_size=256): 84 | use_cuda = torch.cuda.is_available() 85 | if use_cuda: 86 | self.cuda() 87 | 88 | encoded = [] 89 | self.eval() 90 | num = X.shape[0] 91 | num_batch = int(math.ceil(1.0*X.shape[0]/batch_size)) 92 | for batch_idx in range(num_batch): 93 | xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)] 94 | inputs = Variable(xbatch) 95 | z,_, _ = self.forward(inputs) 96 | encoded.append(z.data) 97 | 98 | encoded = torch.cat(encoded, dim=0) 99 | return encoded 100 | 101 | def cluster_loss(self, p, q): 102 | def kld(target, pred): 103 | return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1)) 104 | kldloss = kld(p, q) 105 | return self.gamma*kldloss 106 | 107 | def recon_loss(self, x, xrecon): 108 | recon_loss = torch.mean((xrecon-x)**2) 109 | return recon_loss 110 | 111 | def pairwise_loss(self, p1, p2, cons_type): 112 | if cons_type == "ML": 113 | ml_loss = torch.mean(-torch.log(torch.sum(p1 * p2, dim=1))) 114 | return ml_loss 115 | else: 116 | cl_loss = torch.mean(-torch.log(1.0 - torch.sum(p1 * p2, dim=1))) 117 | return cl_loss 118 | 119 | def global_size_loss(self, p, cons_detail): 120 | m_p = torch.mean(p, dim=0) 121 | m_p = m_p / torch.sum(m_p) 122 | return torch.sum((m_p-cons_detail)*(m_p-cons_detail)) 123 | 124 | def difficulty_loss(self, q, mask): 125 | mask = mask.unsqueeze_(-1) 126 | mask = mask.expand(q.shape[0], q.shape[1]) 127 | mask_q = q * mask 128 | diff_loss = -torch.norm(mask_q, 2) 129 | penalty_degree = 0.1 130 | return penalty_degree * diff_loss 131 | 132 | def target_distribution(self, q): 133 | p = q**2 / torch.sum(q, dim=0) 134 | p = p / torch.sum(p, dim=1, keepdim=True) 135 | return p 136 | 137 | def triplet_loss(self, anchor, positive, negative, margin_constant): 138 | # loss = max(d(anchor, negative) - d(anchor, positve) + margin, 0), margin > 0 139 | # d(x, y) = q(x) * q(y) 140 | negative_dis = torch.sum(anchor * negative, dim=1) 141 | positive_dis = torch.sum(anchor * positive, dim=1) 142 | margin = margin_constant * torch.ones(negative_dis.shape).cuda() 143 | diff_dis = negative_dis - positive_dis 144 | penalty = diff_dis + margin 145 | triplet_loss = 1*torch.max(penalty, torch.zeros(negative_dis.shape).cuda()) 146 | 147 | return torch.mean(triplet_loss) 148 | 149 | def satisfied_constraints(self,ml_ind1,ml_ind2,cl_ind1, cl_ind2,y_pred): 150 | 151 | if ml_ind1.size == 0 or ml_ind2.size == 0 or cl_ind1.size == 0 or cl_ind2.size == 0: 152 | return 1.1 153 | 154 | count = 0 155 | satisfied = 0 156 | for (i, j) in zip(ml_ind1, ml_ind2): 157 | count += 1 158 | if y_pred[i] == y_pred[j]: 159 | satisfied += 1 160 | for (i, j) in zip(cl_ind1, cl_ind2): 161 | count += 1 162 | if y_pred[i] != y_pred[j]: 163 | satisfied += 1 164 | 165 | return float(satisfied)/count 166 | 167 | 168 | def predict(self, X, y): 169 | use_cuda = torch.cuda.is_available() 170 | if use_cuda: 171 | self.cuda() 172 | latent = self.encodeBatch(X) 173 | q = self.soft_assign(latent) 174 | 175 | # evalute the clustering performance 176 | y_pred = torch.argmax(q, dim=1).data.cpu().numpy() 177 | y = y.data.cpu().numpy() 178 | if y is not None: 179 | print("acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred))) 180 | final_acc = acc(y, y_pred) 181 | final_nmi = normalized_mutual_info_score(y, y_pred) 182 | return final_acc, final_nmi 183 | 184 | def fit(self,anchor, positive, negative, ml_ind1,ml_ind2,cl_ind1, cl_ind2, mask, use_global, ml_p, cl_p, X,y=None, lr=0.001, batch_size=256, num_epochs=10, update_interval=1, tol=1e-3, use_kmeans=True, plotting="",clustering_loss_weight=1): 185 | 186 | # save intermediate results for plotting 187 | intermediate_results = collections.defaultdict(lambda:{}) 188 | 189 | '''X: tensor data''' 190 | use_cuda = torch.cuda.is_available() 191 | if use_cuda: 192 | self.cuda() 193 | print("=====Training IDEC=======") 194 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr) 195 | 196 | if use_kmeans: 197 | print("Initializing cluster centers with kmeans.") 198 | kmeans = KMeans(self.n_clusters, n_init=20) 199 | data = self.encodeBatch(X) 200 | y_pred = kmeans.fit_predict(data.data.cpu().numpy()) 201 | y_pred_last = y_pred 202 | self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_)) 203 | else: 204 | # use kmeans to randomly initialize cluster ceters 205 | print("Randomly initializing cluster centers.") 206 | kmeans = KMeans(self.n_clusters, n_init=1, max_iter=1) 207 | data = self.encodeBatch(X) 208 | y_pred = kmeans.fit_predict(data.data.cpu().numpy()) 209 | y_pred_last = y_pred 210 | self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_)) 211 | 212 | if y is not None: 213 | y = y.cpu().numpy() 214 | # print("Kmeans acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred))) 215 | self.train() 216 | num = X.shape[0] 217 | num_batch = int(math.ceil(1.0*X.shape[0]/batch_size)) 218 | ml_num_batch = int(math.ceil(1.0*ml_ind1.shape[0]/batch_size)) 219 | cl_num_batch = int(math.ceil(1.0*cl_ind1.shape[0]/batch_size)) 220 | tri_num_batch = int(math.ceil(1.0*anchor.shape[0]/batch_size)) 221 | cl_num = cl_ind1.shape[0] 222 | ml_num = ml_ind1.shape[0] 223 | tri_num = anchor.shape[0] 224 | 225 | final_acc, final_nmi, final_epoch = 0, 0, 0 226 | update_ml = 1 227 | update_cl = 1 228 | update_triplet = 1 229 | for epoch in range(num_epochs): 230 | if epoch%update_interval == 0: 231 | # update the targe distribution p 232 | latent = self.encodeBatch(X) 233 | q = self.soft_assign(latent) 234 | p = self.target_distribution(q).data 235 | 236 | # evalute the clustering performance 237 | y_pred = torch.argmax(q, dim=1).data.cpu().numpy() 238 | if use_global: 239 | y_dict = collections.defaultdict(list) 240 | ind1, ind2 = [], [] 241 | for i in range(y_pred.shape[0]): 242 | y_dict[y_pred[i]].append(i) 243 | for key in y_dict.keys(): 244 | if y is not None: 245 | print("predicted class: ", key, " total: ", len(y_dict[key])) 246 | #, " mapped index(ground truth): ", np.bincount(y[y_dict[key]]).argmax()) 247 | 248 | if y is not None: 249 | print("acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred))) 250 | print("satisfied constraints: %.5f"%self.satisfied_constraints(ml_ind1,ml_ind2,cl_ind1, cl_ind2,y_pred)) 251 | final_acc = acc(y, y_pred) 252 | final_nmi = normalized_mutual_info_score(y, y_pred) 253 | final_epoch = epoch 254 | 255 | # save model for plotting 256 | if plotting and (epoch in [10,20,30,40] or epoch%50 == 0 or epoch == num_epochs-1): 257 | 258 | df = pd.DataFrame(latent.cpu().numpy()) 259 | df["y"] = y 260 | df.to_pickle(os.path.join(plotting,"save_model_%d.pkl"%(epoch))) 261 | 262 | intermediate_results["acc"][str(epoch)] = acc(y, y_pred) 263 | intermediate_results["nmi"][str(epoch)] = normalized_mutual_info_score(y, y_pred) 264 | with open(os.path.join(plotting,"intermediate_results.json"), "w") as fp: 265 | json.dump(intermediate_results, fp) 266 | 267 | # check stop criterion 268 | try: 269 | delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / num 270 | y_pred_last = y_pred 271 | if epoch>0 and delta_label < tol: 272 | print('delta_label ', delta_label, '< tol ', tol) 273 | print("Reach tolerance threshold. Stopping training.") 274 | 275 | # save model for plotting 276 | if plotting: 277 | 278 | df = pd.DataFrame(latent.cpu().numpy()) 279 | df["y"] = y 280 | df.to_pickle(os.path.join(plotting,"save_model_%d.pkl"%epoch)) 281 | 282 | intermediate_results["acc"][str(epoch)] = acc(y, y_pred) 283 | intermediate_results["nmi"][str(epoch)] = normalized_mutual_info_score(y, y_pred) 284 | with open(os.path.join(plotting,"intermediate_results.json"), "w") as fp: 285 | json.dump(intermediate_results, fp) 286 | break 287 | except: 288 | pass 289 | 290 | # train 1 epoch for clustering loss 291 | train_loss = 0.0 292 | recon_loss_val = 0.0 293 | cluster_loss_val = 0.0 294 | instance_constraints_loss_val = 0.0 295 | global_loss_val = 0.0 296 | for batch_idx in range(num_batch): 297 | xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)] 298 | pbatch = p[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)] 299 | mask_batch = mask[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)] 300 | optimizer.zero_grad() 301 | inputs = Variable(xbatch) 302 | target = Variable(pbatch) 303 | cons_detail = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) 304 | global_cons = torch.from_numpy(cons_detail).float().to("cuda") 305 | 306 | z, qbatch, xrecon = self.forward(inputs) 307 | if use_global == False: 308 | cluster_loss = self.cluster_loss(target, qbatch) 309 | recon_loss = self.recon_loss(inputs, xrecon) 310 | instance_constraints_loss = self.difficulty_loss(qbatch, mask_batch) 311 | loss = cluster_loss + recon_loss + instance_constraints_loss 312 | loss.backward() 313 | optimizer.step() 314 | cluster_loss_val += cluster_loss.data * len(inputs) 315 | recon_loss_val += recon_loss.data * len(inputs) 316 | instance_constraints_loss_val += instance_constraints_loss.data * len(inputs) 317 | train_loss = clustering_loss_weight*cluster_loss_val + recon_loss_val + instance_constraints_loss_val 318 | else: 319 | cluster_loss = self.cluster_loss(target, qbatch) 320 | recon_loss = self.recon_loss(inputs, xrecon) 321 | global_loss = self.global_size_loss(qbatch, global_cons) 322 | loss = cluster_loss + recon_loss + global_loss 323 | loss.backward() 324 | optimizer.step() 325 | cluster_loss_val += cluster_loss.data * len(inputs) 326 | recon_loss_val += recon_loss.data * len(inputs) 327 | train_loss = clustering_loss_weight*cluster_loss_val + recon_loss_val 328 | 329 | 330 | if instance_constraints_loss_val != 0.0: 331 | print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f Instance Difficulty Loss: %.4f"% ( 332 | epoch + 1, train_loss / num, cluster_loss_val / num, recon_loss_val / num, instance_constraints_loss_val / num)) 333 | elif global_loss_val != 0.0 and use_global: 334 | print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f Global Loss: %.4f"% ( 335 | epoch + 1, train_loss / num + global_loss_val/num_batch, cluster_loss_val / num, recon_loss_val / num, global_loss_val / num_batch)) 336 | else: 337 | print("#Epoch %3d: Total: %.4f Clustering Loss: %.4f Reconstruction Loss: %.4f" % ( 338 | epoch + 1, train_loss / num, cluster_loss_val / num, recon_loss_val / num)) 339 | ml_loss = 0.0 340 | if epoch % update_ml == 0: 341 | for ml_batch_idx in range(ml_num_batch): 342 | px1 = X[ml_ind1[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]] 343 | px2 = X[ml_ind2[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]] 344 | pbatch1 = p[ml_ind1[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx + 1)*batch_size)]] 345 | pbatch2 = p[ml_ind2[ml_batch_idx*batch_size : min(ml_num, (ml_batch_idx+1)*batch_size)]] 346 | optimizer.zero_grad() 347 | inputs1 = Variable(px1) 348 | inputs2 = Variable(px2) 349 | target1 = Variable(pbatch1) 350 | target2 = Variable(pbatch2) 351 | z1, q1, xr1 = self.forward(inputs1) 352 | z2, q2, xr2 = self.forward(inputs2) 353 | loss = (ml_p*self.pairwise_loss(q1, q2, "ML")+self.recon_loss(inputs1, xr1) + self.recon_loss(inputs2, xr2)) 354 | # 0.1 for mnist/reuters, 1 for fashion, the parameters are tuned via grid search on validation set 355 | ml_loss += loss.data 356 | loss.backward() 357 | optimizer.step() 358 | 359 | cl_loss = 0.0 360 | if epoch % update_cl == 0: 361 | for cl_batch_idx in range(cl_num_batch): 362 | px1 = X[cl_ind1[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]] 363 | px2 = X[cl_ind2[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]] 364 | pbatch1 = p[cl_ind1[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx + 1)*batch_size)]] 365 | pbatch2 = p[cl_ind2[cl_batch_idx*batch_size : min(cl_num, (cl_batch_idx+1)*batch_size)]] 366 | optimizer.zero_grad() 367 | inputs1 = Variable(px1) 368 | inputs2 = Variable(px2) 369 | target1 = Variable(pbatch1) 370 | target2 = Variable(pbatch2) 371 | z1, q1, xr1 = self.forward(inputs1) 372 | z2, q2, xr2 = self.forward(inputs2) 373 | loss = cl_p*self.pairwise_loss(q1, q2, "CL") 374 | cl_loss += loss.data 375 | loss.backward() 376 | optimizer.step() 377 | 378 | if ml_num_batch >0 and cl_num_batch > 0: 379 | print("Pairwise Total:", round(float(ml_loss.cpu()), 2) + float(cl_loss.cpu()), "ML loss", float(ml_loss.cpu()), "CL loss:", float(cl_loss.cpu())) 380 | triplet_loss = 0.0 381 | if epoch % update_triplet == 0: 382 | for tri_batch_idx in range(tri_num_batch): 383 | px1 = X[anchor[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]] 384 | px2 = X[positive[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]] 385 | px3 = X[negative[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]] 386 | pbatch1 = p[anchor[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx + 1)*batch_size)]] 387 | pbatch2 = p[positive[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]] 388 | pbatch3 = p[negative[tri_batch_idx*batch_size : min(tri_num, (tri_batch_idx+1)*batch_size)]] 389 | optimizer.zero_grad() 390 | inputs1 = Variable(px1) 391 | inputs2 = Variable(px2) 392 | inputs3 = Variable(px3) 393 | target1 = Variable(pbatch1) 394 | target2 = Variable(pbatch2) 395 | target3 = Variable(pbatch3) 396 | z1, q1, xr1 = self.forward(inputs1) 397 | z2, q2, xr2 = self.forward(inputs2) 398 | z3, q3, xr3 = self.forward(inputs3) 399 | loss = self.triplet_loss(q1, q2, q3, 0.1) 400 | triplet_loss += loss.data 401 | loss.backward() 402 | optimizer.step() 403 | if tri_num_batch > 0: 404 | print("Triplet Loss:", triplet_loss) 405 | return final_acc, final_nmi, final_epoch 406 | -------------------------------------------------------------------------------- /lib/dec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torchvision 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | 10 | import numpy as np 11 | import math 12 | from lib.utils import acc 13 | from sklearn.metrics.cluster import normalized_mutual_info_score 14 | from sklearn.cluster import KMeans 15 | 16 | 17 | def buildNetwork(layers, activation="relu", dropout=0): 18 | net = [] 19 | for i in range(1, len(layers)): 20 | net.append(nn.Linear(layers[i-1], layers[i])) 21 | if activation=="relu": 22 | net.append(nn.ReLU()) 23 | elif activation=="sigmoid": 24 | net.append(nn.Sigmoid()) 25 | if dropout > 0: 26 | net.append(nn.Dropout(dropout)) 27 | return nn.Sequential(*net) 28 | 29 | 30 | class DEC(nn.Module): 31 | def __init__(self, input_dim=784, z_dim=10, n_clusters=10, 32 | encodeLayer=[400], activation="relu", dropout=0, alpha=1.): 33 | super(self.__class__, self).__init__() 34 | self.z_dim = z_dim 35 | self.layers = [input_dim] + encodeLayer + [z_dim] 36 | self.activation = activation 37 | self.dropout = dropout 38 | self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout) # f(x) = z 39 | self._enc_mu = nn.Linear(encodeLayer[-1], z_dim) # clustering layer -> q 40 | 41 | self.n_clusters = n_clusters 42 | self.alpha = alpha 43 | self.mu = Parameter(torch.Tensor(n_clusters, z_dim)) 44 | 45 | def save_model(self, path): 46 | torch.save(self.state_dict(), path) 47 | 48 | def load_model(self, path): 49 | pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage) 50 | model_dict = self.state_dict() 51 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 52 | model_dict.update(pretrained_dict) 53 | self.load_state_dict(model_dict) 54 | 55 | def forward(self, x): 56 | h = self.encoder(x) 57 | z = self._enc_mu(h) 58 | # compute q -> NxK 59 | q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha) 60 | q = q**(self.alpha+1.0)/2.0 61 | q = q / torch.sum(q, dim=1, keepdim=True) 62 | return z, q 63 | 64 | def encodeBatch(self, dataloader, islabel=False): 65 | use_cuda = torch.cuda.is_available() 66 | if use_cuda: 67 | self.cuda() 68 | 69 | encoded = [] 70 | ylabels = [] 71 | self.eval() 72 | for batch_idx, (inputs, labels) in enumerate(dataloader): 73 | inputs = Variable(inputs) 74 | z,_ = self.forward(inputs) 75 | encoded.append(z.data.cpu()) 76 | ylabels.append(labels) 77 | 78 | encoded = torch.cat(encoded, dim=0) 79 | ylabels = torch.cat(ylabels) 80 | if islabel: 81 | out = (encoded, ylabels) 82 | else: 83 | out = encoded 84 | return out 85 | 86 | def loss_function(self, p, q): 87 | def kld(target, pred): 88 | return torch.mean(torch.sum(target*torch.log(target/(pred+1e-6)), dim=1)) 89 | 90 | loss = kld(p, q) 91 | return loss 92 | 93 | def target_distribution(self, q): 94 | p = q**2 / torch.sum(q, dim=0) 95 | p = p / torch.sum(p, dim=1, keepdim=True) 96 | return p 97 | 98 | def fit(self, X, y=None, lr=0.001, batch_size=256, num_epochs=10, update_interval=1, tol=1e-3): 99 | '''X: tensor data''' 100 | use_cuda = torch.cuda.is_available() 101 | if use_cuda: 102 | self.cuda() 103 | print("=====Training DEC=======") 104 | #optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr) 105 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9) 106 | 107 | print("Initializing cluster centers with kmeans.") 108 | kmeans = KMeans(self.n_clusters, n_init=20) 109 | data, _ = self.forward(X) 110 | y_pred = kmeans.fit_predict(data.data.cpu().numpy()) 111 | y_pred_last = y_pred 112 | self.mu.data.copy_(torch.Tensor(kmeans.cluster_centers_)) 113 | if y is not None: 114 | y = y.cpu().numpy() 115 | print("Kmeans acc: %.5f, nmi: %.5f" % (acc(y, y_pred), normalized_mutual_info_score(y, y_pred))) 116 | 117 | self.train() 118 | num = X.shape[0] 119 | num_batch = int(math.ceil(1.0*X.shape[0]/batch_size)) 120 | print("num_batches:", num_batch) 121 | for epoch in range(num_epochs): 122 | if epoch % update_interval == 0: 123 | # update the targe distribution p 124 | _, q = self.forward(X) 125 | p = self.target_distribution(q).data 126 | 127 | # evalute the clustering performance 128 | y_pred = torch.argmax(q, dim=1).data.cpu().numpy() 129 | if y is not None: 130 | print("epoch: %.5f, acc: %.5f, nmi: %.5f" % (epoch, acc(y, y_pred), normalized_mutual_info_score(y, y_pred))) 131 | 132 | # check stop criterion 133 | delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / num 134 | y_pred_last = y_pred 135 | if epoch>0 and delta_label < tol: 136 | print('delta_label ', delta_label, '< tol ', tol) 137 | print("Reach tolerance threshold. Stopping training.") 138 | break 139 | 140 | # train 1 epoch 141 | train_loss = 0.0 142 | for batch_idx in range(num_batch): 143 | xbatch = X[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)] 144 | pbatch = p[batch_idx*batch_size : min((batch_idx+1)*batch_size, num)] 145 | 146 | optimizer.zero_grad() 147 | inputs = Variable(xbatch) 148 | target = Variable(pbatch) 149 | 150 | z, qbatch = self.forward(inputs) 151 | loss = self.loss_function(target, qbatch) 152 | train_loss += loss.data*len(inputs) 153 | loss.backward() 154 | optimizer.step() 155 | 156 | #print("#Epoch %3d: Loss: %.4f" % ( 157 | # epoch+1, train_loss / num)) 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /lib/denoisingAutoencoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torchvision 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | 10 | import numpy as np 11 | import math 12 | from lib.utils import Dataset, masking_noise 13 | from lib.ops import MSELoss, BCELoss 14 | 15 | def adjust_learning_rate(init_lr, optimizer, epoch): 16 | lr = init_lr * (0.1 ** (epoch//100)) 17 | toprint = True 18 | for param_group in optimizer.param_groups: 19 | if param_group["lr"]!=lr: 20 | param_group["lr"] = lr 21 | if toprint: 22 | print("Switching to learning rate %f" % lr) 23 | toprint = False 24 | 25 | class DenoisingAutoencoder(nn.Module): 26 | def __init__(self, in_features, out_features, activation="relu", 27 | dropout=0.2, tied=False): 28 | super(self.__class__, self).__init__() 29 | self.in_features = in_features 30 | self.out_features = out_features 31 | self.weight = Parameter(torch.Tensor(out_features, in_features)) 32 | if tied: 33 | self.deweight = self.weight.t() 34 | else: 35 | self.deweight = Parameter(torch.Tensor(in_features, out_features)) 36 | self.bias = Parameter(torch.Tensor(out_features)) 37 | self.vbias = Parameter(torch.Tensor(in_features)) 38 | 39 | if activation=="relu": 40 | self.enc_act_func = nn.ReLU() 41 | elif activation=="sigmoid": 42 | self.enc_act_func = nn.Sigmoid() 43 | elif activation=="none": 44 | self.enc_act_func = None 45 | self.dropout = nn.Dropout(p=dropout) 46 | 47 | self.reset_parameters() 48 | 49 | def reset_parameters(self): 50 | stdv = 0.01 51 | self.weight.data.uniform_(-stdv, stdv) 52 | self.bias.data.uniform_(-stdv, stdv) 53 | stdv = 0.01 54 | self.deweight.data.uniform_(-stdv, stdv) 55 | self.vbias.data.uniform_(-stdv, stdv) 56 | 57 | def forward(self, x): 58 | if self.enc_act_func is not None: 59 | return self.dropout(self.enc_act_func(F.linear(x, self.weight, self.bias))) 60 | else: 61 | return self.dropout(F.linear(x, self.weight, self.bias)) 62 | 63 | def encode(self, x, train=True): 64 | if train: 65 | self.dropout.train() 66 | else: 67 | self.dropout.eval() 68 | if self.enc_act_func is not None: 69 | return self.dropout(self.enc_act_func(F.linear(x, self.weight, self.bias))) 70 | else: 71 | return self.dropout(F.linear(x, self.weight, self.bias)) 72 | 73 | def encodeBatch(self, dataloader): 74 | use_cuda = torch.cuda.is_available() 75 | encoded = [] 76 | for batch_idx, (inputs, _) in enumerate(dataloader): 77 | inputs = inputs.view(inputs.size(0), -1).float() 78 | if use_cuda: 79 | inputs = inputs.cuda() 80 | inputs = Variable(inputs) 81 | hidden = self.encode(inputs, train=False) 82 | encoded.append(hidden.data.cpu()) 83 | 84 | encoded = torch.cat(encoded, dim=0) 85 | return encoded 86 | 87 | def decode(self, x, binary=False): 88 | if not binary: 89 | return F.linear(x, self.deweight, self.vbias) 90 | else: 91 | return F.sigmoid(F.linear(x, self.deweight, self.vbias)) 92 | 93 | def fit(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.3, 94 | loss_type="mse"): 95 | """ 96 | data_x: FloatTensor 97 | valid_x: FloatTensor 98 | """ 99 | use_cuda = torch.cuda.is_available() 100 | if use_cuda: 101 | self.cuda() 102 | print("=====Denoising Autoencoding layer=======") 103 | # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr) 104 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9) 105 | if loss_type=="mse": 106 | criterion = MSELoss() 107 | elif loss_type=="cross-entropy": 108 | criterion = BCELoss() 109 | 110 | # validate 111 | total_loss = 0.0 112 | total_num = 0 113 | for batch_idx, (inputs, _) in enumerate(validloader): 114 | # inputs = inputs.view(inputs.size(0), -1).float() 115 | # if use_cuda: 116 | # inputs = inputs.cuda() 117 | inputs = Variable(inputs) 118 | hidden = self.encode(inputs) 119 | if loss_type=="cross-entropy": 120 | outputs = self.decode(hidden, binary=True) 121 | else: 122 | outputs = self.decode(hidden) 123 | 124 | valid_recon_loss = criterion(outputs, inputs) 125 | total_loss += valid_recon_loss.data * len(inputs) 126 | total_num += inputs.size()[0] 127 | 128 | valid_loss = total_loss / total_num 129 | print("#Epoch 0: Valid Reconstruct Loss: %.4f" % (valid_loss)) 130 | 131 | self.train() 132 | for epoch in range(num_epochs): 133 | # train 1 epoch 134 | train_loss = 0.0 135 | adjust_learning_rate(lr, optimizer, epoch) 136 | for batch_idx, (inputs, _) in enumerate(trainloader): 137 | # inputs = inputs.view(inputs.size(0), -1).float() 138 | inputs_corr = masking_noise(inputs, corrupt) 139 | # if use_cuda: 140 | # inputs = inputs.cuda() 141 | # inputs_corr = inputs_corr.cuda() 142 | optimizer.zero_grad() 143 | inputs = Variable(inputs) 144 | inputs_corr = Variable(inputs_corr) 145 | 146 | hidden = self.encode(inputs_corr) 147 | if loss_type=="cross-entropy": 148 | outputs = self.decode(hidden, binary=True) 149 | else: 150 | outputs = self.decode(hidden) 151 | recon_loss = criterion(outputs, inputs) 152 | train_loss += recon_loss.data*len(inputs) 153 | recon_loss.backward() 154 | optimizer.step() 155 | 156 | # validate 157 | valid_loss = 0.0 158 | for batch_idx, (inputs, _) in enumerate(validloader): 159 | # inputs = inputs.view(inputs.size(0), -1).float() 160 | # if use_cuda: 161 | # inputs = inputs.cuda() 162 | inputs = Variable(inputs) 163 | hidden = self.encode(inputs, train=False) 164 | if loss_type=="cross-entropy": 165 | outputs = self.decode(hidden, binary=True) 166 | else: 167 | outputs = self.decode(hidden) 168 | 169 | valid_recon_loss = criterion(outputs, inputs) 170 | valid_loss += valid_recon_loss.data * len(inputs) 171 | 172 | print("#Epoch %3d: Reconstruct Loss: %.4f, Valid Reconstruct Loss: %.4f" % ( 173 | epoch+1, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset))) 174 | 175 | def extra_repr(self): 176 | return 'in_features={}, out_features={}, bias={}'.format( 177 | self.in_features, self.out_features, self.bias is not None 178 | ) 179 | 180 | -------------------------------------------------------------------------------- /lib/ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | from torch.autograd import Variable 5 | import torch.nn.functional as F 6 | import math 7 | 8 | 9 | class MSELoss(nn.Module): 10 | def __init__(self): 11 | super(self.__class__, self).__init__() 12 | 13 | def forward(self, input, target): 14 | return 0.5 * torch.mean((input-target)**2) 15 | 16 | class BCELoss(nn.Module): 17 | def __init__(self): 18 | super(self.__class__, self).__init__() 19 | 20 | def forward(self, input, target): 21 | return -torch.mean(torch.sum(target*torch.log(torch.clamp(input, min=1e-10))+ 22 | (1-target)*torch.log(torch.clamp(1-input, min=1e-10)), 1)) 23 | -------------------------------------------------------------------------------- /lib/stackedDAE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torchvision 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | 10 | import numpy as np 11 | import math 12 | from lib.utils import Dataset, masking_noise 13 | from lib.ops import MSELoss, BCELoss 14 | from lib.denoisingAutoencoder import DenoisingAutoencoder 15 | 16 | def buildNetwork(layers, activation="relu", dropout=0): 17 | net = [] 18 | for i in range(1, len(layers)): 19 | net.append(nn.Linear(layers[i-1], layers[i])) 20 | if activation=="relu": 21 | net.append(nn.ReLU()) 22 | elif activation=="sigmoid": 23 | net.append(nn.Sigmoid()) 24 | if dropout > 0: 25 | net.append(nn.Dropout(dropout)) 26 | return nn.Sequential(*net) 27 | 28 | def adjust_learning_rate(init_lr, optimizer, epoch): 29 | lr = init_lr * (0.1 ** (epoch//100)) 30 | toprint = True 31 | for param_group in optimizer.param_groups: 32 | if param_group["lr"]!=lr: 33 | param_group["lr"] = lr 34 | if toprint: 35 | print("Switching to learning rate %f" % lr) 36 | toprint = False 37 | 38 | class StackedDAE(nn.Module): 39 | def __init__(self, input_dim=784, z_dim=10, binary=True, 40 | encodeLayer=[400], decodeLayer=[400], activation="relu", 41 | dropout=0, tied=False): 42 | super(self.__class__, self).__init__() 43 | self.z_dim = z_dim 44 | self.layers = [input_dim] + encodeLayer + [z_dim] 45 | self.activation = activation 46 | self.dropout = dropout 47 | self.encoder = buildNetwork([input_dim] + encodeLayer, activation=activation, dropout=dropout) 48 | self.decoder = buildNetwork([z_dim] + decodeLayer, activation=activation, dropout=dropout) 49 | self._enc_mu = nn.Linear(encodeLayer[-1], z_dim) 50 | 51 | self._dec = nn.Linear(decodeLayer[-1], input_dim) 52 | self._dec_act = None 53 | if binary: 54 | self._dec_act = nn.Sigmoid() 55 | 56 | def decode(self, z): 57 | h = self.decoder(z) 58 | x = self._dec(h) 59 | if self._dec_act is not None: 60 | x = self._dec_act(x) 61 | return x 62 | 63 | def loss_function(self, recon_x, x): 64 | loss = -torch.mean(torch.sum(x*torch.log(torch.clamp(recon_x, min=1e-10))+ 65 | (1-x)*torch.log(torch.clamp(1-recon_x, min=1e-10)), 1)) 66 | 67 | return loss 68 | 69 | def forward(self, x): 70 | h = self.encoder(x) 71 | z = self._enc_mu(h) 72 | 73 | return z, self.decode(z) 74 | 75 | def save_model(self, path): 76 | torch.save(self.state_dict(), path) 77 | 78 | def load_model(self, path): 79 | pretrained_dict = torch.load(path, map_location=lambda storage, loc: storage) 80 | model_dict = self.state_dict() 81 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 82 | model_dict.update(pretrained_dict) 83 | self.load_state_dict(model_dict) 84 | 85 | def pretrain(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.2, loss_type="cross-entropy"): 86 | trloader = trainloader 87 | valoader = validloader 88 | daeLayers = [] 89 | for l in range(1, len(self.layers)): 90 | infeatures = self.layers[l-1] 91 | outfeatures = self.layers[l] 92 | if l!= len(self.layers)-1: 93 | dae = DenoisingAutoencoder(infeatures, outfeatures, activation=self.activation, dropout=corrupt) 94 | else: 95 | dae = DenoisingAutoencoder(infeatures, outfeatures, activation="none", dropout=0) 96 | print(dae) 97 | if l==1: 98 | dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type=loss_type) 99 | else: 100 | if self.activation=="sigmoid": 101 | dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="cross-entropy") 102 | else: 103 | dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="mse") 104 | data_x = dae.encodeBatch(trloader) 105 | valid_x = dae.encodeBatch(valoader) 106 | trainset = Dataset(data_x, data_x) 107 | trloader = torch.utils.data.DataLoader( 108 | trainset, batch_size=batch_size, shuffle=True, num_workers=0) 109 | validset = Dataset(valid_x, valid_x) 110 | valoader = torch.utils.data.DataLoader( 111 | validset, batch_size=1000, shuffle=False, num_workers=0) 112 | daeLayers.append(dae) 113 | 114 | self.copyParam(daeLayers) 115 | 116 | def copyParam(self, daeLayers): 117 | if self.dropout==0: 118 | every = 2 119 | else: 120 | every = 3 121 | # input layer 122 | # copy encoder weight 123 | self.encoder[0].weight.data.copy_(daeLayers[0].weight.data) 124 | self.encoder[0].bias.data.copy_(daeLayers[0].bias.data) 125 | self._dec.weight.data.copy_(daeLayers[0].deweight.data) 126 | self._dec.bias.data.copy_(daeLayers[0].vbias.data) 127 | 128 | for l in range(1, len(self.layers)-2): 129 | # copy encoder weight 130 | self.encoder[l*every].weight.data.copy_(daeLayers[l].weight.data) 131 | self.encoder[l*every].bias.data.copy_(daeLayers[l].bias.data) 132 | 133 | # copy decoder weight 134 | self.decoder[-(l-1)*every-2].weight.data.copy_(daeLayers[l].deweight.data) 135 | self.decoder[-(l-1)*every-2].bias.data.copy_(daeLayers[l].vbias.data) 136 | 137 | # z layer 138 | self._enc_mu.weight.data.copy_(daeLayers[-1].weight.data) 139 | self._enc_mu.bias.data.copy_(daeLayers[-1].bias.data) 140 | self.decoder[0].weight.data.copy_(daeLayers[-1].deweight.data) 141 | self.decoder[0].bias.data.copy_(daeLayers[-1].vbias.data) 142 | 143 | def fit(self, trainloader, validloader, lr=0.001, num_epochs=10, corrupt=0.3, 144 | loss_type="mse"): 145 | """ 146 | data_x: FloatTensor 147 | valid_x: FloatTensor 148 | """ 149 | use_cuda = torch.cuda.is_available() 150 | if use_cuda: 151 | self.cuda() 152 | print("=====Stacked Denoising Autoencoding Layer=======") 153 | # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=lr) 154 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9) 155 | if loss_type=="mse": 156 | criterion = MSELoss() 157 | elif loss_type=="cross-entropy": 158 | criterion = BCELoss() 159 | 160 | # validate 161 | total_loss = 0.0 162 | total_num = 0 163 | for batch_idx, (inputs, _) in enumerate(validloader): 164 | inputs = inputs.view(inputs.size(0), -1).float() 165 | if use_cuda: 166 | inputs = inputs.cuda() 167 | inputs = Variable(inputs) 168 | z, outputs = self.forward(inputs) 169 | 170 | valid_recon_loss = criterion(outputs, inputs) 171 | total_loss += valid_recon_loss.data * len(inputs) 172 | total_num += inputs.size()[0] 173 | 174 | valid_loss = total_loss / total_num 175 | print("#Epoch 0: Valid Reconstruct Loss: %.4f" % (valid_loss)) 176 | self.train() 177 | for epoch in range(num_epochs): 178 | # train 1 epoch 179 | adjust_learning_rate(lr, optimizer, epoch) 180 | train_loss = 0.0 181 | for batch_idx, (inputs, _) in enumerate(trainloader): 182 | inputs = inputs.view(inputs.size(0), -1).float() 183 | inputs_corr = masking_noise(inputs, corrupt) 184 | if use_cuda: 185 | inputs = inputs.cuda() 186 | inputs_corr = inputs_corr.cuda() 187 | optimizer.zero_grad() 188 | inputs = Variable(inputs) 189 | inputs_corr = Variable(inputs_corr) 190 | 191 | z, outputs = self.forward(inputs_corr) 192 | recon_loss = criterion(outputs, inputs) 193 | train_loss += recon_loss.data*len(inputs) 194 | recon_loss.backward() 195 | optimizer.step() 196 | 197 | # validate 198 | valid_loss = 0.0 199 | for batch_idx, (inputs, _) in enumerate(validloader): 200 | inputs = inputs.view(inputs.size(0), -1).float() 201 | if use_cuda: 202 | inputs = inputs.cuda() 203 | inputs = Variable(inputs) 204 | z, outputs = self.forward(inputs) 205 | 206 | valid_recon_loss = criterion(outputs, inputs) 207 | valid_loss += valid_recon_loss.data * len(inputs) 208 | 209 | print("#Epoch %3d: Reconstruct Loss: %.4f, Valid Reconstruct Loss: %.4f" % ( 210 | epoch+1, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset))) 211 | 212 | 213 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | '''Some helper functions for PyTorch, including: 2 | - get_mean_and_std: calculate the mean and std value of dataset. 3 | - msr_init: net parameter initialization. 4 | - progress_bar: progress bar mimic xlua.progress. 5 | 6 | ''' 7 | import os 8 | import sys 9 | import time 10 | import math 11 | import numpy as np 12 | import random 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.init as init 16 | import torch.utils.data as data 17 | from scipy.linalg import norm 18 | from PIL import Image 19 | 20 | 21 | def weights_xavier_init(m): 22 | if isinstance(m, nn.Linear): 23 | nn.init.xavier_uniform(m.weight.data) 24 | nn.init.constant(m.bias.data, 0) 25 | 26 | 27 | class Dataset(data.Dataset): 28 | def __init__(self, data, labels, transform=None, target_transform=None): 29 | self.transform = transform 30 | self.target_transform = target_transform 31 | self.data = data 32 | self.labels = labels 33 | if torch.cuda.is_available(): 34 | self.data = self.data.cuda() 35 | self.labels = self.labels.cuda() 36 | 37 | def __getitem__(self, index): 38 | img, target = self.data[index], self.labels[index] 39 | # img = Image.fromarray(img) 40 | if self.transform is not None: 41 | img = self.transform(img) 42 | 43 | if self.target_transform is not None: 44 | target = self.target_transform(target) 45 | 46 | return img, target 47 | 48 | def __len__(self): 49 | return len(self.data) 50 | 51 | 52 | def masking_noise(data, frac): 53 | """ 54 | data: Tensor 55 | frac: fraction of unit to be masked out 56 | """ 57 | data_noise = data.clone() 58 | rand = torch.rand(data.size()) 59 | data_noise[rand 0: 205 | tmp1 = random.randint(0, y.shape[0] - 1) 206 | tmp2 = random.randint(0, y.shape[0] - 1) 207 | if tmp1 == tmp2: 208 | continue 209 | if y[tmp1] == y[tmp2]: 210 | ml_ind1.append(tmp1) 211 | ml_ind2.append(tmp2) 212 | else: 213 | cl_ind1.append(tmp1) 214 | cl_ind2.append(tmp2) 215 | num -= 1 216 | return np.array(ml_ind1), np.array(ml_ind2), np.array(cl_ind1), np.array(cl_ind2) 217 | 218 | 219 | def generate_mnist_triplets(y, num): 220 | """ 221 | Generate random triplet constraints 222 | """ 223 | # To download the trusted_embedding for mnist data, run the script download_model.sh 224 | # Or you can create your own truseted embedding by running our pairwise constraints model 225 | # with 100000 randomly generated constraints. 226 | mnist_embedding = np.load("../model/mnist_triplet_embedding.npy") 227 | anchor_inds, pos_inds, neg_inds = [], [], [] 228 | while num > 0: 229 | tmp_anchor_index = random.randint(0, y.shape[0] - 1) 230 | tmp_pos_index = random.randint(0, y.shape[0] - 1) 231 | tmp_neg_index = random.randint(0, y.shape[0] - 1) 232 | pos_distance = norm(mnist_embedding[tmp_anchor_index]-mnist_embedding[tmp_pos_index], 2) 233 | neg_distance = norm(mnist_embedding[tmp_anchor_index]-mnist_embedding[tmp_neg_index], 2) 234 | # 35 is selected by grid search which produce human trusted positive/negative pairs 235 | if neg_distance <= pos_distance + 35: 236 | continue 237 | anchor_inds.append(tmp_anchor_index) 238 | pos_inds.append(tmp_pos_index) 239 | neg_inds.append(tmp_neg_index) 240 | num -= 1 241 | return np.array(anchor_inds), np.array(pos_inds), np.array(neg_inds) 242 | 243 | 244 | def generate_triplet_constraints_continuous(y, num): 245 | """ 246 | Generate random triplet constraints 247 | """ 248 | # To download the trusted_embedding for mnist data, run the script download_model.sh 249 | # Or you can create your own truseted embedding by running our pairwise constraints model 250 | # with 100000 randomly generated constraints. 251 | fashion_embedding = np.load("../model/fashion_triplet_embedding.npy") 252 | anchor_inds, pos_inds, neg_inds = [], [], [] 253 | while num > 0: 254 | tmp_anchor_index = random.randint(0, y.shape[0] - 1) 255 | tmp_pos_index = random.randint(0, y.shape[0] - 1) 256 | tmp_neg_index = random.randint(0, y.shape[0] - 1) 257 | pos_distance = norm(fashion_embedding[tmp_anchor_index]-fashion_embedding[tmp_pos_index], 2) 258 | neg_distance = norm(fashion_embedding[tmp_anchor_index]-fashion_embedding[tmp_neg_index], 2) 259 | # 80 is selected by grid search which produce human trusted positive/negative pairs 260 | if neg_distance <= pos_distance + 80: 261 | continue 262 | anchor_inds.append(tmp_anchor_index) 263 | pos_inds.append(tmp_pos_index) 264 | neg_inds.append(tmp_neg_index) 265 | num -= 1 266 | return np.array(anchor_inds), np.array(pos_inds), np.array(neg_inds) 267 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Code for ECMLPKDD 2019 Paper: [A Framework for Deep Constrained Clustering - Algorithms and Advances](https://arxiv.org/abs/1901.10061) 2 | 3 | ## Installation 4 | 5 | #### Step 1: Clone the Code from Github 6 | 7 | ``` 8 | git clone https://github.com/blueocean92/deep_constrained_clustering 9 | cd deep_constrained_clustering 10 | ``` 11 | 12 | 13 | 14 | 15 | #### Step 2: Install Requirements 16 | 17 | **Python**: see [`requirement.txt`](https://github.com/blueocean92/deep_constrained_clustering/blob/master/requirements.txt) for complete list of used packages. We recommend doing a clean installation of requirements using virtualenv: 18 | ```bash 19 | conda create -n testenv python=3.6 20 | source activate testenv 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | If you dont want to do the above clean installation via virtualenv, you could also directly install the requirements through: 25 | ```bash 26 | pip install -r requirements.txt --no-index 27 | ``` 28 | 29 | **PyTorch**: Note that you need [PyTorch](https://pytorch.org/). We used Version 1.0.0 If you use the above virtualenv, PyTorch will be automatically installed therein. 30 | 31 | 32 | ## Running Constrained Clustering Experiments 33 | 34 | While in `deep_constrained_clustering` folder: 35 | 36 | #### Step 1: Download Pretrained Networks 37 | 38 | ``` 39 | sh download_model.sh 40 | ``` 41 | 42 | #### Step 2: Download Processed Reuters Data(optional, MNIST and Fashion is available in torchvision.datasets) 43 | 44 | ``` 45 | sh download_data.sh 46 | ``` 47 | 48 | ``` 49 | cd experiments/ 50 | ``` 51 | 52 | While in `deep_constrained_clustering/experiments` folder: 53 | #### Step 3: Run Experimental Scripts to Reproduce Results 54 | 55 | ###### Option 1: Run Demo Pairwise Constraints Script 56 | 57 | To run the pairwise constrained clustering using pre-trained weights (AE features, 6000 constraints), do: 58 | ```bash 59 | python run_DCC_pairwise.py --data $DATA 60 | ``` 61 | 62 | For the `--data` flag which specifies the data set being used, the options are "MNIST", "Fashion" and "Reuters". 63 | 64 | To run the pairwise without constrained clustering from raw features, do: 65 | ```bash 66 | python run_DCC_pairwise.py --data $DATA --without_pretrain 67 | ``` 68 | 69 | To run the pairwise without KMeans initialization, do: 70 | ```bash 71 | python run_DCC_pairwise.py --data $DATA --without_kmeans 72 | ``` 73 | 74 | To run the pairwise constrained clustering with noisy pairwise constraints do: 75 | ```bash 76 | python run_DCC_pairwise.py --data $DATA --noisy $NOISE 77 | ``` 78 | 79 | For the `--noisy` flag which specifies the noisy degree, the option should be a positive float equal to the ratio of noisy constraints to ground truth constraints. 80 | 81 | 82 | To save data for plotting, do: 83 | ```bash 84 | python run_DCC_pairwise.py --data $DATA --plotting 85 | ``` 86 | 87 | This will save the experiment data for plotting in folders under ./plotting 88 | 89 | To plot the results, do: 90 | ```bash 91 | python ./plotting/plot_pairwise.py 92 | ``` 93 | 94 | 95 | ###### Option 2: Run Demo Instance Constraints Script 96 | 97 | To run the instance difficulty constrained clustering, do: 98 | ```bash 99 | python run_DCC_instance.py --data $DATA 100 | ``` 101 | 102 | ###### Option 3: Run Demo Triplets Constraints Script 103 | 104 | To run the triplets constrained clustering (6000 constraints), do: 105 | ```bash 106 | python run_DCC_triplets.py --data $DATA 107 | ``` 108 | 109 | 110 | ###### Option 4: Run Demo Global Constraints Script 111 | 112 | To run the global size constrained clustering, do: 113 | ```bash 114 | python run_DCC_global.py --data $DATA 115 | ``` 116 | 117 | 118 | ###### Option 5: Run Demo Improved DEC Script 119 | 120 | To run the baseline Improved DEC, do: 121 | ```bash 122 | python run_improved_DEC.py --data $DATA 123 | ``` 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2018.11.29 2 | cffi==1.11.5 3 | numpy==1.15.4 4 | olefile==0.46 5 | Pillow==6.2.0 6 | pycparser==2.19 7 | scikit-learn==0.20.2 8 | scipy==1.1.0 9 | six==1.12.0 10 | torch==1.0.0 11 | torchvision==0.2.1 12 | --------------------------------------------------------------------------------