├── .gitignore ├── image_folder ├── accuracy.png ├── process.png ├── visualize.png └── visualization.png ├── requirments.txt ├── train.py ├── embedding_patent.py ├── preprocessing.py ├── README.md ├── utils.py ├── config.py ├── doc2vec.py └── dec.py /.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /image_folder/accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamLab/pdcde2018/HEAD/image_folder/accuracy.png -------------------------------------------------------------------------------- /image_folder/process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamLab/pdcde2018/HEAD/image_folder/process.png -------------------------------------------------------------------------------- /image_folder/visualize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamLab/pdcde2018/HEAD/image_folder/visualize.png -------------------------------------------------------------------------------- /image_folder/visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamLab/pdcde2018/HEAD/image_folder/visualization.png -------------------------------------------------------------------------------- /requirments.txt: -------------------------------------------------------------------------------- 1 | numpy==1.13.3 2 | pandas==0.23.0 3 | scikit-learn==0.19.1 4 | scipy==1.0.0 5 | tensorflow_gpu>=1.4.0 6 | keras>=2.2.0 7 | nltk>=3.3 8 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from dec import DeepEmbeddingClustering 2 | import numpy as np 3 | import pickle 4 | import config 5 | import utils 6 | import os 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | args = config.get_args() 12 | 13 | # embedding vector data 14 | data_path = os.path.join(args.save_embedding_vector, args.dataset) 15 | 16 | if os.path.isfile(data_path): 17 | data = pickle.load(open(data_path, "rb")) 18 | embedding_vector, label = data 19 | 20 | # normalize 21 | normalized_doc_embeddings = utils.normalization_vector(embedding_vector) 22 | 23 | # check accuracy utilized k-mean 24 | utils.check_kmean_accuracy(normalized_doc_embeddings, label) 25 | dec = DeepEmbeddingClustering(args, n_clusters=len(np.unique(label))) 26 | 27 | # greedy-layer wise auto-encoder 28 | dec.initialize(args, 29 | normalized_doc_embeddings, 30 | finetune_iters=args.finetune_iters, 31 | layerwise_pretrain_iters=args.layerwise_pretrain_iters) 32 | 33 | # update z space of patent document vector 34 | dec.cluster(args, x_data=normalized_doc_embeddings, y_data=label, test=args.task) 35 | 36 | else: 37 | 38 | print("embedding patent document first!") 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /embedding_patent.py: -------------------------------------------------------------------------------- 1 | import config 2 | import preprocessing 3 | import utils 4 | import doc2vec 5 | import numpy as np 6 | import os 7 | import pickle 8 | 9 | 10 | if __name__ == "__main__": 11 | 12 | args = config.get_args() 13 | 14 | # make directory 15 | utils.make_directory_doc(args) 16 | 17 | # load dataset 18 | abstract, label = preprocessing.load_dataset(args) 19 | 20 | # stemming process. we used Snowball stemming of nltk package. 21 | abstract = preprocessing.stemming(abstract) 22 | 23 | # convert word text to idx. 24 | sequences, word2idx, vocab_size, instances = preprocessing.get_sequences(abstract, args) 25 | 26 | # get context words, target word and document idx 27 | context, target, document = preprocessing.get_trainable_data(sequences, instances, args) 28 | 29 | num_document = np.max(document)+1 30 | 31 | # model load and compile 32 | model = doc2vec.load_model(args, vocab_size, args.embedding_size, num_document) 33 | 34 | # get callbacks 35 | callbacks = utils.get_callbacks_doc(args) 36 | 37 | if not os.path.isfile(os.path.join(args.save_weight_path, "doc2vec_weights_{}.h5".format(args.dataset))): 38 | # train 39 | model.fit(x=[context, target, document], y=target, shuffle=True, 40 | batch_size=args.doc_batch_size, epochs=args.doc_epochs, callbacks=callbacks) 41 | 42 | # save patent abstract vector 43 | model.load_weights(os.path.join(args.save_weight_path, "doc2vec_weights_{}.h5".format(args.dataset))) 44 | embedding_vector = model.get_weights()[0] 45 | 46 | if not os.path.isfile(os.path.join(args.save_embedding_vector, args.dataset)): 47 | pickle.dump([embedding_vector, label], open(os.path.join(args.save_embedding_vector, args.dataset), "wb")) 48 | 49 | print("Finish embedding process!") 50 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import numpy as np 4 | 5 | from sklearn import preprocessing 6 | from nltk.stem.snowball import SnowballStemmer 7 | 8 | from keras.preprocessing.text import Tokenizer 9 | 10 | 11 | def load_dataset(args): 12 | 13 | df = pd.read_csv(os.path.join(args.dataset_path, "kpris_data.csv")) 14 | 15 | # label encoder 16 | le = preprocessing.LabelEncoder() 17 | 18 | # categories select 19 | if args.dataset != "5_categories": 20 | 21 | categories = args.dataset.split("_") 22 | target_index = np.where(np.isin(df.target.values, categories) == 1)[0] 23 | 24 | x_data = df.abstract.values[target_index] 25 | y_data = le.fit_transform(df.target.values[target_index]) 26 | 27 | # use all categories of KPRIS dataset 28 | else: 29 | x_data = df.abstract.values 30 | y_data = le.fit_transform(df.target.values) 31 | 32 | assert len(x_data) == len(y_data) 33 | print("Number of Abstract : {} , Target : {}".format(len(x_data), len(y_data))) 34 | 35 | return x_data, y_data 36 | 37 | 38 | def stemming(sentences): 39 | 40 | stemmer = SnowballStemmer("english") 41 | 42 | stemming_sentences = [] 43 | for i, sent in enumerate(sentences): 44 | stem_sent = " ".join([stemmer.stem(word) for word in sent.split()]) 45 | stemming_sentences.append(stem_sent) 46 | 47 | print("Stemming process done") 48 | return stemming_sentences 49 | 50 | 51 | def get_sequences(sentences, args): 52 | 53 | tokenizer = Tokenizer() 54 | tokenizer.fit_on_texts(sentences) 55 | sequences = tokenizer.texts_to_sequences(sentences) 56 | 57 | # get word index 58 | word_index = tokenizer.word_index 59 | print('Found %s unique tokens.' % len(word_index)) 60 | 61 | # get vocab size, use index zero to padding 62 | vocab_size = len(word_index) + 1 63 | print("Vocab size is {}".format(vocab_size)) 64 | 65 | # padding sequence same as window size. 66 | sequences = [[0]*args.window_size+sequence+[0]*args.window_size for sequence in sequences] 67 | 68 | # check how many training sampling we get 69 | instances = np.sum([len(sequence)-2*args.window_size for sequence in sequences]) 70 | print("Training sampling : {}".format(instances)) 71 | 72 | return sequences, word_index, vocab_size, instances 73 | 74 | 75 | def get_trainable_data(sequences, instances, args): 76 | 77 | context = np.zeros(shape=(instances, args.window_size*2+1), dtype=np.int32) 78 | target = np.zeros(shape=(instances, 1), dtype=np.int32) 79 | document = np.zeros(shape=(instances, 1), dtype=np.int32) 80 | 81 | k = 0 82 | for doc_id, sequence in enumerate(sequences): 83 | for i in range(args.window_size, len(sequence)-args.window_size): 84 | 85 | context[k] = sequence[i-args.window_size:i+args.window_size+1] 86 | target[k] = sequence[i] 87 | document[k] = doc_id 88 | k += 1 89 | 90 | # delete target word in context 91 | context = np.delete(context, args.window_size, axis=1) 92 | 93 | print("trainable data settting finish") 94 | return context, target, document 95 | 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Patent Document Clustering with Deep Embeddings 2 | 3 | Code for the paper 4 | 5 | **PDCDE : Patent Document Clustering with Deep Embeddings** 6 | Jaeyoung Kim, Janghyeok Yoon, Eunjeong Park, Sungchul Choi 7 | 8 | https://www.researchgate.net/publication/325251122_Patent_Document_Clustering_with_Deep_Embeddings 9 | 10 | ![process](image_folder/process.png) 11 | 12 | ## Dataset 13 | 14 | * [KIPRIS](http://www.kipris.or.kr/khome/main.jsp) dataset 15 | - KIPRIS dataset consists of abstracts from five categories of US patent 16 | - Categories : car, cameras, CPUs, memory, graphics. 17 | 18 | * The combination used in the paper 19 | 20 | - Task 1 : car-camera(Less relevant class) 21 | - Task 2 : memory-cpu(Relevant classes) 22 | - Task 3 : car, camera, cpu, memory, graphics. 23 | 24 | 25 | ## Results 26 | 27 | #### Visualization 28 | 29 | ![visualize](image_folder/visualize.png) 30 | 31 | #### Unsupervised accuracy 32 | 33 | ![accuracy](image_folder/accuracy.png) 34 | * 3 categories task is used [KISTA](http://biz.kista.re.kr/patentmap) dataset, 35 | we will add this dataset soon. 36 | 37 | ## Prerequisites 38 | 39 | * Tensorflow 1.4.0 40 | * Keras 2.2.0 41 | * nltk 3.3 42 | * pandas 0.23.0 43 | * scikit-learn 0.19.1 44 | 45 | ## Usage 46 | 47 | #### Requirment 48 | 49 | ~~~ 50 | #python2 51 | $ pip install -r requirments.txt 52 | 53 | #python3 54 | $ pip3 install -r requirments.txt 55 | ~~~ 56 | 57 | #### Embedding patent abstract from scratch 58 | * **category : car_camera, memory_cpu, 5_categories** 59 | 60 | ~~~ 61 | $ python embedding_patent.py --dataset "category" 62 | ~~~ 63 | 64 | 65 | #### Train DEC 66 | 67 | ~~~ 68 | $ python train.py --dataset "category" 69 | ~~~ 70 | 71 | 72 | #### To test with an existing model 73 | 74 | ~~~ 75 | $ python train.py --dataset "category" --task test 76 | ~~~ 77 | 78 | ##### **Common Options** 79 | 80 | * `dataset` : categories of dataset. you can select `{"car_camera", "memory_cpu", "5_categories"}` 81 | * `save_embedding_vector` : path to the embedding vectors. 82 | * `save_weight_path` : path to the trained weight. 83 | * `dataset_path` : path to KPRIS dataset. Default is `./dataset` 84 | 85 | 86 | ##### **Doc2Vec Options** 87 | 88 | * `window_size` : Doc2Vec window size. Default is `5`. 89 | * `embedding_size` : Embedding vector dimension. Default is `50`. 90 | * `doc_initializer` : Doc2Vec word and document initializer. Default is `uniform` 91 | * `negative_sample` : Number of negative sampling used0 nce loss. Default is `5`. 92 | * `doc_lr` : Doc2Vec initial learning rate. Default is `0.01`. 93 | * `doc_batch_size` : Doc2Vec batch size. Default is `256`. 94 | * `doc_epochs` : Doc2Vec epochs. Default is `500`. 95 | 96 | ##### **DEC Options** 97 | 98 | * `dec_batch_size` : DEC model batch size. Default is `256` 99 | * `dec_lr` : DEC initial learning rate. Default is `0.001` 100 | * `dec_decay_step` : step decay every **n** epochs. 101 | * `layerwise_pretrain_iters` : layerwise weight pretrain iterations(greedy layer wise auto encoder). 102 | Default is `5000`. 103 | * `finetune_iters` : fine-tunning iteration after layerwise weights pretrain. 104 | Default is `5000`. -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | from sklearn.cluster import KMeans 5 | from keras import callbacks 6 | from sklearn.utils.linear_assignment_ import linear_assignment 7 | 8 | 9 | def make_directory_doc(args): 10 | 11 | if not os.path.exists(args.save_weight_path): 12 | os.makedirs(args.save_weight_path) 13 | if not os.path.exists(args.save_log_path): 14 | os.makedirs(args.save_log_path) 15 | if not os.path.exists(args.save_embedding_vector): 16 | os.makedirs(args.save_embedding_vector) 17 | 18 | 19 | def get_callbacks_doc(args): 20 | lr_decay = callbacks.LearningRateScheduler(schedule=lambda epoch: args.doc_lr * math.pow(0.5, math.floor((1+epoch)/args.doc_decay_step))) 21 | log = callbacks.CSVLogger(os.path.join(args.save_log_path, "doc2vec_log_{}.csv".format(args.dataset))) 22 | checkpoint = callbacks.ModelCheckpoint(os.path.join(args.save_weight_path, 23 | "doc2vec_weights_{}.h5".format(args.dataset)), 24 | monitor='loss', 25 | save_best_only=True, 26 | save_weights_only=True, 27 | verbose=1, 28 | mode='min') 29 | early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', patience=30) 30 | return [log, lr_decay, checkpoint, early_stopping] 31 | 32 | 33 | def get_callbacks_ae(args): 34 | lr_decay = callbacks.LearningRateScheduler(schedule=lambda epoch: args.dec_lr * math.pow(0.5, math.floor((1+epoch)/args.dec_decay_step))) 35 | log = callbacks.CSVLogger(os.path.join(args.save_log_path, "ae_log_{}.csv".format(args.dataset))) 36 | checkpoint = callbacks.ModelCheckpoint(os.path.join(args.save_weight_path, 37 | "ae_weights_{}.h5".format(args.dataset)), 38 | monitor='loss', 39 | save_best_only=True, 40 | save_weights_only=True, 41 | verbose=1, 42 | mode='min') 43 | early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', patience=50) 44 | return [log, lr_decay, checkpoint, early_stopping] 45 | 46 | 47 | def cluster_acc(y_true, y_pred): 48 | 49 | assert y_pred.size == y_true.size 50 | D = max(y_pred.max(), y_true.max())+1 51 | w = np.zeros((D, D), dtype=np.int64) 52 | for i in range(y_pred.size): 53 | w[y_pred[i], y_true[i]] += 1 54 | ind = linear_assignment(w.max() - w) 55 | accuracy = sum([w[i, j] for i, j in ind])*1.0/y_pred.size, w 56 | return accuracy[0] 57 | 58 | 59 | def normalization_vector(embedding_vector): 60 | 61 | norm = np.sqrt(np.sum(np.square(embedding_vector), axis=1)) 62 | norm = np.expand_dims(norm, axis=-1) 63 | normalized_doc_embeddings = embedding_vector / norm 64 | 65 | return normalized_doc_embeddings 66 | 67 | 68 | def check_kmean_accuracy(normalized_doc_embeddings, label): 69 | 70 | kmeans = KMeans(n_clusters=len(np.unique(label)), n_init=20) 71 | y_pred = kmeans.fit_predict(normalized_doc_embeddings) 72 | 73 | accuracy = cluster_acc(label, y_pred) 74 | print("K-means Accuracy of Doc2Vec : {}".format(accuracy)) -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_args(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--dataset', 7 | required=True, 8 | choices='car_camera memory_cpu 5_categories'.split(), 9 | help='select categories "car_camera, memory_cpu, 3_categories, 5_categories"') 10 | 11 | parser.add_argument('--gpu_id', 12 | type=str, 13 | default="0", 14 | help='select gpu id. default setting is "0"' 15 | ) 16 | 17 | parser.add_argument('--save_embedding_vector', 18 | default='./embedding_vector', 19 | type=str, 20 | help='save path of patent embedding vectors' 21 | ) 22 | 23 | parser.add_argument('--save_log_path', 24 | default='./train_log', 25 | type=str, 26 | help='save path of train log csv file' 27 | ) 28 | 29 | parser.add_argument('--save_weight_path', 30 | default='./checkpoint', 31 | type=str, 32 | help='save weights path' 33 | ) 34 | 35 | parser.add_argument('--dataset_path', 36 | default='./dataset', 37 | type=str, 38 | help='KPRIS and KISTA dataset path') 39 | 40 | parser.add_argument('--window_size', 41 | default=5, 42 | type=int, 43 | help='doc2vec window size. default is 5.') 44 | 45 | parser.add_argument('--embedding_size', 46 | default=50, 47 | type=int, 48 | help='embedding vector dimension') 49 | 50 | parser.add_argument('--doc_initializer', 51 | default='uniform', 52 | type=str, 53 | help='Doc2Vec word and document initializer' 54 | ) 55 | 56 | parser.add_argument('--negative_sample', 57 | default=5, 58 | type=int, 59 | help='number of negative sampling used nce loss.') 60 | 61 | parser.add_argument('--doc_lr', 62 | default=0.001, 63 | type=float, 64 | help='Doc2Vec initial learning rate') 65 | 66 | parser.add_argument('--doc_batch_size', 67 | default=256, 68 | type=int, 69 | help='Doc2Vec batch size') 70 | 71 | parser.add_argument('--doc_epochs', 72 | default=500, 73 | type=int, 74 | help='Doc2Vec epochs') 75 | 76 | parser.add_argument('--doc_decay_step', 77 | default=50, 78 | type=float, 79 | help='decay step. Default 0.5 decay every 200 epochs') 80 | 81 | parser.add_argument('--dec_batch_size', 82 | default=256, 83 | type=int, 84 | help='deep cluster embedding model batch size') 85 | 86 | parser.add_argument('--dec_lr', 87 | default=0.001, 88 | type=float, 89 | help='deep cluster embedding model initial learning rate') 90 | 91 | parser.add_argument('--dec_decay_step', 92 | default=20, 93 | type=int, 94 | help='deep cluster embedding model learning rate decay') 95 | 96 | parser.add_argument('--task', 97 | default='train', 98 | type=str, 99 | help='select train test') 100 | 101 | parser.add_argument('--layerwise_pretrain_iters', 102 | default=10000, 103 | type=int, 104 | help='layer-wise pretrain weight for greedy layer wise auto encoder') 105 | 106 | parser.add_argument('--finetune_iters', 107 | default=20000, 108 | type=int, 109 | help='fine-tunning iteration') 110 | 111 | return parser.parse_args() -------------------------------------------------------------------------------- /doc2vec.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.optimizers import * 3 | from keras.layers import * 4 | from keras.initializers import * 5 | from keras import objectives 6 | 7 | import keras.backend as K 8 | import tensorflow as tf 9 | 10 | 11 | class NceLogit(Layer): 12 | 13 | """ 14 | implementation Tensorflow nce-loss function. 15 | original source code of Tensorflow : 16 | https://github.com/tensorflow/tensorflow/blob/r1.11/tensorflow/python/ops/nn_impl.py 17 | original paper : http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf 18 | """ 19 | 20 | def __init__(self, target, vocab_size, num_true, embedding_size, num_sampled, **kwargs): 21 | super(NceLogit, self).__init__(**kwargs) 22 | self.target = target 23 | self.vocab_size = vocab_size 24 | self.embedding_size = embedding_size 25 | self.num_sampled = num_sampled 26 | self.num_true = num_true 27 | 28 | def build(self, input_shape): 29 | 30 | self.input_batch = input_shape[0] 31 | self.input_dim = input_shape[1] 32 | 33 | self.W = self.add_weight(shape=(self.vocab_size, self.embedding_size), 34 | initializer=TruncatedNormal(mean=0.0, stddev=1.0/self.embedding_size), 35 | name='softmax_weights') 36 | 37 | self.bias = self.add_weight(shape=(self.vocab_size,), initializer='zeros', name='softmax_bias') 38 | 39 | self.built = True 40 | 41 | def call(self, inputs, training=None): 42 | 43 | target = K.cast(self.target, "int64") 44 | 45 | # label_flat shape [batch_size * num_true] tensor 46 | label_flat = K.reshape(target, [-1]) 47 | sampled_value = tf.nn.log_uniform_candidate_sampler(true_classes=target, 48 | num_true=self.num_true, num_sampled=self.num_sampled, 49 | unique=True, range_max=self.vocab_size) 50 | 51 | # sampled shape : [num_sampled] tensor 52 | sampled, true_expected_count, sampled_expected_count = (K.stop_gradient(s) for s in sampled_value) 53 | sampled = K.cast(sampled, tf.int64) 54 | 55 | all_ids = K.concatenate([label_flat, sampled], axis=0) 56 | 57 | # Retrieve the true weights and the logits of the sampled weights. 58 | # weights shape is [vocab_size, embedding_size] 59 | all_w = tf.nn.embedding_lookup(self.W, all_ids, partition_strategy='mod') 60 | 61 | # true_w shape is [batch_size * num_true, embedding_size] 62 | true_w = K.slice(all_w, [0, 0], K.stack([K.shape(label_flat)[0], -1])) 63 | sampled_w = K.slice(all_w, K.stack([K.shape(label_flat)[0], 0]), [-1, -1]) 64 | 65 | # inputs has shape [batch_size, embedding_size] 66 | # sampled_w has shape [num_sampled, embedding_size] 67 | # Apply matmul, which yields [batch_size, num_sampled] 68 | sampled_logits = K.dot(inputs, K.transpose(sampled_w)) 69 | 70 | # Retrieve the true and sampled biases, compute the true logits, and 71 | # add the biases to the true and sampled logits. 72 | all_b = tf.nn.embedding_lookup(self.bias, all_ids, partition_strategy='mod') 73 | 74 | # true_b is a [batch_size * num_true] tensor 75 | # sampled_b is a [num_sampled] float tensor 76 | true_b = K.slice(all_b, [0], K.shape(label_flat)) 77 | sampled_b = K.slice(all_b, K.shape(label_flat), [-1]) 78 | 79 | # inputs shape is [batch_size, embedding_size] 80 | # true_w shape is [batch_size * num_true, embedding_size] 81 | # row_wise_dots is [batch_size, num_true, embedding_size] 82 | dim = K.shape(true_w)[1:2] 83 | new_true_w_shape = K.concatenate([[-1, self.num_true], dim], axis=0) 84 | row_wise_dots = Multiply()([K.expand_dims(inputs, axis=1), K.reshape(true_w, new_true_w_shape)]) 85 | 86 | # [batch_size, num_true] tensor of true_logits. 87 | dots_as_matrix = K.reshape(row_wise_dots, K.concatenate([[-1], dim], 0)) 88 | 89 | # true_logits = [batch_size, num_true] 90 | true_logits = K.reshape(K.sum(dots_as_matrix, axis=1), [-1, self.num_true]) 91 | true_b = K.reshape(true_b, [-1, self.num_true]) 92 | true_logits += true_b 93 | 94 | sampled_logits += sampled_b 95 | 96 | # out_logits = [batch_size, num_true+num_sampled] 97 | out_logits = K.concatenate([true_logits, sampled_logits], axis=1) 98 | 99 | return out_logits 100 | 101 | def compute_output_shape(self, input_shape): 102 | return tuple([self.input_batch, self.num_true+self.num_sampled]) 103 | 104 | def get_config(self): 105 | config = { 106 | 'target': self.target, 107 | 'vocab_size': self.vocab_size, 108 | 'embedding_size': self.embedding_size, 109 | 'num_sampled': self.num_sampled, 110 | 'num_true': self.num_true 111 | } 112 | base_config = super(NceLogit, self).get_config() 113 | return dict(list(base_config.items()) + list(config.items())) 114 | 115 | 116 | def load_model(args, vocab_size, embedding_size, num_document, summary=True, num_true=1): 117 | 118 | def nce_loss(y_true_, y_pred): 119 | """ 120 | :param y_true: y_true does not use because we calculate true logits and smaple logits using NceLogit. 121 | :param nce_logit: nce_logit consist of [batch_size, num_true+num_sampled]. 122 | [batch_size, 0] is true_logits and [batch_size, 1:] is sample_logits 123 | :return: binary-crossentropy 124 | """ 125 | true_logit = K.expand_dims(y_pred[:, num_true-1], axis=-1) 126 | sample_logit = y_pred[:, num_true:] 127 | 128 | y_true = K.concatenate([K.ones_like(true_logit), K.zeros_like(sample_logit)], axis=1) 129 | loss = K.mean(objectives.binary_crossentropy(y_true, K.sigmoid(y_pred))) 130 | 131 | return loss 132 | 133 | # context inputs size [batch_size* window_size * 2] 134 | context_inputs = Input(shape=(args.window_size*2,), name='context_inputs') 135 | target_inputs = Input(shape=(1,), name='target_inputs') 136 | document_inputs = Input(shape=(1,), name='document_inputs') 137 | 138 | word_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_size, 139 | embeddings_initializer=args.doc_initializer, name='word_embedding') 140 | 141 | # context_embedding shape [batch_size*window_size*2, embedding_size] 142 | context_embedding = word_embedding(context_inputs) 143 | 144 | document_embedding = Embedding(input_dim=num_document, output_dim=embedding_size, name='document_embedding', 145 | embeddings_initializer=args.doc_initializer)(document_inputs) 146 | 147 | document_embedding = Reshape(target_shape=(embedding_size,))(document_embedding) 148 | 149 | # mean of word embedding vector 150 | mean_context_embedding = Lambda(lambda x: K.mean(x, axis=1))(context_embedding) 151 | 152 | average_embedding = Average(name='document_vector')([document_embedding, mean_context_embedding]) 153 | 154 | # Keras does not exists NCE-loss, so implementation NCE loss. 155 | nce_logits = NceLogit(target=target_inputs, vocab_size=vocab_size, num_true=num_true, 156 | embedding_size=embedding_size, num_sampled=args.negative_sample)(average_embedding) 157 | model = Model([context_inputs, target_inputs, document_inputs], nce_logits) 158 | 159 | if summary: 160 | model.summary() 161 | 162 | model.compile(loss=[nce_loss], optimizer=Adam(lr=args.doc_lr)) 163 | return model -------------------------------------------------------------------------------- /dec.py: -------------------------------------------------------------------------------- 1 | from keras.initializers import * 2 | from keras.models import * 3 | from keras.layers import * 4 | from keras.optimizers import * 5 | from sklearn.cluster import KMeans 6 | 7 | import os 8 | import numpy as np 9 | import sys 10 | import utils 11 | 12 | 13 | class ClusterLayer(Layer): 14 | 15 | def __init__(self, output_dim, input_dim=None, weights=None, alpha=1.0, **kwargs): 16 | self.output_dim = output_dim 17 | self.input_dim = input_dim 18 | self.alpha = alpha 19 | 20 | # k-means cluster centre locations 21 | self.initial_weights = weights 22 | self.input_spec = [InputSpec(ndim=2)] 23 | 24 | if self.input_dim: 25 | kwargs['input_shape'] = (self.input_dim,) 26 | super(ClusterLayer, self).__init__(**kwargs) 27 | 28 | def build(self, input_shape): 29 | assert len(input_shape) == 2 30 | input_dim = input_shape[1] 31 | self.input_spec = [InputSpec(dtype=K.floatx(), 32 | shape=(None, input_dim))] 33 | 34 | self.W = K.variable(self.initial_weights) 35 | self.trainable_weights = [self.W] 36 | 37 | def call(self, x, mask=None): 38 | q = 1.0 / (1.0 + K.sqrt(K.sum(K.square(K.expand_dims(x, 1) - self.W), axis=2)) ** 2 / self.alpha) 39 | q = q ** ((self.alpha + 1.0) / 2.0) 40 | q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) 41 | return q 42 | 43 | def get_output_shape_for(self, input_shape): 44 | assert input_shape and len(input_shape) == 2 45 | return (input_shape[0], self.output_dim) 46 | 47 | def compute_output_shape(self, input_shape): 48 | assert input_shape and len(input_shape) == 2 49 | return (input_shape[0], self.output_dim) 50 | 51 | def get_config(self): 52 | config = {'output_dim': self.output_dim, 53 | 'input_dim': self.input_dim} 54 | base_config = super(ClusterLayer, self).get_config() 55 | return dict(list(base_config.items()) + list(config.items())) 56 | 57 | 58 | class DeepEmbeddingClustering(object): 59 | def __init__(self, args, n_clusters, alpha=1.0, cluster_centroid=None): 60 | ''' 61 | :param n_clusters: number of cluster(classes) 62 | :param alpha: soft-clustering hyper parameter. 63 | :param cluster_centroid: centroid each cluster. 64 | ''' 65 | 66 | super(DeepEmbeddingClustering, self).__init__() 67 | 68 | self.n_clusters = n_clusters 69 | self.input_dim = args.embedding_size 70 | self.alpha = alpha 71 | self.pretrained_weights_path = os.path.join(args.save_weight_path, "ae_weights_{}.h5".format(args.dataset)) 72 | self.cluster_centroid = cluster_centroid 73 | self.batch_size = args.dec_batch_size 74 | self.learning_rate = args.dec_lr 75 | 76 | # encoder layer dimensions. Decoder dimension is the opposite. 77 | self.encoders_dims = [self.input_dim, 500, 500, 2000, 50] 78 | self.input_layer = Input(shape=(self.input_dim,), name='input') 79 | self.dropout_fraction = 0.2 80 | 81 | self.layer_wise_autoencoders = [] 82 | self.encoders = [] 83 | self.decoders = [] 84 | for i in range(1, len(self.encoders_dims)): 85 | encoder_activation = 'linear' if i == (len(self.encoders_dims) - 1) else 'selu' 86 | encoder = Dense(self.encoders_dims[i], activation=encoder_activation, 87 | input_shape=(self.encoders_dims[i - 1],), 88 | kernel_initializer=RandomNormal(mean=0.0, stddev=0.01, seed=None), 89 | bias_initializer='zeros', name='encoder_dense_%d' % i) 90 | self.encoders.append(encoder) 91 | 92 | decoder_index = len(self.encoders_dims) - i 93 | decoder_activation = 'linear' if i == 1 else 'selu' 94 | decoder = Dense(self.encoders_dims[i - 1], activation=decoder_activation, 95 | kernel_initializer=RandomNormal(mean=0.0, stddev=0.01, seed=None), 96 | bias_initializer='zeros', 97 | name='decoder_dense_%d' % decoder_index) 98 | self.decoders.append(decoder) 99 | 100 | autoencoder = Sequential([ 101 | Dropout(self.dropout_fraction, input_shape=(self.encoders_dims[i - 1],), 102 | name='encoder_dropout_%d' % i), 103 | encoder, 104 | Dropout(self.dropout_fraction, name='decoder_dropout_%d' % decoder_index), 105 | decoder 106 | ]) 107 | autoencoder.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 108 | self.layer_wise_autoencoders.append(autoencoder) 109 | 110 | # build the end-to-end autoencoder for fine-tuning 111 | # Note that at this point dropout is discarded 112 | self.encoder = Sequential(self.encoders) 113 | self.encoder.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 114 | self.decoders.reverse() 115 | self.autoencoder = Sequential(self.encoders + self.decoders) 116 | self.autoencoder.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 117 | 118 | if cluster_centroid is not None: 119 | assert cluster_centroid.shape[0] == self.n_clusters 120 | assert cluster_centroid.shape[1] == self.encoder.layers[-1].output_dim 121 | 122 | if os.path.isfile(self.pretrained_weights_path): 123 | self.autoencoder.load_weights(self.pretrained_weights_path) 124 | print("Load pre-trained AE") 125 | 126 | def p_mat(self, q): 127 | weight = q ** 2 / q.sum(0) 128 | return (weight.T / weight.sum(1)).T 129 | 130 | def initialize(self, args, x_data, layerwise_pretrain_iters=50000, finetune_iters=100000): 131 | if not os.path.isfile(self.pretrained_weights_path): 132 | 133 | iters_per_epoch = int(len(x_data) / self.batch_size) 134 | layerwise_epochs = max(int(layerwise_pretrain_iters / iters_per_epoch), 1) 135 | finetune_epochs = max(int(finetune_iters / iters_per_epoch), 1) 136 | 137 | print('layer-wise pre-train') 138 | current_input = x_data 139 | [train_log, lr_schedule, checkpoint, early_stopping] = utils.get_callbacks_ae(args) 140 | 141 | # greedy-layer wise training 142 | for i, autoencoder in enumerate(self.layer_wise_autoencoders): 143 | if i > 0: 144 | weights = self.encoders[i - 1].get_weights() 145 | dense_layer = Dense(self.encoders_dims[i], input_shape=(current_input.shape[1],), 146 | activation='selu', weights=weights, 147 | name='encoder_dense_copy_%d' % i) 148 | encoder_model = Sequential([dense_layer]) 149 | encoder_model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 150 | current_input = encoder_model.predict(current_input) 151 | 152 | autoencoder.fit(current_input, current_input, 153 | batch_size=self.batch_size, epochs=layerwise_epochs, callbacks=[lr_schedule]) 154 | self.autoencoder.layers[i].set_weights(autoencoder.layers[1].get_weights()) 155 | self.autoencoder.layers[len(self.autoencoder.layers) - i - 1].set_weights( 156 | autoencoder.layers[-1].get_weights()) 157 | 158 | print('Fine-tuning auto-encoder') 159 | 160 | # update encoder and decoder weights: 161 | self.autoencoder.fit(x_data, x_data, 162 | batch_size=self.batch_size, 163 | epochs=finetune_epochs, 164 | callbacks=[train_log, lr_schedule, checkpoint, early_stopping]) 165 | 166 | else: 167 | print('Loading pre-trained weights for auto-encoder.') 168 | self.autoencoder.load_weights(self.pretrained_weights_path) 169 | 170 | # update encoder, decoder 171 | 172 | for i in range(len(self.encoder.layers)): 173 | self.encoder.layers[i].set_weights(self.autoencoder.layers[i].get_weights()) 174 | 175 | # initialize cluster centres using k-means 176 | print('Initializing cluster centres with k-means.') 177 | if self.cluster_centroid is None: 178 | kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) 179 | self.y_pred = kmeans.fit_predict(self.encoder.predict(x_data)) 180 | self.cluster_centroid = kmeans.cluster_centers_ 181 | 182 | # initial centroid using K-mean 183 | self.dec_model = Sequential([self.encoder, 184 | ClusterLayer(self.n_clusters, weights=self.cluster_centroid, name='dec')]) 185 | self.dec_model.compile(loss='kullback_leibler_divergence', optimizer=Adam(lr=0.0001)) 186 | return 187 | 188 | def cluster(self, args, x_data, y_data=None, test="train", tol=0.01, iter_max=1e6, **kwargs): 189 | 190 | save_path = os.path.join(args.save_weight_path, "dec_weights_{}.h5".format(args.dataset)) 191 | 192 | if os.path.isfile(save_path): 193 | self.dec_model.load_weights(save_path) 194 | print('Restored Model weight') 195 | 196 | if test=="test": 197 | y_pred = self.dec_model.predict(x_data, verbose=0).argmax(1) 198 | acc = utils.cluster_acc(y_data, y_pred) 199 | print('Accuracy ' + str(np.round(acc, 5))) 200 | return 201 | 202 | update_interval = x_data.shape[0] / self.batch_size 203 | print('Update interval', update_interval) 204 | 205 | train = True 206 | iteration, index = 0, 0 207 | current_acc = 0 208 | self.accuracy = 0 209 | 210 | while train: 211 | sys.stdout.write('\r') 212 | # cut off iteration 213 | if iter_max < iteration: 214 | print('Reached maximum iteration limit. Stopping training.') 215 | return self.y_pred 216 | 217 | # update (or initialize) probability distributions and propagate weight changes 218 | # from DEC model to encoder. 219 | if iteration % update_interval == 0: 220 | self.q = self.dec_model.predict(x_data, verbose=0) 221 | self.p = self.p_mat(self.q) 222 | 223 | y_pred = self.q.argmax(1) 224 | delta_label = (np.sum((y_pred == self.y_pred)).astype(np.float32) / y_pred.shape[0]) 225 | if y_data is not None: 226 | current_acc = utils.cluster_acc(y_data, y_pred) 227 | print('Iteration ' + str(iteration) + ', Accuracy ' + str(np.round(current_acc, 5))) 228 | 229 | else: 230 | print(str(np.round(delta_label * 100, 5)) + '% change in label assignment') 231 | 232 | if delta_label < tol: 233 | print('Reached tolerance threshold.') 234 | train = False 235 | continue 236 | else: 237 | self.y_pred = y_pred 238 | 239 | # weight changes if current 240 | if self.accuracy < current_acc: 241 | for i in range(len(self.encoder.layers)): 242 | self.encoder.layers[i].set_weights(self.dec_model.layers[0].layers[i].get_weights()) 243 | self.cluster_centroid = self.dec_model.layers[-1].get_weights()[0] 244 | 245 | # save checkpoint 246 | self.dec_model.save(save_path) 247 | self.accuracy = current_acc 248 | print("update weight and save checkpoint") 249 | 250 | # train on batch 251 | sys.stdout.write('Iteration %d, ' % iteration) 252 | if (index + 1) * self.batch_size > x_data.shape[0]: 253 | loss = self.dec_model.train_on_batch(x_data[index * self.batch_size::], 254 | self.p[index * self.batch_size::]) 255 | index = 0 256 | sys.stdout.write('Loss %f' % loss) 257 | else: 258 | loss = self.dec_model.train_on_batch(x_data[index * self.batch_size:(index + 1) * self.batch_size], 259 | self.p[index * self.batch_size:(index + 1) * self.batch_size]) 260 | sys.stdout.write('Loss %f' % loss) 261 | index += 1 262 | 263 | iteration += 1 264 | sys.stdout.flush() 265 | 266 | return --------------------------------------------------------------------------------