├── LICENSE
├── README.md
├── __pycache__
    └── utils.cpython-37.pyc
├── bert_weights
    └── put_your_bert_weights_in_this_dir.txt
├── datasets
    ├── datasets.z01
    ├── datasets.zip
    └── just_unzip_the_zip_file.txt
├── images
    ├── LCM.png
    ├── curve.png
    └── paper_title.png
├── lcm_exp_on_bert.py
├── lcm_exp_on_lstm.py
├── models
    ├── __pycache__
    │   └── lstm.cpython-37.pyc
    ├── bert.py
    └── lstm.py
├── output
    └── placeholder.txt
└── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 beyond_guo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Description:
 2 | 
 3 | This is the official implementation of our AAAI-21 accepted paper *Label Confusion Learning to Enhance Text Classification Models*.
 4 | 
 5 | <img src="images/paper_title.png" />
 6 | 
 7 | 
 8 | 
 9 | The structure of LCM looks like this:
10 | 
11 | <img src="images/LCM.png" />
12 | 
13 | 
14 | 
15 | Here we provide some demo experimental code & datasets. 
16 | 
17 | ### Environment:
18 | 
19 | python 3.6
20 | tensorflow 2.2.0
21 | keras 2.3.1
22 | 
23 | ### Run a Demo:
24 | 
25 | **LCM-based LSTM:**
26 | 
27 | Run `python lcm_exp_on_lstm.py` to compare the performance of  LSTM, LSTM with label smoothing(LS) and LSTM with LCM.
28 | 
29 | **LCM-based BERT:**
30 | 
31 | Run `python lcm_exp_on_bert.py` to compare the performance of  BERT, BERT with label smoothing(LS) and BERT with LCM.
32 | 
33 | 
34 | 
35 | The final results will be outputted to `output/` directory.
36 | 
37 | 
38 | 
39 | The curve below shows our results on 20NG with LSTM as basic predictor. By changing the α, we can control the influence of LCM on the original model. 
40 | 
41 | <img src="images/curve.png" />
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/bert_weights/put_your_bert_weights_in_this_dir.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/bert_weights/put_your_bert_weights_in_this_dir.txt


--------------------------------------------------------------------------------
/datasets/datasets.z01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/datasets/datasets.z01


--------------------------------------------------------------------------------
/datasets/datasets.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/datasets/datasets.zip


--------------------------------------------------------------------------------
/datasets/just_unzip_the_zip_file.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/datasets/just_unzip_the_zip_file.txt


--------------------------------------------------------------------------------
/images/LCM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/images/LCM.png


--------------------------------------------------------------------------------
/images/curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/images/curve.png


--------------------------------------------------------------------------------
/images/paper_title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/images/paper_title.png


--------------------------------------------------------------------------------
/lcm_exp_on_bert.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import numpy as np
  3 | from utils import load_dataset, create_asy_noise_labels
  4 | from sklearn.utils import shuffle
  5 | from bert4keras.tokenizers import Tokenizer
  6 | from bert4keras.snippets import sequence_padding
  7 | from models import bert
  8 | import matplotlib.pyplot as plt
  9 | import time
 10 | 
 11 | 
 12 | #%%
 13 | # ========== parameters: ==========
 14 | maxlen = 128
 15 | hidden_size = 64
 16 | batch_size = 512
 17 | epochs = 150
 18 | 
 19 | # ========== bert config: ==========
 20 | # for English, use bert_tiny:
 21 | bert_type = 'bert'
 22 | config_path = 'bert_weights/bert_tiny_uncased_L-2_H-128_A-2/bert_config.json'
 23 | checkpoint_path = 'bert_weights/bert_tiny_uncased_L-2_H-128_A-2/bert_model.ckpt'
 24 | vocab_path = 'bert_weights/bert_tiny_uncased_L-2_H-128_A-2/vocab.txt'
 25 | 
 26 | # for Chinese, use albert_tiny:
 27 | # bert_type = 'albert'
 28 | # config_path = '../bert_weights/albert_tiny_google_zh_489k/albert_config.json'
 29 | # checkpoint_path = '../bert_weights/albert_tiny_google_zh_489k/albert_model.ckpt'
 30 | # vocab_path = '../bert_weights/albert_tiny_google_zh_489k/vocab.txt'
 31 | 
 32 | tokenizer = Tokenizer(vocab_path, do_lower_case=True)
 33 | 
 34 | # ========== dataset: ==========
 35 | dataset_name = '20NG'
 36 | group_noise_rate = 0.3
 37 | df,num_classes,label_groups = load_dataset(dataset_name)
 38 | # define log file name:
 39 | log_txt_name = '%s_BERT_log(group_noise=%s,comp+rec+talk)' % (dataset_name,group_noise_rate)
 40 | 
 41 | df = df.dropna(axis=0,how='any')
 42 | df = shuffle(df)[:50000]
 43 | print('data size:',len(df))
 44 | 
 45 | #%%
 46 | # ========== data preparation: ==========
 47 | labels = sorted(list(set(df.label)))
 48 | assert len(labels) == num_classes,'wrong num of classes!'
 49 | label2idx = {name:i for name,i in zip(labels,range(num_classes))}
 50 | #%%
 51 | print('start tokenizing...')
 52 | t = time.time()
 53 | X_token = []
 54 | X_seg = []
 55 | y = []
 56 | i = 0
 57 | for content,label in zip(list(df.content),list(df.label)):
 58 |     i += 1
 59 |     if i%1000 == 0:
 60 |         print(i)
 61 |     token_ids, seg_ids = tokenizer.encode(content, maxlen=maxlen)
 62 |     X_token.append(token_ids)
 63 |     X_seg.append(seg_ids)
 64 |     y.append(label2idx[label])
 65 | 
 66 | # the sequences we obtained from above may have different length, so use Padding：
 67 | X_token = sequence_padding(X_token)
 68 | X_seg = sequence_padding(X_seg)
 69 | y = np.array(y)
 70 | print('tokenizing time cost:',time.time()-t,'s.')
 71 | 
 72 | #%%
 73 | # ========== model traing: ==========
 74 | old_list = []
 75 | ls_list = []
 76 | lcm_list = []
 77 | N = 5
 78 | for n in range(N):
 79 |     # randomly split train and test each time:
 80 |     np.random.seed(n) # 这样保证了每次试验的seed一致
 81 |     random_indexs = np.random.permutation(range(len(X_token)))
 82 |     train_size = int(len(X_token)*0.6)
 83 |     val_size = int(len(X_token)*0.15)
 84 |     X_token_train = X_token[random_indexs][:train_size]
 85 |     X_token_val = X_token[random_indexs][train_size:train_size+val_size]
 86 |     X_token_test = X_token[random_indexs][train_size+val_size:]
 87 |     X_seg_train = X_seg[random_indexs][:train_size]
 88 |     X_seg_val = X_seg[random_indexs][train_size:train_size + val_size]
 89 |     X_seg_test = X_seg[random_indexs][train_size + val_size:]
 90 |     y_train = y[random_indexs][:train_size]
 91 |     y_val = y[random_indexs][train_size:train_size+val_size]
 92 |     y_test = y[random_indexs][train_size+val_size:]
 93 |     data_package = [X_token_train, X_seg_train, y_train, X_token_val, X_seg_val, y_val, X_token_test, X_seg_test, y_test]
 94 | 
 95 |     # apply noise only on train set:
 96 |     if group_noise_rate>0:
 97 |         _, overall_noise_rate, y_train = create_asy_noise_labels(y_train,label_groups,label2idx,group_noise_rate)
 98 |         data_package = [X_token_train, X_seg_train, y_train, X_token_val, X_seg_val, y_val, X_token_test, X_seg_test,
 99 |                         y_test]
100 |         with open('output/%s.txt' % log_txt_name, 'a') as f:
101 |             print('-'*30,'\nNOITCE: overall_noise_rate=%s'%round(overall_noise_rate,2), file=f)
102 | 
103 | 
104 |     with open('output/%s.txt'%log_txt_name,'a') as f:
105 |         print('\n',str(datetime.datetime.now()),file=f)
106 |         print('\n ROUND & SEED = ',n,'-'*20,file=f)
107 | 
108 |     model_to_run = [""]
109 |     print('====Original:============')
110 |     model = bert.BERT_Basic(config_path,checkpoint_path,hidden_size,num_classes,bert_type)
111 |     train_score_list, val_socre_list, best_val_score, test_score = model.train_val(data_package,batch_size,epochs)
112 |     plt.plot(train_score_list, label='train')
113 |     plt.plot(val_socre_list, label='val')
114 |     plt.title('BERT')
115 |     plt.legend()
116 |     plt.show()
117 |     old_list.append(test_score)
118 |     with open('output/%s.txt'%log_txt_name,'a') as f:
119 |         print('\n*** Orig BERT ***:',file=f)
120 |         print('test acc:', str(test_score), file=f)
121 |         print('best val acc:',str(best_val_score),file=f)
122 |         print('train acc list:\n',str(train_score_list),file=f)
123 |         print('val acc list:\n',str(val_socre_list),'\n',file=f)
124 | 
125 |     print('====Label Smooth:============')
126 |     ls_e = 0.1
127 |     model = bert.BERT_LS(config_path,checkpoint_path,hidden_size,num_classes,ls_e,bert_type)
128 |     train_score_list, val_socre_list, best_val_score, test_score = model.train_val(data_package,batch_size,epochs)
129 |     plt.plot(train_score_list, label='train')
130 |     plt.plot(val_socre_list, label='val')
131 |     plt.title('BERT with LS')
132 |     plt.legend()
133 |     plt.show()
134 |     old_list.append(test_score)
135 |     with open('output/%s.txt'%log_txt_name,'a') as f:
136 |         print('\n*** Orig BERT with LS (e=%s) ***:'%ls_e, file=f)
137 |         print('test acc:', str(test_score), file=f)
138 |         print('best val acc:', str(best_val_score), file=f)
139 |         print('train acc list:\n', str(train_score_list), file=f)
140 |         print('val acc list:\n', str(val_socre_list), '\n', file=f)
141 | 
142 |     print('====LCM:============')
143 |     # alpha = 3
144 |     for alpha in [3,4,5]:
145 |         wvdim = 256
146 |         lcm_stop = 100
147 |         params_str = 'a=%s, wvdim=%s, lcm_stop=%s'%(alpha,wvdim,lcm_stop)
148 |         model = bert.BERT_LCM(config_path,checkpoint_path,hidden_size,num_classes,alpha,wvdim,bert_type)
149 |         train_score_list, val_socre_list, best_val_score, test_score = model.train_val(data_package, batch_size,epochs,lcm_stop)
150 |         plt.plot(train_score_list, label='train')
151 |         plt.plot(val_socre_list, label='val')
152 |         plt.title('BERT with LCM')
153 |         plt.legend()
154 |         plt.show()
155 |         old_list.append(test_score)
156 |         with open('output/%s.txt'%log_txt_name,'a') as f:
157 |             print('\n*** Orig BERT with LCM (%s) ***:'%params_str,file=f)
158 |             print('test acc:', str(test_score), file=f)
159 |             print('best val acc:', str(best_val_score), file=f)
160 |             print('train acc list:\n', str(train_score_list), file=f)
161 |             print('val acc list:\n', str(val_socre_list), '\n', file=f)


--------------------------------------------------------------------------------
/lcm_exp_on_lstm.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import sys
  3 | import pandas as pd
  4 | import numpy as np
  5 | from utils import *
  6 | from sklearn.utils import shuffle
  7 | from models import lstm
  8 | 
  9 | # ========== parameters & dataset: ==========
 10 | vocab_size = 20000
 11 | maxlen = 100
 12 | wvdim = 64
 13 | hidden_size = 64
 14 | # num_filters = 100
 15 | # filter_sizes = [3,10,25] # 不能超过maxlen
 16 | alpha = 4 # new model的loss中的alpha
 17 | batch_size = 512
 18 | epochs = 40
 19 | emb_type = ''
 20 | 
 21 | log_txt_name = '20NG_log'
 22 | num_classes = 20
 23 | df1 = pd.read_csv('datasets/20NG/20ng-train-all-terms.csv')
 24 | df2 = pd.read_csv('datasets/20NG/20ng-test-all-terms.csv')
 25 | df = pd.concat([df1,df2])
 26 | 
 27 | # log_txt_name = 'DBPedia' # AG_log
 28 | # num_classes = 14
 29 | # df1 = pd.read_csv('../datasets/DBPedia/train.csv',header=None,index_col=None)
 30 | # df2 = pd.read_csv('../datasets/DBPedia/test.csv',header=None,index_col=None)
 31 | # df1.columns = ['label','title','content']
 32 | # df2.columns = ['label','title','content']
 33 | # df = pd.concat([df1,df2])
 34 | 
 35 | # log_txt_name = 'Fudan_log'
 36 | # num_classes = 20
 37 | # # df = pd.read_csv('../datasets/thucnews_subset.csv')
 38 | # df = pd.read_csv('../datasets/fudan_news.csv')
 39 | 
 40 | 
 41 | df = df.dropna(axis=0,how='any')
 42 | df = shuffle(df)[:50000]
 43 | print('data size:',len(df))
 44 | 
 45 | 
 46 | # ========== data pre-processing: ==========
 47 | labels = sorted(list(set(df.label)))
 48 | assert len(labels) == num_classes,'wrong num of classes'
 49 | label2idx = {name:i for name,i in zip(labels,range(num_classes))}
 50 | num_classes = len(label2idx)
 51 | 
 52 | corpus = []
 53 | X_words = []
 54 | Y = []
 55 | i = 0
 56 | for content,y in zip(list(df.content),list(df.label)):
 57 |     i += 1
 58 |     if i%1000 == 0:
 59 |         print(i)
 60 |     # English:
 61 |     content_words = content.split(' ')
 62 |     # Chinese:
 63 |     # content_words = jieba.lcut(content)
 64 |     corpus += content_words
 65 |     X_words.append(content_words)
 66 |     Y.append(label2idx[y])
 67 | 
 68 | tokenizer, word_index, freq_word_index = fit_corpus(corpus,vocab_size=vocab_size)
 69 | X = text2idx(tokenizer,X_words,maxlen=maxlen)
 70 | y = np.array(Y)
 71 | 
 72 | 
 73 | 
 74 | # ========== model traing: ==========
 75 | old_list = []
 76 | ls_list = []
 77 | lcm_list = []
 78 | N = 10
 79 | for n in range(N):
 80 |     # randomly split train and test each time:
 81 |     np.random.seed(n) # 这样保证了每次试验的seed一致
 82 |     random_indexs = np.random.permutation(range(len(X)))
 83 |     train_size = int(len(X)*0.6)
 84 |     val_size = int(len(X)*0.15)
 85 |     X_train = X[random_indexs][:train_size]
 86 |     X_val = X[random_indexs][train_size:train_size+val_size]
 87 |     X_test = X[random_indexs][train_size+val_size:]
 88 |     y_train = y[random_indexs][:train_size]
 89 |     y_val = y[random_indexs][train_size:train_size+val_size]
 90 |     y_test = y[random_indexs][train_size+val_size:]
 91 |     data_package = [X_train,y_train,X_val,y_val,X_test,y_test]
 92 | 
 93 | 
 94 |     with open('output/%s.txt'%log_txt_name,'a') as f:
 95 |         print(str(datetime.datetime.now()),file=f)
 96 |         print('--Round',n+1,file=f)
 97 | 
 98 |     print('====Original:============')
 99 |     basic_model = lstm.LSTM_Basic(maxlen,vocab_size,wvdim,hidden_size,num_classes,None)
100 |     best_val_score,val_score_list,final_test_score,final_train_score = basic_model.train_val(data_package,batch_size,epochs)
101 |     old_list.append(final_test_score)
102 |     with open('output/%s.txt'%log_txt_name,'a') as f:
103 |         print(n,'old:',final_train_score,best_val_score,final_test_score,file=f)
104 | 
105 |     print('====LS:============')
106 |     ls_model = lstm.LSTM_LS(maxlen,vocab_size,wvdim,hidden_size,num_classes,None)
107 |     best_val_score,val_score_list,final_test_score,final_train_score = ls_model.train_val(data_package,batch_size,epochs)
108 |     ls_list.append(final_test_score)
109 |     with open('output/%s.txt'%log_txt_name,'a') as f:
110 |         print(n,'ls:',final_train_score,best_val_score,final_test_score,file=f)
111 | 
112 |     print('====LCM:============')
113 |     dy_lcm_model = lstm.LSTM_LCM_dynamic(maxlen,vocab_size,wvdim,hidden_size,num_classes,alpha,None,None)
114 |     best_val_score,val_score_list,final_test_score,final_train_score = dy_lcm_model.train_val(data_package,batch_size,epochs,lcm_stop=30)
115 |     lcm_list.append(final_test_score)
116 |     with open('output/%s.txt'%log_txt_name,'a') as f:
117 |         print(n,'lcm:',final_train_score,best_val_score,final_test_score,file=f)
118 | 


--------------------------------------------------------------------------------
/models/__pycache__/lstm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/models/__pycache__/lstm.cpython-37.pyc


--------------------------------------------------------------------------------
/models/bert.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import keras
  3 | import tensorflow as tf
  4 | import time
  5 | from keras.models import Sequential,Model
  6 | from keras.layers import Input,Dense,LSTM,Embedding,Conv1D,MaxPooling1D
  7 | from keras.layers import Flatten,Dropout,Concatenate,Lambda,Multiply,Reshape,Dot,Bidirectional
  8 | import keras.backend as K
  9 | from keras.utils import to_categorical
 10 | from gensim.models import Word2Vec
 11 | from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
 12 | from bert4keras.models import build_transformer_model
 13 | from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
 14 | 
 15 | 
 16 | class BERT_Basic:
 17 | 
 18 |     def __init__(self,config_path,checkpoint_path,hidden_size,num_classes,model_type='bert'):
 19 |         self.num_classes = num_classes
 20 |         bert = build_transformer_model(config_path=config_path,checkpoint_path=checkpoint_path,
 21 |                                        model=model_type,return_keras_model=False)
 22 |         text_emb = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
 23 |         text_emb = Dense(hidden_size,activation='tanh')(text_emb)
 24 |         output = Dense(num_classes,activation='softmax')(text_emb)
 25 |         self.model = Model(bert.model.input,output)
 26 |         AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
 27 |         self.model.compile(loss='categorical_crossentropy',optimizer=AdamLR(learning_rate=1e-4, lr_schedule={1000: 1,2000: 0.1}))
 28 |          
 29 |     def train_val(self,data_package,batch_size,epochs,save_best=False):
 30 |         X_token_train, X_seg_train, y_train, X_token_val, X_seg_val, y_val, X_token_test, X_seg_test, y_test = data_package
 31 |         best_val_score = 0
 32 |         test_score = 0
 33 |         train_score_list = []
 34 |         val_socre_list = []
 35 |         """实验说明：
 36 |         每一轮train完，在val上测试，记录其accuracy，
 37 |         每当val-acc达到新高，就立马在test上测试，得到test-acc，
 38 |         这样最终保留下来的test-acc就是在val上表现最好的模型在test上的accuracy
 39 |         """
 40 |         learning_curve = []
 41 |         for i in range(epochs):
 42 |             t1 = time.time()
 43 |             self.model.fit([X_token_train,X_seg_train],to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
 44 |             # record train set result:
 45 |             pred_probs = self.model.predict([X_token_train, X_seg_train])
 46 |             predictions = np.argmax(pred_probs, axis=1)
 47 |             train_score = round(accuracy_score(y_train, predictions), 5)
 48 |             train_score_list.append(train_score)
 49 |             # validation:
 50 |             pred_probs = self.model.predict([X_token_val,X_seg_val])
 51 |             predictions = np.argmax(pred_probs,axis=1)
 52 |             val_score = round(accuracy_score(y_val, predictions), 5)
 53 |             val_socre_list.append(val_score)
 54 |             t2 = time.time()
 55 |             print('(Orig)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), ' | train acc:', train_score, ' | val acc:', val_score)
 56 |             # save best model according to validation & test result:
 57 |             if val_score>best_val_score:
 58 |                 best_val_score = val_score
 59 |                 print('Current Best model!','current epoch:',i+1)
 60 |                 # test on best model:
 61 |                 pred_probs = self.model.predict([X_token_test,X_seg_test])
 62 |                 predictions = np.argmax(pred_probs, axis=1)
 63 |                 test_score = round(accuracy_score(y_test, predictions), 5)
 64 |                 print('  Current Best model! Test score:', test_score)
 65 |                 if save_best:
 66 |                     self.model.save('best_model_bert.h5')
 67 |                     print('  best model saved!')
 68 |         return train_score_list, val_socre_list, best_val_score, test_score
 69 | 
 70 | 
 71 | class BERT_LS:
 72 |     def __init__(self, config_path, checkpoint_path, hidden_size, num_classes, ls_e=0.1, model_type='bert'):
 73 | 
 74 |         def ls_loss(y_true, y_pred, e=ls_e):
 75 |             loss1 = K.categorical_crossentropy(y_true, y_pred)
 76 |             loss2 = K.categorical_crossentropy(K.ones_like(y_pred) / num_classes, y_pred)
 77 |             return (1 - e) * loss1 + e * loss2
 78 | 
 79 |         self.num_classes = num_classes
 80 |         bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path,
 81 |                                        model=model_type, return_keras_model=False)
 82 |         text_emb = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
 83 |         text_emb = Dense(hidden_size, activation='tanh')(text_emb)
 84 |         output = Dense(num_classes, activation='softmax')(text_emb)
 85 |         self.model = Model(bert.model.input, output)
 86 |         AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
 87 |         self.model.compile(loss=ls_loss, optimizer=AdamLR(learning_rate=1e-4, lr_schedule={1000: 1, 2000: 0.1}))
 88 | 
 89 |     def train_val(self, data_package, batch_size, epochs, save_best=False):
 90 |         X_token_train, X_seg_train, y_train, X_token_val, X_seg_val, y_val, X_token_test, X_seg_test, y_test = data_package
 91 |         best_val_score = 0
 92 |         test_score = 0
 93 |         train_score_list = []
 94 |         val_socre_list = []
 95 |         """实验说明：
 96 |         每一轮train完，在val上测试，记录其accuracy，
 97 |         每当val-acc达到新高，就立马在test上测试，得到test-acc，
 98 |         这样最终保留下来的test-acc就是在val上表现最好的模型在test上的accuracy
 99 |         """
100 |         for i in range(epochs):
101 |             t1 = time.time()
102 |             self.model.fit([X_token_train, X_seg_train], to_categorical(y_train), batch_size=batch_size, verbose=0,
103 |                            epochs=1)
104 |             # record train set result:
105 |             pred_probs = self.model.predict([X_token_train, X_seg_train])
106 |             predictions = np.argmax(pred_probs, axis=1)
107 |             train_score = round(accuracy_score(y_train, predictions), 5)
108 |             train_score_list.append(train_score)
109 |             # validation:
110 |             pred_probs = self.model.predict([X_token_val, X_seg_val])
111 |             predictions = np.argmax(pred_probs, axis=1)
112 |             val_score = round(accuracy_score(y_val, predictions), 5)
113 |             val_socre_list.append(val_score)
114 |             t2 = time.time()
115 |             print('(LS)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), ' | train acc:', train_score, ' | val acc:',
116 |                   val_score)
117 |             # save best model according to validation & test result:
118 |             if val_score > best_val_score:
119 |                 best_val_score = val_score
120 |                 print('Current Best model!', 'current epoch:', i + 1)
121 |                 # test on best model:
122 |                 pred_probs = self.model.predict([X_token_test, X_seg_test])
123 |                 predictions = np.argmax(pred_probs, axis=1)
124 |                 test_score = round(accuracy_score(y_test, predictions), 5)
125 |                 print('  Current Best model! Test score:', test_score)
126 |                 if save_best:
127 |                     self.model.save('best_model_bert_ls.h5')
128 |                     print('  best model saved!')
129 |         return train_score_list, val_socre_list, best_val_score, test_score
130 | 
131 | 
132 | class BERT_LCM:
133 |     def __init__(self,config_path,checkpoint_path,hidden_size,num_classes,alpha,wvdim=768,model_type='bert',label_embedding_matrix=None):
134 |         self.num_classes = num_classes
135 | 
136 |         def lcm_loss(y_true,y_pred,alpha=alpha):
137 |             pred_probs = y_pred[:,:num_classes]
138 |             label_sim_dist = y_pred[:,num_classes:]
139 |             simulated_y_true = K.softmax(label_sim_dist+alpha*y_true)
140 |             loss1 = -K.categorical_crossentropy(simulated_y_true,simulated_y_true)
141 |             loss2 = K.categorical_crossentropy(simulated_y_true,pred_probs)
142 |             return loss1+loss2
143 | 
144 |         def ls_loss(y_true, y_pred, e=0.1):
145 |             loss1 = K.categorical_crossentropy(y_true, y_pred)
146 |             loss2 = K.categorical_crossentropy(K.ones_like(y_pred)/num_classes, y_pred)
147 |             return (1-e)*loss1 + e*loss2     
148 | 
149 |         # text_encoder:
150 |         bert = build_transformer_model(config_path=config_path,checkpoint_path=checkpoint_path,
151 |                                        model=model_type,return_keras_model=False)
152 |         text_emb = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
153 |         text_emb = Dense(hidden_size,activation='tanh')(text_emb)
154 |         pred_probs = Dense(num_classes,activation='softmax')(text_emb)
155 |         self.basic_predictor = Model(bert.model.input,pred_probs)
156 |         AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
157 |         self.basic_predictor.compile(loss='categorical_crossentropy',optimizer=AdamLR(learning_rate=1e-4, lr_schedule={1000: 1,2000: 0.1}))
158 | 
159 | 
160 |         # label_encoder:
161 |         label_input = Input(shape=(num_classes,),name='label_input')
162 |         if label_embedding_matrix is None: # 不使用pretrained embedding
163 |             label_emb = Embedding(num_classes,wvdim,input_length=num_classes,name='label_emb1')(label_input) # (n,wvdim)
164 |         else:
165 |             label_emb = Embedding(num_classes,wvdim,input_length=num_classes,weights=[label_embedding_matrix],name='label_emb1')(label_input)
166 | #         label_emb = Bidirectional(LSTM(hidden_size,return_sequences=True),merge_mode='ave')(label_emb) # (n,d)
167 |         label_emb = Dense(hidden_size,activation='tanh',name='label_emb2')(label_emb)
168 |                 
169 |         # similarity part:
170 |         doc_product = Dot(axes=(2,1))([label_emb,text_emb]) # (n,d) dot (d,1) --> (n,1)
171 |         label_sim_dict = Dense(num_classes,activation='softmax',name='label_sim_dict')(doc_product)
172 |         # concat output:
173 |         concat_output = Concatenate()([pred_probs,label_sim_dict])
174 |         # compile；
175 |         AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
176 |         self.model = Model(bert.model.input+[label_input],concat_output)
177 |         self.model.compile(loss=lcm_loss, optimizer=AdamLR(learning_rate=1e-4, lr_schedule={1000: 1,2000: 0.1}))
178 | 
179 | 
180 |     def lcm_evaluate(self,model,inputs,y_true):
181 |         outputs = model.predict(inputs)
182 |         pred_probs = outputs[:,:self.num_classes]
183 |         predictions = np.argmax(pred_probs,axis=1)
184 |         acc = round(accuracy_score(y_true,predictions),5)
185 |         return acc
186 | 
187 |     def train_val(self, data_package, batch_size,epochs,lcm_stop=50,save_best=False):
188 |         X_token_train, X_seg_train, y_train, X_token_val, X_seg_val, y_val, X_token_test, X_seg_test, y_test = data_package
189 |         best_val_score = 0
190 |         test_score = 0
191 |         train_score_list = []
192 |         val_socre_list = []
193 |         """实验说明：
194 |         每一轮train完，在val上测试，记录其accuracy，
195 |         每当val-acc达到新高，就立马在test上测试，得到test-acc，
196 |         这样最终保留下来的test-acc就是在val上表现最好的模型在test上的accuracy
197 |         """
198 | 
199 |         L_train = np.array([np.array(range(self.num_classes)) for i in range(len(X_token_train))])
200 |         L_val = np.array([np.array(range(self.num_classes)) for i in range(len(X_token_val))])
201 |         L_test = np.array([np.array(range(self.num_classes)) for i in range(len(X_token_test))])
202 | 
203 |         for i in range(epochs):
204 |             t1 = time.time()
205 |             if i < lcm_stop:
206 |                 self.model.fit([X_token_train,X_seg_train,L_train],to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
207 |                 # record train set result:
208 |                 train_score = self.lcm_evaluate(self.model,[X_token_train,X_seg_train,L_train],y_train)
209 |                 train_score_list.append(train_score)
210 |                 # validation:
211 |                 val_score = self.lcm_evaluate(self.model,[X_token_val,X_seg_val,L_val],y_val)
212 |                 val_socre_list.append(val_score)
213 |                 t2 = time.time()
214 |                 print('(LCM)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), ' | train acc:', train_score, ' | val acc:',
215 |                       val_score)
216 |                 # save best model according to validation & test result:
217 |                 if val_score > best_val_score:
218 |                     best_val_score = val_score
219 |                     print('Current Best model!', 'current epoch:', i + 1)
220 |                     # test on best model:
221 |                     test_score = self.lcm_evaluate(self.model,[X_token_test,X_seg_test,L_test],y_test)
222 |                     print('  Current Best model! Test score:', test_score)
223 |                     if save_best:
224 |                         self.model.save('best_model_bert_lcm.h5')
225 |                         print('  best model saved!')
226 |             else:
227 |                 self.basic_predictor.fit([X_token_train,X_seg_train],to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
228 |                 # record train set result:
229 |                 pred_probs = self.basic_predictor.predict([X_token_train, X_seg_train])
230 |                 predictions = np.argmax(pred_probs, axis=1)
231 |                 train_score = round(accuracy_score(y_train, predictions),5)
232 |                 train_score_list.append(train_score)
233 |                 # validation:
234 |                 pred_probs = self.basic_predictor.predict([X_token_val, X_seg_val])
235 |                 predictions = np.argmax(pred_probs, axis=1)
236 |                 val_score = round(accuracy_score(y_val, predictions),5)
237 |                 val_socre_list.append(val_score)
238 |                 t2 = time.time()
239 |                 print('(LCM_stopped)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), ' | train acc:', train_score, ' | val acc:',
240 |                       val_score)
241 |                 # save best model according to validation & test result:
242 |                 if val_score > best_val_score:
243 |                     best_val_score = val_score
244 |                     print('Current Best model!', 'current epoch:', i + 1)
245 |                     # test on best model:
246 |                     pred_probs = self.basic_predictor.predict([X_token_test, X_seg_test])
247 |                     predictions = np.argmax(pred_probs, axis=1)
248 |                     test_score = round(accuracy_score(y_test, predictions),5)
249 |                     print('  Current Best model! Test score:', test_score)
250 |                     if save_best:
251 |                         self.model.save('best_model_bert_lcm.h5')
252 |                         print('  best model saved!')
253 |         return train_score_list, val_socre_list, best_val_score, test_score
254 | 
255 | 
256 | 
257 | class BERT_LCM_Att:
258 | 	"""
259 | 	使用self-attention构建的LCM
260 | 	"""
261 |     def __init__(self, config_path, checkpoint_path, hidden_size, num_classes, alpha, wvdim=768, model_type='bert',
262 |                  label_embedding_matrix=None):
263 |         self.num_classes = num_classes
264 | 
265 |         def lcm_loss(y_true, y_pred, alpha=alpha):
266 |             pred_probs = y_pred[:, :num_classes]
267 |             label_sim_dist = y_pred[:, num_classes:]
268 |             simulated_y_true = K.softmax(label_sim_dist + alpha * y_true)
269 |             loss1 = -K.categorical_crossentropy(simulated_y_true, simulated_y_true)
270 |             loss2 = K.categorical_crossentropy(simulated_y_true, pred_probs)
271 |             return loss1 + loss2
272 | 
273 |         def ls_loss(y_true, y_pred, e=0.1):
274 |             loss1 = K.categorical_crossentropy(y_true, y_pred)
275 |             loss2 = K.categorical_crossentropy(K.ones_like(y_pred) / num_classes, y_pred)
276 |             return (1 - e) * loss1 + e * loss2
277 | 
278 |         # text_encoder:
279 |         bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path,
280 |                                        model=model_type, return_keras_model=False)
281 |         text_emb = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
282 |         text_emb = Dense(hidden_size, activation='tanh')(text_emb)
283 |         pred_probs = Dense(num_classes, activation='softmax')(text_emb)
284 |         self.basic_predictor = Model(bert.model.input, pred_probs)
285 |         AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
286 |         self.basic_predictor.compile(loss='categorical_crossentropy',
287 |                                      optimizer=AdamLR(learning_rate=1e-4, lr_schedule={1000: 1, 2000: 0.1}))
288 | 
289 |         # label_encoder by attention:
290 |         label_input = Input(shape=(num_classes,), name='label_input')
291 |         if label_embedding_matrix is None:  # 不使用pretrained embedding
292 |             label_emb = Embedding(num_classes, wvdim, input_length=num_classes, name='label_emb1')(
293 |                 label_input)  # (n,wvdim)
294 |         else:
295 |             label_emb = Embedding(num_classes, wvdim, input_length=num_classes, weights=[label_embedding_matrix],
296 |                                   name='label_emb1')(label_input)
297 |         # self-attention part:
298 |         doc_attention = Lambda(lambda pair: tf.matmul(pair[0],pair[1],transpose_b=True))([label_emb,label_emb]) #  (n,d) * (d,n) -> (n,n)
299 |         attention_scores = Lambda(lambda x:tf.nn.softmax(x))(doc_attention) # (n,n)
300 |         self_attention_seq = Lambda(lambda pair: tf.matmul(pair[0],pair[1]))([attention_scores, label_emb])  # (n,n)*(n,d)->(n,d)
301 | 
302 |         # similarity part:
303 |         doc_product = Dot(axes=(2, 1))([self_attention_seq, text_emb])  # (n,d) dot (d,1) --> (n,1)
304 |         label_sim_dist = Lambda(lambda x: tf.nn.softmax(x), name='label_sim_dict')(doc_product)
305 |         # concat output:
306 |         concat_output = Concatenate()([pred_probs, label_sim_dist])
307 |         # compile；
308 |         AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
309 |         self.model = Model(bert.model.input + [label_input], concat_output)
310 |         self.model.compile(loss=lcm_loss, optimizer=AdamLR(learning_rate=1e-4, lr_schedule={1000: 1, 2000: 0.1}))
311 | 
312 |     def lcm_evaluate(self, model, inputs, y_true):
313 |         outputs = model.predict(inputs)
314 |         pred_probs = outputs[:, :self.num_classes]
315 |         predictions = np.argmax(pred_probs, axis=1)
316 |         acc = round(accuracy_score(y_true, predictions), 5)
317 |         return acc
318 | 
319 |     def train_val(self, data_package, batch_size, epochs, lcm_stop=50, save_best=False):
320 |         X_token_train, X_seg_train, y_train, X_token_val, X_seg_val, y_val, X_token_test, X_seg_test, y_test = data_package
321 |         best_val_score = 0
322 |         test_score = 0
323 |         train_score_list = []
324 |         val_socre_list = []
325 |         """实验说明：
326 |         每一轮train完，在val上测试，记录其accuracy，
327 |         每当val-acc达到新高，就立马在test上测试，得到test-acc，
328 |         这样最终保留下来的test-acc就是在val上表现最好的模型在test上的accuracy
329 |         """
330 | 
331 |         L_train = np.array([np.array(range(self.num_classes)) for i in range(len(X_token_train))])
332 |         L_val = np.array([np.array(range(self.num_classes)) for i in range(len(X_token_val))])
333 |         L_test = np.array([np.array(range(self.num_classes)) for i in range(len(X_token_test))])
334 | 
335 |         for i in range(epochs):
336 |             t1 = time.time()
337 |             if i < lcm_stop:
338 |                 self.model.fit([X_token_train, X_seg_train, L_train], to_categorical(y_train), batch_size=batch_size,
339 |                                verbose=0, epochs=1)
340 |                 # record train set result:
341 |                 train_score = self.lcm_evaluate(self.model, [X_token_train, X_seg_train, L_train], y_train)
342 |                 train_score_list.append(train_score)
343 |                 # validation:
344 |                 val_score = self.lcm_evaluate(self.model, [X_token_val, X_seg_val, L_val], y_val)
345 |                 val_socre_list.append(val_score)
346 |                 t2 = time.time()
347 |                 print('(LCM)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), ' | train acc:', train_score, ' | val acc:',
348 |                       val_score)
349 |                 # save best model according to validation & test result:
350 |                 if val_score > best_val_score:
351 |                     best_val_score = val_score
352 |                     print('Current Best model!', 'current epoch:', i + 1)
353 |                     # test on best model:
354 |                     test_score = self.lcm_evaluate(self.model, [X_token_test, X_seg_test, L_test], y_test)
355 |                     print('  Current Best model! Test score:', test_score)
356 |                     if save_best:
357 |                         self.model.save('best_model_bert_lcm.h5')
358 |                         print('  best model saved!')
359 |             else:
360 |                 self.basic_predictor.fit([X_token_train, X_seg_train], to_categorical(y_train), batch_size=batch_size,
361 |                                          verbose=0, epochs=1)
362 |                 # record train set result:
363 |                 pred_probs = self.basic_predictor.predict([X_token_train, X_seg_train])
364 |                 predictions = np.argmax(pred_probs, axis=1)
365 |                 train_score = round(accuracy_score(y_train, predictions), 5)
366 |                 train_score_list.append(train_score)
367 |                 # validation:
368 |                 pred_probs = self.basic_predictor.predict([X_token_val, X_seg_val])
369 |                 predictions = np.argmax(pred_probs, axis=1)
370 |                 val_score = round(accuracy_score(y_val, predictions), 5)
371 |                 val_socre_list.append(val_score)
372 |                 t2 = time.time()
373 |                 print('(LCM_stopped)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), ' | train acc:', train_score,
374 |                       ' | val acc:',
375 |                       val_score)
376 |                 # save best model according to validation & test result:
377 |                 if val_score > best_val_score:
378 |                     best_val_score = val_score
379 |                     print('Current Best model!', 'current epoch:', i + 1)
380 |                     # test on best model:
381 |                     pred_probs = self.basic_predictor.predict([X_token_test, X_seg_test])
382 |                     predictions = np.argmax(pred_probs, axis=1)
383 |                     test_score = round(accuracy_score(y_test, predictions), 5)
384 |                     print('  Current Best model! Test score:', test_score)
385 |                     if save_best:
386 |                         self.model.save('best_model_bert_lcm.h5')
387 |                         print('  best model saved!')
388 |         return train_score_list, val_socre_list, best_val_score, test_score


--------------------------------------------------------------------------------
/models/lstm.py:
--------------------------------------------------------------------------------
  1 | # checked at 2020.9.14
  2 | import numpy as np
  3 | import time
  4 | import keras
  5 | import tensorflow as tf
  6 | from keras.models import Sequential,Model
  7 | from keras.layers import Input,Dense,LSTM,Embedding,Conv1D,MaxPooling1D
  8 | from keras.layers import Flatten,Dropout,Concatenate,Lambda,Multiply,Reshape,Dot,Bidirectional
  9 | import keras.backend as K
 10 | from keras.utils import to_categorical
 11 | from gensim.models import Word2Vec
 12 | from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
 13 | 
 14 | 
 15 | class LSTM_Basic:
 16 |     """
 17 |     input->embedding->lstm->softmax_dense
 18 |     """
 19 |     def __init__(self,maxlen,vocab_size,wvdim,hidden_size,num_classes,embedding_matrix=None):
 20 |         text_input =  Input(shape=(maxlen,),name='text_input')
 21 |         if embedding_matrix is None: # 不使用pretrained embedding
 22 |             input_emb = Embedding(vocab_size+1,wvdim,input_length=maxlen,name='text_emb')(text_input) #(V,wvdim)
 23 |         else:
 24 |             input_emb = Embedding(vocab_size+1,wvdim,input_length=maxlen,weights=[embedding_matrix],name='text_emb')(text_input) #(V,wvdim)
 25 |         input_vec = LSTM(hidden_size)(input_emb)
 26 |         pred_probs = Dense(num_classes,activation='softmax',name='pred_probs')(input_vec)
 27 |         self.model = Model(inputs=text_input,outputs=pred_probs)
 28 |         self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 29 | 
 30 |     def train_val(self,data_package,batch_size,epochs,save_best=False):
 31 |         X_train,y_train,X_val,y_val,X_test,y_test = data_package
 32 |         best_val_score = 0
 33 |         final_test_score = 0
 34 |         final_train_score = 0
 35 |         val_socre_list = []
 36 |         """实验说明：
 37 |         每一轮train完，在val上测试，记录其accuracy，
 38 |         每当val-acc达到新高，就立马在test上测试，得到test-acc，
 39 |         这样最终保留下来的test-acc就是在val上表现最好的模型在test上的accuracy
 40 |         """
 41 |         for i in range(epochs):
 42 |             t1 = time.time()
 43 |             self.model.fit(X_train,to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
 44 |             pred_probs = self.model.predict(X_val)
 45 |             predictions = np.argmax(pred_probs,axis=1)
 46 |             val_score = round(accuracy_score(y_val,predictions),5)
 47 |             t2 = time.time()
 48 |             print('(Orig)Epoch',i+1,'| time: %.3f s'%(t2-t1),'| current val accuracy:',val_score)
 49 |             if val_score>best_val_score:
 50 |                 best_val_score = val_score
 51 |                 # 使用当前val上最好模型进行test:
 52 |                 pred_probs = self.model.predict(X_test)
 53 |                 predictions = np.argmax(pred_probs,axis=1)
 54 |                 final_test_score = round(accuracy_score(y_test,predictions),5)
 55 |                 print('  Current Best model! Test score:',final_test_score)
 56 |                 # 同时记录一下train上的score：
 57 |                 pred_probs = self.model.predict(X_train)
 58 |                 predictions = np.argmax(pred_probs, axis=1)
 59 |                 final_train_score = round(accuracy_score(y_train, predictions),5)
 60 |                 print('  Current Best model! Train score:', final_train_score)
 61 |                 if save_best:
 62 |                     self.model.save('best_model_lstm.h5')
 63 |                     print('  best model saved!')
 64 |             val_socre_list.append(val_score)
 65 |         return best_val_score,val_socre_list,final_test_score,final_train_score
 66 | 
 67 | 
 68 | class LSTM_LS:
 69 |     """
 70 |     input->embedding->lstm->softmax_dense
 71 |     """
 72 |     def __init__(self,maxlen,vocab_size,wvdim,hidden_size,num_classes,embedding_matrix=None):
 73 |         def ls_loss(y_true, y_pred, e=0.1):
 74 |             loss1 = K.categorical_crossentropy(y_true, y_pred)
 75 |             loss2 = K.categorical_crossentropy(K.ones_like(y_pred)/num_classes, y_pred)
 76 |             return (1-e)*loss1 + e*loss2
 77 |         
 78 |         text_input =  Input(shape=(maxlen,),name='text_input')
 79 |         if embedding_matrix is None: # 不使用pretrained embedding
 80 |             input_emb = Embedding(vocab_size+1,wvdim,input_length=maxlen,name='text_emb')(text_input) #(V,wvdim)
 81 |         else:
 82 |             input_emb = Embedding(vocab_size+1,wvdim,input_length=maxlen,weights=[embedding_matrix],name='text_emb')(text_input) #(V,wvdim)
 83 |         input_vec = LSTM(hidden_size)(input_emb)
 84 |         pred_probs = Dense(num_classes,activation='softmax',name='pred_probs')(input_vec)
 85 |         self.model = Model(inputs=text_input,outputs=pred_probs)
 86 |         self.model.compile(loss=ls_loss, optimizer='adam', metrics=['accuracy'])
 87 |     
 88 | 
 89 |     def train_val(self,data_package,batch_size,epochs,save_best=False):
 90 |         X_train,y_train,X_val,y_val,X_test,y_test = data_package
 91 |         best_val_score = 0
 92 |         final_test_score = 0
 93 |         final_train_score = 0
 94 |         val_score_list = []
 95 |         for i in range(epochs):
 96 |             t1 = time.time()
 97 |             self.model.fit(X_train,to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
 98 |             # val:
 99 |             pred_probs = self.model.predict(X_val)
100 |             predictions = np.argmax(pred_probs,axis=1)
101 |             val_score = round(accuracy_score(y_val,predictions),5)
102 |             t2 = time.time()
103 |             print('(LS)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), '| current val accuracy:', val_score)
104 |             if val_score>best_val_score:
105 |                 best_val_score = val_score
106 |                 # 使用当前val上最好模型进行test:
107 |                 pred_probs = self.model.predict(X_test)
108 |                 predictions = np.argmax(pred_probs, axis=1)
109 |                 final_test_score = round(accuracy_score(y_test, predictions),5)
110 |                 print('  Current Best model! Test score:', final_test_score)
111 |                 # 同时记录一下train上的score：
112 |                 pred_probs = self.model.predict(X_train)
113 |                 predictions = np.argmax(pred_probs, axis=1)
114 |                 final_train_score = round(accuracy_score(y_train, predictions),5)
115 |                 print('  Current Best model! Train score:', final_train_score)
116 |                 if save_best:
117 |                     self.model.save('best_model_ls.h5')
118 |                     print('  best model saved!')
119 |             val_score_list.append(val_score)
120 |         return best_val_score,val_score_list,final_test_score,final_train_score
121 | 
122 | 
123 | class LSTM_LCM_dynamic:
124 |     """
125 |     LCM dynamic,跟LCM的主要差别在于：
126 |     1.可以设置early stop，即设置在某一个epoch就停止LCM的作用；
127 |     2.在停止使用LCM之后，可以选择是否使用label smoothing来计算loss。
128 |     """
129 |     def __init__(self,maxlen,vocab_size,wvdim,hidden_size,num_classes,alpha,default_loss='ls',text_embedding_matrix=None,label_embedding_matrix=None):
130 |         self.num_classes = num_classes
131 | 
132 |         def lcm_loss(y_true,y_pred,alpha=alpha):
133 |             pred_probs = y_pred[:,:num_classes]
134 |             label_sim_dist = y_pred[:,num_classes:]
135 |             simulated_y_true = K.softmax(label_sim_dist+alpha*y_true)
136 |             loss1 = -K.categorical_crossentropy(simulated_y_true,simulated_y_true)
137 |             loss2 = K.categorical_crossentropy(simulated_y_true,pred_probs)
138 |             return loss1+loss2
139 |         
140 |         def ls_loss(y_true, y_pred, e=0.1):
141 |             loss1 = K.categorical_crossentropy(y_true, y_pred)
142 |             loss2 = K.categorical_crossentropy(K.ones_like(y_pred)/num_classes, y_pred)
143 |             return (1-e)*loss1 + e*loss2
144 |         
145 |         # basic_predictor:
146 |         text_input =  Input(shape=(maxlen,),name='text_input')
147 |         if text_embedding_matrix is None:
148 |             input_emb = Embedding(vocab_size+1,wvdim,input_length=maxlen,name='text_emb')(text_input) #(V,wvdim)
149 |         else:
150 |             input_emb = Embedding(vocab_size+1,wvdim,input_length=maxlen,weights=[text_embedding_matrix],name='text_emb')(text_input) #(V,wvdim)
151 |         input_vec = LSTM(hidden_size)(input_emb)
152 |         pred_probs = Dense(num_classes,activation='softmax',name='pred_probs')(input_vec)
153 |         self.basic_predictor = Model(input=text_input,output=pred_probs)
154 |         if default_loss == 'ls':
155 |             self.basic_predictor.compile(loss=ls_loss, optimizer='adam')
156 |         else:
157 |             self.basic_predictor.compile(loss='categorical_crossentropy', optimizer='adam')
158 | 
159 |         # LCM:
160 |         label_input = Input(shape=(num_classes,),name='label_input')
161 |         label_emb = Embedding(num_classes,wvdim,input_length=num_classes,name='label_emb1')(label_input) # (n,wvdim)
162 |         label_emb = Dense(hidden_size,activation='tanh',name='label_emb2')(label_emb)
163 |         # similarity part:
164 |         doc_product = Dot(axes=(2,1))([label_emb,input_vec]) # (n,d) dot (d,1) --> (n,1)
165 |         label_sim_dict = Dense(num_classes,activation='softmax',name='label_sim_dict')(doc_product)
166 |         # concat output:
167 |         concat_output = Concatenate()([pred_probs,label_sim_dict])
168 |         # compile；
169 |         self.model = Model(inputs=[text_input,label_input],outputs=concat_output)
170 |         self.model.compile(loss=lcm_loss, optimizer='adam')
171 | 
172 |     def my_evaluator(self,model,inputs,label_list):
173 |         outputs = model.predict(inputs)
174 |         pred_probs = outputs[:,:self.num_classes]
175 |         predictions = np.argmax(pred_probs,axis=1)
176 |         acc = round(accuracy_score(label_list,predictions),5)
177 |         # recall = recall_score(label_list,predictions,average='weighted')
178 |         # f1 = f1_score(label_list,predictions,average='weighted')
179 |         return acc
180 | 
181 |     def train_val(self,data_package,batch_size,epochs,lcm_stop=50,save_best=False):
182 |         X_train,y_train,X_val,y_val,X_test,y_test = data_package
183 |         L_train = np.array([np.array(range(self.num_classes)) for i in range(len(X_train))])
184 |         L_val = np.array([np.array(range(self.num_classes)) for i in range(len(X_val))])
185 |         L_test = np.array([np.array(range(self.num_classes)) for i in range(len(X_test))])
186 |         best_val_score = 0
187 |         final_test_score = 0
188 |         final_train_score = 0
189 |         val_score_list = []
190 |         for i in range(epochs):
191 |             if i < lcm_stop:
192 |                 t1 = time.time()
193 |                 self.model.fit([X_train,L_train],to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
194 |                 val_score = self.my_evaluator(self.model,[X_val,L_val],y_val)
195 |                 t2 = time.time()
196 |                 print('(LCM)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), '| current val accuracy:', val_score)
197 |                 if val_score>best_val_score:
198 |                     best_val_score = val_score
199 |                     # test:
200 |                     final_test_score = self.my_evaluator(self.model,[X_test,L_test],y_test)
201 |                     print('  Current Best model! Test score:',final_test_score)
202 |                     # train:
203 |                     final_train_score = self.my_evaluator(self.model, [X_train, L_train], y_train)
204 |                     print('  Current Best model! Train score:', final_train_score)
205 |                     if save_best:
206 |                         self.model.save('best_model.h5')
207 |                         print('best model saved!')
208 |                 val_score_list.append(val_score)
209 |             else: # 停止LCM的作用
210 |                 t1 = time.time()
211 |                 self.basic_predictor.fit(X_train,to_categorical(y_train),batch_size=batch_size,verbose=0,epochs=1)
212 |                 pred_probs = self.basic_predictor.predict(X_val)
213 |                 predictions = np.argmax(pred_probs,axis=1)
214 |                 val_score = round(accuracy_score(y_val,predictions),5)
215 |                 t2 = time.time()
216 |                 print('(LCM-stop)Epoch', i + 1, '| time: %.3f s' % (t2 - t1), '| current val accuracy:', val_score)
217 |                 if val_score>best_val_score:
218 |                     best_val_score = val_score
219 |                     # test:
220 |                     final_test_score = self.my_evaluator(self.model, [X_test, L_test], y_test)
221 |                     print('  Current Best model! Test score:', final_test_score)
222 |                     # train:
223 |                     final_train_score = self.my_evaluator(self.model, [X_train, L_train], y_train)
224 |                     print('  Current Best model! Train score:', final_train_score)
225 |                     if save_best:
226 |                         self.model.save('best_model_lcm.h5')
227 |                         print('  best model saved!')
228 |                 val_score_list.append(val_score)
229 |         return best_val_score,val_score_list,final_test_score,final_train_score
230 | 
231 | 
232 | 
233 | 
234 | # =================================
235 | # class LSTM_LCM:
236 | #
237 | #     def __init__(self, maxlen, vocab_size, wvdim, hidden_size, num_classes, alpha, text_embedding_matrix=None,
238 | #                  label_embedding_matrix=None):
239 | #         self.num_classes = num_classes
240 | #
241 | #         def lcm_loss(y_true, y_pred, alpha=alpha):
242 | #             pred_probs = y_pred[:, :num_classes]
243 | #             label_sim_dist = y_pred[:, num_classes:]
244 | #             simulated_y_true = K.softmax(label_sim_dist + alpha * y_true)
245 | #             loss1 = -K.categorical_crossentropy(simulated_y_true, simulated_y_true)
246 | #             loss2 = K.categorical_crossentropy(simulated_y_true, pred_probs)
247 | #             return loss1 + loss2
248 | #
249 | #         # text_encoder:
250 | #         text_input = Input(shape=(maxlen,), name='text_input')
251 | #         if text_embedding_matrix is None:
252 | #             input_emb = Embedding(vocab_size + 1, wvdim, input_length=maxlen, name='text_emb')(text_input)  # (V,wvdim)
253 | #         else:
254 | #             input_emb = Embedding(vocab_size + 1, wvdim, input_length=maxlen, weights=[text_embedding_matrix],
255 | #                                   name='text_emb')(text_input)  # (V,wvdim)
256 | #         input_vec = LSTM(hidden_size)(input_emb)
257 | #         pred_probs = Dense(num_classes, activation='softmax', name='pred_probs')(input_vec)
258 | #         # label_encoder:
259 | #         label_input = Input(shape=(num_classes,), name='label_input')
260 | #         if label_embedding_matrix is None:  # 不使用pretrained embedding
261 | #             label_emb = Embedding(num_classes, wvdim, input_length=num_classes, name='label_emb')(
262 | #                 label_input)  # (n,wvdim)
263 | #         else:
264 | #             label_emb = Embedding(num_classes, wvdim, input_length=num_classes, weights=[label_embedding_matrix],
265 | #                                   name='label_emb')(label_input)
266 | #         label_emb = Bidirectional(LSTM(hidden_size, return_sequences=True), merge_mode='ave')(label_emb)  # (n,d)
267 | #         # similarity part:
268 | #         doc_product = Dot(axes=(2, 1))([label_emb, input_vec])  # (n,d) dot (d,1) --> (n,1)
269 | #         label_sim_dict = Dense(num_classes, activation='softmax', name='label_sim_dict')(doc_product)
270 | #         # concat output:
271 | #         concat_output = Concatenate()([pred_probs, label_sim_dict])
272 | #         # compile；
273 | #         self.model = Model(inputs=[text_input, label_input], outputs=concat_output)
274 | #         self.model.compile(loss=lcm_loss, optimizer='adam')
275 | #
276 | #     def my_evaluator(self, model, inputs, label_list):
277 | #         outputs = model.predict(inputs)
278 | #         pred_probs = outputs[:, :self.num_classes]
279 | #         predictions = np.argmax(pred_probs, axis=1)
280 | #         acc = accuracy_score(label_list, predictions)
281 | #         recall = recall_score(label_list, predictions, average='weighted')
282 | #         f1 = f1_score(label_list, predictions, average='weighted')
283 | #         return acc, recall, f1
284 | #
285 | #     def train_val(self, data_package, batch_size, epochs, metric='accuracy', save_best=False):
286 | #         X_train, y_train, X_val, y_val, X_test, y_test = data_package
287 | #         L_train = np.array([np.array(range(self.num_classes)) for i in range(len(X_train))])
288 | #         L_val = np.array([np.array(range(self.num_classes)) for i in range(len(X_val))])
289 | #         L_test = np.array([np.array(range(self.num_classes)) for i in range(len(X_test))])
290 | #         best_val_score = 0
291 | #         learning_curve = []
292 | #         for i in range(epochs):
293 | #             self.model.fit([X_train, L_train], to_categorical(y_train), batch_size=batch_size, verbose=1, epochs=1)
294 | #             acc, recall, f1 = self.my_evaluator(self.model, [X_val, L_val], y_val)
295 | #             if metric == 'accuracy':
296 | #                 score = acc
297 | #                 print('Epoch', i + 1, '| current val %s:' % metric, score)
298 | #             if score > best_val_score:
299 | #                 best_val_score = score
300 | #                 # test:
301 | #                 test_score, test_recall, test_f1 = self.my_evaluator(self.model, [X_test, L_test], y_test)
302 | #                 print('Current Best model! Test score:', test_score, 'current epoch:', i + 1)
303 | #                 if save_best:
304 | #                     self.model.save('best_model.h5')
305 | #                     print('best model saved!')
306 | #             learning_curve.append(score)
307 | #         return best_val_score, learning_curve, test_score
308 | 


--------------------------------------------------------------------------------
/output/placeholder.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/label_confusion_learning/89a4eebfd5d947ea5ff1e87854209a43c7b8fde5/output/placeholder.txt


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from keras.preprocessing.text import text_to_word_sequence,Tokenizer
  2 | from keras.preprocessing.sequence import pad_sequences
  3 | import numpy as np
  4 | from numpy.random import shuffle
  5 | import pandas as pd
  6 | import jieba
  7 | import re
  8 | import time
  9 | import copy
 10 | 
 11 | # ===================preprocessing:==============================
 12 | def load_dataset(name):
 13 |     assert name in ['20NG','AG','DBP','FDU','THU'], "name only supports '20NG','AG','DBP','FDU','THU', but your input is %s"%name
 14 | 
 15 |     if name == '20NG':
 16 |         num_classes = 20
 17 |         df1 = pd.read_csv('datasets/20NG/20ng-train-all-terms.csv')
 18 |         df2 = pd.read_csv('datasets/20NG/20ng-test-all-terms.csv')
 19 |         df = pd.concat([df1,df2])
 20 |         comp_group = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
 21 |                       'comp.windows.x']
 22 |         rec_group = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
 23 |         talk_group = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
 24 |         sci_group = ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
 25 |         other_group = ['alt.atheism', 'misc.forsale', 'soc.religion.christian']
 26 |         label_groups = [comp_group,rec_group,talk_group]
 27 |         return df, num_classes,label_groups
 28 |     if name == 'AG':
 29 |         num_classes = 4  # DBP:14 AG:4
 30 |         df1 = pd.read_csv('../datasets/AG_news/train.csv', header=None, index_col=None)
 31 |         df2 = pd.read_csv('../datasets/AG_news/test.csv', header=None, index_col=None)
 32 |         df1.columns = ['label', 'title', 'content']
 33 |         df2.columns = ['label', 'title', 'content']
 34 |         df = pd.concat([df1, df2])
 35 |         label_groups = []
 36 |         return df,num_classes,label_groups
 37 |     if name == 'DBP':
 38 |         num_classes = 14
 39 |         df1 = pd.read_csv('../datasets/DBPedia/train.csv', header=None, index_col=None)
 40 |         df2 = pd.read_csv('../datasets/DBPedia/test.csv', header=None, index_col=None)
 41 |         df1.columns = ['label', 'title', 'content']
 42 |         df2.columns = ['label', 'title', 'content']
 43 |         df = pd.concat([df1, df2])
 44 |         label_groups = []
 45 |         return df,num_classes,label_groups
 46 |     if name == 'FDU':
 47 |         num_classes = 20
 48 |         df = pd.read_csv('../datasets/fudan_news.csv')
 49 |         label_groups = []
 50 |         return df, num_classes,label_groups
 51 |     if name == 'THU':
 52 |         num_classes = 13
 53 |         df = pd.read_csv('../datasets/thucnews_subset.csv')
 54 |         label_groups = []
 55 |         return df, num_classes,label_groups
 56 | 
 57 | def create_shuffled_labels(y, label_groups, label2idx, rate):
 58 |     np.random.seed(0)
 59 |     """
 60 |     y: the list of label index
 61 |     label_groups: each group is a list of label names to shuffle
 62 |     rate: noise rate, note that this isn't the error rate, since the some labels may remain unchanged after shuffling
 63 |     """
 64 |     y_orig = list(y)[:]
 65 |     y = np.array(y)
 66 |     count = 0
 67 |     for labels in label_groups:
 68 |         label_ids = [label2idx[l] for l in labels]
 69 |         # find out the indexes of your target labels to add noise
 70 |         indexes = [i for i in range(len(y)) if y[i] in label_ids]
 71 |         shuffle(indexes)
 72 |         partial_indexes = indexes[:int(rate*len(indexes))]
 73 |         count += len(partial_indexes)
 74 |         shuffled_partial_indexes = partial_indexes[:]
 75 |         shuffle(shuffled_partial_indexes)
 76 |         y[partial_indexes] = y[shuffled_partial_indexes]
 77 |     errors = list(np.array(y_orig) & np.array(y)).count(0)
 78 |     shuffle_rate = count/len(y)
 79 |     error_rate = errors/len(y)
 80 |     return shuffle_rate,error_rate,y
 81 | 
 82 | 
 83 | def create_asy_noise_labels(y, label_groups, label2idx, rate):
 84 |     np.random.seed(0)
 85 |     """
 86 |     y: the list of label index
 87 |     label_groups: each group is a list of label names to exchange within groups
 88 |     rate: noise rate, in this mode, each selected label will change to another random label in the same group
 89 |     """
 90 |     y_orig = list(y)[:]
 91 |     y = np.array(y)
 92 |     count = 0
 93 |     for labels in label_groups:
 94 |         label_ids = [label2idx[l] for l in labels]
 95 |         # find out the indexes of your target labels to add noise
 96 |         indexes = [i for i in range(len(y)) if y[i] in label_ids]
 97 |         shuffle(indexes)
 98 |         partial_indexes = indexes[:int(rate * len(indexes))]
 99 |         count += len(partial_indexes)
100 |         # find out the indexes of your target labels to add noise
101 |         for idx in partial_indexes:
102 |             if y[idx] in label_ids:
103 |                 # randomly change to another label in the same group
104 |                 other_label_ids = label_ids[:]
105 |                 other_label_ids.remove(y[idx])
106 |                 y[idx] = np.random.choice(other_label_ids)
107 |     errors = len(y) - list(np.array(y_orig) - np.array(y)).count(0)
108 |     shuffle_rate = count / len(y)
109 |     error_rate = errors / len(y)
110 |     return shuffle_rate, error_rate, y
111 | 
112 | def remove_punctuations(text):
113 |     return re.sub('[，。：；’‘“”？！、,.!?\'\"\n\t]','',text)
114 | 
115 | 
116 | def fit_corpus(corpus,vocab_size=None):
117 |     """
118 |     corpus 为分好词的语料库
119 |     """
120 |     print("Start fitting the corpus......")
121 |     t = Tokenizer(vocab_size) # 要使得文本向量化时省略掉低频词，就要设置这个参数
122 |     tik = time.time()
123 |     t.fit_on_texts(corpus) # 在所有的评论数据集上训练，得到统计信息
124 |     tok = time.time()
125 |     word_index = t.word_index # 不受vocab_size的影响
126 |     print('all_vocab_size',len(word_index))
127 |     print("Fitting time: ",(tok-tik),'s')
128 |     freq_word_index = {}
129 |     if vocab_size is not None:
130 |         print("Creating freq-word_index...")
131 |         x = list(t.word_counts.items())
132 |         s = sorted(x,key=lambda p:p[1],reverse=True)
133 |         freq_word_index = copy.deepcopy(word_index) # 防止原来的字典也被改变了
134 |         for item in s[vocab_size:]:
135 |             freq_word_index.pop(item[0])
136 |         print("Finished!")
137 |     return t,word_index,freq_word_index
138 | 
139 | 
140 | def text2idx(tokenizer,text,maxlen):
141 |     """
142 |     text 是一个列表，每个元素为一个文档的分词
143 |     """
144 |     print("Start vectorizing the sentences.......")
145 |     X = tokenizer.texts_to_sequences(text) # 受vocab_size的影响
146 |     print("Start padding......")
147 |     pad_X = pad_sequences(X,maxlen=maxlen,padding='post')
148 |     print("Finished!")
149 |     return pad_X
150 | 
151 | 
152 | def create_embedding_matrix(wvmodel,vocab_size,emb_dim,word_index):
153 |     """
154 |     vocab_size 为词汇表大小，一般为词向量的词汇量
155 |     emb_dim 为词向量维度
156 |     word_index 为词和其index对应的查询词典
157 |     """
158 |     embedding_matrix = np.random.uniform(size=(vocab_size+1,emb_dim)) # +1是要留一个给index=0
159 |     print("Transfering to the embedding matrix......")
160 |     # sorted_small_index = sorted(list(small_word_index.items()),key=lambda x:x[1])
161 |     for word,index in word_index.items():
162 |         try:
163 |             word_vector = wvmodel[word]
164 |             embedding_matrix[index] = word_vector
165 |         except Exception as e:
166 |             print(e,"Use random embedding instead.")
167 |     print("Finished!")
168 |     print("Embedding matrix shape:\n",embedding_matrix.shape)
169 |     return embedding_matrix
170 | 
171 | 
172 | def label2idx(label_list):
173 |     label_dict = {}
174 |     unique_labels = list(set(label_list))
175 |     for i,each in enumerate(unique_labels):
176 |         label_dict[each] = i
177 |     new_label_list = []
178 |     for label in label_list:
179 |         new_label_list.append(label_dict[label])
180 |     return new_label_list,label_dict


--------------------------------------------------------------------------------