├── README.md ├── config.py ├── dataUtils.py ├── main.py ├── model.py └── torch ├── CM_Model.py ├── ERD.py ├── ERD_CN.py ├── RDM_Model.py ├── dataUtils.py └── dataUtils_CN.py /README.md: -------------------------------------------------------------------------------- 1 | ## Early Rumour Detection 2 | Rumours can spread quickly through social media, and malicious ones 3 | can bring about significant economical and social impact. Motivated by 4 | this, our paper focuses on the task of rumour detection; particularly, 5 | we are interested in understanding how early we can detect 6 | them. To address this, we present a novel methodology for early rumour 7 | detection.Here is the code based on our approach. 8 | 9 | ### Requirement 10 | Python 3.6 11 | 12 | TensorFlow 1.13 13 | 14 | ### DataSet 15 | 16 | Two DataSets can be used to evaluate our model. 17 | 18 | Weibo DataSet: http://alt.qcri.org/~wgao/data/rumdect.zip 19 | 20 | Twitter DataSet: https://figshare.com/articles/PHEME_dataset_of_rumours_and_non-rumours/4010619 21 | 22 | ### Usage 23 | 1. Download Twitter DataSet and extract, set the DataSet path to the `data_file_path` in `config.py`. 24 | 25 | 2. Download glove word vectors: http://nlp.stanford.edu/data/glove.840B.300d.zip, and set the `w2v_file_path` in `config.py`. 26 | 27 | 3. Run `python main.py` to train and evaluate the model. 28 | 29 | ## Early Rumour Detection (Torch) 30 | If there are problems with the codes, you can try the newly uploaded torch codes by Menglong Lu. 31 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | tf.flags.DEFINE_string("w2v_file_path", "w2v_300", "w2v file") 5 | tf.flags.DEFINE_string("data_file_path", "D:\\pheme-rnr-dataset", "data_file") 6 | 7 | tf.flags.DEFINE_integer("post_fn", 2, "Fixed Number of Posts") 8 | tf.flags.DEFINE_integer("time_limit", 48, "Posts Time Limitation (The Posts in 48 Hours)") 9 | 10 | tf.flags.DEFINE_integer("batch_size", 50, "Batch size (default: 50)") 11 | tf.flags.DEFINE_integer("hidden_dim", 100, "Dimensionality of hidden states (default: 100)") 12 | tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of word embedding (default: 300)") 13 | tf.flags.DEFINE_integer("max_seq_len", 100, "Max length of sequence (default: 300)") 14 | tf.flags.DEFINE_integer("max_sent_len", 100, "Max length of sentence (default: 300)") 15 | 16 | tf.flags.DEFINE_integer("class_num", 2, "#Class (Non-Rumor, Rumor)") 17 | tf.flags.DEFINE_integer("action_num", 2, "#Action (Continue, Stop)") 18 | 19 | # RL parameters; 20 | tf.flags.DEFINE_float("random_rate", 0.01, "RL Random Action Rate") 21 | tf.flags.DEFINE_integer("OBSERVE", 1000, "OBSERVE BEFORE TRAIN") 22 | tf.flags.DEFINE_integer("max_memory", 80000, "Max memory size") 23 | tf.flags.DEFINE_float("reward_rate", 0.2, "reward rate") 24 | 25 | FLAGS = tf.flags.FLAGS 26 | -------------------------------------------------------------------------------- /dataUtils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import datetime 5 | import numpy as np 6 | import gensim 7 | from config import * 8 | import random 9 | import math 10 | 11 | files = [] 12 | data = {} 13 | data_ID = [] 14 | data_len = [] 15 | data_y = [] 16 | word2vec = gensim.models.KeyedVectors.load('word2vec.model') 17 | reward_counter = 0 18 | eval_flag = 0 19 | 20 | 21 | def get_curtime(): 22 | return time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 23 | 24 | 25 | def list_files(data_path): 26 | global data, files 27 | fs = os.listdir(data_path) 28 | for f1 in fs: 29 | tmp_path = os.path.join(data_path, f1) 30 | if not os.path.isdir(tmp_path): 31 | if tmp_path.split('.')[-1] == 'json': 32 | files.append(tmp_path) 33 | else: 34 | list_files(tmp_path) 35 | 36 | 37 | def str2timestamp(str_time): 38 | month = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 39 | 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 40 | 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'} 41 | ss = str_time.split(' ') 42 | m_time = ss[5] + "-" + month[ss[1]] + '-' + ss[2] + ' ' + ss[3] 43 | d = datetime.datetime.strptime(m_time, "%Y-%m-%d %H:%M:%S") 44 | t = d.timetuple() 45 | timeStamp = int(time.mktime(t)) 46 | return timeStamp 47 | 48 | 49 | def data_process(file_path): 50 | ret = {} 51 | ss = file_path.split("\\") 52 | data = json.load(open(file_path, mode="r", encoding="utf-8")) 53 | 54 | # 'Wed Jan 07 11:14:08 +0000 2015' 55 | ret[ss[4]] = {'label': ss[3], 'text': [data['text'].lower()], 'created_at': [str2timestamp(data['created_at'])]} 56 | 57 | return ret 58 | 59 | 60 | def load_data(data_path): 61 | # get data files path 62 | global data, files, data_ID, data_len, eval_flag 63 | data = {} 64 | files = [] 65 | data_ID = [] 66 | data_len = [] 67 | list_files(data_path) 68 | 69 | # load data to json 70 | for file in files: 71 | td = data_process(file) 72 | for key in td.keys(): 73 | if key in data: 74 | data[key]['text'].append(td[key]['text'][0]) 75 | data[key]['created_at'].append(td[key]['created_at'][0]) 76 | else: 77 | data[key] = td[key] 78 | 79 | # convert to my data style 80 | for key, value in data.items(): 81 | temp_list = [] 82 | for i in range(len(data[key]['text'])): 83 | temp_list.append([data[key]['created_at'][i], data[key]['text'][i]]) 84 | data[key]['text'] = [] 85 | data[key]['created_at'] = [] 86 | 87 | ttext = "" 88 | last = 0 89 | for i in range(len(temp_list)): 90 | if temp_list[i][0] - temp_list[0][0] > FLAGS.time_limit * 3600 or len(data[key]['created_at']) >= 100: 91 | break 92 | if i % FLAGS.post_fn == 0: 93 | if len(ttext) > 0: 94 | data[key]['text'].append(ttext) 95 | data[key]['created_at'].append(temp_list[i][0]) 96 | else: 97 | ttext = temp_list[i][1] 98 | else: 99 | ttext += " " + temp_list[i][1] 100 | last = i 101 | 102 | # keep the last one 103 | if len(ttext) > 0: 104 | data[key]['text'].append(ttext) 105 | data[key]['created_at'].append(temp_list[last][0]) 106 | 107 | for key in data.keys(): 108 | data_ID.append(key) 109 | data_ID = random.sample(data_ID, len(data_ID)) 110 | 111 | for i in range(len(data_ID)): 112 | data_len.append(len(data[data_ID[i]]['text'])) 113 | if data[data_ID[i]]['label'] == "rumours": 114 | data_y.append([1.0, 0.0]) 115 | else: 116 | data_y.append([0.0, 1.0]) 117 | 118 | eval_flag = int(len(data_ID) / 4) * 3 119 | 120 | print("{} data loaded".format(len(data))) 121 | 122 | 123 | def get_df_batch(start, new_data_len=[]): 124 | data_x = np.zeros([FLAGS.batch_size, FLAGS.max_seq_len, FLAGS.max_sent_len, FLAGS.embedding_dim], dtype=np.float32) 125 | m_data_y = np.zeros([FLAGS.batch_size, 2], dtype=np.int32) 126 | m_data_len = np.zeros([FLAGS.batch_size], dtype=np.int32) 127 | 128 | if len(new_data_len) > 0: 129 | t_data_len = new_data_len 130 | else: 131 | t_data_len = data_len 132 | 133 | mts = start * FLAGS.batch_size 134 | if mts >= len(data_ID): 135 | mts = mts % len(data_ID) 136 | 137 | for i in range(FLAGS.batch_size): 138 | m_data_y[i] = data_y[mts] 139 | m_data_len[i] = t_data_len[mts] 140 | for j in range(t_data_len[mts]): 141 | t_words = data[data_ID[mts]]['text'][j].strip().split(" ") 142 | for k in range(len(t_words)): 143 | m_word = t_words[k] 144 | try: 145 | data_x[i][j][k] = word2vec[m_word] 146 | except: 147 | miss_vec = 1 148 | 149 | mts += 1 150 | if mts >= len(data_ID): 151 | mts = mts % len(data_ID) 152 | 153 | return data_x, m_data_len, m_data_y 154 | 155 | 156 | # seq_states is the date_x to get 157 | # max_id is the next corpus to take 158 | def get_rl_batch(ids, seq_states, stop_states, counter_id, start_id, total_data): 159 | input_x = np.zeros([FLAGS.batch_size, FLAGS.max_sent_len, FLAGS.embedding_dim], dtype=np.float32) 160 | input_y = np.zeros([FLAGS.batch_size, FLAGS.class_num], dtype=np.float32) 161 | 162 | for i in range(FLAGS.batch_size): 163 | if stop_states[i] == 1 or seq_states[i] >= data_len[ids[i]]: 164 | ids[i] = counter_id + start_id 165 | seq_states[i] = 0 166 | try: 167 | t_words = data[ids[i]]['text'][seq_states[i]].strip().split(" ") 168 | except: 169 | print(ids[i], seq_states[i]) 170 | for j in range(len(t_words)): 171 | m_word = t_words[j] 172 | try: 173 | input_x[i][j] = word2vec[m_word] 174 | except: 175 | miss_vec = 1 176 | input_y[i] = data_y[ids[i]] 177 | counter_id += 1 178 | counter_id = counter_id % total_data 179 | else: 180 | try: 181 | t_words = data[ids[i]]['text'][seq_states[i]].strip().split(" ") 182 | except: 183 | print(ids[i],seq_states[i]) 184 | for j in range(len(t_words)): 185 | m_word = t_words[j] 186 | try: 187 | input_x[i][j] = word2vec[m_word] 188 | except: 189 | miss_vec = 1 190 | input_y[i] = data_y[ids[i]] 191 | # point to the next sequence 192 | seq_states[i] += 1 193 | 194 | return input_x, input_y, ids, seq_states, counter_id 195 | 196 | 197 | # not to stop -0.1, so that to be early 198 | # DDQN y = r + Q(S, argmax(Q)) 199 | def get_reward(isStop, ss, pys, ids, seq_ids): 200 | global reward_counter 201 | reward = np.zeros([len(isStop)], dtype=np.float32) 202 | for i in range(len(isStop)): 203 | if isStop[i] == 1: 204 | if np.argmax(pys[ids[i]][seq_ids[i]-1]) == np.argmax(data_y[ids[i]]): 205 | r = 1 + FLAGS.reward_rate * math.log(reward_counter) 206 | reward[i] = r 207 | reward_counter += 1 208 | else: 209 | reward[i] = -100 210 | else: 211 | reward[i] = -0.01 + 0.99 * max(ss[i]) 212 | return reward 213 | 214 | 215 | def get_new_len(sess, mm): 216 | new_x_len = np.zeros([len(data_ID)], dtype=np.int32) 217 | 218 | for i in range(len(data_ID)): 219 | init_state = np.zeros([1, FLAGS.hidden_dim], dtype=np.float32) 220 | e_state = sess.run(mm.df_state, feed_dict={mm.topics: init_state}) 221 | for j in range(data_len[i]): 222 | t_words = data[data_ID[i]]['text'][j].strip().split(" ") 223 | e_x = np.zeros([1, FLAGS.max_word_len, FLAGS.hidden_dim], dtype=np.float32) 224 | for k in range(len(t_words)): 225 | m_word = t_words[k] 226 | try: 227 | e_x[0][k] = word2vec[m_word] 228 | except: 229 | miss_word = 1 230 | batch_dic = {mm.rl_state: e_state, mm.rl_input: e_x, mm.dropout_keep_prob: 1.0} 231 | e_isStop, mNewState = sess.run([mm.isStop, mm.rl_new_state], batch_dic) 232 | e_state = mNewState 233 | 234 | if e_isStop == 1: 235 | new_x_len[i] = j+1 236 | break 237 | if new_x_len[i] == 0 or new_x_len[i] > data_len[i]: 238 | new_x_len[i] = data_len[i] 239 | 240 | # print(" Max Length: " + str(max(new_x_len)) + 241 | # " Min Length: " + str(min(new_x_len)) + 242 | # " Ave Length: " + str(np.mean(new_x_len))) + " (" + str(np.mean(data_len)) + ")" 243 | 244 | return new_x_len 245 | 246 | 247 | def get_RL_Train_batch(D): 248 | s_state = np.zeros([FLAGS.batch_size, FLAGS.hidden_dim], dtype=np.float32) 249 | s_x = np.zeros([FLAGS.batch_size, FLAGS.max_sent_len, FLAGS.hidden_dim], dtype=np.float32) 250 | s_isStop = np.zeros([FLAGS.batch_size, FLAGS.action_num], dtype=np.float32) 251 | s_rw = np.zeros([FLAGS.batch_size], dtype=np.float32) 252 | 253 | m_batch = random.sample(D, FLAGS.batch_size) 254 | for i in range(FLAGS.batch_size): 255 | s_state[i] = m_batch[i][0] 256 | s_x[i] = m_batch[i][1] 257 | s_isStop[i][m_batch[i][2]] = 1 258 | s_rw[i] = m_batch[i][3] 259 | 260 | return s_state, s_x, s_isStop, s_rw 261 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import deque 3 | from model import RL_GRU2 4 | from dataUtils import * 5 | 6 | tf.logging.set_verbosity(tf.logging.ERROR) 7 | 8 | 9 | def df_train(sess, mm, t_acc, t_steps, new_data_len=[]): 10 | sum_loss = 0.0 11 | sum_acc = 0.0 12 | ret_acc = 0.0 13 | init_states = np.zeros([FLAGS.batch_size, FLAGS.hidden_dim], dtype=np.float32) 14 | 15 | for i in range(t_steps): 16 | if len(new_data_len) > 0: 17 | x, x_len, y = get_df_batch(i, new_data_len) 18 | else: 19 | x, x_len, y = get_df_batch(i) 20 | feed_dic = {mm.input_x: x, mm.x_len: x_len, mm.input_y: y, mm.init_states: init_states, mm.dropout_keep_prob: 0.5} 21 | _, step, loss, acc = sess.run([df_train_op, df_global_step, mm.loss, mm.accuracy], feed_dic) 22 | sum_loss += loss 23 | sum_acc += acc 24 | 25 | if i % 100 == 99: 26 | sum_loss = sum_loss / 100 27 | sum_acc = sum_acc / 100 28 | ret_acc = sum_acc 29 | print(get_curtime() + " Step: " + str(step) + " Training loss: " + str(sum_loss) + " accuracy: " + str(sum_acc)) 30 | if sum_acc > t_acc: 31 | break 32 | sum_acc = 0.0 33 | sum_loss = 0.0 34 | 35 | print(get_curtime() + " Train df Model End.") 36 | return ret_acc 37 | 38 | 39 | def rl_train(sess, mm, t_rw, t_steps): 40 | ids = np.array(range(FLAGS.batch_size), dtype=np.int32) 41 | seq_states = np.zeros([FLAGS.batch_size], dtype=np.int32) 42 | isStop = np.zeros([FLAGS.batch_size], dtype=np.int32) 43 | max_id = FLAGS.batch_size 44 | init_states = np.zeros([FLAGS.batch_size, FLAGS.hidden_dim], dtype=np.float32) 45 | state = sess.run(mm.df_state) 46 | 47 | D = deque() 48 | ssq = [] 49 | print("in RL the begining") 50 | # get_new_len(sess, mm) 51 | if len(data_ID) % FLAGS.batch_size == 0: 52 | flags = len(data_ID) / FLAGS.batch_size 53 | else: 54 | flags = len(data_ID) / FLAGS.batch_size + 1 55 | for i in range(flags): 56 | x, x_len, y = get_df_batch(i) 57 | feed_dic = {mm.input_x: x, mm.x_len: x_len, mm.input_y: y, mm.dropout_keep_prob: 1.0} 58 | t_ssq = sess.run(mm.out_seq, feed_dic) 59 | if len(ssq) > 0: 60 | ssq = np.append(ssq, t_ssq, axis=0) 61 | else: 62 | ssq = t_ssq 63 | print(get_curtime() + " Now Start RL training ...") 64 | counter = 0 65 | sum_rw = 0.0 66 | while True: 67 | if counter > FLAGS.OBSERVE: 68 | sum_rw += np.mean(rw) 69 | if counter % 200 == 0: 70 | sum_rw = sum_rw / 2000 71 | print(get_curtime() + " Step: " + str(step) + " REWARD IS " + str(sum_rw)) 72 | if sum_rw > t_rw: 73 | print("Retch The Target Reward") 74 | break 75 | if counter > t_steps: 76 | print("Retch The Target Steps") 77 | break 78 | sum_rw = 0.0 79 | s_state, s_x, s_isStop, s_rw = get_RL_Train_batch(D) 80 | feed_dic = {mm.rl_state: s_state, mm.rl_input: s_x, mm.action: s_isStop, mm.reward:s_rw, mm.dropout_keep_prob: 0.5} 81 | _, step = sess.run([rl_train_op, rl_global_step], feed_dic) 82 | 83 | x, y, ids, seq_states, max_id = get_rl_batch(ids, seq_states, isStop, max_id, 0, 3150) 84 | batch_dic = {mm.rl_state: state, mm.rl_input: x, mm.dropout_keep_prob: 1.0} 85 | isStop, mss, mNewState = sess.run([mm.isStop, mm.stopScore, mm.rl_new_state], batch_dic) 86 | 87 | for j in range(FLAGS.batch_size): 88 | if random.random() < FLAGS.random_rate: 89 | isStop[j] = np.argmax(np.random.rand(2)) 90 | if seq_states[j] == data_len[ids[j]]: 91 | isStop[j] = 1 92 | # eval 93 | rw = get_reward(isStop, mss, ssq, ids, seq_states) 94 | 95 | for j in range(FLAGS.batch_size): 96 | D.append((state[j], x[j], isStop[j], rw[j])) 97 | if len(D) > FLAGS.max_memory: 98 | D.popleft() 99 | 100 | state = mNewState 101 | for j in range(FLAGS.batch_size): 102 | if isStop[j] == 1: 103 | init_states = np.zeros([FLAGS.batch_size, FLAGS.hidden_dim], dtype=np.float32) 104 | state[j] = sess.run(mm.df_state) 105 | 106 | counter += 1 107 | 108 | 109 | def eval(sess, mm): 110 | start_ef = int(eval_flag / FLAGS.batch_size) 111 | end_ef = int(len(data_ID) / FLAGS.batch_size) + 1 112 | init_states = np.zeros([FLAGS.batch_size, FLAGS.hidden_dim], dtype=np.float32) 113 | 114 | counter = 0 115 | sum_acc = 0.0 116 | 117 | for i in range(start_ef, end_ef): 118 | x, x_len, y = get_df_batch(i) 119 | feed_dic = {mm.input_x: x, mm.x_len: x_len, mm.input_y: y, mm.init_states: init_states, mm.dropout_keep_prob: 1.0} 120 | _, step, loss, acc = sess.run([df_train_op, df_global_step, mm.loss, mm.accuracy], feed_dic) 121 | counter += 1 122 | sum_acc += acc 123 | 124 | print(sum_acc / counter) 125 | 126 | 127 | if __name__ == "__main__": 128 | print(get_curtime() + " Loading data ...") 129 | load_data(FLAGS.data_file_path) 130 | print(get_curtime() + " Data loaded.") 131 | 132 | with tf.Graph().as_default(): 133 | sess = tf.Session() 134 | with sess.as_default(): 135 | # (self, input_dim, hidden_dim, max_seq_len, max_word_len, class_num, action_num): 136 | print(FLAGS.embedding_dim, FLAGS.hidden_dim, FLAGS.max_seq_len, FLAGS.max_sent_len, FLAGS.class_num, FLAGS.action_num) 137 | mm = RL_GRU2(FLAGS.embedding_dim, FLAGS.hidden_dim, FLAGS.max_seq_len, 138 | FLAGS.max_sent_len, FLAGS.class_num, FLAGS.action_num) 139 | 140 | # df model 141 | df_global_step = tf.Variable(0, name="global_step", trainable=False) 142 | df_train_op = tf.train.AdamOptimizer(0.01).minimize(mm.loss, df_global_step) 143 | 144 | # rl model 145 | rl_global_step = tf.Variable(0, name="global_step", trainable=False) 146 | rl_train_op = tf.train.AdamOptimizer(0.001).minimize(mm.rl_cost, rl_global_step) 147 | 148 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=4) 149 | 150 | sess.run(tf.global_variables_initializer()) 151 | 152 | ckpt_dir = "df_saved" 153 | checkpoint = tf.train.get_checkpoint_state(ckpt_dir) 154 | if checkpoint and checkpoint.model_checkpoint_path: 155 | saver.restore(sess, checkpoint.model_checkpoint_path) 156 | print(checkpoint.model_checkpoint_path+" is restored.") 157 | else: 158 | df_train(sess, mm, 0.80, 2000) 159 | saver.save(sess, "df_saved/model") 160 | print("df_model "+" saved") 161 | 162 | for i in range(20): 163 | rl_train(sess, mm, 0.5, 50000) 164 | saver.save(sess, "rl_saved/model"+str(i)) 165 | print("rl_model "+str(i)+" saved") 166 | new_len = get_new_len(sess, mm) 167 | acc = df_train(sess, mm, 0.9, 500, new_len) 168 | saver.save(sess, "df_saved/model"+str(i)) 169 | print("df_model "+str(i)+" saved") 170 | if acc > 0.9: 171 | break 172 | 173 | print("The End of My Program") 174 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | 4 | 5 | class RL_GRU2: 6 | def __init__(self, input_dim, hidden_dim, max_seq_len, max_word_len, class_num, action_num): 7 | self.input_x = tf.placeholder(tf.float32, [None, max_seq_len, max_word_len, input_dim], name="input_x") 8 | self.input_y = tf.placeholder(tf.float32, [None, class_num], name="input_y") 9 | self.x_len = tf.placeholder(tf.int32, [None], name="x_len") 10 | self.init_states = tf.placeholder(tf.float32, [None, hidden_dim], name="topics") 11 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 12 | 13 | self.rl_state = tf.placeholder(tf.float32, [None, hidden_dim], name="rl_states") 14 | self.rl_input = tf.placeholder(tf.float32, [None, max_word_len, input_dim], name="rl_input") 15 | self.action = tf.placeholder(tf.float32, [None, action_num], name="action") 16 | self.reward = tf.placeholder(tf.float32, [None], name="reward") 17 | 18 | output_dim = hidden_dim 19 | 20 | # shared pooling layer 21 | self.w_t = tf.Variable(tf.random_uniform([input_dim, output_dim], -1.0, 1.0), name="w_t") 22 | self.b_t = tf.Variable(tf.constant(0.01, shape=[output_dim]), name="b_t") 23 | pooled_input_x = self.shared_pooling_layer(self.input_x, input_dim, max_seq_len, max_word_len, output_dim) 24 | pooled_rl_input = self.shared_pooling_layer(self.rl_input, input_dim, 1, max_word_len, output_dim) 25 | pooled_rl_input = tf.reshape(pooled_rl_input, [-1, output_dim]) 26 | 27 | # dropout layer 28 | pooled_input_x_dp = tf.nn.dropout(pooled_input_x, self.dropout_keep_prob) 29 | 30 | # df model 31 | df_cell = rnn.GRUCell(output_dim) 32 | df_cell = rnn.DropoutWrapper(df_cell, output_keep_prob=self.dropout_keep_prob) 33 | 34 | w_tp = tf.constant(0.0, shape=[hidden_dim, output_dim], name="w_tp") 35 | self.df_state = tf.matmul(self.init_states, w_tp, name="df_state") 36 | 37 | df_outputs, df_last_state = tf.nn.dynamic_rnn(df_cell, pooled_input_x_dp, self.x_len, initial_state=self.df_state, dtype=tf.float32) 38 | l2_loss = tf.constant(0.0) 39 | 40 | w_ps = tf.Variable(tf.truncated_normal([output_dim, class_num], stddev=0.1)) 41 | b_ps = tf.Variable(tf.constant(0.01, shape=[class_num])) 42 | l2_loss += tf.nn.l2_loss(w_ps) 43 | l2_loss += tf.nn.l2_loss(b_ps) 44 | 45 | self.pre_scores = tf.nn.xw_plus_b(df_last_state, w_ps, b_ps, name="p_scores") 46 | self.predictions = tf.argmax(self.pre_scores, 1, name="predictions") 47 | 48 | r_outputs = tf.reshape(df_outputs, [-1, output_dim]) 49 | scores_seq = tf.nn.softmax(tf.nn.xw_plus_b(r_outputs, w_ps, b_ps)) 50 | self.out_seq = tf.reshape(scores_seq, [-1, max_seq_len, class_num], name="out_seq") 51 | 52 | df_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.pre_scores, labels=self.input_y) 53 | self.loss = tf.reduce_mean(df_losses) + 0.1 * l2_loss 54 | 55 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 56 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 57 | 58 | # rl model 59 | self.rl_output, self.rl_new_state = df_cell(pooled_rl_input, self.rl_state) 60 | 61 | w_ss1 = tf.Variable(tf.truncated_normal([output_dim, 64], stddev=0.01)) 62 | b_ss1 = tf.Variable(tf.constant(0.01, shape=[64])) 63 | rl_h1 = tf.nn.relu(tf.nn.xw_plus_b(self.rl_state, w_ss1, b_ss1)) 64 | 65 | w_ss2 = tf.Variable(tf.truncated_normal([64, action_num], stddev=0.01)) 66 | b_ss2 = tf.Variable(tf.constant(0.01, shape=[action_num])) 67 | 68 | self.stopScore = tf.nn.xw_plus_b(rl_h1, w_ss2, b_ss2, name="stopScore") 69 | 70 | self.isStop = tf.argmax(self.stopScore, 1, name="isStop") 71 | 72 | out_action = tf.reduce_sum(tf.multiply(self.stopScore, self.action), reduction_indices=1) 73 | self.rl_cost = tf.reduce_mean(tf.square(self.reward - out_action), name="rl_cost") 74 | 75 | def shared_pooling_layer(self, inputs, input_dim, max_seq_len, max_word_len, output_dim): 76 | t_inputs = tf.reshape(inputs, [-1, input_dim]) 77 | t_h = tf.nn.xw_plus_b(t_inputs, self.w_t, self.b_t) 78 | t_h = tf.reshape(t_h, [-1, max_word_len, output_dim]) 79 | t_h_expended = tf.expand_dims(t_h, -1) 80 | pooled = tf.nn.max_pool( 81 | t_h_expended, 82 | ksize=[1, max_word_len, 1, 1], 83 | strides=[1, 1, 1, 1], 84 | padding="VALID", 85 | name="max_pool" 86 | ) 87 | outs = tf.reshape(pooled, [-1, max_seq_len, output_dim]) 88 | return outs 89 | 90 | def pooling_layer(self, inputs, input_dim, max_seq_len, max_word_len, output_dim): 91 | t_inputs = tf.reshape(inputs, [-1, input_dim]) 92 | w = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=0.1)) 93 | b = tf.Variable(tf.constant(0.01, shape=[output_dim])) 94 | 95 | h = tf.nn.xw_plus_b(t_inputs, w, b) 96 | hs = tf.reshape(h, [-1, max_word_len, output_dim]) 97 | 98 | inputs_expended = tf.expand_dims(hs, -1) 99 | 100 | pooled = tf.nn.max_pool( 101 | inputs_expended, 102 | ksize=[1, max_word_len, 1, 1], 103 | strides=[1, 1, 1, 1], 104 | padding="VALID", 105 | name="max_pool" 106 | ) 107 | cnn_outs = tf.reshape(pooled, [-1, max_seq_len, output_dim]) 108 | return cnn_outs 109 | -------------------------------------------------------------------------------- /torch/CM_Model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import torch 4 | import importlib 5 | from tensorboardX import SummaryWriter 6 | import torch.nn.utils.rnn as rnn_utils 7 | import pickle 8 | import tqdm 9 | import os 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch.autograd import Variable 13 | from collections import deque 14 | import json 15 | import pdb 16 | from dataUtils_CN import * 17 | import numpy as np 18 | 19 | def TrainCMModel_V0(sent_pooler, rdm_model, rdm_classifier, cm_model, stage, t_rw, t_steps, log_dir, logger, FLAGS, cuda=True): 20 | batch_size = FLAGS.batch_size 21 | t_acc = 0.9 22 | ids = np.array(range(batch_size), dtype=np.int32) 23 | seq_states = np.zeros([batch_size], dtype=np.int32) 24 | isStop = torch.zeros([batch_size], dtype=torch.int32) 25 | max_id = batch_size 26 | df_init_states = torch.zeros([1, batch_size, rdm_model.hidden_dim], dtype=torch.float32).cuda() 27 | writer = SummaryWriter(log_dir, filename_suffix="_ERD_CM_stage_%3d"%stage) 28 | D = deque() 29 | ssq = [] 30 | print("in RL the begining") 31 | rl_optim = torch.optim.Adam([{'params': sent_pooler.parameters(), 'lr': 2e-5}, 32 | {'params': rdm_model.parameters(), 'lr': 2e-5}, 33 | {'params':cm_model.parameters(), 'lr':1e-3}]) 34 | data_ID = get_data_ID() 35 | valid_data_len = get_valid_data_len() 36 | data_len = get_data_len() 37 | 38 | if len(data_ID) % batch_size == 0: # the total number of events 39 | flags = int(len(data_ID) / FLAGS.batch_size) 40 | else: 41 | flags = int(len(data_ID) / FLAGS.batch_size) + 1 42 | 43 | for i in range(flags): 44 | with torch.no_grad(): 45 | x, x_len, y = get_df_batch(i, batch_size) 46 | seq = sent_pooler(x) 47 | rdm_hiddens = rdm_model(seq) 48 | batchsize, _, _ = rdm_hiddens.shape 49 | print("batch %d"%i) 50 | if len(ssq) > 0: 51 | ssq.extend([rdm_classifier(h) for h in rdm_hiddens]) 52 | else: 53 | ssq = [rdm_classifier(h) for h in rdm_hiddens] 54 | torch.cuda.empty_cache() 55 | 56 | print(get_curtime() + " Now Start RL training ...") 57 | counter = 0 58 | sum_rw = 0.0 # sum of rewards 59 | 60 | while True: 61 | # if counter > FLAGS.OBSERVE: 62 | if counter > FLAGS.OBSERVE: 63 | sum_rw += rw.mean() 64 | if counter % 200 == 0: 65 | sum_rw = sum_rw / 2000 66 | print(get_curtime() + " Step: " + str(counter-FLAGS.OBSERVE) + " REWARD IS " + str(sum_rw)) 67 | if counter > t_steps: 68 | print("Retch The Target Steps") 69 | break 70 | sum_rw = 0.0 71 | s_state, s_x, s_isStop, s_rw = get_RL_Train_batch(D) 72 | word_tensors = torch.tensor(s_x) 73 | batchsize, max_sent_len, emb_dim = word_tensors.shape 74 | sent_tensor = sent_pooler.linear(word_tensors.reshape([-1, emb_dim]).cuda()).reshape([batchsize, max_sent_len, emb_dim]).max(axis=1)[0].unsqueeze(1) 75 | df_outs, df_last_state = rdm_model.gru_model(sent_tensor, s_state.unsqueeze(0).cuda()) 76 | batchsize, _, hidden_dim = df_outs.shape 77 | stopScore, isStop = cm_model(df_outs.reshape([-1, hidden_dim])) 78 | out_action = (stopScore*s_isStop.cuda()).sum(axis=1) 79 | rl_cost = torch.pow(s_rw.cuda() - out_action, 2).mean() 80 | rl_optim.zero_grad() 81 | rl_cost.backward() 82 | torch.cuda.empty_cache() 83 | rl_optim.step() 84 | # print("RL Cost:", rl_cost) 85 | writer.add_scalar('RL Cost', rl_cost, counter - FLAGS.OBSERVE) 86 | if (counter - FLAGS.OBSERVE)%100 == 0: 87 | print("*** %6d|%6d *** RL Cost:%8.6f"%(counter, t_steps, rl_cost)) 88 | valid_new_len = get_new_len_on_valid_data(sent_pooler, rdm_model, cm_model, FLAGS, cuda=True) 89 | print("diff len:", np.array(valid_data_len)-np.array(valid_new_len)) 90 | 91 | x, y, ids, seq_states, max_id = get_rl_batch_0(ids, seq_states, isStop, max_id, 0) 92 | for j in range(FLAGS.batch_size): 93 | if seq_states[j] == 1: 94 | df_init_states[0][j].fill_(0.0) 95 | 96 | with torch.no_grad(): 97 | word_tensors = torch.tensor(x) 98 | batchsize, max_sent_len, emb_dim = word_tensors.shape 99 | sent_tensor = sent_pooler.linear(word_tensors.reshape([-1, emb_dim]).cuda()).reshape([batchsize, max_sent_len, emb_dim]).max(axis=1)[0].unsqueeze(1) 100 | df_outs, df_last_state = rdm_model.gru_model(sent_tensor, df_init_states) 101 | batchsize, _, hidden_dim = df_outs.shape 102 | stopScore, isStop = cm_model(df_outs.reshape([-1, hidden_dim])) 103 | 104 | for j in range(batch_size): 105 | if random.random() < FLAGS.random_rate: 106 | isStop[j] = torch.randn(2).argmax() 107 | if seq_states[j] == data_len[ids[j]]: 108 | isStop[j] = 1 109 | rw, Q_val = get_reward_0(isStop, stopScore, ssq, ids, seq_states) 110 | for j in range(FLAGS.batch_size): 111 | D.append((df_init_states[0][j], x[j], isStop[j], rw[j])) 112 | if len(D) > FLAGS.max_memory: 113 | D.popleft() 114 | df_init_states = df_last_state 115 | counter += 1 -------------------------------------------------------------------------------- /torch/ERD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import random 6 | import torch 7 | import importlib 8 | from tensorboardX import SummaryWriter 9 | import torch.nn.utils.rnn as rnn_utils 10 | import pickle 11 | import tqdm 12 | import os 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | from collections import deque 17 | sys.path.append(".") 18 | from dataUtilsV0 import * 19 | import json 20 | from RDM_Model import * 21 | from CM_Model import * 22 | # import pdb 23 | 24 | 25 | 26 | class LayerNormLSTMCell(nn.LSTMCell): 27 | def __init__(self, input_size, hidden_size, dropout=0.0, bias=True, use_layer_norm=True): 28 | super().__init__(input_size, hidden_size, bias) 29 | self.use_layer_norm = use_layer_norm 30 | if self.use_layer_norm: 31 | self.ln_ih = nn.LayerNorm(4 * hidden_size) 32 | self.ln_hh = nn.LayerNorm(4 * hidden_size) 33 | self.ln_ho = nn.LayerNorm(hidden_size) 34 | # DropConnect on the recurrent hidden to hidden weight 35 | self.dropout = dropout 36 | 37 | def forward(self, input, hidden=None): 38 | self.check_forward_input(input) 39 | if hidden is None: 40 | hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False) 41 | cx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False) 42 | else: 43 | hx, cx = hidden 44 | self.check_forward_hidden(input, hx, '[0]') 45 | self.check_forward_hidden(input, cx, '[1]') 46 | 47 | weight_hh = nn.functional.dropout(self.weight_hh, p=self.dropout, training=self.training) 48 | if self.use_layer_norm: 49 | gates = self.ln_ih(F.linear(input, self.weight_ih, self.bias_ih)) \ 50 | + self.ln_hh(F.linear(hx, weight_hh, self.bias_hh)) 51 | else: 52 | gates = F.linear(input, self.weight_ih, self.bias_ih) \ 53 | + F.linear(hx, weight_hh, self.bias_hh) 54 | 55 | i, f, c, o = gates.chunk(4, 1) 56 | i_ = torch.sigmoid(i) 57 | f_ = torch.sigmoid(f) 58 | c_ = torch.tanh(c) 59 | o_ = torch.sigmoid(o) 60 | cy = (f_ * cx) + (i_ * c_) 61 | if self.use_layer_norm: 62 | hy = o_ * self.ln_ho(torch.tanh(cy)) 63 | else: 64 | hy = o_ * torch.tanh(cy) 65 | return hy, cy 66 | 67 | class LayerNormLSTM(nn.Module): 68 | def __init__(self, 69 | input_size, 70 | hidden_size, 71 | num_layers=1, 72 | dropout=0.0, 73 | weight_dropout=0.0, 74 | bias=True, 75 | bidirectional=False, 76 | use_layer_norm=True): 77 | super().__init__() 78 | self.input_size = input_size 79 | self.hidden_size = hidden_size 80 | self.num_layers = num_layers 81 | # using variational dropout 82 | self.dropout = dropout 83 | self.bidirectional = bidirectional 84 | 85 | num_directions = 2 if bidirectional else 1 86 | self.hidden0 = nn.ModuleList([ 87 | LayerNormLSTMCell(input_size=(input_size if layer == 0 else hidden_size * num_directions), 88 | hidden_size=hidden_size, dropout=weight_dropout, bias=bias, use_layer_norm=use_layer_norm) 89 | for layer in range(num_layers) 90 | ]) 91 | 92 | if self.bidirectional: 93 | self.hidden1 = nn.ModuleList([ 94 | LayerNormLSTMCell(input_size=(input_size if layer == 0 else hidden_size * num_directions), 95 | hidden_size=hidden_size, dropout=weight_dropout, bias=bias, use_layer_norm=use_layer_norm) 96 | for layer in range(num_layers) 97 | ]) 98 | 99 | def copy_parameters(self, rnn_old): 100 | for param in rnn_old.named_parameters(): 101 | name_ = param[0].split("_") 102 | layer = int(name_[2].replace("l", "")) 103 | sub_name = "_".join(name_[:2]) 104 | if len(name_) > 3: 105 | self.hidden1[layer].register_parameter(sub_name, param[1]) 106 | else: 107 | self.hidden0[layer].register_parameter(sub_name, param[1]) 108 | 109 | def forward(self, input, hidden=None, seq_lens=None): 110 | seq_len, batch_size, _ = input.size() 111 | num_directions = 2 if self.bidirectional else 1 112 | if hidden is None: 113 | hx = input.new_zeros(self.num_layers * num_directions, batch_size, self.hidden_size, requires_grad=False) 114 | cx = input.new_zeros(self.num_layers * num_directions, batch_size, self.hidden_size, requires_grad=False) 115 | else: 116 | hx, cx = hidden 117 | 118 | ht = [] 119 | for i in range(seq_len): 120 | ht.append([None] * (self.num_layers * num_directions)) 121 | ct = [] 122 | for i in range(seq_len): 123 | ct.append([None] * (self.num_layers * num_directions)) 124 | 125 | seq_len_mask = input.new_ones(batch_size, seq_len, self.hidden_size, requires_grad=False) 126 | if seq_lens != None: 127 | for i, l in enumerate(seq_lens): 128 | seq_len_mask[i, l:, :] = 0 129 | seq_len_mask = seq_len_mask.transpose(0, 1) 130 | 131 | if self.bidirectional: 132 | # if use cuda, change 'torch.LongTensor' to 'torch.cuda.LongTensor' 133 | indices_ = (torch.LongTensor(seq_lens) - 1).unsqueeze(1).unsqueeze(0).unsqueeze(0).repeat( 134 | [1, 1, 1, self.hidden_size]) 135 | # if use cuda, change 'torch.LongTensor' to 'torch.cuda.LongTensor' 136 | indices_reverse = torch.LongTensor([0] * batch_size).unsqueeze(1).unsqueeze(0).unsqueeze(0).repeat( 137 | [1, 1, 1, self.hidden_size]) 138 | indices = torch.cat((indices_, indices_reverse), dim=1) 139 | hy = [] 140 | cy = [] 141 | xs = input 142 | # Variational Dropout 143 | if not self.training or self.dropout == 0: 144 | dropout_mask = input.new_ones(self.num_layers, 2, batch_size, self.hidden_size) 145 | else: 146 | dropout_mask = input.new(self.num_layers, 2, batch_size, self.hidden_size).bernoulli_(1 - self.dropout) 147 | dropout_mask = Variable(dropout_mask, requires_grad=False) / (1 - self.dropout) 148 | 149 | for l, (layer0, layer1) in enumerate(zip(self.hidden0, self.hidden1)): 150 | l0, l1 = 2 * l, 2 * l + 1 151 | h0, c0, h1, c1 = hx[l0], cx[l0], hx[l1], cx[l1] 152 | for t, (x0, x1) in enumerate(zip(xs, reversed(xs))): 153 | ht_, ct_ = layer0(x0, (h0, c0)) 154 | ht[t][l0] = ht_ * seq_len_mask[t] 155 | ct[t][l0] = ct_ * seq_len_mask[t] 156 | h0, c0 = ht[t][l0], ct[t][l0] 157 | t = seq_len - 1 - t 158 | ht_, ct_ = layer1(x1, (h1, c1)) 159 | ht[t][l1] = ht_ * seq_len_mask[t] 160 | ct[t][l1] = ct_ * seq_len_mask[t] 161 | h1, c1 = ht[t][l1], ct[t][l1] 162 | 163 | xs = [torch.cat((h[l0]*dropout_mask[l][0], h[l1]*dropout_mask[l][1]), dim=1) for h in ht] 164 | ht_temp = torch.stack([torch.stack([h[l0], h[l1]]) for h in ht]) 165 | ct_temp = torch.stack([torch.stack([c[l0], c[l1]]) for c in ct]) 166 | if len(hy) == 0: 167 | hy = torch.stack(list(ht_temp.gather(dim=0, index=indices).squeeze(0))) 168 | else: 169 | hy = torch.cat((hy, torch.stack(list(ht_temp.gather(dim=0, index=indices).squeeze(0)))), dim=0) 170 | if len(cy) == 0: 171 | cy = torch.stack(list(ct_temp.gather(dim=0, index=indices).squeeze(0))) 172 | else: 173 | cy = torch.cat((cy, torch.stack(list(ct_temp.gather(dim=0, index=indices).squeeze(0)))), dim=0) 174 | y = torch.stack(xs) 175 | else: 176 | # if use cuda, change 'torch.LongTensor' to 'torch.cuda.LongTensor' 177 | indices = (torch.cuda.LongTensor(seq_lens) - 1).unsqueeze(1).unsqueeze(0).unsqueeze(0).repeat( 178 | [1, self.num_layers, 1, self.hidden_size]) 179 | h, c = hx, cx 180 | # Variational Dropout 181 | if not self.training or self.dropout == 0: 182 | dropout_mask = input.new_ones(self.num_layers, batch_size, self.hidden_size) 183 | else: 184 | dropout_mask = input.new(self.num_layers, batch_size, self.hidden_size).bernoulli_(1 - self.dropout) 185 | dropout_mask = Variable(dropout_mask, requires_grad=False) / (1 - self.dropout) 186 | 187 | for t, x in enumerate(input): 188 | for l, layer in enumerate(self.hidden0): 189 | ht_, ct_ = layer(x, (h[l], c[l])) 190 | ht[t][l] = ht_ * seq_len_mask[t] 191 | ct[t][l] = ct_ * seq_len_mask[t] 192 | x = ht[t][l] * dropout_mask[l] 193 | ht[t] = torch.stack(ht[t]) 194 | ct[t] = torch.stack(ct[t]) 195 | h, c = ht[t], ct[t] 196 | y = torch.stack([h[-1]*dropout_mask[-1] for h in ht]) 197 | hy = torch.stack(list(torch.stack(ht).gather(dim=0, index=indices).squeeze(0))) 198 | cy = torch.stack(list(torch.stack(ct).gather(dim=0, index=indices).squeeze(0))) 199 | 200 | return y, (hy, cy) 201 | 202 | 203 | # ### 模型训练与测试 204 | class pooling_layer(nn.Module): 205 | def __init__(self, input_dim, output_dim): 206 | super(pooling_layer, self).__init__() 207 | self.linear = nn.Linear(input_dim, output_dim) 208 | self.input_dim = input_dim 209 | self.output_dim = output_dim 210 | 211 | def forward(self, inputs, cuda=True): 212 | inputs_sent = [torch.cat([self.linear(sent_tensor.cuda() if cuda else sent_tensor).max(axis=0)[0].unsqueeze(0) for sent_tensor in seq]) for seq in inputs] 213 | seqs = torch.nn.utils.rnn.pad_sequence(inputs_sent, batch_first=True) 214 | return seqs 215 | 216 | class RDM_Model(nn.Module): 217 | def __init__(self, word_embedding_dim, sent_embedding_dim, hidden_dim, dropout_prob): 218 | super(RDM_Model, self).__init__() 219 | self.embedding_dim = sent_embedding_dim 220 | self.hidden_dim = hidden_dim 221 | self.gru_model = nn.GRU(word_embedding_dim, 222 | self.hidden_dim, 223 | batch_first=True, 224 | dropout=dropout_prob 225 | ) 226 | self.DropLayer = nn.Dropout(dropout_prob) 227 | 228 | def forward(self, input_x): 229 | """ 230 | input_x: [batchsize, max_seq_len, sentence_embedding_dim] 231 | x_len: [batchsize] 232 | init_states: [batchsize, hidden_dim] 233 | """ 234 | batchsize, max_seq_len, emb_dim = input_x.shape 235 | init_states = torch.zeros([1, batchsize, self.hidden_dim], dtype=torch.float32).cuda() 236 | try: 237 | df_outputs, df_last_state = self.gru_model(input_x, init_states) 238 | except: 239 | print("Error:", pool_feature.shape, init_states.shape) 240 | raise 241 | return df_outputs 242 | 243 | class RDM_Model_V1(nn.Module): 244 | def __init__(self, word_embedding_dim, sent_embedding_dim, hidden_dim, dropout_prob): 245 | super(RDM_Model_V1, self).__init__() 246 | self.embedding_dim = sent_embedding_dim 247 | self.hidden_dim = hidden_dim 248 | self.gru_model = LayerNormLSTM(word_embedding_dim, 249 | self.hidden_dim, 250 | dropout=dropout_prob 251 | ) 252 | 253 | def forward(self, input_x, seq_lens): 254 | """ 255 | input_x: [batchsize, max_seq_len, sentence_embedding_dim] 256 | x_len: [batchsize] 257 | init_states: [batchsize, hidden_dim] 258 | """ 259 | batchsize, max_seq_len, emb_dim = input_x.shape 260 | h0 = torch.zeros([1, batchsize, self.hidden_dim], dtype=torch.float32).cuda() 261 | c0 = torch.zeros([1, batchsize, self.hidden_dim], dtype=torch.float32).cuda() 262 | df_outputs, (df_last_state, df_last_cell) = self.gru_model(input_x.transpose(0, 1), (h0, c0), seq_lens) 263 | return df_outputs.transpose(0, 1), df_last_state.transpose(0, 1), df_last_cell.transpose(0, 1) 264 | 265 | class CM_Model_V1(nn.Module): 266 | def __init__(self, hidden_dim, action_num): 267 | super(CM_Model_V1, self).__init__() 268 | self.hidden_dim = hidden_dim 269 | self.action_num = action_num 270 | self.DenseLayer = nn.Linear(self.hidden_dim, 64) 271 | self.Classifier = nn.Linear(64, self.action_num) 272 | 273 | def forward(self, rdm_state): 274 | """ 275 | rdm_state: [batchsize, hidden_dim] 276 | """ 277 | batchsize, hidden_dim = rdm_state.shape 278 | rl_h1 = nn.functional.relu( 279 | self.DenseLayer( 280 | rdm_state 281 | ) 282 | ) 283 | stopScore = self.Classifier(rl_h1) 284 | isStop = stopScore.argmax(axis=1) 285 | return stopScore, isStop 286 | 287 | class CM_Model(nn.Module): 288 | def __init__(self, sentence_embedding_dim, hidden_dim, action_num): 289 | super(CM_Model, self).__init__() 290 | self.sentence_embedding_dim = sentence_embedding_dim 291 | self.hidden_dim = hidden_dim 292 | self.action_num = action_num 293 | # self.PoolLayer = pooling_layer(self.embedding_dim, 294 | # self.hidden_dim) 295 | self.DenseLayer = nn.Linear(self.hidden_dim, 64) 296 | self.Classifier = nn.Linear(64, self.action_num) 297 | 298 | def forward(self, rdm_model, rl_input, rl_state): 299 | """ 300 | rl_input: [batchsize, max_word_num, sentence_embedding_dim] 301 | rl_state: [1, batchsize, hidden_dim] 302 | """ 303 | assert(rl_input.ndim==3) 304 | batchsize, max_word_num, embedding_dim = rl_input.shape 305 | rl_output, rl_new_state = rdm_model.gru_model( 306 | rl_input, 307 | rl_state 308 | ) 309 | rl_h1 = nn.functional.relu( 310 | self.DenseLayer( 311 | # rl_state.reshape([len(rl_input), self.hidden_dim]) #it is not sure to take rl_state , rather than rl_output, as the feature 312 | rl_output.reshape( 313 | [len(rl_input), self.hidden_dim] 314 | ) 315 | ) 316 | ) 317 | stopScore = self.Classifier(rl_h1) 318 | isStop = stopScore.argmax(axis=1) 319 | return stopScore, isStop, rl_new_state 320 | 321 | 322 | # In[13]: 323 | 324 | 325 | load_data_fast() 326 | 327 | rdm_model = RDM_Model(300, 300, 256, 0.2).cuda() 328 | sent_pooler = pooling_layer(300, 300).cuda() 329 | rdm_classifier = nn.Linear(256, 2).cuda() 330 | cm_model = CM_Model_V1(256, 2).cuda() 331 | 332 | log_dir = os.path.join(sys.path[0], "ERDV4/") 333 | 334 | with open("../../config.json", "r") as cr: 335 | dic = json.load(cr) 336 | 337 | class adict(dict): 338 | ''' Attribute dictionary - a convenience data structure, similar to SimpleNamespace in python 3.3 339 | One can use attributes to read/write dictionary content. 340 | ''' 341 | def __init__(self, *av, **kav): 342 | dict.__init__(self, *av, **kav) 343 | self.__dict__ = self 344 | 345 | FLAGS = adict(dic) 346 | 347 | # #### 导入模型预训练参数 348 | # pretrained_file = "%s/ERD_best.pkl"%log_dir 349 | pretrained_file = "ERD/ERD_best.pkl" 350 | if os.path.exists(pretrained_file): 351 | checkpoint = torch.load(pretrained_file) 352 | sent_pooler.load_state_dict(checkpoint['sent_pooler']) 353 | rdm_model.load_state_dict(checkpoint["rmdModel"]) 354 | rdm_classifier.load_state_dict(checkpoint["rdm_classifier"]) 355 | else: 356 | TrainRDMModel(rdm_model, sent_pooler, rdm_classifier, 357 | t_steps=5000, stage=0, new_data_len=[], valid_new_len=[], logger=None, 358 | log_dir=log_dir, cuda=True) 359 | 360 | 361 | 362 | #### 标准ERD模型 363 | for i in range(20): 364 | erd_save_as = '%s/erdModel_epoch%03d.pkl'% (log_dir , i) 365 | if i==0: 366 | TrainCMModel_V3(sent_pooler, rdm_model, rdm_classifier, cm_model, 0, 0.5, 20000, log_dir, None, FLAGS, cuda=True) 367 | else: 368 | TrainCMModel_V3(sent_pooler, rdm_model, rdm_classifier, cm_model, 0, 0.5, 2000, log_dir, None, FLAGS, cuda=True) 369 | torch.save( 370 | { 371 | "sent_pooler":sent_pooler.state_dict(), 372 | "rmdModel":rdm_model.state_dict(), 373 | "rdm_classifier": rdm_classifier.state_dict(), 374 | "cm_model":cm_model.state_dict() 375 | }, 376 | erd_save_as 377 | ) 378 | print("iter:", i, ", train cm model completed!") 379 | new_len, valid_new_len = get_new_len(sent_pooler, rdm_model, cm_model, FLAGS, cuda=True) 380 | print("after new len:") 381 | print("new_data_len:", new_len) 382 | print("valid_new_len:", valid_new_len) 383 | TrainRDMModel(rdm_model, sent_pooler, rdm_classifier, 384 | t_steps=1000, stage=0, new_data_len=new_len, valid_new_len=valid_new_len, logger=None, 385 | log_dir=log_dir, cuda=True) 386 | 387 | 388 | 389 | 390 | -------------------------------------------------------------------------------- /torch/ERD_CN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import random 6 | import torch 7 | import importlib 8 | from tensorboardX import SummaryWriter 9 | import torch.nn.utils.rnn as rnn_utils 10 | import pickle 11 | import tqdm 12 | import os 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | from collections import deque 17 | sys.path.append(".") 18 | from dataUtils_CN import * 19 | import json 20 | from RDM_Model import * 21 | from CM_Model import * 22 | # import pdb 23 | 24 | 25 | 26 | class LayerNormLSTMCell(nn.LSTMCell): 27 | def __init__(self, input_size, hidden_size, dropout=0.0, bias=True, use_layer_norm=True): 28 | super().__init__(input_size, hidden_size, bias) 29 | self.use_layer_norm = use_layer_norm 30 | if self.use_layer_norm: 31 | self.ln_ih = nn.LayerNorm(4 * hidden_size) 32 | self.ln_hh = nn.LayerNorm(4 * hidden_size) 33 | self.ln_ho = nn.LayerNorm(hidden_size) 34 | # DropConnect on the recurrent hidden to hidden weight 35 | self.dropout = dropout 36 | 37 | def forward(self, input, hidden=None): 38 | self.check_forward_input(input) 39 | if hidden is None: 40 | hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False) 41 | cx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False) 42 | else: 43 | hx, cx = hidden 44 | self.check_forward_hidden(input, hx, '[0]') 45 | self.check_forward_hidden(input, cx, '[1]') 46 | 47 | weight_hh = nn.functional.dropout(self.weight_hh, p=self.dropout, training=self.training) 48 | if self.use_layer_norm: 49 | gates = self.ln_ih(F.linear(input, self.weight_ih, self.bias_ih)) \ 50 | + self.ln_hh(F.linear(hx, weight_hh, self.bias_hh)) 51 | else: 52 | gates = F.linear(input, self.weight_ih, self.bias_ih) \ 53 | + F.linear(hx, weight_hh, self.bias_hh) 54 | 55 | i, f, c, o = gates.chunk(4, 1) 56 | i_ = torch.sigmoid(i) 57 | f_ = torch.sigmoid(f) 58 | c_ = torch.tanh(c) 59 | o_ = torch.sigmoid(o) 60 | cy = (f_ * cx) + (i_ * c_) 61 | if self.use_layer_norm: 62 | hy = o_ * self.ln_ho(torch.tanh(cy)) 63 | else: 64 | hy = o_ * torch.tanh(cy) 65 | return hy, cy 66 | 67 | class LayerNormLSTM(nn.Module): 68 | def __init__(self, 69 | input_size, 70 | hidden_size, 71 | num_layers=1, 72 | dropout=0.0, 73 | weight_dropout=0.0, 74 | bias=True, 75 | bidirectional=False, 76 | use_layer_norm=True): 77 | super().__init__() 78 | self.input_size = input_size 79 | self.hidden_size = hidden_size 80 | self.num_layers = num_layers 81 | # using variational dropout 82 | self.dropout = dropout 83 | self.bidirectional = bidirectional 84 | 85 | num_directions = 2 if bidirectional else 1 86 | self.hidden0 = nn.ModuleList([ 87 | LayerNormLSTMCell(input_size=(input_size if layer == 0 else hidden_size * num_directions), 88 | hidden_size=hidden_size, dropout=weight_dropout, bias=bias, use_layer_norm=use_layer_norm) 89 | for layer in range(num_layers) 90 | ]) 91 | 92 | if self.bidirectional: 93 | self.hidden1 = nn.ModuleList([ 94 | LayerNormLSTMCell(input_size=(input_size if layer == 0 else hidden_size * num_directions), 95 | hidden_size=hidden_size, dropout=weight_dropout, bias=bias, use_layer_norm=use_layer_norm) 96 | for layer in range(num_layers) 97 | ]) 98 | 99 | def copy_parameters(self, rnn_old): 100 | for param in rnn_old.named_parameters(): 101 | name_ = param[0].split("_") 102 | layer = int(name_[2].replace("l", "")) 103 | sub_name = "_".join(name_[:2]) 104 | if len(name_) > 3: 105 | self.hidden1[layer].register_parameter(sub_name, param[1]) 106 | else: 107 | self.hidden0[layer].register_parameter(sub_name, param[1]) 108 | 109 | def forward(self, input, hidden=None, seq_lens=None): 110 | seq_len, batch_size, _ = input.size() 111 | num_directions = 2 if self.bidirectional else 1 112 | if hidden is None: 113 | hx = input.new_zeros(self.num_layers * num_directions, batch_size, self.hidden_size, requires_grad=False) 114 | cx = input.new_zeros(self.num_layers * num_directions, batch_size, self.hidden_size, requires_grad=False) 115 | else: 116 | hx, cx = hidden 117 | 118 | ht = [] 119 | for i in range(seq_len): 120 | ht.append([None] * (self.num_layers * num_directions)) 121 | ct = [] 122 | for i in range(seq_len): 123 | ct.append([None] * (self.num_layers * num_directions)) 124 | 125 | seq_len_mask = input.new_ones(batch_size, seq_len, self.hidden_size, requires_grad=False) 126 | if seq_lens != None: 127 | for i, l in enumerate(seq_lens): 128 | seq_len_mask[i, l:, :] = 0 129 | seq_len_mask = seq_len_mask.transpose(0, 1) 130 | 131 | if self.bidirectional: 132 | # if use cuda, change 'torch.LongTensor' to 'torch.cuda.LongTensor' 133 | indices_ = (torch.LongTensor(seq_lens) - 1).unsqueeze(1).unsqueeze(0).unsqueeze(0).repeat( 134 | [1, 1, 1, self.hidden_size]) 135 | # if use cuda, change 'torch.LongTensor' to 'torch.cuda.LongTensor' 136 | indices_reverse = torch.LongTensor([0] * batch_size).unsqueeze(1).unsqueeze(0).unsqueeze(0).repeat( 137 | [1, 1, 1, self.hidden_size]) 138 | indices = torch.cat((indices_, indices_reverse), dim=1) 139 | hy = [] 140 | cy = [] 141 | xs = input 142 | # Variational Dropout 143 | if not self.training or self.dropout == 0: 144 | dropout_mask = input.new_ones(self.num_layers, 2, batch_size, self.hidden_size) 145 | else: 146 | dropout_mask = input.new(self.num_layers, 2, batch_size, self.hidden_size).bernoulli_(1 - self.dropout) 147 | dropout_mask = Variable(dropout_mask, requires_grad=False) / (1 - self.dropout) 148 | 149 | for l, (layer0, layer1) in enumerate(zip(self.hidden0, self.hidden1)): 150 | l0, l1 = 2 * l, 2 * l + 1 151 | h0, c0, h1, c1 = hx[l0], cx[l0], hx[l1], cx[l1] 152 | for t, (x0, x1) in enumerate(zip(xs, reversed(xs))): 153 | ht_, ct_ = layer0(x0, (h0, c0)) 154 | ht[t][l0] = ht_ * seq_len_mask[t] 155 | ct[t][l0] = ct_ * seq_len_mask[t] 156 | h0, c0 = ht[t][l0], ct[t][l0] 157 | t = seq_len - 1 - t 158 | ht_, ct_ = layer1(x1, (h1, c1)) 159 | ht[t][l1] = ht_ * seq_len_mask[t] 160 | ct[t][l1] = ct_ * seq_len_mask[t] 161 | h1, c1 = ht[t][l1], ct[t][l1] 162 | 163 | xs = [torch.cat((h[l0]*dropout_mask[l][0], h[l1]*dropout_mask[l][1]), dim=1) for h in ht] 164 | ht_temp = torch.stack([torch.stack([h[l0], h[l1]]) for h in ht]) 165 | ct_temp = torch.stack([torch.stack([c[l0], c[l1]]) for c in ct]) 166 | if len(hy) == 0: 167 | hy = torch.stack(list(ht_temp.gather(dim=0, index=indices).squeeze(0))) 168 | else: 169 | hy = torch.cat((hy, torch.stack(list(ht_temp.gather(dim=0, index=indices).squeeze(0)))), dim=0) 170 | if len(cy) == 0: 171 | cy = torch.stack(list(ct_temp.gather(dim=0, index=indices).squeeze(0))) 172 | else: 173 | cy = torch.cat((cy, torch.stack(list(ct_temp.gather(dim=0, index=indices).squeeze(0)))), dim=0) 174 | y = torch.stack(xs) 175 | else: 176 | # if use cuda, change 'torch.LongTensor' to 'torch.cuda.LongTensor' 177 | indices = (torch.cuda.LongTensor(seq_lens) - 1).unsqueeze(1).unsqueeze(0).unsqueeze(0).repeat( 178 | [1, self.num_layers, 1, self.hidden_size]) 179 | h, c = hx, cx 180 | # Variational Dropout 181 | if not self.training or self.dropout == 0: 182 | dropout_mask = input.new_ones(self.num_layers, batch_size, self.hidden_size) 183 | else: 184 | dropout_mask = input.new(self.num_layers, batch_size, self.hidden_size).bernoulli_(1 - self.dropout) 185 | dropout_mask = Variable(dropout_mask, requires_grad=False) / (1 - self.dropout) 186 | 187 | for t, x in enumerate(input): 188 | for l, layer in enumerate(self.hidden0): 189 | ht_, ct_ = layer(x, (h[l], c[l])) 190 | ht[t][l] = ht_ * seq_len_mask[t] 191 | ct[t][l] = ct_ * seq_len_mask[t] 192 | x = ht[t][l] * dropout_mask[l] 193 | ht[t] = torch.stack(ht[t]) 194 | ct[t] = torch.stack(ct[t]) 195 | h, c = ht[t], ct[t] 196 | y = torch.stack([h[-1]*dropout_mask[-1] for h in ht]) 197 | hy = torch.stack(list(torch.stack(ht).gather(dim=0, index=indices).squeeze(0))) 198 | cy = torch.stack(list(torch.stack(ct).gather(dim=0, index=indices).squeeze(0))) 199 | 200 | return y, (hy, cy) 201 | 202 | 203 | # ### 模型训练与测试 204 | class pooling_layer(nn.Module): 205 | def __init__(self, input_dim, output_dim): 206 | super(pooling_layer, self).__init__() 207 | self.linear = nn.Linear(input_dim, output_dim) 208 | self.input_dim = input_dim 209 | self.output_dim = output_dim 210 | 211 | def forward(self, inputs, cuda=True): 212 | inputs_sent = [torch.cat([self.linear(sent_tensor.cuda() if cuda else sent_tensor).max(axis=0)[0].unsqueeze(0) for sent_tensor in seq]) for seq in inputs] 213 | seqs = torch.nn.utils.rnn.pad_sequence(inputs_sent, batch_first=True) 214 | return seqs 215 | 216 | class RDM_Model(nn.Module): 217 | def __init__(self, word_embedding_dim, sent_embedding_dim, hidden_dim, dropout_prob): 218 | super(RDM_Model, self).__init__() 219 | self.embedding_dim = sent_embedding_dim 220 | self.hidden_dim = hidden_dim 221 | self.gru_model = nn.GRU(word_embedding_dim, 222 | self.hidden_dim, 223 | batch_first=True, 224 | dropout=dropout_prob 225 | ) 226 | self.DropLayer = nn.Dropout(dropout_prob) 227 | 228 | def forward(self, input_x): 229 | """ 230 | input_x: [batchsize, max_seq_len, sentence_embedding_dim] 231 | x_len: [batchsize] 232 | init_states: [batchsize, hidden_dim] 233 | """ 234 | batchsize, max_seq_len, emb_dim = input_x.shape 235 | init_states = torch.zeros([1, batchsize, self.hidden_dim], dtype=torch.float32).cuda() 236 | try: 237 | df_outputs, df_last_state = self.gru_model(input_x, init_states) 238 | except: 239 | print("Error:", input_x.shape, init_states.shape) 240 | raise 241 | return df_outputs 242 | 243 | class RDM_Model_V1(nn.Module): 244 | def __init__(self, word_embedding_dim, sent_embedding_dim, hidden_dim, dropout_prob): 245 | super(RDM_Model_V1, self).__init__() 246 | self.embedding_dim = sent_embedding_dim 247 | self.hidden_dim = hidden_dim 248 | self.gru_model = LayerNormLSTM(word_embedding_dim, 249 | self.hidden_dim, 250 | dropout=dropout_prob 251 | ) 252 | 253 | def forward(self, input_x, seq_lens): 254 | """ 255 | input_x: [batchsize, max_seq_len, sentence_embedding_dim] 256 | x_len: [batchsize] 257 | init_states: [batchsize, hidden_dim] 258 | """ 259 | batchsize, max_seq_len, emb_dim = input_x.shape 260 | h0 = torch.zeros([1, batchsize, self.hidden_dim], dtype=torch.float32).cuda() 261 | c0 = torch.zeros([1, batchsize, self.hidden_dim], dtype=torch.float32).cuda() 262 | df_outputs, (df_last_state, df_last_cell) = self.gru_model(input_x.transpose(0, 1), (h0, c0), seq_lens) 263 | return df_outputs.transpose(0, 1), df_last_state.transpose(0, 1), df_last_cell.transpose(0, 1) 264 | 265 | class CM_Model_V1(nn.Module): 266 | def __init__(self, hidden_dim, action_num): 267 | super(CM_Model_V1, self).__init__() 268 | self.hidden_dim = hidden_dim 269 | self.action_num = action_num 270 | self.DenseLayer = nn.Linear(self.hidden_dim, 64) 271 | self.Classifier = nn.Linear(64, self.action_num) 272 | 273 | def forward(self, rdm_state): 274 | """ 275 | rdm_state: [batchsize, hidden_dim] 276 | """ 277 | batchsize, hidden_dim = rdm_state.shape 278 | rl_h1 = nn.functional.relu( 279 | self.DenseLayer( 280 | rdm_state 281 | ) 282 | ) 283 | stopScore = self.Classifier(rl_h1) 284 | isStop = stopScore.argmax(axis=1) 285 | return stopScore, isStop 286 | 287 | class CM_Model(nn.Module): 288 | def __init__(self, sentence_embedding_dim, hidden_dim, action_num): 289 | super(CM_Model, self).__init__() 290 | self.sentence_embedding_dim = sentence_embedding_dim 291 | self.hidden_dim = hidden_dim 292 | self.action_num = action_num 293 | # self.PoolLayer = pooling_layer(self.embedding_dim, 294 | # self.hidden_dim) 295 | self.DenseLayer = nn.Linear(self.hidden_dim, 64) 296 | self.Classifier = nn.Linear(64, self.action_num) 297 | 298 | def forward(self, rdm_model, rl_input, rl_state): 299 | """ 300 | rl_input: [batchsize, max_word_num, sentence_embedding_dim] 301 | rl_state: [1, batchsize, hidden_dim] 302 | """ 303 | assert(rl_input.ndim==3) 304 | batchsize, max_word_num, embedding_dim = rl_input.shape 305 | rl_output, rl_new_state = rdm_model.gru_model( 306 | rl_input, 307 | rl_state 308 | ) 309 | rl_h1 = nn.functional.relu( 310 | self.DenseLayer( 311 | # rl_state.reshape([len(rl_input), self.hidden_dim]) #it is not sure to take rl_state , rather than rl_output, as the feature 312 | rl_output.reshape( 313 | [len(rl_input), self.hidden_dim] 314 | ) 315 | ) 316 | ) 317 | stopScore = self.Classifier(rl_h1) 318 | isStop = stopScore.argmax(axis=1) 319 | return stopScore, isStop, rl_new_state 320 | 321 | 322 | # In[13]: 323 | 324 | 325 | load_data_fast() 326 | 327 | rdm_model = RDM_Model(300, 300, 256, 0.2).cuda() 328 | sent_pooler = pooling_layer(300, 300).cuda() 329 | rdm_classifier = nn.Linear(256, 2).cuda() 330 | cm_model = CM_Model_V1(256, 2).cuda() 331 | 332 | log_dir = os.path.join(sys.path[0], "ERD_CN/") 333 | 334 | with open("../../config.json", "r") as cr: 335 | dic = json.load(cr) 336 | 337 | class adict(dict): 338 | ''' Attribute dictionary - a convenience data structure, similar to SimpleNamespace in python 3.3 339 | One can use attributes to read/write dictionary content. 340 | ''' 341 | def __init__(self, *av, **kav): 342 | dict.__init__(self, *av, **kav) 343 | self.__dict__ = self 344 | 345 | FLAGS = adict(dic) 346 | 347 | # #### 导入模型预训练参数 348 | # pretrained_file = "%s/ERD_best.pkl"%log_dir 349 | pretrained_file = "ERD/ERD_CN_best.pkl" 350 | if os.path.exists(pretrained_file): 351 | checkpoint = torch.load(pretrained_file) 352 | sent_pooler.load_state_dict(checkpoint['sent_pooler']) 353 | rdm_model.load_state_dict(checkpoint["rmdModel"]) 354 | rdm_classifier.load_state_dict(checkpoint["rdm_classifier"]) 355 | else: 356 | TrainRDMModel_V0(rdm_model, sent_pooler, rdm_classifier, 357 | t_steps=5000, stage=0, new_data_len=[], valid_new_len=[], logger=None, 358 | log_dir=log_dir, cuda=True) 359 | 360 | 361 | 362 | #### 标准ERD模型 363 | for i in range(20): 364 | erd_save_as = '%s/erdModel_epoch%03d.pkl'% (log_dir , i) 365 | if i==0: 366 | TrainCMModel_V3(sent_pooler, rdm_model, rdm_classifier, cm_model, 0, 0.5, 20000, log_dir, None, FLAGS, cuda=True) 367 | else: 368 | TrainCMModel_V3(sent_pooler, rdm_model, rdm_classifier, cm_model, 0, 0.5, 2000, log_dir, None, FLAGS, cuda=True) 369 | torch.save( 370 | { 371 | "sent_pooler":sent_pooler.state_dict(), 372 | "rmdModel":rdm_model.state_dict(), 373 | "rdm_classifier": rdm_classifier.state_dict(), 374 | "cm_model":cm_model.state_dict() 375 | }, 376 | erd_save_as 377 | ) 378 | print("iter:", i, ", train cm model completed!") 379 | new_len, valid_new_len = get_new_len(sent_pooler, rdm_model, cm_model, FLAGS, cuda=True) 380 | print("after new len:") 381 | print("new_data_len:", new_len) 382 | print("valid_new_len:", valid_new_len) 383 | TrainRDMModel(rdm_model, sent_pooler, rdm_classifier, 384 | t_steps=1000, stage=0, new_data_len=new_len, valid_new_len=valid_new_len, logger=None, 385 | log_dir=log_dir, cuda=True) 386 | 387 | 388 | 389 | 390 | -------------------------------------------------------------------------------- /torch/RDM_Model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import torch 4 | import importlib 5 | from tensorboardX import SummaryWriter 6 | import torch.nn.utils.rnn as rnn_utils 7 | import pickle 8 | import tqdm 9 | import os 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch.autograd import Variable 13 | from collections import deque 14 | import json 15 | from dataUtils_CN import * 16 | from sklearn.metrics import accuracy_score 17 | from sklearn.metrics import precision_score 18 | from sklearn.metrics import recall_score 19 | 20 | 21 | def TrainRDMModel_V0(rdm_model, sent_pooler, rdm_classifier, 22 | t_steps=100, stage=0, new_data_len=[], valid_new_len=[], best_valid_acc = 0.0, logger=None, 23 | log_dir="RDMBertTrain", cuda=True): 24 | batch_size = 20 25 | sum_loss = 0.0 26 | sum_acc = 0.0 27 | t_acc = 0.9 28 | ret_acc = 0.0 29 | init_states = torch.zeros([1, batch_size, rdm_model.hidden_dim], dtype=torch.float32).cuda() 30 | weight = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda() 31 | loss_fn = nn.CrossEntropyLoss(weight=weight) 32 | optim = torch.optim.Adagrad([ 33 | {'params': sent_pooler.parameters(), 'lr': 5e-3}, 34 | {'params': rdm_model.parameters(), 'lr': 5e-3}, 35 | {'params': rdm_classifier.parameters(), 'lr': 5e-3} 36 | ] 37 | ) 38 | 39 | writer = SummaryWriter(log_dir, filename_suffix="_ERD_CM_stage_%3d"%stage) 40 | for step in range(499, t_steps): 41 | optim.zero_grad() 42 | try: 43 | x, x_len, y = get_df_batch(step*batch_size, batch_size) 44 | seq = sent_pooler(x) 45 | rdm_hiddens = rdm_model(seq) 46 | # rdm_hiddens, rdm_out, rdm_cell = rdm_model(seq, x_len.tolist()) 47 | batchsize, _, _ = rdm_hiddens.shape 48 | rdm_outs = torch.cat( 49 | [ rdm_hiddens[i][x_len[i]-1].unsqueeze(0) for i in range(batchsize)] 50 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 51 | ) 52 | rdm_scores = rdm_classifier( 53 | rdm_outs 54 | ) 55 | rdm_preds = rdm_scores.argmax(axis=1) 56 | y_label = torch.tensor(y).argmax(axis=1).cuda() if cuda else torch.tensor(y).argmax(axis=1) 57 | acc = accuracy_score(y_label.cpu().numpy(), rdm_preds.cpu().numpy()) 58 | loss = loss_fn(rdm_scores, y_label) 59 | loss.backward() 60 | torch.cuda.empty_cache() 61 | 62 | except RuntimeError as exception: 63 | if "out of memory" in str(exception): 64 | print("WARNING: out of memory") 65 | print("%d, %d | x_len:"%(step, j), x_len) 66 | if hasattr(torch.cuda, 'empty_cache'): 67 | torch.cuda.empty_cache() 68 | # time.sleep(5) 69 | raise exception 70 | else: 71 | raise exception 72 | 73 | optim.step() 74 | writer.add_scalar('Train Loss', loss, step) 75 | writer.add_scalar('Train Accuracy', acc, step) 76 | 77 | sum_loss += loss 78 | sum_acc += acc 79 | 80 | torch.cuda.empty_cache() 81 | 82 | if step % 10 == 9: 83 | sum_loss = sum_loss / 10 84 | sum_acc = sum_acc / 10 85 | print('%3d | %d , train_loss/accuracy = %6.8f/%6.7f' % (step, t_steps, 86 | sum_loss, sum_acc, 87 | )) 88 | if step%500 == 499: 89 | valid_acc = accuracy_on_valid_data(rdm_model, sent_pooler, rdm_classifier) 90 | if valid_acc > best_valid_acc: 91 | print("valid_acc:", valid_acc) 92 | writer.add_scalar('Valid Accuracy', valid_acc, step) 93 | best_valid_acc = valid_acc 94 | if stage != 0: 95 | rdm_save_as = '%s/ERD_best_%d.pkl'% (log_dir, stage) 96 | else: 97 | rdm_save_as = '%s/ERD_best.pkl'% (log_dir) 98 | torch.save( 99 | { 100 | "rmdModel":rdm_model.state_dict(), 101 | "sent_pooler":sent_pooler.state_dict(), 102 | "rdm_classifier": rdm_classifier.state_dict() 103 | }, 104 | rdm_save_as 105 | ) 106 | sum_acc = 0.0 107 | sum_loss = 0.0 108 | print(get_curtime() + " Train df Model End.") 109 | return ret_acc -------------------------------------------------------------------------------- /torch/dataUtils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import datetime 5 | import numpy as np 6 | import gensim 7 | import random 8 | import math 9 | import re 10 | import pickle 11 | import torch 12 | 13 | files = [] 14 | data = {} 15 | data_ID = [] 16 | data_len = [] 17 | data_y = [] 18 | 19 | valid_data_ID = [] 20 | valid_data_y = [] 21 | valid_data_len = [] 22 | # word2vec = gensim.models.KeyedVectors.load_word2vec_format('/home/hadoop/word2vec.model') 23 | with open("/home/hadoop/word2vec.txt", "rb") as handle: 24 | word2vec = pickle.load(handle) 25 | print("load glove finished") 26 | # c2vec = chars2vec.load_model('eng_300') 27 | reward_counter = 0 28 | eval_flag = 0 29 | 30 | def get_data_ID(): 31 | global data_ID 32 | return data_ID 33 | 34 | def get_data_len(): 35 | global data_len 36 | return data_len 37 | 38 | def get_curtime(): 39 | return time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 40 | 41 | 42 | def list_files(data_path): 43 | global data, files 44 | fs = os.listdir(data_path) 45 | for f1 in fs: 46 | tmp_path = os.path.join(data_path, f1) 47 | if not os.path.isdir(tmp_path): 48 | if tmp_path.split('.')[-1] == 'json': 49 | files.append(tmp_path) 50 | else: 51 | list_files(tmp_path) 52 | 53 | 54 | def str2timestamp(str_time): 55 | month = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 56 | 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 57 | 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'} 58 | ss = str_time.split(' ') 59 | m_time = ss[5] + "-" + month[ss[1]] + '-' + ss[2] + ' ' + ss[3] 60 | d = datetime.datetime.strptime(m_time, "%Y-%m-%d %H:%M:%S") 61 | t = d.timetuple() 62 | timeStamp = int(time.mktime(t)) 63 | return timeStamp 64 | 65 | 66 | def data_process(file_path): 67 | ret = {} 68 | ss = file_path.split("/") 69 | data = json.load(open(file_path, mode="r", encoding="utf-8")) 70 | # 'Wed Jan 07 11:14:08 +0000 2015' 71 | # print("SS:", ss) 72 | ret[ss[6]] = {'label': ss[5], 'text': [data['text'].lower()], 'created_at': [str2timestamp(data['created_at'])]} 73 | return ret 74 | 75 | def transIrregularWord(line): 76 | if not line: 77 | return '' 78 | line.lower() 79 | line = re.sub("@[^ ]*", "{ mention someone }", line) 80 | line = re.sub("#[^ ]*", "{ special topic }", line) 81 | line = re.sub("http(.?)://[^ ]*", "{ a special link }", line) 82 | return line 83 | 84 | 85 | def load_test_data_fast(): 86 | global data, data_ID, data_len, data_y, eval_flag 87 | with open("data/data_dict.txt", "rb") as handle: 88 | data = pickle.load(handle) 89 | data_ID = np.load("data/test_data_ID.npy").tolist() 90 | data_len = np.load("data/test_data_len.npy").tolist() 91 | data_y = np.load("data/test_data_y.npy").tolist() 92 | max_sent = max( map(lambda value: max(map(lambda txt_list: len(txt_list), value['text']) ), list(data.values()) ) ) 93 | print("max_sent:", max_sent, ", max_seq_len:", max(data_len)) 94 | eval_flag = int(len(data_ID) / 4) * 3 95 | print("{} data loaded".format(len(data))) 96 | 97 | def load_data_fast(): 98 | global data, data_ID, data_len, data_y, valid_data_ID, valid_data_y, valid_data_len 99 | with open("data/data_dict.txt", "rb") as handle: 100 | data = pickle.load(handle) 101 | data_ID = np.load("data/data_ID.npy").tolist() 102 | data_len = np.load("data/data_len.npy").tolist() 103 | data_y = np.load("data/data_y.npy").tolist() 104 | # valid_data_ID = np.load("data/valid_data_ID.npy").tolist() 105 | # valid_data_len = np.load("data/valid_data_len.npy").tolist() 106 | # valid_data_y = np.load("data/valid_data_y.npy").tolist() 107 | valid_data_ID = np.load("data/test_data_ID.npy").tolist() 108 | valid_data_len = np.load("data/test_data_len.npy").tolist() 109 | valid_data_y = np.load("data/test_data_y.npy").tolist() 110 | max_sent = max( map(lambda value: max(map(lambda txt_list: len(txt_list), value['text']) ), list(data.values()) ) ) 111 | print("max_sent:", max_sent, ", max_seq_len:", max(data_len)) 112 | eval_flag = int(len(data_ID) / 4) * 3 113 | print("{} data loaded".format(len(data))) 114 | 115 | 116 | 117 | def sortTempList(temp_list): 118 | time = np.array([item[0] for item in temp_list]) 119 | posts = np.array([item[1] for item in temp_list]) 120 | idxs = time.argsort().tolist() 121 | rst = [[t, p] for (t, p) in zip(time[idxs], posts[idxs])] 122 | del time, posts 123 | return rst 124 | 125 | def load_data(data_path, FLAGS): 126 | # get data files path 127 | global data, files, data_ID, data_len, eval_flag 128 | data = {} 129 | files = [] 130 | data_ID = [] 131 | data_len = [] 132 | list_files(data_path) #load all filepath to files 133 | max_sent = 0 134 | # load data to json 135 | for file in files: 136 | td = data_process(file) # read out the information from json file, and organized it as {dataID:{'key':val}} 137 | for key in td.keys(): # use temporary data to organize the final whole data 138 | if key in data: 139 | data[key]['text'].append(td[key]['text'][0]) 140 | data[key]['created_at'].append(td[key]['created_at'][0]) 141 | else: 142 | data[key] = td[key] 143 | # convert to my data style 144 | for key, value in data.items(): 145 | temp_list = [] 146 | for i in range(len(data[key]['text'])): 147 | temp_list.append([data[key]['created_at'][i], data[key]['text'][i]]) 148 | temp_list = sortTempList(temp_list) 149 | data[key]['text'] = [] 150 | data[key]['created_at'] = [] 151 | ttext = "" 152 | last = 0 153 | for i in range(len(temp_list)): 154 | # if temp_list[i][0] - temp_list[0][0] > FLAGS.time_limit * 3600 or len(data[key]['created_at']) >= 100: 155 | # break 156 | if i % FLAGS.post_fn == 0: # merge the fixed number of texts in a time interval 157 | if len(ttext) > 0: # if there are data already in ttext, output it as a new instance 158 | words = transIrregularWord(ttext) 159 | data[key]['text'].append(words) 160 | data[key]['created_at'].append(temp_list[i][0]) 161 | ttext = temp_list[i][1] 162 | else: 163 | ttext += " " + temp_list[i][1] 164 | last = i 165 | # keep the last one 166 | if len(ttext) > 0: 167 | words = transIrregularWord(ttext) 168 | data[key]['text'].append(words) 169 | data[key]['created_at'].append(temp_list[last][0]) 170 | 171 | for key in data.keys(): 172 | data_ID.append(key) 173 | data_ID = random.sample(data_ID, len(data_ID)) #shuffle the data id 174 | for i in range(len(data_ID)): #pre processing the extra informations 175 | data_len.append(len(data[data_ID[i]]['text'])) 176 | if data[data_ID[i]]['label'] == "rumours": 177 | data_y.append([1.0, 0.0]) 178 | else: 179 | data_y.append([0.0, 1.0]) 180 | eval_flag = int(len(data_ID) / 4) * 3 181 | print("{} data loaded".format(len(data))) 182 | 183 | 184 | def get_df_batch(start, batch_size, new_data_len=[], cuda=True): 185 | data_x = [] 186 | m_data_y = np.zeros([batch_size, 2], dtype=np.int32) 187 | m_data_len = np.zeros([batch_size], dtype=np.int32) 188 | miss_vec = 0 189 | hit_vec = 0 190 | if len(new_data_len) > 0: 191 | t_data_len = new_data_len 192 | else: 193 | t_data_len = data_len 194 | mts = start * batch_size 195 | if mts >= len(data_ID): 196 | mts = mts % len(data_ID) 197 | 198 | for i in range(batch_size): 199 | m_data_y[i] = data_y[mts] 200 | m_data_len[i] = t_data_len[mts] 201 | seq = [] 202 | for j in range(t_data_len[mts]): 203 | sent = [] 204 | t_words = transIrregularWord(data[data_ID[mts]]['text'][j]).split(" ") 205 | for k in range(len(t_words)): 206 | m_word = t_words[k] 207 | try: 208 | sent.append( torch.tensor([word2vec[m_word]], dtype=torch.float32) ) 209 | except KeyError: 210 | miss_vec += 1 211 | sent.append( torch.tensor([word2vec['{'] +word2vec['an'] + word2vec['unknown'] + word2vec['word'] + word2vec['}'] ], dtype=torch.float32) ) 212 | except IndexError: 213 | raise 214 | else: 215 | hit_vec += 1 216 | sent_tensor = torch.cat(sent) 217 | seq.append(sent_tensor) 218 | data_x.append(seq) 219 | mts += 1 220 | if mts >= len(data_ID): # read data looply 221 | mts = mts % len(data_ID) 222 | 223 | return data_x, m_data_len, m_data_y 224 | 225 | 226 | # seq_states is the date_x to get 227 | # max_id is the next corpus to take 228 | def get_rl_batch(ids, seq_states, stop_states, counter_id, start_id, total_data): 229 | input_x = np.zeros([FLAGS.batch_size, FLAGS.max_sent_len, FLAGS.embedding_dim], dtype=np.float32) 230 | input_y = np.zeros([FLAGS.batch_size, FLAGS.class_num], dtype=np.float32) 231 | miss_vec = 0 232 | total_data = len(data_len) 233 | 234 | for i in range(FLAGS.batch_size): 235 | # seq_states:records the id of a sentence in a sequence 236 | # stop_states: records whether the sentence is judged by the program 237 | if stop_states[i] == 1 or seq_states[i] >= data_len[ids[i]]: 238 | ids[i] = counter_id + start_id 239 | seq_states[i] = 0 240 | try: 241 | t_words = data[ data_ID[ids[i]] ]['text'][seq_states[i]] 242 | except: 243 | print(ids[i], seq_states[i]) 244 | for j in range(len(t_words)): 245 | m_word = t_words[j] 246 | try: 247 | input_x[i][j] = word2vec[m_word] 248 | except: 249 | miss_vec = 1 250 | # if len(t_words) != 0: 251 | # input_x[i][:len(t_words)] = c2vec.vectorize_words(t_words) 252 | input_y[i] = data_y[ids[i]] 253 | counter_id += 1 254 | counter_id = counter_id % total_data 255 | else: 256 | try: 257 | t_words = data[ data_ID[ids[i]] ]['text'][seq_states[i]] 258 | except: 259 | print("ids and seq_states:", ids[i], seq_states[i]) 260 | t_words = [] 261 | for j in range(len(t_words)): 262 | m_word = t_words[j] 263 | try: 264 | input_x[i][j] = word2vec[m_word] 265 | except: 266 | miss_vec += 1 267 | 268 | # if len(t_words) != 0: 269 | # input_x[i][:len(t_words)] = c2vec.vectorize_words(t_words) 270 | input_y[i] = data_y[ids[i]] 271 | # point to the next sequence 272 | seq_states[i] += 1 273 | 274 | return input_x, input_y, ids, seq_states, counter_id 275 | 276 | def accuracy_on_valid_data_V1(rdm_model = None, sent_pooler = None, rdm_classifier=None, new_data_len=[], cuda=True): 277 | def Count_Acc(ylabel, preds): 278 | correct_preds = np.array( 279 | [1 if y1==y2 else 0 280 | for (y1, y2) in zip(ylabel, preds)] 281 | ) 282 | acc = sum(correct_preds) / (1.0 * len(ylabel)) 283 | return acc 284 | 285 | batch_size = 20 286 | t_steps = int(len(valid_data_ID)/batch_size) 287 | sum_acc = 0.0 288 | miss_vec = 0 289 | mts = 0 290 | hit_vec = 0 291 | if len(new_data_len) > 0: 292 | t_data_len = new_data_len 293 | else: 294 | t_data_len = valid_data_len 295 | 296 | for step in range(t_steps): 297 | data_x = [] 298 | m_data_y = np.zeros([batch_size, 2], dtype=np.int32) 299 | m_data_len = np.zeros([batch_size], dtype=np.int32) 300 | for i in range(batch_size): 301 | m_data_y[i] = valid_data_y[mts] 302 | m_data_len[i] = t_data_len[mts] 303 | seq = [] 304 | for j in range(t_data_len[mts]): 305 | sent = [] 306 | t_words = transIrregularWord(data[valid_data_ID[mts]]['text'][j]).split(" ") 307 | for k in range(len(t_words)): 308 | m_word = t_words[k] 309 | try: 310 | sent.append( torch.tensor([word2vec[m_word]], dtype=torch.float32)) 311 | except KeyError: 312 | miss_vec += 1 313 | sent.append( torch.tensor([word2vec['{'] +word2vec['an'] + word2vec['unknown'] + word2vec['word'] + word2vec['}'] ], dtype=torch.float32) ) 314 | except IndexError: 315 | raise 316 | else: 317 | hit_vec += 1 318 | if len(sent) != 0 : 319 | sent_tensor = torch.cat(sent) 320 | else: 321 | print("empty sentence:", t_words) 322 | seq.append(sent_tensor) 323 | data_x.append(seq) 324 | mts += 1 325 | if mts >= len(valid_data_ID): # read data looply 326 | mts = mts % len(valid_data_ID) 327 | 328 | 329 | if rdm_model is not None and sent_pooler is not None and rdm_classifier is not None: 330 | with torch.no_grad(): 331 | seq = sent_pooler(data_x) 332 | rdm_hiddens, rdm_out, rdm_cell = rdm_model(seq, m_data_len.tolist()) 333 | batchsize, _, _ = rdm_hiddens.shape 334 | rdm_outs = torch.cat( 335 | [ rdm_hiddens[i][m_data_len[i]-1].unsqueeze(0) for i in range(batchsize)] 336 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 337 | ) 338 | rdm_scores = rdm_classifier( 339 | rdm_outs 340 | ) 341 | rdm_preds = rdm_scores.argmax(axis=1) 342 | y_label = torch.tensor(m_data_y).argmax(axis=1).cuda() if cuda else torch.tensor(m_data_y).argmax(axis=1) 343 | acc = Count_Acc(y_label, rdm_preds) 344 | sum_acc += acc 345 | mean_acc = sum_acc / (1.0*t_steps) 346 | return mean_acc 347 | 348 | 349 | def accuracy_on_valid_data(rdm_model = None, sent_pooler = None, rdm_classifier=None, new_data_len=[], cuda=True): 350 | def Count_Acc(ylabel, preds): 351 | correct_preds = np.array( 352 | [1 if y1==y2 else 0 353 | for (y1, y2) in zip(ylabel, preds)] 354 | ) 355 | acc = sum(correct_preds) / (1.0 * len(ylabel)) 356 | return acc 357 | 358 | batch_size = 20 359 | t_steps = int(len(valid_data_ID)/batch_size) 360 | sum_acc = 0.0 361 | miss_vec = 0 362 | mts = 0 363 | hit_vec = 0 364 | if len(new_data_len) > 0: 365 | t_data_len = new_data_len 366 | else: 367 | t_data_len = valid_data_len 368 | 369 | for step in range(t_steps): 370 | data_x = [] 371 | m_data_y = np.zeros([batch_size, 2], dtype=np.int32) 372 | m_data_len = np.zeros([batch_size], dtype=np.int32) 373 | for i in range(batch_size): 374 | m_data_y[i] = valid_data_y[mts] 375 | m_data_len[i] = t_data_len[mts] 376 | seq = [] 377 | for j in range(t_data_len[mts]): 378 | sent = [] 379 | t_words = transIrregularWord(data[valid_data_ID[mts]]['text'][j]).split(" ") 380 | for k in range(len(t_words)): 381 | m_word = t_words[k] 382 | try: 383 | sent.append( torch.tensor([word2vec[m_word]], dtype=torch.float32)) 384 | except KeyError: 385 | miss_vec += 1 386 | sent.append( torch.tensor([word2vec['{'] +word2vec['an'] + word2vec['unknown'] + word2vec['word'] + word2vec['}'] ], dtype=torch.float32) ) 387 | except IndexError: 388 | raise 389 | else: 390 | hit_vec += 1 391 | if len(sent) != 0 : 392 | sent_tensor = torch.cat(sent) 393 | else: 394 | print("empty sentence:", t_words) 395 | seq.append(sent_tensor) 396 | data_x.append(seq) 397 | mts += 1 398 | if mts >= len(valid_data_ID): # read data looply 399 | mts = mts % len(valid_data_ID) 400 | 401 | 402 | if rdm_model is not None and sent_pooler is not None and rdm_classifier is not None: 403 | with torch.no_grad(): 404 | seq = sent_pooler(data_x) 405 | rdm_hiddens = rdm_model(seq) 406 | batchsize, _, _ = rdm_hiddens.shape 407 | rdm_outs = torch.cat( 408 | [ rdm_hiddens[i][m_data_len[i]-1].unsqueeze(0) for i in range(batchsize)] 409 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 410 | ) 411 | rdm_scores = rdm_classifier( 412 | rdm_outs 413 | ) 414 | rdm_preds = rdm_scores.argmax(axis=1) 415 | y_label = torch.tensor(m_data_y).argmax(axis=1).cuda() if cuda else torch.tensor(m_data_y).argmax(axis=1) 416 | acc = Count_Acc(y_label, rdm_preds) 417 | sum_acc += acc 418 | mean_acc = sum_acc / (1.0*t_steps) 419 | return mean_acc 420 | 421 | 422 | # not to stop -0.1, so that to be early 423 | # DDQN y = r + Q(S, argmax(Q)) 424 | def get_rl_batch(ids, seq_states, counter_id, start_id, FLAGS): 425 | # input_x = np.zeros([FLAGS.batch_size, FLAGS.max_sent_len, FLAGS.max_char_num], dtype=np.float32) 426 | input_x = [] # [batch_size, sent_len] 427 | 428 | batch_size = len(ids) 429 | input_y = np.zeros([batch_size, FLAGS.class_num], dtype=np.float32) 430 | miss_vec = 0 431 | total_data = len(data_len) 432 | for i in range(batch_size): 433 | # seq_states:records the id of a sentence in a sequence 434 | # stop_states: records whether the sentence is judged by the program 435 | if seq_states[i] >= data_len[ids[i]]: 436 | # stop之后, 要换一个新的序列,新序列的下标也要重新进行标记,从头开始计数. 437 | ids[i] = counter_id + start_id 438 | seq_states[i] = 0 439 | counter_id += 1 440 | counter_id = counter_id % total_data 441 | # point to the next sequence 442 | else: 443 | seq_states[i] += 1 444 | return ids, seq_states, counter_id 445 | 446 | 447 | def get_reward_0(isStop, ss, pys, ids, seq_ids): 448 | global reward_counter 449 | reward = torch.zeros([len(isStop)], dtype=torch.float32) 450 | Q_Val = torch.zeros([len(isStop)], dtype= torch.float32) 451 | for i in range(len(isStop)): 452 | if isStop[i] == 1: 453 | try: 454 | if pys[ids[i]][seq_ids[i]-1].argmax() == np.argmax(data_y[ids[i]]): 455 | reward_counter += 1 # more number of correct prediction, more rewards 456 | r = 1 + FLAGS.reward_rate * math.log(reward_counter) 457 | reward[i] = r 458 | else: 459 | reward[i] = -100 460 | except: 461 | print("i:", i) 462 | print("ids_i:", ids[i]) 463 | print("seq_ids:", seq_ids[i]) 464 | print("pys:", pys[ids[i]]) 465 | raise 466 | Q_Val[i] = reward[i] 467 | else: 468 | reward[i] = -0.01 469 | Q_Val[i] = reward[i] + 0.99 * max(ss[i]) 470 | return reward, Q_Val 471 | 472 | 473 | def get_reward(isStop, ss, pys, ids, seq_ids): 474 | global reward_counter 475 | reward = torch.zeros([len(isStop)], dtype=torch.float32) 476 | Q_Val = torch.zeros([len(isStop)], dtype= torch.float32) 477 | for i in range(len(isStop)): 478 | if isStop[i] == 1: 479 | if pys[ids[i]][seq_ids[i]-1].argmax() == np.argmax(data_y[ids[i]]): 480 | reward_counter += 1 # more number of correct prediction, more rewards 481 | r = 1 + min(FLAGS.reward_rate * math.log(reward_counter), 10) 482 | reward[i] = r 483 | else: 484 | reward[i] = -100 485 | Q_Val[i] = reward[i] 486 | else: 487 | reward[i] = -0.01 488 | Q_Val[i] = reward[i] + 0.99 * max(ss[i]) 489 | return reward, Q_Val 490 | 491 | 492 | # def get_reward_v1(isStop, ss, pys, ids, seq_ids, cm_model, rdm_outs): 493 | def get_reward_v1(isStop, mss, ssq, ids, seq_states, cm_model, rdm_hiddens_seq): 494 | global reward_counter 495 | reward = torch.zeros([len(isStop)], dtype=torch.float32) 496 | Q_Val = torch.zeros([len(isStop)], dtype= torch.float32) 497 | for i in range(len(isStop)): 498 | if isStop[i] == 1: 499 | if ssq[ids[i]][seq_states[i]-1].argmax() == np.argmax(data_y[ids[i]]): 500 | reward_counter += 1 # more number of correct prediction, more rewards 501 | r = 1 + min(FLAGS.reward_rate * math.log(reward_counter), 10) 502 | reward[i] = r 503 | if data_len[ids[i]] > seq_states[i]: 504 | with torch.no_grad(): 505 | subsequent_score = cm_model.Classifier( 506 | nn.functional.relu( 507 | cm_model.DenseLayer( 508 | rdm_hiddens_seq[ids[i]] 509 | ) 510 | ) 511 | ) 512 | torch.cuda.empty_cache() 513 | for j in range(seq_states[i], data_len[ids[i]]): 514 | if subsequent_score[j][0] > subsequent_score[j][1]: 515 | reward[i] += -20 516 | break 517 | else: 518 | reward[i] += 15.0/data_len[ids[i]] 519 | else: 520 | reward[i] = -100 521 | Q_Val[i] = reward[i] 522 | else: 523 | reward[i] = -0.01 524 | Q_Val[i] = reward[i] + 0.99 * max(mss[i]) 525 | return reward, Q_Val 526 | 527 | 528 | 529 | def get_new_len(sent_pooler, rdm_model, cm_model, FLAGS, cuda): 530 | batch_size = 20 531 | new_len = [] 532 | valid_new_len = [] 533 | if len(data_ID) % batch_size == 0: # the total number of events 534 | flags = int(len(data_ID) / FLAGS.batch_size) 535 | else: 536 | flags = int(len(data_ID) / FLAGS.batch_size) + 1 537 | for i in range(flags): 538 | with torch.no_grad(): 539 | x, x_len, y = get_df_batch(i, batch_size) 540 | seq = sent_pooler(x) 541 | rdm_hiddens = rdm_model(seq) 542 | batchsize, _, _ = rdm_hiddens.shape 543 | rdm_outs = torch.cat( 544 | [ rdm_hiddens[i][x_len[i]-1] for i in range(batchsize)] 545 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 546 | ).reshape( 547 | [-1, rdm_model.hidden_dim] 548 | ) 549 | stopScores = cm_model.Classifier( 550 | nn.functional.relu( 551 | cm_model.DenseLayer( 552 | rdm_hiddens.reshape([-1, rdm_model.hidden_dim]) 553 | ) 554 | ) 555 | ).reshape( 556 | [batchsize, -1, 2] 557 | ) 558 | isStop = stopScores.argmax(axis=-1).cpu().numpy() 559 | 560 | tmp_len = [iS.argmax()+1 if (iS.max() ==1 and (iS.argmax()+1)= len(data_ID): # read data looply 609 | mts = mts % len(data_ID) 610 | with torch.no_grad(): 611 | seq = sent_pooler(data_x) 612 | rdm_hiddens = rdm_model(seq) 613 | batchsize, _, _ = rdm_hiddens.shape 614 | rdm_outs = torch.cat( 615 | [ rdm_hiddens[i][m_data_len[i]-1] for i in range(batchsize)] 616 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 617 | ).reshape( 618 | [-1, rdm_model.hidden_dim] 619 | ) 620 | stopScores = cm_model.Classifier( 621 | nn.functional.relu( 622 | cm_model.DenseLayer( 623 | rdm_hiddens.reshape([-1, rdm_model.hidden_dim]) 624 | ) 625 | ) 626 | ).reshape( 627 | [batchsize, -1, 2] 628 | ) 629 | isStop = stopScores.argmax(axis=-1).cpu().numpy() 630 | 631 | tmp_len = [iS.argmax()+1 if (iS.max() ==1 and (iS.argmax()+1) 0: 203 | t_data_len = new_data_len 204 | else: 205 | t_data_len = data_len 206 | mts = start * batch_size 207 | if mts >= len(data_ID): 208 | mts = mts % len(data_ID) 209 | 210 | for i in range(batch_size): 211 | m_data_y[i] = data_y[mts] 212 | m_data_len[i] = t_data_len[mts] 213 | seq = [] 214 | for j in range(t_data_len[mts]): 215 | sent = [] 216 | t_words = data[data_ID[mts]]['text'][j] 217 | if len(t_words) == 0: 218 | print("ID:%s j:%3d empty sentence:"%(valid_data_ID[mts], j), t_words) 219 | continue 220 | 221 | for k in range(len(t_words)): 222 | m_word = t_words[k] 223 | try: 224 | sent.append( torch.tensor([word2vec[m_word]], dtype=torch.float32) ) 225 | except KeyError: 226 | miss_vec += 1 227 | sent.append( torch.tensor([word2vec['{'] +word2vec['未知'] + word2vec['词'] + word2vec['}'] ], dtype=torch.float32) ) 228 | except IndexError: 229 | raise 230 | else: 231 | hit_vec += 1 232 | sent_tensor = torch.cat(sent) 233 | seq.append(sent_tensor) 234 | data_x.append(seq) 235 | mts += 1 236 | if mts >= len(data_ID): # read data looply 237 | mts = mts % len(data_ID) 238 | 239 | return data_x, m_data_len, m_data_y 240 | 241 | 242 | # ``` python 243 | # with open('./data/weibo_dict.txt', 'wb') as handle: 244 | # pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) 245 | # 246 | # idxs = np.random.randn(len(data_ID)).argsort() 247 | # 248 | # data_ID = [data_ID[idx] for idx in idxs] 249 | # data_y = [data_y[idx] for idx in idxs] 250 | # data_len = [data_len[idx] for idx in idxs] 251 | # 252 | # np.save("./data/weibo_ID.npy", np.array(data_ID)[:4000]) 253 | # np.save("./data/weibo_y.npy", np.array(data_y)[:4000]) 254 | # np.save("./data/weibo_len.npy", np.array(data_len)[:4000]) 255 | # 256 | # np.save("./data/test_weibo_ID.npy", np.array(data_ID)[4000:]) 257 | # np.save("./data/test_weibo_y.npy", np.array(data_y)[4000:]) 258 | # np.save("./data/test_weibo_len.npy", np.array(data_len)[4000:]) 259 | # ``` 260 | 261 | # In[44]: 262 | 263 | 264 | def get_rl_batch(ids, seq_states, stop_states, counter_id, start_id, total_data): 265 | input_x = np.zeros([FLAGS.batch_size, FLAGS.max_sent_len, FLAGS.embedding_dim], dtype=np.float32) 266 | input_y = np.zeros([FLAGS.batch_size, FLAGS.class_num], dtype=np.float32) 267 | miss_vec = 0 268 | total_data = len(data_len) 269 | for i in range(FLAGS.batch_size): 270 | # seq_states:records the id of a sentence in a sequence 271 | # stop_states: records whether the sentence is judged by the program 272 | if stop_states[i] == 1 or seq_states[i] >= data_len[ids[i]]: 273 | ids[i] = counter_id + start_id 274 | seq_states[i] = 0 275 | try: 276 | t_words = data[ data_ID[ids[i]] ]['text'][seq_states[i]] 277 | except: 278 | print(ids[i], seq_states[i]) 279 | for j in range(len(t_words)): 280 | m_word = t_words[j] 281 | try: 282 | input_x[i][j] = word2vec[m_word] 283 | except: 284 | miss_vec = 1 285 | input_y[i] = data_y[ids[i]] 286 | counter_id += 1 287 | counter_id = counter_id % total_data 288 | else: 289 | try: 290 | t_words = data[ data_ID[ids[i]] ]['text'][seq_states[i]] 291 | except: 292 | print("ids and seq_states:", ids[i], seq_states[i]) 293 | t_words = [] 294 | for j in range(len(t_words)): 295 | m_word = t_words[j] 296 | try: 297 | input_x[i][j] = word2vec[m_word] 298 | except: 299 | miss_vec += 1 300 | input_y[i] = data_y[ids[i]] 301 | # point to the next sequence 302 | seq_states[i] += 1 303 | 304 | return input_x, input_y, ids, seq_states, counter_id 305 | 306 | 307 | # In[46]: 308 | 309 | 310 | def accuracy_on_valid_data(rdm_model = None, sent_pooler = None, rdm_classifier=None, new_data_len=[], cuda=True): 311 | batch_size = 20 312 | t_steps = int(len(valid_data_ID)/batch_size) 313 | sum_acc = 0.0 314 | miss_vec = 0 315 | mts = 0 316 | hit_vec = 0 317 | if len(new_data_len) > 0: 318 | t_data_len = new_data_len 319 | else: 320 | t_data_len = valid_data_len 321 | 322 | for step in range(t_steps): 323 | data_x = [] 324 | m_data_y = np.zeros([batch_size, 2], dtype=np.int32) 325 | m_data_len = np.zeros([batch_size], dtype=np.int32) 326 | for i in range(batch_size): 327 | m_data_y[i] = valid_data_y[mts] 328 | m_data_len[i] = t_data_len[mts] 329 | seq = [] 330 | for j in range(t_data_len[mts]): 331 | sent = [] 332 | t_words = data[valid_data_ID[mts]]['text'][j] 333 | if len(t_words) == 0: 334 | print("ID:%s j:%3d empty sentence:"%(valid_data_ID[mts], j), t_words) 335 | continue 336 | 337 | for k in range(len(t_words)): 338 | m_word = t_words[k] 339 | try: 340 | sent.append( torch.tensor([word2vec[m_word]], dtype=torch.float32)) 341 | except KeyError: 342 | miss_vec += 1 343 | sent.append( torch.tensor([word2vec['{'] +word2vec['an'] + word2vec['unknown'] + word2vec['word'] + word2vec['}'] ], dtype=torch.float32) ) 344 | except IndexError: 345 | raise 346 | else: 347 | hit_vec += 1 348 | sent_tensor = torch.cat(sent) 349 | seq.append(sent_tensor) 350 | 351 | data_x.append(seq) 352 | mts += 1 353 | if mts >= len(valid_data_ID): # read data looply 354 | mts = mts % len(valid_data_ID) 355 | 356 | 357 | if rdm_model is not None and sent_pooler is not None and rdm_classifier is not None: 358 | with torch.no_grad(): 359 | seq = sent_pooler(data_x) 360 | rdm_hiddens = rdm_model(seq) 361 | batchsize, _, _ = rdm_hiddens.shape 362 | rdm_outs = torch.cat( 363 | [ rdm_hiddens[i][m_data_len[i]-1].unsqueeze(0) for i in range(batchsize)] 364 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 365 | ) 366 | rdm_scores = rdm_classifier( 367 | rdm_outs 368 | ) 369 | rdm_preds = rdm_scores.argmax(axis=1) 370 | y_label = torch.tensor(m_data_y).argmax(axis=1).cuda() if cuda else torch.tensor(m_data_y).argmax(axis=1) 371 | acc = accuracy_score(y_label.cpu().numpy(), rdm_preds.cpu().numpy()) 372 | torch.cuda.empty_cache() 373 | 374 | sum_acc += acc 375 | mean_acc = sum_acc / (1.0*t_steps) 376 | return mean_acc 377 | 378 | 379 | # In[47]: 380 | 381 | 382 | def get_reward_0(isStop, ss, pys, ids, seq_ids): 383 | global reward_counter 384 | reward = torch.zeros([len(isStop)], dtype=torch.float32) 385 | Q_Val = torch.zeros([len(isStop)], dtype= torch.float32) 386 | for i in range(len(isStop)): 387 | if isStop[i] == 1: 388 | try: 389 | if pys[ids[i]][seq_ids[i]-1].argmax() == np.argmax(data_y[ids[i]]): 390 | reward_counter += 1 # more number of correct prediction, more rewards 391 | r = 1 + FLAGS.reward_rate * math.log(reward_counter) 392 | reward[i] = r 393 | else: 394 | reward[i] = -100 395 | except: 396 | print("i:", i) 397 | print("ids_i:", ids[i]) 398 | print("seq_ids:", seq_ids[i]) 399 | print("pys:", pys[ids[i]]) 400 | raise 401 | Q_Val[i] = reward[i] 402 | else: 403 | reward[i] = -0.01 404 | Q_Val[i] = reward[i] + 0.99 * max(ss[i]) 405 | return reward, Q_Val 406 | 407 | 408 | # In[48]: 409 | 410 | 411 | def get_reward(isStop, ss, pys, ids, seq_ids): 412 | global reward_counter 413 | reward = torch.zeros([len(isStop)], dtype=torch.float32) 414 | Q_Val = torch.zeros([len(isStop)], dtype= torch.float32) 415 | for i in range(len(isStop)): 416 | if isStop[i] == 1: 417 | if pys[ids[i]][seq_ids[i]-1].argmax() == np.argmax(data_y[ids[i]]): 418 | reward_counter += 1 # more number of correct prediction, more rewards 419 | r = 1 + min(FLAGS.reward_rate * math.log(reward_counter), 10) 420 | reward[i] = r 421 | else: 422 | reward[i] = -100 423 | Q_Val[i] = reward[i] 424 | else: 425 | reward[i] = -0.01 426 | Q_Val[i] = reward[i] + 0.99 * max(ss[i]) 427 | return reward, Q_Val 428 | 429 | 430 | # In[49]: 431 | 432 | 433 | def get_reward_v1(isStop, mss, ssq, ids, seq_states, cm_model, rdm_hiddens_seq): 434 | global reward_counter 435 | reward = torch.zeros([len(isStop)], dtype=torch.float32) 436 | Q_Val = torch.zeros([len(isStop)], dtype= torch.float32) 437 | for i in range(len(isStop)): 438 | if isStop[i] == 1: 439 | if ssq[ids[i]][seq_states[i]-1].argmax() == np.argmax(data_y[ids[i]]): 440 | reward_counter += 1 # more number of correct prediction, more rewards 441 | r = 1 + min(FLAGS.reward_rate * math.log(reward_counter), 10) 442 | reward[i] = r 443 | if data_len[ids[i]] > seq_states[i]: 444 | with torch.no_grad(): 445 | subsequent_score = cm_model.Classifier( 446 | nn.functional.relu( 447 | cm_model.DenseLayer( 448 | rdm_hiddens_seq[ids[i]] 449 | ) 450 | ) 451 | ) 452 | torch.cuda.empty_cache() 453 | for j in range(seq_states[i], data_len[ids[i]]): 454 | if subsequent_score[j][0] > subsequent_score[j][1]: 455 | reward[i] += -20 456 | break 457 | else: 458 | reward[i] += 15.0/data_len[ids[i]] 459 | else: 460 | reward[i] = -100 461 | Q_Val[i] = reward[i] 462 | else: 463 | reward[i] = -0.01 464 | Q_Val[i] = reward[i] + 0.99 * max(mss[i]) 465 | return reward, Q_Val 466 | 467 | 468 | # In[51]: 469 | 470 | 471 | def get_new_len(sent_pooler, rdm_model, cm_model, FLAGS, cuda): 472 | batch_size = 20 473 | new_len = [] 474 | valid_new_len = [] 475 | if len(data_ID) % batch_size == 0: # the total number of events 476 | flags = int(len(data_ID) / FLAGS.batch_size) 477 | else: 478 | flags = int(len(data_ID) / FLAGS.batch_size) + 1 479 | for i in range(flags): 480 | with torch.no_grad(): 481 | x, x_len, y = get_df_batch(i, batch_size) 482 | seq = sent_pooler(x) 483 | rdm_hiddens = rdm_model(seq) 484 | batchsize, _, _ = rdm_hiddens.shape 485 | rdm_outs = torch.cat( 486 | [ rdm_hiddens[i][x_len[i]-1] for i in range(batchsize)] 487 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 488 | ).reshape( 489 | [-1, rdm_model.hidden_dim] 490 | ) 491 | stopScores = cm_model.Classifier( 492 | nn.functional.relu( 493 | cm_model.DenseLayer( 494 | rdm_hiddens.reshape([-1, rdm_model.hidden_dim]) 495 | ) 496 | ) 497 | ).reshape( 498 | [batchsize, -1, 2] 499 | ) 500 | isStop = stopScores.argmax(axis=-1).cpu().numpy() 501 | 502 | tmp_len = [iS.argmax()+1 if (iS.max() ==1 and (iS.argmax()+1)= len(data_ID): # read data looply 552 | mts = mts % len(data_ID) 553 | with torch.no_grad(): 554 | seq = sent_pooler(data_x) 555 | rdm_hiddens = rdm_model(seq) 556 | batchsize, _, _ = rdm_hiddens.shape 557 | rdm_outs = torch.cat( 558 | [ rdm_hiddens[i][m_data_len[i]-1] for i in range(batchsize)] 559 | # a list of tensor, where the ndim of tensor is 1 and the shape of tensor is [hidden_size] 560 | ).reshape( 561 | [-1, rdm_model.hidden_dim] 562 | ) 563 | stopScores = cm_model.Classifier( 564 | nn.functional.relu( 565 | cm_model.DenseLayer( 566 | rdm_hiddens.reshape([-1, rdm_model.hidden_dim]) 567 | ) 568 | ) 569 | ).reshape( 570 | [batchsize, -1, 2] 571 | ) 572 | isStop = stopScores.argmax(axis=-1).cpu().numpy() 573 | 574 | tmp_len = [iS.argmax()+1 if (iS.max() ==1 and (iS.argmax()+1)