├── IART ├── bin │ ├── __init__.py │ ├── __init__.pyc │ ├── test_and_evaluate.pyc │ ├── train_and_evaluate.pyc │ ├── test_and_evaluate.py │ └── train_and_evaluate.py ├── models │ ├── __init__.py │ ├── net.pyc │ ├── __init__.pyc │ ├── iadam_attention.pyc │ ├── net.py │ └── iadam_attention.py ├── utils │ ├── __init__.py │ ├── layers.pyc │ ├── reader.pyc │ ├── __init__.pyc │ ├── evaluation.pyc │ ├── operations.pyc │ ├── douban_evaluation.pyc │ ├── evaluation.py │ ├── douban_evaluation.py │ ├── reader.py │ ├── preparation.py │ ├── operations.py │ ├── layers.py │ └── preprocess.py ├── run.sh ├── conqa │ ├── gen_w2v_filtered_helper.py │ ├── test_word_embedding_pkl.py │ ├── gen_query_all_metrics_helper.py │ ├── gen_query_mz_score_file_from_dam_scores.py │ ├── gen_w2v_mikolov.py │ ├── data_preprocess_dam.py │ ├── gen_user_intent_vector.py │ ├── gen_w2v_filtered.py │ ├── gen_query_all_metrics.py │ └── transfer_mz_to_dam_format.py ├── main_udc.py ├── main_ms_v2.py └── main_conversation_qa.py ├── output └── README.md ├── figures └── iart-model.png ├── data └── README.md └── README.md /IART/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /IART/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /IART/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /output/README.md: -------------------------------------------------------------------------------- 1 | Please store model checkpoints and model output here. -------------------------------------------------------------------------------- /IART/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | CUDA_VISIBLE_DEVICES=1 python main_ms_v2.py 3 | 4 | 5 | -------------------------------------------------------------------------------- /IART/models/net.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/models/net.pyc -------------------------------------------------------------------------------- /IART/bin/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/bin/__init__.pyc -------------------------------------------------------------------------------- /IART/utils/layers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/layers.pyc -------------------------------------------------------------------------------- /IART/utils/reader.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/reader.pyc -------------------------------------------------------------------------------- /figures/iart-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/figures/iart-model.png -------------------------------------------------------------------------------- /IART/models/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/models/__init__.pyc -------------------------------------------------------------------------------- /IART/utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/__init__.pyc -------------------------------------------------------------------------------- /IART/utils/evaluation.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/evaluation.pyc -------------------------------------------------------------------------------- /IART/utils/operations.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/operations.pyc -------------------------------------------------------------------------------- /IART/bin/test_and_evaluate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/bin/test_and_evaluate.pyc -------------------------------------------------------------------------------- /IART/bin/train_and_evaluate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/bin/train_and_evaluate.pyc -------------------------------------------------------------------------------- /IART/models/iadam_attention.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/models/iadam_attention.pyc -------------------------------------------------------------------------------- /IART/utils/douban_evaluation.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/douban_evaluation.pyc -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Please download the data from this [Google Drive Folder](https://drive.google.com/drive/folders/1ayXN6pgzxs7DP9iCO6JR-KXbQHPUOLXx?usp=sharing). -------------------------------------------------------------------------------- /IART/conqa/gen_w2v_filtered_helper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | for data in list(['ms_v2']): 5 | for wd in list([200, 300]): 6 | cur_folder = '../../data/' + data +'/' 7 | cmd = 'python gen_w2v_filtered.py ' \ 8 | + cur_folder + 'train_word2vec_mikolov_' + str(wd) +'d.txt ' \ 9 | + cur_folder + 'word_dict.txt ' \ 10 | + cur_folder + 'cut_embed_mikolov_' + str(wd) + 'd.pkl' 11 | print cmd 12 | os.system(cmd) -------------------------------------------------------------------------------- /IART/conqa/test_word_embedding_pkl.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import sys 4 | 5 | if __name__ == '__main__': 6 | if len(sys.argv) < 2: 7 | print 'please input params: word_embedding pkl file' 8 | exit(1) 9 | word_embed_file = sys.argv[1] 10 | word_embedding_init = np.array(pickle.load(open(word_embed_file, 'rb'))) 11 | print('shape of word_embedding_init: ', word_embedding_init.shape) 12 | print('init embed vectors of the first 10 words are: ', word_embedding_init[0:10,:]) 13 | 14 | -------------------------------------------------------------------------------- /IART/conqa/gen_query_all_metrics_helper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A scripts to help run gen_query_mz_score_file_from_dam_scores.py and 3 | gen_query_all_metrics.py 4 | ''' 5 | 6 | 7 | import os 8 | 9 | dam_score_dict = { 10 | 'ms_v2-iart-att-dot': '../../output/ms_v2/iadam-attention-iadam-att-dot-opd-test-dot-run40/score.test', 11 | 'ms_v2-iart-att-outprod': '../../output/ms_v2/iadam-attention-iadam-att-dot-opd-test-outprod-run40/score.test', 12 | 'udc-iart-att-dot': '../../output/udc/iadam-attention-iadam-att-intentv2-test-dot-intentv2-run44/score.test', 13 | 'udc-iart-att-outprod': '../../output/udc/iadam-attention-iadam-att-intentv2-test-outprod-intentv2-run44/score.test' 14 | } 15 | 16 | for data in ['ms_v2', 'udc']: 17 | for model in ['iart-att-dot', 'iart-att-outprod']: 18 | dam_prediction_score_file = dam_score_dict[data + '-' + model] 19 | cmd = 'python gen_query_mz_score_file_from_dam_scores.py ' + \ 20 | dam_prediction_score_file + ' ' + data 21 | print 'run ', cmd 22 | os.system(cmd) 23 | mz_prediction_file = dam_prediction_score_file + '.mz_score' 24 | cmd = 'python gen_query_all_metrics.py ' + mz_prediction_file 25 | print 'run ', cmd 26 | os.system(cmd) -------------------------------------------------------------------------------- /IART/conqa/gen_query_mz_score_file_from_dam_scores.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Transfer the score files of DAM related models to the format 3 | of MatchZoo score files 4 | The required score file format is as follows: 5 | # Q597901 Q0 D118777 0 2.370282 DMN_CNN 0(ground truth) 6 | # seperated by \t 7 | # qid \t Q0 \t did \t rank \t score \t method \t ground_truth_label 8 | # 030 Q0 ZF08-175-870 0 4238 prise1 9 | # qid iter docno rank sim run_id 10 | # In particular, note that the rank field is ignored here; 11 | # internally ranks are assigned by sorting by the sim field with ties 12 | # broken deterministicly (using docno). 13 | 14 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 15 | ''' 16 | 17 | import sys 18 | 19 | if __name__ == '__main__': 20 | if len(sys.argv) < 3: 21 | print 'please input params: dam_prediction_score_file (absolute path) data_name (udc or ms_v2)' 22 | exit(1) 23 | dam_score_file = sys.argv[1] # path of dam score file 24 | # format of dam score file: score label 25 | # the order is the same with the input data 26 | data_name = sys.argv[2] 27 | relation_file = '../../data/' + data_name + '/relation_test.txt' # use non_fd version to be consistent 28 | with open(dam_score_file) as fin_score, open(relation_file) as fin_rel, \ 29 | open(dam_score_file + '.mz_score', 'w') as fout_score: 30 | for dam_score_line in fin_score: 31 | dam_score_line = dam_score_line.strip().split() 32 | rel_line = fin_rel.readline().strip().split() 33 | label = rel_line[0] 34 | qid = rel_line[1] 35 | did = rel_line[2] 36 | score = dam_score_line[0] 37 | fout_score.write(qid + '\tQ0\t' + did + '\t0\t' + score + '\t' + 38 | dam_score_file + '\t' + label + '\n') 39 | 40 | -------------------------------------------------------------------------------- /IART/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | import sys; 2 | from douban_evaluation import mean_average_precision 3 | 4 | def get_p_at_n_in_m(data, n, m, ind): 5 | pos_score = data[ind][0]; 6 | curr = data[ind:ind+m]; 7 | curr = sorted(curr, key = lambda x:x[0], reverse=True) 8 | 9 | if curr[n-1][0] <= pos_score: 10 | return 1; 11 | return 0; 12 | 13 | def get_map(data, m, ind): 14 | curr = data[ind:ind + m]; 15 | sort_data = sorted(curr, key=lambda x: x[0], reverse=True) 16 | m_a_p = mean_average_precision(sort_data) 17 | return m_a_p 18 | 19 | def evaluate(file_path): 20 | data = [] 21 | with open(file_path, 'r') as file: 22 | for line in file: 23 | line = line.strip(); 24 | tokens = line.split("\t") 25 | 26 | if len(tokens) != 2: 27 | continue 28 | 29 | data.append((float(tokens[0]), int(tokens[1]))); 30 | 31 | #assert len(data) % 10 == 0 32 | 33 | p_at_1_in_2 = 0.0 34 | p_at_1_in_10 = 0.0 35 | p_at_2_in_10 = 0.0 36 | p_at_5_in_10 = 0.0 37 | map_sum = 0.0 38 | 39 | length = len(data)/10 # number of queries 40 | print('num of queries: ', length) 41 | 42 | for i in xrange(0, length): 43 | ind = i * 10 # use ind to index the first doc of each query 44 | assert data[ind][1] == 1 45 | 46 | p_at_1_in_2 += get_p_at_n_in_m(data, 1, 2, ind) 47 | p_at_1_in_10 += get_p_at_n_in_m(data, 1, 10, ind) 48 | p_at_2_in_10 += get_p_at_n_in_m(data, 2, 10, ind) 49 | p_at_5_in_10 += get_p_at_n_in_m(data, 5, 10, ind) 50 | map_sum += get_map(data, 10, ind) 51 | # add MAP here for IADAM evaluation 52 | 53 | 54 | 55 | return (p_at_1_in_2/length, p_at_1_in_10/length, p_at_2_in_10/length, 56 | p_at_5_in_10/length, map_sum/length) 57 | 58 | 59 | if __name__ == '__main__': 60 | if len(sys.argv) < 2: 61 | print("plean input parameters: score_file") 62 | sys.exit(1) 63 | result = evaluate(sys.argv[1]) 64 | for r in result: 65 | print(r) 66 | # m_line = "\t".join([str(m) for m in result]) 67 | # print('[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t', m_line) 68 | print('[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format( 69 | result[0], result[1], result[2], result[3], result[4])) -------------------------------------------------------------------------------- /IART/main_udc.py: -------------------------------------------------------------------------------- 1 | import models.net as net 2 | import models.iadam_attention as iadam_attention 3 | 4 | import bin.train_and_evaluate as train 5 | 6 | # configure 7 | 8 | # data_small.pkl is the small data for debugging purpose (10K training instances for UDC) 9 | # data.pkl is the whole data (1M training instances for UDC) 10 | 11 | conf = { 12 | "data_name": "udc", 13 | "data_path": "../data/udc/data_small.pkl", # data_small.pkl or data.pkl 14 | "intent_vec_path": "../data/udc/intent_vectors.txt", # path of intent vectors 15 | "intent_size": 12, # dimensions of different intent 16 | "intent_attention_type": "bilinear", # 'dot', 'bilinear', 'outprod' 17 | "intent_ffn_od0": 64, # in iadam-concat ffn 144->64->16 match 576 18 | "intent_ffn_od1": 16, # in iadam-concat ffn 144->64->16 match 576 19 | "intent_loss_weight": 0.2, # in iadam-mtl weight for intent loss; 1-weight for the ranking loss 20 | "model_name": "iadam-attention", # dam, iadam-concat, iadam-attention, iadam-mtl 21 | "save_path": "../output/udc/temp/", 22 | "word_emb_init": None, #"../data/udc/cut_embed_mikolov_200d.pkl", # word_embedding.pkl 23 | "init_model": None, #should be set for test 24 | "rand_seed": None, 25 | "drop_dense": None, 26 | "drop_attention": None, 27 | 28 | "is_mask": True, 29 | "is_layer_norm": True, 30 | "is_positional": False, 31 | 32 | "stack_num": 5, 33 | "attention_type": "dot", 34 | 35 | "learning_rate": 1e-3, 36 | "vocab_size": 429498, 37 | "emb_size": 200, 38 | "batch_size": 128, # for udc/iadam_mtl model, batch_size = 64; others = 128 39 | 40 | "max_turn_num": 9, 41 | "max_turn_len": 50, 42 | 43 | "max_to_keep": 1, 44 | "num_scan_data": 2, # about 16 hours for 2 epoches on udc 45 | "_EOS_": 429498, # 28270, #1 for douban data 46 | "final_n_class": 1, 47 | 48 | "cnn_3d_oc0": 32, 49 | "cnn_3d_oc1": 16 50 | } 51 | 52 | if conf['model_name'] == 'dam': 53 | model = net.Net(conf) # DAM 54 | elif conf['model_name'] == 'iadam-attention': 55 | model = iadam_attention.Net(conf) # IADAM-Attention-V4-2/ IART 56 | else: 57 | raise NameError('model not supported.') 58 | 59 | train.train(conf, model) 60 | 61 | # test and evaluation, init_model in conf should be set 62 | # test.test(conf, model) 63 | -------------------------------------------------------------------------------- /IART/conqa/gen_w2v_mikolov.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Train word embeddings with word2vec tool by mikolov with training data of ms/udc 3 | The script for generate the pretrained word embedding file for DAM model 4 | We directly used the released dataset from sequential-matching-network (Wu et 5 | al., 2017), thus there is no preprocessing phase in our experiments. 6 | 7 | We pre-train word-embeddings using a word2vec toolkit in c++, I can hardly 8 | find the script we used to pre-train word-embeddings. Maybe the following 9 | command can help you: 10 | 11 | ./bin/word2vec -train $train_dat -output "$train_dat.w2v" -debug 2 -size 200 \ 12 | -window 10 -sample 1e-4 -negative 25 -hs 0 -binary 1 -cbow 1 -min-count 1 13 | 14 | # default setting cut_embed_mikolov_200d_no_readvocab.txt 15 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 16 | ''' 17 | 18 | import os 19 | import sys 20 | from tqdm import tqdm 21 | 22 | if __name__ == '__main__': 23 | word2vec_path = '/net/home/lyang/PycharmProjects/NLPIRNNMatchZooQA/src-match-zoo-lyang-dev/data/udc/ModelInput/word2vec_mikolov/word2vec/bin/' 24 | 25 | if len(sys.argv) < 3: 26 | print 'please input params: data_name (udc or ms_v2), model_input_folder (folder for corpus.txt)' 27 | exit(1) 28 | data_name = sys.argv[1] # udc or ms_v2 29 | model_input_folder = sys.argv[2] # model_input_folder 30 | corpus_file = model_input_folder + 'corpus.txt' 31 | corpus_text_file = model_input_folder + 'corpus_text.txt' 32 | vocab_file = model_input_folder + 'word_dict.txt' 33 | 34 | print 'generate corpus_file: ', corpus_text_file 35 | # generate corpus_text.txt for training the word vectors by word2vec 36 | with open(corpus_file) as f_in, open(corpus_text_file, 'w') as f_out: 37 | for l in tqdm(f_in): 38 | #print 'l: ', l 39 | f_out.write(' '.join(l.split()[1:])) 40 | 41 | for wd in list([200,300]): 42 | word_embed_file = 'train_word2vec_mikolov_' + str(wd) + 'd.txt' 43 | cmd = word2vec_path + 'word2vec -train ' + corpus_text_file + \ 44 | ' -output ' + model_input_folder + word_embed_file + ' -debug 2 ' \ 45 | '-size ' + str(wd) + ' -window 10 -sample 1e-4 -negative 25 -hs 0 -binary 0 -cbow 1 -min-count 1 ' \ 46 | '-threads 5' 47 | print 'run cmd: ', cmd 48 | os.system(cmd) 49 | 50 | -------------------------------------------------------------------------------- /IART/main_ms_v2.py: -------------------------------------------------------------------------------- 1 | import models.net as net 2 | import models.iadam_attention as iadam_attention 3 | 4 | import bin.train_and_evaluate as train 5 | import bin.test_and_evaluate as test 6 | 7 | # configure 8 | 9 | # data_small.pkl is the small data for debugging purpose (10K training instances) 10 | # data.pkl is the whole data 11 | 12 | conf = { 13 | "data_name": "ms_v2", 14 | "data_path": "../data/ms_v2/data_small.pkl", # data_small.pkl or data.pkl 15 | "intent_vec_path": "../data/ms_v2/intent_vectors.txt", # path of intent vectors 16 | "intent_size": 12, # dimensions of different intent 17 | "intent_attention_type":"bilinear", # in iadam-attention: 'dot', 'bilinear', 'outprod' 18 | "intent_ffn_od0": 128, # in iadam-concat ffn 144->128->64 match 6400 19 | "intent_ffn_od1": 64, # in iadam-concat ffn 144->128->64 match 6400 20 | "intent_loss_weight": 0.2, # in iadam-mtl weight for intent loss; 1-weight for the ranking loss 21 | "model_name": "iadam-attention", # dam, iadam-concat, iadam-attention, iadam-mtl 22 | "save_path": "../output/ms_v2/temp/", 23 | "word_emb_init": None, # "../data/ms_v2/cut_embed_mikolov_200d.pkl", # None (set None during debugging) 24 | "init_model": None, # "../output/ms_v2/iadam-attention-max_turn_len-200-run33/model.ckpt.20", # "../output/ms_v2/dam_default_setting_0412_run29/model.ckpt.36", # Set None for training; Set best ckpt for test 25 | "rand_seed": None, 26 | "drop_dense": None, 27 | "drop_attention": None, 28 | 29 | "is_mask": True, 30 | "is_layer_norm": True, 31 | "is_positional": False, 32 | 33 | "stack_num": 4, 34 | "attention_type": "dot", 35 | 36 | "learning_rate": 1e-3, 37 | "vocab_size": 167983, 38 | "emb_size": 200, 39 | "batch_size": 32, # for ms_v2/iadam_mtl model, batch_size = 20; others = 32 40 | 41 | "max_turn_num": 6, # 6 is better for ms_v2 42 | "max_turn_len": 200, # default is 180 43 | 44 | "max_to_keep": 1, 45 | "num_scan_data": 5, # about 18 hours for 5 epoches on ms_v2 46 | "_EOS_": 167983, #1 for douban data 47 | "final_n_class": 1, 48 | 49 | "cnn_3d_oc0": 16, 50 | "cnn_3d_oc1": 16 51 | } 52 | 53 | if conf['model_name'] == 'dam': 54 | model = net.Net(conf) # DAM 55 | elif conf['model_name'] == 'iadam-attention': 56 | model = iadam_attention.Net(conf) # IADAM is IART in paper 57 | else: 58 | raise NameError('model not supported.') 59 | 60 | train.train(conf, model) 61 | 62 | # test and evaluation, init_model in conf should be set 63 | # test.test(conf, model) 64 | 65 | -------------------------------------------------------------------------------- /IART/utils/douban_evaluation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from sklearn.metrics import average_precision_score 4 | 5 | def mean_average_precision(sort_data): 6 | #to do 7 | count_1 = 0 8 | sum_precision = 0 9 | for index in range(len(sort_data)): 10 | if sort_data[index][1] == 1: 11 | count_1 += 1 12 | sum_precision += 1.0 * count_1 / (index+1) 13 | return sum_precision / count_1 14 | 15 | def mean_reciprocal_rank(sort_data): 16 | sort_lable = [s_d[1] for s_d in sort_data] 17 | assert 1 in sort_lable 18 | return 1.0 / (1 + sort_lable.index(1)) 19 | 20 | def precision_at_position_1(sort_data): 21 | if sort_data[0][1] == 1: 22 | return 1 23 | else: 24 | return 0 25 | 26 | def recall_at_position_k_in_10(sort_data, k): 27 | sort_lable = [s_d[1] for s_d in sort_data] 28 | select_lable = sort_lable[:k] 29 | return 1.0 * select_lable.count(1) / sort_lable.count(1) 30 | 31 | def evaluation_one_session(data): 32 | sort_data = sorted(data, key=lambda x: x[0], reverse=True) 33 | m_a_p = mean_average_precision(sort_data) 34 | m_r_r = mean_reciprocal_rank(sort_data) 35 | p_1 = precision_at_position_1(sort_data) 36 | r_1 = recall_at_position_k_in_10(sort_data, 1) 37 | r_2 = recall_at_position_k_in_10(sort_data, 2) 38 | r_5 = recall_at_position_k_in_10(sort_data, 5) 39 | return m_a_p, m_r_r, p_1, r_1, r_2, r_5 40 | 41 | def evaluate(file_path): 42 | sum_m_a_p = 0 43 | sum_m_r_r = 0 44 | sum_p_1 = 0 45 | sum_r_1 = 0 46 | sum_r_2 = 0 47 | sum_r_5 = 0 48 | 49 | i = 0 50 | total_num = 0 51 | with open(file_path, 'r') as infile: 52 | for line in infile: 53 | if i % 10 == 0: 54 | data = [] 55 | 56 | tokens = line.strip().split('\t') 57 | data.append((float(tokens[0]), int(tokens[1]))) 58 | 59 | if i % 10 == 9: 60 | total_num += 1 61 | m_a_p, m_r_r, p_1, r_1, r_2, r_5 = evaluation_one_session(data) 62 | sum_m_a_p += m_a_p 63 | sum_m_r_r += m_r_r 64 | sum_p_1 += p_1 65 | sum_r_1 += r_1 66 | sum_r_2 += r_2 67 | sum_r_5 += r_5 68 | 69 | i += 1 70 | 71 | print('total num: %s' %total_num) 72 | print('MAP: %s' %(1.0*sum_m_a_p/total_num)) 73 | print('MRR: %s' %(1.0*sum_m_r_r/total_num)) 74 | print('P@1: %s' %(1.0*sum_p_1/total_num)) 75 | return (1.0*sum_m_a_p/total_num, 1.0*sum_m_r_r/total_num, 1.0*sum_p_1/total_num, 76 | 1.0*sum_r_1/total_num, 1.0*sum_r_2/total_num, 1.0*sum_r_5/total_num) 77 | 78 | if __name__ == '__main__': 79 | result = evaluate(sys.argv[1]) 80 | for r in result: 81 | print(r) 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /IART/conqa/data_preprocess_dam.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Data preprocess of MS_V2 and UDC data for running DAM model 3 | A good preprocess is very important for good performance 4 | Preprocess UDC for debugging purpose 5 | Preprocess MS_V2 to get results of DAM on MS_V2 6 | The input data format is label \t context (utterances seperated by \t) \t response 7 | 8 | Firstly run data_preprocess_dam.py, then run transfer_mz_to_dam_format.py 9 | 10 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 11 | @homepage: https://sites.google.com/site/lyangwww/ 12 | ''' 13 | 14 | # /bin/python2.7 15 | import os 16 | import sys 17 | 18 | sys.path.append('../utils/') 19 | 20 | from preparation import Preparation 21 | from preprocess import Preprocess, NgramUtil 22 | 23 | def read_dict(infile): 24 | word_dict = {} 25 | for line in open(infile): 26 | r = line.strip().split() 27 | word_dict[r[1]] = r[0] 28 | return word_dict 29 | 30 | if __name__ == '__main__': 31 | if len(sys.argv) < 2: 32 | print 'please input params: data_name (udc or ms_v2)' 33 | exit(1) 34 | data_name = sys.argv[1] # udc or ms_v2 35 | 36 | basedir = '../../data/' + data_name + '/' 37 | 38 | # transform context/response pairs into input pkl file of DAM model 39 | # the input files are train.txt/valid.txt/test.txt 40 | # the format of each line is 'label context response' 41 | prepare = Preparation() 42 | 43 | if data_name == 'udc' or data_name == 'ms_v2': 44 | train_file = 'train.txt' 45 | valid_file = 'valid.txt' 46 | test_file = 'test.txt' 47 | else: 48 | raise ValueError('invalid data name!') 49 | 50 | corpus, rels_train, rels_valid, rels_test = prepare.run_with_train_valid_test_corpus_dmn( 51 | basedir + train_file, basedir + valid_file, 52 | basedir + test_file) 53 | for data_part in list(['train', 'valid', 'test']): 54 | if data_part == 'train': 55 | rels = rels_train 56 | elif data_part == 'valid': 57 | rels = rels_valid 58 | else: 59 | rels = rels_test 60 | print 'total relations in ', data_part, len(rels) 61 | prepare.save_relation(basedir + 'relation_' + data_part + '.txt', 62 | rels) 63 | print 'filter queries with duplicated doc ids...' 64 | prepare.check_filter_query_with_dup_doc( 65 | basedir + 'relation_' + data_part + '.txt') 66 | print 'total corpus ', len(corpus) 67 | prepare.save_corpus_dmn(basedir + 'corpus.txt', corpus, '\t') 68 | print 'preparation finished ...' 69 | 70 | print 'begin preprocess...' 71 | # Prerpocess corpus file 72 | # Trying not filtering terms by frequency 73 | preprocessor = Preprocess() 74 | dids, docs = preprocessor.run_2d_smn( 75 | basedir+ 'corpus.txt') # docs is [corpus_size, utterance_num, max_text1_len] 76 | preprocessor.save_word_dict(basedir + 'word_dict.txt') 77 | # preprocessor.save_words_df(basedir + 'word_df.txt') 78 | 79 | fout = open(basedir+ 'corpus_preprocessed.txt', 'w') 80 | for inum, did in enumerate(dids): 81 | doc_txt = docs[inum] # 2d list 82 | doc_string = '' 83 | for utt in doc_txt: 84 | for w in utt: 85 | doc_string += str(w) + ' ' 86 | doc_string += '\t' 87 | fout.write('%s\t%s\t%s\n' % ( 88 | did, len(docs[inum]), doc_string)) # id text_len text_ids 89 | fout.close() 90 | print('preprocess finished ...') -------------------------------------------------------------------------------- /IART/conqa/gen_user_intent_vector.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Extract the predicted user intent vectors predicted by Chen Qu 3 | Build a dict for user intent vectors 4 | The key is qid_uid or rid (e.g. Q1-0 Q1-1,..., D0, D1, D2,...) 5 | since the context is 2D texts; the response is 1D text 6 | The value is a 12-dimensional intent vector for this context utterance or 7 | response candidate 8 | 9 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 10 | @created on 02/08/2019 11 | ''' 12 | 13 | import sys 14 | import json 15 | import numpy as np 16 | from tqdm import tqdm 17 | import base64 18 | 19 | 20 | def read_dict(infile): 21 | word_dict = {} 22 | for line in open(infile): 23 | r = line.strip().split() 24 | word_dict[r[1]] = r[0] 25 | return word_dict 26 | 27 | 28 | if __name__ == '__main__': 29 | # Extract the 12-dimensional user intent vectors predicted by Chen Qu's 30 | # classifier 31 | if len(sys.argv) < 2: 32 | print ('please input params: data_name (ms_v2 or udc)') 33 | exit(1) 34 | data_name = sys.argv[1] # udc or ms_v2 35 | 36 | basedir = '../../data/' + data_name + '/ModelInput/' 37 | cur_data_dir = basedir + 'dmn_model_input/' 38 | data_name_qc = data_name if data_name != 'ms_v2' else 'ms' 39 | # intent_file_folder = '/mnt/scratch/chenqu/response_intent/output/' 40 | # /mnt/scratch/chenqu/response_intent/udc_v2/output/ 41 | intent_file_folder = '/mnt/scratch/chenqu/response_intent/udc_v2/output/' 42 | 43 | # Extract user intent vectors for each (context_qid, utt_index, candidate_response_id) triple 44 | # in train/valid/test data 45 | intent_dict = {} 46 | for data_part in list(['test', 'train', 'valid']): 47 | # ! Note that here we need to read non-fd version of relation files to be consistant 48 | # ! fd version just filtered queries with duplicated doc ids 49 | # ! the filtering process won't change the qids/ dids 50 | # ! thus the keys of qid/did can be used in both filtered version and 51 | # non-filtered version 52 | relation_file = cur_data_dir + 'relation_' + data_part + '.txt' 53 | intent_file = intent_file_folder + data_name_qc + '_' + data_part + '.txt' 54 | with open(relation_file) as fin_relation, open( 55 | intent_file) as fin_intent: 56 | print('preprcess file: ', intent_file) 57 | for rel_line in tqdm(fin_relation): 58 | intent_line = fin_intent.readline() 59 | intent_tokens = intent_line.split('\t') 60 | rel_tokens = rel_line.split() 61 | qid, rid = rel_tokens[1], rel_tokens[2] 62 | # collect intent vectors for context utterances 63 | for i in range(1, len(intent_tokens) - 1): 64 | intent_dict[qid + '-' + str(i - 1)] = intent_tokens[ 65 | i].strip() 66 | # collect intent vectors for response candidates 67 | intent_dict[rid] = intent_tokens[ 68 | len(intent_tokens) - 1].strip() 69 | print('test len of intent_dict: ', len(intent_dict)) 70 | # output to file 71 | intent_file = cur_data_dir + 'intent_vectors_v2.txt' 72 | with open(intent_file, 'w') as fout: 73 | for id in intent_dict: 74 | fout.write(id + '\t' + intent_dict[id] + '\n') 75 | print('intent_dict[0:10]: ', dict(intent_dict.items()[0:10])) 76 | print('done!') 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /IART/conqa/gen_w2v_filtered.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Filter word embddings from pre-trained word embeddings from Mikolov tool 3 | word_embedding_init: a 2-d array with shape [vocab_size+1, emb_size] 4 | there is one dimension in vocab_size which is corresponding to _eos_. 5 | in our preprocessing, _eos_ is always the last dimension 6 | +1 to add one more embedding vector for padding and masking 7 | We add an "all 0" vector in the 0-th row of word_embedding_init in order 8 | to denote the padding word 9 | when call tf.nn.embedding_lookup(), if word_id = 0, then this is a paded 10 | word; if word_id > 0 (from 1 to vocab_size), then this is a real word 11 | 12 | Can double check the relationships between the row index and word ids 13 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 14 | @homepage: https://sites.google.com/site/lyangwww/ 15 | ''' 16 | 17 | import sys 18 | import numpy as np 19 | import pickle 20 | from tqdm import tqdm 21 | 22 | 23 | 24 | w2v_file = open(sys.argv[1]) # pre-trained word embedding file by the mikolov tool 25 | word_dict_file = open(sys.argv[2]) # word_dict file for the vocabulary information 26 | output_file = open(sys.argv[3], 'wb') # the cutted word embedding file with the shape [vocab_size+1, emb_size] 27 | 28 | # In the generated word_embedding.pkl file 29 | # the 0-th row if a "all 0" vector which is corresponding to the padded word 30 | # the 1-vocab_size -th row are word vectors for real words in vocabs 31 | 32 | # word_count, embed_dim = w2v_file.readline().strip().split() 33 | 34 | word_map_w2v = {} 35 | word2id = {} # map word to id 36 | id2word = {} # map id to word 37 | 38 | print 'load word dict ...' 39 | for line in tqdm(word_dict_file): 40 | line = line.split() 41 | try: 42 | word2id[line[0]] = int(line[1]) 43 | id2word[int(line[1])] = line[0] 44 | except: 45 | print line 46 | continue 47 | 48 | # add one word for _eos_ and _pad_ 49 | # pad_id = 0 50 | eos_id = len(word2id) + 1 51 | 52 | # word2id['_pad_'] = pad_id 53 | word2id['_eos_'] = eos_id 54 | # id2word[pad_id] = '_pad_' 55 | id2word[eos_id] = '_eos_' 56 | vocab_size = len(word2id) 57 | 58 | print 'vocab_size = ', vocab_size 59 | 60 | print 'load word vectors ...' 61 | for line in tqdm(w2v_file): 62 | line = line.split() 63 | if len(line) == 0 or len(line) == 2: # len(len) == 2 for the first line (V WD) 64 | continue 65 | if line[0] in word2id: 66 | word_map_w2v[line[0]] = line[1:] 67 | 68 | emb_size = len(word_map_w2v[word_map_w2v.keys()[0]]) 69 | 70 | print 'emb_size = ', emb_size 71 | 72 | word_diff = list() 73 | for w in word2id.keys(): 74 | if w not in word_map_w2v: 75 | word_diff.append(w) 76 | 77 | # output shared w2v dict 78 | word_embedding_init = np.array(np.zeros((vocab_size+1, emb_size))) # a 2-d array with shape [vocab_size+1, emb_size] 79 | 80 | print 'number of shared word vectors: ', vocab_size-len(word_diff) 81 | 82 | # the first row is an all 0 vector for padding word 83 | 84 | # then add init embedding vectors for real words 85 | 86 | for id in tqdm(range(1, vocab_size+1)): 87 | word = id2word[id] 88 | if word in word_map_w2v: 89 | word_embedding_init[id,:] = [float(s) for s in word_map_w2v[word]] 90 | else: 91 | alpha = 0.5 * (2.0 * np.random.random() - 1.0) 92 | rand_embed = (2.0 * np.random.random_sample([emb_size]) - 1.0) * alpha 93 | rand_embed = ['%.6f' % k for k in rand_embed.tolist()] 94 | word_embedding_init[id, :] = rand_embed 95 | 96 | pickle.dump(word_embedding_init.tolist(), output_file) 97 | 98 | print 'Map word vectors finished ...' 99 | -------------------------------------------------------------------------------- /IART/bin/test_and_evaluate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | import cPickle as pickle 6 | import tensorflow as tf 7 | import numpy as np 8 | 9 | import utils.reader as reader 10 | import utils.evaluation as eva 11 | 12 | 13 | def test(conf, _model): 14 | 15 | if not os.path.exists(conf['save_path']): 16 | os.makedirs(conf['save_path']) 17 | 18 | # load data 19 | print('starting loading data') 20 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 21 | train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb')) 22 | print('finish loading data') 23 | print('init intent_dict...') 24 | conf['intent_dict'] = reader.read_intent(conf['intent_vec_path']) if conf[ 25 | 'model_name'] != 'dam' else None 26 | test_batches = reader.build_batches(test_data, conf) 27 | print("finish building test batches") 28 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 29 | 30 | # refine conf 31 | test_batch_num = len(test_batches["response"]) 32 | 33 | print('configurations:') 34 | conf_copy = {} 35 | for k in conf: 36 | if k != 'intent_dict': 37 | conf_copy[k] = conf[k] 38 | print(conf_copy) 39 | 40 | _graph = _model.build_graph() 41 | print('build graph sucess') 42 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 43 | 44 | with tf.Session(graph=_graph) as sess: 45 | #_model.init.run(); 46 | _model.saver.restore(sess, conf["init_model"]) 47 | print("sucess init %s" %conf["init_model"]) 48 | 49 | batch_index = 0 50 | step = 0 51 | 52 | score_file_path = conf['save_path'] + 'score.test' 53 | score_file = open(score_file_path, 'w') 54 | attention_file = open(conf['save_path'] + 'attention.test', 'w') 55 | 56 | print('starting test') 57 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 58 | for batch_index in xrange(test_batch_num): 59 | 60 | feed = { 61 | _model.turns: test_batches["turns"][batch_index], 62 | _model.tt_turns_len: test_batches["tt_turns_len"][batch_index], 63 | _model.every_turn_len: test_batches["every_turn_len"][batch_index], 64 | _model.response: test_batches["response"][batch_index], 65 | _model.response_len: test_batches["response_len"][batch_index], 66 | _model.label: test_batches["label"][batch_index] 67 | } 68 | if conf['model_name'] != 'dam': 69 | feed[_model.turns_intent] = \ 70 | test_batches["turns_intent"][batch_index] 71 | feed[_model.response_intent] = \ 72 | test_batches["response_intent"][batch_index] 73 | 74 | scores, attention = sess.run([_model.logits, _model.attention], feed_dict = feed) 75 | # shape of attention [batch, max_turn_num] 76 | # shape of scores [batch] 77 | # also run and print out attention weights to do visualization 78 | #print('print out attention weights over context utterances:', attention) 79 | 80 | # print predicted scores and labels into score file 81 | # print intent aware-attention weights into attention file 82 | for i in xrange(conf["batch_size"]): 83 | score_file.write( 84 | str(scores[i]) + '\t' + 85 | str(test_batches["label"][batch_index][i]) + '\n') 86 | #str(sum(test_batches["every_turn_len"][batch_index][i]) / test_batches['tt_turns_len'][batch_index][i]) + '\t' + 87 | #str(test_batches['tt_turns_len'][batch_index][i]) + '\n') 88 | attention_file.write('\t'.join([str(a) for a in attention[i]]) 89 | + '\n') 90 | score_file.close() 91 | attention_file.close() 92 | print('finish test') 93 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 94 | 95 | #write evaluation result 96 | result = eva.evaluate(score_file_path) 97 | result_file_path = conf["save_path"] + "result.test" 98 | with open(result_file_path, 'w') as out_file: 99 | for p_at in result: 100 | out_file.write(str(p_at) + '\n') 101 | print('finish evaluation') 102 | # lyang: also print metrics in log file 103 | print('testing_metrics for_model_ckpt:\t{:s}\t[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format( 104 | conf["init_model"], result[0], result[1], result[2], result[3], result[4])) 105 | print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /IART/conqa/gen_query_all_metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Generate per-query metrics with trec_eval 3 | to do significance test and case study later 4 | 5 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 6 | ''' 7 | 8 | import os 9 | import sys 10 | import json 11 | import re 12 | from tqdm import tqdm 13 | 14 | def fix_duplicate_doc_id(mz_prediction_file): 15 | ''' 16 | Before going the next steps, we need to fix the "same doc id under the 17 | the same query" problem. Loop the qid under each query, if we find a 18 | duplicated qid, add '-' + dup_times after this did 19 | For example, if D811 is duplidated, we change it to D811-1, D811-2, etc. 20 | ''' 21 | nd_file = mz_prediction_file + '.nd' 22 | dup_times = 1 23 | with open(mz_prediction_file) as f_in, open(nd_file, 'w') as f_out: 24 | cur_qid = 'init' 25 | cache_did_set = set() 26 | cache_q_lines = [] 27 | for l in f_in: 28 | tokens = l.strip().split() 29 | if tokens[0] == cur_qid: 30 | # same qid 31 | if tokens[2] in cache_did_set: 32 | # means we find a duplicate doc id 33 | tokens[2] += ('-' + str(dup_times)) 34 | print('found dup doc_id, gen a new doc_id and qid: ', tokens[2], tokens[0]) 35 | dup_times += 1 36 | cache_did_set.add(tokens[2]) 37 | cache_q_lines.append('\t'.join(tokens) + '\n') # tokens[2] maybe changed 38 | else: 39 | # meet a new qid 40 | f_out.write(''.join(cache_q_lines)) 41 | dup_times = 1 # reset 42 | cache_q_lines = [] 43 | cache_q_lines.append(l) 44 | cache_did_set.clear() 45 | cur_qid = tokens[0] 46 | cache_did_set.add(tokens[2]) 47 | # the last query 48 | # print len(cache_q_lines), len(cache_did_set) 49 | if (len(cache_q_lines) != 0 and len(cache_q_lines) == len( 50 | cache_did_set)): 51 | f_out.write(''.join(cache_q_lines)) 52 | print('write the last query... done: ', ''.join(cache_q_lines)) 53 | return nd_file 54 | 55 | if __name__ == '__main__': 56 | 57 | if len(sys.argv) < 2: 58 | print 'please input params: mz_prediction_file (absolute path)' 59 | exit(1) 60 | mz_prediction_file = sys.argv[1] # dmn_cnn_prf_body.predict.test.txt for smn 61 | # dmn_cnn_kd_word-embedsize-500-rid-17-test-iter-250.predict.test.txt for ms 62 | # dmn_cnn_kd_word-embedsize-100-rid-17-test-iter-300.predict.test.txt for udc 63 | # dmn_cnn-pureDMN-contextlen-4-test-iter-500.predict.test.txt for ms 64 | 65 | # DMN for MS_V2 ../data/ms_v2/ModelRes/ms_v2-dmn_cnn_pure-goulburn-weights-best289-04092018-iter-290.predict.test.txt 66 | # DMN-PRF for MS_V2 ../data/ms_v2/ModelRes/ms_v2-dmn_cnn_prf_body-contextlen-10.weights.250.predict.test.txt 67 | # DMN-KD for MS_V2 ../data/ms_v2/ModelRes/ms_v2-dmn_cnn_kd_word-contextlen-6-rid-20.weights.350.predict.test.txt 68 | # SMN for MS_V2 ../data/ms_v2/ModelRes/ms_v2-smn-test.pkl.pred.txt.mz-score-file 69 | 70 | # Before going the next steps, we need to fix the "same doc id under the 71 | # the same query" problem. Loop the qid under each query, if we find a 72 | # duplicated qid, add '-' + dup_times after this did 73 | # For example, if D811 is duplidated, we change it to D811-1, D811-2, etc. 74 | mz_prediction_file = fix_duplicate_doc_id(mz_prediction_file) 75 | 76 | # Q597901 Q0 D118777 0 2.370282 DMN_CNN 0(ground truth) 77 | # seperated by \t 78 | # qid \t Q0 \t did \t rank \t score \t method \t ground_truth_label 79 | # 030 Q0 ZF08-175-870 0 4238 prise1 80 | # qid iter docno rank sim run_id 81 | # In particular, note that the rank field is ignored here; 82 | # internally ranks are assigned by sorting by the sim field with ties 83 | # broken deterministicly (using docno). 84 | with open(mz_prediction_file) as f_in, open(mz_prediction_file + '.score', 'w') as score_out, open(mz_prediction_file + '.qrel', 'w') as qrel_out: 85 | for l in f_in: 86 | to = l.split('\t') 87 | score_out.write(' '.join(to[0:len(to)-1]) + '\n') 88 | qrel_out.write(to[0] + ' Q0 ' + to[2] + ' ' + to[6]) # qid iter docno rel 89 | 90 | # compute per-query metrics with qrel 91 | # use -q to print out metrics for all queries 92 | cmd = '''trec_eval -m 'all_trec' -q ''' + mz_prediction_file + '.qrel ' + mz_prediction_file + '.score > ' \ 93 | + mz_prediction_file + '.metrics' 94 | print 'run ', cmd 95 | os.system(cmd) 96 | 97 | # parse the metrics file to extract the used metrics into a json file 98 | # {'Q101' : {'map':0.876, 'recall5':0.876, 'recall1':0.876, 'recall2':0.876}} 99 | q_metrics_dict = {} 100 | used_metrics = {'map', 'recall_1', 'recall_2', 'recall_5'} 101 | with open(mz_prediction_file + '.metrics') as f_in: 102 | #'\s+' to match 1 to many spaces 103 | for l in tqdm(f_in): 104 | to = re.split('\s+', l) # m_name qid score 105 | if to[0] not in used_metrics: 106 | continue 107 | if to[1] in q_metrics_dict: 108 | q_metrics_dict[to[1]][to[0]] = float(to[2]) 109 | else: 110 | q_metrics_dict[to[1]] = {} 111 | q_metrics_dict[to[1]][to[0]] = float(to[2]) 112 | with open(mz_prediction_file + '.metrics.json', 'w') as outfile: 113 | json.dump(q_metrics_dict, outfile) 114 | 115 | 116 | -------------------------------------------------------------------------------- /IART/conqa/transfer_mz_to_dam_format.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Data preprocess of MS_V2 and UDC data for running DAM model 3 | Transfer preprocessed data in MZ format to DAM input format 4 | A good preprocess is very important for good performance 5 | Preprocess UDC for debugging purpose 6 | Preprocess MS_V2 to get results of DAM on MS_V2 7 | The input data format is label \t context (utterances seperated by \t) \t response 8 | Add qid/dids in relation files in the pkl file in order to associate the 9 | corresponding intent vectors in the future 10 | 11 | Firstly run data_preprocess_dam.py, then run transfer_mz_to_dam_format.py 12 | 13 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 14 | @homepage: https://sites.google.com/site/lyangwww/ 15 | ''' 16 | 17 | import sys 18 | import pickle 19 | import random 20 | from tqdm import tqdm 21 | 22 | def gen_id2corpus(corpus_pre_file, word_dict_file): 23 | word_dict = dict() 24 | id2corpus = dict() 25 | id2word = dict() 26 | with open(word_dict_file, 'r') as fin: 27 | for l in fin: 28 | tok = l.split() 29 | word_dict[tok[0]] = tok[1].strip() 30 | id2word[int(tok[1].strip())] = tok[0] 31 | # word_id for _eos_ is the maxID+1 in word_dict 32 | word_dict['_eos_'] = str(len(word_dict) + 1) 33 | id2word[len(id2word) + 1] = '_eos_' 34 | print('word id for _eos_: ', word_dict['_eos_']) 35 | #print('len(id2word)', len(id2word)) 36 | with open(corpus_pre_file, 'r') as fin: 37 | for l in tqdm(fin): 38 | tok = l.split('\t') 39 | id = tok[0] 40 | if 'D' in id: 41 | id2corpus[id] = [int(w) for w in tok[2].split()] 42 | else: 43 | utts = [] 44 | for i in range(2, len(tok)): 45 | utts.extend(tok[i].split()) 46 | utts.append(word_dict['_eos_']) 47 | utts = utts[0:len(utts)-2] # remove the last 2 _eos_ 48 | utts = [int(w) for w in utts] 49 | # utts_words = [id2word[w] for w in utts] 50 | # print('test utts_words: ', utts_words) 51 | id2corpus[id] = utts 52 | return id2corpus, word_dict 53 | 54 | def gen_dam_inputs(basedir, data_partition, id2corpus, word_dict, gen_mode, 55 | relation_mode, ag_mode, ag_sample_num): 56 | if relation_mode == 'nofd' \ 57 | or data_partition == 'test' \ 58 | or data_partition == 'valid': # can't filter queries in test/valid 59 | rel_file = basedir + 'relation_' + data_partition + '.txt' # for data_nofd.pkl 60 | elif ag_mode == 'yes': 61 | rel_file = basedir + 'relation_' + data_partition + '.txt.ag' + ag_sample_num # for data_ag2.pkl 62 | else: 63 | rel_file = basedir + 'relation_' + data_partition + '.txt.fd' # for data.pkl 64 | print 'using relation file: ', rel_file 65 | 66 | labels = [] 67 | qids = [] 68 | dids = [] 69 | context = [] 70 | resp = [] 71 | 72 | with open(rel_file) as fin: 73 | ins_num = 0 74 | for l in tqdm(fin): 75 | ins_num += 1 76 | tok = l.strip().split() 77 | labels.append(int(tok[0])) 78 | qids.append(tok[1].strip()) 79 | dids.append(tok[2].strip()) 80 | context.append(id2corpus[tok[1]]) 81 | resp.append(id2corpus[tok[2]]) 82 | if gen_mode == 'small' and ins_num >= 10000: 83 | break 84 | # Add qid/dids in relation files in the pkl file in order to associate the 85 | # corresponding intent vectors in the future 86 | return {'y': labels, 'c': context, 'r': resp, 'qids': qids, 'dids': dids} 87 | 88 | def gen_relation_train_ag_file(basedir, data_name, ag_sample_num): 89 | ''' 90 | Perform data augumentation for training data by ramdonly sampling more 91 | negtive training data 92 | Only need to do this for UDC data 93 | ''' 94 | if data_name != 'udc': 95 | raise NameError('can only do ag for udc!') 96 | rel_train_file = basedir + 'relation_train.txt' 97 | doc_id_pool = set() 98 | with open(rel_train_file) as fin: 99 | for l in fin: 100 | t = l.strip().split() 101 | doc_id_pool.add(t[2]) 102 | # for UDC, each qid has 1 pos did and 1 neg did 103 | # we further sample k more neg dids for each qid 104 | doc_id_pool = list(doc_id_pool) 105 | total_doc_num = len(doc_id_pool) 106 | print 'test total_doc_num', total_doc_num 107 | with open(rel_train_file) as fin, open( 108 | rel_train_file + '.ag' + ag_sample_num, 'w') as fout: 109 | line_index = -1 110 | for l in tqdm(fin): 111 | t = l.strip().split() 112 | fout.write(l) 113 | line_index += 1 114 | if line_index % 2 == 1: 115 | #print 'test cur line_index and t', line_index, t 116 | sampled_num = 0 117 | while sampled_num < int(ag_sample_num): # transfer to int! 118 | #print 'test sampled_num and ag_sample_num: ', sampled_num, ag_sample_num 119 | pick = random.randint(0,total_doc_num-1) 120 | sdid = doc_id_pool[pick] 121 | #print 'test pick sdid t[2]: ', pick, sdid, t[2] 122 | if sdid != t[2]: 123 | fout.write('0 ' + t[1] + ' ' + sdid + '\n') 124 | #print 'test sampled_num: ', sampled_num 125 | sampled_num += 1 126 | 127 | if __name__ == '__main__': 128 | if len(sys.argv) < 6: 129 | print 'please input params: data_name (udc or ms_v2) \ 130 | gen_mode (full or small) relation_mode(fd or nofd) ag_mode(yes or no) \ 131 | ag_sample_num (2,4,6,8)' 132 | exit(1) 133 | # If gen_mode=small, only use 10000 train/valid/test relations for debug 134 | # If relation_mode=nofd, use original relation files without filtering 135 | # queries with duplicate doc id 136 | # If ag_mode=yes, do data augumentation for training data by sample 137 | # new negative dids from the doc id pool. If the sampled did is already 138 | # covered, do resampling; otherwise add this sampled pair into 139 | # the training data to get a larger training data 140 | # transfer_mz_to_dam_format.py udc full fd yes 2 141 | data_name = sys.argv[1] # udc or ms_v2 142 | gen_mode = sys.argv[2] # full or small 143 | relation_mode = sys.argv[3] # fd or nofd 144 | ag_mode = sys.argv[4] # yes or no 145 | ag_sample_num = sys.argv[5] # 2,4,6,8 146 | basedir = '../../data/' + data_name + '/' 147 | 148 | corpus_pre_file = basedir + 'corpus_preprocessed.txt' 149 | word_dict_file = basedir + 'word_dict.txt' 150 | id2corpus, word_dict = gen_id2corpus(corpus_pre_file, word_dict_file) 151 | 152 | if ag_mode == 'yes': 153 | gen_relation_train_ag_file(basedir, data_name, ag_sample_num) 154 | 155 | # transform context/response pairs into input pkl file of DAM model 156 | train = gen_dam_inputs(basedir, 'train', id2corpus, word_dict, gen_mode, 157 | relation_mode, ag_mode, ag_sample_num) 158 | valid = gen_dam_inputs(basedir, 'valid', id2corpus, word_dict, gen_mode, 159 | relation_mode, 'no', ag_sample_num) # no ag for valid 160 | test = gen_dam_inputs(basedir, 'test', id2corpus, word_dict, gen_mode, 161 | relation_mode, 'no', ag_sample_num) # no ag for test 162 | 163 | if gen_mode == 'small': 164 | data_pkl_name = 'data_small.pkl' 165 | elif relation_mode == 'nofd': 166 | data_pkl_name = 'data_nofd.pkl' 167 | elif ag_mode == 'yes': 168 | data_pkl_name = 'data_ag' + ag_sample_num + '.pkl' 169 | else: 170 | data_pkl_name = 'data.pkl' 171 | print('begin writing data pkl file...', data_pkl_name) 172 | pickle.dump((train,valid,test), open(basedir + data_pkl_name, 'wb')) 173 | print('finish writing data pkl file...', data_pkl_name) 174 | with open(basedir + 'word2id', 'w') as fout: 175 | for w in word_dict: 176 | fout.write(w + '\n') 177 | fout.write(word_dict[w] + '\n') 178 | print('write word_dict done!') 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 2 | 3 | # __IART: Intent-aware Response Ranking with Transformers in Information-seeking Conversation Systems__ 4 | 5 | This is the source code of the IART model (Intent-aware Response Ranking with Transformers), which is proposed for multi-turn response selection in the retrieval-based conversation systems. 6 | 7 | IART is built on top of the integration of user intent modeling and 8 | language representation learning with the Transformer architecture, 9 | which relies entirely on a self-attention mechanism instead of recurrent 10 | nets. It incorporates intent-aware utterance attention to derive an 11 | importance weighting scheme of utterances in conversation context with 12 | the aim of better conversation history understanding. IART is 13 | published in WWW 2020. Please find our [paper](https://arxiv.org/pdf/2002.00571.pdf) 14 | for more details of this model. 15 | 16 | ## __Network__ 17 | 18 | IART is built on the top of the [DAM](https://github.com/baidu/Dialogue/tree/master/DAM) model. The 19 | model incorporates intent-aware utterance attention to derive the 20 | importance weighting scheme of different context utterances. Given input 21 | context utterances and response candidates, we first generate representations from two different perspectives: user intent representations with a 22 | trained neural classifier and semantic information encoding with Transformers. Then self-attention and cross-attention matching will be performed over 23 | encoded representations from Transformers to extract matching features. 24 | These matching features will be weighted by the intent-aware attention mechanism and aggregated into a matching tensor. Finally a two-layer 3D convolutional neural network will distill final representations over 25 | the matching tensor and generate the ranking score for the conversation context/ response candidate pair. The main difference between IART and 26 | DAM is that we explicitly define and model user intent in conversations. 27 | We show that the intent-aware attention mechanism can help improve response ranking in conversations. 28 | 29 |
30 | 31 |
32 | 33 | ## __Usage__ 34 | 35 | First, please download data from this [Google Drive folder](https://drive.google.com/drive/folders/1ayXN6pgzxs7DP9iCO6JR-KXbQHPUOLXx?usp=sharing) and 36 | unzip it: 37 | 38 | ``` 39 | unzip data.zip 40 | ``` 41 | 42 | Train and test the model by: 43 | ``` 44 | sh run.sh 45 | ``` 46 | 47 | ## __Data Preprocessing__ 48 | 49 | The code for data preprocessing is in the folder IART/conqa. The input data format is 50 | 51 | ``` 52 | label \t context (utterances separated by \t) \t response. 53 | ``` 54 | 55 | To generate the pkl data files used for model training/testing, you need to firstly run the script conqa/data_preprocess_dam.py to do data preprocessing. After that you need to run the script conqa/transfer_mz_to_dam_format.py to transfer the preprocessed data in MatchZoo model format to the input data format of DAM/IART model. 56 | 57 | The user intent feature vectors output by the user intent classifier are stored in a separate input text file. The format of the user intent vector should be 58 | 59 | ``` 60 | UtteranceID_or_ResponseID \t intent_vector (12 dimensional real value vector for MS/UDC) 61 | ``` 62 | 63 | The utterance ID is in the format context_query_id-utterance_position. For example: 64 | 65 | ``` 66 | Q494819-0 \t 0.77854055 0.005579375 0.012071793 0.1426655 0.04516567 0.01410749 0.045949493 0.32732058 0.021191863 0.052890584 0.019807862 0.06566066 67 | Q494819-1 \t 0.07301873 0.106960244 0.17646718 0.061297048 0.45176902 0.31784177 0.051918026 0.21009733 0.11450306 0.5177668 0.1812782 0.118079714 68 | Q494819-2 \t 0.20212053 0.1563704 0.16760118 0.102229066 0.356528 0.31535488 0.081880495 0.28093183 0.11076649 0.46562743 0.18301935 0.16115002 69 | D464632 \t 0.0762779 0.0095357755 0.046223667 0.018178586 0.00872361 0.0409651 0.0002578992 0.3250671 4.5165205e-05 0.5883033 0.06031314 0.0021345518 70 | D464633 \t 0.08268682 0.09142208 0.17887363 0.12554155 0.10471701 0.15151422 0.020252429 0.4203747 0.019151673 0.4006856 0.20331828 0.064555936 71 | D464630 \t 0.047641575 0.05804358 0.10583141 0.048851516 0.09105808 0.18841337 0.010175006 0.52949446 0.0072372677 0.38212296 0.13627377 0.037408095 72 | ``` 73 | 74 | Q* denotes context utterances. D* denotes response candidates 75 | The following script can help you generate this vector file 76 | conqa/gen_user_intent_vector.py. You can also modify this script or write your own scripts to transfer the intent feature vectors for your own data sets to this format. 77 | 78 | ## __Further Instructions on Training/Testing IART__ 79 | 80 | The main scripts for training/testing IART model based on UDC and [MSDialog](https://ciir.cs.umass.edu/downloads/msdialog/) data are main_udc.py and main_ms_v2.py respectively. With the right setting on the model configuration, you can start the model training based on [MSDialog](https://ciir.cs.umass.edu/downloads/msdialog/) data by running 81 | 82 | ``` 83 | python main_ms_v2.py 84 | ``` 85 | 86 | * Set intent_vec_path as the path of the intent features vector file you use. 87 | * Set intent_size as the number of different intent (12 for UDC and MS) 88 | * Set intent_attention_type as bilinear for IART-bilinear, as dot for IART-dot, as outprod for IART-outerproduct 89 | * Set model_name as iadam-attention 90 | * Set data_name, data_path, save_path, word_embed_init, vocab_size, embed_size, batch_size, max_turn_num, max_turn_len, _EOS_ according to the specific setting of your own data sets. 91 | 92 | For model testing, you need to additionally set init_model as the path of the best model checkpoint file from your trained model. 93 | 94 | ## __Example Model Training Output__ 95 | 96 | ``` 97 | ..... 98 | [finish building valid batches 99 | 2021-04-17 14:24:46 100 | ('batch_size: ', 32) 101 | ('total number of batches in one epoch: ', 312) 102 | configurations: 103 | {'vocab_size': 167983, 'intent_vec_path': '../data/ms_v2/intent_vectors.txt', 'data_name': 'ms_v2', 'intent_loss_weight': 0.2, 'emb_size': 200, 'is_mask': True, 'train_steps': 1560, 'drop_attention': None, 'word_emb_init': None, 'print_step': 3, 'save_path': '../output/ms_v2/temp/', 'max_turn_num': 6, 'is_positional': False, 'data_path': '../data/ms_v2/data_small.pkl', 'init_model': None, '_EOS_': 167983, 'learning_rate': 0.001, 'intent_attention_type': 'bilinear', 'rand_seed': None, 'drop_dense': None, 'batch_size': 32, 'final_n_class': 1, 'intent_size': 12, 'intent_ffn_od1': 64, 'intent_ffn_od0': 128, 'attention_type': 'dot', 'cnn_3d_oc0': 16, 'cnn_3d_oc1': 16, 'max_turn_len': 200, 'num_scan_data': 5, 'max_to_keep': 1, 'save_step': 31, 'is_layer_norm': True, 'stack_num': 4, 'model_name': 'iadam-attention'} 104 | model sucess 105 | 2021-04-17 14:24:46 106 | ('current turn_index : ', 0) 107 | ('current turn_index : ', 1) 108 | ('current turn_index : ', 2) 109 | ('current turn_index : ', 3) 110 | ('current turn_index : ', 4) 111 | ('current turn_index : ', 5) 112 | [attention_logits] after stack attention_logits.shape: (32, 6) 113 | [attention_mask] attention_mask.shape: (32, 6) 114 | [attention] attention.shape: (32, 6) 115 | [3d cnn aggregation] sim shape: (32, 6, 200, 200, 10) 116 | conv_0 shape: (32, 6, 200, 200, 16) 117 | pooling_0 shape: (32, 2, 67, 67, 16) 118 | conv_1 shape: (32, 2, 67, 67, 16) 119 | pooling_1 shape: (32, 1, 23, 23, 16) 120 | [3d cnn aggregation] final_info: (32, 8464) 121 | build graph sucess 122 | 2021-04-17 14:28:23 123 | 2021-04-17 14:28:23.570120: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA 124 | starting shuffle train data 125 | finish building train data 126 | step: 3 lr: 0.001, epoch: 0 127 | step: 3 processed current epoch: [0.00961538461538] loss: 0.567167798678 128 | step: 6 lr: 0.001, epoch: 0 129 | step: 6 processed current epoch: [0.0192307692308] loss: 0.393668711185 130 | step: 9 lr: 0.001, epoch: 0 131 | step: 9 processed current epoch: [0.0288461538462] loss: 0.467894713084 132 | step: 12 lr: 0.001, epoch: 0 133 | step: 12 processed current epoch: [0.0384615384615] loss: 0.413786088427 134 | step: 15 lr: 0.001, epoch: 0 135 | step: 15 processed current epoch: [0.0480769230769] loss: 0.271514303361]: 136 | ...... 137 | 138 | ``` 139 | 140 | ## __Dependencies__ 141 | 142 | - Python 2.7.3 143 | - Tensorflow == 1.4 144 | 145 | ## __Citation__ 146 | 147 | The following article describes the IART model in detail. 148 | 149 | ``` 150 | @inproceedings{ 151 | title={IART: Intent-aware Response Ranking with Transformers in Information-seeking Conversation Systems}, 152 | author={Liu Yang, Minghui Qiu, Chen Qu, Cen Chen, Jiafeng Guo, Yongfeng Zhang, W. Bruce Croft, Haiqing Chen}, 153 | booktitle={WWW 2020}, 154 | year={2020} 155 | } 156 | ``` 157 | 158 | ## __Acknowledgement__ 159 | 160 | IART is built based on the [DAM model](https://github.com/baidu/Dialogue/tree/master/DAM) released by [Zhou et. al. ACL 2018](https://www.aclweb.org/anthology/P18-1103/). We thank the DAM authors for the effort on open sourcing their model code. 161 | 162 | ## __Contact__ 163 | 164 | For help or issues using IART, please submit a GitHub issue. 165 | 166 | For personal communication related to IART, please contact Liu Yang (yangliuyx@gmail.com), Minghui Qiu (minghuiqiu@yeah.net), Chen Qu (quchen0502@gmail.com) or Cen Chen (cecilia.cenchen@gmail.com). 167 | -------------------------------------------------------------------------------- /IART/bin/train_and_evaluate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | import cPickle as pickle 6 | import tensorflow as tf 7 | import numpy as np 8 | 9 | import utils.reader as reader 10 | import utils.evaluation as eva 11 | 12 | 13 | def train(conf, _model): 14 | 15 | if conf['rand_seed'] is not None: 16 | np.random.seed(conf['rand_seed']) 17 | 18 | if not os.path.exists(conf['save_path']): 19 | os.makedirs(conf['save_path']) 20 | 21 | # load data 22 | print('starting loading data') 23 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 24 | train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb')) 25 | print('lyang test: val_data: ', len(val_data), type(val_data), 26 | len(val_data['y'])) 27 | print('lyang test: val_data[y]: ', val_data['y'][0:2]) 28 | print('lyang test: val_data[c]: ', val_data['c'][0:2]) 29 | print('lyang test: val_data[r]: ', val_data['r'][0:2]) 30 | print('lyang test: val_data[qids]: ', val_data['qids'][0:2]) 31 | print('lyang test: val_data[dids]: ', val_data['dids'][0:2]) 32 | print('map id to words ...') 33 | id2word = reader.read_dict('../data/' + conf["data_name"]+ '/word2id') 34 | response_ids = val_data['r'][0:1][0] 35 | context_ids = val_data['c'][0:1][0] 36 | print('lyang test: val_data[c]: ', 37 | [id2word[str(id)] for id in context_ids], val_data.keys()) 38 | print('lyang test: val_data[r]: ', 39 | [id2word[str(id)] for id in response_ids], val_data.keys()) 40 | 41 | print('finish loading data') 42 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 43 | 44 | print('init intent_dict...') 45 | conf['intent_dict'] = reader.read_intent(conf['intent_vec_path']) if conf[ 46 | 'model_name'] != 'dam' else None 47 | print('lyang test len(conf[intent_dict])', len(conf['intent_dict'])) 48 | val_batches = reader.build_batches(val_data, conf) 49 | 50 | # check the example 0 and 1 in batch 0 51 | print('intent of val_batches context: ', val_batches['turns_intent'][0][0:2]) 52 | print('intent of val_batches response: ',val_batches['response_intent'][0][0:2]) 53 | 54 | print("finish building valid batches") 55 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 56 | 57 | # refine conf 58 | batch_num = len(train_data['y']) / conf["batch_size"] 59 | print('batch_size: ', conf["batch_size"]) 60 | print('total number of batches in one epoch: ', batch_num) 61 | val_batch_num = len(val_batches["response"]) 62 | 63 | conf["train_steps"] = conf["num_scan_data"] * batch_num # total number of training steps epoch_num * batch_num 64 | conf["save_step"] = max(1, batch_num / 10) # at most save 10 times 65 | conf["print_step"] = max(1, batch_num / 100) # at most print 100 times 66 | 67 | print('configurations:') 68 | conf_copy = {} 69 | for k in conf: 70 | if k != 'intent_dict': 71 | conf_copy[k] = conf[k] 72 | print(conf_copy) 73 | 74 | print('model sucess') 75 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 76 | 77 | _graph = _model.build_graph() 78 | print('build graph sucess') 79 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 80 | 81 | with tf.Session(graph=_graph) as sess: 82 | # train_writer = tf.summary.FileWriter( 83 | # conf["save_path"] + "tensorboard_log/", sess.graph) 84 | # merge = tf.summary.merge_all() # for tensorboard 85 | _model.init.run(); 86 | if conf["init_model"]: 87 | _model.saver.restore(sess, conf["init_model"]) 88 | print("sucess init %s" %conf["init_model"]) 89 | 90 | average_loss = 0.0 91 | batch_index = 0 92 | step = 0 93 | best_result = [0, 0, 0, 0] 94 | 95 | for step_i in xrange(conf["num_scan_data"]): 96 | #for batch_index in rng.permutation(range(batch_num)): 97 | print('starting shuffle train data') 98 | shuffle_train = reader.unison_shuffle(train_data) 99 | train_batches = reader.build_batches(shuffle_train, conf) 100 | print('finish building train data') 101 | for batch_index in range(batch_num): 102 | 103 | feed = { 104 | _model.turns: train_batches["turns"][batch_index], 105 | _model.tt_turns_len: train_batches["tt_turns_len"][batch_index], 106 | _model.every_turn_len: train_batches["every_turn_len"][batch_index], 107 | _model.response: train_batches["response"][batch_index], 108 | _model.response_len: train_batches["response_len"][batch_index], 109 | _model.label: train_batches["label"][batch_index], 110 | } 111 | if conf['model_name'] != 'dam': 112 | feed[_model.turns_intent] = train_batches["turns_intent"][batch_index] 113 | feed[_model.response_intent] = train_batches["response_intent"][batch_index] 114 | 115 | batch_index = (batch_index + 1) % batch_num; 116 | 117 | _, curr_loss = sess.run([_model.g_updates, _model.loss], feed_dict = feed) 118 | # print loss and metrics into tensorboard log 119 | # train_writer.add_summary(summ, global_step=step) 120 | 121 | average_loss += curr_loss 122 | 123 | step += 1 124 | 125 | if step % conf["print_step"] == 0 and step > 0: 126 | g_step, lr = sess.run([_model.global_step, _model.learning_rate]) 127 | print('step: %s lr: %s, epoch: %s ' %(g_step, lr, step_i)) 128 | print("step: " + str(g_step)+ " processed current epoch: [" \ 129 | + str(step * 1.0 / batch_num) + "] loss: " + \ 130 | str(average_loss / conf["print_step"])) 131 | average_loss = 0 132 | 133 | if step % conf["save_step"] == 0 and step > 0: 134 | index = step / conf['save_step'] 135 | score_file_path = conf['save_path'] + 'score.' + str(index) 136 | score_file = open(score_file_path, 'w') 137 | print('save step: %s' %index) 138 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 139 | 140 | for batch_index in xrange(val_batch_num): 141 | 142 | feed = { 143 | _model.turns: val_batches["turns"][batch_index], 144 | _model.tt_turns_len: val_batches["tt_turns_len"][batch_index], 145 | _model.every_turn_len: val_batches["every_turn_len"][batch_index], 146 | _model.response: val_batches["response"][batch_index], 147 | _model.response_len: val_batches["response_len"][batch_index], 148 | _model.label: val_batches["label"][batch_index] 149 | } 150 | if conf['model_name'] != 'dam': 151 | feed[_model.turns_intent] = \ 152 | val_batches["turns_intent"][batch_index] 153 | feed[_model.response_intent] = \ 154 | val_batches["response_intent"][batch_index] 155 | 156 | scores = sess.run(_model.logits, feed_dict = feed) 157 | 158 | for i in xrange(conf["batch_size"]): 159 | score_file.write( 160 | str(scores[i]) + '\t' + 161 | str(val_batches["label"][batch_index][i]) + '\n') 162 | score_file.close() 163 | 164 | #write evaluation result 165 | result = eva.evaluate(score_file_path) 166 | result_file_path = conf["save_path"] + "result." + str(index) 167 | with open(result_file_path, 'w') as out_file: 168 | for m in result: 169 | out_file.write(str(m) + '\n') 170 | print('finish evaluation') 171 | # lyang: also print metrics in log file 172 | print('save step:\t{:d}\t[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format( 173 | index, result[0], result[1], result[2], result[3], result[4])) 174 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 175 | # lyang: also print metrics in tensorboard log file 176 | # metrics = tf.Summary(value=[ 177 | # tf.Summary.Value(tag="R10at1", simple_value=result[1]), 178 | # ]) 179 | # metrics.value.add(tag="MAP", simple_value=result[4]) 180 | # # metrics.value.add(tag="R10at2", simple_value=result[2]) 181 | # # metrics.value.add(tag="R10at5", simple_value=result[3]) 182 | # train_writer.add_summary(metrics, global_step=step) 183 | 184 | if result[1] + result[2] > best_result[1] + best_result[2]: # save model only when find a model better than previously best model 185 | best_result = result 186 | _save_path = _model.saver.save(sess, conf["save_path"] + "model.ckpt." + str(step / conf["save_step"])) 187 | print("succ saving model in " + _save_path) 188 | print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) 189 | 190 | 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /IART/models/net.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cPickle as pickle 4 | 5 | import utils.layers as layers 6 | import utils.operations as op 7 | 8 | class Net(object): 9 | '''Add positional encoding(initializer lambda is 0), 10 | cross-attention, cnn integrated and grad clip by value. 11 | 12 | Attributes: 13 | conf: a configuration paramaters dict 14 | word_embedding_init: a 2-d array with shape [vocab_size+1, emb_size] 15 | there is one dimension in vocab_size which is corresponding to _eos_. 16 | in our preprocessing, _eos_ is always the last dimension 17 | +1 to add one more embedding vector for padding and masking 18 | We add an "all 0" vector in the 0-th row of word_embedding_init in order 19 | to denote the padding word 20 | when call tf.nn.embedding_lookup(), if word_id = 0, then this is a paded 21 | word; if word_id > 0 (from 1 to vocab_size), then this is a real word 22 | ''' 23 | def __init__(self, conf): 24 | self._graph = tf.Graph() 25 | self._conf = conf 26 | 27 | if self._conf['word_emb_init'] is not None: 28 | print('loading word emb init') 29 | self._word_embedding_init = pickle.load(open(self._conf['word_emb_init'], 'rb')) 30 | else: 31 | self._word_embedding_init = None 32 | 33 | def build_graph(self): 34 | with self._graph.as_default(): 35 | if self._conf['rand_seed'] is not None: 36 | rand_seed = self._conf['rand_seed'] 37 | tf.set_random_seed(rand_seed) 38 | print('set tf random seed: %s' %self._conf['rand_seed']) 39 | 40 | #word embedding 41 | if self._word_embedding_init is not None: 42 | word_embedding_initializer = tf.constant_initializer(self._word_embedding_init) 43 | else: 44 | word_embedding_initializer = tf.random_normal_initializer(stddev=0.1) 45 | 46 | self._word_embedding = tf.get_variable( 47 | name='word_embedding', 48 | shape=[self._conf['vocab_size']+1, self._conf['emb_size']], 49 | dtype=tf.float32, 50 | initializer=word_embedding_initializer) 51 | 52 | 53 | #define placehloders 54 | self.turns = tf.placeholder( 55 | tf.int32, 56 | shape=[self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"]]) 57 | 58 | self.tt_turns_len = tf.placeholder( 59 | tf.int32, 60 | shape=[self._conf["batch_size"]]) 61 | 62 | self.every_turn_len = tf.placeholder( 63 | tf.int32, 64 | shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) 65 | 66 | self.response = tf.placeholder( 67 | tf.int32, 68 | shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) 69 | 70 | self.response_len = tf.placeholder( 71 | tf.int32, 72 | shape=[self._conf["batch_size"]]) 73 | 74 | self.label = tf.placeholder( 75 | tf.float32, 76 | shape=[self._conf["batch_size"]]) 77 | 78 | 79 | #define operations 80 | #response part 81 | Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) 82 | 83 | if self._conf['is_positional'] and self._conf['stack_num'] > 0: 84 | with tf.variable_scope('positional'): 85 | Hr = op.positional_encoding_vector(Hr, max_timescale=10) 86 | Hr_stack = [Hr] 87 | # lyang comments: self attention 88 | for index in range(self._conf['stack_num']): 89 | with tf.variable_scope('self_stack_' + str(index)): 90 | Hr = layers.block( # attentive module 91 | Hr, Hr, Hr, 92 | Q_lengths=self.response_len, K_lengths=self.response_len, 93 | attention_type=self._conf['attention_type']) 94 | Hr_stack.append(Hr) 95 | 96 | 97 | #context part 98 | #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] 99 | list_turn_t = tf.unstack(self.turns, axis=1) 100 | list_turn_length = tf.unstack(self.every_turn_len, axis=1) 101 | 102 | sim_turns = [] 103 | #for every turn_t calculate matching vector 104 | for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): 105 | Hu = tf.nn.embedding_lookup(self._word_embedding, turn_t) #[batch, max_turn_len, emb_size] 106 | 107 | if self._conf['is_positional'] and self._conf['stack_num'] > 0: 108 | with tf.variable_scope('positional', reuse=True): 109 | Hu = op.positional_encoding_vector(Hu, max_timescale=10) 110 | Hu_stack = [Hu] 111 | 112 | # lyang comments: self attention 113 | for index in range(self._conf['stack_num']): 114 | 115 | with tf.variable_scope('self_stack_' + str(index), reuse=True): 116 | Hu = layers.block( # attentive module 117 | Hu, Hu, Hu, 118 | Q_lengths=t_turn_length, K_lengths=t_turn_length, 119 | attention_type=self._conf['attention_type']) 120 | 121 | Hu_stack.append(Hu) 122 | 123 | # lyang comments: cross attention 124 | r_a_t_stack = [] 125 | t_a_r_stack = [] 126 | # cross attention 127 | for index in range(self._conf['stack_num']+1): 128 | 129 | with tf.variable_scope('t_attend_r_' + str(index)): 130 | try: 131 | t_a_r = layers.block( # attentive module 132 | Hu_stack[index], Hr_stack[index], Hr_stack[index], 133 | Q_lengths=t_turn_length, K_lengths=self.response_len, 134 | attention_type=self._conf['attention_type']) 135 | except ValueError: 136 | tf.get_variable_scope().reuse_variables() 137 | t_a_r = layers.block( 138 | Hu_stack[index], Hr_stack[index], Hr_stack[index], 139 | Q_lengths=t_turn_length, K_lengths=self.response_len, 140 | attention_type=self._conf['attention_type']) 141 | 142 | 143 | with tf.variable_scope('r_attend_t_' + str(index)): 144 | try: 145 | r_a_t = layers.block( # attentive module 146 | Hr_stack[index], Hu_stack[index], Hu_stack[index], 147 | Q_lengths=self.response_len, K_lengths=t_turn_length, 148 | attention_type=self._conf['attention_type']) 149 | except ValueError: 150 | tf.get_variable_scope().reuse_variables() 151 | r_a_t = layers.block( 152 | Hr_stack[index], Hu_stack[index], Hu_stack[index], 153 | Q_lengths=self.response_len, K_lengths=t_turn_length, 154 | attention_type=self._conf['attention_type']) 155 | 156 | t_a_r_stack.append(t_a_r) 157 | r_a_t_stack.append(r_a_t) 158 | 159 | #lyang comments: 3D aggregation 160 | t_a_r_stack.extend(Hu_stack) 161 | r_a_t_stack.extend(Hr_stack) 162 | 163 | t_a_r = tf.stack(t_a_r_stack, axis=-1) 164 | r_a_t = tf.stack(r_a_t_stack, axis=-1) 165 | 166 | 167 | #calculate similarity matrix 168 | with tf.variable_scope('similarity'): 169 | # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] 170 | # divide sqrt(200) to prevent gradient explosion 171 | sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0) 172 | 173 | sim_turns.append(sim) 174 | 175 | 176 | #cnn and aggregation 177 | #lyang comments aggregation by 3D CNN layer 178 | sim = tf.stack(sim_turns, axis=1) 179 | print('sim shape: %s' %sim.shape) 180 | with tf.variable_scope('cnn_aggregation'): 181 | final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'], 182 | self._conf['cnn_3d_oc1']) 183 | #for udc 184 | #final_info = layers.CNN_3d(sim, 32, 16) 185 | #for douban 186 | #final_info = layers.CNN_3d(sim, 16, 16) 187 | 188 | #loss and train 189 | with tf.variable_scope('loss'): 190 | self.loss, self.logits = layers.loss(final_info, self.label) 191 | 192 | self.global_step = tf.Variable(0, trainable=False) 193 | initial_learning_rate = self._conf['learning_rate'] 194 | self.learning_rate = tf.train.exponential_decay( 195 | initial_learning_rate, 196 | global_step=self.global_step, 197 | decay_steps=400, 198 | decay_rate=0.9, 199 | staircase=True) 200 | 201 | Optimizer = tf.train.AdamOptimizer(self.learning_rate) 202 | self.optimizer = Optimizer.minimize( 203 | self.loss, 204 | global_step=self.global_step) 205 | 206 | self.init = tf.global_variables_initializer() 207 | self.saver = tf.train.Saver(max_to_keep = self._conf["max_to_keep"]) 208 | self.all_variables = tf.global_variables() 209 | self.all_operations = self._graph.get_operations() 210 | self.grads_and_vars = Optimizer.compute_gradients(self.loss) 211 | 212 | for grad, var in self.grads_and_vars: 213 | if grad is None: 214 | print var 215 | 216 | self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] 217 | self.g_updates = Optimizer.apply_gradients( 218 | self.capped_gvs, 219 | global_step=self.global_step) 220 | 221 | return self._graph 222 | 223 | -------------------------------------------------------------------------------- /IART/main_conversation_qa.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | import models.net as net 5 | import models.iadam_attention as iadam_attention 6 | 7 | import bin.train_and_evaluate as train 8 | import bin.test_and_evaluate as test 9 | 10 | def main(argv): 11 | # conf_udc and conf_ms are the default settings for udc and ms_v2 12 | conf_udc = { 13 | "data_name": "udc", 14 | "data_path": "../data/udc/data.pkl", # data_small.pkl or data.pkl or data_nofd.pkl 15 | "intent_vec_path": "../data/udc/intent_vectors.txt", # path of intent vectors 16 | "intent_size": 12, # dimensions of different intent 17 | "intent_attention_type": "bilinear", # 'dot', 'bilinear', 'outprod'. default is bilinear 18 | "intent_ffn_od0": 64, # in iadam-concat ffn 144->64->16 match 576 19 | "intent_ffn_od1": 16, # in iadam-concat ffn 144->64->16 match 576 20 | "intent_loss_weight": 0.2, 21 | # in iadam-mtl weight for intent loss; 1-weight for the ranking loss 22 | "model_name": "iadam-concat", # dam, iadam-concat, iadam-attention, iadam-mtl 23 | "save_path": "../output/udc/temp/", 24 | "word_emb_init": "../data/udc/cut_embed_mikolov_200d.pkl", # word_embedding.pkl 25 | "init_model": None, # should be set for test 26 | "rand_seed": None, 27 | "drop_dense": None, 28 | "drop_attention": None, 29 | 30 | "is_mask": True, 31 | "is_layer_norm": True, 32 | "is_positional": False, 33 | 34 | "stack_num": 5, 35 | "attention_type": "dot", 36 | 37 | "learning_rate": 1e-3, 38 | "vocab_size": 429498, 39 | "emb_size": 200, 40 | "batch_size": 128, # for udc/iadam_mtl model, batch_size = 64; others = 128 41 | 42 | "max_turn_num": 9, 43 | "max_turn_len": 50, 44 | 45 | "max_to_keep": 1, 46 | "num_scan_data": 2, # about 16 hours for 2 epoches on udc 47 | "_EOS_": 429498, # 28270, #1 for douban data 48 | "final_n_class": 1, 49 | 50 | "cnn_3d_oc0": 32, 51 | "cnn_3d_oc1": 16 52 | } 53 | 54 | conf_ms = { 55 | "data_name": "ms_v2", 56 | "data_path": "../data/ms_v2/data.pkl", # data_small.pkl or data.pkl or data_nofd.pkl 57 | "intent_vec_path": "../data/ms_v2/intent_vectors.txt", # path of intent vectors 58 | "intent_size": 12, # dimensions of different intent 59 | "intent_attention_type": "bilinear", # 'dot', 'bilinear', 'outprod'. default is bilinear 60 | "intent_ffn_od0": 128, # in iadam-concat ffn 144->128->64 match 6400 61 | "intent_ffn_od1": 64, # in iadam-concat ffn 144->128->64 match 6400 62 | "intent_loss_weight": 0.2, 63 | # in iadam-mtl weight for intent loss; 1-weight for the ranking loss 64 | "model_name": "iadam-concat", # dam, iadam-concat, iadam-attention, iadam-mtl 65 | "save_path": "../output/ms_v2/temp/", 66 | "word_emb_init": "../data/ms_v2/cut_embed_mikolov_200d.pkl", # "../data/ms_v2/cut_embed_mikolov_200d.pkl", # None (set None during debugging) 67 | "init_model": None, # "../output/ms_v2/dam_default_setting_0412_run29/model.ckpt.36", #should be set for test 68 | 69 | "rand_seed": None, 70 | 71 | "drop_dense": None, 72 | "drop_attention": None, 73 | 74 | "is_mask": True, 75 | "is_layer_norm": True, 76 | "is_positional": False, 77 | 78 | "stack_num": 4, 79 | "attention_type": "dot", 80 | 81 | "learning_rate": 1e-3, 82 | "vocab_size": 167983, 83 | "emb_size": 200, 84 | "batch_size": 32, # 200 for test 256 85 | 86 | "max_turn_num": 6, # 6 is better for ms_v2 87 | "max_turn_len": 180, 88 | 89 | "max_to_keep": 1, 90 | "num_scan_data": 5, # about 18 hours for 5 epoches on ms_v2 91 | "_EOS_": 167983, # 1 for douban data 92 | "final_n_class": 1, 93 | 94 | "cnn_3d_oc0": 16, 95 | "cnn_3d_oc1": 16 96 | } 97 | 98 | parser = argparse.ArgumentParser() 99 | # python main_conversation_qa.py --help to print the help messages 100 | # sys.argv includes a list of elements starting with the program 101 | # required parameters 102 | parser.add_argument('--phase', default='train', 103 | help='phase: it can be train or predict, the default \ 104 | value is train.', 105 | required=True) 106 | parser.add_argument('--data_name', default='udc', 107 | help='data_name: name of the data. it can be udc or \ 108 | ms_v2', required=True) 109 | parser.add_argument('--model_name', default='dam', 110 | help='model_name: name of the model', required=True) 111 | parser.add_argument('--save_path', default='../output/udc/temp/', 112 | help='save_path: output path for model files, score \ 113 | files and result files', required=True) 114 | parser.add_argument('--or_cmd', default=False, 115 | help='or_cmd: whether want to override config \ 116 | parameters by command line parameters', 117 | required=True) 118 | 119 | # optional parameters 120 | parser.add_argument('--intent_vec_path', 121 | help='intent_vec_path: path of intent vectors.') 122 | parser.add_argument('--intent_attention_type', 123 | help='intent_attention_type: type of intent attention.') 124 | parser.add_argument('--intent_ffn_od0', 125 | help='intent_ffn_od0: output dimension 0 in FFN for \ 126 | intent transformation in IADAM-Concat') 127 | parser.add_argument('--intent_ffn_od1', 128 | help='intent_ffn_od1: output dimension 1 in FFN for \ 129 | intent transformation in IADAM-Concat') 130 | parser.add_argument('--intent_loss_weight', 131 | help='intent_loss_weight: weight of intent loss \ 132 | in IADAM-MTL model') 133 | parser.add_argument('--data_path', 134 | help='data_path: path of input data.') 135 | parser.add_argument('--word_emb_init', 136 | help='data_name: path of word embedding file to \ 137 | initialize the word embeddings.') 138 | parser.add_argument('--init_model', 139 | help='init_model: path of the checkpoints of \ 140 | model initialization during testing phase.') 141 | parser.add_argument('--rand_seed', 142 | help='rand_seed: rand seed used in numpy.') 143 | parser.add_argument('--is_positional', 144 | help='is_positional: whether add positional embeddings.') 145 | parser.add_argument('--stack_num', 146 | help='stack_num: stack number in Transformers.') 147 | parser.add_argument('--attention_type', 148 | help='attention_type: attention_type in attentive module \ 149 | in Transformers (dot or bilinear).') # Added in net.py 150 | parser.add_argument('--learning_rate', 151 | help='learning_rate: initial learning rate in \ 152 | exponential decay learning rate.') 153 | parser.add_argument('--vocab_size', 154 | help='vocab_size: vocabulary size.') 155 | parser.add_argument('--emb_size', 156 | help='emb_size: embedding size.') 157 | parser.add_argument('--batch_size', 158 | help='batch_size: batch size.') 159 | parser.add_argument('--max_turn_num', 160 | help='max_turn_num: max number of turns in conversation \ 161 | context.') 162 | parser.add_argument('--max_turn_len', 163 | help='max_turn_len: max length of conversation turns.') 164 | parser.add_argument('--max_to_keep', 165 | help='max_to_keep: max number of checkpoints file to \ 166 | keep.') 167 | parser.add_argument('--num_scan_data', 168 | help='num_scan_data: number of times to scan the data \ 169 | which is also number of epoches.') 170 | parser.add_argument('--eos', 171 | help='eos: word id for _EOS_, which is the seperator \ 172 | between different turns in context') 173 | parser.add_argument('--cnn_3d_oc0', 174 | help='cnn_3d_oc0: out_channels_0 of 3D CNN layer.') 175 | parser.add_argument('--cnn_3d_oc1', 176 | help='cnn_3d_oc1: out_channels_1 of 3D CNN layer.') 177 | 178 | args = parser.parse_args() 179 | # parse the hyper-parameters from the command lines 180 | phase = args.phase 181 | or_cmd = bool(args.or_cmd) 182 | conf = conf_udc if args.data_name == 'udc' else conf_ms 183 | conf['save_path'] = args.save_path 184 | conf['model_name'] = args.model_name 185 | 186 | # load settings from the config file 187 | # then update the hyper-parameters in the config files with the settings 188 | # passed from command lines 189 | if or_cmd: 190 | if args.intent_vec_path != None: 191 | conf['intent_vec_path'] = args.intent_vec_path 192 | if args.intent_ffn_od0 != None: 193 | conf['intent_ffn_od0'] = int(args.intent_ffn_od0) 194 | if args.intent_ffn_od1 != None: 195 | conf['intent_ffn_od1'] = int(args.intent_ffn_od1) 196 | if args.intent_attention_type != None: 197 | conf['intent_attention_type'] = args.intent_attention_type 198 | if args.intent_loss_weight != None: 199 | conf['intent_loss_weight'] = float(args.intent_loss_weight) 200 | if args.data_path != None: 201 | conf['data_path'] = args.data_path 202 | if args.word_emb_init != None: 203 | conf['word_emb_init'] = args.word_emb_init 204 | if args.init_model != None: 205 | conf['init_model'] = args.init_model 206 | if args.rand_seed != None: 207 | conf['rand_seed'] = float(args.rand_seed) 208 | if args.is_positional != None: 209 | conf['is_positional'] = args.is_positional 210 | if args.stack_num != None: 211 | conf['stack_num'] = int(args.stack_num) 212 | if args.attention_type != None: 213 | conf['attention_type'] = args.attention_type 214 | if args.learning_rate != None: 215 | conf['learning_rate'] = float(args.learning_rate) 216 | if args.vocab_size != None: 217 | conf['vocab_size'] = int(args.vocab_size) 218 | if args.emb_size != None: 219 | conf['emb_size'] = int(args.emb_size) 220 | if args.batch_size != None: 221 | conf['batch_size'] = int(args.batch_size) 222 | if args.max_turn_num != None: 223 | conf['max_turn_num'] = int(args.max_turn_num) 224 | if args.max_turn_len != None: 225 | conf['max_turn_len'] = int(args.max_turn_len) 226 | if args.max_to_keep != None: 227 | conf['max_to_keep'] = int(args.max_to_keep) 228 | if args.num_scan_data != None: 229 | conf['num_scan_data'] = int(args.num_scan_data) 230 | if args.eos != None: 231 | conf['_EOS_'] = int(args.eos) 232 | if args.cnn_3d_oc0 != None: 233 | conf['cnn_3d_oc0'] = int(args.cnn_3d_oc0) 234 | if args.cnn_3d_oc1 != None: 235 | conf['cnn_3d_oc1'] = int(args.cnn_3d_oc1) 236 | 237 | if conf['model_name'] == 'dam': 238 | model = net.Net(conf) # DAM 239 | elif conf['model_name'] == 'iadam-attention': 240 | model = iadam_attention.Net(conf) # IADAM-Attention-V4-2 (IART) 241 | else: 242 | raise NameError('model not supported.') 243 | 244 | if phase == 'train': 245 | train.train(conf, model) 246 | elif phase == 'predict': 247 | # test and evaluation, init_model in conf should be set 248 | test.test(conf, model) 249 | else: 250 | print 'Phase Error.' 251 | return 252 | 253 | 254 | if __name__ == '__main__': 255 | main(sys.argv) 256 | 257 | -------------------------------------------------------------------------------- /IART/utils/reader.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | def unison_shuffle(data, seed=None): 6 | if seed is not None: 7 | np.random.seed(seed) 8 | 9 | y = np.array(data['y']) 10 | c = np.array(data['c']) 11 | r = np.array(data['r']) 12 | qids = [] 13 | dids = [] 14 | 15 | assert len(y) == len(c) == len(r) == len(data['qids']) == len(data['dids']) 16 | p = np.random.permutation(len(y)) 17 | # shuffle qids and dids 18 | for i in range(len(y)): 19 | qids.append(data['qids'][p[i]]) 20 | dids.append(data['dids'][p[i]]) 21 | 22 | shuffle_data = {'y': y[p], 'c': c[p], 'r': r[p], 'qids': qids, 'dids': dids} 23 | # print('test after shuffle: ') 24 | # print('y: ', y[p][0:1]) 25 | # print('c: ', c[p][0:1]) 26 | # print('r: ', r[p][0:1]) 27 | # print('qids: ', qids[0:1]) 28 | # print('dids: ', dids[0:1]) 29 | 30 | return shuffle_data 31 | 32 | def split_c(c, split_id): 33 | '''c is a list, example context 34 | split_id is a integer, conf[_EOS_] 35 | return nested list 36 | ''' 37 | turns = [[]] 38 | for _id in c: 39 | if _id != split_id: 40 | turns[-1].append(_id) 41 | else: 42 | turns.append([]) 43 | if turns[-1] == [] and len(turns) > 1: 44 | turns.pop() 45 | return turns 46 | 47 | def normalize_length(_list, length, cut_type='tail'): 48 | '''_list is a list or nested list, example turns/r/single turn c 49 | cut_type is head or tail, if _list len > length is used 50 | return a list len=length and min(read_length, length) 51 | ''' 52 | real_length = len(_list) 53 | # if real_length == 0, pad 0 54 | if real_length == 0: 55 | return [0]*length, 0 56 | 57 | # if real_length <= length, pad 0s 58 | if real_length <= length: 59 | # 1D list 60 | if not isinstance(_list[0], list): 61 | _list.extend([0]*(length - real_length)) 62 | else: # 2D list 63 | _list.extend([[]]*(length - real_length)) 64 | return _list, real_length 65 | 66 | # if real_length > length, cut extra tokens 67 | if cut_type == 'head': 68 | return _list[:length], length 69 | if cut_type == 'tail': 70 | return _list[-length:], length 71 | 72 | def produce_intent(cid, rid, turns_len_all, intent_dict): 73 | r_intent = intent_dict[rid] 74 | c_intent = [] 75 | for i in range(turns_len_all): # loop all turns in context 76 | c_intent.append(intent_dict[cid + '-' + str(i)]) 77 | return c_intent, r_intent 78 | 79 | def produce_one_sample(data, conf, index, split_id, max_turn_num, max_turn_len, turn_cut_type='tail', term_cut_type='tail'): 80 | '''max_turn_num=10 81 | max_turn_len=50 82 | return y, nor_turns_nor_c, nor_r, turn_len, term_len, r_len 83 | ''' 84 | # print('keys of data: ', data.keys()) 85 | c = data['c'][index] 86 | r = data['r'][index][:] 87 | y = data['y'][index] 88 | cid = data['qids'][index] 89 | rid = data['dids'][index] 90 | c_intent = [] 91 | r_intent = [] 92 | 93 | turns = split_c(c, split_id) 94 | turns_len_all = len(turns) # all turns in context before normalization 95 | 96 | if conf['model_name'] != 'dam': 97 | c_intent, r_intent = produce_intent(cid, rid, turns_len_all, conf['intent_dict']) 98 | 99 | #print('test c_intent: ', c_intent) 100 | #normalize turns_c length, nor_turns length is max_turn_num 101 | # cut extra conversation turns 102 | nor_turns, turn_len = normalize_length(turns, max_turn_num, turn_cut_type) 103 | if conf['model_name'] != 'dam': 104 | nor_turns_intent, turn_len_intent = normalize_length(c_intent, max_turn_num, turn_cut_type) 105 | # print('test nor_turns_intent, turn_len_intent: ', nor_turns_intent, 106 | # turn_len_intent) 107 | 108 | nor_turns_nor_c = [] 109 | term_len = [] 110 | #nor_turn_nor_c length is max_turn_num, element of a list length is max_turn_len 111 | # cut extra length for context turn text 112 | for c in nor_turns: 113 | #nor_c length is max_turn_len 114 | nor_c, nor_c_len = normalize_length(c, max_turn_len, term_cut_type) 115 | nor_turns_nor_c.append(nor_c) 116 | term_len.append(nor_c_len) 117 | 118 | nor_turns_intent_nor_it = [] 119 | # pad 0s in nor_turns_intent if there are less than max_turn_num turns 120 | if conf['model_name'] != 'dam': 121 | for it in nor_turns_intent: 122 | # nor_it length is intent_size 123 | nor_it, nor_it_len = normalize_length(it, conf['intent_size'], term_cut_type) 124 | nor_turns_intent_nor_it.append(nor_it) 125 | 126 | # cut extra length for response text 127 | nor_r, r_len = normalize_length(r, max_turn_len, term_cut_type) 128 | 129 | return y, nor_turns_nor_c, nor_r, turn_len, term_len, r_len, nor_turns_intent_nor_it, r_intent 130 | 131 | def build_one_batch(data, batch_index, conf, turn_cut_type='tail', term_cut_type='tail'): 132 | _turns = [] 133 | _tt_turns_len = [] 134 | _every_turn_len = [] 135 | _turns_intent = [] 136 | 137 | _response = [] 138 | _response_len = [] 139 | _response_intent = [] 140 | 141 | _label = [] 142 | 143 | for i in range(conf['batch_size']): 144 | # i is to loop instances in the current batch 145 | # index is a global position for this instance 146 | index = batch_index * conf['batch_size'] + i 147 | y, nor_turns_nor_c, nor_r, turn_len, term_len, r_len, c_intent, r_intent = produce_one_sample(data, conf, index, conf['_EOS_'], conf['max_turn_num'], 148 | conf['max_turn_len'], turn_cut_type, term_cut_type) 149 | 150 | _label.append(y) 151 | _turns.append(nor_turns_nor_c) 152 | _response.append(nor_r) 153 | _every_turn_len.append(term_len) 154 | _tt_turns_len.append(turn_len) 155 | _response_len.append(r_len) 156 | if conf['model_name'] != 'dam': 157 | _turns_intent.append(c_intent) 158 | _response_intent.append(r_intent) 159 | 160 | return _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label, _turns_intent, _response_intent 161 | 162 | def build_one_batch_dict(data, batch_index, conf, turn_cut_type='tail', term_cut_type='tail'): 163 | _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label, _turns_intent, _response_intent = build_one_batch(data, batch_index, conf, turn_cut_type, term_cut_type) 164 | ans = {'turns': _turns, 165 | 'tt_turns_len': _tt_turns_len, 166 | 'every_turn_len': _every_turn_len, 167 | 'response': _response, 168 | 'response_len': _response_len, 169 | 'label': _label, "turns_intent": _turns_intent, 170 | "response_intent": _response_intent} 171 | return ans 172 | 173 | def build_batches(data, conf, turn_cut_type='tail', term_cut_type='tail'): 174 | '''Build batches for DAM and IADAM 175 | for DAM, conf['intent_dict'] == None 176 | for IADAM, conf['intent_dict'] != None 177 | In addition to (c,r,y) for each instance, we also look up the corresponding 178 | predicted intent vectors for (c,r) from the intent_dict in O(1) 179 | ''' 180 | _turns_batches = [] 181 | _tt_turns_len_batches = [] 182 | _every_turn_len_batches = [] 183 | _turns_intent_batches = [] 184 | 185 | _response_batches = [] 186 | _response_len_batches = [] 187 | _response_intent_batches = [] 188 | 189 | _label_batches = [] 190 | 191 | batch_len = len(data['y'])/conf['batch_size'] 192 | 193 | for batch_index in range(batch_len): 194 | _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label, _turns_intent, _response_intent = build_one_batch(data, batch_index, conf, turn_cut_type='tail', term_cut_type='tail') 195 | 196 | _turns_batches.append(_turns) 197 | _tt_turns_len_batches.append(_tt_turns_len) 198 | _every_turn_len_batches.append(_every_turn_len) 199 | 200 | _response_batches.append(_response) 201 | _response_len_batches.append(_response_len) 202 | 203 | if conf['model_name'] != 'dam': 204 | _turns_intent_batches.append(_turns_intent) 205 | _response_intent_batches.append(_response_intent) 206 | 207 | _label_batches.append(_label) 208 | 209 | ans = { 210 | "turns": _turns_batches, "tt_turns_len": _tt_turns_len_batches, "every_turn_len":_every_turn_len_batches, 211 | "response": _response_batches, "response_len": _response_len_batches, "label": _label_batches, 212 | "turns_intent" : _turns_intent_batches, "response_intent": _response_intent_batches 213 | } 214 | 215 | return ans 216 | 217 | # def build_batches_iadam(data, conf, intent_dict, turn_cut_type='tail', 218 | # term_cut_type='tail'): 219 | # '''Build batches for intent-aware DAM model 220 | # In addition to (c,r,y) for each instance, we also look up the corresponding 221 | # predicted intent vectors for (c,r) from the intent_dict in O(1) 222 | # ''' 223 | # _turns_batches = [] 224 | # _tt_turns_len_batches = [] 225 | # _every_turn_len_batches = [] 226 | # 227 | # _response_batches = [] 228 | # _response_len_batches = [] 229 | # 230 | # _label_batches = [] 231 | # 232 | # batch_len = len(data['y']) / conf['batch_size'] 233 | # print('number of batches in one epoch', batch_len) 234 | # 235 | # for batch_index in range(batch_len): 236 | # # batch_index is to index the batch in the current epoch 237 | # # if batch_size = 50, and there are 500 instances in data 238 | # # than batch_len = 10, batch_index = 0,1,...9 239 | # _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label = build_one_batch( 240 | # data, batch_index, conf, turn_cut_type='tail', 241 | # term_cut_type='tail') 242 | # 243 | # _turns_batches.append(_turns) 244 | # _tt_turns_len_batches.append(_tt_turns_len) 245 | # _every_turn_len_batches.append(_every_turn_len) 246 | # 247 | # _response_batches.append(_response) 248 | # _response_len_batches.append(_response_len) 249 | # 250 | # _label_batches.append(_label) 251 | # 252 | # ans = { 253 | # "turns": _turns_batches, "tt_turns_len": _tt_turns_len_batches, 254 | # "every_turn_len": _every_turn_len_batches, 255 | # "response": _response_batches, "response_len": _response_len_batches, 256 | # "label": _label_batches 257 | # } 258 | # 259 | # return ans 260 | 261 | def read_dict(word2id_file): 262 | id2word_dict = dict() 263 | with open(word2id_file) as fin: 264 | lines = fin.readlines() 265 | for index in range(0,len(lines)-1, 2): 266 | id2word_dict[lines[index+1].strip()] = lines[index].strip() 267 | print('vocab size: ', len(id2word_dict)) 268 | return id2word_dict 269 | 270 | 271 | # read intent_vectors.txt for intent vectors in DMN_INTENT model 272 | def read_intent(filename): 273 | intent_dict = {} 274 | print('read intent vectors...') 275 | with open(filename) as fin: 276 | for l in tqdm(fin): 277 | to = l.strip().split('\t') 278 | intent_dict[to[0]] = [float(x) for x in 279 | to[1].split()] # str to float 280 | return intent_dict 281 | 282 | if __name__ == '__main__': 283 | # conf = { 284 | # "batch_size": 256, 285 | # "max_turn_num": 10, 286 | # "max_turn_len": 50, 287 | # "_EOS_": 28270, 288 | # } 289 | # train, val, test = pickle.load(open('../../data/data_small.pkl', 'rb')) 290 | # print('load data success') 291 | # 292 | # train_batches = build_batches(train, conf) 293 | # val_batches = build_batches(val, conf) 294 | # test_batches = build_batches(test, conf) 295 | # print('build batches success') 296 | # 297 | # pickle.dump([train_batches, val_batches, test_batches], open('../../data/batches_small.pkl', 'wb')) 298 | # print('dump success') 299 | word2id_file = '../../data/ubuntu/word2id' 300 | id2word_dict = read_dict(word2id_file) 301 | print(dict(id2word_dict.items()[0:5])) 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | -------------------------------------------------------------------------------- /IART/utils/preparation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from __future__ import print_function 5 | import sys 6 | import os 7 | import numpy as np 8 | import hashlib 9 | import random 10 | 11 | import preprocess 12 | 13 | 14 | class Preparation(object): 15 | '''Convert dataset of different text matching tasks into a unified format as the input of deep matching modules. Users provide datasets contain pairs of texts along with their labels, and the module produces the following files: 16 | * Word Dictionary: this file records the mapping from each word to a unique identifier. 17 | * Corpus File: this file records the mapping from each text to a unique identifiers, along with a sequence of word identifiers contained in text. 18 | * Relation File: this file records the relationship between two texts, each line containing the label and a pair of ids. 19 | ''' 20 | 21 | def __init__(self): 22 | pass 23 | 24 | def get_text_id(self, hashid, text, idtag='T'): 25 | hash_obj = hashlib.sha1(text.encode('utf8')) # if the text are the same, then the hash_code are also the same 26 | hex_dig = hash_obj.hexdigest() 27 | if hex_dig in hashid: 28 | return hashid[hex_dig] 29 | else: 30 | tid = idtag + str(len(hashid)) # start from 0, 1, 2, ... 31 | hashid[hex_dig] = tid 32 | return tid 33 | 34 | def parse_line(self, line, delimiter='\t'): 35 | subs = line.split(delimiter) 36 | # print('subs: ', len(subs)) 37 | if 3 != len(subs): 38 | raise ValueError('format of data file wrong, should be \'label,text1,text2\'.') 39 | else: 40 | return subs[0], subs[1], subs[2] 41 | 42 | def parse_line_dmn(self, line, delimiter='\t'): 43 | subs = line.split(delimiter) 44 | # print('subs: ', len(subs)) 45 | if len(subs) < 3: 46 | raise ValueError('format of data file wrong, should be \'label,text1(mulitple utterances septerated by tab),text2\'.') 47 | else: 48 | return subs 49 | 50 | def run_with_one_corpus(self, file_path): 51 | hashid = {} 52 | corpus = {} 53 | rels = [] 54 | f = open(file_path, 'r') 55 | for line in f: 56 | line = line.decode('utf8') 57 | line = line.strip() 58 | label, t1, t2 = self.parse_line(line) 59 | id1 = self.get_text_id(hashid, t1, 'T') 60 | id2 = self.get_text_id(hashid, t2, 'T') 61 | corpus[id1] = t1 62 | corpus[id2] = t2 63 | rels.append((label, id1, id2)) 64 | f.close() 65 | return corpus, rels 66 | 67 | def run_with_two_corpus(self, file_path): 68 | hashid_q = {} 69 | hashid_d = {} 70 | corpus_q = {} 71 | corpus_d = {} 72 | rels = [] 73 | f = open(file_path, 'r') 74 | for line in f: 75 | line = line.decode('utf8') 76 | line = line.strip() 77 | label, t1, t2 = self.parse_line(line) 78 | id1 = self.get_text_id(hashid_q, t1, 'Q') 79 | id2 = self.get_text_id(hashid_d, t2, 'D') 80 | corpus_q[id1] = t1 81 | corpus_d[id2] = t2 82 | rels.append((label, id1, id2)) 83 | f.close() 84 | return corpus_q, corpus_d, rels 85 | 86 | def run_with_train_valid_test_corpus(self, train_file, valid_file, test_file): 87 | ''' 88 | Run with pre-splited train_file, valid_file, test_file 89 | The input format should be label \t text1 \t text2 90 | The query ids can't be duplicated. For the same query 91 | id, the document ids can't be duplicated. 92 | Note that if we make queries with unique id (fixed 10 candidates for a single query), then it is 93 | possible that multiple queries have different query ids, but with the same text (in rare cases) 94 | :param train_file: train file 95 | :param valid_file: valid file 96 | :param test_file: test file 97 | :return: corpus, rels_train, rels_valid, rels_test 98 | ''' 99 | hashid = {} 100 | corpus = {} 101 | rels = [] 102 | rels_train = [] 103 | rels_valid = [] 104 | rels_test = [] 105 | # merge corpus files, but return rels for train/valid/test seperately 106 | curQ = 'init' 107 | curQid = 0 108 | for file_path in list([train_file, valid_file, test_file]): 109 | if file_path == train_file: 110 | rels = rels_train 111 | elif file_path == valid_file: 112 | rels = rels_valid 113 | if file_path == test_file: 114 | rels = rels_test 115 | f = open(file_path, 'r') 116 | for line in f: 117 | line = line.decode('utf8') 118 | line = line.strip() 119 | label, t1, t2 = self.parse_line(line) 120 | id2 = self.get_text_id(hashid, t2, 'D') 121 | # generate unique query ids 122 | if t1 == curQ: 123 | # same query 124 | id1 = 'Q' + str(curQid) 125 | else: 126 | # new query 127 | curQid += 1 128 | id1 = 'Q' + str(curQid) 129 | curQ = t1 130 | corpus[id1] = t1 131 | corpus[id2] = t2 132 | rels.append((label, id1, id2)) 133 | f.close() 134 | return corpus, rels_train, rels_valid, rels_test 135 | 136 | def run_with_train_valid_test_corpus_dmn(self, train_file, valid_file, test_file): 137 | ''' 138 | Run with pre-splited train_file, valid_file, test_file for dmn model for conversation response ranking 139 | The input format should be label \t text1 (conversation context utterances seperated by \t) \t text2 140 | The query ids can't be duplicated. For the same query 141 | id, the document ids can't be duplicated. 142 | Note that if we make queries with unique id (fixed 10 candidates for a single query), then it is 143 | possible that multiple queries have different query ids, but with the same text (in rare cases) 144 | :param train_file: train file 145 | :param valid_file: valid file 146 | :param test_file: test file 147 | :return: corpus, rels_train, rels_valid, rels_test 148 | ''' 149 | hashid = {} 150 | corpus = {} 151 | rels = [] 152 | rels_train = [] 153 | rels_valid = [] 154 | rels_test = [] 155 | # merge corpus files, but return rels for train/valid/test seperately 156 | curQ = 'init' 157 | curQid = 0 158 | for file_path in list([train_file, valid_file, test_file]): 159 | if file_path == train_file: 160 | rels = rels_train 161 | elif file_path == valid_file: 162 | rels = rels_valid 163 | if file_path == test_file: 164 | rels = rels_test 165 | f = open(file_path, 'r') 166 | for line in f: 167 | line = line.decode('utf8') 168 | line = line.strip() 169 | subs = self.parse_line_dmn(line) 170 | label = subs[0] 171 | t1 = '\t'.join(subs[1:-1]) 172 | t2 = subs[-1] 173 | id2 = self.get_text_id(hashid, t2, 'D') 174 | # generate unique query ids 175 | if t1 == curQ: 176 | # same query 177 | id1 = 'Q' + str(curQid) 178 | else: 179 | # new query 180 | curQid += 1 181 | id1 = 'Q' + str(curQid) 182 | curQ = t1 183 | corpus[id1] = t1 184 | corpus[id2] = t2 185 | rels.append((label, id1, id2)) 186 | f.close() 187 | return corpus, rels_train, rels_valid, rels_test 188 | 189 | @staticmethod 190 | def save_corpus(file_path, corpus): 191 | f = open(file_path, 'w') 192 | for qid, text in corpus.items(): 193 | f.write('%s %s\n' % (qid, text.encode('utf8'))) 194 | f.close() 195 | 196 | @staticmethod 197 | def save_corpus_dmn(file_path, corpus, delim='\t'): 198 | f = open(file_path, 'w') 199 | for qid, text in corpus.items(): 200 | f.write('%s%s%s\n' % (qid, delim, text.encode('utf8'))) 201 | f.close() 202 | 203 | @staticmethod 204 | def merge_corpus(train_corpus, valid_corpus, test_corpus): 205 | # cat train valid test > corpus.txt 206 | # cat corpus_train.txt corpus_valid.txt corpus_test.txt > corpus.txt 207 | os.system('cat ' + train_corpus + ' ' + valid_corpus + ' ' + test_corpus + ' > corpus.txt') 208 | 209 | @staticmethod 210 | def save_relation(file_path, relations): 211 | f = open(file_path, 'w') 212 | for rel in relations: 213 | f.write('%s %s %s\n' % (rel)) 214 | f.close() 215 | 216 | @staticmethod 217 | def check_filter_query_with_dup_doc(input_file): 218 | ''' Filter queries with duplicated doc ids in the relation files 219 | :param input_file: input file, which could be the relation file for train/valid/test data 220 | The format is "label qid did" 221 | :return: 222 | ''' 223 | with open(input_file) as f_in, open(input_file + '.fd', 'w') as f_out: 224 | cur_qid = 'init' 225 | cache_did_set = set() 226 | cache_q_lines = [] 227 | found_dup_doc = False 228 | for l in f_in: 229 | tokens = l.split() 230 | if tokens[1] == cur_qid: 231 | # same qid 232 | cache_q_lines.append(l) 233 | if tokens[2] in cache_did_set: 234 | found_dup_doc = True 235 | else: 236 | cache_did_set.add(tokens[2]) 237 | else: 238 | # new qid 239 | if not found_dup_doc: 240 | f_out.write(''.join(cache_q_lines)) 241 | else: 242 | print 243 | 'found qid with duplicated doc id/text: ', ''.join(cache_q_lines) 244 | print 245 | 'filtered... continue' 246 | cache_q_lines = [] 247 | cache_q_lines.append(l) 248 | found_dup_doc = False 249 | cache_did_set.clear() 250 | cur_qid = tokens[1] 251 | cache_did_set.add(tokens[2]) 252 | # the last query 253 | # print len(cache_q_lines), len(cache_did_set) 254 | if (len(cache_q_lines) != 0 and len(cache_q_lines) == len(cache_did_set)): 255 | f_out.write(''.join(cache_q_lines)) 256 | print 257 | 'write the last query... done: ', ''.join(cache_q_lines) 258 | 259 | @staticmethod 260 | def split_train_valid_test(relations, ratio=[0.8, 0.1, 0.1]): 261 | random.shuffle(relations) 262 | total_rel = len(relations) 263 | num_train = int(total_rel * ratio[0]) 264 | num_valid = int(total_rel * ratio[1]) 265 | valid_end = num_train + num_valid 266 | rel_train = relations[: num_train] 267 | rel_valid = relations[num_train: valid_end] 268 | rel_test = relations[valid_end:] 269 | return rel_train, rel_valid, rel_test 270 | 271 | @staticmethod 272 | def split_train_valid_test_for_ranking(relations, ratio=[0.8, 0.1, 0.1]): 273 | qid_group = set() 274 | for r, q, d in relations: 275 | qid_group.add(q) 276 | qid_group = list(qid_group) 277 | 278 | random.shuffle(qid_group) 279 | total_rel = len(qid_group) 280 | num_train = int(total_rel * ratio[0]) 281 | num_valid = int(total_rel * ratio[1]) 282 | valid_end = num_train + num_valid 283 | 284 | qid_train = qid_group[: num_train] 285 | qid_valid = qid_group[num_train: valid_end] 286 | qid_test = qid_group[valid_end:] 287 | 288 | def select_rel_by_qids(qids): 289 | rels = [] 290 | qids = set(qids) 291 | for r, q, d in relations: 292 | if q in qids: 293 | rels.append((r, q, d)) 294 | return rels 295 | 296 | rel_train = select_rel_by_qids(qid_train) 297 | rel_valid = select_rel_by_qids(qid_valid) 298 | rel_test = select_rel_by_qids(qid_test) 299 | 300 | return rel_train, rel_valid, rel_test 301 | 302 | 303 | if __name__ == '__main__': 304 | prepare = Preparation() 305 | basedir = '../../data/example/ranking/' 306 | corpus, rels = prepare.run_with_one_corpus(basedir + 'sample.txt') 307 | print('total corpus : %d ...' % (len(corpus))) 308 | print('total relations : %d ...' % (len(rels))) 309 | prepare.save_corpus(basedir + 'corpus.txt', corpus) 310 | 311 | rel_train, rel_valid, rel_test = prepare.split_train_valid_test(rels, [0.8, 0.1, 0.1]) 312 | prepare.save_relation(basedir + 'relation_train.txt', rel_train) 313 | prepare.save_relation(basedir + 'relation_valid.txt', rel_valid) 314 | prepare.save_relation(basedir + 'relation_test.txt', rel_test) 315 | print('Done ...') 316 | -------------------------------------------------------------------------------- /IART/utils/operations.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from scipy.stats import multivariate_normal 4 | import tensorflow as tf 5 | 6 | def learning_rate(step_num, d_model=512, warmup_steps=4000): 7 | a = step_num**(-0.5) 8 | b = step_num*warmup_steps**(-1.5) 9 | return a, b, d_model**(-0.5) * min(step_num**(-0.5), step_num*(warmup_steps**(-1.5))) 10 | 11 | def selu(x): 12 | alpha = 1.6732632423543772848170429916717 13 | scale = 1.0507009873554804934193349852946 14 | print('use selu') 15 | return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x)) 16 | 17 | def bilinear_sim_4d(x, y, is_nor=True): 18 | '''calulate bilinear similarity with two 4d tensor. 19 | 20 | Args: 21 | x: a tensor with shape [batch, time_x, dimension_x, num_stacks] 22 | y: a tensor with shape [batch, time_y, dimension_y, num_stacks] 23 | 24 | Returns: 25 | a tensor with shape [batch, time_x, time_y, num_stacks] 26 | 27 | Raises: 28 | ValueError: if 29 | the shapes of x and y are not match; 30 | bilinear matrix reuse error. 31 | ''' 32 | M = tf.get_variable( 33 | name="bilinear_matrix", 34 | shape=[x.shape[2], y.shape[2], x.shape[3]], 35 | dtype=tf.float32, 36 | initializer=tf.orthogonal_initializer()) 37 | sim = tf.einsum('biks,kls,bjls->bijs', x, M, y) 38 | 39 | if is_nor: 40 | scale = tf.sqrt(tf.cast(x.shape[2] * y.shape[2], tf.float32)) 41 | scale = tf.maximum(1.0, scale) 42 | return sim / scale 43 | else: 44 | return sim 45 | 46 | def dot_sim_2d(x, y): 47 | ''' 48 | calculate dot similarity with two tensor in 2D for intent attention 49 | ''' 50 | M = tf.get_variable( 51 | name="dot_attention_matrix", 52 | shape=[x.shape[-1]*2], # concate x and y 53 | dtype=tf.float32, 54 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 55 | con = tf.concat([x,y], 1) 56 | sim = tf.einsum('bi,i->b', con, M) 57 | return sim 58 | 59 | def bilinear_sim_2d(x, y): 60 | ''' 61 | calculate bilinear similarity with two tensor in 2D for intent attention 62 | ''' 63 | M = tf.get_variable( 64 | name="bilinear_matrix", 65 | shape=[x.shape[-1], y.shape[-1]], 66 | dtype=tf.float32, 67 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 68 | sim = tf.einsum('bi,ij,jb->b', x, M, tf.transpose(y)) 69 | return sim 70 | 71 | def outprod_sim_2d(x, y): 72 | ''' 73 | calculate outprod similarity with two tensor in 2D for intent attention 74 | ''' 75 | M = tf.get_variable( 76 | name="outproduct_matrix", 77 | shape=[x.shape[-1]*y.shape[-1]], 78 | dtype=tf.float32, 79 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 80 | sim = tf.einsum('bi,bj->bij', x, y) 81 | flat = tf.reshape(sim, [x.shape[0],-1]) # [batch, intent_size*intent_size] 82 | sim = tf.einsum('bi,i->b', flat, M) 83 | return sim 84 | 85 | def bilinear_sim(x, y, is_nor=True): 86 | '''calculate bilinear similarity with two tensor. 87 | Args: 88 | x: a tensor with shape [batch, time_x, dimension_x] 89 | y: a tensor with shape [batch, time_y, dimension_y] 90 | 91 | Returns: 92 | a tensor with shape [batch, time_x, time_y] 93 | Raises: 94 | ValueError: if 95 | the shapes of x and y are not match; 96 | bilinear matrix reuse error. 97 | ''' 98 | M = tf.get_variable( 99 | name="bilinear_matrix", 100 | shape=[x.shape[-1], y.shape[-1]], 101 | dtype=tf.float32, 102 | initializer=tf.orthogonal_initializer()) 103 | sim = tf.einsum('bik,kl,bjl->bij', x, M, y) 104 | 105 | if is_nor: 106 | scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32)) 107 | scale = tf.maximum(1.0, scale) 108 | return sim / scale 109 | else: 110 | return sim 111 | 112 | def dot_sim(x, y, is_nor=True): 113 | '''calculate dot similarity with two tensor. 114 | 115 | Args: 116 | x: a tensor with shape [batch, time_x, dimension] 117 | y: a tensor with shape [batch, time_y, dimension] 118 | 119 | Returns: 120 | a tensor with shape [batch, time_x, time_y] 121 | Raises: 122 | AssertionError: if 123 | the shapes of x and y are not match. 124 | ''' 125 | assert x.shape[-1] == y.shape[-1] 126 | 127 | sim = tf.einsum('bik,bjk->bij', x, y) 128 | 129 | if is_nor: 130 | scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32)) 131 | scale = tf.maximum(1.0, scale) 132 | return sim / scale 133 | else: 134 | return sim 135 | 136 | def layer_norm(x, axis=None, epsilon=1e-6): 137 | '''Add layer normalization. 138 | 139 | Args: 140 | x: a tensor 141 | axis: the dimensions to normalize 142 | 143 | Returns: 144 | a tensor the same shape as x. 145 | 146 | Raises: 147 | ''' 148 | print('wrong version of layer_norm') 149 | scale = tf.get_variable( 150 | name='scale', 151 | shape=[1], 152 | dtype=tf.float32, 153 | initializer=tf.ones_initializer()) 154 | bias = tf.get_variable( 155 | name='bias', 156 | shape=[1], 157 | dtype=tf.float32, 158 | initializer=tf.zeros_initializer()) 159 | 160 | if axis is None: 161 | axis = [-1] 162 | 163 | mean = tf.reduce_mean(x, axis=axis, keep_dims=True) 164 | variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True) 165 | norm = (x-mean) * tf.rsqrt(variance + epsilon) 166 | return scale * norm + bias 167 | 168 | def layer_norm_debug(x, axis = None, epsilon=1e-6): 169 | '''Add layer normalization. 170 | 171 | Args: 172 | x: a tensor 173 | axis: the dimensions to normalize 174 | 175 | Returns: 176 | a tensor the same shape as x. 177 | 178 | Raises: 179 | ''' 180 | if axis is None: 181 | axis = [-1] 182 | shape = [x.shape[i] for i in axis] 183 | 184 | scale = tf.get_variable( 185 | name='scale', 186 | shape=shape, 187 | dtype=tf.float32, 188 | initializer=tf.ones_initializer()) 189 | bias = tf.get_variable( 190 | name='bias', 191 | shape=shape, 192 | dtype=tf.float32, 193 | initializer=tf.zeros_initializer()) 194 | 195 | mean = tf.reduce_mean(x, axis=axis, keep_dims=True) 196 | variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True) 197 | norm = (x-mean) * tf.rsqrt(variance + epsilon) 198 | return scale * norm + bias 199 | 200 | def dense(x, out_dimension=None, add_bias=True): 201 | '''Add dense connected layer, Wx + b. 202 | 203 | Args: 204 | x: a tensor with shape [batch, time, dimension] 205 | out_dimension: a number which is the output dimension 206 | 207 | Return: 208 | a tensor with shape [batch, time, out_dimension] 209 | 210 | Raises: 211 | ''' 212 | if out_dimension is None: 213 | out_dimension = x.shape[-1] 214 | 215 | W = tf.get_variable( 216 | name='weights', 217 | shape=[x.shape[-1], out_dimension], 218 | dtype=tf.float32, 219 | initializer=tf.orthogonal_initializer()) 220 | if add_bias: 221 | bias = tf.get_variable( 222 | name='bias', 223 | shape=[1], 224 | dtype=tf.float32, 225 | initializer=tf.zeros_initializer()) 226 | return tf.einsum('bik,kj->bij', x, W) + bias 227 | else: 228 | return tf.einsum('bik,kj->bij', x, W) 229 | 230 | def matmul_2d(x, out_dimension, drop_prob=None): 231 | '''Multiplies 2-d tensor by weights. 232 | 233 | Args: 234 | x: a tensor with shape [batch, dimension] 235 | out_dimension: a number 236 | 237 | Returns: 238 | a tensor with shape [batch, out_dimension] 239 | 240 | Raises: 241 | ''' 242 | W = tf.get_variable( 243 | name='weights', 244 | shape=[x.shape[1], out_dimension], 245 | dtype=tf.float32, 246 | initializer=tf.orthogonal_initializer()) 247 | if drop_prob is not None: 248 | W = tf.nn.dropout(W, drop_prob) 249 | print('W is dropout') 250 | 251 | return tf.matmul(x, W) 252 | 253 | def gauss_positional_encoding_vector(x, role=0, value=0): 254 | position = int(x.shape[1]) 255 | dimension = int(x.shape[2]) 256 | print('position: %s' %position) 257 | print('dimension: %s' %dimension) 258 | 259 | _lambda = tf.get_variable( 260 | name='lambda', 261 | shape=[position], 262 | dtype=tf.float32, 263 | initializer=tf.constant_initializer(value)) 264 | _lambda = tf.expand_dims(_lambda, axis=-1) 265 | 266 | mean = [position/2.0, dimension/2.0] 267 | 268 | #cov = [[position/3.0, 0], [0, dimension/3.0]] 269 | sigma_x = position/math.sqrt(4.0*dimension) 270 | sigma_y = math.sqrt(dimension/4.0) 271 | cov = [[sigma_x*sigma_x, role*sigma_x*sigma_y], 272 | [role*sigma_x*sigma_y, sigma_y*sigma_y]] 273 | 274 | pos = np.dstack(np.mgrid[0:position, 0:dimension]) 275 | 276 | 277 | rv = multivariate_normal(mean, cov) 278 | signal = rv.pdf(pos) 279 | signal = signal - np.max(signal)/2.0 280 | 281 | signal = tf.multiply(_lambda, signal) 282 | signal = tf.expand_dims(signal, axis=0) 283 | 284 | print('gauss positional encoding') 285 | 286 | return x + _lambda * signal 287 | 288 | def positional_encoding(x, min_timescale=1.0, max_timescale=1.0e4, value=0): 289 | '''Adds a bunch of sinusoids of different frequencies to a tensor. 290 | 291 | Args: 292 | x: a tensor with shape [batch, length, channels] 293 | min_timescale: a float 294 | max_timescale: a float 295 | 296 | Returns: 297 | a tensor the same shape as x. 298 | 299 | Raises: 300 | ''' 301 | length = x.shape[1] 302 | channels = x.shape[2] 303 | _lambda = tf.get_variable( 304 | name='lambda', 305 | shape=[1], 306 | dtype=tf.float32, 307 | initializer=tf.constant_initializer(value)) 308 | 309 | position = tf.to_float(tf.range(length)) 310 | num_timescales = channels // 2 311 | log_timescale_increment = ( 312 | math.log(float(max_timescale) / float(min_timescale)) / 313 | (tf.to_float(num_timescales) - 1)) 314 | inv_timescales = min_timescale * tf.exp( 315 | tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) 316 | scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) 317 | signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) 318 | signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) 319 | #signal = tf.reshape(signal, [1, length, channels]) 320 | signal = tf.expand_dims(signal, axis=0) 321 | 322 | return x + _lambda * signal 323 | 324 | 325 | def positional_encoding_vector(x, min_timescale=1.0, max_timescale=1.0e4, value=0): 326 | '''Adds a bunch of sinusoids of different frequencies to a tensor. 327 | 328 | Args: 329 | x: a tensor with shape [batch, length, channels] 330 | min_timescale: a float 331 | max_timescale: a float 332 | 333 | Returns: 334 | a tensor the same shape as x. 335 | 336 | Raises: 337 | ''' 338 | length = x.shape[1] 339 | channels = x.shape[2] 340 | _lambda = tf.get_variable( 341 | name='lambda', 342 | shape=[length], 343 | dtype=tf.float32, 344 | initializer=tf.constant_initializer(value)) 345 | _lambda = tf.expand_dims(_lambda, axis=-1) 346 | 347 | position = tf.to_float(tf.range(length)) 348 | num_timescales = channels // 2 349 | log_timescale_increment = ( 350 | math.log(float(max_timescale) / float(min_timescale)) / 351 | (tf.to_float(num_timescales) - 1)) 352 | inv_timescales = min_timescale * tf.exp( 353 | tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) 354 | scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) 355 | signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) 356 | signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) 357 | 358 | signal = tf.multiply(_lambda, signal) 359 | signal = tf.expand_dims(signal, axis=0) 360 | 361 | return x + signal 362 | 363 | def mask(row_lengths, col_lengths, max_row_length, max_col_length): 364 | '''Return a mask tensor representing the first N positions of each row and each column. 365 | 366 | Args: 367 | row_lengths: a tensor with shape [batch] 368 | col_lengths: a tensor with shape [batch] 369 | row_lengths and col_lengths are real lengths 370 | max_row_length and max_col_length are max lengths 371 | 372 | Returns: 373 | a mask tensor with shape [batch, max_row_length, max_col_length] 374 | 375 | Raises: 376 | ''' 377 | row_mask = tf.sequence_mask(row_lengths, max_row_length) #return bool, [batch, max_row_len] 378 | col_mask = tf.sequence_mask(col_lengths, max_col_length) #return bool, [batch, max_col_len] 379 | 380 | row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32) 381 | col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32) 382 | 383 | return tf.einsum('bik,bjk->bij', row_mask, col_mask) 384 | 385 | def weighted_sum(weight, values): 386 | '''Calcualte the weighted sum. 387 | 388 | Args: 389 | weight: a tensor with shape [batch, time, dimension] 390 | values: a tensor with shape [batch, dimension, values_dimension] 391 | 392 | Return: 393 | a tensor with shape [batch, time, values_dimension] 394 | 395 | Raises: 396 | ''' 397 | return tf.einsum('bij,bjk->bik', weight, values) 398 | 399 | 400 | 401 | 402 | -------------------------------------------------------------------------------- /IART/models/iadam_attention.py: -------------------------------------------------------------------------------- 1 | ''' 2 | IADAM-Attention-V4-2 model which is the IART model in the paper. 3 | Developed based on DAM model 4 | 5 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu) 6 | @homepage: https://sites.google.com/site/lyangwww/ 7 | ''' 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | import cPickle as pickle 12 | 13 | import utils.layers as layers 14 | import utils.operations as op 15 | 16 | class Net(object): 17 | '''Add positional encoding(initializer lambda is 0), 18 | cross-attention, cnn integrated and grad clip by value. 19 | 20 | Attributes: 21 | conf: a configuration paramaters dict 22 | word_embedding_init: a 2-d array with shape [vocab_size+1, emb_size] 23 | there is one dimension in vocab_size which is corresponding to _eos_. 24 | in our preprocessing, _eos_ is always the last dimension 25 | +1 to add one more embedding vector for padding and masking 26 | We add an "all 0" vector in the 0-th row of word_embedding_init in order 27 | to denote the padding word 28 | when call tf.nn.embedding_lookup(), if word_id = 0, then this is a paded 29 | word; if word_id > 0 (from 1 to vocab_size), then this is a real word 30 | ''' 31 | 32 | def __init__(self, conf): 33 | self._graph = tf.Graph() 34 | self._conf = conf 35 | 36 | if self._conf['word_emb_init'] is not None: 37 | print('loading word emb init') 38 | self._word_embedding_init = pickle.load( 39 | open(self._conf['word_emb_init'], 'rb')) 40 | else: 41 | self._word_embedding_init = None 42 | 43 | def build_graph(self): 44 | with self._graph.as_default(): 45 | if self._conf['rand_seed'] is not None: 46 | rand_seed = self._conf['rand_seed'] 47 | tf.set_random_seed(rand_seed) 48 | print('set tf random seed: %s' % self._conf['rand_seed']) 49 | 50 | # word embedding 51 | if self._word_embedding_init is not None: 52 | word_embedding_initializer = tf.constant_initializer( 53 | self._word_embedding_init) 54 | else: 55 | word_embedding_initializer = tf.random_normal_initializer( 56 | stddev=0.1) 57 | 58 | self._word_embedding = tf.get_variable( 59 | name='word_embedding', 60 | shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], 61 | dtype=tf.float32, 62 | initializer=word_embedding_initializer) 63 | 64 | # define placehloders 65 | self.turns = tf.placeholder( 66 | tf.int32, 67 | shape=[self._conf["batch_size"], self._conf["max_turn_num"], 68 | self._conf["max_turn_len"]]) 69 | 70 | self.tt_turns_len = tf.placeholder( # turn_num 71 | tf.int32, 72 | shape=[self._conf["batch_size"]]) 73 | 74 | self.every_turn_len = tf.placeholder( 75 | tf.int32, 76 | shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) 77 | 78 | self.turns_intent = tf.placeholder( 79 | tf.float32, 80 | shape=[self._conf["batch_size"], self._conf["max_turn_num"], 81 | self._conf["intent_size"]]) 82 | 83 | self.response = tf.placeholder( 84 | tf.int32, 85 | shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) 86 | 87 | self.response_len = tf.placeholder( 88 | tf.int32, 89 | shape=[self._conf["batch_size"]]) 90 | 91 | self.response_intent = tf.placeholder( 92 | tf.float32, 93 | shape=[self._conf["batch_size"], self._conf["intent_size"]]) 94 | 95 | self.label = tf.placeholder( 96 | tf.float32, 97 | shape=[self._conf["batch_size"]]) 98 | 99 | # define operations 100 | # response part 101 | Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) 102 | # [batch_size, max_turn_len, embed_size] 103 | 104 | # print('[after embedding_lookup] Hr shape: %s' % Hr.shape) 105 | 106 | if self._conf['is_positional'] and self._conf['stack_num'] > 0: 107 | with tf.variable_scope('positional'): 108 | Hr = op.positional_encoding_vector(Hr, max_timescale=10) 109 | Hr_stack = [Hr] # 1st element of Hr_stack is the orginal embedding 110 | # lyang comments: self attention 111 | for index in range(self._conf['stack_num']): 112 | # print('[self attention for response] stack index: %d ' % index) 113 | with tf.variable_scope('self_stack_' + str(index)): 114 | # [batch, max_turn_len, emb_size] 115 | Hr = layers.block( # attentive module 116 | Hr, Hr, Hr, 117 | Q_lengths=self.response_len, 118 | K_lengths=self.response_len) 119 | # print('[after layers.block] Hr shape: %s' % Hr.shape) 120 | # Hr is still [batch_size, max_turn_len, embed_size] 121 | Hr_stack.append(Hr) 122 | 123 | # print('[after self attention of response] len(Hr_stack)', 124 | # len(Hr_stack)) # 1+stack_num 125 | # context part 126 | # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] 127 | list_turn_t = tf.unstack(self.turns, axis=1) 128 | list_turn_length = tf.unstack(self.every_turn_len, axis=1) 129 | list_turn_intent = tf.unstack(self.turns_intent, axis=1) 130 | 131 | sim_turns = [] 132 | attention_turns = [] # intent based attention on each turn 133 | # for every turn_t calculate matching vector 134 | turn_index = 0 135 | for turn_t, t_turn_length, t_intent in zip(list_turn_t, list_turn_length, list_turn_intent): 136 | print('current turn_index : ', turn_index) 137 | turn_index += 1 138 | Hu = tf.nn.embedding_lookup(self._word_embedding, 139 | turn_t) # [batch, max_turn_len, emb_size] 140 | # print('[after embedding_lookup] Hu shape: %s' % Hu.shape) 141 | 142 | if self._conf['is_positional'] and self._conf['stack_num'] > 0: 143 | with tf.variable_scope('positional', reuse=True): 144 | Hu = op.positional_encoding_vector(Hu, 145 | max_timescale=10) 146 | Hu_stack = [Hu] # 1st element of Hu_stack is the orginal embedding 147 | 148 | # lyang comments: self attention 149 | for index in range(self._conf['stack_num']): 150 | # print('[self attention for context turn] stack index: %d ' % index) 151 | with tf.variable_scope('self_stack_' + str(index), 152 | reuse=True): 153 | # [batch, max_turn_len, emb_size] 154 | Hu = layers.block( # attentive module 155 | Hu, Hu, Hu, 156 | Q_lengths=t_turn_length, K_lengths=t_turn_length) 157 | # print('[after layers.block] Hu shape: %s' % Hu.shape) 158 | Hu_stack.append(Hu) 159 | # print('[after self attention of context turn] len(Hu_stack)', 160 | # len(Hu_stack)) # 1+stack_num 161 | 162 | # lyang comments: cross attention 163 | # print('[cross attention ...]') 164 | r_a_t_stack = [] 165 | t_a_r_stack = [] 166 | # cross attention 167 | for index in range(self._conf['stack_num'] + 1): 168 | # print('[cross attention] stack index = ', index) 169 | with tf.variable_scope('t_attend_r_' + str(index)): 170 | try: 171 | # [batch, max_turn_len, emb_size] 172 | t_a_r = layers.block( # attentive module 173 | Hu_stack[index], Hr_stack[index], 174 | Hr_stack[index], 175 | Q_lengths=t_turn_length, 176 | K_lengths=self.response_len) 177 | except ValueError: 178 | tf.get_variable_scope().reuse_variables() 179 | t_a_r = layers.block( 180 | # [batch, max_turn_len, emb_size] 181 | Hu_stack[index], Hr_stack[index], 182 | Hr_stack[index], 183 | Q_lengths=t_turn_length, 184 | K_lengths=self.response_len) 185 | # print('[cross attention t_attend_r_] stack index: %d, t_a_r.shape: %s' % ( 186 | # index, t_a_r.shape)) 187 | 188 | with tf.variable_scope('r_attend_t_' + str(index)): 189 | try: 190 | # [batch, max_turn_len, emb_size] 191 | r_a_t = layers.block( # attentive module 192 | Hr_stack[index], Hu_stack[index], 193 | Hu_stack[index], 194 | Q_lengths=self.response_len, 195 | K_lengths=t_turn_length) 196 | except ValueError: 197 | tf.get_variable_scope().reuse_variables() 198 | r_a_t = layers.block( 199 | Hr_stack[index], Hu_stack[index], 200 | Hu_stack[index], 201 | Q_lengths=self.response_len, 202 | K_lengths=t_turn_length) 203 | # print('[cross attention r_a_t_] stack index: %d, r_a_t.shape: %s' % ( 204 | # index, r_a_t.shape)) 205 | 206 | t_a_r_stack.append(t_a_r) 207 | r_a_t_stack.append(r_a_t) 208 | # print('[cross attention] len(t_a_r_stack):', len(t_a_r_stack)) 209 | # print('[cross attention] len(r_a_t_stack):', len(r_a_t_stack)) 210 | 211 | # print('[before extend] len(t_a_r_stack):', len(t_a_r_stack)) 212 | # print('[before extend] len(r_a_t_stack):', len(r_a_t_stack)) 213 | # lyang comments: 3D aggregation 214 | t_a_r_stack.extend( 215 | Hu_stack) # half from self-attention; half from cross-attention 216 | r_a_t_stack.extend( 217 | Hr_stack) # half from self-attention; half from cross-attention 218 | # after extend, len(t_a_r_stack)) = 2*(stack_num+1) 219 | 220 | # print('[after extend] len(t_a_r_stack):', len(t_a_r_stack)) 221 | # print('[after extend] len(r_a_t_stack):', len(r_a_t_stack)) 222 | 223 | t_a_r = tf.stack(t_a_r_stack, axis=-1) 224 | r_a_t = tf.stack(r_a_t_stack, axis=-1) 225 | 226 | # print('after stack along the last dimension: ') 227 | # print('t_a_r shape: %s' % t_a_r.shape) 228 | # print('r_a_t shape: %s' % r_a_t.shape) 229 | # after stack, t_a_r and r_a_t are (batch, max_turn_len, embed_size, 2*(stack_num+1)) 230 | 231 | with tf.variable_scope('intent_based_attention', 232 | reuse=tf.AUTO_REUSE): # share parameter across different turns 233 | # there are 3 different ways to implement intent based attention 234 | # implement these three different variations and compare the 235 | # effectiveness as model abalation analysis 236 | # let I_u_t and I_r_k are intent vector in [12,1] 237 | # 1. dot: w * [I_u_t, I_r_k], where w is [24,1] 238 | # 2. biliear: I_u_t' * w * I_r_k, where w is [12,12] 239 | # 3. outprod: I_u_t * I_r_k' -> [12,12] out product -> 240 | # flaten to [144,1] outprod -> w*outprod 241 | # where w is [1,144] 242 | attention_logits = layers.attention_intent(t_intent, 243 | self.response_intent, 244 | self._conf['intent_attention_type']) 245 | # print('[intent_based_attention] attention_logits.shape: %s' % attention_logits.shape) 246 | attention_turns.append(attention_logits) 247 | 248 | # calculate similarity matrix 249 | with tf.variable_scope('similarity'): 250 | # sim shape [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] 251 | # divide sqrt(200) to prevent gradient explosion 252 | # A_biks * B_bjks -> C_bijs 253 | sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt( 254 | 200.0) 255 | # (batch, max_turn_len, embed_size, 2*(stack_num+1)) * 256 | # (batch, max_turn_len, embed_size, 2*(stack_num+1)) -> 257 | # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] 258 | # where k is corresponding to the dimension of embed_size, 259 | # which can be eliminated by dot product with einsum 260 | # print('[similarity] after einsum dot prod sim shape: %s' % sim.shape) 261 | # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] 262 | # ! Here we multipy sim by intent based attention weights before 263 | # append sim into sim_turns in order to generate the weighted 264 | # stack in the next step 265 | 266 | sim_turns.append(sim) 267 | # print('[similarity] after append, len(sim_turns):', len(sim_turns)) 268 | 269 | attention_logits = tf.stack(attention_turns, axis=1) # [batch, max_turn_num] 270 | print('[attention_logits] after stack attention_logits.shape: %s' % attention_logits.shape) 271 | # add mask in attention following the way in BERT 272 | # real turn_num is in self.tt_turns_len [batch] 273 | # return a mask tensor with shape [batch, conf['max_turn_num']] 274 | attention_mask = tf.sequence_mask(self.tt_turns_len, self._conf['max_turn_num'], 275 | dtype=tf.float32) 276 | print('[attention_mask] attention_mask.shape: %s' % attention_mask.shape) 277 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 278 | # masked positions, this operation will create a tensor which is 0.0 for 279 | # positions we want to attend and -10000.0 for masked positions. 280 | adder = (1.0 - attention_mask) * -10000.0 281 | 282 | # Since we are adding it to the raw scores before the softmax, this is 283 | # effectively the same as removing these entirely. 284 | attention_logits += adder 285 | attention = tf.nn.softmax(attention_logits) # by default softmax along dim=-1 [batch, max_turn_num] 286 | print('[attention] attention.shape: %s' % attention_mask.shape) 287 | self.attention = attention # will print it for visualization 288 | 289 | # cnn and aggregation 290 | # lyang comments aggregation by 3D CNN layer 291 | # [3d cnn aggregation] sim shape: (32, 9, 180, 180, 10) 292 | # conv_0 shape: (32, 9, 180, 180, 16) 293 | # pooling_0 shape: (32, 3, 60, 60, 16) 294 | # conv_1 shape: (32, 3, 60, 60, 16) 295 | # pooling_1 shape: (32, 1, 20, 20, 16) 296 | # [3d cnn aggregation] final_info: (32, 6400) # [batch * feature_size] 297 | # [batch, max_turn_num, max_turn_len, max_turn_len, 2*(stack_num+1)] 298 | # (32, 9, 180, 180, 10) 299 | sim = tf.stack(sim_turns, axis=1) 300 | # multipy sim by attention score 301 | sim = tf.einsum('bijks,bi->bijks', sim, attention) 302 | print('[3d cnn aggregation] sim shape: %s' % sim.shape) 303 | with tf.variable_scope('cnn_aggregation'): 304 | final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'], 305 | self._conf['cnn_3d_oc1']) 306 | # for udc 307 | # final_info = layers.CNN_3d(sim, 32, 16) 308 | # for douban 309 | # final_info = layers.CNN_3d(sim, 16, 16) 310 | 311 | print('[3d cnn aggregation] final_info: %s' % final_info.shape) 312 | # loss and train 313 | with tf.variable_scope('loss'): 314 | self.loss, self.logits = layers.loss(final_info, self.label) 315 | 316 | self.global_step = tf.Variable(0, trainable=False) 317 | initial_learning_rate = self._conf['learning_rate'] 318 | self.learning_rate = tf.train.exponential_decay( 319 | initial_learning_rate, 320 | global_step=self.global_step, 321 | decay_steps=400, 322 | decay_rate=0.9, 323 | staircase=True) 324 | 325 | Optimizer = tf.train.AdamOptimizer(self.learning_rate) 326 | self.optimizer = Optimizer.minimize( 327 | self.loss, 328 | global_step=self.global_step) 329 | 330 | self.init = tf.global_variables_initializer() 331 | self.saver = tf.train.Saver( 332 | max_to_keep=self._conf["max_to_keep"]) 333 | self.all_variables = tf.global_variables() 334 | self.all_operations = self._graph.get_operations() 335 | self.grads_and_vars = Optimizer.compute_gradients(self.loss) 336 | 337 | for grad, var in self.grads_and_vars: 338 | if grad is None: 339 | print var 340 | 341 | self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for 342 | grad, var in self.grads_and_vars] 343 | self.g_updates = Optimizer.apply_gradients( 344 | self.capped_gvs, 345 | global_step=self.global_step) 346 | 347 | return self._graph 348 | 349 | 350 | -------------------------------------------------------------------------------- /IART/utils/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import utils.operations as op 3 | def similarity(x, y, x_lengths, y_lengths): 4 | '''calculate similarity with two 3d tensor. 5 | 6 | Args: 7 | x: a tensor with shape [batch, time_x, dimension] 8 | y: a tensor with shape [batch, time_y, dimension] 9 | 10 | Returns: 11 | a tensor with shape [batch, time_x, time_y] 12 | 13 | Raises: 14 | ValueError: if 15 | the dimenisons of x and y are not equal. 16 | ''' 17 | with tf.variable_scope('x_attend_y'): 18 | try: 19 | x_a_y = block( 20 | x, y, y, 21 | Q_lengths=x_lengths, K_lengths=y_lengths) 22 | except ValueError: 23 | tf.get_variable_scope().reuse_variables() 24 | x_a_y = block( 25 | x, y, y, 26 | Q_lengths=x_lengths, K_lengths=y_lengths) 27 | 28 | with tf.variable_scope('y_attend_x'): 29 | try: 30 | y_a_x = block( 31 | y, x, x, 32 | Q_lengths=y_lengths, K_lengths=x_lengths) 33 | except ValueError: 34 | tf.get_variable_scope().reuse_variables() 35 | y_a_x = block( 36 | y, x, x, 37 | Q_lengths=y_lengths, K_lengths=x_lengths) 38 | 39 | return tf.matmul(x + x_a_y, y + y_a_x, transpose_b=True) 40 | 41 | 42 | def dynamic_L(x): 43 | '''Attention machanism to combine the infomation, 44 | from https://arxiv.org/pdf/1612.01627.pdf. 45 | 46 | Args: 47 | x: a tensor with shape [batch, time, dimension] 48 | 49 | Returns: 50 | a tensor with shape [batch, dimension] 51 | 52 | Raises: 53 | ''' 54 | key_0 = tf.get_variable( 55 | name='key', 56 | shape=[x.shape[-1]], 57 | dtype=tf.float32, 58 | initializer=tf.random_uniform_initializer( 59 | -tf.sqrt(6./tf.cast(x.shape[-1], tf.float32)), 60 | tf.sqrt(6./tf.cast(x.shape[-1], tf.float32)))) 61 | 62 | key = op.dense(x, add_bias=False) #[batch, time, dimension] 63 | weight = tf.reduce_sum(tf.multiply(key, key_0), axis=-1) #[batch, time] 64 | weight = tf.expand_dims(tf.nn.softmax(weight), -1) #[batch, time, 1] 65 | 66 | L = tf.reduce_sum(tf.multiply(x, weight), axis=1) #[batch, dimension] 67 | return L 68 | 69 | def mtl_loss(x, y, turns_intent_enc, response_intent_enc, 70 | turns_intent, response_intent, intent_loss_weight, num_classes=2, 71 | is_clip=True, clip_value=10): 72 | ''' 73 | A specialized loss IADAM-MTL-V5 which jointly learn the response ranking 74 | and user intent prediction 75 | For a q or d text, we predict the user intent distributions in q and d 76 | In the case of information-seeking conversation 77 | A q is a dialog context with multiple utterances 78 | A d is a candidate response 79 | Thus for d which is a candidate response text, 80 | y_pred is [batch_size, intent_size] 81 | y_true [batch_size, intent_size] 82 | for q which is a context utterance sequences 83 | y_pred is [batch_size, max_utterance_num, intent_size] 84 | y_true [batch_size, max_utterance_num, intent_size] 85 | Both y_pred and y_true come from a Dense layer with sigmoid as the activation 86 | funtion as in the multi-label classification setting for intent prediction 87 | Thus we need to use tf.nn.sigmoid_cross_entropy_with_logits in tensorflow 88 | https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits 89 | ''' 90 | # intent loss 91 | loss_turns_intent = tf.reduce_mean(tf.clip_by_value( 92 | tf.nn.sigmoid_cross_entropy_with_logits( 93 | labels=turns_intent, 94 | logits=turns_intent_enc), -clip_value, clip_value)) 95 | # tf.summary.scalar('loss_turns_intent', loss_turns_intent) 96 | loss_response_intent = tf.reduce_mean(tf.clip_by_value( 97 | tf.nn.sigmoid_cross_entropy_with_logits( 98 | labels=response_intent, 99 | logits=response_intent_enc), -clip_value, clip_value)) 100 | # tf.summary.scalar('loss_response_intent', loss_turns_intent) 101 | 102 | # ranking loss 103 | assert isinstance(num_classes, int) 104 | assert num_classes >= 2 105 | 106 | W = tf.get_variable( 107 | name='weights', 108 | shape=[x.shape[-1], num_classes - 1], 109 | initializer=tf.orthogonal_initializer()) 110 | bias = tf.get_variable( 111 | name='bias', 112 | shape=[num_classes - 1], 113 | initializer=tf.zeros_initializer()) 114 | 115 | logits = tf.reshape(tf.matmul(x, W) + bias, [-1]) 116 | loss = tf.nn.sigmoid_cross_entropy_with_logits( 117 | labels=tf.cast(y, tf.float32), 118 | logits=logits) 119 | loss = tf.reduce_mean(tf.clip_by_value(loss, -clip_value, clip_value)) 120 | # tf.summary.scalar('ranking loss', loss) 121 | # the final loss is a weighted sum of intent loss and ranking loss 122 | loss_final = (1.0 - intent_loss_weight) * loss + intent_loss_weight * ( 123 | loss_turns_intent + loss_response_intent) 124 | # tf.summary.scalar('loss_final', loss_final) 125 | # print('tracking loss in tensorboard...') 126 | return loss_final, logits 127 | 128 | def loss(x, y, num_classes=2, is_clip=True, clip_value=10): 129 | '''From info x calculate logits as return loss. 130 | 131 | Args: 132 | x: a tensor with shape [batch, dimension] 133 | num_classes: a number 134 | 135 | Returns: 136 | loss: a tensor with shape [1], which is the average loss of one batch 137 | logits: a tensor with shape [batch, 1] 138 | 139 | Raises: 140 | AssertionError: if 141 | num_classes is not a int greater equal than 2. 142 | TODO: 143 | num_classes > 2 may be not adapted. 144 | ''' 145 | assert isinstance(num_classes, int) 146 | assert num_classes >= 2 147 | 148 | W = tf.get_variable( 149 | name='weights', 150 | shape=[x.shape[-1], num_classes-1], 151 | initializer=tf.orthogonal_initializer()) 152 | bias = tf.get_variable( 153 | name='bias', 154 | shape=[num_classes-1], 155 | initializer=tf.zeros_initializer()) 156 | 157 | logits = tf.reshape(tf.matmul(x, W) + bias, [-1]) 158 | loss = tf.nn.sigmoid_cross_entropy_with_logits( 159 | labels=tf.cast(y, tf.float32), 160 | logits=logits) 161 | loss = tf.reduce_mean(tf.clip_by_value(loss, -clip_value, clip_value)) 162 | # tf.summary.scalar('loss', loss) 163 | # print('tracking loss in tensorboard...') 164 | return loss, logits 165 | 166 | def attention_intent(I_u, I_r, attention_type='dot'): 167 | ''' 168 | Intent based attention layer 169 | The inputs are two intent vectors for a (utterance, response) pair 170 | Returns the attention logits 171 | ''' 172 | assert attention_type in ('dot', 'bilinear', 'outprod') 173 | if attention_type == 'dot': 174 | logits = op.dot_sim_2d(I_u, I_r) 175 | elif attention_type == 'bilinear': 176 | logits = op.bilinear_sim_2d(I_u, I_r) 177 | elif attention_type == 'outprod': 178 | logits = op.outprod_sim_2d(I_u, I_r) 179 | return logits 180 | 181 | def attention( 182 | Q, K, V, 183 | Q_lengths, K_lengths, 184 | attention_type='dot', 185 | is_mask=True, mask_value=-2**32+1, 186 | drop_prob=None): 187 | '''Add attention layer. 188 | Args: 189 | Q: a tensor with shape [batch, Q_time, Q_dimension] 190 | K: a tensor with shape [batch, time, K_dimension] 191 | V: a tensor with shape [batch, time, V_dimension] 192 | 193 | Q_length: a tensor with shape [batch] 194 | K_length: a tensor with shape [batch] 195 | 196 | Returns: 197 | a tensor with shape [batch, Q_time, V_dimension] 198 | 199 | Raises: 200 | AssertionError: if 201 | Q_dimension not equal to K_dimension when attention type is dot. 202 | ''' 203 | assert attention_type in ('dot', 'bilinear') 204 | if attention_type == 'dot': 205 | assert Q.shape[-1] == K.shape[-1] 206 | 207 | Q_time = Q.shape[1] 208 | K_time = K.shape[1] 209 | 210 | if attention_type == 'dot': 211 | logits = op.dot_sim(Q, K) #[batch, Q_time, time] 212 | if attention_type == 'bilinear': 213 | logits = op.bilinear_sim(Q, K) 214 | 215 | if is_mask: 216 | mask = op.mask(Q_lengths, K_lengths, Q_time, K_time) #return [batch, Q_time, K_time] 217 | # mask is a tensor with the same shape with logits 218 | # where the real word location is labeled by 1 219 | # where the padded/masked word location is labeled by 0 220 | # mask * logits is element-wise product 221 | # + (1 - mask) is to add very small negative value on 222 | # masked positions (0). after softmax, this position becomes 0 223 | # similar tricks also used in BERT 224 | logits = mask * logits + (1 - mask) * mask_value 225 | 226 | attention = tf.nn.softmax(logits) 227 | 228 | if drop_prob is not None: 229 | print('use attention drop') 230 | attention = tf.nn.dropout(attention, drop_prob) 231 | 232 | return op.weighted_sum(attention, V) 233 | 234 | def FFN(x, out_dimension_0=None, out_dimension_1=None): 235 | '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1. 236 | 237 | Args: 238 | x: a tensor with shape [batch, time, dimension] 239 | out_dimension: a number which is the output dimension 240 | 241 | Returns: 242 | a tensor with shape [batch, time, out_dimension] 243 | 244 | Raises: 245 | ''' 246 | with tf.variable_scope('FFN_1'): 247 | y = op.dense(x, out_dimension_0) 248 | y = tf.nn.relu(y) 249 | with tf.variable_scope('FFN_2'): 250 | z = op.dense(y, out_dimension_1) #, add_bias=False) #!!!! 251 | return z 252 | 253 | def block( 254 | Q, K, V, 255 | Q_lengths, K_lengths, 256 | attention_type='dot', 257 | is_layer_norm=True, 258 | is_mask=True, mask_value=-2**32+1, 259 | drop_prob=None): 260 | '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf. 261 | Args: 262 | Q: a tensor with shape [batch, Q_time, Q_dimension] 263 | K: a tensor with shape [batch, time, K_dimension] 264 | V: a tensor with shape [batch, time, V_dimension] 265 | 266 | Q_length: a tensor with shape [batch] 267 | K_length: a tensor with shape [batch] 268 | 269 | Returns: 270 | a tensor with shape [batch, time, dimension] 271 | 272 | Raises: 273 | ''' 274 | att = attention(Q, K, V, 275 | Q_lengths, K_lengths, 276 | attention_type='dot', 277 | is_mask=is_mask, mask_value=mask_value, 278 | drop_prob=drop_prob) 279 | if is_layer_norm: 280 | with tf.variable_scope('attention_layer_norm'): 281 | y = op.layer_norm_debug(Q + att) 282 | else: 283 | y = Q + att 284 | 285 | z = FFN(y) 286 | if is_layer_norm: 287 | with tf.variable_scope('FFN_layer_norm'): 288 | w = op.layer_norm_debug(y + z) 289 | else: 290 | w = y + z 291 | return w 292 | 293 | def intent_cnn(x, intent_size): 294 | ''' 295 | A CNN encoder to extract features for intent prediction 296 | Args: 297 | x: a tensor with shape [batch, in_height, in_width, in_channels] 298 | intent_size: size of intent types, which is also output shape 299 | Returns: 300 | a tensor with shape [batch, intent_size] 301 | ''' 302 | # CNN 303 | cnn = CNN(x, 3, 3, 3, add_relu=True) 304 | print('[In intent_cnn]: cnn %s ' % cnn.shape) 305 | 306 | # Dense Layer 0 307 | # dense_0 = tf.layers.dense(inputs=cnn, units=4096, 308 | # activation=tf.nn.relu) 309 | # print('[In intent_cnn]: dense_0 %s ' % dense_0.shape) 310 | # dropout_0 = tf.layers.dropout(inputs=dense_0, rate=0.4) 311 | # print('[In intent_cnn]: dropout_0 %s ' % dropout_0.shape) 312 | 313 | # Dense Layer 1 314 | dense_1 = tf.layers.dense(inputs=cnn, units=1024, 315 | activation=tf.nn.relu) 316 | print('[In intent_cnn]: dense_1 %s ' % dense_1.shape) 317 | dropout_1 = tf.layers.dropout(inputs=dense_1, rate=0.4) 318 | print('[In intent_cnn]: dropout_1 %s ' % dropout_1.shape) 319 | 320 | # Logits Layer 321 | logits = tf.layers.dense(inputs=dropout_1, units=intent_size, 322 | activation=tf.nn.sigmoid) 323 | print('[In intent_cnn]: logits %s ' % logits.shape) 324 | return logits 325 | 326 | def CNN(x, out_channels, filter_size, pooling_size, add_relu=True): 327 | '''Add a convlution layer with relu and max pooling layer. 328 | 329 | Args: 330 | x: a tensor with shape [batch, in_height, in_width, in_channels] 331 | out_channels: a number 332 | filter_size: a number 333 | pooling_size: a number 334 | 335 | Returns: 336 | a flattened tensor with shape [batch, num_features] 337 | 338 | Raises: 339 | ''' 340 | #calculate the last dimension of return 341 | # num_features = ((tf.shape(x)[1]-filter_size+1)/pooling_size * 342 | # (tf.shape(x)[2]-filter_size+1)/pooling_size) * out_channels 343 | # print('[In Intent CNN Encoder]: num_features ', num_features) 344 | 345 | in_channels = x.shape[-1] 346 | weights = tf.get_variable( 347 | name='filter', 348 | shape=[filter_size, filter_size, in_channels, out_channels], 349 | dtype=tf.float32, 350 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 351 | print('[In Intent CNN Encoder]: filter weights.shape %s ' % weights.shape) 352 | bias = tf.get_variable( 353 | name='bias', 354 | shape=[out_channels], 355 | dtype=tf.float32, 356 | initializer=tf.zeros_initializer()) 357 | 358 | conv = tf.nn.conv2d(x, weights, strides=[1, 1, 1, 1], padding="SAME") 359 | conv = conv + bias 360 | print('[In Intent CNN Encoder]: conv.shape %s ' % conv.shape) 361 | if add_relu: 362 | conv = tf.nn.relu(conv) 363 | 364 | pooling = tf.nn.max_pool( 365 | conv, 366 | ksize=[1, pooling_size, pooling_size, 1], 367 | strides=[1, pooling_size, pooling_size, 1], 368 | padding="SAME") 369 | 370 | print('[In Intent CNN Encoder]: pooling.shape %s ' % pooling.shape) 371 | flat = tf.contrib.layers.flatten(pooling) 372 | print('[In Intent CNN Encoder]: flat.shape %s ' % flat.shape) 373 | return flat 374 | 375 | def CNN_3d(x, out_channels_0, out_channels_1, add_relu=True): 376 | '''Add a 3d convlution layer with relu and max pooling layer. 377 | 378 | Args: 379 | x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels] 380 | out_channels: a number 381 | filter_size: a number 382 | pooling_size: a number 383 | 384 | Returns: 385 | a flattened tensor with shape [batch, num_features] 386 | 387 | Raises: 388 | ''' 389 | in_channels = x.shape[-1] 390 | weights_0 = tf.get_variable( 391 | name='filter_0', 392 | shape=[3, 3, 3, in_channels, out_channels_0], 393 | dtype=tf.float32, 394 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 395 | bias_0 = tf.get_variable( 396 | name='bias_0', 397 | shape=[out_channels_0], 398 | dtype=tf.float32, 399 | initializer=tf.zeros_initializer()) 400 | 401 | conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="SAME") 402 | print('conv_0 shape: %s' %conv_0.shape) 403 | conv_0 = conv_0 + bias_0 404 | 405 | if add_relu: 406 | conv_0 = tf.nn.elu(conv_0) 407 | 408 | pooling_0 = tf.nn.max_pool3d( 409 | conv_0, 410 | ksize=[1, 3, 3, 3, 1], 411 | strides=[1, 3, 3, 3, 1], 412 | padding="SAME") 413 | print('pooling_0 shape: %s' %pooling_0.shape) 414 | 415 | #layer_1 416 | weights_1 = tf.get_variable( 417 | name='filter_1', 418 | shape=[3, 3, 3, out_channels_0, out_channels_1], 419 | dtype=tf.float32, 420 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 421 | bias_1 = tf.get_variable( 422 | name='bias_1', 423 | shape=[out_channels_1], 424 | dtype=tf.float32, 425 | initializer=tf.zeros_initializer()) 426 | 427 | conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="SAME") 428 | print('conv_1 shape: %s' %conv_1.shape) 429 | conv_1 = conv_1 + bias_1 430 | 431 | if add_relu: 432 | conv_1 = tf.nn.elu(conv_1) 433 | 434 | pooling_1 = tf.nn.max_pool3d( 435 | conv_1, 436 | ksize=[1, 3, 3, 3, 1], 437 | strides=[1, 3, 3, 3, 1], 438 | padding="SAME") 439 | print('pooling_1 shape: %s' %pooling_1.shape) 440 | 441 | return tf.contrib.layers.flatten(pooling_1) 442 | 443 | def CNN_3d_2d(x, out_channels_0, out_channels_1, add_relu=True): 444 | '''Add a 3d convlution layer with relu and max pooling layer. 445 | 446 | Args: 447 | x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels] 448 | out_channels: a number 449 | filter_size: a number 450 | pooling_size: a number 451 | 452 | Returns: 453 | a flattened tensor with shape [batch, num_features] 454 | 455 | Raises: 456 | ''' 457 | in_channels = x.shape[-1] 458 | weights_0 = tf.get_variable( 459 | name='filter_0', 460 | shape=[1, 3, 3, in_channels, out_channels_0], 461 | dtype=tf.float32, 462 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 463 | bias_0 = tf.get_variable( 464 | name='bias_0', 465 | shape=[out_channels_0], 466 | dtype=tf.float32, 467 | initializer=tf.zeros_initializer()) 468 | 469 | conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="SAME") 470 | print('conv_0 shape: %s' %conv_0.shape) 471 | conv_0 = conv_0 + bias_0 472 | 473 | if add_relu: 474 | conv_0 = tf.nn.elu(conv_0) 475 | 476 | pooling_0 = tf.nn.max_pool3d( 477 | conv_0, 478 | ksize=[1, 1, 3, 3, 1], 479 | strides=[1, 1, 3, 3, 1], 480 | padding="SAME") 481 | print('pooling_0 shape: %s' %pooling_0.shape) 482 | 483 | #layer_1 484 | weights_1 = tf.get_variable( 485 | name='filter_1', 486 | shape=[1, 3, 3, out_channels_0, out_channels_1], 487 | dtype=tf.float32, 488 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 489 | bias_1 = tf.get_variable( 490 | name='bias_1', 491 | shape=[out_channels_1], 492 | dtype=tf.float32, 493 | initializer=tf.zeros_initializer()) 494 | 495 | conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="SAME") 496 | print('conv_1 shape: %s' %conv_1.shape) 497 | conv_1 = conv_1 + bias_1 498 | 499 | if add_relu: 500 | conv_1 = tf.nn.elu(conv_1) 501 | 502 | pooling_1 = tf.nn.max_pool3d( 503 | conv_1, 504 | ksize=[1, 1, 3, 3, 1], 505 | strides=[1, 1, 3, 3, 1], 506 | padding="SAME") 507 | print('pooling_1 shape: %s' %pooling_1.shape) 508 | 509 | return tf.contrib.layers.flatten(pooling_1) 510 | 511 | def CNN_3d_change(x, out_channels_0, out_channels_1, add_relu=True): 512 | '''Add a 3d convlution layer with relu and max pooling layer. 513 | 514 | Args: 515 | x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels] 516 | out_channels: a number 517 | filter_size: a number 518 | pooling_size: a number 519 | 520 | Returns: 521 | a flattened tensor with shape [batch, num_features] 522 | 523 | Raises: 524 | ''' 525 | in_channels = x.shape[-1] 526 | weights_0 = tf.get_variable( 527 | name='filter_0', 528 | shape=[3, 3, 3, in_channels, out_channels_0], 529 | dtype=tf.float32, 530 | #initializer=tf.random_normal_initializer(0, 0.05)) 531 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 532 | bias_0 = tf.get_variable( 533 | name='bias_0', 534 | shape=[out_channels_0], 535 | dtype=tf.float32, 536 | initializer=tf.zeros_initializer()) 537 | #Todo 538 | g_0 = tf.get_variable(name='scale_0', 539 | shape = [out_channels_0], 540 | dtype=tf.float32, 541 | initializer=tf.ones_initializer()) 542 | weights_0 = tf.reshape(g_0, [1, 1, 1, out_channels_0]) * tf.nn.l2_normalize(weights_0, [0, 1, 2]) 543 | 544 | conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="VALID") 545 | print('conv_0 shape: %s' %conv_0.shape) 546 | conv_0 = conv_0 + bias_0 547 | ####### 548 | ''' 549 | with tf.variable_scope('layer_0'): 550 | conv_0 = op.layer_norm(conv_0, axis=[1, 2, 3, 4]) 551 | print('layer_norm in cnn') 552 | ''' 553 | if add_relu: 554 | conv_0 = tf.nn.elu(conv_0) 555 | 556 | pooling_0 = tf.nn.max_pool3d( 557 | conv_0, 558 | ksize=[1, 2, 3, 3, 1], 559 | strides=[1, 2, 3, 3, 1], 560 | padding="VALID") 561 | print('pooling_0 shape: %s' %pooling_0.shape) 562 | 563 | #layer_1 564 | weights_1 = tf.get_variable( 565 | name='filter_1', 566 | shape=[2, 2, 2, out_channels_0, out_channels_1], 567 | dtype=tf.float32, 568 | initializer=tf.random_uniform_initializer(-0.01, 0.01)) 569 | 570 | bias_1 = tf.get_variable( 571 | name='bias_1', 572 | shape=[out_channels_1], 573 | dtype=tf.float32, 574 | initializer=tf.zeros_initializer()) 575 | 576 | g_1 = tf.get_variable(name='scale_1', 577 | shape = [out_channels_1], 578 | dtype=tf.float32, 579 | initializer=tf.ones_initializer()) 580 | weights_1 = tf.reshape(g_1, [1, 1, 1, out_channels_1]) * tf.nn.l2_normalize(weights_1, [0, 1, 2]) 581 | 582 | conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="VALID") 583 | print('conv_1 shape: %s' %conv_1.shape) 584 | conv_1 = conv_1 + bias_1 585 | #with tf.variable_scope('layer_1'): 586 | # conv_1 = op.layer_norm(conv_1, axis=[1, 2, 3, 4]) 587 | 588 | if add_relu: 589 | conv_1 = tf.nn.elu(conv_1) 590 | 591 | pooling_1 = tf.nn.max_pool3d( 592 | conv_1, 593 | ksize=[1, 3, 3, 3, 1], 594 | strides=[1, 3, 3, 3, 1], 595 | padding="VALID") 596 | print('pooling_1 shape: %s' %pooling_1.shape) 597 | 598 | return tf.contrib.layers.flatten(pooling_1) 599 | 600 | def RNN_last_state(x, lengths, hidden_size): 601 | '''encode x with a gru cell and return the last state. 602 | 603 | Args: 604 | x: a tensor with shape [batch, time, dimension] 605 | length: a tensor with shape [batch] 606 | 607 | Return: 608 | a tensor with shape [batch, hidden_size] 609 | 610 | Raises: 611 | ''' 612 | cell = tf.nn.rnn_cell.GRUCell(hidden_size) 613 | outputs, last_states = tf.nn.dynamic_rnn(cell, x, lengths, dtype=tf.float32) 614 | return outputs, last_states 615 | 616 | 617 | -------------------------------------------------------------------------------- /IART/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from __future__ import print_function 5 | from nltk.tokenize import word_tokenize 6 | import jieba 7 | import sys 8 | import numpy as np 9 | from nltk.corpus import stopwords as nltk_stopwords 10 | from nltk.stem import SnowballStemmer 11 | from tqdm import tqdm 12 | 13 | sys.path.append('../inputs') 14 | sys.path.append('../utils') 15 | from preparation import * 16 | # from rank_io import * 17 | 18 | class Preprocess(object): 19 | 20 | _valid_lang = ['en', 'cn'] 21 | _stemmer = SnowballStemmer('english') 22 | 23 | def __init__(self, 24 | word_seg_config = {}, 25 | doc_filter_config = {}, 26 | word_stem_config = {}, 27 | word_lower_config = {}, 28 | word_filter_config = {}, 29 | word_index_config = {} 30 | ): 31 | # set default configuration 32 | self._word_seg_config = { 'enable': True, 'lang': 'en' } 33 | self._doc_filter_config = { 'enable': True, 'min_len': 0, 'max_len': sys.maxint } 34 | self._word_stem_config = { 'enable': True } 35 | self._word_lower_config = { 'enable': True } 36 | self._word_filter_config = { 'enable': True, 'stop_words': nltk_stopwords.words('english'), 37 | 'min_freq': 0, 'max_freq': sys.maxint, 'words_useless': None } 38 | self._word_index_config = { 'word_dict': None } 39 | 40 | self._word_seg_config.update(word_seg_config) 41 | self._doc_filter_config.update(doc_filter_config) 42 | self._word_stem_config.update(word_stem_config) 43 | self._word_lower_config.update(word_lower_config) 44 | self._word_filter_config.update(word_filter_config) 45 | self._word_index_config.update(word_index_config) 46 | 47 | self._word_dict = self._word_index_config['word_dict'] 48 | self._words_stats = dict() 49 | 50 | def run(self, file_path): 51 | print('load...') 52 | dids, docs = Preprocess.load(file_path) 53 | 54 | if self._word_seg_config['enable']: 55 | print('word_seg...') 56 | docs = Preprocess.word_seg(docs, self._word_seg_config) 57 | 58 | if self._doc_filter_config['enable']: 59 | print('doc_filter...') 60 | dids, docs = Preprocess.doc_filter(dids, docs, self._doc_filter_config) 61 | 62 | if self._word_stem_config['enable']: 63 | print('word_stem...') 64 | docs = Preprocess.word_stem(docs) 65 | 66 | if self._word_lower_config['enable']: 67 | print('word_lower...') 68 | docs = Preprocess.word_lower(docs) 69 | 70 | self._words_stats = Preprocess.cal_words_stat(docs) 71 | 72 | if self._word_filter_config['enable']: 73 | print('word_filter...') 74 | docs, self._words_useless = Preprocess.word_filter(docs, self._word_filter_config, self._words_stats) 75 | 76 | print('word_index...') 77 | docs, self._word_dict = Preprocess.word_index(docs, self._word_index_config) 78 | 79 | return dids, docs 80 | 81 | def run_2d(self, file_path): 82 | print('load...') 83 | dids, docs = Preprocess.load_2d(file_path) 84 | # dids: a list of corpus ids 85 | # docs: a list of context/responses. The context is seperated by \t 86 | 87 | print('transfer to 2d docs...') 88 | # firstly transfer docs to a 2D list [corpus_text_size, utterance_list] 89 | # a corpus text could be a list of utterances (for context) or 1 utterance (for response) 90 | docs_2d = Preprocess.transfer_to_2ddocs(docs) 91 | 92 | if self._word_seg_config['enable']: 93 | print('word_seg...') 94 | docs_2d = Preprocess.word_seg_2d(docs_2d) 95 | 96 | if self._word_stem_config['enable']: 97 | print('word_stem...') 98 | docs_2d = Preprocess.word_stem_2d(docs_2d) 99 | 100 | if self._word_lower_config['enable']: 101 | print('word_lower...') 102 | docs_2d = Preprocess.word_lower_2d(docs_2d) 103 | 104 | # print ('after word_lower, docs_2d[0:100] = ', docs_2d[0:100]) 105 | print('cal_words_stat...') 106 | self._words_stats = Preprocess.cal_words_stat_2d(docs_2d) 107 | 108 | if self._word_filter_config['enable']: 109 | print('word_filter...') 110 | docs_2d, self._words_useless = Preprocess.word_filter_2d(docs_2d, self._word_filter_config, self._words_stats) 111 | 112 | print('word_index...') 113 | docs_2d, self._word_dict = Preprocess.word_index_2d(docs_2d, self._word_index_config) 114 | return dids, docs_2d 115 | 116 | def run_2d_smn(self, file_path): 117 | ''' 118 | Minimize the preprocess steps to be consistant with Yu Wu's SMN code 119 | Refer to the build_multiturn_data function in PreProcess.py of the 120 | Theano code of Yu Wu's SMN source code 121 | :param file_path: 122 | :return: 123 | ''' 124 | print('load...') 125 | dids, docs = Preprocess.load_2d(file_path) 126 | # removed _ as what Yu Wu did in SMN preprocess code 127 | # dids: a list of corpus ids 128 | # docs: a list of context/responses. The context is seperated by \t 129 | 130 | print('transfer to 2d docs...') 131 | # firstly transfer docs to a 2D list [corpus_text_size, utterance_list] 132 | # a corpus text could be a list of utterances (for context) or 1 utterance (for response) 133 | docs_2d = Preprocess.transfer_to_2ddocs(docs) 134 | 135 | print('word_seg... (necessary for ms_dialog data)') 136 | docs_2d = Preprocess.word_seg_2d(docs_2d) 137 | 138 | print('word_lower... (necessary for ms_dialog data)') 139 | docs_2d = Preprocess.word_lower_2d(docs_2d) 140 | 141 | print('following SMN, just split with split() and index...') 142 | print('build word dict...') 143 | words = set() 144 | for c_text in tqdm(docs_2d): 145 | for utt in c_text: 146 | # words.update(set(utt.split())) 147 | words.update(set(utt)) 148 | print('vocab size: ', len(words)) 149 | word_id = 1 150 | self._word_dict = {} 151 | for word in words: 152 | self._word_dict[word] = word_id 153 | word_id += 1 154 | 155 | print('map words to ids ...') 156 | docs_index = [] 157 | for doc in tqdm(docs_2d): 158 | # docs_index.append([[self._word_dict[w] for w in utt.split()] for utt in doc]) 159 | docs_index.append([[self._word_dict[w] for w in utt] for utt in doc]) 160 | return dids, docs_index 161 | 162 | @staticmethod 163 | def transfer_to_2ddocs(docs): 164 | ''' 165 | transfer a docs to a 2 dimensional docs [corpus_text_size, utterance_list] 166 | a corpus text could be a list of utterances (for context) or 1 utterance (for response) 167 | ''' 168 | docs_2d = [] 169 | for c_text in tqdm(docs): 170 | docs_2d.append(list(c_text.split('\t'))) 171 | return docs_2d 172 | 173 | @staticmethod 174 | def parse(line): 175 | subs = line.split(' ', 1) 176 | if 1 == len(subs): 177 | return subs[0], '' 178 | else: 179 | return subs[0], subs[1] 180 | 181 | @staticmethod 182 | def load(file_path): 183 | dids = list() 184 | docs = list() 185 | f = open(file_path, 'r') 186 | for line in tqdm(f): 187 | line = line.decode('utf8') 188 | line = line.strip() 189 | if '' != line: 190 | did, doc = Preprocess.parse(line) 191 | dids.append(did) 192 | docs.append(doc) 193 | f.close() 194 | return dids, docs 195 | 196 | @staticmethod 197 | def load_2d(file_path): 198 | dids = list() 199 | docs = list() 200 | f = open(file_path, 'r') 201 | for line in tqdm(f): 202 | line = line.decode('utf8') 203 | line = line.replace("_", "") # same with SMN code by Yu Wu 204 | line = line.strip() 205 | if '' != line: 206 | subs = line.split('\t') 207 | did, doc = subs[0], '\t'.join(subs[1:len(subs)]) 208 | dids.append(did) 209 | docs.append(doc) 210 | f.close() 211 | return dids, docs 212 | 213 | @staticmethod 214 | def word_seg_2d(docs): 215 | docs_seg = [] 216 | for doc in tqdm(docs): 217 | docs_seg.append([word_tokenize(utt) for utt in doc]) 218 | return docs_seg 219 | 220 | @staticmethod 221 | def word_seg_en(docs): 222 | docs = [word_tokenize(sent) for sent in tqdm(docs)] 223 | # show the progress of word segmentation with tqdm 224 | return docs 225 | 226 | @staticmethod 227 | def word_seg_cn(docs): 228 | docs = [list(jieba.cut(sent)) for sent in docs] 229 | return docs 230 | 231 | @staticmethod 232 | def word_seg(docs, config): 233 | assert config['lang'].lower() in Preprocess._valid_lang, 'Wrong language type: %s' % config['lang'] 234 | docs = getattr(Preprocess, '%s_%s' % (sys._getframe().f_code.co_name, config['lang']))(docs) 235 | return docs 236 | 237 | @staticmethod 238 | def cal_words_stat(docs): 239 | words_stats = {} 240 | docs_num = len(docs) 241 | for ws in docs: 242 | for w in ws: 243 | if w not in words_stats: 244 | words_stats[w] = {} 245 | words_stats[w]['cf'] = 0 246 | words_stats[w]['df'] = 0 247 | words_stats[w]['idf'] = 0 248 | words_stats[w]['cf'] += 1 249 | for w in set(ws): 250 | words_stats[w]['df'] += 1 251 | for w, winfo in words_stats.items(): 252 | words_stats[w]['idf'] = np.log( (1. + docs_num) / (1. + winfo['df'])) 253 | return words_stats 254 | 255 | @staticmethod 256 | def cal_words_stat_2d(docs): 257 | words_stats = {} 258 | docs_num = len(docs) 259 | for ws in tqdm(docs): # for each corpus text 260 | for ww in ws: # for each utterance 261 | for w in ww: # for each word 262 | if w not in words_stats: 263 | words_stats[w] = {} 264 | words_stats[w]['cf'] = 0 265 | words_stats[w]['df'] = 0 266 | words_stats[w]['idf'] = 0 267 | words_stats[w]['cf'] += 1 268 | for w in set(ww): 269 | words_stats[w]['df'] += 1 270 | for w, winfo in words_stats.items(): 271 | words_stats[w]['idf'] = np.log((1. + docs_num) / (1. + winfo['df'])) 272 | return words_stats 273 | 274 | @staticmethod 275 | def word_filter(docs, config, words_stats): 276 | if config['words_useless'] is None: 277 | config['words_useless'] = set() 278 | # filter with stop_words 279 | config['words_useless'].update(config['stop_words']) 280 | # filter with min_freq and max_freq 281 | for w, winfo in words_stats.items(): 282 | # filter too frequent words or rare words 283 | if config['min_freq'] > winfo['df'] or config['max_freq'] < winfo['df']: 284 | config['words_useless'].add(w) 285 | # filter with useless words 286 | docs = [[w for w in ws if w not in config['words_useless']] for ws in tqdm(docs)] 287 | return docs, config['words_useless'] 288 | 289 | @staticmethod 290 | def word_filter_2d(docs, config, words_stats): 291 | if config['words_useless'] is None: 292 | config['words_useless'] = set() 293 | # filter with stop_words 294 | config['words_useless'].update(config['stop_words']) 295 | # filter with min_freq and max_freq 296 | for w, winfo in words_stats.items(): 297 | # filter too frequent words or rare words 298 | if config['min_freq'] > winfo['df'] or config['max_freq'] < winfo['df']: 299 | config['words_useless'].add(w) 300 | # filter with useless words 301 | print('filter useless words: ', len(config['words_useless'])) 302 | docs_filter_word = [] 303 | for doc in tqdm(docs): 304 | docs_filter_word.append([[w for w in ws if w not in config['words_useless']] for ws in doc]) 305 | return docs_filter_word, config['words_useless'] 306 | 307 | @staticmethod 308 | def doc_filter(dids, docs, config): 309 | new_docs = list() 310 | new_dids = list() 311 | for i in tqdm(range(len(docs))): 312 | if config['min_len'] <= len(docs[i]) <= config['max_len']: 313 | new_docs.append(docs[i]) 314 | new_dids.append(dids[i]) 315 | return new_dids, new_docs 316 | 317 | @staticmethod 318 | def word_stem(docs): 319 | docs = [[Preprocess._stemmer.stem(w) for w in ws] for ws in tqdm(docs)] 320 | return docs 321 | 322 | @staticmethod 323 | def word_stem_2d(docs): 324 | docs_stem = [] 325 | for doc in tqdm(docs): 326 | docs_stem.append([[Preprocess._stemmer.stem(w) for w in ws] for ws in doc]) 327 | return docs_stem 328 | 329 | @staticmethod 330 | def word_lower(docs): 331 | docs = [[w.lower() for w in ws] for ws in tqdm(docs)] 332 | return docs 333 | 334 | @staticmethod 335 | def word_lower_2d(docs): 336 | docs_lower = [] 337 | for doc in tqdm(docs): 338 | docs_lower.append([[w.lower() for w in ws] for ws in doc]) 339 | return docs_lower 340 | 341 | @staticmethod 342 | def build_word_dict(docs): 343 | word_dict = dict() 344 | for ws in docs: 345 | for w in ws: 346 | word_dict.setdefault(w, len(word_dict)) 347 | return word_dict 348 | 349 | @staticmethod 350 | def build_word_dict_2d(docs): 351 | word_dict = dict() 352 | for doc in docs: 353 | for ws in doc: 354 | for w in ws: 355 | word_dict.setdefault(w, len(word_dict)) 356 | return word_dict 357 | 358 | @staticmethod 359 | def word_index(docs, config): 360 | if config['word_dict'] is None: 361 | config['word_dict'] = Preprocess.build_word_dict(docs) 362 | docs = [[config['word_dict'][w] for w in ws if w in config['word_dict']] for ws in tqdm(docs)] 363 | return docs, config['word_dict'] 364 | 365 | @staticmethod 366 | def word_index_2d(docs, config): 367 | if config['word_dict'] is None: 368 | config['word_dict'] = Preprocess.build_word_dict_2d(docs) 369 | docs_index = [] 370 | for doc in tqdm(docs): 371 | docs_index.append([[config['word_dict'][w] for w in ws if w in config['word_dict']] for ws in doc]) 372 | return docs_index, config['word_dict'] 373 | 374 | @staticmethod 375 | def save_lines(file_path, lines): 376 | f = open(file_path, 'w') 377 | for line in lines: 378 | line = line.encode('utf8') 379 | f.write(line + "\n") 380 | f.close() 381 | 382 | @staticmethod 383 | def load_lines(file_path): 384 | f = open(file_path, 'r') 385 | lines = f.readlines() 386 | f.close() 387 | return lines 388 | 389 | @staticmethod 390 | def save_dict(file_path, dic, sort=False): 391 | if sort: 392 | dic = sorted(dic.items(), key=lambda d:d[1], reverse=False) 393 | lines = ['%s %s' % (k, v) for k, v in dic] 394 | else: 395 | lines = ['%s %s' % (k, v) for k, v in dic.iteritems()] 396 | Preprocess.save_lines(file_path, lines) 397 | 398 | @staticmethod 399 | def load_dict(file_path): 400 | lines = Preprocess.load_lines(file_path) 401 | dic = dict() 402 | for line in lines: 403 | k, v = line.split() 404 | dic[k] = v 405 | return dic 406 | 407 | def save_words_useless(self, words_useless_fp): 408 | Preprocess.save_lines(words_useless_fp, self._words_useless) 409 | 410 | def load_words_useless(self, words_useless_fp): 411 | self._words_useless = set(Preprocess.load_lines(words_useless_fp)) 412 | 413 | def save_word_dict(self, word_dict_fp, sort=False): 414 | Preprocess.save_dict(word_dict_fp, self._word_dict, sort) 415 | 416 | def load_word_dict(self, word_dict_fp): 417 | self._word_dict = Preprocess.load_dict(word_dict_fp) 418 | 419 | def save_words_stats(self, words_stats_fp, sort=False): 420 | if sort: 421 | word_dic = sorted(self._word_dict.items(), key=lambda d:d[1], reverse=False) 422 | lines = ['%s %d %d %f' % (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'], 423 | self._words_stats[w]['idf']) for w, wid in word_dic] 424 | else: 425 | lines = ['%s %d %d %f' % (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'], 426 | self._words_stats[w]['idf']) for w, wid in self._word_dict.items()] 427 | Preprocess.save_lines(words_stats_fp, lines) 428 | 429 | def load_words_stats(self, words_stats_fp): 430 | lines = Preprocess.load_lines(words_stats_fp) 431 | for line in lines: 432 | wid, cf, df, idf = line.split() 433 | self._words_stats[wid] = {} 434 | self._words_stats[wid]['cf'] = int(cf) 435 | self._words_stats[wid]['df'] = int(df) 436 | self._words_stats[wid]['idf'] = float(idf) 437 | 438 | 439 | class NgramUtil(object): 440 | 441 | def __init__(self): 442 | pass 443 | 444 | @staticmethod 445 | def unigrams(words): 446 | """ 447 | Input: a list of words, e.g., ["I", "am", "Denny"] 448 | Output: a list of unigram 449 | """ 450 | assert type(words) == list 451 | return words 452 | 453 | @staticmethod 454 | def bigrams(words, join_string, skip=0): 455 | """ 456 | Input: a list of words, e.g., ["I", "am", "Denny"] 457 | Output: a list of bigram, e.g., ["I_am", "am_Denny"] 458 | """ 459 | assert type(words) == list 460 | L = len(words) 461 | if L > 1: 462 | lst = [] 463 | for i in range(L - 1): 464 | for k in range(1, skip + 2): 465 | if i + k < L: 466 | lst.append(join_string.join([words[i], words[i + k]])) 467 | else: 468 | # set it as unigram 469 | lst = NgramUtil.unigrams(words) 470 | return lst 471 | 472 | @staticmethod 473 | def trigrams(words, join_string, skip=0): 474 | """ 475 | Input: a list of words, e.g., ["I", "am", "Denny"] 476 | Output: a list of trigram, e.g., ["I_am_Denny"] 477 | """ 478 | assert type(words) == list 479 | L = len(words) 480 | if L > 2: 481 | lst = [] 482 | for i in range(L - 2): 483 | for k1 in range(1, skip + 2): 484 | for k2 in range(1, skip + 2): 485 | if i + k1 < L and i + k1 + k2 < L: 486 | lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]])) 487 | else: 488 | # set it as bigram 489 | lst = NgramUtil.bigrams(words, join_string, skip) 490 | return lst 491 | 492 | @staticmethod 493 | def fourgrams(words, join_string): 494 | """ 495 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 496 | Output: a list of trigram, e.g., ["I_am_Denny_boy"] 497 | """ 498 | assert type(words) == list 499 | L = len(words) 500 | if L > 3: 501 | lst = [] 502 | for i in xrange(L - 3): 503 | lst.append(join_string.join([words[i], words[i + 1], words[i + 2], words[i + 3]])) 504 | else: 505 | # set it as trigram 506 | lst = NgramUtil.trigrams(words, join_string) 507 | return lst 508 | 509 | @staticmethod 510 | def uniterms(words): 511 | return NgramUtil.unigrams(words) 512 | 513 | @staticmethod 514 | def biterms(words, join_string): 515 | """ 516 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 517 | Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"] 518 | """ 519 | assert type(words) == list 520 | L = len(words) 521 | if L > 1: 522 | lst = [] 523 | for i in range(L - 1): 524 | for j in range(i + 1, L): 525 | lst.append(join_string.join([words[i], words[j]])) 526 | else: 527 | # set it as uniterm 528 | lst = NgramUtil.uniterms(words) 529 | return lst 530 | 531 | @staticmethod 532 | def triterms(words, join_string): 533 | """ 534 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 535 | Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"] 536 | """ 537 | assert type(words) == list 538 | L = len(words) 539 | if L > 2: 540 | lst = [] 541 | for i in xrange(L - 2): 542 | for j in xrange(i + 1, L - 1): 543 | for k in xrange(j + 1, L): 544 | lst.append(join_string.join([words[i], words[j], words[k]])) 545 | else: 546 | # set it as biterm 547 | lst = NgramUtil.biterms(words, join_string) 548 | return lst 549 | 550 | @staticmethod 551 | def fourterms(words, join_string): 552 | """ 553 | Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"] 554 | Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"] 555 | """ 556 | assert type(words) == list 557 | L = len(words) 558 | if L > 3: 559 | lst = [] 560 | for i in xrange(L - 3): 561 | for j in xrange(i + 1, L - 2): 562 | for k in xrange(j + 1, L - 1): 563 | for l in xrange(k + 1, L): 564 | lst.append(join_string.join([words[i], words[j], words[k], words[l]])) 565 | else: 566 | # set it as triterm 567 | lst = NgramUtil.triterms(words, join_string) 568 | return lst 569 | 570 | @staticmethod 571 | def ngrams(words, ngram, join_string=" "): 572 | """ 573 | wrapper for ngram 574 | """ 575 | if ngram == 1: 576 | return NgramUtil.unigrams(words) 577 | elif ngram == 2: 578 | return NgramUtil.bigrams(words, join_string) 579 | elif ngram == 3: 580 | return NgramUtil.trigrams(words, join_string) 581 | elif ngram == 4: 582 | return NgramUtil.fourgrams(words, join_string) 583 | elif ngram == 12: 584 | unigram = NgramUtil.unigrams(words) 585 | bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2] 586 | return unigram + bigram 587 | elif ngram == 123: 588 | unigram = NgramUtil.unigrams(words) 589 | bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2] 590 | trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3] 591 | return unigram + bigram + trigram 592 | 593 | @staticmethod 594 | def nterms(words, nterm, join_string=" "): 595 | """wrapper for nterm""" 596 | if nterm == 1: 597 | return NgramUtil.uniterms(words) 598 | elif nterm == 2: 599 | return NgramUtil.biterms(words, join_string) 600 | elif nterm == 3: 601 | return NgramUtil.triterms(words, join_string) 602 | elif nterm == 4: 603 | return NgramUtil.fourterms(words, join_string) 604 | 605 | def cal_hist(t1_rep, t2_rep, qnum, hist_size): 606 | #qnum = len(t1_rep) 607 | mhist = np.zeros((qnum, hist_size), dtype=np.float32) 608 | mm = t1_rep.dot(np.transpose(t2_rep)) 609 | for (i,j), v in np.ndenumerate(mm): 610 | if i >= qnum: 611 | break 612 | vid = int((v + 1.) / 2. * (hist_size - 1.)) 613 | mhist[i][vid] += 1. 614 | mhist += 1. 615 | mhist = np.log10(mhist) 616 | return mhist.flatten() 617 | 618 | def cal_binsum(t1_rep, t2_rep, qnum, bin_num): 619 | mbinsum = np.zeros((qnum, bin_num), dtype=np.float32) 620 | mm = t1_rep.dot(np.transpose(t2_rep)) 621 | for (i, j), v in np.ndenumerate(mm): 622 | if i >= qnum: 623 | break 624 | vid = int((v + 1.) / 2. * (bin_num - 1.)) 625 | mbinsum[i][vid] += v 626 | #mhist += 1. # smooth is not needed for computing bin sum 627 | #mhist = np.log10(mhist) # not needed for computing bin sum 628 | return mbinsum.flatten() 629 | 630 | def _test_ngram(): 631 | words = 'hello, world! hello, deep!' 632 | print(NgramUtil.ngrams(list(words), 3, '')) 633 | 634 | # def _test_hist(): 635 | # embedfile = '../../data/mq2007/embed_wiki-pdc_d50_norm' 636 | # queryfile = '../../data/mq2007/qid_query.txt' 637 | # docfile = '../../data/mq2007/docid_doc.txt' 638 | # relfile = '../../data/mq2007/relation.test.fold5.txt' 639 | # histfile = '../../data/mq2007/relation.test.fold5.hist-30.txt' 640 | # embed_dict = read_embedding(filename = embedfile) 641 | # print('after read embedding ...') 642 | # _PAD_ = 193367 643 | # embed_dict[_PAD_] = np.zeros((50, ), dtype=np.float32) 644 | # embed = np.float32(np.random.uniform(-0.2, 0.2, [193368, 50])) 645 | # embed = convert_embed_2_numpy(embed_dict, embed = embed) 646 | # 647 | # query, _ = read_data(queryfile) 648 | # print('after read query ....') 649 | # doc, _ = read_data(docfile) 650 | # print('after read doc ...') 651 | # rel = read_relation(relfile) 652 | # print('after read relation ... ') 653 | # fout = open(histfile, 'w') 654 | # for label, d1, d2 in rel: 655 | # assert d1 in query 656 | # assert d2 in doc 657 | # qnum = len(query[d1]) 658 | # d1_embed = embed[query[d1]] 659 | # d2_embed = embed[doc[d2]] 660 | # curr_hist = cal_hist(d1_embed, d2_embed, qnum, 30) 661 | # curr_hist = curr_hist.tolist() 662 | # fout.write(' '.join(map(str, curr_hist))) 663 | # fout.write('\n') 664 | # print(qnum) 665 | # #print(curr_hist) 666 | # fout.close() 667 | 668 | 669 | 670 | if __name__ == '__main__': 671 | #_test_ngram() 672 | # test with sample data 673 | basedir = '../../data/example/ranking/' 674 | prepare = Preparation() 675 | sample_file = basedir + 'sample.txt' 676 | corpus, rels = prepare.run_with_one_corpus(sample_file) 677 | print ('total corpus size', len(corpus)) 678 | print ('total relations size', len(rels)) 679 | prepare.save_corpus(basedir + 'corpus.txt', corpus) 680 | prepare.save_relation(basedir + 'relation.txt', rels) 681 | print ('preparation finished ...') 682 | 683 | print ('begin preprocess...') 684 | # Prerpocess corpus file 685 | preprocessor = Preprocess(min_freq=1) 686 | dids, docs = preprocessor.run(basedir + 'corpus.txt') 687 | preprocessor.save_word_dict(basedir + 'word_dict.txt') 688 | preprocessor.save_words_stats(basedir + 'word_stats.txt') 689 | 690 | fout = open(basedir + 'corpus_preprocessed.txt', 'w') 691 | for inum, did in enumerate(dids): 692 | fout.write('%s\t%s\n' % (did, ' '.join(map(str, docs[inum])))) 693 | fout.close() 694 | print('preprocess finished ...') 695 | 696 | 697 | --------------------------------------------------------------------------------