├── IART
    ├── bin
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── test_and_evaluate.pyc
    │   ├── train_and_evaluate.pyc
    │   ├── test_and_evaluate.py
    │   └── train_and_evaluate.py
    ├── models
    │   ├── __init__.py
    │   ├── net.pyc
    │   ├── __init__.pyc
    │   ├── iadam_attention.pyc
    │   ├── net.py
    │   └── iadam_attention.py
    ├── utils
    │   ├── __init__.py
    │   ├── layers.pyc
    │   ├── reader.pyc
    │   ├── __init__.pyc
    │   ├── evaluation.pyc
    │   ├── operations.pyc
    │   ├── douban_evaluation.pyc
    │   ├── evaluation.py
    │   ├── douban_evaluation.py
    │   ├── reader.py
    │   ├── preparation.py
    │   ├── operations.py
    │   ├── layers.py
    │   └── preprocess.py
    ├── run.sh
    ├── conqa
    │   ├── gen_w2v_filtered_helper.py
    │   ├── test_word_embedding_pkl.py
    │   ├── gen_query_all_metrics_helper.py
    │   ├── gen_query_mz_score_file_from_dam_scores.py
    │   ├── gen_w2v_mikolov.py
    │   ├── data_preprocess_dam.py
    │   ├── gen_user_intent_vector.py
    │   ├── gen_w2v_filtered.py
    │   ├── gen_query_all_metrics.py
    │   └── transfer_mz_to_dam_format.py
    ├── main_udc.py
    ├── main_ms_v2.py
    └── main_conversation_qa.py
├── output
    └── README.md
├── figures
    └── iart-model.png
├── data
    └── README.md
└── README.md


/IART/bin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/IART/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/IART/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | Please store model checkpoints and model output here.


--------------------------------------------------------------------------------
/IART/run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | CUDA_VISIBLE_DEVICES=1 python main_ms_v2.py
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/IART/models/net.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/models/net.pyc


--------------------------------------------------------------------------------
/IART/bin/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/bin/__init__.pyc


--------------------------------------------------------------------------------
/IART/utils/layers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/layers.pyc


--------------------------------------------------------------------------------
/IART/utils/reader.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/reader.pyc


--------------------------------------------------------------------------------
/figures/iart-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/figures/iart-model.png


--------------------------------------------------------------------------------
/IART/models/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/models/__init__.pyc


--------------------------------------------------------------------------------
/IART/utils/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/__init__.pyc


--------------------------------------------------------------------------------
/IART/utils/evaluation.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/evaluation.pyc


--------------------------------------------------------------------------------
/IART/utils/operations.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/operations.pyc


--------------------------------------------------------------------------------
/IART/bin/test_and_evaluate.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/bin/test_and_evaluate.pyc


--------------------------------------------------------------------------------
/IART/bin/train_and_evaluate.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/bin/train_and_evaluate.pyc


--------------------------------------------------------------------------------
/IART/models/iadam_attention.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/models/iadam_attention.pyc


--------------------------------------------------------------------------------
/IART/utils/douban_evaluation.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/Intent-Aware-Ranking-Transformers/HEAD/IART/utils/douban_evaluation.pyc


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Please download the data from this [Google Drive Folder](https://drive.google.com/drive/folders/1ayXN6pgzxs7DP9iCO6JR-KXbQHPUOLXx?usp=sharing).


--------------------------------------------------------------------------------
/IART/conqa/gen_w2v_filtered_helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | for data in list(['ms_v2']):
 5 |       for wd in list([200, 300]):
 6 |         cur_folder = '../../data/' + data +'/'
 7 |         cmd = 'python gen_w2v_filtered.py ' \
 8 |               + cur_folder + 'train_word2vec_mikolov_' + str(wd) +'d.txt ' \
 9 |               + cur_folder + 'word_dict.txt ' \
10 |               + cur_folder + 'cut_embed_mikolov_' + str(wd) + 'd.pkl'
11 |         print cmd
12 |         os.system(cmd)


--------------------------------------------------------------------------------
/IART/conqa/test_word_embedding_pkl.py:
--------------------------------------------------------------------------------
 1 | import  pickle
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | if __name__ == '__main__':
 6 |     if len(sys.argv) < 2:
 7 |         print 'please input params: word_embedding pkl file'
 8 |         exit(1)
 9 |     word_embed_file = sys.argv[1]
10 |     word_embedding_init = np.array(pickle.load(open(word_embed_file, 'rb')))
11 |     print('shape of word_embedding_init: ', word_embedding_init.shape)
12 |     print('init embed vectors of the first 10 words are: ', word_embedding_init[0:10,:])
13 | 
14 | 


--------------------------------------------------------------------------------
/IART/conqa/gen_query_all_metrics_helper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A scripts to help run gen_query_mz_score_file_from_dam_scores.py and
 3 | gen_query_all_metrics.py
 4 | '''
 5 | 
 6 | 
 7 | import os
 8 | 
 9 | dam_score_dict = {
10 |     'ms_v2-iart-att-dot': '../../output/ms_v2/iadam-attention-iadam-att-dot-opd-test-dot-run40/score.test',
11 |     'ms_v2-iart-att-outprod': '../../output/ms_v2/iadam-attention-iadam-att-dot-opd-test-outprod-run40/score.test',
12 |     'udc-iart-att-dot': '../../output/udc/iadam-attention-iadam-att-intentv2-test-dot-intentv2-run44/score.test',
13 |     'udc-iart-att-outprod': '../../output/udc/iadam-attention-iadam-att-intentv2-test-outprod-intentv2-run44/score.test'
14 | }
15 | 
16 | for data in ['ms_v2', 'udc']:
17 |     for model in ['iart-att-dot', 'iart-att-outprod']:
18 |         dam_prediction_score_file = dam_score_dict[data + '-' + model]
19 |         cmd = 'python gen_query_mz_score_file_from_dam_scores.py ' + \
20 |               dam_prediction_score_file + ' ' + data
21 |         print 'run ', cmd
22 |         os.system(cmd)
23 |         mz_prediction_file = dam_prediction_score_file + '.mz_score'
24 |         cmd = 'python gen_query_all_metrics.py ' + mz_prediction_file
25 |         print 'run ', cmd
26 |         os.system(cmd)


--------------------------------------------------------------------------------
/IART/conqa/gen_query_mz_score_file_from_dam_scores.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Transfer the score files of DAM related models to the format
 3 | of MatchZoo score files
 4 | The required score file format is as follows:
 5 | # Q597901	Q0	D118777	0	2.370282	DMN_CNN	0(ground truth)
 6 | # seperated by \t
 7 | # qid \t Q0 \t did \t rank \t score \t method \t ground_truth_label
 8 | # 030  Q0  ZF08-175-870  0   4238   prise1
 9 | # qid iter   docno      rank  sim   run_id
10 | # In particular, note that the rank field is ignored here;
11 | # internally ranks are assigned by sorting by the sim field with ties
12 | # broken deterministicly (using docno).
13 | 
14 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
15 | '''
16 | 
17 | import sys
18 | 
19 | if __name__ == '__main__':
20 |     if len(sys.argv) < 3:
21 |         print 'please input params: dam_prediction_score_file (absolute path) data_name (udc or ms_v2)'
22 |         exit(1)
23 |     dam_score_file = sys.argv[1]  # path of dam score file
24 |     # format of dam score file: score label
25 |     # the order is the same with the input data
26 |     data_name = sys.argv[2]
27 |     relation_file = '../../data/' + data_name + '/relation_test.txt' # use non_fd version to be consistent
28 |     with open(dam_score_file) as fin_score, open(relation_file) as fin_rel, \
29 |         open(dam_score_file + '.mz_score', 'w') as fout_score:
30 |         for dam_score_line in fin_score:
31 |             dam_score_line = dam_score_line.strip().split()
32 |             rel_line = fin_rel.readline().strip().split()
33 |             label = rel_line[0]
34 |             qid = rel_line[1]
35 |             did = rel_line[2]
36 |             score = dam_score_line[0]
37 |             fout_score.write(qid + '\tQ0\t' + did + '\t0\t' + score + '\t' +
38 |                              dam_score_file + '\t' + label + '\n')
39 | 
40 | 


--------------------------------------------------------------------------------
/IART/utils/evaluation.py:
--------------------------------------------------------------------------------
 1 | import sys;
 2 | from douban_evaluation import mean_average_precision
 3 | 
 4 | def get_p_at_n_in_m(data, n, m, ind):
 5 | 	pos_score = data[ind][0];
 6 | 	curr = data[ind:ind+m];
 7 | 	curr = sorted(curr, key = lambda x:x[0], reverse=True)
 8 | 
 9 | 	if curr[n-1][0] <= pos_score:
10 | 		return 1;
11 | 	return 0;
12 | 
13 | def get_map(data, m, ind):
14 | 	curr = data[ind:ind + m];
15 | 	sort_data = sorted(curr, key=lambda x: x[0], reverse=True)
16 | 	m_a_p = mean_average_precision(sort_data)
17 | 	return m_a_p
18 | 
19 | def evaluate(file_path):
20 | 	data = []
21 | 	with open(file_path, 'r') as file:
22 | 		for line in file:
23 | 			line = line.strip();
24 | 			tokens = line.split("\t")
25 | 		
26 | 			if len(tokens) != 2:
27 | 				continue
28 | 		
29 | 			data.append((float(tokens[0]), int(tokens[1])));
30 | 		
31 | 	#assert len(data) % 10 == 0
32 | 	
33 | 	p_at_1_in_2 = 0.0
34 | 	p_at_1_in_10 = 0.0
35 | 	p_at_2_in_10 = 0.0
36 | 	p_at_5_in_10 = 0.0
37 | 	map_sum = 0.0
38 | 
39 | 	length = len(data)/10 # number of queries
40 | 	print('num of queries: ', length)
41 | 
42 | 	for i in xrange(0, length):
43 | 		ind = i * 10 # use ind to index the first doc of each query
44 | 		assert data[ind][1] == 1
45 | 	
46 | 		p_at_1_in_2 += get_p_at_n_in_m(data, 1, 2, ind)
47 | 		p_at_1_in_10 += get_p_at_n_in_m(data, 1, 10, ind)
48 | 		p_at_2_in_10 += get_p_at_n_in_m(data, 2, 10, ind)
49 | 		p_at_5_in_10 += get_p_at_n_in_m(data, 5, 10, ind)
50 | 		map_sum += get_map(data, 10, ind)
51 | 		# add MAP here for IADAM evaluation
52 | 
53 | 
54 | 
55 | 	return (p_at_1_in_2/length, p_at_1_in_10/length, p_at_2_in_10/length,
56 | 			p_at_5_in_10/length, map_sum/length)
57 | 
58 | 
59 | if __name__ == '__main__':
60 | 	if len(sys.argv) < 2:
61 | 		print("plean input parameters: score_file")
62 | 		sys.exit(1)
63 | 	result = evaluate(sys.argv[1])
64 | 	for r in result:
65 | 		print(r)
66 | 	# m_line = "\t".join([str(m) for m in result])
67 | 	# print('[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t', m_line)
68 | 	print('[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format(
69 | 		result[0], result[1], result[2], result[3], result[4]))


--------------------------------------------------------------------------------
/IART/main_udc.py:
--------------------------------------------------------------------------------
 1 | import models.net as net
 2 | import models.iadam_attention as iadam_attention
 3 | 
 4 | import bin.train_and_evaluate as train
 5 | 
 6 | # configure
 7 | 
 8 | # data_small.pkl is the small data for debugging purpose (10K training instances for UDC)
 9 | # data.pkl is the whole data  (1M training instances for UDC)
10 | 
11 | conf = {
12 |     "data_name": "udc",
13 |     "data_path": "../data/udc/data_small.pkl", # data_small.pkl or data.pkl
14 |     "intent_vec_path": "../data/udc/intent_vectors.txt", # path of intent vectors
15 |     "intent_size": 12,  # dimensions of different intent
16 |     "intent_attention_type": "bilinear",  # 'dot', 'bilinear', 'outprod'
17 |     "intent_ffn_od0": 64,  # in iadam-concat ffn 144->64->16 match 576
18 |     "intent_ffn_od1": 16,  # in iadam-concat ffn 144->64->16 match 576
19 |     "intent_loss_weight": 0.2, # in iadam-mtl weight for intent loss; 1-weight for the ranking loss
20 |     "model_name": "iadam-attention", # dam, iadam-concat, iadam-attention, iadam-mtl
21 |     "save_path": "../output/udc/temp/",
22 |     "word_emb_init": None, #"../data/udc/cut_embed_mikolov_200d.pkl", # word_embedding.pkl
23 |     "init_model": None, #should be set for test
24 |     "rand_seed": None,
25 |     "drop_dense": None,
26 |     "drop_attention": None,
27 | 
28 |     "is_mask": True,
29 |     "is_layer_norm": True,
30 |     "is_positional": False,  
31 | 
32 |     "stack_num": 5,  
33 |     "attention_type": "dot",
34 | 
35 |     "learning_rate": 1e-3,
36 |     "vocab_size": 429498,
37 |     "emb_size": 200,
38 |     "batch_size": 128, # for udc/iadam_mtl model, batch_size = 64; others = 128
39 | 
40 |     "max_turn_num": 9,  
41 |     "max_turn_len": 50, 
42 | 
43 |     "max_to_keep": 1,
44 |     "num_scan_data": 2,  # about 16 hours for 2 epoches on udc
45 |     "_EOS_": 429498, # 28270, #1 for douban data
46 |     "final_n_class": 1,
47 | 
48 |     "cnn_3d_oc0": 32,
49 |     "cnn_3d_oc1": 16
50 | }
51 | 
52 | if conf['model_name'] == 'dam':
53 |     model = net.Net(conf)  # DAM
54 | elif conf['model_name'] == 'iadam-attention':
55 |     model = iadam_attention.Net(conf)  # IADAM-Attention-V4-2/ IART
56 | else:
57 |     raise NameError('model not supported.')
58 | 
59 | train.train(conf, model)
60 | 
61 | # test and evaluation, init_model in conf should be set
62 | # test.test(conf, model)
63 | 


--------------------------------------------------------------------------------
/IART/conqa/gen_w2v_mikolov.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Train word embeddings with word2vec tool by mikolov with training data of ms/udc
 3 | The script for generate the pretrained word embedding file for DAM model
 4 | We directly used the released dataset from sequential-matching-network (Wu et
 5 | al., 2017), thus there is no preprocessing phase in our experiments.
 6 | 
 7 | We pre-train word-embeddings using a word2vec toolkit in c++, I can hardly
 8 | find the script we used to pre-train word-embeddings. Maybe the following
 9 | command can help you:
10 | 
11 | ./bin/word2vec -train $train_dat -output "$train_dat.w2v" -debug 2 -size 200 \
12 | -window 10 -sample 1e-4 -negative 25 -hs 0 -binary 1 -cbow 1 -min-count 1
13 | 
14 | # default setting cut_embed_mikolov_200d_no_readvocab.txt
15 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
16 | '''
17 | 
18 | import os
19 | import sys
20 | from tqdm import tqdm
21 | 
22 | if __name__ == '__main__':
23 |     word2vec_path = '/net/home/lyang/PycharmProjects/NLPIRNNMatchZooQA/src-match-zoo-lyang-dev/data/udc/ModelInput/word2vec_mikolov/word2vec/bin/'
24 | 
25 |     if len(sys.argv) < 3:
26 |         print 'please input params: data_name (udc or ms_v2), model_input_folder (folder for corpus.txt)'
27 |         exit(1)
28 |     data_name = sys.argv[1]  # udc or ms_v2
29 |     model_input_folder = sys.argv[2] # model_input_folder
30 |     corpus_file = model_input_folder + 'corpus.txt'
31 |     corpus_text_file = model_input_folder + 'corpus_text.txt'
32 |     vocab_file = model_input_folder + 'word_dict.txt'
33 | 
34 |     print 'generate corpus_file: ', corpus_text_file
35 |     # generate corpus_text.txt for training the word vectors by word2vec
36 |     with open(corpus_file) as f_in, open(corpus_text_file, 'w') as f_out:
37 |         for l in tqdm(f_in):
38 |             #print 'l: ', l
39 |             f_out.write(' '.join(l.split()[1:]))
40 | 
41 |     for wd in list([200,300]):
42 |         word_embed_file = 'train_word2vec_mikolov_' + str(wd) + 'd.txt'
43 |         cmd = word2vec_path + 'word2vec -train ' + corpus_text_file + \
44 |               ' -output ' + model_input_folder + word_embed_file + ' -debug 2 ' \
45 |               '-size ' + str(wd) + ' -window 10 -sample 1e-4 -negative 25 -hs 0 -binary 0 -cbow 1 -min-count 1 ' \
46 |                                     '-threads 5'
47 |         print 'run cmd: ', cmd
48 |         os.system(cmd)
49 | 
50 | 


--------------------------------------------------------------------------------
/IART/main_ms_v2.py:
--------------------------------------------------------------------------------
 1 | import models.net as net
 2 | import models.iadam_attention as iadam_attention
 3 | 
 4 | import bin.train_and_evaluate as train
 5 | import bin.test_and_evaluate as test
 6 | 
 7 | # configure
 8 | 
 9 | # data_small.pkl is the small data for debugging purpose (10K training instances)
10 | # data.pkl is the whole data
11 | 
12 | conf = {
13 |     "data_name": "ms_v2",
14 |     "data_path": "../data/ms_v2/data_small.pkl", # data_small.pkl or data.pkl
15 |     "intent_vec_path": "../data/ms_v2/intent_vectors.txt", # path of intent vectors
16 |     "intent_size": 12, # dimensions of different intent
17 |     "intent_attention_type":"bilinear", # in iadam-attention:  'dot', 'bilinear', 'outprod'
18 |     "intent_ffn_od0": 128, # in iadam-concat ffn 144->128->64 match 6400
19 |     "intent_ffn_od1": 64, # in iadam-concat ffn 144->128->64 match 6400
20 |     "intent_loss_weight": 0.2, # in iadam-mtl weight for intent loss; 1-weight for the ranking loss
21 |     "model_name": "iadam-attention", # dam, iadam-concat, iadam-attention, iadam-mtl
22 |     "save_path": "../output/ms_v2/temp/",
23 |     "word_emb_init": None, # "../data/ms_v2/cut_embed_mikolov_200d.pkl", # None (set None during debugging)
24 |     "init_model": None, # "../output/ms_v2/iadam-attention-max_turn_len-200-run33/model.ckpt.20", # "../output/ms_v2/dam_default_setting_0412_run29/model.ckpt.36", # Set None for training; Set best ckpt for test
25 |     "rand_seed": None,
26 |     "drop_dense": None,
27 |     "drop_attention": None,
28 | 
29 |     "is_mask": True,
30 |     "is_layer_norm": True,
31 |     "is_positional": False,  
32 | 
33 |     "stack_num": 4,
34 |     "attention_type": "dot",
35 | 
36 |     "learning_rate": 1e-3,
37 |     "vocab_size": 167983,
38 |     "emb_size": 200,
39 |     "batch_size": 32, # for ms_v2/iadam_mtl model, batch_size = 20; others = 32
40 | 
41 |     "max_turn_num": 6,  #  6 is better for ms_v2
42 |     "max_turn_len": 200, # default is 180
43 | 
44 |     "max_to_keep": 1,
45 |     "num_scan_data": 5,  # about 18 hours for 5 epoches on ms_v2
46 |     "_EOS_": 167983, #1 for douban data
47 |     "final_n_class": 1,
48 | 
49 |     "cnn_3d_oc0": 16,
50 |     "cnn_3d_oc1": 16
51 | }
52 | 
53 | if conf['model_name'] == 'dam':
54 |     model = net.Net(conf) # DAM
55 | elif conf['model_name'] == 'iadam-attention':
56 |     model = iadam_attention.Net(conf) # IADAM is IART in paper
57 | else:
58 |     raise NameError('model not supported.')
59 | 
60 | train.train(conf, model)
61 | 
62 | # test and evaluation, init_model in conf should be set
63 | # test.test(conf, model)
64 | 
65 | 


--------------------------------------------------------------------------------
/IART/utils/douban_evaluation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | from sklearn.metrics import average_precision_score
 4 | 
 5 | def mean_average_precision(sort_data):
 6 |     #to do
 7 |     count_1 = 0
 8 |     sum_precision = 0
 9 |     for index in range(len(sort_data)):
10 |         if sort_data[index][1] == 1:
11 |             count_1 += 1
12 |             sum_precision += 1.0 * count_1 / (index+1)
13 |     return sum_precision / count_1
14 | 
15 | def mean_reciprocal_rank(sort_data):
16 |     sort_lable = [s_d[1] for s_d in sort_data]
17 |     assert 1 in sort_lable
18 |     return 1.0 / (1 + sort_lable.index(1))
19 | 
20 | def precision_at_position_1(sort_data):
21 |     if sort_data[0][1] == 1:
22 |         return 1
23 |     else:
24 |         return 0
25 | 
26 | def recall_at_position_k_in_10(sort_data, k):
27 |     sort_lable = [s_d[1] for s_d in sort_data]
28 |     select_lable = sort_lable[:k]
29 |     return 1.0 * select_lable.count(1) / sort_lable.count(1)
30 | 
31 | def evaluation_one_session(data):
32 |     sort_data = sorted(data, key=lambda x: x[0], reverse=True)
33 |     m_a_p = mean_average_precision(sort_data)
34 |     m_r_r = mean_reciprocal_rank(sort_data)
35 |     p_1 = precision_at_position_1(sort_data)
36 |     r_1 = recall_at_position_k_in_10(sort_data, 1)
37 |     r_2 = recall_at_position_k_in_10(sort_data, 2)
38 |     r_5 = recall_at_position_k_in_10(sort_data, 5)
39 |     return m_a_p, m_r_r, p_1, r_1, r_2, r_5
40 | 
41 | def evaluate(file_path):
42 |     sum_m_a_p = 0
43 |     sum_m_r_r = 0
44 |     sum_p_1 = 0
45 |     sum_r_1 = 0
46 |     sum_r_2 = 0
47 |     sum_r_5 = 0
48 | 
49 |     i = 0
50 |     total_num = 0
51 |     with open(file_path, 'r') as infile:
52 |         for line in infile:
53 |             if i % 10 == 0:
54 |                 data = []
55 |             
56 |             tokens = line.strip().split('\t')
57 |             data.append((float(tokens[0]), int(tokens[1])))
58 | 
59 |             if i % 10 == 9:
60 |                 total_num += 1
61 |                 m_a_p, m_r_r, p_1, r_1, r_2, r_5 = evaluation_one_session(data)
62 |                 sum_m_a_p += m_a_p
63 |                 sum_m_r_r += m_r_r
64 |                 sum_p_1 += p_1
65 |                 sum_r_1 += r_1
66 |                 sum_r_2 += r_2
67 |                 sum_r_5 += r_5
68 | 
69 |             i += 1
70 | 
71 |     print('total num: %s' %total_num)
72 |     print('MAP: %s' %(1.0*sum_m_a_p/total_num))
73 |     print('MRR: %s' %(1.0*sum_m_r_r/total_num))
74 |     print('P@1: %s' %(1.0*sum_p_1/total_num))
75 |     return (1.0*sum_m_a_p/total_num, 1.0*sum_m_r_r/total_num, 1.0*sum_p_1/total_num, 
76 |             1.0*sum_r_1/total_num, 1.0*sum_r_2/total_num, 1.0*sum_r_5/total_num)
77 | 
78 | if __name__ == '__main__':
79 |     result = evaluate(sys.argv[1])
80 |     for r in result:
81 |         print(r)
82 | 
83 | 
84 | 
85 | 
86 |         
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/IART/conqa/data_preprocess_dam.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Data preprocess of MS_V2 and UDC data for running DAM model
 3 | A good preprocess is very important for good performance
 4 | Preprocess UDC for debugging purpose
 5 | Preprocess MS_V2 to get results of DAM on MS_V2
 6 | The input data format is label \t context (utterances seperated by \t) \t response
 7 | 
 8 | Firstly run data_preprocess_dam.py, then run transfer_mz_to_dam_format.py
 9 | 
10 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
11 | @homepage: https://sites.google.com/site/lyangwww/
12 | '''
13 | 
14 | # /bin/python2.7
15 | import os
16 | import sys
17 | 
18 | sys.path.append('../utils/')
19 | 
20 | from preparation import Preparation
21 | from preprocess import Preprocess, NgramUtil
22 | 
23 | def read_dict(infile):
24 |     word_dict = {}
25 |     for line in open(infile):
26 |         r = line.strip().split()
27 |         word_dict[r[1]] = r[0]
28 |     return word_dict
29 | 
30 | if __name__ == '__main__':
31 |     if len(sys.argv) < 2:
32 |         print 'please input params: data_name (udc or ms_v2)'
33 |         exit(1)
34 |     data_name = sys.argv[1]  # udc or ms_v2
35 | 
36 |     basedir = '../../data/' + data_name + '/'
37 | 
38 |     # transform context/response pairs into input pkl file of DAM model
39 |     # the input files are train.txt/valid.txt/test.txt
40 |     # the format of each line is 'label context response'
41 |     prepare = Preparation()
42 | 
43 |     if data_name == 'udc' or data_name == 'ms_v2':
44 |         train_file = 'train.txt'
45 |         valid_file = 'valid.txt'
46 |         test_file = 'test.txt'
47 |     else:
48 |         raise ValueError('invalid data name!')
49 | 
50 |     corpus, rels_train, rels_valid, rels_test = prepare.run_with_train_valid_test_corpus_dmn(
51 |         basedir + train_file, basedir + valid_file,
52 |         basedir + test_file)
53 |     for data_part in list(['train', 'valid', 'test']):
54 |         if data_part == 'train':
55 |             rels = rels_train
56 |         elif data_part == 'valid':
57 |             rels = rels_valid
58 |         else:
59 |             rels = rels_test
60 |         print 'total relations in ', data_part, len(rels)
61 |         prepare.save_relation(basedir + 'relation_' + data_part + '.txt',
62 |                               rels)
63 |         print 'filter queries with duplicated doc ids...'
64 |         prepare.check_filter_query_with_dup_doc(
65 |             basedir + 'relation_' + data_part + '.txt')
66 |     print 'total corpus ', len(corpus)
67 |     prepare.save_corpus_dmn(basedir + 'corpus.txt', corpus, '\t')
68 |     print 'preparation finished ...'
69 | 
70 |     print 'begin preprocess...'
71 |     # Prerpocess corpus file
72 |     # Trying not filtering terms by frequency
73 |     preprocessor = Preprocess()
74 |     dids, docs = preprocessor.run_2d_smn(
75 |         basedir+ 'corpus.txt')  # docs is [corpus_size, utterance_num, max_text1_len]
76 |     preprocessor.save_word_dict(basedir + 'word_dict.txt')
77 |     # preprocessor.save_words_df(basedir + 'word_df.txt')
78 | 
79 |     fout = open(basedir+ 'corpus_preprocessed.txt', 'w')
80 |     for inum, did in enumerate(dids):
81 |         doc_txt = docs[inum]  # 2d list
82 |         doc_string = ''
83 |         for utt in doc_txt:
84 |             for w in utt:
85 |                 doc_string += str(w) + ' '
86 |             doc_string += '\t'
87 |         fout.write('%s\t%s\t%s\n' % (
88 |         did, len(docs[inum]), doc_string))  # id text_len text_ids
89 |     fout.close()
90 |     print('preprocess finished ...')


--------------------------------------------------------------------------------
/IART/conqa/gen_user_intent_vector.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Extract the predicted user intent vectors predicted by Chen Qu
 3 | Build a dict for user intent vectors
 4 | The key is qid_uid or rid (e.g. Q1-0 Q1-1,..., D0, D1, D2,...)
 5 |         since the context is 2D texts; the response is 1D text
 6 | The value is a 12-dimensional intent vector for this context utterance or
 7 | response candidate
 8 | 
 9 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
10 | @created on 02/08/2019
11 | '''
12 | 
13 | import sys
14 | import json
15 | import numpy as np
16 | from tqdm import tqdm
17 | import base64
18 | 
19 | 
20 | def read_dict(infile):
21 |     word_dict = {}
22 |     for line in open(infile):
23 |         r = line.strip().split()
24 |         word_dict[r[1]] = r[0]
25 |     return word_dict
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     # Extract the 12-dimensional user intent vectors predicted by Chen Qu's
30 |     # classifier
31 |     if len(sys.argv) < 2:
32 |         print ('please input params: data_name (ms_v2 or udc)')
33 |         exit(1)
34 |     data_name = sys.argv[1]  # udc or ms_v2
35 | 
36 |     basedir = '../../data/' + data_name + '/ModelInput/'
37 |     cur_data_dir = basedir + 'dmn_model_input/'
38 |     data_name_qc = data_name if data_name != 'ms_v2' else 'ms'
39 |     # intent_file_folder = '/mnt/scratch/chenqu/response_intent/output/'
40 |     # /mnt/scratch/chenqu/response_intent/udc_v2/output/
41 |     intent_file_folder = '/mnt/scratch/chenqu/response_intent/udc_v2/output/'
42 | 
43 |     # Extract user intent vectors for each (context_qid, utt_index, candidate_response_id) triple
44 |     # in train/valid/test data
45 |     intent_dict = {}
46 |     for data_part in list(['test', 'train', 'valid']):
47 |         # ! Note that here we need to read non-fd version of relation files to be consistant
48 |         # ! fd version just filtered queries with duplicated doc ids
49 |         # ! the filtering process won't change the qids/ dids
50 |         # ! thus the keys of qid/did can be used in both filtered version and
51 |         # non-filtered version
52 |         relation_file = cur_data_dir + 'relation_' + data_part + '.txt'
53 |         intent_file = intent_file_folder + data_name_qc + '_' + data_part + '.txt'
54 |         with open(relation_file) as fin_relation, open(
55 |             intent_file) as fin_intent:
56 |             print('preprcess file: ', intent_file)
57 |             for rel_line in tqdm(fin_relation):
58 |                 intent_line = fin_intent.readline()
59 |                 intent_tokens = intent_line.split('\t')
60 |                 rel_tokens = rel_line.split()
61 |                 qid, rid = rel_tokens[1], rel_tokens[2]
62 |                 # collect intent vectors for context utterances
63 |                 for i in range(1, len(intent_tokens) - 1):
64 |                     intent_dict[qid + '-' + str(i - 1)] = intent_tokens[
65 |                         i].strip()
66 |                 # collect intent vectors for response candidates
67 |                 intent_dict[rid] = intent_tokens[
68 |                     len(intent_tokens) - 1].strip()
69 |     print('test len of intent_dict: ', len(intent_dict))
70 |     # output to file
71 |     intent_file = cur_data_dir + 'intent_vectors_v2.txt'
72 |     with open(intent_file, 'w') as fout:
73 |         for id in intent_dict:
74 |             fout.write(id + '\t' + intent_dict[id] + '\n')
75 |     print('intent_dict[0:10]: ', dict(intent_dict.items()[0:10]))
76 |     print('done!')
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/IART/conqa/gen_w2v_filtered.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Filter word embddings from pre-trained word embeddings from Mikolov tool
 3 | word_embedding_init: a 2-d array with shape [vocab_size+1, emb_size]
 4 | there is one dimension in vocab_size which is corresponding to _eos_.
 5 | in our preprocessing, _eos_ is always the last dimension
 6 | +1 to add one more embedding vector for padding and masking
 7 | We add an "all 0" vector in the 0-th row of word_embedding_init in order
 8 | to denote the padding word
 9 | when call tf.nn.embedding_lookup(), if word_id = 0, then this is a paded
10 | word; if word_id > 0 (from 1 to vocab_size), then this is a real word
11 | 
12 | Can double check the relationships between the row index and word ids
13 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
14 | @homepage: https://sites.google.com/site/lyangwww/
15 | '''
16 | 
17 | import sys
18 | import numpy as np
19 | import pickle
20 | from tqdm import tqdm
21 | 
22 | 
23 | 
24 | w2v_file = open(sys.argv[1]) # pre-trained word embedding file by the mikolov tool
25 | word_dict_file = open(sys.argv[2]) # word_dict file for the vocabulary information
26 | output_file = open(sys.argv[3], 'wb') # the cutted word embedding file with the shape [vocab_size+1, emb_size]
27 | 
28 | # In the generated word_embedding.pkl file
29 | # the 0-th row if a "all 0" vector which is corresponding to the padded word
30 | # the 1-vocab_size -th row are word vectors for real words in vocabs
31 | 
32 | # word_count, embed_dim = w2v_file.readline().strip().split()
33 | 
34 | word_map_w2v = {}
35 | word2id = {} # map word to id
36 | id2word = {} # map id to word
37 | 
38 | print 'load word dict ...'
39 | for line in tqdm(word_dict_file):
40 |   line = line.split()
41 |   try:
42 |       word2id[line[0]] = int(line[1])
43 |       id2word[int(line[1])] = line[0]
44 |   except:
45 |       print line
46 |       continue
47 | 
48 | # add one word for _eos_ and _pad_
49 | # pad_id = 0
50 | eos_id = len(word2id) + 1
51 | 
52 | # word2id['_pad_'] = pad_id
53 | word2id['_eos_'] = eos_id
54 | # id2word[pad_id] = '_pad_'
55 | id2word[eos_id] = '_eos_'
56 | vocab_size = len(word2id)
57 | 
58 | print 'vocab_size = ', vocab_size
59 | 
60 | print 'load word vectors ...'
61 | for line in tqdm(w2v_file):
62 |   line = line.split()
63 |   if len(line) == 0 or len(line) == 2: # len(len) == 2 for the first line (V WD)
64 |       continue
65 |   if line[0] in word2id:
66 |     word_map_w2v[line[0]] = line[1:]
67 | 
68 | emb_size = len(word_map_w2v[word_map_w2v.keys()[0]])
69 | 
70 | print 'emb_size = ', emb_size
71 | 
72 | word_diff = list()
73 | for w in word2id.keys():
74 |     if w not in word_map_w2v:
75 |         word_diff.append(w)
76 | 
77 | # output shared w2v dict
78 | word_embedding_init = np.array(np.zeros((vocab_size+1, emb_size))) # a 2-d array with shape [vocab_size+1, emb_size]
79 | 
80 | print 'number of shared word vectors: ', vocab_size-len(word_diff)
81 | 
82 | # the first row is an all 0 vector for padding word
83 | 
84 | # then add init embedding vectors for real words
85 | 
86 | for id in tqdm(range(1, vocab_size+1)):
87 |     word = id2word[id]
88 |     if word in word_map_w2v:
89 |         word_embedding_init[id,:] = [float(s) for s in word_map_w2v[word]]
90 |     else:
91 |         alpha = 0.5 * (2.0 * np.random.random() - 1.0)
92 |         rand_embed = (2.0 * np.random.random_sample([emb_size]) - 1.0) * alpha
93 |         rand_embed = ['%.6f' % k for k in rand_embed.tolist()]
94 |         word_embedding_init[id, :] = rand_embed
95 | 
96 | pickle.dump(word_embedding_init.tolist(), output_file)
97 | 
98 | print 'Map word vectors finished ...'
99 | 


--------------------------------------------------------------------------------
/IART/bin/test_and_evaluate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | 
  5 | import cPickle as pickle
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | 
  9 | import utils.reader as reader
 10 | import utils.evaluation as eva
 11 | 
 12 | 
 13 | def test(conf, _model):
 14 |     
 15 |     if not os.path.exists(conf['save_path']):
 16 |         os.makedirs(conf['save_path'])
 17 | 
 18 |     # load data
 19 |     print('starting loading data')
 20 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 21 |     train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb'))    
 22 |     print('finish loading data')
 23 |     print('init intent_dict...')
 24 |     conf['intent_dict'] = reader.read_intent(conf['intent_vec_path']) if conf[
 25 |                             'model_name'] != 'dam' else None
 26 |     test_batches = reader.build_batches(test_data, conf)
 27 |     print("finish building test batches")
 28 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 29 | 
 30 |     # refine conf
 31 |     test_batch_num = len(test_batches["response"])
 32 | 
 33 |     print('configurations:')
 34 |     conf_copy = {}
 35 |     for k in conf:
 36 |         if k != 'intent_dict':
 37 |             conf_copy[k] = conf[k]
 38 |     print(conf_copy)
 39 | 
 40 |     _graph = _model.build_graph()
 41 |     print('build graph sucess')
 42 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 43 | 
 44 |     with tf.Session(graph=_graph) as sess:
 45 |         #_model.init.run();
 46 |         _model.saver.restore(sess, conf["init_model"])
 47 |         print("sucess init %s" %conf["init_model"])
 48 | 
 49 |         batch_index = 0
 50 |         step = 0
 51 | 
 52 |         score_file_path = conf['save_path'] + 'score.test'
 53 |         score_file = open(score_file_path, 'w')
 54 |         attention_file = open(conf['save_path'] + 'attention.test', 'w')
 55 | 
 56 |         print('starting test')
 57 |         print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 58 |         for batch_index in xrange(test_batch_num):
 59 |                 
 60 |             feed = { 
 61 |                 _model.turns: test_batches["turns"][batch_index],
 62 |                 _model.tt_turns_len: test_batches["tt_turns_len"][batch_index],
 63 |                 _model.every_turn_len: test_batches["every_turn_len"][batch_index],
 64 |                 _model.response: test_batches["response"][batch_index],
 65 |                 _model.response_len: test_batches["response_len"][batch_index],
 66 |                 _model.label: test_batches["label"][batch_index]
 67 |                 }
 68 |             if conf['model_name'] != 'dam':
 69 |                 feed[_model.turns_intent] = \
 70 |                     test_batches["turns_intent"][batch_index]
 71 |                 feed[_model.response_intent] = \
 72 |                     test_batches["response_intent"][batch_index]
 73 |                 
 74 |             scores, attention = sess.run([_model.logits, _model.attention], feed_dict = feed)
 75 |             # shape of attention [batch, max_turn_num]
 76 |             # shape of scores [batch]
 77 |             # also run and print out attention weights to do visualization
 78 |             #print('print out attention weights over context utterances:', attention)
 79 | 
 80 |             # print predicted scores and labels into score file
 81 |             # print intent aware-attention weights into attention file
 82 |             for i in xrange(conf["batch_size"]):
 83 |                 score_file.write(
 84 |                     str(scores[i]) + '\t' + 
 85 |                     str(test_batches["label"][batch_index][i]) + '\n')
 86 |                     #str(sum(test_batches["every_turn_len"][batch_index][i]) / test_batches['tt_turns_len'][batch_index][i]) + '\t' +
 87 |                     #str(test_batches['tt_turns_len'][batch_index][i]) + '\n')
 88 |                 attention_file.write('\t'.join([str(a) for a in attention[i]])
 89 |                                      + '\n')
 90 |         score_file.close()
 91 |         attention_file.close()
 92 |         print('finish test')
 93 |         print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 94 | 
 95 |         #write evaluation result
 96 |         result = eva.evaluate(score_file_path)
 97 |         result_file_path = conf["save_path"] + "result.test"
 98 |         with open(result_file_path, 'w') as out_file:
 99 |             for p_at in result:
100 |                 out_file.write(str(p_at) + '\n')
101 |         print('finish evaluation')
102 |         # lyang: also print metrics in log file
103 |         print('testing_metrics for_model_ckpt:\t{:s}\t[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format(
104 |             conf["init_model"], result[0], result[1], result[2], result[3], result[4]))
105 |         print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
106 |         
107 | 
108 |                     
109 | 


--------------------------------------------------------------------------------
/IART/conqa/gen_query_all_metrics.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Generate per-query metrics with trec_eval
  3 | to do significance test and case study later
  4 | 
  5 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
  6 | '''
  7 | 
  8 | import os
  9 | import sys
 10 | import json
 11 | import re
 12 | from tqdm import tqdm
 13 | 
 14 | def fix_duplicate_doc_id(mz_prediction_file):
 15 |     '''
 16 |     Before going the next steps, we need to fix the "same doc id under the
 17 |     the same query" problem. Loop the qid under each query, if we find a
 18 |     duplicated qid, add '-' + dup_times after this did
 19 |     For example, if D811 is duplidated, we change it to D811-1, D811-2, etc.
 20 |     '''
 21 |     nd_file = mz_prediction_file + '.nd'
 22 |     dup_times = 1
 23 |     with open(mz_prediction_file) as f_in, open(nd_file, 'w') as f_out:
 24 |         cur_qid = 'init'
 25 |         cache_did_set = set()
 26 |         cache_q_lines = []
 27 |         for l in f_in:
 28 |             tokens = l.strip().split()
 29 |             if tokens[0] == cur_qid:
 30 |                 # same qid
 31 |                 if tokens[2] in cache_did_set:
 32 |                     # means we find a duplicate doc id
 33 |                     tokens[2] += ('-' + str(dup_times))
 34 |                     print('found dup doc_id, gen a new doc_id and qid: ', tokens[2], tokens[0])
 35 |                     dup_times += 1
 36 |                 cache_did_set.add(tokens[2])
 37 |                 cache_q_lines.append('\t'.join(tokens) + '\n') # tokens[2] maybe changed
 38 |             else:
 39 |                 # meet a new qid
 40 |                 f_out.write(''.join(cache_q_lines))
 41 |                 dup_times = 1 # reset
 42 |                 cache_q_lines = []
 43 |                 cache_q_lines.append(l)
 44 |                 cache_did_set.clear()
 45 |                 cur_qid = tokens[0]
 46 |                 cache_did_set.add(tokens[2])
 47 |         # the last query
 48 |         # print len(cache_q_lines), len(cache_did_set)
 49 |         if (len(cache_q_lines) != 0 and len(cache_q_lines) == len(
 50 |             cache_did_set)):
 51 |             f_out.write(''.join(cache_q_lines))
 52 |             print('write the last query... done: ', ''.join(cache_q_lines))
 53 |         return nd_file
 54 | 
 55 | if __name__ == '__main__':
 56 | 
 57 |     if len(sys.argv) < 2:
 58 |         print 'please input params: mz_prediction_file (absolute path)'
 59 |         exit(1)
 60 |     mz_prediction_file = sys.argv[1] # dmn_cnn_prf_body.predict.test.txt for smn
 61 |     # dmn_cnn_kd_word-embedsize-500-rid-17-test-iter-250.predict.test.txt for ms
 62 |     # dmn_cnn_kd_word-embedsize-100-rid-17-test-iter-300.predict.test.txt for udc
 63 |     # dmn_cnn-pureDMN-contextlen-4-test-iter-500.predict.test.txt for ms
 64 | 
 65 |     # DMN for MS_V2   ../data/ms_v2/ModelRes/ms_v2-dmn_cnn_pure-goulburn-weights-best289-04092018-iter-290.predict.test.txt
 66 |     # DMN-PRF for MS_V2   ../data/ms_v2/ModelRes/ms_v2-dmn_cnn_prf_body-contextlen-10.weights.250.predict.test.txt
 67 |     # DMN-KD for MS_V2   ../data/ms_v2/ModelRes/ms_v2-dmn_cnn_kd_word-contextlen-6-rid-20.weights.350.predict.test.txt
 68 |     # SMN for MS_V2 ../data/ms_v2/ModelRes/ms_v2-smn-test.pkl.pred.txt.mz-score-file
 69 | 
 70 |     # Before going the next steps, we need to fix the "same doc id under the
 71 |     # the same query" problem. Loop the qid under each query, if we find a
 72 |     # duplicated qid, add '-' + dup_times after this did
 73 |     # For example, if D811 is duplidated, we change it to D811-1, D811-2, etc.
 74 |     mz_prediction_file = fix_duplicate_doc_id(mz_prediction_file)
 75 | 
 76 |     # Q597901	Q0	D118777	0	2.370282	DMN_CNN	0(ground truth)
 77 |     # seperated by \t
 78 |     # qid \t Q0 \t did \t rank \t score \t method \t ground_truth_label
 79 |     # 030  Q0  ZF08-175-870  0   4238   prise1
 80 |     # qid iter   docno      rank  sim   run_id
 81 |     # In particular, note that the rank field is ignored here;
 82 |     # internally ranks are assigned by sorting by the sim field with ties
 83 |     # broken deterministicly (using docno).
 84 |     with open(mz_prediction_file) as f_in, open(mz_prediction_file + '.score', 'w') as score_out, open(mz_prediction_file + '.qrel', 'w') as qrel_out:
 85 |         for l in f_in:
 86 |             to = l.split('\t')
 87 |             score_out.write(' '.join(to[0:len(to)-1]) + '\n')
 88 |             qrel_out.write(to[0] + ' Q0 ' + to[2] + ' ' + to[6]) # qid  iter  docno  rel
 89 | 
 90 |     # compute per-query metrics with qrel
 91 |     # use -q to print out metrics for all queries
 92 |     cmd = '''trec_eval -m 'all_trec' -q ''' + mz_prediction_file + '.qrel ' + mz_prediction_file + '.score > ' \
 93 |           + mz_prediction_file + '.metrics'
 94 |     print 'run ', cmd
 95 |     os.system(cmd)
 96 | 
 97 |     # parse the metrics file to extract the used metrics into a json file
 98 |     # {'Q101' : {'map':0.876, 'recall5':0.876, 'recall1':0.876, 'recall2':0.876}}
 99 |     q_metrics_dict = {}
100 |     used_metrics = {'map', 'recall_1', 'recall_2', 'recall_5'}
101 |     with open(mz_prediction_file + '.metrics') as f_in:
102 |         #'\s+' to match 1 to many spaces
103 |         for l in tqdm(f_in):
104 |             to = re.split('\s+', l) # m_name qid score
105 |             if to[0] not in used_metrics:
106 |                 continue
107 |             if to[1] in q_metrics_dict:
108 |                 q_metrics_dict[to[1]][to[0]] = float(to[2])
109 |             else:
110 |                 q_metrics_dict[to[1]] = {}
111 |                 q_metrics_dict[to[1]][to[0]] = float(to[2])
112 |     with open(mz_prediction_file + '.metrics.json', 'w') as outfile:
113 |         json.dump(q_metrics_dict, outfile)
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/IART/conqa/transfer_mz_to_dam_format.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Data preprocess of MS_V2 and UDC data for running DAM model
  3 | Transfer preprocessed data in MZ format to DAM input format
  4 | A good preprocess is very important for good performance
  5 | Preprocess UDC for debugging purpose
  6 | Preprocess MS_V2 to get results of DAM on MS_V2
  7 | The input data format is label \t context (utterances seperated by \t) \t response
  8 | Add qid/dids in relation files in the pkl file in order to associate the
  9 | corresponding intent vectors in the future
 10 | 
 11 | Firstly run data_preprocess_dam.py, then run transfer_mz_to_dam_format.py
 12 | 
 13 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
 14 | @homepage: https://sites.google.com/site/lyangwww/
 15 | '''
 16 | 
 17 | import sys
 18 | import pickle
 19 | import random
 20 | from tqdm import tqdm
 21 | 
 22 | def gen_id2corpus(corpus_pre_file, word_dict_file):
 23 |     word_dict = dict()
 24 |     id2corpus = dict()
 25 |     id2word = dict()
 26 |     with open(word_dict_file, 'r') as fin:
 27 |         for l in fin:
 28 |             tok = l.split()
 29 |             word_dict[tok[0]] = tok[1].strip()
 30 |             id2word[int(tok[1].strip())] = tok[0]
 31 |         # word_id for _eos_ is the maxID+1 in word_dict
 32 |         word_dict['_eos_'] = str(len(word_dict) + 1)
 33 |         id2word[len(id2word) + 1] = '_eos_'
 34 |     print('word id for _eos_: ', word_dict['_eos_'])
 35 |     #print('len(id2word)', len(id2word))
 36 |     with open(corpus_pre_file, 'r') as fin:
 37 |         for l in tqdm(fin):
 38 |             tok = l.split('\t')
 39 |             id = tok[0]
 40 |             if 'D' in id:
 41 |                 id2corpus[id] = [int(w) for w in tok[2].split()]
 42 |             else:
 43 |                 utts = []
 44 |                 for i in range(2, len(tok)):
 45 |                     utts.extend(tok[i].split())
 46 |                     utts.append(word_dict['_eos_'])
 47 |                 utts = utts[0:len(utts)-2] # remove the last 2 _eos_
 48 |                 utts = [int(w) for w in utts]
 49 |                 # utts_words = [id2word[w] for w in utts]
 50 |                 # print('test utts_words: ', utts_words)
 51 |                 id2corpus[id] = utts
 52 |     return id2corpus, word_dict
 53 | 
 54 | def gen_dam_inputs(basedir, data_partition, id2corpus, word_dict, gen_mode,
 55 |                    relation_mode, ag_mode, ag_sample_num):
 56 |     if relation_mode == 'nofd' \
 57 |         or data_partition == 'test' \
 58 |         or data_partition == 'valid': # can't filter queries in test/valid
 59 |         rel_file = basedir + 'relation_' + data_partition + '.txt' # for data_nofd.pkl
 60 |     elif ag_mode == 'yes':
 61 |         rel_file = basedir + 'relation_' + data_partition + '.txt.ag' + ag_sample_num   # for data_ag2.pkl
 62 |     else:
 63 |         rel_file = basedir + 'relation_' + data_partition + '.txt.fd'  # for data.pkl
 64 |     print 'using relation file: ', rel_file
 65 | 
 66 |     labels = []
 67 |     qids = []
 68 |     dids = []
 69 |     context = []
 70 |     resp = []
 71 | 
 72 |     with open(rel_file) as fin:
 73 |         ins_num = 0
 74 |         for l in tqdm(fin):
 75 |             ins_num += 1
 76 |             tok = l.strip().split()
 77 |             labels.append(int(tok[0]))
 78 |             qids.append(tok[1].strip())
 79 |             dids.append(tok[2].strip())
 80 |             context.append(id2corpus[tok[1]])
 81 |             resp.append(id2corpus[tok[2]])
 82 |             if gen_mode == 'small' and ins_num >= 10000:
 83 |                 break
 84 |     # Add qid/dids in relation files in the pkl file in order to associate the
 85 |     # corresponding intent vectors in the future
 86 |     return {'y': labels, 'c': context, 'r': resp, 'qids': qids, 'dids': dids}
 87 | 
 88 | def gen_relation_train_ag_file(basedir, data_name, ag_sample_num):
 89 |     '''
 90 |     Perform data augumentation for training data by ramdonly sampling more
 91 |     negtive training data
 92 |     Only need to do this for UDC data
 93 |     '''
 94 |     if data_name != 'udc':
 95 |         raise NameError('can only do ag for udc!')
 96 |     rel_train_file = basedir + 'relation_train.txt'
 97 |     doc_id_pool = set()
 98 |     with open(rel_train_file) as fin:
 99 |         for l in fin:
100 |             t = l.strip().split()
101 |             doc_id_pool.add(t[2])
102 |     # for UDC, each qid has 1 pos did and 1 neg did
103 |     # we further sample k more neg dids for each qid
104 |     doc_id_pool = list(doc_id_pool)
105 |     total_doc_num = len(doc_id_pool)
106 |     print 'test total_doc_num', total_doc_num
107 |     with open(rel_train_file) as fin, open(
108 |         rel_train_file + '.ag' + ag_sample_num, 'w') as fout:
109 |         line_index = -1
110 |         for l in tqdm(fin):
111 |             t = l.strip().split()
112 |             fout.write(l)
113 |             line_index += 1
114 |             if line_index % 2 == 1:
115 |                 #print 'test cur line_index and t', line_index, t
116 |                 sampled_num = 0
117 |                 while sampled_num < int(ag_sample_num): # transfer to int!
118 |                     #print 'test sampled_num and ag_sample_num: ', sampled_num, ag_sample_num
119 |                     pick = random.randint(0,total_doc_num-1)
120 |                     sdid = doc_id_pool[pick]
121 |                     #print 'test pick sdid t[2]: ', pick, sdid, t[2]
122 |                     if sdid != t[2]:
123 |                         fout.write('0 ' + t[1] + ' ' + sdid + '\n')
124 |                         #print 'test sampled_num: ', sampled_num
125 |                         sampled_num += 1
126 | 
127 | if __name__ == '__main__':
128 |     if len(sys.argv) < 6:
129 |         print 'please input params: data_name (udc or ms_v2) \
130 |         gen_mode (full or small) relation_mode(fd or nofd) ag_mode(yes or no) \
131 |               ag_sample_num (2,4,6,8)'
132 |         exit(1)
133 |     # If gen_mode=small, only use 10000 train/valid/test relations for debug
134 |     # If relation_mode=nofd, use original relation files without filtering
135 |     # queries with duplicate doc id
136 |     # If ag_mode=yes, do data augumentation for training data by sample
137 |     # new negative dids from the doc id pool. If the sampled did is already
138 |     # covered, do resampling; otherwise add this sampled <q,d> pair into
139 |     # the training data to get a larger training data
140 |     # transfer_mz_to_dam_format.py udc full fd yes 2
141 |     data_name = sys.argv[1] # udc or ms_v2
142 |     gen_mode = sys.argv[2] # full or small
143 |     relation_mode = sys.argv[3] # fd or nofd
144 |     ag_mode = sys.argv[4] # yes or no
145 |     ag_sample_num = sys.argv[5] # 2,4,6,8
146 |     basedir = '../../data/' + data_name + '/'
147 | 
148 |     corpus_pre_file = basedir + 'corpus_preprocessed.txt'
149 |     word_dict_file = basedir + 'word_dict.txt'
150 |     id2corpus, word_dict = gen_id2corpus(corpus_pre_file, word_dict_file)
151 | 
152 |     if ag_mode == 'yes':
153 |         gen_relation_train_ag_file(basedir, data_name, ag_sample_num)
154 | 
155 |     # transform context/response pairs into input pkl file of DAM model
156 |     train = gen_dam_inputs(basedir, 'train', id2corpus, word_dict, gen_mode,
157 |                            relation_mode, ag_mode, ag_sample_num)
158 |     valid = gen_dam_inputs(basedir, 'valid', id2corpus, word_dict, gen_mode,
159 |                            relation_mode, 'no', ag_sample_num) # no ag for valid
160 |     test = gen_dam_inputs(basedir, 'test', id2corpus, word_dict, gen_mode,
161 |                           relation_mode, 'no', ag_sample_num) # no ag for test
162 | 
163 |     if gen_mode == 'small':
164 |         data_pkl_name = 'data_small.pkl'
165 |     elif relation_mode == 'nofd':
166 |         data_pkl_name = 'data_nofd.pkl'
167 |     elif ag_mode == 'yes':
168 |         data_pkl_name = 'data_ag' + ag_sample_num + '.pkl'
169 |     else:
170 |         data_pkl_name = 'data.pkl'
171 |     print('begin writing data pkl file...', data_pkl_name)
172 |     pickle.dump((train,valid,test), open(basedir + data_pkl_name, 'wb'))
173 |     print('finish writing data pkl file...', data_pkl_name)
174 |     with open(basedir + 'word2id', 'w') as fout:
175 |         for w in word_dict:
176 |             fout.write(w + '\n')
177 |             fout.write(word_dict[w] + '\n')
178 |         print('write word_dict done!')
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  2 | 
  3 | # __IART: Intent-aware Response Ranking with Transformers in Information-seeking Conversation Systems__
  4 | 
  5 | This is the source code of the IART model (Intent-aware Response Ranking with Transformers), which is proposed for multi-turn response selection in the retrieval-based conversation systems.
  6 | 
  7 | IART is built on top of the integration of user intent modeling and
  8 | language representation learning with the Transformer architecture,
  9 | which relies entirely on a self-attention mechanism instead of recurrent
 10 |  nets. It incorporates intent-aware utterance attention to derive an
 11 |  importance weighting scheme of utterances in conversation context with
 12 |   the aim of better conversation history understanding. IART is
 13 |    published in WWW 2020. Please find our [paper](https://arxiv.org/pdf/2002.00571.pdf)
 14 |     for more details of this model.
 15 | 
 16 | ## __Network__
 17 | 
 18 | IART is built on the top of the [DAM](https://github.com/baidu/Dialogue/tree/master/DAM) model. The
 19 | model incorporates intent-aware utterance attention to derive the
 20 | importance weighting scheme of different context utterances. Given input
 21 |  context utterances and response candidates, we first generate representations from two different perspectives: user intent representations with a
 22 |  trained neural classifier and semantic information encoding with Transformers. Then self-attention and cross-attention matching will be performed over
 23 |   encoded representations from Transformers to  extract matching features.
 24 |   These matching features will be weighted by the intent-aware attention mechanism and aggregated into a matching tensor. Finally a two-layer 3D convolutional neural network will distill final representations over
 25 |   the matching tensor and generate the ranking score for the conversation context/ response candidate pair. The main difference between IART and
 26 |    DAM is that we explicitly define and model user intent in conversations.
 27 |    We show that the intent-aware attention mechanism can help improve response ranking in conversations.
 28 | 
 29 | <div align=center>
 30 | <img src="figures/iart-model.png" width=800>
 31 | </div>
 32 | 
 33 | ## __Usage__
 34 | 
 35 | First, please download data from this [Google Drive folder](https://drive.google.com/drive/folders/1ayXN6pgzxs7DP9iCO6JR-KXbQHPUOLXx?usp=sharing) and
 36 | unzip it:
 37 | 
 38 | ```
 39 | unzip data.zip
 40 | ```
 41 | 
 42 | Train and test the model by:
 43 | ```
 44 | sh run.sh
 45 | ```
 46 | 
 47 | ## __Data Preprocessing__
 48 | 
 49 | The code for data preprocessing is in the folder IART/conqa. The input data format is
 50 | 
 51 | ```
 52 | label \t context (utterances separated by \t) \t response.
 53 | ```
 54 | 
 55 |  To generate the pkl data files used for model training/testing, you need to firstly run the script conqa/data_preprocess_dam.py to do data preprocessing. After that you need to run the script conqa/transfer_mz_to_dam_format.py to transfer the preprocessed data in MatchZoo model format to the input data format of DAM/IART model.
 56 | 
 57 | The user intent feature vectors output by the user intent classifier are stored in a separate input text file. The format of the user intent vector should be
 58 | 
 59 | ```
 60 | UtteranceID_or_ResponseID \t intent_vector (12 dimensional real value vector for MS/UDC)
 61 | ```
 62 | 
 63 | The utterance ID is in the format context_query_id-utterance_position. For example:
 64 | 
 65 | ```
 66 | Q494819-0  \t   0.77854055 0.005579375 0.012071793 0.1426655 0.04516567 0.01410749 0.045949493 0.32732058 0.021191863 0.052890584 0.019807862 0.06566066
 67 | Q494819-1  \t   0.07301873 0.106960244 0.17646718 0.061297048 0.45176902 0.31784177 0.051918026 0.21009733 0.11450306 0.5177668 0.1812782 0.118079714
 68 | Q494819-2  \t  0.20212053 0.1563704 0.16760118 0.102229066 0.356528 0.31535488 0.081880495 0.28093183 0.11076649 0.46562743 0.18301935 0.16115002
 69 | D464632  \t  0.0762779 0.0095357755 0.046223667 0.018178586 0.00872361 0.0409651 0.0002578992 0.3250671 4.5165205e-05 0.5883033 0.06031314 0.0021345518
 70 | D464633  \t 0.08268682 0.09142208 0.17887363 0.12554155 0.10471701 0.15151422 0.020252429 0.4203747 0.019151673 0.4006856 0.20331828 0.064555936
 71 | D464630   \t 0.047641575 0.05804358 0.10583141 0.048851516 0.09105808 0.18841337 0.010175006 0.52949446 0.0072372677 0.38212296 0.13627377 0.037408095
 72 | ```
 73 | 
 74 | Q* denotes context utterances. D* denotes response candidates
 75 | The following script can help you generate this vector file
 76 | conqa/gen_user_intent_vector.py. You can also modify this script or write your own scripts to transfer the intent feature vectors for your own data sets to this format.
 77 | 
 78 | ## __Further Instructions on Training/Testing IART__
 79 | 
 80 | The main scripts for training/testing IART model based on UDC and [MSDialog](https://ciir.cs.umass.edu/downloads/msdialog/) data are main_udc.py and main_ms_v2.py respectively. With the right setting on the model configuration, you can start the model training based on [MSDialog](https://ciir.cs.umass.edu/downloads/msdialog/) data by running
 81 | 
 82 | ```
 83 | python main_ms_v2.py
 84 | ```
 85 | 
 86 | * Set intent_vec_path as the path of the intent features vector file you use.
 87 | * Set intent_size as the number of different intent (12 for UDC and MS)
 88 | * Set intent_attention_type as bilinear for IART-bilinear, as dot for IART-dot, as outprod for IART-outerproduct
 89 | * Set model_name as iadam-attention
 90 | * Set data_name, data_path, save_path, word_embed_init, vocab_size, embed_size, batch_size, max_turn_num, max_turn_len, _EOS_ according to the specific setting of your own data sets.
 91 | 
 92 | For model testing, you need to additionally set init_model as the path of the best model checkpoint file from your trained model.
 93 | 
 94 | ## __Example Model Training Output__
 95 | 
 96 | ```
 97 | .....
 98 |        [finish building valid batches
 99 |        2021-04-17 14:24:46
100 |        ('batch_size: ', 32)
101 |        ('total number of batches in one epoch: ', 312)
102 |        configurations:
103 |        {'vocab_size': 167983, 'intent_vec_path': '../data/ms_v2/intent_vectors.txt', 'data_name': 'ms_v2', 'intent_loss_weight': 0.2, 'emb_size': 200, 'is_mask': True, 'train_steps': 1560, 'drop_attention': None, 'word_emb_init': None, 'print_step': 3, 'save_path': '../output/ms_v2/temp/', 'max_turn_num': 6, 'is_positional': False, 'data_path': '../data/ms_v2/data_small.pkl', 'init_model': None, '_EOS_': 167983, 'learning_rate': 0.001, 'intent_attention_type': 'bilinear', 'rand_seed': None, 'drop_dense': None, 'batch_size': 32, 'final_n_class': 1, 'intent_size': 12, 'intent_ffn_od1': 64, 'intent_ffn_od0': 128, 'attention_type': 'dot', 'cnn_3d_oc0': 16, 'cnn_3d_oc1': 16, 'max_turn_len': 200, 'num_scan_data': 5, 'max_to_keep': 1, 'save_step': 31, 'is_layer_norm': True, 'stack_num': 4, 'model_name': 'iadam-attention'}
104 |        model sucess
105 |        2021-04-17 14:24:46
106 |        ('current turn_index : ', 0)
107 |        ('current turn_index : ', 1)
108 |        ('current turn_index : ', 2)
109 |        ('current turn_index : ', 3)
110 |        ('current turn_index : ', 4)
111 |        ('current turn_index : ', 5)
112 |        [attention_logits] after stack attention_logits.shape: (32, 6)
113 |        [attention_mask] attention_mask.shape: (32, 6)
114 |        [attention] attention.shape: (32, 6)
115 |        [3d cnn aggregation] sim shape: (32, 6, 200, 200, 10)
116 |        conv_0 shape: (32, 6, 200, 200, 16)
117 |        pooling_0 shape: (32, 2, 67, 67, 16)
118 |        conv_1 shape: (32, 2, 67, 67, 16)
119 |        pooling_1 shape: (32, 1, 23, 23, 16)
120 |        [3d cnn aggregation] final_info: (32, 8464)
121 |        build graph sucess
122 |        2021-04-17 14:28:23
123 |        2021-04-17 14:28:23.570120: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
124 |        starting shuffle train data
125 |        finish building train data
126 |        step: 3 lr: 0.001, epoch: 0
127 |        step: 3 processed current epoch: [0.00961538461538] loss: 0.567167798678
128 |        step: 6 lr: 0.001, epoch: 0
129 |        step: 6 processed current epoch: [0.0192307692308] loss: 0.393668711185
130 |        step: 9 lr: 0.001, epoch: 0
131 |        step: 9 processed current epoch: [0.0288461538462] loss: 0.467894713084
132 |        step: 12 lr: 0.001, epoch: 0
133 |        step: 12 processed current epoch: [0.0384615384615] loss: 0.413786088427
134 |        step: 15 lr: 0.001, epoch: 0
135 |        step: 15 processed current epoch: [0.0480769230769] loss: 0.271514303361]:
136 | ......
137 | 
138 | ```
139 | 
140 | ## __Dependencies__
141 | 
142 | - Python 2.7.3
143 | - Tensorflow == 1.4
144 | 
145 | ## __Citation__
146 | 
147 | The following article describes the IART model in detail.
148 | 
149 | ```
150 | @inproceedings{
151 |   title={IART: Intent-aware Response Ranking with Transformers in Information-seeking Conversation Systems},
152 |   author={Liu Yang, Minghui Qiu, Chen Qu, Cen Chen, Jiafeng Guo, Yongfeng Zhang, W. Bruce Croft, Haiqing Chen},
153 |   booktitle={WWW 2020},
154 |   year={2020}
155 | }
156 | ```
157 | 
158 | ## __Acknowledgement__
159 | 
160 | IART is built based on the [DAM model](https://github.com/baidu/Dialogue/tree/master/DAM) released by [Zhou et. al. ACL 2018](https://www.aclweb.org/anthology/P18-1103/). We thank the DAM authors for the effort on open sourcing their model code.
161 | 
162 | ## __Contact__
163 | 
164 | For help or issues using IART, please submit a GitHub issue.
165 | 
166 | For personal communication related to IART, please contact Liu Yang (yangliuyx@gmail.com), Minghui Qiu (minghuiqiu@yeah.net), Chen Qu (quchen0502@gmail.com) or Cen Chen (cecilia.cenchen@gmail.com).
167 | 


--------------------------------------------------------------------------------
/IART/bin/train_and_evaluate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | 
  5 | import cPickle as pickle
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | 
  9 | import utils.reader as reader
 10 | import utils.evaluation as eva
 11 | 
 12 | 
 13 | def train(conf, _model):
 14 |     
 15 |     if conf['rand_seed'] is not None:
 16 |         np.random.seed(conf['rand_seed'])
 17 | 
 18 |     if not os.path.exists(conf['save_path']):
 19 |         os.makedirs(conf['save_path'])
 20 | 
 21 |     # load data
 22 |     print('starting loading data')
 23 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 24 |     train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb'))
 25 |     print('lyang test: val_data: ', len(val_data), type(val_data),
 26 |           len(val_data['y']))
 27 |     print('lyang test: val_data[y]: ', val_data['y'][0:2])
 28 |     print('lyang test: val_data[c]: ', val_data['c'][0:2])
 29 |     print('lyang test: val_data[r]: ', val_data['r'][0:2])
 30 |     print('lyang test: val_data[qids]: ', val_data['qids'][0:2])
 31 |     print('lyang test: val_data[dids]: ', val_data['dids'][0:2])
 32 |     print('map id to words ...')
 33 |     id2word = reader.read_dict('../data/' + conf["data_name"]+ '/word2id')
 34 |     response_ids = val_data['r'][0:1][0]
 35 |     context_ids = val_data['c'][0:1][0]
 36 |     print('lyang test: val_data[c]: ',
 37 |           [id2word[str(id)] for id in context_ids], val_data.keys())
 38 |     print('lyang test: val_data[r]: ',
 39 |           [id2word[str(id)] for id in response_ids], val_data.keys())
 40 | 
 41 |     print('finish loading data')
 42 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 43 | 
 44 |     print('init intent_dict...')
 45 |     conf['intent_dict'] = reader.read_intent(conf['intent_vec_path']) if conf[
 46 |         'model_name'] != 'dam' else None
 47 |     print('lyang test len(conf[intent_dict])', len(conf['intent_dict']))
 48 |     val_batches = reader.build_batches(val_data, conf)
 49 | 
 50 |     # check the example 0 and 1 in batch 0
 51 |     print('intent of val_batches context: ', val_batches['turns_intent'][0][0:2])
 52 |     print('intent of val_batches response: ',val_batches['response_intent'][0][0:2])
 53 | 
 54 |     print("finish building valid batches")
 55 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 56 | 
 57 |     # refine conf
 58 |     batch_num = len(train_data['y']) / conf["batch_size"]
 59 |     print('batch_size: ', conf["batch_size"])
 60 |     print('total number of batches in one epoch: ', batch_num)
 61 |     val_batch_num = len(val_batches["response"])
 62 | 
 63 |     conf["train_steps"] = conf["num_scan_data"] * batch_num # total number of training steps epoch_num * batch_num
 64 |     conf["save_step"] = max(1, batch_num / 10) # at most save 10 times
 65 |     conf["print_step"] = max(1, batch_num / 100) # at most print 100 times
 66 | 
 67 |     print('configurations:')
 68 |     conf_copy = {}
 69 |     for k in conf:
 70 |         if k != 'intent_dict':
 71 |             conf_copy[k] = conf[k]
 72 |     print(conf_copy)
 73 | 
 74 |     print('model sucess')
 75 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 76 | 
 77 |     _graph = _model.build_graph()
 78 |     print('build graph sucess')
 79 |     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
 80 | 
 81 |     with tf.Session(graph=_graph) as sess:
 82 |         # train_writer = tf.summary.FileWriter(
 83 |         #     conf["save_path"] + "tensorboard_log/", sess.graph)
 84 |         # merge = tf.summary.merge_all()  # for tensorboard
 85 |         _model.init.run();
 86 |         if conf["init_model"]:
 87 |             _model.saver.restore(sess, conf["init_model"])
 88 |             print("sucess init %s" %conf["init_model"])
 89 | 
 90 |         average_loss = 0.0
 91 |         batch_index = 0
 92 |         step = 0
 93 |         best_result = [0, 0, 0, 0]
 94 | 
 95 |         for step_i in xrange(conf["num_scan_data"]):
 96 |             #for batch_index in rng.permutation(range(batch_num)):
 97 |             print('starting shuffle train data')
 98 |             shuffle_train = reader.unison_shuffle(train_data)
 99 |             train_batches = reader.build_batches(shuffle_train, conf)
100 |             print('finish building train data')
101 |             for batch_index in range(batch_num):
102 | 
103 |                 feed = {
104 |                     _model.turns: train_batches["turns"][batch_index], 
105 |                     _model.tt_turns_len: train_batches["tt_turns_len"][batch_index],
106 |                     _model.every_turn_len: train_batches["every_turn_len"][batch_index],
107 |                     _model.response: train_batches["response"][batch_index], 
108 |                     _model.response_len: train_batches["response_len"][batch_index],
109 |                     _model.label: train_batches["label"][batch_index],
110 |                 }
111 |                 if conf['model_name'] != 'dam':
112 |                     feed[_model.turns_intent] = train_batches["turns_intent"][batch_index]
113 |                     feed[_model.response_intent] = train_batches["response_intent"][batch_index]
114 | 
115 |                 batch_index = (batch_index + 1) % batch_num;
116 | 
117 |                 _, curr_loss = sess.run([_model.g_updates, _model.loss], feed_dict = feed)
118 |                 # print loss and metrics into tensorboard log
119 |                 # train_writer.add_summary(summ, global_step=step)
120 | 
121 |                 average_loss += curr_loss
122 | 
123 |                 step += 1
124 | 
125 |                 if step % conf["print_step"] == 0 and step > 0:
126 |                     g_step, lr = sess.run([_model.global_step, _model.learning_rate])
127 |                     print('step: %s lr: %s, epoch: %s ' %(g_step, lr, step_i))
128 |                     print("step: " + str(g_step)+ " processed current epoch: [" \
129 |                           + str(step * 1.0 / batch_num) + "] loss: " + \
130 |                           str(average_loss / conf["print_step"]))
131 |                     average_loss = 0
132 | 
133 |                 if step % conf["save_step"] == 0 and step > 0:
134 |                     index = step / conf['save_step']
135 |                     score_file_path = conf['save_path'] + 'score.' + str(index)
136 |                     score_file = open(score_file_path, 'w')
137 |                     print('save step: %s' %index)
138 |                     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
139 | 
140 |                     for batch_index in xrange(val_batch_num):
141 |                 
142 |                         feed = { 
143 |                             _model.turns: val_batches["turns"][batch_index],
144 |                             _model.tt_turns_len: val_batches["tt_turns_len"][batch_index],
145 |                             _model.every_turn_len: val_batches["every_turn_len"][batch_index],
146 |                             _model.response: val_batches["response"][batch_index],
147 |                             _model.response_len: val_batches["response_len"][batch_index],
148 |                             _model.label: val_batches["label"][batch_index]
149 |                         }
150 |                         if conf['model_name'] != 'dam':
151 |                             feed[_model.turns_intent] = \
152 |                             val_batches["turns_intent"][batch_index]
153 |                             feed[_model.response_intent] = \
154 |                             val_batches["response_intent"][batch_index]
155 |                 
156 |                         scores = sess.run(_model.logits, feed_dict = feed)
157 |                     
158 |                         for i in xrange(conf["batch_size"]):
159 |                             score_file.write(
160 |                                 str(scores[i]) + '\t' + 
161 |                                 str(val_batches["label"][batch_index][i]) + '\n')
162 |                     score_file.close()
163 | 
164 |                     #write evaluation result
165 |                     result = eva.evaluate(score_file_path)
166 |                     result_file_path = conf["save_path"] + "result." + str(index)
167 |                     with open(result_file_path, 'w') as out_file:
168 |                         for m in result:
169 |                             out_file.write(str(m) + '\n')
170 |                     print('finish evaluation')
171 |                     # lyang: also print metrics in log file
172 |                     print('save step:\t{:d}\t[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format(
173 |                         index, result[0], result[1], result[2], result[3], result[4]))
174 |                     print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
175 |                     # lyang: also print metrics in tensorboard log file
176 |                     # metrics = tf.Summary(value=[
177 |                     #     tf.Summary.Value(tag="R10at1", simple_value=result[1]),
178 |                     # ])
179 |                     # metrics.value.add(tag="MAP", simple_value=result[4])
180 |                     # # metrics.value.add(tag="R10at2", simple_value=result[2])
181 |                     # # metrics.value.add(tag="R10at5", simple_value=result[3])
182 |                     # train_writer.add_summary(metrics, global_step=step)
183 | 
184 |                     if result[1] + result[2] > best_result[1] + best_result[2]: # save model only when find a model better than previously best model
185 |                         best_result = result
186 |                         _save_path = _model.saver.save(sess, conf["save_path"] + "model.ckpt." + str(step / conf["save_step"]))
187 |                         print("succ saving model in " + _save_path)
188 |                         print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
189 | 
190 | 
191 | 
192 |                 
193 | 
194 | 


--------------------------------------------------------------------------------
/IART/models/net.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import cPickle as pickle
  4 | 
  5 | import utils.layers as layers
  6 | import utils.operations as op
  7 | 
  8 | class Net(object):
  9 |     '''Add positional encoding(initializer lambda is 0),
 10 |        cross-attention, cnn integrated and grad clip by value.
 11 | 
 12 |     Attributes:
 13 |         conf: a configuration paramaters dict
 14 |         word_embedding_init: a 2-d array with shape [vocab_size+1, emb_size]
 15 |         there is one dimension in vocab_size which is corresponding to _eos_.
 16 |         in our preprocessing, _eos_ is always the last dimension
 17 |         +1 to add one more embedding vector for padding and masking
 18 |         We add an "all 0" vector in the 0-th row of word_embedding_init in order
 19 |         to denote the padding word
 20 |         when call tf.nn.embedding_lookup(), if word_id = 0, then this is a paded
 21 |         word; if word_id > 0 (from 1 to vocab_size), then this is a real word
 22 |     '''
 23 |     def __init__(self, conf):
 24 |         self._graph = tf.Graph()
 25 |         self._conf = conf
 26 | 
 27 |         if self._conf['word_emb_init'] is not None:
 28 |             print('loading word emb init')
 29 |             self._word_embedding_init = pickle.load(open(self._conf['word_emb_init'], 'rb'))
 30 |         else:
 31 |             self._word_embedding_init = None
 32 | 
 33 |     def build_graph(self):
 34 |         with self._graph.as_default():
 35 |             if self._conf['rand_seed'] is not None:
 36 |                 rand_seed = self._conf['rand_seed']
 37 |                 tf.set_random_seed(rand_seed)
 38 |                 print('set tf random seed: %s' %self._conf['rand_seed'])
 39 | 
 40 |             #word embedding
 41 |             if self._word_embedding_init is not None:
 42 |                 word_embedding_initializer = tf.constant_initializer(self._word_embedding_init)
 43 |             else:
 44 |                 word_embedding_initializer = tf.random_normal_initializer(stddev=0.1)
 45 | 
 46 |             self._word_embedding = tf.get_variable(
 47 |                 name='word_embedding',
 48 |                 shape=[self._conf['vocab_size']+1, self._conf['emb_size']],
 49 |                 dtype=tf.float32,
 50 |                 initializer=word_embedding_initializer)
 51 | 
 52 | 
 53 |             #define placehloders
 54 |             self.turns = tf.placeholder(
 55 |                 tf.int32,
 56 |                 shape=[self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"]])
 57 | 
 58 |             self.tt_turns_len = tf.placeholder(
 59 |                 tf.int32,
 60 |                 shape=[self._conf["batch_size"]])
 61 | 
 62 |             self.every_turn_len = tf.placeholder(
 63 |                 tf.int32,
 64 |                 shape=[self._conf["batch_size"], self._conf["max_turn_num"]])
 65 |     
 66 |             self.response = tf.placeholder(
 67 |                 tf.int32, 
 68 |                 shape=[self._conf["batch_size"], self._conf["max_turn_len"]])
 69 | 
 70 |             self.response_len = tf.placeholder(
 71 |                 tf.int32, 
 72 |                 shape=[self._conf["batch_size"]])
 73 | 
 74 |             self.label = tf.placeholder(
 75 |                 tf.float32, 
 76 |                 shape=[self._conf["batch_size"]])
 77 | 
 78 | 
 79 |             #define operations
 80 |             #response part
 81 |             Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
 82 | 
 83 |             if self._conf['is_positional'] and self._conf['stack_num'] > 0:
 84 |                 with tf.variable_scope('positional'):
 85 |                     Hr = op.positional_encoding_vector(Hr, max_timescale=10)
 86 |             Hr_stack = [Hr]
 87 |             # lyang comments: self attention
 88 |             for index in range(self._conf['stack_num']):
 89 |                 with tf.variable_scope('self_stack_' + str(index)):
 90 |                     Hr = layers.block( # attentive module
 91 |                         Hr, Hr, Hr, 
 92 |                         Q_lengths=self.response_len, K_lengths=self.response_len,
 93 |                         attention_type=self._conf['attention_type'])
 94 |                     Hr_stack.append(Hr)
 95 | 
 96 | 
 97 |             #context part
 98 |             #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
 99 |             list_turn_t = tf.unstack(self.turns, axis=1) 
100 |             list_turn_length = tf.unstack(self.every_turn_len, axis=1)
101 |             
102 |             sim_turns = []
103 |             #for every turn_t calculate matching vector
104 |             for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):
105 |                 Hu = tf.nn.embedding_lookup(self._word_embedding, turn_t) #[batch, max_turn_len, emb_size]
106 | 
107 |                 if self._conf['is_positional'] and self._conf['stack_num'] > 0:
108 |                     with tf.variable_scope('positional', reuse=True):
109 |                         Hu = op.positional_encoding_vector(Hu, max_timescale=10)
110 |                 Hu_stack = [Hu]
111 | 
112 |                 # lyang comments: self attention
113 |                 for index in range(self._conf['stack_num']):
114 | 
115 |                     with tf.variable_scope('self_stack_' + str(index), reuse=True):
116 |                         Hu = layers.block(  # attentive module
117 |                             Hu, Hu, Hu,
118 |                             Q_lengths=t_turn_length, K_lengths=t_turn_length,
119 |                             attention_type=self._conf['attention_type'])
120 | 
121 |                         Hu_stack.append(Hu)
122 | 
123 |                 # lyang comments: cross attention
124 |                 r_a_t_stack = []
125 |                 t_a_r_stack = []
126 |                 # cross attention
127 |                 for index in range(self._conf['stack_num']+1):
128 | 
129 |                     with tf.variable_scope('t_attend_r_' + str(index)):
130 |                         try:
131 |                             t_a_r = layers.block(  # attentive module
132 |                                 Hu_stack[index], Hr_stack[index], Hr_stack[index],
133 |                                 Q_lengths=t_turn_length, K_lengths=self.response_len,
134 |                                 attention_type=self._conf['attention_type'])
135 |                         except ValueError:
136 |                             tf.get_variable_scope().reuse_variables()
137 |                             t_a_r = layers.block(
138 |                                 Hu_stack[index], Hr_stack[index], Hr_stack[index],
139 |                                 Q_lengths=t_turn_length, K_lengths=self.response_len,
140 |                                 attention_type=self._conf['attention_type'])
141 | 
142 | 
143 |                     with tf.variable_scope('r_attend_t_' + str(index)):
144 |                         try:
145 |                             r_a_t = layers.block(  # attentive module
146 |                                 Hr_stack[index], Hu_stack[index], Hu_stack[index],
147 |                                 Q_lengths=self.response_len, K_lengths=t_turn_length,
148 |                                 attention_type=self._conf['attention_type'])
149 |                         except ValueError:
150 |                             tf.get_variable_scope().reuse_variables()
151 |                             r_a_t = layers.block(
152 |                                 Hr_stack[index], Hu_stack[index], Hu_stack[index],
153 |                                 Q_lengths=self.response_len, K_lengths=t_turn_length,
154 |                                 attention_type=self._conf['attention_type'])
155 | 
156 |                     t_a_r_stack.append(t_a_r)
157 |                     r_a_t_stack.append(r_a_t)
158 | 
159 |                 #lyang comments: 3D aggregation
160 |                 t_a_r_stack.extend(Hu_stack)
161 |                 r_a_t_stack.extend(Hr_stack)
162 |                 
163 |                 t_a_r = tf.stack(t_a_r_stack, axis=-1)
164 |                 r_a_t = tf.stack(r_a_t_stack, axis=-1)
165 | 
166 |                             
167 |                 #calculate similarity matrix
168 |                 with tf.variable_scope('similarity'):
169 |                     # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
170 |                     # divide sqrt(200) to prevent gradient explosion
171 |                     sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0)
172 | 
173 |                 sim_turns.append(sim)
174 | 
175 | 
176 |             #cnn and aggregation
177 |             #lyang comments aggregation by 3D CNN layer
178 |             sim = tf.stack(sim_turns, axis=1)
179 |             print('sim shape: %s' %sim.shape)
180 |             with tf.variable_scope('cnn_aggregation'):
181 |                 final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'],
182 |                                            self._conf['cnn_3d_oc1'])
183 |                 #for udc
184 |                 #final_info = layers.CNN_3d(sim, 32, 16)
185 |                 #for douban
186 |                 #final_info = layers.CNN_3d(sim, 16, 16)
187 | 
188 |             #loss and train
189 |             with tf.variable_scope('loss'):
190 |                 self.loss, self.logits = layers.loss(final_info, self.label)
191 | 
192 |                 self.global_step = tf.Variable(0, trainable=False)
193 |                 initial_learning_rate = self._conf['learning_rate']
194 |                 self.learning_rate = tf.train.exponential_decay(
195 |                     initial_learning_rate,
196 |                     global_step=self.global_step,
197 |                     decay_steps=400,
198 |                     decay_rate=0.9,
199 |                     staircase=True)
200 | 
201 |                 Optimizer = tf.train.AdamOptimizer(self.learning_rate)
202 |                 self.optimizer = Optimizer.minimize(
203 |                     self.loss,
204 |                     global_step=self.global_step)
205 | 
206 |                 self.init = tf.global_variables_initializer()
207 |                 self.saver = tf.train.Saver(max_to_keep = self._conf["max_to_keep"])
208 |                 self.all_variables = tf.global_variables() 
209 |                 self.all_operations = self._graph.get_operations()
210 |                 self.grads_and_vars = Optimizer.compute_gradients(self.loss)
211 | 
212 |                 for grad, var in self.grads_and_vars:
213 |                     if grad is None:
214 |                         print var
215 | 
216 |                 self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars]
217 |                 self.g_updates = Optimizer.apply_gradients(
218 |                     self.capped_gvs,
219 |                     global_step=self.global_step)
220 |     
221 |         return self._graph
222 | 
223 | 


--------------------------------------------------------------------------------
/IART/main_conversation_qa.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | 
  4 | import models.net as net
  5 | import models.iadam_attention as iadam_attention
  6 | 
  7 | import bin.train_and_evaluate as train
  8 | import bin.test_and_evaluate as test
  9 | 
 10 | def main(argv):
 11 |     # conf_udc and conf_ms are the default settings for udc and ms_v2
 12 |     conf_udc = {
 13 |         "data_name": "udc",
 14 |         "data_path": "../data/udc/data.pkl",  # data_small.pkl or data.pkl or data_nofd.pkl
 15 |         "intent_vec_path": "../data/udc/intent_vectors.txt", # path of intent vectors
 16 |         "intent_size": 12,  # dimensions of different intent
 17 |         "intent_attention_type": "bilinear",  # 'dot', 'bilinear', 'outprod'. default is bilinear
 18 |         "intent_ffn_od0": 64,  # in iadam-concat ffn 144->64->16 match 576
 19 |         "intent_ffn_od1": 16,  # in iadam-concat ffn 144->64->16 match 576
 20 |         "intent_loss_weight": 0.2,
 21 |     # in iadam-mtl weight for intent loss; 1-weight for the ranking loss
 22 |         "model_name": "iadam-concat", # dam, iadam-concat, iadam-attention, iadam-mtl
 23 |         "save_path": "../output/udc/temp/",
 24 |         "word_emb_init": "../data/udc/cut_embed_mikolov_200d.pkl", # word_embedding.pkl
 25 |         "init_model": None,  # should be set for test
 26 |         "rand_seed": None,
 27 |         "drop_dense": None,
 28 |         "drop_attention": None,
 29 | 
 30 |         "is_mask": True,
 31 |         "is_layer_norm": True,
 32 |         "is_positional": False,
 33 | 
 34 |         "stack_num": 5,
 35 |         "attention_type": "dot",
 36 | 
 37 |         "learning_rate": 1e-3,
 38 |         "vocab_size": 429498,
 39 |         "emb_size": 200,
 40 |         "batch_size": 128, # for udc/iadam_mtl model, batch_size = 64; others = 128
 41 | 
 42 |         "max_turn_num": 9,
 43 |         "max_turn_len": 50,
 44 | 
 45 |         "max_to_keep": 1,
 46 |         "num_scan_data": 2,  # about 16 hours for 2 epoches on udc
 47 |         "_EOS_": 429498,  # 28270, #1 for douban data
 48 |         "final_n_class": 1,
 49 | 
 50 |         "cnn_3d_oc0": 32,
 51 |         "cnn_3d_oc1": 16
 52 |     }
 53 | 
 54 |     conf_ms = {
 55 |         "data_name": "ms_v2",
 56 |         "data_path": "../data/ms_v2/data.pkl",  # data_small.pkl or data.pkl or data_nofd.pkl
 57 |         "intent_vec_path": "../data/ms_v2/intent_vectors.txt", # path of intent vectors
 58 |         "intent_size": 12,  # dimensions of different intent
 59 |         "intent_attention_type": "bilinear",  # 'dot', 'bilinear', 'outprod'. default is bilinear
 60 |         "intent_ffn_od0": 128,  # in iadam-concat ffn 144->128->64 match 6400
 61 |         "intent_ffn_od1": 64,  # in iadam-concat ffn 144->128->64 match 6400
 62 |         "intent_loss_weight": 0.2,
 63 |     # in iadam-mtl weight for intent loss; 1-weight for the ranking loss
 64 |         "model_name": "iadam-concat", # dam, iadam-concat, iadam-attention, iadam-mtl
 65 |         "save_path": "../output/ms_v2/temp/",
 66 |         "word_emb_init": "../data/ms_v2/cut_embed_mikolov_200d.pkl", # "../data/ms_v2/cut_embed_mikolov_200d.pkl", # None (set None during debugging)
 67 |         "init_model": None, # "../output/ms_v2/dam_default_setting_0412_run29/model.ckpt.36", #should be set for test
 68 | 
 69 |         "rand_seed": None,
 70 | 
 71 |         "drop_dense": None,
 72 |         "drop_attention": None,
 73 | 
 74 |         "is_mask": True,
 75 |         "is_layer_norm": True,
 76 |         "is_positional": False,
 77 | 
 78 |         "stack_num": 4,
 79 |         "attention_type": "dot",
 80 | 
 81 |         "learning_rate": 1e-3,
 82 |         "vocab_size": 167983,
 83 |         "emb_size": 200,
 84 |         "batch_size": 32,  # 200 for test  256
 85 | 
 86 |         "max_turn_num": 6, #  6 is better for ms_v2
 87 |         "max_turn_len": 180,
 88 | 
 89 |         "max_to_keep": 1,
 90 |         "num_scan_data": 5,  # about 18 hours for 5 epoches on ms_v2
 91 |         "_EOS_": 167983,  # 1 for douban data
 92 |         "final_n_class": 1,
 93 | 
 94 |         "cnn_3d_oc0": 16,
 95 |         "cnn_3d_oc1": 16
 96 |     }
 97 | 
 98 |     parser = argparse.ArgumentParser()
 99 |     # python main_conversation_qa.py --help to print the help messages
100 |     # sys.argv includes a list of elements starting with the program
101 |     # required parameters
102 |     parser.add_argument('--phase', default='train',
103 |                         help='phase: it can be train or predict, the default \
104 |                         value is train.',
105 |                         required=True)
106 |     parser.add_argument('--data_name', default='udc',
107 |                         help='data_name: name of the data. it can be udc or \
108 |                              ms_v2', required=True)
109 |     parser.add_argument('--model_name', default='dam',
110 |                         help='model_name: name of the model', required=True)
111 |     parser.add_argument('--save_path', default='../output/udc/temp/',
112 |                         help='save_path: output path for model files, score \
113 |                              files and result files', required=True)
114 |     parser.add_argument('--or_cmd', default=False,
115 |                         help='or_cmd: whether want to override config \
116 |                         parameters by command line parameters',
117 |                         required=True)
118 | 
119 |     # optional parameters
120 |     parser.add_argument('--intent_vec_path',
121 |                         help='intent_vec_path: path of intent vectors.')
122 |     parser.add_argument('--intent_attention_type',
123 |                         help='intent_attention_type: type of intent attention.')
124 |     parser.add_argument('--intent_ffn_od0',
125 |                         help='intent_ffn_od0: output dimension 0 in FFN for \
126 |                              intent transformation in IADAM-Concat')
127 |     parser.add_argument('--intent_ffn_od1',
128 |                         help='intent_ffn_od1: output dimension 1 in FFN for \
129 |                                  intent transformation in IADAM-Concat')
130 |     parser.add_argument('--intent_loss_weight',
131 |                         help='intent_loss_weight: weight of intent loss \
132 |                                      in IADAM-MTL model')
133 |     parser.add_argument('--data_path',
134 |                         help='data_path: path of input data.')
135 |     parser.add_argument('--word_emb_init',
136 |                         help='data_name: path of word embedding file to \
137 |                         initialize the word embeddings.')
138 |     parser.add_argument('--init_model',
139 |                         help='init_model: path of the checkpoints of \
140 |                         model initialization during testing phase.')
141 |     parser.add_argument('--rand_seed',
142 |                         help='rand_seed: rand seed used in numpy.')
143 |     parser.add_argument('--is_positional',
144 |                         help='is_positional: whether add positional embeddings.')
145 |     parser.add_argument('--stack_num',
146 |                         help='stack_num: stack number in Transformers.')
147 |     parser.add_argument('--attention_type',
148 |                         help='attention_type: attention_type in attentive module \
149 |                         in Transformers (dot or bilinear).')  # Added in net.py
150 |     parser.add_argument('--learning_rate',
151 |                         help='learning_rate: initial learning rate in \
152 |                         exponential decay learning rate.')
153 |     parser.add_argument('--vocab_size',
154 |                         help='vocab_size: vocabulary size.')
155 |     parser.add_argument('--emb_size',
156 |                         help='emb_size: embedding size.')
157 |     parser.add_argument('--batch_size',
158 |                         help='batch_size: batch size.')
159 |     parser.add_argument('--max_turn_num',
160 |                         help='max_turn_num: max number of turns in conversation \
161 |                         context.')
162 |     parser.add_argument('--max_turn_len',
163 |                         help='max_turn_len: max length of conversation turns.')
164 |     parser.add_argument('--max_to_keep',
165 |                         help='max_to_keep: max number of checkpoints file to \
166 |                         keep.')
167 |     parser.add_argument('--num_scan_data',
168 |                         help='num_scan_data: number of times to scan the data \
169 |                         which is also number of epoches.')
170 |     parser.add_argument('--eos',
171 |                         help='eos: word id for _EOS_, which is the seperator \
172 |                              between different turns in context')
173 |     parser.add_argument('--cnn_3d_oc0',
174 |                         help='cnn_3d_oc0: out_channels_0 of 3D CNN layer.')
175 |     parser.add_argument('--cnn_3d_oc1',
176 |                         help='cnn_3d_oc1: out_channels_1 of 3D CNN layer.')
177 | 
178 |     args = parser.parse_args()
179 |     # parse the hyper-parameters from the command lines
180 |     phase = args.phase
181 |     or_cmd = bool(args.or_cmd)
182 |     conf = conf_udc if args.data_name == 'udc' else conf_ms
183 |     conf['save_path'] = args.save_path
184 |     conf['model_name'] = args.model_name
185 | 
186 |     # load settings from the config file
187 |     # then update the hyper-parameters in the config files with the settings
188 |     # passed from command lines
189 |     if or_cmd:
190 |         if args.intent_vec_path != None:
191 |             conf['intent_vec_path'] = args.intent_vec_path
192 |         if args.intent_ffn_od0 != None:
193 |             conf['intent_ffn_od0'] = int(args.intent_ffn_od0)
194 |         if args.intent_ffn_od1 != None:
195 |             conf['intent_ffn_od1'] = int(args.intent_ffn_od1)
196 |         if args.intent_attention_type != None:
197 |             conf['intent_attention_type'] = args.intent_attention_type
198 |         if args.intent_loss_weight != None:
199 |             conf['intent_loss_weight'] = float(args.intent_loss_weight)
200 |         if args.data_path != None:
201 |             conf['data_path'] = args.data_path
202 |         if args.word_emb_init != None:
203 |             conf['word_emb_init'] = args.word_emb_init
204 |         if args.init_model != None:
205 |             conf['init_model'] = args.init_model
206 |         if args.rand_seed != None:
207 |             conf['rand_seed'] = float(args.rand_seed)
208 |         if args.is_positional != None:
209 |             conf['is_positional'] = args.is_positional
210 |         if args.stack_num != None:
211 |             conf['stack_num'] = int(args.stack_num)
212 |         if args.attention_type != None:
213 |             conf['attention_type'] = args.attention_type
214 |         if args.learning_rate != None:
215 |             conf['learning_rate'] = float(args.learning_rate)
216 |         if args.vocab_size != None:
217 |             conf['vocab_size'] = int(args.vocab_size)
218 |         if args.emb_size != None:
219 |             conf['emb_size'] = int(args.emb_size)
220 |         if args.batch_size != None:
221 |             conf['batch_size'] = int(args.batch_size)
222 |         if args.max_turn_num != None:
223 |             conf['max_turn_num'] = int(args.max_turn_num)
224 |         if args.max_turn_len != None:
225 |             conf['max_turn_len'] = int(args.max_turn_len)
226 |         if args.max_to_keep != None:
227 |             conf['max_to_keep'] = int(args.max_to_keep)
228 |         if args.num_scan_data != None:
229 |             conf['num_scan_data'] = int(args.num_scan_data)
230 |         if args.eos != None:
231 |             conf['_EOS_'] = int(args.eos)
232 |         if args.cnn_3d_oc0 != None:
233 |             conf['cnn_3d_oc0'] = int(args.cnn_3d_oc0)
234 |         if args.cnn_3d_oc1 != None:
235 |             conf['cnn_3d_oc1'] = int(args.cnn_3d_oc1)
236 | 
237 |     if conf['model_name'] == 'dam':
238 |         model = net.Net(conf)  # DAM
239 |     elif conf['model_name'] == 'iadam-attention':
240 |         model = iadam_attention.Net(conf)  # IADAM-Attention-V4-2 (IART)
241 |     else:
242 |         raise NameError('model not supported.')
243 | 
244 |     if phase == 'train':
245 |         train.train(conf, model)
246 |     elif phase == 'predict':
247 |         # test and evaluation, init_model in conf should be set
248 |         test.test(conf, model)
249 |     else:
250 |         print 'Phase Error.'
251 |     return
252 | 
253 | 
254 | if __name__ == '__main__':
255 |     main(sys.argv)
256 | 
257 | 


--------------------------------------------------------------------------------
/IART/utils/reader.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | 
  5 | def unison_shuffle(data, seed=None):
  6 |     if seed is not None:
  7 |         np.random.seed(seed)
  8 | 
  9 |     y = np.array(data['y'])
 10 |     c = np.array(data['c'])
 11 |     r = np.array(data['r'])
 12 |     qids = []
 13 |     dids = []
 14 | 
 15 |     assert len(y) == len(c) == len(r) == len(data['qids']) == len(data['dids'])
 16 |     p = np.random.permutation(len(y))
 17 |     # shuffle qids and dids
 18 |     for i in range(len(y)):
 19 |         qids.append(data['qids'][p[i]])
 20 |         dids.append(data['dids'][p[i]])
 21 | 
 22 |     shuffle_data = {'y': y[p], 'c': c[p], 'r': r[p], 'qids': qids, 'dids': dids}
 23 |     # print('test after shuffle: ')
 24 |     # print('y: ', y[p][0:1])
 25 |     # print('c: ', c[p][0:1])
 26 |     # print('r: ', r[p][0:1])
 27 |     # print('qids: ', qids[0:1])
 28 |     # print('dids: ', dids[0:1])
 29 | 
 30 |     return shuffle_data
 31 | 
 32 | def split_c(c, split_id):
 33 |     '''c is a list, example context
 34 |        split_id is a integer, conf[_EOS_]
 35 |        return nested list
 36 |     '''
 37 |     turns = [[]]
 38 |     for _id in c:
 39 |         if _id != split_id:
 40 |             turns[-1].append(_id)
 41 |         else:
 42 |             turns.append([])
 43 |     if turns[-1] == [] and len(turns) > 1:
 44 |         turns.pop()
 45 |     return turns
 46 | 
 47 | def normalize_length(_list, length, cut_type='tail'):
 48 |     '''_list is a list or nested list, example turns/r/single turn c
 49 |        cut_type is head or tail, if _list len > length is used
 50 |        return a list len=length and min(read_length, length)
 51 |     '''
 52 |     real_length = len(_list)
 53 |     # if real_length == 0, pad 0
 54 |     if real_length == 0:
 55 |         return [0]*length, 0
 56 | 
 57 |     # if real_length <= length, pad 0s
 58 |     if real_length <= length:
 59 |         # 1D list
 60 |         if not isinstance(_list[0], list):
 61 |             _list.extend([0]*(length - real_length))
 62 |         else: # 2D list
 63 |             _list.extend([[]]*(length - real_length))
 64 |         return _list, real_length
 65 | 
 66 |     # if real_length > length, cut extra tokens
 67 |     if cut_type == 'head':
 68 |         return _list[:length], length
 69 |     if cut_type == 'tail':
 70 |         return _list[-length:], length
 71 | 
 72 | def produce_intent(cid, rid, turns_len_all, intent_dict):
 73 |     r_intent = intent_dict[rid]
 74 |     c_intent = []
 75 |     for i in range(turns_len_all): # loop all turns in context
 76 |         c_intent.append(intent_dict[cid + '-' + str(i)])
 77 |     return c_intent, r_intent
 78 | 
 79 | def produce_one_sample(data, conf, index, split_id, max_turn_num, max_turn_len, turn_cut_type='tail', term_cut_type='tail'):
 80 |     '''max_turn_num=10
 81 |        max_turn_len=50
 82 |        return y, nor_turns_nor_c, nor_r, turn_len, term_len, r_len
 83 |     '''
 84 |     # print('keys of data: ', data.keys())
 85 |     c = data['c'][index]
 86 |     r = data['r'][index][:]
 87 |     y = data['y'][index]
 88 |     cid = data['qids'][index]
 89 |     rid = data['dids'][index]
 90 |     c_intent = []
 91 |     r_intent = []
 92 | 
 93 |     turns = split_c(c, split_id)
 94 |     turns_len_all = len(turns) # all turns in context before normalization
 95 | 
 96 |     if conf['model_name'] != 'dam':
 97 |         c_intent, r_intent = produce_intent(cid, rid, turns_len_all, conf['intent_dict'])
 98 | 
 99 |     #print('test c_intent: ', c_intent)
100 |     #normalize turns_c length, nor_turns length is max_turn_num
101 |     # cut extra conversation turns
102 |     nor_turns, turn_len = normalize_length(turns, max_turn_num, turn_cut_type)
103 |     if conf['model_name'] != 'dam':
104 |         nor_turns_intent, turn_len_intent = normalize_length(c_intent, max_turn_num, turn_cut_type)
105 |     # print('test nor_turns_intent, turn_len_intent: ', nor_turns_intent,
106 |     #       turn_len_intent)
107 | 
108 |     nor_turns_nor_c = []
109 |     term_len = []
110 |     #nor_turn_nor_c length is max_turn_num, element of a list length is max_turn_len
111 |     # cut extra length for context turn text
112 |     for c in nor_turns:
113 |         #nor_c length is max_turn_len
114 |         nor_c, nor_c_len = normalize_length(c, max_turn_len, term_cut_type)
115 |         nor_turns_nor_c.append(nor_c)
116 |         term_len.append(nor_c_len)
117 | 
118 |     nor_turns_intent_nor_it = []
119 |     # pad 0s in nor_turns_intent if there are less than max_turn_num turns
120 |     if conf['model_name'] != 'dam':
121 |         for it in nor_turns_intent:
122 |             # nor_it length is intent_size
123 |             nor_it, nor_it_len = normalize_length(it, conf['intent_size'], term_cut_type)
124 |             nor_turns_intent_nor_it.append(nor_it)
125 | 
126 |     # cut extra length for response text
127 |     nor_r, r_len = normalize_length(r, max_turn_len, term_cut_type)
128 | 
129 |     return y, nor_turns_nor_c, nor_r, turn_len, term_len, r_len, nor_turns_intent_nor_it, r_intent
130 | 
131 | def build_one_batch(data, batch_index, conf, turn_cut_type='tail', term_cut_type='tail'):
132 |     _turns = []
133 |     _tt_turns_len = []
134 |     _every_turn_len = []
135 |     _turns_intent = []
136 | 
137 |     _response = []
138 |     _response_len = []
139 |     _response_intent = []
140 | 
141 |     _label = []
142 | 
143 |     for i in range(conf['batch_size']):
144 |         # i is to loop instances in the current batch
145 |         # index is a global position for this instance
146 |         index = batch_index * conf['batch_size'] + i
147 |         y, nor_turns_nor_c, nor_r, turn_len, term_len, r_len, c_intent, r_intent = produce_one_sample(data, conf, index, conf['_EOS_'], conf['max_turn_num'],
148 |                 conf['max_turn_len'], turn_cut_type, term_cut_type)
149 | 
150 |         _label.append(y)
151 |         _turns.append(nor_turns_nor_c)
152 |         _response.append(nor_r)
153 |         _every_turn_len.append(term_len)
154 |         _tt_turns_len.append(turn_len)
155 |         _response_len.append(r_len)
156 |         if conf['model_name'] != 'dam':
157 |             _turns_intent.append(c_intent)
158 |             _response_intent.append(r_intent)
159 | 
160 |     return _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label, _turns_intent, _response_intent
161 | 
162 | def build_one_batch_dict(data, batch_index, conf, turn_cut_type='tail', term_cut_type='tail'):
163 |     _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label, _turns_intent, _response_intent = build_one_batch(data, batch_index, conf, turn_cut_type, term_cut_type)
164 |     ans = {'turns': _turns,
165 |             'tt_turns_len': _tt_turns_len,
166 |             'every_turn_len': _every_turn_len,
167 |             'response': _response,
168 |             'response_len': _response_len,
169 |             'label': _label, "turns_intent": _turns_intent,
170 |            "response_intent": _response_intent}
171 |     return ans
172 | 
173 | def build_batches(data, conf, turn_cut_type='tail', term_cut_type='tail'):
174 |     '''Build batches for DAM and IADAM
175 |        for DAM, conf['intent_dict'] == None
176 |        for IADAM, conf['intent_dict'] != None
177 |        In addition to (c,r,y) for each instance, we also look up the corresponding
178 |        predicted intent vectors for (c,r) from the intent_dict in O(1)
179 |     '''
180 |     _turns_batches = []
181 |     _tt_turns_len_batches = []
182 |     _every_turn_len_batches = []
183 |     _turns_intent_batches = []
184 | 
185 |     _response_batches = []
186 |     _response_len_batches = []
187 |     _response_intent_batches = []
188 | 
189 |     _label_batches = []
190 | 
191 |     batch_len = len(data['y'])/conf['batch_size']
192 | 
193 |     for batch_index in range(batch_len):
194 |         _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label, _turns_intent, _response_intent = build_one_batch(data, batch_index, conf, turn_cut_type='tail', term_cut_type='tail')
195 | 
196 |         _turns_batches.append(_turns)
197 |         _tt_turns_len_batches.append(_tt_turns_len)
198 |         _every_turn_len_batches.append(_every_turn_len)
199 | 
200 |         _response_batches.append(_response)
201 |         _response_len_batches.append(_response_len)
202 | 
203 |         if conf['model_name'] != 'dam':
204 |             _turns_intent_batches.append(_turns_intent)
205 |             _response_intent_batches.append(_response_intent)
206 | 
207 |         _label_batches.append(_label)
208 | 
209 |     ans = {
210 |         "turns": _turns_batches, "tt_turns_len": _tt_turns_len_batches, "every_turn_len":_every_turn_len_batches,
211 |         "response": _response_batches, "response_len": _response_len_batches, "label": _label_batches,
212 |         "turns_intent" : _turns_intent_batches, "response_intent": _response_intent_batches
213 |     }
214 | 
215 |     return ans
216 | 
217 | # def build_batches_iadam(data, conf, intent_dict, turn_cut_type='tail',
218 | #                         term_cut_type='tail'):
219 | #     '''Build batches for intent-aware DAM model
220 | #     In addition to (c,r,y) for each instance, we also look up the corresponding
221 | #     predicted intent vectors for (c,r) from the intent_dict in O(1)
222 | #     '''
223 | #     _turns_batches = []
224 | #     _tt_turns_len_batches = []
225 | #     _every_turn_len_batches = []
226 | #
227 | #     _response_batches = []
228 | #     _response_len_batches = []
229 | #
230 | #     _label_batches = []
231 | #
232 | #     batch_len = len(data['y']) / conf['batch_size']
233 | #     print('number of batches in one epoch', batch_len)
234 | #
235 | #     for batch_index in range(batch_len):
236 | #         # batch_index is to index the batch in the current epoch
237 | #         # if batch_size = 50, and there are 500 instances in data
238 | #         # than batch_len = 10, batch_index = 0,1,...9
239 | #         _turns, _tt_turns_len, _every_turn_len, _response, _response_len, _label = build_one_batch(
240 | #             data, batch_index, conf, turn_cut_type='tail',
241 | #             term_cut_type='tail')
242 | #
243 | #         _turns_batches.append(_turns)
244 | #         _tt_turns_len_batches.append(_tt_turns_len)
245 | #         _every_turn_len_batches.append(_every_turn_len)
246 | #
247 | #         _response_batches.append(_response)
248 | #         _response_len_batches.append(_response_len)
249 | #
250 | #         _label_batches.append(_label)
251 | #
252 | #     ans = {
253 | #         "turns": _turns_batches, "tt_turns_len": _tt_turns_len_batches,
254 | #         "every_turn_len": _every_turn_len_batches,
255 | #         "response": _response_batches, "response_len": _response_len_batches,
256 | #         "label": _label_batches
257 | #     }
258 | #
259 | #     return ans
260 | 
261 | def read_dict(word2id_file):
262 |     id2word_dict = dict()
263 |     with open(word2id_file) as fin:
264 |         lines = fin.readlines()
265 |         for index in range(0,len(lines)-1, 2):
266 |             id2word_dict[lines[index+1].strip()] = lines[index].strip()
267 |         print('vocab size: ', len(id2word_dict))
268 |     return id2word_dict
269 | 
270 | 
271 | # read intent_vectors.txt for intent vectors in DMN_INTENT model
272 | def read_intent(filename):
273 |     intent_dict = {}
274 |     print('read intent vectors...')
275 |     with open(filename) as fin:
276 |         for l in tqdm(fin):
277 |             to = l.strip().split('\t')
278 |             intent_dict[to[0]] = [float(x) for x in
279 |                                   to[1].split()]  # str to float
280 |     return intent_dict
281 | 
282 | if __name__ == '__main__':
283 |     # conf = {
284 |     #     "batch_size": 256,
285 |     #     "max_turn_num": 10,
286 |     #     "max_turn_len": 50,
287 |     #     "_EOS_": 28270,
288 |     # }
289 |     # train, val, test = pickle.load(open('../../data/data_small.pkl', 'rb'))
290 |     # print('load data success')
291 |     #
292 |     # train_batches = build_batches(train, conf)
293 |     # val_batches = build_batches(val, conf)
294 |     # test_batches = build_batches(test, conf)
295 |     # print('build batches success')
296 |     #
297 |     # pickle.dump([train_batches, val_batches, test_batches], open('../../data/batches_small.pkl', 'wb'))
298 |     # print('dump success')
299 |     word2id_file = '../../data/ubuntu/word2id'
300 |     id2word_dict = read_dict(word2id_file)
301 |     print(dict(id2word_dict.items()[0:5]))
302 | 
303 |         
304 | 
305 | 
306 |     
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 |     
316 |     
317 | 
318 | 
319 | 


--------------------------------------------------------------------------------
/IART/utils/preparation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from __future__ import print_function
  5 | import sys
  6 | import os
  7 | import numpy as np
  8 | import hashlib
  9 | import random
 10 | 
 11 | import preprocess
 12 | 
 13 | 
 14 | class Preparation(object):
 15 |     '''Convert dataset of different text matching tasks into a unified format as the input of deep matching modules. Users provide datasets contain pairs of texts along with their labels, and the module produces the following files:
 16 |     * Word Dictionary: this file records the mapping from each word to a unique identifier.
 17 |     * Corpus File: this file records the mapping from each text to a unique identifiers, along with a sequence of word identifiers contained in text.
 18 |     * Relation File: this file records the relationship between two texts, each line containing the label and a pair of ids.
 19 |     '''
 20 | 
 21 |     def __init__(self):
 22 |         pass
 23 | 
 24 |     def get_text_id(self, hashid, text, idtag='T'):
 25 |         hash_obj = hashlib.sha1(text.encode('utf8'))  # if the text are the same, then the hash_code are also the same
 26 |         hex_dig = hash_obj.hexdigest()
 27 |         if hex_dig in hashid:
 28 |             return hashid[hex_dig]
 29 |         else:
 30 |             tid = idtag + str(len(hashid))  # start from 0, 1, 2, ...
 31 |             hashid[hex_dig] = tid
 32 |             return tid
 33 | 
 34 |     def parse_line(self, line, delimiter='\t'):
 35 |         subs = line.split(delimiter)
 36 |         # print('subs: ', len(subs))
 37 |         if 3 != len(subs):
 38 |             raise ValueError('format of data file wrong, should be \'label,text1,text2\'.')
 39 |         else:
 40 |             return subs[0], subs[1], subs[2]
 41 | 
 42 |     def parse_line_dmn(self, line, delimiter='\t'):
 43 |         subs = line.split(delimiter)
 44 |         # print('subs: ', len(subs))
 45 |         if len(subs) < 3:
 46 |             raise ValueError('format of data file wrong, should be \'label,text1(mulitple utterances septerated by tab),text2\'.')
 47 |         else:
 48 |             return subs
 49 | 
 50 |     def run_with_one_corpus(self, file_path):
 51 |         hashid = {}
 52 |         corpus = {}
 53 |         rels = []
 54 |         f = open(file_path, 'r')
 55 |         for line in f:
 56 |             line = line.decode('utf8')
 57 |             line = line.strip()
 58 |             label, t1, t2 = self.parse_line(line)
 59 |             id1 = self.get_text_id(hashid, t1, 'T')
 60 |             id2 = self.get_text_id(hashid, t2, 'T')
 61 |             corpus[id1] = t1
 62 |             corpus[id2] = t2
 63 |             rels.append((label, id1, id2))
 64 |         f.close()
 65 |         return corpus, rels
 66 | 
 67 |     def run_with_two_corpus(self, file_path):
 68 |         hashid_q = {}
 69 |         hashid_d = {}
 70 |         corpus_q = {}
 71 |         corpus_d = {}
 72 |         rels = []
 73 |         f = open(file_path, 'r')
 74 |         for line in f:
 75 |             line = line.decode('utf8')
 76 |             line = line.strip()
 77 |             label, t1, t2 = self.parse_line(line)
 78 |             id1 = self.get_text_id(hashid_q, t1, 'Q')
 79 |             id2 = self.get_text_id(hashid_d, t2, 'D')
 80 |             corpus_q[id1] = t1
 81 |             corpus_d[id2] = t2
 82 |             rels.append((label, id1, id2))
 83 |         f.close()
 84 |         return corpus_q, corpus_d, rels
 85 | 
 86 |     def run_with_train_valid_test_corpus(self, train_file, valid_file, test_file):
 87 |         '''
 88 |         Run with pre-splited train_file, valid_file, test_file
 89 |         The input format should be label \t text1 \t text2
 90 |         The query ids can't be duplicated. For the same query
 91 |         id, the document ids can't be duplicated.
 92 |         Note that if we make queries with unique id (fixed 10 candidates for a single query), then it is
 93 |         possible that multiple queries have different query ids, but with the same text (in rare cases)
 94 |         :param train_file: train file
 95 |         :param valid_file: valid file
 96 |         :param test_file: test file
 97 |         :return: corpus, rels_train, rels_valid, rels_test
 98 |         '''
 99 |         hashid = {}
100 |         corpus = {}
101 |         rels = []
102 |         rels_train = []
103 |         rels_valid = []
104 |         rels_test = []
105 |         # merge corpus files, but return rels for train/valid/test seperately
106 |         curQ = 'init'
107 |         curQid = 0
108 |         for file_path in list([train_file, valid_file, test_file]):
109 |             if file_path == train_file:
110 |                 rels = rels_train
111 |             elif file_path == valid_file:
112 |                 rels = rels_valid
113 |             if file_path == test_file:
114 |                 rels = rels_test
115 |             f = open(file_path, 'r')
116 |             for line in f:
117 |                 line = line.decode('utf8')
118 |                 line = line.strip()
119 |                 label, t1, t2 = self.parse_line(line)
120 |                 id2 = self.get_text_id(hashid, t2, 'D')
121 |                 # generate unique query ids
122 |                 if t1 == curQ:
123 |                     # same query
124 |                     id1 = 'Q' + str(curQid)
125 |                 else:
126 |                     # new query
127 |                     curQid += 1
128 |                     id1 = 'Q' + str(curQid)
129 |                     curQ = t1
130 |                 corpus[id1] = t1
131 |                 corpus[id2] = t2
132 |                 rels.append((label, id1, id2))
133 |             f.close()
134 |         return corpus, rels_train, rels_valid, rels_test
135 | 
136 |     def run_with_train_valid_test_corpus_dmn(self, train_file, valid_file, test_file):
137 |         '''
138 |         Run with pre-splited train_file, valid_file, test_file for dmn model for conversation response ranking
139 |         The input format should be label \t text1 (conversation context utterances seperated by \t) \t text2
140 |         The query ids can't be duplicated. For the same query
141 |         id, the document ids can't be duplicated.
142 |         Note that if we make queries with unique id (fixed 10 candidates for a single query), then it is
143 |         possible that multiple queries have different query ids, but with the same text (in rare cases)
144 |         :param train_file: train file
145 |         :param valid_file: valid file
146 |         :param test_file: test file
147 |         :return: corpus, rels_train, rels_valid, rels_test
148 |         '''
149 |         hashid = {}
150 |         corpus = {}
151 |         rels = []
152 |         rels_train = []
153 |         rels_valid = []
154 |         rels_test = []
155 |         # merge corpus files, but return rels for train/valid/test seperately
156 |         curQ = 'init'
157 |         curQid = 0
158 |         for file_path in list([train_file, valid_file, test_file]):
159 |             if file_path == train_file:
160 |                 rels = rels_train
161 |             elif file_path == valid_file:
162 |                 rels = rels_valid
163 |             if file_path == test_file:
164 |                 rels = rels_test
165 |             f = open(file_path, 'r')
166 |             for line in f:
167 |                 line = line.decode('utf8')
168 |                 line = line.strip()
169 |                 subs = self.parse_line_dmn(line)
170 |                 label = subs[0]
171 |                 t1 = '\t'.join(subs[1:-1])
172 |                 t2 = subs[-1]
173 |                 id2 = self.get_text_id(hashid, t2, 'D')
174 |                 # generate unique query ids
175 |                 if t1 == curQ:
176 |                     # same query
177 |                     id1 = 'Q' + str(curQid)
178 |                 else:
179 |                     # new query
180 |                     curQid += 1
181 |                     id1 = 'Q' + str(curQid)
182 |                     curQ = t1
183 |                 corpus[id1] = t1
184 |                 corpus[id2] = t2
185 |                 rels.append((label, id1, id2))
186 |             f.close()
187 |         return corpus, rels_train, rels_valid, rels_test
188 | 
189 |     @staticmethod
190 |     def save_corpus(file_path, corpus):
191 |         f = open(file_path, 'w')
192 |         for qid, text in corpus.items():
193 |             f.write('%s %s\n' % (qid, text.encode('utf8')))
194 |         f.close()
195 | 
196 |     @staticmethod
197 |     def save_corpus_dmn(file_path, corpus, delim='\t'):
198 |         f = open(file_path, 'w')
199 |         for qid, text in corpus.items():
200 |             f.write('%s%s%s\n' % (qid, delim, text.encode('utf8')))
201 |         f.close()
202 | 
203 |     @staticmethod
204 |     def merge_corpus(train_corpus, valid_corpus, test_corpus):
205 |         # cat train valid test > corpus.txt
206 |         # cat corpus_train.txt corpus_valid.txt corpus_test.txt > corpus.txt
207 |         os.system('cat ' + train_corpus + ' ' + valid_corpus + ' ' + test_corpus + '  > corpus.txt')
208 | 
209 |     @staticmethod
210 |     def save_relation(file_path, relations):
211 |         f = open(file_path, 'w')
212 |         for rel in relations:
213 |             f.write('%s %s %s\n' % (rel))
214 |         f.close()
215 | 
216 |     @staticmethod
217 |     def check_filter_query_with_dup_doc(input_file):
218 |         ''' Filter queries with duplicated doc ids in the relation files
219 |         :param input_file: input file, which could be the relation file for train/valid/test data
220 |                            The format is "label qid did"
221 |         :return:
222 |         '''
223 |         with open(input_file) as f_in, open(input_file + '.fd', 'w') as f_out:
224 |             cur_qid = 'init'
225 |             cache_did_set = set()
226 |             cache_q_lines = []
227 |             found_dup_doc = False
228 |             for l in f_in:
229 |                 tokens = l.split()
230 |                 if tokens[1] == cur_qid:
231 |                     # same qid
232 |                     cache_q_lines.append(l)
233 |                     if tokens[2] in cache_did_set:
234 |                         found_dup_doc = True
235 |                     else:
236 |                         cache_did_set.add(tokens[2])
237 |                 else:
238 |                     # new qid
239 |                     if not found_dup_doc:
240 |                         f_out.write(''.join(cache_q_lines))
241 |                     else:
242 |                         print
243 |                         'found qid with duplicated doc id/text: ', ''.join(cache_q_lines)
244 |                         print
245 |                         'filtered... continue'
246 |                     cache_q_lines = []
247 |                     cache_q_lines.append(l)
248 |                     found_dup_doc = False
249 |                     cache_did_set.clear()
250 |                     cur_qid = tokens[1]
251 |                     cache_did_set.add(tokens[2])
252 |             # the last query
253 |             # print len(cache_q_lines), len(cache_did_set)
254 |             if (len(cache_q_lines) != 0 and len(cache_q_lines) == len(cache_did_set)):
255 |                 f_out.write(''.join(cache_q_lines))
256 |                 print
257 |                 'write the last query... done: ', ''.join(cache_q_lines)
258 | 
259 |     @staticmethod
260 |     def split_train_valid_test(relations, ratio=[0.8, 0.1, 0.1]):
261 |         random.shuffle(relations)
262 |         total_rel = len(relations)
263 |         num_train = int(total_rel * ratio[0])
264 |         num_valid = int(total_rel * ratio[1])
265 |         valid_end = num_train + num_valid
266 |         rel_train = relations[: num_train]
267 |         rel_valid = relations[num_train: valid_end]
268 |         rel_test = relations[valid_end:]
269 |         return rel_train, rel_valid, rel_test
270 | 
271 |     @staticmethod
272 |     def split_train_valid_test_for_ranking(relations, ratio=[0.8, 0.1, 0.1]):
273 |         qid_group = set()
274 |         for r, q, d in relations:
275 |             qid_group.add(q)
276 |         qid_group = list(qid_group)
277 | 
278 |         random.shuffle(qid_group)
279 |         total_rel = len(qid_group)
280 |         num_train = int(total_rel * ratio[0])
281 |         num_valid = int(total_rel * ratio[1])
282 |         valid_end = num_train + num_valid
283 | 
284 |         qid_train = qid_group[: num_train]
285 |         qid_valid = qid_group[num_train: valid_end]
286 |         qid_test = qid_group[valid_end:]
287 | 
288 |         def select_rel_by_qids(qids):
289 |             rels = []
290 |             qids = set(qids)
291 |             for r, q, d in relations:
292 |                 if q in qids:
293 |                     rels.append((r, q, d))
294 |             return rels
295 | 
296 |         rel_train = select_rel_by_qids(qid_train)
297 |         rel_valid = select_rel_by_qids(qid_valid)
298 |         rel_test = select_rel_by_qids(qid_test)
299 | 
300 |         return rel_train, rel_valid, rel_test
301 | 
302 | 
303 | if __name__ == '__main__':
304 |     prepare = Preparation()
305 |     basedir = '../../data/example/ranking/'
306 |     corpus, rels = prepare.run_with_one_corpus(basedir + 'sample.txt')
307 |     print('total corpus : %d ...' % (len(corpus)))
308 |     print('total relations : %d ...' % (len(rels)))
309 |     prepare.save_corpus(basedir + 'corpus.txt', corpus)
310 | 
311 |     rel_train, rel_valid, rel_test = prepare.split_train_valid_test(rels, [0.8, 0.1, 0.1])
312 |     prepare.save_relation(basedir + 'relation_train.txt', rel_train)
313 |     prepare.save_relation(basedir + 'relation_valid.txt', rel_valid)
314 |     prepare.save_relation(basedir + 'relation_test.txt', rel_test)
315 |     print('Done ...')
316 | 


--------------------------------------------------------------------------------
/IART/utils/operations.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from scipy.stats import multivariate_normal
  4 | import tensorflow as tf
  5 | 
  6 | def learning_rate(step_num, d_model=512, warmup_steps=4000):
  7 |     a = step_num**(-0.5)
  8 |     b = step_num*warmup_steps**(-1.5)
  9 |     return a, b, d_model**(-0.5) * min(step_num**(-0.5), step_num*(warmup_steps**(-1.5))) 
 10 | 
 11 | def selu(x):
 12 |     alpha = 1.6732632423543772848170429916717
 13 |     scale = 1.0507009873554804934193349852946
 14 |     print('use selu')
 15 |     return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x))
 16 | 
 17 | def bilinear_sim_4d(x, y, is_nor=True):
 18 |     '''calulate bilinear similarity with two 4d tensor.
 19 |     
 20 |     Args:
 21 |         x: a tensor with shape [batch, time_x, dimension_x, num_stacks]
 22 |         y: a tensor with shape [batch, time_y, dimension_y, num_stacks]
 23 | 
 24 |     Returns:
 25 |         a tensor with shape [batch, time_x, time_y, num_stacks]
 26 | 
 27 |     Raises:
 28 |         ValueError: if
 29 |             the shapes of x and y are not match;
 30 |             bilinear matrix reuse error.
 31 |     '''
 32 |     M = tf.get_variable(
 33 |         name="bilinear_matrix", 
 34 |         shape=[x.shape[2], y.shape[2], x.shape[3]],
 35 |         dtype=tf.float32,
 36 |         initializer=tf.orthogonal_initializer())
 37 |     sim = tf.einsum('biks,kls,bjls->bijs', x, M, y)
 38 | 
 39 |     if is_nor:
 40 |         scale = tf.sqrt(tf.cast(x.shape[2] * y.shape[2], tf.float32))
 41 |         scale = tf.maximum(1.0, scale)
 42 |         return sim / scale
 43 |     else:
 44 |         return sim
 45 | 
 46 | def dot_sim_2d(x, y):
 47 |     '''
 48 |     calculate dot similarity with two tensor in 2D for intent attention
 49 |     '''
 50 |     M = tf.get_variable(
 51 |         name="dot_attention_matrix",
 52 |         shape=[x.shape[-1]*2], # concate x and y
 53 |         dtype=tf.float32,
 54 |         initializer=tf.truncated_normal_initializer(stddev=0.02))
 55 |     con = tf.concat([x,y], 1)
 56 |     sim = tf.einsum('bi,i->b', con, M)
 57 |     return sim
 58 | 
 59 | def bilinear_sim_2d(x, y):
 60 |     '''
 61 |     calculate bilinear similarity with two tensor in 2D for intent attention
 62 |     '''
 63 |     M = tf.get_variable(
 64 |         name="bilinear_matrix",
 65 |         shape=[x.shape[-1], y.shape[-1]],
 66 |         dtype=tf.float32,
 67 |         initializer=tf.truncated_normal_initializer(stddev=0.02))
 68 |     sim = tf.einsum('bi,ij,jb->b', x, M, tf.transpose(y))
 69 |     return sim
 70 | 
 71 | def outprod_sim_2d(x, y):
 72 |     '''
 73 |     calculate outprod similarity with two tensor in 2D for intent attention
 74 |     '''
 75 |     M = tf.get_variable(
 76 |         name="outproduct_matrix",
 77 |         shape=[x.shape[-1]*y.shape[-1]],
 78 |         dtype=tf.float32,
 79 |         initializer=tf.truncated_normal_initializer(stddev=0.02))
 80 |     sim = tf.einsum('bi,bj->bij', x, y)
 81 |     flat = tf.reshape(sim, [x.shape[0],-1]) # [batch, intent_size*intent_size]
 82 |     sim = tf.einsum('bi,i->b', flat, M)
 83 |     return sim
 84 | 
 85 | def bilinear_sim(x, y, is_nor=True):
 86 |     '''calculate bilinear similarity with two tensor.
 87 |     Args:
 88 |         x: a tensor with shape [batch, time_x, dimension_x]
 89 |         y: a tensor with shape [batch, time_y, dimension_y]
 90 |     
 91 |     Returns:
 92 |         a tensor with shape [batch, time_x, time_y]
 93 |     Raises:
 94 |         ValueError: if
 95 |             the shapes of x and y are not match;
 96 |             bilinear matrix reuse error.
 97 |     '''
 98 |     M = tf.get_variable(
 99 |         name="bilinear_matrix",
100 |         shape=[x.shape[-1], y.shape[-1]],
101 |         dtype=tf.float32,
102 |         initializer=tf.orthogonal_initializer())
103 |     sim = tf.einsum('bik,kl,bjl->bij', x, M, y)
104 | 
105 |     if is_nor:
106 |         scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32))
107 |         scale = tf.maximum(1.0, scale)
108 |         return sim / scale
109 |     else:
110 |         return sim
111 | 
112 | def dot_sim(x, y, is_nor=True):
113 |     '''calculate dot similarity with two tensor.
114 | 
115 |     Args:
116 |         x: a tensor with shape [batch, time_x, dimension]
117 |         y: a tensor with shape [batch, time_y, dimension]
118 |     
119 |     Returns:
120 |         a tensor with shape [batch, time_x, time_y]
121 |     Raises:
122 |         AssertionError: if
123 |             the shapes of x and y are not match.
124 |     '''
125 |     assert x.shape[-1] == y.shape[-1]
126 | 
127 |     sim = tf.einsum('bik,bjk->bij', x, y)
128 | 
129 |     if is_nor:
130 |         scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32))
131 |         scale = tf.maximum(1.0, scale)
132 |         return sim / scale
133 |     else:
134 |         return sim
135 | 
136 | def layer_norm(x, axis=None, epsilon=1e-6):
137 |     '''Add layer normalization.
138 | 
139 |     Args:
140 |         x: a tensor
141 |         axis: the dimensions to normalize
142 | 
143 |     Returns:
144 |         a tensor the same shape as x.
145 | 
146 |     Raises:
147 |     '''
148 |     print('wrong version of layer_norm')
149 |     scale = tf.get_variable(
150 |         name='scale',
151 |         shape=[1],
152 |         dtype=tf.float32,
153 |         initializer=tf.ones_initializer())
154 |     bias = tf.get_variable(
155 |         name='bias',
156 |         shape=[1],
157 |         dtype=tf.float32,
158 |         initializer=tf.zeros_initializer())
159 | 
160 |     if axis is None:
161 |         axis = [-1]
162 | 
163 |     mean = tf.reduce_mean(x, axis=axis, keep_dims=True)
164 |     variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True)
165 |     norm = (x-mean) * tf.rsqrt(variance + epsilon)
166 |     return scale * norm + bias
167 | 
168 | def layer_norm_debug(x, axis = None, epsilon=1e-6):
169 |     '''Add layer normalization.
170 | 
171 |     Args:
172 |         x: a tensor
173 |         axis: the dimensions to normalize
174 | 
175 |     Returns:
176 |         a tensor the same shape as x.
177 | 
178 |     Raises:
179 |     '''
180 |     if axis is None:
181 |         axis = [-1]
182 |     shape = [x.shape[i] for i in axis]
183 | 
184 |     scale = tf.get_variable(
185 |         name='scale',
186 |         shape=shape,
187 |         dtype=tf.float32,
188 |         initializer=tf.ones_initializer())
189 |     bias = tf.get_variable(
190 |         name='bias',
191 |         shape=shape,
192 |         dtype=tf.float32,
193 |         initializer=tf.zeros_initializer())
194 | 
195 |     mean = tf.reduce_mean(x, axis=axis, keep_dims=True)
196 |     variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True)
197 |     norm = (x-mean) * tf.rsqrt(variance + epsilon)
198 |     return scale * norm + bias
199 | 
200 | def dense(x, out_dimension=None, add_bias=True):
201 |     '''Add dense connected layer, Wx + b.
202 | 
203 |     Args:
204 |         x: a tensor with shape [batch, time, dimension]
205 |         out_dimension: a number which is the output dimension
206 | 
207 |     Return:
208 |         a tensor with shape [batch, time, out_dimension]
209 | 
210 |     Raises:
211 |     '''
212 |     if out_dimension is None:
213 |         out_dimension = x.shape[-1]
214 | 
215 |     W = tf.get_variable(
216 |         name='weights',
217 |         shape=[x.shape[-1], out_dimension],
218 |         dtype=tf.float32,
219 |         initializer=tf.orthogonal_initializer())
220 |     if add_bias:
221 |         bias = tf.get_variable(
222 |             name='bias',
223 |             shape=[1],
224 |             dtype=tf.float32,
225 |             initializer=tf.zeros_initializer())
226 |         return tf.einsum('bik,kj->bij', x, W) + bias
227 |     else:
228 |         return tf.einsum('bik,kj->bij', x, W)
229 | 
230 | def matmul_2d(x, out_dimension, drop_prob=None):
231 |     '''Multiplies 2-d tensor by weights.
232 | 
233 |     Args:
234 |         x: a tensor with shape [batch, dimension]
235 |         out_dimension: a number
236 | 
237 |     Returns:
238 |         a tensor with shape [batch, out_dimension]
239 | 
240 |     Raises:
241 |     '''
242 |     W = tf.get_variable(
243 |         name='weights',
244 |         shape=[x.shape[1], out_dimension],
245 |         dtype=tf.float32,
246 |         initializer=tf.orthogonal_initializer())
247 |     if drop_prob is not None:
248 |         W = tf.nn.dropout(W, drop_prob)
249 |         print('W is dropout')
250 | 
251 |     return tf.matmul(x, W)
252 | 
253 | def gauss_positional_encoding_vector(x, role=0, value=0):
254 |     position = int(x.shape[1])
255 |     dimension = int(x.shape[2])
256 |     print('position: %s' %position)
257 |     print('dimension: %s' %dimension)
258 | 
259 |     _lambda = tf.get_variable(
260 |         name='lambda',
261 |         shape=[position],
262 |         dtype=tf.float32,
263 |         initializer=tf.constant_initializer(value))
264 |     _lambda = tf.expand_dims(_lambda, axis=-1)
265 | 
266 |     mean = [position/2.0, dimension/2.0]
267 | 
268 |     #cov = [[position/3.0, 0], [0, dimension/3.0]]
269 |     sigma_x = position/math.sqrt(4.0*dimension)
270 |     sigma_y = math.sqrt(dimension/4.0)
271 |     cov = [[sigma_x*sigma_x, role*sigma_x*sigma_y], 
272 |             [role*sigma_x*sigma_y, sigma_y*sigma_y]]
273 | 
274 |     pos = np.dstack(np.mgrid[0:position, 0:dimension])
275 | 
276 |     
277 |     rv = multivariate_normal(mean, cov)
278 |     signal = rv.pdf(pos) 
279 |     signal = signal - np.max(signal)/2.0
280 | 
281 |     signal = tf.multiply(_lambda, signal)
282 |     signal = tf.expand_dims(signal, axis=0)
283 | 
284 |     print('gauss positional encoding')
285 | 
286 |     return x + _lambda * signal
287 | 
288 | def positional_encoding(x, min_timescale=1.0, max_timescale=1.0e4, value=0):
289 |     '''Adds a bunch of sinusoids of different frequencies to a tensor.
290 | 
291 |     Args:
292 |         x: a tensor with shape [batch, length, channels]
293 |         min_timescale: a float
294 |         max_timescale: a float
295 | 
296 |     Returns:
297 |         a tensor the same shape as x.
298 | 
299 |     Raises:
300 |     '''
301 |     length = x.shape[1]
302 |     channels = x.shape[2]
303 |     _lambda = tf.get_variable(
304 |         name='lambda',
305 |         shape=[1],
306 |         dtype=tf.float32,
307 |         initializer=tf.constant_initializer(value))
308 | 
309 |     position = tf.to_float(tf.range(length))
310 |     num_timescales = channels // 2
311 |     log_timescale_increment = (
312 |         math.log(float(max_timescale) / float(min_timescale)) /
313 |         (tf.to_float(num_timescales) - 1))
314 |     inv_timescales = min_timescale * tf.exp(
315 |         tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
316 |     scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
317 |     signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
318 |     signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
319 |     #signal = tf.reshape(signal, [1, length, channels])
320 |     signal = tf.expand_dims(signal, axis=0)
321 | 
322 |     return x + _lambda * signal
323 | 
324 | 
325 | def positional_encoding_vector(x, min_timescale=1.0, max_timescale=1.0e4, value=0):
326 |     '''Adds a bunch of sinusoids of different frequencies to a tensor.
327 | 
328 |     Args:
329 |         x: a tensor with shape [batch, length, channels]
330 |         min_timescale: a float
331 |         max_timescale: a float
332 | 
333 |     Returns:
334 |         a tensor the same shape as x.
335 | 
336 |     Raises:
337 |     '''
338 |     length = x.shape[1]
339 |     channels = x.shape[2]
340 |     _lambda = tf.get_variable(
341 |         name='lambda',
342 |         shape=[length],
343 |         dtype=tf.float32,
344 |         initializer=tf.constant_initializer(value))
345 |     _lambda = tf.expand_dims(_lambda, axis=-1)
346 | 
347 |     position = tf.to_float(tf.range(length))
348 |     num_timescales = channels // 2
349 |     log_timescale_increment = (
350 |         math.log(float(max_timescale) / float(min_timescale)) /
351 |         (tf.to_float(num_timescales) - 1))
352 |     inv_timescales = min_timescale * tf.exp(
353 |         tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
354 |     scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
355 |     signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
356 |     signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
357 | 
358 |     signal = tf.multiply(_lambda, signal)
359 |     signal = tf.expand_dims(signal, axis=0)
360 | 
361 |     return x + signal
362 | 
363 | def mask(row_lengths, col_lengths, max_row_length, max_col_length):
364 |     '''Return a mask tensor representing the first N positions of each row and each column.
365 | 
366 |     Args:
367 |         row_lengths: a tensor with shape [batch]
368 |         col_lengths: a tensor with shape [batch]
369 |         row_lengths and col_lengths are real lengths
370 |         max_row_length and max_col_length are max lengths
371 | 
372 |     Returns:
373 |         a mask tensor with shape [batch, max_row_length, max_col_length]
374 | 
375 |     Raises:
376 |     '''
377 |     row_mask = tf.sequence_mask(row_lengths, max_row_length) #return bool, [batch, max_row_len]
378 |     col_mask = tf.sequence_mask(col_lengths, max_col_length) #return bool, [batch, max_col_len]
379 | 
380 |     row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32)
381 |     col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32)
382 | 
383 |     return tf.einsum('bik,bjk->bij', row_mask, col_mask)
384 | 
385 | def weighted_sum(weight, values):
386 |     '''Calcualte the weighted sum.
387 | 
388 |     Args:
389 |         weight: a tensor with shape [batch, time, dimension]
390 |         values: a tensor with shape [batch, dimension, values_dimension]
391 | 
392 |     Return:
393 |         a tensor with shape [batch, time, values_dimension]
394 | 
395 |     Raises:
396 |     '''
397 |     return tf.einsum('bij,bjk->bik', weight, values)
398 | 
399 | 
400 |         
401 | 
402 | 


--------------------------------------------------------------------------------
/IART/models/iadam_attention.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | IADAM-Attention-V4-2 model which is the IART model in the paper.
  3 | Developed based on DAM model
  4 | 
  5 | @author: Liu Yang (yangliuyx@gmail.com / lyang@cs.umass.edu)
  6 | @homepage: https://sites.google.com/site/lyangwww/
  7 | '''
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | import cPickle as pickle
 12 | 
 13 | import utils.layers as layers
 14 | import utils.operations as op
 15 | 
 16 | class Net(object):
 17 |     '''Add positional encoding(initializer lambda is 0),
 18 |        cross-attention, cnn integrated and grad clip by value.
 19 | 
 20 |     Attributes:
 21 |         conf: a configuration paramaters dict
 22 |         word_embedding_init: a 2-d array with shape [vocab_size+1, emb_size]
 23 |         there is one dimension in vocab_size which is corresponding to _eos_.
 24 |         in our preprocessing, _eos_ is always the last dimension
 25 |         +1 to add one more embedding vector for padding and masking
 26 |         We add an "all 0" vector in the 0-th row of word_embedding_init in order
 27 |         to denote the padding word
 28 |         when call tf.nn.embedding_lookup(), if word_id = 0, then this is a paded
 29 |         word; if word_id > 0 (from 1 to vocab_size), then this is a real word
 30 |     '''
 31 | 
 32 |     def __init__(self, conf):
 33 |         self._graph = tf.Graph()
 34 |         self._conf = conf
 35 | 
 36 |         if self._conf['word_emb_init'] is not None:
 37 |             print('loading word emb init')
 38 |             self._word_embedding_init = pickle.load(
 39 |                 open(self._conf['word_emb_init'], 'rb'))
 40 |         else:
 41 |             self._word_embedding_init = None
 42 | 
 43 |     def build_graph(self):
 44 |         with self._graph.as_default():
 45 |             if self._conf['rand_seed'] is not None:
 46 |                 rand_seed = self._conf['rand_seed']
 47 |                 tf.set_random_seed(rand_seed)
 48 |                 print('set tf random seed: %s' % self._conf['rand_seed'])
 49 | 
 50 |             # word embedding
 51 |             if self._word_embedding_init is not None:
 52 |                 word_embedding_initializer = tf.constant_initializer(
 53 |                     self._word_embedding_init)
 54 |             else:
 55 |                 word_embedding_initializer = tf.random_normal_initializer(
 56 |                     stddev=0.1)
 57 | 
 58 |             self._word_embedding = tf.get_variable(
 59 |                 name='word_embedding',
 60 |                 shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
 61 |                 dtype=tf.float32,
 62 |                 initializer=word_embedding_initializer)
 63 | 
 64 |             # define placehloders
 65 |             self.turns = tf.placeholder(
 66 |                 tf.int32,
 67 |                 shape=[self._conf["batch_size"], self._conf["max_turn_num"],
 68 |                        self._conf["max_turn_len"]])
 69 | 
 70 |             self.tt_turns_len = tf.placeholder(  # turn_num
 71 |                 tf.int32,
 72 |                 shape=[self._conf["batch_size"]])
 73 | 
 74 |             self.every_turn_len = tf.placeholder(
 75 |                 tf.int32,
 76 |                 shape=[self._conf["batch_size"], self._conf["max_turn_num"]])
 77 | 
 78 |             self.turns_intent = tf.placeholder(
 79 |                 tf.float32,
 80 |                 shape=[self._conf["batch_size"], self._conf["max_turn_num"],
 81 |                        self._conf["intent_size"]])
 82 | 
 83 |             self.response = tf.placeholder(
 84 |                 tf.int32,
 85 |                 shape=[self._conf["batch_size"], self._conf["max_turn_len"]])
 86 | 
 87 |             self.response_len = tf.placeholder(
 88 |                 tf.int32,
 89 |                 shape=[self._conf["batch_size"]])
 90 | 
 91 |             self.response_intent = tf.placeholder(
 92 |                 tf.float32,
 93 |                 shape=[self._conf["batch_size"], self._conf["intent_size"]])
 94 | 
 95 |             self.label = tf.placeholder(
 96 |                 tf.float32,
 97 |                 shape=[self._conf["batch_size"]])
 98 | 
 99 |             # define operations
100 |             # response part
101 |             Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
102 |             # [batch_size, max_turn_len, embed_size]
103 | 
104 |             # print('[after embedding_lookup] Hr shape: %s' % Hr.shape)
105 | 
106 |             if self._conf['is_positional'] and self._conf['stack_num'] > 0:
107 |                 with tf.variable_scope('positional'):
108 |                     Hr = op.positional_encoding_vector(Hr, max_timescale=10)
109 |             Hr_stack = [Hr]  # 1st element of Hr_stack is the orginal embedding
110 |             # lyang comments: self attention
111 |             for index in range(self._conf['stack_num']):
112 |                 # print('[self attention for response] stack index: %d ' % index)
113 |                 with tf.variable_scope('self_stack_' + str(index)):
114 |                     # [batch, max_turn_len, emb_size]
115 |                     Hr = layers.block(  # attentive module
116 |                         Hr, Hr, Hr,
117 |                         Q_lengths=self.response_len,
118 |                         K_lengths=self.response_len)
119 |                     # print('[after layers.block] Hr shape: %s' % Hr.shape)
120 |                     # Hr is still [batch_size, max_turn_len, embed_size]
121 |                     Hr_stack.append(Hr)
122 | 
123 |             # print('[after self attention of response] len(Hr_stack)',
124 |             #       len(Hr_stack))  # 1+stack_num
125 |             # context part
126 |             # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
127 |             list_turn_t = tf.unstack(self.turns, axis=1)
128 |             list_turn_length = tf.unstack(self.every_turn_len, axis=1)
129 |             list_turn_intent = tf.unstack(self.turns_intent, axis=1)
130 | 
131 |             sim_turns = []
132 |             attention_turns = [] # intent based attention on each turn
133 |             # for every turn_t calculate matching vector
134 |             turn_index = 0
135 |             for turn_t, t_turn_length, t_intent in zip(list_turn_t, list_turn_length, list_turn_intent):
136 |                 print('current turn_index : ', turn_index)
137 |                 turn_index += 1
138 |                 Hu = tf.nn.embedding_lookup(self._word_embedding,
139 |                                             turn_t)  # [batch, max_turn_len, emb_size]
140 |                 # print('[after embedding_lookup] Hu shape: %s' % Hu.shape)
141 | 
142 |                 if self._conf['is_positional'] and self._conf['stack_num'] > 0:
143 |                     with tf.variable_scope('positional', reuse=True):
144 |                         Hu = op.positional_encoding_vector(Hu,
145 |                                                            max_timescale=10)
146 |                 Hu_stack = [Hu]  # 1st element of Hu_stack is the orginal embedding
147 | 
148 |                 # lyang comments: self attention
149 |                 for index in range(self._conf['stack_num']):
150 |                     # print('[self attention for context turn] stack index: %d ' % index)
151 |                     with tf.variable_scope('self_stack_' + str(index),
152 |                                            reuse=True):
153 |                         # [batch, max_turn_len, emb_size]
154 |                         Hu = layers.block(  # attentive module
155 |                             Hu, Hu, Hu,
156 |                             Q_lengths=t_turn_length, K_lengths=t_turn_length)
157 |                         # print('[after layers.block] Hu shape: %s' % Hu.shape)
158 |                         Hu_stack.append(Hu)
159 |                 # print('[after self attention of context turn] len(Hu_stack)',
160 |                 #       len(Hu_stack))  # 1+stack_num
161 | 
162 |                 # lyang comments: cross attention
163 |                 # print('[cross attention ...]')
164 |                 r_a_t_stack = []
165 |                 t_a_r_stack = []
166 |                 # cross attention
167 |                 for index in range(self._conf['stack_num'] + 1):
168 |                     # print('[cross attention] stack index = ', index)
169 |                     with tf.variable_scope('t_attend_r_' + str(index)):
170 |                         try:
171 |                             # [batch, max_turn_len, emb_size]
172 |                             t_a_r = layers.block(  # attentive module
173 |                                 Hu_stack[index], Hr_stack[index],
174 |                                 Hr_stack[index],
175 |                                 Q_lengths=t_turn_length,
176 |                                 K_lengths=self.response_len)
177 |                         except ValueError:
178 |                             tf.get_variable_scope().reuse_variables()
179 |                             t_a_r = layers.block(
180 |                                 # [batch, max_turn_len, emb_size]
181 |                                 Hu_stack[index], Hr_stack[index],
182 |                                 Hr_stack[index],
183 |                                 Q_lengths=t_turn_length,
184 |                                 K_lengths=self.response_len)
185 |                         # print('[cross attention t_attend_r_] stack index: %d, t_a_r.shape: %s' % (
186 |                         #         index, t_a_r.shape))
187 | 
188 |                     with tf.variable_scope('r_attend_t_' + str(index)):
189 |                         try:
190 |                             # [batch, max_turn_len, emb_size]
191 |                             r_a_t = layers.block(  # attentive module
192 |                                 Hr_stack[index], Hu_stack[index],
193 |                                 Hu_stack[index],
194 |                                 Q_lengths=self.response_len,
195 |                                 K_lengths=t_turn_length)
196 |                         except ValueError:
197 |                             tf.get_variable_scope().reuse_variables()
198 |                             r_a_t = layers.block(
199 |                                 Hr_stack[index], Hu_stack[index],
200 |                                 Hu_stack[index],
201 |                                 Q_lengths=self.response_len,
202 |                                 K_lengths=t_turn_length)
203 |                         # print('[cross attention r_a_t_] stack index: %d, r_a_t.shape: %s' % (
204 |                         #         index, r_a_t.shape))
205 | 
206 |                     t_a_r_stack.append(t_a_r)
207 |                     r_a_t_stack.append(r_a_t)
208 |                     # print('[cross attention] len(t_a_r_stack):', len(t_a_r_stack))
209 |                     # print('[cross attention] len(r_a_t_stack):', len(r_a_t_stack))
210 | 
211 |                 # print('[before extend] len(t_a_r_stack):', len(t_a_r_stack))
212 |                 # print('[before extend] len(r_a_t_stack):', len(r_a_t_stack))
213 |                 # lyang comments: 3D aggregation
214 |                 t_a_r_stack.extend(
215 |                     Hu_stack)  # half from self-attention; half from cross-attention
216 |                 r_a_t_stack.extend(
217 |                     Hr_stack)  # half from self-attention; half from cross-attention
218 |                 # after extend, len(t_a_r_stack)) = 2*(stack_num+1)
219 | 
220 |                 # print('[after extend] len(t_a_r_stack):', len(t_a_r_stack))
221 |                 # print('[after extend] len(r_a_t_stack):', len(r_a_t_stack))
222 | 
223 |                 t_a_r = tf.stack(t_a_r_stack, axis=-1)
224 |                 r_a_t = tf.stack(r_a_t_stack, axis=-1)
225 | 
226 |                 # print('after stack along the last dimension: ')
227 |                 # print('t_a_r shape: %s' % t_a_r.shape)
228 |                 # print('r_a_t shape: %s' % r_a_t.shape)
229 |                 # after stack, t_a_r and r_a_t are (batch, max_turn_len, embed_size, 2*(stack_num+1))
230 | 
231 |                 with tf.variable_scope('intent_based_attention',
232 |                                        reuse=tf.AUTO_REUSE): # share parameter across different turns
233 |                     # there are 3 different ways to implement intent based attention
234 |                     # implement these three different variations and compare the
235 |                     # effectiveness as model abalation analysis
236 |                     # let I_u_t and I_r_k are intent vector in [12,1]
237 |                     # 1. dot: w * [I_u_t, I_r_k], where w is [24,1]
238 |                     # 2. biliear: I_u_t' * w * I_r_k, where w is [12,12]
239 |                     # 3. outprod: I_u_t * I_r_k' -> [12,12] out product ->
240 |                     #             flaten to [144,1] outprod -> w*outprod
241 |                     #             where w is [1,144]
242 |                     attention_logits = layers.attention_intent(t_intent,
243 |                                         self.response_intent,
244 |                                         self._conf['intent_attention_type'])
245 |                     # print('[intent_based_attention] attention_logits.shape: %s' % attention_logits.shape)
246 |                     attention_turns.append(attention_logits)
247 | 
248 |                     # calculate similarity matrix
249 |                 with tf.variable_scope('similarity'):
250 |                     # sim shape [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
251 |                     # divide sqrt(200) to prevent gradient explosion
252 |                     # A_biks * B_bjks -> C_bijs
253 |                     sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(
254 |                         200.0)
255 |                     # (batch, max_turn_len, embed_size, 2*(stack_num+1)) *
256 |                     # (batch, max_turn_len, embed_size, 2*(stack_num+1)) ->
257 |                     # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
258 |                     # where k is corresponding to the dimension of embed_size,
259 |                     # which can be eliminated by dot product with einsum
260 |                     # print('[similarity] after einsum dot prod sim shape: %s' % sim.shape)
261 |                     # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
262 |                     # ! Here we multipy sim by intent based attention weights before
263 |                     # append sim into sim_turns in order to generate the weighted
264 |                     # stack in the next step
265 | 
266 |                 sim_turns.append(sim)
267 |                 # print('[similarity] after append, len(sim_turns):', len(sim_turns))
268 | 
269 |             attention_logits = tf.stack(attention_turns, axis=1) # [batch, max_turn_num]
270 |             print('[attention_logits] after stack attention_logits.shape: %s' % attention_logits.shape)
271 |             # add mask in attention following the way in BERT
272 |             # real turn_num is in self.tt_turns_len [batch]
273 |             # return a mask tensor with shape [batch,  conf['max_turn_num']]
274 |             attention_mask = tf.sequence_mask(self.tt_turns_len, self._conf['max_turn_num'],
275 |                                               dtype=tf.float32)
276 |             print('[attention_mask] attention_mask.shape: %s' % attention_mask.shape)
277 |             # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
278 |             # masked positions, this operation will create a tensor which is 0.0 for
279 |             # positions we want to attend and -10000.0 for masked positions.
280 |             adder = (1.0 - attention_mask) * -10000.0
281 | 
282 |             # Since we are adding it to the raw scores before the softmax, this is
283 |             # effectively the same as removing these entirely.
284 |             attention_logits += adder
285 |             attention = tf.nn.softmax(attention_logits) # by default softmax along dim=-1 [batch, max_turn_num]
286 |             print('[attention] attention.shape: %s' % attention_mask.shape)
287 |             self.attention = attention # will print it for visualization
288 | 
289 |             # cnn and aggregation
290 |             # lyang comments aggregation by 3D CNN layer
291 |             # [3d cnn aggregation] sim shape: (32, 9, 180, 180, 10)
292 |             # conv_0 shape: (32, 9, 180, 180, 16)
293 |             # pooling_0 shape: (32, 3, 60, 60, 16)
294 |             # conv_1 shape: (32, 3, 60, 60, 16)
295 |             # pooling_1 shape: (32, 1, 20, 20, 16)
296 |             # [3d cnn aggregation] final_info: (32, 6400) # [batch * feature_size]
297 |             # [batch, max_turn_num, max_turn_len, max_turn_len, 2*(stack_num+1)]
298 |             # (32, 9, 180, 180, 10)
299 |             sim = tf.stack(sim_turns, axis=1)
300 |             # multipy sim by attention score
301 |             sim = tf.einsum('bijks,bi->bijks', sim, attention)
302 |             print('[3d cnn aggregation] sim shape: %s' % sim.shape)
303 |             with tf.variable_scope('cnn_aggregation'):
304 |                 final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'],
305 |                                            self._conf['cnn_3d_oc1'])
306 |                 # for udc
307 |                 # final_info = layers.CNN_3d(sim, 32, 16)
308 |                 # for douban
309 |                 # final_info = layers.CNN_3d(sim, 16, 16)
310 | 
311 |             print('[3d cnn aggregation] final_info: %s' % final_info.shape)
312 |             # loss and train
313 |             with tf.variable_scope('loss'):
314 |                 self.loss, self.logits = layers.loss(final_info, self.label)
315 | 
316 |                 self.global_step = tf.Variable(0, trainable=False)
317 |                 initial_learning_rate = self._conf['learning_rate']
318 |                 self.learning_rate = tf.train.exponential_decay(
319 |                     initial_learning_rate,
320 |                     global_step=self.global_step,
321 |                     decay_steps=400,
322 |                     decay_rate=0.9,
323 |                     staircase=True)
324 | 
325 |                 Optimizer = tf.train.AdamOptimizer(self.learning_rate)
326 |                 self.optimizer = Optimizer.minimize(
327 |                     self.loss,
328 |                     global_step=self.global_step)
329 | 
330 |                 self.init = tf.global_variables_initializer()
331 |                 self.saver = tf.train.Saver(
332 |                     max_to_keep=self._conf["max_to_keep"])
333 |                 self.all_variables = tf.global_variables()
334 |                 self.all_operations = self._graph.get_operations()
335 |                 self.grads_and_vars = Optimizer.compute_gradients(self.loss)
336 | 
337 |                 for grad, var in self.grads_and_vars:
338 |                     if grad is None:
339 |                         print var
340 | 
341 |                 self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for
342 |                                    grad, var in self.grads_and_vars]
343 |                 self.g_updates = Optimizer.apply_gradients(
344 |                     self.capped_gvs,
345 |                     global_step=self.global_step)
346 | 
347 |         return self._graph
348 | 
349 | 
350 | 


--------------------------------------------------------------------------------
/IART/utils/layers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import utils.operations as op
  3 | def similarity(x, y, x_lengths, y_lengths):
  4 |     '''calculate similarity with two 3d tensor.
  5 | 
  6 |     Args:
  7 |         x: a tensor with shape [batch, time_x, dimension]
  8 |         y: a tensor with shape [batch, time_y, dimension]
  9 | 
 10 |     Returns:
 11 |         a tensor with shape [batch, time_x, time_y]
 12 | 
 13 |     Raises:
 14 |         ValueError: if
 15 |             the dimenisons of x and y are not equal.
 16 |     '''
 17 |     with tf.variable_scope('x_attend_y'):
 18 |         try:
 19 |             x_a_y = block(
 20 |                 x, y, y,
 21 |                 Q_lengths=x_lengths, K_lengths=y_lengths)
 22 |         except ValueError:
 23 |             tf.get_variable_scope().reuse_variables()
 24 |             x_a_y = block(
 25 |                 x, y, y,
 26 |                 Q_lengths=x_lengths, K_lengths=y_lengths)
 27 | 
 28 |     with tf.variable_scope('y_attend_x'):
 29 |         try:
 30 |             y_a_x = block(
 31 |                 y, x, x,
 32 |                 Q_lengths=y_lengths, K_lengths=x_lengths)
 33 |         except ValueError:
 34 |             tf.get_variable_scope().reuse_variables()
 35 |             y_a_x = block(
 36 |                 y, x, x,
 37 |                 Q_lengths=y_lengths, K_lengths=x_lengths)
 38 | 
 39 |     return tf.matmul(x + x_a_y, y + y_a_x, transpose_b=True)
 40 | 
 41 | 
 42 | def dynamic_L(x):
 43 |     '''Attention machanism to combine the infomation, 
 44 |        from https://arxiv.org/pdf/1612.01627.pdf.
 45 | 
 46 |     Args:
 47 |         x: a tensor with shape [batch, time, dimension]
 48 | 
 49 |     Returns:
 50 |         a tensor with shape [batch, dimension]
 51 | 
 52 |     Raises:
 53 |     '''
 54 |     key_0 = tf.get_variable(
 55 |         name='key',
 56 |         shape=[x.shape[-1]],
 57 |         dtype=tf.float32,
 58 |         initializer=tf.random_uniform_initializer(
 59 |             -tf.sqrt(6./tf.cast(x.shape[-1], tf.float32)),
 60 |             tf.sqrt(6./tf.cast(x.shape[-1], tf.float32))))
 61 | 
 62 |     key = op.dense(x, add_bias=False) #[batch, time, dimension]
 63 |     weight = tf.reduce_sum(tf.multiply(key, key_0), axis=-1)  #[batch, time]
 64 |     weight = tf.expand_dims(tf.nn.softmax(weight), -1)  #[batch, time, 1]
 65 | 
 66 |     L = tf.reduce_sum(tf.multiply(x, weight), axis=1) #[batch, dimension]
 67 |     return L
 68 | 
 69 | def mtl_loss(x, y, turns_intent_enc, response_intent_enc,
 70 |              turns_intent, response_intent, intent_loss_weight, num_classes=2,
 71 |               is_clip=True, clip_value=10):
 72 |     '''
 73 |     A specialized loss IADAM-MTL-V5 which jointly learn the response ranking
 74 |     and user intent prediction
 75 |     For a q or d text, we predict the user intent distributions in q and d
 76 |     In the case of information-seeking conversation
 77 |     A q is a dialog context with multiple utterances
 78 |     A d is a candidate response
 79 |     Thus for d which is a candidate response text,
 80 |         y_pred is [batch_size, intent_size]
 81 |         y_true [batch_size, intent_size]
 82 |         for q which is a context utterance sequences
 83 |         y_pred is [batch_size, max_utterance_num, intent_size]
 84 |         y_true [batch_size, max_utterance_num, intent_size]
 85 |     Both y_pred and y_true come from a Dense layer with sigmoid as the activation
 86 |     funtion as in the multi-label classification setting for intent prediction
 87 |     Thus we need to use tf.nn.sigmoid_cross_entropy_with_logits in tensorflow
 88 |     https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits
 89 |     '''
 90 |     # intent loss
 91 |     loss_turns_intent = tf.reduce_mean(tf.clip_by_value(
 92 |         tf.nn.sigmoid_cross_entropy_with_logits(
 93 |             labels=turns_intent,
 94 |             logits=turns_intent_enc), -clip_value, clip_value))
 95 |     # tf.summary.scalar('loss_turns_intent', loss_turns_intent)
 96 |     loss_response_intent = tf.reduce_mean(tf.clip_by_value(
 97 |         tf.nn.sigmoid_cross_entropy_with_logits(
 98 |             labels=response_intent,
 99 |             logits=response_intent_enc), -clip_value, clip_value))
100 |     # tf.summary.scalar('loss_response_intent', loss_turns_intent)
101 | 
102 |     # ranking loss
103 |     assert isinstance(num_classes, int)
104 |     assert num_classes >= 2
105 | 
106 |     W = tf.get_variable(
107 |         name='weights',
108 |         shape=[x.shape[-1], num_classes - 1],
109 |         initializer=tf.orthogonal_initializer())
110 |     bias = tf.get_variable(
111 |         name='bias',
112 |         shape=[num_classes - 1],
113 |         initializer=tf.zeros_initializer())
114 | 
115 |     logits = tf.reshape(tf.matmul(x, W) + bias, [-1])
116 |     loss = tf.nn.sigmoid_cross_entropy_with_logits(
117 |         labels=tf.cast(y, tf.float32),
118 |         logits=logits)
119 |     loss = tf.reduce_mean(tf.clip_by_value(loss, -clip_value, clip_value))
120 |     # tf.summary.scalar('ranking loss', loss)
121 |     # the final loss is a weighted sum of intent loss and ranking loss
122 |     loss_final = (1.0 - intent_loss_weight) * loss + intent_loss_weight * (
123 |         loss_turns_intent + loss_response_intent)
124 |     # tf.summary.scalar('loss_final', loss_final)
125 |     # print('tracking loss in tensorboard...')
126 |     return loss_final, logits
127 | 
128 | def loss(x, y, num_classes=2, is_clip=True, clip_value=10):
129 |     '''From info x calculate logits as return loss.
130 | 
131 |     Args:
132 |         x: a tensor with shape [batch, dimension]
133 |         num_classes: a number
134 | 
135 |     Returns:
136 |         loss: a tensor with shape [1], which is the average loss of one batch
137 |         logits: a tensor with shape [batch, 1]
138 | 
139 |     Raises:
140 |         AssertionError: if
141 |             num_classes is not a int greater equal than 2.
142 |     TODO:
143 |         num_classes > 2 may be not adapted.
144 |     '''
145 |     assert isinstance(num_classes, int)
146 |     assert num_classes >= 2
147 | 
148 |     W = tf.get_variable(
149 |         name='weights',
150 |         shape=[x.shape[-1], num_classes-1],
151 |         initializer=tf.orthogonal_initializer())
152 |     bias = tf.get_variable(
153 |         name='bias',
154 |         shape=[num_classes-1],
155 |         initializer=tf.zeros_initializer())
156 | 
157 |     logits = tf.reshape(tf.matmul(x, W) + bias, [-1])
158 |     loss = tf.nn.sigmoid_cross_entropy_with_logits(
159 |         labels=tf.cast(y, tf.float32),
160 |         logits=logits)
161 |     loss = tf.reduce_mean(tf.clip_by_value(loss, -clip_value, clip_value))
162 |     # tf.summary.scalar('loss', loss)
163 |     # print('tracking loss in tensorboard...')
164 |     return loss, logits
165 | 
166 | def attention_intent(I_u, I_r, attention_type='dot'):
167 |     '''
168 |     Intent based attention layer
169 |     The inputs are two intent vectors for a (utterance, response) pair
170 |     Returns the attention logits
171 |     '''
172 |     assert  attention_type in ('dot', 'bilinear', 'outprod')
173 |     if attention_type == 'dot':
174 |         logits = op.dot_sim_2d(I_u, I_r)
175 |     elif attention_type == 'bilinear':
176 |         logits = op.bilinear_sim_2d(I_u, I_r)
177 |     elif attention_type == 'outprod':
178 |         logits = op.outprod_sim_2d(I_u, I_r)
179 |     return logits
180 | 
181 | def attention(
182 |     Q, K, V, 
183 |     Q_lengths, K_lengths, 
184 |     attention_type='dot', 
185 |     is_mask=True, mask_value=-2**32+1,
186 |     drop_prob=None):
187 |     '''Add attention layer.
188 |     Args:
189 |         Q: a tensor with shape [batch, Q_time, Q_dimension]
190 |         K: a tensor with shape [batch, time, K_dimension]
191 |         V: a tensor with shape [batch, time, V_dimension]
192 | 
193 |         Q_length: a tensor with shape [batch]
194 |         K_length: a tensor with shape [batch]
195 | 
196 |     Returns:
197 |         a tensor with shape [batch, Q_time, V_dimension]
198 | 
199 |     Raises:
200 |         AssertionError: if
201 |             Q_dimension not equal to K_dimension when attention type is dot.
202 |     '''
203 |     assert attention_type in ('dot', 'bilinear')
204 |     if attention_type == 'dot':
205 |         assert Q.shape[-1] == K.shape[-1]
206 | 
207 |     Q_time = Q.shape[1]
208 |     K_time = K.shape[1]
209 | 
210 |     if attention_type == 'dot':
211 |         logits = op.dot_sim(Q, K) #[batch, Q_time, time]
212 |     if attention_type == 'bilinear':
213 |         logits = op.bilinear_sim(Q, K)
214 | 
215 |     if is_mask:
216 |         mask = op.mask(Q_lengths, K_lengths, Q_time, K_time) #return [batch, Q_time, K_time]
217 |         # mask is a tensor with the same shape with logits
218 |         # where the real word location is labeled by 1
219 |         # where the padded/masked word location is labeled by 0
220 |         # mask * logits is element-wise product
221 |         # + (1 - mask) is to add very small negative value on
222 |         # masked positions (0). after softmax, this position becomes 0
223 |         # similar tricks also used in BERT
224 |         logits = mask * logits + (1 - mask) * mask_value
225 |     
226 |     attention = tf.nn.softmax(logits)
227 | 
228 |     if drop_prob is not None:
229 |         print('use attention drop')
230 |         attention = tf.nn.dropout(attention, drop_prob)
231 | 
232 |     return op.weighted_sum(attention, V)
233 | 
234 | def FFN(x, out_dimension_0=None, out_dimension_1=None):
235 |     '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.
236 | 
237 |     Args:
238 |         x: a tensor with shape [batch, time, dimension]
239 |         out_dimension: a number which is the output dimension
240 | 
241 |     Returns:
242 |         a tensor with shape [batch, time, out_dimension]
243 | 
244 |     Raises:
245 |     '''
246 |     with tf.variable_scope('FFN_1'):
247 |         y = op.dense(x, out_dimension_0)
248 |         y = tf.nn.relu(y)
249 |     with tf.variable_scope('FFN_2'):
250 |         z = op.dense(y, out_dimension_1) #, add_bias=False)  #!!!!
251 |     return z
252 | 
253 | def block(
254 |     Q, K, V, 
255 |     Q_lengths, K_lengths, 
256 |     attention_type='dot', 
257 |     is_layer_norm=True, 
258 |     is_mask=True, mask_value=-2**32+1,
259 |     drop_prob=None):
260 |     '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf.
261 |     Args:
262 |         Q: a tensor with shape [batch, Q_time, Q_dimension]
263 |         K: a tensor with shape [batch, time, K_dimension]
264 |         V: a tensor with shape [batch, time, V_dimension]
265 | 
266 |         Q_length: a tensor with shape [batch]
267 |         K_length: a tensor with shape [batch]
268 | 
269 |     Returns:
270 |         a tensor with shape [batch, time, dimension]
271 | 
272 |     Raises:
273 |     '''
274 |     att = attention(Q, K, V, 
275 |                     Q_lengths, K_lengths, 
276 |                     attention_type='dot', 
277 |                     is_mask=is_mask, mask_value=mask_value,
278 |                     drop_prob=drop_prob)
279 |     if is_layer_norm:
280 |         with tf.variable_scope('attention_layer_norm'):
281 |             y = op.layer_norm_debug(Q + att)
282 |     else:
283 |         y = Q + att
284 | 
285 |     z = FFN(y)
286 |     if is_layer_norm:
287 |         with tf.variable_scope('FFN_layer_norm'):
288 |             w = op.layer_norm_debug(y + z)
289 |     else:
290 |         w = y + z
291 |     return w
292 | 
293 | def intent_cnn(x, intent_size):
294 |     '''
295 |     A CNN encoder to extract features for intent prediction
296 |     Args:
297 |         x: a tensor with shape [batch, in_height, in_width, in_channels]
298 |         intent_size: size of intent types, which is also output shape
299 |     Returns:
300 |         a tensor with shape [batch, intent_size]
301 |     '''
302 |     # CNN
303 |     cnn = CNN(x, 3, 3, 3, add_relu=True)
304 |     print('[In intent_cnn]: cnn %s ' % cnn.shape)
305 | 
306 |     # Dense Layer 0
307 |     # dense_0 = tf.layers.dense(inputs=cnn, units=4096,
308 |     #                         activation=tf.nn.relu)
309 |     # print('[In intent_cnn]: dense_0 %s ' % dense_0.shape)
310 |     # dropout_0 = tf.layers.dropout(inputs=dense_0, rate=0.4)
311 |     # print('[In intent_cnn]: dropout_0 %s ' % dropout_0.shape)
312 | 
313 |     # Dense Layer 1
314 |     dense_1 = tf.layers.dense(inputs=cnn, units=1024,
315 |                             activation=tf.nn.relu)
316 |     print('[In intent_cnn]: dense_1 %s ' % dense_1.shape)
317 |     dropout_1 = tf.layers.dropout(inputs=dense_1, rate=0.4)
318 |     print('[In intent_cnn]: dropout_1 %s ' % dropout_1.shape)
319 | 
320 |     # Logits Layer
321 |     logits = tf.layers.dense(inputs=dropout_1, units=intent_size,
322 |                              activation=tf.nn.sigmoid)
323 |     print('[In intent_cnn]: logits %s ' % logits.shape)
324 |     return logits
325 | 
326 | def CNN(x, out_channels, filter_size, pooling_size, add_relu=True):
327 |     '''Add a convlution layer with relu and max pooling layer.
328 | 
329 |     Args:
330 |         x: a tensor with shape [batch, in_height, in_width, in_channels]
331 |         out_channels: a number
332 |         filter_size: a number
333 |         pooling_size: a number
334 | 
335 |     Returns:
336 |         a flattened tensor with shape [batch, num_features]
337 | 
338 |     Raises:
339 |     '''
340 |     #calculate the last dimension of return
341 |     # num_features = ((tf.shape(x)[1]-filter_size+1)/pooling_size *
342 |     #     (tf.shape(x)[2]-filter_size+1)/pooling_size) * out_channels
343 |     # print('[In Intent CNN Encoder]: num_features ', num_features)
344 | 
345 |     in_channels = x.shape[-1]
346 |     weights = tf.get_variable(
347 |         name='filter',
348 |         shape=[filter_size, filter_size, in_channels, out_channels],
349 |         dtype=tf.float32,
350 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
351 |     print('[In Intent CNN Encoder]: filter weights.shape %s ' % weights.shape)
352 |     bias = tf.get_variable(
353 |         name='bias',
354 |         shape=[out_channels],
355 |         dtype=tf.float32,
356 |         initializer=tf.zeros_initializer())
357 | 
358 |     conv = tf.nn.conv2d(x, weights, strides=[1, 1, 1, 1], padding="SAME")
359 |     conv = conv + bias
360 |     print('[In Intent CNN Encoder]: conv.shape %s ' % conv.shape)
361 |     if add_relu:
362 |         conv = tf.nn.relu(conv)
363 | 
364 |     pooling = tf.nn.max_pool(
365 |         conv, 
366 |         ksize=[1, pooling_size, pooling_size, 1],
367 |         strides=[1, pooling_size, pooling_size, 1], 
368 |         padding="SAME")
369 | 
370 |     print('[In Intent CNN Encoder]: pooling.shape %s ' % pooling.shape)
371 |     flat = tf.contrib.layers.flatten(pooling)
372 |     print('[In Intent CNN Encoder]: flat.shape %s ' % flat.shape)
373 |     return flat
374 | 
375 | def CNN_3d(x, out_channels_0, out_channels_1, add_relu=True):
376 |     '''Add a 3d convlution layer with relu and max pooling layer.
377 | 
378 |     Args:
379 |         x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels]
380 |         out_channels: a number
381 |         filter_size: a number
382 |         pooling_size: a number
383 | 
384 |     Returns:
385 |         a flattened tensor with shape [batch, num_features]
386 | 
387 |     Raises:
388 |     '''
389 |     in_channels = x.shape[-1]
390 |     weights_0 = tf.get_variable(
391 |         name='filter_0',
392 |         shape=[3, 3, 3, in_channels, out_channels_0],
393 |         dtype=tf.float32,
394 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
395 |     bias_0 = tf.get_variable(
396 |         name='bias_0',
397 |         shape=[out_channels_0],
398 |         dtype=tf.float32,
399 |         initializer=tf.zeros_initializer())
400 | 
401 |     conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="SAME")
402 |     print('conv_0 shape: %s' %conv_0.shape)
403 |     conv_0 = conv_0 + bias_0
404 | 
405 |     if add_relu:
406 |         conv_0 = tf.nn.elu(conv_0)
407 | 
408 |     pooling_0 = tf.nn.max_pool3d(
409 |         conv_0, 
410 |         ksize=[1, 3, 3, 3, 1],
411 |         strides=[1, 3, 3, 3, 1], 
412 |         padding="SAME")
413 |     print('pooling_0 shape: %s' %pooling_0.shape)
414 | 
415 |     #layer_1
416 |     weights_1 = tf.get_variable(
417 |         name='filter_1',
418 |         shape=[3, 3, 3, out_channels_0, out_channels_1],
419 |         dtype=tf.float32,
420 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
421 |     bias_1 = tf.get_variable(
422 |         name='bias_1',
423 |         shape=[out_channels_1],
424 |         dtype=tf.float32,
425 |         initializer=tf.zeros_initializer())
426 | 
427 |     conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="SAME")
428 |     print('conv_1 shape: %s' %conv_1.shape)
429 |     conv_1 = conv_1 + bias_1
430 | 
431 |     if add_relu:
432 |         conv_1 = tf.nn.elu(conv_1)
433 | 
434 |     pooling_1 = tf.nn.max_pool3d(
435 |         conv_1, 
436 |         ksize=[1, 3, 3, 3, 1],
437 |         strides=[1, 3, 3, 3, 1], 
438 |         padding="SAME")
439 |     print('pooling_1 shape: %s' %pooling_1.shape)
440 | 
441 |     return tf.contrib.layers.flatten(pooling_1)
442 | 
443 | def CNN_3d_2d(x, out_channels_0, out_channels_1, add_relu=True):
444 |     '''Add a 3d convlution layer with relu and max pooling layer.
445 | 
446 |     Args:
447 |         x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels]
448 |         out_channels: a number
449 |         filter_size: a number
450 |         pooling_size: a number
451 | 
452 |     Returns:
453 |         a flattened tensor with shape [batch, num_features]
454 | 
455 |     Raises:
456 |     '''
457 |     in_channels = x.shape[-1]
458 |     weights_0 = tf.get_variable(
459 |         name='filter_0',
460 |         shape=[1, 3, 3, in_channels, out_channels_0],
461 |         dtype=tf.float32,
462 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
463 |     bias_0 = tf.get_variable(
464 |         name='bias_0',
465 |         shape=[out_channels_0],
466 |         dtype=tf.float32,
467 |         initializer=tf.zeros_initializer())
468 | 
469 |     conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="SAME")
470 |     print('conv_0 shape: %s' %conv_0.shape)
471 |     conv_0 = conv_0 + bias_0
472 | 
473 |     if add_relu:
474 |         conv_0 = tf.nn.elu(conv_0)
475 | 
476 |     pooling_0 = tf.nn.max_pool3d(
477 |         conv_0, 
478 |         ksize=[1, 1, 3, 3, 1],
479 |         strides=[1, 1, 3, 3, 1], 
480 |         padding="SAME")
481 |     print('pooling_0 shape: %s' %pooling_0.shape)
482 | 
483 |     #layer_1
484 |     weights_1 = tf.get_variable(
485 |         name='filter_1',
486 |         shape=[1, 3, 3, out_channels_0, out_channels_1],
487 |         dtype=tf.float32,
488 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
489 |     bias_1 = tf.get_variable(
490 |         name='bias_1',
491 |         shape=[out_channels_1],
492 |         dtype=tf.float32,
493 |         initializer=tf.zeros_initializer())
494 | 
495 |     conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="SAME")
496 |     print('conv_1 shape: %s' %conv_1.shape)
497 |     conv_1 = conv_1 + bias_1
498 | 
499 |     if add_relu:
500 |         conv_1 = tf.nn.elu(conv_1)
501 | 
502 |     pooling_1 = tf.nn.max_pool3d(
503 |         conv_1, 
504 |         ksize=[1, 1, 3, 3, 1],
505 |         strides=[1, 1, 3, 3, 1], 
506 |         padding="SAME")
507 |     print('pooling_1 shape: %s' %pooling_1.shape)
508 | 
509 |     return tf.contrib.layers.flatten(pooling_1)
510 | 
511 | def CNN_3d_change(x, out_channels_0, out_channels_1, add_relu=True):
512 |     '''Add a 3d convlution layer with relu and max pooling layer.
513 | 
514 |     Args:
515 |         x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels]
516 |         out_channels: a number
517 |         filter_size: a number
518 |         pooling_size: a number
519 | 
520 |     Returns:
521 |         a flattened tensor with shape [batch, num_features]
522 | 
523 |     Raises:
524 |     '''
525 |     in_channels = x.shape[-1]
526 |     weights_0 = tf.get_variable(
527 |         name='filter_0',
528 |         shape=[3, 3, 3, in_channels, out_channels_0],
529 |         dtype=tf.float32,
530 |         #initializer=tf.random_normal_initializer(0, 0.05))
531 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
532 |     bias_0 = tf.get_variable(
533 |         name='bias_0',
534 |         shape=[out_channels_0],
535 |         dtype=tf.float32,
536 |         initializer=tf.zeros_initializer())
537 |     #Todo
538 |     g_0 = tf.get_variable(name='scale_0',
539 |         shape = [out_channels_0],
540 |         dtype=tf.float32,
541 |         initializer=tf.ones_initializer())
542 |     weights_0 = tf.reshape(g_0, [1, 1, 1, out_channels_0]) * tf.nn.l2_normalize(weights_0, [0, 1, 2])
543 | 
544 |     conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="VALID")
545 |     print('conv_0 shape: %s' %conv_0.shape)
546 |     conv_0 = conv_0 + bias_0
547 |     #######
548 |     '''
549 |     with tf.variable_scope('layer_0'):
550 |         conv_0 = op.layer_norm(conv_0, axis=[1, 2, 3, 4])
551 |         print('layer_norm in cnn')
552 |     '''
553 |     if add_relu:
554 |         conv_0 = tf.nn.elu(conv_0)
555 | 
556 |     pooling_0 = tf.nn.max_pool3d(
557 |         conv_0, 
558 |         ksize=[1, 2, 3, 3, 1],
559 |         strides=[1, 2, 3, 3, 1], 
560 |         padding="VALID")
561 |     print('pooling_0 shape: %s' %pooling_0.shape)
562 | 
563 |     #layer_1
564 |     weights_1 = tf.get_variable(
565 |         name='filter_1',
566 |         shape=[2, 2, 2, out_channels_0, out_channels_1],
567 |         dtype=tf.float32,
568 |         initializer=tf.random_uniform_initializer(-0.01, 0.01))
569 |     
570 |     bias_1 = tf.get_variable(
571 |         name='bias_1',
572 |         shape=[out_channels_1],
573 |         dtype=tf.float32,
574 |         initializer=tf.zeros_initializer())
575 |     
576 |     g_1 = tf.get_variable(name='scale_1',
577 |         shape = [out_channels_1],
578 |         dtype=tf.float32,
579 |         initializer=tf.ones_initializer())
580 |     weights_1 = tf.reshape(g_1, [1, 1, 1, out_channels_1]) * tf.nn.l2_normalize(weights_1, [0, 1, 2])
581 | 
582 |     conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="VALID")
583 |     print('conv_1 shape: %s' %conv_1.shape)
584 |     conv_1 = conv_1 + bias_1
585 |     #with tf.variable_scope('layer_1'):
586 |     #    conv_1 = op.layer_norm(conv_1, axis=[1, 2, 3, 4])
587 | 
588 |     if add_relu:
589 |         conv_1 = tf.nn.elu(conv_1)
590 | 
591 |     pooling_1 = tf.nn.max_pool3d(
592 |         conv_1, 
593 |         ksize=[1, 3, 3, 3, 1],
594 |         strides=[1, 3, 3, 3, 1], 
595 |         padding="VALID")
596 |     print('pooling_1 shape: %s' %pooling_1.shape)
597 | 
598 |     return tf.contrib.layers.flatten(pooling_1)
599 | 
600 | def RNN_last_state(x, lengths, hidden_size):
601 |     '''encode x with a gru cell and return the last state.
602 |     
603 |     Args:
604 |         x: a tensor with shape [batch, time, dimension]
605 |         length: a tensor with shape [batch]
606 | 
607 |     Return:
608 |         a tensor with shape [batch, hidden_size]
609 | 
610 |     Raises:
611 |     '''
612 |     cell = tf.nn.rnn_cell.GRUCell(hidden_size)
613 |     outputs, last_states = tf.nn.dynamic_rnn(cell, x, lengths, dtype=tf.float32)
614 |     return outputs, last_states
615 | 
616 | 
617 | 


--------------------------------------------------------------------------------
/IART/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from __future__ import print_function
  5 | from nltk.tokenize import word_tokenize
  6 | import jieba
  7 | import sys
  8 | import numpy as np
  9 | from nltk.corpus import stopwords as nltk_stopwords
 10 | from nltk.stem import SnowballStemmer
 11 | from tqdm import tqdm
 12 | 
 13 | sys.path.append('../inputs')
 14 | sys.path.append('../utils')
 15 | from preparation import *
 16 | # from rank_io import *
 17 | 
 18 | class Preprocess(object):
 19 | 
 20 |     _valid_lang = ['en', 'cn']
 21 |     _stemmer = SnowballStemmer('english')
 22 | 
 23 |     def __init__(self,
 24 |                  word_seg_config = {},
 25 |                  doc_filter_config = {},
 26 |                  word_stem_config = {},
 27 |                  word_lower_config = {},
 28 |                  word_filter_config = {},
 29 |                  word_index_config = {}
 30 |                  ):
 31 |         # set default configuration
 32 |         self._word_seg_config = { 'enable': True, 'lang': 'en' }
 33 |         self._doc_filter_config = { 'enable': True, 'min_len': 0, 'max_len': sys.maxint }
 34 |         self._word_stem_config = { 'enable': True }
 35 |         self._word_lower_config = { 'enable': True }
 36 |         self._word_filter_config = { 'enable': True, 'stop_words': nltk_stopwords.words('english'),
 37 |                                      'min_freq': 0, 'max_freq': sys.maxint, 'words_useless': None }
 38 |         self._word_index_config = { 'word_dict': None }
 39 | 
 40 |         self._word_seg_config.update(word_seg_config)
 41 |         self._doc_filter_config.update(doc_filter_config)
 42 |         self._word_stem_config.update(word_stem_config)
 43 |         self._word_lower_config.update(word_lower_config)
 44 |         self._word_filter_config.update(word_filter_config)
 45 |         self._word_index_config.update(word_index_config)
 46 | 
 47 |         self._word_dict = self._word_index_config['word_dict']
 48 |         self._words_stats = dict()
 49 | 
 50 |     def run(self, file_path):
 51 |         print('load...')
 52 |         dids, docs = Preprocess.load(file_path)
 53 | 
 54 |         if self._word_seg_config['enable']:
 55 |             print('word_seg...')
 56 |             docs = Preprocess.word_seg(docs, self._word_seg_config)
 57 | 
 58 |         if self._doc_filter_config['enable']:
 59 |             print('doc_filter...')
 60 |             dids, docs = Preprocess.doc_filter(dids, docs, self._doc_filter_config)
 61 | 
 62 |         if self._word_stem_config['enable']:
 63 |             print('word_stem...')
 64 |             docs = Preprocess.word_stem(docs)
 65 | 
 66 |         if self._word_lower_config['enable']:
 67 |             print('word_lower...')
 68 |             docs = Preprocess.word_lower(docs)
 69 | 
 70 |         self._words_stats = Preprocess.cal_words_stat(docs)
 71 | 
 72 |         if self._word_filter_config['enable']:
 73 |             print('word_filter...')
 74 |             docs, self._words_useless = Preprocess.word_filter(docs, self._word_filter_config, self._words_stats)
 75 | 
 76 |         print('word_index...')
 77 |         docs, self._word_dict = Preprocess.word_index(docs, self._word_index_config)
 78 | 
 79 |         return dids, docs
 80 | 
 81 |     def run_2d(self, file_path):
 82 |         print('load...')
 83 |         dids, docs = Preprocess.load_2d(file_path)
 84 |         # dids: a list of corpus ids
 85 |         # docs: a list of context/responses. The context is seperated by \t
 86 | 
 87 |         print('transfer to 2d docs...')
 88 |         # firstly transfer docs to a 2D list [corpus_text_size, utterance_list]
 89 |         # a corpus text could be a list of utterances (for context) or 1 utterance (for response)
 90 |         docs_2d = Preprocess.transfer_to_2ddocs(docs)
 91 | 
 92 |         if self._word_seg_config['enable']:
 93 |             print('word_seg...')
 94 |             docs_2d = Preprocess.word_seg_2d(docs_2d)
 95 | 
 96 |         if self._word_stem_config['enable']:
 97 |             print('word_stem...')
 98 |             docs_2d = Preprocess.word_stem_2d(docs_2d)
 99 | 
100 |         if self._word_lower_config['enable']:
101 |             print('word_lower...')
102 |             docs_2d = Preprocess.word_lower_2d(docs_2d)
103 | 
104 |         # print ('after word_lower, docs_2d[0:100] = ', docs_2d[0:100])
105 |         print('cal_words_stat...')
106 |         self._words_stats = Preprocess.cal_words_stat_2d(docs_2d)
107 | 
108 |         if self._word_filter_config['enable']:
109 |             print('word_filter...')
110 |             docs_2d, self._words_useless = Preprocess.word_filter_2d(docs_2d, self._word_filter_config, self._words_stats)
111 | 
112 |         print('word_index...')
113 |         docs_2d, self._word_dict = Preprocess.word_index_2d(docs_2d, self._word_index_config)
114 |         return dids, docs_2d
115 | 
116 |     def run_2d_smn(self, file_path):
117 |         '''
118 |         Minimize the preprocess steps to be consistant with Yu Wu's SMN code
119 |         Refer to the build_multiturn_data function in PreProcess.py of the
120 |         Theano code of Yu Wu's SMN source code
121 |         :param file_path:
122 |         :return:
123 |         '''
124 |         print('load...')
125 |         dids, docs = Preprocess.load_2d(file_path)
126 |         # removed _ as what Yu Wu did in SMN preprocess code
127 |         # dids: a list of corpus ids
128 |         # docs: a list of context/responses. The context is seperated by \t
129 | 
130 |         print('transfer to 2d docs...')
131 |         # firstly transfer docs to a 2D list [corpus_text_size, utterance_list]
132 |         # a corpus text could be a list of utterances (for context) or 1 utterance (for response)
133 |         docs_2d = Preprocess.transfer_to_2ddocs(docs)
134 | 
135 |         print('word_seg... (necessary for ms_dialog data)')
136 |         docs_2d = Preprocess.word_seg_2d(docs_2d)
137 | 
138 |         print('word_lower... (necessary for ms_dialog data)')
139 |         docs_2d = Preprocess.word_lower_2d(docs_2d)
140 | 
141 |         print('following SMN, just split with split() and index...')
142 |         print('build word dict...')
143 |         words = set()
144 |         for c_text in tqdm(docs_2d):
145 |             for utt in c_text:
146 |                 # words.update(set(utt.split()))
147 |                 words.update(set(utt))
148 |         print('vocab size: ', len(words))
149 |         word_id = 1
150 |         self._word_dict = {}
151 |         for word in words:
152 |             self._word_dict[word] = word_id
153 |             word_id += 1
154 | 
155 |         print('map words to ids ...')
156 |         docs_index = []
157 |         for doc in tqdm(docs_2d):
158 |             # docs_index.append([[self._word_dict[w] for w in utt.split()] for utt in doc])
159 |             docs_index.append([[self._word_dict[w] for w in utt] for utt in doc])
160 |         return dids, docs_index
161 | 
162 |     @staticmethod
163 |     def transfer_to_2ddocs(docs):
164 |         '''
165 |         transfer a docs to a 2 dimensional docs [corpus_text_size, utterance_list]
166 |         a corpus text could be a list of utterances (for context) or 1 utterance (for response)
167 |         '''
168 |         docs_2d = []
169 |         for c_text in tqdm(docs):
170 |             docs_2d.append(list(c_text.split('\t')))
171 |         return docs_2d
172 | 
173 |     @staticmethod
174 |     def parse(line):
175 |         subs = line.split(' ', 1)
176 |         if 1 == len(subs):
177 |             return subs[0], ''
178 |         else:
179 |             return subs[0], subs[1]
180 | 
181 |     @staticmethod
182 |     def load(file_path):
183 |         dids = list()
184 |         docs = list()
185 |         f = open(file_path, 'r')
186 |         for line in tqdm(f):
187 |             line = line.decode('utf8')
188 |             line = line.strip()
189 |             if '' != line:
190 |                 did, doc = Preprocess.parse(line)
191 |                 dids.append(did)
192 |                 docs.append(doc)
193 |         f.close()
194 |         return dids, docs
195 | 
196 |     @staticmethod
197 |     def load_2d(file_path):
198 |         dids = list()
199 |         docs = list()
200 |         f = open(file_path, 'r')
201 |         for line in tqdm(f):
202 |             line = line.decode('utf8')
203 |             line = line.replace("_", "") # same with SMN code by Yu Wu
204 |             line = line.strip()
205 |             if '' != line:
206 |                 subs = line.split('\t')
207 |                 did, doc = subs[0], '\t'.join(subs[1:len(subs)])
208 |                 dids.append(did)
209 |                 docs.append(doc)
210 |         f.close()
211 |         return dids, docs
212 | 
213 |     @staticmethod
214 |     def word_seg_2d(docs):
215 |         docs_seg = []
216 |         for doc in tqdm(docs):
217 |             docs_seg.append([word_tokenize(utt) for utt in doc])
218 |         return docs_seg
219 | 
220 |     @staticmethod
221 |     def word_seg_en(docs):
222 |         docs = [word_tokenize(sent) for sent in tqdm(docs)]
223 |         # show the progress of word segmentation with tqdm
224 |         return docs
225 | 
226 |     @staticmethod
227 |     def word_seg_cn(docs):
228 |         docs = [list(jieba.cut(sent)) for sent in docs]
229 |         return docs
230 | 
231 |     @staticmethod
232 |     def word_seg(docs, config):
233 |         assert config['lang'].lower() in Preprocess._valid_lang, 'Wrong language type: %s' % config['lang']
234 |         docs = getattr(Preprocess, '%s_%s' % (sys._getframe().f_code.co_name, config['lang']))(docs)
235 |         return docs
236 | 
237 |     @staticmethod
238 |     def cal_words_stat(docs):
239 |         words_stats = {}
240 |         docs_num = len(docs)
241 |         for ws in docs:
242 |             for w in ws:
243 |                 if w not in words_stats:
244 |                     words_stats[w] = {}
245 |                     words_stats[w]['cf'] = 0
246 |                     words_stats[w]['df'] = 0
247 |                     words_stats[w]['idf'] = 0
248 |                 words_stats[w]['cf'] += 1
249 |             for w in set(ws):
250 |                 words_stats[w]['df'] += 1
251 |         for w, winfo in words_stats.items():
252 |             words_stats[w]['idf'] = np.log( (1. + docs_num) / (1. + winfo['df']))
253 |         return words_stats
254 | 
255 |     @staticmethod
256 |     def cal_words_stat_2d(docs):
257 |         words_stats = {}
258 |         docs_num = len(docs)
259 |         for ws in tqdm(docs): # for each corpus text
260 |             for ww in ws: # for each utterance
261 |                 for w in ww: # for each word
262 |                     if w not in words_stats:
263 |                         words_stats[w] = {}
264 |                         words_stats[w]['cf'] = 0
265 |                         words_stats[w]['df'] = 0
266 |                         words_stats[w]['idf'] = 0
267 |                     words_stats[w]['cf'] += 1
268 |                 for w in set(ww):
269 |                     words_stats[w]['df'] += 1
270 |         for w, winfo in words_stats.items():
271 |             words_stats[w]['idf'] = np.log((1. + docs_num) / (1. + winfo['df']))
272 |         return words_stats
273 | 
274 |     @staticmethod
275 |     def word_filter(docs, config, words_stats):
276 |         if config['words_useless'] is None:
277 |             config['words_useless'] = set()
278 |             # filter with stop_words
279 |             config['words_useless'].update(config['stop_words'])
280 |             # filter with min_freq and max_freq
281 |             for w, winfo in words_stats.items():
282 |                 # filter too frequent words or rare words
283 |                 if config['min_freq'] > winfo['df'] or config['max_freq'] < winfo['df']:
284 |                     config['words_useless'].add(w)
285 |         # filter with useless words
286 |         docs = [[w for w in ws if w not in config['words_useless']] for ws in tqdm(docs)]
287 |         return docs, config['words_useless']
288 | 
289 |     @staticmethod
290 |     def word_filter_2d(docs, config, words_stats):
291 |         if config['words_useless'] is None:
292 |             config['words_useless'] = set()
293 |             # filter with stop_words
294 |             config['words_useless'].update(config['stop_words'])
295 |             # filter with min_freq and max_freq
296 |             for w, winfo in words_stats.items():
297 |                 # filter too frequent words or rare words
298 |                 if config['min_freq'] > winfo['df'] or config['max_freq'] < winfo['df']:
299 |                     config['words_useless'].add(w)
300 |         # filter with useless words
301 |         print('filter useless words: ', len(config['words_useless']))
302 |         docs_filter_word = []
303 |         for doc in tqdm(docs):
304 |             docs_filter_word.append([[w for w in ws if w not in config['words_useless']] for ws in doc])
305 |         return docs_filter_word, config['words_useless']
306 | 
307 |     @staticmethod
308 |     def doc_filter(dids, docs, config):
309 |         new_docs = list()
310 |         new_dids = list()
311 |         for i in tqdm(range(len(docs))):
312 |             if config['min_len'] <= len(docs[i]) <= config['max_len']:
313 |                 new_docs.append(docs[i])
314 |                 new_dids.append(dids[i])
315 |         return new_dids, new_docs
316 | 
317 |     @staticmethod
318 |     def word_stem(docs):
319 |         docs = [[Preprocess._stemmer.stem(w) for w in ws] for ws in tqdm(docs)]
320 |         return docs
321 | 
322 |     @staticmethod
323 |     def word_stem_2d(docs):
324 |         docs_stem = []
325 |         for doc in tqdm(docs):
326 |             docs_stem.append([[Preprocess._stemmer.stem(w) for w in ws] for ws in doc])
327 |         return docs_stem
328 | 
329 |     @staticmethod
330 |     def word_lower(docs):
331 |         docs = [[w.lower() for w in ws] for ws in tqdm(docs)]
332 |         return docs
333 | 
334 |     @staticmethod
335 |     def word_lower_2d(docs):
336 |         docs_lower = []
337 |         for doc in tqdm(docs):
338 |             docs_lower.append([[w.lower() for w in ws] for ws in doc])
339 |         return docs_lower
340 | 
341 |     @staticmethod
342 |     def build_word_dict(docs):
343 |         word_dict = dict()
344 |         for ws in docs:
345 |             for w in ws:
346 |                 word_dict.setdefault(w, len(word_dict))
347 |         return word_dict
348 | 
349 |     @staticmethod
350 |     def build_word_dict_2d(docs):
351 |         word_dict = dict()
352 |         for doc in docs:
353 |             for ws in doc:
354 |                 for w in ws:
355 |                     word_dict.setdefault(w, len(word_dict))
356 |         return word_dict
357 | 
358 |     @staticmethod
359 |     def word_index(docs, config):
360 |         if config['word_dict'] is None:
361 |             config['word_dict'] = Preprocess.build_word_dict(docs)
362 |         docs = [[config['word_dict'][w] for w in ws if w in config['word_dict']] for ws in tqdm(docs)]
363 |         return docs, config['word_dict']
364 | 
365 |     @staticmethod
366 |     def word_index_2d(docs, config):
367 |         if config['word_dict'] is None:
368 |             config['word_dict'] = Preprocess.build_word_dict_2d(docs)
369 |         docs_index = []
370 |         for doc in tqdm(docs):
371 |             docs_index.append([[config['word_dict'][w] for w in ws if w in config['word_dict']] for ws in doc])
372 |         return docs_index, config['word_dict']
373 | 
374 |     @staticmethod
375 |     def save_lines(file_path, lines):
376 |         f = open(file_path, 'w')
377 |         for line in lines:
378 |             line = line.encode('utf8')
379 |             f.write(line + "\n")
380 |         f.close()
381 | 
382 |     @staticmethod
383 |     def load_lines(file_path):
384 |         f = open(file_path, 'r')
385 |         lines = f.readlines()
386 |         f.close()
387 |         return lines
388 | 
389 |     @staticmethod
390 |     def save_dict(file_path, dic, sort=False):
391 |         if sort:
392 |             dic = sorted(dic.items(), key=lambda d:d[1], reverse=False)
393 |             lines = ['%s %s' % (k, v) for k, v in dic]
394 |         else:
395 |             lines = ['%s %s' % (k, v) for k, v in dic.iteritems()]
396 |         Preprocess.save_lines(file_path, lines)
397 | 
398 |     @staticmethod
399 |     def load_dict(file_path):
400 |         lines = Preprocess.load_lines(file_path)
401 |         dic = dict()
402 |         for line in lines:
403 |             k, v = line.split()
404 |             dic[k] = v
405 |         return dic
406 | 
407 |     def save_words_useless(self, words_useless_fp):
408 |         Preprocess.save_lines(words_useless_fp, self._words_useless)
409 | 
410 |     def load_words_useless(self, words_useless_fp):
411 |         self._words_useless = set(Preprocess.load_lines(words_useless_fp))
412 | 
413 |     def save_word_dict(self, word_dict_fp, sort=False):
414 |         Preprocess.save_dict(word_dict_fp, self._word_dict, sort)
415 | 
416 |     def load_word_dict(self, word_dict_fp):
417 |         self._word_dict = Preprocess.load_dict(word_dict_fp)
418 | 
419 |     def save_words_stats(self, words_stats_fp, sort=False):
420 |         if sort:
421 |             word_dic = sorted(self._word_dict.items(), key=lambda d:d[1], reverse=False)
422 |             lines = ['%s %d %d %f' % (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'],
423 |                 self._words_stats[w]['idf']) for w, wid in word_dic]
424 |         else:
425 |             lines = ['%s %d %d %f' % (wid, self._words_stats[w]['cf'], self._words_stats[w]['df'],
426 |                 self._words_stats[w]['idf']) for w, wid in self._word_dict.items()]
427 |         Preprocess.save_lines(words_stats_fp, lines)
428 | 
429 |     def load_words_stats(self, words_stats_fp):
430 |         lines = Preprocess.load_lines(words_stats_fp)
431 |         for line in lines:
432 |             wid, cf, df, idf  = line.split()
433 |             self._words_stats[wid] = {}
434 |             self._words_stats[wid]['cf'] = int(cf)
435 |             self._words_stats[wid]['df'] = int(df)
436 |             self._words_stats[wid]['idf'] = float(idf)
437 | 
438 | 
439 | class NgramUtil(object):
440 | 
441 |     def __init__(self):
442 |         pass
443 | 
444 |     @staticmethod
445 |     def unigrams(words):
446 |         """
447 |             Input: a list of words, e.g., ["I", "am", "Denny"]
448 |             Output: a list of unigram
449 |         """
450 |         assert type(words) == list
451 |         return words
452 | 
453 |     @staticmethod
454 |     def bigrams(words, join_string, skip=0):
455 |         """
456 |            Input: a list of words, e.g., ["I", "am", "Denny"]
457 |            Output: a list of bigram, e.g., ["I_am", "am_Denny"]
458 |         """
459 |         assert type(words) == list
460 |         L = len(words)
461 |         if L > 1:
462 |             lst = []
463 |             for i in range(L - 1):
464 |                 for k in range(1, skip + 2):
465 |                     if i + k < L:
466 |                         lst.append(join_string.join([words[i], words[i + k]]))
467 |         else:
468 |             # set it as unigram
469 |             lst = NgramUtil.unigrams(words)
470 |         return lst
471 | 
472 |     @staticmethod
473 |     def trigrams(words, join_string, skip=0):
474 |         """
475 |            Input: a list of words, e.g., ["I", "am", "Denny"]
476 |            Output: a list of trigram, e.g., ["I_am_Denny"]
477 |         """
478 |         assert type(words) == list
479 |         L = len(words)
480 |         if L > 2:
481 |             lst = []
482 |             for i in range(L - 2):
483 |                 for k1 in range(1, skip + 2):
484 |                     for k2 in range(1, skip + 2):
485 |                         if i + k1 < L and i + k1 + k2 < L:
486 |                             lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
487 |         else:
488 |             # set it as bigram
489 |             lst = NgramUtil.bigrams(words, join_string, skip)
490 |         return lst
491 | 
492 |     @staticmethod
493 |     def fourgrams(words, join_string):
494 |         """
495 |             Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
496 |             Output: a list of trigram, e.g., ["I_am_Denny_boy"]
497 |         """
498 |         assert type(words) == list
499 |         L = len(words)
500 |         if L > 3:
501 |             lst = []
502 |             for i in xrange(L - 3):
503 |                 lst.append(join_string.join([words[i], words[i + 1], words[i + 2], words[i + 3]]))
504 |         else:
505 |             # set it as trigram
506 |             lst = NgramUtil.trigrams(words, join_string)
507 |         return lst
508 | 
509 |     @staticmethod
510 |     def uniterms(words):
511 |         return NgramUtil.unigrams(words)
512 | 
513 |     @staticmethod
514 |     def biterms(words, join_string):
515 |         """
516 |             Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
517 |             Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
518 |         """
519 |         assert type(words) == list
520 |         L = len(words)
521 |         if L > 1:
522 |             lst = []
523 |             for i in range(L - 1):
524 |                 for j in range(i + 1, L):
525 |                     lst.append(join_string.join([words[i], words[j]]))
526 |         else:
527 |             # set it as uniterm
528 |             lst = NgramUtil.uniterms(words)
529 |         return lst
530 | 
531 |     @staticmethod
532 |     def triterms(words, join_string):
533 |         """
534 |             Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
535 |             Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
536 |         """
537 |         assert type(words) == list
538 |         L = len(words)
539 |         if L > 2:
540 |             lst = []
541 |             for i in xrange(L - 2):
542 |                 for j in xrange(i + 1, L - 1):
543 |                     for k in xrange(j + 1, L):
544 |                         lst.append(join_string.join([words[i], words[j], words[k]]))
545 |         else:
546 |             # set it as biterm
547 |             lst = NgramUtil.biterms(words, join_string)
548 |         return lst
549 | 
550 |     @staticmethod
551 |     def fourterms(words, join_string):
552 |         """
553 |             Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
554 |             Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
555 |         """
556 |         assert type(words) == list
557 |         L = len(words)
558 |         if L > 3:
559 |             lst = []
560 |             for i in xrange(L - 3):
561 |                 for j in xrange(i + 1, L - 2):
562 |                     for k in xrange(j + 1, L - 1):
563 |                         for l in xrange(k + 1, L):
564 |                             lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
565 |         else:
566 |             # set it as triterm
567 |             lst = NgramUtil.triterms(words, join_string)
568 |         return lst
569 | 
570 |     @staticmethod
571 |     def ngrams(words, ngram, join_string=" "):
572 |         """
573 |         wrapper for ngram
574 |         """
575 |         if ngram == 1:
576 |             return NgramUtil.unigrams(words)
577 |         elif ngram == 2:
578 |             return NgramUtil.bigrams(words, join_string)
579 |         elif ngram == 3:
580 |             return NgramUtil.trigrams(words, join_string)
581 |         elif ngram == 4:
582 |             return NgramUtil.fourgrams(words, join_string)
583 |         elif ngram == 12:
584 |             unigram = NgramUtil.unigrams(words)
585 |             bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
586 |             return unigram + bigram
587 |         elif ngram == 123:
588 |             unigram = NgramUtil.unigrams(words)
589 |             bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
590 |             trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
591 |             return unigram + bigram + trigram
592 | 
593 |     @staticmethod
594 |     def nterms(words, nterm, join_string=" "):
595 |         """wrapper for nterm"""
596 |         if nterm == 1:
597 |             return NgramUtil.uniterms(words)
598 |         elif nterm == 2:
599 |             return NgramUtil.biterms(words, join_string)
600 |         elif nterm == 3:
601 |             return NgramUtil.triterms(words, join_string)
602 |         elif nterm == 4:
603 |             return NgramUtil.fourterms(words, join_string)
604 | 
605 | def cal_hist(t1_rep, t2_rep, qnum, hist_size):
606 |     #qnum = len(t1_rep)
607 |     mhist = np.zeros((qnum, hist_size), dtype=np.float32)
608 |     mm = t1_rep.dot(np.transpose(t2_rep))
609 |     for (i,j), v in np.ndenumerate(mm):
610 |         if i >= qnum:
611 |             break
612 |         vid = int((v + 1.) / 2. * (hist_size - 1.))
613 |         mhist[i][vid] += 1.
614 |     mhist += 1.
615 |     mhist = np.log10(mhist)
616 |     return mhist.flatten()
617 | 
618 | def cal_binsum(t1_rep, t2_rep, qnum, bin_num):
619 |     mbinsum = np.zeros((qnum, bin_num), dtype=np.float32)
620 |     mm = t1_rep.dot(np.transpose(t2_rep))
621 |     for (i, j), v in np.ndenumerate(mm):
622 |         if i >= qnum:
623 |             break
624 |         vid = int((v + 1.) / 2. * (bin_num - 1.))
625 |         mbinsum[i][vid] += v
626 |     #mhist += 1. # smooth is not needed for computing bin sum
627 |     #mhist = np.log10(mhist) # not needed for computing  bin sum
628 |     return mbinsum.flatten()
629 | 
630 | def _test_ngram():
631 |     words = 'hello, world! hello, deep!'
632 |     print(NgramUtil.ngrams(list(words), 3, ''))
633 | 
634 | # def _test_hist():
635 | #     embedfile = '../../data/mq2007/embed_wiki-pdc_d50_norm'
636 | #     queryfile = '../../data/mq2007/qid_query.txt'
637 | #     docfile = '../../data/mq2007/docid_doc.txt'
638 | #     relfile = '../../data/mq2007/relation.test.fold5.txt'
639 | #     histfile = '../../data/mq2007/relation.test.fold5.hist-30.txt'
640 | #     embed_dict = read_embedding(filename = embedfile)
641 | #     print('after read embedding ...')
642 | #     _PAD_ = 193367
643 | #     embed_dict[_PAD_] = np.zeros((50, ), dtype=np.float32)
644 | #     embed = np.float32(np.random.uniform(-0.2, 0.2, [193368, 50]))
645 | #     embed = convert_embed_2_numpy(embed_dict, embed = embed)
646 | #
647 | #     query, _ = read_data(queryfile)
648 | #     print('after read query ....')
649 | #     doc, _ = read_data(docfile)
650 | #     print('after read doc ...')
651 | #     rel = read_relation(relfile)
652 | #     print('after read relation ... ')
653 | #     fout = open(histfile, 'w')
654 | #     for label, d1, d2 in rel:
655 | #         assert d1 in query
656 | #         assert d2 in doc
657 | #         qnum = len(query[d1])
658 | #         d1_embed = embed[query[d1]]
659 | #         d2_embed = embed[doc[d2]]
660 | #         curr_hist = cal_hist(d1_embed, d2_embed, qnum, 30)
661 | #         curr_hist = curr_hist.tolist()
662 | #         fout.write(' '.join(map(str, curr_hist)))
663 | #         fout.write('\n')
664 | #         print(qnum)
665 | #         #print(curr_hist)
666 | #     fout.close()
667 | 
668 | 
669 | 
670 | if __name__ == '__main__':
671 |     #_test_ngram()
672 |     # test with sample data
673 |     basedir = '../../data/example/ranking/'
674 |     prepare = Preparation()
675 |     sample_file = basedir + 'sample.txt'
676 |     corpus, rels = prepare.run_with_one_corpus(sample_file)
677 |     print ('total corpus size', len(corpus))
678 |     print ('total relations size', len(rels))
679 |     prepare.save_corpus(basedir + 'corpus.txt', corpus)
680 |     prepare.save_relation(basedir + 'relation.txt', rels)
681 |     print ('preparation finished ...')
682 | 
683 |     print ('begin preprocess...')
684 |     # Prerpocess corpus file
685 |     preprocessor = Preprocess(min_freq=1)
686 |     dids, docs = preprocessor.run(basedir + 'corpus.txt')
687 |     preprocessor.save_word_dict(basedir + 'word_dict.txt')
688 |     preprocessor.save_words_stats(basedir + 'word_stats.txt')
689 | 
690 |     fout = open(basedir + 'corpus_preprocessed.txt', 'w')
691 |     for inum, did in enumerate(dids):
692 |         fout.write('%s\t%s\n' % (did, ' '.join(map(str, docs[inum]))))
693 |     fout.close()
694 |     print('preprocess finished ...')
695 | 
696 | 
697 | 


--------------------------------------------------------------------------------