├── Non-Pretraining-Based ├── C2P-X │ ├── __init__.py │ ├── scripts │ │ ├── test.sh │ │ └── train.sh │ ├── compute_metrics.py │ ├── metrics.py │ ├── test.py │ ├── model_BOW.py │ ├── data_helpers.py │ ├── transformer_block.py │ ├── model_Transformer.py │ ├── model_BiLSTM.py │ └── train.py └── U2P-X │ ├── __init__.py │ ├── scripts │ ├── test.sh │ └── train.sh │ ├── compute_metrics.py │ ├── metrics.py │ ├── test.py │ ├── model_BOW.py │ ├── transformer_block.py │ ├── model_Transformer.py │ ├── data_helpers.py │ ├── model_BiLSTM.py │ └── train.py ├── image ├── result.png └── task.png ├── Pretraining-Based ├── uncased_L-12_H-768_A-12 │ └── README.txt ├── C2P-BERT │ ├── scripts │ │ ├── test.sh │ │ └── train.sh │ ├── __init__.py │ ├── compute_metrics.py │ ├── metrics.py │ ├── optimization.py │ ├── test.py │ └── tokenization.py └── U2P-BERT │ ├── scripts │ ├── test.sh │ └── train.sh │ ├── __init__.py │ ├── compute_metrics.py │ ├── metrics.py │ ├── optimization.py │ └── test.py ├── data_PMPC └── README.txt └── README.md /Non-Pretraining-Based/C2P-X/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /image/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonForJoy/SPD/HEAD/image/result.png -------------------------------------------------------------------------------- /image/task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonForJoy/SPD/HEAD/image/task.png -------------------------------------------------------------------------------- /Pretraining-Based/uncased_L-12_H-768_A-12/README.txt: -------------------------------------------------------------------------------- 1 | ====== Download the BERT base model ====== 2 | 3 | link: https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 4 | Move to path: ./Pretraining-Based/uncased_L-12_H-768_A-12 5 | 6 | -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/scripts/test.sh: -------------------------------------------------------------------------------- 1 | 2 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \ 3 | --test_dir ../data_tfrecord/processed_test_both_revised_cand_10.tfrecord \ 4 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \ 5 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \ 6 | --max_seq_length 200 \ 7 | --eval_batch_size 50 \ 8 | --restore_model_dir ../output/1631501715 > log_test_BERT_cand_10.txt 2>&1 & 9 | -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/scripts/test.sh: -------------------------------------------------------------------------------- 1 | 2 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \ 3 | --test_dir ../data_tfrecord/processed_test_both_revised_cand_10.tfrecord \ 4 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \ 5 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \ 6 | --max_seq_length 1400 \ 7 | --eval_batch_size 10 \ 8 | --restore_model_dir ../output/1631263935 > log_test_BERT_cand_10.txt 2>&1 & 9 | -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /data_PMPC/README.txt: -------------------------------------------------------------------------------- 1 | Download the PMPC dataset and move it to path: /data_PMPC 2 | 3 | If you think our work is helpful, or use the code or dataset, please cite the following paper 4 | 5 | @inproceedings{gu-etal-2021-detecting, 6 | title = "Detecting Speaker Personas from Conversational Texts", 7 | author = "Gu, Jia-Chen and 8 | Ling, Zhen-Hua and 9 | Wu, Yu and 10 | Liu, Quan and 11 | Chen, Zhigang and 12 | Zhu, Xiaodan", 13 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", 14 | month = nov, 15 | year = "2021", 16 | publisher = "Association for Computational Linguistics", 17 | } -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/scripts/train.sh: -------------------------------------------------------------------------------- 1 | 2 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \ 3 | --task_name PersonaMatch \ 4 | --train_dir ../data_tfrecord/processed_train_both_revised.tfrecord \ 5 | --valid_dir ../data_tfrecord/processed_valid_both_revised_cand_10.tfrecord \ 6 | --output_dir ../output \ 7 | --do_lower_case True \ 8 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \ 9 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \ 10 | --init_checkpoint ../../uncased_L-12_H-768_A-12/bert_model.ckpt \ 11 | --max_seq_length 200 \ 12 | --do_train True \ 13 | --do_eval True \ 14 | --train_batch_size 20 \ 15 | --eval_batch_size 20 \ 16 | --learning_rate 2e-5 \ 17 | --num_train_epochs 10 \ 18 | --warmup_proportion 0.1 > log_train_BERT_cand_10.txt 2>&1 & 19 | -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/scripts/train.sh: -------------------------------------------------------------------------------- 1 | 2 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \ 3 | --task_name PersonaMatch \ 4 | --train_dir ../data_tfrecord/processed_train_both_revised_cand_10.tfrecord \ 5 | --valid_dir ../data_tfrecord/processed_valid_both_revised_cand_10.tfrecord \ 6 | --output_dir ../output \ 7 | --do_lower_case True \ 8 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \ 9 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \ 10 | --init_checkpoint ../../uncased_L-12_H-768_A-12/bert_model.ckpt \ 11 | --max_seq_length 1400 \ 12 | --do_train True \ 13 | --do_eval True \ 14 | --train_batch_size 4 \ 15 | --eval_batch_size 4 \ 16 | --learning_rate 2e-5 \ 17 | --num_train_epochs 20 \ 18 | --warmup_proportion 0.1 > log_train_BERT_cand_10.txt 2>&1 & 19 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/scripts/test.sh: -------------------------------------------------------------------------------- 1 | 2 | latest_checkpoint=../output/1631263935/checkpoints 3 | echo $latest_checkpoint 4 | 5 | test_file=../../../data_PMPC/test_both_revised_cand_10.txt 6 | vocab_file=../../../data_PMPC/vocab.txt 7 | char_vocab_file=../../../data_PMPC/char_vocab.txt 8 | output_file=${latest_checkpoint}/output_test.txt 9 | 10 | max_context_len=150 11 | max_persona_len=50 12 | max_word_length=18 13 | batch_size=128 14 | 15 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \ 16 | --test_file $test_file \ 17 | --vocab_file $vocab_file \ 18 | --char_vocab_file $char_vocab_file \ 19 | --output_file $output_file \ 20 | --max_context_len $max_context_len \ 21 | --max_persona_len $max_persona_len \ 22 | --max_word_length $max_word_length \ 23 | --batch_size $batch_size \ 24 | --checkpoint_dir $latest_checkpoint > log_test_BOW_cand_10.txt 2>&1 & 25 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/scripts/test.sh: -------------------------------------------------------------------------------- 1 | 2 | latest_checkpoint=../output/1631263935/checkpoints 3 | echo $latest_checkpoint 4 | 5 | test_file=../../../data_PMPC/test_both_revised_cand_10.txt 6 | vocab_file=../../../data_PMPC/vocab.txt 7 | char_vocab_file=../../../data_PMPC/char_vocab.txt 8 | output_file=${latest_checkpoint}/output_test.txt 9 | 10 | max_utter_num=8 11 | max_utter_len=20 12 | max_profile_num=5 13 | max_profile_len=15 14 | max_word_length=18 15 | batch_size=128 16 | 17 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \ 18 | --test_file $test_file \ 19 | --vocab_file $vocab_file \ 20 | --char_vocab_file $char_vocab_file \ 21 | --output_file $output_file \ 22 | --max_utter_num $max_utter_num \ 23 | --max_utter_len $max_utter_len \ 24 | --max_profile_num $max_profile_num \ 25 | --max_profile_len $max_profile_len \ 26 | --max_word_length $max_word_length \ 27 | --batch_size $batch_size \ 28 | --checkpoint_dir $latest_checkpoint > log_test_BOW_can_10.txt 2>&1 & 29 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/scripts/train.sh: -------------------------------------------------------------------------------- 1 | 2 | train_file=../../../data_PMPC/train_both_revised.txt 3 | valid_file=../../../data_PMPC/valid_both_revised_cand_10.txt 4 | vocab_file=../../../data_PMPC/vocab.txt 5 | char_vocab_file=../../../data_PMPC/char_vocab.txt 6 | embedded_vector_file=../../../data_PMPC/filtered.glove.42B.300d.txt 7 | 8 | max_context_len=150 9 | max_persona_len=50 10 | max_word_length=18 11 | num_layer=1 12 | embedding_dim=300 13 | rnn_size=200 14 | 15 | batch_size=128 16 | lambda=0 17 | dropout_keep_prob=0.8 18 | num_epochs=1000 19 | evaluate_every=100 20 | 21 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \ 22 | --train_file $train_file \ 23 | --valid_file $valid_file \ 24 | --vocab_file $vocab_file \ 25 | --char_vocab_file $char_vocab_file \ 26 | --embedded_vector_file $embedded_vector_file \ 27 | --max_context_len $max_context_len \ 28 | --max_persona_len $max_persona_len \ 29 | --max_word_length $max_word_length \ 30 | --num_layer $num_layer \ 31 | --embedding_dim $embedding_dim \ 32 | --rnn_size $rnn_size \ 33 | --batch_size $batch_size \ 34 | --l2_reg_lambda $lambda \ 35 | --dropout_keep_prob $dropout_keep_prob \ 36 | --num_epochs $num_epochs \ 37 | --evaluate_every $evaluate_every > log_train_BOW_cand_10.txt 2>&1 & 38 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/scripts/train.sh: -------------------------------------------------------------------------------- 1 | 2 | train_file=../../../data_PMPC/train_both_revised.txt 3 | valid_file=../../../data_PMPC/valid_both_revised_cand_10.txt 4 | vocab_file=../../../data_PMPC/vocab.txt 5 | char_vocab_file=../../../data_PMPC/char_vocab.txt 6 | embedded_vector_file=../../../data_PMPC/filtered.glove.42B.300d.txt 7 | 8 | max_utter_num=8 9 | max_utter_len=20 10 | max_profile_num=5 11 | max_profile_len=15 12 | max_word_length=18 13 | num_layer=1 14 | embedding_dim=300 15 | rnn_size=200 16 | 17 | batch_size=128 18 | lambda=0 19 | dropout_keep_prob=0.8 20 | num_epochs=1000 21 | evaluate_every=100 22 | 23 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \ 24 | --train_file $train_file \ 25 | --valid_file $valid_file \ 26 | --vocab_file $vocab_file \ 27 | --char_vocab_file $char_vocab_file \ 28 | --embedded_vector_file $embedded_vector_file \ 29 | --max_utter_num $max_utter_num \ 30 | --max_utter_len $max_utter_len \ 31 | --max_profile_num $max_profile_num \ 32 | --max_profile_len $max_profile_len \ 33 | --max_word_length $max_word_length \ 34 | --num_layer $num_layer \ 35 | --embedding_dim $embedding_dim \ 36 | --rnn_size $rnn_size \ 37 | --batch_size $batch_size \ 38 | --l2_reg_lambda $lambda \ 39 | --dropout_keep_prob $dropout_keep_prob \ 40 | --num_epochs $num_epochs \ 41 | --evaluate_every $evaluate_every > log_train_BOW_cand_10.txt 2>&1 & 42 | -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/compute_metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load the output.txt file and compute the matrics 3 | ''' 4 | 5 | import numpy as np 6 | import operator 7 | import random 8 | from collections import defaultdict 9 | import metrics 10 | 11 | 12 | test_out_filename = "output/1631259843/output_test.txt" 13 | print("*"*20 + test_out_filename + "*"*20 + "\n") 14 | 15 | with open(test_out_filename, 'r') as f: 16 | 17 | results = defaultdict(list) 18 | lines = f.readlines() 19 | for line in lines[1:]: 20 | line = line.strip().split('\t') 21 | us_id = line[0] 22 | r_id = line[1] 23 | prob_score = float(line[2]) 24 | label = float(line[4]) 25 | results[us_id].append((r_id, label, prob_score)) 26 | 27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 29 | total_valid_query = metrics.get_num_valid_query(results) 30 | mvp = metrics.mean_average_precision(results) 31 | mrr = metrics.mean_reciprocal_rank(results) 32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format( 33 | mvp, mrr, total_valid_query)) 34 | top_1_precision = metrics.top_k_precision(results, k=1) 35 | top_2_precision = metrics.top_k_precision(results, k=2) 36 | top_5_precision = metrics.top_k_precision(results, k=5) 37 | print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format( 38 | top_1_precision, top_2_precision, top_5_precision)) 39 | 40 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/compute_metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load the output.txt file and compute the matrics 3 | ''' 4 | 5 | import numpy as np 6 | import operator 7 | import random 8 | from collections import defaultdict 9 | import metrics 10 | 11 | 12 | test_out_filename = "output/1631512095/checkpoints/output_test.txt" 13 | print("*"*20 + test_out_filename + "*"*20 + "\n") 14 | 15 | with open(test_out_filename, 'r') as f: 16 | 17 | results = defaultdict(list) 18 | lines = f.readlines() 19 | for line in lines[1:]: 20 | line = line.strip().split('\t') 21 | us_id = line[0] 22 | r_id = line[1] 23 | prob_score = float(line[2]) 24 | label = float(line[4]) 25 | results[us_id].append((r_id, label, prob_score)) 26 | 27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 29 | total_valid_query = metrics.get_num_valid_query(results) 30 | mvp = metrics.mean_average_precision(results) 31 | mrr = metrics.mean_reciprocal_rank(results) 32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format( 33 | mvp, mrr, total_valid_query)) 34 | top_1_precision = metrics.top_k_precision(results, k=1) 35 | top_2_precision = metrics.top_k_precision(results, k=2) 36 | top_5_precision = metrics.top_k_precision(results, k=5) 37 | print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format( 38 | top_1_precision, top_2_precision, top_5_precision)) 39 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/compute_metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load the output.txt file and compute the matrics 3 | ''' 4 | 5 | import numpy as np 6 | import operator 7 | import random 8 | from collections import defaultdict 9 | import metrics 10 | 11 | 12 | test_out_filename = "output/1631513113/checkpoints/output_test.txt" 13 | print("*"*20 + test_out_filename + "*"*20 + "\n") 14 | 15 | with open(test_out_filename, 'r') as f: 16 | 17 | results = defaultdict(list) 18 | lines = f.readlines() 19 | for line in lines[1:]: 20 | line = line.strip().split('\t') 21 | us_id = line[0] 22 | r_id = line[1] 23 | prob_score = float(line[2]) 24 | label = float(line[4]) 25 | results[us_id].append((r_id, label, prob_score)) 26 | 27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 29 | total_valid_query = metrics.get_num_valid_query(results) 30 | mvp = metrics.mean_average_precision(results) 31 | mrr = metrics.mean_reciprocal_rank(results) 32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format( 33 | mvp, mrr, total_valid_query)) 34 | top_1_precision = metrics.top_k_precision(results, k=1) 35 | top_2_precision = metrics.top_k_precision(results, k=2) 36 | top_5_precision = metrics.top_k_precision(results, k=5) 37 | print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format( 38 | top_1_precision, top_2_precision, top_5_precision)) 39 | -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/compute_metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load the output.txt file and compute the matrics 3 | ''' 4 | 5 | import numpy as np 6 | import operator 7 | import random 8 | from collections import defaultdict 9 | import metrics 10 | 11 | 12 | test_out_filename = "output_test.txt" 13 | print("*"*20 + test_out_filename + "*"*20 + "\n") 14 | 15 | with open(test_out_filename, 'r') as f: 16 | 17 | results = defaultdict(list) 18 | lines = f.readlines() 19 | for line in lines[1:]: 20 | line = line.strip().split('\t') 21 | us_id = line[0] 22 | r_id = line[1] 23 | prob_score = float(line[2]) 24 | label = float(line[4]) 25 | results[us_id].append((r_id, label, prob_score)) 26 | 27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 29 | total_valid_query = metrics.get_num_valid_query(results) 30 | mvp = metrics.mean_average_precision(results) 31 | mrr = metrics.mean_reciprocal_rank(results) 32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format( 33 | mvp, mrr, total_valid_query)) 34 | top_1_precision = metrics.top_k_precision(results, k=1) 35 | top_2_precision = metrics.top_k_precision(results, k=2) 36 | top_5_precision = metrics.top_k_precision(results, k=5) 37 | top_10_precision = metrics.top_k_precision(results, k=10) 38 | print('Recall@1: {}\tRecall@2: {}\tRecall@5: {}\tRecall@10: {}\n'.format( 39 | top_1_precision, top_2_precision, top_5_precision, top_10_precision)) 40 | 41 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/metrics.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import math 3 | 4 | 5 | def is_valid_query(v): 6 | num_pos = 0 7 | num_neg = 0 8 | for aid, label, score in v: 9 | if label > 0: 10 | num_pos += 1 11 | else: 12 | num_neg += 1 13 | if num_pos > 0 and num_neg > 0: 14 | return True 15 | else: 16 | return False 17 | 18 | 19 | def get_num_valid_query(results): 20 | num_query = 0 21 | for k, v in results.items(): 22 | if not is_valid_query(v): 23 | continue 24 | num_query += 1 25 | return num_query 26 | 27 | 28 | def top_1_precision(results): 29 | num_query = 0 30 | top_1_correct = 0.0 31 | for k, v in results.items(): 32 | if not is_valid_query(v): 33 | continue 34 | num_query += 1 35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 36 | aid, label, score = sorted_v[0] 37 | if label > 0: 38 | top_1_correct += 1 39 | 40 | if num_query > 0: 41 | return top_1_correct / num_query 42 | else: 43 | return 0.0 44 | 45 | 46 | def top_k_precision(results, k=1): 47 | num_query = 0 48 | top_1_correct = 0.0 49 | for key, v in results.items(): 50 | if not is_valid_query(v): 51 | continue 52 | num_query += 1 53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 54 | if k == 1: 55 | aid, label, score = sorted_v[0] 56 | if label > 0: 57 | top_1_correct += 1 58 | elif k == 2: 59 | aid1, label1, score1 = sorted_v[0] 60 | aid2, label2, score2 = sorted_v[1] 61 | if label1 > 0 or label2 > 0: 62 | top_1_correct += 1 63 | elif k == 5: 64 | for vv in sorted_v[0:5]: 65 | label = vv[1] 66 | if label > 0: 67 | top_1_correct += 1 68 | break 69 | else: 70 | raise BaseException 71 | 72 | if num_query > 0: 73 | return top_1_correct/num_query 74 | else: 75 | return 0.0 76 | 77 | 78 | def mean_reciprocal_rank(results): 79 | num_query = 0 80 | mrr = 0.0 81 | for k, v in results.items(): 82 | if not is_valid_query(v): 83 | continue 84 | 85 | num_query += 1 86 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 87 | for i, rec in enumerate(sorted_v): 88 | aid, label, score = rec 89 | if label > 0: 90 | mrr += 1.0 / (i + 1) 91 | break 92 | 93 | if num_query == 0: 94 | return 0.0 95 | else: 96 | mrr = mrr / num_query 97 | return mrr 98 | 99 | 100 | def mean_average_precision(results): 101 | num_query = 0 102 | mvp = 0.0 103 | for k, v in results.items(): 104 | if not is_valid_query(v): 105 | continue 106 | 107 | num_query += 1 108 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 109 | num_relevant_doc = 0.0 110 | avp = 0.0 111 | for i, rec in enumerate(sorted_v): 112 | aid, label, score = rec 113 | if label == 1: 114 | num_relevant_doc += 1 115 | precision = num_relevant_doc / (i + 1) 116 | avp += precision 117 | avp = avp / num_relevant_doc 118 | mvp += avp 119 | 120 | if num_query == 0: 121 | return 0.0 122 | else: 123 | mvp = mvp / num_query 124 | return mvp 125 | 126 | 127 | def classification_metrics(results): 128 | total_num = 0 129 | total_correct = 0 130 | true_positive = 0 131 | positive_correct = 0 132 | predicted_positive = 0 133 | 134 | loss = 0.0; 135 | for k, v in results.items(): 136 | for rec in v: 137 | total_num += 1 138 | aid, label, score = rec 139 | 140 | if score > 0.5: 141 | predicted_positive += 1 142 | 143 | if label > 0: 144 | true_positive += 1 145 | loss += -math.log(score + 1e-12) 146 | else: 147 | loss += -math.log(1.0 - score + 1e-12); 148 | 149 | if score > 0.5 and label > 0: 150 | total_correct += 1 151 | positive_correct += 1 152 | 153 | if score < 0.5 and label < 0.5: 154 | total_correct += 1 155 | 156 | accuracy = float(total_correct) / total_num 157 | precision = float(positive_correct) / (predicted_positive + 1e-12) 158 | recall = float(positive_correct) / true_positive 159 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall) 160 | return accuracy, precision, recall, F1, loss / total_num; -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/metrics.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import math 3 | 4 | 5 | def is_valid_query(v): 6 | num_pos = 0 7 | num_neg = 0 8 | for aid, label, score in v: 9 | if label > 0: 10 | num_pos += 1 11 | else: 12 | num_neg += 1 13 | if num_pos > 0 and num_neg > 0: 14 | return True 15 | else: 16 | return False 17 | 18 | 19 | def get_num_valid_query(results): 20 | num_query = 0 21 | for k, v in results.items(): 22 | if not is_valid_query(v): 23 | continue 24 | num_query += 1 25 | return num_query 26 | 27 | 28 | def top_1_precision(results): 29 | num_query = 0 30 | top_1_correct = 0.0 31 | for k, v in results.items(): 32 | if not is_valid_query(v): 33 | continue 34 | num_query += 1 35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 36 | aid, label, score = sorted_v[0] 37 | if label > 0: 38 | top_1_correct += 1 39 | 40 | if num_query > 0: 41 | return top_1_correct / num_query 42 | else: 43 | return 0.0 44 | 45 | 46 | def top_k_precision(results, k=1): 47 | num_query = 0 48 | top_1_correct = 0.0 49 | for key, v in results.items(): 50 | if not is_valid_query(v): 51 | continue 52 | num_query += 1 53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 54 | if k == 1: 55 | aid, label, score = sorted_v[0] 56 | if label > 0: 57 | top_1_correct += 1 58 | elif k == 2: 59 | aid1, label1, score1 = sorted_v[0] 60 | aid2, label2, score2 = sorted_v[1] 61 | if label1 > 0 or label2 > 0: 62 | top_1_correct += 1 63 | elif k == 5: 64 | for vv in sorted_v[0:5]: 65 | label = vv[1] 66 | if label > 0: 67 | top_1_correct += 1 68 | break 69 | else: 70 | raise BaseException 71 | 72 | if num_query > 0: 73 | return top_1_correct/num_query 74 | else: 75 | return 0.0 76 | 77 | 78 | def mean_reciprocal_rank(results): 79 | num_query = 0 80 | mrr = 0.0 81 | for k, v in results.items(): 82 | if not is_valid_query(v): 83 | continue 84 | 85 | num_query += 1 86 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 87 | for i, rec in enumerate(sorted_v): 88 | aid, label, score = rec 89 | if label > 0: 90 | mrr += 1.0 / (i + 1) 91 | break 92 | 93 | if num_query == 0: 94 | return 0.0 95 | else: 96 | mrr = mrr / num_query 97 | return mrr 98 | 99 | 100 | def mean_average_precision(results): 101 | num_query = 0 102 | mvp = 0.0 103 | for k, v in results.items(): 104 | if not is_valid_query(v): 105 | continue 106 | 107 | num_query += 1 108 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 109 | num_relevant_doc = 0.0 110 | avp = 0.0 111 | for i, rec in enumerate(sorted_v): 112 | aid, label, score = rec 113 | if label == 1: 114 | num_relevant_doc += 1 115 | precision = num_relevant_doc / (i + 1) 116 | avp += precision 117 | avp = avp / num_relevant_doc 118 | mvp += avp 119 | 120 | if num_query == 0: 121 | return 0.0 122 | else: 123 | mvp = mvp / num_query 124 | return mvp 125 | 126 | 127 | def classification_metrics(results): 128 | total_num = 0 129 | total_correct = 0 130 | true_positive = 0 131 | positive_correct = 0 132 | predicted_positive = 0 133 | 134 | loss = 0.0; 135 | for k, v in results.items(): 136 | for rec in v: 137 | total_num += 1 138 | aid, label, score = rec 139 | 140 | if score > 0.5: 141 | predicted_positive += 1 142 | 143 | if label > 0: 144 | true_positive += 1 145 | loss += -math.log(score + 1e-12) 146 | else: 147 | loss += -math.log(1.0 - score + 1e-12); 148 | 149 | if score > 0.5 and label > 0: 150 | total_correct += 1 151 | positive_correct += 1 152 | 153 | if score < 0.5 and label < 0.5: 154 | total_correct += 1 155 | 156 | accuracy = float(total_correct) / total_num 157 | precision = float(positive_correct) / (predicted_positive + 1e-12) 158 | recall = float(positive_correct) / true_positive 159 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall) 160 | return accuracy, precision, recall, F1, loss / total_num; -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/metrics.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import math 3 | 4 | 5 | def is_valid_query(v): 6 | num_pos = 0 7 | num_neg = 0 8 | for aid, label, score in v: 9 | if label > 0: 10 | num_pos += 1 11 | else: 12 | num_neg += 1 13 | if num_pos > 0 and num_neg > 0: 14 | return True 15 | else: 16 | return False 17 | 18 | 19 | def get_num_valid_query(results): 20 | num_query = 0 21 | for k, v in results.items(): 22 | if not is_valid_query(v): 23 | continue 24 | num_query += 1 25 | return num_query 26 | 27 | 28 | def top_1_precision(results): 29 | num_query = 0 30 | top_1_correct = 0.0 31 | for k, v in results.items(): 32 | if not is_valid_query(v): 33 | continue 34 | num_query += 1 35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 36 | aid, label, score = sorted_v[0] 37 | if label > 0: 38 | top_1_correct += 1 39 | 40 | if num_query > 0: 41 | return top_1_correct / num_query 42 | else: 43 | return 0.0 44 | 45 | 46 | def top_k_precision(results, k=1): 47 | num_query = 0 48 | top_1_correct = 0.0 49 | for key, v in results.items(): 50 | if not is_valid_query(v): 51 | continue 52 | num_query += 1 53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 54 | if k == 1: 55 | aid, label, score = sorted_v[0] 56 | if label > 0: 57 | top_1_correct += 1 58 | elif k == 2: 59 | aid1, label1, score1 = sorted_v[0] 60 | aid2, label2, score2 = sorted_v[1] 61 | if label1 > 0 or label2 > 0: 62 | top_1_correct += 1 63 | elif k == 5: 64 | for vv in sorted_v[0:5]: 65 | label = vv[1] 66 | if label > 0: 67 | top_1_correct += 1 68 | break 69 | else: 70 | raise BaseException 71 | 72 | if num_query > 0: 73 | return top_1_correct/num_query 74 | else: 75 | return 0.0 76 | 77 | 78 | def mean_reciprocal_rank(results): 79 | num_query = 0 80 | mrr = 0.0 81 | for k, v in results.items(): 82 | if not is_valid_query(v): 83 | continue 84 | 85 | num_query += 1 86 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 87 | for i, rec in enumerate(sorted_v): 88 | aid, label, score = rec 89 | if label > 0: 90 | mrr += 1.0 / (i + 1) 91 | break 92 | 93 | if num_query == 0: 94 | return 0.0 95 | else: 96 | mrr = mrr / num_query 97 | return mrr 98 | 99 | 100 | def mean_average_precision(results): 101 | num_query = 0 102 | mvp = 0.0 103 | for k, v in results.items(): 104 | if not is_valid_query(v): 105 | continue 106 | 107 | num_query += 1 108 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 109 | num_relevant_doc = 0.0 110 | avp = 0.0 111 | for i, rec in enumerate(sorted_v): 112 | aid, label, score = rec 113 | if label == 1: 114 | num_relevant_doc += 1 115 | precision = num_relevant_doc / (i + 1) 116 | avp += precision 117 | avp = avp / num_relevant_doc 118 | mvp += avp 119 | 120 | if num_query == 0: 121 | return 0.0 122 | else: 123 | mvp = mvp / num_query 124 | return mvp 125 | 126 | 127 | def classification_metrics(results): 128 | total_num = 0 129 | total_correct = 0 130 | true_positive = 0 131 | positive_correct = 0 132 | predicted_positive = 0 133 | 134 | loss = 0.0; 135 | for k, v in results.items(): 136 | for rec in v: 137 | total_num += 1 138 | aid, label, score = rec 139 | 140 | if score > 0.5: 141 | predicted_positive += 1 142 | 143 | if label > 0: 144 | true_positive += 1 145 | loss += -math.log(score + 1e-12) 146 | else: 147 | loss += -math.log(1.0 - score + 1e-12); 148 | 149 | if score > 0.5 and label > 0: 150 | total_correct += 1 151 | positive_correct += 1 152 | 153 | if score < 0.5 and label < 0.5: 154 | total_correct += 1 155 | 156 | accuracy = float(total_correct) / total_num 157 | precision = float(positive_correct) / (predicted_positive + 1e-12) 158 | recall = float(positive_correct) / true_positive 159 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall) 160 | return accuracy, precision, recall, F1, loss / total_num; -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/metrics.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import math 3 | 4 | 5 | def is_valid_query(v): 6 | num_pos = 0 7 | num_neg = 0 8 | for aid, label, score in v: 9 | if label > 0: 10 | num_pos += 1 11 | else: 12 | num_neg += 1 13 | if num_pos > 0 and num_neg > 0: 14 | return True 15 | else: 16 | return False 17 | 18 | 19 | def get_num_valid_query(results): 20 | num_query = 0 21 | for k, v in results.items(): 22 | if not is_valid_query(v): 23 | continue 24 | num_query += 1 25 | return num_query 26 | 27 | 28 | def top_1_precision(results): 29 | num_query = 0 30 | top_1_correct = 0.0 31 | for k, v in results.items(): 32 | if not is_valid_query(v): 33 | continue 34 | num_query += 1 35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 36 | aid, label, score = sorted_v[0] 37 | if label > 0: 38 | top_1_correct += 1 39 | 40 | if num_query > 0: 41 | return top_1_correct / num_query 42 | else: 43 | return 0.0 44 | 45 | 46 | def top_k_precision(results, k=1): 47 | num_query = 0 48 | top_1_correct = 0.0 49 | for key, v in results.items(): 50 | if not is_valid_query(v): 51 | continue 52 | num_query += 1 53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 54 | if k == 1: 55 | aid, label, score = sorted_v[0] 56 | if label > 0: 57 | top_1_correct += 1 58 | elif k == 2: 59 | aid1, label1, score1 = sorted_v[0] 60 | aid2, label2, score2 = sorted_v[1] 61 | if label1 > 0 or label2 > 0: 62 | top_1_correct += 1 63 | elif k == 5: 64 | for vv in sorted_v[0:5]: 65 | label = vv[1] 66 | if label > 0: 67 | top_1_correct += 1 68 | break 69 | elif k == 10: 70 | for vv in sorted_v[0:10]: 71 | label = vv[1] 72 | if label > 0: 73 | top_1_correct += 1 74 | break 75 | else: 76 | raise BaseException 77 | 78 | if num_query > 0: 79 | return top_1_correct/num_query 80 | else: 81 | return 0.0 82 | 83 | 84 | def mean_reciprocal_rank(results): 85 | num_query = 0 86 | mrr = 0.0 87 | for k, v in results.items(): 88 | if not is_valid_query(v): 89 | continue 90 | 91 | num_query += 1 92 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 93 | for i, rec in enumerate(sorted_v): 94 | aid, label, score = rec 95 | if label > 0: 96 | mrr += 1.0 / (i + 1) 97 | break 98 | 99 | if num_query == 0: 100 | return 0.0 101 | else: 102 | mrr = mrr / num_query 103 | return mrr 104 | 105 | 106 | def mean_average_precision(results): 107 | num_query = 0 108 | mvp = 0.0 109 | for k, v in results.items(): 110 | if not is_valid_query(v): 111 | continue 112 | 113 | num_query += 1 114 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True) 115 | num_relevant_doc = 0.0 116 | avp = 0.0 117 | for i, rec in enumerate(sorted_v): 118 | aid, label, score = rec 119 | if label == 1: 120 | num_relevant_doc += 1 121 | precision = num_relevant_doc / (i + 1) 122 | avp += precision 123 | avp = avp / num_relevant_doc 124 | mvp += avp 125 | 126 | if num_query == 0: 127 | return 0.0 128 | else: 129 | mvp = mvp / num_query 130 | return mvp 131 | 132 | 133 | def classification_metrics(results): 134 | total_num = 0 135 | total_correct = 0 136 | true_positive = 0 137 | positive_correct = 0 138 | predicted_positive = 0 139 | 140 | loss = 0.0; 141 | for k, v in results.items(): 142 | for rec in v: 143 | total_num += 1 144 | aid, label, score = rec 145 | 146 | if score > 0.5: 147 | predicted_positive += 1 148 | 149 | if label > 0: 150 | true_positive += 1 151 | loss += -math.log(score + 1e-12) 152 | else: 153 | loss += -math.log(1.0 - score + 1e-12); 154 | 155 | if score > 0.5 and label > 0: 156 | total_correct += 1 157 | positive_correct += 1 158 | 159 | if score < 0.5 and label < 0.5: 160 | total_correct += 1 161 | 162 | accuracy = float(total_correct) / total_num 163 | precision = float(positive_correct) / (predicted_positive + 1e-12) 164 | recall = float(positive_correct) / true_positive 165 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall) 166 | return accuracy, precision, recall, F1, loss / total_num; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Detecting Speaker Personas from Conversational Texts 2 | This repository contains the source code and the dataset for the _EMNLP 2021_ paper [Detecting Speaker Personas from Conversational Texts](https://aclanthology.org/2021.emnlp-main.86.pdf). Jia-Chen Gu, Zhen-Hua Ling, Yu Wu, Quan Liu, Zhigang Chen, Xiaodan Zhu.
3 | 4 | 5 | ## Introduction 6 | Personas are useful for dialogue response prediction. However, the personas used in current studies are pre-defined and hard to obtain before a conversation. To tackle this issue, we study a new task, named Speaker Persona Detection (SPD), which aims to detect speaker personas based on the plain conversational text. In this task, a best-matched persona is searched out from candidates given the conversational text. This is a many-to-many semantic matching task because both contexts and personas in SPD are composed of multiple sentences. The long-term dependency and the dynamic redundancy among these sentences increase the difficulty of this task. We build a dataset for SPD, dubbed as Persona Match on Persona-Chat (PMPC). Furthermore, we evaluate several baseline models and propose utterance-to-profile (U2P) matching networks for this task. The U2P models operate at a fine granularity which treat both contexts and personas as sets of multiple sequences. Then, each sequence pair is scored and an interpretable overall score is obtained for a context-persona pair through aggregation. Evaluation results show that the U2P models outperform their baseline counterparts significantly. 7 | 8 |

9 | 10 |
11 | 12 | 13 | ## Dependencies 14 | Python 3.6
15 | Tensorflow 1.13.1 16 | 17 | 18 | ## Download 19 | - Download the [BERT released by the Google research](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip), 20 | and move to path: ./Pretraining-Based/uncased_L-12_H-768_A-12
21 | 22 | - Download the [PMPC dataset](https://drive.google.com/file/d/1sE_N7fi_WojeQBWZcTg4Mw6Pyod27S73/view?usp=sharing) used in our paper, 23 | and move to path: ```./data_PMPC```
24 | 25 | 26 | ## Non-Pretraining-Based Models 27 | Train a new model. 28 | ``` 29 | cd Non-Pretraining-Based/C2P-X/scripts/ 30 | bash train.sh 31 | ``` 32 | The training process is recorded in ```log_train_*.txt``` file.
33 | 34 | Test a trained model by modifying the variable ```latest_checkpoint``` in ```test.sh```. 35 | ``` 36 | cd Non-Pretraining-Based/C2P-X/scripts/ 37 | bash test.sh 38 | ``` 39 | The testing process is recorded in ```log_test_*.txt``` file. A "output_test.txt" file which records scores for each context-persona pair will be saved to the path of ```latest_checkpoint```. Modify the variable ```test_out_filename``` in ```compute_metrics.py``` and then run the following command, various metrics will be shown. 40 | ``` 41 | python compute_metrics.py 42 | ``` 43 | 44 | You can choose a baseline model by comment/uncomment a model package (from ```model_BOW```, ```model_BiLSTM```, ```model_Transformer``` and ```model_ESIM```) in the first several lines in ```train.py```. The same process and commands can be done for those Non-Pretraining-Based U2P-X Models. 45 | 46 | 47 | ## Pretraining-Based Models 48 | Create the fine-tuning data. 49 | ``` 50 | cd Pretraining-Based/C2P-BERT/ 51 | python data_process_tfrecord.py 52 | ``` 53 | 54 | Running the fine-tuning process. 55 | ``` 56 | cd Pretraining-Based/C2P-BERT/scripts/ 57 | bash train.sh 58 | ``` 59 | 60 | Test a trained model by modifying the variable ```restore_model_dir``` in ```test.sh```. 61 | ``` 62 | cd Pretraining-Based/C2P-BERT/scripts/ 63 | bash test.sh 64 | ``` 65 | 66 | Modify the variable ```test_out_filename``` in ```compute_metrics.py``` and then run the following command, various metrics will be shown. 67 | ``` 68 | python compute_metrics.py 69 | ``` 70 | 71 | The same process and commands can be done for U2P-BERT. 72 | 73 | **NOTE**: Since the dataset is small, each model was trained for 10 times with identical architectures and different random initializations. Thus, we report (mean ± standard deviation) in our paper. 74 | 75 | 76 | ## Cite 77 | If you think our work is helpful, or use the code or dataset, please cite the following paper: 78 | **"Detecting Speaker Personas from Conversational Texts"** 79 | Jia-Chen Gu, Zhen-Hua Ling, Yu Wu, Quan Liu, Zhigang Chen, Xiaodan Zhu. _EMNLP (2021)_ 80 | ``` 81 | @inproceedings{gu-etal-2021-detecting, 82 | title = "Detecting Speaker Personas from Conversational Texts", 83 | author = "Gu, Jia-Chen and 84 | Ling, Zhenhua and 85 | Wu, Yu and 86 | Liu, Quan and 87 | Chen, Zhigang and 88 | Zhu, Xiaodan", 89 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", 90 | month = nov, 91 | year = "2021", 92 | address = "Online and Punta Cana, Dominican Republic", 93 | publisher = "Association for Computational Linguistics", 94 | url = "https://aclanthology.org/2021.emnlp-main.86", 95 | pages = "1126--1136", 96 | } 97 | ``` 98 | 99 | 100 | ## Update 101 | Please keep an eye on this repository if you are interested in our work. 102 | Feel free to contact us (gujc@mail.ustc.edu.cn) or open issues. 103 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import time 5 | import datetime 6 | import operator 7 | import metrics 8 | from collections import defaultdict 9 | import data_helpers 10 | 11 | # Files 12 | tf.flags.DEFINE_string("test_file", "", "path to test file") 13 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file") 14 | tf.flags.DEFINE_string("char_vocab_file", "", "vocabulary file") 15 | tf.flags.DEFINE_string("output_file", "", "prediction output file") 16 | 17 | # Model Hyperparameters 18 | tf.flags.DEFINE_integer("max_context_len", 150, "max context length") 19 | tf.flags.DEFINE_integer("max_persona_len", 50, "max persona length") 20 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length") 21 | 22 | # Test parameters 23 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)") 24 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 25 | 26 | # Misc Parameters 27 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 28 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 29 | 30 | FLAGS = tf.flags.FLAGS 31 | # FLAGS._parse_flags() 32 | # print("\nParameters:") 33 | # for attr, value in sorted(FLAGS.__flags.items()): 34 | # print("{}={}".format(attr.upper(), value)) 35 | print("") 36 | 37 | vocab = data_helpers.load_vocab(FLAGS.vocab_file) 38 | print('vocabulary size: {}'.format(len(vocab))) 39 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file) 40 | 41 | test_dataset = data_helpers.load_dataset(FLAGS.test_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len) 42 | print('test_pairs: {}'.format(len(test_dataset))) 43 | 44 | print("\nEvaluating...\n") 45 | 46 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 47 | print(checkpoint_file) 48 | 49 | graph = tf.Graph() 50 | with graph.as_default(): 51 | session_conf = tf.ConfigProto( 52 | allow_soft_placement=FLAGS.allow_soft_placement, 53 | log_device_placement=FLAGS.log_device_placement) 54 | sess = tf.Session(config=session_conf) 55 | with sess.as_default(): 56 | # Load the saved meta graph and restore variables 57 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 58 | saver.restore(sess, checkpoint_file) 59 | 60 | # Get the placeholders from the graph by name 61 | context = graph.get_operation_by_name("context").outputs[0] 62 | context_len = graph.get_operation_by_name("context_len").outputs[0] 63 | persona = graph.get_operation_by_name("persona").outputs[0] 64 | persona_len = graph.get_operation_by_name("persona_len").outputs[0] 65 | 66 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 67 | 68 | c_char_feature = graph.get_operation_by_name("context_char").outputs[0] 69 | c_char_len = graph.get_operation_by_name("context_char_len").outputs[0] 70 | p_char_feature = graph.get_operation_by_name("persona_char").outputs[0] 71 | p_char_len = graph.get_operation_by_name("persona_char_len").outputs[0] 72 | 73 | # Tensors we want to evaluate 74 | prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0] 75 | 76 | results = defaultdict(list) 77 | num_test = 0 78 | 79 | test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=False) 80 | for test_batch in test_batches: 81 | x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = test_batch 82 | feed_dict = { 83 | context: x_context, 84 | context_len: x_context_len, 85 | persona: x_persona, 86 | persona_len: x_persona_len, 87 | dropout_keep_prob: 1.0, 88 | c_char_feature: x_context_char, 89 | c_char_len: x_context_char_len, 90 | p_char_feature: x_persona_char, 91 | p_char_len: x_persona_char_len 92 | } 93 | predicted_prob = sess.run(prob, feed_dict) 94 | num_test += len(predicted_prob) 95 | print('num_test_sample={}'.format(num_test)) 96 | for i, prob_score in enumerate(predicted_prob): 97 | us_id, ps_id, label = x_id_pairs[i] 98 | results[us_id].append((ps_id, label, prob_score)) 99 | 100 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 101 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 102 | 103 | mvp = metrics.mean_average_precision(results) 104 | mrr = metrics.mean_reciprocal_rank(results) 105 | top_1_precision = metrics.top_1_precision(results) 106 | total_valid_query = metrics.get_num_valid_query(results) 107 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query)) 108 | 109 | out_path = FLAGS.output_file 110 | print("Saving evaluation to {}".format(out_path)) 111 | with open(out_path, 'w') as f: 112 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n") 113 | for us_id, v in results.items(): 114 | v.sort(key=operator.itemgetter(2), reverse=True) 115 | for i, rec in enumerate(v): 116 | ps_id, label, prob_score = rec 117 | rank = i+1 118 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, ps_id, prob_score, rank, label)) 119 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/model_BOW.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | FLAGS = tf.flags.FLAGS 5 | 6 | def get_embeddings(vocab): 7 | print("get_embedding") 8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 9 | return tf.constant(initializer, name="word_embedding") 10 | 11 | def get_char_embedding(charVocab): 12 | print("get_char_embedding") 13 | char_size = len(charVocab) 14 | embeddings = np.zeros((char_size, char_size), dtype='float32') 15 | for i in range(1, char_size): 16 | embeddings[i, i] = 1.0 17 | 18 | return tf.constant(embeddings, name="word_char_embedding") 19 | 20 | def load_embed_vectors(fname, dim): 21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... } 22 | vectors = {} 23 | for line in open(fname, 'rt'): 24 | items = line.strip().split(' ') 25 | if len(items[0]) <= 0: 26 | continue 27 | vec = [float(items[i]) for i in range(1, dim+1)] 28 | vectors[items[0]] = vec 29 | 30 | return vectors 31 | 32 | def load_word_embeddings(vocab, dim): 33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 34 | vocab_size = len(vocab) 35 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 36 | for word, code in vocab.items(): 37 | if word in vectors: 38 | embeddings[code] = vectors[word] 39 | #else: 40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 41 | 42 | return embeddings 43 | 44 | 45 | class BOW(object): 46 | def __init__( 47 | self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0): 48 | 49 | self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context") 50 | self.context_len = tf.placeholder(tf.int32, [None], name="context_len") 51 | self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona") 52 | self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len") 53 | 54 | self.target = tf.placeholder(tf.float32, [None], name="target") 55 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 56 | 57 | self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char") 58 | self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len") 59 | self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char") 60 | self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len") 61 | 62 | l2_loss = tf.constant(1.0) 63 | 64 | # =============================== Embedding layer =============================== 65 | with tf.name_scope("embedding"): 66 | W = get_embeddings(vocab) 67 | context_embedded = tf.nn.embedding_lookup(W, self.context) # [batch_size, max_context_len, word_dim] 68 | persona_embedded = tf.nn.embedding_lookup(W, self.persona) # [batch_size, max_persona_len, word_dim] 69 | context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob) 70 | persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob) 71 | print("context_embedded: {}".format(context_embedded.get_shape())) 72 | print("persona_embedded: {}".format(persona_embedded.get_shape())) 73 | 74 | 75 | # =============================== Encoding layer =============================== 76 | with tf.variable_scope("encoding_layer") as vs: 77 | mask_c = tf.sequence_mask(self.context_len, max_context_len, dtype=tf.float32) # [batch_size, max_context_len] 78 | mask_c = tf.expand_dims(mask_c, -1) # [batch_size, max_context_len, 1] 79 | final_context = tf.reduce_max(context_embedded * mask_c, axis=1) 80 | 81 | mask_p = tf.sequence_mask(self.persona_len, max_persona_len, dtype=tf.float32) # [batch_size, max_persona_len] 82 | mask_p = tf.expand_dims(mask_p, -1) # [batch_size, max_persona_len, 1] 83 | final_persona = tf.reduce_max(persona_embedded * mask_p, axis=1) 84 | print("establish BOW encoder") 85 | 86 | 87 | # =============================== Matching layer =============================== 88 | with tf.variable_scope("matching_layer") as vs: 89 | output_dim = final_context.get_shape()[-1].value 90 | A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32) 91 | 92 | similarity = tf.matmul(final_context, A_matrix) # [batch_size, dim] 93 | similarity = tf.reduce_sum(similarity * final_persona, axis=-1) # [batch_size, ] 94 | print("shape of similarity: {}".format(similarity.get_shape())) 95 | 96 | 97 | # =============================== Prediction layer =============================== 98 | with tf.variable_scope("prediction_layer") as vs: 99 | logits = similarity 100 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ] 101 | 102 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target) 103 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 104 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 105 | 106 | with tf.name_scope("accuracy"): 107 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ] 108 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 109 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import time 5 | import datetime 6 | import operator 7 | import metrics 8 | from collections import defaultdict 9 | import data_helpers 10 | 11 | # Files 12 | tf.flags.DEFINE_string("test_file", "", "path to test file") 13 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file") 14 | tf.flags.DEFINE_string("char_vocab_file", "", "vocabulary file") 15 | tf.flags.DEFINE_string("output_file", "", "prediction output file") 16 | 17 | # Model Hyperparameters 18 | tf.flags.DEFINE_integer("max_utter_num", 8, "max utterance number") 19 | tf.flags.DEFINE_integer("max_utter_len", 20, "max utterance length") 20 | tf.flags.DEFINE_integer("max_profile_num", 5, "max profile number") 21 | tf.flags.DEFINE_integer("max_profile_len", 15, "max profile length") 22 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length") 23 | 24 | # Test parameters 25 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)") 26 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 27 | 28 | # Misc Parameters 29 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 30 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 31 | 32 | FLAGS = tf.flags.FLAGS 33 | # FLAGS._parse_flags() 34 | # print("\nParameters:") 35 | # for attr, value in sorted(FLAGS.__flags.items()): 36 | # print("{}={}".format(attr.upper(), value)) 37 | print("") 38 | 39 | vocab = data_helpers.load_vocab(FLAGS.vocab_file) 40 | print('vocabulary size: {}'.format(len(vocab))) 41 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file) 42 | 43 | test_dataset = data_helpers.load_dataset(FLAGS.test_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len) 44 | print('test_pairs: {}'.format(len(test_dataset))) 45 | 46 | print("\nEvaluating...\n") 47 | 48 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 49 | print(checkpoint_file) 50 | 51 | graph = tf.Graph() 52 | with graph.as_default(): 53 | session_conf = tf.ConfigProto( 54 | allow_soft_placement=FLAGS.allow_soft_placement, 55 | log_device_placement=FLAGS.log_device_placement) 56 | sess = tf.Session(config=session_conf) 57 | with sess.as_default(): 58 | # Load the saved meta graph and restore variables 59 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 60 | saver.restore(sess, checkpoint_file) 61 | 62 | # Get the placeholders from the graph by name 63 | utterances = graph.get_operation_by_name("utterances").outputs[0] 64 | utterances_len = graph.get_operation_by_name("utterances_len").outputs[0] 65 | utterances_num = graph.get_operation_by_name("utterances_num").outputs[0] 66 | profiles = graph.get_operation_by_name("profiles").outputs[0] 67 | profiles_len = graph.get_operation_by_name("profiles_len").outputs[0] 68 | profiles_num = graph.get_operation_by_name("profiles_num").outputs[0] 69 | 70 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 71 | 72 | u_char_feature = graph.get_operation_by_name("utterances_char").outputs[0] 73 | u_char_len = graph.get_operation_by_name("utterances_char_len").outputs[0] 74 | p_char_feature = graph.get_operation_by_name("profiles_char").outputs[0] 75 | p_char_len = graph.get_operation_by_name("profiles_char_len").outputs[0] 76 | 77 | # Tensors we want to evaluate 78 | prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0] 79 | 80 | results = defaultdict(list) 81 | num_test = 0 82 | 83 | test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=False) 84 | for test_batch in test_batches: 85 | x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, \ 86 | x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = test_batch 87 | feed_dict = { 88 | utterances: x_utterances, 89 | utterances_len: x_utterances_len, 90 | utterances_num: x_utterances_num, 91 | profiles: x_profiles, 92 | profiles_len: x_profiles_len, 93 | profiles_num: x_profiles_num, 94 | dropout_keep_prob: 1.0, 95 | u_char_feature: x_utterances_char, 96 | u_char_len: x_utterances_char_len, 97 | p_char_feature: x_profiles_char, 98 | p_char_len: x_profiles_char_len 99 | } 100 | predicted_prob = sess.run(prob, feed_dict) 101 | num_test += len(predicted_prob) 102 | print('num_test_sample={}'.format(num_test)) 103 | for i, prob_score in enumerate(predicted_prob): 104 | us_id, ps_id, label = x_ids[i] 105 | results[us_id].append((ps_id, label, prob_score)) 106 | 107 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 108 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 109 | 110 | mvp = metrics.mean_average_precision(results) 111 | mrr = metrics.mean_reciprocal_rank(results) 112 | top_1_precision = metrics.top_1_precision(results) 113 | total_valid_query = metrics.get_num_valid_query(results) 114 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query)) 115 | 116 | out_path = FLAGS.output_file 117 | print("Saving evaluation to {}".format(out_path)) 118 | with open(out_path, 'w') as f: 119 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n") 120 | for us_id, v in results.items(): 121 | v.sort(key=operator.itemgetter(2), reverse=True) 122 | for i, rec in enumerate(v): 123 | ps_id, label, prob_score = rec 124 | rank = i+1 125 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, ps_id, prob_score, rank, label)) 126 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | def load_vocab(fname): 6 | ''' 7 | vocab = {"I": 0, ...} 8 | ''' 9 | vocab={} 10 | with open(fname, 'rt') as f: 11 | for i,line in enumerate(f): 12 | word = line.strip() 13 | vocab[word] = i 14 | return vocab 15 | 16 | def load_char_vocab(fname): 17 | ''' 18 | charVocab = {"U": 0, "!": 1, ...} 19 | ''' 20 | charVocab={} 21 | with open(fname, 'rt') as f: 22 | for line in f: 23 | fields = line.strip().split('\t') 24 | char_id = int(fields[0]) 25 | ch = fields[1] 26 | charVocab[ch] = char_id 27 | return charVocab 28 | 29 | def to_vec(tokens, vocab, maxlen): 30 | ''' 31 | length: length of the input sequence 32 | vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...] 33 | ''' 34 | n = len(tokens) 35 | length = 0 36 | vec=[] 37 | for i in range(n): 38 | length += 1 39 | if tokens[i] in vocab: 40 | vec.append(vocab[tokens[i]]) 41 | else: 42 | vec.append(vocab["_unk_"]) 43 | return length, np.array(vec) 44 | 45 | def load_dataset(fname, vocab, max_context_len, max_persona_len): 46 | 47 | dataset=[] 48 | with open(fname, 'rt') as f: 49 | for line in f: 50 | line = line.strip() 51 | fields = line.split('\t') 52 | 53 | # id 54 | c_id = fields[0] 55 | 56 | # context 57 | context = fields[1] + " _eos_" 58 | c_tokens = context.split(' ')[:max_context_len] # select the head max_context_len tokens in every context 59 | c_len, c_vec = to_vec(c_tokens, vocab, max_context_len) 60 | 61 | # matched persona 62 | if fields[2] != "NA": 63 | personas = fields[2].split("|") 64 | for index, persona in enumerate(personas): 65 | p_id = "1." + str(index) 66 | persona = persona + " _eos_" 67 | p_tokens = persona.split(' ')[:max_persona_len] # select the head max_persona_len tokens in every persona 68 | p_len, p_vec = to_vec(p_tokens, vocab, max_persona_len) 69 | dataset.append((c_id, c_tokens, c_vec, c_len, 1.0, p_id, p_tokens, p_vec, p_len)) 70 | 71 | # mismatched persona 72 | if fields[3] != "NA": 73 | personas = fields[3].split("|") 74 | for index, persona in enumerate(personas): 75 | ps_id = "0." + str(index) 76 | persona = persona + " _eos_" 77 | p_tokens = persona.split(' ')[:max_persona_len] # select the head max_persona_len tokens in every persona 78 | p_len, p_vec = to_vec(p_tokens, vocab, max_persona_len) 79 | dataset.append((c_id, c_tokens, c_vec, c_len, 0.0, p_id, p_tokens, p_vec, p_len)) 80 | 81 | return dataset 82 | 83 | 84 | def normalize_vec(vec, maxlen): 85 | ''' 86 | pad the original vec to the same maxlen 87 | [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0] 88 | ''' 89 | if len(vec) == maxlen: 90 | return vec 91 | 92 | new_vec = np.zeros(maxlen, dtype='int32') 93 | for i in range(len(vec)): 94 | new_vec[i] = vec[i] 95 | return new_vec 96 | 97 | 98 | def charVec(tokens, charVocab, maxlen, maxWordLength): 99 | ''' 100 | chars = np.array( (maxlen, maxWordLength) ) 0 if not found in charVocab or None 101 | word_lengths = np.array( maxlen ) 1 if None 102 | ''' 103 | n = len(tokens) 104 | if n > maxlen: 105 | n = maxlen 106 | 107 | chars = np.zeros((maxlen, maxWordLength), dtype=np.int32) 108 | word_lengths = np.ones(maxlen, dtype=np.int32) 109 | for i in range(n): 110 | token = tokens[i][:maxWordLength] 111 | word_lengths[i] = len(token) 112 | row = chars[i] 113 | for idx, ch in enumerate(token): 114 | if ch in charVocab: 115 | row[idx] = charVocab[ch] 116 | 117 | return chars, word_lengths 118 | 119 | 120 | def batch_iter(data, batch_size, num_epochs, max_context_len, max_persona_len, 121 | charVocab, max_word_length, shuffle=True): 122 | """ 123 | Generates a batch iterator for a dataset. 124 | """ 125 | data_size = len(data) 126 | num_batches_per_epoch = int(len(data)/batch_size) + 1 127 | for epoch in range(num_epochs): 128 | # Shuffle the data at each epoch 129 | if shuffle: 130 | random.shuffle(data) 131 | for batch_num in range(num_batches_per_epoch): 132 | start_index = batch_num * batch_size 133 | end_index = min((batch_num + 1) * batch_size, data_size) 134 | 135 | x_context = [] 136 | x_context_len = [] 137 | x_persona = [] 138 | x_persona_len = [] 139 | 140 | x_labels = [] 141 | x_id_pairs = [] 142 | 143 | x_context_char = [] 144 | x_context_char_len = [] 145 | x_persona_char = [] 146 | x_persona_char_len = [] 147 | 148 | for rowIdx in range(start_index, end_index): 149 | c_id, c_tokens, c_vec, c_len, label, p_id, p_tokens, p_vec, p_len = data[rowIdx] 150 | 151 | # normalize c_vec 152 | new_c_vec = normalize_vec(c_vec, max_context_len) 153 | x_context.append(new_c_vec) 154 | x_context_len.append(c_len) 155 | 156 | # normalize p_vec 157 | new_p_vec = normalize_vec(p_vec, max_persona_len) 158 | x_persona.append(new_p_vec) 159 | x_persona_len.append(p_len) 160 | 161 | x_labels.append(label) 162 | x_id_pairs.append((c_id, p_id, int(label))) 163 | 164 | # normalize us_CharVec 165 | cCharVec, cCharLen = charVec(c_tokens, charVocab, max_context_len, max_word_length) 166 | x_context_char.append(cCharVec) 167 | x_context_char_len.append(cCharLen) 168 | 169 | # normalize ps_CharVec 170 | pCharVec, pCharLen = charVec(p_tokens, charVocab, max_persona_len, max_word_length) 171 | x_persona_char.append(pCharVec) 172 | x_persona_char_len.append(pCharLen) 173 | 174 | yield np.array(x_context), np.array(x_context_len), np.array(x_persona), np.array(x_persona_len), \ 175 | np.array(x_labels), x_id_pairs, \ 176 | np.array(x_context_char), np.array(x_context_char_len), np.array(x_persona_char), np.array(x_persona_char_len) 177 | -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | # Normally the global step update is done inside of `apply_gradients`. 80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 81 | # a different optimizer, you should probably take this line out. 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = ( 132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = ( 134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 135 | tf.square(grad))) 136 | 137 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 138 | 139 | # Just adding the square of the weights to the loss function is *not* 140 | # the correct way of using L2 regularization/weight decay with Adam, 141 | # since that will interact with the m and v parameters in strange ways. 142 | # 143 | # Instead we want ot decay the weights in a manner that doesn't interact 144 | # with the m/v parameters. This is equivalent to adding the square 145 | # of the weights to the loss with plain (non-momentum) SGD. 146 | if self._do_use_weight_decay(param_name): 147 | update += self.weight_decay_rate * param 148 | 149 | update_with_lr = self.learning_rate * update 150 | 151 | next_param = param - update_with_lr 152 | 153 | assignments.extend( 154 | [param.assign(next_param), 155 | m.assign(next_m), 156 | v.assign(next_v)]) 157 | return tf.group(*assignments, name=name) 158 | 159 | def _do_use_weight_decay(self, param_name): 160 | """Whether to use L2 weight decay for `param_name`.""" 161 | if not self.weight_decay_rate: 162 | return False 163 | if self.exclude_from_weight_decay: 164 | for r in self.exclude_from_weight_decay: 165 | if re.search(r, param_name) is not None: 166 | return False 167 | return True 168 | 169 | def _get_variable_name(self, param_name): 170 | """Get the variable name from the tensor name.""" 171 | m = re.match("^(.*):\\d+$", param_name) 172 | if m is not None: 173 | param_name = m.group(1) 174 | return param_name 175 | -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | # Normally the global step update is done inside of `apply_gradients`. 80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 81 | # a different optimizer, you should probably take this line out. 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = ( 132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = ( 134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 135 | tf.square(grad))) 136 | 137 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 138 | 139 | # Just adding the square of the weights to the loss function is *not* 140 | # the correct way of using L2 regularization/weight decay with Adam, 141 | # since that will interact with the m and v parameters in strange ways. 142 | # 143 | # Instead we want ot decay the weights in a manner that doesn't interact 144 | # with the m/v parameters. This is equivalent to adding the square 145 | # of the weights to the loss with plain (non-momentum) SGD. 146 | if self._do_use_weight_decay(param_name): 147 | update += self.weight_decay_rate * param 148 | 149 | update_with_lr = self.learning_rate * update 150 | 151 | next_param = param - update_with_lr 152 | 153 | assignments.extend( 154 | [param.assign(next_param), 155 | m.assign(next_m), 156 | v.assign(next_v)]) 157 | return tf.group(*assignments, name=name) 158 | 159 | def _do_use_weight_decay(self, param_name): 160 | """Whether to use L2 weight decay for `param_name`.""" 161 | if not self.weight_decay_rate: 162 | return False 163 | if self.exclude_from_weight_decay: 164 | for r in self.exclude_from_weight_decay: 165 | if re.search(r, param_name) is not None: 166 | return False 167 | return True 168 | 169 | def _get_variable_name(self, param_name): 170 | """Get the variable name from the tensor name.""" 171 | m = re.match("^(.*):\\d+$", param_name) 172 | if m is not None: 173 | param_name = m.group(1) 174 | return param_name 175 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/model_BOW.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | FLAGS = tf.flags.FLAGS 5 | 6 | def get_embeddings(vocab): 7 | print("get_embedding") 8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 9 | return tf.constant(initializer, name="word_embedding") 10 | 11 | def get_char_embedding(charVocab): 12 | print("get_char_embedding") 13 | char_size = len(charVocab) 14 | embeddings = np.zeros((char_size, char_size), dtype='float32') 15 | for i in range(1, char_size): 16 | embeddings[i, i] = 1.0 17 | 18 | return tf.constant(embeddings, name="word_char_embedding") 19 | 20 | def load_embed_vectors(fname, dim): 21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... } 22 | vectors = {} 23 | for line in open(fname, 'rt'): 24 | items = line.strip().split(' ') 25 | if len(items[0]) <= 0: 26 | continue 27 | vec = [float(items[i]) for i in range(1, dim+1)] 28 | vectors[items[0]] = vec 29 | 30 | return vectors 31 | 32 | def load_word_embeddings(vocab, dim): 33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 34 | vocab_size = len(vocab) 35 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 36 | for word, code in vocab.items(): 37 | if word in vectors: 38 | embeddings[code] = vectors[word] 39 | #else: 40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 41 | 42 | return embeddings 43 | 44 | 45 | class BOW(object): 46 | def __init__( 47 | self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0): 48 | 49 | self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances") 50 | self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len") 51 | self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num") 52 | self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles") 53 | self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len") 54 | self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num") 55 | 56 | self.target = tf.placeholder(tf.float32, [None], name="target") 57 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 58 | 59 | self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char") 60 | self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len") 61 | self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char") 62 | self.p_charLen = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len") 63 | 64 | l2_loss = tf.constant(1.0) 65 | 66 | 67 | # =============================== Embedding layer =============================== 68 | with tf.name_scope("embedding"): 69 | W = get_embeddings(vocab) 70 | utterances_embedded = tf.nn.embedding_lookup(W, self.utterances) # [batch_size, max_utter_num, max_utter_len, word_dim] 71 | profiles_embedded = tf.nn.embedding_lookup(W, self.profiles) # [batch_size, max_profile_num, max_profile_len, word_dim] 72 | utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob) 73 | profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob) 74 | print("utterances_embedded: {}".format(utterances_embedded.get_shape())) 75 | print("profiles_embedded: {}".format(profiles_embedded.get_shape())) 76 | 77 | 78 | # =============================== Encoding layer =============================== 79 | with tf.variable_scope("encoding_layer") as vs: 80 | mask_u = tf.sequence_mask(self.utterances_len, max_utter_len, dtype=tf.float32) # [batch_size, max_utter_num, max_utter_len] 81 | mask_u = tf.expand_dims(mask_u, -1) # [batch_size, max_utter_num, max_utter_len, 1] 82 | final_utterances = tf.reduce_max(utterances_embedded * mask_u, axis=2) # [batch_size, max_utter_num, word_dim] 83 | 84 | mask_p = tf.sequence_mask(self.profiles_len, max_profile_len, dtype=tf.float32) # [batch_size, max_profile_num, max_profile_len] 85 | mask_p = tf.expand_dims(mask_p, -1) # [batch_size, max_profile_num, max_profile_len, 1] 86 | final_profiles = tf.reduce_max(profiles_embedded * mask_p, axis=2) # [batch_size, max_profile_num, word_dim] 87 | print("establish BOW encoder") 88 | 89 | 90 | # =============================== Matching layer =============================== 91 | with tf.variable_scope("matching_layer") as vs: 92 | concat_dim = final_utterances.get_shape()[-1].value 93 | 94 | A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32) 95 | similarity = tf.einsum('aij,jk->aik', 96 | final_utterances, A_matrix) # [batch_size, max_utter_num, dim] 97 | similarity = tf.matmul(similarity, 98 | tf.transpose(final_profiles, perm=[0, 2, 1]), 99 | name="similarity") # [batch_size, max_utter_num, max_profile_num] 100 | 101 | print("shape of similarity: {}".format(similarity.get_shape())) 102 | print("establish matching between utterances and profiles") 103 | 104 | 105 | # =============================== Aggregation layer =============================== 106 | with tf.variable_scope("aggregation_layer") as vs: 107 | logits = tf.reduce_max(similarity, axis=2, name="logits_1") # [batch_size, max_utter_num] 108 | mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32) # [batch_size, max_utter_num] 109 | logits = logits * mask_u 110 | logits = tf.reduce_sum(logits, axis=1, name="logits_2") # [batch_size, ] 111 | print("establish reduce_max across profiles and masked_reduce_sum across utterances") 112 | print("logits: {}".format(logits.get_shape())) 113 | 114 | 115 | # =============================== Prediction layer =============================== 116 | with tf.variable_scope("prediction_layer") as vs: 117 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ] 118 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target) 119 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 120 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 121 | 122 | with tf.name_scope("accuracy"): 123 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ] 124 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 125 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/transformer_block.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # main function 4 | def block( 5 | Q, K, V, 6 | Q_lengths, K_lengths, 7 | attention_type='dot', 8 | is_layer_norm=True, 9 | is_mask=True, mask_value=-2**32+1, 10 | drop_prob=None): 11 | '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf. 12 | Args: 13 | Q: a tensor with shape [batch, Q_time, Q_dimension] 14 | K: a tensor with shape [batch, time, K_dimension] 15 | V: a tensor with shape [batch, time, V_dimension] 16 | 17 | Q_length: a tensor with shape [batch] 18 | K_length: a tensor with shape [batch] 19 | 20 | Returns: 21 | a tensor with shape [batch, time, dimension] 22 | ''' 23 | att = attention(Q, K, V, 24 | Q_lengths, K_lengths, 25 | attention_type=attention_type, 26 | is_mask=is_mask, mask_value=mask_value, 27 | drop_prob=drop_prob) 28 | if is_layer_norm: 29 | with tf.variable_scope('attention_layer_norm'): 30 | y = layer_norm_debug(Q + att) 31 | else: 32 | y = Q + att 33 | 34 | z = FFN(y) 35 | if is_layer_norm: 36 | with tf.variable_scope('FFN_layer_norm'): 37 | w = layer_norm_debug(y + z) 38 | else: 39 | w = y + z 40 | return w 41 | 42 | def attention( 43 | Q, K, V, 44 | Q_lengths, K_lengths, 45 | attention_type='dot', 46 | is_mask=True, mask_value=-2**32+1, 47 | drop_prob=None): 48 | '''Add attention layer. 49 | Args: 50 | Q: a tensor with shape [batch, Q_time, Q_dimension] 51 | K: a tensor with shape [batch, time, K_dimension] 52 | V: a tensor with shape [batch, time, V_dimension] 53 | 54 | Q_length: a tensor with shape [batch] 55 | K_length: a tensor with shape [batch] 56 | 57 | Returns: 58 | a tensor with shape [batch, Q_time, V_dimension] 59 | 60 | Raises: 61 | AssertionError: if 62 | Q_dimension not equal to K_dimension when attention type is dot. 63 | ''' 64 | assert attention_type in ('dot', 'bilinear') 65 | if attention_type == 'dot': 66 | assert Q.shape[-1] == K.shape[-1] 67 | 68 | Q_time = Q.shape[1] 69 | K_time = K.shape[1] 70 | 71 | if attention_type == 'dot': 72 | logits = dot_sim(Q, K) #[batch, Q_time, time] 73 | if attention_type == 'bilinear': 74 | logits = bilinear_sim(Q, K) 75 | 76 | if is_mask: 77 | _mask = mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time] 78 | logits = _mask * logits + (1 - _mask) * mask_value 79 | 80 | attention = tf.nn.softmax(logits) 81 | 82 | if drop_prob is not None: 83 | print('use attention drop') 84 | attention = tf.nn.dropout(attention, drop_prob) 85 | 86 | return weighted_sum(attention, V) 87 | 88 | def dot_sim(x, y, is_nor=True): 89 | '''calculate dot similarity with two tensor. 90 | 91 | Args: 92 | x: a tensor with shape [batch, time_x, dimension] 93 | y: a tensor with shape [batch, time_y, dimension] 94 | 95 | Returns: 96 | a tensor with shape [batch, time_x, time_y] 97 | Raises: 98 | AssertionError: if 99 | the shapes of x and y are not match. 100 | ''' 101 | assert x.shape[-1] == y.shape[-1] 102 | 103 | sim = tf.einsum('bik,bjk->bij', x, y) 104 | 105 | if is_nor: 106 | scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32)) 107 | scale = tf.maximum(1.0, scale) 108 | return sim / scale 109 | else: 110 | return result 111 | 112 | def bilinear_sim(x, y, is_nor=True): 113 | '''calculate bilinear similarity with two tensor. 114 | Args: 115 | x: a tensor with shape [batch, time_x, dimension_x] 116 | y: a tensor with shape [batch, time_y, dimension_y] 117 | 118 | Returns: 119 | a tensor with shape [batch, time_x, time_y] 120 | Raises: 121 | ValueError: if 122 | the shapes of x and y are not match; 123 | bilinear matrix reuse error. 124 | ''' 125 | M = tf.get_variable( 126 | name="bilinear_matrix", 127 | shape=[x.shape[-1], y.shape[-1]], 128 | dtype=tf.float32, 129 | initializer=tf.orthogonal_initializer()) 130 | sim = tf.einsum('bik,kl,bjl->bij', x, M, y) 131 | 132 | if is_nor: 133 | scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32)) 134 | scale = tf.maximum(1.0, scale) 135 | return sim / scale 136 | else: 137 | return sim 138 | 139 | def mask(row_lengths, col_lengths, max_row_length, max_col_length): 140 | '''Return a mask tensor representing the first N positions of each row and each column. 141 | 142 | Args: 143 | row_lengths: a tensor with shape [batch] 144 | col_lengths: a tensor with shape [batch] 145 | 146 | Returns: 147 | a mask tensor with shape [batch, max_row_length, max_col_length] 148 | 149 | Raises: 150 | ''' 151 | row_mask = tf.sequence_mask(row_lengths, max_row_length) #bool, [batch, max_row_len] 152 | col_mask = tf.sequence_mask(col_lengths, max_col_length) #bool, [batch, max_col_len] 153 | 154 | row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32) 155 | col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32) 156 | 157 | return tf.einsum('bik,bjk->bij', row_mask, col_mask) 158 | 159 | def weighted_sum(weight, values): 160 | '''Calcualte the weighted sum. 161 | 162 | Args: 163 | weight: a tensor with shape [batch, time, dimension] 164 | values: a tensor with shape [batch, dimension, values_dimension] 165 | 166 | Return: 167 | a tensor with shape [batch, time, values_dimension] 168 | 169 | Raises: 170 | ''' 171 | return tf.einsum('bij,bjk->bik', weight, values) 172 | 173 | def layer_norm_debug(x, axis = None, epsilon=1e-6): 174 | '''Add layer normalization. 175 | 176 | Args: 177 | x: a tensor 178 | axis: the dimensions to normalize 179 | 180 | Returns: 181 | a tensor the same shape as x. 182 | 183 | Raises: 184 | ''' 185 | if axis is None: 186 | axis = [-1] 187 | shape = [x.shape[i] for i in axis] 188 | 189 | scale = tf.get_variable( 190 | name='scale', 191 | shape=shape, 192 | dtype=tf.float32, 193 | initializer=tf.ones_initializer()) 194 | bias = tf.get_variable( 195 | name='bias', 196 | shape=shape, 197 | dtype=tf.float32, 198 | initializer=tf.zeros_initializer()) 199 | 200 | mean = tf.reduce_mean(x, axis=axis, keep_dims=True) 201 | variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True) 202 | norm = (x-mean) * tf.rsqrt(variance + epsilon) 203 | return scale * norm + bias 204 | 205 | def FFN(x, out_dimension_0=None, out_dimension_1=None): 206 | '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1. 207 | 208 | Args: 209 | x: a tensor with shape [batch, time, dimension] 210 | out_dimension: a number which is the output dimension 211 | 212 | Returns: 213 | a tensor with shape [batch, time, out_dimension] 214 | 215 | Raises: 216 | ''' 217 | with tf.variable_scope('FFN_1'): 218 | y = dense(x, out_dimension_0) 219 | y = tf.nn.relu(y) 220 | with tf.variable_scope('FFN_2'): 221 | z = dense(y, out_dimension_1) #, add_bias=False) #!!!! 222 | return z 223 | 224 | def dense(x, out_dimension=None, add_bias=True): 225 | '''Add dense connected layer, Wx + b. 226 | 227 | Args: 228 | x: a tensor with shape [batch, time, dimension] 229 | out_dimension: a number which is the output dimension 230 | 231 | Return: 232 | a tensor with shape [batch, time, out_dimension] 233 | 234 | Raises: 235 | ''' 236 | if out_dimension is None: 237 | out_dimension = x.shape[-1] 238 | 239 | W = tf.get_variable( 240 | name='weights', 241 | shape=[x.shape[-1], out_dimension], 242 | dtype=tf.float32, 243 | initializer=tf.orthogonal_initializer()) 244 | if add_bias: 245 | bias = tf.get_variable( 246 | name='bias', 247 | shape=[1], 248 | dtype=tf.float32, 249 | initializer=tf.zeros_initializer()) 250 | return tf.einsum('bik,kj->bij', x, W) + bias 251 | else: 252 | return tf.einsum('bik,kj->bij', x, W) 253 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/transformer_block.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # main function 4 | def block( 5 | Q, K, V, 6 | Q_lengths, K_lengths, 7 | attention_type='dot', 8 | is_layer_norm=True, 9 | is_mask=True, mask_value=-2**32+1, 10 | drop_prob=None): 11 | '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf. 12 | Args: 13 | Q: a tensor with shape [batch, Q_time, Q_dimension] 14 | K: a tensor with shape [batch, time, K_dimension] 15 | V: a tensor with shape [batch, time, V_dimension] 16 | 17 | Q_length: a tensor with shape [batch] 18 | K_length: a tensor with shape [batch] 19 | 20 | Returns: 21 | a tensor with shape [batch, time, dimension] 22 | ''' 23 | att = attention(Q, K, V, 24 | Q_lengths, K_lengths, 25 | attention_type=attention_type, 26 | is_mask=is_mask, mask_value=mask_value, 27 | drop_prob=drop_prob) 28 | if is_layer_norm: 29 | with tf.variable_scope('attention_layer_norm'): 30 | y = layer_norm_debug(Q + att) 31 | else: 32 | y = Q + att 33 | 34 | z = FFN(y) 35 | if is_layer_norm: 36 | with tf.variable_scope('FFN_layer_norm'): 37 | w = layer_norm_debug(y + z) 38 | else: 39 | w = y + z 40 | return w 41 | 42 | def attention( 43 | Q, K, V, 44 | Q_lengths, K_lengths, 45 | attention_type='dot', 46 | is_mask=True, mask_value=-2**32+1, 47 | drop_prob=None): 48 | '''Add attention layer. 49 | Args: 50 | Q: a tensor with shape [batch, Q_time, Q_dimension] 51 | K: a tensor with shape [batch, time, K_dimension] 52 | V: a tensor with shape [batch, time, V_dimension] 53 | 54 | Q_length: a tensor with shape [batch] 55 | K_length: a tensor with shape [batch] 56 | 57 | Returns: 58 | a tensor with shape [batch, Q_time, V_dimension] 59 | 60 | Raises: 61 | AssertionError: if 62 | Q_dimension not equal to K_dimension when attention type is dot. 63 | ''' 64 | assert attention_type in ('dot', 'bilinear') 65 | if attention_type == 'dot': 66 | assert Q.shape[-1] == K.shape[-1] 67 | 68 | Q_time = Q.shape[1] 69 | K_time = K.shape[1] 70 | 71 | if attention_type == 'dot': 72 | logits = dot_sim(Q, K) #[batch, Q_time, time] 73 | if attention_type == 'bilinear': 74 | logits = bilinear_sim(Q, K) 75 | 76 | if is_mask: 77 | _mask = mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time] 78 | logits = _mask * logits + (1 - _mask) * mask_value 79 | 80 | attention = tf.nn.softmax(logits) 81 | 82 | if drop_prob is not None: 83 | print('use attention drop') 84 | attention = tf.nn.dropout(attention, drop_prob) 85 | 86 | return weighted_sum(attention, V) 87 | 88 | def dot_sim(x, y, is_nor=True): 89 | '''calculate dot similarity with two tensor. 90 | 91 | Args: 92 | x: a tensor with shape [batch, time_x, dimension] 93 | y: a tensor with shape [batch, time_y, dimension] 94 | 95 | Returns: 96 | a tensor with shape [batch, time_x, time_y] 97 | Raises: 98 | AssertionError: if 99 | the shapes of x and y are not match. 100 | ''' 101 | assert x.shape[-1] == y.shape[-1] 102 | 103 | sim = tf.einsum('bik,bjk->bij', x, y) 104 | 105 | if is_nor: 106 | scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32)) 107 | scale = tf.maximum(1.0, scale) 108 | return sim / scale 109 | else: 110 | return result 111 | 112 | def bilinear_sim(x, y, is_nor=True): 113 | '''calculate bilinear similarity with two tensor. 114 | Args: 115 | x: a tensor with shape [batch, time_x, dimension_x] 116 | y: a tensor with shape [batch, time_y, dimension_y] 117 | 118 | Returns: 119 | a tensor with shape [batch, time_x, time_y] 120 | Raises: 121 | ValueError: if 122 | the shapes of x and y are not match; 123 | bilinear matrix reuse error. 124 | ''' 125 | M = tf.get_variable( 126 | name="bilinear_matrix", 127 | shape=[x.shape[-1], y.shape[-1]], 128 | dtype=tf.float32, 129 | initializer=tf.orthogonal_initializer()) 130 | sim = tf.einsum('bik,kl,bjl->bij', x, M, y) 131 | 132 | if is_nor: 133 | scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32)) 134 | scale = tf.maximum(1.0, scale) 135 | return sim / scale 136 | else: 137 | return sim 138 | 139 | def mask(row_lengths, col_lengths, max_row_length, max_col_length): 140 | '''Return a mask tensor representing the first N positions of each row and each column. 141 | 142 | Args: 143 | row_lengths: a tensor with shape [batch] 144 | col_lengths: a tensor with shape [batch] 145 | 146 | Returns: 147 | a mask tensor with shape [batch, max_row_length, max_col_length] 148 | 149 | Raises: 150 | ''' 151 | row_mask = tf.sequence_mask(row_lengths, max_row_length) #bool, [batch, max_row_len] 152 | col_mask = tf.sequence_mask(col_lengths, max_col_length) #bool, [batch, max_col_len] 153 | 154 | row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32) 155 | col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32) 156 | 157 | return tf.einsum('bik,bjk->bij', row_mask, col_mask) 158 | 159 | def weighted_sum(weight, values): 160 | '''Calcualte the weighted sum. 161 | 162 | Args: 163 | weight: a tensor with shape [batch, time, dimension] 164 | values: a tensor with shape [batch, dimension, values_dimension] 165 | 166 | Return: 167 | a tensor with shape [batch, time, values_dimension] 168 | 169 | Raises: 170 | ''' 171 | return tf.einsum('bij,bjk->bik', weight, values) 172 | 173 | def layer_norm_debug(x, axis = None, epsilon=1e-6): 174 | '''Add layer normalization. 175 | 176 | Args: 177 | x: a tensor 178 | axis: the dimensions to normalize 179 | 180 | Returns: 181 | a tensor the same shape as x. 182 | 183 | Raises: 184 | ''' 185 | if axis is None: 186 | axis = [-1] 187 | shape = [x.shape[i] for i in axis] 188 | 189 | scale = tf.get_variable( 190 | name='scale', 191 | shape=shape, 192 | dtype=tf.float32, 193 | initializer=tf.ones_initializer()) 194 | bias = tf.get_variable( 195 | name='bias', 196 | shape=shape, 197 | dtype=tf.float32, 198 | initializer=tf.zeros_initializer()) 199 | 200 | mean = tf.reduce_mean(x, axis=axis, keep_dims=True) 201 | variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True) 202 | norm = (x-mean) * tf.rsqrt(variance + epsilon) 203 | return scale * norm + bias 204 | 205 | def FFN(x, out_dimension_0=None, out_dimension_1=None): 206 | '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1. 207 | 208 | Args: 209 | x: a tensor with shape [batch, time, dimension] 210 | out_dimension: a number which is the output dimension 211 | 212 | Returns: 213 | a tensor with shape [batch, time, out_dimension] 214 | 215 | Raises: 216 | ''' 217 | with tf.variable_scope('FFN_1'): 218 | y = dense(x, out_dimension_0) 219 | y = tf.nn.relu(y) 220 | with tf.variable_scope('FFN_2'): 221 | z = dense(y, out_dimension_1) #, add_bias=False) #!!!! 222 | return z 223 | 224 | def dense(x, out_dimension=None, add_bias=True): 225 | '''Add dense connected layer, Wx + b. 226 | 227 | Args: 228 | x: a tensor with shape [batch, time, dimension] 229 | out_dimension: a number which is the output dimension 230 | 231 | Return: 232 | a tensor with shape [batch, time, out_dimension] 233 | 234 | Raises: 235 | ''' 236 | if out_dimension is None: 237 | out_dimension = x.shape[-1] 238 | 239 | W = tf.get_variable( 240 | name='weights', 241 | shape=[x.shape[-1], out_dimension], 242 | dtype=tf.float32, 243 | initializer=tf.orthogonal_initializer()) 244 | if add_bias: 245 | bias = tf.get_variable( 246 | name='bias', 247 | shape=[1], 248 | dtype=tf.float32, 249 | initializer=tf.zeros_initializer()) 250 | return tf.einsum('bik,kj->bij', x, W) + bias 251 | else: 252 | return tf.einsum('bik,kj->bij', x, W) 253 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/model_Transformer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import transformer_block 4 | 5 | FLAGS = tf.flags.FLAGS 6 | 7 | def get_embeddings(vocab): 8 | print("get_embedding") 9 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 10 | return tf.constant(initializer, name="word_embedding") 11 | 12 | def get_char_embedding(charVocab): 13 | print("get_char_embedding") 14 | char_size = len(charVocab) 15 | embeddings = np.zeros((char_size, char_size), dtype='float32') 16 | for i in range(1, char_size): 17 | embeddings[i, i] = 1.0 18 | 19 | return tf.constant(embeddings, name="word_char_embedding") 20 | 21 | def load_embed_vectors(fname, dim): 22 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... } 23 | vectors = {} 24 | for line in open(fname, 'rt'): 25 | items = line.strip().split(' ') 26 | if len(items[0]) <= 0: 27 | continue 28 | vec = [float(items[i]) for i in range(1, dim+1)] 29 | vectors[items[0]] = vec 30 | 31 | return vectors 32 | 33 | def load_word_embeddings(vocab, dim): 34 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 35 | vocab_size = len(vocab) 36 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 37 | for word, code in vocab.items(): 38 | if word in vectors: 39 | embeddings[code] = vectors[word] 40 | #else: 41 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 42 | 43 | return embeddings 44 | 45 | def cnn_layer(inputs, filter_sizes, num_filters, scope=None, scope_reuse=False): 46 | with tf.variable_scope(scope, reuse=scope_reuse): 47 | input_size = inputs.get_shape()[2].value 48 | 49 | outputs = [] 50 | for i, filter_size in enumerate(filter_sizes): 51 | with tf.variable_scope("conv_{}".format(i)): 52 | w = tf.get_variable("w", [filter_size, input_size, num_filters]) 53 | b = tf.get_variable("b", [num_filters]) 54 | conv = tf.nn.conv1d(inputs, w, stride=1, padding="VALID") # [num_words, num_chars - filter_size, num_filters] 55 | h = tf.nn.relu(tf.nn.bias_add(conv, b)) # [num_words, num_chars - filter_size, num_filters] 56 | pooled = tf.reduce_max(h, 1) # [num_words, num_filters] 57 | outputs.append(pooled) 58 | return tf.concat(outputs, 1) # [num_words, num_filters * len(filter_sizes)] 59 | 60 | 61 | class Transformer(object): 62 | def __init__( 63 | self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0): 64 | 65 | self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context") 66 | self.context_len = tf.placeholder(tf.int32, [None], name="context_len") 67 | self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona") 68 | self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len") 69 | 70 | self.target = tf.placeholder(tf.float32, [None], name="target") 71 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 72 | 73 | self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char") 74 | self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len") 75 | self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char") 76 | self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len") 77 | 78 | l2_loss = tf.constant(1.0) 79 | 80 | # =============================== Embedding layer =============================== 81 | # 1. word embedding 82 | with tf.name_scope("embedding"): 83 | W = get_embeddings(vocab) 84 | context_embedded = tf.nn.embedding_lookup(W, self.context) # [batch_size, max_context_len, word_dim] 85 | persona_embedded = tf.nn.embedding_lookup(W, self.persona) # [batch_size, max_persona_len, word_dim] 86 | context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob) 87 | persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob) 88 | print("context_embedded: {}".format(context_embedded.get_shape())) 89 | print("persona_embedded: {}".format(persona_embedded.get_shape())) 90 | 91 | 92 | # =============================== Encoding layer =============================== 93 | emb_dim = context_embedded.get_shape()[-1].value 94 | 95 | # with tf.variable_scope("encoding_layer") as vs: 96 | # # CNN encoder 97 | # final_context = cnn_layer(context_embedded, filter_sizes=[3, 4, 5], num_filters=100, scope="CNN_emb", scope_reuse=False) # [batch_size*max_utter_num, emb] 98 | # final_persona = cnn_layer(persona_embedded, filter_sizes=[3, 4, 5], num_filters=100, scope="CNN_emb", scope_reuse=True) # [batch_size*max_profile_num, emb] 99 | # print("establish CNN encoder") 100 | 101 | context_input = context_embedded 102 | for layer in range(num_layer): 103 | with tf.variable_scope("encoding_layer_{}".format(layer)): 104 | context_output = transformer_block.block(context_input, context_input, context_input, self.context_len, self.context_len) 105 | context_input = context_output 106 | 107 | persona_input = persona_embedded 108 | for layer in range(num_layer): 109 | with tf.variable_scope("encoding_layer_{}".format(layer), reuse=True): # [batch_size, max_context_len, word_dim] 110 | persona_output = transformer_block.block(persona_input, persona_input, persona_input, self.persona_len, self.persona_len) 111 | persona_input = persona_output 112 | print("context_output: {}".format(context_output.get_shape())) # [batch_size, max_persona_len, word_dim] 113 | print("persona_output: {}".format(persona_output.get_shape())) 114 | print("establish {}-layer Transformer encoder".format(num_layer)) 115 | 116 | 117 | # =============================== Matching layer =============================== 118 | with tf.variable_scope("matching_layer") as vs: 119 | mask_c = tf.sequence_mask(self.context_len, max_context_len, dtype=tf.float32) # [batch_size, max_context_len] 120 | context_output = context_output * tf.expand_dims(mask_c, 2) # [batch_size, max_context_len, dim] 121 | final_context = tf.reduce_sum(context_output, axis=1) # [batch_size, dim] 122 | 123 | mask_p = tf.sequence_mask(self.persona_len, max_persona_len, dtype=tf.float32) # [batch_size, max_persona_len] 124 | persona_output = persona_output * tf.expand_dims(mask_p, 2) # [batch_size, max_persona_len, dim] 125 | final_persona = tf.reduce_sum(persona_output, axis=1) # [batch_size, dim] 126 | 127 | output_dim = final_context.get_shape()[-1].value 128 | A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32) 129 | 130 | similarity = tf.matmul(final_context, A_matrix) # [batch_size, dim] 131 | similarity = tf.reduce_sum(similarity * final_persona, axis=-1) # [batch_size, ] 132 | print("shape of similarity: {}".format(similarity.get_shape())) 133 | 134 | 135 | # =============================== Prediction layer =============================== 136 | with tf.variable_scope("prediction_layer") as vs: 137 | logits = similarity 138 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ] 139 | 140 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target) 141 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 142 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 143 | 144 | with tf.name_scope("accuracy"): 145 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ] 146 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 147 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/model_BiLSTM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | FLAGS = tf.flags.FLAGS 5 | 6 | def get_embeddings(vocab): 7 | print("get_embedding") 8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 9 | return tf.constant(initializer, name="word_embedding") 10 | 11 | def get_char_embedding(charVocab): 12 | print("get_char_embedding") 13 | char_size = len(charVocab) 14 | embeddings = np.zeros((char_size, char_size), dtype='float32') 15 | for i in range(1, char_size): 16 | embeddings[i, i] = 1.0 17 | 18 | return tf.constant(embeddings, name="word_char_embedding") 19 | 20 | def load_embed_vectors(fname, dim): 21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... } 22 | vectors = {} 23 | for line in open(fname, 'rt'): 24 | items = line.strip().split(' ') 25 | if len(items[0]) <= 0: 26 | continue 27 | vec = [float(items[i]) for i in range(1, dim+1)] 28 | vectors[items[0]] = vec 29 | 30 | return vectors 31 | 32 | def load_word_embeddings(vocab, dim): 33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 34 | vocab_size = len(vocab) 35 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 36 | for word, code in vocab.items(): 37 | if word in vectors: 38 | embeddings[code] = vectors[word] 39 | #else: 40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 41 | 42 | return embeddings 43 | 44 | 45 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False): 46 | with tf.variable_scope(scope, reuse=scope_reuse) as vs: 47 | fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse) 48 | fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob) 49 | bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse) 50 | bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob) 51 | rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell, 52 | inputs=inputs, 53 | sequence_length=input_seq_len, 54 | dtype=tf.float32) 55 | return rnn_outputs, rnn_states 56 | 57 | def multi_lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, num_layer, scope, scope_reuse=False): 58 | with tf.variable_scope(scope, reuse=scope_reuse) as vs: 59 | multi_outputs = [] 60 | multi_states = [] 61 | cur_inputs = inputs 62 | for i_layer in range(num_layer): 63 | rnn_outputs, rnn_states = lstm_layer(cur_inputs, input_seq_len, rnn_size, dropout_keep_prob, scope+str(i_layer), scope_reuse) 64 | rnn_outputs = tf.concat(values=rnn_outputs, axis=2) 65 | multi_outputs.append(rnn_outputs) 66 | multi_states.append(rnn_states) 67 | cur_inputs = rnn_outputs 68 | 69 | # multi_layer_aggregation 70 | ml_weights = tf.nn.softmax(tf.get_variable("ml_scores", [num_layer, ], initializer=tf.constant_initializer(0.0))) 71 | 72 | multi_outputs = tf.stack(multi_outputs, axis=-1) # [batch_size, max_len, 2*rnn_size(400), num_layer] 73 | max_len = multi_outputs.get_shape()[1].value 74 | dim = multi_outputs.get_shape()[2].value 75 | flattened_multi_outputs = tf.reshape(multi_outputs, [-1, num_layer]) # [batch_size * max_len * 2*rnn_size(400), num_layer] 76 | aggregated_ml_outputs = tf.matmul(flattened_multi_outputs, tf.expand_dims(ml_weights, 1)) # [batch_size * max_len * 2*rnn_size(400), 1] 77 | aggregated_ml_outputs = tf.reshape(aggregated_ml_outputs, [-1, max_len, dim]) # [batch_size , max_len , 2*rnn_size(400)] 78 | 79 | return aggregated_ml_outputs 80 | 81 | 82 | class BiLSTM(object): 83 | def __init__( 84 | self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0): 85 | 86 | self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context") 87 | self.context_len = tf.placeholder(tf.int32, [None], name="context_len") 88 | self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona") 89 | self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len") 90 | 91 | self.target = tf.placeholder(tf.float32, [None], name="target") 92 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 93 | 94 | self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char") 95 | self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len") 96 | self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char") 97 | self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len") 98 | 99 | l2_loss = tf.constant(1.0) 100 | 101 | # =============================== Embedding layer =============================== 102 | with tf.name_scope("embedding"): 103 | W = get_embeddings(vocab) 104 | context_embedded = tf.nn.embedding_lookup(W, self.context) # [batch_size, max_context_len, word_dim] 105 | persona_embedded = tf.nn.embedding_lookup(W, self.persona) # [batch_size, max_persona_len, word_dim] 106 | context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob) 107 | persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob) 108 | print("context_embedded: {}".format(context_embedded.get_shape())) 109 | print("persona_embedded: {}".format(persona_embedded.get_shape())) 110 | 111 | 112 | # =============================== Encoding layer =============================== 113 | with tf.variable_scope("encoding_layer") as vs: 114 | rnn_scope_name = "bidirectional_rnn" 115 | # 1. single_lstm_layer 116 | c_rnn_output, c_rnn_states = lstm_layer(context_embedded, self.context_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False) 117 | context_output = tf.concat(axis=2, values=c_rnn_output) # [batch_size, max_context_len, rnn_size*2] 118 | p_rnn_output, p_rnn_states = lstm_layer(persona_embedded, self.persona_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True) # [batch_size, max_profile_len, rnn_size(200)] 119 | persona_output = tf.concat(axis=2, values=p_rnn_output) # [batch_size, max_persona_len, rnn_size*2] 120 | # 2. multi_lstm_layer 121 | # utterances_output = multi_lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=False) 122 | # response_output = multi_lstm_layer(flattened_responses_embedded, flattened_responses_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=True) 123 | # print("establish AHRE layers : {}".format(num_layer)) 124 | print("establish BiLSTM encoder") 125 | 126 | 127 | # =============================== Matching layer =============================== 128 | with tf.variable_scope("matching_layer") as vs: 129 | final_context = tf.concat(axis=1, values=[c_rnn_states[0].h, c_rnn_states[1].h]) # [batch_size, rnn_size*2] 130 | final_persona = tf.concat(axis=1, values=[p_rnn_states[0].h, p_rnn_states[1].h]) # [batch_size, rnn_size*2] 131 | 132 | output_dim = final_context.get_shape()[-1].value 133 | A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32) 134 | 135 | similarity = tf.matmul(final_context, A_matrix) # [batch_size, dim] 136 | similarity = tf.reduce_sum(similarity * final_persona, axis=-1) # [batch_size, ] 137 | print("shape of similarity: {}".format(similarity.get_shape())) 138 | 139 | 140 | # =============================== Prediction layer =============================== 141 | with tf.variable_scope("prediction_layer") as vs: 142 | logits = similarity 143 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ] 144 | 145 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target) 146 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 147 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 148 | 149 | with tf.name_scope("accuracy"): 150 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ] 151 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 152 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/model_Transformer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import transformer_block 4 | 5 | FLAGS = tf.flags.FLAGS 6 | 7 | def get_embeddings(vocab): 8 | print("get_embedding") 9 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 10 | return tf.constant(initializer, name="word_embedding") 11 | 12 | def get_char_embedding(charVocab): 13 | print("get_char_embedding") 14 | char_size = len(charVocab) 15 | embeddings = np.zeros((char_size, char_size), dtype='float32') 16 | for i in range(1, char_size): 17 | embeddings[i, i] = 1.0 18 | 19 | return tf.constant(embeddings, name="word_char_embedding") 20 | 21 | def load_embed_vectors(fname, dim): 22 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... } 23 | vectors = {} 24 | for line in open(fname, 'rt'): 25 | items = line.strip().split(' ') 26 | if len(items[0]) <= 0: 27 | continue 28 | vec = [float(items[i]) for i in range(1, dim+1)] 29 | vectors[items[0]] = vec 30 | 31 | return vectors 32 | 33 | def load_word_embeddings(vocab, dim): 34 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 35 | vocab_size = len(vocab) 36 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 37 | for word, code in vocab.items(): 38 | if word in vectors: 39 | embeddings[code] = vectors[word] 40 | #else: 41 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 42 | 43 | return embeddings 44 | 45 | 46 | class Transformer(object): 47 | def __init__( 48 | self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0): 49 | 50 | self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances") 51 | self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len") 52 | self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num") 53 | self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles") 54 | self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len") 55 | self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num") 56 | 57 | self.target = tf.placeholder(tf.float32, [None], name="target") 58 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 59 | 60 | self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char") 61 | self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len") 62 | self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char") 63 | self.p_charLen = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len") 64 | 65 | l2_loss = tf.constant(1.0) 66 | 67 | 68 | # =============================== Embedding layer =============================== 69 | with tf.name_scope("embedding"): 70 | W = get_embeddings(vocab) 71 | utterances_embedded = tf.nn.embedding_lookup(W, self.utterances) # [batch_size, max_utter_num, max_utter_len, word_dim] 72 | profiles_embedded = tf.nn.embedding_lookup(W, self.profiles) # [batch_size, max_profile_num, max_profile_len, word_dim] 73 | utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob) 74 | profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob) 75 | print("utterances_embedded: {}".format(utterances_embedded.get_shape())) 76 | print("profiles_embedded: {}".format(profiles_embedded.get_shape())) 77 | 78 | 79 | # =============================== Encoding layer =============================== 80 | with tf.variable_scope("encoding_layer") as vs: 81 | rnn_scope_name = "bidirectional_rnn" 82 | emb_dim = utterances_embedded.get_shape()[-1].value 83 | flattened_utterances_embedded = tf.reshape(utterances_embedded, [-1, max_utter_len, emb_dim]) # [batch_size*max_utter_num, max_utter_len, emb] 84 | flattened_utterances_len = tf.reshape(self.utterances_len, [-1]) # [batch_size*max_utter_num, ] 85 | flattened_profiles_embedded = tf.reshape(profiles_embedded, [-1, max_profile_len, emb_dim]) # [batch_size*max_profile_num, max_profile_len, emb] 86 | flattened_profiles_len = tf.reshape(self.profiles_len, [-1]) # [batch_size*max_profile_num, ] 87 | 88 | utterances_input = flattened_utterances_embedded 89 | profiles_input = flattened_profiles_embedded 90 | for layer in range(num_layer): 91 | with tf.variable_scope("encoding_layer_{}".format(layer)): 92 | utterances_output = transformer_block.block(utterances_input, utterances_input, utterances_input, 93 | flattened_utterances_len, flattened_utterances_len) 94 | utterances_input = utterances_output 95 | 96 | for layer in range(num_layer): 97 | with tf.variable_scope("encoding_layer_{}".format(layer), reuse=True): 98 | profiles_output = transformer_block.block(profiles_input, profiles_input, profiles_input, 99 | flattened_profiles_len, flattened_profiles_len) 100 | profiles_input = profiles_output 101 | print("establish Transformer encoder") 102 | print("utterances_output: {}".format(utterances_output.get_shape())) 103 | print("profiles_output: {}".format(profiles_output.get_shape())) 104 | 105 | 106 | # =============================== Matching layer =============================== 107 | with tf.variable_scope("matching_layer") as vs: 108 | mask_u = tf.sequence_mask(flattened_utterances_len, max_utter_len, dtype=tf.float32) # [batch_size*max_utter_num, max_utter_len] 109 | utterances_output = utterances_output * tf.expand_dims(mask_u, 2) # [batch_size*max_utter_num, max_utter_len, dim] 110 | final_utterances = tf.reduce_sum(utterances_output, axis=1) # [batch_size*max_utter_num, dim] 111 | # final_utterances = tf.div(final_utterances, tf.expand_dims(tf.sqrt(tf.cast(flattened_utterances_len, tf.float32)), 1)) 112 | concat_dim = final_utterances.get_shape()[-1].value 113 | final_utterances = tf.reshape(final_utterances, [-1, max_utter_num, concat_dim]) # [batch_size, max_utter_num, dim] 114 | 115 | mask_p = tf.sequence_mask(flattened_profiles_len, max_profile_len, dtype=tf.float32) # [batch_size*max_profile_num, max_profile_len] 116 | profiles_output = profiles_output * tf.expand_dims(mask_p, 2) 117 | final_profiles = tf.reduce_sum(profiles_output, axis=1) 118 | # final_profiles = tf.div(final_profiles, tf.expand_dims(tf.sqrt(tf.cast(flattened_profiles_len, tf.float32)), 1)) 119 | final_profiles = tf.reshape(final_profiles, [-1, max_profile_num, concat_dim]) # [batch_size, max_profile_num, dim] 120 | 121 | A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32) 122 | similarity = tf.einsum('aij,jk->aik', 123 | final_utterances, A_matrix) # [batch_size, max_utter_num, dim] 124 | similarity = tf.matmul(similarity, 125 | tf.transpose(final_profiles, perm=[0, 2, 1])) # [batch_size, max_utter_num, max_profile_num] 126 | 127 | print("shape of similarity: {}".format(similarity.get_shape())) 128 | 129 | 130 | # =============================== Aggregation layer =============================== 131 | with tf.variable_scope("aggregation_layer") as vs: 132 | logits = tf.reduce_max(similarity, axis=2) # [batch_size, max_utter_num] 133 | mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32) # [batch_size, max_utter_num] 134 | logits = logits * mask_u 135 | logits = tf.reduce_sum(logits, axis=1) # [batch_size, ] 136 | print("establish reduce_max across profiles and masked_reduce_sum across utterances") 137 | print("logits: {}".format(logits.get_shape())) 138 | 139 | 140 | # =============================== Prediction layer =============================== 141 | with tf.variable_scope("prediction_layer") as vs: 142 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ] 143 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target) 144 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 145 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 146 | 147 | with tf.name_scope("accuracy"): 148 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ] 149 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 150 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | def load_vocab(fname): 6 | ''' 7 | vocab = {"I": 0, ...} 8 | ''' 9 | vocab={} 10 | with open(fname, 'rt') as f: 11 | for i,line in enumerate(f): 12 | word = line.strip() 13 | vocab[word] = i 14 | return vocab 15 | 16 | def load_char_vocab(fname): 17 | ''' 18 | charVocab = {"U": 0, "!": 1, ...} 19 | ''' 20 | charVocab={} 21 | with open(fname, 'rt') as f: 22 | for line in f: 23 | fields = line.strip().split('\t') 24 | char_id = int(fields[0]) 25 | ch = fields[1] 26 | charVocab[ch] = char_id 27 | return charVocab 28 | 29 | def to_vec(tokens, vocab, maxlen): 30 | ''' 31 | length: length of the input sequence 32 | vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...] 33 | ''' 34 | n = len(tokens) 35 | length = 0 36 | vec=[] 37 | for i in range(n): 38 | length += 1 39 | if tokens[i] in vocab: 40 | vec.append(vocab[tokens[i]]) 41 | else: 42 | # vec.append(vocab["fiance"]) # fix to fiance 43 | vec.append(vocab["_unk_"]) 44 | return length, np.array(vec) 45 | 46 | def load_dataset(fname, vocab, max_utter_num, max_utter_len, max_profile_num, max_profile_len): 47 | 48 | dataset=[] 49 | with open(fname, 'rt') as f: 50 | for line in f: 51 | # ( id, context utterances, persona candidates, label ) 52 | line = line.strip() 53 | fields = line.split('\t') 54 | 55 | # id 56 | us_id = fields[0] 57 | 58 | # context utterances 59 | context = fields[1] 60 | utterances = context.split(' _eos_ ') 61 | utterances = [utterance + " _eos_" for utterance in utterances] 62 | utterances = utterances[-max_utter_num:] # select the last max_utter_num utterances 63 | 64 | us_tokens = [] 65 | us_vec = [] 66 | us_len = [] 67 | for utterance in utterances: 68 | u_tokens = utterance.split(' ')[:max_utter_len] # select the head max_utter_len tokens in every utterance 69 | u_len, u_vec = to_vec(u_tokens, vocab, max_utter_len) 70 | us_tokens.append(u_tokens) 71 | us_vec.append(u_vec) 72 | us_len.append(u_len) 73 | us_num = len(utterances) 74 | 75 | # persona candidates 76 | if fields[2] != "NA": 77 | personas = fields[2].split("|") 78 | for index, persona in enumerate(personas): 79 | # ps_id = "match_" + str(index) 80 | ps_id = "1." + str(index) 81 | profiles = persona.split(' _eos_ ') 82 | profiles = [profile + " _eos_" for profile in profiles] 83 | profiles = profiles[-max_profile_num:] # select the last max_utter_num utterances 84 | ps_tokens = [] 85 | ps_vec = [] 86 | ps_len = [] 87 | for profile in profiles: 88 | p_tokens = profile.split(' ')[:max_profile_len] # select the head max_profile_len tokens in every persona 89 | p_len, p_vec = to_vec(p_tokens, vocab, max_profile_len) 90 | ps_tokens.append(p_tokens) 91 | ps_vec.append(p_vec) 92 | ps_len.append(p_len) 93 | ps_num = len(profiles) 94 | dataset.append((us_id, us_tokens, us_vec, us_len, us_num, 1.0, ps_id, ps_tokens, ps_vec, ps_len, ps_num)) 95 | 96 | if fields[3] != "NA": 97 | personas = fields[3].split("|") 98 | for index, persona in enumerate(personas): 99 | # ps_id = "mismatch_" + str(index) 100 | ps_id = "0." + str(index) 101 | profiles = persona.split(' _eos_ ') 102 | profiles = [profile + " _eos_" for profile in profiles] 103 | profiles = profiles[-max_profile_num:] 104 | ps_tokens = [] 105 | ps_vec = [] 106 | ps_len = [] 107 | for profile in profiles: 108 | p_tokens = profile.split(' ')[:max_profile_len] 109 | p_len, p_vec = to_vec(p_tokens, vocab, max_profile_len) 110 | ps_tokens.append(p_tokens) 111 | ps_vec.append(p_vec) 112 | ps_len.append(p_len) 113 | ps_num = len(profiles) 114 | dataset.append((us_id, us_tokens, us_vec, us_len, us_num, 0.0, ps_id, ps_tokens, ps_vec, ps_len, ps_num)) 115 | 116 | return dataset 117 | 118 | 119 | def normalize_vec(vec, maxlen): 120 | ''' 121 | pad the original vec to the same maxlen 122 | [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0] 123 | ''' 124 | if len(vec) == maxlen: 125 | return vec 126 | 127 | new_vec = np.zeros(maxlen, dtype='int32') 128 | for i in range(len(vec)): 129 | new_vec[i] = vec[i] 130 | return new_vec 131 | 132 | 133 | def charVec(tokens, charVocab, maxlen, maxWordLength): 134 | ''' 135 | chars = np.array( (maxlen, maxWordLength) ) 0 if not found in charVocab or None 136 | word_lengths = np.array( maxlen ) 1 if None 137 | ''' 138 | n = len(tokens) 139 | if n > maxlen: 140 | n = maxlen 141 | 142 | chars = np.zeros((maxlen, maxWordLength), dtype=np.int32) 143 | word_lengths = np.ones(maxlen, dtype=np.int32) 144 | for i in range(n): 145 | token = tokens[i][:maxWordLength] 146 | word_lengths[i] = len(token) 147 | row = chars[i] 148 | for idx, ch in enumerate(token): 149 | if ch in charVocab: 150 | row[idx] = charVocab[ch] 151 | 152 | return chars, word_lengths 153 | 154 | 155 | def batch_iter(data, batch_size, num_epochs, max_utter_num, max_utter_len, max_profile_num, max_profile_len, 156 | charVocab, max_word_length, shuffle=True): 157 | """ 158 | Generates a batch iterator for a dataset. 159 | """ 160 | data_size = len(data) 161 | num_batches_per_epoch = int(len(data)/batch_size) + 1 162 | for epoch in range(num_epochs): 163 | # Shuffle the data at each epoch 164 | if shuffle: 165 | random.shuffle(data) 166 | for batch_num in range(num_batches_per_epoch): 167 | start_index = batch_num * batch_size 168 | end_index = min((batch_num + 1) * batch_size, data_size) 169 | 170 | x_utterances = [] 171 | x_utterances_len = [] 172 | x_utterances_num = [] 173 | x_profiles = [] 174 | x_profiles_len = [] 175 | x_profiles_num = [] 176 | 177 | x_labels = [] 178 | x_id_pairs = [] 179 | 180 | x_utterances_char = [] 181 | x_utterances_char_len = [] 182 | x_profiles_char = [] 183 | x_profiles_char_len = [] 184 | 185 | for rowIdx in range(start_index, end_index): 186 | us_id, us_tokens, us_vec, us_len, us_num, label, ps_id, ps_tokens, ps_vec, ps_len, ps_num = data[rowIdx] 187 | 188 | # normalize us_vec and us_len 189 | new_utters_vec = np.zeros((max_utter_num, max_utter_len), dtype='int32') 190 | new_utters_len = np.zeros((max_utter_num, ), dtype='int32') 191 | for i in range(len(us_len)): 192 | new_utter_vec = normalize_vec(us_vec[i], max_utter_len) 193 | new_utters_vec[i] = new_utter_vec 194 | new_utters_len[i] = us_len[i] 195 | x_utterances.append(new_utters_vec) 196 | x_utterances_len.append(new_utters_len) 197 | x_utterances_num.append(us_num) 198 | 199 | # normalize ps_vec and ps_len 200 | new_profiles_vec = np.zeros((max_profile_num, max_profile_len), dtype='int32') 201 | new_profiles_len = np.zeros((max_profile_num, ), dtype='int32') 202 | for i in range(len(ps_len)): 203 | new_profile_vec = normalize_vec(ps_vec[i], max_profile_len) 204 | new_profiles_vec[i] = new_profile_vec 205 | new_profiles_len[i] = ps_len[i] 206 | x_profiles.append(new_profiles_vec) 207 | x_profiles_len.append(new_profiles_len) 208 | x_profiles_num.append(ps_num) 209 | 210 | x_labels.append(label) 211 | x_id_pairs.append((us_id, ps_id, int(label))) 212 | 213 | # normalize us_CharVec and us_CharLen 214 | uttersCharVec = np.zeros((max_utter_num, max_utter_len, max_word_length), dtype='int32') 215 | uttersCharLen = np.ones((max_utter_num, max_utter_len), dtype='int32') 216 | for i in range(len(us_len)): 217 | utterCharVec, utterCharLen = charVec(us_tokens[i], charVocab, max_utter_len, max_word_length) 218 | uttersCharVec[i] = utterCharVec 219 | uttersCharLen[i] = utterCharLen 220 | x_utterances_char.append(uttersCharVec) 221 | x_utterances_char_len.append(uttersCharLen) 222 | 223 | # normalize ps_CharVec and ps_CharLen 224 | psCharVec = np.zeros((max_profile_num, max_profile_len, max_word_length), dtype='int32') 225 | psCharLen = np.ones((max_profile_num, max_profile_len), dtype='int32') 226 | for i in range(len(ps_len)): 227 | pCharVec, pCharLen = charVec(ps_tokens[i], charVocab, max_profile_len, max_word_length) 228 | psCharVec[i] = pCharVec 229 | psCharLen[i] = pCharLen 230 | x_profiles_char.append(psCharVec) 231 | x_profiles_char_len.append(psCharLen) 232 | 233 | yield np.array(x_utterances), np.array(x_utterances_len), np.array(x_utterances_num), \ 234 | np.array(x_profiles), np.array(x_profiles_len), np.array(x_profiles_num), \ 235 | np.array(x_labels), x_id_pairs, \ 236 | np.array(x_utterances_char), np.array(x_utterances_char_len), np.array(x_profiles_char), np.array(x_profiles_char_len) 237 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/C2P-X/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import time 5 | import datetime 6 | import operator 7 | from collections import defaultdict 8 | import metrics 9 | import data_helpers 10 | from model_BOW import BOW as MODEL 11 | # from model_BiLSTM import BiLSTM as MODEL 12 | # from model_Transformer import Transformer as MODEL 13 | # from model_ESIM import ESIM as MODEL 14 | 15 | 16 | # Files 17 | tf.flags.DEFINE_string("train_file", "", "path to train file") 18 | tf.flags.DEFINE_string("valid_file", "", "path to valid file") 19 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file") 20 | tf.flags.DEFINE_string("char_vocab_file", "", "path to char vocab file") 21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector") 22 | 23 | # Model Hyperparameters 24 | tf.flags.DEFINE_integer("max_context_len", 150, "max context length") 25 | tf.flags.DEFINE_integer("max_persona_len", 50, "max persona length") 26 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length") 27 | tf.flags.DEFINE_integer("num_layer", 1, "num of layers in sentence encoder") 28 | tf.flags.DEFINE_integer("embedding_dim", 200, "dimensionality of word embedding") 29 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units") 30 | 31 | # Training parameters 32 | tf.flags.DEFINE_integer("batch_size", 128, "batch size (default: 128)") 33 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0)") 34 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "dropout keep probability (default: 1.0)") 35 | tf.flags.DEFINE_integer("num_epochs", 1000000, "number of training epochs (default: 1000000)") 36 | tf.flags.DEFINE_integer("evaluate_every", 1000, "evaluate model on valid dataset after this many steps (default: 1000)") 37 | 38 | # Misc Parameters 39 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 40 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 41 | 42 | FLAGS = tf.flags.FLAGS 43 | # FLAGS._parse_flags() 44 | # print("\nParameters:") 45 | # for attr, value in sorted(FLAGS.__flags.items()): 46 | # print("{}={}".format(attr.upper(), value)) 47 | print("") 48 | 49 | # Load data 50 | print("Loading data...") 51 | 52 | vocab = data_helpers.load_vocab(FLAGS.vocab_file) 53 | print('vocabulary size: {}'.format(len(vocab))) 54 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file) 55 | 56 | train_dataset = data_helpers.load_dataset(FLAGS.train_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len) 57 | print('train_pairs: {}'.format(len(train_dataset))) 58 | valid_dataset = data_helpers.load_dataset(FLAGS.valid_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len) 59 | print('valid_pairs: {}'.format(len(valid_dataset))) 60 | 61 | with tf.Graph().as_default(): 62 | session_conf = tf.ConfigProto( 63 | allow_soft_placement=FLAGS.allow_soft_placement, 64 | log_device_placement=FLAGS.log_device_placement) 65 | sess = tf.Session(config=session_conf) 66 | with sess.as_default(): 67 | model = MODEL( 68 | max_context_len=FLAGS.max_context_len, 69 | max_persona_len=FLAGS.max_persona_len, 70 | num_layer=FLAGS.num_layer, 71 | vocab_size=len(vocab), 72 | embedding_size=FLAGS.embedding_dim, 73 | vocab=vocab, 74 | rnn_size=FLAGS.rnn_size, 75 | maxWordLength=FLAGS.max_word_length, 76 | charVocab=charVocab, 77 | l2_reg_lambda=FLAGS.l2_reg_lambda) 78 | # Define Training procedure 79 | global_step = tf.Variable(0, name="global_step", trainable=False) 80 | starter_learning_rate = 0.001 81 | learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 82 | 5000, 0.96, staircase=True) 83 | optimizer = tf.train.AdamOptimizer(learning_rate) 84 | grads_and_vars = optimizer.compute_gradients(model.mean_loss) 85 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 86 | 87 | # Keep track of gradient values and sparsity (optional) 88 | """ 89 | grad_summaries = [] 90 | for g, v in grads_and_vars: 91 | if g is not None: 92 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) 93 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 94 | grad_summaries.append(grad_hist_summary) 95 | grad_summaries.append(sparsity_summary) 96 | grad_summaries_merged = tf.merge_summary(grad_summaries) 97 | """ 98 | 99 | # Output directory for models and summaries 100 | timestamp = str(int(time.time())) 101 | # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 102 | out_dir = os.path.abspath(os.path.join("../output", timestamp)) 103 | print("Writing to {}\n".format(out_dir)) 104 | 105 | # Summaries for loss and accuracy 106 | """ 107 | loss_summary = tf.scalar_summary("loss", model.mean_loss) 108 | acc_summary = tf.scalar_summary("accuracy", model.accuracy) 109 | 110 | # Train Summaries 111 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) 112 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 113 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def) 114 | 115 | # Dev summaries 116 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) 117 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 118 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def) 119 | """ 120 | 121 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 122 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 123 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 124 | if not os.path.exists(checkpoint_dir): 125 | os.makedirs(checkpoint_dir) 126 | saver = tf.train.Saver(tf.global_variables()) 127 | 128 | # Initialize all variables 129 | sess.run(tf.global_variables_initializer()) 130 | 131 | def train_step(x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, 132 | x_context_char, x_context_char_len, x_persona_char, x_persona_char_len): 133 | """ 134 | A single training step 135 | """ 136 | feed_dict = { 137 | model.context: x_context, 138 | model.context_len: x_context_len, 139 | model.persona: x_persona, 140 | model.persona_len: x_persona_len, 141 | model.target: x_labels, 142 | model.dropout_keep_prob: FLAGS.dropout_keep_prob, 143 | model.c_charVec: x_context_char, 144 | model.c_charLen: x_context_char_len, 145 | model.p_charVec: x_persona_char, 146 | model.p_charLen: x_persona_char_len 147 | } 148 | 149 | _, step, loss, accuracy, predicted_prob = sess.run( 150 | [train_op, global_step, model.mean_loss, model.accuracy, model.probs], 151 | feed_dict) 152 | 153 | if step%100 == 0: 154 | time_str = datetime.datetime.now().isoformat() 155 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 156 | #train_summary_writer.add_summary(summaries, step) 157 | 158 | 159 | def dev_step(): 160 | results = defaultdict(list) 161 | num_test = 0 162 | num_correct = 0.0 163 | valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=True) 164 | for valid_batch in valid_batches: 165 | x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = valid_batch 166 | feed_dict = { 167 | model.context: x_context, 168 | model.context_len: x_context_len, 169 | model.persona: x_persona, 170 | model.persona_len: x_persona_len, 171 | model.target: x_labels, 172 | model.dropout_keep_prob: 1.0, 173 | model.c_charVec: x_context_char, 174 | model.c_charLen: x_context_char_len, 175 | model.p_charVec: x_persona_char, 176 | model.p_charLen: x_persona_char_len 177 | } 178 | batch_accuracy, predicted_prob = sess.run([model.accuracy, model.probs], feed_dict) 179 | num_test += len(predicted_prob) 180 | if num_test % 1000 == 0: 181 | print(num_test) 182 | 183 | num_correct += len(predicted_prob) * batch_accuracy 184 | for i, prob_score in enumerate(predicted_prob): 185 | utterances_id, profiles_id, label = x_id_pairs[i] 186 | results[utterances_id].append((profiles_id, label, prob_score)) 187 | 188 | #calculate top-1 precision 189 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct/num_test)) 190 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 191 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 192 | 193 | mvp = metrics.mean_average_precision(results) 194 | mrr = metrics.mean_reciprocal_rank(results) 195 | top_1_precision = metrics.top_1_precision(results) 196 | total_valid_query = metrics.get_num_valid_query(results) 197 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query)) 198 | 199 | return mrr 200 | 201 | best_mrr = 0.0 202 | batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=True) 203 | for batch in batches: 204 | x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = batch 205 | train_step(x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len) 206 | current_step = tf.train.global_step(sess, global_step) 207 | if current_step % FLAGS.evaluate_every == 0: 208 | print("\nEvaluation:") 209 | valid_mrr = dev_step() 210 | if valid_mrr > best_mrr: 211 | best_mrr = valid_mrr 212 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 213 | print("Saved model checkpoint to {}\n".format(path)) 214 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/model_BiLSTM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | FLAGS = tf.flags.FLAGS 5 | 6 | def get_embeddings(vocab): 7 | print("get_embedding") 8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 9 | return tf.constant(initializer, name="word_embedding") 10 | 11 | def get_char_embedding(charVocab): 12 | print("get_char_embedding") 13 | char_size = len(charVocab) 14 | embeddings = np.zeros((char_size, char_size), dtype='float32') 15 | for i in range(1, char_size): 16 | embeddings[i, i] = 1.0 17 | 18 | return tf.constant(embeddings, name="word_char_embedding") 19 | 20 | def load_embed_vectors(fname, dim): 21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... } 22 | vectors = {} 23 | for line in open(fname, 'rt'): 24 | items = line.strip().split(' ') 25 | if len(items[0]) <= 0: 26 | continue 27 | vec = [float(items[i]) for i in range(1, dim+1)] 28 | vectors[items[0]] = vec 29 | 30 | return vectors 31 | 32 | def load_word_embeddings(vocab, dim): 33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 34 | vocab_size = len(vocab) 35 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 36 | for word, code in vocab.items(): 37 | if word in vectors: 38 | embeddings[code] = vectors[word] 39 | #else: 40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 41 | 42 | return embeddings 43 | 44 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False): 45 | with tf.variable_scope(scope, reuse=scope_reuse) as vs: 46 | fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse) 47 | fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob) 48 | bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse) 49 | bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob) 50 | rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell, 51 | inputs=inputs, 52 | sequence_length=input_seq_len, 53 | dtype=tf.float32) 54 | return rnn_outputs, rnn_states 55 | 56 | def multi_lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, num_layer, scope, scope_reuse=False): 57 | with tf.variable_scope(scope, reuse=scope_reuse) as vs: 58 | multi_outputs = [] 59 | multi_states = [] 60 | cur_inputs = inputs 61 | for i_layer in range(num_layer): 62 | rnn_outputs, rnn_states = lstm_layer(cur_inputs, input_seq_len, rnn_size, dropout_keep_prob, scope+str(i_layer), scope_reuse) 63 | rnn_outputs = tf.concat(values=rnn_outputs, axis=2) 64 | multi_outputs.append(rnn_outputs) 65 | multi_states.append(rnn_states) 66 | cur_inputs = rnn_outputs 67 | 68 | # multi_layer_aggregation 69 | ml_weights = tf.nn.softmax(tf.get_variable("ml_scores", [num_layer, ], initializer=tf.constant_initializer(0.0))) 70 | 71 | multi_outputs = tf.stack(multi_outputs, axis=-1) # [batch_size, max_len, 2*rnn_size(400), num_layer] 72 | max_len = multi_outputs.get_shape()[1].value 73 | dim = multi_outputs.get_shape()[2].value 74 | flattened_multi_outputs = tf.reshape(multi_outputs, [-1, num_layer]) # [batch_size * max_len * 2*rnn_size(400), num_layer] 75 | aggregated_ml_outputs = tf.matmul(flattened_multi_outputs, tf.expand_dims(ml_weights, 1)) # [batch_size * max_len * 2*rnn_size(400), 1] 76 | aggregated_ml_outputs = tf.reshape(aggregated_ml_outputs, [-1, max_len, dim]) # [batch_size , max_len , 2*rnn_size(400)] 77 | 78 | return aggregated_ml_outputs 79 | 80 | 81 | class BiLSTM(object): 82 | def __init__( 83 | self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0): 84 | 85 | self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances") 86 | self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len") 87 | self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num") 88 | self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles") 89 | self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len") 90 | self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num") 91 | 92 | self.target = tf.placeholder(tf.float32, [None], name="target") 93 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 94 | 95 | self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char") 96 | self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len") 97 | self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char") 98 | self.p_charLen = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len") 99 | 100 | l2_loss = tf.constant(1.0) 101 | 102 | 103 | # =============================== Embedding layer =============================== 104 | with tf.name_scope("embedding"): 105 | W = get_embeddings(vocab) 106 | utterances_embedded = tf.nn.embedding_lookup(W, self.utterances) # [batch_size, max_utter_num, max_utter_len, word_dim] 107 | profiles_embedded = tf.nn.embedding_lookup(W, self.profiles) # [batch_size, max_profile_num, max_profile_len, word_dim] 108 | utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob) 109 | profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob) 110 | print("utterances_embedded: {}".format(utterances_embedded.get_shape())) 111 | print("profiles_embedded: {}".format(profiles_embedded.get_shape())) 112 | 113 | 114 | # =============================== Encoding layer =============================== 115 | with tf.variable_scope("encoding_layer") as vs: 116 | rnn_scope_name = "bidirectional_rnn" 117 | emb_dim = utterances_embedded.get_shape()[-1].value 118 | flattened_utterances_embedded = tf.reshape(utterances_embedded, [-1, max_utter_len, emb_dim]) # [batch_size*max_utter_num, max_utter_len, emb] 119 | flattened_utterances_len = tf.reshape(self.utterances_len, [-1]) # [batch_size*max_utter_num, ] 120 | flattened_profiles_embedded = tf.reshape(profiles_embedded, [-1, max_profile_len, emb_dim]) # [batch_size*max_profile_num, max_profile_len, emb] 121 | flattened_profiles_len = tf.reshape(self.profiles_len, [-1]) # [batch_size*max_profile_num, ] 122 | # 1. single_lstm_layer 123 | u_rnn_output, u_rnn_states = lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False) 124 | utterances_output = tf.concat(axis=2, values=u_rnn_output) # [batch_size*max_utter_num, max_utter_len, rnn_size*2] 125 | p_rnn_output, p_rnn_states = lstm_layer(flattened_profiles_embedded, flattened_profiles_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True) # [batch_size, max_profile_len, rnn_size(200)] 126 | profiles_output = tf.concat(axis=2, values=p_rnn_output) # [batch_size*max_profile_num, max_profile_len, 2*rnn_size(400)] 127 | # 2. multi_lstm_layer 128 | # utterances_output = multi_lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=False) 129 | # response_output = multi_lstm_layer(flattened_responses_embedded, flattened_responses_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=True) 130 | # print("establish AHRE layers : {}".format(num_layer)) 131 | print("establish BiLSTM encoder") 132 | 133 | 134 | # =============================== Matching layer =============================== 135 | with tf.variable_scope("matching_layer") as vs: 136 | final_utterances = tf.concat(axis=1, values=[u_rnn_states[0].h, u_rnn_states[1].h]) 137 | concat_dim = final_utterances.get_shape()[-1].value 138 | final_utterances = tf.reshape(final_utterances, [-1, max_utter_num, concat_dim]) # [batch_size, max_utter_num, dim] 139 | 140 | final_profiles = tf.concat(axis=1, values=[p_rnn_states[0].h, p_rnn_states[1].h]) 141 | final_profiles = tf.reshape(final_profiles, [-1, max_profile_num, concat_dim]) # [batch_size, max_profile_num, dim] 142 | 143 | A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32) 144 | similarity = tf.einsum('aij,jk->aik', 145 | final_utterances, A_matrix) # [batch_size, max_utter_num, dim] 146 | similarity = tf.matmul(similarity, 147 | tf.transpose(final_profiles, perm=[0, 2, 1]), 148 | name="similarity") # [batch_size, max_utter_num, max_profile_num] 149 | 150 | print("shape of similarity: {}".format(similarity.get_shape())) 151 | print("establish matching between utterances and profiles") 152 | 153 | 154 | # =============================== Aggregation layer =============================== 155 | with tf.variable_scope("aggregation_layer") as vs: 156 | logits = tf.reduce_max(similarity, axis=2, name="logits_1") # [batch_size, max_utter_num] 157 | mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32) # [batch_size, max_utter_num] 158 | logits = logits * mask_u 159 | logits = tf.reduce_sum(logits, axis=1, name="logits_2") # [batch_size, ] 160 | print("establish reduce_max across profiles and masked_reduce_sum across utterances") 161 | print("logits: {}".format(logits.get_shape())) 162 | 163 | 164 | # =============================== Prediction layer =============================== 165 | with tf.variable_scope("prediction_layer") as vs: 166 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ] 167 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target) 168 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 169 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 170 | 171 | with tf.name_scope("accuracy"): 172 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ] 173 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 174 | -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT finetuning runner.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import operator 23 | from time import time 24 | from collections import defaultdict 25 | import tensorflow as tf 26 | import optimization 27 | import tokenization 28 | import modeling as modeling 29 | import metrics 30 | 31 | 32 | flags = tf.flags 33 | FLAGS = flags.FLAGS 34 | 35 | ## Required parameters 36 | flags.DEFINE_string( 37 | "test_dir", 'valid.tfrecord', 38 | "The input test data dir. Should contain the .tsv files (or other data files) for the task.") 39 | 40 | flags.DEFINE_string( 41 | "restore_model_dir", 'output/', 42 | "The output directory where the model checkpoints have been written.") 43 | 44 | flags.DEFINE_string( 45 | "task_name", 'TestModel', 46 | "The name of the task.") 47 | 48 | flags.DEFINE_string( 49 | "bert_config_file", 'uncased_L-12_H-768_A-12/bert_config.json', 50 | "The config json file corresponding to the pre-trained BERT model. " 51 | "This specifies the model architecture.") 52 | 53 | flags.DEFINE_integer( 54 | "max_seq_length", 320, 55 | "The maximum total input sequence length after WordPiece tokenization. " 56 | "Sequences longer than this will be truncated, and sequences shorter " 57 | "than this will be padded.") 58 | 59 | flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.") 60 | 61 | flags.DEFINE_integer("eval_batch_size", 32, "Total batch size for predict.") 62 | 63 | 64 | def print_configuration_op(FLAGS): 65 | print('My Configurations:') 66 | for name, value in FLAGS.__flags.items(): 67 | value=value.value 68 | if type(value) == float: 69 | print(' %s:\t %f'%(name, value)) 70 | elif type(value) == int: 71 | print(' %s:\t %d'%(name, value)) 72 | elif type(value) == str: 73 | print(' %s:\t %s'%(name, value)) 74 | elif type(value) == bool: 75 | print(' %s:\t %s'%(name, value)) 76 | else: 77 | print('%s:\t %s' % (name, value)) 78 | print('End of configuration') 79 | 80 | 81 | def total_sample(file_name): 82 | sample_nums = 0 83 | for record in tf.python_io.tf_record_iterator(file_name): 84 | sample_nums += 1 85 | return sample_nums 86 | 87 | 88 | def parse_exmp(serial_exmp): 89 | input_data = tf.parse_single_example(serial_exmp, 90 | features={ 91 | "text_a_id": 92 | tf.FixedLenFeature([], tf.int64), 93 | "text_b_id": 94 | tf.FixedLenFeature([], tf.int64), 95 | "input_ids": 96 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 97 | "input_mask": 98 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 99 | "segment_ids": 100 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 101 | "label_ids": 102 | tf.FixedLenFeature([], tf.float32) 103 | } 104 | ) 105 | # So cast all int64 to int32. 106 | for name in list(input_data.keys()): 107 | t = input_data[name] 108 | if t.dtype == tf.int64: 109 | t = tf.to_int32(t) 110 | input_data[name] = t 111 | 112 | text_a_id = input_data["text_a_id"] 113 | text_b_id = input_data['text_b_id'] 114 | input_ids = input_data["input_ids"] 115 | input_mask = input_data["input_mask"] 116 | segment_ids= input_data["segment_ids"] 117 | labels = input_data['label_ids'] 118 | return text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels 119 | 120 | 121 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, text_a_id, text_b_id, 122 | num_labels, use_one_hot_embeddings): 123 | """Creates a classification model.""" 124 | model = modeling.BertModel( 125 | config=bert_config, 126 | is_training=is_training, 127 | input_ids=input_ids, 128 | input_mask=input_mask, 129 | token_type_ids=segment_ids, 130 | use_one_hot_embeddings=use_one_hot_embeddings) 131 | 132 | # In the demo, we are doing a simple classification task on the entire 133 | # segment. 134 | # 135 | # If you want to use the token-level output, use model.get_sequence_output() 136 | # instead. 137 | target_loss_weight = [1.0, 1.0] 138 | target_loss_weight = tf.convert_to_tensor(target_loss_weight) 139 | 140 | flagx = tf.cast(tf.greater(labels, 0), dtype=tf.float32) 141 | flagy = tf.cast(tf.equal(labels, 0), dtype=tf.float32) 142 | 143 | all_target_loss = target_loss_weight[1] * flagx + target_loss_weight[0] * flagy 144 | 145 | output_layer = model.get_pooled_output() 146 | 147 | hidden_size = output_layer.shape[-1].value 148 | 149 | output_weights = tf.get_variable( 150 | "output_weights", [num_labels, hidden_size], 151 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 152 | 153 | output_bias = tf.get_variable( 154 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 155 | 156 | with tf.variable_scope("loss"): 157 | # if is_training: 158 | # # I.e., 0.1 dropout 159 | # output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 160 | output_layer = tf.layers.dropout(output_layer, rate=0.1, training=is_training) 161 | 162 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 163 | logits = tf.nn.bias_add(logits, output_bias) 164 | 165 | probabilities = tf.sigmoid(logits, name="prob") 166 | logits = tf.squeeze(logits,[1]) 167 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels) 168 | losses = tf.multiply(losses, all_target_loss) 169 | 170 | mean_loss = tf.reduce_mean(losses, name="mean_loss") + sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 171 | 172 | with tf.name_scope("accuracy"): 173 | correct_prediction = tf.equal(tf.sign(probabilities - 0.5), tf.sign(labels - 0.5)) 174 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 175 | # 176 | # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 177 | # 178 | # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 179 | # loss = tf.reduce_mean(per_example_loss) 180 | 181 | return mean_loss, logits, probabilities, accuracy, model, output_layer 182 | 183 | 184 | best_score = 0.0 185 | def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer): 186 | results = defaultdict(list) 187 | num_test = 0 188 | num_correct = 0.0 189 | n_updates = 0 190 | mrr = 0 191 | t0 = time() 192 | try: 193 | while True: 194 | n_updates += 1 195 | 196 | batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False}) 197 | question_id, answer_id, label = pair_ 198 | 199 | num_test += len(predicted_prob) 200 | # if num_test % 1000 == 0: 201 | # print(num_test) 202 | 203 | num_correct += len(predicted_prob) * batch_accuracy 204 | for i, prob_score in enumerate(predicted_prob): 205 | results[question_id[i]].append((answer_id[i], label[i], prob_score[0])) 206 | 207 | if n_updates%100 == 0: 208 | tf.logging.info("n_update %d , %s: Mins Used: %.2f" % 209 | (n_updates, op_name, (time() - t0) / 60.0)) 210 | 211 | except tf.errors.OutOfRangeError: 212 | 213 | print("Inference Time: {} s".format(time() - t0)) 214 | 215 | # calculate top-1 precision 216 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct / num_test)) 217 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 218 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 219 | 220 | mvp = metrics.mean_average_precision(results) 221 | mrr = metrics.mean_reciprocal_rank(results) 222 | top_1_precision = metrics.top_1_precision(results) 223 | total_valid_query = metrics.get_num_valid_query(results) 224 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format( 225 | mvp, mrr, top_1_precision, total_valid_query)) 226 | 227 | out_path = os.path.join(dir_path, "output_test.txt") 228 | print("Saving evaluation to {}".format(out_path)) 229 | with open(out_path, 'w') as f: 230 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n") 231 | for us_id, v in results.items(): 232 | v.sort(key=operator.itemgetter(2), reverse=True) 233 | for i, rec in enumerate(v): 234 | r_id, label, prob_score = rec 235 | rank = i+1 236 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label)) 237 | return mrr 238 | 239 | 240 | def main(_): 241 | tf.logging.set_verbosity(tf.logging.INFO) 242 | 243 | print_configuration_op(FLAGS) 244 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 245 | 246 | test_data_size = total_sample(FLAGS.test_dir) 247 | tf.logging.info('test data size: {}'.format(test_data_size)) 248 | 249 | filenames = tf.placeholder(tf.string, shape=[None]) 250 | shuffle_size = tf.placeholder(tf.int64) 251 | dataset = tf.data.TFRecordDataset(filenames) 252 | dataset = dataset.map(parse_exmp) # Parse the record into tensors. 253 | dataset = dataset.repeat(1) 254 | # dataset = dataset.shuffle(shuffle_size) 255 | dataset = dataset.batch(FLAGS.eval_batch_size) 256 | iterator = dataset.make_initializable_iterator() 257 | text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels = iterator.get_next() # output dir 258 | pair_ids = [text_a_id, text_b_id, labels] 259 | 260 | training = tf.placeholder(tf.bool) 261 | mean_loss, logits, probabilities, accuracy, model, output_layer = create_model(bert_config, 262 | is_training = training, 263 | input_ids = input_ids, 264 | input_mask = input_mask, 265 | segment_ids = segment_ids, 266 | labels = labels, 267 | text_a_id = text_a_id, 268 | text_b_id = text_b_id, 269 | num_labels = 1, 270 | use_one_hot_embeddings = False) 271 | 272 | 273 | config = tf.ConfigProto(allow_soft_placement=True) 274 | config.gpu_options.allow_growth = True 275 | 276 | if FLAGS.do_eval: 277 | with tf.Session(config=config) as sess: 278 | tf.logging.info("*** Restore model ***") 279 | 280 | ckpt = tf.train.get_checkpoint_state(FLAGS.restore_model_dir) 281 | variables = tf.trainable_variables() 282 | saver = tf.train.Saver(variables) 283 | saver.restore(sess, ckpt.model_checkpoint_path) 284 | 285 | tf.logging.info('Test begin') 286 | sess.run(iterator.initializer, 287 | feed_dict={filenames: [FLAGS.test_dir], shuffle_size: 1}) 288 | run_test(FLAGS.restore_model_dir, "test", sess, training, accuracy, probabilities, pair_ids, output_layer) 289 | 290 | 291 | if __name__ == "__main__": 292 | 293 | tf.app.run() 294 | -------------------------------------------------------------------------------- /Non-Pretraining-Based/U2P-X/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import time 5 | import datetime 6 | import operator 7 | from collections import defaultdict 8 | import metrics 9 | import data_helpers 10 | from model_BOW import BOW as MODEL 11 | # from model_BiLSTM import BiLSTM as MODEL 12 | # from model_Transformer import Transformer as MODEL 13 | # from model_ESIM import ESIM as MODEL 14 | 15 | 16 | # Files 17 | tf.flags.DEFINE_string("train_file", "", "path to train file") 18 | tf.flags.DEFINE_string("valid_file", "", "path to valid file") 19 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file") 20 | tf.flags.DEFINE_string("char_vocab_file", "", "path to char vocab file") 21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector") 22 | 23 | # Model Hyperparameters 24 | tf.flags.DEFINE_integer("max_utter_num", 8, "max utterance number") 25 | tf.flags.DEFINE_integer("max_utter_len", 20, "max utterance length") 26 | tf.flags.DEFINE_integer("max_profile_num", 5, "max profile number") 27 | tf.flags.DEFINE_integer("max_profile_len", 15, "max profile length") 28 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length") 29 | tf.flags.DEFINE_integer("num_layer", 1, "num of layers in sentence encoder") 30 | tf.flags.DEFINE_integer("embedding_dim", 200, "dimensionality of word embedding") 31 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units") 32 | 33 | # Training parameters 34 | tf.flags.DEFINE_integer("batch_size", 128, "batch size (default: 128)") 35 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0)") 36 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "dropout keep probability (default: 1.0)") 37 | tf.flags.DEFINE_integer("num_epochs", 1000000, "number of training epochs (default: 1000000)") 38 | tf.flags.DEFINE_integer("evaluate_every", 1000, "evaluate model on valid dataset after this many steps (default: 1000)") 39 | 40 | # Misc Parameters 41 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 42 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 43 | 44 | FLAGS = tf.flags.FLAGS 45 | # FLAGS._parse_flags() 46 | # print("\nParameters:") 47 | # for attr, value in sorted(FLAGS.__flags.items()): 48 | # print("{}={}".format(attr.upper(), value)) 49 | print("") 50 | 51 | # Load data 52 | print("Loading data...") 53 | 54 | vocab = data_helpers.load_vocab(FLAGS.vocab_file) 55 | print('vocabulary size: {}'.format(len(vocab))) 56 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file) 57 | 58 | train_dataset = data_helpers.load_dataset(FLAGS.train_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len) 59 | print('train_pairs: {}'.format(len(train_dataset))) 60 | valid_dataset = data_helpers.load_dataset(FLAGS.valid_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len) 61 | print('valid_pairs: {}'.format(len(valid_dataset))) 62 | 63 | with tf.Graph().as_default(): 64 | session_conf = tf.ConfigProto( 65 | allow_soft_placement=FLAGS.allow_soft_placement, 66 | log_device_placement=FLAGS.log_device_placement) 67 | sess = tf.Session(config=session_conf) 68 | with sess.as_default(): 69 | model = MODEL( 70 | max_utter_num=FLAGS.max_utter_num, 71 | max_utter_len=FLAGS.max_utter_len, 72 | max_profile_num=FLAGS.max_profile_num, 73 | max_profile_len=FLAGS.max_profile_len, 74 | num_layer=FLAGS.num_layer, 75 | vocab_size=len(vocab), 76 | embedding_size=FLAGS.embedding_dim, 77 | vocab=vocab, 78 | rnn_size=FLAGS.rnn_size, 79 | maxWordLength=FLAGS.max_word_length, 80 | charVocab=charVocab, 81 | l2_reg_lambda=FLAGS.l2_reg_lambda) 82 | # Define Training procedure 83 | global_step = tf.Variable(0, name="global_step", trainable=False) 84 | starter_learning_rate = 0.001 85 | learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 86 | 5000, 0.96, staircase=True) 87 | optimizer = tf.train.AdamOptimizer(learning_rate) 88 | grads_and_vars = optimizer.compute_gradients(model.mean_loss) 89 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 90 | 91 | # Keep track of gradient values and sparsity (optional) 92 | """ 93 | grad_summaries = [] 94 | for g, v in grads_and_vars: 95 | if g is not None: 96 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) 97 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 98 | grad_summaries.append(grad_hist_summary) 99 | grad_summaries.append(sparsity_summary) 100 | grad_summaries_merged = tf.merge_summary(grad_summaries) 101 | """ 102 | 103 | # Output directory for models and summaries 104 | timestamp = str(int(time.time())) 105 | # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 106 | out_dir = os.path.abspath(os.path.join("../output", timestamp)) 107 | print("Writing to {}\n".format(out_dir)) 108 | 109 | # Summaries for loss and accuracy 110 | """ 111 | loss_summary = tf.scalar_summary("loss", model.mean_loss) 112 | acc_summary = tf.scalar_summary("accuracy", model.accuracy) 113 | 114 | # Train Summaries 115 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) 116 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 117 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def) 118 | 119 | # Dev summaries 120 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) 121 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 122 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def) 123 | """ 124 | 125 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 126 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 127 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 128 | if not os.path.exists(checkpoint_dir): 129 | os.makedirs(checkpoint_dir) 130 | saver = tf.train.Saver(tf.global_variables()) 131 | 132 | # Initialize all variables 133 | sess.run(tf.global_variables_initializer()) 134 | # ===================================================================================== 135 | # tvars = tf.trainable_variables() 136 | # para_total = 0 137 | # print 'All parameters:' 138 | # for i in xrange(len(tvars)): 139 | # print tvars[i].name 140 | # print tvars[i].get_shape() 141 | # if tvars[i].get_shape().ndims==1: 142 | # para_total += int(tvars[i].get_shape()[0]) 143 | # else: 144 | # para_total += int(tvars[i].get_shape()[0])*int(tvars[i].get_shape()[1]) 145 | # print 'Total Parameter Numbers: {}.'.format(para_total) 146 | # ===================================================================================== 147 | 148 | 149 | def train_step(x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, 150 | x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len): 151 | """ 152 | A single training step 153 | """ 154 | feed_dict = { 155 | model.utterances: x_utterances, 156 | model.utterances_len: x_utterances_len, 157 | model.utterances_num: x_utterances_num, 158 | model.profiles: x_profiles, 159 | model.profiles_len: x_profiles_len, 160 | model.profiles_num: x_profiles_num, 161 | model.target: x_labels, 162 | model.dropout_keep_prob: FLAGS.dropout_keep_prob, 163 | model.u_charVec: x_utterances_char, 164 | model.u_charLen: x_utterances_char_len, 165 | model.p_charVec: x_profiles_char, 166 | model.p_charLen: x_profiles_char_len 167 | } 168 | 169 | _, step, loss, accuracy, predicted_prob = sess.run( 170 | [train_op, global_step, model.mean_loss, model.accuracy, model.probs], 171 | feed_dict) 172 | 173 | if step%100 == 0: 174 | time_str = datetime.datetime.now().isoformat() 175 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 176 | #train_summary_writer.add_summary(summaries, step) 177 | 178 | 179 | def dev_step(): 180 | # t0 = time.time() 181 | results = defaultdict(list) 182 | num_test = 0 183 | num_correct = 0.0 184 | valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=True) 185 | for valid_batch in valid_batches: 186 | x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = valid_batch 187 | feed_dict = { 188 | model.utterances: x_utterances, 189 | model.utterances_len: x_utterances_len, 190 | model.utterances_num: x_utterances_num, 191 | model.profiles: x_profiles, 192 | model.profiles_len: x_profiles_len, 193 | model.profiles_num: x_profiles_num, 194 | model.target: x_labels, 195 | model.dropout_keep_prob: 1.0, 196 | model.u_charVec: x_utterances_char, 197 | model.u_charLen: x_utterances_char_len, 198 | model.p_charVec: x_profiles_char, 199 | model.p_charLen: x_profiles_char_len 200 | } 201 | batch_accuracy, predicted_prob = sess.run([model.accuracy, model.probs], feed_dict) 202 | num_test += len(predicted_prob) 203 | if num_test % 1000 == 0: 204 | print(num_test) 205 | 206 | num_correct += len(predicted_prob) * batch_accuracy 207 | for i, prob_score in enumerate(predicted_prob): 208 | utterances_id, profiles_id, label = x_ids[i] 209 | results[utterances_id].append((profiles_id, label, prob_score)) 210 | 211 | # print("Validation Time: {} s".format(time.time() - t0)) 212 | 213 | #calculate top-1 precision 214 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct/num_test)) 215 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 216 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 217 | 218 | mvp = metrics.mean_average_precision(results) 219 | mrr = metrics.mean_reciprocal_rank(results) 220 | top_1_precision = metrics.top_1_precision(results) 221 | total_valid_query = metrics.get_num_valid_query(results) 222 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query)) 223 | 224 | return mrr 225 | 226 | best_mrr = 0.0 227 | batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=True) 228 | for batch in batches: 229 | x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = batch 230 | train_step(x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len) 231 | current_step = tf.train.global_step(sess, global_step) 232 | if current_step % FLAGS.evaluate_every == 0: 233 | print("\nEvaluation:") 234 | valid_mrr = dev_step() 235 | if valid_mrr > best_mrr: 236 | best_mrr = valid_mrr 237 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 238 | print("Saved model checkpoint to {}\n".format(path)) 239 | -------------------------------------------------------------------------------- /Pretraining-Based/U2P-BERT/test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT finetuning runner.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import operator 23 | from time import time 24 | from collections import defaultdict 25 | import tensorflow as tf 26 | import optimization 27 | import tokenization 28 | import modeling as modeling 29 | import metrics 30 | 31 | 32 | flags = tf.flags 33 | FLAGS = flags.FLAGS 34 | 35 | ## Required parameters 36 | flags.DEFINE_string( 37 | "test_dir", 'valid.tfrecord', 38 | "The input test data dir. Should contain the .tsv files (or other data files) for the task.") 39 | 40 | flags.DEFINE_string( 41 | "restore_model_dir", 'output/', 42 | "The output directory where the model checkpoints have been written.") 43 | 44 | flags.DEFINE_string( 45 | "task_name", 'TestModel', 46 | "The name of the task.") 47 | 48 | flags.DEFINE_string( 49 | "bert_config_file", 'uncased_L-12_H-768_A-12/bert_config.json', 50 | "The config json file corresponding to the pre-trained BERT model. " 51 | "This specifies the model architecture.") 52 | 53 | flags.DEFINE_integer( 54 | "max_seq_length", 320, 55 | "The maximum total input sequence length after WordPiece tokenization. " 56 | "Sequences longer than this will be truncated, and sequences shorter " 57 | "than this will be padded.") 58 | 59 | flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.") 60 | 61 | flags.DEFINE_integer("eval_batch_size", 32, "Total batch size for predict.") 62 | 63 | max_sentence_a_num=8 64 | max_sentence_a_len=20 65 | max_sentence_b_num=5 66 | max_sentence_b_len=15 67 | 68 | 69 | def print_configuration_op(FLAGS): 70 | print('My Configurations:') 71 | for name, value in FLAGS.__flags.items(): 72 | value=value.value 73 | if type(value) == float: 74 | print(' %s:\t %f'%(name, value)) 75 | elif type(value) == int: 76 | print(' %s:\t %d'%(name, value)) 77 | elif type(value) == str: 78 | print(' %s:\t %s'%(name, value)) 79 | elif type(value) == bool: 80 | print(' %s:\t %s'%(name, value)) 81 | else: 82 | print('%s:\t %s' % (name, value)) 83 | print('End of configuration') 84 | 85 | 86 | def total_sample(file_name): 87 | sample_nums = 0 88 | for record in tf.python_io.tf_record_iterator(file_name): 89 | sample_nums += 1 90 | return sample_nums 91 | 92 | 93 | def parse_exmp(serial_exmp): 94 | input_data = tf.parse_single_example(serial_exmp, 95 | features={ 96 | "text_a_id": 97 | tf.FixedLenFeature([], tf.int64), 98 | "text_b_id": 99 | tf.FixedLenFeature([], tf.int64), 100 | "input_ids": 101 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 102 | "input_mask": 103 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 104 | "segment_ids": 105 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 106 | "label_ids": 107 | tf.FixedLenFeature([], tf.float32) 108 | } 109 | ) 110 | # So cast all int64 to int32. 111 | for name in list(input_data.keys()): 112 | t = input_data[name] 113 | if t.dtype == tf.int64: 114 | t = tf.to_int32(t) 115 | input_data[name] = t 116 | 117 | text_a_id = input_data["text_a_id"] 118 | text_b_id = input_data['text_b_id'] 119 | input_ids = input_data["input_ids"] 120 | input_mask = input_data["input_mask"] 121 | segment_ids= input_data["segment_ids"] 122 | labels = input_data['label_ids'] 123 | return text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels 124 | 125 | 126 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, text_a_id, text_b_id, 127 | num_labels, use_one_hot_embeddings): 128 | """Creates a classification model.""" 129 | 130 | # print(input_ids.get_shape()) # [batch_size, max_sentence_a_num * max_sentence_b_num * (max_sentence_a_len + max_sentence_b_len)] 131 | input_ids = tf.reshape(input_ids, [-1, (max_sentence_a_len + max_sentence_b_len)]) 132 | input_mask = tf.reshape(input_mask, [-1, (max_sentence_a_len + max_sentence_b_len)]) 133 | segment_ids = tf.reshape(segment_ids, [-1, (max_sentence_a_len + max_sentence_b_len)]) 134 | # print(input_ids.get_shape()) # [batch_size * max_sentence_a_num * max_sentence_b_num, (max_sentence_a_len + max_sentence_b_len)] 135 | 136 | model = modeling.BertModel( 137 | config=bert_config, 138 | is_training=is_training, 139 | input_ids=input_ids, 140 | input_mask=input_mask, 141 | token_type_ids=segment_ids, 142 | use_one_hot_embeddings=use_one_hot_embeddings) 143 | 144 | # In the demo, we are doing a simple classification task on the entire 145 | # segment. 146 | # 147 | # If you want to use the token-level output, use model.get_sequence_output() 148 | # instead. 149 | target_loss_weight = [1.0, 1.0] 150 | target_loss_weight = tf.convert_to_tensor(target_loss_weight) 151 | 152 | flagx = tf.cast(tf.greater(labels, 0), dtype=tf.float32) 153 | flagy = tf.cast(tf.equal(labels, 0), dtype=tf.float32) 154 | 155 | all_target_loss = target_loss_weight[1] * flagx + target_loss_weight[0] * flagy 156 | 157 | output_layer = model.get_pooled_output() 158 | 159 | hidden_size = output_layer.shape[-1].value 160 | 161 | output_weights = tf.get_variable( 162 | "output_weights", [num_labels, hidden_size], 163 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 164 | 165 | output_bias = tf.get_variable( 166 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 167 | 168 | with tf.variable_scope("loss"): 169 | # if is_training: 170 | # # I.e., 0.1 dropout 171 | # output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 172 | output_layer = tf.layers.dropout(output_layer, rate=0.1, training=is_training) 173 | 174 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 175 | logits = tf.nn.bias_add(logits, output_bias) 176 | 177 | logits = tf.reshape(logits, [-1, max_sentence_a_num, max_sentence_b_num]) 178 | logits = tf.reduce_max(logits, -1) 179 | logits = tf.reduce_sum(logits, -1) 180 | logits = tf.expand_dims(logits, -1) 181 | 182 | probabilities = tf.sigmoid(logits, name="prob") 183 | logits = tf.squeeze(logits,[1]) 184 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels) 185 | losses = tf.multiply(losses, all_target_loss) 186 | 187 | mean_loss = tf.reduce_mean(losses, name="mean_loss") + sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 188 | 189 | with tf.name_scope("accuracy"): 190 | correct_prediction = tf.equal(tf.sign(probabilities - 0.5), tf.sign(labels - 0.5)) 191 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 192 | # 193 | # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 194 | # 195 | # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 196 | # loss = tf.reduce_mean(per_example_loss) 197 | 198 | return mean_loss, logits, probabilities, accuracy, model, output_layer 199 | 200 | 201 | best_score = 0.0 202 | def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer): 203 | results = defaultdict(list) 204 | num_test = 0 205 | num_correct = 0.0 206 | n_updates = 0 207 | mrr = 0 208 | t0 = time() 209 | try: 210 | while True: 211 | n_updates += 1 212 | 213 | batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False}) 214 | question_id, answer_id, label = pair_ 215 | 216 | num_test += len(predicted_prob) 217 | # if num_test % 1000 == 0: 218 | # print(num_test) 219 | 220 | num_correct += len(predicted_prob) * batch_accuracy 221 | for i, prob_score in enumerate(predicted_prob): 222 | # question_id, answer_id, label = pair_id[i] 223 | results[question_id[i]].append((answer_id[i], label[i], prob_score[0])) 224 | 225 | if n_updates%100 == 0: 226 | tf.logging.info("n_update %d , %s: Mins Used: %.2f" % 227 | (n_updates, op_name, (time() - t0) / 60.0)) 228 | 229 | except tf.errors.OutOfRangeError: 230 | 231 | print("Inference Time: {} s".format(time() - t0)) 232 | 233 | # calculate top-1 precision 234 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct / num_test)) 235 | accu, precision, recall, f1, loss = metrics.classification_metrics(results) 236 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) 237 | 238 | mvp = metrics.mean_average_precision(results) 239 | mrr = metrics.mean_reciprocal_rank(results) 240 | top_1_precision = metrics.top_1_precision(results) 241 | total_valid_query = metrics.get_num_valid_query(results) 242 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format( 243 | mvp, mrr, top_1_precision, total_valid_query)) 244 | 245 | out_path = os.path.join(dir_path, "output_test.txt") 246 | print("Saving evaluation to {}".format(out_path)) 247 | with open(out_path, 'w') as f: 248 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n") 249 | for us_id, v in results.items(): 250 | v.sort(key=operator.itemgetter(2), reverse=True) 251 | for i, rec in enumerate(v): 252 | r_id, label, prob_score = rec 253 | rank = i+1 254 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label)) 255 | return mrr 256 | 257 | 258 | def main(_): 259 | tf.logging.set_verbosity(tf.logging.INFO) 260 | 261 | print_configuration_op(FLAGS) 262 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 263 | 264 | test_data_size = total_sample(FLAGS.test_dir) 265 | tf.logging.info('test data size: {}'.format(test_data_size)) 266 | 267 | filenames = tf.placeholder(tf.string, shape=[None]) 268 | shuffle_size = tf.placeholder(tf.int64) 269 | dataset = tf.data.TFRecordDataset(filenames) 270 | dataset = dataset.map(parse_exmp) # Parse the record into tensors. 271 | dataset = dataset.repeat(1) 272 | # dataset = dataset.shuffle(shuffle_size) 273 | dataset = dataset.batch(FLAGS.eval_batch_size) 274 | iterator = dataset.make_initializable_iterator() 275 | text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels = iterator.get_next() # output dir 276 | pair_ids = [text_a_id, text_b_id, labels] 277 | 278 | training = tf.placeholder(tf.bool) 279 | mean_loss, logits, probabilities, accuracy, model, output_layer = create_model(bert_config, 280 | is_training = training, 281 | input_ids = input_ids, 282 | input_mask = input_mask, 283 | segment_ids = segment_ids, 284 | labels = labels, 285 | text_a_id = text_a_id, 286 | text_b_id = text_b_id, 287 | num_labels = 1, 288 | use_one_hot_embeddings = False) 289 | 290 | 291 | config = tf.ConfigProto(allow_soft_placement=True) 292 | config.gpu_options.allow_growth = True 293 | 294 | if FLAGS.do_eval: 295 | with tf.Session(config=config) as sess: 296 | tf.logging.info("*** Restore model ***") 297 | 298 | ckpt = tf.train.get_checkpoint_state(FLAGS.restore_model_dir) 299 | variables = tf.trainable_variables() 300 | saver = tf.train.Saver(variables) 301 | saver.restore(sess, ckpt.model_checkpoint_path) 302 | 303 | tf.logging.info('Test begin') 304 | sess.run(iterator.initializer, 305 | feed_dict={filenames: [FLAGS.test_dir], shuffle_size: 1}) 306 | run_test(FLAGS.restore_model_dir, "test", sess, training, accuracy, probabilities, pair_ids, output_layer) 307 | 308 | 309 | if __name__ == "__main__": 310 | tf.app.run() 311 | -------------------------------------------------------------------------------- /Pretraining-Based/C2P-BERT/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import re 23 | import unicodedata 24 | import six 25 | import tensorflow as tf 26 | 27 | 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): 29 | """Checks whether the casing config is consistent with the checkpoint name.""" 30 | 31 | # The casing has to be passed in by the user and there is no explicit check 32 | # as to whether it matches the checkpoint. The casing information probably 33 | # should have been stored in the bert_config.json file, but it's not, so 34 | # we have to heuristically detect it to validate. 35 | 36 | if not init_checkpoint: 37 | return 38 | 39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) 40 | if m is None: 41 | return 42 | 43 | model_name = m.group(1) 44 | 45 | lower_models = [ 46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", 47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" 48 | ] 49 | 50 | cased_models = [ 51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", 52 | "multi_cased_L-12_H-768_A-12" 53 | ] 54 | 55 | is_bad_config = False 56 | if model_name in lower_models and not do_lower_case: 57 | is_bad_config = True 58 | actual_flag = "False" 59 | case_name = "lowercased" 60 | opposite_flag = "True" 61 | 62 | if model_name in cased_models and do_lower_case: 63 | is_bad_config = True 64 | actual_flag = "True" 65 | case_name = "cased" 66 | opposite_flag = "False" 67 | 68 | if is_bad_config: 69 | raise ValueError( 70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " 71 | "However, `%s` seems to be a %s model, so you " 72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches " 73 | "how the model was pre-training. If this error is wrong, please " 74 | "just comment out this check." % (actual_flag, init_checkpoint, 75 | model_name, case_name, opposite_flag)) 76 | 77 | 78 | def convert_to_unicode(text): 79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 80 | if six.PY3: 81 | if isinstance(text, str): 82 | return text 83 | elif isinstance(text, bytes): 84 | return text.decode("utf-8", "ignore") 85 | else: 86 | raise ValueError("Unsupported string type: %s" % (type(text))) 87 | elif six.PY2: 88 | if isinstance(text, str): 89 | return text.decode("utf-8", "ignore") 90 | elif isinstance(text, unicode): 91 | return text 92 | else: 93 | raise ValueError("Unsupported string type: %s" % (type(text))) 94 | else: 95 | raise ValueError("Not running on Python2 or Python 3?") 96 | 97 | 98 | def printable_text(text): 99 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 100 | 101 | # These functions want `str` for both Python2 and Python3, but in one case 102 | # it's a Unicode string and in the other it's a byte string. 103 | if six.PY3: 104 | if isinstance(text, str): 105 | return text 106 | elif isinstance(text, bytes): 107 | return text.decode("utf-8", "ignore") 108 | else: 109 | raise ValueError("Unsupported string type: %s" % (type(text))) 110 | elif six.PY2: 111 | if isinstance(text, str): 112 | return text 113 | elif isinstance(text, unicode): 114 | return text.encode("utf-8") 115 | else: 116 | raise ValueError("Unsupported string type: %s" % (type(text))) 117 | else: 118 | raise ValueError("Not running on Python2 or Python 3?") 119 | 120 | 121 | def load_vocab(vocab_file): 122 | """Loads a vocabulary file into a dictionary.""" 123 | vocab = collections.OrderedDict() 124 | index = 0 125 | with tf.gfile.GFile(vocab_file, "r") as reader: 126 | while True: 127 | token = convert_to_unicode(reader.readline()) 128 | if not token: 129 | break 130 | token = token.strip() 131 | vocab[token] = index 132 | index += 1 133 | return vocab 134 | 135 | 136 | def convert_by_vocab(vocab, items): 137 | """Converts a sequence of [tokens|ids] using the vocab.""" 138 | output = [] 139 | for item in items: 140 | output.append(vocab[item]) 141 | return output 142 | 143 | 144 | def convert_tokens_to_ids(vocab, tokens): 145 | return convert_by_vocab(vocab, tokens) 146 | 147 | 148 | def convert_ids_to_tokens(inv_vocab, ids): 149 | return convert_by_vocab(inv_vocab, ids) 150 | 151 | 152 | def whitespace_tokenize(text): 153 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 154 | text = text.strip() 155 | if not text: 156 | return [] 157 | tokens = text.split() 158 | return tokens 159 | 160 | 161 | class FullTokenizer(object): 162 | """Runs end-to-end tokenziation.""" 163 | 164 | def __init__(self, vocab_file, do_lower_case=True): 165 | self.vocab = load_vocab(vocab_file) 166 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 167 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 168 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 169 | 170 | def tokenize(self, text): 171 | split_tokens = [] 172 | for token in self.basic_tokenizer.tokenize(text): 173 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 174 | split_tokens.append(sub_token) 175 | 176 | return split_tokens 177 | 178 | def convert_tokens_to_ids(self, tokens): 179 | return convert_by_vocab(self.vocab, tokens) 180 | 181 | def convert_ids_to_tokens(self, ids): 182 | return convert_by_vocab(self.inv_vocab, ids) 183 | 184 | 185 | class BasicTokenizer(object): 186 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 187 | 188 | def __init__(self, do_lower_case=True): 189 | """Constructs a BasicTokenizer. 190 | 191 | Args: 192 | do_lower_case: Whether to lower case the input. 193 | """ 194 | self.do_lower_case = do_lower_case 195 | 196 | def tokenize(self, text): 197 | """Tokenizes a piece of text.""" 198 | text = convert_to_unicode(text) 199 | text = self._clean_text(text) 200 | 201 | # This was added on November 1st, 2018 for the multilingual and Chinese 202 | # models. This is also applied to the English models now, but it doesn't 203 | # matter since the English models were not trained on any Chinese data 204 | # and generally don't have any Chinese data in them (there are Chinese 205 | # characters in the vocabulary because Wikipedia does have some Chinese 206 | # words in the English Wikipedia.). 207 | text = self._tokenize_chinese_chars(text) 208 | 209 | orig_tokens = whitespace_tokenize(text) 210 | split_tokens = [] 211 | for token in orig_tokens: 212 | if self.do_lower_case: 213 | token = token.lower() 214 | token = self._run_strip_accents(token) 215 | split_tokens.extend(self._run_split_on_punc(token)) 216 | 217 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 218 | return output_tokens 219 | 220 | def _run_strip_accents(self, text): 221 | """Strips accents from a piece of text.""" 222 | text = unicodedata.normalize("NFD", text) 223 | output = [] 224 | for char in text: 225 | cat = unicodedata.category(char) 226 | if cat == "Mn": 227 | continue 228 | output.append(char) 229 | return "".join(output) 230 | 231 | def _run_split_on_punc(self, text): 232 | """Splits punctuation on a piece of text.""" 233 | chars = list(text) 234 | i = 0 235 | start_new_word = True 236 | output = [] 237 | while i < len(chars): 238 | char = chars[i] 239 | if _is_punctuation(char): 240 | output.append([char]) 241 | start_new_word = True 242 | else: 243 | if start_new_word: 244 | output.append([]) 245 | start_new_word = False 246 | output[-1].append(char) 247 | i += 1 248 | 249 | return ["".join(x) for x in output] 250 | 251 | def _tokenize_chinese_chars(self, text): 252 | """Adds whitespace around any CJK character.""" 253 | output = [] 254 | for char in text: 255 | cp = ord(char) 256 | if self._is_chinese_char(cp): 257 | output.append(" ") 258 | output.append(char) 259 | output.append(" ") 260 | else: 261 | output.append(char) 262 | return "".join(output) 263 | 264 | def _is_chinese_char(self, cp): 265 | """Checks whether CP is the codepoint of a CJK character.""" 266 | # This defines a "chinese character" as anything in the CJK Unicode block: 267 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 268 | # 269 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 270 | # despite its name. The modern Korean Hangul alphabet is a different block, 271 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 272 | # space-separated words, so they are not treated specially and handled 273 | # like the all of the other languages. 274 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 275 | (cp >= 0x3400 and cp <= 0x4DBF) or # 276 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 277 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 278 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 279 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 280 | (cp >= 0xF900 and cp <= 0xFAFF) or # 281 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 282 | return True 283 | 284 | return False 285 | 286 | def _clean_text(self, text): 287 | """Performs invalid character removal and whitespace cleanup on text.""" 288 | output = [] 289 | for char in text: 290 | cp = ord(char) 291 | if cp == 0 or cp == 0xfffd or _is_control(char): 292 | continue 293 | if _is_whitespace(char): 294 | output.append(" ") 295 | else: 296 | output.append(char) 297 | return "".join(output) 298 | 299 | 300 | class WordpieceTokenizer(object): 301 | """Runs WordPiece tokenziation.""" 302 | 303 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 304 | self.vocab = vocab 305 | self.unk_token = unk_token 306 | self.max_input_chars_per_word = max_input_chars_per_word 307 | 308 | def tokenize(self, text): 309 | """Tokenizes a piece of text into its word pieces. 310 | 311 | This uses a greedy longest-match-first algorithm to perform tokenization 312 | using the given vocabulary. 313 | 314 | For example: 315 | input = "unaffable" 316 | output = ["un", "##aff", "##able"] 317 | 318 | Args: 319 | text: A single token or whitespace separated tokens. This should have 320 | already been passed through `BasicTokenizer. 321 | 322 | Returns: 323 | A list of wordpiece tokens. 324 | """ 325 | 326 | text = convert_to_unicode(text) 327 | 328 | output_tokens = [] 329 | for token in whitespace_tokenize(text): 330 | chars = list(token) 331 | if len(chars) > self.max_input_chars_per_word: 332 | output_tokens.append(self.unk_token) 333 | continue 334 | 335 | is_bad = False 336 | start = 0 337 | sub_tokens = [] 338 | while start < len(chars): 339 | end = len(chars) 340 | cur_substr = None 341 | while start < end: 342 | substr = "".join(chars[start:end]) 343 | if start > 0: 344 | substr = "##" + substr 345 | if substr in self.vocab: 346 | cur_substr = substr 347 | break 348 | end -= 1 349 | if cur_substr is None: 350 | is_bad = True 351 | break 352 | sub_tokens.append(cur_substr) 353 | start = end 354 | 355 | if is_bad: 356 | output_tokens.append(self.unk_token) 357 | else: 358 | output_tokens.extend(sub_tokens) 359 | return output_tokens 360 | 361 | 362 | def _is_whitespace(char): 363 | """Checks whether `chars` is a whitespace character.""" 364 | # \t, \n, and \r are technically contorl characters but we treat them 365 | # as whitespace since they are generally considered as such. 366 | if char == " " or char == "\t" or char == "\n" or char == "\r": 367 | return True 368 | cat = unicodedata.category(char) 369 | if cat == "Zs": 370 | return True 371 | return False 372 | 373 | 374 | def _is_control(char): 375 | """Checks whether `chars` is a control character.""" 376 | # These are technically control characters but we count them as whitespace 377 | # characters. 378 | if char == "\t" or char == "\n" or char == "\r": 379 | return False 380 | cat = unicodedata.category(char) 381 | if cat in ("Cc", "Cf"): 382 | return True 383 | return False 384 | 385 | 386 | def _is_punctuation(char): 387 | """Checks whether `chars` is a punctuation character.""" 388 | cp = ord(char) 389 | # We treat all non-letter/number ASCII as punctuation. 390 | # Characters such as "^", "$", and "`" are not in the Unicode 391 | # Punctuation class but we treat them as punctuation anyways, for 392 | # consistency. 393 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 394 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 395 | return True 396 | cat = unicodedata.category(char) 397 | if cat.startswith("P"): 398 | return True 399 | return False 400 | --------------------------------------------------------------------------------