├── Non-Pretraining-Based
├── C2P-X
│ ├── __init__.py
│ ├── scripts
│ │ ├── test.sh
│ │ └── train.sh
│ ├── compute_metrics.py
│ ├── metrics.py
│ ├── test.py
│ ├── model_BOW.py
│ ├── data_helpers.py
│ ├── transformer_block.py
│ ├── model_Transformer.py
│ ├── model_BiLSTM.py
│ └── train.py
└── U2P-X
│ ├── __init__.py
│ ├── scripts
│ ├── test.sh
│ └── train.sh
│ ├── compute_metrics.py
│ ├── metrics.py
│ ├── test.py
│ ├── model_BOW.py
│ ├── transformer_block.py
│ ├── model_Transformer.py
│ ├── data_helpers.py
│ ├── model_BiLSTM.py
│ └── train.py
├── image
├── result.png
└── task.png
├── Pretraining-Based
├── uncased_L-12_H-768_A-12
│ └── README.txt
├── C2P-BERT
│ ├── scripts
│ │ ├── test.sh
│ │ └── train.sh
│ ├── __init__.py
│ ├── compute_metrics.py
│ ├── metrics.py
│ ├── optimization.py
│ ├── test.py
│ └── tokenization.py
└── U2P-BERT
│ ├── scripts
│ ├── test.sh
│ └── train.sh
│ ├── __init__.py
│ ├── compute_metrics.py
│ ├── metrics.py
│ ├── optimization.py
│ └── test.py
├── data_PMPC
└── README.txt
└── README.md
/Non-Pretraining-Based/C2P-X/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/image/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonForJoy/SPD/HEAD/image/result.png
--------------------------------------------------------------------------------
/image/task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonForJoy/SPD/HEAD/image/task.png
--------------------------------------------------------------------------------
/Pretraining-Based/uncased_L-12_H-768_A-12/README.txt:
--------------------------------------------------------------------------------
1 | ====== Download the BERT base model ======
2 |
3 | link: https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
4 | Move to path: ./Pretraining-Based/uncased_L-12_H-768_A-12
5 |
6 |
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/scripts/test.sh:
--------------------------------------------------------------------------------
1 |
2 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
3 | --test_dir ../data_tfrecord/processed_test_both_revised_cand_10.tfrecord \
4 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
5 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
6 | --max_seq_length 200 \
7 | --eval_batch_size 50 \
8 | --restore_model_dir ../output/1631501715 > log_test_BERT_cand_10.txt 2>&1 &
9 |
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/scripts/test.sh:
--------------------------------------------------------------------------------
1 |
2 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
3 | --test_dir ../data_tfrecord/processed_test_both_revised_cand_10.tfrecord \
4 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
5 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
6 | --max_seq_length 1400 \
7 | --eval_batch_size 10 \
8 | --restore_model_dir ../output/1631263935 > log_test_BERT_cand_10.txt 2>&1 &
9 |
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/data_PMPC/README.txt:
--------------------------------------------------------------------------------
1 | Download the PMPC dataset and move it to path: /data_PMPC
2 |
3 | If you think our work is helpful, or use the code or dataset, please cite the following paper
4 |
5 | @inproceedings{gu-etal-2021-detecting,
6 | title = "Detecting Speaker Personas from Conversational Texts",
7 | author = "Gu, Jia-Chen and
8 | Ling, Zhen-Hua and
9 | Wu, Yu and
10 | Liu, Quan and
11 | Chen, Zhigang and
12 | Zhu, Xiaodan",
13 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
14 | month = nov,
15 | year = "2021",
16 | publisher = "Association for Computational Linguistics",
17 | }
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/scripts/train.sh:
--------------------------------------------------------------------------------
1 |
2 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
3 | --task_name PersonaMatch \
4 | --train_dir ../data_tfrecord/processed_train_both_revised.tfrecord \
5 | --valid_dir ../data_tfrecord/processed_valid_both_revised_cand_10.tfrecord \
6 | --output_dir ../output \
7 | --do_lower_case True \
8 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
9 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
10 | --init_checkpoint ../../uncased_L-12_H-768_A-12/bert_model.ckpt \
11 | --max_seq_length 200 \
12 | --do_train True \
13 | --do_eval True \
14 | --train_batch_size 20 \
15 | --eval_batch_size 20 \
16 | --learning_rate 2e-5 \
17 | --num_train_epochs 10 \
18 | --warmup_proportion 0.1 > log_train_BERT_cand_10.txt 2>&1 &
19 |
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/scripts/train.sh:
--------------------------------------------------------------------------------
1 |
2 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
3 | --task_name PersonaMatch \
4 | --train_dir ../data_tfrecord/processed_train_both_revised_cand_10.tfrecord \
5 | --valid_dir ../data_tfrecord/processed_valid_both_revised_cand_10.tfrecord \
6 | --output_dir ../output \
7 | --do_lower_case True \
8 | --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
9 | --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
10 | --init_checkpoint ../../uncased_L-12_H-768_A-12/bert_model.ckpt \
11 | --max_seq_length 1400 \
12 | --do_train True \
13 | --do_eval True \
14 | --train_batch_size 4 \
15 | --eval_batch_size 4 \
16 | --learning_rate 2e-5 \
17 | --num_train_epochs 20 \
18 | --warmup_proportion 0.1 > log_train_BERT_cand_10.txt 2>&1 &
19 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/scripts/test.sh:
--------------------------------------------------------------------------------
1 |
2 | latest_checkpoint=../output/1631263935/checkpoints
3 | echo $latest_checkpoint
4 |
5 | test_file=../../../data_PMPC/test_both_revised_cand_10.txt
6 | vocab_file=../../../data_PMPC/vocab.txt
7 | char_vocab_file=../../../data_PMPC/char_vocab.txt
8 | output_file=${latest_checkpoint}/output_test.txt
9 |
10 | max_context_len=150
11 | max_persona_len=50
12 | max_word_length=18
13 | batch_size=128
14 |
15 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
16 | --test_file $test_file \
17 | --vocab_file $vocab_file \
18 | --char_vocab_file $char_vocab_file \
19 | --output_file $output_file \
20 | --max_context_len $max_context_len \
21 | --max_persona_len $max_persona_len \
22 | --max_word_length $max_word_length \
23 | --batch_size $batch_size \
24 | --checkpoint_dir $latest_checkpoint > log_test_BOW_cand_10.txt 2>&1 &
25 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/scripts/test.sh:
--------------------------------------------------------------------------------
1 |
2 | latest_checkpoint=../output/1631263935/checkpoints
3 | echo $latest_checkpoint
4 |
5 | test_file=../../../data_PMPC/test_both_revised_cand_10.txt
6 | vocab_file=../../../data_PMPC/vocab.txt
7 | char_vocab_file=../../../data_PMPC/char_vocab.txt
8 | output_file=${latest_checkpoint}/output_test.txt
9 |
10 | max_utter_num=8
11 | max_utter_len=20
12 | max_profile_num=5
13 | max_profile_len=15
14 | max_word_length=18
15 | batch_size=128
16 |
17 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
18 | --test_file $test_file \
19 | --vocab_file $vocab_file \
20 | --char_vocab_file $char_vocab_file \
21 | --output_file $output_file \
22 | --max_utter_num $max_utter_num \
23 | --max_utter_len $max_utter_len \
24 | --max_profile_num $max_profile_num \
25 | --max_profile_len $max_profile_len \
26 | --max_word_length $max_word_length \
27 | --batch_size $batch_size \
28 | --checkpoint_dir $latest_checkpoint > log_test_BOW_can_10.txt 2>&1 &
29 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/scripts/train.sh:
--------------------------------------------------------------------------------
1 |
2 | train_file=../../../data_PMPC/train_both_revised.txt
3 | valid_file=../../../data_PMPC/valid_both_revised_cand_10.txt
4 | vocab_file=../../../data_PMPC/vocab.txt
5 | char_vocab_file=../../../data_PMPC/char_vocab.txt
6 | embedded_vector_file=../../../data_PMPC/filtered.glove.42B.300d.txt
7 |
8 | max_context_len=150
9 | max_persona_len=50
10 | max_word_length=18
11 | num_layer=1
12 | embedding_dim=300
13 | rnn_size=200
14 |
15 | batch_size=128
16 | lambda=0
17 | dropout_keep_prob=0.8
18 | num_epochs=1000
19 | evaluate_every=100
20 |
21 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
22 | --train_file $train_file \
23 | --valid_file $valid_file \
24 | --vocab_file $vocab_file \
25 | --char_vocab_file $char_vocab_file \
26 | --embedded_vector_file $embedded_vector_file \
27 | --max_context_len $max_context_len \
28 | --max_persona_len $max_persona_len \
29 | --max_word_length $max_word_length \
30 | --num_layer $num_layer \
31 | --embedding_dim $embedding_dim \
32 | --rnn_size $rnn_size \
33 | --batch_size $batch_size \
34 | --l2_reg_lambda $lambda \
35 | --dropout_keep_prob $dropout_keep_prob \
36 | --num_epochs $num_epochs \
37 | --evaluate_every $evaluate_every > log_train_BOW_cand_10.txt 2>&1 &
38 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/scripts/train.sh:
--------------------------------------------------------------------------------
1 |
2 | train_file=../../../data_PMPC/train_both_revised.txt
3 | valid_file=../../../data_PMPC/valid_both_revised_cand_10.txt
4 | vocab_file=../../../data_PMPC/vocab.txt
5 | char_vocab_file=../../../data_PMPC/char_vocab.txt
6 | embedded_vector_file=../../../data_PMPC/filtered.glove.42B.300d.txt
7 |
8 | max_utter_num=8
9 | max_utter_len=20
10 | max_profile_num=5
11 | max_profile_len=15
12 | max_word_length=18
13 | num_layer=1
14 | embedding_dim=300
15 | rnn_size=200
16 |
17 | batch_size=128
18 | lambda=0
19 | dropout_keep_prob=0.8
20 | num_epochs=1000
21 | evaluate_every=100
22 |
23 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
24 | --train_file $train_file \
25 | --valid_file $valid_file \
26 | --vocab_file $vocab_file \
27 | --char_vocab_file $char_vocab_file \
28 | --embedded_vector_file $embedded_vector_file \
29 | --max_utter_num $max_utter_num \
30 | --max_utter_len $max_utter_len \
31 | --max_profile_num $max_profile_num \
32 | --max_profile_len $max_profile_len \
33 | --max_word_length $max_word_length \
34 | --num_layer $num_layer \
35 | --embedding_dim $embedding_dim \
36 | --rnn_size $rnn_size \
37 | --batch_size $batch_size \
38 | --l2_reg_lambda $lambda \
39 | --dropout_keep_prob $dropout_keep_prob \
40 | --num_epochs $num_epochs \
41 | --evaluate_every $evaluate_every > log_train_BOW_cand_10.txt 2>&1 &
42 |
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/compute_metrics.py:
--------------------------------------------------------------------------------
1 | '''
2 | Load the output.txt file and compute the matrics
3 | '''
4 |
5 | import numpy as np
6 | import operator
7 | import random
8 | from collections import defaultdict
9 | import metrics
10 |
11 |
12 | test_out_filename = "output/1631259843/output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 |
15 | with open(test_out_filename, 'r') as f:
16 |
17 | results = defaultdict(list)
18 | lines = f.readlines()
19 | for line in lines[1:]:
20 | line = line.strip().split('\t')
21 | us_id = line[0]
22 | r_id = line[1]
23 | prob_score = float(line[2])
24 | label = float(line[4])
25 | results[us_id].append((r_id, label, prob_score))
26 |
27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 | total_valid_query = metrics.get_num_valid_query(results)
30 | mvp = metrics.mean_average_precision(results)
31 | mrr = metrics.mean_reciprocal_rank(results)
32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 | mvp, mrr, total_valid_query))
34 | top_1_precision = metrics.top_k_precision(results, k=1)
35 | top_2_precision = metrics.top_k_precision(results, k=2)
36 | top_5_precision = metrics.top_k_precision(results, k=5)
37 | print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format(
38 | top_1_precision, top_2_precision, top_5_precision))
39 |
40 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/compute_metrics.py:
--------------------------------------------------------------------------------
1 | '''
2 | Load the output.txt file and compute the matrics
3 | '''
4 |
5 | import numpy as np
6 | import operator
7 | import random
8 | from collections import defaultdict
9 | import metrics
10 |
11 |
12 | test_out_filename = "output/1631512095/checkpoints/output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 |
15 | with open(test_out_filename, 'r') as f:
16 |
17 | results = defaultdict(list)
18 | lines = f.readlines()
19 | for line in lines[1:]:
20 | line = line.strip().split('\t')
21 | us_id = line[0]
22 | r_id = line[1]
23 | prob_score = float(line[2])
24 | label = float(line[4])
25 | results[us_id].append((r_id, label, prob_score))
26 |
27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 | total_valid_query = metrics.get_num_valid_query(results)
30 | mvp = metrics.mean_average_precision(results)
31 | mrr = metrics.mean_reciprocal_rank(results)
32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 | mvp, mrr, total_valid_query))
34 | top_1_precision = metrics.top_k_precision(results, k=1)
35 | top_2_precision = metrics.top_k_precision(results, k=2)
36 | top_5_precision = metrics.top_k_precision(results, k=5)
37 | print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format(
38 | top_1_precision, top_2_precision, top_5_precision))
39 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/compute_metrics.py:
--------------------------------------------------------------------------------
1 | '''
2 | Load the output.txt file and compute the matrics
3 | '''
4 |
5 | import numpy as np
6 | import operator
7 | import random
8 | from collections import defaultdict
9 | import metrics
10 |
11 |
12 | test_out_filename = "output/1631513113/checkpoints/output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 |
15 | with open(test_out_filename, 'r') as f:
16 |
17 | results = defaultdict(list)
18 | lines = f.readlines()
19 | for line in lines[1:]:
20 | line = line.strip().split('\t')
21 | us_id = line[0]
22 | r_id = line[1]
23 | prob_score = float(line[2])
24 | label = float(line[4])
25 | results[us_id].append((r_id, label, prob_score))
26 |
27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 | total_valid_query = metrics.get_num_valid_query(results)
30 | mvp = metrics.mean_average_precision(results)
31 | mrr = metrics.mean_reciprocal_rank(results)
32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 | mvp, mrr, total_valid_query))
34 | top_1_precision = metrics.top_k_precision(results, k=1)
35 | top_2_precision = metrics.top_k_precision(results, k=2)
36 | top_5_precision = metrics.top_k_precision(results, k=5)
37 | print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format(
38 | top_1_precision, top_2_precision, top_5_precision))
39 |
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/compute_metrics.py:
--------------------------------------------------------------------------------
1 | '''
2 | Load the output.txt file and compute the matrics
3 | '''
4 |
5 | import numpy as np
6 | import operator
7 | import random
8 | from collections import defaultdict
9 | import metrics
10 |
11 |
12 | test_out_filename = "output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 |
15 | with open(test_out_filename, 'r') as f:
16 |
17 | results = defaultdict(list)
18 | lines = f.readlines()
19 | for line in lines[1:]:
20 | line = line.strip().split('\t')
21 | us_id = line[0]
22 | r_id = line[1]
23 | prob_score = float(line[2])
24 | label = float(line[4])
25 | results[us_id].append((r_id, label, prob_score))
26 |
27 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 | total_valid_query = metrics.get_num_valid_query(results)
30 | mvp = metrics.mean_average_precision(results)
31 | mrr = metrics.mean_reciprocal_rank(results)
32 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 | mvp, mrr, total_valid_query))
34 | top_1_precision = metrics.top_k_precision(results, k=1)
35 | top_2_precision = metrics.top_k_precision(results, k=2)
36 | top_5_precision = metrics.top_k_precision(results, k=5)
37 | top_10_precision = metrics.top_k_precision(results, k=10)
38 | print('Recall@1: {}\tRecall@2: {}\tRecall@5: {}\tRecall@10: {}\n'.format(
39 | top_1_precision, top_2_precision, top_5_precision, top_10_precision))
40 |
41 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/metrics.py:
--------------------------------------------------------------------------------
1 | import operator
2 | import math
3 |
4 |
5 | def is_valid_query(v):
6 | num_pos = 0
7 | num_neg = 0
8 | for aid, label, score in v:
9 | if label > 0:
10 | num_pos += 1
11 | else:
12 | num_neg += 1
13 | if num_pos > 0 and num_neg > 0:
14 | return True
15 | else:
16 | return False
17 |
18 |
19 | def get_num_valid_query(results):
20 | num_query = 0
21 | for k, v in results.items():
22 | if not is_valid_query(v):
23 | continue
24 | num_query += 1
25 | return num_query
26 |
27 |
28 | def top_1_precision(results):
29 | num_query = 0
30 | top_1_correct = 0.0
31 | for k, v in results.items():
32 | if not is_valid_query(v):
33 | continue
34 | num_query += 1
35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
36 | aid, label, score = sorted_v[0]
37 | if label > 0:
38 | top_1_correct += 1
39 |
40 | if num_query > 0:
41 | return top_1_correct / num_query
42 | else:
43 | return 0.0
44 |
45 |
46 | def top_k_precision(results, k=1):
47 | num_query = 0
48 | top_1_correct = 0.0
49 | for key, v in results.items():
50 | if not is_valid_query(v):
51 | continue
52 | num_query += 1
53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
54 | if k == 1:
55 | aid, label, score = sorted_v[0]
56 | if label > 0:
57 | top_1_correct += 1
58 | elif k == 2:
59 | aid1, label1, score1 = sorted_v[0]
60 | aid2, label2, score2 = sorted_v[1]
61 | if label1 > 0 or label2 > 0:
62 | top_1_correct += 1
63 | elif k == 5:
64 | for vv in sorted_v[0:5]:
65 | label = vv[1]
66 | if label > 0:
67 | top_1_correct += 1
68 | break
69 | else:
70 | raise BaseException
71 |
72 | if num_query > 0:
73 | return top_1_correct/num_query
74 | else:
75 | return 0.0
76 |
77 |
78 | def mean_reciprocal_rank(results):
79 | num_query = 0
80 | mrr = 0.0
81 | for k, v in results.items():
82 | if not is_valid_query(v):
83 | continue
84 |
85 | num_query += 1
86 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
87 | for i, rec in enumerate(sorted_v):
88 | aid, label, score = rec
89 | if label > 0:
90 | mrr += 1.0 / (i + 1)
91 | break
92 |
93 | if num_query == 0:
94 | return 0.0
95 | else:
96 | mrr = mrr / num_query
97 | return mrr
98 |
99 |
100 | def mean_average_precision(results):
101 | num_query = 0
102 | mvp = 0.0
103 | for k, v in results.items():
104 | if not is_valid_query(v):
105 | continue
106 |
107 | num_query += 1
108 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
109 | num_relevant_doc = 0.0
110 | avp = 0.0
111 | for i, rec in enumerate(sorted_v):
112 | aid, label, score = rec
113 | if label == 1:
114 | num_relevant_doc += 1
115 | precision = num_relevant_doc / (i + 1)
116 | avp += precision
117 | avp = avp / num_relevant_doc
118 | mvp += avp
119 |
120 | if num_query == 0:
121 | return 0.0
122 | else:
123 | mvp = mvp / num_query
124 | return mvp
125 |
126 |
127 | def classification_metrics(results):
128 | total_num = 0
129 | total_correct = 0
130 | true_positive = 0
131 | positive_correct = 0
132 | predicted_positive = 0
133 |
134 | loss = 0.0;
135 | for k, v in results.items():
136 | for rec in v:
137 | total_num += 1
138 | aid, label, score = rec
139 |
140 | if score > 0.5:
141 | predicted_positive += 1
142 |
143 | if label > 0:
144 | true_positive += 1
145 | loss += -math.log(score + 1e-12)
146 | else:
147 | loss += -math.log(1.0 - score + 1e-12);
148 |
149 | if score > 0.5 and label > 0:
150 | total_correct += 1
151 | positive_correct += 1
152 |
153 | if score < 0.5 and label < 0.5:
154 | total_correct += 1
155 |
156 | accuracy = float(total_correct) / total_num
157 | precision = float(positive_correct) / (predicted_positive + 1e-12)
158 | recall = float(positive_correct) / true_positive
159 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
160 | return accuracy, precision, recall, F1, loss / total_num;
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/metrics.py:
--------------------------------------------------------------------------------
1 | import operator
2 | import math
3 |
4 |
5 | def is_valid_query(v):
6 | num_pos = 0
7 | num_neg = 0
8 | for aid, label, score in v:
9 | if label > 0:
10 | num_pos += 1
11 | else:
12 | num_neg += 1
13 | if num_pos > 0 and num_neg > 0:
14 | return True
15 | else:
16 | return False
17 |
18 |
19 | def get_num_valid_query(results):
20 | num_query = 0
21 | for k, v in results.items():
22 | if not is_valid_query(v):
23 | continue
24 | num_query += 1
25 | return num_query
26 |
27 |
28 | def top_1_precision(results):
29 | num_query = 0
30 | top_1_correct = 0.0
31 | for k, v in results.items():
32 | if not is_valid_query(v):
33 | continue
34 | num_query += 1
35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
36 | aid, label, score = sorted_v[0]
37 | if label > 0:
38 | top_1_correct += 1
39 |
40 | if num_query > 0:
41 | return top_1_correct / num_query
42 | else:
43 | return 0.0
44 |
45 |
46 | def top_k_precision(results, k=1):
47 | num_query = 0
48 | top_1_correct = 0.0
49 | for key, v in results.items():
50 | if not is_valid_query(v):
51 | continue
52 | num_query += 1
53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
54 | if k == 1:
55 | aid, label, score = sorted_v[0]
56 | if label > 0:
57 | top_1_correct += 1
58 | elif k == 2:
59 | aid1, label1, score1 = sorted_v[0]
60 | aid2, label2, score2 = sorted_v[1]
61 | if label1 > 0 or label2 > 0:
62 | top_1_correct += 1
63 | elif k == 5:
64 | for vv in sorted_v[0:5]:
65 | label = vv[1]
66 | if label > 0:
67 | top_1_correct += 1
68 | break
69 | else:
70 | raise BaseException
71 |
72 | if num_query > 0:
73 | return top_1_correct/num_query
74 | else:
75 | return 0.0
76 |
77 |
78 | def mean_reciprocal_rank(results):
79 | num_query = 0
80 | mrr = 0.0
81 | for k, v in results.items():
82 | if not is_valid_query(v):
83 | continue
84 |
85 | num_query += 1
86 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
87 | for i, rec in enumerate(sorted_v):
88 | aid, label, score = rec
89 | if label > 0:
90 | mrr += 1.0 / (i + 1)
91 | break
92 |
93 | if num_query == 0:
94 | return 0.0
95 | else:
96 | mrr = mrr / num_query
97 | return mrr
98 |
99 |
100 | def mean_average_precision(results):
101 | num_query = 0
102 | mvp = 0.0
103 | for k, v in results.items():
104 | if not is_valid_query(v):
105 | continue
106 |
107 | num_query += 1
108 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
109 | num_relevant_doc = 0.0
110 | avp = 0.0
111 | for i, rec in enumerate(sorted_v):
112 | aid, label, score = rec
113 | if label == 1:
114 | num_relevant_doc += 1
115 | precision = num_relevant_doc / (i + 1)
116 | avp += precision
117 | avp = avp / num_relevant_doc
118 | mvp += avp
119 |
120 | if num_query == 0:
121 | return 0.0
122 | else:
123 | mvp = mvp / num_query
124 | return mvp
125 |
126 |
127 | def classification_metrics(results):
128 | total_num = 0
129 | total_correct = 0
130 | true_positive = 0
131 | positive_correct = 0
132 | predicted_positive = 0
133 |
134 | loss = 0.0;
135 | for k, v in results.items():
136 | for rec in v:
137 | total_num += 1
138 | aid, label, score = rec
139 |
140 | if score > 0.5:
141 | predicted_positive += 1
142 |
143 | if label > 0:
144 | true_positive += 1
145 | loss += -math.log(score + 1e-12)
146 | else:
147 | loss += -math.log(1.0 - score + 1e-12);
148 |
149 | if score > 0.5 and label > 0:
150 | total_correct += 1
151 | positive_correct += 1
152 |
153 | if score < 0.5 and label < 0.5:
154 | total_correct += 1
155 |
156 | accuracy = float(total_correct) / total_num
157 | precision = float(positive_correct) / (predicted_positive + 1e-12)
158 | recall = float(positive_correct) / true_positive
159 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
160 | return accuracy, precision, recall, F1, loss / total_num;
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/metrics.py:
--------------------------------------------------------------------------------
1 | import operator
2 | import math
3 |
4 |
5 | def is_valid_query(v):
6 | num_pos = 0
7 | num_neg = 0
8 | for aid, label, score in v:
9 | if label > 0:
10 | num_pos += 1
11 | else:
12 | num_neg += 1
13 | if num_pos > 0 and num_neg > 0:
14 | return True
15 | else:
16 | return False
17 |
18 |
19 | def get_num_valid_query(results):
20 | num_query = 0
21 | for k, v in results.items():
22 | if not is_valid_query(v):
23 | continue
24 | num_query += 1
25 | return num_query
26 |
27 |
28 | def top_1_precision(results):
29 | num_query = 0
30 | top_1_correct = 0.0
31 | for k, v in results.items():
32 | if not is_valid_query(v):
33 | continue
34 | num_query += 1
35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
36 | aid, label, score = sorted_v[0]
37 | if label > 0:
38 | top_1_correct += 1
39 |
40 | if num_query > 0:
41 | return top_1_correct / num_query
42 | else:
43 | return 0.0
44 |
45 |
46 | def top_k_precision(results, k=1):
47 | num_query = 0
48 | top_1_correct = 0.0
49 | for key, v in results.items():
50 | if not is_valid_query(v):
51 | continue
52 | num_query += 1
53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
54 | if k == 1:
55 | aid, label, score = sorted_v[0]
56 | if label > 0:
57 | top_1_correct += 1
58 | elif k == 2:
59 | aid1, label1, score1 = sorted_v[0]
60 | aid2, label2, score2 = sorted_v[1]
61 | if label1 > 0 or label2 > 0:
62 | top_1_correct += 1
63 | elif k == 5:
64 | for vv in sorted_v[0:5]:
65 | label = vv[1]
66 | if label > 0:
67 | top_1_correct += 1
68 | break
69 | else:
70 | raise BaseException
71 |
72 | if num_query > 0:
73 | return top_1_correct/num_query
74 | else:
75 | return 0.0
76 |
77 |
78 | def mean_reciprocal_rank(results):
79 | num_query = 0
80 | mrr = 0.0
81 | for k, v in results.items():
82 | if not is_valid_query(v):
83 | continue
84 |
85 | num_query += 1
86 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
87 | for i, rec in enumerate(sorted_v):
88 | aid, label, score = rec
89 | if label > 0:
90 | mrr += 1.0 / (i + 1)
91 | break
92 |
93 | if num_query == 0:
94 | return 0.0
95 | else:
96 | mrr = mrr / num_query
97 | return mrr
98 |
99 |
100 | def mean_average_precision(results):
101 | num_query = 0
102 | mvp = 0.0
103 | for k, v in results.items():
104 | if not is_valid_query(v):
105 | continue
106 |
107 | num_query += 1
108 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
109 | num_relevant_doc = 0.0
110 | avp = 0.0
111 | for i, rec in enumerate(sorted_v):
112 | aid, label, score = rec
113 | if label == 1:
114 | num_relevant_doc += 1
115 | precision = num_relevant_doc / (i + 1)
116 | avp += precision
117 | avp = avp / num_relevant_doc
118 | mvp += avp
119 |
120 | if num_query == 0:
121 | return 0.0
122 | else:
123 | mvp = mvp / num_query
124 | return mvp
125 |
126 |
127 | def classification_metrics(results):
128 | total_num = 0
129 | total_correct = 0
130 | true_positive = 0
131 | positive_correct = 0
132 | predicted_positive = 0
133 |
134 | loss = 0.0;
135 | for k, v in results.items():
136 | for rec in v:
137 | total_num += 1
138 | aid, label, score = rec
139 |
140 | if score > 0.5:
141 | predicted_positive += 1
142 |
143 | if label > 0:
144 | true_positive += 1
145 | loss += -math.log(score + 1e-12)
146 | else:
147 | loss += -math.log(1.0 - score + 1e-12);
148 |
149 | if score > 0.5 and label > 0:
150 | total_correct += 1
151 | positive_correct += 1
152 |
153 | if score < 0.5 and label < 0.5:
154 | total_correct += 1
155 |
156 | accuracy = float(total_correct) / total_num
157 | precision = float(positive_correct) / (predicted_positive + 1e-12)
158 | recall = float(positive_correct) / true_positive
159 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
160 | return accuracy, precision, recall, F1, loss / total_num;
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/metrics.py:
--------------------------------------------------------------------------------
1 | import operator
2 | import math
3 |
4 |
5 | def is_valid_query(v):
6 | num_pos = 0
7 | num_neg = 0
8 | for aid, label, score in v:
9 | if label > 0:
10 | num_pos += 1
11 | else:
12 | num_neg += 1
13 | if num_pos > 0 and num_neg > 0:
14 | return True
15 | else:
16 | return False
17 |
18 |
19 | def get_num_valid_query(results):
20 | num_query = 0
21 | for k, v in results.items():
22 | if not is_valid_query(v):
23 | continue
24 | num_query += 1
25 | return num_query
26 |
27 |
28 | def top_1_precision(results):
29 | num_query = 0
30 | top_1_correct = 0.0
31 | for k, v in results.items():
32 | if not is_valid_query(v):
33 | continue
34 | num_query += 1
35 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
36 | aid, label, score = sorted_v[0]
37 | if label > 0:
38 | top_1_correct += 1
39 |
40 | if num_query > 0:
41 | return top_1_correct / num_query
42 | else:
43 | return 0.0
44 |
45 |
46 | def top_k_precision(results, k=1):
47 | num_query = 0
48 | top_1_correct = 0.0
49 | for key, v in results.items():
50 | if not is_valid_query(v):
51 | continue
52 | num_query += 1
53 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
54 | if k == 1:
55 | aid, label, score = sorted_v[0]
56 | if label > 0:
57 | top_1_correct += 1
58 | elif k == 2:
59 | aid1, label1, score1 = sorted_v[0]
60 | aid2, label2, score2 = sorted_v[1]
61 | if label1 > 0 or label2 > 0:
62 | top_1_correct += 1
63 | elif k == 5:
64 | for vv in sorted_v[0:5]:
65 | label = vv[1]
66 | if label > 0:
67 | top_1_correct += 1
68 | break
69 | elif k == 10:
70 | for vv in sorted_v[0:10]:
71 | label = vv[1]
72 | if label > 0:
73 | top_1_correct += 1
74 | break
75 | else:
76 | raise BaseException
77 |
78 | if num_query > 0:
79 | return top_1_correct/num_query
80 | else:
81 | return 0.0
82 |
83 |
84 | def mean_reciprocal_rank(results):
85 | num_query = 0
86 | mrr = 0.0
87 | for k, v in results.items():
88 | if not is_valid_query(v):
89 | continue
90 |
91 | num_query += 1
92 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
93 | for i, rec in enumerate(sorted_v):
94 | aid, label, score = rec
95 | if label > 0:
96 | mrr += 1.0 / (i + 1)
97 | break
98 |
99 | if num_query == 0:
100 | return 0.0
101 | else:
102 | mrr = mrr / num_query
103 | return mrr
104 |
105 |
106 | def mean_average_precision(results):
107 | num_query = 0
108 | mvp = 0.0
109 | for k, v in results.items():
110 | if not is_valid_query(v):
111 | continue
112 |
113 | num_query += 1
114 | sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
115 | num_relevant_doc = 0.0
116 | avp = 0.0
117 | for i, rec in enumerate(sorted_v):
118 | aid, label, score = rec
119 | if label == 1:
120 | num_relevant_doc += 1
121 | precision = num_relevant_doc / (i + 1)
122 | avp += precision
123 | avp = avp / num_relevant_doc
124 | mvp += avp
125 |
126 | if num_query == 0:
127 | return 0.0
128 | else:
129 | mvp = mvp / num_query
130 | return mvp
131 |
132 |
133 | def classification_metrics(results):
134 | total_num = 0
135 | total_correct = 0
136 | true_positive = 0
137 | positive_correct = 0
138 | predicted_positive = 0
139 |
140 | loss = 0.0;
141 | for k, v in results.items():
142 | for rec in v:
143 | total_num += 1
144 | aid, label, score = rec
145 |
146 | if score > 0.5:
147 | predicted_positive += 1
148 |
149 | if label > 0:
150 | true_positive += 1
151 | loss += -math.log(score + 1e-12)
152 | else:
153 | loss += -math.log(1.0 - score + 1e-12);
154 |
155 | if score > 0.5 and label > 0:
156 | total_correct += 1
157 | positive_correct += 1
158 |
159 | if score < 0.5 and label < 0.5:
160 | total_correct += 1
161 |
162 | accuracy = float(total_correct) / total_num
163 | precision = float(positive_correct) / (predicted_positive + 1e-12)
164 | recall = float(positive_correct) / true_positive
165 | F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
166 | return accuracy, precision, recall, F1, loss / total_num;
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Detecting Speaker Personas from Conversational Texts
2 | This repository contains the source code and the dataset for the _EMNLP 2021_ paper [Detecting Speaker Personas from Conversational Texts](https://aclanthology.org/2021.emnlp-main.86.pdf). Jia-Chen Gu, Zhen-Hua Ling, Yu Wu, Quan Liu, Zhigang Chen, Xiaodan Zhu.
3 |
4 |
5 | ## Introduction
6 | Personas are useful for dialogue response prediction. However, the personas used in current studies are pre-defined and hard to obtain before a conversation. To tackle this issue, we study a new task, named Speaker Persona Detection (SPD), which aims to detect speaker personas based on the plain conversational text. In this task, a best-matched persona is searched out from candidates given the conversational text. This is a many-to-many semantic matching task because both contexts and personas in SPD are composed of multiple sentences. The long-term dependency and the dynamic redundancy among these sentences increase the difficulty of this task. We build a dataset for SPD, dubbed as Persona Match on Persona-Chat (PMPC). Furthermore, we evaluate several baseline models and propose utterance-to-profile (U2P) matching networks for this task. The U2P models operate at a fine granularity which treat both contexts and personas as sets of multiple sequences. Then, each sequence pair is scored and an interpretable overall score is obtained for a context-persona pair through aggregation. Evaluation results show that the U2P models outperform their baseline counterparts significantly.
7 |
8 |

9 |
10 | 
11 |
12 |
13 | ## Dependencies
14 | Python 3.6
15 | Tensorflow 1.13.1
16 |
17 |
18 | ## Download
19 | - Download the [BERT released by the Google research](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip),
20 | and move to path: ./Pretraining-Based/uncased_L-12_H-768_A-12
21 |
22 | - Download the [PMPC dataset](https://drive.google.com/file/d/1sE_N7fi_WojeQBWZcTg4Mw6Pyod27S73/view?usp=sharing) used in our paper,
23 | and move to path: ```./data_PMPC```
24 |
25 |
26 | ## Non-Pretraining-Based Models
27 | Train a new model.
28 | ```
29 | cd Non-Pretraining-Based/C2P-X/scripts/
30 | bash train.sh
31 | ```
32 | The training process is recorded in ```log_train_*.txt``` file.
33 |
34 | Test a trained model by modifying the variable ```latest_checkpoint``` in ```test.sh```.
35 | ```
36 | cd Non-Pretraining-Based/C2P-X/scripts/
37 | bash test.sh
38 | ```
39 | The testing process is recorded in ```log_test_*.txt``` file. A "output_test.txt" file which records scores for each context-persona pair will be saved to the path of ```latest_checkpoint```. Modify the variable ```test_out_filename``` in ```compute_metrics.py``` and then run the following command, various metrics will be shown.
40 | ```
41 | python compute_metrics.py
42 | ```
43 |
44 | You can choose a baseline model by comment/uncomment a model package (from ```model_BOW```, ```model_BiLSTM```, ```model_Transformer``` and ```model_ESIM```) in the first several lines in ```train.py```. The same process and commands can be done for those Non-Pretraining-Based U2P-X Models.
45 |
46 |
47 | ## Pretraining-Based Models
48 | Create the fine-tuning data.
49 | ```
50 | cd Pretraining-Based/C2P-BERT/
51 | python data_process_tfrecord.py
52 | ```
53 |
54 | Running the fine-tuning process.
55 | ```
56 | cd Pretraining-Based/C2P-BERT/scripts/
57 | bash train.sh
58 | ```
59 |
60 | Test a trained model by modifying the variable ```restore_model_dir``` in ```test.sh```.
61 | ```
62 | cd Pretraining-Based/C2P-BERT/scripts/
63 | bash test.sh
64 | ```
65 |
66 | Modify the variable ```test_out_filename``` in ```compute_metrics.py``` and then run the following command, various metrics will be shown.
67 | ```
68 | python compute_metrics.py
69 | ```
70 |
71 | The same process and commands can be done for U2P-BERT.
72 |
73 | **NOTE**: Since the dataset is small, each model was trained for 10 times with identical architectures and different random initializations. Thus, we report (mean ± standard deviation) in our paper.
74 |
75 |
76 | ## Cite
77 | If you think our work is helpful, or use the code or dataset, please cite the following paper:
78 | **"Detecting Speaker Personas from Conversational Texts"**
79 | Jia-Chen Gu, Zhen-Hua Ling, Yu Wu, Quan Liu, Zhigang Chen, Xiaodan Zhu. _EMNLP (2021)_
80 | ```
81 | @inproceedings{gu-etal-2021-detecting,
82 | title = "Detecting Speaker Personas from Conversational Texts",
83 | author = "Gu, Jia-Chen and
84 | Ling, Zhenhua and
85 | Wu, Yu and
86 | Liu, Quan and
87 | Chen, Zhigang and
88 | Zhu, Xiaodan",
89 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
90 | month = nov,
91 | year = "2021",
92 | address = "Online and Punta Cana, Dominican Republic",
93 | publisher = "Association for Computational Linguistics",
94 | url = "https://aclanthology.org/2021.emnlp-main.86",
95 | pages = "1126--1136",
96 | }
97 | ```
98 |
99 |
100 | ## Update
101 | Please keep an eye on this repository if you are interested in our work.
102 | Feel free to contact us (gujc@mail.ustc.edu.cn) or open issues.
103 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/test.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import os
4 | import time
5 | import datetime
6 | import operator
7 | import metrics
8 | from collections import defaultdict
9 | import data_helpers
10 |
11 | # Files
12 | tf.flags.DEFINE_string("test_file", "", "path to test file")
13 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
14 | tf.flags.DEFINE_string("char_vocab_file", "", "vocabulary file")
15 | tf.flags.DEFINE_string("output_file", "", "prediction output file")
16 |
17 | # Model Hyperparameters
18 | tf.flags.DEFINE_integer("max_context_len", 150, "max context length")
19 | tf.flags.DEFINE_integer("max_persona_len", 50, "max persona length")
20 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
21 |
22 | # Test parameters
23 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
24 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
25 |
26 | # Misc Parameters
27 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
28 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
29 |
30 | FLAGS = tf.flags.FLAGS
31 | # FLAGS._parse_flags()
32 | # print("\nParameters:")
33 | # for attr, value in sorted(FLAGS.__flags.items()):
34 | # print("{}={}".format(attr.upper(), value))
35 | print("")
36 |
37 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
38 | print('vocabulary size: {}'.format(len(vocab)))
39 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
40 |
41 | test_dataset = data_helpers.load_dataset(FLAGS.test_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len)
42 | print('test_pairs: {}'.format(len(test_dataset)))
43 |
44 | print("\nEvaluating...\n")
45 |
46 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
47 | print(checkpoint_file)
48 |
49 | graph = tf.Graph()
50 | with graph.as_default():
51 | session_conf = tf.ConfigProto(
52 | allow_soft_placement=FLAGS.allow_soft_placement,
53 | log_device_placement=FLAGS.log_device_placement)
54 | sess = tf.Session(config=session_conf)
55 | with sess.as_default():
56 | # Load the saved meta graph and restore variables
57 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
58 | saver.restore(sess, checkpoint_file)
59 |
60 | # Get the placeholders from the graph by name
61 | context = graph.get_operation_by_name("context").outputs[0]
62 | context_len = graph.get_operation_by_name("context_len").outputs[0]
63 | persona = graph.get_operation_by_name("persona").outputs[0]
64 | persona_len = graph.get_operation_by_name("persona_len").outputs[0]
65 |
66 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
67 |
68 | c_char_feature = graph.get_operation_by_name("context_char").outputs[0]
69 | c_char_len = graph.get_operation_by_name("context_char_len").outputs[0]
70 | p_char_feature = graph.get_operation_by_name("persona_char").outputs[0]
71 | p_char_len = graph.get_operation_by_name("persona_char_len").outputs[0]
72 |
73 | # Tensors we want to evaluate
74 | prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0]
75 |
76 | results = defaultdict(list)
77 | num_test = 0
78 |
79 | test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=False)
80 | for test_batch in test_batches:
81 | x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = test_batch
82 | feed_dict = {
83 | context: x_context,
84 | context_len: x_context_len,
85 | persona: x_persona,
86 | persona_len: x_persona_len,
87 | dropout_keep_prob: 1.0,
88 | c_char_feature: x_context_char,
89 | c_char_len: x_context_char_len,
90 | p_char_feature: x_persona_char,
91 | p_char_len: x_persona_char_len
92 | }
93 | predicted_prob = sess.run(prob, feed_dict)
94 | num_test += len(predicted_prob)
95 | print('num_test_sample={}'.format(num_test))
96 | for i, prob_score in enumerate(predicted_prob):
97 | us_id, ps_id, label = x_id_pairs[i]
98 | results[us_id].append((ps_id, label, prob_score))
99 |
100 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
101 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
102 |
103 | mvp = metrics.mean_average_precision(results)
104 | mrr = metrics.mean_reciprocal_rank(results)
105 | top_1_precision = metrics.top_1_precision(results)
106 | total_valid_query = metrics.get_num_valid_query(results)
107 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
108 |
109 | out_path = FLAGS.output_file
110 | print("Saving evaluation to {}".format(out_path))
111 | with open(out_path, 'w') as f:
112 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
113 | for us_id, v in results.items():
114 | v.sort(key=operator.itemgetter(2), reverse=True)
115 | for i, rec in enumerate(v):
116 | ps_id, label, prob_score = rec
117 | rank = i+1
118 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, ps_id, prob_score, rank, label))
119 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/model_BOW.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | FLAGS = tf.flags.FLAGS
5 |
6 | def get_embeddings(vocab):
7 | print("get_embedding")
8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
9 | return tf.constant(initializer, name="word_embedding")
10 |
11 | def get_char_embedding(charVocab):
12 | print("get_char_embedding")
13 | char_size = len(charVocab)
14 | embeddings = np.zeros((char_size, char_size), dtype='float32')
15 | for i in range(1, char_size):
16 | embeddings[i, i] = 1.0
17 |
18 | return tf.constant(embeddings, name="word_char_embedding")
19 |
20 | def load_embed_vectors(fname, dim):
21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
22 | vectors = {}
23 | for line in open(fname, 'rt'):
24 | items = line.strip().split(' ')
25 | if len(items[0]) <= 0:
26 | continue
27 | vec = [float(items[i]) for i in range(1, dim+1)]
28 | vectors[items[0]] = vec
29 |
30 | return vectors
31 |
32 | def load_word_embeddings(vocab, dim):
33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
34 | vocab_size = len(vocab)
35 | embeddings = np.zeros((vocab_size, dim), dtype='float32')
36 | for word, code in vocab.items():
37 | if word in vectors:
38 | embeddings[code] = vectors[word]
39 | #else:
40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim)
41 |
42 | return embeddings
43 |
44 |
45 | class BOW(object):
46 | def __init__(
47 | self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
48 |
49 | self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context")
50 | self.context_len = tf.placeholder(tf.int32, [None], name="context_len")
51 | self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona")
52 | self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len")
53 |
54 | self.target = tf.placeholder(tf.float32, [None], name="target")
55 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
56 |
57 | self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char")
58 | self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len")
59 | self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char")
60 | self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len")
61 |
62 | l2_loss = tf.constant(1.0)
63 |
64 | # =============================== Embedding layer ===============================
65 | with tf.name_scope("embedding"):
66 | W = get_embeddings(vocab)
67 | context_embedded = tf.nn.embedding_lookup(W, self.context) # [batch_size, max_context_len, word_dim]
68 | persona_embedded = tf.nn.embedding_lookup(W, self.persona) # [batch_size, max_persona_len, word_dim]
69 | context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob)
70 | persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob)
71 | print("context_embedded: {}".format(context_embedded.get_shape()))
72 | print("persona_embedded: {}".format(persona_embedded.get_shape()))
73 |
74 |
75 | # =============================== Encoding layer ===============================
76 | with tf.variable_scope("encoding_layer") as vs:
77 | mask_c = tf.sequence_mask(self.context_len, max_context_len, dtype=tf.float32) # [batch_size, max_context_len]
78 | mask_c = tf.expand_dims(mask_c, -1) # [batch_size, max_context_len, 1]
79 | final_context = tf.reduce_max(context_embedded * mask_c, axis=1)
80 |
81 | mask_p = tf.sequence_mask(self.persona_len, max_persona_len, dtype=tf.float32) # [batch_size, max_persona_len]
82 | mask_p = tf.expand_dims(mask_p, -1) # [batch_size, max_persona_len, 1]
83 | final_persona = tf.reduce_max(persona_embedded * mask_p, axis=1)
84 | print("establish BOW encoder")
85 |
86 |
87 | # =============================== Matching layer ===============================
88 | with tf.variable_scope("matching_layer") as vs:
89 | output_dim = final_context.get_shape()[-1].value
90 | A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
91 |
92 | similarity = tf.matmul(final_context, A_matrix) # [batch_size, dim]
93 | similarity = tf.reduce_sum(similarity * final_persona, axis=-1) # [batch_size, ]
94 | print("shape of similarity: {}".format(similarity.get_shape()))
95 |
96 |
97 | # =============================== Prediction layer ===============================
98 | with tf.variable_scope("prediction_layer") as vs:
99 | logits = similarity
100 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ]
101 |
102 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
103 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
104 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
105 |
106 | with tf.name_scope("accuracy"):
107 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ]
108 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
109 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/test.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import os
4 | import time
5 | import datetime
6 | import operator
7 | import metrics
8 | from collections import defaultdict
9 | import data_helpers
10 |
11 | # Files
12 | tf.flags.DEFINE_string("test_file", "", "path to test file")
13 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
14 | tf.flags.DEFINE_string("char_vocab_file", "", "vocabulary file")
15 | tf.flags.DEFINE_string("output_file", "", "prediction output file")
16 |
17 | # Model Hyperparameters
18 | tf.flags.DEFINE_integer("max_utter_num", 8, "max utterance number")
19 | tf.flags.DEFINE_integer("max_utter_len", 20, "max utterance length")
20 | tf.flags.DEFINE_integer("max_profile_num", 5, "max profile number")
21 | tf.flags.DEFINE_integer("max_profile_len", 15, "max profile length")
22 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
23 |
24 | # Test parameters
25 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
26 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
27 |
28 | # Misc Parameters
29 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
30 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
31 |
32 | FLAGS = tf.flags.FLAGS
33 | # FLAGS._parse_flags()
34 | # print("\nParameters:")
35 | # for attr, value in sorted(FLAGS.__flags.items()):
36 | # print("{}={}".format(attr.upper(), value))
37 | print("")
38 |
39 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
40 | print('vocabulary size: {}'.format(len(vocab)))
41 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
42 |
43 | test_dataset = data_helpers.load_dataset(FLAGS.test_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len)
44 | print('test_pairs: {}'.format(len(test_dataset)))
45 |
46 | print("\nEvaluating...\n")
47 |
48 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
49 | print(checkpoint_file)
50 |
51 | graph = tf.Graph()
52 | with graph.as_default():
53 | session_conf = tf.ConfigProto(
54 | allow_soft_placement=FLAGS.allow_soft_placement,
55 | log_device_placement=FLAGS.log_device_placement)
56 | sess = tf.Session(config=session_conf)
57 | with sess.as_default():
58 | # Load the saved meta graph and restore variables
59 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
60 | saver.restore(sess, checkpoint_file)
61 |
62 | # Get the placeholders from the graph by name
63 | utterances = graph.get_operation_by_name("utterances").outputs[0]
64 | utterances_len = graph.get_operation_by_name("utterances_len").outputs[0]
65 | utterances_num = graph.get_operation_by_name("utterances_num").outputs[0]
66 | profiles = graph.get_operation_by_name("profiles").outputs[0]
67 | profiles_len = graph.get_operation_by_name("profiles_len").outputs[0]
68 | profiles_num = graph.get_operation_by_name("profiles_num").outputs[0]
69 |
70 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
71 |
72 | u_char_feature = graph.get_operation_by_name("utterances_char").outputs[0]
73 | u_char_len = graph.get_operation_by_name("utterances_char_len").outputs[0]
74 | p_char_feature = graph.get_operation_by_name("profiles_char").outputs[0]
75 | p_char_len = graph.get_operation_by_name("profiles_char_len").outputs[0]
76 |
77 | # Tensors we want to evaluate
78 | prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0]
79 |
80 | results = defaultdict(list)
81 | num_test = 0
82 |
83 | test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=False)
84 | for test_batch in test_batches:
85 | x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, \
86 | x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = test_batch
87 | feed_dict = {
88 | utterances: x_utterances,
89 | utterances_len: x_utterances_len,
90 | utterances_num: x_utterances_num,
91 | profiles: x_profiles,
92 | profiles_len: x_profiles_len,
93 | profiles_num: x_profiles_num,
94 | dropout_keep_prob: 1.0,
95 | u_char_feature: x_utterances_char,
96 | u_char_len: x_utterances_char_len,
97 | p_char_feature: x_profiles_char,
98 | p_char_len: x_profiles_char_len
99 | }
100 | predicted_prob = sess.run(prob, feed_dict)
101 | num_test += len(predicted_prob)
102 | print('num_test_sample={}'.format(num_test))
103 | for i, prob_score in enumerate(predicted_prob):
104 | us_id, ps_id, label = x_ids[i]
105 | results[us_id].append((ps_id, label, prob_score))
106 |
107 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
108 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
109 |
110 | mvp = metrics.mean_average_precision(results)
111 | mrr = metrics.mean_reciprocal_rank(results)
112 | top_1_precision = metrics.top_1_precision(results)
113 | total_valid_query = metrics.get_num_valid_query(results)
114 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
115 |
116 | out_path = FLAGS.output_file
117 | print("Saving evaluation to {}".format(out_path))
118 | with open(out_path, 'w') as f:
119 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
120 | for us_id, v in results.items():
121 | v.sort(key=operator.itemgetter(2), reverse=True)
122 | for i, rec in enumerate(v):
123 | ps_id, label, prob_score = rec
124 | rank = i+1
125 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, ps_id, prob_score, rank, label))
126 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/data_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 |
4 |
5 | def load_vocab(fname):
6 | '''
7 | vocab = {"I": 0, ...}
8 | '''
9 | vocab={}
10 | with open(fname, 'rt') as f:
11 | for i,line in enumerate(f):
12 | word = line.strip()
13 | vocab[word] = i
14 | return vocab
15 |
16 | def load_char_vocab(fname):
17 | '''
18 | charVocab = {"U": 0, "!": 1, ...}
19 | '''
20 | charVocab={}
21 | with open(fname, 'rt') as f:
22 | for line in f:
23 | fields = line.strip().split('\t')
24 | char_id = int(fields[0])
25 | ch = fields[1]
26 | charVocab[ch] = char_id
27 | return charVocab
28 |
29 | def to_vec(tokens, vocab, maxlen):
30 | '''
31 | length: length of the input sequence
32 | vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...]
33 | '''
34 | n = len(tokens)
35 | length = 0
36 | vec=[]
37 | for i in range(n):
38 | length += 1
39 | if tokens[i] in vocab:
40 | vec.append(vocab[tokens[i]])
41 | else:
42 | vec.append(vocab["_unk_"])
43 | return length, np.array(vec)
44 |
45 | def load_dataset(fname, vocab, max_context_len, max_persona_len):
46 |
47 | dataset=[]
48 | with open(fname, 'rt') as f:
49 | for line in f:
50 | line = line.strip()
51 | fields = line.split('\t')
52 |
53 | # id
54 | c_id = fields[0]
55 |
56 | # context
57 | context = fields[1] + " _eos_"
58 | c_tokens = context.split(' ')[:max_context_len] # select the head max_context_len tokens in every context
59 | c_len, c_vec = to_vec(c_tokens, vocab, max_context_len)
60 |
61 | # matched persona
62 | if fields[2] != "NA":
63 | personas = fields[2].split("|")
64 | for index, persona in enumerate(personas):
65 | p_id = "1." + str(index)
66 | persona = persona + " _eos_"
67 | p_tokens = persona.split(' ')[:max_persona_len] # select the head max_persona_len tokens in every persona
68 | p_len, p_vec = to_vec(p_tokens, vocab, max_persona_len)
69 | dataset.append((c_id, c_tokens, c_vec, c_len, 1.0, p_id, p_tokens, p_vec, p_len))
70 |
71 | # mismatched persona
72 | if fields[3] != "NA":
73 | personas = fields[3].split("|")
74 | for index, persona in enumerate(personas):
75 | ps_id = "0." + str(index)
76 | persona = persona + " _eos_"
77 | p_tokens = persona.split(' ')[:max_persona_len] # select the head max_persona_len tokens in every persona
78 | p_len, p_vec = to_vec(p_tokens, vocab, max_persona_len)
79 | dataset.append((c_id, c_tokens, c_vec, c_len, 0.0, p_id, p_tokens, p_vec, p_len))
80 |
81 | return dataset
82 |
83 |
84 | def normalize_vec(vec, maxlen):
85 | '''
86 | pad the original vec to the same maxlen
87 | [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0]
88 | '''
89 | if len(vec) == maxlen:
90 | return vec
91 |
92 | new_vec = np.zeros(maxlen, dtype='int32')
93 | for i in range(len(vec)):
94 | new_vec[i] = vec[i]
95 | return new_vec
96 |
97 |
98 | def charVec(tokens, charVocab, maxlen, maxWordLength):
99 | '''
100 | chars = np.array( (maxlen, maxWordLength) ) 0 if not found in charVocab or None
101 | word_lengths = np.array( maxlen ) 1 if None
102 | '''
103 | n = len(tokens)
104 | if n > maxlen:
105 | n = maxlen
106 |
107 | chars = np.zeros((maxlen, maxWordLength), dtype=np.int32)
108 | word_lengths = np.ones(maxlen, dtype=np.int32)
109 | for i in range(n):
110 | token = tokens[i][:maxWordLength]
111 | word_lengths[i] = len(token)
112 | row = chars[i]
113 | for idx, ch in enumerate(token):
114 | if ch in charVocab:
115 | row[idx] = charVocab[ch]
116 |
117 | return chars, word_lengths
118 |
119 |
120 | def batch_iter(data, batch_size, num_epochs, max_context_len, max_persona_len,
121 | charVocab, max_word_length, shuffle=True):
122 | """
123 | Generates a batch iterator for a dataset.
124 | """
125 | data_size = len(data)
126 | num_batches_per_epoch = int(len(data)/batch_size) + 1
127 | for epoch in range(num_epochs):
128 | # Shuffle the data at each epoch
129 | if shuffle:
130 | random.shuffle(data)
131 | for batch_num in range(num_batches_per_epoch):
132 | start_index = batch_num * batch_size
133 | end_index = min((batch_num + 1) * batch_size, data_size)
134 |
135 | x_context = []
136 | x_context_len = []
137 | x_persona = []
138 | x_persona_len = []
139 |
140 | x_labels = []
141 | x_id_pairs = []
142 |
143 | x_context_char = []
144 | x_context_char_len = []
145 | x_persona_char = []
146 | x_persona_char_len = []
147 |
148 | for rowIdx in range(start_index, end_index):
149 | c_id, c_tokens, c_vec, c_len, label, p_id, p_tokens, p_vec, p_len = data[rowIdx]
150 |
151 | # normalize c_vec
152 | new_c_vec = normalize_vec(c_vec, max_context_len)
153 | x_context.append(new_c_vec)
154 | x_context_len.append(c_len)
155 |
156 | # normalize p_vec
157 | new_p_vec = normalize_vec(p_vec, max_persona_len)
158 | x_persona.append(new_p_vec)
159 | x_persona_len.append(p_len)
160 |
161 | x_labels.append(label)
162 | x_id_pairs.append((c_id, p_id, int(label)))
163 |
164 | # normalize us_CharVec
165 | cCharVec, cCharLen = charVec(c_tokens, charVocab, max_context_len, max_word_length)
166 | x_context_char.append(cCharVec)
167 | x_context_char_len.append(cCharLen)
168 |
169 | # normalize ps_CharVec
170 | pCharVec, pCharLen = charVec(p_tokens, charVocab, max_persona_len, max_word_length)
171 | x_persona_char.append(pCharVec)
172 | x_persona_char_len.append(pCharLen)
173 |
174 | yield np.array(x_context), np.array(x_context_len), np.array(x_persona), np.array(x_persona_len), \
175 | np.array(x_labels), x_id_pairs, \
176 | np.array(x_context_char), np.array(x_context_char_len), np.array(x_persona_char), np.array(x_persona_char_len)
177 |
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/model_BOW.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | FLAGS = tf.flags.FLAGS
5 |
6 | def get_embeddings(vocab):
7 | print("get_embedding")
8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
9 | return tf.constant(initializer, name="word_embedding")
10 |
11 | def get_char_embedding(charVocab):
12 | print("get_char_embedding")
13 | char_size = len(charVocab)
14 | embeddings = np.zeros((char_size, char_size), dtype='float32')
15 | for i in range(1, char_size):
16 | embeddings[i, i] = 1.0
17 |
18 | return tf.constant(embeddings, name="word_char_embedding")
19 |
20 | def load_embed_vectors(fname, dim):
21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
22 | vectors = {}
23 | for line in open(fname, 'rt'):
24 | items = line.strip().split(' ')
25 | if len(items[0]) <= 0:
26 | continue
27 | vec = [float(items[i]) for i in range(1, dim+1)]
28 | vectors[items[0]] = vec
29 |
30 | return vectors
31 |
32 | def load_word_embeddings(vocab, dim):
33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
34 | vocab_size = len(vocab)
35 | embeddings = np.zeros((vocab_size, dim), dtype='float32')
36 | for word, code in vocab.items():
37 | if word in vectors:
38 | embeddings[code] = vectors[word]
39 | #else:
40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim)
41 |
42 | return embeddings
43 |
44 |
45 | class BOW(object):
46 | def __init__(
47 | self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
48 |
49 | self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances")
50 | self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len")
51 | self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num")
52 | self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles")
53 | self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len")
54 | self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num")
55 |
56 | self.target = tf.placeholder(tf.float32, [None], name="target")
57 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
58 |
59 | self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char")
60 | self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len")
61 | self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char")
62 | self.p_charLen = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len")
63 |
64 | l2_loss = tf.constant(1.0)
65 |
66 |
67 | # =============================== Embedding layer ===============================
68 | with tf.name_scope("embedding"):
69 | W = get_embeddings(vocab)
70 | utterances_embedded = tf.nn.embedding_lookup(W, self.utterances) # [batch_size, max_utter_num, max_utter_len, word_dim]
71 | profiles_embedded = tf.nn.embedding_lookup(W, self.profiles) # [batch_size, max_profile_num, max_profile_len, word_dim]
72 | utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob)
73 | profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob)
74 | print("utterances_embedded: {}".format(utterances_embedded.get_shape()))
75 | print("profiles_embedded: {}".format(profiles_embedded.get_shape()))
76 |
77 |
78 | # =============================== Encoding layer ===============================
79 | with tf.variable_scope("encoding_layer") as vs:
80 | mask_u = tf.sequence_mask(self.utterances_len, max_utter_len, dtype=tf.float32) # [batch_size, max_utter_num, max_utter_len]
81 | mask_u = tf.expand_dims(mask_u, -1) # [batch_size, max_utter_num, max_utter_len, 1]
82 | final_utterances = tf.reduce_max(utterances_embedded * mask_u, axis=2) # [batch_size, max_utter_num, word_dim]
83 |
84 | mask_p = tf.sequence_mask(self.profiles_len, max_profile_len, dtype=tf.float32) # [batch_size, max_profile_num, max_profile_len]
85 | mask_p = tf.expand_dims(mask_p, -1) # [batch_size, max_profile_num, max_profile_len, 1]
86 | final_profiles = tf.reduce_max(profiles_embedded * mask_p, axis=2) # [batch_size, max_profile_num, word_dim]
87 | print("establish BOW encoder")
88 |
89 |
90 | # =============================== Matching layer ===============================
91 | with tf.variable_scope("matching_layer") as vs:
92 | concat_dim = final_utterances.get_shape()[-1].value
93 |
94 | A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
95 | similarity = tf.einsum('aij,jk->aik',
96 | final_utterances, A_matrix) # [batch_size, max_utter_num, dim]
97 | similarity = tf.matmul(similarity,
98 | tf.transpose(final_profiles, perm=[0, 2, 1]),
99 | name="similarity") # [batch_size, max_utter_num, max_profile_num]
100 |
101 | print("shape of similarity: {}".format(similarity.get_shape()))
102 | print("establish matching between utterances and profiles")
103 |
104 |
105 | # =============================== Aggregation layer ===============================
106 | with tf.variable_scope("aggregation_layer") as vs:
107 | logits = tf.reduce_max(similarity, axis=2, name="logits_1") # [batch_size, max_utter_num]
108 | mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32) # [batch_size, max_utter_num]
109 | logits = logits * mask_u
110 | logits = tf.reduce_sum(logits, axis=1, name="logits_2") # [batch_size, ]
111 | print("establish reduce_max across profiles and masked_reduce_sum across utterances")
112 | print("logits: {}".format(logits.get_shape()))
113 |
114 |
115 | # =============================== Prediction layer ===============================
116 | with tf.variable_scope("prediction_layer") as vs:
117 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ]
118 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
119 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
120 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
121 |
122 | with tf.name_scope("accuracy"):
123 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ]
124 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
125 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/transformer_block.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | # main function
4 | def block(
5 | Q, K, V,
6 | Q_lengths, K_lengths,
7 | attention_type='dot',
8 | is_layer_norm=True,
9 | is_mask=True, mask_value=-2**32+1,
10 | drop_prob=None):
11 | '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf.
12 | Args:
13 | Q: a tensor with shape [batch, Q_time, Q_dimension]
14 | K: a tensor with shape [batch, time, K_dimension]
15 | V: a tensor with shape [batch, time, V_dimension]
16 |
17 | Q_length: a tensor with shape [batch]
18 | K_length: a tensor with shape [batch]
19 |
20 | Returns:
21 | a tensor with shape [batch, time, dimension]
22 | '''
23 | att = attention(Q, K, V,
24 | Q_lengths, K_lengths,
25 | attention_type=attention_type,
26 | is_mask=is_mask, mask_value=mask_value,
27 | drop_prob=drop_prob)
28 | if is_layer_norm:
29 | with tf.variable_scope('attention_layer_norm'):
30 | y = layer_norm_debug(Q + att)
31 | else:
32 | y = Q + att
33 |
34 | z = FFN(y)
35 | if is_layer_norm:
36 | with tf.variable_scope('FFN_layer_norm'):
37 | w = layer_norm_debug(y + z)
38 | else:
39 | w = y + z
40 | return w
41 |
42 | def attention(
43 | Q, K, V,
44 | Q_lengths, K_lengths,
45 | attention_type='dot',
46 | is_mask=True, mask_value=-2**32+1,
47 | drop_prob=None):
48 | '''Add attention layer.
49 | Args:
50 | Q: a tensor with shape [batch, Q_time, Q_dimension]
51 | K: a tensor with shape [batch, time, K_dimension]
52 | V: a tensor with shape [batch, time, V_dimension]
53 |
54 | Q_length: a tensor with shape [batch]
55 | K_length: a tensor with shape [batch]
56 |
57 | Returns:
58 | a tensor with shape [batch, Q_time, V_dimension]
59 |
60 | Raises:
61 | AssertionError: if
62 | Q_dimension not equal to K_dimension when attention type is dot.
63 | '''
64 | assert attention_type in ('dot', 'bilinear')
65 | if attention_type == 'dot':
66 | assert Q.shape[-1] == K.shape[-1]
67 |
68 | Q_time = Q.shape[1]
69 | K_time = K.shape[1]
70 |
71 | if attention_type == 'dot':
72 | logits = dot_sim(Q, K) #[batch, Q_time, time]
73 | if attention_type == 'bilinear':
74 | logits = bilinear_sim(Q, K)
75 |
76 | if is_mask:
77 | _mask = mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time]
78 | logits = _mask * logits + (1 - _mask) * mask_value
79 |
80 | attention = tf.nn.softmax(logits)
81 |
82 | if drop_prob is not None:
83 | print('use attention drop')
84 | attention = tf.nn.dropout(attention, drop_prob)
85 |
86 | return weighted_sum(attention, V)
87 |
88 | def dot_sim(x, y, is_nor=True):
89 | '''calculate dot similarity with two tensor.
90 |
91 | Args:
92 | x: a tensor with shape [batch, time_x, dimension]
93 | y: a tensor with shape [batch, time_y, dimension]
94 |
95 | Returns:
96 | a tensor with shape [batch, time_x, time_y]
97 | Raises:
98 | AssertionError: if
99 | the shapes of x and y are not match.
100 | '''
101 | assert x.shape[-1] == y.shape[-1]
102 |
103 | sim = tf.einsum('bik,bjk->bij', x, y)
104 |
105 | if is_nor:
106 | scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32))
107 | scale = tf.maximum(1.0, scale)
108 | return sim / scale
109 | else:
110 | return result
111 |
112 | def bilinear_sim(x, y, is_nor=True):
113 | '''calculate bilinear similarity with two tensor.
114 | Args:
115 | x: a tensor with shape [batch, time_x, dimension_x]
116 | y: a tensor with shape [batch, time_y, dimension_y]
117 |
118 | Returns:
119 | a tensor with shape [batch, time_x, time_y]
120 | Raises:
121 | ValueError: if
122 | the shapes of x and y are not match;
123 | bilinear matrix reuse error.
124 | '''
125 | M = tf.get_variable(
126 | name="bilinear_matrix",
127 | shape=[x.shape[-1], y.shape[-1]],
128 | dtype=tf.float32,
129 | initializer=tf.orthogonal_initializer())
130 | sim = tf.einsum('bik,kl,bjl->bij', x, M, y)
131 |
132 | if is_nor:
133 | scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32))
134 | scale = tf.maximum(1.0, scale)
135 | return sim / scale
136 | else:
137 | return sim
138 |
139 | def mask(row_lengths, col_lengths, max_row_length, max_col_length):
140 | '''Return a mask tensor representing the first N positions of each row and each column.
141 |
142 | Args:
143 | row_lengths: a tensor with shape [batch]
144 | col_lengths: a tensor with shape [batch]
145 |
146 | Returns:
147 | a mask tensor with shape [batch, max_row_length, max_col_length]
148 |
149 | Raises:
150 | '''
151 | row_mask = tf.sequence_mask(row_lengths, max_row_length) #bool, [batch, max_row_len]
152 | col_mask = tf.sequence_mask(col_lengths, max_col_length) #bool, [batch, max_col_len]
153 |
154 | row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32)
155 | col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32)
156 |
157 | return tf.einsum('bik,bjk->bij', row_mask, col_mask)
158 |
159 | def weighted_sum(weight, values):
160 | '''Calcualte the weighted sum.
161 |
162 | Args:
163 | weight: a tensor with shape [batch, time, dimension]
164 | values: a tensor with shape [batch, dimension, values_dimension]
165 |
166 | Return:
167 | a tensor with shape [batch, time, values_dimension]
168 |
169 | Raises:
170 | '''
171 | return tf.einsum('bij,bjk->bik', weight, values)
172 |
173 | def layer_norm_debug(x, axis = None, epsilon=1e-6):
174 | '''Add layer normalization.
175 |
176 | Args:
177 | x: a tensor
178 | axis: the dimensions to normalize
179 |
180 | Returns:
181 | a tensor the same shape as x.
182 |
183 | Raises:
184 | '''
185 | if axis is None:
186 | axis = [-1]
187 | shape = [x.shape[i] for i in axis]
188 |
189 | scale = tf.get_variable(
190 | name='scale',
191 | shape=shape,
192 | dtype=tf.float32,
193 | initializer=tf.ones_initializer())
194 | bias = tf.get_variable(
195 | name='bias',
196 | shape=shape,
197 | dtype=tf.float32,
198 | initializer=tf.zeros_initializer())
199 |
200 | mean = tf.reduce_mean(x, axis=axis, keep_dims=True)
201 | variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True)
202 | norm = (x-mean) * tf.rsqrt(variance + epsilon)
203 | return scale * norm + bias
204 |
205 | def FFN(x, out_dimension_0=None, out_dimension_1=None):
206 | '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.
207 |
208 | Args:
209 | x: a tensor with shape [batch, time, dimension]
210 | out_dimension: a number which is the output dimension
211 |
212 | Returns:
213 | a tensor with shape [batch, time, out_dimension]
214 |
215 | Raises:
216 | '''
217 | with tf.variable_scope('FFN_1'):
218 | y = dense(x, out_dimension_0)
219 | y = tf.nn.relu(y)
220 | with tf.variable_scope('FFN_2'):
221 | z = dense(y, out_dimension_1) #, add_bias=False) #!!!!
222 | return z
223 |
224 | def dense(x, out_dimension=None, add_bias=True):
225 | '''Add dense connected layer, Wx + b.
226 |
227 | Args:
228 | x: a tensor with shape [batch, time, dimension]
229 | out_dimension: a number which is the output dimension
230 |
231 | Return:
232 | a tensor with shape [batch, time, out_dimension]
233 |
234 | Raises:
235 | '''
236 | if out_dimension is None:
237 | out_dimension = x.shape[-1]
238 |
239 | W = tf.get_variable(
240 | name='weights',
241 | shape=[x.shape[-1], out_dimension],
242 | dtype=tf.float32,
243 | initializer=tf.orthogonal_initializer())
244 | if add_bias:
245 | bias = tf.get_variable(
246 | name='bias',
247 | shape=[1],
248 | dtype=tf.float32,
249 | initializer=tf.zeros_initializer())
250 | return tf.einsum('bik,kj->bij', x, W) + bias
251 | else:
252 | return tf.einsum('bik,kj->bij', x, W)
253 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/transformer_block.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | # main function
4 | def block(
5 | Q, K, V,
6 | Q_lengths, K_lengths,
7 | attention_type='dot',
8 | is_layer_norm=True,
9 | is_mask=True, mask_value=-2**32+1,
10 | drop_prob=None):
11 | '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf.
12 | Args:
13 | Q: a tensor with shape [batch, Q_time, Q_dimension]
14 | K: a tensor with shape [batch, time, K_dimension]
15 | V: a tensor with shape [batch, time, V_dimension]
16 |
17 | Q_length: a tensor with shape [batch]
18 | K_length: a tensor with shape [batch]
19 |
20 | Returns:
21 | a tensor with shape [batch, time, dimension]
22 | '''
23 | att = attention(Q, K, V,
24 | Q_lengths, K_lengths,
25 | attention_type=attention_type,
26 | is_mask=is_mask, mask_value=mask_value,
27 | drop_prob=drop_prob)
28 | if is_layer_norm:
29 | with tf.variable_scope('attention_layer_norm'):
30 | y = layer_norm_debug(Q + att)
31 | else:
32 | y = Q + att
33 |
34 | z = FFN(y)
35 | if is_layer_norm:
36 | with tf.variable_scope('FFN_layer_norm'):
37 | w = layer_norm_debug(y + z)
38 | else:
39 | w = y + z
40 | return w
41 |
42 | def attention(
43 | Q, K, V,
44 | Q_lengths, K_lengths,
45 | attention_type='dot',
46 | is_mask=True, mask_value=-2**32+1,
47 | drop_prob=None):
48 | '''Add attention layer.
49 | Args:
50 | Q: a tensor with shape [batch, Q_time, Q_dimension]
51 | K: a tensor with shape [batch, time, K_dimension]
52 | V: a tensor with shape [batch, time, V_dimension]
53 |
54 | Q_length: a tensor with shape [batch]
55 | K_length: a tensor with shape [batch]
56 |
57 | Returns:
58 | a tensor with shape [batch, Q_time, V_dimension]
59 |
60 | Raises:
61 | AssertionError: if
62 | Q_dimension not equal to K_dimension when attention type is dot.
63 | '''
64 | assert attention_type in ('dot', 'bilinear')
65 | if attention_type == 'dot':
66 | assert Q.shape[-1] == K.shape[-1]
67 |
68 | Q_time = Q.shape[1]
69 | K_time = K.shape[1]
70 |
71 | if attention_type == 'dot':
72 | logits = dot_sim(Q, K) #[batch, Q_time, time]
73 | if attention_type == 'bilinear':
74 | logits = bilinear_sim(Q, K)
75 |
76 | if is_mask:
77 | _mask = mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time]
78 | logits = _mask * logits + (1 - _mask) * mask_value
79 |
80 | attention = tf.nn.softmax(logits)
81 |
82 | if drop_prob is not None:
83 | print('use attention drop')
84 | attention = tf.nn.dropout(attention, drop_prob)
85 |
86 | return weighted_sum(attention, V)
87 |
88 | def dot_sim(x, y, is_nor=True):
89 | '''calculate dot similarity with two tensor.
90 |
91 | Args:
92 | x: a tensor with shape [batch, time_x, dimension]
93 | y: a tensor with shape [batch, time_y, dimension]
94 |
95 | Returns:
96 | a tensor with shape [batch, time_x, time_y]
97 | Raises:
98 | AssertionError: if
99 | the shapes of x and y are not match.
100 | '''
101 | assert x.shape[-1] == y.shape[-1]
102 |
103 | sim = tf.einsum('bik,bjk->bij', x, y)
104 |
105 | if is_nor:
106 | scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32))
107 | scale = tf.maximum(1.0, scale)
108 | return sim / scale
109 | else:
110 | return result
111 |
112 | def bilinear_sim(x, y, is_nor=True):
113 | '''calculate bilinear similarity with two tensor.
114 | Args:
115 | x: a tensor with shape [batch, time_x, dimension_x]
116 | y: a tensor with shape [batch, time_y, dimension_y]
117 |
118 | Returns:
119 | a tensor with shape [batch, time_x, time_y]
120 | Raises:
121 | ValueError: if
122 | the shapes of x and y are not match;
123 | bilinear matrix reuse error.
124 | '''
125 | M = tf.get_variable(
126 | name="bilinear_matrix",
127 | shape=[x.shape[-1], y.shape[-1]],
128 | dtype=tf.float32,
129 | initializer=tf.orthogonal_initializer())
130 | sim = tf.einsum('bik,kl,bjl->bij', x, M, y)
131 |
132 | if is_nor:
133 | scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32))
134 | scale = tf.maximum(1.0, scale)
135 | return sim / scale
136 | else:
137 | return sim
138 |
139 | def mask(row_lengths, col_lengths, max_row_length, max_col_length):
140 | '''Return a mask tensor representing the first N positions of each row and each column.
141 |
142 | Args:
143 | row_lengths: a tensor with shape [batch]
144 | col_lengths: a tensor with shape [batch]
145 |
146 | Returns:
147 | a mask tensor with shape [batch, max_row_length, max_col_length]
148 |
149 | Raises:
150 | '''
151 | row_mask = tf.sequence_mask(row_lengths, max_row_length) #bool, [batch, max_row_len]
152 | col_mask = tf.sequence_mask(col_lengths, max_col_length) #bool, [batch, max_col_len]
153 |
154 | row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32)
155 | col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32)
156 |
157 | return tf.einsum('bik,bjk->bij', row_mask, col_mask)
158 |
159 | def weighted_sum(weight, values):
160 | '''Calcualte the weighted sum.
161 |
162 | Args:
163 | weight: a tensor with shape [batch, time, dimension]
164 | values: a tensor with shape [batch, dimension, values_dimension]
165 |
166 | Return:
167 | a tensor with shape [batch, time, values_dimension]
168 |
169 | Raises:
170 | '''
171 | return tf.einsum('bij,bjk->bik', weight, values)
172 |
173 | def layer_norm_debug(x, axis = None, epsilon=1e-6):
174 | '''Add layer normalization.
175 |
176 | Args:
177 | x: a tensor
178 | axis: the dimensions to normalize
179 |
180 | Returns:
181 | a tensor the same shape as x.
182 |
183 | Raises:
184 | '''
185 | if axis is None:
186 | axis = [-1]
187 | shape = [x.shape[i] for i in axis]
188 |
189 | scale = tf.get_variable(
190 | name='scale',
191 | shape=shape,
192 | dtype=tf.float32,
193 | initializer=tf.ones_initializer())
194 | bias = tf.get_variable(
195 | name='bias',
196 | shape=shape,
197 | dtype=tf.float32,
198 | initializer=tf.zeros_initializer())
199 |
200 | mean = tf.reduce_mean(x, axis=axis, keep_dims=True)
201 | variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True)
202 | norm = (x-mean) * tf.rsqrt(variance + epsilon)
203 | return scale * norm + bias
204 |
205 | def FFN(x, out_dimension_0=None, out_dimension_1=None):
206 | '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.
207 |
208 | Args:
209 | x: a tensor with shape [batch, time, dimension]
210 | out_dimension: a number which is the output dimension
211 |
212 | Returns:
213 | a tensor with shape [batch, time, out_dimension]
214 |
215 | Raises:
216 | '''
217 | with tf.variable_scope('FFN_1'):
218 | y = dense(x, out_dimension_0)
219 | y = tf.nn.relu(y)
220 | with tf.variable_scope('FFN_2'):
221 | z = dense(y, out_dimension_1) #, add_bias=False) #!!!!
222 | return z
223 |
224 | def dense(x, out_dimension=None, add_bias=True):
225 | '''Add dense connected layer, Wx + b.
226 |
227 | Args:
228 | x: a tensor with shape [batch, time, dimension]
229 | out_dimension: a number which is the output dimension
230 |
231 | Return:
232 | a tensor with shape [batch, time, out_dimension]
233 |
234 | Raises:
235 | '''
236 | if out_dimension is None:
237 | out_dimension = x.shape[-1]
238 |
239 | W = tf.get_variable(
240 | name='weights',
241 | shape=[x.shape[-1], out_dimension],
242 | dtype=tf.float32,
243 | initializer=tf.orthogonal_initializer())
244 | if add_bias:
245 | bias = tf.get_variable(
246 | name='bias',
247 | shape=[1],
248 | dtype=tf.float32,
249 | initializer=tf.zeros_initializer())
250 | return tf.einsum('bik,kj->bij', x, W) + bias
251 | else:
252 | return tf.einsum('bik,kj->bij', x, W)
253 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/model_Transformer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import transformer_block
4 |
5 | FLAGS = tf.flags.FLAGS
6 |
7 | def get_embeddings(vocab):
8 | print("get_embedding")
9 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
10 | return tf.constant(initializer, name="word_embedding")
11 |
12 | def get_char_embedding(charVocab):
13 | print("get_char_embedding")
14 | char_size = len(charVocab)
15 | embeddings = np.zeros((char_size, char_size), dtype='float32')
16 | for i in range(1, char_size):
17 | embeddings[i, i] = 1.0
18 |
19 | return tf.constant(embeddings, name="word_char_embedding")
20 |
21 | def load_embed_vectors(fname, dim):
22 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
23 | vectors = {}
24 | for line in open(fname, 'rt'):
25 | items = line.strip().split(' ')
26 | if len(items[0]) <= 0:
27 | continue
28 | vec = [float(items[i]) for i in range(1, dim+1)]
29 | vectors[items[0]] = vec
30 |
31 | return vectors
32 |
33 | def load_word_embeddings(vocab, dim):
34 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
35 | vocab_size = len(vocab)
36 | embeddings = np.zeros((vocab_size, dim), dtype='float32')
37 | for word, code in vocab.items():
38 | if word in vectors:
39 | embeddings[code] = vectors[word]
40 | #else:
41 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim)
42 |
43 | return embeddings
44 |
45 | def cnn_layer(inputs, filter_sizes, num_filters, scope=None, scope_reuse=False):
46 | with tf.variable_scope(scope, reuse=scope_reuse):
47 | input_size = inputs.get_shape()[2].value
48 |
49 | outputs = []
50 | for i, filter_size in enumerate(filter_sizes):
51 | with tf.variable_scope("conv_{}".format(i)):
52 | w = tf.get_variable("w", [filter_size, input_size, num_filters])
53 | b = tf.get_variable("b", [num_filters])
54 | conv = tf.nn.conv1d(inputs, w, stride=1, padding="VALID") # [num_words, num_chars - filter_size, num_filters]
55 | h = tf.nn.relu(tf.nn.bias_add(conv, b)) # [num_words, num_chars - filter_size, num_filters]
56 | pooled = tf.reduce_max(h, 1) # [num_words, num_filters]
57 | outputs.append(pooled)
58 | return tf.concat(outputs, 1) # [num_words, num_filters * len(filter_sizes)]
59 |
60 |
61 | class Transformer(object):
62 | def __init__(
63 | self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
64 |
65 | self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context")
66 | self.context_len = tf.placeholder(tf.int32, [None], name="context_len")
67 | self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona")
68 | self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len")
69 |
70 | self.target = tf.placeholder(tf.float32, [None], name="target")
71 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
72 |
73 | self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char")
74 | self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len")
75 | self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char")
76 | self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len")
77 |
78 | l2_loss = tf.constant(1.0)
79 |
80 | # =============================== Embedding layer ===============================
81 | # 1. word embedding
82 | with tf.name_scope("embedding"):
83 | W = get_embeddings(vocab)
84 | context_embedded = tf.nn.embedding_lookup(W, self.context) # [batch_size, max_context_len, word_dim]
85 | persona_embedded = tf.nn.embedding_lookup(W, self.persona) # [batch_size, max_persona_len, word_dim]
86 | context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob)
87 | persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob)
88 | print("context_embedded: {}".format(context_embedded.get_shape()))
89 | print("persona_embedded: {}".format(persona_embedded.get_shape()))
90 |
91 |
92 | # =============================== Encoding layer ===============================
93 | emb_dim = context_embedded.get_shape()[-1].value
94 |
95 | # with tf.variable_scope("encoding_layer") as vs:
96 | # # CNN encoder
97 | # final_context = cnn_layer(context_embedded, filter_sizes=[3, 4, 5], num_filters=100, scope="CNN_emb", scope_reuse=False) # [batch_size*max_utter_num, emb]
98 | # final_persona = cnn_layer(persona_embedded, filter_sizes=[3, 4, 5], num_filters=100, scope="CNN_emb", scope_reuse=True) # [batch_size*max_profile_num, emb]
99 | # print("establish CNN encoder")
100 |
101 | context_input = context_embedded
102 | for layer in range(num_layer):
103 | with tf.variable_scope("encoding_layer_{}".format(layer)):
104 | context_output = transformer_block.block(context_input, context_input, context_input, self.context_len, self.context_len)
105 | context_input = context_output
106 |
107 | persona_input = persona_embedded
108 | for layer in range(num_layer):
109 | with tf.variable_scope("encoding_layer_{}".format(layer), reuse=True): # [batch_size, max_context_len, word_dim]
110 | persona_output = transformer_block.block(persona_input, persona_input, persona_input, self.persona_len, self.persona_len)
111 | persona_input = persona_output
112 | print("context_output: {}".format(context_output.get_shape())) # [batch_size, max_persona_len, word_dim]
113 | print("persona_output: {}".format(persona_output.get_shape()))
114 | print("establish {}-layer Transformer encoder".format(num_layer))
115 |
116 |
117 | # =============================== Matching layer ===============================
118 | with tf.variable_scope("matching_layer") as vs:
119 | mask_c = tf.sequence_mask(self.context_len, max_context_len, dtype=tf.float32) # [batch_size, max_context_len]
120 | context_output = context_output * tf.expand_dims(mask_c, 2) # [batch_size, max_context_len, dim]
121 | final_context = tf.reduce_sum(context_output, axis=1) # [batch_size, dim]
122 |
123 | mask_p = tf.sequence_mask(self.persona_len, max_persona_len, dtype=tf.float32) # [batch_size, max_persona_len]
124 | persona_output = persona_output * tf.expand_dims(mask_p, 2) # [batch_size, max_persona_len, dim]
125 | final_persona = tf.reduce_sum(persona_output, axis=1) # [batch_size, dim]
126 |
127 | output_dim = final_context.get_shape()[-1].value
128 | A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
129 |
130 | similarity = tf.matmul(final_context, A_matrix) # [batch_size, dim]
131 | similarity = tf.reduce_sum(similarity * final_persona, axis=-1) # [batch_size, ]
132 | print("shape of similarity: {}".format(similarity.get_shape()))
133 |
134 |
135 | # =============================== Prediction layer ===============================
136 | with tf.variable_scope("prediction_layer") as vs:
137 | logits = similarity
138 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ]
139 |
140 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
141 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
142 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
143 |
144 | with tf.name_scope("accuracy"):
145 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ]
146 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
147 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/model_BiLSTM.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | FLAGS = tf.flags.FLAGS
5 |
6 | def get_embeddings(vocab):
7 | print("get_embedding")
8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
9 | return tf.constant(initializer, name="word_embedding")
10 |
11 | def get_char_embedding(charVocab):
12 | print("get_char_embedding")
13 | char_size = len(charVocab)
14 | embeddings = np.zeros((char_size, char_size), dtype='float32')
15 | for i in range(1, char_size):
16 | embeddings[i, i] = 1.0
17 |
18 | return tf.constant(embeddings, name="word_char_embedding")
19 |
20 | def load_embed_vectors(fname, dim):
21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
22 | vectors = {}
23 | for line in open(fname, 'rt'):
24 | items = line.strip().split(' ')
25 | if len(items[0]) <= 0:
26 | continue
27 | vec = [float(items[i]) for i in range(1, dim+1)]
28 | vectors[items[0]] = vec
29 |
30 | return vectors
31 |
32 | def load_word_embeddings(vocab, dim):
33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
34 | vocab_size = len(vocab)
35 | embeddings = np.zeros((vocab_size, dim), dtype='float32')
36 | for word, code in vocab.items():
37 | if word in vectors:
38 | embeddings[code] = vectors[word]
39 | #else:
40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim)
41 |
42 | return embeddings
43 |
44 |
45 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False):
46 | with tf.variable_scope(scope, reuse=scope_reuse) as vs:
47 | fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
48 | fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob)
49 | bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
50 | bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob)
51 | rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell,
52 | inputs=inputs,
53 | sequence_length=input_seq_len,
54 | dtype=tf.float32)
55 | return rnn_outputs, rnn_states
56 |
57 | def multi_lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, num_layer, scope, scope_reuse=False):
58 | with tf.variable_scope(scope, reuse=scope_reuse) as vs:
59 | multi_outputs = []
60 | multi_states = []
61 | cur_inputs = inputs
62 | for i_layer in range(num_layer):
63 | rnn_outputs, rnn_states = lstm_layer(cur_inputs, input_seq_len, rnn_size, dropout_keep_prob, scope+str(i_layer), scope_reuse)
64 | rnn_outputs = tf.concat(values=rnn_outputs, axis=2)
65 | multi_outputs.append(rnn_outputs)
66 | multi_states.append(rnn_states)
67 | cur_inputs = rnn_outputs
68 |
69 | # multi_layer_aggregation
70 | ml_weights = tf.nn.softmax(tf.get_variable("ml_scores", [num_layer, ], initializer=tf.constant_initializer(0.0)))
71 |
72 | multi_outputs = tf.stack(multi_outputs, axis=-1) # [batch_size, max_len, 2*rnn_size(400), num_layer]
73 | max_len = multi_outputs.get_shape()[1].value
74 | dim = multi_outputs.get_shape()[2].value
75 | flattened_multi_outputs = tf.reshape(multi_outputs, [-1, num_layer]) # [batch_size * max_len * 2*rnn_size(400), num_layer]
76 | aggregated_ml_outputs = tf.matmul(flattened_multi_outputs, tf.expand_dims(ml_weights, 1)) # [batch_size * max_len * 2*rnn_size(400), 1]
77 | aggregated_ml_outputs = tf.reshape(aggregated_ml_outputs, [-1, max_len, dim]) # [batch_size , max_len , 2*rnn_size(400)]
78 |
79 | return aggregated_ml_outputs
80 |
81 |
82 | class BiLSTM(object):
83 | def __init__(
84 | self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
85 |
86 | self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context")
87 | self.context_len = tf.placeholder(tf.int32, [None], name="context_len")
88 | self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona")
89 | self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len")
90 |
91 | self.target = tf.placeholder(tf.float32, [None], name="target")
92 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
93 |
94 | self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char")
95 | self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len")
96 | self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char")
97 | self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len")
98 |
99 | l2_loss = tf.constant(1.0)
100 |
101 | # =============================== Embedding layer ===============================
102 | with tf.name_scope("embedding"):
103 | W = get_embeddings(vocab)
104 | context_embedded = tf.nn.embedding_lookup(W, self.context) # [batch_size, max_context_len, word_dim]
105 | persona_embedded = tf.nn.embedding_lookup(W, self.persona) # [batch_size, max_persona_len, word_dim]
106 | context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob)
107 | persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob)
108 | print("context_embedded: {}".format(context_embedded.get_shape()))
109 | print("persona_embedded: {}".format(persona_embedded.get_shape()))
110 |
111 |
112 | # =============================== Encoding layer ===============================
113 | with tf.variable_scope("encoding_layer") as vs:
114 | rnn_scope_name = "bidirectional_rnn"
115 | # 1. single_lstm_layer
116 | c_rnn_output, c_rnn_states = lstm_layer(context_embedded, self.context_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False)
117 | context_output = tf.concat(axis=2, values=c_rnn_output) # [batch_size, max_context_len, rnn_size*2]
118 | p_rnn_output, p_rnn_states = lstm_layer(persona_embedded, self.persona_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True) # [batch_size, max_profile_len, rnn_size(200)]
119 | persona_output = tf.concat(axis=2, values=p_rnn_output) # [batch_size, max_persona_len, rnn_size*2]
120 | # 2. multi_lstm_layer
121 | # utterances_output = multi_lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=False)
122 | # response_output = multi_lstm_layer(flattened_responses_embedded, flattened_responses_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=True)
123 | # print("establish AHRE layers : {}".format(num_layer))
124 | print("establish BiLSTM encoder")
125 |
126 |
127 | # =============================== Matching layer ===============================
128 | with tf.variable_scope("matching_layer") as vs:
129 | final_context = tf.concat(axis=1, values=[c_rnn_states[0].h, c_rnn_states[1].h]) # [batch_size, rnn_size*2]
130 | final_persona = tf.concat(axis=1, values=[p_rnn_states[0].h, p_rnn_states[1].h]) # [batch_size, rnn_size*2]
131 |
132 | output_dim = final_context.get_shape()[-1].value
133 | A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
134 |
135 | similarity = tf.matmul(final_context, A_matrix) # [batch_size, dim]
136 | similarity = tf.reduce_sum(similarity * final_persona, axis=-1) # [batch_size, ]
137 | print("shape of similarity: {}".format(similarity.get_shape()))
138 |
139 |
140 | # =============================== Prediction layer ===============================
141 | with tf.variable_scope("prediction_layer") as vs:
142 | logits = similarity
143 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ]
144 |
145 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
146 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
147 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
148 |
149 | with tf.name_scope("accuracy"):
150 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ]
151 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
152 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/model_Transformer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import transformer_block
4 |
5 | FLAGS = tf.flags.FLAGS
6 |
7 | def get_embeddings(vocab):
8 | print("get_embedding")
9 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
10 | return tf.constant(initializer, name="word_embedding")
11 |
12 | def get_char_embedding(charVocab):
13 | print("get_char_embedding")
14 | char_size = len(charVocab)
15 | embeddings = np.zeros((char_size, char_size), dtype='float32')
16 | for i in range(1, char_size):
17 | embeddings[i, i] = 1.0
18 |
19 | return tf.constant(embeddings, name="word_char_embedding")
20 |
21 | def load_embed_vectors(fname, dim):
22 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
23 | vectors = {}
24 | for line in open(fname, 'rt'):
25 | items = line.strip().split(' ')
26 | if len(items[0]) <= 0:
27 | continue
28 | vec = [float(items[i]) for i in range(1, dim+1)]
29 | vectors[items[0]] = vec
30 |
31 | return vectors
32 |
33 | def load_word_embeddings(vocab, dim):
34 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
35 | vocab_size = len(vocab)
36 | embeddings = np.zeros((vocab_size, dim), dtype='float32')
37 | for word, code in vocab.items():
38 | if word in vectors:
39 | embeddings[code] = vectors[word]
40 | #else:
41 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim)
42 |
43 | return embeddings
44 |
45 |
46 | class Transformer(object):
47 | def __init__(
48 | self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
49 |
50 | self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances")
51 | self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len")
52 | self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num")
53 | self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles")
54 | self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len")
55 | self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num")
56 |
57 | self.target = tf.placeholder(tf.float32, [None], name="target")
58 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
59 |
60 | self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char")
61 | self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len")
62 | self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char")
63 | self.p_charLen = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len")
64 |
65 | l2_loss = tf.constant(1.0)
66 |
67 |
68 | # =============================== Embedding layer ===============================
69 | with tf.name_scope("embedding"):
70 | W = get_embeddings(vocab)
71 | utterances_embedded = tf.nn.embedding_lookup(W, self.utterances) # [batch_size, max_utter_num, max_utter_len, word_dim]
72 | profiles_embedded = tf.nn.embedding_lookup(W, self.profiles) # [batch_size, max_profile_num, max_profile_len, word_dim]
73 | utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob)
74 | profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob)
75 | print("utterances_embedded: {}".format(utterances_embedded.get_shape()))
76 | print("profiles_embedded: {}".format(profiles_embedded.get_shape()))
77 |
78 |
79 | # =============================== Encoding layer ===============================
80 | with tf.variable_scope("encoding_layer") as vs:
81 | rnn_scope_name = "bidirectional_rnn"
82 | emb_dim = utterances_embedded.get_shape()[-1].value
83 | flattened_utterances_embedded = tf.reshape(utterances_embedded, [-1, max_utter_len, emb_dim]) # [batch_size*max_utter_num, max_utter_len, emb]
84 | flattened_utterances_len = tf.reshape(self.utterances_len, [-1]) # [batch_size*max_utter_num, ]
85 | flattened_profiles_embedded = tf.reshape(profiles_embedded, [-1, max_profile_len, emb_dim]) # [batch_size*max_profile_num, max_profile_len, emb]
86 | flattened_profiles_len = tf.reshape(self.profiles_len, [-1]) # [batch_size*max_profile_num, ]
87 |
88 | utterances_input = flattened_utterances_embedded
89 | profiles_input = flattened_profiles_embedded
90 | for layer in range(num_layer):
91 | with tf.variable_scope("encoding_layer_{}".format(layer)):
92 | utterances_output = transformer_block.block(utterances_input, utterances_input, utterances_input,
93 | flattened_utterances_len, flattened_utterances_len)
94 | utterances_input = utterances_output
95 |
96 | for layer in range(num_layer):
97 | with tf.variable_scope("encoding_layer_{}".format(layer), reuse=True):
98 | profiles_output = transformer_block.block(profiles_input, profiles_input, profiles_input,
99 | flattened_profiles_len, flattened_profiles_len)
100 | profiles_input = profiles_output
101 | print("establish Transformer encoder")
102 | print("utterances_output: {}".format(utterances_output.get_shape()))
103 | print("profiles_output: {}".format(profiles_output.get_shape()))
104 |
105 |
106 | # =============================== Matching layer ===============================
107 | with tf.variable_scope("matching_layer") as vs:
108 | mask_u = tf.sequence_mask(flattened_utterances_len, max_utter_len, dtype=tf.float32) # [batch_size*max_utter_num, max_utter_len]
109 | utterances_output = utterances_output * tf.expand_dims(mask_u, 2) # [batch_size*max_utter_num, max_utter_len, dim]
110 | final_utterances = tf.reduce_sum(utterances_output, axis=1) # [batch_size*max_utter_num, dim]
111 | # final_utterances = tf.div(final_utterances, tf.expand_dims(tf.sqrt(tf.cast(flattened_utterances_len, tf.float32)), 1))
112 | concat_dim = final_utterances.get_shape()[-1].value
113 | final_utterances = tf.reshape(final_utterances, [-1, max_utter_num, concat_dim]) # [batch_size, max_utter_num, dim]
114 |
115 | mask_p = tf.sequence_mask(flattened_profiles_len, max_profile_len, dtype=tf.float32) # [batch_size*max_profile_num, max_profile_len]
116 | profiles_output = profiles_output * tf.expand_dims(mask_p, 2)
117 | final_profiles = tf.reduce_sum(profiles_output, axis=1)
118 | # final_profiles = tf.div(final_profiles, tf.expand_dims(tf.sqrt(tf.cast(flattened_profiles_len, tf.float32)), 1))
119 | final_profiles = tf.reshape(final_profiles, [-1, max_profile_num, concat_dim]) # [batch_size, max_profile_num, dim]
120 |
121 | A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
122 | similarity = tf.einsum('aij,jk->aik',
123 | final_utterances, A_matrix) # [batch_size, max_utter_num, dim]
124 | similarity = tf.matmul(similarity,
125 | tf.transpose(final_profiles, perm=[0, 2, 1])) # [batch_size, max_utter_num, max_profile_num]
126 |
127 | print("shape of similarity: {}".format(similarity.get_shape()))
128 |
129 |
130 | # =============================== Aggregation layer ===============================
131 | with tf.variable_scope("aggregation_layer") as vs:
132 | logits = tf.reduce_max(similarity, axis=2) # [batch_size, max_utter_num]
133 | mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32) # [batch_size, max_utter_num]
134 | logits = logits * mask_u
135 | logits = tf.reduce_sum(logits, axis=1) # [batch_size, ]
136 | print("establish reduce_max across profiles and masked_reduce_sum across utterances")
137 | print("logits: {}".format(logits.get_shape()))
138 |
139 |
140 | # =============================== Prediction layer ===============================
141 | with tf.variable_scope("prediction_layer") as vs:
142 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ]
143 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
144 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
145 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
146 |
147 | with tf.name_scope("accuracy"):
148 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ]
149 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
150 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/data_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 |
4 |
5 | def load_vocab(fname):
6 | '''
7 | vocab = {"I": 0, ...}
8 | '''
9 | vocab={}
10 | with open(fname, 'rt') as f:
11 | for i,line in enumerate(f):
12 | word = line.strip()
13 | vocab[word] = i
14 | return vocab
15 |
16 | def load_char_vocab(fname):
17 | '''
18 | charVocab = {"U": 0, "!": 1, ...}
19 | '''
20 | charVocab={}
21 | with open(fname, 'rt') as f:
22 | for line in f:
23 | fields = line.strip().split('\t')
24 | char_id = int(fields[0])
25 | ch = fields[1]
26 | charVocab[ch] = char_id
27 | return charVocab
28 |
29 | def to_vec(tokens, vocab, maxlen):
30 | '''
31 | length: length of the input sequence
32 | vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...]
33 | '''
34 | n = len(tokens)
35 | length = 0
36 | vec=[]
37 | for i in range(n):
38 | length += 1
39 | if tokens[i] in vocab:
40 | vec.append(vocab[tokens[i]])
41 | else:
42 | # vec.append(vocab["fiance"]) # fix to fiance
43 | vec.append(vocab["_unk_"])
44 | return length, np.array(vec)
45 |
46 | def load_dataset(fname, vocab, max_utter_num, max_utter_len, max_profile_num, max_profile_len):
47 |
48 | dataset=[]
49 | with open(fname, 'rt') as f:
50 | for line in f:
51 | # ( id, context utterances, persona candidates, label )
52 | line = line.strip()
53 | fields = line.split('\t')
54 |
55 | # id
56 | us_id = fields[0]
57 |
58 | # context utterances
59 | context = fields[1]
60 | utterances = context.split(' _eos_ ')
61 | utterances = [utterance + " _eos_" for utterance in utterances]
62 | utterances = utterances[-max_utter_num:] # select the last max_utter_num utterances
63 |
64 | us_tokens = []
65 | us_vec = []
66 | us_len = []
67 | for utterance in utterances:
68 | u_tokens = utterance.split(' ')[:max_utter_len] # select the head max_utter_len tokens in every utterance
69 | u_len, u_vec = to_vec(u_tokens, vocab, max_utter_len)
70 | us_tokens.append(u_tokens)
71 | us_vec.append(u_vec)
72 | us_len.append(u_len)
73 | us_num = len(utterances)
74 |
75 | # persona candidates
76 | if fields[2] != "NA":
77 | personas = fields[2].split("|")
78 | for index, persona in enumerate(personas):
79 | # ps_id = "match_" + str(index)
80 | ps_id = "1." + str(index)
81 | profiles = persona.split(' _eos_ ')
82 | profiles = [profile + " _eos_" for profile in profiles]
83 | profiles = profiles[-max_profile_num:] # select the last max_utter_num utterances
84 | ps_tokens = []
85 | ps_vec = []
86 | ps_len = []
87 | for profile in profiles:
88 | p_tokens = profile.split(' ')[:max_profile_len] # select the head max_profile_len tokens in every persona
89 | p_len, p_vec = to_vec(p_tokens, vocab, max_profile_len)
90 | ps_tokens.append(p_tokens)
91 | ps_vec.append(p_vec)
92 | ps_len.append(p_len)
93 | ps_num = len(profiles)
94 | dataset.append((us_id, us_tokens, us_vec, us_len, us_num, 1.0, ps_id, ps_tokens, ps_vec, ps_len, ps_num))
95 |
96 | if fields[3] != "NA":
97 | personas = fields[3].split("|")
98 | for index, persona in enumerate(personas):
99 | # ps_id = "mismatch_" + str(index)
100 | ps_id = "0." + str(index)
101 | profiles = persona.split(' _eos_ ')
102 | profiles = [profile + " _eos_" for profile in profiles]
103 | profiles = profiles[-max_profile_num:]
104 | ps_tokens = []
105 | ps_vec = []
106 | ps_len = []
107 | for profile in profiles:
108 | p_tokens = profile.split(' ')[:max_profile_len]
109 | p_len, p_vec = to_vec(p_tokens, vocab, max_profile_len)
110 | ps_tokens.append(p_tokens)
111 | ps_vec.append(p_vec)
112 | ps_len.append(p_len)
113 | ps_num = len(profiles)
114 | dataset.append((us_id, us_tokens, us_vec, us_len, us_num, 0.0, ps_id, ps_tokens, ps_vec, ps_len, ps_num))
115 |
116 | return dataset
117 |
118 |
119 | def normalize_vec(vec, maxlen):
120 | '''
121 | pad the original vec to the same maxlen
122 | [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0]
123 | '''
124 | if len(vec) == maxlen:
125 | return vec
126 |
127 | new_vec = np.zeros(maxlen, dtype='int32')
128 | for i in range(len(vec)):
129 | new_vec[i] = vec[i]
130 | return new_vec
131 |
132 |
133 | def charVec(tokens, charVocab, maxlen, maxWordLength):
134 | '''
135 | chars = np.array( (maxlen, maxWordLength) ) 0 if not found in charVocab or None
136 | word_lengths = np.array( maxlen ) 1 if None
137 | '''
138 | n = len(tokens)
139 | if n > maxlen:
140 | n = maxlen
141 |
142 | chars = np.zeros((maxlen, maxWordLength), dtype=np.int32)
143 | word_lengths = np.ones(maxlen, dtype=np.int32)
144 | for i in range(n):
145 | token = tokens[i][:maxWordLength]
146 | word_lengths[i] = len(token)
147 | row = chars[i]
148 | for idx, ch in enumerate(token):
149 | if ch in charVocab:
150 | row[idx] = charVocab[ch]
151 |
152 | return chars, word_lengths
153 |
154 |
155 | def batch_iter(data, batch_size, num_epochs, max_utter_num, max_utter_len, max_profile_num, max_profile_len,
156 | charVocab, max_word_length, shuffle=True):
157 | """
158 | Generates a batch iterator for a dataset.
159 | """
160 | data_size = len(data)
161 | num_batches_per_epoch = int(len(data)/batch_size) + 1
162 | for epoch in range(num_epochs):
163 | # Shuffle the data at each epoch
164 | if shuffle:
165 | random.shuffle(data)
166 | for batch_num in range(num_batches_per_epoch):
167 | start_index = batch_num * batch_size
168 | end_index = min((batch_num + 1) * batch_size, data_size)
169 |
170 | x_utterances = []
171 | x_utterances_len = []
172 | x_utterances_num = []
173 | x_profiles = []
174 | x_profiles_len = []
175 | x_profiles_num = []
176 |
177 | x_labels = []
178 | x_id_pairs = []
179 |
180 | x_utterances_char = []
181 | x_utterances_char_len = []
182 | x_profiles_char = []
183 | x_profiles_char_len = []
184 |
185 | for rowIdx in range(start_index, end_index):
186 | us_id, us_tokens, us_vec, us_len, us_num, label, ps_id, ps_tokens, ps_vec, ps_len, ps_num = data[rowIdx]
187 |
188 | # normalize us_vec and us_len
189 | new_utters_vec = np.zeros((max_utter_num, max_utter_len), dtype='int32')
190 | new_utters_len = np.zeros((max_utter_num, ), dtype='int32')
191 | for i in range(len(us_len)):
192 | new_utter_vec = normalize_vec(us_vec[i], max_utter_len)
193 | new_utters_vec[i] = new_utter_vec
194 | new_utters_len[i] = us_len[i]
195 | x_utterances.append(new_utters_vec)
196 | x_utterances_len.append(new_utters_len)
197 | x_utterances_num.append(us_num)
198 |
199 | # normalize ps_vec and ps_len
200 | new_profiles_vec = np.zeros((max_profile_num, max_profile_len), dtype='int32')
201 | new_profiles_len = np.zeros((max_profile_num, ), dtype='int32')
202 | for i in range(len(ps_len)):
203 | new_profile_vec = normalize_vec(ps_vec[i], max_profile_len)
204 | new_profiles_vec[i] = new_profile_vec
205 | new_profiles_len[i] = ps_len[i]
206 | x_profiles.append(new_profiles_vec)
207 | x_profiles_len.append(new_profiles_len)
208 | x_profiles_num.append(ps_num)
209 |
210 | x_labels.append(label)
211 | x_id_pairs.append((us_id, ps_id, int(label)))
212 |
213 | # normalize us_CharVec and us_CharLen
214 | uttersCharVec = np.zeros((max_utter_num, max_utter_len, max_word_length), dtype='int32')
215 | uttersCharLen = np.ones((max_utter_num, max_utter_len), dtype='int32')
216 | for i in range(len(us_len)):
217 | utterCharVec, utterCharLen = charVec(us_tokens[i], charVocab, max_utter_len, max_word_length)
218 | uttersCharVec[i] = utterCharVec
219 | uttersCharLen[i] = utterCharLen
220 | x_utterances_char.append(uttersCharVec)
221 | x_utterances_char_len.append(uttersCharLen)
222 |
223 | # normalize ps_CharVec and ps_CharLen
224 | psCharVec = np.zeros((max_profile_num, max_profile_len, max_word_length), dtype='int32')
225 | psCharLen = np.ones((max_profile_num, max_profile_len), dtype='int32')
226 | for i in range(len(ps_len)):
227 | pCharVec, pCharLen = charVec(ps_tokens[i], charVocab, max_profile_len, max_word_length)
228 | psCharVec[i] = pCharVec
229 | psCharLen[i] = pCharLen
230 | x_profiles_char.append(psCharVec)
231 | x_profiles_char_len.append(psCharLen)
232 |
233 | yield np.array(x_utterances), np.array(x_utterances_len), np.array(x_utterances_num), \
234 | np.array(x_profiles), np.array(x_profiles_len), np.array(x_profiles_num), \
235 | np.array(x_labels), x_id_pairs, \
236 | np.array(x_utterances_char), np.array(x_utterances_char_len), np.array(x_profiles_char), np.array(x_profiles_char_len)
237 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/train.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import os
4 | import time
5 | import datetime
6 | import operator
7 | from collections import defaultdict
8 | import metrics
9 | import data_helpers
10 | from model_BOW import BOW as MODEL
11 | # from model_BiLSTM import BiLSTM as MODEL
12 | # from model_Transformer import Transformer as MODEL
13 | # from model_ESIM import ESIM as MODEL
14 |
15 |
16 | # Files
17 | tf.flags.DEFINE_string("train_file", "", "path to train file")
18 | tf.flags.DEFINE_string("valid_file", "", "path to valid file")
19 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
20 | tf.flags.DEFINE_string("char_vocab_file", "", "path to char vocab file")
21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector")
22 |
23 | # Model Hyperparameters
24 | tf.flags.DEFINE_integer("max_context_len", 150, "max context length")
25 | tf.flags.DEFINE_integer("max_persona_len", 50, "max persona length")
26 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
27 | tf.flags.DEFINE_integer("num_layer", 1, "num of layers in sentence encoder")
28 | tf.flags.DEFINE_integer("embedding_dim", 200, "dimensionality of word embedding")
29 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units")
30 |
31 | # Training parameters
32 | tf.flags.DEFINE_integer("batch_size", 128, "batch size (default: 128)")
33 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0)")
34 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "dropout keep probability (default: 1.0)")
35 | tf.flags.DEFINE_integer("num_epochs", 1000000, "number of training epochs (default: 1000000)")
36 | tf.flags.DEFINE_integer("evaluate_every", 1000, "evaluate model on valid dataset after this many steps (default: 1000)")
37 |
38 | # Misc Parameters
39 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
40 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
41 |
42 | FLAGS = tf.flags.FLAGS
43 | # FLAGS._parse_flags()
44 | # print("\nParameters:")
45 | # for attr, value in sorted(FLAGS.__flags.items()):
46 | # print("{}={}".format(attr.upper(), value))
47 | print("")
48 |
49 | # Load data
50 | print("Loading data...")
51 |
52 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
53 | print('vocabulary size: {}'.format(len(vocab)))
54 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
55 |
56 | train_dataset = data_helpers.load_dataset(FLAGS.train_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len)
57 | print('train_pairs: {}'.format(len(train_dataset)))
58 | valid_dataset = data_helpers.load_dataset(FLAGS.valid_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len)
59 | print('valid_pairs: {}'.format(len(valid_dataset)))
60 |
61 | with tf.Graph().as_default():
62 | session_conf = tf.ConfigProto(
63 | allow_soft_placement=FLAGS.allow_soft_placement,
64 | log_device_placement=FLAGS.log_device_placement)
65 | sess = tf.Session(config=session_conf)
66 | with sess.as_default():
67 | model = MODEL(
68 | max_context_len=FLAGS.max_context_len,
69 | max_persona_len=FLAGS.max_persona_len,
70 | num_layer=FLAGS.num_layer,
71 | vocab_size=len(vocab),
72 | embedding_size=FLAGS.embedding_dim,
73 | vocab=vocab,
74 | rnn_size=FLAGS.rnn_size,
75 | maxWordLength=FLAGS.max_word_length,
76 | charVocab=charVocab,
77 | l2_reg_lambda=FLAGS.l2_reg_lambda)
78 | # Define Training procedure
79 | global_step = tf.Variable(0, name="global_step", trainable=False)
80 | starter_learning_rate = 0.001
81 | learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
82 | 5000, 0.96, staircase=True)
83 | optimizer = tf.train.AdamOptimizer(learning_rate)
84 | grads_and_vars = optimizer.compute_gradients(model.mean_loss)
85 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
86 |
87 | # Keep track of gradient values and sparsity (optional)
88 | """
89 | grad_summaries = []
90 | for g, v in grads_and_vars:
91 | if g is not None:
92 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
93 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
94 | grad_summaries.append(grad_hist_summary)
95 | grad_summaries.append(sparsity_summary)
96 | grad_summaries_merged = tf.merge_summary(grad_summaries)
97 | """
98 |
99 | # Output directory for models and summaries
100 | timestamp = str(int(time.time()))
101 | # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
102 | out_dir = os.path.abspath(os.path.join("../output", timestamp))
103 | print("Writing to {}\n".format(out_dir))
104 |
105 | # Summaries for loss and accuracy
106 | """
107 | loss_summary = tf.scalar_summary("loss", model.mean_loss)
108 | acc_summary = tf.scalar_summary("accuracy", model.accuracy)
109 |
110 | # Train Summaries
111 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
112 | train_summary_dir = os.path.join(out_dir, "summaries", "train")
113 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
114 |
115 | # Dev summaries
116 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
117 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
118 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def)
119 | """
120 |
121 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
122 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
123 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
124 | if not os.path.exists(checkpoint_dir):
125 | os.makedirs(checkpoint_dir)
126 | saver = tf.train.Saver(tf.global_variables())
127 |
128 | # Initialize all variables
129 | sess.run(tf.global_variables_initializer())
130 |
131 | def train_step(x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs,
132 | x_context_char, x_context_char_len, x_persona_char, x_persona_char_len):
133 | """
134 | A single training step
135 | """
136 | feed_dict = {
137 | model.context: x_context,
138 | model.context_len: x_context_len,
139 | model.persona: x_persona,
140 | model.persona_len: x_persona_len,
141 | model.target: x_labels,
142 | model.dropout_keep_prob: FLAGS.dropout_keep_prob,
143 | model.c_charVec: x_context_char,
144 | model.c_charLen: x_context_char_len,
145 | model.p_charVec: x_persona_char,
146 | model.p_charLen: x_persona_char_len
147 | }
148 |
149 | _, step, loss, accuracy, predicted_prob = sess.run(
150 | [train_op, global_step, model.mean_loss, model.accuracy, model.probs],
151 | feed_dict)
152 |
153 | if step%100 == 0:
154 | time_str = datetime.datetime.now().isoformat()
155 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
156 | #train_summary_writer.add_summary(summaries, step)
157 |
158 |
159 | def dev_step():
160 | results = defaultdict(list)
161 | num_test = 0
162 | num_correct = 0.0
163 | valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=True)
164 | for valid_batch in valid_batches:
165 | x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = valid_batch
166 | feed_dict = {
167 | model.context: x_context,
168 | model.context_len: x_context_len,
169 | model.persona: x_persona,
170 | model.persona_len: x_persona_len,
171 | model.target: x_labels,
172 | model.dropout_keep_prob: 1.0,
173 | model.c_charVec: x_context_char,
174 | model.c_charLen: x_context_char_len,
175 | model.p_charVec: x_persona_char,
176 | model.p_charLen: x_persona_char_len
177 | }
178 | batch_accuracy, predicted_prob = sess.run([model.accuracy, model.probs], feed_dict)
179 | num_test += len(predicted_prob)
180 | if num_test % 1000 == 0:
181 | print(num_test)
182 |
183 | num_correct += len(predicted_prob) * batch_accuracy
184 | for i, prob_score in enumerate(predicted_prob):
185 | utterances_id, profiles_id, label = x_id_pairs[i]
186 | results[utterances_id].append((profiles_id, label, prob_score))
187 |
188 | #calculate top-1 precision
189 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct/num_test))
190 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
191 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
192 |
193 | mvp = metrics.mean_average_precision(results)
194 | mrr = metrics.mean_reciprocal_rank(results)
195 | top_1_precision = metrics.top_1_precision(results)
196 | total_valid_query = metrics.get_num_valid_query(results)
197 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
198 |
199 | return mrr
200 |
201 | best_mrr = 0.0
202 | batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=True)
203 | for batch in batches:
204 | x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = batch
205 | train_step(x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len)
206 | current_step = tf.train.global_step(sess, global_step)
207 | if current_step % FLAGS.evaluate_every == 0:
208 | print("\nEvaluation:")
209 | valid_mrr = dev_step()
210 | if valid_mrr > best_mrr:
211 | best_mrr = valid_mrr
212 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)
213 | print("Saved model checkpoint to {}\n".format(path))
214 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/model_BiLSTM.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | FLAGS = tf.flags.FLAGS
5 |
6 | def get_embeddings(vocab):
7 | print("get_embedding")
8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
9 | return tf.constant(initializer, name="word_embedding")
10 |
11 | def get_char_embedding(charVocab):
12 | print("get_char_embedding")
13 | char_size = len(charVocab)
14 | embeddings = np.zeros((char_size, char_size), dtype='float32')
15 | for i in range(1, char_size):
16 | embeddings[i, i] = 1.0
17 |
18 | return tf.constant(embeddings, name="word_char_embedding")
19 |
20 | def load_embed_vectors(fname, dim):
21 | # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
22 | vectors = {}
23 | for line in open(fname, 'rt'):
24 | items = line.strip().split(' ')
25 | if len(items[0]) <= 0:
26 | continue
27 | vec = [float(items[i]) for i in range(1, dim+1)]
28 | vectors[items[0]] = vec
29 |
30 | return vectors
31 |
32 | def load_word_embeddings(vocab, dim):
33 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
34 | vocab_size = len(vocab)
35 | embeddings = np.zeros((vocab_size, dim), dtype='float32')
36 | for word, code in vocab.items():
37 | if word in vectors:
38 | embeddings[code] = vectors[word]
39 | #else:
40 | # embeddings[code] = np.random.uniform(-0.25, 0.25, dim)
41 |
42 | return embeddings
43 |
44 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False):
45 | with tf.variable_scope(scope, reuse=scope_reuse) as vs:
46 | fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
47 | fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob)
48 | bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
49 | bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob)
50 | rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell,
51 | inputs=inputs,
52 | sequence_length=input_seq_len,
53 | dtype=tf.float32)
54 | return rnn_outputs, rnn_states
55 |
56 | def multi_lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, num_layer, scope, scope_reuse=False):
57 | with tf.variable_scope(scope, reuse=scope_reuse) as vs:
58 | multi_outputs = []
59 | multi_states = []
60 | cur_inputs = inputs
61 | for i_layer in range(num_layer):
62 | rnn_outputs, rnn_states = lstm_layer(cur_inputs, input_seq_len, rnn_size, dropout_keep_prob, scope+str(i_layer), scope_reuse)
63 | rnn_outputs = tf.concat(values=rnn_outputs, axis=2)
64 | multi_outputs.append(rnn_outputs)
65 | multi_states.append(rnn_states)
66 | cur_inputs = rnn_outputs
67 |
68 | # multi_layer_aggregation
69 | ml_weights = tf.nn.softmax(tf.get_variable("ml_scores", [num_layer, ], initializer=tf.constant_initializer(0.0)))
70 |
71 | multi_outputs = tf.stack(multi_outputs, axis=-1) # [batch_size, max_len, 2*rnn_size(400), num_layer]
72 | max_len = multi_outputs.get_shape()[1].value
73 | dim = multi_outputs.get_shape()[2].value
74 | flattened_multi_outputs = tf.reshape(multi_outputs, [-1, num_layer]) # [batch_size * max_len * 2*rnn_size(400), num_layer]
75 | aggregated_ml_outputs = tf.matmul(flattened_multi_outputs, tf.expand_dims(ml_weights, 1)) # [batch_size * max_len * 2*rnn_size(400), 1]
76 | aggregated_ml_outputs = tf.reshape(aggregated_ml_outputs, [-1, max_len, dim]) # [batch_size , max_len , 2*rnn_size(400)]
77 |
78 | return aggregated_ml_outputs
79 |
80 |
81 | class BiLSTM(object):
82 | def __init__(
83 | self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
84 |
85 | self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances")
86 | self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len")
87 | self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num")
88 | self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles")
89 | self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len")
90 | self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num")
91 |
92 | self.target = tf.placeholder(tf.float32, [None], name="target")
93 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
94 |
95 | self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char")
96 | self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len")
97 | self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char")
98 | self.p_charLen = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len")
99 |
100 | l2_loss = tf.constant(1.0)
101 |
102 |
103 | # =============================== Embedding layer ===============================
104 | with tf.name_scope("embedding"):
105 | W = get_embeddings(vocab)
106 | utterances_embedded = tf.nn.embedding_lookup(W, self.utterances) # [batch_size, max_utter_num, max_utter_len, word_dim]
107 | profiles_embedded = tf.nn.embedding_lookup(W, self.profiles) # [batch_size, max_profile_num, max_profile_len, word_dim]
108 | utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob)
109 | profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob)
110 | print("utterances_embedded: {}".format(utterances_embedded.get_shape()))
111 | print("profiles_embedded: {}".format(profiles_embedded.get_shape()))
112 |
113 |
114 | # =============================== Encoding layer ===============================
115 | with tf.variable_scope("encoding_layer") as vs:
116 | rnn_scope_name = "bidirectional_rnn"
117 | emb_dim = utterances_embedded.get_shape()[-1].value
118 | flattened_utterances_embedded = tf.reshape(utterances_embedded, [-1, max_utter_len, emb_dim]) # [batch_size*max_utter_num, max_utter_len, emb]
119 | flattened_utterances_len = tf.reshape(self.utterances_len, [-1]) # [batch_size*max_utter_num, ]
120 | flattened_profiles_embedded = tf.reshape(profiles_embedded, [-1, max_profile_len, emb_dim]) # [batch_size*max_profile_num, max_profile_len, emb]
121 | flattened_profiles_len = tf.reshape(self.profiles_len, [-1]) # [batch_size*max_profile_num, ]
122 | # 1. single_lstm_layer
123 | u_rnn_output, u_rnn_states = lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False)
124 | utterances_output = tf.concat(axis=2, values=u_rnn_output) # [batch_size*max_utter_num, max_utter_len, rnn_size*2]
125 | p_rnn_output, p_rnn_states = lstm_layer(flattened_profiles_embedded, flattened_profiles_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True) # [batch_size, max_profile_len, rnn_size(200)]
126 | profiles_output = tf.concat(axis=2, values=p_rnn_output) # [batch_size*max_profile_num, max_profile_len, 2*rnn_size(400)]
127 | # 2. multi_lstm_layer
128 | # utterances_output = multi_lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=False)
129 | # response_output = multi_lstm_layer(flattened_responses_embedded, flattened_responses_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=True)
130 | # print("establish AHRE layers : {}".format(num_layer))
131 | print("establish BiLSTM encoder")
132 |
133 |
134 | # =============================== Matching layer ===============================
135 | with tf.variable_scope("matching_layer") as vs:
136 | final_utterances = tf.concat(axis=1, values=[u_rnn_states[0].h, u_rnn_states[1].h])
137 | concat_dim = final_utterances.get_shape()[-1].value
138 | final_utterances = tf.reshape(final_utterances, [-1, max_utter_num, concat_dim]) # [batch_size, max_utter_num, dim]
139 |
140 | final_profiles = tf.concat(axis=1, values=[p_rnn_states[0].h, p_rnn_states[1].h])
141 | final_profiles = tf.reshape(final_profiles, [-1, max_profile_num, concat_dim]) # [batch_size, max_profile_num, dim]
142 |
143 | A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
144 | similarity = tf.einsum('aij,jk->aik',
145 | final_utterances, A_matrix) # [batch_size, max_utter_num, dim]
146 | similarity = tf.matmul(similarity,
147 | tf.transpose(final_profiles, perm=[0, 2, 1]),
148 | name="similarity") # [batch_size, max_utter_num, max_profile_num]
149 |
150 | print("shape of similarity: {}".format(similarity.get_shape()))
151 | print("establish matching between utterances and profiles")
152 |
153 |
154 | # =============================== Aggregation layer ===============================
155 | with tf.variable_scope("aggregation_layer") as vs:
156 | logits = tf.reduce_max(similarity, axis=2, name="logits_1") # [batch_size, max_utter_num]
157 | mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32) # [batch_size, max_utter_num]
158 | logits = logits * mask_u
159 | logits = tf.reduce_sum(logits, axis=1, name="logits_2") # [batch_size, ]
160 | print("establish reduce_max across profiles and masked_reduce_sum across utterances")
161 | print("logits: {}".format(logits.get_shape()))
162 |
163 |
164 | # =============================== Prediction layer ===============================
165 | with tf.variable_scope("prediction_layer") as vs:
166 | self.probs = tf.sigmoid(logits, name="prob") # [batch_size, ]
167 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
168 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
169 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
170 |
171 | with tf.name_scope("accuracy"):
172 | correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5)) # [batch_size, ]
173 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
174 |
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """BERT finetuning runner."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import os
22 | import operator
23 | from time import time
24 | from collections import defaultdict
25 | import tensorflow as tf
26 | import optimization
27 | import tokenization
28 | import modeling as modeling
29 | import metrics
30 |
31 |
32 | flags = tf.flags
33 | FLAGS = flags.FLAGS
34 |
35 | ## Required parameters
36 | flags.DEFINE_string(
37 | "test_dir", 'valid.tfrecord',
38 | "The input test data dir. Should contain the .tsv files (or other data files) for the task.")
39 |
40 | flags.DEFINE_string(
41 | "restore_model_dir", 'output/',
42 | "The output directory where the model checkpoints have been written.")
43 |
44 | flags.DEFINE_string(
45 | "task_name", 'TestModel',
46 | "The name of the task.")
47 |
48 | flags.DEFINE_string(
49 | "bert_config_file", 'uncased_L-12_H-768_A-12/bert_config.json',
50 | "The config json file corresponding to the pre-trained BERT model. "
51 | "This specifies the model architecture.")
52 |
53 | flags.DEFINE_integer(
54 | "max_seq_length", 320,
55 | "The maximum total input sequence length after WordPiece tokenization. "
56 | "Sequences longer than this will be truncated, and sequences shorter "
57 | "than this will be padded.")
58 |
59 | flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.")
60 |
61 | flags.DEFINE_integer("eval_batch_size", 32, "Total batch size for predict.")
62 |
63 |
64 | def print_configuration_op(FLAGS):
65 | print('My Configurations:')
66 | for name, value in FLAGS.__flags.items():
67 | value=value.value
68 | if type(value) == float:
69 | print(' %s:\t %f'%(name, value))
70 | elif type(value) == int:
71 | print(' %s:\t %d'%(name, value))
72 | elif type(value) == str:
73 | print(' %s:\t %s'%(name, value))
74 | elif type(value) == bool:
75 | print(' %s:\t %s'%(name, value))
76 | else:
77 | print('%s:\t %s' % (name, value))
78 | print('End of configuration')
79 |
80 |
81 | def total_sample(file_name):
82 | sample_nums = 0
83 | for record in tf.python_io.tf_record_iterator(file_name):
84 | sample_nums += 1
85 | return sample_nums
86 |
87 |
88 | def parse_exmp(serial_exmp):
89 | input_data = tf.parse_single_example(serial_exmp,
90 | features={
91 | "text_a_id":
92 | tf.FixedLenFeature([], tf.int64),
93 | "text_b_id":
94 | tf.FixedLenFeature([], tf.int64),
95 | "input_ids":
96 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
97 | "input_mask":
98 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
99 | "segment_ids":
100 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
101 | "label_ids":
102 | tf.FixedLenFeature([], tf.float32)
103 | }
104 | )
105 | # So cast all int64 to int32.
106 | for name in list(input_data.keys()):
107 | t = input_data[name]
108 | if t.dtype == tf.int64:
109 | t = tf.to_int32(t)
110 | input_data[name] = t
111 |
112 | text_a_id = input_data["text_a_id"]
113 | text_b_id = input_data['text_b_id']
114 | input_ids = input_data["input_ids"]
115 | input_mask = input_data["input_mask"]
116 | segment_ids= input_data["segment_ids"]
117 | labels = input_data['label_ids']
118 | return text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels
119 |
120 |
121 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, text_a_id, text_b_id,
122 | num_labels, use_one_hot_embeddings):
123 | """Creates a classification model."""
124 | model = modeling.BertModel(
125 | config=bert_config,
126 | is_training=is_training,
127 | input_ids=input_ids,
128 | input_mask=input_mask,
129 | token_type_ids=segment_ids,
130 | use_one_hot_embeddings=use_one_hot_embeddings)
131 |
132 | # In the demo, we are doing a simple classification task on the entire
133 | # segment.
134 | #
135 | # If you want to use the token-level output, use model.get_sequence_output()
136 | # instead.
137 | target_loss_weight = [1.0, 1.0]
138 | target_loss_weight = tf.convert_to_tensor(target_loss_weight)
139 |
140 | flagx = tf.cast(tf.greater(labels, 0), dtype=tf.float32)
141 | flagy = tf.cast(tf.equal(labels, 0), dtype=tf.float32)
142 |
143 | all_target_loss = target_loss_weight[1] * flagx + target_loss_weight[0] * flagy
144 |
145 | output_layer = model.get_pooled_output()
146 |
147 | hidden_size = output_layer.shape[-1].value
148 |
149 | output_weights = tf.get_variable(
150 | "output_weights", [num_labels, hidden_size],
151 | initializer=tf.truncated_normal_initializer(stddev=0.02))
152 |
153 | output_bias = tf.get_variable(
154 | "output_bias", [num_labels], initializer=tf.zeros_initializer())
155 |
156 | with tf.variable_scope("loss"):
157 | # if is_training:
158 | # # I.e., 0.1 dropout
159 | # output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
160 | output_layer = tf.layers.dropout(output_layer, rate=0.1, training=is_training)
161 |
162 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
163 | logits = tf.nn.bias_add(logits, output_bias)
164 |
165 | probabilities = tf.sigmoid(logits, name="prob")
166 | logits = tf.squeeze(logits,[1])
167 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
168 | losses = tf.multiply(losses, all_target_loss)
169 |
170 | mean_loss = tf.reduce_mean(losses, name="mean_loss") + sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
171 |
172 | with tf.name_scope("accuracy"):
173 | correct_prediction = tf.equal(tf.sign(probabilities - 0.5), tf.sign(labels - 0.5))
174 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
175 | #
176 | # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
177 | #
178 | # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
179 | # loss = tf.reduce_mean(per_example_loss)
180 |
181 | return mean_loss, logits, probabilities, accuracy, model, output_layer
182 |
183 |
184 | best_score = 0.0
185 | def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer):
186 | results = defaultdict(list)
187 | num_test = 0
188 | num_correct = 0.0
189 | n_updates = 0
190 | mrr = 0
191 | t0 = time()
192 | try:
193 | while True:
194 | n_updates += 1
195 |
196 | batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False})
197 | question_id, answer_id, label = pair_
198 |
199 | num_test += len(predicted_prob)
200 | # if num_test % 1000 == 0:
201 | # print(num_test)
202 |
203 | num_correct += len(predicted_prob) * batch_accuracy
204 | for i, prob_score in enumerate(predicted_prob):
205 | results[question_id[i]].append((answer_id[i], label[i], prob_score[0]))
206 |
207 | if n_updates%100 == 0:
208 | tf.logging.info("n_update %d , %s: Mins Used: %.2f" %
209 | (n_updates, op_name, (time() - t0) / 60.0))
210 |
211 | except tf.errors.OutOfRangeError:
212 |
213 | print("Inference Time: {} s".format(time() - t0))
214 |
215 | # calculate top-1 precision
216 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct / num_test))
217 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
218 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
219 |
220 | mvp = metrics.mean_average_precision(results)
221 | mrr = metrics.mean_reciprocal_rank(results)
222 | top_1_precision = metrics.top_1_precision(results)
223 | total_valid_query = metrics.get_num_valid_query(results)
224 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(
225 | mvp, mrr, top_1_precision, total_valid_query))
226 |
227 | out_path = os.path.join(dir_path, "output_test.txt")
228 | print("Saving evaluation to {}".format(out_path))
229 | with open(out_path, 'w') as f:
230 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
231 | for us_id, v in results.items():
232 | v.sort(key=operator.itemgetter(2), reverse=True)
233 | for i, rec in enumerate(v):
234 | r_id, label, prob_score = rec
235 | rank = i+1
236 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label))
237 | return mrr
238 |
239 |
240 | def main(_):
241 | tf.logging.set_verbosity(tf.logging.INFO)
242 |
243 | print_configuration_op(FLAGS)
244 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
245 |
246 | test_data_size = total_sample(FLAGS.test_dir)
247 | tf.logging.info('test data size: {}'.format(test_data_size))
248 |
249 | filenames = tf.placeholder(tf.string, shape=[None])
250 | shuffle_size = tf.placeholder(tf.int64)
251 | dataset = tf.data.TFRecordDataset(filenames)
252 | dataset = dataset.map(parse_exmp) # Parse the record into tensors.
253 | dataset = dataset.repeat(1)
254 | # dataset = dataset.shuffle(shuffle_size)
255 | dataset = dataset.batch(FLAGS.eval_batch_size)
256 | iterator = dataset.make_initializable_iterator()
257 | text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels = iterator.get_next() # output dir
258 | pair_ids = [text_a_id, text_b_id, labels]
259 |
260 | training = tf.placeholder(tf.bool)
261 | mean_loss, logits, probabilities, accuracy, model, output_layer = create_model(bert_config,
262 | is_training = training,
263 | input_ids = input_ids,
264 | input_mask = input_mask,
265 | segment_ids = segment_ids,
266 | labels = labels,
267 | text_a_id = text_a_id,
268 | text_b_id = text_b_id,
269 | num_labels = 1,
270 | use_one_hot_embeddings = False)
271 |
272 |
273 | config = tf.ConfigProto(allow_soft_placement=True)
274 | config.gpu_options.allow_growth = True
275 |
276 | if FLAGS.do_eval:
277 | with tf.Session(config=config) as sess:
278 | tf.logging.info("*** Restore model ***")
279 |
280 | ckpt = tf.train.get_checkpoint_state(FLAGS.restore_model_dir)
281 | variables = tf.trainable_variables()
282 | saver = tf.train.Saver(variables)
283 | saver.restore(sess, ckpt.model_checkpoint_path)
284 |
285 | tf.logging.info('Test begin')
286 | sess.run(iterator.initializer,
287 | feed_dict={filenames: [FLAGS.test_dir], shuffle_size: 1})
288 | run_test(FLAGS.restore_model_dir, "test", sess, training, accuracy, probabilities, pair_ids, output_layer)
289 |
290 |
291 | if __name__ == "__main__":
292 |
293 | tf.app.run()
294 |
--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/train.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import os
4 | import time
5 | import datetime
6 | import operator
7 | from collections import defaultdict
8 | import metrics
9 | import data_helpers
10 | from model_BOW import BOW as MODEL
11 | # from model_BiLSTM import BiLSTM as MODEL
12 | # from model_Transformer import Transformer as MODEL
13 | # from model_ESIM import ESIM as MODEL
14 |
15 |
16 | # Files
17 | tf.flags.DEFINE_string("train_file", "", "path to train file")
18 | tf.flags.DEFINE_string("valid_file", "", "path to valid file")
19 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
20 | tf.flags.DEFINE_string("char_vocab_file", "", "path to char vocab file")
21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector")
22 |
23 | # Model Hyperparameters
24 | tf.flags.DEFINE_integer("max_utter_num", 8, "max utterance number")
25 | tf.flags.DEFINE_integer("max_utter_len", 20, "max utterance length")
26 | tf.flags.DEFINE_integer("max_profile_num", 5, "max profile number")
27 | tf.flags.DEFINE_integer("max_profile_len", 15, "max profile length")
28 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
29 | tf.flags.DEFINE_integer("num_layer", 1, "num of layers in sentence encoder")
30 | tf.flags.DEFINE_integer("embedding_dim", 200, "dimensionality of word embedding")
31 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units")
32 |
33 | # Training parameters
34 | tf.flags.DEFINE_integer("batch_size", 128, "batch size (default: 128)")
35 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0)")
36 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "dropout keep probability (default: 1.0)")
37 | tf.flags.DEFINE_integer("num_epochs", 1000000, "number of training epochs (default: 1000000)")
38 | tf.flags.DEFINE_integer("evaluate_every", 1000, "evaluate model on valid dataset after this many steps (default: 1000)")
39 |
40 | # Misc Parameters
41 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
42 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
43 |
44 | FLAGS = tf.flags.FLAGS
45 | # FLAGS._parse_flags()
46 | # print("\nParameters:")
47 | # for attr, value in sorted(FLAGS.__flags.items()):
48 | # print("{}={}".format(attr.upper(), value))
49 | print("")
50 |
51 | # Load data
52 | print("Loading data...")
53 |
54 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
55 | print('vocabulary size: {}'.format(len(vocab)))
56 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
57 |
58 | train_dataset = data_helpers.load_dataset(FLAGS.train_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len)
59 | print('train_pairs: {}'.format(len(train_dataset)))
60 | valid_dataset = data_helpers.load_dataset(FLAGS.valid_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len)
61 | print('valid_pairs: {}'.format(len(valid_dataset)))
62 |
63 | with tf.Graph().as_default():
64 | session_conf = tf.ConfigProto(
65 | allow_soft_placement=FLAGS.allow_soft_placement,
66 | log_device_placement=FLAGS.log_device_placement)
67 | sess = tf.Session(config=session_conf)
68 | with sess.as_default():
69 | model = MODEL(
70 | max_utter_num=FLAGS.max_utter_num,
71 | max_utter_len=FLAGS.max_utter_len,
72 | max_profile_num=FLAGS.max_profile_num,
73 | max_profile_len=FLAGS.max_profile_len,
74 | num_layer=FLAGS.num_layer,
75 | vocab_size=len(vocab),
76 | embedding_size=FLAGS.embedding_dim,
77 | vocab=vocab,
78 | rnn_size=FLAGS.rnn_size,
79 | maxWordLength=FLAGS.max_word_length,
80 | charVocab=charVocab,
81 | l2_reg_lambda=FLAGS.l2_reg_lambda)
82 | # Define Training procedure
83 | global_step = tf.Variable(0, name="global_step", trainable=False)
84 | starter_learning_rate = 0.001
85 | learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
86 | 5000, 0.96, staircase=True)
87 | optimizer = tf.train.AdamOptimizer(learning_rate)
88 | grads_and_vars = optimizer.compute_gradients(model.mean_loss)
89 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
90 |
91 | # Keep track of gradient values and sparsity (optional)
92 | """
93 | grad_summaries = []
94 | for g, v in grads_and_vars:
95 | if g is not None:
96 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
97 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
98 | grad_summaries.append(grad_hist_summary)
99 | grad_summaries.append(sparsity_summary)
100 | grad_summaries_merged = tf.merge_summary(grad_summaries)
101 | """
102 |
103 | # Output directory for models and summaries
104 | timestamp = str(int(time.time()))
105 | # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
106 | out_dir = os.path.abspath(os.path.join("../output", timestamp))
107 | print("Writing to {}\n".format(out_dir))
108 |
109 | # Summaries for loss and accuracy
110 | """
111 | loss_summary = tf.scalar_summary("loss", model.mean_loss)
112 | acc_summary = tf.scalar_summary("accuracy", model.accuracy)
113 |
114 | # Train Summaries
115 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
116 | train_summary_dir = os.path.join(out_dir, "summaries", "train")
117 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
118 |
119 | # Dev summaries
120 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
121 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
122 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def)
123 | """
124 |
125 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
126 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
127 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
128 | if not os.path.exists(checkpoint_dir):
129 | os.makedirs(checkpoint_dir)
130 | saver = tf.train.Saver(tf.global_variables())
131 |
132 | # Initialize all variables
133 | sess.run(tf.global_variables_initializer())
134 | # =====================================================================================
135 | # tvars = tf.trainable_variables()
136 | # para_total = 0
137 | # print 'All parameters:'
138 | # for i in xrange(len(tvars)):
139 | # print tvars[i].name
140 | # print tvars[i].get_shape()
141 | # if tvars[i].get_shape().ndims==1:
142 | # para_total += int(tvars[i].get_shape()[0])
143 | # else:
144 | # para_total += int(tvars[i].get_shape()[0])*int(tvars[i].get_shape()[1])
145 | # print 'Total Parameter Numbers: {}.'.format(para_total)
146 | # =====================================================================================
147 |
148 |
149 | def train_step(x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num,
150 | x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len):
151 | """
152 | A single training step
153 | """
154 | feed_dict = {
155 | model.utterances: x_utterances,
156 | model.utterances_len: x_utterances_len,
157 | model.utterances_num: x_utterances_num,
158 | model.profiles: x_profiles,
159 | model.profiles_len: x_profiles_len,
160 | model.profiles_num: x_profiles_num,
161 | model.target: x_labels,
162 | model.dropout_keep_prob: FLAGS.dropout_keep_prob,
163 | model.u_charVec: x_utterances_char,
164 | model.u_charLen: x_utterances_char_len,
165 | model.p_charVec: x_profiles_char,
166 | model.p_charLen: x_profiles_char_len
167 | }
168 |
169 | _, step, loss, accuracy, predicted_prob = sess.run(
170 | [train_op, global_step, model.mean_loss, model.accuracy, model.probs],
171 | feed_dict)
172 |
173 | if step%100 == 0:
174 | time_str = datetime.datetime.now().isoformat()
175 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
176 | #train_summary_writer.add_summary(summaries, step)
177 |
178 |
179 | def dev_step():
180 | # t0 = time.time()
181 | results = defaultdict(list)
182 | num_test = 0
183 | num_correct = 0.0
184 | valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=True)
185 | for valid_batch in valid_batches:
186 | x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = valid_batch
187 | feed_dict = {
188 | model.utterances: x_utterances,
189 | model.utterances_len: x_utterances_len,
190 | model.utterances_num: x_utterances_num,
191 | model.profiles: x_profiles,
192 | model.profiles_len: x_profiles_len,
193 | model.profiles_num: x_profiles_num,
194 | model.target: x_labels,
195 | model.dropout_keep_prob: 1.0,
196 | model.u_charVec: x_utterances_char,
197 | model.u_charLen: x_utterances_char_len,
198 | model.p_charVec: x_profiles_char,
199 | model.p_charLen: x_profiles_char_len
200 | }
201 | batch_accuracy, predicted_prob = sess.run([model.accuracy, model.probs], feed_dict)
202 | num_test += len(predicted_prob)
203 | if num_test % 1000 == 0:
204 | print(num_test)
205 |
206 | num_correct += len(predicted_prob) * batch_accuracy
207 | for i, prob_score in enumerate(predicted_prob):
208 | utterances_id, profiles_id, label = x_ids[i]
209 | results[utterances_id].append((profiles_id, label, prob_score))
210 |
211 | # print("Validation Time: {} s".format(time.time() - t0))
212 |
213 | #calculate top-1 precision
214 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct/num_test))
215 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
216 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
217 |
218 | mvp = metrics.mean_average_precision(results)
219 | mrr = metrics.mean_reciprocal_rank(results)
220 | top_1_precision = metrics.top_1_precision(results)
221 | total_valid_query = metrics.get_num_valid_query(results)
222 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
223 |
224 | return mrr
225 |
226 | best_mrr = 0.0
227 | batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=True)
228 | for batch in batches:
229 | x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = batch
230 | train_step(x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len)
231 | current_step = tf.train.global_step(sess, global_step)
232 | if current_step % FLAGS.evaluate_every == 0:
233 | print("\nEvaluation:")
234 | valid_mrr = dev_step()
235 | if valid_mrr > best_mrr:
236 | best_mrr = valid_mrr
237 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)
238 | print("Saved model checkpoint to {}\n".format(path))
239 |
--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """BERT finetuning runner."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import os
22 | import operator
23 | from time import time
24 | from collections import defaultdict
25 | import tensorflow as tf
26 | import optimization
27 | import tokenization
28 | import modeling as modeling
29 | import metrics
30 |
31 |
32 | flags = tf.flags
33 | FLAGS = flags.FLAGS
34 |
35 | ## Required parameters
36 | flags.DEFINE_string(
37 | "test_dir", 'valid.tfrecord',
38 | "The input test data dir. Should contain the .tsv files (or other data files) for the task.")
39 |
40 | flags.DEFINE_string(
41 | "restore_model_dir", 'output/',
42 | "The output directory where the model checkpoints have been written.")
43 |
44 | flags.DEFINE_string(
45 | "task_name", 'TestModel',
46 | "The name of the task.")
47 |
48 | flags.DEFINE_string(
49 | "bert_config_file", 'uncased_L-12_H-768_A-12/bert_config.json',
50 | "The config json file corresponding to the pre-trained BERT model. "
51 | "This specifies the model architecture.")
52 |
53 | flags.DEFINE_integer(
54 | "max_seq_length", 320,
55 | "The maximum total input sequence length after WordPiece tokenization. "
56 | "Sequences longer than this will be truncated, and sequences shorter "
57 | "than this will be padded.")
58 |
59 | flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.")
60 |
61 | flags.DEFINE_integer("eval_batch_size", 32, "Total batch size for predict.")
62 |
63 | max_sentence_a_num=8
64 | max_sentence_a_len=20
65 | max_sentence_b_num=5
66 | max_sentence_b_len=15
67 |
68 |
69 | def print_configuration_op(FLAGS):
70 | print('My Configurations:')
71 | for name, value in FLAGS.__flags.items():
72 | value=value.value
73 | if type(value) == float:
74 | print(' %s:\t %f'%(name, value))
75 | elif type(value) == int:
76 | print(' %s:\t %d'%(name, value))
77 | elif type(value) == str:
78 | print(' %s:\t %s'%(name, value))
79 | elif type(value) == bool:
80 | print(' %s:\t %s'%(name, value))
81 | else:
82 | print('%s:\t %s' % (name, value))
83 | print('End of configuration')
84 |
85 |
86 | def total_sample(file_name):
87 | sample_nums = 0
88 | for record in tf.python_io.tf_record_iterator(file_name):
89 | sample_nums += 1
90 | return sample_nums
91 |
92 |
93 | def parse_exmp(serial_exmp):
94 | input_data = tf.parse_single_example(serial_exmp,
95 | features={
96 | "text_a_id":
97 | tf.FixedLenFeature([], tf.int64),
98 | "text_b_id":
99 | tf.FixedLenFeature([], tf.int64),
100 | "input_ids":
101 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
102 | "input_mask":
103 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
104 | "segment_ids":
105 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
106 | "label_ids":
107 | tf.FixedLenFeature([], tf.float32)
108 | }
109 | )
110 | # So cast all int64 to int32.
111 | for name in list(input_data.keys()):
112 | t = input_data[name]
113 | if t.dtype == tf.int64:
114 | t = tf.to_int32(t)
115 | input_data[name] = t
116 |
117 | text_a_id = input_data["text_a_id"]
118 | text_b_id = input_data['text_b_id']
119 | input_ids = input_data["input_ids"]
120 | input_mask = input_data["input_mask"]
121 | segment_ids= input_data["segment_ids"]
122 | labels = input_data['label_ids']
123 | return text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels
124 |
125 |
126 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, text_a_id, text_b_id,
127 | num_labels, use_one_hot_embeddings):
128 | """Creates a classification model."""
129 |
130 | # print(input_ids.get_shape()) # [batch_size, max_sentence_a_num * max_sentence_b_num * (max_sentence_a_len + max_sentence_b_len)]
131 | input_ids = tf.reshape(input_ids, [-1, (max_sentence_a_len + max_sentence_b_len)])
132 | input_mask = tf.reshape(input_mask, [-1, (max_sentence_a_len + max_sentence_b_len)])
133 | segment_ids = tf.reshape(segment_ids, [-1, (max_sentence_a_len + max_sentence_b_len)])
134 | # print(input_ids.get_shape()) # [batch_size * max_sentence_a_num * max_sentence_b_num, (max_sentence_a_len + max_sentence_b_len)]
135 |
136 | model = modeling.BertModel(
137 | config=bert_config,
138 | is_training=is_training,
139 | input_ids=input_ids,
140 | input_mask=input_mask,
141 | token_type_ids=segment_ids,
142 | use_one_hot_embeddings=use_one_hot_embeddings)
143 |
144 | # In the demo, we are doing a simple classification task on the entire
145 | # segment.
146 | #
147 | # If you want to use the token-level output, use model.get_sequence_output()
148 | # instead.
149 | target_loss_weight = [1.0, 1.0]
150 | target_loss_weight = tf.convert_to_tensor(target_loss_weight)
151 |
152 | flagx = tf.cast(tf.greater(labels, 0), dtype=tf.float32)
153 | flagy = tf.cast(tf.equal(labels, 0), dtype=tf.float32)
154 |
155 | all_target_loss = target_loss_weight[1] * flagx + target_loss_weight[0] * flagy
156 |
157 | output_layer = model.get_pooled_output()
158 |
159 | hidden_size = output_layer.shape[-1].value
160 |
161 | output_weights = tf.get_variable(
162 | "output_weights", [num_labels, hidden_size],
163 | initializer=tf.truncated_normal_initializer(stddev=0.02))
164 |
165 | output_bias = tf.get_variable(
166 | "output_bias", [num_labels], initializer=tf.zeros_initializer())
167 |
168 | with tf.variable_scope("loss"):
169 | # if is_training:
170 | # # I.e., 0.1 dropout
171 | # output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
172 | output_layer = tf.layers.dropout(output_layer, rate=0.1, training=is_training)
173 |
174 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
175 | logits = tf.nn.bias_add(logits, output_bias)
176 |
177 | logits = tf.reshape(logits, [-1, max_sentence_a_num, max_sentence_b_num])
178 | logits = tf.reduce_max(logits, -1)
179 | logits = tf.reduce_sum(logits, -1)
180 | logits = tf.expand_dims(logits, -1)
181 |
182 | probabilities = tf.sigmoid(logits, name="prob")
183 | logits = tf.squeeze(logits,[1])
184 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
185 | losses = tf.multiply(losses, all_target_loss)
186 |
187 | mean_loss = tf.reduce_mean(losses, name="mean_loss") + sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
188 |
189 | with tf.name_scope("accuracy"):
190 | correct_prediction = tf.equal(tf.sign(probabilities - 0.5), tf.sign(labels - 0.5))
191 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
192 | #
193 | # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
194 | #
195 | # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
196 | # loss = tf.reduce_mean(per_example_loss)
197 |
198 | return mean_loss, logits, probabilities, accuracy, model, output_layer
199 |
200 |
201 | best_score = 0.0
202 | def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer):
203 | results = defaultdict(list)
204 | num_test = 0
205 | num_correct = 0.0
206 | n_updates = 0
207 | mrr = 0
208 | t0 = time()
209 | try:
210 | while True:
211 | n_updates += 1
212 |
213 | batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False})
214 | question_id, answer_id, label = pair_
215 |
216 | num_test += len(predicted_prob)
217 | # if num_test % 1000 == 0:
218 | # print(num_test)
219 |
220 | num_correct += len(predicted_prob) * batch_accuracy
221 | for i, prob_score in enumerate(predicted_prob):
222 | # question_id, answer_id, label = pair_id[i]
223 | results[question_id[i]].append((answer_id[i], label[i], prob_score[0]))
224 |
225 | if n_updates%100 == 0:
226 | tf.logging.info("n_update %d , %s: Mins Used: %.2f" %
227 | (n_updates, op_name, (time() - t0) / 60.0))
228 |
229 | except tf.errors.OutOfRangeError:
230 |
231 | print("Inference Time: {} s".format(time() - t0))
232 |
233 | # calculate top-1 precision
234 | print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct / num_test))
235 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
236 | print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
237 |
238 | mvp = metrics.mean_average_precision(results)
239 | mrr = metrics.mean_reciprocal_rank(results)
240 | top_1_precision = metrics.top_1_precision(results)
241 | total_valid_query = metrics.get_num_valid_query(results)
242 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(
243 | mvp, mrr, top_1_precision, total_valid_query))
244 |
245 | out_path = os.path.join(dir_path, "output_test.txt")
246 | print("Saving evaluation to {}".format(out_path))
247 | with open(out_path, 'w') as f:
248 | f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
249 | for us_id, v in results.items():
250 | v.sort(key=operator.itemgetter(2), reverse=True)
251 | for i, rec in enumerate(v):
252 | r_id, label, prob_score = rec
253 | rank = i+1
254 | f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label))
255 | return mrr
256 |
257 |
258 | def main(_):
259 | tf.logging.set_verbosity(tf.logging.INFO)
260 |
261 | print_configuration_op(FLAGS)
262 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
263 |
264 | test_data_size = total_sample(FLAGS.test_dir)
265 | tf.logging.info('test data size: {}'.format(test_data_size))
266 |
267 | filenames = tf.placeholder(tf.string, shape=[None])
268 | shuffle_size = tf.placeholder(tf.int64)
269 | dataset = tf.data.TFRecordDataset(filenames)
270 | dataset = dataset.map(parse_exmp) # Parse the record into tensors.
271 | dataset = dataset.repeat(1)
272 | # dataset = dataset.shuffle(shuffle_size)
273 | dataset = dataset.batch(FLAGS.eval_batch_size)
274 | iterator = dataset.make_initializable_iterator()
275 | text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels = iterator.get_next() # output dir
276 | pair_ids = [text_a_id, text_b_id, labels]
277 |
278 | training = tf.placeholder(tf.bool)
279 | mean_loss, logits, probabilities, accuracy, model, output_layer = create_model(bert_config,
280 | is_training = training,
281 | input_ids = input_ids,
282 | input_mask = input_mask,
283 | segment_ids = segment_ids,
284 | labels = labels,
285 | text_a_id = text_a_id,
286 | text_b_id = text_b_id,
287 | num_labels = 1,
288 | use_one_hot_embeddings = False)
289 |
290 |
291 | config = tf.ConfigProto(allow_soft_placement=True)
292 | config.gpu_options.allow_growth = True
293 |
294 | if FLAGS.do_eval:
295 | with tf.Session(config=config) as sess:
296 | tf.logging.info("*** Restore model ***")
297 |
298 | ckpt = tf.train.get_checkpoint_state(FLAGS.restore_model_dir)
299 | variables = tf.trainable_variables()
300 | saver = tf.train.Saver(variables)
301 | saver.restore(sess, ckpt.model_checkpoint_path)
302 |
303 | tf.logging.info('Test begin')
304 | sess.run(iterator.initializer,
305 | feed_dict={filenames: [FLAGS.test_dir], shuffle_size: 1})
306 | run_test(FLAGS.restore_model_dir, "test", sess, training, accuracy, probabilities, pair_ids, output_layer)
307 |
308 |
309 | if __name__ == "__main__":
310 | tf.app.run()
311 |
--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/tokenization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import re
23 | import unicodedata
24 | import six
25 | import tensorflow as tf
26 |
27 |
28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
29 | """Checks whether the casing config is consistent with the checkpoint name."""
30 |
31 | # The casing has to be passed in by the user and there is no explicit check
32 | # as to whether it matches the checkpoint. The casing information probably
33 | # should have been stored in the bert_config.json file, but it's not, so
34 | # we have to heuristically detect it to validate.
35 |
36 | if not init_checkpoint:
37 | return
38 |
39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
40 | if m is None:
41 | return
42 |
43 | model_name = m.group(1)
44 |
45 | lower_models = [
46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
48 | ]
49 |
50 | cased_models = [
51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
52 | "multi_cased_L-12_H-768_A-12"
53 | ]
54 |
55 | is_bad_config = False
56 | if model_name in lower_models and not do_lower_case:
57 | is_bad_config = True
58 | actual_flag = "False"
59 | case_name = "lowercased"
60 | opposite_flag = "True"
61 |
62 | if model_name in cased_models and do_lower_case:
63 | is_bad_config = True
64 | actual_flag = "True"
65 | case_name = "cased"
66 | opposite_flag = "False"
67 |
68 | if is_bad_config:
69 | raise ValueError(
70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
71 | "However, `%s` seems to be a %s model, so you "
72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
73 | "how the model was pre-training. If this error is wrong, please "
74 | "just comment out this check." % (actual_flag, init_checkpoint,
75 | model_name, case_name, opposite_flag))
76 |
77 |
78 | def convert_to_unicode(text):
79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
80 | if six.PY3:
81 | if isinstance(text, str):
82 | return text
83 | elif isinstance(text, bytes):
84 | return text.decode("utf-8", "ignore")
85 | else:
86 | raise ValueError("Unsupported string type: %s" % (type(text)))
87 | elif six.PY2:
88 | if isinstance(text, str):
89 | return text.decode("utf-8", "ignore")
90 | elif isinstance(text, unicode):
91 | return text
92 | else:
93 | raise ValueError("Unsupported string type: %s" % (type(text)))
94 | else:
95 | raise ValueError("Not running on Python2 or Python 3?")
96 |
97 |
98 | def printable_text(text):
99 | """Returns text encoded in a way suitable for print or `tf.logging`."""
100 |
101 | # These functions want `str` for both Python2 and Python3, but in one case
102 | # it's a Unicode string and in the other it's a byte string.
103 | if six.PY3:
104 | if isinstance(text, str):
105 | return text
106 | elif isinstance(text, bytes):
107 | return text.decode("utf-8", "ignore")
108 | else:
109 | raise ValueError("Unsupported string type: %s" % (type(text)))
110 | elif six.PY2:
111 | if isinstance(text, str):
112 | return text
113 | elif isinstance(text, unicode):
114 | return text.encode("utf-8")
115 | else:
116 | raise ValueError("Unsupported string type: %s" % (type(text)))
117 | else:
118 | raise ValueError("Not running on Python2 or Python 3?")
119 |
120 |
121 | def load_vocab(vocab_file):
122 | """Loads a vocabulary file into a dictionary."""
123 | vocab = collections.OrderedDict()
124 | index = 0
125 | with tf.gfile.GFile(vocab_file, "r") as reader:
126 | while True:
127 | token = convert_to_unicode(reader.readline())
128 | if not token:
129 | break
130 | token = token.strip()
131 | vocab[token] = index
132 | index += 1
133 | return vocab
134 |
135 |
136 | def convert_by_vocab(vocab, items):
137 | """Converts a sequence of [tokens|ids] using the vocab."""
138 | output = []
139 | for item in items:
140 | output.append(vocab[item])
141 | return output
142 |
143 |
144 | def convert_tokens_to_ids(vocab, tokens):
145 | return convert_by_vocab(vocab, tokens)
146 |
147 |
148 | def convert_ids_to_tokens(inv_vocab, ids):
149 | return convert_by_vocab(inv_vocab, ids)
150 |
151 |
152 | def whitespace_tokenize(text):
153 | """Runs basic whitespace cleaning and splitting on a piece of text."""
154 | text = text.strip()
155 | if not text:
156 | return []
157 | tokens = text.split()
158 | return tokens
159 |
160 |
161 | class FullTokenizer(object):
162 | """Runs end-to-end tokenziation."""
163 |
164 | def __init__(self, vocab_file, do_lower_case=True):
165 | self.vocab = load_vocab(vocab_file)
166 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
167 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
168 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
169 |
170 | def tokenize(self, text):
171 | split_tokens = []
172 | for token in self.basic_tokenizer.tokenize(text):
173 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
174 | split_tokens.append(sub_token)
175 |
176 | return split_tokens
177 |
178 | def convert_tokens_to_ids(self, tokens):
179 | return convert_by_vocab(self.vocab, tokens)
180 |
181 | def convert_ids_to_tokens(self, ids):
182 | return convert_by_vocab(self.inv_vocab, ids)
183 |
184 |
185 | class BasicTokenizer(object):
186 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
187 |
188 | def __init__(self, do_lower_case=True):
189 | """Constructs a BasicTokenizer.
190 |
191 | Args:
192 | do_lower_case: Whether to lower case the input.
193 | """
194 | self.do_lower_case = do_lower_case
195 |
196 | def tokenize(self, text):
197 | """Tokenizes a piece of text."""
198 | text = convert_to_unicode(text)
199 | text = self._clean_text(text)
200 |
201 | # This was added on November 1st, 2018 for the multilingual and Chinese
202 | # models. This is also applied to the English models now, but it doesn't
203 | # matter since the English models were not trained on any Chinese data
204 | # and generally don't have any Chinese data in them (there are Chinese
205 | # characters in the vocabulary because Wikipedia does have some Chinese
206 | # words in the English Wikipedia.).
207 | text = self._tokenize_chinese_chars(text)
208 |
209 | orig_tokens = whitespace_tokenize(text)
210 | split_tokens = []
211 | for token in orig_tokens:
212 | if self.do_lower_case:
213 | token = token.lower()
214 | token = self._run_strip_accents(token)
215 | split_tokens.extend(self._run_split_on_punc(token))
216 |
217 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
218 | return output_tokens
219 |
220 | def _run_strip_accents(self, text):
221 | """Strips accents from a piece of text."""
222 | text = unicodedata.normalize("NFD", text)
223 | output = []
224 | for char in text:
225 | cat = unicodedata.category(char)
226 | if cat == "Mn":
227 | continue
228 | output.append(char)
229 | return "".join(output)
230 |
231 | def _run_split_on_punc(self, text):
232 | """Splits punctuation on a piece of text."""
233 | chars = list(text)
234 | i = 0
235 | start_new_word = True
236 | output = []
237 | while i < len(chars):
238 | char = chars[i]
239 | if _is_punctuation(char):
240 | output.append([char])
241 | start_new_word = True
242 | else:
243 | if start_new_word:
244 | output.append([])
245 | start_new_word = False
246 | output[-1].append(char)
247 | i += 1
248 |
249 | return ["".join(x) for x in output]
250 |
251 | def _tokenize_chinese_chars(self, text):
252 | """Adds whitespace around any CJK character."""
253 | output = []
254 | for char in text:
255 | cp = ord(char)
256 | if self._is_chinese_char(cp):
257 | output.append(" ")
258 | output.append(char)
259 | output.append(" ")
260 | else:
261 | output.append(char)
262 | return "".join(output)
263 |
264 | def _is_chinese_char(self, cp):
265 | """Checks whether CP is the codepoint of a CJK character."""
266 | # This defines a "chinese character" as anything in the CJK Unicode block:
267 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
268 | #
269 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
270 | # despite its name. The modern Korean Hangul alphabet is a different block,
271 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
272 | # space-separated words, so they are not treated specially and handled
273 | # like the all of the other languages.
274 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
275 | (cp >= 0x3400 and cp <= 0x4DBF) or #
276 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
277 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
278 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
279 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
280 | (cp >= 0xF900 and cp <= 0xFAFF) or #
281 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
282 | return True
283 |
284 | return False
285 |
286 | def _clean_text(self, text):
287 | """Performs invalid character removal and whitespace cleanup on text."""
288 | output = []
289 | for char in text:
290 | cp = ord(char)
291 | if cp == 0 or cp == 0xfffd or _is_control(char):
292 | continue
293 | if _is_whitespace(char):
294 | output.append(" ")
295 | else:
296 | output.append(char)
297 | return "".join(output)
298 |
299 |
300 | class WordpieceTokenizer(object):
301 | """Runs WordPiece tokenziation."""
302 |
303 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
304 | self.vocab = vocab
305 | self.unk_token = unk_token
306 | self.max_input_chars_per_word = max_input_chars_per_word
307 |
308 | def tokenize(self, text):
309 | """Tokenizes a piece of text into its word pieces.
310 |
311 | This uses a greedy longest-match-first algorithm to perform tokenization
312 | using the given vocabulary.
313 |
314 | For example:
315 | input = "unaffable"
316 | output = ["un", "##aff", "##able"]
317 |
318 | Args:
319 | text: A single token or whitespace separated tokens. This should have
320 | already been passed through `BasicTokenizer.
321 |
322 | Returns:
323 | A list of wordpiece tokens.
324 | """
325 |
326 | text = convert_to_unicode(text)
327 |
328 | output_tokens = []
329 | for token in whitespace_tokenize(text):
330 | chars = list(token)
331 | if len(chars) > self.max_input_chars_per_word:
332 | output_tokens.append(self.unk_token)
333 | continue
334 |
335 | is_bad = False
336 | start = 0
337 | sub_tokens = []
338 | while start < len(chars):
339 | end = len(chars)
340 | cur_substr = None
341 | while start < end:
342 | substr = "".join(chars[start:end])
343 | if start > 0:
344 | substr = "##" + substr
345 | if substr in self.vocab:
346 | cur_substr = substr
347 | break
348 | end -= 1
349 | if cur_substr is None:
350 | is_bad = True
351 | break
352 | sub_tokens.append(cur_substr)
353 | start = end
354 |
355 | if is_bad:
356 | output_tokens.append(self.unk_token)
357 | else:
358 | output_tokens.extend(sub_tokens)
359 | return output_tokens
360 |
361 |
362 | def _is_whitespace(char):
363 | """Checks whether `chars` is a whitespace character."""
364 | # \t, \n, and \r are technically contorl characters but we treat them
365 | # as whitespace since they are generally considered as such.
366 | if char == " " or char == "\t" or char == "\n" or char == "\r":
367 | return True
368 | cat = unicodedata.category(char)
369 | if cat == "Zs":
370 | return True
371 | return False
372 |
373 |
374 | def _is_control(char):
375 | """Checks whether `chars` is a control character."""
376 | # These are technically control characters but we count them as whitespace
377 | # characters.
378 | if char == "\t" or char == "\n" or char == "\r":
379 | return False
380 | cat = unicodedata.category(char)
381 | if cat in ("Cc", "Cf"):
382 | return True
383 | return False
384 |
385 |
386 | def _is_punctuation(char):
387 | """Checks whether `chars` is a punctuation character."""
388 | cp = ord(char)
389 | # We treat all non-letter/number ASCII as punctuation.
390 | # Characters such as "^", "$", and "`" are not in the Unicode
391 | # Punctuation class but we treat them as punctuation anyways, for
392 | # consistency.
393 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
394 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
395 | return True
396 | cat = unicodedata.category(char)
397 | if cat.startswith("P"):
398 | return True
399 | return False
400 |
--------------------------------------------------------------------------------