├── Non-Pretraining-Based
    ├── C2P-X
    │   ├── __init__.py
    │   ├── scripts
    │   │   ├── test.sh
    │   │   └── train.sh
    │   ├── compute_metrics.py
    │   ├── metrics.py
    │   ├── test.py
    │   ├── model_BOW.py
    │   ├── data_helpers.py
    │   ├── transformer_block.py
    │   ├── model_Transformer.py
    │   ├── model_BiLSTM.py
    │   └── train.py
    └── U2P-X
    │   ├── __init__.py
    │   ├── scripts
    │       ├── test.sh
    │       └── train.sh
    │   ├── compute_metrics.py
    │   ├── metrics.py
    │   ├── test.py
    │   ├── model_BOW.py
    │   ├── transformer_block.py
    │   ├── model_Transformer.py
    │   ├── data_helpers.py
    │   ├── model_BiLSTM.py
    │   └── train.py
├── image
    ├── result.png
    └── task.png
├── Pretraining-Based
    ├── uncased_L-12_H-768_A-12
    │   └── README.txt
    ├── C2P-BERT
    │   ├── scripts
    │   │   ├── test.sh
    │   │   └── train.sh
    │   ├── __init__.py
    │   ├── compute_metrics.py
    │   ├── metrics.py
    │   ├── optimization.py
    │   ├── test.py
    │   └── tokenization.py
    └── U2P-BERT
    │   ├── scripts
    │       ├── test.sh
    │       └── train.sh
    │   ├── __init__.py
    │   ├── compute_metrics.py
    │   ├── metrics.py
    │   ├── optimization.py
    │   └── test.py
├── data_PMPC
    └── README.txt
└── README.md


/Non-Pretraining-Based/C2P-X/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/image/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonForJoy/SPD/HEAD/image/result.png


--------------------------------------------------------------------------------
/image/task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonForJoy/SPD/HEAD/image/task.png


--------------------------------------------------------------------------------
/Pretraining-Based/uncased_L-12_H-768_A-12/README.txt:
--------------------------------------------------------------------------------
1 | ====== Download the BERT base model ======
2 | 
3 | link: https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
4 | Move to path: ./Pretraining-Based/uncased_L-12_H-768_A-12
5 | 
6 | 


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/scripts/test.sh:
--------------------------------------------------------------------------------
1 | 
2 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
3 |     --test_dir ../data_tfrecord/processed_test_both_revised_cand_10.tfrecord \
4 |     --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
5 |     --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
6 |     --max_seq_length 200 \
7 |     --eval_batch_size 50 \
8 |     --restore_model_dir ../output/1631501715 > log_test_BERT_cand_10.txt 2>&1 &
9 | 


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/scripts/test.sh:
--------------------------------------------------------------------------------
1 | 
2 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
3 |     --test_dir ../data_tfrecord/processed_test_both_revised_cand_10.tfrecord \
4 |     --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
5 |     --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
6 |     --max_seq_length 1400 \
7 |     --eval_batch_size 10 \
8 |     --restore_model_dir ../output/1631263935 > log_test_BERT_cand_10.txt 2>&1 &
9 | 


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/data_PMPC/README.txt:
--------------------------------------------------------------------------------
 1 | Download the PMPC dataset and move it to path: /data_PMPC
 2 | 
 3 | If you think our work is helpful, or use the code or dataset, please cite the following paper
 4 | 
 5 | @inproceedings{gu-etal-2021-detecting,
 6 |  title = "Detecting Speaker Personas from Conversational Texts",
 7 |  author = "Gu, Jia-Chen  and
 8 |            Ling, Zhen-Hua  and
 9 |            Wu, Yu  and
10 |            Liu, Quan  and
11 |            Chen, Zhigang  and
12 |            Zhu, Xiaodan",
13 |  booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
14 |  month = nov,
15 |  year = "2021",
16 |  publisher = "Association for Computational Linguistics",
17 | }


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
 3 |   --task_name PersonaMatch \
 4 |   --train_dir ../data_tfrecord/processed_train_both_revised.tfrecord \
 5 |   --valid_dir ../data_tfrecord/processed_valid_both_revised_cand_10.tfrecord \
 6 |   --output_dir ../output \
 7 |   --do_lower_case True \
 8 |   --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
 9 |   --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
10 |   --init_checkpoint ../../uncased_L-12_H-768_A-12/bert_model.ckpt \
11 |   --max_seq_length 200 \
12 |   --do_train True  \
13 |   --do_eval True  \
14 |   --train_batch_size 20 \
15 |   --eval_batch_size 20 \
16 |   --learning_rate 2e-5 \
17 |   --num_train_epochs 10 \
18 |   --warmup_proportion 0.1 > log_train_BERT_cand_10.txt 2>&1 &
19 | 


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
 3 |   --task_name PersonaMatch \
 4 |   --train_dir ../data_tfrecord/processed_train_both_revised_cand_10.tfrecord \
 5 |   --valid_dir ../data_tfrecord/processed_valid_both_revised_cand_10.tfrecord \
 6 |   --output_dir ../output \
 7 |   --do_lower_case True \
 8 |   --vocab_file ../../uncased_L-12_H-768_A-12/vocab.txt \
 9 |   --bert_config_file ../../uncased_L-12_H-768_A-12/bert_config.json \
10 |   --init_checkpoint ../../uncased_L-12_H-768_A-12/bert_model.ckpt \
11 |   --max_seq_length 1400 \
12 |   --do_train True  \
13 |   --do_eval True  \
14 |   --train_batch_size 4 \
15 |   --eval_batch_size 4 \
16 |   --learning_rate 2e-5 \
17 |   --num_train_epochs 20 \
18 |   --warmup_proportion 0.1 > log_train_BERT_cand_10.txt 2>&1 &
19 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | latest_checkpoint=../output/1631263935/checkpoints
 3 | echo $latest_checkpoint
 4 | 
 5 | test_file=../../../data_PMPC/test_both_revised_cand_10.txt
 6 | vocab_file=../../../data_PMPC/vocab.txt
 7 | char_vocab_file=../../../data_PMPC/char_vocab.txt
 8 | output_file=${latest_checkpoint}/output_test.txt
 9 | 
10 | max_context_len=150
11 | max_persona_len=50
12 | max_word_length=18
13 | batch_size=128
14 | 
15 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
16 |                   --test_file $test_file \
17 |                   --vocab_file $vocab_file \
18 |                   --char_vocab_file $char_vocab_file \
19 |                   --output_file $output_file \
20 |                   --max_context_len $max_context_len \
21 |                   --max_persona_len $max_persona_len \
22 |                   --max_word_length $max_word_length \
23 |                   --batch_size $batch_size \
24 |                   --checkpoint_dir $latest_checkpoint > log_test_BOW_cand_10.txt 2>&1 &
25 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | latest_checkpoint=../output/1631263935/checkpoints
 3 | echo $latest_checkpoint
 4 | 
 5 | test_file=../../../data_PMPC/test_both_revised_cand_10.txt
 6 | vocab_file=../../../data_PMPC/vocab.txt
 7 | char_vocab_file=../../../data_PMPC/char_vocab.txt
 8 | output_file=${latest_checkpoint}/output_test.txt
 9 | 
10 | max_utter_num=8
11 | max_utter_len=20
12 | max_profile_num=5
13 | max_profile_len=15
14 | max_word_length=18
15 | batch_size=128
16 | 
17 | CUDA_VISIBLE_DEVICES=0 python -u ../test.py \
18 |                   --test_file $test_file \
19 |                   --vocab_file $vocab_file \
20 |                   --char_vocab_file $char_vocab_file \
21 |                   --output_file $output_file \
22 |                   --max_utter_num $max_utter_num \
23 |                   --max_utter_len $max_utter_len \
24 |                   --max_profile_num $max_profile_num \
25 |                   --max_profile_len $max_profile_len \
26 |                   --max_word_length $max_word_length \
27 |                   --batch_size $batch_size \
28 |                   --checkpoint_dir $latest_checkpoint > log_test_BOW_can_10.txt 2>&1 &
29 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | train_file=../../../data_PMPC/train_both_revised.txt
 3 | valid_file=../../../data_PMPC/valid_both_revised_cand_10.txt
 4 | vocab_file=../../../data_PMPC/vocab.txt
 5 | char_vocab_file=../../../data_PMPC/char_vocab.txt
 6 | embedded_vector_file=../../../data_PMPC/filtered.glove.42B.300d.txt
 7 | 
 8 | max_context_len=150
 9 | max_persona_len=50
10 | max_word_length=18
11 | num_layer=1
12 | embedding_dim=300
13 | rnn_size=200
14 | 
15 | batch_size=128
16 | lambda=0
17 | dropout_keep_prob=0.8
18 | num_epochs=1000
19 | evaluate_every=100
20 | 
21 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
22 |                 --train_file $train_file \
23 |                 --valid_file $valid_file \
24 |                 --vocab_file $vocab_file \
25 |                 --char_vocab_file $char_vocab_file \
26 |                 --embedded_vector_file $embedded_vector_file \
27 |                 --max_context_len $max_context_len \
28 |                 --max_persona_len $max_persona_len \
29 |                 --max_word_length $max_word_length \
30 |                 --num_layer $num_layer \
31 |                 --embedding_dim $embedding_dim \
32 |                 --rnn_size $rnn_size \
33 |                 --batch_size $batch_size \
34 |                 --l2_reg_lambda $lambda \
35 |                 --dropout_keep_prob $dropout_keep_prob \
36 |                 --num_epochs $num_epochs \
37 |                 --evaluate_every $evaluate_every > log_train_BOW_cand_10.txt 2>&1 &
38 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | train_file=../../../data_PMPC/train_both_revised.txt
 3 | valid_file=../../../data_PMPC/valid_both_revised_cand_10.txt
 4 | vocab_file=../../../data_PMPC/vocab.txt
 5 | char_vocab_file=../../../data_PMPC/char_vocab.txt
 6 | embedded_vector_file=../../../data_PMPC/filtered.glove.42B.300d.txt
 7 | 
 8 | max_utter_num=8
 9 | max_utter_len=20
10 | max_profile_num=5
11 | max_profile_len=15
12 | max_word_length=18
13 | num_layer=1
14 | embedding_dim=300
15 | rnn_size=200
16 | 
17 | batch_size=128
18 | lambda=0
19 | dropout_keep_prob=0.8
20 | num_epochs=1000
21 | evaluate_every=100
22 | 
23 | CUDA_VISIBLE_DEVICES=0 python -u ../train.py \
24 |                 --train_file $train_file \
25 |                 --valid_file $valid_file \
26 |                 --vocab_file $vocab_file \
27 |                 --char_vocab_file $char_vocab_file \
28 |                 --embedded_vector_file $embedded_vector_file \
29 |                 --max_utter_num $max_utter_num \
30 |                 --max_utter_len $max_utter_len \
31 |                 --max_profile_num $max_profile_num \
32 |                 --max_profile_len $max_profile_len \
33 |                 --max_word_length $max_word_length \
34 |                 --num_layer $num_layer \
35 |                 --embedding_dim $embedding_dim \
36 |                 --rnn_size $rnn_size \
37 |                 --batch_size $batch_size \
38 |                 --l2_reg_lambda $lambda \
39 |                 --dropout_keep_prob $dropout_keep_prob \
40 |                 --num_epochs $num_epochs \
41 |                 --evaluate_every $evaluate_every > log_train_BOW_cand_10.txt 2>&1 &
42 | 


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/compute_metrics.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Load the output.txt file and compute the matrics
 3 | '''
 4 | 
 5 | import numpy as np
 6 | import operator
 7 | import random
 8 | from collections import defaultdict
 9 | import metrics
10 | 
11 | 
12 | test_out_filename = "output/1631259843/output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 | 
15 | with open(test_out_filename, 'r') as f:
16 | 
17 |     results = defaultdict(list)
18 |     lines = f.readlines()
19 |     for line in lines[1:]:
20 |         line = line.strip().split('\t')
21 |         us_id = line[0]
22 |         r_id = line[1]
23 |         prob_score = float(line[2])
24 |         label = float(line[4])
25 |         results[us_id].append((r_id, label, prob_score))
26 | 
27 |     accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 |     print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 |     total_valid_query = metrics.get_num_valid_query(results)
30 |     mvp = metrics.mean_average_precision(results)
31 |     mrr = metrics.mean_reciprocal_rank(results)
32 |     print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 |         mvp, mrr, total_valid_query))
34 |     top_1_precision = metrics.top_k_precision(results, k=1)
35 |     top_2_precision = metrics.top_k_precision(results, k=2)
36 |     top_5_precision = metrics.top_k_precision(results, k=5)
37 |     print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format(
38 |         top_1_precision, top_2_precision, top_5_precision))
39 | 
40 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/compute_metrics.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Load the output.txt file and compute the matrics
 3 | '''
 4 | 
 5 | import numpy as np
 6 | import operator
 7 | import random
 8 | from collections import defaultdict
 9 | import metrics
10 | 
11 | 
12 | test_out_filename = "output/1631512095/checkpoints/output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 | 
15 | with open(test_out_filename, 'r') as f:
16 | 
17 |     results = defaultdict(list)
18 |     lines = f.readlines()
19 |     for line in lines[1:]:
20 |         line = line.strip().split('\t')
21 |         us_id = line[0]
22 |         r_id = line[1]
23 |         prob_score = float(line[2])
24 |         label = float(line[4])
25 |         results[us_id].append((r_id, label, prob_score))
26 | 
27 |     accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 |     print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 |     total_valid_query = metrics.get_num_valid_query(results)
30 |     mvp = metrics.mean_average_precision(results)
31 |     mrr = metrics.mean_reciprocal_rank(results)
32 |     print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 |         mvp, mrr, total_valid_query))
34 |     top_1_precision = metrics.top_k_precision(results, k=1)
35 |     top_2_precision = metrics.top_k_precision(results, k=2)
36 |     top_5_precision = metrics.top_k_precision(results, k=5)
37 |     print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format(
38 |         top_1_precision, top_2_precision, top_5_precision))
39 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/compute_metrics.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Load the output.txt file and compute the matrics
 3 | '''
 4 | 
 5 | import numpy as np
 6 | import operator
 7 | import random
 8 | from collections import defaultdict
 9 | import metrics
10 | 
11 | 
12 | test_out_filename = "output/1631513113/checkpoints/output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 | 
15 | with open(test_out_filename, 'r') as f:
16 | 
17 |     results = defaultdict(list)
18 |     lines = f.readlines()
19 |     for line in lines[1:]:
20 |         line = line.strip().split('\t')
21 |         us_id = line[0]
22 |         r_id = line[1]
23 |         prob_score = float(line[2])
24 |         label = float(line[4])
25 |         results[us_id].append((r_id, label, prob_score))
26 | 
27 |     accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 |     print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 |     total_valid_query = metrics.get_num_valid_query(results)
30 |     mvp = metrics.mean_average_precision(results)
31 |     mrr = metrics.mean_reciprocal_rank(results)
32 |     print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 |         mvp, mrr, total_valid_query))
34 |     top_1_precision = metrics.top_k_precision(results, k=1)
35 |     top_2_precision = metrics.top_k_precision(results, k=2)
36 |     top_5_precision = metrics.top_k_precision(results, k=5)
37 |     print('Recall_10@1: {}\tRecall_10@2: {}\tRecall_10@5: {}\n'.format(
38 |         top_1_precision, top_2_precision, top_5_precision))
39 | 


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/compute_metrics.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Load the output.txt file and compute the matrics
 3 | '''
 4 | 
 5 | import numpy as np
 6 | import operator
 7 | import random
 8 | from collections import defaultdict
 9 | import metrics
10 | 
11 | 
12 | test_out_filename = "output_test.txt"
13 | print("*"*20 + test_out_filename + "*"*20 + "\n")
14 | 
15 | with open(test_out_filename, 'r') as f:
16 | 
17 |     results = defaultdict(list)
18 |     lines = f.readlines()
19 |     for line in lines[1:]:
20 |         line = line.strip().split('\t')
21 |         us_id = line[0]
22 |         r_id = line[1]
23 |         prob_score = float(line[2])
24 |         label = float(line[4])
25 |         results[us_id].append((r_id, label, prob_score))
26 | 
27 |     accu, precision, recall, f1, loss = metrics.classification_metrics(results)
28 |     print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
29 |     total_valid_query = metrics.get_num_valid_query(results)
30 |     mvp = metrics.mean_average_precision(results)
31 |     mrr = metrics.mean_reciprocal_rank(results)
32 |     print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tNum_query: {}'.format(
33 |         mvp, mrr, total_valid_query))
34 |     top_1_precision = metrics.top_k_precision(results, k=1)
35 |     top_2_precision = metrics.top_k_precision(results, k=2)
36 |     top_5_precision = metrics.top_k_precision(results, k=5)
37 |     top_10_precision = metrics.top_k_precision(results, k=10)
38 |     print('Recall@1: {}\tRecall@2: {}\tRecall@5: {}\tRecall@10: {}\n'.format(
39 |         top_1_precision, top_2_precision, top_5_precision, top_10_precision))
40 | 
41 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/metrics.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import math
  3 | 
  4 | 
  5 | def is_valid_query(v):
  6 |     num_pos = 0
  7 |     num_neg = 0
  8 |     for aid, label, score in v:
  9 |         if label > 0:
 10 |             num_pos += 1
 11 |         else:
 12 |             num_neg += 1
 13 |     if num_pos > 0 and num_neg > 0:
 14 |         return True
 15 |     else:
 16 |         return False
 17 | 
 18 | 
 19 | def get_num_valid_query(results):
 20 |     num_query = 0
 21 |     for k, v in results.items():
 22 |         if not is_valid_query(v):
 23 |             continue
 24 |         num_query += 1
 25 |     return num_query
 26 | 
 27 | 
 28 | def top_1_precision(results):
 29 |     num_query = 0
 30 |     top_1_correct = 0.0
 31 |     for k, v in results.items():
 32 |         if not is_valid_query(v):
 33 |             continue
 34 |         num_query += 1
 35 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 36 |         aid, label, score = sorted_v[0]
 37 |         if label > 0:
 38 |             top_1_correct += 1
 39 | 
 40 |     if num_query > 0:
 41 |         return top_1_correct / num_query
 42 |     else:
 43 |         return 0.0
 44 | 
 45 | 
 46 | def top_k_precision(results, k=1):
 47 |     num_query = 0
 48 |     top_1_correct = 0.0
 49 |     for key, v in results.items():
 50 |         if not is_valid_query(v):
 51 |             continue
 52 |         num_query += 1
 53 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 54 |         if k == 1:
 55 |             aid, label, score = sorted_v[0]
 56 |             if label > 0:
 57 |                 top_1_correct += 1
 58 |         elif k == 2:
 59 |             aid1, label1, score1 = sorted_v[0]
 60 |             aid2, label2, score2 = sorted_v[1]
 61 |             if label1 > 0 or label2 > 0:
 62 |                 top_1_correct += 1
 63 |         elif k == 5:
 64 |             for vv in sorted_v[0:5]:
 65 |                 label = vv[1]
 66 |                 if label > 0:
 67 |                     top_1_correct += 1
 68 |                     break
 69 |         else:
 70 |             raise BaseException
 71 | 
 72 |     if num_query > 0:
 73 |         return top_1_correct/num_query
 74 |     else:
 75 |         return 0.0
 76 | 
 77 | 
 78 | def mean_reciprocal_rank(results):
 79 |     num_query = 0
 80 |     mrr = 0.0
 81 |     for k, v in results.items():
 82 |         if not is_valid_query(v):
 83 |             continue
 84 | 
 85 |         num_query += 1
 86 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 87 |         for i, rec in enumerate(sorted_v):
 88 |             aid, label, score = rec
 89 |             if label > 0:
 90 |                 mrr += 1.0 / (i + 1)
 91 |                 break
 92 | 
 93 |     if num_query == 0:
 94 |         return 0.0
 95 |     else:
 96 |         mrr = mrr / num_query
 97 |         return mrr
 98 | 
 99 | 
100 | def mean_average_precision(results):
101 |     num_query = 0
102 |     mvp = 0.0
103 |     for k, v in results.items():
104 |         if not is_valid_query(v):
105 |             continue
106 | 
107 |         num_query += 1
108 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
109 |         num_relevant_doc = 0.0
110 |         avp = 0.0
111 |         for i, rec in enumerate(sorted_v):
112 |             aid, label, score = rec
113 |             if label == 1:
114 |                 num_relevant_doc += 1
115 |                 precision = num_relevant_doc / (i + 1)
116 |                 avp += precision
117 |         avp = avp / num_relevant_doc
118 |         mvp += avp
119 | 
120 |     if num_query == 0:
121 |         return 0.0
122 |     else:
123 |         mvp = mvp / num_query
124 |         return mvp
125 | 
126 | 
127 | def classification_metrics(results):
128 |     total_num = 0
129 |     total_correct = 0
130 |     true_positive = 0
131 |     positive_correct = 0
132 |     predicted_positive = 0
133 | 
134 |     loss = 0.0;
135 |     for k, v in results.items():
136 |         for rec in v:
137 |             total_num += 1
138 |             aid, label, score = rec
139 | 
140 |             if score > 0.5:
141 |                 predicted_positive += 1
142 | 
143 |             if label > 0:
144 |                 true_positive += 1
145 |                 loss += -math.log(score + 1e-12)
146 |             else:
147 |                 loss += -math.log(1.0 - score + 1e-12);
148 | 
149 |             if score > 0.5 and label > 0:
150 |                 total_correct += 1
151 |                 positive_correct += 1
152 | 
153 |             if score < 0.5 and label < 0.5:
154 |                 total_correct += 1
155 | 
156 |     accuracy = float(total_correct) / total_num
157 |     precision = float(positive_correct) / (predicted_positive + 1e-12)
158 |     recall = float(positive_correct) / true_positive
159 |     F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
160 |     return accuracy, precision, recall, F1, loss / total_num;


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/metrics.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import math
  3 | 
  4 | 
  5 | def is_valid_query(v):
  6 |     num_pos = 0
  7 |     num_neg = 0
  8 |     for aid, label, score in v:
  9 |         if label > 0:
 10 |             num_pos += 1
 11 |         else:
 12 |             num_neg += 1
 13 |     if num_pos > 0 and num_neg > 0:
 14 |         return True
 15 |     else:
 16 |         return False
 17 | 
 18 | 
 19 | def get_num_valid_query(results):
 20 |     num_query = 0
 21 |     for k, v in results.items():
 22 |         if not is_valid_query(v):
 23 |             continue
 24 |         num_query += 1
 25 |     return num_query
 26 | 
 27 | 
 28 | def top_1_precision(results):
 29 |     num_query = 0
 30 |     top_1_correct = 0.0
 31 |     for k, v in results.items():
 32 |         if not is_valid_query(v):
 33 |             continue
 34 |         num_query += 1
 35 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 36 |         aid, label, score = sorted_v[0]
 37 |         if label > 0:
 38 |             top_1_correct += 1
 39 | 
 40 |     if num_query > 0:
 41 |         return top_1_correct / num_query
 42 |     else:
 43 |         return 0.0
 44 | 
 45 | 
 46 | def top_k_precision(results, k=1):
 47 |     num_query = 0
 48 |     top_1_correct = 0.0
 49 |     for key, v in results.items():
 50 |         if not is_valid_query(v):
 51 |             continue
 52 |         num_query += 1
 53 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 54 |         if k == 1:
 55 |             aid, label, score = sorted_v[0]
 56 |             if label > 0:
 57 |                 top_1_correct += 1
 58 |         elif k == 2:
 59 |             aid1, label1, score1 = sorted_v[0]
 60 |             aid2, label2, score2 = sorted_v[1]
 61 |             if label1 > 0 or label2 > 0:
 62 |                 top_1_correct += 1
 63 |         elif k == 5:
 64 |             for vv in sorted_v[0:5]:
 65 |                 label = vv[1]
 66 |                 if label > 0:
 67 |                     top_1_correct += 1
 68 |                     break
 69 |         else:
 70 |             raise BaseException
 71 | 
 72 |     if num_query > 0:
 73 |         return top_1_correct/num_query
 74 |     else:
 75 |         return 0.0
 76 | 
 77 | 
 78 | def mean_reciprocal_rank(results):
 79 |     num_query = 0
 80 |     mrr = 0.0
 81 |     for k, v in results.items():
 82 |         if not is_valid_query(v):
 83 |             continue
 84 | 
 85 |         num_query += 1
 86 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 87 |         for i, rec in enumerate(sorted_v):
 88 |             aid, label, score = rec
 89 |             if label > 0:
 90 |                 mrr += 1.0 / (i + 1)
 91 |                 break
 92 | 
 93 |     if num_query == 0:
 94 |         return 0.0
 95 |     else:
 96 |         mrr = mrr / num_query
 97 |         return mrr
 98 | 
 99 | 
100 | def mean_average_precision(results):
101 |     num_query = 0
102 |     mvp = 0.0
103 |     for k, v in results.items():
104 |         if not is_valid_query(v):
105 |             continue
106 | 
107 |         num_query += 1
108 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
109 |         num_relevant_doc = 0.0
110 |         avp = 0.0
111 |         for i, rec in enumerate(sorted_v):
112 |             aid, label, score = rec
113 |             if label == 1:
114 |                 num_relevant_doc += 1
115 |                 precision = num_relevant_doc / (i + 1)
116 |                 avp += precision
117 |         avp = avp / num_relevant_doc
118 |         mvp += avp
119 | 
120 |     if num_query == 0:
121 |         return 0.0
122 |     else:
123 |         mvp = mvp / num_query
124 |         return mvp
125 | 
126 | 
127 | def classification_metrics(results):
128 |     total_num = 0
129 |     total_correct = 0
130 |     true_positive = 0
131 |     positive_correct = 0
132 |     predicted_positive = 0
133 | 
134 |     loss = 0.0;
135 |     for k, v in results.items():
136 |         for rec in v:
137 |             total_num += 1
138 |             aid, label, score = rec
139 | 
140 |             if score > 0.5:
141 |                 predicted_positive += 1
142 | 
143 |             if label > 0:
144 |                 true_positive += 1
145 |                 loss += -math.log(score + 1e-12)
146 |             else:
147 |                 loss += -math.log(1.0 - score + 1e-12);
148 | 
149 |             if score > 0.5 and label > 0:
150 |                 total_correct += 1
151 |                 positive_correct += 1
152 | 
153 |             if score < 0.5 and label < 0.5:
154 |                 total_correct += 1
155 | 
156 |     accuracy = float(total_correct) / total_num
157 |     precision = float(positive_correct) / (predicted_positive + 1e-12)
158 |     recall = float(positive_correct) / true_positive
159 |     F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
160 |     return accuracy, precision, recall, F1, loss / total_num;


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/metrics.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import math
  3 | 
  4 | 
  5 | def is_valid_query(v):
  6 |     num_pos = 0
  7 |     num_neg = 0
  8 |     for aid, label, score in v:
  9 |         if label > 0:
 10 |             num_pos += 1
 11 |         else:
 12 |             num_neg += 1
 13 |     if num_pos > 0 and num_neg > 0:
 14 |         return True
 15 |     else:
 16 |         return False
 17 | 
 18 | 
 19 | def get_num_valid_query(results):
 20 |     num_query = 0
 21 |     for k, v in results.items():
 22 |         if not is_valid_query(v):
 23 |             continue
 24 |         num_query += 1
 25 |     return num_query
 26 | 
 27 | 
 28 | def top_1_precision(results):
 29 |     num_query = 0
 30 |     top_1_correct = 0.0
 31 |     for k, v in results.items():
 32 |         if not is_valid_query(v):
 33 |             continue
 34 |         num_query += 1
 35 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 36 |         aid, label, score = sorted_v[0]
 37 |         if label > 0:
 38 |             top_1_correct += 1
 39 | 
 40 |     if num_query > 0:
 41 |         return top_1_correct / num_query
 42 |     else:
 43 |         return 0.0
 44 | 
 45 | 
 46 | def top_k_precision(results, k=1):
 47 |     num_query = 0
 48 |     top_1_correct = 0.0
 49 |     for key, v in results.items():
 50 |         if not is_valid_query(v):
 51 |             continue
 52 |         num_query += 1
 53 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 54 |         if k == 1:
 55 |             aid, label, score = sorted_v[0]
 56 |             if label > 0:
 57 |                 top_1_correct += 1
 58 |         elif k == 2:
 59 |             aid1, label1, score1 = sorted_v[0]
 60 |             aid2, label2, score2 = sorted_v[1]
 61 |             if label1 > 0 or label2 > 0:
 62 |                 top_1_correct += 1
 63 |         elif k == 5:
 64 |             for vv in sorted_v[0:5]:
 65 |                 label = vv[1]
 66 |                 if label > 0:
 67 |                     top_1_correct += 1
 68 |                     break
 69 |         else:
 70 |             raise BaseException
 71 | 
 72 |     if num_query > 0:
 73 |         return top_1_correct/num_query
 74 |     else:
 75 |         return 0.0
 76 | 
 77 | 
 78 | def mean_reciprocal_rank(results):
 79 |     num_query = 0
 80 |     mrr = 0.0
 81 |     for k, v in results.items():
 82 |         if not is_valid_query(v):
 83 |             continue
 84 | 
 85 |         num_query += 1
 86 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 87 |         for i, rec in enumerate(sorted_v):
 88 |             aid, label, score = rec
 89 |             if label > 0:
 90 |                 mrr += 1.0 / (i + 1)
 91 |                 break
 92 | 
 93 |     if num_query == 0:
 94 |         return 0.0
 95 |     else:
 96 |         mrr = mrr / num_query
 97 |         return mrr
 98 | 
 99 | 
100 | def mean_average_precision(results):
101 |     num_query = 0
102 |     mvp = 0.0
103 |     for k, v in results.items():
104 |         if not is_valid_query(v):
105 |             continue
106 | 
107 |         num_query += 1
108 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
109 |         num_relevant_doc = 0.0
110 |         avp = 0.0
111 |         for i, rec in enumerate(sorted_v):
112 |             aid, label, score = rec
113 |             if label == 1:
114 |                 num_relevant_doc += 1
115 |                 precision = num_relevant_doc / (i + 1)
116 |                 avp += precision
117 |         avp = avp / num_relevant_doc
118 |         mvp += avp
119 | 
120 |     if num_query == 0:
121 |         return 0.0
122 |     else:
123 |         mvp = mvp / num_query
124 |         return mvp
125 | 
126 | 
127 | def classification_metrics(results):
128 |     total_num = 0
129 |     total_correct = 0
130 |     true_positive = 0
131 |     positive_correct = 0
132 |     predicted_positive = 0
133 | 
134 |     loss = 0.0;
135 |     for k, v in results.items():
136 |         for rec in v:
137 |             total_num += 1
138 |             aid, label, score = rec
139 | 
140 |             if score > 0.5:
141 |                 predicted_positive += 1
142 | 
143 |             if label > 0:
144 |                 true_positive += 1
145 |                 loss += -math.log(score + 1e-12)
146 |             else:
147 |                 loss += -math.log(1.0 - score + 1e-12);
148 | 
149 |             if score > 0.5 and label > 0:
150 |                 total_correct += 1
151 |                 positive_correct += 1
152 | 
153 |             if score < 0.5 and label < 0.5:
154 |                 total_correct += 1
155 | 
156 |     accuracy = float(total_correct) / total_num
157 |     precision = float(positive_correct) / (predicted_positive + 1e-12)
158 |     recall = float(positive_correct) / true_positive
159 |     F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
160 |     return accuracy, precision, recall, F1, loss / total_num;


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/metrics.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import math
  3 | 
  4 | 
  5 | def is_valid_query(v):
  6 |     num_pos = 0
  7 |     num_neg = 0
  8 |     for aid, label, score in v:
  9 |         if label > 0:
 10 |             num_pos += 1
 11 |         else:
 12 |             num_neg += 1
 13 |     if num_pos > 0 and num_neg > 0:
 14 |         return True
 15 |     else:
 16 |         return False
 17 | 
 18 | 
 19 | def get_num_valid_query(results):
 20 |     num_query = 0
 21 |     for k, v in results.items():
 22 |         if not is_valid_query(v):
 23 |             continue
 24 |         num_query += 1
 25 |     return num_query
 26 | 
 27 | 
 28 | def top_1_precision(results):
 29 |     num_query = 0
 30 |     top_1_correct = 0.0
 31 |     for k, v in results.items():
 32 |         if not is_valid_query(v):
 33 |             continue
 34 |         num_query += 1
 35 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 36 |         aid, label, score = sorted_v[0]
 37 |         if label > 0:
 38 |             top_1_correct += 1
 39 | 
 40 |     if num_query > 0:
 41 |         return top_1_correct / num_query
 42 |     else:
 43 |         return 0.0
 44 | 
 45 | 
 46 | def top_k_precision(results, k=1):
 47 |     num_query = 0
 48 |     top_1_correct = 0.0
 49 |     for key, v in results.items():
 50 |         if not is_valid_query(v):
 51 |             continue
 52 |         num_query += 1
 53 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 54 |         if k == 1:
 55 |             aid, label, score = sorted_v[0]
 56 |             if label > 0:
 57 |                 top_1_correct += 1
 58 |         elif k == 2:
 59 |             aid1, label1, score1 = sorted_v[0]
 60 |             aid2, label2, score2 = sorted_v[1]
 61 |             if label1 > 0 or label2 > 0:
 62 |                 top_1_correct += 1
 63 |         elif k == 5:
 64 |             for vv in sorted_v[0:5]:
 65 |                 label = vv[1]
 66 |                 if label > 0:
 67 |                     top_1_correct += 1
 68 |                     break
 69 |         elif k == 10:
 70 |             for vv in sorted_v[0:10]:
 71 |                 label = vv[1]
 72 |                 if label > 0:
 73 |                     top_1_correct += 1
 74 |                     break            
 75 |         else:
 76 |             raise BaseException
 77 | 
 78 |     if num_query > 0:
 79 |         return top_1_correct/num_query
 80 |     else:
 81 |         return 0.0
 82 | 
 83 | 
 84 | def mean_reciprocal_rank(results):
 85 |     num_query = 0
 86 |     mrr = 0.0
 87 |     for k, v in results.items():
 88 |         if not is_valid_query(v):
 89 |             continue
 90 | 
 91 |         num_query += 1
 92 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
 93 |         for i, rec in enumerate(sorted_v):
 94 |             aid, label, score = rec
 95 |             if label > 0:
 96 |                 mrr += 1.0 / (i + 1)
 97 |                 break
 98 | 
 99 |     if num_query == 0:
100 |         return 0.0
101 |     else:
102 |         mrr = mrr / num_query
103 |         return mrr
104 | 
105 | 
106 | def mean_average_precision(results):
107 |     num_query = 0
108 |     mvp = 0.0
109 |     for k, v in results.items():
110 |         if not is_valid_query(v):
111 |             continue
112 | 
113 |         num_query += 1
114 |         sorted_v = sorted(v, key=operator.itemgetter(2), reverse=True)
115 |         num_relevant_doc = 0.0
116 |         avp = 0.0
117 |         for i, rec in enumerate(sorted_v):
118 |             aid, label, score = rec
119 |             if label == 1:
120 |                 num_relevant_doc += 1
121 |                 precision = num_relevant_doc / (i + 1)
122 |                 avp += precision
123 |         avp = avp / num_relevant_doc
124 |         mvp += avp
125 | 
126 |     if num_query == 0:
127 |         return 0.0
128 |     else:
129 |         mvp = mvp / num_query
130 |         return mvp
131 | 
132 | 
133 | def classification_metrics(results):
134 |     total_num = 0
135 |     total_correct = 0
136 |     true_positive = 0
137 |     positive_correct = 0
138 |     predicted_positive = 0
139 | 
140 |     loss = 0.0;
141 |     for k, v in results.items():
142 |         for rec in v:
143 |             total_num += 1
144 |             aid, label, score = rec
145 | 
146 |             if score > 0.5:
147 |                 predicted_positive += 1
148 | 
149 |             if label > 0:
150 |                 true_positive += 1
151 |                 loss += -math.log(score + 1e-12)
152 |             else:
153 |                 loss += -math.log(1.0 - score + 1e-12);
154 | 
155 |             if score > 0.5 and label > 0:
156 |                 total_correct += 1
157 |                 positive_correct += 1
158 | 
159 |             if score < 0.5 and label < 0.5:
160 |                 total_correct += 1
161 | 
162 |     accuracy = float(total_correct) / total_num
163 |     precision = float(positive_correct) / (predicted_positive + 1e-12)
164 |     recall = float(positive_correct) / true_positive
165 |     F1 = 2.0 * precision * recall / (1e-12 + precision + recall)
166 |     return accuracy, precision, recall, F1, loss / total_num;


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Detecting Speaker Personas from Conversational Texts
  2 | This repository contains the source code and the dataset for the _EMNLP 2021_ paper [Detecting Speaker Personas from Conversational Texts](https://aclanthology.org/2021.emnlp-main.86.pdf). Jia-Chen Gu, Zhen-Hua Ling, Yu Wu, Quan Liu, Zhigang Chen, Xiaodan Zhu. <br>
  3 | 
  4 | 
  5 | ## Introduction
  6 | Personas are useful for dialogue response prediction. However, the personas used in current studies are pre-defined and hard to obtain before a conversation. To tackle this issue, we study a new task, named Speaker Persona Detection (SPD), which aims to detect speaker personas based on the plain conversational text. In this task, a best-matched persona is searched out from candidates given the conversational text. This is a many-to-many semantic matching task because both contexts and personas in SPD are composed of multiple sentences. The long-term dependency and the dynamic redundancy among these sentences increase the difficulty of this task. We build a dataset for SPD, dubbed as Persona Match on Persona-Chat (PMPC). Furthermore, we evaluate several baseline models and propose utterance-to-profile (U2P) matching networks for this task. The U2P models operate at a fine granularity which treat both contexts and personas as sets of multiple sequences. Then, each sequence pair is scored and an interpretable overall score is obtained for a context-persona pair through aggregation. Evaluation results show that the U2P models outperform their baseline counterparts significantly.
  7 | 
  8 | <div align=center><img src="image/task.png" width=50%></div> <br>
  9 | 
 10 | <div align=center><img src="image/result.png" width=80%></div>
 11 | 
 12 | 
 13 | ## Dependencies
 14 | Python 3.6 <br>
 15 | Tensorflow 1.13.1
 16 | 
 17 | 
 18 | ## Download
 19 | - Download the [BERT released by the Google research](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip), 
 20 |   and move to path: ./Pretraining-Based/uncased_L-12_H-768_A-12 <br>
 21 |   
 22 | - Download the [PMPC dataset](https://drive.google.com/file/d/1sE_N7fi_WojeQBWZcTg4Mw6Pyod27S73/view?usp=sharing) used in our paper,
 23 |   and move to path: ```./data_PMPC``` <br>
 24 | 
 25 | 
 26 | ## Non-Pretraining-Based Models
 27 | Train a new model.
 28 | ```
 29 | cd Non-Pretraining-Based/C2P-X/scripts/
 30 | bash train.sh
 31 | ```
 32 | The training process is recorded in ```log_train_*.txt``` file. <br>
 33 | 
 34 | Test a trained model by modifying the variable ```latest_checkpoint``` in ```test.sh```.
 35 | ```
 36 | cd Non-Pretraining-Based/C2P-X/scripts/
 37 | bash test.sh
 38 | ```
 39 | The testing process is recorded in ```log_test_*.txt``` file. A "output_test.txt" file which records scores for each context-persona pair will be saved to the path of ```latest_checkpoint```. Modify the variable ```test_out_filename``` in ```compute_metrics.py``` and then run the following command, various metrics will be shown.
 40 | ```
 41 | python compute_metrics.py
 42 | ```
 43 | 
 44 | You can choose a baseline model by comment/uncomment a model package (from ```model_BOW```, ```model_BiLSTM```, ```model_Transformer``` and ```model_ESIM```) in the first several lines in ```train.py```. The same process and commands can be done for those Non-Pretraining-Based U2P-X Models.
 45 | 
 46 | 
 47 | ## Pretraining-Based Models
 48 | Create the fine-tuning data.
 49 | ```
 50 | cd Pretraining-Based/C2P-BERT/
 51 | python data_process_tfrecord.py
 52 | ```
 53 | 
 54 | Running the fine-tuning process.
 55 | ```
 56 | cd Pretraining-Based/C2P-BERT/scripts/
 57 | bash train.sh
 58 | ```
 59 | 
 60 | Test a trained model by modifying the variable ```restore_model_dir``` in ```test.sh```.
 61 | ```
 62 | cd Pretraining-Based/C2P-BERT/scripts/
 63 | bash test.sh
 64 | ```
 65 | 
 66 | Modify the variable ```test_out_filename``` in ```compute_metrics.py``` and then run the following command, various metrics will be shown.
 67 | ```
 68 | python compute_metrics.py
 69 | ```
 70 | 
 71 | The same process and commands can be done for U2P-BERT.
 72 | 
 73 | **NOTE**: Since the dataset is small, each model was trained for 10 times with identical architectures and different random initializations. Thus, we report (mean ± standard deviation) in our paper.
 74 | 
 75 | 
 76 | ## Cite
 77 | If you think our work is helpful, or use the code or dataset, please cite the following paper:
 78 | **"Detecting Speaker Personas from Conversational Texts"**
 79 | Jia-Chen Gu, Zhen-Hua Ling, Yu Wu, Quan Liu, Zhigang Chen, Xiaodan Zhu. _EMNLP (2021)_
 80 | ```
 81 | @inproceedings{gu-etal-2021-detecting,
 82 |     title = "Detecting Speaker Personas from Conversational Texts",
 83 |     author = "Gu, Jia-Chen  and
 84 |               Ling, Zhenhua  and
 85 |               Wu, Yu  and
 86 |               Liu, Quan  and
 87 |               Chen, Zhigang  and
 88 |               Zhu, Xiaodan",
 89 |     booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
 90 |     month = nov,
 91 |     year = "2021",
 92 |     address = "Online and Punta Cana, Dominican Republic",
 93 |     publisher = "Association for Computational Linguistics",
 94 |     url = "https://aclanthology.org/2021.emnlp-main.86",
 95 |     pages = "1126--1136",
 96 | }
 97 | ```
 98 | 
 99 | 
100 | ## Update
101 | Please keep an eye on this repository if you are interested in our work.
102 | Feel free to contact us (gujc@mail.ustc.edu.cn) or open issues.
103 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/test.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import datetime
  6 | import operator
  7 | import metrics
  8 | from collections import defaultdict
  9 | import data_helpers
 10 | 
 11 | # Files
 12 | tf.flags.DEFINE_string("test_file", "", "path to test file")
 13 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
 14 | tf.flags.DEFINE_string("char_vocab_file", "", "vocabulary file")
 15 | tf.flags.DEFINE_string("output_file", "", "prediction output file")
 16 | 
 17 | # Model Hyperparameters
 18 | tf.flags.DEFINE_integer("max_context_len", 150, "max context length")
 19 | tf.flags.DEFINE_integer("max_persona_len", 50, "max persona length")
 20 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
 21 | 
 22 | # Test parameters
 23 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 24 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
 25 | 
 26 | # Misc Parameters
 27 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 28 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 29 | 
 30 | FLAGS = tf.flags.FLAGS
 31 | # FLAGS._parse_flags()
 32 | # print("\nParameters:")
 33 | # for attr, value in sorted(FLAGS.__flags.items()):
 34 | #     print("{}={}".format(attr.upper(), value))
 35 | print("")
 36 | 
 37 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
 38 | print('vocabulary size: {}'.format(len(vocab)))
 39 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
 40 | 
 41 | test_dataset = data_helpers.load_dataset(FLAGS.test_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len)
 42 | print('test_pairs: {}'.format(len(test_dataset)))
 43 | 
 44 | print("\nEvaluating...\n")
 45 | 
 46 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
 47 | print(checkpoint_file)
 48 | 
 49 | graph = tf.Graph()
 50 | with graph.as_default():
 51 |     session_conf = tf.ConfigProto(
 52 |       allow_soft_placement=FLAGS.allow_soft_placement,
 53 |       log_device_placement=FLAGS.log_device_placement)
 54 |     sess = tf.Session(config=session_conf)
 55 |     with sess.as_default():
 56 |         # Load the saved meta graph and restore variables
 57 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
 58 |         saver.restore(sess, checkpoint_file)
 59 | 
 60 |         # Get the placeholders from the graph by name
 61 |         context     = graph.get_operation_by_name("context").outputs[0]
 62 |         context_len = graph.get_operation_by_name("context_len").outputs[0]
 63 |         persona     = graph.get_operation_by_name("persona").outputs[0]
 64 |         persona_len = graph.get_operation_by_name("persona_len").outputs[0]
 65 | 
 66 |         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 67 | 
 68 |         c_char_feature = graph.get_operation_by_name("context_char").outputs[0]
 69 |         c_char_len     = graph.get_operation_by_name("context_char_len").outputs[0]
 70 |         p_char_feature = graph.get_operation_by_name("persona_char").outputs[0]
 71 |         p_char_len     = graph.get_operation_by_name("persona_char_len").outputs[0]
 72 | 
 73 |         # Tensors we want to evaluate
 74 |         prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0]
 75 | 
 76 |         results = defaultdict(list)
 77 |         num_test = 0
 78 | 
 79 |         test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=False)
 80 |         for test_batch in test_batches:
 81 |             x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = test_batch
 82 |             feed_dict = {
 83 |                 context: x_context,
 84 |                 context_len: x_context_len,
 85 |                 persona: x_persona,
 86 |                 persona_len: x_persona_len,
 87 |                 dropout_keep_prob: 1.0,
 88 |                 c_char_feature: x_context_char,
 89 |                 c_char_len: x_context_char_len,
 90 |                 p_char_feature: x_persona_char,
 91 |                 p_char_len: x_persona_char_len
 92 |             }
 93 |             predicted_prob = sess.run(prob, feed_dict)
 94 |             num_test += len(predicted_prob)
 95 |             print('num_test_sample={}'.format(num_test))
 96 |             for i, prob_score in enumerate(predicted_prob):
 97 |                 us_id, ps_id, label = x_id_pairs[i]
 98 |                 results[us_id].append((ps_id, label, prob_score))
 99 | 
100 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
101 | print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
102 | 
103 | mvp = metrics.mean_average_precision(results)
104 | mrr = metrics.mean_reciprocal_rank(results)
105 | top_1_precision = metrics.top_1_precision(results)
106 | total_valid_query = metrics.get_num_valid_query(results)
107 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
108 | 
109 | out_path = FLAGS.output_file
110 | print("Saving evaluation to {}".format(out_path))
111 | with open(out_path, 'w') as f:
112 |     f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
113 |     for us_id, v in results.items():
114 |         v.sort(key=operator.itemgetter(2), reverse=True)
115 |         for i, rec in enumerate(v):
116 |             ps_id, label, prob_score = rec
117 |             rank = i+1
118 |             f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, ps_id, prob_score, rank, label))
119 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/model_BOW.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | FLAGS = tf.flags.FLAGS
  5 | 
  6 | def get_embeddings(vocab):
  7 |     print("get_embedding")
  8 |     initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
  9 |     return tf.constant(initializer, name="word_embedding")
 10 | 
 11 | def get_char_embedding(charVocab):
 12 |     print("get_char_embedding")
 13 |     char_size = len(charVocab)
 14 |     embeddings = np.zeros((char_size, char_size), dtype='float32')
 15 |     for i in range(1, char_size):
 16 |         embeddings[i, i] = 1.0
 17 | 
 18 |     return tf.constant(embeddings, name="word_char_embedding")
 19 | 
 20 | def load_embed_vectors(fname, dim):
 21 |     # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
 22 |     vectors = {}
 23 |     for line in open(fname, 'rt'):
 24 |         items = line.strip().split(' ')
 25 |         if len(items[0]) <= 0:
 26 |             continue
 27 |         vec = [float(items[i]) for i in range(1, dim+1)]
 28 |         vectors[items[0]] = vec
 29 | 
 30 |     return vectors
 31 | 
 32 | def load_word_embeddings(vocab, dim):
 33 |     vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
 34 |     vocab_size = len(vocab)
 35 |     embeddings = np.zeros((vocab_size, dim), dtype='float32')
 36 |     for word, code in vocab.items():
 37 |         if word in vectors:
 38 |             embeddings[code] = vectors[word]
 39 |         #else:
 40 |         #    embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 
 41 | 
 42 |     return embeddings 
 43 | 
 44 | 
 45 | class BOW(object):
 46 |     def __init__(
 47 |       self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
 48 | 
 49 |         self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context")
 50 |         self.context_len = tf.placeholder(tf.int32, [None], name="context_len")
 51 |         self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona")
 52 |         self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len")
 53 | 
 54 |         self.target = tf.placeholder(tf.float32, [None], name="target")
 55 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 56 | 
 57 |         self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char")
 58 |         self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len")
 59 |         self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char")
 60 |         self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len")
 61 | 
 62 |         l2_loss = tf.constant(1.0)
 63 | 
 64 |         # =============================== Embedding layer ===============================
 65 |         with tf.name_scope("embedding"):
 66 |             W = get_embeddings(vocab)
 67 |             context_embedded = tf.nn.embedding_lookup(W, self.context)  # [batch_size, max_context_len, word_dim]
 68 |             persona_embedded = tf.nn.embedding_lookup(W, self.persona)  # [batch_size, max_persona_len, word_dim]
 69 |             context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob)
 70 |             persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob)
 71 |             print("context_embedded: {}".format(context_embedded.get_shape()))
 72 |             print("persona_embedded: {}".format(persona_embedded.get_shape()))
 73 | 
 74 | 
 75 |         # =============================== Encoding layer ===============================
 76 |         with tf.variable_scope("encoding_layer") as vs:
 77 |             mask_c = tf.sequence_mask(self.context_len, max_context_len, dtype=tf.float32)  # [batch_size, max_context_len]
 78 |             mask_c = tf.expand_dims(mask_c, -1)  # [batch_size, max_context_len, 1]
 79 |             final_context = tf.reduce_max(context_embedded * mask_c, axis=1)
 80 | 
 81 |             mask_p = tf.sequence_mask(self.persona_len, max_persona_len, dtype=tf.float32)  # [batch_size, max_persona_len]
 82 |             mask_p = tf.expand_dims(mask_p, -1)  # [batch_size, max_persona_len, 1]
 83 |             final_persona = tf.reduce_max(persona_embedded * mask_p, axis=1)
 84 |             print("establish BOW encoder")
 85 |             
 86 |         
 87 |         # =============================== Matching layer ===============================
 88 |         with tf.variable_scope("matching_layer") as vs:
 89 |             output_dim = final_context.get_shape()[-1].value
 90 |             A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
 91 | 
 92 |             similarity = tf.matmul(final_context, A_matrix)                  # [batch_size, dim]
 93 |             similarity = tf.reduce_sum(similarity * final_persona, axis=-1)  # [batch_size, ]
 94 |             print("shape of similarity: {}".format(similarity.get_shape()))          
 95 | 
 96 | 
 97 |         # =============================== Prediction layer ===============================
 98 |         with tf.variable_scope("prediction_layer") as vs:
 99 |             logits = similarity
100 |             self.probs = tf.sigmoid(logits, name="prob")   # [batch_size, ]
101 | 
102 |             losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
103 |             self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
104 |                                                               tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
105 | 
106 |         with tf.name_scope("accuracy"):
107 |             correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5))    # [batch_size, ]
108 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
109 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/test.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import datetime
  6 | import operator
  7 | import metrics
  8 | from collections import defaultdict
  9 | import data_helpers
 10 | 
 11 | # Files
 12 | tf.flags.DEFINE_string("test_file", "", "path to test file")
 13 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
 14 | tf.flags.DEFINE_string("char_vocab_file", "", "vocabulary file")
 15 | tf.flags.DEFINE_string("output_file", "", "prediction output file")
 16 | 
 17 | # Model Hyperparameters
 18 | tf.flags.DEFINE_integer("max_utter_num", 8, "max utterance number")
 19 | tf.flags.DEFINE_integer("max_utter_len", 20, "max utterance length")
 20 | tf.flags.DEFINE_integer("max_profile_num", 5, "max profile number")
 21 | tf.flags.DEFINE_integer("max_profile_len", 15, "max profile length")
 22 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
 23 | 
 24 | # Test parameters
 25 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 26 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
 27 | 
 28 | # Misc Parameters
 29 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 30 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 31 | 
 32 | FLAGS = tf.flags.FLAGS
 33 | # FLAGS._parse_flags()
 34 | # print("\nParameters:")
 35 | # for attr, value in sorted(FLAGS.__flags.items()):
 36 | #     print("{}={}".format(attr.upper(), value))
 37 | print("")
 38 | 
 39 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
 40 | print('vocabulary size: {}'.format(len(vocab)))
 41 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
 42 | 
 43 | test_dataset = data_helpers.load_dataset(FLAGS.test_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len)
 44 | print('test_pairs: {}'.format(len(test_dataset)))
 45 | 
 46 | print("\nEvaluating...\n")
 47 | 
 48 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
 49 | print(checkpoint_file)
 50 | 
 51 | graph = tf.Graph()
 52 | with graph.as_default():
 53 |     session_conf = tf.ConfigProto(
 54 |       allow_soft_placement=FLAGS.allow_soft_placement,
 55 |       log_device_placement=FLAGS.log_device_placement)
 56 |     sess = tf.Session(config=session_conf)
 57 |     with sess.as_default():
 58 |         # Load the saved meta graph and restore variables
 59 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
 60 |         saver.restore(sess, checkpoint_file)
 61 | 
 62 |         # Get the placeholders from the graph by name
 63 |         utterances = graph.get_operation_by_name("utterances").outputs[0]
 64 |         utterances_len = graph.get_operation_by_name("utterances_len").outputs[0]
 65 |         utterances_num = graph.get_operation_by_name("utterances_num").outputs[0]
 66 |         profiles = graph.get_operation_by_name("profiles").outputs[0]
 67 |         profiles_len = graph.get_operation_by_name("profiles_len").outputs[0]
 68 |         profiles_num = graph.get_operation_by_name("profiles_num").outputs[0]
 69 | 
 70 |         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 71 | 
 72 |         u_char_feature = graph.get_operation_by_name("utterances_char").outputs[0]
 73 |         u_char_len     = graph.get_operation_by_name("utterances_char_len").outputs[0]
 74 |         p_char_feature = graph.get_operation_by_name("profiles_char").outputs[0]
 75 |         p_char_len     = graph.get_operation_by_name("profiles_char_len").outputs[0]
 76 | 
 77 |         # Tensors we want to evaluate
 78 |         prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0]
 79 | 
 80 |         results = defaultdict(list)
 81 |         num_test = 0
 82 | 
 83 |         test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=False)
 84 |         for test_batch in test_batches:
 85 |             x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, \
 86 |             x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = test_batch
 87 |             feed_dict = {
 88 |                 utterances: x_utterances,
 89 |                 utterances_len: x_utterances_len,
 90 |                 utterances_num: x_utterances_num,
 91 |                 profiles: x_profiles,
 92 |                 profiles_len: x_profiles_len,
 93 |                 profiles_num: x_profiles_num,
 94 |                 dropout_keep_prob: 1.0,
 95 |                 u_char_feature: x_utterances_char,
 96 |                 u_char_len: x_utterances_char_len,
 97 |                 p_char_feature: x_profiles_char,
 98 |                 p_char_len: x_profiles_char_len
 99 |             }
100 |             predicted_prob = sess.run(prob, feed_dict)
101 |             num_test += len(predicted_prob)
102 |             print('num_test_sample={}'.format(num_test))
103 |             for i, prob_score in enumerate(predicted_prob):
104 |                 us_id, ps_id, label = x_ids[i]
105 |                 results[us_id].append((ps_id, label, prob_score))
106 | 
107 | accu, precision, recall, f1, loss = metrics.classification_metrics(results)
108 | print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
109 | 
110 | mvp = metrics.mean_average_precision(results)
111 | mrr = metrics.mean_reciprocal_rank(results)
112 | top_1_precision = metrics.top_1_precision(results)
113 | total_valid_query = metrics.get_num_valid_query(results)
114 | print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
115 | 
116 | out_path = FLAGS.output_file
117 | print("Saving evaluation to {}".format(out_path))
118 | with open(out_path, 'w') as f:
119 |     f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
120 |     for us_id, v in results.items():
121 |         v.sort(key=operator.itemgetter(2), reverse=True)
122 |         for i, rec in enumerate(v):
123 |             ps_id, label, prob_score = rec
124 |             rank = i+1
125 |             f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, ps_id, prob_score, rank, label))
126 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | def load_vocab(fname):
  6 |     '''
  7 |     vocab = {"I": 0, ...}
  8 |     '''
  9 |     vocab={}
 10 |     with open(fname, 'rt') as f:
 11 |         for i,line in enumerate(f):
 12 |             word = line.strip()
 13 |             vocab[word] = i
 14 |     return vocab
 15 | 
 16 | def load_char_vocab(fname):
 17 |     '''
 18 |     charVocab = {"U": 0, "!": 1, ...}
 19 |     '''
 20 |     charVocab={}
 21 |     with open(fname, 'rt') as f:
 22 |         for line in f:
 23 |             fields = line.strip().split('\t')
 24 |             char_id = int(fields[0])
 25 |             ch = fields[1]
 26 |             charVocab[ch] = char_id
 27 |     return charVocab
 28 | 
 29 | def to_vec(tokens, vocab, maxlen):
 30 |     '''
 31 |     length: length of the input sequence
 32 |     vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...]
 33 |     '''
 34 |     n = len(tokens)
 35 |     length = 0
 36 |     vec=[]
 37 |     for i in range(n):
 38 |         length += 1
 39 |         if tokens[i] in vocab:
 40 |             vec.append(vocab[tokens[i]])
 41 |         else:
 42 |             vec.append(vocab["_unk_"]) 
 43 |     return length, np.array(vec)
 44 | 
 45 | def load_dataset(fname, vocab, max_context_len, max_persona_len):
 46 | 
 47 |     dataset=[]
 48 |     with open(fname, 'rt') as f:
 49 |         for line in f:
 50 |             line = line.strip()
 51 |             fields = line.split('\t')
 52 | 
 53 |             # id
 54 |             c_id = fields[0]
 55 | 
 56 |             # context
 57 |             context = fields[1] + " _eos_"
 58 |             c_tokens = context.split(' ')[:max_context_len]  # select the head max_context_len tokens in every context
 59 |             c_len, c_vec = to_vec(c_tokens, vocab, max_context_len)
 60 | 
 61 |             # matched persona
 62 |             if fields[2] != "NA":
 63 |                 personas = fields[2].split("|")
 64 |                 for index, persona in enumerate(personas):
 65 |                     p_id = "1." + str(index)
 66 |                     persona = persona + " _eos_"
 67 |                     p_tokens = persona.split(' ')[:max_persona_len]  # select the head max_persona_len tokens in every persona
 68 |                     p_len, p_vec = to_vec(p_tokens, vocab, max_persona_len)
 69 |                     dataset.append((c_id, c_tokens, c_vec, c_len, 1.0, p_id, p_tokens, p_vec, p_len))
 70 |             
 71 |             # mismatched persona
 72 |             if fields[3] != "NA":
 73 |                 personas = fields[3].split("|")
 74 |                 for index, persona in enumerate(personas):
 75 |                     ps_id = "0." + str(index)
 76 |                     persona = persona + " _eos_"
 77 |                     p_tokens = persona.split(' ')[:max_persona_len]  # select the head max_persona_len tokens in every persona
 78 |                     p_len, p_vec = to_vec(p_tokens, vocab, max_persona_len)
 79 |                     dataset.append((c_id, c_tokens, c_vec, c_len, 0.0, p_id, p_tokens, p_vec, p_len))
 80 |    
 81 |     return dataset
 82 | 
 83 | 
 84 | def normalize_vec(vec, maxlen):
 85 |     '''
 86 |     pad the original vec to the same maxlen
 87 |     [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0]
 88 |     '''
 89 |     if len(vec) == maxlen:
 90 |         return vec
 91 | 
 92 |     new_vec = np.zeros(maxlen, dtype='int32')
 93 |     for i in range(len(vec)):
 94 |         new_vec[i] = vec[i]
 95 |     return new_vec
 96 | 
 97 | 
 98 | def charVec(tokens, charVocab, maxlen, maxWordLength):
 99 |     '''
100 |     chars = np.array( (maxlen, maxWordLength) )    0 if not found in charVocab or None
101 |     word_lengths = np.array( maxlen )              1 if None
102 |     '''
103 |     n = len(tokens)
104 |     if n > maxlen:
105 |         n = maxlen
106 | 
107 |     chars =  np.zeros((maxlen, maxWordLength), dtype=np.int32)
108 |     word_lengths = np.ones(maxlen, dtype=np.int32)
109 |     for i in range(n):
110 |         token = tokens[i][:maxWordLength]
111 |         word_lengths[i] = len(token)
112 |         row = chars[i]
113 |         for idx, ch in enumerate(token):
114 |             if ch in charVocab:
115 |                 row[idx] = charVocab[ch]
116 | 
117 |     return chars, word_lengths
118 | 
119 | 
120 | def batch_iter(data, batch_size, num_epochs, max_context_len, max_persona_len,
121 |                charVocab, max_word_length, shuffle=True):
122 |     """
123 |     Generates a batch iterator for a dataset.
124 |     """
125 |     data_size = len(data)
126 |     num_batches_per_epoch = int(len(data)/batch_size) + 1
127 |     for epoch in range(num_epochs):
128 |         # Shuffle the data at each epoch
129 |         if shuffle:
130 |             random.shuffle(data)
131 |         for batch_num in range(num_batches_per_epoch):
132 |             start_index = batch_num * batch_size
133 |             end_index = min((batch_num + 1) * batch_size, data_size)
134 | 
135 |             x_context = []
136 |             x_context_len = []
137 |             x_persona = []
138 |             x_persona_len = []
139 | 
140 |             x_labels = []
141 |             x_id_pairs = []
142 |             
143 |             x_context_char = []
144 |             x_context_char_len = []         
145 |             x_persona_char = []
146 |             x_persona_char_len = []
147 | 
148 |             for rowIdx in range(start_index, end_index):
149 |                 c_id, c_tokens, c_vec, c_len, label, p_id, p_tokens, p_vec, p_len = data[rowIdx]
150 | 
151 |                 # normalize c_vec
152 |                 new_c_vec = normalize_vec(c_vec, max_context_len)
153 |                 x_context.append(new_c_vec)
154 |                 x_context_len.append(c_len)
155 | 
156 |                 # normalize p_vec
157 |                 new_p_vec = normalize_vec(p_vec, max_persona_len)
158 |                 x_persona.append(new_p_vec)
159 |                 x_persona_len.append(p_len)
160 | 
161 |                 x_labels.append(label)
162 |                 x_id_pairs.append((c_id, p_id, int(label)))
163 | 
164 |                 # normalize us_CharVec
165 |                 cCharVec, cCharLen = charVec(c_tokens, charVocab, max_context_len, max_word_length)
166 |                 x_context_char.append(cCharVec)
167 |                 x_context_char_len.append(cCharLen)
168 | 
169 |                 # normalize ps_CharVec
170 |                 pCharVec, pCharLen = charVec(p_tokens, charVocab, max_persona_len, max_word_length)
171 |                 x_persona_char.append(pCharVec)
172 |                 x_persona_char_len.append(pCharLen)
173 | 
174 |             yield np.array(x_context), np.array(x_context_len), np.array(x_persona), np.array(x_persona_len), \
175 |                   np.array(x_labels), x_id_pairs, \
176 |                   np.array(x_context_char), np.array(x_context_char_len), np.array(x_persona_char), np.array(x_persona_char_len)
177 | 


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   # Normally the global step update is done inside of `apply_gradients`.
 80 |   # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |   # a different optimizer, you should probably take this line out.
 82 |   new_global_step = global_step + 1
 83 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |   return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |   def __init__(self,
 91 |                learning_rate,
 92 |                weight_decay_rate=0.0,
 93 |                beta_1=0.9,
 94 |                beta_2=0.999,
 95 |                epsilon=1e-6,
 96 |                exclude_from_weight_decay=None,
 97 |                name="AdamWeightDecayOptimizer"):
 98 |     """Constructs a AdamWeightDecayOptimizer."""
 99 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |     self.learning_rate = learning_rate
102 |     self.weight_decay_rate = weight_decay_rate
103 |     self.beta_1 = beta_1
104 |     self.beta_2 = beta_2
105 |     self.epsilon = epsilon
106 |     self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |     """See base class."""
110 |     assignments = []
111 |     for (grad, param) in grads_and_vars:
112 |       if grad is None or param is None:
113 |         continue
114 | 
115 |       param_name = self._get_variable_name(param.name)
116 | 
117 |       m = tf.get_variable(
118 |           name=param_name + "/adam_m",
119 |           shape=param.shape.as_list(),
120 |           dtype=tf.float32,
121 |           trainable=False,
122 |           initializer=tf.zeros_initializer())
123 |       v = tf.get_variable(
124 |           name=param_name + "/adam_v",
125 |           shape=param.shape.as_list(),
126 |           dtype=tf.float32,
127 |           trainable=False,
128 |           initializer=tf.zeros_initializer())
129 | 
130 |       # Standard Adam update.
131 |       next_m = (
132 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |       next_v = (
134 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                     tf.square(grad)))
136 | 
137 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |       # Just adding the square of the weights to the loss function is *not*
140 |       # the correct way of using L2 regularization/weight decay with Adam,
141 |       # since that will interact with the m and v parameters in strange ways.
142 |       #
143 |       # Instead we want ot decay the weights in a manner that doesn't interact
144 |       # with the m/v parameters. This is equivalent to adding the square
145 |       # of the weights to the loss with plain (non-momentum) SGD.
146 |       if self._do_use_weight_decay(param_name):
147 |         update += self.weight_decay_rate * param
148 | 
149 |       update_with_lr = self.learning_rate * update
150 | 
151 |       next_param = param - update_with_lr
152 | 
153 |       assignments.extend(
154 |           [param.assign(next_param),
155 |            m.assign(next_m),
156 |            v.assign(next_v)])
157 |     return tf.group(*assignments, name=name)
158 | 
159 |   def _do_use_weight_decay(self, param_name):
160 |     """Whether to use L2 weight decay for `param_name`."""
161 |     if not self.weight_decay_rate:
162 |       return False
163 |     if self.exclude_from_weight_decay:
164 |       for r in self.exclude_from_weight_decay:
165 |         if re.search(r, param_name) is not None:
166 |           return False
167 |     return True
168 | 
169 |   def _get_variable_name(self, param_name):
170 |     """Get the variable name from the tensor name."""
171 |     m = re.match("^(.*):\\d+$", param_name)
172 |     if m is not None:
173 |       param_name = m.group(1)
174 |     return param_name
175 | 


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   # Normally the global step update is done inside of `apply_gradients`.
 80 |   # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |   # a different optimizer, you should probably take this line out.
 82 |   new_global_step = global_step + 1
 83 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |   return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |   def __init__(self,
 91 |                learning_rate,
 92 |                weight_decay_rate=0.0,
 93 |                beta_1=0.9,
 94 |                beta_2=0.999,
 95 |                epsilon=1e-6,
 96 |                exclude_from_weight_decay=None,
 97 |                name="AdamWeightDecayOptimizer"):
 98 |     """Constructs a AdamWeightDecayOptimizer."""
 99 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |     self.learning_rate = learning_rate
102 |     self.weight_decay_rate = weight_decay_rate
103 |     self.beta_1 = beta_1
104 |     self.beta_2 = beta_2
105 |     self.epsilon = epsilon
106 |     self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |     """See base class."""
110 |     assignments = []
111 |     for (grad, param) in grads_and_vars:
112 |       if grad is None or param is None:
113 |         continue
114 | 
115 |       param_name = self._get_variable_name(param.name)
116 | 
117 |       m = tf.get_variable(
118 |           name=param_name + "/adam_m",
119 |           shape=param.shape.as_list(),
120 |           dtype=tf.float32,
121 |           trainable=False,
122 |           initializer=tf.zeros_initializer())
123 |       v = tf.get_variable(
124 |           name=param_name + "/adam_v",
125 |           shape=param.shape.as_list(),
126 |           dtype=tf.float32,
127 |           trainable=False,
128 |           initializer=tf.zeros_initializer())
129 | 
130 |       # Standard Adam update.
131 |       next_m = (
132 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |       next_v = (
134 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                     tf.square(grad)))
136 | 
137 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |       # Just adding the square of the weights to the loss function is *not*
140 |       # the correct way of using L2 regularization/weight decay with Adam,
141 |       # since that will interact with the m and v parameters in strange ways.
142 |       #
143 |       # Instead we want ot decay the weights in a manner that doesn't interact
144 |       # with the m/v parameters. This is equivalent to adding the square
145 |       # of the weights to the loss with plain (non-momentum) SGD.
146 |       if self._do_use_weight_decay(param_name):
147 |         update += self.weight_decay_rate * param
148 | 
149 |       update_with_lr = self.learning_rate * update
150 | 
151 |       next_param = param - update_with_lr
152 | 
153 |       assignments.extend(
154 |           [param.assign(next_param),
155 |            m.assign(next_m),
156 |            v.assign(next_v)])
157 |     return tf.group(*assignments, name=name)
158 | 
159 |   def _do_use_weight_decay(self, param_name):
160 |     """Whether to use L2 weight decay for `param_name`."""
161 |     if not self.weight_decay_rate:
162 |       return False
163 |     if self.exclude_from_weight_decay:
164 |       for r in self.exclude_from_weight_decay:
165 |         if re.search(r, param_name) is not None:
166 |           return False
167 |     return True
168 | 
169 |   def _get_variable_name(self, param_name):
170 |     """Get the variable name from the tensor name."""
171 |     m = re.match("^(.*):\\d+$", param_name)
172 |     if m is not None:
173 |       param_name = m.group(1)
174 |     return param_name
175 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/model_BOW.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | FLAGS = tf.flags.FLAGS
  5 | 
  6 | def get_embeddings(vocab):
  7 |     print("get_embedding")
  8 |     initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
  9 |     return tf.constant(initializer, name="word_embedding")
 10 | 
 11 | def get_char_embedding(charVocab):
 12 |     print("get_char_embedding")
 13 |     char_size = len(charVocab)
 14 |     embeddings = np.zeros((char_size, char_size), dtype='float32')
 15 |     for i in range(1, char_size):
 16 |         embeddings[i, i] = 1.0
 17 | 
 18 |     return tf.constant(embeddings, name="word_char_embedding")
 19 | 
 20 | def load_embed_vectors(fname, dim):
 21 |     # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
 22 |     vectors = {}
 23 |     for line in open(fname, 'rt'):
 24 |         items = line.strip().split(' ')
 25 |         if len(items[0]) <= 0:
 26 |             continue
 27 |         vec = [float(items[i]) for i in range(1, dim+1)]
 28 |         vectors[items[0]] = vec
 29 | 
 30 |     return vectors
 31 | 
 32 | def load_word_embeddings(vocab, dim):
 33 |     vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
 34 |     vocab_size = len(vocab)
 35 |     embeddings = np.zeros((vocab_size, dim), dtype='float32')
 36 |     for word, code in vocab.items():
 37 |         if word in vectors:
 38 |             embeddings[code] = vectors[word]
 39 |         #else:
 40 |         #    embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 
 41 | 
 42 |     return embeddings 
 43 | 
 44 | 
 45 | class BOW(object):
 46 |     def __init__(
 47 |       self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
 48 | 
 49 |         self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances")
 50 |         self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len")
 51 |         self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num")
 52 |         self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles")
 53 |         self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len")
 54 |         self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num")
 55 | 
 56 |         self.target = tf.placeholder(tf.float32, [None], name="target")
 57 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 58 | 
 59 |         self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char")
 60 |         self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len")
 61 |         self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char")
 62 |         self.p_charLen =  tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len")
 63 | 
 64 |         l2_loss = tf.constant(1.0)
 65 | 
 66 | 
 67 |         # =============================== Embedding layer ===============================
 68 |         with tf.name_scope("embedding"):
 69 |             W = get_embeddings(vocab)
 70 |             utterances_embedded = tf.nn.embedding_lookup(W, self.utterances)  # [batch_size, max_utter_num, max_utter_len,  word_dim]
 71 |             profiles_embedded = tf.nn.embedding_lookup(W, self.profiles)      # [batch_size, max_profile_num, max_profile_len, word_dim]
 72 |             utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob)
 73 |             profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob)
 74 |             print("utterances_embedded: {}".format(utterances_embedded.get_shape()))
 75 |             print("profiles_embedded: {}".format(profiles_embedded.get_shape()))
 76 | 
 77 | 
 78 |         # =============================== Encoding layer ===============================
 79 |         with tf.variable_scope("encoding_layer") as vs:
 80 |             mask_u = tf.sequence_mask(self.utterances_len, max_utter_len, dtype=tf.float32)  # [batch_size, max_utter_num, max_utter_len]
 81 |             mask_u = tf.expand_dims(mask_u, -1)                                              # [batch_size, max_utter_num, max_utter_len, 1]
 82 |             final_utterances = tf.reduce_max(utterances_embedded * mask_u, axis=2)           # [batch_size, max_utter_num, word_dim]
 83 | 
 84 |             mask_p = tf.sequence_mask(self.profiles_len, max_profile_len, dtype=tf.float32)  # [batch_size, max_profile_num, max_profile_len]
 85 |             mask_p = tf.expand_dims(mask_p, -1)                                              # [batch_size, max_profile_num, max_profile_len, 1]
 86 |             final_profiles = tf.reduce_max(profiles_embedded * mask_p, axis=2)               # [batch_size, max_profile_num, word_dim]
 87 |             print("establish BOW encoder")
 88 | 
 89 | 
 90 |         # =============================== Matching layer ===============================
 91 |         with tf.variable_scope("matching_layer") as vs:
 92 |             concat_dim = final_utterances.get_shape()[-1].value
 93 | 
 94 |             A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
 95 |             similarity = tf.einsum('aij,jk->aik', 
 96 |                                    final_utterances, A_matrix)   # [batch_size, max_utter_num, dim]
 97 |             similarity = tf.matmul(similarity, 
 98 |                                    tf.transpose(final_profiles, perm=[0, 2, 1]),
 99 |                                    name="similarity")  # [batch_size, max_utter_num, max_profile_num]
100 | 
101 |             print("shape of similarity: {}".format(similarity.get_shape()))
102 |             print("establish matching between utterances and profiles")
103 | 
104 | 
105 |         # =============================== Aggregation layer ===============================
106 |         with tf.variable_scope("aggregation_layer") as vs:
107 |             logits = tf.reduce_max(similarity, axis=2, name="logits_1")  # [batch_size, max_utter_num]
108 |             mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32)  # [batch_size, max_utter_num]
109 |             logits = logits * mask_u
110 |             logits = tf.reduce_sum(logits, axis=1, name="logits_2")      # [batch_size, ]
111 |             print("establish reduce_max across profiles and masked_reduce_sum across utterances")
112 |             print("logits: {}".format(logits.get_shape()))
113 | 
114 | 
115 |         # =============================== Prediction layer ===============================
116 |         with tf.variable_scope("prediction_layer") as vs:
117 |             self.probs = tf.sigmoid(logits, name="prob")   # [batch_size, ]
118 |             losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
119 |             self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
120 |                                                               tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
121 | 
122 |         with tf.name_scope("accuracy"):
123 |             correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5))    # [batch_size, ]
124 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
125 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/transformer_block.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | # main function
  4 | def block(
  5 |     Q, K, V, 
  6 |     Q_lengths, K_lengths, 
  7 |     attention_type='dot', 
  8 |     is_layer_norm=True, 
  9 |     is_mask=True, mask_value=-2**32+1,
 10 |     drop_prob=None):
 11 |     '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf.
 12 |     Args:
 13 |         Q: a tensor with shape [batch, Q_time, Q_dimension]
 14 |         K: a tensor with shape [batch, time, K_dimension]
 15 |         V: a tensor with shape [batch, time, V_dimension]
 16 | 
 17 |         Q_length: a tensor with shape [batch]
 18 |         K_length: a tensor with shape [batch]
 19 | 
 20 |     Returns:
 21 |         a tensor with shape [batch, time, dimension]
 22 |     '''
 23 |     att = attention(Q, K, V, 
 24 |                     Q_lengths, K_lengths, 
 25 |                     attention_type=attention_type, 
 26 |                     is_mask=is_mask, mask_value=mask_value,
 27 |                     drop_prob=drop_prob)
 28 |     if is_layer_norm:
 29 |         with tf.variable_scope('attention_layer_norm'):
 30 |             y = layer_norm_debug(Q + att)
 31 |     else:
 32 |         y = Q + att
 33 | 
 34 |     z = FFN(y)
 35 |     if is_layer_norm:
 36 |         with tf.variable_scope('FFN_layer_norm'):
 37 |             w = layer_norm_debug(y + z)
 38 |     else:
 39 |         w = y + z
 40 |     return w
 41 | 
 42 | def attention(
 43 |     Q, K, V, 
 44 |     Q_lengths, K_lengths, 
 45 |     attention_type='dot', 
 46 |     is_mask=True, mask_value=-2**32+1,
 47 |     drop_prob=None):
 48 |     '''Add attention layer.
 49 |     Args:
 50 |         Q: a tensor with shape [batch, Q_time, Q_dimension]
 51 |         K: a tensor with shape [batch, time, K_dimension]
 52 |         V: a tensor with shape [batch, time, V_dimension]
 53 | 
 54 |         Q_length: a tensor with shape [batch]
 55 |         K_length: a tensor with shape [batch]
 56 | 
 57 |     Returns:
 58 |         a tensor with shape [batch, Q_time, V_dimension]
 59 | 
 60 |     Raises:
 61 |         AssertionError: if
 62 |             Q_dimension not equal to K_dimension when attention type is dot.
 63 |     '''
 64 |     assert attention_type in ('dot', 'bilinear')
 65 |     if attention_type == 'dot':
 66 |         assert Q.shape[-1] == K.shape[-1]
 67 | 
 68 |     Q_time = Q.shape[1]
 69 |     K_time = K.shape[1]
 70 | 
 71 |     if attention_type == 'dot':
 72 |         logits = dot_sim(Q, K) #[batch, Q_time, time]
 73 |     if attention_type == 'bilinear':
 74 |         logits = bilinear_sim(Q, K)
 75 | 
 76 |     if is_mask:
 77 |         _mask = mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time]
 78 |         logits = _mask * logits + (1 - _mask) * mask_value
 79 |     
 80 |     attention = tf.nn.softmax(logits)
 81 | 
 82 |     if drop_prob is not None:
 83 |         print('use attention drop')
 84 |         attention = tf.nn.dropout(attention, drop_prob)
 85 | 
 86 |     return weighted_sum(attention, V)
 87 | 
 88 | def dot_sim(x, y, is_nor=True):
 89 |     '''calculate dot similarity with two tensor.
 90 | 
 91 |     Args:
 92 |         x: a tensor with shape [batch, time_x, dimension]
 93 |         y: a tensor with shape [batch, time_y, dimension]
 94 |     
 95 |     Returns:
 96 |         a tensor with shape [batch, time_x, time_y]
 97 |     Raises:
 98 |         AssertionError: if
 99 |             the shapes of x and y are not match.
100 |     '''
101 |     assert x.shape[-1] == y.shape[-1]
102 | 
103 |     sim = tf.einsum('bik,bjk->bij', x, y)
104 | 
105 |     if is_nor:
106 |         scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32))
107 |         scale = tf.maximum(1.0, scale)
108 |         return sim / scale
109 |     else:
110 |         return result
111 | 
112 | def bilinear_sim(x, y, is_nor=True):
113 |     '''calculate bilinear similarity with two tensor.
114 |     Args:
115 |         x: a tensor with shape [batch, time_x, dimension_x]
116 |         y: a tensor with shape [batch, time_y, dimension_y]
117 |     
118 |     Returns:
119 |         a tensor with shape [batch, time_x, time_y]
120 |     Raises:
121 |         ValueError: if
122 |             the shapes of x and y are not match;
123 |             bilinear matrix reuse error.
124 |     '''
125 |     M = tf.get_variable(
126 |         name="bilinear_matrix", 
127 |         shape=[x.shape[-1], y.shape[-1]],
128 |         dtype=tf.float32,
129 |         initializer=tf.orthogonal_initializer())
130 |     sim = tf.einsum('bik,kl,bjl->bij', x, M, y)
131 | 
132 |     if is_nor:
133 |         scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32))
134 |         scale = tf.maximum(1.0, scale)
135 |         return sim / scale
136 |     else:
137 |         return sim
138 | 
139 | def mask(row_lengths, col_lengths, max_row_length, max_col_length):
140 |     '''Return a mask tensor representing the first N positions of each row and each column.
141 | 
142 |     Args:
143 |         row_lengths: a tensor with shape [batch]
144 |         col_lengths: a tensor with shape [batch]
145 | 
146 |     Returns:
147 |         a mask tensor with shape [batch, max_row_length, max_col_length]
148 | 
149 |     Raises:
150 |     '''
151 |     row_mask = tf.sequence_mask(row_lengths, max_row_length) #bool, [batch, max_row_len]
152 |     col_mask = tf.sequence_mask(col_lengths, max_col_length) #bool, [batch, max_col_len]
153 | 
154 |     row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32)
155 |     col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32)
156 | 
157 |     return tf.einsum('bik,bjk->bij', row_mask, col_mask)
158 | 
159 | def weighted_sum(weight, values):
160 |     '''Calcualte the weighted sum.
161 | 
162 |     Args:
163 |         weight: a tensor with shape [batch, time, dimension]
164 |         values: a tensor with shape [batch, dimension, values_dimension]
165 | 
166 |     Return:
167 |         a tensor with shape [batch, time, values_dimension]
168 | 
169 |     Raises:
170 |     '''
171 |     return tf.einsum('bij,bjk->bik', weight, values)
172 | 
173 | def layer_norm_debug(x, axis = None, epsilon=1e-6):
174 |     '''Add layer normalization.
175 | 
176 |     Args:
177 |         x: a tensor
178 |         axis: the dimensions to normalize
179 | 
180 |     Returns:
181 |         a tensor the same shape as x.
182 | 
183 |     Raises:
184 |     '''
185 |     if axis is None:
186 |         axis = [-1]
187 |     shape = [x.shape[i] for i in axis]
188 | 
189 |     scale = tf.get_variable(
190 |         name='scale',
191 |         shape=shape,
192 |         dtype=tf.float32,
193 |         initializer=tf.ones_initializer())
194 |     bias = tf.get_variable(
195 |         name='bias',
196 |         shape=shape,
197 |         dtype=tf.float32,
198 |         initializer=tf.zeros_initializer())
199 | 
200 |     mean = tf.reduce_mean(x, axis=axis, keep_dims=True)
201 |     variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True)
202 |     norm = (x-mean) * tf.rsqrt(variance + epsilon)
203 |     return scale * norm + bias
204 | 
205 | def FFN(x, out_dimension_0=None, out_dimension_1=None):
206 |     '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.
207 | 
208 |     Args:
209 |         x: a tensor with shape [batch, time, dimension]
210 |         out_dimension: a number which is the output dimension
211 | 
212 |     Returns:
213 |         a tensor with shape [batch, time, out_dimension]
214 | 
215 |     Raises:
216 |     '''
217 |     with tf.variable_scope('FFN_1'):
218 |         y = dense(x, out_dimension_0)
219 |         y = tf.nn.relu(y)
220 |     with tf.variable_scope('FFN_2'):
221 |         z = dense(y, out_dimension_1) #, add_bias=False)  #!!!!
222 |     return z
223 | 
224 | def dense(x, out_dimension=None, add_bias=True):
225 |     '''Add dense connected layer, Wx + b.
226 | 
227 |     Args:
228 |         x: a tensor with shape [batch, time, dimension]
229 |         out_dimension: a number which is the output dimension
230 | 
231 |     Return:
232 |         a tensor with shape [batch, time, out_dimension]
233 | 
234 |     Raises:
235 |     '''
236 |     if out_dimension is None:
237 |         out_dimension = x.shape[-1]
238 | 
239 |     W = tf.get_variable(
240 |         name='weights',
241 |         shape=[x.shape[-1], out_dimension],
242 |         dtype=tf.float32,
243 |         initializer=tf.orthogonal_initializer())
244 |     if add_bias:
245 |         bias = tf.get_variable(
246 |             name='bias',
247 |             shape=[1],
248 |             dtype=tf.float32,
249 |             initializer=tf.zeros_initializer())
250 |         return tf.einsum('bik,kj->bij', x, W) + bias
251 |     else:
252 |         return tf.einsum('bik,kj->bij', x, W)
253 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/transformer_block.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | # main function
  4 | def block(
  5 |     Q, K, V, 
  6 |     Q_lengths, K_lengths, 
  7 |     attention_type='dot', 
  8 |     is_layer_norm=True, 
  9 |     is_mask=True, mask_value=-2**32+1,
 10 |     drop_prob=None):
 11 |     '''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf.
 12 |     Args:
 13 |         Q: a tensor with shape [batch, Q_time, Q_dimension]
 14 |         K: a tensor with shape [batch, time, K_dimension]
 15 |         V: a tensor with shape [batch, time, V_dimension]
 16 | 
 17 |         Q_length: a tensor with shape [batch]
 18 |         K_length: a tensor with shape [batch]
 19 | 
 20 |     Returns:
 21 |         a tensor with shape [batch, time, dimension]
 22 |     '''
 23 |     att = attention(Q, K, V, 
 24 |                     Q_lengths, K_lengths, 
 25 |                     attention_type=attention_type, 
 26 |                     is_mask=is_mask, mask_value=mask_value,
 27 |                     drop_prob=drop_prob)
 28 |     if is_layer_norm:
 29 |         with tf.variable_scope('attention_layer_norm'):
 30 |             y = layer_norm_debug(Q + att)
 31 |     else:
 32 |         y = Q + att
 33 | 
 34 |     z = FFN(y)
 35 |     if is_layer_norm:
 36 |         with tf.variable_scope('FFN_layer_norm'):
 37 |             w = layer_norm_debug(y + z)
 38 |     else:
 39 |         w = y + z
 40 |     return w
 41 | 
 42 | def attention(
 43 |     Q, K, V, 
 44 |     Q_lengths, K_lengths, 
 45 |     attention_type='dot', 
 46 |     is_mask=True, mask_value=-2**32+1,
 47 |     drop_prob=None):
 48 |     '''Add attention layer.
 49 |     Args:
 50 |         Q: a tensor with shape [batch, Q_time, Q_dimension]
 51 |         K: a tensor with shape [batch, time, K_dimension]
 52 |         V: a tensor with shape [batch, time, V_dimension]
 53 | 
 54 |         Q_length: a tensor with shape [batch]
 55 |         K_length: a tensor with shape [batch]
 56 | 
 57 |     Returns:
 58 |         a tensor with shape [batch, Q_time, V_dimension]
 59 | 
 60 |     Raises:
 61 |         AssertionError: if
 62 |             Q_dimension not equal to K_dimension when attention type is dot.
 63 |     '''
 64 |     assert attention_type in ('dot', 'bilinear')
 65 |     if attention_type == 'dot':
 66 |         assert Q.shape[-1] == K.shape[-1]
 67 | 
 68 |     Q_time = Q.shape[1]
 69 |     K_time = K.shape[1]
 70 | 
 71 |     if attention_type == 'dot':
 72 |         logits = dot_sim(Q, K) #[batch, Q_time, time]
 73 |     if attention_type == 'bilinear':
 74 |         logits = bilinear_sim(Q, K)
 75 | 
 76 |     if is_mask:
 77 |         _mask = mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time]
 78 |         logits = _mask * logits + (1 - _mask) * mask_value
 79 |     
 80 |     attention = tf.nn.softmax(logits)
 81 | 
 82 |     if drop_prob is not None:
 83 |         print('use attention drop')
 84 |         attention = tf.nn.dropout(attention, drop_prob)
 85 | 
 86 |     return weighted_sum(attention, V)
 87 | 
 88 | def dot_sim(x, y, is_nor=True):
 89 |     '''calculate dot similarity with two tensor.
 90 | 
 91 |     Args:
 92 |         x: a tensor with shape [batch, time_x, dimension]
 93 |         y: a tensor with shape [batch, time_y, dimension]
 94 |     
 95 |     Returns:
 96 |         a tensor with shape [batch, time_x, time_y]
 97 |     Raises:
 98 |         AssertionError: if
 99 |             the shapes of x and y are not match.
100 |     '''
101 |     assert x.shape[-1] == y.shape[-1]
102 | 
103 |     sim = tf.einsum('bik,bjk->bij', x, y)
104 | 
105 |     if is_nor:
106 |         scale = tf.sqrt(tf.cast(x.shape[-1], tf.float32))
107 |         scale = tf.maximum(1.0, scale)
108 |         return sim / scale
109 |     else:
110 |         return result
111 | 
112 | def bilinear_sim(x, y, is_nor=True):
113 |     '''calculate bilinear similarity with two tensor.
114 |     Args:
115 |         x: a tensor with shape [batch, time_x, dimension_x]
116 |         y: a tensor with shape [batch, time_y, dimension_y]
117 |     
118 |     Returns:
119 |         a tensor with shape [batch, time_x, time_y]
120 |     Raises:
121 |         ValueError: if
122 |             the shapes of x and y are not match;
123 |             bilinear matrix reuse error.
124 |     '''
125 |     M = tf.get_variable(
126 |         name="bilinear_matrix", 
127 |         shape=[x.shape[-1], y.shape[-1]],
128 |         dtype=tf.float32,
129 |         initializer=tf.orthogonal_initializer())
130 |     sim = tf.einsum('bik,kl,bjl->bij', x, M, y)
131 | 
132 |     if is_nor:
133 |         scale = tf.sqrt(tf.cast(x.shape[-1] * y.shape[-1], tf.float32))
134 |         scale = tf.maximum(1.0, scale)
135 |         return sim / scale
136 |     else:
137 |         return sim
138 | 
139 | def mask(row_lengths, col_lengths, max_row_length, max_col_length):
140 |     '''Return a mask tensor representing the first N positions of each row and each column.
141 | 
142 |     Args:
143 |         row_lengths: a tensor with shape [batch]
144 |         col_lengths: a tensor with shape [batch]
145 | 
146 |     Returns:
147 |         a mask tensor with shape [batch, max_row_length, max_col_length]
148 | 
149 |     Raises:
150 |     '''
151 |     row_mask = tf.sequence_mask(row_lengths, max_row_length) #bool, [batch, max_row_len]
152 |     col_mask = tf.sequence_mask(col_lengths, max_col_length) #bool, [batch, max_col_len]
153 | 
154 |     row_mask = tf.cast(tf.expand_dims(row_mask, -1), tf.float32)
155 |     col_mask = tf.cast(tf.expand_dims(col_mask, -1), tf.float32)
156 | 
157 |     return tf.einsum('bik,bjk->bij', row_mask, col_mask)
158 | 
159 | def weighted_sum(weight, values):
160 |     '''Calcualte the weighted sum.
161 | 
162 |     Args:
163 |         weight: a tensor with shape [batch, time, dimension]
164 |         values: a tensor with shape [batch, dimension, values_dimension]
165 | 
166 |     Return:
167 |         a tensor with shape [batch, time, values_dimension]
168 | 
169 |     Raises:
170 |     '''
171 |     return tf.einsum('bij,bjk->bik', weight, values)
172 | 
173 | def layer_norm_debug(x, axis = None, epsilon=1e-6):
174 |     '''Add layer normalization.
175 | 
176 |     Args:
177 |         x: a tensor
178 |         axis: the dimensions to normalize
179 | 
180 |     Returns:
181 |         a tensor the same shape as x.
182 | 
183 |     Raises:
184 |     '''
185 |     if axis is None:
186 |         axis = [-1]
187 |     shape = [x.shape[i] for i in axis]
188 | 
189 |     scale = tf.get_variable(
190 |         name='scale',
191 |         shape=shape,
192 |         dtype=tf.float32,
193 |         initializer=tf.ones_initializer())
194 |     bias = tf.get_variable(
195 |         name='bias',
196 |         shape=shape,
197 |         dtype=tf.float32,
198 |         initializer=tf.zeros_initializer())
199 | 
200 |     mean = tf.reduce_mean(x, axis=axis, keep_dims=True)
201 |     variance = tf.reduce_mean(tf.square(x - mean), axis=axis, keep_dims=True)
202 |     norm = (x-mean) * tf.rsqrt(variance + epsilon)
203 |     return scale * norm + bias
204 | 
205 | def FFN(x, out_dimension_0=None, out_dimension_1=None):
206 |     '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.
207 | 
208 |     Args:
209 |         x: a tensor with shape [batch, time, dimension]
210 |         out_dimension: a number which is the output dimension
211 | 
212 |     Returns:
213 |         a tensor with shape [batch, time, out_dimension]
214 | 
215 |     Raises:
216 |     '''
217 |     with tf.variable_scope('FFN_1'):
218 |         y = dense(x, out_dimension_0)
219 |         y = tf.nn.relu(y)
220 |     with tf.variable_scope('FFN_2'):
221 |         z = dense(y, out_dimension_1) #, add_bias=False)  #!!!!
222 |     return z
223 | 
224 | def dense(x, out_dimension=None, add_bias=True):
225 |     '''Add dense connected layer, Wx + b.
226 | 
227 |     Args:
228 |         x: a tensor with shape [batch, time, dimension]
229 |         out_dimension: a number which is the output dimension
230 | 
231 |     Return:
232 |         a tensor with shape [batch, time, out_dimension]
233 | 
234 |     Raises:
235 |     '''
236 |     if out_dimension is None:
237 |         out_dimension = x.shape[-1]
238 | 
239 |     W = tf.get_variable(
240 |         name='weights',
241 |         shape=[x.shape[-1], out_dimension],
242 |         dtype=tf.float32,
243 |         initializer=tf.orthogonal_initializer())
244 |     if add_bias:
245 |         bias = tf.get_variable(
246 |             name='bias',
247 |             shape=[1],
248 |             dtype=tf.float32,
249 |             initializer=tf.zeros_initializer())
250 |         return tf.einsum('bik,kj->bij', x, W) + bias
251 |     else:
252 |         return tf.einsum('bik,kj->bij', x, W)
253 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/model_Transformer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import transformer_block
  4 | 
  5 | FLAGS = tf.flags.FLAGS
  6 | 
  7 | def get_embeddings(vocab):
  8 |     print("get_embedding")
  9 |     initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
 10 |     return tf.constant(initializer, name="word_embedding")
 11 | 
 12 | def get_char_embedding(charVocab):
 13 |     print("get_char_embedding")
 14 |     char_size = len(charVocab)
 15 |     embeddings = np.zeros((char_size, char_size), dtype='float32')
 16 |     for i in range(1, char_size):
 17 |         embeddings[i, i] = 1.0
 18 | 
 19 |     return tf.constant(embeddings, name="word_char_embedding")
 20 | 
 21 | def load_embed_vectors(fname, dim):
 22 |     # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
 23 |     vectors = {}
 24 |     for line in open(fname, 'rt'):
 25 |         items = line.strip().split(' ')
 26 |         if len(items[0]) <= 0:
 27 |             continue
 28 |         vec = [float(items[i]) for i in range(1, dim+1)]
 29 |         vectors[items[0]] = vec
 30 | 
 31 |     return vectors
 32 | 
 33 | def load_word_embeddings(vocab, dim):
 34 |     vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
 35 |     vocab_size = len(vocab)
 36 |     embeddings = np.zeros((vocab_size, dim), dtype='float32')
 37 |     for word, code in vocab.items():
 38 |         if word in vectors:
 39 |             embeddings[code] = vectors[word]
 40 |         #else:
 41 |         #    embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 
 42 | 
 43 |     return embeddings 
 44 | 
 45 | def cnn_layer(inputs, filter_sizes, num_filters, scope=None, scope_reuse=False):
 46 |     with tf.variable_scope(scope, reuse=scope_reuse):
 47 |         input_size = inputs.get_shape()[2].value
 48 | 
 49 |         outputs = []
 50 |         for i, filter_size in enumerate(filter_sizes):
 51 |             with tf.variable_scope("conv_{}".format(i)):
 52 |                 w = tf.get_variable("w", [filter_size, input_size, num_filters])
 53 |                 b = tf.get_variable("b", [num_filters])
 54 |             conv = tf.nn.conv1d(inputs, w, stride=1, padding="VALID") # [num_words, num_chars - filter_size, num_filters]
 55 |             h = tf.nn.relu(tf.nn.bias_add(conv, b)) # [num_words, num_chars - filter_size, num_filters]
 56 |             pooled = tf.reduce_max(h, 1) # [num_words, num_filters]
 57 |             outputs.append(pooled)
 58 |     return tf.concat(outputs, 1) # [num_words, num_filters * len(filter_sizes)]
 59 | 
 60 | 
 61 | class Transformer(object):
 62 |     def __init__(
 63 |       self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
 64 | 
 65 |         self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context")
 66 |         self.context_len = tf.placeholder(tf.int32, [None], name="context_len")
 67 |         self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona")
 68 |         self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len")
 69 | 
 70 |         self.target = tf.placeholder(tf.float32, [None], name="target")
 71 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 72 | 
 73 |         self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char")
 74 |         self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len")
 75 |         self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char")
 76 |         self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len")
 77 | 
 78 |         l2_loss = tf.constant(1.0)
 79 | 
 80 |         # =============================== Embedding layer ===============================
 81 |         # 1. word embedding
 82 |         with tf.name_scope("embedding"):
 83 |             W = get_embeddings(vocab)
 84 |             context_embedded = tf.nn.embedding_lookup(W, self.context)  # [batch_size, max_context_len, word_dim]
 85 |             persona_embedded = tf.nn.embedding_lookup(W, self.persona)  # [batch_size, max_persona_len, word_dim]
 86 |             context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob)
 87 |             persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob)
 88 |             print("context_embedded: {}".format(context_embedded.get_shape()))
 89 |             print("persona_embedded: {}".format(persona_embedded.get_shape()))
 90 | 
 91 | 
 92 |         # =============================== Encoding layer ===============================
 93 |         emb_dim = context_embedded.get_shape()[-1].value
 94 |         
 95 |         # with tf.variable_scope("encoding_layer") as vs:
 96 |         #     # CNN encoder
 97 |         #     final_context = cnn_layer(context_embedded, filter_sizes=[3, 4, 5], num_filters=100, scope="CNN_emb", scope_reuse=False) # [batch_size*max_utter_num, emb]
 98 |         #     final_persona = cnn_layer(persona_embedded, filter_sizes=[3, 4, 5], num_filters=100, scope="CNN_emb", scope_reuse=True)  # [batch_size*max_profile_num, emb]
 99 |         #     print("establish CNN encoder")
100 | 
101 |         context_input = context_embedded
102 |         for layer in range(num_layer):
103 |             with tf.variable_scope("encoding_layer_{}".format(layer)):
104 |                 context_output = transformer_block.block(context_input, context_input, context_input, self.context_len, self.context_len)
105 |                 context_input = context_output
106 | 
107 |         persona_input = persona_embedded
108 |         for layer in range(num_layer):
109 |             with tf.variable_scope("encoding_layer_{}".format(layer), reuse=True):   # [batch_size, max_context_len, word_dim]
110 |                 persona_output = transformer_block.block(persona_input, persona_input, persona_input, self.persona_len, self.persona_len)
111 |                 persona_input = persona_output
112 |         print("context_output: {}".format(context_output.get_shape()))  # [batch_size, max_persona_len, word_dim]
113 |         print("persona_output: {}".format(persona_output.get_shape()))
114 |         print("establish {}-layer Transformer encoder".format(num_layer))
115 | 
116 | 
117 |         # =============================== Matching layer ===============================
118 |         with tf.variable_scope("matching_layer") as vs:
119 |             mask_c = tf.sequence_mask(self.context_len, max_context_len, dtype=tf.float32)  # [batch_size, max_context_len]
120 |             context_output = context_output * tf.expand_dims(mask_c, 2)                     # [batch_size, max_context_len, dim]
121 |             final_context = tf.reduce_sum(context_output, axis=1)                           # [batch_size, dim]
122 |             
123 |             mask_p = tf.sequence_mask(self.persona_len, max_persona_len, dtype=tf.float32)  # [batch_size, max_persona_len]
124 |             persona_output = persona_output * tf.expand_dims(mask_p, 2)                     # [batch_size, max_persona_len, dim]
125 |             final_persona = tf.reduce_sum(persona_output, axis=1)                           # [batch_size, dim]
126 | 
127 |             output_dim = final_context.get_shape()[-1].value
128 |             A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
129 | 
130 |             similarity = tf.matmul(final_context, A_matrix)                  # [batch_size, dim]
131 |             similarity = tf.reduce_sum(similarity * final_persona, axis=-1)  # [batch_size, ]
132 |             print("shape of similarity: {}".format(similarity.get_shape()))
133 |         
134 | 
135 |         # =============================== Prediction layer ===============================
136 |         with tf.variable_scope("prediction_layer") as vs:
137 |             logits = similarity
138 |             self.probs = tf.sigmoid(logits, name="prob")   # [batch_size, ]
139 | 
140 |             losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
141 |             self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
142 |                                                               tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
143 | 
144 |         with tf.name_scope("accuracy"):
145 |             correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5))    # [batch_size, ]
146 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
147 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/model_BiLSTM.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | FLAGS = tf.flags.FLAGS
  5 | 
  6 | def get_embeddings(vocab):
  7 |     print("get_embedding")
  8 |     initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
  9 |     return tf.constant(initializer, name="word_embedding")
 10 | 
 11 | def get_char_embedding(charVocab):
 12 |     print("get_char_embedding")
 13 |     char_size = len(charVocab)
 14 |     embeddings = np.zeros((char_size, char_size), dtype='float32')
 15 |     for i in range(1, char_size):
 16 |         embeddings[i, i] = 1.0
 17 | 
 18 |     return tf.constant(embeddings, name="word_char_embedding")
 19 | 
 20 | def load_embed_vectors(fname, dim):
 21 |     # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
 22 |     vectors = {}
 23 |     for line in open(fname, 'rt'):
 24 |         items = line.strip().split(' ')
 25 |         if len(items[0]) <= 0:
 26 |             continue
 27 |         vec = [float(items[i]) for i in range(1, dim+1)]
 28 |         vectors[items[0]] = vec
 29 | 
 30 |     return vectors
 31 | 
 32 | def load_word_embeddings(vocab, dim):
 33 |     vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
 34 |     vocab_size = len(vocab)
 35 |     embeddings = np.zeros((vocab_size, dim), dtype='float32')
 36 |     for word, code in vocab.items():
 37 |         if word in vectors:
 38 |             embeddings[code] = vectors[word]
 39 |         #else:
 40 |         #    embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 
 41 | 
 42 |     return embeddings 
 43 | 
 44 | 
 45 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False):
 46 |     with tf.variable_scope(scope, reuse=scope_reuse) as vs:
 47 |         fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
 48 |         fw_cell  = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob)
 49 |         bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
 50 |         bw_cell  = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob)
 51 |         rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell,
 52 |                                                                 inputs=inputs,
 53 |                                                                 sequence_length=input_seq_len,
 54 |                                                                 dtype=tf.float32)
 55 |         return rnn_outputs, rnn_states
 56 | 
 57 | def multi_lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, num_layer, scope, scope_reuse=False):
 58 |     with tf.variable_scope(scope, reuse=scope_reuse) as vs:
 59 |         multi_outputs = []
 60 |         multi_states = []
 61 |         cur_inputs = inputs
 62 |         for i_layer in range(num_layer):
 63 |             rnn_outputs, rnn_states = lstm_layer(cur_inputs, input_seq_len, rnn_size, dropout_keep_prob, scope+str(i_layer), scope_reuse)
 64 |             rnn_outputs = tf.concat(values=rnn_outputs, axis=2)
 65 |             multi_outputs.append(rnn_outputs)
 66 |             multi_states.append(rnn_states)
 67 |             cur_inputs = rnn_outputs
 68 | 
 69 |         # multi_layer_aggregation
 70 |         ml_weights = tf.nn.softmax(tf.get_variable("ml_scores", [num_layer, ], initializer=tf.constant_initializer(0.0)))
 71 | 
 72 |         multi_outputs = tf.stack(multi_outputs, axis=-1)   # [batch_size, max_len, 2*rnn_size(400), num_layer]
 73 |         max_len = multi_outputs.get_shape()[1].value
 74 |         dim = multi_outputs.get_shape()[2].value
 75 |         flattened_multi_outputs = tf.reshape(multi_outputs, [-1, num_layer])                         # [batch_size * max_len * 2*rnn_size(400), num_layer]
 76 |         aggregated_ml_outputs = tf.matmul(flattened_multi_outputs, tf.expand_dims(ml_weights, 1))    # [batch_size * max_len * 2*rnn_size(400), 1]
 77 |         aggregated_ml_outputs = tf.reshape(aggregated_ml_outputs, [-1, max_len, dim])                # [batch_size , max_len , 2*rnn_size(400)]
 78 | 
 79 |         return aggregated_ml_outputs
 80 | 
 81 | 
 82 | class BiLSTM(object):
 83 |     def __init__(
 84 |       self, max_context_len, max_persona_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
 85 | 
 86 |         self.context = tf.placeholder(tf.int32, [None, max_context_len], name="context")
 87 |         self.context_len = tf.placeholder(tf.int32, [None], name="context_len")
 88 |         self.persona = tf.placeholder(tf.int32, [None, max_persona_len], name="persona")
 89 |         self.persona_len = tf.placeholder(tf.int32, [None], name="persona_len")
 90 | 
 91 |         self.target = tf.placeholder(tf.float32, [None], name="target")
 92 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 93 | 
 94 |         self.c_charVec = tf.placeholder(tf.int32, [None, max_context_len, maxWordLength], name="context_char")
 95 |         self.c_charLen = tf.placeholder(tf.int32, [None, max_context_len], name="context_char_len")
 96 |         self.p_charVec = tf.placeholder(tf.int32, [None, max_persona_len, maxWordLength], name="persona_char")
 97 |         self.p_charLen = tf.placeholder(tf.int32, [None, max_persona_len], name="persona_char_len")
 98 | 
 99 |         l2_loss = tf.constant(1.0)
100 | 
101 |         # =============================== Embedding layer ===============================
102 |         with tf.name_scope("embedding"):
103 |             W = get_embeddings(vocab)
104 |             context_embedded = tf.nn.embedding_lookup(W, self.context)  # [batch_size, max_context_len, word_dim]
105 |             persona_embedded = tf.nn.embedding_lookup(W, self.persona)  # [batch_size, max_persona_len, word_dim]
106 |             context_embedded = tf.nn.dropout(context_embedded, keep_prob=self.dropout_keep_prob)
107 |             persona_embedded = tf.nn.dropout(persona_embedded, keep_prob=self.dropout_keep_prob)
108 |             print("context_embedded: {}".format(context_embedded.get_shape()))
109 |             print("persona_embedded: {}".format(persona_embedded.get_shape()))
110 | 
111 | 
112 |         # =============================== Encoding layer ===============================
113 |         with tf.variable_scope("encoding_layer") as vs:
114 |             rnn_scope_name = "bidirectional_rnn"
115 |             # 1. single_lstm_layer
116 |             c_rnn_output, c_rnn_states = lstm_layer(context_embedded, self.context_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False)
117 |             context_output = tf.concat(axis=2, values=c_rnn_output)   # [batch_size, max_context_len, rnn_size*2]
118 |             p_rnn_output, p_rnn_states = lstm_layer(persona_embedded, self.persona_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True)   # [batch_size, max_profile_len, rnn_size(200)]
119 |             persona_output = tf.concat(axis=2, values=p_rnn_output)   # [batch_size, max_persona_len, rnn_size*2]
120 |             # 2. multi_lstm_layer
121 |             # utterances_output = multi_lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=False)
122 |             # response_output = multi_lstm_layer(flattened_responses_embedded, flattened_responses_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=True)
123 |             # print("establish AHRE layers : {}".format(num_layer))
124 |             print("establish BiLSTM encoder")
125 |         
126 | 
127 |         # =============================== Matching layer ===============================
128 |         with tf.variable_scope("matching_layer") as vs:
129 |             final_context = tf.concat(axis=1, values=[c_rnn_states[0].h, c_rnn_states[1].h])  # [batch_size, rnn_size*2]
130 |             final_persona = tf.concat(axis=1, values=[p_rnn_states[0].h, p_rnn_states[1].h])  # [batch_size, rnn_size*2]
131 | 
132 |             output_dim = final_context.get_shape()[-1].value
133 |             A_matrix = tf.get_variable('A_matrix_v', shape=[output_dim, output_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
134 | 
135 |             similarity = tf.matmul(final_context, A_matrix)                  # [batch_size, dim]
136 |             similarity = tf.reduce_sum(similarity * final_persona, axis=-1)  # [batch_size, ]
137 |             print("shape of similarity: {}".format(similarity.get_shape()))          
138 | 
139 | 
140 |         # =============================== Prediction layer ===============================
141 |         with tf.variable_scope("prediction_layer") as vs:
142 |             logits = similarity
143 |             self.probs = tf.sigmoid(logits, name="prob")   # [batch_size, ]
144 | 
145 |             losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
146 |             self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
147 |                                                               tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
148 | 
149 |         with tf.name_scope("accuracy"):
150 |             correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5))    # [batch_size, ]
151 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
152 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/model_Transformer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import transformer_block
  4 | 
  5 | FLAGS = tf.flags.FLAGS
  6 | 
  7 | def get_embeddings(vocab):
  8 |     print("get_embedding")
  9 |     initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
 10 |     return tf.constant(initializer, name="word_embedding")
 11 | 
 12 | def get_char_embedding(charVocab):
 13 |     print("get_char_embedding")
 14 |     char_size = len(charVocab)
 15 |     embeddings = np.zeros((char_size, char_size), dtype='float32')
 16 |     for i in range(1, char_size):
 17 |         embeddings[i, i] = 1.0
 18 | 
 19 |     return tf.constant(embeddings, name="word_char_embedding")
 20 | 
 21 | def load_embed_vectors(fname, dim):
 22 |     # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
 23 |     vectors = {}
 24 |     for line in open(fname, 'rt'):
 25 |         items = line.strip().split(' ')
 26 |         if len(items[0]) <= 0:
 27 |             continue
 28 |         vec = [float(items[i]) for i in range(1, dim+1)]
 29 |         vectors[items[0]] = vec
 30 | 
 31 |     return vectors
 32 | 
 33 | def load_word_embeddings(vocab, dim):
 34 |     vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
 35 |     vocab_size = len(vocab)
 36 |     embeddings = np.zeros((vocab_size, dim), dtype='float32')
 37 |     for word, code in vocab.items():
 38 |         if word in vectors:
 39 |             embeddings[code] = vectors[word]
 40 |         #else:
 41 |         #    embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 
 42 | 
 43 |     return embeddings 
 44 | 
 45 | 
 46 | class Transformer(object):
 47 |     def __init__(
 48 |       self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
 49 | 
 50 |         self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances")
 51 |         self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len")
 52 |         self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num")
 53 |         self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles")
 54 |         self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len")
 55 |         self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num")
 56 | 
 57 |         self.target = tf.placeholder(tf.float32, [None], name="target")
 58 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 59 | 
 60 |         self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char")
 61 |         self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len")
 62 |         self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char")
 63 |         self.p_charLen =  tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len")
 64 | 
 65 |         l2_loss = tf.constant(1.0)
 66 | 
 67 | 
 68 |         # =============================== Embedding layer ===============================
 69 |         with tf.name_scope("embedding"):
 70 |             W = get_embeddings(vocab)
 71 |             utterances_embedded = tf.nn.embedding_lookup(W, self.utterances)  # [batch_size, max_utter_num, max_utter_len,  word_dim]
 72 |             profiles_embedded = tf.nn.embedding_lookup(W, self.profiles)      # [batch_size, max_profile_num, max_profile_len, word_dim]
 73 |             utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob)
 74 |             profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob)
 75 |             print("utterances_embedded: {}".format(utterances_embedded.get_shape()))
 76 |             print("profiles_embedded: {}".format(profiles_embedded.get_shape()))
 77 | 
 78 | 
 79 |         # =============================== Encoding layer ===============================
 80 |         with tf.variable_scope("encoding_layer") as vs:
 81 |             rnn_scope_name = "bidirectional_rnn"
 82 |             emb_dim = utterances_embedded.get_shape()[-1].value
 83 |             flattened_utterances_embedded = tf.reshape(utterances_embedded, [-1, max_utter_len, emb_dim])  # [batch_size*max_utter_num, max_utter_len, emb]
 84 |             flattened_utterances_len = tf.reshape(self.utterances_len, [-1])                               # [batch_size*max_utter_num, ]
 85 |             flattened_profiles_embedded = tf.reshape(profiles_embedded, [-1, max_profile_len, emb_dim])    # [batch_size*max_profile_num, max_profile_len, emb]
 86 |             flattened_profiles_len = tf.reshape(self.profiles_len, [-1])                                   # [batch_size*max_profile_num, ]
 87 | 
 88 |             utterances_input = flattened_utterances_embedded
 89 |             profiles_input = flattened_profiles_embedded
 90 |             for layer in range(num_layer):
 91 |                 with tf.variable_scope("encoding_layer_{}".format(layer)):
 92 |                     utterances_output = transformer_block.block(utterances_input, utterances_input, utterances_input, 
 93 |                                                             flattened_utterances_len, flattened_utterances_len)
 94 |                     utterances_input = utterances_output
 95 | 
 96 |             for layer in range(num_layer):
 97 |                 with tf.variable_scope("encoding_layer_{}".format(layer), reuse=True):
 98 |                     profiles_output = transformer_block.block(profiles_input, profiles_input, profiles_input, 
 99 |                                                             flattened_profiles_len, flattened_profiles_len)
100 |                     profiles_input = profiles_output
101 |             print("establish Transformer encoder")
102 |             print("utterances_output: {}".format(utterances_output.get_shape()))
103 |             print("profiles_output: {}".format(profiles_output.get_shape()))
104 |             
105 | 
106 |         # =============================== Matching layer ===============================
107 |         with tf.variable_scope("matching_layer") as vs:
108 |             mask_u = tf.sequence_mask(flattened_utterances_len, max_utter_len, dtype=tf.float32)  # [batch_size*max_utter_num, max_utter_len]
109 |             utterances_output = utterances_output * tf.expand_dims(mask_u, 2)                     # [batch_size*max_utter_num, max_utter_len, dim]
110 |             final_utterances = tf.reduce_sum(utterances_output, axis=1)                           # [batch_size*max_utter_num, dim]
111 |             # final_utterances = tf.div(final_utterances, tf.expand_dims(tf.sqrt(tf.cast(flattened_utterances_len, tf.float32)), 1))
112 |             concat_dim = final_utterances.get_shape()[-1].value
113 |             final_utterances = tf.reshape(final_utterances, [-1, max_utter_num, concat_dim])      # [batch_size, max_utter_num, dim]
114 | 
115 |             mask_p = tf.sequence_mask(flattened_profiles_len, max_profile_len, dtype=tf.float32)  # [batch_size*max_profile_num, max_profile_len]
116 |             profiles_output = profiles_output * tf.expand_dims(mask_p, 2)
117 |             final_profiles = tf.reduce_sum(profiles_output, axis=1)
118 |             # final_profiles = tf.div(final_profiles, tf.expand_dims(tf.sqrt(tf.cast(flattened_profiles_len, tf.float32)), 1))
119 |             final_profiles = tf.reshape(final_profiles, [-1, max_profile_num, concat_dim])        # [batch_size, max_profile_num, dim]
120 | 
121 |             A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
122 |             similarity = tf.einsum('aij,jk->aik', 
123 |                                    final_utterances, A_matrix)   # [batch_size, max_utter_num, dim]
124 |             similarity = tf.matmul(similarity, 
125 |                                    tf.transpose(final_profiles, perm=[0, 2, 1]))  # [batch_size, max_utter_num, max_profile_num]
126 | 
127 |             print("shape of similarity: {}".format(similarity.get_shape()))
128 | 
129 | 
130 |         # =============================== Aggregation layer ===============================
131 |         with tf.variable_scope("aggregation_layer") as vs:
132 |             logits = tf.reduce_max(similarity, axis=2)  # [batch_size, max_utter_num]
133 |             mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32)  # [batch_size, max_utter_num]
134 |             logits = logits * mask_u
135 |             logits = tf.reduce_sum(logits, axis=1)      # [batch_size, ]
136 |             print("establish reduce_max across profiles and masked_reduce_sum across utterances")
137 |             print("logits: {}".format(logits.get_shape()))
138 | 
139 | 
140 |         # =============================== Prediction layer ===============================
141 |         with tf.variable_scope("prediction_layer") as vs:
142 |             self.probs = tf.sigmoid(logits, name="prob")   # [batch_size, ]
143 |             losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
144 |             self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
145 |                                                               tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
146 | 
147 |         with tf.name_scope("accuracy"):
148 |             correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5))    # [batch_size, ]
149 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
150 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | def load_vocab(fname):
  6 |     '''
  7 |     vocab = {"I": 0, ...}
  8 |     '''
  9 |     vocab={}
 10 |     with open(fname, 'rt') as f:
 11 |         for i,line in enumerate(f):
 12 |             word = line.strip()
 13 |             vocab[word] = i
 14 |     return vocab
 15 | 
 16 | def load_char_vocab(fname):
 17 |     '''
 18 |     charVocab = {"U": 0, "!": 1, ...}
 19 |     '''
 20 |     charVocab={}
 21 |     with open(fname, 'rt') as f:
 22 |         for line in f:
 23 |             fields = line.strip().split('\t')
 24 |             char_id = int(fields[0])
 25 |             ch = fields[1]
 26 |             charVocab[ch] = char_id
 27 |     return charVocab
 28 | 
 29 | def to_vec(tokens, vocab, maxlen):
 30 |     '''
 31 |     length: length of the input sequence
 32 |     vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...]
 33 |     '''
 34 |     n = len(tokens)
 35 |     length = 0
 36 |     vec=[]
 37 |     for i in range(n):
 38 |         length += 1
 39 |         if tokens[i] in vocab:
 40 |             vec.append(vocab[tokens[i]])
 41 |         else:
 42 |             # vec.append(vocab["fiance"])  # fix to fiance
 43 |             vec.append(vocab["_unk_"]) 
 44 |     return length, np.array(vec)
 45 | 
 46 | def load_dataset(fname, vocab, max_utter_num, max_utter_len, max_profile_num, max_profile_len):
 47 | 
 48 |     dataset=[]
 49 |     with open(fname, 'rt') as f:
 50 |         for line in f:
 51 |             # ( id, context utterances, persona candidates, label )
 52 |             line = line.strip()
 53 |             fields = line.split('\t')
 54 | 
 55 |             # id
 56 |             us_id = fields[0]
 57 | 
 58 |             # context utterances
 59 |             context = fields[1]
 60 |             utterances = context.split(' _eos_ ')
 61 |             utterances = [utterance + " _eos_" for utterance in utterances]
 62 |             utterances = utterances[-max_utter_num:]   # select the last max_utter_num utterances
 63 | 
 64 |             us_tokens = []
 65 |             us_vec = []
 66 |             us_len = []
 67 |             for utterance in utterances:
 68 |                 u_tokens = utterance.split(' ')[:max_utter_len]  # select the head max_utter_len tokens in every utterance
 69 |                 u_len, u_vec = to_vec(u_tokens, vocab, max_utter_len)
 70 |                 us_tokens.append(u_tokens)
 71 |                 us_vec.append(u_vec)
 72 |                 us_len.append(u_len)
 73 |             us_num = len(utterances)
 74 | 
 75 |             # persona candidates
 76 |             if fields[2] != "NA":
 77 |                 personas = fields[2].split("|")
 78 |                 for index, persona in enumerate(personas):
 79 |                     # ps_id = "match_" + str(index)
 80 |                     ps_id = "1." + str(index)
 81 |                     profiles = persona.split(' _eos_ ')
 82 |                     profiles = [profile + " _eos_" for profile in profiles]
 83 |                     profiles = profiles[-max_profile_num:]   # select the last max_utter_num utterances
 84 |                     ps_tokens = []
 85 |                     ps_vec = []
 86 |                     ps_len = []
 87 |                     for profile in profiles:
 88 |                         p_tokens = profile.split(' ')[:max_profile_len]  # select the head max_profile_len tokens in every persona
 89 |                         p_len, p_vec = to_vec(p_tokens, vocab, max_profile_len)
 90 |                         ps_tokens.append(p_tokens)
 91 |                         ps_vec.append(p_vec)
 92 |                         ps_len.append(p_len)
 93 |                     ps_num = len(profiles)
 94 |                     dataset.append((us_id, us_tokens, us_vec, us_len, us_num, 1.0, ps_id, ps_tokens, ps_vec, ps_len, ps_num))
 95 | 
 96 |             if fields[3] != "NA":
 97 |                 personas = fields[3].split("|")
 98 |                 for index, persona in enumerate(personas):
 99 |                     # ps_id = "mismatch_" + str(index)
100 |                     ps_id = "0." + str(index)
101 |                     profiles = persona.split(' _eos_ ')
102 |                     profiles = [profile + " _eos_" for profile in profiles]
103 |                     profiles = profiles[-max_profile_num:]
104 |                     ps_tokens = []
105 |                     ps_vec = []
106 |                     ps_len = []
107 |                     for profile in profiles:
108 |                         p_tokens = profile.split(' ')[:max_profile_len]
109 |                         p_len, p_vec = to_vec(p_tokens, vocab, max_profile_len)
110 |                         ps_tokens.append(p_tokens)
111 |                         ps_vec.append(p_vec)
112 |                         ps_len.append(p_len)
113 |                     ps_num = len(profiles)
114 |                     dataset.append((us_id, us_tokens, us_vec, us_len, us_num, 0.0, ps_id, ps_tokens, ps_vec, ps_len, ps_num))
115 |    
116 |     return dataset
117 | 
118 | 
119 | def normalize_vec(vec, maxlen):
120 |     '''
121 |     pad the original vec to the same maxlen
122 |     [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0]
123 |     '''
124 |     if len(vec) == maxlen:
125 |         return vec
126 | 
127 |     new_vec = np.zeros(maxlen, dtype='int32')
128 |     for i in range(len(vec)):
129 |         new_vec[i] = vec[i]
130 |     return new_vec
131 | 
132 | 
133 | def charVec(tokens, charVocab, maxlen, maxWordLength):
134 |     '''
135 |     chars = np.array( (maxlen, maxWordLength) )    0 if not found in charVocab or None
136 |     word_lengths = np.array( maxlen )              1 if None
137 |     '''
138 |     n = len(tokens)
139 |     if n > maxlen:
140 |         n = maxlen
141 | 
142 |     chars =  np.zeros((maxlen, maxWordLength), dtype=np.int32)
143 |     word_lengths = np.ones(maxlen, dtype=np.int32)
144 |     for i in range(n):
145 |         token = tokens[i][:maxWordLength]
146 |         word_lengths[i] = len(token)
147 |         row = chars[i]
148 |         for idx, ch in enumerate(token):
149 |             if ch in charVocab:
150 |                 row[idx] = charVocab[ch]
151 | 
152 |     return chars, word_lengths
153 | 
154 | 
155 | def batch_iter(data, batch_size, num_epochs, max_utter_num, max_utter_len, max_profile_num, max_profile_len, 
156 |                charVocab, max_word_length, shuffle=True):
157 |     """
158 |     Generates a batch iterator for a dataset.
159 |     """
160 |     data_size = len(data)
161 |     num_batches_per_epoch = int(len(data)/batch_size) + 1
162 |     for epoch in range(num_epochs):
163 |         # Shuffle the data at each epoch
164 |         if shuffle:
165 |             random.shuffle(data)
166 |         for batch_num in range(num_batches_per_epoch):
167 |             start_index = batch_num * batch_size
168 |             end_index = min((batch_num + 1) * batch_size, data_size)
169 | 
170 |             x_utterances = []
171 |             x_utterances_len = []
172 |             x_utterances_num = []
173 |             x_profiles = []
174 |             x_profiles_len = []
175 |             x_profiles_num = []
176 | 
177 |             x_labels = []
178 |             x_id_pairs = []
179 |             
180 |             x_utterances_char = []
181 |             x_utterances_char_len = []         
182 |             x_profiles_char = []
183 |             x_profiles_char_len = []
184 | 
185 |             for rowIdx in range(start_index, end_index):
186 |                 us_id, us_tokens, us_vec, us_len, us_num, label, ps_id, ps_tokens, ps_vec, ps_len, ps_num = data[rowIdx]
187 | 
188 |                 # normalize us_vec and us_len
189 |                 new_utters_vec = np.zeros((max_utter_num, max_utter_len), dtype='int32')
190 |                 new_utters_len = np.zeros((max_utter_num, ), dtype='int32')
191 |                 for i in range(len(us_len)):
192 |                     new_utter_vec = normalize_vec(us_vec[i], max_utter_len)
193 |                     new_utters_vec[i] = new_utter_vec
194 |                     new_utters_len[i] = us_len[i]
195 |                 x_utterances.append(new_utters_vec)
196 |                 x_utterances_len.append(new_utters_len)
197 |                 x_utterances_num.append(us_num)
198 | 
199 |                 # normalize ps_vec and ps_len
200 |                 new_profiles_vec = np.zeros((max_profile_num, max_profile_len), dtype='int32')
201 |                 new_profiles_len = np.zeros((max_profile_num, ), dtype='int32')
202 |                 for i in range(len(ps_len)):
203 |                     new_profile_vec = normalize_vec(ps_vec[i], max_profile_len)
204 |                     new_profiles_vec[i] = new_profile_vec
205 |                     new_profiles_len[i] = ps_len[i]
206 |                 x_profiles.append(new_profiles_vec)
207 |                 x_profiles_len.append(new_profiles_len)
208 |                 x_profiles_num.append(ps_num)
209 | 
210 |                 x_labels.append(label)
211 |                 x_id_pairs.append((us_id, ps_id, int(label)))
212 | 
213 |                 # normalize us_CharVec and us_CharLen
214 |                 uttersCharVec = np.zeros((max_utter_num, max_utter_len, max_word_length), dtype='int32')
215 |                 uttersCharLen = np.ones((max_utter_num, max_utter_len), dtype='int32')
216 |                 for i in range(len(us_len)):
217 |                     utterCharVec, utterCharLen = charVec(us_tokens[i], charVocab, max_utter_len, max_word_length)
218 |                     uttersCharVec[i] = utterCharVec
219 |                     uttersCharLen[i] = utterCharLen
220 |                 x_utterances_char.append(uttersCharVec)
221 |                 x_utterances_char_len.append(uttersCharLen)
222 | 
223 |                 # normalize ps_CharVec and ps_CharLen
224 |                 psCharVec = np.zeros((max_profile_num, max_profile_len, max_word_length), dtype='int32')
225 |                 psCharLen = np.ones((max_profile_num, max_profile_len), dtype='int32')
226 |                 for i in range(len(ps_len)):
227 |                     pCharVec, pCharLen = charVec(ps_tokens[i], charVocab, max_profile_len, max_word_length)
228 |                     psCharVec[i] = pCharVec
229 |                     psCharLen[i] = pCharLen
230 |                 x_profiles_char.append(psCharVec)
231 |                 x_profiles_char_len.append(psCharLen)
232 | 
233 |             yield np.array(x_utterances), np.array(x_utterances_len), np.array(x_utterances_num), \
234 |                   np.array(x_profiles), np.array(x_profiles_len), np.array(x_profiles_num), \
235 |                   np.array(x_labels), x_id_pairs, \
236 |                   np.array(x_utterances_char), np.array(x_utterances_char_len), np.array(x_profiles_char), np.array(x_profiles_char_len)
237 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/C2P-X/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import datetime
  6 | import operator
  7 | from collections import defaultdict
  8 | import metrics
  9 | import data_helpers
 10 | from model_BOW import BOW as MODEL
 11 | # from model_BiLSTM import BiLSTM as MODEL
 12 | # from model_Transformer import Transformer as MODEL
 13 | # from model_ESIM import ESIM as MODEL
 14 | 
 15 | 
 16 | # Files
 17 | tf.flags.DEFINE_string("train_file", "", "path to train file")
 18 | tf.flags.DEFINE_string("valid_file", "", "path to valid file")
 19 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
 20 | tf.flags.DEFINE_string("char_vocab_file",  "", "path to char vocab file")
 21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector")
 22 | 
 23 | # Model Hyperparameters
 24 | tf.flags.DEFINE_integer("max_context_len", 150, "max context length")
 25 | tf.flags.DEFINE_integer("max_persona_len", 50, "max persona length")
 26 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
 27 | tf.flags.DEFINE_integer("num_layer", 1, "num of layers in sentence encoder")
 28 | tf.flags.DEFINE_integer("embedding_dim", 200, "dimensionality of word embedding")
 29 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units")
 30 | 
 31 | # Training parameters
 32 | tf.flags.DEFINE_integer("batch_size", 128, "batch size (default: 128)")
 33 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0)")
 34 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "dropout keep probability (default: 1.0)")
 35 | tf.flags.DEFINE_integer("num_epochs", 1000000, "number of training epochs (default: 1000000)")
 36 | tf.flags.DEFINE_integer("evaluate_every", 1000, "evaluate model on valid dataset after this many steps (default: 1000)")
 37 | 
 38 | # Misc Parameters
 39 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 40 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 41 | 
 42 | FLAGS = tf.flags.FLAGS
 43 | # FLAGS._parse_flags()
 44 | # print("\nParameters:")
 45 | # for attr, value in sorted(FLAGS.__flags.items()):
 46 | #     print("{}={}".format(attr.upper(), value))
 47 | print("")
 48 | 
 49 | # Load data
 50 | print("Loading data...")
 51 | 
 52 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
 53 | print('vocabulary size: {}'.format(len(vocab)))
 54 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
 55 | 
 56 | train_dataset = data_helpers.load_dataset(FLAGS.train_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len)
 57 | print('train_pairs: {}'.format(len(train_dataset)))
 58 | valid_dataset = data_helpers.load_dataset(FLAGS.valid_file, vocab, FLAGS.max_context_len, FLAGS.max_persona_len)
 59 | print('valid_pairs: {}'.format(len(valid_dataset)))
 60 | 
 61 | with tf.Graph().as_default():
 62 |     session_conf = tf.ConfigProto(
 63 |       allow_soft_placement=FLAGS.allow_soft_placement,
 64 |       log_device_placement=FLAGS.log_device_placement)
 65 |     sess = tf.Session(config=session_conf)
 66 |     with sess.as_default():
 67 |         model = MODEL(
 68 |             max_context_len=FLAGS.max_context_len,
 69 |             max_persona_len=FLAGS.max_persona_len,
 70 |             num_layer=FLAGS.num_layer,
 71 |             vocab_size=len(vocab),
 72 |             embedding_size=FLAGS.embedding_dim,
 73 |             vocab=vocab,
 74 |             rnn_size=FLAGS.rnn_size,
 75 |             maxWordLength=FLAGS.max_word_length,
 76 |             charVocab=charVocab,
 77 |             l2_reg_lambda=FLAGS.l2_reg_lambda)
 78 |         # Define Training procedure
 79 |         global_step = tf.Variable(0, name="global_step", trainable=False)
 80 |         starter_learning_rate = 0.001
 81 |         learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
 82 |                                                                    5000, 0.96, staircase=True)
 83 |         optimizer = tf.train.AdamOptimizer(learning_rate)
 84 |         grads_and_vars = optimizer.compute_gradients(model.mean_loss)
 85 |         train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
 86 | 
 87 |         # Keep track of gradient values and sparsity (optional)
 88 |         """
 89 |         grad_summaries = []
 90 |         for g, v in grads_and_vars:
 91 |             if g is not None:
 92 |                 grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
 93 |                 sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 94 |                 grad_summaries.append(grad_hist_summary)
 95 |                 grad_summaries.append(sparsity_summary)
 96 |         grad_summaries_merged = tf.merge_summary(grad_summaries)
 97 |         """
 98 | 
 99 |         # Output directory for models and summaries
100 |         timestamp = str(int(time.time()))
101 |         # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
102 |         out_dir = os.path.abspath(os.path.join("../output", timestamp))
103 |         print("Writing to {}\n".format(out_dir))
104 | 
105 |         # Summaries for loss and accuracy
106 |         """
107 |         loss_summary = tf.scalar_summary("loss", model.mean_loss)
108 |         acc_summary = tf.scalar_summary("accuracy", model.accuracy)
109 | 
110 |         # Train Summaries
111 |         train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
112 |         train_summary_dir = os.path.join(out_dir, "summaries", "train")
113 |         train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
114 | 
115 |         # Dev summaries
116 |         dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
117 |         dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
118 |         dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def)
119 |         """
120 | 
121 |         # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
122 |         checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
123 |         checkpoint_prefix = os.path.join(checkpoint_dir, "model")
124 |         if not os.path.exists(checkpoint_dir):
125 |             os.makedirs(checkpoint_dir)
126 |         saver = tf.train.Saver(tf.global_variables())
127 | 
128 |         # Initialize all variables
129 |         sess.run(tf.global_variables_initializer())
130 | 
131 |         def train_step(x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, 
132 |                        x_context_char, x_context_char_len, x_persona_char, x_persona_char_len):
133 |             """
134 |             A single training step
135 |             """
136 |             feed_dict = {
137 |               model.context: x_context,
138 |               model.context_len: x_context_len,
139 |               model.persona: x_persona,
140 |               model.persona_len: x_persona_len,
141 |               model.target: x_labels,
142 |               model.dropout_keep_prob: FLAGS.dropout_keep_prob,
143 |               model.c_charVec: x_context_char,
144 |               model.c_charLen: x_context_char_len,
145 |               model.p_charVec: x_persona_char,
146 |               model.p_charLen: x_persona_char_len
147 |             }
148 | 
149 |             _, step, loss, accuracy, predicted_prob = sess.run(
150 |                 [train_op, global_step, model.mean_loss, model.accuracy, model.probs],
151 |                 feed_dict)
152 | 
153 |             if step%100 == 0:
154 |                 time_str = datetime.datetime.now().isoformat()
155 |                 print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
156 |             #train_summary_writer.add_summary(summaries, step)
157 | 
158 | 
159 |         def dev_step():
160 |             results = defaultdict(list)
161 |             num_test = 0
162 |             num_correct = 0.0
163 |             valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=True)
164 |             for valid_batch in valid_batches:
165 |                 x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = valid_batch
166 |                 feed_dict = {
167 |                   model.context: x_context,
168 |                   model.context_len: x_context_len,
169 |                   model.persona: x_persona,
170 |                   model.persona_len: x_persona_len,
171 |                   model.target: x_labels,
172 |                   model.dropout_keep_prob: 1.0,
173 |                   model.c_charVec: x_context_char,
174 |                   model.c_charLen: x_context_char_len,
175 |                   model.p_charVec: x_persona_char,
176 |                   model.p_charLen: x_persona_char_len
177 |                 }
178 |                 batch_accuracy, predicted_prob = sess.run([model.accuracy, model.probs], feed_dict)
179 |                 num_test += len(predicted_prob)
180 |                 if num_test % 1000 == 0:
181 |                     print(num_test)
182 | 
183 |                 num_correct += len(predicted_prob) * batch_accuracy
184 |                 for i, prob_score in enumerate(predicted_prob):
185 |                     utterances_id, profiles_id, label = x_id_pairs[i]
186 |                     results[utterances_id].append((profiles_id, label, prob_score))
187 | 
188 |             #calculate top-1 precision
189 |             print('num_test_samples: {}  test_accuracy: {}'.format(num_test, num_correct/num_test))
190 |             accu, precision, recall, f1, loss = metrics.classification_metrics(results)
191 |             print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
192 | 
193 |             mvp = metrics.mean_average_precision(results)
194 |             mrr = metrics.mean_reciprocal_rank(results)
195 |             top_1_precision = metrics.top_1_precision(results)
196 |             total_valid_query = metrics.get_num_valid_query(results)
197 |             print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
198 | 
199 |             return mrr
200 | 
201 |         best_mrr = 0.0
202 |         batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_context_len, FLAGS.max_persona_len, charVocab, FLAGS.max_word_length, shuffle=True)
203 |         for batch in batches:
204 |             x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len = batch
205 |             train_step(x_context, x_context_len, x_persona, x_persona_len, x_labels, x_id_pairs, x_context_char, x_context_char_len, x_persona_char, x_persona_char_len)
206 |             current_step = tf.train.global_step(sess, global_step)
207 |             if current_step % FLAGS.evaluate_every == 0:
208 |                 print("\nEvaluation:")
209 |                 valid_mrr = dev_step()
210 |                 if valid_mrr > best_mrr:
211 |                     best_mrr = valid_mrr
212 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
213 |                     print("Saved model checkpoint to {}\n".format(path))
214 |     


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/model_BiLSTM.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | FLAGS = tf.flags.FLAGS
  5 | 
  6 | def get_embeddings(vocab):
  7 |     print("get_embedding")
  8 |     initializer = load_word_embeddings(vocab, FLAGS.embedding_dim)
  9 |     return tf.constant(initializer, name="word_embedding")
 10 | 
 11 | def get_char_embedding(charVocab):
 12 |     print("get_char_embedding")
 13 |     char_size = len(charVocab)
 14 |     embeddings = np.zeros((char_size, char_size), dtype='float32')
 15 |     for i in range(1, char_size):
 16 |         embeddings[i, i] = 1.0
 17 | 
 18 |     return tf.constant(embeddings, name="word_char_embedding")
 19 | 
 20 | def load_embed_vectors(fname, dim):
 21 |     # vectors = { 'the': [0.2911, 0.3288, 0.2002,...], ... }
 22 |     vectors = {}
 23 |     for line in open(fname, 'rt'):
 24 |         items = line.strip().split(' ')
 25 |         if len(items[0]) <= 0:
 26 |             continue
 27 |         vec = [float(items[i]) for i in range(1, dim+1)]
 28 |         vectors[items[0]] = vec
 29 | 
 30 |     return vectors
 31 | 
 32 | def load_word_embeddings(vocab, dim):
 33 |     vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim)
 34 |     vocab_size = len(vocab)
 35 |     embeddings = np.zeros((vocab_size, dim), dtype='float32')
 36 |     for word, code in vocab.items():
 37 |         if word in vectors:
 38 |             embeddings[code] = vectors[word]
 39 |         #else:
 40 |         #    embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 
 41 | 
 42 |     return embeddings 
 43 | 
 44 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False):
 45 |     with tf.variable_scope(scope, reuse=scope_reuse) as vs:
 46 |         fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
 47 |         fw_cell  = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob)
 48 |         bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse)
 49 |         bw_cell  = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob)
 50 |         rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell,
 51 |                                                                 inputs=inputs,
 52 |                                                                 sequence_length=input_seq_len,
 53 |                                                                 dtype=tf.float32)
 54 |         return rnn_outputs, rnn_states
 55 | 
 56 | def multi_lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, num_layer, scope, scope_reuse=False):
 57 |     with tf.variable_scope(scope, reuse=scope_reuse) as vs:
 58 |         multi_outputs = []
 59 |         multi_states = []
 60 |         cur_inputs = inputs
 61 |         for i_layer in range(num_layer):
 62 |             rnn_outputs, rnn_states = lstm_layer(cur_inputs, input_seq_len, rnn_size, dropout_keep_prob, scope+str(i_layer), scope_reuse)
 63 |             rnn_outputs = tf.concat(values=rnn_outputs, axis=2)
 64 |             multi_outputs.append(rnn_outputs)
 65 |             multi_states.append(rnn_states)
 66 |             cur_inputs = rnn_outputs
 67 | 
 68 |         # multi_layer_aggregation
 69 |         ml_weights = tf.nn.softmax(tf.get_variable("ml_scores", [num_layer, ], initializer=tf.constant_initializer(0.0)))
 70 | 
 71 |         multi_outputs = tf.stack(multi_outputs, axis=-1)   # [batch_size, max_len, 2*rnn_size(400), num_layer]
 72 |         max_len = multi_outputs.get_shape()[1].value
 73 |         dim = multi_outputs.get_shape()[2].value
 74 |         flattened_multi_outputs = tf.reshape(multi_outputs, [-1, num_layer])                         # [batch_size * max_len * 2*rnn_size(400), num_layer]
 75 |         aggregated_ml_outputs = tf.matmul(flattened_multi_outputs, tf.expand_dims(ml_weights, 1))    # [batch_size * max_len * 2*rnn_size(400), 1]
 76 |         aggregated_ml_outputs = tf.reshape(aggregated_ml_outputs, [-1, max_len, dim])                # [batch_size , max_len , 2*rnn_size(400)]
 77 | 
 78 |         return aggregated_ml_outputs
 79 | 
 80 | 
 81 | class BiLSTM(object):
 82 |     def __init__(
 83 |       self, max_utter_num, max_utter_len, max_profile_num, max_profile_len, num_layer, vocab_size, embedding_size, vocab, rnn_size, maxWordLength, charVocab, l2_reg_lambda=0.0):
 84 | 
 85 |         self.utterances = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances")
 86 |         self.utterances_len = tf.placeholder(tf.int32, [None, max_utter_num], name="utterances_len")
 87 |         self.utterances_num = tf.placeholder(tf.int32, [None], name="utterances_num")
 88 |         self.profiles = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles")
 89 |         self.profiles_len = tf.placeholder(tf.int32, [None, max_profile_num], name="profiles_len")
 90 |         self.profiles_num = tf.placeholder(tf.int32, [None], name="profiles_num")
 91 | 
 92 |         self.target = tf.placeholder(tf.float32, [None], name="target")
 93 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 94 | 
 95 |         self.u_charVec = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len, maxWordLength], name="utterances_char")
 96 |         self.u_charLen = tf.placeholder(tf.int32, [None, max_utter_num, max_utter_len], name="utterances_char_len")
 97 |         self.p_charVec = tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len, maxWordLength], name="profiles_char")
 98 |         self.p_charLen =  tf.placeholder(tf.int32, [None, max_profile_num, max_profile_len], name="profiles_char_len")
 99 | 
100 |         l2_loss = tf.constant(1.0)
101 | 
102 | 
103 |         # =============================== Embedding layer ===============================
104 |         with tf.name_scope("embedding"):
105 |             W = get_embeddings(vocab)
106 |             utterances_embedded = tf.nn.embedding_lookup(W, self.utterances)  # [batch_size, max_utter_num, max_utter_len,  word_dim]
107 |             profiles_embedded = tf.nn.embedding_lookup(W, self.profiles)      # [batch_size, max_profile_num, max_profile_len, word_dim]
108 |             utterances_embedded = tf.nn.dropout(utterances_embedded, keep_prob=self.dropout_keep_prob)
109 |             profiles_embedded = tf.nn.dropout(profiles_embedded, keep_prob=self.dropout_keep_prob)
110 |             print("utterances_embedded: {}".format(utterances_embedded.get_shape()))
111 |             print("profiles_embedded: {}".format(profiles_embedded.get_shape()))
112 | 
113 | 
114 |         # =============================== Encoding layer ===============================
115 |         with tf.variable_scope("encoding_layer") as vs:
116 |             rnn_scope_name = "bidirectional_rnn"
117 |             emb_dim = utterances_embedded.get_shape()[-1].value
118 |             flattened_utterances_embedded = tf.reshape(utterances_embedded, [-1, max_utter_len, emb_dim])  # [batch_size*max_utter_num, max_utter_len, emb]
119 |             flattened_utterances_len = tf.reshape(self.utterances_len, [-1])                               # [batch_size*max_utter_num, ]
120 |             flattened_profiles_embedded = tf.reshape(profiles_embedded, [-1, max_profile_len, emb_dim])    # [batch_size*max_profile_num, max_profile_len, emb]
121 |             flattened_profiles_len = tf.reshape(self.profiles_len, [-1])                                   # [batch_size*max_profile_num, ]
122 |             # 1. single_lstm_layer
123 |             u_rnn_output, u_rnn_states = lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False)
124 |             utterances_output = tf.concat(axis=2, values=u_rnn_output)   # [batch_size*max_utter_num,  max_utter_len, rnn_size*2]
125 |             p_rnn_output, p_rnn_states = lstm_layer(flattened_profiles_embedded, flattened_profiles_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True)   # [batch_size, max_profile_len, rnn_size(200)]
126 |             profiles_output = tf.concat(axis=2, values=p_rnn_output)     # [batch_size*max_profile_num, max_profile_len, 2*rnn_size(400)]
127 |             # 2. multi_lstm_layer
128 |             # utterances_output = multi_lstm_layer(flattened_utterances_embedded, flattened_utterances_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=False)
129 |             # response_output = multi_lstm_layer(flattened_responses_embedded, flattened_responses_len, rnn_size, self.dropout_keep_prob, num_layer, rnn_scope_name, scope_reuse=True)
130 |             # print("establish AHRE layers : {}".format(num_layer))
131 |             print("establish BiLSTM encoder")
132 |             
133 | 
134 |         # =============================== Matching layer ===============================
135 |         with tf.variable_scope("matching_layer") as vs:
136 |             final_utterances = tf.concat(axis=1, values=[u_rnn_states[0].h, u_rnn_states[1].h])
137 |             concat_dim = final_utterances.get_shape()[-1].value
138 |             final_utterances = tf.reshape(final_utterances, [-1, max_utter_num, concat_dim])     # [batch_size, max_utter_num, dim]
139 | 
140 |             final_profiles = tf.concat(axis=1, values=[p_rnn_states[0].h, p_rnn_states[1].h])
141 |             final_profiles = tf.reshape(final_profiles, [-1, max_profile_num, concat_dim])       # [batch_size, max_profile_num, dim]
142 | 
143 |             A_matrix = tf.get_variable('A_matrix_v', shape=[concat_dim, concat_dim], initializer=tf.orthogonal_initializer(), dtype=tf.float32)
144 |             similarity = tf.einsum('aij,jk->aik', 
145 |                                    final_utterances, A_matrix)   # [batch_size, max_utter_num, dim]
146 |             similarity = tf.matmul(similarity, 
147 |                                    tf.transpose(final_profiles, perm=[0, 2, 1]),
148 |                                    name="similarity")            # [batch_size, max_utter_num, max_profile_num]
149 | 
150 |             print("shape of similarity: {}".format(similarity.get_shape()))
151 |             print("establish matching between utterances and profiles")
152 | 
153 | 
154 |         # =============================== Aggregation layer ===============================
155 |         with tf.variable_scope("aggregation_layer") as vs:
156 |             logits = tf.reduce_max(similarity, axis=2, name="logits_1")  # [batch_size, max_utter_num]
157 |             mask_u = tf.sequence_mask(self.utterances_num, max_utter_num, dtype=tf.float32)  # [batch_size, max_utter_num]
158 |             logits = logits * mask_u
159 |             logits = tf.reduce_sum(logits, axis=1, name="logits_2")      # [batch_size, ]
160 |             print("establish reduce_max across profiles and masked_reduce_sum across utterances")
161 |             print("logits: {}".format(logits.get_shape()))
162 | 
163 | 
164 |         # =============================== Prediction layer ===============================
165 |         with tf.variable_scope("prediction_layer") as vs:
166 |             self.probs = tf.sigmoid(logits, name="prob")   # [batch_size, ]
167 |             losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.target)
168 |             self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum(
169 |                                                               tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
170 | 
171 |         with tf.name_scope("accuracy"):
172 |             correct_prediction = tf.equal(tf.sign(self.probs - 0.5), tf.sign(self.target - 0.5))    # [batch_size, ]
173 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
174 | 


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT finetuning runner."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import operator
 23 | from time import time
 24 | from collections import defaultdict
 25 | import tensorflow as tf
 26 | import optimization
 27 | import tokenization
 28 | import modeling as modeling
 29 | import metrics
 30 | 
 31 | 
 32 | flags = tf.flags
 33 | FLAGS = flags.FLAGS
 34 | 
 35 | ## Required parameters
 36 | flags.DEFINE_string(
 37 |     "test_dir", 'valid.tfrecord',
 38 |     "The input test data dir. Should contain the .tsv files (or other data files) for the task.")
 39 | 
 40 | flags.DEFINE_string(
 41 |     "restore_model_dir", 'output/',
 42 |     "The output directory where the model checkpoints have been written.")
 43 | 
 44 | flags.DEFINE_string(
 45 |     "task_name", 'TestModel', 
 46 |     "The name of the task.")
 47 | 
 48 | flags.DEFINE_string(
 49 |     "bert_config_file", 'uncased_L-12_H-768_A-12/bert_config.json',
 50 |     "The config json file corresponding to the pre-trained BERT model. "
 51 |     "This specifies the model architecture.")
 52 | 
 53 | flags.DEFINE_integer(
 54 |     "max_seq_length", 320,
 55 |     "The maximum total input sequence length after WordPiece tokenization. "
 56 |     "Sequences longer than this will be truncated, and sequences shorter "
 57 |     "than this will be padded.")
 58 | 
 59 | flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.")
 60 | 
 61 | flags.DEFINE_integer("eval_batch_size", 32, "Total batch size for predict.")
 62 | 
 63 | 
 64 | def print_configuration_op(FLAGS):
 65 |     print('My Configurations:')
 66 |     for name, value in FLAGS.__flags.items():
 67 |         value=value.value
 68 |         if type(value) == float:
 69 |             print(' %s:\t %f'%(name, value))
 70 |         elif type(value) == int:
 71 |             print(' %s:\t %d'%(name, value))
 72 |         elif type(value) == str:
 73 |             print(' %s:\t %s'%(name, value))
 74 |         elif type(value) == bool:
 75 |             print(' %s:\t %s'%(name, value))
 76 |         else:
 77 |             print('%s:\t %s' % (name, value))
 78 |     print('End of configuration')
 79 | 
 80 | 
 81 | def total_sample(file_name):
 82 |     sample_nums = 0
 83 |     for record in tf.python_io.tf_record_iterator(file_name):
 84 |         sample_nums += 1
 85 |     return  sample_nums
 86 | 
 87 | 
 88 | def parse_exmp(serial_exmp):
 89 |     input_data = tf.parse_single_example(serial_exmp,
 90 |                                        features={
 91 |                                            "text_a_id":
 92 |                                                tf.FixedLenFeature([], tf.int64),
 93 |                                            "text_b_id":
 94 |                                                tf.FixedLenFeature([], tf.int64),
 95 |                                            "input_ids":
 96 |                                                tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 97 |                                            "input_mask":
 98 |                                                tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 99 |                                            "segment_ids":
100 |                                                tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
101 |                                            "label_ids":
102 |                                                tf.FixedLenFeature([], tf.float32)
103 |                                        }
104 |                                        )
105 |     # So cast all int64 to int32.
106 |     for name in list(input_data.keys()):
107 |         t = input_data[name]
108 |         if t.dtype == tf.int64:
109 |             t = tf.to_int32(t)
110 |         input_data[name] = t
111 | 
112 |     text_a_id = input_data["text_a_id"]
113 |     text_b_id = input_data['text_b_id']
114 |     input_ids = input_data["input_ids"]
115 |     input_mask = input_data["input_mask"]
116 |     segment_ids= input_data["segment_ids"]
117 |     labels = input_data['label_ids']
118 |     return text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels
119 | 
120 | 
121 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, text_a_id, text_b_id,
122 |                  num_labels, use_one_hot_embeddings):
123 |   """Creates a classification model."""
124 |   model = modeling.BertModel(
125 |       config=bert_config,
126 |       is_training=is_training,
127 |       input_ids=input_ids,
128 |       input_mask=input_mask,
129 |       token_type_ids=segment_ids,
130 |       use_one_hot_embeddings=use_one_hot_embeddings)
131 | 
132 |   # In the demo, we are doing a simple classification task on the entire
133 |   # segment.
134 |   #
135 |   # If you want to use the token-level output, use model.get_sequence_output()
136 |   # instead.
137 |   target_loss_weight = [1.0, 1.0]
138 |   target_loss_weight = tf.convert_to_tensor(target_loss_weight)
139 | 
140 |   flagx = tf.cast(tf.greater(labels, 0), dtype=tf.float32)
141 |   flagy = tf.cast(tf.equal(labels, 0), dtype=tf.float32)
142 | 
143 |   all_target_loss = target_loss_weight[1] * flagx + target_loss_weight[0] * flagy
144 | 
145 |   output_layer = model.get_pooled_output()
146 | 
147 |   hidden_size = output_layer.shape[-1].value
148 | 
149 |   output_weights = tf.get_variable(
150 |       "output_weights", [num_labels, hidden_size],
151 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
152 | 
153 |   output_bias = tf.get_variable(
154 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
155 | 
156 |   with tf.variable_scope("loss"):
157 |     # if is_training:
158 |     #   # I.e., 0.1 dropout
159 |     #   output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
160 |     output_layer = tf.layers.dropout(output_layer, rate=0.1, training=is_training)
161 | 
162 |     logits = tf.matmul(output_layer, output_weights, transpose_b=True)
163 |     logits = tf.nn.bias_add(logits, output_bias)
164 | 
165 |     probabilities = tf.sigmoid(logits, name="prob")
166 |     logits = tf.squeeze(logits,[1])
167 |     losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
168 |     losses = tf.multiply(losses, all_target_loss)
169 | 
170 |     mean_loss = tf.reduce_mean(losses, name="mean_loss") +  sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
171 | 
172 |     with tf.name_scope("accuracy"):
173 |         correct_prediction = tf.equal(tf.sign(probabilities - 0.5), tf.sign(labels - 0.5))
174 |         accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
175 |     #
176 |     # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
177 |     #
178 |     # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
179 |     # loss = tf.reduce_mean(per_example_loss)
180 | 
181 |     return mean_loss, logits, probabilities, accuracy, model, output_layer
182 | 
183 | 
184 | best_score = 0.0
185 | def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer):
186 |     results = defaultdict(list)
187 |     num_test = 0
188 |     num_correct = 0.0
189 |     n_updates = 0
190 |     mrr = 0
191 |     t0 = time()
192 |     try:
193 |         while True:
194 |             n_updates += 1
195 | 
196 |             batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False})
197 |             question_id, answer_id, label = pair_
198 | 
199 |             num_test += len(predicted_prob)
200 |             # if num_test % 1000 == 0:
201 |             #     print(num_test)
202 | 
203 |             num_correct += len(predicted_prob) * batch_accuracy
204 |             for i, prob_score in enumerate(predicted_prob):
205 |                 results[question_id[i]].append((answer_id[i], label[i], prob_score[0]))
206 | 
207 |             if n_updates%100 == 0:
208 |                 tf.logging.info("n_update %d , %s: Mins Used: %.2f" %
209 |                                 (n_updates, op_name, (time() - t0) / 60.0))
210 | 
211 |     except tf.errors.OutOfRangeError:
212 | 
213 |         print("Inference Time: {} s".format(time() - t0))
214 | 
215 |         # calculate top-1 precision
216 |         print('num_test_samples: {}  test_accuracy: {}'.format(num_test, num_correct / num_test))
217 |         accu, precision, recall, f1, loss = metrics.classification_metrics(results)
218 |         print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
219 | 
220 |         mvp = metrics.mean_average_precision(results)
221 |         mrr = metrics.mean_reciprocal_rank(results)
222 |         top_1_precision = metrics.top_1_precision(results)
223 |         total_valid_query = metrics.get_num_valid_query(results)
224 |         print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(
225 |             mvp, mrr, top_1_precision, total_valid_query))
226 | 
227 |         out_path = os.path.join(dir_path, "output_test.txt")
228 |         print("Saving evaluation to {}".format(out_path))
229 |         with open(out_path, 'w') as f:
230 |           f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
231 |           for us_id, v in results.items():
232 |             v.sort(key=operator.itemgetter(2), reverse=True)
233 |             for i, rec in enumerate(v):
234 |               r_id, label, prob_score = rec
235 |               rank = i+1
236 |               f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label))
237 |     return mrr
238 | 
239 | 
240 | def main(_):
241 |     tf.logging.set_verbosity(tf.logging.INFO)
242 | 
243 |     print_configuration_op(FLAGS)
244 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
245 | 
246 |     test_data_size = total_sample(FLAGS.test_dir)
247 |     tf.logging.info('test data size: {}'.format(test_data_size))
248 | 
249 |     filenames = tf.placeholder(tf.string, shape=[None])
250 |     shuffle_size = tf.placeholder(tf.int64)
251 |     dataset = tf.data.TFRecordDataset(filenames)
252 |     dataset = dataset.map(parse_exmp)  # Parse the record into tensors.
253 |     dataset = dataset.repeat(1)
254 |     # dataset = dataset.shuffle(shuffle_size)
255 |     dataset = dataset.batch(FLAGS.eval_batch_size)
256 |     iterator = dataset.make_initializable_iterator()
257 |     text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels = iterator.get_next()  # output dir
258 |     pair_ids = [text_a_id, text_b_id, labels]
259 | 
260 |     training = tf.placeholder(tf.bool)
261 |     mean_loss, logits, probabilities, accuracy, model, output_layer = create_model(bert_config,
262 |                                                                        is_training = training,
263 |                                                                        input_ids = input_ids,
264 |                                                                        input_mask = input_mask,
265 |                                                                        segment_ids = segment_ids,
266 |                                                                        labels = labels,
267 |                                                                        text_a_id = text_a_id,
268 |                                                                        text_b_id = text_b_id,
269 |                                                                        num_labels = 1,
270 |                                                                        use_one_hot_embeddings = False)
271 | 
272 | 
273 |     config = tf.ConfigProto(allow_soft_placement=True)
274 |     config.gpu_options.allow_growth = True
275 | 
276 |     if FLAGS.do_eval:
277 |         with tf.Session(config=config) as sess:
278 |             tf.logging.info("*** Restore model ***")
279 | 
280 |             ckpt = tf.train.get_checkpoint_state(FLAGS.restore_model_dir)
281 |             variables = tf.trainable_variables()
282 |             saver = tf.train.Saver(variables)
283 |             saver.restore(sess, ckpt.model_checkpoint_path)
284 | 
285 |             tf.logging.info('Test begin')
286 |             sess.run(iterator.initializer,
287 |                      feed_dict={filenames: [FLAGS.test_dir], shuffle_size: 1})
288 |             run_test(FLAGS.restore_model_dir, "test", sess, training, accuracy, probabilities, pair_ids, output_layer)
289 | 
290 | 
291 | if __name__ == "__main__":
292 | 
293 |   tf.app.run()
294 | 


--------------------------------------------------------------------------------
/Non-Pretraining-Based/U2P-X/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import datetime
  6 | import operator
  7 | from collections import defaultdict
  8 | import metrics
  9 | import data_helpers
 10 | from model_BOW import BOW as MODEL
 11 | # from model_BiLSTM import BiLSTM as MODEL
 12 | # from model_Transformer import Transformer as MODEL
 13 | # from model_ESIM import ESIM as MODEL
 14 | 
 15 | 
 16 | # Files
 17 | tf.flags.DEFINE_string("train_file", "", "path to train file")
 18 | tf.flags.DEFINE_string("valid_file", "", "path to valid file")
 19 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file")
 20 | tf.flags.DEFINE_string("char_vocab_file",  "", "path to char vocab file")
 21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector")
 22 | 
 23 | # Model Hyperparameters
 24 | tf.flags.DEFINE_integer("max_utter_num", 8, "max utterance number")
 25 | tf.flags.DEFINE_integer("max_utter_len", 20, "max utterance length")
 26 | tf.flags.DEFINE_integer("max_profile_num", 5, "max profile number")
 27 | tf.flags.DEFINE_integer("max_profile_len", 15, "max profile length")
 28 | tf.flags.DEFINE_integer("max_word_length", 18, "max word length")
 29 | tf.flags.DEFINE_integer("num_layer", 1, "num of layers in sentence encoder")
 30 | tf.flags.DEFINE_integer("embedding_dim", 200, "dimensionality of word embedding")
 31 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units")
 32 | 
 33 | # Training parameters
 34 | tf.flags.DEFINE_integer("batch_size", 128, "batch size (default: 128)")
 35 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0)")
 36 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "dropout keep probability (default: 1.0)")
 37 | tf.flags.DEFINE_integer("num_epochs", 1000000, "number of training epochs (default: 1000000)")
 38 | tf.flags.DEFINE_integer("evaluate_every", 1000, "evaluate model on valid dataset after this many steps (default: 1000)")
 39 | 
 40 | # Misc Parameters
 41 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 42 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 43 | 
 44 | FLAGS = tf.flags.FLAGS
 45 | # FLAGS._parse_flags()
 46 | # print("\nParameters:")
 47 | # for attr, value in sorted(FLAGS.__flags.items()):
 48 | #     print("{}={}".format(attr.upper(), value))
 49 | print("")
 50 | 
 51 | # Load data
 52 | print("Loading data...")
 53 | 
 54 | vocab = data_helpers.load_vocab(FLAGS.vocab_file)
 55 | print('vocabulary size: {}'.format(len(vocab)))
 56 | charVocab = data_helpers.load_char_vocab(FLAGS.char_vocab_file)
 57 | 
 58 | train_dataset = data_helpers.load_dataset(FLAGS.train_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len)
 59 | print('train_pairs: {}'.format(len(train_dataset)))
 60 | valid_dataset = data_helpers.load_dataset(FLAGS.valid_file, vocab, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len)
 61 | print('valid_pairs: {}'.format(len(valid_dataset)))
 62 | 
 63 | with tf.Graph().as_default():
 64 |     session_conf = tf.ConfigProto(
 65 |       allow_soft_placement=FLAGS.allow_soft_placement,
 66 |       log_device_placement=FLAGS.log_device_placement)
 67 |     sess = tf.Session(config=session_conf)
 68 |     with sess.as_default():
 69 |         model = MODEL(
 70 |             max_utter_num=FLAGS.max_utter_num,
 71 |             max_utter_len=FLAGS.max_utter_len,
 72 |             max_profile_num=FLAGS.max_profile_num,
 73 |             max_profile_len=FLAGS.max_profile_len,
 74 |             num_layer=FLAGS.num_layer,
 75 |             vocab_size=len(vocab),
 76 |             embedding_size=FLAGS.embedding_dim,
 77 |             vocab=vocab,
 78 |             rnn_size=FLAGS.rnn_size,
 79 |             maxWordLength=FLAGS.max_word_length,
 80 |             charVocab=charVocab,
 81 |             l2_reg_lambda=FLAGS.l2_reg_lambda)
 82 |         # Define Training procedure
 83 |         global_step = tf.Variable(0, name="global_step", trainable=False)
 84 |         starter_learning_rate = 0.001
 85 |         learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
 86 |                                                                    5000, 0.96, staircase=True)
 87 |         optimizer = tf.train.AdamOptimizer(learning_rate)
 88 |         grads_and_vars = optimizer.compute_gradients(model.mean_loss)
 89 |         train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
 90 | 
 91 |         # Keep track of gradient values and sparsity (optional)
 92 |         """
 93 |         grad_summaries = []
 94 |         for g, v in grads_and_vars:
 95 |             if g is not None:
 96 |                 grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
 97 |                 sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 98 |                 grad_summaries.append(grad_hist_summary)
 99 |                 grad_summaries.append(sparsity_summary)
100 |         grad_summaries_merged = tf.merge_summary(grad_summaries)
101 |         """
102 | 
103 |         # Output directory for models and summaries
104 |         timestamp = str(int(time.time()))
105 |         # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
106 |         out_dir = os.path.abspath(os.path.join("../output", timestamp))
107 |         print("Writing to {}\n".format(out_dir))
108 | 
109 |         # Summaries for loss and accuracy
110 |         """
111 |         loss_summary = tf.scalar_summary("loss", model.mean_loss)
112 |         acc_summary = tf.scalar_summary("accuracy", model.accuracy)
113 | 
114 |         # Train Summaries
115 |         train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
116 |         train_summary_dir = os.path.join(out_dir, "summaries", "train")
117 |         train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
118 | 
119 |         # Dev summaries
120 |         dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
121 |         dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
122 |         dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def)
123 |         """
124 | 
125 |         # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
126 |         checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
127 |         checkpoint_prefix = os.path.join(checkpoint_dir, "model")
128 |         if not os.path.exists(checkpoint_dir):
129 |             os.makedirs(checkpoint_dir)
130 |         saver = tf.train.Saver(tf.global_variables())
131 | 
132 |         # Initialize all variables
133 |         sess.run(tf.global_variables_initializer())
134 |         # =====================================================================================
135 |         # tvars = tf.trainable_variables()
136 |         # para_total = 0
137 |         # print 'All parameters:'
138 |         # for i in xrange(len(tvars)):
139 |         #     print tvars[i].name
140 |         #     print tvars[i].get_shape()
141 |         #     if tvars[i].get_shape().ndims==1:
142 |         #         para_total += int(tvars[i].get_shape()[0])
143 |         #     else:
144 |         #         para_total += int(tvars[i].get_shape()[0])*int(tvars[i].get_shape()[1])
145 |         # print 'Total Parameter Numbers: {}.'.format(para_total)
146 |         # =====================================================================================
147 | 
148 | 
149 |         def train_step(x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, 
150 |                        x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len):
151 |             """
152 |             A single training step
153 |             """
154 |             feed_dict = {
155 |               model.utterances: x_utterances,
156 |               model.utterances_len: x_utterances_len,
157 |               model.utterances_num: x_utterances_num,
158 |               model.profiles: x_profiles,
159 |               model.profiles_len: x_profiles_len,
160 |               model.profiles_num: x_profiles_num,
161 |               model.target: x_labels,
162 |               model.dropout_keep_prob: FLAGS.dropout_keep_prob,
163 |               model.u_charVec: x_utterances_char,
164 |               model.u_charLen: x_utterances_char_len,
165 |               model.p_charVec: x_profiles_char,
166 |               model.p_charLen: x_profiles_char_len
167 |             }
168 | 
169 |             _, step, loss, accuracy, predicted_prob = sess.run(
170 |                 [train_op, global_step, model.mean_loss, model.accuracy, model.probs],
171 |                 feed_dict)
172 | 
173 |             if step%100 == 0:
174 |                 time_str = datetime.datetime.now().isoformat()
175 |                 print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
176 |             #train_summary_writer.add_summary(summaries, step)
177 | 
178 | 
179 |         def dev_step():
180 |             # t0 = time.time()
181 |             results = defaultdict(list)
182 |             num_test = 0
183 |             num_correct = 0.0
184 |             valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=True)
185 |             for valid_batch in valid_batches:
186 |                 x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = valid_batch
187 |                 feed_dict = {
188 |                   model.utterances: x_utterances,
189 |                   model.utterances_len: x_utterances_len,
190 |                   model.utterances_num: x_utterances_num,
191 |                   model.profiles: x_profiles,
192 |                   model.profiles_len: x_profiles_len,
193 |                   model.profiles_num: x_profiles_num,
194 |                   model.target: x_labels,
195 |                   model.dropout_keep_prob: 1.0,
196 |                   model.u_charVec: x_utterances_char,
197 |                   model.u_charLen: x_utterances_char_len,
198 |                   model.p_charVec: x_profiles_char,
199 |                   model.p_charLen: x_profiles_char_len
200 |                 }
201 |                 batch_accuracy, predicted_prob = sess.run([model.accuracy, model.probs], feed_dict)
202 |                 num_test += len(predicted_prob)
203 |                 if num_test % 1000 == 0:
204 |                     print(num_test)
205 | 
206 |                 num_correct += len(predicted_prob) * batch_accuracy
207 |                 for i, prob_score in enumerate(predicted_prob):
208 |                     utterances_id, profiles_id, label = x_ids[i]
209 |                     results[utterances_id].append((profiles_id, label, prob_score))
210 | 
211 |             # print("Validation Time: {} s".format(time.time() - t0))
212 | 
213 |             #calculate top-1 precision
214 |             print('num_test_samples: {}  test_accuracy: {}'.format(num_test, num_correct/num_test))
215 |             accu, precision, recall, f1, loss = metrics.classification_metrics(results)
216 |             print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
217 | 
218 |             mvp = metrics.mean_average_precision(results)
219 |             mrr = metrics.mean_reciprocal_rank(results)
220 |             top_1_precision = metrics.top_1_precision(results)
221 |             total_valid_query = metrics.get_num_valid_query(results)
222 |             print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query))
223 | 
224 |             return mrr
225 | 
226 |         best_mrr = 0.0
227 |         batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_utter_num, FLAGS.max_utter_len, FLAGS.max_profile_num, FLAGS.max_profile_len, charVocab, FLAGS.max_word_length, shuffle=True)
228 |         for batch in batches:
229 |             x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len = batch
230 |             train_step(x_utterances, x_utterances_len, x_utterances_num, x_profiles, x_profiles_len, x_profiles_num, x_labels, x_ids, x_utterances_char, x_utterances_char_len, x_profiles_char, x_profiles_char_len)
231 |             current_step = tf.train.global_step(sess, global_step)
232 |             if current_step % FLAGS.evaluate_every == 0:
233 |                 print("\nEvaluation:")
234 |                 valid_mrr = dev_step()
235 |                 if valid_mrr > best_mrr:
236 |                     best_mrr = valid_mrr
237 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
238 |                     print("Saved model checkpoint to {}\n".format(path))
239 | 


--------------------------------------------------------------------------------
/Pretraining-Based/U2P-BERT/test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT finetuning runner."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import operator
 23 | from time import time
 24 | from collections import defaultdict
 25 | import tensorflow as tf
 26 | import optimization
 27 | import tokenization
 28 | import modeling as modeling
 29 | import metrics
 30 | 
 31 | 
 32 | flags = tf.flags
 33 | FLAGS = flags.FLAGS
 34 | 
 35 | ## Required parameters
 36 | flags.DEFINE_string(
 37 |     "test_dir", 'valid.tfrecord',
 38 |     "The input test data dir. Should contain the .tsv files (or other data files) for the task.")
 39 | 
 40 | flags.DEFINE_string(
 41 |     "restore_model_dir", 'output/',
 42 |     "The output directory where the model checkpoints have been written.")
 43 | 
 44 | flags.DEFINE_string(
 45 |     "task_name", 'TestModel', 
 46 |     "The name of the task.")
 47 | 
 48 | flags.DEFINE_string(
 49 |     "bert_config_file", 'uncased_L-12_H-768_A-12/bert_config.json',
 50 |     "The config json file corresponding to the pre-trained BERT model. "
 51 |     "This specifies the model architecture.")
 52 | 
 53 | flags.DEFINE_integer(
 54 |     "max_seq_length", 320,
 55 |     "The maximum total input sequence length after WordPiece tokenization. "
 56 |     "Sequences longer than this will be truncated, and sequences shorter "
 57 |     "than this will be padded.")
 58 | 
 59 | flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.")
 60 | 
 61 | flags.DEFINE_integer("eval_batch_size", 32, "Total batch size for predict.")
 62 | 
 63 | max_sentence_a_num=8
 64 | max_sentence_a_len=20
 65 | max_sentence_b_num=5
 66 | max_sentence_b_len=15
 67 | 
 68 | 
 69 | def print_configuration_op(FLAGS):
 70 |     print('My Configurations:')
 71 |     for name, value in FLAGS.__flags.items():
 72 |         value=value.value
 73 |         if type(value) == float:
 74 |             print(' %s:\t %f'%(name, value))
 75 |         elif type(value) == int:
 76 |             print(' %s:\t %d'%(name, value))
 77 |         elif type(value) == str:
 78 |             print(' %s:\t %s'%(name, value))
 79 |         elif type(value) == bool:
 80 |             print(' %s:\t %s'%(name, value))
 81 |         else:
 82 |             print('%s:\t %s' % (name, value))
 83 |     print('End of configuration')
 84 | 
 85 | 
 86 | def total_sample(file_name):
 87 |     sample_nums = 0
 88 |     for record in tf.python_io.tf_record_iterator(file_name):
 89 |         sample_nums += 1
 90 |     return  sample_nums
 91 | 
 92 | 
 93 | def parse_exmp(serial_exmp):
 94 |     input_data = tf.parse_single_example(serial_exmp,
 95 |                                        features={
 96 |                                            "text_a_id":
 97 |                                                tf.FixedLenFeature([], tf.int64),
 98 |                                            "text_b_id":
 99 |                                                tf.FixedLenFeature([], tf.int64),
100 |                                            "input_ids":
101 |                                                tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
102 |                                            "input_mask":
103 |                                                tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
104 |                                            "segment_ids":
105 |                                                tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
106 |                                            "label_ids":
107 |                                                tf.FixedLenFeature([], tf.float32)
108 |                                        }
109 |                                        )
110 |     # So cast all int64 to int32.
111 |     for name in list(input_data.keys()):
112 |         t = input_data[name]
113 |         if t.dtype == tf.int64:
114 |             t = tf.to_int32(t)
115 |         input_data[name] = t
116 | 
117 |     text_a_id = input_data["text_a_id"]
118 |     text_b_id = input_data['text_b_id']
119 |     input_ids = input_data["input_ids"]
120 |     input_mask = input_data["input_mask"]
121 |     segment_ids= input_data["segment_ids"]
122 |     labels = input_data['label_ids']
123 |     return text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels
124 | 
125 | 
126 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, text_a_id, text_b_id,
127 |                  num_labels, use_one_hot_embeddings):
128 |   """Creates a classification model."""
129 | 
130 |   # print(input_ids.get_shape())    # [batch_size, max_sentence_a_num * max_sentence_b_num * (max_sentence_a_len + max_sentence_b_len)]
131 |   input_ids = tf.reshape(input_ids, [-1, (max_sentence_a_len + max_sentence_b_len)])
132 |   input_mask = tf.reshape(input_mask, [-1, (max_sentence_a_len + max_sentence_b_len)])
133 |   segment_ids = tf.reshape(segment_ids, [-1, (max_sentence_a_len + max_sentence_b_len)])
134 |   # print(input_ids.get_shape())    # [batch_size * max_sentence_a_num * max_sentence_b_num, (max_sentence_a_len + max_sentence_b_len)]
135 | 
136 |   model = modeling.BertModel(
137 |       config=bert_config,
138 |       is_training=is_training,
139 |       input_ids=input_ids,
140 |       input_mask=input_mask,
141 |       token_type_ids=segment_ids,
142 |       use_one_hot_embeddings=use_one_hot_embeddings)
143 | 
144 |   # In the demo, we are doing a simple classification task on the entire
145 |   # segment.
146 |   #
147 |   # If you want to use the token-level output, use model.get_sequence_output()
148 |   # instead.
149 |   target_loss_weight = [1.0, 1.0]
150 |   target_loss_weight = tf.convert_to_tensor(target_loss_weight)
151 | 
152 |   flagx = tf.cast(tf.greater(labels, 0), dtype=tf.float32)
153 |   flagy = tf.cast(tf.equal(labels, 0), dtype=tf.float32)
154 | 
155 |   all_target_loss = target_loss_weight[1] * flagx + target_loss_weight[0] * flagy
156 | 
157 |   output_layer = model.get_pooled_output()
158 | 
159 |   hidden_size = output_layer.shape[-1].value
160 | 
161 |   output_weights = tf.get_variable(
162 |       "output_weights", [num_labels, hidden_size],
163 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
164 | 
165 |   output_bias = tf.get_variable(
166 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
167 | 
168 |   with tf.variable_scope("loss"):
169 |     # if is_training:
170 |     #   # I.e., 0.1 dropout
171 |     #   output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
172 |     output_layer = tf.layers.dropout(output_layer, rate=0.1, training=is_training)
173 | 
174 |     logits = tf.matmul(output_layer, output_weights, transpose_b=True)
175 |     logits = tf.nn.bias_add(logits, output_bias)
176 | 
177 |     logits = tf.reshape(logits, [-1, max_sentence_a_num, max_sentence_b_num])
178 |     logits = tf.reduce_max(logits, -1)
179 |     logits = tf.reduce_sum(logits, -1)
180 |     logits = tf.expand_dims(logits, -1)
181 | 
182 |     probabilities = tf.sigmoid(logits, name="prob")
183 |     logits = tf.squeeze(logits,[1])
184 |     losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
185 |     losses = tf.multiply(losses, all_target_loss)
186 | 
187 |     mean_loss = tf.reduce_mean(losses, name="mean_loss") +  sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
188 | 
189 |     with tf.name_scope("accuracy"):
190 |         correct_prediction = tf.equal(tf.sign(probabilities - 0.5), tf.sign(labels - 0.5))
191 |         accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy")
192 |     #
193 |     # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
194 |     #
195 |     # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
196 |     # loss = tf.reduce_mean(per_example_loss)
197 | 
198 |     return mean_loss, logits, probabilities, accuracy, model, output_layer
199 | 
200 | 
201 | best_score = 0.0
202 | def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer):
203 |     results = defaultdict(list)
204 |     num_test = 0
205 |     num_correct = 0.0
206 |     n_updates = 0
207 |     mrr = 0
208 |     t0 = time()
209 |     try:
210 |         while True:
211 |             n_updates += 1
212 | 
213 |             batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False})
214 |             question_id, answer_id, label = pair_
215 | 
216 |             num_test += len(predicted_prob)
217 |             # if num_test % 1000 == 0:
218 |             #     print(num_test)
219 | 
220 |             num_correct += len(predicted_prob) * batch_accuracy
221 |             for i, prob_score in enumerate(predicted_prob):
222 |                 # question_id, answer_id, label = pair_id[i]
223 |                 results[question_id[i]].append((answer_id[i], label[i], prob_score[0]))
224 | 
225 |             if n_updates%100 == 0:
226 |                 tf.logging.info("n_update %d , %s: Mins Used: %.2f" %
227 |                                 (n_updates, op_name, (time() - t0) / 60.0))
228 | 
229 |     except tf.errors.OutOfRangeError:
230 | 
231 |         print("Inference Time: {} s".format(time() - t0))
232 | 
233 |         # calculate top-1 precision
234 |         print('num_test_samples: {}  test_accuracy: {}'.format(num_test, num_correct / num_test))
235 |         accu, precision, recall, f1, loss = metrics.classification_metrics(results)
236 |         print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))
237 | 
238 |         mvp = metrics.mean_average_precision(results)
239 |         mrr = metrics.mean_reciprocal_rank(results)
240 |         top_1_precision = metrics.top_1_precision(results)
241 |         total_valid_query = metrics.get_num_valid_query(results)
242 |         print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(
243 |             mvp, mrr, top_1_precision, total_valid_query))
244 | 
245 |         out_path = os.path.join(dir_path, "output_test.txt")
246 |         print("Saving evaluation to {}".format(out_path))
247 |         with open(out_path, 'w') as f:
248 |           f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
249 |           for us_id, v in results.items():
250 |             v.sort(key=operator.itemgetter(2), reverse=True)
251 |             for i, rec in enumerate(v):
252 |               r_id, label, prob_score = rec
253 |               rank = i+1
254 |               f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label))
255 |     return mrr
256 | 
257 | 
258 | def main(_):
259 |     tf.logging.set_verbosity(tf.logging.INFO)
260 | 
261 |     print_configuration_op(FLAGS)
262 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
263 | 
264 |     test_data_size = total_sample(FLAGS.test_dir)
265 |     tf.logging.info('test data size: {}'.format(test_data_size))
266 | 
267 |     filenames = tf.placeholder(tf.string, shape=[None])
268 |     shuffle_size = tf.placeholder(tf.int64)
269 |     dataset = tf.data.TFRecordDataset(filenames)
270 |     dataset = dataset.map(parse_exmp)  # Parse the record into tensors.
271 |     dataset = dataset.repeat(1)
272 |     # dataset = dataset.shuffle(shuffle_size)
273 |     dataset = dataset.batch(FLAGS.eval_batch_size)
274 |     iterator = dataset.make_initializable_iterator()
275 |     text_a_id, text_b_id, input_ids, input_mask, segment_ids, labels = iterator.get_next()  # output dir
276 |     pair_ids = [text_a_id, text_b_id, labels]
277 | 
278 |     training = tf.placeholder(tf.bool)
279 |     mean_loss, logits, probabilities, accuracy, model, output_layer = create_model(bert_config,
280 |                                                                        is_training = training,
281 |                                                                        input_ids = input_ids,
282 |                                                                        input_mask = input_mask,
283 |                                                                        segment_ids = segment_ids,
284 |                                                                        labels = labels,
285 |                                                                        text_a_id = text_a_id,
286 |                                                                        text_b_id = text_b_id,
287 |                                                                        num_labels = 1,
288 |                                                                        use_one_hot_embeddings = False)
289 | 
290 | 
291 |     config = tf.ConfigProto(allow_soft_placement=True)
292 |     config.gpu_options.allow_growth = True
293 | 
294 |     if FLAGS.do_eval:
295 |         with tf.Session(config=config) as sess:
296 |             tf.logging.info("*** Restore model ***")
297 | 
298 |             ckpt = tf.train.get_checkpoint_state(FLAGS.restore_model_dir)
299 |             variables = tf.trainable_variables()
300 |             saver = tf.train.Saver(variables)
301 |             saver.restore(sess, ckpt.model_checkpoint_path)
302 | 
303 |             tf.logging.info('Test begin')
304 |             sess.run(iterator.initializer,
305 |                      feed_dict={filenames: [FLAGS.test_dir], shuffle_size: 1})
306 |             run_test(FLAGS.restore_model_dir, "test", sess, training, accuracy, probabilities, pair_ids, output_layer)
307 | 
308 | 
309 | if __name__ == "__main__":
310 |   tf.app.run()
311 | 


--------------------------------------------------------------------------------
/Pretraining-Based/C2P-BERT/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import re
 23 | import unicodedata
 24 | import six
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 29 |   """Checks whether the casing config is consistent with the checkpoint name."""
 30 | 
 31 |   # The casing has to be passed in by the user and there is no explicit check
 32 |   # as to whether it matches the checkpoint. The casing information probably
 33 |   # should have been stored in the bert_config.json file, but it's not, so
 34 |   # we have to heuristically detect it to validate.
 35 | 
 36 |   if not init_checkpoint:
 37 |     return
 38 | 
 39 |   m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
 40 |   if m is None:
 41 |     return
 42 | 
 43 |   model_name = m.group(1)
 44 | 
 45 |   lower_models = [
 46 |       "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
 47 |       "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
 48 |   ]
 49 | 
 50 |   cased_models = [
 51 |       "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
 52 |       "multi_cased_L-12_H-768_A-12"
 53 |   ]
 54 | 
 55 |   is_bad_config = False
 56 |   if model_name in lower_models and not do_lower_case:
 57 |     is_bad_config = True
 58 |     actual_flag = "False"
 59 |     case_name = "lowercased"
 60 |     opposite_flag = "True"
 61 | 
 62 |   if model_name in cased_models and do_lower_case:
 63 |     is_bad_config = True
 64 |     actual_flag = "True"
 65 |     case_name = "cased"
 66 |     opposite_flag = "False"
 67 | 
 68 |   if is_bad_config:
 69 |     raise ValueError(
 70 |         "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
 71 |         "However, `%s` seems to be a %s model, so you "
 72 |         "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
 73 |         "how the model was pre-training. If this error is wrong, please "
 74 |         "just comment out this check." % (actual_flag, init_checkpoint,
 75 |                                           model_name, case_name, opposite_flag))
 76 | 
 77 | 
 78 | def convert_to_unicode(text):
 79 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 80 |   if six.PY3:
 81 |     if isinstance(text, str):
 82 |       return text
 83 |     elif isinstance(text, bytes):
 84 |       return text.decode("utf-8", "ignore")
 85 |     else:
 86 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 87 |   elif six.PY2:
 88 |     if isinstance(text, str):
 89 |       return text.decode("utf-8", "ignore")
 90 |     elif isinstance(text, unicode):
 91 |       return text
 92 |     else:
 93 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 94 |   else:
 95 |     raise ValueError("Not running on Python2 or Python 3?")
 96 | 
 97 | 
 98 | def printable_text(text):
 99 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
100 | 
101 |   # These functions want `str` for both Python2 and Python3, but in one case
102 |   # it's a Unicode string and in the other it's a byte string.
103 |   if six.PY3:
104 |     if isinstance(text, str):
105 |       return text
106 |     elif isinstance(text, bytes):
107 |       return text.decode("utf-8", "ignore")
108 |     else:
109 |       raise ValueError("Unsupported string type: %s" % (type(text)))
110 |   elif six.PY2:
111 |     if isinstance(text, str):
112 |       return text
113 |     elif isinstance(text, unicode):
114 |       return text.encode("utf-8")
115 |     else:
116 |       raise ValueError("Unsupported string type: %s" % (type(text)))
117 |   else:
118 |     raise ValueError("Not running on Python2 or Python 3?")
119 | 
120 | 
121 | def load_vocab(vocab_file):
122 |   """Loads a vocabulary file into a dictionary."""
123 |   vocab = collections.OrderedDict()
124 |   index = 0
125 |   with tf.gfile.GFile(vocab_file, "r") as reader:
126 |     while True:
127 |       token = convert_to_unicode(reader.readline())
128 |       if not token:
129 |         break
130 |       token = token.strip()
131 |       vocab[token] = index
132 |       index += 1
133 |   return vocab
134 | 
135 | 
136 | def convert_by_vocab(vocab, items):
137 |   """Converts a sequence of [tokens|ids] using the vocab."""
138 |   output = []
139 |   for item in items:
140 |     output.append(vocab[item])
141 |   return output
142 | 
143 | 
144 | def convert_tokens_to_ids(vocab, tokens):
145 |   return convert_by_vocab(vocab, tokens)
146 | 
147 | 
148 | def convert_ids_to_tokens(inv_vocab, ids):
149 |   return convert_by_vocab(inv_vocab, ids)
150 | 
151 | 
152 | def whitespace_tokenize(text):
153 |   """Runs basic whitespace cleaning and splitting on a piece of text."""
154 |   text = text.strip()
155 |   if not text:
156 |     return []
157 |   tokens = text.split()
158 |   return tokens
159 | 
160 | 
161 | class FullTokenizer(object):
162 |   """Runs end-to-end tokenziation."""
163 | 
164 |   def __init__(self, vocab_file, do_lower_case=True):
165 |     self.vocab = load_vocab(vocab_file)
166 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
167 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
168 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
169 | 
170 |   def tokenize(self, text):
171 |     split_tokens = []
172 |     for token in self.basic_tokenizer.tokenize(text):
173 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
174 |         split_tokens.append(sub_token)
175 | 
176 |     return split_tokens
177 | 
178 |   def convert_tokens_to_ids(self, tokens):
179 |     return convert_by_vocab(self.vocab, tokens)
180 | 
181 |   def convert_ids_to_tokens(self, ids):
182 |     return convert_by_vocab(self.inv_vocab, ids)
183 | 
184 | 
185 | class BasicTokenizer(object):
186 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
187 | 
188 |   def __init__(self, do_lower_case=True):
189 |     """Constructs a BasicTokenizer.
190 | 
191 |     Args:
192 |       do_lower_case: Whether to lower case the input.
193 |     """
194 |     self.do_lower_case = do_lower_case
195 | 
196 |   def tokenize(self, text):
197 |     """Tokenizes a piece of text."""
198 |     text = convert_to_unicode(text)
199 |     text = self._clean_text(text)
200 | 
201 |     # This was added on November 1st, 2018 for the multilingual and Chinese
202 |     # models. This is also applied to the English models now, but it doesn't
203 |     # matter since the English models were not trained on any Chinese data
204 |     # and generally don't have any Chinese data in them (there are Chinese
205 |     # characters in the vocabulary because Wikipedia does have some Chinese
206 |     # words in the English Wikipedia.).
207 |     text = self._tokenize_chinese_chars(text)
208 | 
209 |     orig_tokens = whitespace_tokenize(text)
210 |     split_tokens = []
211 |     for token in orig_tokens:
212 |       if self.do_lower_case:
213 |         token = token.lower()
214 |         token = self._run_strip_accents(token)
215 |       split_tokens.extend(self._run_split_on_punc(token))
216 | 
217 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
218 |     return output_tokens
219 | 
220 |   def _run_strip_accents(self, text):
221 |     """Strips accents from a piece of text."""
222 |     text = unicodedata.normalize("NFD", text)
223 |     output = []
224 |     for char in text:
225 |       cat = unicodedata.category(char)
226 |       if cat == "Mn":
227 |         continue
228 |       output.append(char)
229 |     return "".join(output)
230 | 
231 |   def _run_split_on_punc(self, text):
232 |     """Splits punctuation on a piece of text."""
233 |     chars = list(text)
234 |     i = 0
235 |     start_new_word = True
236 |     output = []
237 |     while i < len(chars):
238 |       char = chars[i]
239 |       if _is_punctuation(char):
240 |         output.append([char])
241 |         start_new_word = True
242 |       else:
243 |         if start_new_word:
244 |           output.append([])
245 |         start_new_word = False
246 |         output[-1].append(char)
247 |       i += 1
248 | 
249 |     return ["".join(x) for x in output]
250 | 
251 |   def _tokenize_chinese_chars(self, text):
252 |     """Adds whitespace around any CJK character."""
253 |     output = []
254 |     for char in text:
255 |       cp = ord(char)
256 |       if self._is_chinese_char(cp):
257 |         output.append(" ")
258 |         output.append(char)
259 |         output.append(" ")
260 |       else:
261 |         output.append(char)
262 |     return "".join(output)
263 | 
264 |   def _is_chinese_char(self, cp):
265 |     """Checks whether CP is the codepoint of a CJK character."""
266 |     # This defines a "chinese character" as anything in the CJK Unicode block:
267 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
268 |     #
269 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
270 |     # despite its name. The modern Korean Hangul alphabet is a different block,
271 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
272 |     # space-separated words, so they are not treated specially and handled
273 |     # like the all of the other languages.
274 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
275 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
276 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
277 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
278 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
279 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
280 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
281 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
282 |       return True
283 | 
284 |     return False
285 | 
286 |   def _clean_text(self, text):
287 |     """Performs invalid character removal and whitespace cleanup on text."""
288 |     output = []
289 |     for char in text:
290 |       cp = ord(char)
291 |       if cp == 0 or cp == 0xfffd or _is_control(char):
292 |         continue
293 |       if _is_whitespace(char):
294 |         output.append(" ")
295 |       else:
296 |         output.append(char)
297 |     return "".join(output)
298 | 
299 | 
300 | class WordpieceTokenizer(object):
301 |   """Runs WordPiece tokenziation."""
302 | 
303 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
304 |     self.vocab = vocab
305 |     self.unk_token = unk_token
306 |     self.max_input_chars_per_word = max_input_chars_per_word
307 | 
308 |   def tokenize(self, text):
309 |     """Tokenizes a piece of text into its word pieces.
310 | 
311 |     This uses a greedy longest-match-first algorithm to perform tokenization
312 |     using the given vocabulary.
313 | 
314 |     For example:
315 |       input = "unaffable"
316 |       output = ["un", "##aff", "##able"]
317 | 
318 |     Args:
319 |       text: A single token or whitespace separated tokens. This should have
320 |         already been passed through `BasicTokenizer.
321 | 
322 |     Returns:
323 |       A list of wordpiece tokens.
324 |     """
325 | 
326 |     text = convert_to_unicode(text)
327 | 
328 |     output_tokens = []
329 |     for token in whitespace_tokenize(text):
330 |       chars = list(token)
331 |       if len(chars) > self.max_input_chars_per_word:
332 |         output_tokens.append(self.unk_token)
333 |         continue
334 | 
335 |       is_bad = False
336 |       start = 0
337 |       sub_tokens = []
338 |       while start < len(chars):
339 |         end = len(chars)
340 |         cur_substr = None
341 |         while start < end:
342 |           substr = "".join(chars[start:end])
343 |           if start > 0:
344 |             substr = "##" + substr
345 |           if substr in self.vocab:
346 |             cur_substr = substr
347 |             break
348 |           end -= 1
349 |         if cur_substr is None:
350 |           is_bad = True
351 |           break
352 |         sub_tokens.append(cur_substr)
353 |         start = end
354 | 
355 |       if is_bad:
356 |         output_tokens.append(self.unk_token)
357 |       else:
358 |         output_tokens.extend(sub_tokens)
359 |     return output_tokens
360 | 
361 | 
362 | def _is_whitespace(char):
363 |   """Checks whether `chars` is a whitespace character."""
364 |   # \t, \n, and \r are technically contorl characters but we treat them
365 |   # as whitespace since they are generally considered as such.
366 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
367 |     return True
368 |   cat = unicodedata.category(char)
369 |   if cat == "Zs":
370 |     return True
371 |   return False
372 | 
373 | 
374 | def _is_control(char):
375 |   """Checks whether `chars` is a control character."""
376 |   # These are technically control characters but we count them as whitespace
377 |   # characters.
378 |   if char == "\t" or char == "\n" or char == "\r":
379 |     return False
380 |   cat = unicodedata.category(char)
381 |   if cat in ("Cc", "Cf"):
382 |     return True
383 |   return False
384 | 
385 | 
386 | def _is_punctuation(char):
387 |   """Checks whether `chars` is a punctuation character."""
388 |   cp = ord(char)
389 |   # We treat all non-letter/number ASCII as punctuation.
390 |   # Characters such as "^", "$", and "`" are not in the Unicode
391 |   # Punctuation class but we treat them as punctuation anyways, for
392 |   # consistency.
393 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
394 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
395 |     return True
396 |   cat = unicodedata.category(char)
397 |   if cat.startswith("P"):
398 |     return True
399 |   return False
400 | 


--------------------------------------------------------------------------------