├── README.md ├── clean.sh ├── logits_example_aligner.py ├── main_iter.sh ├── prepare_dir.sh ├── run.sh ├── single_domain ├── __pycache__ │ ├── batching.cpython-36.pyc │ └── tokenization.cpython-36.pyc ├── base_squad.py ├── batching.py ├── batching.pyc ├── convert_params.py ├── diversity_sample.py ├── gen_logits.py ├── gen_logits_batching.py ├── inference │ ├── CMakeLists.txt │ ├── gen_demo_data.py │ └── inference.cc ├── model │ ├── __init__.py │ ├── __init__.pyc │ ├── bert.py │ ├── bert.pyc │ ├── classifier.py │ ├── transformer_encoder.py │ └── transformer_encoder.pyc ├── multi_iter.sh ├── optimization.py ├── optimization.pyc ├── predict_classifier.py ├── reader │ ├── __init__.py │ ├── __init__.pyc │ ├── base_squad.py │ ├── cls.py │ ├── gen_logits_squad.py │ ├── gen_logits_squad.pyc │ ├── old_squad.pyc │ ├── pretraining.py │ ├── squad.py │ └── squad.pyc ├── run_classifier.py ├── run_squad.py ├── test_squad.py ├── tokenization.py ├── tokenization.pyc ├── train.py └── utils │ ├── __init__.py │ ├── __init__.pyc │ ├── args.py │ ├── args.pyc │ ├── fp16.py │ ├── fp16.pyc │ ├── init.py │ └── init.pyc ├── small_clean.sh ├── small_prepare_dir.sh ├── small_run.sh ├── split_data.py └── upsample.py /README.md: -------------------------------------------------------------------------------- 1 | # An Iterative Multi-Source Mutual Knowledge Transfer Frameworkfor Machine Reading Comprehension 2 | * This project is modified on the basis of https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleNLP/pretrain_language_models/BERT 3 | * The commands we list below do not include all optional arguments. 4 | If you want to change some of the optional arguments such as converting model output directories, please change the settings in ```main_iter.sh``` and ```single_domain/multi_iter.sh```. 5 | 6 | ## Data Format 7 | The ```data``` directory should contain train, dev, test sets of five domains: **SQuAD**, **NewsQA**, **HotpotQA**, **NaturalQuestions** and **TriviaQA**. ```DOMAINNAME.raw.json``` are the training sets, ```DOMAINNAME_dev.raw.json``` and ```DOMAINNAME_test.raw.json``` are the validation sets and test sets. These json files need to be converted to SQuAD format. 8 | 9 | ## Running Command 10 | * This project needs 5 GPUs (one GPU per each domain), please change the GPU configure in file ```main_iter.sh```. 11 | 12 | Before start training, you need to download the pretrained BERT_base checkpoint. 13 | ``` 14 | wget https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz 15 | tar -zvxf uncased_L-12_H-768_A-12.tar.gz 16 | ``` 17 | After unzip the checkpoint file, you can start training by running: 18 | ``` 19 | bash run.sh 20 | ``` 21 | ## Dependencies 22 | * Python=2.7 23 | * PaddlePaddle>=1.4.0 24 | -------------------------------------------------------------------------------- /clean.sh: -------------------------------------------------------------------------------- 1 | rm -r align* 2 | rm -r NewsQA 3 | rm -r SQuAD 4 | rm -r HotpotQA 5 | rm -r NaturalQuestions 6 | rm -r TriviaQA 7 | rm total_logits/* 8 | rm id_map 9 | rm base_train.log 10 | rm test.log 11 | rm ./data/All_domain.raw.json_* 12 | rm ./data/*4cb* 13 | -------------------------------------------------------------------------------- /logits_example_aligner.py: -------------------------------------------------------------------------------- 1 | import argparse # option parsing 2 | import json 3 | 4 | parser = argparse.ArgumentParser(description='usage') # add description 5 | parser.add_argument('--domains', nargs='+', type=str, help='file path of processed file') 6 | 7 | args = parser.parse_args() 8 | 9 | # domain_nums = args.domains.__len__() 10 | logits_file_name = ["./" + x + '/logits_file' for x in args.domains] 11 | qas2startlogits = {} 12 | qas2endlogits = {} 13 | qas2name = {} 14 | 15 | for idx, file_name in enumerate(logits_file_name): 16 | logits_file = open(file_name, 'r') 17 | cnt = 0 18 | for item in logits_file.read().split('\n\n'): 19 | if item == "": 20 | continue 21 | lines = item.split('\n') 22 | qas2name[lines[0].split('_')[0]] = args.domains[idx] 23 | if lines[0].split('_')[0] not in qas2startlogits: 24 | qas2startlogits[lines[0].split('_')[0]] = {} 25 | qas2endlogits[lines[0].split('_')[0]] = {} 26 | 27 | pa_idx = int(lines[0].split('_')[1]) 28 | qas2startlogits[lines[0].split('_')[0]][pa_idx] = [float(x) for x in lines[1].split(' ')] 29 | qas2endlogits[lines[0].split('_')[0]][pa_idx] = [float(x) for x in lines[2].split(' ')] 30 | else: 31 | pa_idx = int(lines[0].split('_')[1]) 32 | qas2startlogits[lines[0].split('_')[0]][pa_idx] = [float(x) for x in lines[1].split(' ')] 33 | qas2endlogits[lines[0].split('_')[0]][pa_idx] = [float(x) for x in lines[2].split(' ')] 34 | logits_file.close() 35 | 36 | qas_list = [] 37 | domain_data_name = './data/All_domain.raw.json' 38 | file = open(domain_data_name, 'r') 39 | filecontext = file.read() 40 | file.close() 41 | json_to_dict=json.loads(filecontext) 42 | articleList = json_to_dict["data"] 43 | 44 | for article in articleList: 45 | for paragraph in article['paragraphs']: 46 | context = paragraph['context'] 47 | for qas in paragraph['qas']: 48 | qas_list.append(qas['id']) 49 | 50 | qas_num = qas_list.__len__() 51 | epoch_idx = 0 52 | file_idx = 0 53 | 54 | out_file = open('./total_logits/logits_0', 'w') 55 | id_map_file = open('id_map', 'w') 56 | for idx, id in enumerate(qas_list): 57 | epoch_idx += 1 58 | if epoch_idx >= int(qas_num / 100): 59 | out_file.close() 60 | epoch_idx = 0 61 | file_idx += 1 62 | out_file = open('./total_logits/logits_'+str(file_idx), 'w') 63 | logits_num = qas2startlogits[id].__len__() 64 | for pa_idx in range(1, logits_num + 1): 65 | out_file.write(id + '_' + str(pa_idx)) 66 | out_file.write('\n') 67 | out_file.write(' '.join([str(x) for x in qas2startlogits[id][pa_idx]])) 68 | out_file.write('\n') 69 | out_file.write(' '.join([str(x) for x in qas2endlogits[id][pa_idx]])) 70 | out_file.write('\n') 71 | out_file.write(qas2name[id]) 72 | out_file.write('\n\n') 73 | id_map_file.write(id + '_' + str(pa_idx) + '\t' + '../total_logits/logits_' + str(file_idx)) 74 | id_map_file.write('\n') 75 | -------------------------------------------------------------------------------- /main_iter.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export FLAGS_enable_parallel_graph=1 3 | export FLAGS_sync_nccl_allreduce=1 4 | BERT_BASE_PATH=../uncased_L-12_H-768_A-12 5 | CHECKPOINT_PATH=./squad/checkpoints/ 6 | DATA_PATH=../data 7 | 8 | GPU=(1 2 3 4 5) 9 | 10 | base_train() { 11 | python -u base_squad.py --use_cuda true\ 12 | --batch_size 12 \ 13 | --init_pretraining_params ${BERT_BASE_PATH}/params \ 14 | --in_tokens false\ 15 | --checkpoints ${CHECKPOINT_PATH} \ 16 | --vocab_path ${BERT_BASE_PATH}/vocab.txt \ 17 | --do_train True \ 18 | --do_predict True \ 19 | --save_steps 2000 \ 20 | --warmup_proportion 0.1 \ 21 | --weight_decay 0.01 \ 22 | --epoch 2 \ 23 | --max_seq_len 512 \ 24 | --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ 25 | --predict_file ${DATA_PATH}/$1_dev.raw.json \ 26 | --do_lower_case True \ 27 | --doc_stride 128 \ 28 | --train_file ${DATA_PATH}/$1.raw.json \ 29 | --learning_rate 3e-5\ 30 | --lr_scheduler linear_warmup_decay \ 31 | --skip_steps 10 32 | mkdir finish_base_train 33 | } 34 | 35 | python ./upsample.py --path ./data --domains SQuAD NewsQA TriviaQA HotpotQA NaturalQuestions 36 | #python ./upsample.py --path ./data --domains SQuAD_small NewsQA_small TriviaQA_small HotpotQA_small NaturalQuestions_small 37 | ALL_DOMAIN=$@ 38 | 39 | domain_num=0 40 | for dm in $ALL_DOMAIN 41 | do 42 | now_path=./$dm 43 | let domain_num=domain_num+1 44 | done 45 | 46 | idx=0 47 | for dm in $ALL_DOMAIN 48 | do 49 | DOMAIN_RAW_FILE[$idx]=./data/${dm}.4cb.raw.json 50 | DOMAIN_DEV_FILE[$idx]=./data/${dm}_dev.raw.json 51 | let idx=idx+1 52 | done 53 | 54 | RAW_FILE=`echo ${DOMAIN_RAW_FILE[*]}` 55 | DEV_FILE=`echo ${DOMAIN_DEV_FILE[*]}` 56 | 57 | echo python ./data/combine_mrc_data.py --filepaths $RAW_FILE 58 | python ./data/combine_mrc_data.py --filepaths $RAW_FILE 59 | mv combined_dataset ./data/All_domain.raw.json 60 | python ./data/combine_mrc_data.py --filepaths $DEV_FILE 61 | mv combined_dataset ./data/All_domain_dev.raw.json 62 | python ./data/get4weight_set.py --filepaths $DEV_FILE 63 | 64 | python split_data.py --path ./data --domains All_domain 65 | 66 | PWD=`pwd` 67 | saved_path=$PWD 68 | idx=0 69 | 70 | for dm in $ALL_DOMAIN 71 | do 72 | cd $saved_path/$dm 73 | export CUDA_VISIBLE_DEVICES=${GPU[$idx]} 74 | let equal_num=domain_num-1 75 | base_train $dm > base_train.log & 76 | let idx=idx+1 77 | done 78 | 79 | idx=0 80 | for dm in $ALL_DOMAIN 81 | do 82 | echo $saved_path 83 | cd $saved_path/$dm 84 | while [ ! -d "./finish_base_train" ] 85 | do 86 | sleep 30 87 | done 88 | sh ./multi_iter.sh ${GPU[$idx]} $dm $idx $ALL_DOMAIN > ./iter.log & 89 | let idx=idx+1 90 | done 91 | for dm in $ALL_DOMAIN 92 | do 93 | cd $saved_path/$dm 94 | while [ ! -d "./train_finished" ] 95 | do 96 | echo sleep! 97 | sleep 30 98 | done 99 | done 100 | 101 | -------------------------------------------------------------------------------- /prepare_dir.sh: -------------------------------------------------------------------------------- 1 | rm -r SQuAD 2 | rm -r NewsQA 3 | rm -r TriviaQA 4 | rm -r HotpotQA 5 | rm -r NaturalQuestions 6 | cp -r single_domain SQuAD 7 | cp -r checkpoints/SQuAD/step_best/ SQuAD/squad/checkpoints/ 8 | cp -r single_domain NewsQA 9 | cp -r checkpoints/NewsQA/step_best/ NewsQA/squad/checkpoints/ 10 | cp -r single_domain TriviaQA 11 | cp -r checkpoints/TriviaQA/step_best/ TriviaQA/squad/checkpoints/ 12 | cp -r single_domain HotpotQA 13 | cp -r checkpoints/HotpotQA/step_best/ HotpotQA/squad/checkpoints/ 14 | cp -r single_domain NaturalQuestions 15 | cp -r checkpoints/NaturalQuestions/step_best/ NaturalQuestions/squad/checkpoints/ 16 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | sh prepare_dir.sh 2 | nohup sh main_iter.sh SQuAD NewsQA HotpotQA NaturalQuestions TriviaQA > test.log & 3 | -------------------------------------------------------------------------------- /single_domain/__pycache__/batching.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/__pycache__/batching.cpython-36.pyc -------------------------------------------------------------------------------- /single_domain/__pycache__/tokenization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/__pycache__/tokenization.cpython-36.pyc -------------------------------------------------------------------------------- /single_domain/base_squad.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import argparse 6 | import collections 7 | import multiprocessing 8 | import os 9 | import time 10 | import numpy as np 11 | import paddle 12 | import paddle.fluid as fluid 13 | 14 | from reader.base_squad import DataProcessor, write_predictions 15 | from model.bert import BertConfig, BertModel 16 | from utils.args import ArgumentGroup, print_arguments 17 | from optimization import optimization 18 | from utils.init import init_pretraining_params, init_checkpoint 19 | 20 | # yapf: disable 21 | parser = argparse.ArgumentParser(__doc__) 22 | model_g = ArgumentGroup(parser, "model", "model configuration and paths.") 23 | model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.") 24 | model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") 25 | model_g.add_arg("init_pretraining_params", str, None, 26 | "Init pre-training params which preforms fine-tuning from. If the " 27 | "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") 28 | model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") 29 | 30 | train_g = ArgumentGroup(parser, "training", "training options.") 31 | train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") 32 | train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") 33 | train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", 34 | "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) 35 | train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") 36 | train_g.add_arg("warmup_proportion", float, 0.1, 37 | "Proportion of training steps to perform linear learning rate warmup for.") 38 | train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.") 39 | train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") 40 | train_g.add_arg("loss_scaling", float, 1.0, 41 | "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") 42 | 43 | log_g = ArgumentGroup(parser, "logging", "logging related.") 44 | log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") 45 | log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") 46 | 47 | data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") 48 | data_g.add_arg("train_file", str, None, "SQuAD json for training. E.g., train-v1.1.json.") 49 | data_g.add_arg("predict_file", str, None, "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.") 50 | data_g.add_arg("adv_text_path", str, None, "Adversarial training dataset") 51 | data_g.add_arg("vocab_path", str, None, "Vocabulary path.") 52 | data_g.add_arg("version_2_with_negative", bool, False, 53 | "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.") 54 | data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") 55 | data_g.add_arg("max_query_length", int, 64, "Max query length.") 56 | data_g.add_arg("max_answer_length", int, 30, "Max answer length.") 57 | data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.") 58 | data_g.add_arg("in_tokens", bool, False, 59 | "If set, the batch size will be the maximum number of tokens in one batch. " 60 | "Otherwise, it will be the maximum number of examples in one batch.") 61 | data_g.add_arg("do_lower_case", bool, True, 62 | "Whether to lower case the input text. Should be True for uncased models and False for cased models.") 63 | data_g.add_arg("doc_stride", int, 128, 64 | "When splitting up a long document into chunks, how much stride to take between chunks.") 65 | data_g.add_arg("n_best_size", int, 20, 66 | "The total number of n-best predictions to generate in the nbest_predictions.json output file.") 67 | data_g.add_arg("null_score_diff_threshold", float, 0.0, 68 | "If null_score - best_non_null is greater than the threshold predict null.") 69 | data_g.add_arg("random_seed", int, 0, "Random seed.") 70 | 71 | run_type_g = ArgumentGroup(parser, "run_type", "running type options.") 72 | run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") 73 | run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") 74 | run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.") 75 | run_type_g.add_arg("do_train", bool, True, "Whether to perform training.") 76 | run_type_g.add_arg("do_predict", bool, True, "Whether to perform prediction.") 77 | 78 | args = parser.parse_args() 79 | # yapf: enable. 80 | 81 | def create_model(pyreader_name, bert_config, is_training=False): 82 | if is_training: 83 | pyreader = fluid.layers.py_reader( 84 | capacity=50, 85 | shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 86 | [-1, args.max_seq_len, 1], 87 | [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], 88 | dtypes=[ 89 | 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], 90 | lod_levels=[0, 0, 0, 0, 0, 0], 91 | name=pyreader_name, 92 | use_double_buffer=True) 93 | (src_ids, pos_ids, sent_ids, input_mask, start_positions, 94 | end_positions) = fluid.layers.read_file(pyreader) 95 | else: 96 | pyreader = fluid.layers.py_reader( 97 | capacity=50, 98 | shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 99 | [-1, args.max_seq_len, 1], 100 | [-1, args.max_seq_len, 1], [-1, 1]], 101 | dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], 102 | lod_levels=[0, 0, 0, 0, 0], 103 | name=pyreader_name, 104 | use_double_buffer=True) 105 | (src_ids, pos_ids, sent_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader) 106 | 107 | bert = BertModel( 108 | src_ids=src_ids, 109 | position_ids=pos_ids, 110 | sentence_ids=sent_ids, 111 | input_mask=input_mask, 112 | config=bert_config, 113 | use_fp16=args.use_fp16) 114 | 115 | enc_out = bert.get_sequence_output() 116 | 117 | logits = fluid.layers.fc( 118 | input=enc_out, 119 | size=2, 120 | num_flatten_dims=2, 121 | param_attr=fluid.ParamAttr( 122 | name="cls_squad_out_w", 123 | initializer=fluid.initializer.TruncatedNormal(scale=0.02)), 124 | bias_attr=fluid.ParamAttr( 125 | name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) 126 | 127 | logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) 128 | start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) 129 | 130 | batch_ones = fluid.layers.fill_constant_batch_size_like( 131 | input=start_logits, dtype='int64', shape=[1], value=1) 132 | num_seqs = fluid.layers.reduce_sum(input=batch_ones) 133 | 134 | if is_training: 135 | 136 | def compute_loss(logits, positions): 137 | loss = fluid.layers.softmax_with_cross_entropy( 138 | logits=logits, label=positions) 139 | loss = fluid.layers.mean(x=loss) 140 | return loss 141 | 142 | start_loss = compute_loss(start_logits, start_positions) 143 | end_loss = compute_loss(end_logits, end_positions) 144 | total_loss = (start_loss + end_loss) / 2.0 145 | if args.use_fp16 and args.loss_scaling > 1.0: 146 | total_loss = total_loss * args.loss_scaling 147 | 148 | return pyreader, total_loss, num_seqs 149 | else: 150 | return pyreader, unique_id, start_logits, end_logits, num_seqs 151 | 152 | 153 | RawResult = collections.namedtuple("RawResult", 154 | ["unique_id", "start_logits", "end_logits"]) 155 | 156 | 157 | def predict(test_exe, test_program, test_pyreader, fetch_list, processor, test_path=None): 158 | if not os.path.exists(args.checkpoints): 159 | os.makedirs(args.checkpoints) 160 | output_prediction_file = os.path.join(args.checkpoints, "predictions.json") 161 | output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json") 162 | output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json") 163 | 164 | test_pyreader.start() 165 | all_results = [] 166 | time_begin = time.time() 167 | while True: 168 | try: 169 | np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run( 170 | fetch_list=fetch_list, program=test_program) 171 | for idx in range(np_unique_ids.shape[0]): 172 | if len(all_results) % 1000 == 0: 173 | print("Processing example: %d" % len(all_results)) 174 | unique_id = int(np_unique_ids[idx]) 175 | start_logits = [float(x) for x in np_start_logits[idx].flat] 176 | end_logits = [float(x) for x in np_end_logits[idx].flat] 177 | all_results.append( 178 | RawResult( 179 | unique_id=unique_id, 180 | start_logits=start_logits, 181 | end_logits=end_logits)) 182 | except fluid.core.EOFException: 183 | test_pyreader.reset() 184 | break 185 | time_end = time.time() 186 | 187 | features = processor.get_features( 188 | processor.predict_examples, is_training=False) 189 | if test_path is None: 190 | adv_f1 = write_predictions(processor.predict_examples, features, all_results, 191 | args.n_best_size, args.max_answer_length, 192 | args.do_lower_case, output_prediction_file, 193 | output_nbest_file, output_null_log_odds_file, 194 | args.version_2_with_negative, 195 | args.null_score_diff_threshold, args.verbose, args.predict_file) 196 | else: 197 | adv_f1 = write_predictions(processor.predict_examples, features, all_results, 198 | args.n_best_size, args.max_answer_length, 199 | args.do_lower_case, output_prediction_file, 200 | output_nbest_file, output_null_log_odds_file, 201 | args.version_2_with_negative, 202 | args.null_score_diff_threshold, args.verbose, test_path) 203 | 204 | return adv_f1 205 | 206 | 207 | def train(args): 208 | bert_config = BertConfig(args.bert_config_path) 209 | bert_config.print_config() 210 | 211 | if not (args.do_train or args.do_predict): 212 | raise ValueError("For args `do_train` and `do_predict`, at " 213 | "least one of them must be True.") 214 | 215 | if args.use_cuda: 216 | place = fluid.CUDAPlace(0) 217 | dev_count = fluid.core.get_cuda_device_count() 218 | else: 219 | place = fluid.CPUPlace() 220 | dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) 221 | exe = fluid.Executor(place) 222 | 223 | processor = DataProcessor( 224 | vocab_path=args.vocab_path, 225 | do_lower_case=args.do_lower_case, 226 | max_seq_length=args.max_seq_len, 227 | in_tokens=args.in_tokens, 228 | doc_stride=args.doc_stride, 229 | max_query_length=args.max_query_length, 230 | adv_text_path=args.adv_text_path) 231 | 232 | startup_prog = fluid.Program() 233 | if args.random_seed is not None: 234 | startup_prog.random_seed = args.random_seed 235 | 236 | if args.do_train: 237 | train_data_generator = processor.data_generator( 238 | data_path=args.train_file, 239 | batch_size=args.batch_size, 240 | phase='train', 241 | shuffle=True, 242 | dev_count=dev_count, 243 | version_2_with_negative=args.version_2_with_negative, 244 | epoch=args.epoch) 245 | 246 | num_train_examples = processor.get_num_examples(phase='train') 247 | if args.in_tokens: 248 | max_train_steps = args.epoch * num_train_examples // ( 249 | args.batch_size // args.max_seq_len) // dev_count 250 | else: 251 | max_train_steps = args.epoch * num_train_examples // ( 252 | args.batch_size) // dev_count 253 | warmup_steps = int(max_train_steps * args.warmup_proportion) 254 | print("Device count: %d" % dev_count) 255 | print("Num train examples: %d" % num_train_examples) 256 | print("Max train steps: %d" % max_train_steps) 257 | print("Num warmup steps: %d" % warmup_steps) 258 | 259 | train_program = fluid.Program() 260 | with fluid.program_guard(train_program, startup_prog): 261 | with fluid.unique_name.guard(): 262 | train_pyreader, loss, num_seqs = create_model( 263 | pyreader_name='train_reader', 264 | bert_config=bert_config, 265 | is_training=True) 266 | 267 | scheduled_lr = optimization( 268 | loss=loss, 269 | warmup_steps=warmup_steps, 270 | num_train_steps=max_train_steps, 271 | learning_rate=args.learning_rate, 272 | train_program=train_program, 273 | startup_prog=startup_prog, 274 | weight_decay=args.weight_decay, 275 | scheduler=args.lr_scheduler, 276 | use_fp16=args.use_fp16, 277 | loss_scaling=args.loss_scaling) 278 | 279 | fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) 280 | 281 | if args.verbose: 282 | if args.in_tokens: 283 | lower_mem, upper_mem, unit = fluid.contrib.memory_usage( 284 | program=train_program, 285 | batch_size=args.batch_size // args.max_seq_len) 286 | else: 287 | lower_mem, upper_mem, unit = fluid.contrib.memory_usage( 288 | program=train_program, batch_size=args.batch_size) 289 | print("Theoretical memory usage in training: %.3f - %.3f %s" % 290 | (lower_mem, upper_mem, unit)) 291 | 292 | if args.do_predict: 293 | test_prog = fluid.Program() 294 | with fluid.program_guard(test_prog, startup_prog): 295 | with fluid.unique_name.guard(): 296 | test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( 297 | pyreader_name='test_reader', 298 | bert_config=bert_config, 299 | is_training=False) 300 | 301 | fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name, 302 | start_logits.name, end_logits.name, num_seqs.name]) 303 | 304 | test_prog = test_prog.clone(for_test=True) 305 | 306 | exe.run(startup_prog) 307 | 308 | if args.do_train: 309 | if args.init_checkpoint and args.init_pretraining_params: 310 | print( 311 | "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " 312 | "both are set! Only arg 'init_checkpoint' is made valid.") 313 | if args.init_checkpoint: 314 | init_checkpoint( 315 | exe, 316 | args.init_checkpoint, 317 | main_program=startup_prog, 318 | use_fp16=args.use_fp16) 319 | elif args.init_pretraining_params: 320 | init_pretraining_params( 321 | exe, 322 | args.init_pretraining_params, 323 | main_program=startup_prog, 324 | use_fp16=args.use_fp16) 325 | elif args.do_predict: 326 | if not args.init_checkpoint: 327 | raise ValueError("args 'init_checkpoint' should be set if" 328 | "only doing prediction!") 329 | init_checkpoint( 330 | exe, 331 | args.init_checkpoint, 332 | main_program=startup_prog, 333 | use_fp16=args.use_fp16) 334 | 335 | if args.do_train: 336 | exec_strategy = fluid.ExecutionStrategy() 337 | exec_strategy.use_experimental_executor = args.use_fast_executor 338 | exec_strategy.num_threads = dev_count 339 | exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope 340 | 341 | train_exe = fluid.ParallelExecutor( 342 | use_cuda=args.use_cuda, 343 | loss_name=loss.name, 344 | exec_strategy=exec_strategy, 345 | main_program=train_program) 346 | 347 | train_pyreader.decorate_tensor_provider(train_data_generator) 348 | 349 | train_pyreader.start() 350 | steps = 0 351 | total_cost, total_num_seqs = [], [] 352 | time_begin = time.time() 353 | 354 | best_f1 = -1 355 | while steps < max_train_steps: 356 | try: 357 | steps += 1 358 | if steps % args.skip_steps == 0: 359 | if warmup_steps <= 0: 360 | fetch_list = [loss.name, num_seqs.name] 361 | else: 362 | fetch_list = [ 363 | loss.name, scheduled_lr.name, num_seqs.name 364 | ] 365 | else: 366 | fetch_list = [] 367 | 368 | outputs = train_exe.run(fetch_list=fetch_list) 369 | 370 | if steps % args.skip_steps == 0: 371 | if warmup_steps <= 0: 372 | np_loss, np_num_seqs = outputs 373 | else: 374 | np_loss, np_lr, np_num_seqs = outputs 375 | total_cost.extend(np_loss * np_num_seqs) 376 | total_num_seqs.extend(np_num_seqs) 377 | 378 | if args.verbose: 379 | verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( 380 | ) 381 | verbose += "learning rate: %f" % ( 382 | np_lr[0] 383 | if warmup_steps > 0 else args.learning_rate) 384 | print(verbose) 385 | 386 | time_end = time.time() 387 | used_time = time_end - time_begin 388 | current_example, epoch = processor.get_train_progress() 389 | 390 | print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " 391 | "speed: %f steps/s" % 392 | (epoch, current_example, num_train_examples, steps, 393 | np.sum(total_cost) / np.sum(total_num_seqs), 394 | args.skip_steps / used_time)) 395 | total_cost, total_num_seqs = [], [] 396 | time_begin = time.time() 397 | 398 | if (steps % args.save_steps == 0 or steps == max_train_steps) and steps > int(max_train_steps/3.0): 399 | #if (steps % args.save_steps == 0 or steps == max_train_steps): 400 | if args.do_predict: 401 | test_pyreader.decorate_tensor_provider( 402 | processor.data_generator( 403 | data_path=args.predict_file, 404 | batch_size=args.batch_size, 405 | phase='predict', 406 | shuffle=False, 407 | dev_count=1, 408 | epoch=1)) 409 | adv_f1 = predict(exe, test_prog, test_pyreader, [ 410 | unique_ids.name, start_logits.name, end_logits.name, num_seqs.name 411 | ], processor) 412 | # print(adv_f1) 413 | # continue 414 | 415 | # if steps != max_train_steps: 416 | if adv_f1 > best_f1: 417 | best_f1 = adv_f1 418 | save_path = os.path.join(args.checkpoints, 419 | "step_best") 420 | print("best adv model saved") 421 | # else: 422 | # save_path = os.path.join(args.checkpoints, 423 | # "step_last") 424 | fluid.io.save_persistables(exe, save_path, train_program) 425 | test_pyreader.decorate_tensor_provider( 426 | processor.data_generator( 427 | data_path=args.predict_file.replace("dev", "test"), 428 | batch_size=args.batch_size, 429 | phase='predict', 430 | shuffle=False, 431 | dev_count=1, 432 | epoch=1)) 433 | test_f1 = predict(exe, test_prog, test_pyreader, [ 434 | unique_ids.name, start_logits.name, end_logits.name, num_seqs.name 435 | ], processor, args.predict_file.replace("dev", "test")) 436 | print("This is the test score.") 437 | 438 | except fluid.core.EOFException: 439 | save_path = os.path.join(args.checkpoints, 440 | "step_" + str(steps) + "_final") 441 | fluid.io.save_persistables(exe, save_path, train_program) 442 | train_pyreader.reset() 443 | break 444 | 445 | if args.do_predict and not args.do_train: 446 | test_pyreader.decorate_tensor_provider( 447 | processor.data_generator( 448 | data_path=args.predict_file, 449 | batch_size=args.batch_size, 450 | phase='predict', 451 | shuffle=False, 452 | dev_count=1, 453 | epoch=1)) 454 | 455 | predict(exe, test_prog, test_pyreader, [ 456 | unique_ids.name, start_logits.name, end_logits.name, num_seqs.name 457 | ], processor) 458 | 459 | 460 | if __name__ == '__main__': 461 | print_arguments(args) 462 | train(args) 463 | -------------------------------------------------------------------------------- /single_domain/batching.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Mask, padding and batching.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import numpy as np 21 | 22 | 23 | def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): 24 | """ 25 | Add mask for batch_tokens, return out, mask_label, mask_pos; 26 | Note: mask_pos responding the batch_tokens after padded; 27 | """ 28 | max_len = max([len(sent) for sent in batch_tokens]) 29 | mask_label = [] 30 | mask_pos = [] 31 | prob_mask = np.random.rand(total_token_num) 32 | # Note: the first token is [CLS], so [low=1] 33 | replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) 34 | pre_sent_len = 0 35 | prob_index = 0 36 | for sent_index, sent in enumerate(batch_tokens): 37 | mask_flag = False 38 | prob_index += pre_sent_len 39 | for token_index, token in enumerate(sent): 40 | prob = prob_mask[prob_index + token_index] 41 | if prob > 0.15: 42 | continue 43 | elif 0.03 < prob <= 0.15: 44 | # mask 45 | if token != SEP and token != CLS: 46 | mask_label.append(sent[token_index]) 47 | sent[token_index] = MASK 48 | mask_flag = True 49 | mask_pos.append(sent_index * max_len + token_index) 50 | elif 0.015 < prob <= 0.03: 51 | # random replace 52 | if token != SEP and token != CLS: 53 | mask_label.append(sent[token_index]) 54 | sent[token_index] = replace_ids[prob_index + token_index] 55 | mask_flag = True 56 | mask_pos.append(sent_index * max_len + token_index) 57 | else: 58 | # keep the original token 59 | if token != SEP and token != CLS: 60 | mask_label.append(sent[token_index]) 61 | mask_pos.append(sent_index * max_len + token_index) 62 | pre_sent_len = len(sent) 63 | 64 | # ensure at least mask one word in a sentence 65 | while not mask_flag: 66 | token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) 67 | if sent[token_index] != SEP and sent[token_index] != CLS: 68 | mask_label.append(sent[token_index]) 69 | sent[token_index] = MASK 70 | mask_flag = True 71 | mask_pos.append(sent_index * max_len + token_index) 72 | mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) 73 | mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) 74 | return batch_tokens, mask_label, mask_pos 75 | 76 | 77 | def prepare_batch_data(insts, 78 | total_token_num, 79 | voc_size=0, 80 | pad_id=None, 81 | cls_id=None, 82 | sep_id=None, 83 | mask_id=None, 84 | return_input_mask=True, 85 | return_max_len=True, 86 | return_num_token=False): 87 | """ 88 | 1. generate Tensor of data 89 | 2. generate Tensor of position 90 | 3. generate self attention mask, [shape: batch_size * max_len * max_len] 91 | """ 92 | 93 | batch_src_ids = [inst[0] for inst in insts] 94 | batch_sent_ids = [inst[1] for inst in insts] 95 | batch_pos_ids = [inst[2] for inst in insts] 96 | labels_list = [] 97 | # compatible with squad, whose example includes start/end positions, 98 | # or unique id 99 | 100 | for i in range(3, len(insts[0]), 1): 101 | labels = [inst[i] for inst in insts] 102 | labels = np.array(labels).astype("int64").reshape([-1, 1]) 103 | labels_list.append(labels) 104 | 105 | # First step: do mask without padding 106 | if mask_id >= 0: 107 | out, mask_label, mask_pos = mask( 108 | batch_src_ids, 109 | total_token_num, 110 | vocab_size=voc_size, 111 | CLS=cls_id, 112 | SEP=sep_id, 113 | MASK=mask_id) 114 | else: 115 | out = batch_src_ids 116 | # Second step: padding 117 | src_id, self_input_mask = pad_batch_data( 118 | out, pad_idx=pad_id, return_input_mask=True) 119 | pos_id = pad_batch_data( 120 | batch_pos_ids, 121 | pad_idx=pad_id, 122 | return_pos=False, 123 | return_input_mask=False) 124 | sent_id = pad_batch_data( 125 | batch_sent_ids, 126 | pad_idx=pad_id, 127 | return_pos=False, 128 | return_input_mask=False) 129 | 130 | if mask_id >= 0: 131 | return_list = [ 132 | src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos 133 | ] + labels_list 134 | else: 135 | return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list 136 | 137 | return return_list if len(return_list) > 1 else return_list[0] 138 | 139 | 140 | def pad_batch_data(insts, 141 | pad_idx=0, 142 | return_pos=False, 143 | return_input_mask=False, 144 | return_max_len=False, 145 | return_num_token=False): 146 | """ 147 | Pad the instances to the max sequence length in batch, and generate the 148 | corresponding position data and input mask. 149 | """ 150 | return_list = [] 151 | max_len = max(len(inst) for inst in insts) 152 | # Any token included in dict can be used to pad, since the paddings' loss 153 | # will be masked out by weights and make no effect on parameter gradients. 154 | 155 | inst_data = np.array([ 156 | list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts 157 | ]) 158 | return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] 159 | 160 | # position data 161 | if return_pos: 162 | inst_pos = np.array([ 163 | list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) 164 | for inst in insts 165 | ]) 166 | 167 | return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] 168 | 169 | if return_input_mask: 170 | # This is used to avoid attention on paddings. 171 | input_mask_data = np.array([[1] * len(inst) + [0] * 172 | (max_len - len(inst)) for inst in insts]) 173 | input_mask_data = np.expand_dims(input_mask_data, axis=-1) 174 | return_list += [input_mask_data.astype("float32")] 175 | 176 | if return_max_len: 177 | return_list += [max_len] 178 | 179 | if return_num_token: 180 | num_token = 0 181 | for inst in insts: 182 | num_token += len(inst) 183 | return_list += [num_token] 184 | 185 | return return_list if len(return_list) > 1 else return_list[0] 186 | 187 | 188 | if __name__ == "__main__": 189 | pass 190 | -------------------------------------------------------------------------------- /single_domain/batching.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/batching.pyc -------------------------------------------------------------------------------- /single_domain/convert_params.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Convert Google official BERT models to Fluid parameters.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import numpy as np 21 | import argparse 22 | import collections 23 | from utils.args import print_arguments 24 | import tensorflow as tf 25 | import paddle.fluid as fluid 26 | from tensorflow.python import pywrap_tensorflow 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser(__doc__) 31 | parser.add_argument( 32 | "--init_tf_checkpoint", 33 | type=str, 34 | required=True, 35 | help="Initial TF checkpoint (a pre-trained BERT model).") 36 | 37 | parser.add_argument( 38 | "--fluid_params_dir", 39 | type=str, 40 | required=True, 41 | help="The directory to store converted Fluid parameters.") 42 | args = parser.parse_args() 43 | return args 44 | 45 | 46 | def parse(init_checkpoint): 47 | tf_fluid_param_name_map = collections.OrderedDict() 48 | tf_param_name_shape_map = collections.OrderedDict() 49 | 50 | init_vars = tf.train.list_variables(init_checkpoint) 51 | for (var_name, var_shape) in init_vars: 52 | fluid_param_name = '' 53 | if var_name.startswith('bert/'): 54 | key = var_name[5:] 55 | if (key.startswith('embeddings/')): 56 | if (key.endswith('LayerNorm/gamma')): 57 | fluid_param_name = 'pre_encoder_layer_norm_scale' 58 | elif (key.endswith('LayerNorm/beta')): 59 | fluid_param_name = 'pre_encoder_layer_norm_bias' 60 | elif (key.endswith('position_embeddings')): 61 | fluid_param_name = 'pos_embedding' 62 | elif (key.endswith('word_embeddings')): 63 | fluid_param_name = 'word_embedding' 64 | elif (key.endswith('token_type_embeddings')): 65 | fluid_param_name = 'sent_embedding' 66 | else: 67 | print("ignored param: %s" % var_name) 68 | elif (key.startswith('encoder/')): 69 | key = key[8:] 70 | layer_num = int(key[key.find('_') + 1:key.find('/')]) 71 | suffix = "encoder_layer_" + str(layer_num) 72 | if key.endswith('attention/output/LayerNorm/beta'): 73 | fluid_param_name = suffix + '_post_att_layer_norm_bias' 74 | elif key.endswith('attention/output/LayerNorm/gamma'): 75 | fluid_param_name = suffix + '_post_att_layer_norm_scale' 76 | elif key.endswith('attention/output/dense/bias'): 77 | fluid_param_name = suffix + '_multi_head_att_output_fc.b_0' 78 | elif key.endswith('attention/output/dense/kernel'): 79 | fluid_param_name = suffix + '_multi_head_att_output_fc.w_0' 80 | elif key.endswith('attention/self/key/bias'): 81 | fluid_param_name = suffix + '_multi_head_att_key_fc.b_0' 82 | elif key.endswith('attention/self/key/kernel'): 83 | fluid_param_name = suffix + '_multi_head_att_key_fc.w_0' 84 | elif key.endswith('attention/self/query/bias'): 85 | fluid_param_name = suffix + '_multi_head_att_query_fc.b_0' 86 | elif key.endswith('attention/self/query/kernel'): 87 | fluid_param_name = suffix + '_multi_head_att_query_fc.w_0' 88 | elif key.endswith('attention/self/value/bias'): 89 | fluid_param_name = suffix + '_multi_head_att_value_fc.b_0' 90 | elif key.endswith('attention/self/value/kernel'): 91 | fluid_param_name = suffix + '_multi_head_att_value_fc.w_0' 92 | elif key.endswith('intermediate/dense/bias'): 93 | fluid_param_name = suffix + '_ffn_fc_0.b_0' 94 | elif key.endswith('intermediate/dense/kernel'): 95 | fluid_param_name = suffix + '_ffn_fc_0.w_0' 96 | elif key.endswith('output/LayerNorm/beta'): 97 | fluid_param_name = suffix + '_post_ffn_layer_norm_bias' 98 | elif key.endswith('output/LayerNorm/gamma'): 99 | fluid_param_name = suffix + '_post_ffn_layer_norm_scale' 100 | elif key.endswith('output/dense/bias'): 101 | fluid_param_name = suffix + '_ffn_fc_1.b_0' 102 | elif key.endswith('output/dense/kernel'): 103 | fluid_param_name = suffix + '_ffn_fc_1.w_0' 104 | else: 105 | print("ignored param: %s" % var_name) 106 | elif (key.startswith('pooler/')): 107 | if key.endswith('dense/bias'): 108 | fluid_param_name = 'pooled_fc.b_0' 109 | elif key.endswith('dense/kernel'): 110 | fluid_param_name = 'pooled_fc.w_0' 111 | else: 112 | print("ignored param: %s" % var_name) 113 | else: 114 | print("ignored param: %s" % var_name) 115 | 116 | elif var_name.startswith('cls/'): 117 | if var_name == 'cls/predictions/output_bias': 118 | fluid_param_name = 'mask_lm_out_fc.b_0' 119 | elif var_name == 'cls/predictions/transform/LayerNorm/beta': 120 | fluid_param_name = 'mask_lm_trans_layer_norm_bias' 121 | elif var_name == 'cls/predictions/transform/LayerNorm/gamma': 122 | fluid_param_name = 'mask_lm_trans_layer_norm_scale' 123 | elif var_name == 'cls/predictions/transform/dense/bias': 124 | fluid_param_name = 'mask_lm_trans_fc.b_0' 125 | elif var_name == 'cls/predictions/transform/dense/kernel': 126 | fluid_param_name = 'mask_lm_trans_fc.w_0' 127 | elif var_name == 'cls/seq_relationship/output_bias': 128 | fluid_param_name = 'next_sent_fc.b_0' 129 | elif var_name == 'cls/seq_relationship/output_weights': 130 | fluid_param_name = 'next_sent_fc.w_0' 131 | elif var_name == 'cls/squad/output_weights': 132 | fluid_param_name = 'cls_squad_out_w' 133 | elif var_name == 'cls/squad/output_bias': 134 | fluid_param_name = 'cls_squad_out_b' 135 | else: 136 | print("ignored param: %s" % var_name) 137 | else: 138 | print("ignored param: %s" % var_name) 139 | 140 | if fluid_param_name != '': 141 | tf_fluid_param_name_map[var_name] = fluid_param_name 142 | tf_param_name_shape_map[var_name] = var_shape 143 | fluid_param_name = '' 144 | 145 | return tf_fluid_param_name_map, tf_param_name_shape_map 146 | 147 | 148 | def convert(args): 149 | tf_fluid_param_name_map, tf_param_name_shape_map = parse( 150 | args.init_tf_checkpoint) 151 | program = fluid.Program() 152 | global_block = program.global_block() 153 | for param in tf_fluid_param_name_map: 154 | global_block.create_parameter( 155 | name=tf_fluid_param_name_map[param], 156 | shape=tf_param_name_shape_map[param], 157 | dtype='float32', 158 | initializer=fluid.initializer.Constant(value=0.0)) 159 | 160 | place = fluid.core.CPUPlace() 161 | exe = fluid.Executor(place) 162 | exe.run(program) 163 | 164 | print('---------------------- Converted Parameters -----------------------') 165 | print('###### [TF param name] --> [Fluid param name] [param shape] ######') 166 | print('-------------------------------------------------------------------') 167 | 168 | reader = pywrap_tensorflow.NewCheckpointReader(args.init_tf_checkpoint) 169 | for param in tf_fluid_param_name_map: 170 | value = reader.get_tensor(param) 171 | if param == 'cls/seq_relationship/output_weights': 172 | value = np.transpose(value) 173 | if param == 'cls/squad/output_weights': 174 | value = np.transpose(value) 175 | fluid.global_scope().find_var(tf_fluid_param_name_map[ 176 | param]).get_tensor().set(value, place) 177 | print(param, ' --> ', tf_fluid_param_name_map[param], ' ', value.shape) 178 | 179 | fluid.io.save_params(exe, args.fluid_params_dir, main_program=program) 180 | 181 | 182 | if __name__ == '__main__': 183 | args = parse_args() 184 | print_arguments(args) 185 | convert(args) 186 | -------------------------------------------------------------------------------- /single_domain/diversity_sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import tokenization 4 | from utils.args import ArgumentGroup 5 | 6 | parser = argparse.ArgumentParser(__doc__) 7 | g = ArgumentGroup(parser, "model", "model configuration and paths.") 8 | g.add_arg("sample_strategy", str, 'max_sample', "Sample strategy.") 9 | g.add_arg("weights_path", str, None, "Path to the weights file.") 10 | g.add_arg("adv_text_path", str, None, "Path to save the generated adversarial sentences.") 11 | g.add_arg("bert_vocab_file", str, None, "Path to the bert vocab file.") 12 | args = parser.parse_args() 13 | 14 | adv_seq_len = -1 15 | do_lower_case = True 16 | tokenizer = tokenization.FullTokenizer( 17 | vocab_file=args.bert_vocab_file, do_lower_case=do_lower_case) 18 | if args.sample_strategy == 'max_sample': 19 | with open(args.weights_path, 'r') as fin, open(args.adv_text_path, 'w') as fout: 20 | items = fin.read().split('\n\n') 21 | for idx, item in enumerate(items): 22 | if item == "": 23 | continue 24 | lines = item.split('\n') 25 | if idx == 0: 26 | adv_seq_len = int(lines[0]) 27 | del lines[0] 28 | assert lines.__len__() == adv_seq_len + 2 29 | qas_id = lines[0] 30 | # seq_weights = np.array([float(item) for item in [x for x in [l.split(' ') for l in lines[1: 1 + adv_seq_len]]]]) 31 | seq_weights = np.array([[float(x) for x in l.split(' ')] for l in lines[1: 1 + adv_seq_len]]) 32 | vocab_id = np.array([int(x) for x in lines[-1].split(' ')]) 33 | 34 | ids = vocab_id[np.array([np.argmax(seq_weights[i]) for i in range(adv_seq_len)])] 35 | tokens = tokenizer.convert_ids_to_tokens(ids) 36 | 37 | fout.write(qas_id) 38 | fout.write('\t') 39 | fout.write(" ".join(tokens)) 40 | fout.write('\n') 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /single_domain/gen_logits_batching.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Mask, padding and batching.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import numpy as np 21 | 22 | 23 | def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): 24 | """ 25 | Add mask for batch_tokens, return out, mask_label, mask_pos; 26 | Note: mask_pos responding the batch_tokens after padded; 27 | """ 28 | max_len = max([len(sent) for sent in batch_tokens]) 29 | mask_label = [] 30 | mask_pos = [] 31 | prob_mask = np.random.rand(total_token_num) 32 | # Note: the first token is [CLS], so [low=1] 33 | replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) 34 | pre_sent_len = 0 35 | prob_index = 0 36 | for sent_index, sent in enumerate(batch_tokens): 37 | mask_flag = False 38 | prob_index += pre_sent_len 39 | for token_index, token in enumerate(sent): 40 | prob = prob_mask[prob_index + token_index] 41 | if prob > 0.15: 42 | continue 43 | elif 0.03 < prob <= 0.15: 44 | # mask 45 | if token != SEP and token != CLS: 46 | mask_label.append(sent[token_index]) 47 | sent[token_index] = MASK 48 | mask_flag = True 49 | mask_pos.append(sent_index * max_len + token_index) 50 | elif 0.015 < prob <= 0.03: 51 | # random replace 52 | if token != SEP and token != CLS: 53 | mask_label.append(sent[token_index]) 54 | sent[token_index] = replace_ids[prob_index + token_index] 55 | mask_flag = True 56 | mask_pos.append(sent_index * max_len + token_index) 57 | else: 58 | # keep the original token 59 | if token != SEP and token != CLS: 60 | mask_label.append(sent[token_index]) 61 | mask_pos.append(sent_index * max_len + token_index) 62 | pre_sent_len = len(sent) 63 | 64 | # ensure at least mask one word in a sentence 65 | while not mask_flag: 66 | token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) 67 | if sent[token_index] != SEP and sent[token_index] != CLS: 68 | mask_label.append(sent[token_index]) 69 | sent[token_index] = MASK 70 | mask_flag = True 71 | mask_pos.append(sent_index * max_len + token_index) 72 | mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) 73 | mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) 74 | return batch_tokens, mask_label, mask_pos 75 | 76 | 77 | def prepare_batch_data(insts, 78 | total_token_num, 79 | voc_size=0, 80 | pad_id=None, 81 | cls_id=None, 82 | sep_id=None, 83 | mask_id=None, 84 | return_input_mask=True, 85 | return_max_len=True, 86 | return_num_token=False): 87 | """ 88 | 1. generate Tensor of data 89 | 2. generate Tensor of position 90 | 3. generate self attention mask, [shape: batch_size * max_len * max_len] 91 | """ 92 | 93 | batch_src_ids = [inst[0] for inst in insts] 94 | batch_sent_ids = [inst[1] for inst in insts] 95 | batch_pos_ids = [inst[2] for inst in insts] 96 | labels_list = [] 97 | # compatible with squad, whose example includes start/end positions, 98 | # or unique id 99 | 100 | for i in range(4, len(insts[0]), 1): 101 | labels = [inst[i] for inst in insts] 102 | labels = np.array(labels).astype("int64").reshape([-1, 1]) 103 | labels_list.append(labels) 104 | 105 | # First step: do mask without padding 106 | if mask_id >= 0: 107 | out, mask_label, mask_pos = mask( 108 | batch_src_ids, 109 | total_token_num, 110 | vocab_size=voc_size, 111 | CLS=cls_id, 112 | SEP=sep_id, 113 | MASK=mask_id) 114 | else: 115 | out = batch_src_ids 116 | # Second step: padding 117 | src_id, self_input_mask = pad_batch_data( 118 | out, pad_idx=pad_id, return_input_mask=True) 119 | pos_id = pad_batch_data( 120 | batch_pos_ids, 121 | pad_idx=pad_id, 122 | return_pos=False, 123 | return_input_mask=False) 124 | sent_id = pad_batch_data( 125 | batch_sent_ids, 126 | pad_idx=pad_id, 127 | return_pos=False, 128 | return_input_mask=False) 129 | 130 | if mask_id >= 0: 131 | return_list = [ 132 | src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos 133 | ] + labels_list 134 | else: 135 | return_list = [src_id, pos_id, sent_id, self_input_mask, [inst[3] for inst in insts]] + labels_list 136 | 137 | return return_list if len(return_list) > 1 else return_list[0] 138 | 139 | 140 | def pad_batch_data(insts, 141 | pad_idx=0, 142 | return_pos=False, 143 | return_input_mask=False, 144 | return_max_len=False, 145 | return_num_token=False): 146 | """ 147 | Pad the instances to the max sequence length in batch, and generate the 148 | corresponding position data and input mask. 149 | """ 150 | return_list = [] 151 | max_len = max(len(inst) for inst in insts) 152 | # Any token included in dict can be used to pad, since the paddings' loss 153 | # will be masked out by weights and make no effect on parameter gradients. 154 | 155 | inst_data = np.array([ 156 | list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts 157 | ]) 158 | return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] 159 | 160 | # position data 161 | if return_pos: 162 | inst_pos = np.array([ 163 | list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) 164 | for inst in insts 165 | ]) 166 | 167 | return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] 168 | 169 | if return_input_mask: 170 | # This is used to avoid attention on paddings. 171 | input_mask_data = np.array([[1] * len(inst) + [0] * 172 | (max_len - len(inst)) for inst in insts]) 173 | input_mask_data = np.expand_dims(input_mask_data, axis=-1) 174 | return_list += [input_mask_data.astype("float32")] 175 | 176 | if return_max_len: 177 | return_list += [max_len] 178 | 179 | if return_num_token: 180 | num_token = 0 181 | for inst in insts: 182 | num_token += len(inst) 183 | return_list += [num_token] 184 | 185 | return return_list if len(return_list) > 1 else return_list[0] 186 | 187 | 188 | if __name__ == "__main__": 189 | pass 190 | -------------------------------------------------------------------------------- /single_domain/inference/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.2) 2 | PROJECT(inference_demo) 3 | SET(CMAKE_C_COMPILER gcc) 4 | SET(CMAKE_CXX_COMPILER g++) 5 | ADD_COMPILE_OPTIONS(-std=c++11 -D_GLIBCXX_USE_CXX11_ABI=0) 6 | 7 | SET(FLUID_INFER_LIB fluid_inference) 8 | SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include) 9 | SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib) 10 | 11 | SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include) 12 | SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib) 13 | 14 | SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include) 15 | SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib) 16 | SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib) 17 | SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib) 18 | 19 | INCLUDE_DIRECTORIES(${FLUID_INC_PATH}) 20 | INCLUDE_DIRECTORIES(${GLOG_INC_PATH}) 21 | INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH}) 22 | 23 | LINK_DIRECTORIES(${FLUID_LIB_PATH}) 24 | LINK_DIRECTORIES(${GLOG_LIB_PATH}) 25 | LINK_DIRECTORIES(${GFLAGS_LIB_PATH}) 26 | LINK_DIRECTORIES(${MKLML_LIB_PATH}) 27 | LINK_DIRECTORIES(${MKLDNN_LIB_PATH}) 28 | 29 | ADD_EXECUTABLE(inference inference.cc) 30 | TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread) 31 | 32 | -------------------------------------------------------------------------------- /single_domain/inference/gen_demo_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | sys.path.append("..") 4 | from reader import cls 5 | 6 | 7 | def main(): 8 | args = parse_args() 9 | task_name = args.task_name.lower() 10 | processors = { 11 | 'xnli': cls.XnliProcessor, 12 | 'cola': cls.ColaProcessor, 13 | 'mrpc': cls.MrpcProcessor, 14 | 'mnli': cls.MnliProcessor, 15 | } 16 | 17 | processor = processors[task_name](data_dir=args.data_path, 18 | vocab_path=args.vocab_path, 19 | max_seq_len=args.max_seq_len, 20 | do_lower_case=args.do_lower_case, 21 | in_tokens=args.in_tokens, 22 | random_seed=args.random_seed) 23 | example = processor.get_test_examples(args.data_path)[0] 24 | gen = processor.data_generator( 25 | args.batch_size, phase='test', epoch=1, shuffle=False)() 26 | 27 | for i, data in enumerate(gen): 28 | data = data[:4] 29 | sample = [] 30 | for field in data: 31 | shape_str = ' '.join(map(str, field.shape)) 32 | data_str = ' '.join(map(str, field.reshape(-1).tolist())) 33 | sample.append(shape_str + ':' + data_str) 34 | print(';'.join(sample)) 35 | 36 | 37 | def str2bool(v): 38 | # because argparse does not support to parse "true, False" as python 39 | # boolean directly 40 | return v.lower() in ("true", "t", "1") 41 | 42 | 43 | def parse_args(): 44 | parser = argparse.ArgumentParser(prog="bert data prepare") 45 | parser.add_argument( 46 | "--task_name", 47 | type=str, 48 | default='xnli', 49 | choices=["xnli", "mnli", "cola", "mrpc"], 50 | help="task name, used to specify data preprocessor") 51 | parser.add_argument( 52 | "--batch_size", 53 | type=int, 54 | default=4096, 55 | help="batch size, see also --in_tokens") 56 | parser.add_argument( 57 | "--in_tokens", 58 | action='store_true', 59 | help="if set, batch_size means token number in a batch, otherwise " 60 | "it means example number in a batch") 61 | parser.add_argument( 62 | '--do_lower_case', 63 | type=str2bool, 64 | default=True, 65 | choices=[True, False], 66 | help="Whether to lower case the input text. Should be True for uncased " 67 | "models and False for cased models.") 68 | parser.add_argument("--vocab_path", type=str, help="path of vocabulary") 69 | parser.add_argument("--data_path", type=str, help="path of data to process") 70 | parser.add_argument( 71 | "--max_seq_len", type=int, default=128, help="max sequence length") 72 | parser.add_argument( 73 | "--random_seed", type=int, default=0, help="random seed") 74 | return parser.parse_args() 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /single_domain/inference/inference.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | DEFINE_string(model_dir, "", "Inference model directory."); 27 | DEFINE_string(data, "", "Input data path."); 28 | DEFINE_int32(repeat, 1, "Repeat times."); 29 | DEFINE_int32(num_labels, 3, "Number of labels."); 30 | DEFINE_bool(output_prediction, false, "Whether to output the prediction results."); 31 | DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction."); 32 | 33 | template 34 | void GetValueFromStream(std::stringstream *ss, T *t) { 35 | (*ss) >> (*t); 36 | } 37 | 38 | template <> 39 | void GetValueFromStream(std::stringstream *ss, std::string *t) { 40 | *t = ss->str(); 41 | } 42 | 43 | // Split string to vector 44 | template 45 | void Split(const std::string &line, char sep, std::vector *v) { 46 | std::stringstream ss; 47 | T t; 48 | for (auto c : line) { 49 | if (c != sep) { 50 | ss << c; 51 | } else { 52 | GetValueFromStream(&ss, &t); 53 | v->push_back(std::move(t)); 54 | ss.str({}); 55 | ss.clear(); 56 | } 57 | } 58 | 59 | if (!ss.str().empty()) { 60 | GetValueFromStream(&ss, &t); 61 | v->push_back(std::move(t)); 62 | ss.str({}); 63 | ss.clear(); 64 | } 65 | } 66 | 67 | template 68 | constexpr paddle::PaddleDType GetPaddleDType(); 69 | 70 | template <> 71 | constexpr paddle::PaddleDType GetPaddleDType() { 72 | return paddle::PaddleDType::INT64; 73 | } 74 | 75 | template <> 76 | constexpr paddle::PaddleDType GetPaddleDType() { 77 | return paddle::PaddleDType::FLOAT32; 78 | } 79 | 80 | 81 | // Parse tensor from string 82 | template 83 | bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { 84 | std::vector data; 85 | Split(field, ':', &data); 86 | if (data.size() < 2) { 87 | LOG(ERROR) << "parse tensor error!"; 88 | return false; 89 | } 90 | 91 | std::string shape_str = data[0]; 92 | 93 | std::vector shape; 94 | Split(shape_str, ' ', &shape); 95 | 96 | std::string mat_str = data[1]; 97 | 98 | std::vector mat; 99 | Split(mat_str, ' ', &mat); 100 | 101 | tensor->shape = shape; 102 | auto size = 103 | std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * 104 | sizeof(T); 105 | tensor->data.Resize(size); 106 | std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); 107 | tensor->dtype = GetPaddleDType(); 108 | 109 | return true; 110 | } 111 | 112 | // Parse input tensors from string 113 | bool ParseLine(const std::string &line, 114 | std::vector *tensors) { 115 | std::vector fields; 116 | Split(line, ';', &fields); 117 | 118 | if (fields.size() < 4) return false; 119 | 120 | tensors->clear(); 121 | tensors->reserve(4); 122 | 123 | int i = 0; 124 | // src_id 125 | paddle::PaddleTensor src_id; 126 | ParseTensor(fields[i++], &src_id); 127 | tensors->push_back(src_id); 128 | 129 | // pos_id 130 | paddle::PaddleTensor pos_id; 131 | ParseTensor(fields[i++], &pos_id); 132 | tensors->push_back(pos_id); 133 | 134 | // segment_id 135 | paddle::PaddleTensor segment_id; 136 | ParseTensor(fields[i++], &segment_id); 137 | tensors->push_back(segment_id); 138 | 139 | // input mask 140 | paddle::PaddleTensor input_mask; 141 | ParseTensor(fields[i++], &input_mask); 142 | tensors->push_back(input_mask); 143 | 144 | return true; 145 | } 146 | 147 | template 148 | void PrintTensor(const paddle::PaddleTensor &t) { 149 | std::stringstream ss; 150 | ss.str({}); 151 | ss.clear(); 152 | ss << "Tensor: shape["; 153 | for (auto i: t.shape) { 154 | ss << i << " "; 155 | } 156 | ss << "], data["; 157 | T *data = static_cast(t.data.data()); 158 | for (int i = 0; i < t.data.length() / sizeof(T); i++) { 159 | ss << data[i] << " "; 160 | } 161 | 162 | ss << "]"; 163 | LOG(INFO) << ss.str(); 164 | } 165 | 166 | void PrintInputs(const std::vector &inputs) { 167 | for (const auto &t : inputs) { 168 | if (t.dtype == paddle::PaddleDType::INT64) { 169 | PrintTensor(t); 170 | } else { 171 | PrintTensor(t); 172 | } 173 | } 174 | } 175 | 176 | // Print outputs to log 177 | void PrintOutputs(const std::vector &outputs, int &cnt) { 178 | for (size_t i = 0; i < outputs.front().data.length() / sizeof(float); 179 | i += FLAGS_num_labels) { 180 | std::cout << cnt << "\t"; 181 | for (size_t j = 0; j < FLAGS_num_labels; ++j) { 182 | std::cout << static_cast(outputs.front().data.data())[i+j] << "\t"; 183 | } 184 | std::cout << std::endl; 185 | cnt += 1; 186 | } 187 | } 188 | 189 | bool LoadInputData(std::vector> *inputs) { 190 | if (FLAGS_data.empty()) { 191 | LOG(ERROR) << "please set input data path"; 192 | return false; 193 | } 194 | 195 | std::ifstream fin(FLAGS_data); 196 | std::string line; 197 | 198 | int lineno = 0; 199 | while (std::getline(fin, line)) { 200 | std::vector feed_data; 201 | if (!ParseLine(line, &feed_data)) { 202 | LOG(ERROR) << "Parse line[" << lineno << "] error!"; 203 | } else { 204 | inputs->push_back(std::move(feed_data)); 205 | } 206 | } 207 | 208 | return true; 209 | } 210 | 211 | int main(int argc, char *argv[]) { 212 | google::InitGoogleLogging(*argv); 213 | gflags::ParseCommandLineFlags(&argc, &argv, true); 214 | 215 | if (FLAGS_model_dir.empty()) { 216 | LOG(ERROR) << "please set model dir"; 217 | return -1; 218 | } 219 | 220 | paddle::NativeConfig config; 221 | config.model_dir = FLAGS_model_dir; 222 | if (FLAGS_use_gpu) { 223 | config.use_gpu = true; 224 | config.fraction_of_gpu_memory = 0.15; 225 | config.device = 0; 226 | } 227 | 228 | auto predictor = CreatePaddlePredictor(config); 229 | 230 | std::vector> inputs; 231 | if (!LoadInputData(&inputs)) { 232 | LOG(ERROR) << "load input data error!"; 233 | return -1; 234 | } 235 | 236 | std::vector fetch; 237 | int total_time{0}; 238 | int num_samples{0}; 239 | int out_cnt = 0; 240 | for (int i = 0; i < FLAGS_repeat; i++) { 241 | for (auto feed : inputs) { 242 | fetch.clear(); 243 | auto start = std::chrono::system_clock::now(); 244 | predictor->Run(feed, &fetch); 245 | if (FLAGS_output_prediction && i == 0) { 246 | PrintOutputs(fetch, out_cnt); 247 | } 248 | auto end = std::chrono::system_clock::now(); 249 | if (!fetch.empty()) { 250 | total_time += 251 | std::chrono::duration_cast(end - start) 252 | .count(); 253 | num_samples += fetch.front().data.length() / FLAGS_num_labels / sizeof(float); 254 | } 255 | } 256 | } 257 | 258 | 259 | auto per_sample_ms = 260 | static_cast(total_time) / num_samples; 261 | LOG(INFO) << "Run on " << num_samples 262 | << " samples over "<< FLAGS_repeat << " times, average latency: " << per_sample_ms 263 | << "ms per sample."; 264 | 265 | return 0; 266 | } 267 | -------------------------------------------------------------------------------- /single_domain/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/model/__init__.py -------------------------------------------------------------------------------- /single_domain/model/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/model/__init__.pyc -------------------------------------------------------------------------------- /single_domain/model/bert.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """BERT model.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import six 21 | import json 22 | import numpy as np 23 | import paddle.fluid as fluid 24 | from model.transformer_encoder import encoder, pre_process_layer 25 | 26 | 27 | class BertConfig(object): 28 | def __init__(self, config_path): 29 | self._config_dict = self._parse(config_path) 30 | 31 | def _parse(self, config_path): 32 | try: 33 | with open(config_path) as json_file: 34 | config_dict = json.load(json_file) 35 | except Exception: 36 | raise IOError("Error in parsing bert model config file '%s'" % 37 | config_path) 38 | else: 39 | return config_dict 40 | 41 | def __getitem__(self, key): 42 | return self._config_dict[key] 43 | 44 | def print_config(self): 45 | for arg, value in sorted(six.iteritems(self._config_dict)): 46 | print('%s: %s' % (arg, value)) 47 | print('------------------------------------------------') 48 | 49 | 50 | class BertModel(object): 51 | def __init__(self, 52 | src_ids, 53 | position_ids, 54 | sentence_ids, 55 | input_mask, 56 | config, 57 | weight_sharing=True, 58 | use_fp16=False): 59 | 60 | self._emb_size = config['hidden_size'] 61 | self._n_layer = config['num_hidden_layers'] 62 | self._n_head = config['num_attention_heads'] 63 | self._voc_size = config['vocab_size'] 64 | self._max_position_seq_len = config['max_position_embeddings'] 65 | self._sent_types = config['type_vocab_size'] 66 | self._hidden_act = config['hidden_act'] 67 | self._prepostprocess_dropout = config['hidden_dropout_prob'] 68 | self._attention_dropout = config['attention_probs_dropout_prob'] 69 | self._weight_sharing = weight_sharing 70 | 71 | self._word_emb_name = "word_embedding" 72 | self._pos_emb_name = "pos_embedding" 73 | self._sent_emb_name = "sent_embedding" 74 | self._dtype = "float16" if use_fp16 else "float32" 75 | 76 | # Initialize all weigths by truncated normal initializer, and all biases 77 | # will be initialized by constant zero by default. 78 | self._param_initializer = fluid.initializer.TruncatedNormal( 79 | scale=config['initializer_range']) 80 | 81 | self._build_model(src_ids, position_ids, sentence_ids, input_mask) 82 | 83 | def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): 84 | # padding id in vocabulary must be set to 0 85 | emb_out = fluid.layers.embedding( 86 | input=src_ids, 87 | size=[self._voc_size, self._emb_size], 88 | dtype=self._dtype, 89 | param_attr=fluid.ParamAttr( 90 | name=self._word_emb_name, initializer=self._param_initializer), 91 | is_sparse=False) 92 | position_emb_out = fluid.layers.embedding( 93 | input=position_ids, 94 | size=[self._max_position_seq_len, self._emb_size], 95 | dtype=self._dtype, 96 | param_attr=fluid.ParamAttr( 97 | name=self._pos_emb_name, initializer=self._param_initializer)) 98 | 99 | sent_emb_out = fluid.layers.embedding( 100 | sentence_ids, 101 | size=[self._sent_types, self._emb_size], 102 | dtype=self._dtype, 103 | param_attr=fluid.ParamAttr( 104 | name=self._sent_emb_name, initializer=self._param_initializer)) 105 | 106 | emb_out = emb_out + position_emb_out 107 | emb_out = emb_out + sent_emb_out 108 | 109 | emb_out = pre_process_layer( 110 | emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') 111 | 112 | if self._dtype == "float16": 113 | input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) 114 | 115 | self_attn_mask = fluid.layers.matmul( 116 | x=input_mask, y=input_mask, transpose_y=True) 117 | self_attn_mask = fluid.layers.scale( 118 | x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) 119 | n_head_self_attn_mask = fluid.layers.stack( 120 | x=[self_attn_mask] * self._n_head, axis=1) 121 | n_head_self_attn_mask.stop_gradient = True 122 | 123 | self._enc_out = encoder( 124 | enc_input=emb_out, 125 | attn_bias=n_head_self_attn_mask, 126 | n_layer=self._n_layer, 127 | n_head=self._n_head, 128 | d_key=self._emb_size // self._n_head, 129 | d_value=self._emb_size // self._n_head, 130 | d_model=self._emb_size, 131 | d_inner_hid=self._emb_size * 4, 132 | prepostprocess_dropout=self._prepostprocess_dropout, 133 | attention_dropout=self._attention_dropout, 134 | relu_dropout=0, 135 | hidden_act=self._hidden_act, 136 | preprocess_cmd="", 137 | postprocess_cmd="dan", 138 | param_initializer=self._param_initializer, 139 | name='encoder') 140 | 141 | def get_sequence_output(self): 142 | return self._enc_out 143 | 144 | def get_pooled_output(self): 145 | """Get the first feature of each sequence for classification""" 146 | 147 | next_sent_feat = fluid.layers.slice( 148 | input=self._enc_out, axes=[1], starts=[0], ends=[1]) 149 | next_sent_feat = fluid.layers.fc( 150 | input=next_sent_feat, 151 | size=self._emb_size, 152 | act="tanh", 153 | param_attr=fluid.ParamAttr( 154 | name="pooled_fc.w_0", initializer=self._param_initializer), 155 | bias_attr="pooled_fc.b_0") 156 | return next_sent_feat 157 | 158 | def get_pretraining_output(self, mask_label, mask_pos, labels): 159 | """Get the loss & accuracy for pretraining""" 160 | 161 | mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') 162 | 163 | # extract the first token feature in each sentence 164 | next_sent_feat = self.get_pooled_output() 165 | reshaped_emb_out = fluid.layers.reshape( 166 | x=self._enc_out, shape=[-1, self._emb_size]) 167 | # extract masked tokens' feature 168 | mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) 169 | 170 | # transform: fc 171 | mask_trans_feat = fluid.layers.fc( 172 | input=mask_feat, 173 | size=self._emb_size, 174 | act=self._hidden_act, 175 | param_attr=fluid.ParamAttr( 176 | name='mask_lm_trans_fc.w_0', 177 | initializer=self._param_initializer), 178 | bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) 179 | # transform: layer norm 180 | mask_trans_feat = pre_process_layer( 181 | mask_trans_feat, 'n', name='mask_lm_trans') 182 | 183 | mask_lm_out_bias_attr = fluid.ParamAttr( 184 | name="mask_lm_out_fc.b_0", 185 | initializer=fluid.initializer.Constant(value=0.0)) 186 | if self._weight_sharing: 187 | fc_out = fluid.layers.matmul( 188 | x=mask_trans_feat, 189 | y=fluid.default_main_program().global_block().var( 190 | self._word_emb_name), 191 | transpose_y=True) 192 | fc_out += fluid.layers.create_parameter( 193 | shape=[self._voc_size], 194 | dtype=self._dtype, 195 | attr=mask_lm_out_bias_attr, 196 | is_bias=True) 197 | 198 | else: 199 | fc_out = fluid.layers.fc(input=mask_trans_feat, 200 | size=self._voc_size, 201 | param_attr=fluid.ParamAttr( 202 | name="mask_lm_out_fc.w_0", 203 | initializer=self._param_initializer), 204 | bias_attr=mask_lm_out_bias_attr) 205 | 206 | mask_lm_loss = fluid.layers.softmax_with_cross_entropy( 207 | logits=fc_out, label=mask_label) 208 | mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) 209 | 210 | next_sent_fc_out = fluid.layers.fc( 211 | input=next_sent_feat, 212 | size=2, 213 | param_attr=fluid.ParamAttr( 214 | name="next_sent_fc.w_0", initializer=self._param_initializer), 215 | bias_attr="next_sent_fc.b_0") 216 | 217 | next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( 218 | logits=next_sent_fc_out, label=labels, return_softmax=True) 219 | 220 | next_sent_acc = fluid.layers.accuracy( 221 | input=next_sent_softmax, label=labels) 222 | 223 | mean_next_sent_loss = fluid.layers.mean(next_sent_loss) 224 | 225 | loss = mean_next_sent_loss + mean_mask_lm_loss 226 | return next_sent_acc, mean_mask_lm_loss, loss 227 | -------------------------------------------------------------------------------- /single_domain/model/bert.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/model/bert.pyc -------------------------------------------------------------------------------- /single_domain/model/classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Model for classifier.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import paddle.fluid as fluid 21 | 22 | from model.bert import BertModel 23 | 24 | 25 | def create_model(args, 26 | pyreader_name, 27 | bert_config, 28 | num_labels, 29 | is_prediction=False): 30 | pyreader = fluid.layers.py_reader( 31 | capacity=50, 32 | shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 33 | [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], 34 | dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], 35 | lod_levels=[0, 0, 0, 0, 0], 36 | name=pyreader_name, 37 | use_double_buffer=True) 38 | 39 | (src_ids, pos_ids, sent_ids, input_mask, 40 | labels) = fluid.layers.read_file(pyreader) 41 | 42 | bert = BertModel( 43 | src_ids=src_ids, 44 | position_ids=pos_ids, 45 | sentence_ids=sent_ids, 46 | input_mask=input_mask, 47 | config=bert_config, 48 | use_fp16=args.use_fp16) 49 | 50 | cls_feats = bert.get_pooled_output() 51 | cls_feats = fluid.layers.dropout( 52 | x=cls_feats, 53 | dropout_prob=0.1, 54 | dropout_implementation="upscale_in_train") 55 | logits = fluid.layers.fc( 56 | input=cls_feats, 57 | size=num_labels, 58 | param_attr=fluid.ParamAttr( 59 | name="cls_out_w", 60 | initializer=fluid.initializer.TruncatedNormal(scale=0.02)), 61 | bias_attr=fluid.ParamAttr( 62 | name="cls_out_b", initializer=fluid.initializer.Constant(0.))) 63 | 64 | if is_prediction: 65 | probs = fluid.layers.softmax(logits) 66 | feed_targets_name = [ 67 | src_ids.name, pos_ids.name, sent_ids.name, input_mask.name 68 | ] 69 | return pyreader, probs, feed_targets_name 70 | 71 | ce_loss, probs = fluid.layers.softmax_with_cross_entropy( 72 | logits=logits, label=labels, return_softmax=True) 73 | loss = fluid.layers.mean(x=ce_loss) 74 | 75 | if args.use_fp16 and args.loss_scaling > 1.0: 76 | loss *= args.loss_scaling 77 | 78 | num_seqs = fluid.layers.create_tensor(dtype='int64') 79 | accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) 80 | 81 | return pyreader, loss, probs, accuracy, num_seqs 82 | -------------------------------------------------------------------------------- /single_domain/model/transformer_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Transformer encoder.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | from functools import partial 21 | import numpy as np 22 | 23 | import paddle.fluid as fluid 24 | import paddle.fluid.layers as layers 25 | 26 | 27 | def multi_head_attention(queries, 28 | keys, 29 | values, 30 | attn_bias, 31 | d_key, 32 | d_value, 33 | d_model, 34 | n_head=1, 35 | dropout_rate=0., 36 | cache=None, 37 | param_initializer=None, 38 | name='multi_head_att'): 39 | """ 40 | Multi-Head Attention. Note that attn_bias is added to the logit before 41 | computing softmax activiation to mask certain selected positions so that 42 | they will not considered in attention weights. 43 | """ 44 | keys = queries if keys is None else keys 45 | values = keys if values is None else values 46 | 47 | if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): 48 | raise ValueError( 49 | "Inputs: quries, keys and values should all be 3-D tensors.") 50 | 51 | def __compute_qkv(queries, keys, values, n_head, d_key, d_value): 52 | """ 53 | Add linear projection to queries, keys, and values. 54 | """ 55 | q = layers.fc(input=queries, 56 | size=d_key * n_head, 57 | num_flatten_dims=2, 58 | param_attr=fluid.ParamAttr( 59 | name=name + '_query_fc.w_0', 60 | initializer=param_initializer), 61 | bias_attr=name + '_query_fc.b_0') 62 | k = layers.fc(input=keys, 63 | size=d_key * n_head, 64 | num_flatten_dims=2, 65 | param_attr=fluid.ParamAttr( 66 | name=name + '_key_fc.w_0', 67 | initializer=param_initializer), 68 | bias_attr=name + '_key_fc.b_0') 69 | v = layers.fc(input=values, 70 | size=d_value * n_head, 71 | num_flatten_dims=2, 72 | param_attr=fluid.ParamAttr( 73 | name=name + '_value_fc.w_0', 74 | initializer=param_initializer), 75 | bias_attr=name + '_value_fc.b_0') 76 | return q, k, v 77 | 78 | def __split_heads(x, n_head): 79 | """ 80 | Reshape the last dimension of inpunt tensor x so that it becomes two 81 | dimensions and then transpose. Specifically, input a tensor with shape 82 | [bs, max_sequence_length, n_head * hidden_dim] then output a tensor 83 | with shape [bs, n_head, max_sequence_length, hidden_dim]. 84 | """ 85 | hidden_size = x.shape[-1] 86 | # The value 0 in shape attr means copying the corresponding dimension 87 | # size of the input as the output dimension size. 88 | reshaped = layers.reshape( 89 | x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) 90 | 91 | # permuate the dimensions into: 92 | # [batch_size, n_head, max_sequence_len, hidden_size_per_head] 93 | return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) 94 | 95 | def __combine_heads(x): 96 | """ 97 | Transpose and then reshape the last two dimensions of inpunt tensor x 98 | so that it becomes one dimension, which is reverse to __split_heads. 99 | """ 100 | if len(x.shape) == 3: return x 101 | if len(x.shape) != 4: 102 | raise ValueError("Input(x) should be a 4-D Tensor.") 103 | 104 | trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) 105 | # The value 0 in shape attr means copying the corresponding dimension 106 | # size of the input as the output dimension size. 107 | return layers.reshape( 108 | x=trans_x, 109 | shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], 110 | inplace=True) 111 | 112 | def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): 113 | """ 114 | Scaled Dot-Product Attention 115 | """ 116 | scaled_q = layers.scale(x=q, scale=d_key**-0.5) 117 | product = layers.matmul(x=scaled_q, y=k, transpose_y=True) 118 | if attn_bias: 119 | product += attn_bias 120 | weights = layers.softmax(product) 121 | if dropout_rate: 122 | weights = layers.dropout( 123 | weights, 124 | dropout_prob=dropout_rate, 125 | dropout_implementation="upscale_in_train", 126 | is_test=False) 127 | out = layers.matmul(weights, v) 128 | return out 129 | 130 | q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) 131 | 132 | if cache is not None: # use cache and concat time steps 133 | # Since the inplace reshape in __split_heads changes the shape of k and 134 | # v, which is the cache input for next time step, reshape the cache 135 | # input from the previous time step first. 136 | k = cache["k"] = layers.concat( 137 | [layers.reshape( 138 | cache["k"], shape=[0, 0, d_model]), k], axis=1) 139 | v = cache["v"] = layers.concat( 140 | [layers.reshape( 141 | cache["v"], shape=[0, 0, d_model]), v], axis=1) 142 | 143 | q = __split_heads(q, n_head) 144 | k = __split_heads(k, n_head) 145 | v = __split_heads(v, n_head) 146 | 147 | ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, 148 | dropout_rate) 149 | 150 | out = __combine_heads(ctx_multiheads) 151 | 152 | # Project back to the model size. 153 | proj_out = layers.fc(input=out, 154 | size=d_model, 155 | num_flatten_dims=2, 156 | param_attr=fluid.ParamAttr( 157 | name=name + '_output_fc.w_0', 158 | initializer=param_initializer), 159 | bias_attr=name + '_output_fc.b_0') 160 | return proj_out 161 | 162 | 163 | def positionwise_feed_forward(x, 164 | d_inner_hid, 165 | d_hid, 166 | dropout_rate, 167 | hidden_act, 168 | param_initializer=None, 169 | name='ffn'): 170 | """ 171 | Position-wise Feed-Forward Networks. 172 | This module consists of two linear transformations with a ReLU activation 173 | in between, which is applied to each position separately and identically. 174 | """ 175 | hidden = layers.fc(input=x, 176 | size=d_inner_hid, 177 | num_flatten_dims=2, 178 | act=hidden_act, 179 | param_attr=fluid.ParamAttr( 180 | name=name + '_fc_0.w_0', 181 | initializer=param_initializer), 182 | bias_attr=name + '_fc_0.b_0') 183 | if dropout_rate: 184 | hidden = layers.dropout( 185 | hidden, 186 | dropout_prob=dropout_rate, 187 | dropout_implementation="upscale_in_train", 188 | is_test=False) 189 | out = layers.fc(input=hidden, 190 | size=d_hid, 191 | num_flatten_dims=2, 192 | param_attr=fluid.ParamAttr( 193 | name=name + '_fc_1.w_0', initializer=param_initializer), 194 | bias_attr=name + '_fc_1.b_0') 195 | return out 196 | 197 | 198 | def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., 199 | name=''): 200 | """ 201 | Add residual connection, layer normalization and droput to the out tensor 202 | optionally according to the value of process_cmd. 203 | This will be used before or after multi-head attention and position-wise 204 | feed-forward networks. 205 | """ 206 | for cmd in process_cmd: 207 | if cmd == "a": # add residual connection 208 | out = out + prev_out if prev_out else out 209 | elif cmd == "n": # add layer normalization 210 | out_dtype = out.dtype 211 | if out_dtype == fluid.core.VarDesc.VarType.FP16: 212 | out = layers.cast(x=out, dtype="float32") 213 | out = layers.layer_norm( 214 | out, 215 | begin_norm_axis=len(out.shape) - 1, 216 | param_attr=fluid.ParamAttr( 217 | name=name + '_layer_norm_scale', 218 | initializer=fluid.initializer.Constant(1.)), 219 | bias_attr=fluid.ParamAttr( 220 | name=name + '_layer_norm_bias', 221 | initializer=fluid.initializer.Constant(0.))) 222 | if out_dtype == fluid.core.VarDesc.VarType.FP16: 223 | out = layers.cast(x=out, dtype="float16") 224 | elif cmd == "d": # add dropout 225 | if dropout_rate: 226 | out = layers.dropout( 227 | out, 228 | dropout_prob=dropout_rate, 229 | dropout_implementation="upscale_in_train", 230 | is_test=False) 231 | return out 232 | 233 | 234 | pre_process_layer = partial(pre_post_process_layer, None) 235 | post_process_layer = pre_post_process_layer 236 | 237 | 238 | def encoder_layer(enc_input, 239 | attn_bias, 240 | n_head, 241 | d_key, 242 | d_value, 243 | d_model, 244 | d_inner_hid, 245 | prepostprocess_dropout, 246 | attention_dropout, 247 | relu_dropout, 248 | hidden_act, 249 | preprocess_cmd="n", 250 | postprocess_cmd="da", 251 | param_initializer=None, 252 | name=''): 253 | """The encoder layers that can be stacked to form a deep encoder. 254 | This module consits of a multi-head (self) attention followed by 255 | position-wise feed-forward networks and both the two components companied 256 | with the post_process_layer to add residual connection, layer normalization 257 | and droput. 258 | """ 259 | attn_output = multi_head_attention( 260 | pre_process_layer( 261 | enc_input, 262 | preprocess_cmd, 263 | prepostprocess_dropout, 264 | name=name + '_pre_att'), 265 | None, 266 | None, 267 | attn_bias, 268 | d_key, 269 | d_value, 270 | d_model, 271 | n_head, 272 | attention_dropout, 273 | param_initializer=param_initializer, 274 | name=name + '_multi_head_att') 275 | attn_output = post_process_layer( 276 | enc_input, 277 | attn_output, 278 | postprocess_cmd, 279 | prepostprocess_dropout, 280 | name=name + '_post_att') 281 | ffd_output = positionwise_feed_forward( 282 | pre_process_layer( 283 | attn_output, 284 | preprocess_cmd, 285 | prepostprocess_dropout, 286 | name=name + '_pre_ffn'), 287 | d_inner_hid, 288 | d_model, 289 | relu_dropout, 290 | hidden_act, 291 | param_initializer=param_initializer, 292 | name=name + '_ffn') 293 | return post_process_layer( 294 | attn_output, 295 | ffd_output, 296 | postprocess_cmd, 297 | prepostprocess_dropout, 298 | name=name + '_post_ffn') 299 | 300 | 301 | def encoder(enc_input, 302 | attn_bias, 303 | n_layer, 304 | n_head, 305 | d_key, 306 | d_value, 307 | d_model, 308 | d_inner_hid, 309 | prepostprocess_dropout, 310 | attention_dropout, 311 | relu_dropout, 312 | hidden_act, 313 | preprocess_cmd="n", 314 | postprocess_cmd="da", 315 | param_initializer=None, 316 | name=''): 317 | """ 318 | The encoder is composed of a stack of identical layers returned by calling 319 | encoder_layer. 320 | """ 321 | for i in range(n_layer): 322 | enc_output = encoder_layer( 323 | enc_input, 324 | attn_bias, 325 | n_head, 326 | d_key, 327 | d_value, 328 | d_model, 329 | d_inner_hid, 330 | prepostprocess_dropout, 331 | attention_dropout, 332 | relu_dropout, 333 | hidden_act, 334 | preprocess_cmd, 335 | postprocess_cmd, 336 | param_initializer=param_initializer, 337 | name=name + '_layer_' + str(i)) 338 | enc_input = enc_output 339 | enc_output = pre_process_layer( 340 | enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") 341 | 342 | return enc_output 343 | -------------------------------------------------------------------------------- /single_domain/model/transformer_encoder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/model/transformer_encoder.pyc -------------------------------------------------------------------------------- /single_domain/multi_iter.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export FLAGS_enable_parallel_graph=1 3 | export FLAGS_sync_nccl_allreduce=1 4 | export CUDA_VISIBLE_DEVICES=$1 5 | 6 | BERT_BASE_PATH=../uncased_L-12_H-768_A-12 7 | CHECKPOINT_PATH=./squad/checkpoints/ 8 | DATA_PATH=../data 9 | 10 | finetune() { 11 | python -u run_squad.py --use_cuda true\ 12 | --batch_size 12 \ 13 | --init_pretraining_params ${BERT_BASE_PATH}/params \ 14 | --in_tokens false\ 15 | --init_checkpoint ./squad/checkpoints/pretrain_best \ 16 | --checkpoints ${CHECKPOINT_PATH} \ 17 | --vocab_path ${BERT_BASE_PATH}/vocab.txt \ 18 | --do_train True \ 19 | --do_predict True \ 20 | --save_steps 4000 \ 21 | --warmup_proportion 0.1 \ 22 | --weight_decay 0.01 \ 23 | --epoch 2 \ 24 | --max_seq_len 512 \ 25 | --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ 26 | --predict_file ${DATA_PATH}/$1_dev.raw.json \ 27 | --do_lower_case True \ 28 | --doc_stride 128 \ 29 | --train_file ${DATA_PATH}/$1.raw.json \ 30 | --learning_rate 1.5e-5\ 31 | --lr_scheduler linear_warmup_decay \ 32 | --skip_steps 10 \ 33 | --now_best_score $2 34 | } 35 | 36 | pretrain() { 37 | python -u run_squad.py --use_cuda true\ 38 | --batch_size 12 \ 39 | --init_pretraining_params ${BERT_BASE_PATH}/params \ 40 | --in_tokens false\ 41 | --checkpoints ${CHECKPOINT_PATH} \ 42 | --vocab_path ${BERT_BASE_PATH}/vocab.txt \ 43 | --do_train True \ 44 | --do_predict True \ 45 | --save_steps 500000 \ 46 | --warmup_proportion 0.1 \ 47 | --weight_decay 0.01 \ 48 | --epoch 1 \ 49 | --max_seq_len 512 \ 50 | --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ 51 | --predict_file ${DATA_PATH}/All_domain_dev.raw.json \ 52 | --do_lower_case True \ 53 | --doc_stride 128 \ 54 | --train_file ${DATA_PATH}/All_domain.raw.json \ 55 | --learning_rate 1.5e-5\ 56 | --lr_scheduler linear_warmup_decay \ 57 | --skip_steps 10 \ 58 | --all_domain_and_weight $@ 59 | } 60 | 61 | gen_logits() { 62 | python -u gen_logits.py --use_cuda true\ 63 | --batch_size 40 \ 64 | --init_pretraining_params ${BERT_BASE_PATH}/params \ 65 | --in_tokens false\ 66 | --init_checkpoint ./squad/checkpoints/step_best \ 67 | --checkpoints ${CHECKPOINT_PATH} \ 68 | --vocab_path ${BERT_BASE_PATH}/vocab.txt \ 69 | --do_train False \ 70 | --do_predict True \ 71 | --save_steps 1000 \ 72 | --warmup_proportion 0.1 \ 73 | --weight_decay 0.01 \ 74 | --epoch 1 \ 75 | --max_seq_len 512 \ 76 | --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ 77 | --predict_file ${DATA_PATH}/$1.raw.json \ 78 | --do_lower_case True \ 79 | --doc_stride 128 \ 80 | --train_file ${DATA_PATH}/$1.raw.json \ 81 | --learning_rate 1.5e-5\ 82 | --lr_scheduler linear_warmup_decay \ 83 | --skip_steps 10 84 | } 85 | 86 | compute_weight() { 87 | python -u run_squad.py --use_cuda true\ 88 | --batch_size 12 \ 89 | --init_pretraining_params ${BERT_BASE_PATH}/params \ 90 | --in_tokens false\ 91 | --init_checkpoint ./squad/checkpoints/step_best \ 92 | --checkpoints ${CHECKPOINT_PATH} \ 93 | --vocab_path ${BERT_BASE_PATH}/vocab.txt \ 94 | --do_train False \ 95 | --do_predict True \ 96 | --save_steps 50000 \ 97 | --warmup_proportion 0.1 \ 98 | --weight_decay 0.01 \ 99 | --epoch 2 \ 100 | --max_seq_len 512 \ 101 | --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ 102 | --predict_file ${DATA_PATH}/$1_dev.4weight.json \ 103 | --do_lower_case True \ 104 | --doc_stride 128 \ 105 | --train_file ${DATA_PATH}/$1.raw.json \ 106 | --learning_rate 1.5e-5\ 107 | --lr_scheduler linear_warmup_decay \ 108 | --skip_steps 10 109 | } 110 | 111 | 112 | NOW_DOMAIN=$2 113 | NEED_ALIGN=$3 114 | shift 3 115 | ALL_DOMAIN=$@ 116 | BEST_SCORE=0 117 | 118 | echo ${NOW_DOMAIN} 119 | echo ${NEED_ALIGN} 120 | echo ${ALL_DOMAIN} 121 | 122 | if [[ ${NOW_DOMAIN} == SQuAD* ]]; then 123 | WEIGHTS='0 0.46 0.26 0.01 0.27' 124 | fi 125 | if [[ ${NOW_DOMAIN} == NewsQA* ]]; then 126 | WEIGHTS='0.5 0 0.24 0.01 0.25' 127 | fi 128 | if [[ ${NOW_DOMAIN} == HotpotQA* ]]; then 129 | WEIGHTS='0.34 0.29 0 0.03 0.35' 130 | fi 131 | if [[ ${NOW_DOMAIN} == NaturalQuestions* ]]; then 132 | WEIGHTS='0.22 0.12 0.35 0 0.31' 133 | fi 134 | if [[ ${NOW_DOMAIN} == TriviaQA* ]]; then 135 | WEIGHTS='0.34 0.30 0.34 0.02 0' 136 | fi 137 | 138 | for idx in $(seq 1 10) 139 | do 140 | 141 | echo Now in iteration ${idx} 142 | echo gen_logits $NOW_DOMAIN 143 | gen_logits $NOW_DOMAIN 144 | echo gen_finished! 145 | mkdir gen_finished${idx} 146 | for dm in $ALL_DOMAIN 147 | do 148 | while [ ! -d "../$dm/gen_finished$idx" ] 149 | do 150 | sleep 30 151 | done 152 | done 153 | if [ $NEED_ALIGN -eq 0 ]; then 154 | cd .. 155 | echo python logits_example_aligner.py --domains $ALL_DOMAIN 156 | python logits_example_aligner.py --domains $ALL_DOMAIN 157 | mkdir align_finish_$idx 158 | cd - 159 | fi 160 | while [ ! -d "../align_finish_$idx" ] 161 | do 162 | sleep 30 163 | done 164 | echo pretrain $ALL_DOMAIN $WEIGHTS 165 | pretrain $ALL_DOMAIN $WEIGHTS > log/pretrain_${idx}.log 166 | echo finetune 167 | finetune $NOW_DOMAIN $BEST_SCORE > log/finetune_${idx}.log 168 | SCORE_NOW=`tail -n 1 log/finetune_${idx}.log` 169 | if [ `echo "${SCORE_NOW} > ${BEST_SCORE}" | bc` -eq 1 ]; then 170 | BEST_SCORE=${SCORE_NOW} 171 | fi 172 | echo $NOW_DOMAIN best score $BEST_SCORE 173 | done 174 | mkdir train_finished 175 | -------------------------------------------------------------------------------- /single_domain/optimization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Optimization and learning rate scheduling.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import numpy as np 21 | import paddle.fluid as fluid 22 | from utils.fp16 import create_master_params_grads, master_param_to_train_param 23 | 24 | 25 | def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): 26 | """ Applies linear warmup of learning rate from 0 and decay to 0.""" 27 | with fluid.default_main_program()._lr_schedule_guard(): 28 | lr = fluid.layers.tensor.create_global_var( 29 | shape=[1], 30 | value=0.0, 31 | dtype='float32', 32 | persistable=True, 33 | name="scheduled_learning_rate") 34 | 35 | global_step = fluid.layers.learning_rate_scheduler._decay_step_counter() 36 | 37 | with fluid.layers.control_flow.Switch() as switch: 38 | with switch.case(global_step < warmup_steps): 39 | warmup_lr = learning_rate * (global_step / warmup_steps) 40 | fluid.layers.tensor.assign(warmup_lr, lr) 41 | with switch.default(): 42 | decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay( 43 | learning_rate=learning_rate, 44 | decay_steps=num_train_steps, 45 | end_learning_rate=0, 46 | power=1.0, 47 | cycle=False) 48 | fluid.layers.tensor.assign(decayed_lr, lr) 49 | 50 | return lr 51 | 52 | 53 | def optimization(loss, 54 | warmup_steps, 55 | num_train_steps, 56 | learning_rate, 57 | train_program, 58 | startup_prog, 59 | weight_decay, 60 | scheduler='linear_warmup_decay', 61 | use_fp16=False, 62 | loss_scaling=1.0): 63 | if warmup_steps > 0: 64 | if scheduler == 'noam_decay': 65 | scheduled_lr = fluid.layers.learning_rate_scheduler\ 66 | .noam_decay(1/(warmup_steps *(learning_rate ** 2)), 67 | warmup_steps) 68 | elif scheduler == 'linear_warmup_decay': 69 | scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, 70 | num_train_steps) 71 | else: 72 | raise ValueError("Unkown learning rate scheduler, should be " 73 | "'noam_decay' or 'linear_warmup_decay'") 74 | optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) 75 | else: 76 | optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) 77 | scheduled_lr = learning_rate 78 | 79 | clip_norm_thres = 1.0 80 | # When using mixed precision training, scale the gradient clip threshold 81 | # by loss_scaling 82 | if use_fp16 and loss_scaling > 1.0: 83 | clip_norm_thres *= loss_scaling 84 | fluid.clip.set_gradient_clip( 85 | clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) 86 | 87 | def exclude_from_weight_decay(name): 88 | if name.find("layer_norm") > -1: 89 | return True 90 | bias_suffix = ["_bias", "_b", ".b_0"] 91 | for suffix in bias_suffix: 92 | if name.endswith(suffix): 93 | return True 94 | return False 95 | 96 | param_list = dict() 97 | 98 | if use_fp16: 99 | param_grads = optimizer.backward(loss) 100 | master_param_grads = create_master_params_grads( 101 | param_grads, train_program, startup_prog, loss_scaling) 102 | 103 | for param, _ in master_param_grads: 104 | param_list[param.name] = param * 1.0 105 | param_list[param.name].stop_gradient = True 106 | 107 | optimizer.apply_gradients(master_param_grads) 108 | 109 | if weight_decay > 0: 110 | for param, grad in master_param_grads: 111 | if exclude_from_weight_decay(param.name.rstrip(".master")): 112 | continue 113 | with param.block.program._optimized_guard( 114 | [param, grad]), fluid.framework.name_scope("weight_decay"): 115 | updated_param = param - param_list[ 116 | param.name] * weight_decay * scheduled_lr 117 | fluid.layers.assign(output=param, input=updated_param) 118 | 119 | master_param_to_train_param(master_param_grads, param_grads, 120 | train_program) 121 | 122 | else: 123 | for param in train_program.global_block().all_parameters(): 124 | param_list[param.name] = param * 1.0 125 | param_list[param.name].stop_gradient = True 126 | 127 | _, param_grads = optimizer.minimize(loss) 128 | 129 | if weight_decay > 0: 130 | for param, grad in param_grads: 131 | if exclude_from_weight_decay(param.name): 132 | continue 133 | with param.block.program._optimized_guard( 134 | [param, grad]), fluid.framework.name_scope("weight_decay"): 135 | updated_param = param - param_list[ 136 | param.name] * weight_decay * scheduled_lr 137 | fluid.layers.assign(output=param, input=updated_param) 138 | 139 | return scheduled_lr 140 | -------------------------------------------------------------------------------- /single_domain/optimization.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/optimization.pyc -------------------------------------------------------------------------------- /single_domain/predict_classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Load classifier's checkpoint to do prediction or save inference model.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import os 21 | import time 22 | import argparse 23 | import numpy as np 24 | import multiprocessing 25 | import paddle.fluid as fluid 26 | 27 | import reader.cls as reader 28 | from model.bert import BertConfig 29 | from model.classifier import create_model 30 | 31 | from utils.args import ArgumentGroup, print_arguments 32 | from utils.init import init_pretraining_params 33 | 34 | # yapf: disable 35 | parser = argparse.ArgumentParser(__doc__) 36 | model_g = ArgumentGroup(parser, "model", "options to init, resume and save model.") 37 | model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.") 38 | model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") 39 | model_g.add_arg("save_inference_model_path", str, None, "If set, save the inference model to this path.") 40 | model_g.add_arg("use_fp16", bool, False, "Whether to resume parameters from fp16 checkpoint.") 41 | 42 | data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.") 43 | data_g.add_arg("data_dir", str, None, "Directory to test data.") 44 | data_g.add_arg("vocab_path", str, None, "Vocabulary path.") 45 | data_g.add_arg("max_seq_len", int, 128, "Number of words of the longest seqence.") 46 | data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") 47 | data_g.add_arg("in_tokens", bool, False, 48 | "If set, the batch size will be the maximum number of tokens in one batch. " 49 | "Otherwise, it will be the maximum number of examples in one batch.") 50 | data_g.add_arg("do_lower_case", bool, True, 51 | "Whether to lower case the input text. Should be True for uncased models and False for cased models.") 52 | 53 | run_type_g = ArgumentGroup(parser, "run_type", "running type options.") 54 | run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") 55 | run_type_g.add_arg("task_name", str, None, 56 | "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.") 57 | run_type_g.add_arg("do_prediction", bool, True, "Whether to do prediction on test set.") 58 | 59 | args = parser.parse_args() 60 | # yapf: enable. 61 | 62 | def main(args): 63 | bert_config = BertConfig(args.bert_config_path) 64 | bert_config.print_config() 65 | 66 | task_name = args.task_name.lower() 67 | processors = { 68 | 'xnli': reader.XnliProcessor, 69 | 'cola': reader.ColaProcessor, 70 | 'mrpc': reader.MrpcProcessor, 71 | 'mnli': reader.MnliProcessor, 72 | } 73 | 74 | processor = processors[task_name](data_dir=args.data_dir, 75 | vocab_path=args.vocab_path, 76 | max_seq_len=args.max_seq_len, 77 | do_lower_case=args.do_lower_case, 78 | in_tokens=False) 79 | num_labels = len(processor.get_labels()) 80 | 81 | predict_prog = fluid.Program() 82 | predict_startup = fluid.Program() 83 | with fluid.program_guard(predict_prog, predict_startup): 84 | with fluid.unique_name.guard(): 85 | predict_pyreader, probs, feed_target_names = create_model( 86 | args, 87 | pyreader_name='predict_reader', 88 | bert_config=bert_config, 89 | num_labels=num_labels, 90 | is_prediction=True) 91 | 92 | predict_prog = predict_prog.clone(for_test=True) 93 | 94 | if args.use_cuda: 95 | place = fluid.CUDAPlace(0) 96 | dev_count = fluid.core.get_cuda_device_count() 97 | else: 98 | place = fluid.CPUPlace() 99 | dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) 100 | 101 | place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace() 102 | exe = fluid.Executor(place) 103 | exe.run(predict_startup) 104 | 105 | if args.init_checkpoint: 106 | init_pretraining_params(exe, args.init_checkpoint, predict_prog) 107 | else: 108 | raise ValueError("args 'init_checkpoint' should be set for prediction!") 109 | 110 | # Due to the design that ParallelExecutor would drop small batches (mostly the last batch) 111 | # So using ParallelExecutor may left some data unpredicted 112 | # if prediction of each and every example is needed, please use Executor instead 113 | predict_exe = fluid.ParallelExecutor( 114 | use_cuda=args.use_cuda, main_program=predict_prog) 115 | 116 | predict_pyreader.decorate_tensor_provider( 117 | processor.data_generator( 118 | batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) 119 | 120 | predict_pyreader.start() 121 | all_results = [] 122 | time_begin = time.time() 123 | while True: 124 | try: 125 | results = predict_exe.run(fetch_list=[probs.name]) 126 | all_results.extend(results[0]) 127 | except fluid.core.EOFException: 128 | predict_pyreader.reset() 129 | break 130 | time_end = time.time() 131 | 132 | np.set_printoptions(precision=4, suppress=True) 133 | print("-------------- prediction results --------------") 134 | print("example_id\t" + ' '.join(processor.get_labels())) 135 | for index, result in enumerate(all_results): 136 | print(str(index) + '\t{}'.format(result)) 137 | 138 | if args.save_inference_model_path: 139 | _, ckpt_dir = os.path.split(args.init_checkpoint.rstrip('/')) 140 | dir_name = ckpt_dir + '_inference_model' 141 | model_path = os.path.join(args.save_inference_model_path, dir_name) 142 | print("save inference model to %s" % model_path) 143 | fluid.io.save_inference_model( 144 | model_path, 145 | feed_target_names, [probs], 146 | exe, 147 | main_program=predict_prog) 148 | 149 | 150 | if __name__ == '__main__': 151 | print_arguments(args) 152 | main(args) 153 | -------------------------------------------------------------------------------- /single_domain/reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/reader/__init__.py -------------------------------------------------------------------------------- /single_domain/reader/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/reader/__init__.pyc -------------------------------------------------------------------------------- /single_domain/reader/cls.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import types 17 | import csv 18 | import numpy as np 19 | import tokenization 20 | from batching import prepare_batch_data 21 | 22 | 23 | class DataProcessor(object): 24 | """Base class for data converters for sequence classification data sets.""" 25 | 26 | def __init__(self, 27 | data_dir, 28 | vocab_path, 29 | max_seq_len, 30 | do_lower_case, 31 | in_tokens, 32 | random_seed=None): 33 | self.data_dir = data_dir 34 | self.max_seq_len = max_seq_len 35 | self.tokenizer = tokenization.FullTokenizer( 36 | vocab_file=vocab_path, do_lower_case=do_lower_case) 37 | self.vocab = self.tokenizer.vocab 38 | self.in_tokens = in_tokens 39 | 40 | np.random.seed(random_seed) 41 | 42 | self.current_train_example = -1 43 | self.num_examples = {'train': -1, 'dev': -1, 'test': -1} 44 | self.current_train_epoch = -1 45 | 46 | def get_train_examples(self, data_dir): 47 | """Gets a collection of `InputExample`s for the train set.""" 48 | raise NotImplementedError() 49 | 50 | def get_dev_examples(self, data_dir): 51 | """Gets a collection of `InputExample`s for the dev set.""" 52 | raise NotImplementedError() 53 | 54 | def get_test_examples(self, data_dir): 55 | """Gets a collection of `InputExample`s for prediction.""" 56 | raise NotImplementedError() 57 | 58 | def get_labels(self): 59 | """Gets the list of labels for this data set.""" 60 | raise NotImplementedError() 61 | 62 | def convert_example(self, index, example, labels, max_seq_len, tokenizer): 63 | """Converts a single `InputExample` into a single `InputFeatures`.""" 64 | feature = convert_single_example(index, example, labels, max_seq_len, 65 | tokenizer) 66 | return feature 67 | 68 | def generate_instance(self, feature): 69 | """ 70 | generate instance with given feature 71 | 72 | Args: 73 | feature: InputFeatures(object). A single set of features of data. 74 | """ 75 | input_pos = list(range(len(feature.input_ids))) 76 | return [ 77 | feature.input_ids, feature.segment_ids, input_pos, feature.label_id 78 | ] 79 | 80 | def generate_batch_data(self, 81 | batch_data, 82 | total_token_num, 83 | voc_size=-1, 84 | mask_id=-1, 85 | return_input_mask=True, 86 | return_max_len=False, 87 | return_num_token=False): 88 | return prepare_batch_data( 89 | batch_data, 90 | total_token_num, 91 | voc_size=-1, 92 | pad_id=self.vocab["[PAD]"], 93 | cls_id=self.vocab["[CLS]"], 94 | sep_id=self.vocab["[SEP]"], 95 | mask_id=-1, 96 | return_input_mask=True, 97 | return_max_len=False, 98 | return_num_token=False) 99 | 100 | @classmethod 101 | def _read_tsv(cls, input_file, quotechar=None): 102 | """Reads a tab separated value file.""" 103 | with open(input_file, "r") as f: 104 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 105 | lines = [] 106 | for line in reader: 107 | lines.append(line) 108 | return lines 109 | 110 | def get_num_examples(self, phase): 111 | """Get number of examples for train, dev or test.""" 112 | if phase not in ['train', 'dev', 'test']: 113 | raise ValueError( 114 | "Unknown phase, which should be in ['train', 'dev', 'test'].") 115 | return self.num_examples[phase] 116 | 117 | def get_train_progress(self): 118 | """Gets progress for training phase.""" 119 | return self.current_train_example, self.current_train_epoch 120 | 121 | def data_generator(self, 122 | batch_size, 123 | phase='train', 124 | epoch=1, 125 | dev_count=1, 126 | shuffle=True): 127 | """ 128 | Generate data for train, dev or test. 129 | 130 | Args: 131 | batch_size: int. The batch size of generated data. 132 | phase: string. The phase for which to generate data. 133 | epoch: int. Total epoches to generate data. 134 | shuffle: bool. Whether to shuffle examples. 135 | """ 136 | if phase == 'train': 137 | examples = self.get_train_examples(self.data_dir) 138 | self.num_examples['train'] = len(examples) 139 | elif phase == 'dev': 140 | examples = self.get_dev_examples(self.data_dir) 141 | self.num_examples['dev'] = len(examples) 142 | elif phase == 'test': 143 | examples = self.get_test_examples(self.data_dir) 144 | self.num_examples['test'] = len(examples) 145 | else: 146 | raise ValueError( 147 | "Unknown phase, which should be in ['train', 'dev', 'test'].") 148 | 149 | def instance_reader(): 150 | for epoch_index in range(epoch): 151 | if shuffle: 152 | np.random.shuffle(examples) 153 | if phase == 'train': 154 | self.current_train_epoch = epoch_index 155 | for (index, example) in enumerate(examples): 156 | if phase == 'train': 157 | self.current_train_example = index + 1 158 | feature = self.convert_example( 159 | index, example, 160 | self.get_labels(), self.max_seq_len, self.tokenizer) 161 | 162 | instance = self.generate_instance(feature) 163 | yield instance 164 | 165 | def batch_reader(reader, batch_size, in_tokens): 166 | batch, total_token_num, max_len = [], 0, 0 167 | for instance in reader(): 168 | token_ids, sent_ids, pos_ids, label = instance[:4] 169 | max_len = max(max_len, len(token_ids)) 170 | if in_tokens: 171 | to_append = (len(batch) + 1) * max_len <= batch_size 172 | else: 173 | to_append = len(batch) < batch_size 174 | if to_append: 175 | batch.append(instance) 176 | total_token_num += len(token_ids) 177 | else: 178 | yield batch, total_token_num 179 | batch, total_token_num, max_len = [instance], len( 180 | token_ids), len(token_ids) 181 | 182 | if len(batch) > 0: 183 | yield batch, total_token_num 184 | 185 | def wrapper(): 186 | all_dev_batches = [] 187 | for batch_data, total_token_num in batch_reader( 188 | instance_reader, batch_size, self.in_tokens): 189 | batch_data = self.generate_batch_data( 190 | batch_data, 191 | total_token_num, 192 | voc_size=-1, 193 | mask_id=-1, 194 | return_input_mask=True, 195 | return_max_len=False, 196 | return_num_token=False) 197 | if len(all_dev_batches) < dev_count: 198 | all_dev_batches.append(batch_data) 199 | 200 | if len(all_dev_batches) == dev_count: 201 | for batch in all_dev_batches: 202 | yield batch 203 | all_dev_batches = [] 204 | 205 | return wrapper 206 | 207 | 208 | class InputExample(object): 209 | """A single training/test example for simple sequence classification.""" 210 | 211 | def __init__(self, guid, text_a, text_b=None, label=None): 212 | """Constructs a InputExample. 213 | 214 | Args: 215 | guid: Unique id for the example. 216 | text_a: string. The untokenized text of the first sequence. For single 217 | sequence tasks, only this sequence must be specified. 218 | text_b: (Optional) string. The untokenized text of the second sequence. 219 | Only must be specified for sequence pair tasks. 220 | label: (Optional) string. The label of the example. This should be 221 | specified for train and dev examples, but not for test examples. 222 | """ 223 | self.guid = guid 224 | self.text_a = text_a 225 | self.text_b = text_b 226 | self.label = label 227 | 228 | 229 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 230 | """Truncates a sequence pair in place to the maximum length.""" 231 | 232 | # This is a simple heuristic which will always truncate the longer sequence 233 | # one token at a time. This makes more sense than truncating an equal percent 234 | # of tokens from each, since if one sequence is very short then each token 235 | # that's truncated likely contains more information than a longer sequence. 236 | while True: 237 | total_length = len(tokens_a) + len(tokens_b) 238 | if total_length <= max_length: 239 | break 240 | if len(tokens_a) > len(tokens_b): 241 | tokens_a.pop() 242 | else: 243 | tokens_b.pop() 244 | 245 | 246 | class InputFeatures(object): 247 | """A single set of features of data.""" 248 | 249 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 250 | self.input_ids = input_ids 251 | self.input_mask = input_mask 252 | self.segment_ids = segment_ids 253 | self.label_id = label_id 254 | 255 | 256 | class XnliProcessor(DataProcessor): 257 | """Processor for the XNLI data set.""" 258 | 259 | def get_train_examples(self, data_dir): 260 | """See base class.""" 261 | self.language = "zh" 262 | lines = self._read_tsv( 263 | os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % 264 | self.language)) 265 | examples = [] 266 | for (i, line) in enumerate(lines): 267 | if i == 0: 268 | continue 269 | guid = "train-%d" % (i) 270 | text_a = tokenization.convert_to_unicode(line[0]) 271 | text_b = tokenization.convert_to_unicode(line[1]) 272 | label = tokenization.convert_to_unicode(line[2]) 273 | if label == tokenization.convert_to_unicode("contradictory"): 274 | label = tokenization.convert_to_unicode("contradiction") 275 | examples.append( 276 | InputExample( 277 | guid=guid, text_a=text_a, text_b=text_b, label=label)) 278 | return examples 279 | 280 | def get_dev_examples(self, data_dir): 281 | """See base class.""" 282 | self.language = "zh" 283 | lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) 284 | examples = [] 285 | for (i, line) in enumerate(lines): 286 | if i == 0: 287 | continue 288 | guid = "dev-%d" % (i) 289 | language = tokenization.convert_to_unicode(line[0]) 290 | if language != tokenization.convert_to_unicode(self.language): 291 | continue 292 | text_a = tokenization.convert_to_unicode(line[6]) 293 | text_b = tokenization.convert_to_unicode(line[7]) 294 | label = tokenization.convert_to_unicode(line[1]) 295 | examples.append( 296 | InputExample( 297 | guid=guid, text_a=text_a, text_b=text_b, label=label)) 298 | return examples 299 | 300 | def get_test_examples(self, data_dir): 301 | """See base class.""" 302 | self.language = "zh" 303 | lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv")) 304 | examples = [] 305 | for (i, line) in enumerate(lines): 306 | if i == 0: 307 | continue 308 | guid = "test-%d" % (i) 309 | language = tokenization.convert_to_unicode(line[0]) 310 | if language != tokenization.convert_to_unicode(self.language): 311 | continue 312 | text_a = tokenization.convert_to_unicode(line[6]) 313 | text_b = tokenization.convert_to_unicode(line[7]) 314 | label = tokenization.convert_to_unicode(line[1]) 315 | examples.append( 316 | InputExample( 317 | guid=guid, text_a=text_a, text_b=text_b, label=label)) 318 | return examples 319 | 320 | def get_labels(self): 321 | """See base class.""" 322 | return ["contradiction", "entailment", "neutral"] 323 | 324 | 325 | class MnliProcessor(DataProcessor): 326 | """Processor for the MultiNLI data set (GLUE version).""" 327 | 328 | def get_train_examples(self, data_dir): 329 | """See base class.""" 330 | return self._create_examples( 331 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 332 | 333 | def get_dev_examples(self, data_dir): 334 | """See base class.""" 335 | return self._create_examples( 336 | self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), 337 | "dev_matched") 338 | 339 | def get_test_examples(self, data_dir): 340 | """See base class.""" 341 | return self._create_examples( 342 | self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") 343 | 344 | def get_labels(self): 345 | """See base class.""" 346 | return ["contradiction", "entailment", "neutral"] 347 | 348 | def _create_examples(self, lines, set_type): 349 | """Creates examples for the training and dev sets.""" 350 | examples = [] 351 | for (i, line) in enumerate(lines): 352 | if i == 0: 353 | continue 354 | guid = "%s-%s" % (set_type, 355 | tokenization.convert_to_unicode(line[0])) 356 | text_a = tokenization.convert_to_unicode(line[8]) 357 | text_b = tokenization.convert_to_unicode(line[9]) 358 | if set_type == "test": 359 | label = "contradiction" 360 | else: 361 | label = tokenization.convert_to_unicode(line[-1]) 362 | examples.append( 363 | InputExample( 364 | guid=guid, text_a=text_a, text_b=text_b, label=label)) 365 | return examples 366 | 367 | 368 | class MrpcProcessor(DataProcessor): 369 | """Processor for the MRPC data set (GLUE version).""" 370 | 371 | def get_train_examples(self, data_dir): 372 | """See base class.""" 373 | return self._create_examples( 374 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 375 | 376 | def get_dev_examples(self, data_dir): 377 | """See base class.""" 378 | return self._create_examples( 379 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 380 | 381 | def get_test_examples(self, data_dir): 382 | """See base class.""" 383 | return self._create_examples( 384 | self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 385 | 386 | def get_labels(self): 387 | """See base class.""" 388 | return ["0", "1"] 389 | 390 | def _create_examples(self, lines, set_type): 391 | """Creates examples for the training and dev sets.""" 392 | examples = [] 393 | for (i, line) in enumerate(lines): 394 | if i == 0: 395 | continue 396 | guid = "%s-%s" % (set_type, i) 397 | text_a = tokenization.convert_to_unicode(line[3]) 398 | text_b = tokenization.convert_to_unicode(line[4]) 399 | if set_type == "test": 400 | label = "0" 401 | else: 402 | label = tokenization.convert_to_unicode(line[0]) 403 | examples.append( 404 | InputExample( 405 | guid=guid, text_a=text_a, text_b=text_b, label=label)) 406 | return examples 407 | 408 | 409 | class ColaProcessor(DataProcessor): 410 | """Processor for the CoLA data set (GLUE version).""" 411 | 412 | def get_train_examples(self, data_dir): 413 | """See base class.""" 414 | return self._create_examples( 415 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 416 | 417 | def get_dev_examples(self, data_dir): 418 | """See base class.""" 419 | return self._create_examples( 420 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 421 | 422 | def get_test_examples(self, data_dir): 423 | """See base class.""" 424 | return self._create_examples( 425 | self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 426 | 427 | def get_labels(self): 428 | """See base class.""" 429 | return ["0", "1"] 430 | 431 | def _create_examples(self, lines, set_type): 432 | """Creates examples for the training and dev sets.""" 433 | examples = [] 434 | for (i, line) in enumerate(lines): 435 | # Only the test set has a header 436 | if set_type == "test" and i == 0: 437 | continue 438 | guid = "%s-%s" % (set_type, i) 439 | if set_type == "test": 440 | text_a = tokenization.convert_to_unicode(line[1]) 441 | label = "0" 442 | else: 443 | text_a = tokenization.convert_to_unicode(line[3]) 444 | label = tokenization.convert_to_unicode(line[1]) 445 | examples.append( 446 | InputExample( 447 | guid=guid, text_a=text_a, text_b=None, label=label)) 448 | return examples 449 | 450 | 451 | def convert_single_example_to_unicode(guid, single_example): 452 | text_a = tokenization.convert_to_unicode(single_example[0]) 453 | text_b = tokenization.convert_to_unicode(single_example[1]) 454 | label = tokenization.convert_to_unicode(single_example[2]) 455 | return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) 456 | 457 | 458 | def convert_single_example(ex_index, example, label_list, max_seq_length, 459 | tokenizer): 460 | """Converts a single `InputExample` into a single `InputFeatures`.""" 461 | label_map = {} 462 | for (i, label) in enumerate(label_list): 463 | label_map[label] = i 464 | 465 | tokens_a = tokenizer.tokenize(example.text_a) 466 | tokens_b = None 467 | if example.text_b: 468 | tokens_b = tokenizer.tokenize(example.text_b) 469 | 470 | if tokens_b: 471 | # Modifies `tokens_a` and `tokens_b` in place so that the total 472 | # length is less than the specified length. 473 | # Account for [CLS], [SEP], [SEP] with "- 3" 474 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 475 | else: 476 | # Account for [CLS] and [SEP] with "- 2" 477 | if len(tokens_a) > max_seq_length - 2: 478 | tokens_a = tokens_a[0:(max_seq_length - 2)] 479 | 480 | # The convention in BERT is: 481 | # (a) For sequence pairs: 482 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 483 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 484 | # (b) For single sequences: 485 | # tokens: [CLS] the dog is hairy . [SEP] 486 | # type_ids: 0 0 0 0 0 0 0 487 | # 488 | # Where "type_ids" are used to indicate whether this is the first 489 | # sequence or the second sequence. The embedding vectors for `type=0` and 490 | # `type=1` were learned during pre-training and are added to the wordpiece 491 | # embedding vector (and position vector). This is not *strictly* necessary 492 | # since the [SEP] token unambiguously separates the sequences, but it makes 493 | # it easier for the model to learn the concept of sequences. 494 | # 495 | # For classification tasks, the first vector (corresponding to [CLS]) is 496 | # used as as the "sentence vector". Note that this only makes sense because 497 | # the entire model is fine-tuned. 498 | tokens = [] 499 | segment_ids = [] 500 | tokens.append("[CLS]") 501 | segment_ids.append(0) 502 | for token in tokens_a: 503 | tokens.append(token) 504 | segment_ids.append(0) 505 | tokens.append("[SEP]") 506 | segment_ids.append(0) 507 | 508 | if tokens_b: 509 | for token in tokens_b: 510 | tokens.append(token) 511 | segment_ids.append(1) 512 | tokens.append("[SEP]") 513 | segment_ids.append(1) 514 | 515 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 516 | 517 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 518 | # tokens are attended to. 519 | input_mask = [1] * len(input_ids) 520 | 521 | label_id = label_map[example.label] 522 | 523 | feature = InputFeatures( 524 | input_ids=input_ids, 525 | input_mask=input_mask, 526 | segment_ids=segment_ids, 527 | label_id=label_id) 528 | return feature 529 | 530 | 531 | def convert_examples_to_features(examples, label_list, max_seq_length, 532 | tokenizer): 533 | """Convert a set of `InputExample`s to a list of `InputFeatures`.""" 534 | 535 | features = [] 536 | for (ex_index, example) in enumerate(examples): 537 | if ex_index % 10000 == 0: 538 | print("Writing example %d of %d" % (ex_index, len(examples))) 539 | 540 | feature = convert_single_example(ex_index, example, label_list, 541 | max_seq_length, tokenizer) 542 | 543 | features.append(feature) 544 | return features 545 | 546 | 547 | if __name__ == '__main__': 548 | pass 549 | -------------------------------------------------------------------------------- /single_domain/reader/gen_logits_squad.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/reader/gen_logits_squad.pyc -------------------------------------------------------------------------------- /single_domain/reader/old_squad.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/reader/old_squad.pyc -------------------------------------------------------------------------------- /single_domain/reader/pretraining.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | from __future__ import division 17 | 18 | import os 19 | import numpy as np 20 | import types 21 | import gzip 22 | import logging 23 | import re 24 | import six 25 | import collections 26 | import tokenization 27 | 28 | import paddle 29 | import paddle.fluid as fluid 30 | 31 | from batching import prepare_batch_data 32 | 33 | 34 | class DataReader(object): 35 | def __init__(self, 36 | data_dir, 37 | vocab_path, 38 | batch_size=4096, 39 | in_tokens=True, 40 | max_seq_len=512, 41 | shuffle_files=True, 42 | epoch=100, 43 | voc_size=0, 44 | is_test=False, 45 | generate_neg_sample=False): 46 | 47 | self.vocab = self.load_vocab(vocab_path) 48 | self.data_dir = data_dir 49 | self.batch_size = batch_size 50 | self.in_tokens = in_tokens 51 | self.shuffle_files = shuffle_files 52 | self.epoch = epoch 53 | self.current_epoch = 0 54 | self.current_file_index = 0 55 | self.total_file = 0 56 | self.current_file = None 57 | self.voc_size = voc_size 58 | self.max_seq_len = max_seq_len 59 | self.pad_id = self.vocab["[PAD]"] 60 | self.cls_id = self.vocab["[CLS]"] 61 | self.sep_id = self.vocab["[SEP]"] 62 | self.mask_id = self.vocab["[MASK]"] 63 | self.is_test = is_test 64 | self.generate_neg_sample = generate_neg_sample 65 | if self.in_tokens: 66 | assert self.batch_size >= self.max_seq_len, "The number of " \ 67 | "tokens in batch should not be smaller than max seq length." 68 | 69 | if self.is_test: 70 | self.epoch = 1 71 | self.shuffle_files = False 72 | 73 | def get_progress(self): 74 | """return current progress of traning data 75 | """ 76 | return self.current_epoch, self.current_file_index, self.total_file, self.current_file 77 | 78 | def parse_line(self, line, max_seq_len=512): 79 | """ parse one line to token_ids, sentence_ids, pos_ids, label 80 | """ 81 | line = line.strip().decode().split(";") 82 | assert len(line) == 4, "One sample must have 4 fields!" 83 | (token_ids, sent_ids, pos_ids, label) = line 84 | token_ids = [int(token) for token in token_ids.split(" ")] 85 | sent_ids = [int(token) for token in sent_ids.split(" ")] 86 | pos_ids = [int(token) for token in pos_ids.split(" ")] 87 | assert len(token_ids) == len(sent_ids) == len( 88 | pos_ids 89 | ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)" 90 | label = int(label) 91 | if len(token_ids) > max_seq_len: 92 | return None 93 | return [token_ids, sent_ids, pos_ids, label] 94 | 95 | def read_file(self, file): 96 | assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file 97 | file_path = self.data_dir + "/" + file 98 | with gzip.open(file_path, "rb") as f: 99 | for line in f: 100 | parsed_line = self.parse_line( 101 | line, max_seq_len=self.max_seq_len) 102 | if parsed_line is None: 103 | continue 104 | yield parsed_line 105 | 106 | def convert_to_unicode(self, text): 107 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 108 | if six.PY3: 109 | if isinstance(text, str): 110 | return text 111 | elif isinstance(text, bytes): 112 | return text.decode("utf-8", "ignore") 113 | else: 114 | raise ValueError("Unsupported string type: %s" % (type(text))) 115 | elif six.PY2: 116 | if isinstance(text, str): 117 | return text.decode("utf-8", "ignore") 118 | elif isinstance(text, unicode): 119 | return text 120 | else: 121 | raise ValueError("Unsupported string type: %s" % (type(text))) 122 | else: 123 | raise ValueError("Not running on Python2 or Python 3?") 124 | 125 | def load_vocab(self, vocab_file): 126 | """Loads a vocabulary file into a dictionary.""" 127 | vocab = collections.OrderedDict() 128 | fin = open(vocab_file) 129 | for num, line in enumerate(fin): 130 | items = self.convert_to_unicode(line.strip()).split("\t") 131 | if len(items) > 2: 132 | break 133 | token = items[0] 134 | index = items[1] if len(items) == 2 else num 135 | token = token.strip() 136 | vocab[token] = int(index) 137 | return vocab 138 | 139 | def random_pair_neg_samples(self, pos_samples): 140 | """ randomly generate negtive samples using pos_samples 141 | 142 | Args: 143 | pos_samples: list of positive samples 144 | 145 | Returns: 146 | neg_samples: list of negtive samples 147 | """ 148 | np.random.shuffle(pos_samples) 149 | num_sample = len(pos_samples) 150 | neg_samples = [] 151 | miss_num = 0 152 | 153 | for i in range(num_sample): 154 | pair_index = (i + 1) % num_sample 155 | origin_src_ids = pos_samples[i][0] 156 | origin_sep_index = origin_src_ids.index(2) 157 | pair_src_ids = pos_samples[pair_index][0] 158 | pair_sep_index = pair_src_ids.index(2) 159 | 160 | src_ids = origin_src_ids[:origin_sep_index + 1] + pair_src_ids[ 161 | pair_sep_index + 1:] 162 | if len(src_ids) >= self.max_seq_len: 163 | miss_num += 1 164 | continue 165 | sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [ 166 | 1 167 | ] * len(pair_src_ids[pair_sep_index + 1:]) 168 | pos_ids = list(range(len(src_ids))) 169 | neg_sample = [src_ids, sent_ids, pos_ids, 0] 170 | assert len(src_ids) == len(sent_ids) == len( 171 | pos_ids 172 | ), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True" 173 | neg_samples.append(neg_sample) 174 | return neg_samples, miss_num 175 | 176 | def mixin_negtive_samples(self, pos_sample_generator, buffer=1000): 177 | """ 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples 178 | 2. combine negtive samples and positive samples 179 | 180 | Args: 181 | pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1] 182 | 183 | Returns: 184 | sample: one sample from shuffled positive samples and negtive samples 185 | """ 186 | pos_samples = [] 187 | num_total_miss = 0 188 | pos_sample_num = 0 189 | try: 190 | while True: 191 | while len(pos_samples) < buffer: 192 | pos_sample = next(pos_sample_generator) 193 | label = pos_sample[3] 194 | assert label == 1, "positive sample's label must be 1" 195 | pos_samples.append(pos_sample) 196 | pos_sample_num += 1 197 | 198 | neg_samples, miss_num = self.random_pair_neg_samples( 199 | pos_samples) 200 | num_total_miss += miss_num 201 | samples = pos_samples + neg_samples 202 | pos_samples = [] 203 | np.random.shuffle(samples) 204 | for sample in samples: 205 | yield sample 206 | except StopIteration: 207 | print("stopiteration: reach end of file") 208 | if len(pos_samples) == 1: 209 | yield pos_samples[0] 210 | elif len(pos_samples) == 0: 211 | yield None 212 | else: 213 | neg_samples, miss_num = self.random_pair_neg_samples( 214 | pos_samples) 215 | num_total_miss += miss_num 216 | samples = pos_samples + neg_samples 217 | pos_samples = [] 218 | np.random.shuffle(samples) 219 | for sample in samples: 220 | yield sample 221 | print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" % 222 | (num_total_miss, pos_sample_num * 2, 223 | num_total_miss / (pos_sample_num * 2))) 224 | 225 | def data_generator(self): 226 | """ 227 | data_generator 228 | """ 229 | files = os.listdir(self.data_dir) 230 | self.total_file = len(files) 231 | assert self.total_file > 0, "[Error] data_dir is empty" 232 | 233 | def wrapper(): 234 | def reader(): 235 | for epoch in range(self.epoch): 236 | self.current_epoch = epoch + 1 237 | if self.shuffle_files: 238 | np.random.shuffle(files) 239 | for index, file in enumerate(files): 240 | self.current_file_index = index + 1 241 | self.current_file = file 242 | sample_generator = self.read_file(file) 243 | if not self.is_test and self.generate_neg_sample: 244 | sample_generator = self.mixin_negtive_samples( 245 | sample_generator) 246 | for sample in sample_generator: 247 | if sample is None: 248 | continue 249 | yield sample 250 | 251 | def batch_reader(reader, batch_size, in_tokens): 252 | batch, total_token_num, max_len = [], 0, 0 253 | for parsed_line in reader(): 254 | token_ids, sent_ids, pos_ids, label = parsed_line 255 | max_len = max(max_len, len(token_ids)) 256 | if in_tokens: 257 | to_append = (len(batch) + 1) * max_len <= batch_size 258 | else: 259 | to_append = len(batch) < batch_size 260 | if to_append: 261 | batch.append(parsed_line) 262 | total_token_num += len(token_ids) 263 | else: 264 | yield batch, total_token_num 265 | batch, total_token_num, max_len = [parsed_line], len( 266 | token_ids), len(token_ids) 267 | 268 | if len(batch) > 0: 269 | yield batch, total_token_num 270 | 271 | for batch_data, total_token_num in batch_reader( 272 | reader, self.batch_size, self.in_tokens): 273 | yield prepare_batch_data( 274 | batch_data, 275 | total_token_num, 276 | voc_size=self.voc_size, 277 | pad_id=self.pad_id, 278 | cls_id=self.cls_id, 279 | sep_id=self.sep_id, 280 | mask_id=self.mask_id, 281 | return_input_mask=True, 282 | return_max_len=False, 283 | return_num_token=False) 284 | 285 | return wrapper 286 | 287 | 288 | if __name__ == "__main__": 289 | pass 290 | -------------------------------------------------------------------------------- /single_domain/reader/squad.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/reader/squad.pyc -------------------------------------------------------------------------------- /single_domain/run_classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Finetuning on classification tasks.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import os 21 | import time 22 | import argparse 23 | import numpy as np 24 | import multiprocessing 25 | 26 | import paddle 27 | import paddle.fluid as fluid 28 | 29 | import reader.cls as reader 30 | from model.bert import BertConfig 31 | from model.classifier import create_model 32 | from optimization import optimization 33 | from utils.args import ArgumentGroup, print_arguments 34 | from utils.init import init_pretraining_params, init_checkpoint 35 | 36 | # yapf: disable 37 | parser = argparse.ArgumentParser(__doc__) 38 | model_g = ArgumentGroup(parser, "model", "model configuration and paths.") 39 | model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.") 40 | model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") 41 | model_g.add_arg("init_pretraining_params", str, None, 42 | "Init pre-training params which preforms fine-tuning from. If the " 43 | "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") 44 | model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") 45 | 46 | train_g = ArgumentGroup(parser, "training", "training options.") 47 | train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") 48 | train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") 49 | train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", 50 | "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) 51 | train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") 52 | train_g.add_arg("warmup_proportion", float, 0.1, 53 | "Proportion of training steps to perform linear learning rate warmup for.") 54 | train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") 55 | train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") 56 | train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") 57 | train_g.add_arg("loss_scaling", float, 1.0, 58 | "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") 59 | 60 | log_g = ArgumentGroup(parser, "logging", "logging related.") 61 | log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") 62 | log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") 63 | 64 | data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") 65 | data_g.add_arg("data_dir", str, None, "Path to training data.") 66 | data_g.add_arg("vocab_path", str, None, "Vocabulary path.") 67 | data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") 68 | data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") 69 | data_g.add_arg("in_tokens", bool, False, 70 | "If set, the batch size will be the maximum number of tokens in one batch. " 71 | "Otherwise, it will be the maximum number of examples in one batch.") 72 | data_g.add_arg("do_lower_case", bool, True, 73 | "Whether to lower case the input text. Should be True for uncased models and False for cased models.") 74 | data_g.add_arg("random_seed", int, 0, "Random seed.") 75 | 76 | run_type_g = ArgumentGroup(parser, "run_type", "running type options.") 77 | run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") 78 | run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") 79 | run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.") 80 | run_type_g.add_arg("task_name", str, None, 81 | "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.") 82 | run_type_g.add_arg("do_train", bool, True, "Whether to perform training.") 83 | run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") 84 | run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") 85 | 86 | args = parser.parse_args() 87 | # yapf: enable. 88 | 89 | 90 | def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): 91 | test_pyreader.start() 92 | total_cost, total_acc, total_num_seqs = [], [], [] 93 | time_begin = time.time() 94 | while True: 95 | try: 96 | np_loss, np_acc, np_num_seqs = exe.run(program=test_program, 97 | fetch_list=fetch_list) 98 | total_cost.extend(np_loss * np_num_seqs) 99 | total_acc.extend(np_acc * np_num_seqs) 100 | total_num_seqs.extend(np_num_seqs) 101 | except fluid.core.EOFException: 102 | test_pyreader.reset() 103 | break 104 | time_end = time.time() 105 | print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" % 106 | (eval_phase, np.sum(total_cost) / np.sum(total_num_seqs), 107 | np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin)) 108 | 109 | 110 | def main(args): 111 | bert_config = BertConfig(args.bert_config_path) 112 | bert_config.print_config() 113 | 114 | if args.use_cuda: 115 | place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) 116 | dev_count = fluid.core.get_cuda_device_count() 117 | else: 118 | place = fluid.CPUPlace() 119 | dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) 120 | exe = fluid.Executor(place) 121 | 122 | task_name = args.task_name.lower() 123 | processors = { 124 | 'xnli': reader.XnliProcessor, 125 | 'cola': reader.ColaProcessor, 126 | 'mrpc': reader.MrpcProcessor, 127 | 'mnli': reader.MnliProcessor, 128 | } 129 | 130 | processor = processors[task_name](data_dir=args.data_dir, 131 | vocab_path=args.vocab_path, 132 | max_seq_len=args.max_seq_len, 133 | do_lower_case=args.do_lower_case, 134 | in_tokens=args.in_tokens, 135 | random_seed=args.random_seed) 136 | num_labels = len(processor.get_labels()) 137 | 138 | if not (args.do_train or args.do_val or args.do_test): 139 | raise ValueError("For args `do_train`, `do_val` and `do_test`, at " 140 | "least one of them must be True.") 141 | 142 | startup_prog = fluid.Program() 143 | if args.random_seed is not None: 144 | startup_prog.random_seed = args.random_seed 145 | 146 | if args.do_train: 147 | train_data_generator = processor.data_generator( 148 | batch_size=args.batch_size, 149 | phase='train', 150 | epoch=args.epoch, 151 | dev_count=dev_count, 152 | shuffle=True) 153 | 154 | num_train_examples = processor.get_num_examples(phase='train') 155 | 156 | if args.in_tokens: 157 | max_train_steps = args.epoch * num_train_examples // ( 158 | args.batch_size // args.max_seq_len) // dev_count 159 | else: 160 | max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count 161 | 162 | warmup_steps = int(max_train_steps * args.warmup_proportion) 163 | print("Device count: %d" % dev_count) 164 | print("Num train examples: %d" % num_train_examples) 165 | print("Max train steps: %d" % max_train_steps) 166 | print("Num warmup steps: %d" % warmup_steps) 167 | 168 | train_program = fluid.Program() 169 | 170 | with fluid.program_guard(train_program, startup_prog): 171 | with fluid.unique_name.guard(): 172 | train_pyreader, loss, probs, accuracy, num_seqs = create_model( 173 | args, 174 | pyreader_name='train_reader', 175 | bert_config=bert_config, 176 | num_labels=num_labels) 177 | scheduled_lr = optimization( 178 | loss=loss, 179 | warmup_steps=warmup_steps, 180 | num_train_steps=max_train_steps, 181 | learning_rate=args.learning_rate, 182 | train_program=train_program, 183 | startup_prog=startup_prog, 184 | weight_decay=args.weight_decay, 185 | scheduler=args.lr_scheduler, 186 | use_fp16=args.use_fp16, 187 | loss_scaling=args.loss_scaling) 188 | 189 | fluid.memory_optimize( 190 | input_program=train_program, 191 | skip_opt_set=[ 192 | loss.name, probs.name, accuracy.name, num_seqs.name 193 | ]) 194 | 195 | if args.verbose: 196 | if args.in_tokens: 197 | lower_mem, upper_mem, unit = fluid.contrib.memory_usage( 198 | program=train_program, 199 | batch_size=args.batch_size // args.max_seq_len) 200 | else: 201 | lower_mem, upper_mem, unit = fluid.contrib.memory_usage( 202 | program=train_program, batch_size=args.batch_size) 203 | print("Theoretical memory usage in training: %.3f - %.3f %s" % 204 | (lower_mem, upper_mem, unit)) 205 | 206 | if args.do_val or args.do_test: 207 | test_prog = fluid.Program() 208 | with fluid.program_guard(test_prog, startup_prog): 209 | with fluid.unique_name.guard(): 210 | test_pyreader, loss, probs, accuracy, num_seqs = create_model( 211 | args, 212 | pyreader_name='test_reader', 213 | bert_config=bert_config, 214 | num_labels=num_labels) 215 | 216 | test_prog = test_prog.clone(for_test=True) 217 | 218 | exe.run(startup_prog) 219 | 220 | if args.do_train: 221 | if args.init_checkpoint and args.init_pretraining_params: 222 | print( 223 | "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " 224 | "both are set! Only arg 'init_checkpoint' is made valid.") 225 | if args.init_checkpoint: 226 | init_checkpoint( 227 | exe, 228 | args.init_checkpoint, 229 | main_program=startup_prog, 230 | use_fp16=args.use_fp16) 231 | elif args.init_pretraining_params: 232 | init_pretraining_params( 233 | exe, 234 | args.init_pretraining_params, 235 | main_program=startup_prog, 236 | use_fp16=args.use_fp16) 237 | elif args.do_val or args.do_test: 238 | if not args.init_checkpoint: 239 | raise ValueError("args 'init_checkpoint' should be set if" 240 | "only doing validation or testing!") 241 | init_checkpoint( 242 | exe, 243 | args.init_checkpoint, 244 | main_program=startup_prog, 245 | use_fp16=args.use_fp16) 246 | 247 | if args.do_train: 248 | exec_strategy = fluid.ExecutionStrategy() 249 | exec_strategy.use_experimental_executor = args.use_fast_executor 250 | exec_strategy.num_threads = dev_count 251 | exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope 252 | 253 | train_exe = fluid.ParallelExecutor( 254 | use_cuda=args.use_cuda, 255 | loss_name=loss.name, 256 | exec_strategy=exec_strategy, 257 | main_program=train_program) 258 | 259 | train_pyreader.decorate_tensor_provider(train_data_generator) 260 | else: 261 | train_exe = None 262 | 263 | if args.do_val or args.do_test: 264 | test_exe = fluid.ParallelExecutor( 265 | use_cuda=args.use_cuda, 266 | main_program=test_prog, 267 | share_vars_from=train_exe) 268 | 269 | if args.do_train: 270 | train_pyreader.start() 271 | steps = 0 272 | total_cost, total_acc, total_num_seqs = [], [], [] 273 | time_begin = time.time() 274 | while True: 275 | try: 276 | steps += 1 277 | if steps % args.skip_steps == 0: 278 | if warmup_steps <= 0: 279 | fetch_list = [loss.name, accuracy.name, num_seqs.name] 280 | else: 281 | fetch_list = [ 282 | loss.name, accuracy.name, scheduled_lr.name, 283 | num_seqs.name 284 | ] 285 | else: 286 | fetch_list = [] 287 | 288 | outputs = train_exe.run(fetch_list=fetch_list) 289 | 290 | if steps % args.skip_steps == 0: 291 | if warmup_steps <= 0: 292 | np_loss, np_acc, np_num_seqs = outputs 293 | else: 294 | np_loss, np_acc, np_lr, np_num_seqs = outputs 295 | 296 | total_cost.extend(np_loss * np_num_seqs) 297 | total_acc.extend(np_acc * np_num_seqs) 298 | total_num_seqs.extend(np_num_seqs) 299 | 300 | if args.verbose: 301 | verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( 302 | ) 303 | verbose += "learning rate: %f" % ( 304 | np_lr[0] 305 | if warmup_steps > 0 else args.learning_rate) 306 | print(verbose) 307 | 308 | current_example, current_epoch = processor.get_train_progress( 309 | ) 310 | time_end = time.time() 311 | used_time = time_end - time_begin 312 | print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " 313 | "ave acc: %f, speed: %f steps/s" % 314 | (current_epoch, current_example, num_train_examples, 315 | steps, np.sum(total_cost) / np.sum(total_num_seqs), 316 | np.sum(total_acc) / np.sum(total_num_seqs), 317 | args.skip_steps / used_time)) 318 | total_cost, total_acc, total_num_seqs = [], [], [] 319 | time_begin = time.time() 320 | 321 | if steps % args.save_steps == 0: 322 | save_path = os.path.join(args.checkpoints, 323 | "step_" + str(steps)) 324 | fluid.io.save_persistables(exe, save_path, train_program) 325 | 326 | if steps % args.validation_steps == 0: 327 | # evaluate dev set 328 | if args.do_val: 329 | test_pyreader.decorate_tensor_provider( 330 | processor.data_generator( 331 | batch_size=args.batch_size, 332 | phase='dev', 333 | epoch=1, 334 | dev_count=1, 335 | shuffle=False)) 336 | evaluate(exe, test_prog, test_pyreader, 337 | [loss.name, accuracy.name, num_seqs.name], 338 | "dev") 339 | # evaluate test set 340 | if args.do_test: 341 | test_pyreader.decorate_tensor_provider( 342 | processor.data_generator( 343 | batch_size=args.batch_size, 344 | phase='test', 345 | epoch=1, 346 | dev_count=1, 347 | shuffle=False)) 348 | evaluate(exe, test_prog, test_pyreader, 349 | [loss.name, accuracy.name, num_seqs.name], 350 | "test") 351 | except fluid.core.EOFException: 352 | save_path = os.path.join(args.checkpoints, "step_" + str(steps)) 353 | fluid.io.save_persistables(exe, save_path, train_program) 354 | train_pyreader.reset() 355 | break 356 | 357 | # final eval on dev set 358 | if args.do_val: 359 | test_pyreader.decorate_tensor_provider( 360 | processor.data_generator( 361 | batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, 362 | shuffle=False)) 363 | print("Final validation result:") 364 | evaluate(exe, test_prog, test_pyreader, 365 | [loss.name, accuracy.name, num_seqs.name], "dev") 366 | 367 | # final eval on test set 368 | if args.do_test: 369 | test_pyreader.decorate_tensor_provider( 370 | processor.data_generator( 371 | batch_size=args.batch_size, 372 | phase='test', 373 | epoch=1, 374 | dev_count=1, 375 | shuffle=False)) 376 | print("Final test result:") 377 | evaluate(exe, test_prog, test_pyreader, 378 | [loss.name, accuracy.name, num_seqs.name], "test") 379 | 380 | 381 | if __name__ == '__main__': 382 | print_arguments(args) 383 | main(args) 384 | -------------------------------------------------------------------------------- /single_domain/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | 25 | 26 | def convert_to_unicode(text): 27 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 28 | if six.PY3: 29 | if isinstance(text, str): 30 | return text 31 | elif isinstance(text, bytes): 32 | return text.decode("utf-8", "ignore") 33 | else: 34 | raise ValueError("Unsupported string type: %s" % (type(text))) 35 | elif six.PY2: 36 | if isinstance(text, str): 37 | return text.decode("utf-8", "ignore") 38 | elif isinstance(text, unicode): 39 | return text 40 | else: 41 | raise ValueError("Unsupported string type: %s" % (type(text))) 42 | else: 43 | raise ValueError("Not running on Python2 or Python 3?") 44 | 45 | 46 | def printable_text(text): 47 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 48 | 49 | # These functions want `str` for both Python2 and Python3, but in one case 50 | # it's a Unicode string and in the other it's a byte string. 51 | if six.PY3: 52 | if isinstance(text, str): 53 | return text 54 | elif isinstance(text, bytes): 55 | return text.decode("utf-8", "ignore") 56 | else: 57 | raise ValueError("Unsupported string type: %s" % (type(text))) 58 | elif six.PY2: 59 | if isinstance(text, str): 60 | return text 61 | elif isinstance(text, unicode): 62 | return text.encode("utf-8") 63 | else: 64 | raise ValueError("Unsupported string type: %s" % (type(text))) 65 | else: 66 | raise ValueError("Not running on Python2 or Python 3?") 67 | 68 | 69 | def load_vocab(vocab_file): 70 | """Loads a vocabulary file into a dictionary.""" 71 | vocab = collections.OrderedDict() 72 | fin = open(vocab_file) 73 | for num, line in enumerate(fin): 74 | items = convert_to_unicode(line.strip()).split("\t") 75 | if len(items) > 2: 76 | break 77 | token = items[0] 78 | index = items[1] if len(items) == 2 else num 79 | token = token.strip() 80 | vocab[token] = int(index) 81 | return vocab 82 | 83 | 84 | def convert_by_vocab(vocab, items): 85 | """Converts a sequence of [tokens|ids] using the vocab.""" 86 | output = [] 87 | for item in items: 88 | output.append(vocab[item]) 89 | return output 90 | 91 | 92 | def convert_tokens_to_ids(vocab, tokens): 93 | return convert_by_vocab(vocab, tokens) 94 | 95 | 96 | def convert_ids_to_tokens(inv_vocab, ids): 97 | return convert_by_vocab(inv_vocab, ids) 98 | 99 | 100 | def whitespace_tokenize(text): 101 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 102 | text = text.strip() 103 | if not text: 104 | return [] 105 | tokens = text.split() 106 | return tokens 107 | 108 | 109 | class FullTokenizer(object): 110 | """Runs end-to-end tokenziation.""" 111 | 112 | def __init__(self, vocab_file, do_lower_case=True): 113 | self.vocab = load_vocab(vocab_file) 114 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 115 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 116 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 117 | 118 | def tokenize(self, text): 119 | split_tokens = [] 120 | for token in self.basic_tokenizer.tokenize(text): 121 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 122 | split_tokens.append(sub_token) 123 | 124 | return split_tokens 125 | 126 | def convert_tokens_to_ids(self, tokens): 127 | return convert_by_vocab(self.vocab, tokens) 128 | 129 | def convert_ids_to_tokens(self, ids): 130 | return convert_by_vocab(self.inv_vocab, ids) 131 | 132 | 133 | class CharTokenizer(object): 134 | """Runs end-to-end tokenziation.""" 135 | 136 | def __init__(self, vocab_file, do_lower_case=True): 137 | self.vocab = load_vocab(vocab_file) 138 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 139 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 140 | 141 | def tokenize(self, text): 142 | split_tokens = [] 143 | for token in text.lower().split(" "): 144 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 145 | split_tokens.append(sub_token) 146 | 147 | return split_tokens 148 | 149 | def convert_tokens_to_ids(self, tokens): 150 | return convert_by_vocab(self.vocab, tokens) 151 | 152 | def convert_ids_to_tokens(self, ids): 153 | return convert_by_vocab(self.inv_vocab, ids) 154 | 155 | 156 | class BasicTokenizer(object): 157 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 158 | 159 | def __init__(self, do_lower_case=True): 160 | """Constructs a BasicTokenizer. 161 | 162 | Args: 163 | do_lower_case: Whether to lower case the input. 164 | """ 165 | self.do_lower_case = do_lower_case 166 | 167 | def tokenize(self, text): 168 | """Tokenizes a piece of text.""" 169 | text = convert_to_unicode(text) 170 | text = self._clean_text(text) 171 | 172 | # This was added on November 1st, 2018 for the multilingual and Chinese 173 | # models. This is also applied to the English models now, but it doesn't 174 | # matter since the English models were not trained on any Chinese data 175 | # and generally don't have any Chinese data in them (there are Chinese 176 | # characters in the vocabulary because Wikipedia does have some Chinese 177 | # words in the English Wikipedia.). 178 | text = self._tokenize_chinese_chars(text) 179 | 180 | orig_tokens = whitespace_tokenize(text) 181 | split_tokens = [] 182 | for token in orig_tokens: 183 | if self.do_lower_case: 184 | token = token.lower() 185 | token = self._run_strip_accents(token) 186 | split_tokens.extend(self._run_split_on_punc(token)) 187 | 188 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 189 | return output_tokens 190 | 191 | def _run_strip_accents(self, text): 192 | """Strips accents from a piece of text.""" 193 | text = unicodedata.normalize("NFD", text) 194 | output = [] 195 | for char in text: 196 | cat = unicodedata.category(char) 197 | if cat == "Mn": 198 | continue 199 | output.append(char) 200 | return "".join(output) 201 | 202 | def _run_split_on_punc(self, text): 203 | """Splits punctuation on a piece of text.""" 204 | chars = list(text) 205 | i = 0 206 | start_new_word = True 207 | output = [] 208 | while i < len(chars): 209 | char = chars[i] 210 | if _is_punctuation(char): 211 | output.append([char]) 212 | start_new_word = True 213 | else: 214 | if start_new_word: 215 | output.append([]) 216 | start_new_word = False 217 | output[-1].append(char) 218 | i += 1 219 | 220 | return ["".join(x) for x in output] 221 | 222 | def _tokenize_chinese_chars(self, text): 223 | """Adds whitespace around any CJK character.""" 224 | output = [] 225 | for char in text: 226 | cp = ord(char) 227 | if self._is_chinese_char(cp): 228 | output.append(" ") 229 | output.append(char) 230 | output.append(" ") 231 | else: 232 | output.append(char) 233 | return "".join(output) 234 | 235 | def _is_chinese_char(self, cp): 236 | """Checks whether CP is the codepoint of a CJK character.""" 237 | # This defines a "chinese character" as anything in the CJK Unicode block: 238 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 239 | # 240 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 241 | # despite its name. The modern Korean Hangul alphabet is a different block, 242 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 243 | # space-separated words, so they are not treated specially and handled 244 | # like the all of the other languages. 245 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 246 | (cp >= 0x3400 and cp <= 0x4DBF) or # 247 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 248 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 249 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 250 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 251 | (cp >= 0xF900 and cp <= 0xFAFF) or # 252 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 253 | return True 254 | 255 | return False 256 | 257 | def _clean_text(self, text): 258 | """Performs invalid character removal and whitespace cleanup on text.""" 259 | output = [] 260 | for char in text: 261 | cp = ord(char) 262 | if cp == 0 or cp == 0xfffd or _is_control(char): 263 | continue 264 | if _is_whitespace(char): 265 | output.append(" ") 266 | else: 267 | output.append(char) 268 | return "".join(output) 269 | 270 | 271 | class WordpieceTokenizer(object): 272 | """Runs WordPiece tokenziation.""" 273 | 274 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 275 | self.vocab = vocab 276 | self.unk_token = unk_token 277 | self.max_input_chars_per_word = max_input_chars_per_word 278 | 279 | def tokenize(self, text): 280 | """Tokenizes a piece of text into its word pieces. 281 | 282 | This uses a greedy longest-match-first algorithm to perform tokenization 283 | using the given vocabulary. 284 | 285 | For example: 286 | input = "unaffable" 287 | output = ["un", "##aff", "##able"] 288 | 289 | Args: 290 | text: A single token or whitespace separated tokens. This should have 291 | already been passed through `BasicTokenizer. 292 | 293 | Returns: 294 | A list of wordpiece tokens. 295 | """ 296 | 297 | text = convert_to_unicode(text) 298 | 299 | output_tokens = [] 300 | for token in whitespace_tokenize(text): 301 | chars = list(token) 302 | if len(chars) > self.max_input_chars_per_word: 303 | output_tokens.append(self.unk_token) 304 | continue 305 | 306 | is_bad = False 307 | start = 0 308 | sub_tokens = [] 309 | while start < len(chars): 310 | end = len(chars) 311 | cur_substr = None 312 | while start < end: 313 | substr = "".join(chars[start:end]) 314 | if start > 0: 315 | substr = "##" + substr 316 | if substr in self.vocab: 317 | cur_substr = substr 318 | break 319 | end -= 1 320 | if cur_substr is None: 321 | is_bad = True 322 | break 323 | sub_tokens.append(cur_substr) 324 | start = end 325 | 326 | if is_bad: 327 | output_tokens.append(self.unk_token) 328 | else: 329 | output_tokens.extend(sub_tokens) 330 | return output_tokens 331 | 332 | 333 | def _is_whitespace(char): 334 | """Checks whether `chars` is a whitespace character.""" 335 | # \t, \n, and \r are technically contorl characters but we treat them 336 | # as whitespace since they are generally considered as such. 337 | if char == " " or char == "\t" or char == "\n" or char == "\r": 338 | return True 339 | cat = unicodedata.category(char) 340 | if cat == "Zs": 341 | return True 342 | return False 343 | 344 | 345 | def _is_control(char): 346 | """Checks whether `chars` is a control character.""" 347 | # These are technically control characters but we count them as whitespace 348 | # characters. 349 | if char == "\t" or char == "\n" or char == "\r": 350 | return False 351 | cat = unicodedata.category(char) 352 | if cat.startswith("C"): 353 | return True 354 | return False 355 | 356 | 357 | def _is_punctuation(char): 358 | """Checks whether `chars` is a punctuation character.""" 359 | cp = ord(char) 360 | # We treat all non-letter/number ASCII as punctuation. 361 | # Characters such as "^", "$", and "`" are not in the Unicode 362 | # Punctuation class but we treat them as punctuation anyways, for 363 | # consistency. 364 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 365 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 366 | return True 367 | cat = unicodedata.category(char) 368 | if cat.startswith("P"): 369 | return True 370 | return False 371 | -------------------------------------------------------------------------------- /single_domain/tokenization.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/tokenization.pyc -------------------------------------------------------------------------------- /single_domain/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """BERT pretraining.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import os 21 | import time 22 | import argparse 23 | import numpy as np 24 | import multiprocessing 25 | 26 | import paddle 27 | import paddle.fluid as fluid 28 | 29 | from reader.pretraining import DataReader 30 | from model.bert import BertModel, BertConfig 31 | from optimization import optimization 32 | from utils.args import ArgumentGroup, print_arguments 33 | from utils.init import init_checkpoint, init_pretraining_params 34 | 35 | # yapf: disable 36 | parser = argparse.ArgumentParser(__doc__) 37 | model_g = ArgumentGroup(parser, "model", "model configuration and paths.") 38 | model_g.add_arg("bert_config_path", str, "./config/bert_config.json", "Path to the json file for bert model config.") 39 | model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") 40 | model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") 41 | model_g.add_arg("weight_sharing", bool, True, "If set, share weights between word embedding and masked lm.") 42 | model_g.add_arg("generate_neg_sample", bool, True, "If set, randomly generate negtive samples by positive samples.") 43 | 44 | train_g = ArgumentGroup(parser, "training", "training options.") 45 | train_g.add_arg("epoch", int, 100, "Number of epoches for training.") 46 | train_g.add_arg("learning_rate", float, 0.0001, "Learning rate used to train with warmup.") 47 | train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", 48 | "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) 49 | train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") 50 | train_g.add_arg("num_train_steps", int, 1000000, "Total steps to perform pretraining.") 51 | train_g.add_arg("warmup_steps", int, 4000, "Total steps to perform warmup when pretraining.") 52 | train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") 53 | train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") 54 | train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") 55 | train_g.add_arg("loss_scaling", float, 1.0, 56 | "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") 57 | 58 | log_g = ArgumentGroup(parser, "logging", "logging related.") 59 | log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") 60 | log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") 61 | 62 | data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") 63 | data_g.add_arg("data_dir", str, "./data/train/", "Path to training data.") 64 | data_g.add_arg("validation_set_dir", str, "./data/validation/", "Path to validation data.") 65 | data_g.add_arg("test_set_dir", str, None, "Path to test data.") 66 | data_g.add_arg("vocab_path", str, "./config/vocab.txt", "Vocabulary path.") 67 | data_g.add_arg("max_seq_len", int, 512, "Tokens' number of the longest seqence allowed.") 68 | data_g.add_arg("batch_size", int, 8192, 69 | "The total number of examples in one batch for training, see also --in_tokens.") 70 | data_g.add_arg("in_tokens", bool, True, 71 | "If set, the batch size will be the maximum number of tokens in one batch. " 72 | "Otherwise, it will be the maximum number of examples in one batch.") 73 | 74 | run_type_g = ArgumentGroup(parser, "run_type", "running type options.") 75 | run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.") 76 | run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") 77 | run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") 78 | run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.") 79 | run_type_g.add_arg("do_test", bool, False, "Whether to perform evaluation on test data set.") 80 | 81 | args = parser.parse_args() 82 | # yapf: enable. 83 | 84 | 85 | def create_model(pyreader_name, bert_config): 86 | pyreader = fluid.layers.py_reader( 87 | capacity=70, 88 | shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 89 | [-1, args.max_seq_len, 1], 90 | [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], 91 | [-1, 1]], 92 | dtypes=[ 93 | 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64' 94 | ], 95 | lod_levels=[0, 0, 0, 0, 0, 0, 0], 96 | name=pyreader_name, 97 | use_double_buffer=True) 98 | 99 | (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = fluid.layers.read_file(pyreader) 100 | 101 | bert = BertModel( 102 | src_ids=src_ids, 103 | position_ids=pos_ids, 104 | sentence_ids=sent_ids, 105 | input_mask=input_mask, 106 | config=bert_config, 107 | weight_sharing=args.weight_sharing, 108 | use_fp16=args.use_fp16) 109 | 110 | next_sent_acc, mask_lm_loss, total_loss = bert.get_pretraining_output( 111 | mask_label, mask_pos, labels) 112 | 113 | if args.use_fp16 and args.loss_scaling > 1.0: 114 | total_loss *= args.loss_scaling 115 | 116 | return pyreader, next_sent_acc, mask_lm_loss, total_loss 117 | 118 | 119 | def predict_wrapper(args, 120 | exe, 121 | bert_config, 122 | test_prog=None, 123 | pyreader=None, 124 | fetch_list=None): 125 | # Context to do validation. 126 | data_path = args.test_set_dir if args.do_test else args.validation_set_dir 127 | data_reader = DataReader( 128 | data_path, 129 | vocab_path=args.vocab_path, 130 | batch_size=args.batch_size, 131 | in_tokens=args.in_tokens, 132 | voc_size=bert_config['vocab_size'], 133 | shuffle_files=False, 134 | epoch=1, 135 | max_seq_len=args.max_seq_len, 136 | is_test=True) 137 | 138 | if args.do_test: 139 | assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \ 140 | to specify you pretrained model checkpoints" 141 | 142 | init_pretraining_params(exe, args.init_checkpoint, test_prog) 143 | 144 | def predict(exe=exe, pyreader=pyreader): 145 | 146 | pyreader.decorate_tensor_provider(data_reader.data_generator()) 147 | pyreader.start() 148 | 149 | cost = 0 150 | lm_cost = 0 151 | acc = 0 152 | steps = 0 153 | time_begin = time.time() 154 | while True: 155 | try: 156 | each_next_acc, each_mask_lm_cost, each_total_cost = exe.run( 157 | fetch_list=fetch_list, program=test_prog) 158 | acc += each_next_acc 159 | lm_cost += each_mask_lm_cost 160 | cost += each_total_cost 161 | steps += 1 162 | if args.do_test and steps % args.skip_steps == 0: 163 | print("[test_set] steps: %d" % steps) 164 | 165 | except fluid.core.EOFException: 166 | pyreader.reset() 167 | break 168 | 169 | used_time = time.time() - time_begin 170 | return cost, lm_cost, acc, steps, (args.skip_steps / used_time) 171 | 172 | return predict 173 | 174 | 175 | def test(args): 176 | bert_config = BertConfig(args.bert_config_path) 177 | bert_config.print_config() 178 | 179 | test_prog = fluid.Program() 180 | test_startup = fluid.Program() 181 | with fluid.program_guard(test_prog, test_startup): 182 | with fluid.unique_name.guard(): 183 | test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( 184 | pyreader_name='test_reader', bert_config=bert_config) 185 | 186 | test_prog = test_prog.clone(for_test=True) 187 | 188 | place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace() 189 | exe = fluid.Executor(place) 190 | exe.run(test_startup) 191 | 192 | predict = predict_wrapper( 193 | args, 194 | exe, 195 | bert_config, 196 | test_prog=test_prog, 197 | pyreader=test_pyreader, 198 | fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name]) 199 | 200 | print("test begin") 201 | loss, lm_loss, acc, steps, speed = predict() 202 | print( 203 | "[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s" 204 | % (np.mean(np.array(loss) / steps), 205 | np.exp(np.mean(np.array(lm_loss) / steps)), 206 | np.mean(np.array(acc) / steps), speed)) 207 | 208 | 209 | def train(args): 210 | print("pretraining start") 211 | bert_config = BertConfig(args.bert_config_path) 212 | bert_config.print_config() 213 | 214 | train_program = fluid.Program() 215 | startup_prog = fluid.Program() 216 | with fluid.program_guard(train_program, startup_prog): 217 | with fluid.unique_name.guard(): 218 | train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( 219 | pyreader_name='train_reader', bert_config=bert_config) 220 | scheduled_lr = optimization( 221 | loss=total_loss, 222 | warmup_steps=args.warmup_steps, 223 | num_train_steps=args.num_train_steps, 224 | learning_rate=args.learning_rate, 225 | train_program=train_program, 226 | startup_prog=startup_prog, 227 | weight_decay=args.weight_decay, 228 | scheduler=args.lr_scheduler, 229 | use_fp16=args.use_fp16, 230 | loss_scaling=args.loss_scaling) 231 | 232 | fluid.memory_optimize( 233 | input_program=train_program, 234 | skip_opt_set=[ 235 | next_sent_acc.name, mask_lm_loss.name, total_loss.name 236 | ]) 237 | 238 | test_prog = fluid.Program() 239 | with fluid.program_guard(test_prog, startup_prog): 240 | with fluid.unique_name.guard(): 241 | test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( 242 | pyreader_name='test_reader', bert_config=bert_config) 243 | 244 | test_prog = test_prog.clone(for_test=True) 245 | 246 | if args.use_cuda: 247 | place = fluid.CUDAPlace(0) 248 | dev_count = fluid.core.get_cuda_device_count() 249 | else: 250 | place = fluid.CPUPlace() 251 | dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) 252 | 253 | print("Device count %d" % dev_count) 254 | if args.verbose: 255 | if args.in_tokens: 256 | lower_mem, upper_mem, unit = fluid.contrib.memory_usage( 257 | program=train_program, 258 | batch_size=args.batch_size // args.max_seq_len) 259 | else: 260 | lower_mem, upper_mem, unit = fluid.contrib.memory_usage( 261 | program=train_program, batch_size=args.batch_size) 262 | print("Theoretical memory usage in training: %.3f - %.3f %s" % 263 | (lower_mem, upper_mem, unit)) 264 | 265 | nccl2_num_trainers = 1 266 | nccl2_trainer_id = 0 267 | print("args.is_distributed:", args.is_distributed) 268 | if args.is_distributed: 269 | worker_endpoints_env = os.getenv("worker_endpoints") 270 | worker_endpoints = worker_endpoints_env.split(",") 271 | trainers_num = len(worker_endpoints) 272 | current_endpoint = os.getenv("current_endpoint") 273 | trainer_id = worker_endpoints.index(current_endpoint) 274 | if trainer_id == 0: 275 | print("train_id == 0, sleep 60s") 276 | time.sleep(60) 277 | print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ 278 | trainer_id:{}" 279 | .format(worker_endpoints, trainers_num, 280 | current_endpoint, trainer_id)) 281 | 282 | # prepare nccl2 env. 283 | config = fluid.DistributeTranspilerConfig() 284 | config.mode = "nccl2" 285 | t = fluid.DistributeTranspiler(config=config) 286 | t.transpile( 287 | trainer_id, 288 | trainers=worker_endpoints_env, 289 | current_endpoint=current_endpoint, 290 | program=train_program, 291 | startup_program=startup_prog) 292 | nccl2_num_trainers = trainers_num 293 | nccl2_trainer_id = trainer_id 294 | 295 | exe = fluid.Executor(place) 296 | exe.run(startup_prog) 297 | 298 | if args.init_checkpoint and args.init_checkpoint != "": 299 | init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) 300 | 301 | data_reader = DataReader( 302 | data_dir=args.data_dir, 303 | batch_size=args.batch_size, 304 | in_tokens=args.in_tokens, 305 | vocab_path=args.vocab_path, 306 | voc_size=bert_config['vocab_size'], 307 | epoch=args.epoch, 308 | max_seq_len=args.max_seq_len, 309 | generate_neg_sample=args.generate_neg_sample) 310 | 311 | exec_strategy = fluid.ExecutionStrategy() 312 | exec_strategy.use_experimental_executor = args.use_fast_executor 313 | exec_strategy.num_threads = dev_count 314 | exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope 315 | 316 | train_exe = fluid.ParallelExecutor( 317 | use_cuda=args.use_cuda, 318 | loss_name=total_loss.name, 319 | exec_strategy=exec_strategy, 320 | main_program=train_program, 321 | num_trainers=nccl2_num_trainers, 322 | trainer_id=nccl2_trainer_id) 323 | 324 | if args.validation_set_dir and args.validation_set_dir != "": 325 | predict = predict_wrapper( 326 | args, 327 | exe, 328 | bert_config, 329 | test_prog=test_prog, 330 | pyreader=test_pyreader, 331 | fetch_list=[ 332 | next_sent_acc.name, mask_lm_loss.name, total_loss.name 333 | ]) 334 | 335 | train_pyreader.decorate_tensor_provider(data_reader.data_generator()) 336 | train_pyreader.start() 337 | steps = 0 338 | cost = [] 339 | lm_cost = [] 340 | acc = [] 341 | time_begin = time.time() 342 | while steps < args.num_train_steps: 343 | try: 344 | steps += nccl2_num_trainers 345 | skip_steps = args.skip_steps * nccl2_num_trainers 346 | 347 | if nccl2_trainer_id != 0: 348 | train_exe.run(fetch_list=[]) 349 | continue 350 | 351 | if steps % skip_steps != 0: 352 | train_exe.run(fetch_list=[]) 353 | else: 354 | each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( 355 | fetch_list=[ 356 | next_sent_acc.name, mask_lm_loss.name, total_loss.name, 357 | scheduled_lr.name 358 | ]) 359 | acc.extend(each_next_acc) 360 | lm_cost.extend(each_mask_lm_cost) 361 | cost.extend(each_total_cost) 362 | 363 | print("feed_queue size", train_pyreader.queue.size()) 364 | time_end = time.time() 365 | used_time = time_end - time_begin 366 | epoch, current_file_index, total_file, current_file = data_reader.get_progress( 367 | ) 368 | print("current learning_rate:%f" % np_lr[0]) 369 | print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " 370 | "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" 371 | % (epoch, current_file_index, total_file, steps, 372 | np.mean(np.array(cost)), 373 | np.mean(np.exp(np.array(lm_cost))), 374 | np.mean(np.array(acc)), skip_steps / used_time, 375 | current_file)) 376 | cost = [] 377 | lm_cost = [] 378 | acc = [] 379 | time_begin = time.time() 380 | 381 | if steps % args.save_steps == 0: 382 | save_path = os.path.join(args.checkpoints, "step_" + str(steps)) 383 | fluid.io.save_persistables(exe, save_path, train_program) 384 | 385 | if args.validation_set_dir and steps % args.validation_steps == 0: 386 | vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( 387 | ) 388 | print("[validation_set] epoch: %d, step: %d, " 389 | "loss: %f, global ppl: %f, batch-averged ppl: %f, " 390 | "next_sent_acc: %f, speed: %f steps/s" % 391 | (epoch, steps, 392 | np.mean(np.array(vali_cost) / vali_steps), 393 | np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), 394 | np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), 395 | np.mean(np.array(vali_acc) / vali_steps), vali_speed)) 396 | 397 | except fluid.core.EOFException: 398 | train_pyreader.reset() 399 | break 400 | 401 | 402 | if __name__ == '__main__': 403 | print_arguments(args) 404 | if args.do_test: 405 | test(args) 406 | else: 407 | train(args) 408 | -------------------------------------------------------------------------------- /single_domain/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/utils/__init__.py -------------------------------------------------------------------------------- /single_domain/utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/utils/__init__.pyc -------------------------------------------------------------------------------- /single_domain/utils/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Arguments for configuration.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import six 21 | import argparse 22 | 23 | 24 | def str2bool(v): 25 | # because argparse does not support to parse "true, False" as python 26 | # boolean directly 27 | return v.lower() in ("true", "t", "1") 28 | 29 | 30 | class ArgumentGroup(object): 31 | def __init__(self, parser, title, des): 32 | self._group = parser.add_argument_group(title=title, description=des) 33 | 34 | def add_arg(self, name, type, default, help, **kwargs): 35 | type = str2bool if type == bool else type 36 | self._group.add_argument( 37 | "--" + name, 38 | default=default, 39 | type=type, 40 | help=help + ' Default: %(default)s.', 41 | **kwargs) 42 | 43 | 44 | def print_arguments(args): 45 | print('----------- Configuration Arguments -----------') 46 | for arg, value in sorted(six.iteritems(vars(args))): 47 | print('%s: %s' % (arg, value)) 48 | print('------------------------------------------------') 49 | -------------------------------------------------------------------------------- /single_domain/utils/args.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/utils/args.pyc -------------------------------------------------------------------------------- /single_domain/utils/fp16.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | import paddle 17 | import paddle.fluid as fluid 18 | 19 | 20 | def cast_fp16_to_fp32(i, o, prog): 21 | prog.global_block().append_op( 22 | type="cast", 23 | inputs={"X": i}, 24 | outputs={"Out": o}, 25 | attrs={ 26 | "in_dtype": fluid.core.VarDesc.VarType.FP16, 27 | "out_dtype": fluid.core.VarDesc.VarType.FP32 28 | }) 29 | 30 | 31 | def cast_fp32_to_fp16(i, o, prog): 32 | prog.global_block().append_op( 33 | type="cast", 34 | inputs={"X": i}, 35 | outputs={"Out": o}, 36 | attrs={ 37 | "in_dtype": fluid.core.VarDesc.VarType.FP32, 38 | "out_dtype": fluid.core.VarDesc.VarType.FP16 39 | }) 40 | 41 | 42 | def copy_to_master_param(p, block): 43 | v = block.vars.get(p.name, None) 44 | if v is None: 45 | raise ValueError("no param name %s found!" % p.name) 46 | new_p = fluid.framework.Parameter( 47 | block=block, 48 | shape=v.shape, 49 | dtype=fluid.core.VarDesc.VarType.FP32, 50 | type=v.type, 51 | lod_level=v.lod_level, 52 | stop_gradient=p.stop_gradient, 53 | trainable=p.trainable, 54 | optimize_attr=p.optimize_attr, 55 | regularizer=p.regularizer, 56 | gradient_clip_attr=p.gradient_clip_attr, 57 | error_clip=p.error_clip, 58 | name=v.name + ".master") 59 | return new_p 60 | 61 | 62 | def create_master_params_grads(params_grads, main_prog, startup_prog, 63 | loss_scaling): 64 | master_params_grads = [] 65 | tmp_role = main_prog._current_role 66 | OpRole = fluid.core.op_proto_and_checker_maker.OpRole 67 | main_prog._current_role = OpRole.Backward 68 | for p, g in params_grads: 69 | # create master parameters 70 | master_param = copy_to_master_param(p, main_prog.global_block()) 71 | startup_master_param = startup_prog.global_block()._clone_variable( 72 | master_param) 73 | startup_p = startup_prog.global_block().var(p.name) 74 | cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog) 75 | # cast fp16 gradients to fp32 before apply gradients 76 | if g.name.find("layer_norm") > -1: 77 | if loss_scaling > 1: 78 | scaled_g = g / float(loss_scaling) 79 | else: 80 | scaled_g = g 81 | master_params_grads.append([p, scaled_g]) 82 | continue 83 | master_grad = fluid.layers.cast(g, "float32") 84 | if loss_scaling > 1: 85 | master_grad = master_grad / float(loss_scaling) 86 | master_params_grads.append([master_param, master_grad]) 87 | main_prog._current_role = tmp_role 88 | return master_params_grads 89 | 90 | 91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog): 92 | for idx, m_p_g in enumerate(master_params_grads): 93 | train_p, _ = params_grads[idx] 94 | if train_p.name.find("layer_norm") > -1: 95 | continue 96 | with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): 97 | cast_fp32_to_fp16(m_p_g[0], train_p, main_prog) 98 | -------------------------------------------------------------------------------- /single_domain/utils/fp16.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/utils/fp16.pyc -------------------------------------------------------------------------------- /single_domain/utils/init.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import os 18 | import six 19 | import ast 20 | import copy 21 | 22 | import numpy as np 23 | import paddle.fluid as fluid 24 | 25 | 26 | def cast_fp32_to_fp16(exe, main_program): 27 | print("Cast parameters to float16 data format.") 28 | for param in main_program.global_block().all_parameters(): 29 | if not param.name.endswith(".master"): 30 | param_t = fluid.global_scope().find_var(param.name).get_tensor() 31 | data = np.array(param_t) 32 | if param.name.find("layer_norm") == -1: 33 | param_t.set(np.float16(data).view(np.uint16), exe.place) 34 | master_param_var = fluid.global_scope().find_var(param.name + 35 | ".master") 36 | if master_param_var is not None: 37 | master_param_var.get_tensor().set(data, exe.place) 38 | 39 | 40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): 41 | assert os.path.exists( 42 | init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path 43 | 44 | def existed_persitables(var): 45 | if not fluid.io.is_persistable(var) or "scheduled_learning_rate" in var.name or "@LR_DECAY_COUNTER@" in var.name: 46 | return False 47 | return os.path.exists(os.path.join(init_checkpoint_path, var.name)) 48 | 49 | fluid.io.load_vars( 50 | exe, 51 | init_checkpoint_path, 52 | main_program=main_program, 53 | predicate=existed_persitables) 54 | print("Load model from {}".format(init_checkpoint_path)) 55 | 56 | if use_fp16: 57 | cast_fp32_to_fp16(exe, main_program) 58 | 59 | 60 | def init_pretraining_params(exe, 61 | pretraining_params_path, 62 | main_program, 63 | use_fp16=False): 64 | assert os.path.exists(pretraining_params_path 65 | ), "[%s] cann't be found." % pretraining_params_path 66 | 67 | def existed_params(var): 68 | if not isinstance(var, fluid.framework.Parameter): 69 | return False 70 | return os.path.exists(os.path.join(pretraining_params_path, var.name)) 71 | 72 | fluid.io.load_vars( 73 | exe, 74 | pretraining_params_path, 75 | main_program=main_program, 76 | predicate=existed_params) 77 | print("Load pretraining parameters from {}.".format( 78 | pretraining_params_path)) 79 | 80 | if use_fp16: 81 | cast_fp32_to_fp16(exe, main_program) 82 | -------------------------------------------------------------------------------- /single_domain/utils/init.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XMUDeepLIT/IMM/42af48a7b1df5eca3e3d677f0606594e6b95d6a6/single_domain/utils/init.pyc -------------------------------------------------------------------------------- /small_clean.sh: -------------------------------------------------------------------------------- 1 | rm -r align* 2 | rm -r NewsQA_small 3 | rm -r SQuAD_small 4 | rm -r HotpotQA_small 5 | rm -r NaturalQuestions_small 6 | rm -r TriviaQA_small 7 | rm total_logits/* 8 | rm id_map 9 | rm base_train.log 10 | rm test.log 11 | rm ./data/All_domain.raw.json_* 12 | rm ./data/*4cb* 13 | -------------------------------------------------------------------------------- /small_prepare_dir.sh: -------------------------------------------------------------------------------- 1 | cp -r single_domain SQuAD_small 2 | cp -r single_domain NewsQA_small 3 | cp -r single_domain HotpotQA_small 4 | cp -r single_domain NaturalQuestions_small 5 | cp -r single_domain TriviaQA_small 6 | cp -r checkpoints/SQuAD/step_best/ SQuAD_small/squad/checkpoints/ 7 | cp -r checkpoints/NewsQA/step_best/ NewsQA_small/squad/checkpoints/ 8 | cp -r checkpoints/HotpotQA/step_best/ HotpotQA_small/squad/checkpoints/ 9 | cp -r checkpoints/NaturalQuestions/step_best/ NaturalQuestions_small/squad/checkpoints/ 10 | cp -r checkpoints/TriviaQA/step_best/ TriviaQA_small/squad/checkpoints/ 11 | -------------------------------------------------------------------------------- /small_run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | bash small_prepare_dir.sh 3 | nohup bash main_iter.sh SQuAD_small NewsQA_small HotpotQA_small NaturalQuestions_small TriviaQA_small > test.log & 4 | #bash main_iter.sh SQuAD_small NewsQA_small 5 | -------------------------------------------------------------------------------- /split_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import sys 5 | import argparse 6 | import os 7 | 8 | reload(sys) 9 | sys.setdefaultencoding('utf8') 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--path', type=str, help='file path of processed file') 14 | parser.add_argument('--domains', nargs='+', type=str, help='file path of processed file') 15 | args = parser.parse_args() 16 | return args 17 | 18 | dev_size = 500 19 | 20 | if __name__ == '__main__': 21 | args = parse_args() 22 | file_nums = args.domains.__len__() 23 | max_len = -9999 24 | for i in range(0, file_nums): 25 | now_data_list = [] 26 | file_name = os.path.join(args.path, (args.domains[i]+'.raw.json')) 27 | now_dataset = json.load(open(file_name,'r')) 28 | article_list = now_dataset['data'] 29 | for article in article_list: 30 | title = article['title'] 31 | for paragraph in article['paragraphs']: 32 | context = paragraph['context'] 33 | for qas in paragraph['qas']: 34 | gen_paragraph = {'context': context, 'qas':[qas]} 35 | data_item = {'title': title, 'paragraphs':[gen_paragraph]} 36 | now_data_list.append(data_item) 37 | # dev_set = now_data_list[:dev_size] 38 | # train_set = now_data_list[dev_size:] 39 | split_len = now_data_list.__len__() / 5 40 | now_idx = split_len 41 | before_idx = 0 42 | for i in range(5): 43 | if i == 4: 44 | now_idx = now_data_list.__len__() 45 | set = now_data_list[before_idx:now_idx] 46 | before_idx += split_len 47 | now_idx += split_len 48 | json.dump({"data":set}, open(file_name + "_" + str(i), 'w'), indent=4, ensure_ascii=False) 49 | 50 | -------------------------------------------------------------------------------- /upsample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import sys 5 | import argparse 6 | import random 7 | import os 8 | 9 | reload(sys) 10 | sys.setdefaultencoding('utf8') 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--path', type=str, help='file path of processed file') 15 | parser.add_argument('--domains', nargs='+', type=str, help='file path of processed file') 16 | args = parser.parse_args() 17 | return args 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | file_nums = args.domains.__len__() 22 | max_len = -9999 23 | for i in range(file_nums): 24 | now_len = 0 25 | file_name = os.path.join(args.path, (args.domains[i]+'.raw.json')) 26 | now_dataset = json.load(open(file_name,'r')) 27 | article_list = now_dataset['data'] 28 | for article in article_list: 29 | for paragraph in article['paragraphs']: 30 | for qas in paragraph['qas']: 31 | now_len += 1 32 | if now_len > max_len: 33 | max_len = now_len 34 | 35 | # total_data_list = [] 36 | 37 | for i in range(0, file_nums): 38 | now_data_list = [] 39 | 40 | file_name = os.path.join(args.path, (args.domains[i]+'.raw.json')) 41 | now_dataset = json.load(open(file_name,'r')) 42 | article_list = now_dataset['data'] 43 | for batch_idx in range(2): 44 | one_batch = [] 45 | for article in article_list: 46 | title = article['title'] 47 | for paragraph in article['paragraphs']: 48 | context = paragraph['context'] 49 | for qas in paragraph['qas']: 50 | gen_paragraph = {'context': context, 'qas':[qas]} 51 | data_item = {'title': title, 'paragraphs':[gen_paragraph]} 52 | one_batch.append(data_item) 53 | random.shuffle(one_batch) 54 | if one_batch.__len__() < max_len: 55 | times = max_len/(one_batch.__len__()) 56 | extends = max_len%(one_batch.__len__()) 57 | append_data = [] 58 | for t in range(times - 1): 59 | append_data = append_data + one_batch 60 | append_data = append_data + one_batch[:extends] 61 | one_batch = one_batch + append_data 62 | random.shuffle(one_batch) 63 | now_data_list = now_data_list + one_batch 64 | file_name = file_name.replace('.raw.json','.4cb.raw.json') 65 | json.dump({"data":now_data_list}, open(file_name, 'w'), indent=4, ensure_ascii=False) 66 | --------------------------------------------------------------------------------