├── .gitignore ├── ACL2019-KTNET ├── downloaded_files.md5 ├── images │ └── architecture.png ├── reading_comprehension │ ├── eval_record_nell.sh │ ├── eval_record_twomemory.sh │ ├── eval_record_wordnet.sh │ ├── eval_squad_nell.sh │ ├── eval_squad_twomemory.sh │ ├── eval_squad_wordnet.sh │ ├── run_record_nell.sh │ ├── run_record_nell_finetune.sh │ ├── run_record_nell_pretrain.sh │ ├── run_record_twomemory.sh │ ├── run_record_twomemory_finetune.sh │ ├── run_record_twomemory_pretrain.sh │ ├── run_record_wordnet.sh │ ├── run_record_wordnet_finetune.sh │ ├── run_record_wordnet_pretrain.sh │ ├── run_squad_nell.sh │ ├── run_squad_nell_finetune.sh │ ├── run_squad_nell_pretrain.sh │ ├── run_squad_twomemory.sh │ ├── run_squad_twomemory_finetune.sh │ ├── run_squad_twomemory_pretrain.sh │ ├── run_squad_wordnet.sh │ ├── run_squad_wordnet_finetune.sh │ ├── run_squad_wordnet_pretrain.sh │ └── src │ │ ├── batching.py │ │ ├── batching_twomemory.py │ │ ├── eval │ │ └── __init__.py │ │ ├── model │ │ ├── __init__.py │ │ ├── bert.py │ │ ├── layers.py │ │ └── transformer_encoder.py │ │ ├── optimization.py │ │ ├── reader │ │ ├── __init__.py │ │ ├── record.py │ │ ├── record_twomemory.py │ │ ├── squad.py │ │ └── squad_twomemory.py │ │ ├── run_record.py │ │ ├── run_record_twomemory.py │ │ ├── run_squad.py │ │ ├── run_squad_twomemory.py │ │ ├── tokenization.py │ │ └── utils │ │ ├── __init__.py │ │ ├── args.py │ │ ├── fp16.py │ │ └── init.py ├── readme.md └── retrieve_concepts │ ├── ner_tagging_squad │ └── tagging.py │ ├── retrieve_nell │ ├── nell_concept_list.txt │ └── retrieve.py │ ├── retrieve_wordnet │ ├── retrieve.py │ └── wordnet-mlj12-definitions.txt │ ├── tokenization_record │ ├── do_tokenization.py │ ├── tokenization.py │ ├── vocab.cased.txt │ └── vocab.uncased.txt │ └── tokenization_squad │ ├── do_tokenization.py │ ├── tokenization.py │ ├── vocab.cased.txt │ └── vocab.uncased.txt ├── DuQM ├── README.md ├── data.py ├── model.py ├── predict.py └── train.py ├── DuReader-2.0 ├── .gitignore ├── LICENSE ├── README.md ├── data │ ├── demo │ │ ├── devset │ │ │ └── search.dev.json │ │ ├── testset │ │ │ └── search.test.json │ │ └── trainset │ │ │ └── search.train.json │ ├── download.sh │ └── md5sum.txt ├── paddle │ ├── UPDATES.md │ ├── args.py │ ├── dataset.py │ ├── paragraph_extraction.py │ ├── preprocess.py │ ├── rc_model.py │ ├── run.py │ ├── run.sh │ └── vocab.py ├── tensorflow │ ├── dataset.py │ ├── layers │ │ ├── __init__.py │ │ ├── basic_rnn.py │ │ ├── match_layer.py │ │ └── pointer_net.py │ ├── rc_model.py │ ├── run.py │ └── vocab.py └── utils │ ├── __init__.py │ ├── download_thirdparty.sh │ ├── dureader_eval.py │ ├── get_vocab.py │ ├── marco_tokenize_data.py │ ├── marcov1_to_dureader.py │ ├── marcov2_to_v1_tojsonl.py │ ├── preprocess.py │ └── run_marco2dureader_preprocess.sh ├── DuReader-Checklist ├── README.md ├── checklist.png ├── download.sh ├── evaluate.py ├── predict.sh ├── run_eval.sh ├── src │ ├── args.py │ ├── models.py │ ├── run.py │ └── squad.py └── train.sh ├── DuReader-Retrieval ├── README.md └── figures │ └── example.png ├── DuReader-Robust ├── README.md ├── download.sh ├── evaluate.py ├── md5.txt ├── paddlehub_baseline │ ├── demo_dataset.py │ ├── paddlehub_reading_comprehension.sh │ └── reading_comprehension.py ├── predict.sh ├── src │ ├── __init__.py │ ├── _ce.py │ ├── batching.py │ ├── convert_params.py │ ├── dist_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── ernie.py │ │ └── transformer_encoder.py │ ├── optimization.py │ ├── reader │ │ ├── __init__.py │ │ └── squad.py │ ├── run_mrc.py │ ├── tokenization.py │ └── utils │ │ ├── __init__.py │ │ ├── args.py │ │ ├── cards.py │ │ ├── fp16.py │ │ └── init.py └── train.sh ├── DuReader-vis ├── README.md └── images │ ├── intro-vis.png │ └── intro.png ├── MRQA2019-D-NET ├── README.md ├── images │ ├── D-NET_framework.png │ └── D-NET_server.png ├── multi_task_learning │ ├── README.md │ ├── configs │ │ ├── answer_matching.yaml │ │ ├── mask_language_model.yaml │ │ ├── mtl_config.yaml │ │ └── reading_comprehension.yaml │ ├── run_build_palm.sh │ ├── run_evaluation.sh │ ├── run_multi_task.sh │ ├── scripts │ │ ├── args.py │ │ ├── combine.py │ │ ├── combine.sh │ │ ├── convert_model_params.py │ │ ├── convert_mrqa2squad.py │ │ ├── convert_mrqa2squad.sh │ │ ├── dev │ │ │ └── md5sum_dev.txt │ │ ├── download_data.sh │ │ ├── evaluate-v1.1.py │ │ ├── macro_avg.py │ │ └── train │ │ │ └── md5sum_train.txt │ ├── wget_data.sh │ └── wget_pretrained_model.sh └── server │ ├── README.md │ ├── bert_server │ ├── model_wrapper.py │ ├── mrc_service.py │ ├── pdnlp │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── algorithm │ │ │ ├── __init__.py │ │ │ ├── multitask.py │ │ │ └── optimization.py │ │ ├── extension │ │ │ ├── __init__.py │ │ │ └── fp16.py │ │ ├── module │ │ │ ├── __init__.py │ │ │ └── transformer_encoder.py │ │ ├── nets │ │ │ ├── __init__.py │ │ │ ├── bert.py │ │ │ └── transformer_encoder.py │ │ └── toolkit │ │ │ ├── __init__.py │ │ │ ├── configure.py │ │ │ ├── init.py │ │ │ └── placeholder.py │ ├── start.sh │ ├── start_service.py │ └── task_reader │ │ ├── __init__.py │ │ ├── batching.py │ │ ├── mrqa.py │ │ └── tokenization.py │ ├── client │ ├── client.py │ └── demo.txt │ ├── ernie_server │ ├── model_wrapper.py │ ├── mrc_service.py │ ├── pdnlp │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── algorithm │ │ │ ├── __init__.py │ │ │ ├── multitask.py │ │ │ └── optimization.py │ │ ├── extension │ │ │ ├── __init__.py │ │ │ └── fp16.py │ │ ├── module │ │ │ ├── __init__.py │ │ │ └── transformer_encoder.py │ │ ├── nets │ │ │ ├── __init__.py │ │ │ └── bert.py │ │ └── toolkit │ │ │ ├── __init__.py │ │ │ ├── configure.py │ │ │ ├── init.py │ │ │ └── placeholder.py │ ├── start.sh │ ├── start_service.py │ └── task_reader │ │ ├── __init__.py │ │ ├── batching.py │ │ ├── mrqa_infer.py │ │ └── tokenization.py │ ├── main_server.py │ ├── start.sh │ ├── wget_server_inference_model.sh │ └── xlnet_server │ ├── __init__.py │ ├── data_utils.py │ ├── model │ ├── __init__.py │ ├── transformer_encoder.py │ └── xlnet.py │ ├── modeling.py │ ├── prepro_utils.py │ ├── serve.py │ ├── server_utils.py │ ├── squad_reader.py │ ├── squad_utils.py │ ├── start.sh │ ├── wrapper.py │ └── xlnet_config │ ├── spiece.model │ └── xlnet_config.json └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | models/ 2 | preprocessed/ 3 | raw/ 4 | *.pyc 5 | vocab.search 6 | local 7 | bleu_metric 8 | rouge_metric 9 | .idea/ 10 | add_copyright.py 11 | copyright 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /ACL2019-KTNET/downloaded_files.md5: -------------------------------------------------------------------------------- 1 | ad550852cf26241b20e8364e40340a99 train.json 2 | 60c70c4a7e8190483f9899a1c9bc4178 dev.json 3 | df45d93b87ca3c47b54a33e03fabf719 record_official_evaluate.py 4 | 981b29407e0affa3b1b156f72073b945 train-v1.1.json 5 | 3e85deb501d4e538b6bc56f786231552 dev-v1.1.json 6 | afb04912d18ff20696f7f88eed49bea9 squad_v1_official_evaluate.py 7 | 64010b964ae2ebf00148b3519a4aafc8 KTNET_preprocess_squad_tagging_output.tar.gz 8 | e9352221127b7620427c18e39bfae7fc KTNET_preprocess_tokenize_result_record.tar.gz 9 | e52da2b1d096e889d32df267b82f9c77 KTNET_preprocess_tokenize_result_squad.tar.gz 10 | 89db2f5cfb07f0c44998d7f49098eb90 KTNET_preprocess_wordnet_concepts.tar.gz 11 | fb62db2fe82d88480ec853f3c6fa237a NELL.08m.1115.esv.csv.gz 12 | a68e68f9dcf4524b356163369c7f9f50 KTNET_preprocess_nell_concepts.tar.gz 13 | d9b62183c6367ffac3ee6f864c9425a5 wn_concept2vec.txt 14 | 1f69c3d092089b0a0652616b72d61bd8 nell_concept2vec.txt 15 | 5405c050e64fee4ffec17ee50f079b64 cased_L-24_H-1024_A-16.tar.gz 16 | 4bd6e911cdad39c543ba8922a70580cd KTNET_fine-tuned-model_record_both.tar.gz 17 | 43fa464d6aeabe6dc7a15315d4ea8288 KTNET_fine-tuned-model_record_nell.tar.gz 18 | 20aaefead331f64e435a94ac8a7b58aa KTNET_fine-tuned-model_record_wordnet.tar.gz 19 | 3abdb7be3fc5e3b98633c918acc25af4 KTNET_fine-tuned-model_squad_both.tar.gz 20 | 9232cf27adda9d64265ccb315e1b9c81 KTNET_fine-tuned-model_squad_nell.tar.gz 21 | a36fdd6d5c88e3e931bb3b28f9aeb4e2 KTNET_fine-tuned-model_squad_wordnet.tar.gz 22 | -------------------------------------------------------------------------------- /ACL2019-KTNET/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/images/architecture.png -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/eval_record_nell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | CKPT_DIR=$1 42 | 43 | python3 src/run_record.py \ 44 | --batch_size 6 \ 45 | --do_train false \ 46 | --do_predict true \ 47 | --use_ema false \ 48 | --do_lower_case false \ 49 | --init_pretraining_params $BERT_DIR/params \ 50 | --init_checkpoint $CKPT_DIR \ 51 | --train_file $DATA/ReCoRD/train.json \ 52 | --predict_file $DATA/ReCoRD/dev.json \ 53 | --vocab_path $BERT_DIR/vocab.txt \ 54 | --bert_config_path $BERT_DIR/bert_config.json \ 55 | --freeze false \ 56 | --max_seq_len 384 \ 57 | --doc_stride 128 \ 58 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 59 | --use_nell true \ 60 | --random_seed 45 \ 61 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 62 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/eval_record_twomemory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | CKPT_DIR=$1 43 | 44 | python3 src/run_record_twomemory.py \ 45 | --batch_size 6 \ 46 | --do_train false \ 47 | --do_predict true \ 48 | --use_ema false \ 49 | --do_lower_case false \ 50 | --init_pretraining_params $BERT_DIR/params \ 51 | --init_checkpoint $CKPT_DIR \ 52 | --train_file $DATA/ReCoRD/train.json \ 53 | --predict_file $DATA/ReCoRD/dev.json \ 54 | --vocab_path $BERT_DIR/vocab.txt \ 55 | --bert_config_path $BERT_DIR/bert_config.json \ 56 | --freeze false \ 57 | --max_seq_len 384 \ 58 | --doc_stride 128 \ 59 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 60 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 61 | --use_wordnet true \ 62 | --use_nell true \ 63 | --random_seed 45 \ 64 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 65 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/eval_record_wordnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | CKPT_DIR=$1 42 | 43 | python3 src/run_record.py \ 44 | --batch_size 6 \ 45 | --do_train false \ 46 | --do_predict true \ 47 | --use_ema false \ 48 | --do_lower_case false \ 49 | --init_pretraining_params $BERT_DIR/params \ 50 | --init_checkpoint $CKPT_DIR \ 51 | --train_file $DATA/ReCoRD/train.json \ 52 | --predict_file $DATA/ReCoRD/dev.json \ 53 | --vocab_path $BERT_DIR/vocab.txt \ 54 | --bert_config_path $BERT_DIR/bert_config.json \ 55 | --freeze false \ 56 | --max_seq_len 384 \ 57 | --doc_stride 128 \ 58 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 59 | --use_wordnet true \ 60 | --random_seed 45 \ 61 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 62 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/eval_squad_nell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | CKPT_DIR=$1 42 | 43 | python3 src/run_squad.py \ 44 | --batch_size 6 \ 45 | --do_train false \ 46 | --do_predict true \ 47 | --use_ema false \ 48 | --do_lower_case false \ 49 | --init_pretraining_params $BERT_DIR/params \ 50 | --init_checkpoint $CKPT_DIR \ 51 | --train_file $DATA/SQuAD/train-v1.1.json \ 52 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 53 | --vocab_path $BERT_DIR/vocab.txt \ 54 | --bert_config_path $BERT_DIR/bert_config.json \ 55 | --freeze false \ 56 | --max_seq_len 384 \ 57 | --doc_stride 128 \ 58 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 59 | --use_nell true \ 60 | --random_seed 45 \ 61 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 62 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/eval_squad_twomemory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | CKPT_DIR=$1 43 | 44 | python3 src/run_squad_twomemory.py \ 45 | --batch_size 6 \ 46 | --do_train false \ 47 | --do_predict true \ 48 | --use_ema false \ 49 | --do_lower_case false \ 50 | --init_pretraining_params $BERT_DIR/params \ 51 | --init_checkpoint $CKPT_DIR \ 52 | --train_file $DATA/SQuAD/train-v1.1.json \ 53 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 54 | --vocab_path $BERT_DIR/vocab.txt \ 55 | --bert_config_path $BERT_DIR/bert_config.json \ 56 | --freeze false \ 57 | --max_seq_len 384 \ 58 | --doc_stride 128 \ 59 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 60 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 61 | --use_wordnet true \ 62 | --use_nell true \ 63 | --random_seed 45 \ 64 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 65 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/eval_squad_wordnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | CKPT_DIR=$1 42 | 43 | python3 src/run_squad.py \ 44 | --batch_size 6 \ 45 | --do_train false \ 46 | --do_predict true \ 47 | --use_ema false \ 48 | --do_lower_case false \ 49 | --init_pretraining_params $BERT_DIR/params \ 50 | --init_checkpoint $CKPT_DIR \ 51 | --train_file $DATA/SQuAD/train-v1.1.json \ 52 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 53 | --vocab_path $BERT_DIR/vocab.txt \ 54 | --bert_config_path $BERT_DIR/bert_config.json \ 55 | --freeze false \ 56 | --max_seq_len 384 \ 57 | --doc_stride 128 \ 58 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 59 | --use_wordnet true \ 60 | --random_seed 45 \ 61 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 62 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_nell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | 42 | python3 src/run_record.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_pretraining_params $BERT_DIR/params \ 48 | --train_file $DATA/ReCoRD/train.json \ 49 | --predict_file $DATA/ReCoRD/dev.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 4 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_nell true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_nell_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | 42 | python3 src/run_record.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_checkpoint record_nell_first_stage_output/step_41970 \ 48 | --train_file $DATA/ReCoRD/train.json \ 49 | --predict_file $DATA/ReCoRD/dev.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 4 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_nell true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_nell_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d record_nell_first_stage_log ]; then 23 | mkdir record_nell_first_stage_log 24 | else 25 | rm -r record_nell_first_stage_log/* 26 | fi 27 | 28 | if [ ! -d record_nell_first_stage_output ]; then 29 | mkdir record_nell_first_stage_output 30 | else 31 | rm -r record_nell_first_stage_output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | 42 | python3 src/run_record.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --use_ema false \ 47 | --do_lower_case false \ 48 | --init_pretraining_params $BERT_DIR/params \ 49 | --train_file $DATA/ReCoRD/train.json \ 50 | --predict_file $DATA/ReCoRD/dev.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze true \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.0 \ 57 | --learning_rate 3e-4 \ 58 | --epoch 10 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 62 | --use_nell true \ 63 | --random_seed 45 \ 64 | --checkpoints record_nell_first_stage_output/ 1>$PWD_DIR/record_nell_first_stage_log/train.log 2>&1 65 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_twomemory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | 43 | python3 src/run_record_twomemory.py \ 44 | --batch_size 6 \ 45 | --do_train true \ 46 | --do_predict true \ 47 | --do_lower_case false \ 48 | --init_pretraining_params $BERT_DIR/params \ 49 | --train_file $DATA/ReCoRD/train.json \ 50 | --predict_file $DATA/ReCoRD/dev.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze false \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.1 \ 57 | --learning_rate 3e-5 \ 58 | --epoch 4 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 62 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 63 | --use_wordnet true \ 64 | --use_nell true \ 65 | --random_seed 45 \ 66 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 67 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_twomemory_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | 43 | python3 src/run_record_twomemory.py \ 44 | --batch_size 6 \ 45 | --do_train true \ 46 | --do_predict true \ 47 | --do_lower_case false \ 48 | --init_checkpoint record_both_first_stage_output/step_41970 \ 49 | --train_file $DATA/ReCoRD/train.json \ 50 | --predict_file $DATA/ReCoRD/dev.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze false \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.1 \ 57 | --learning_rate 3e-5 \ 58 | --epoch 4 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 62 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 63 | --use_wordnet true \ 64 | --use_nell true \ 65 | --random_seed 45 \ 66 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 67 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_twomemory_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d record_both_first_stage_log ]; then 23 | mkdir record_both_first_stage_log 24 | else 25 | rm -r record_both_first_stage_log/* 26 | fi 27 | 28 | if [ ! -d record_both_first_stage_output ]; then 29 | mkdir record_both_first_stage_output 30 | else 31 | rm -r record_both_first_stage_output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | 43 | python3 src/run_record_twomemory.py \ 44 | --batch_size 6 \ 45 | --do_train true \ 46 | --do_predict true \ 47 | --use_ema false \ 48 | --do_lower_case false \ 49 | --init_pretraining_params $BERT_DIR/params \ 50 | --train_file $DATA/ReCoRD/train.json \ 51 | --predict_file $DATA/ReCoRD/dev.json \ 52 | --vocab_path $BERT_DIR/vocab.txt \ 53 | --bert_config_path $BERT_DIR/bert_config.json \ 54 | --freeze true \ 55 | --save_steps 4000 \ 56 | --weight_decay 0.01 \ 57 | --warmup_proportion 0.0 \ 58 | --learning_rate 3e-4 \ 59 | --epoch 10 \ 60 | --max_seq_len 384 \ 61 | --doc_stride 128 \ 62 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 63 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 64 | --use_wordnet true \ 65 | --use_nell true \ 66 | --random_seed 45 \ 67 | --checkpoints record_both_first_stage_output/ 1>$PWD_DIR/record_both_first_stage_log/train.log 2>&1 68 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_wordnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | 42 | python3 src/run_record.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_pretraining_params $BERT_DIR/params \ 48 | --train_file $DATA/ReCoRD/train.json \ 49 | --predict_file $DATA/ReCoRD/dev.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 4 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_wordnet true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_wordnet_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | 42 | python3 src/run_record.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_checkpoint record_wn_first_stage_output/step_41970 \ 48 | --train_file $DATA/ReCoRD/train.json \ 49 | --predict_file $DATA/ReCoRD/dev.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 4 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_wordnet true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_record_wordnet_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d record_wn_first_stage_log ]; then 23 | mkdir record_wn_first_stage_log 24 | else 25 | rm -r record_wn_first_stage_log/* 26 | fi 27 | 28 | if [ ! -d record_wn_first_stage_output ]; then 29 | mkdir record_wn_first_stage_output 30 | else 31 | rm -r record_wn_first_stage_output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | 42 | python3 src/run_record.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --use_ema false \ 47 | --do_lower_case false \ 48 | --init_pretraining_params $BERT_DIR/params \ 49 | --train_file $DATA/ReCoRD/train.json \ 50 | --predict_file $DATA/ReCoRD/dev.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze true \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.0 \ 57 | --learning_rate 3e-4 \ 58 | --epoch 10 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 62 | --use_wordnet true \ 63 | --random_seed 45 \ 64 | --checkpoints record_wn_first_stage_output/ 1>$PWD_DIR/record_wn_first_stage_log/train.log 2>&1 65 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_nell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | 42 | python3 src/run_squad.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_pretraining_params $BERT_DIR/params \ 48 | --train_file $DATA/SQuAD/train-v1.1.json \ 49 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 3 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_nell true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_nell_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | 42 | python3 src/run_squad.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_checkpoint sqd_nell_first_stage_output/step_3649 \ 48 | --train_file $DATA/SQuAD/train-v1.1.json \ 49 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 3 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_nell true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_nell_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d sqd_nell_first_stage_log ]; then 23 | mkdir sqd_nell_first_stage_log 24 | else 25 | rm -r sqd_nell_first_stage_log/* 26 | fi 27 | 28 | if [ ! -d sqd_nell_first_stage_output ]; then 29 | mkdir sqd_nell_first_stage_output 30 | else 31 | rm -r sqd_nell_first_stage_output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 41 | 42 | python3 src/run_squad.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --use_ema false \ 47 | --do_lower_case false \ 48 | --init_pretraining_params $BERT_DIR/params \ 49 | --train_file $DATA/SQuAD/train-v1.1.json \ 50 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze true \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.0 \ 57 | --learning_rate 3e-5 \ 58 | --epoch 1 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 62 | --use_nell true \ 63 | --random_seed 45 \ 64 | --checkpoints sqd_nell_first_stage_output/ 1>$PWD_DIR/sqd_nell_first_stage_log/train.log 2>&1 65 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_twomemory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | 43 | python3 src/run_squad_twomemory.py \ 44 | --batch_size 6 \ 45 | --do_train true \ 46 | --do_predict true \ 47 | --do_lower_case false \ 48 | --init_pretraining_params $BERT_DIR/params \ 49 | --train_file $DATA/SQuAD/train-v1.1.json \ 50 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze false \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.1 \ 57 | --learning_rate 3e-5 \ 58 | --epoch 3 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 62 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 63 | --use_wordnet true \ 64 | --use_nell true \ 65 | --random_seed 45 \ 66 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 67 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_twomemory_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | 43 | python3 src/run_squad_twomemory.py \ 44 | --batch_size 6 \ 45 | --do_train true \ 46 | --do_predict true \ 47 | --do_lower_case false \ 48 | --init_checkpoint sqd_both_first_stage_output/step_3649 \ 49 | --train_file $DATA/SQuAD/train-v1.1.json \ 50 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze false \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.1 \ 57 | --learning_rate 3e-5 \ 58 | --epoch 3 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 62 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 63 | --use_wordnet true \ 64 | --use_nell true \ 65 | --random_seed 45 \ 66 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 67 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_twomemory_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d sqd_both_first_stage_log ]; then 23 | mkdir sqd_both_first_stage_log 24 | else 25 | rm -r sqd_both_first_stage_log/* 26 | fi 27 | 28 | if [ ! -d sqd_both_first_stage_output ]; then 29 | mkdir sqd_both_first_stage_output 30 | else 31 | rm -r sqd_both_first_stage_output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt 42 | 43 | python3 src/run_squad_twomemory.py \ 44 | --batch_size 6 \ 45 | --do_train true \ 46 | --do_predict true \ 47 | --use_ema false \ 48 | --do_lower_case false \ 49 | --init_pretraining_params $BERT_DIR/params \ 50 | --train_file $DATA/SQuAD/train-v1.1.json \ 51 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 52 | --vocab_path $BERT_DIR/vocab.txt \ 53 | --bert_config_path $BERT_DIR/bert_config.json \ 54 | --freeze true \ 55 | --save_steps 4000 \ 56 | --weight_decay 0.01 \ 57 | --warmup_proportion 0.0 \ 58 | --learning_rate 3e-5 \ 59 | --epoch 1 \ 60 | --max_seq_len 384 \ 61 | --doc_stride 128 \ 62 | --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \ 63 | --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \ 64 | --use_wordnet true \ 65 | --use_nell true \ 66 | --random_seed 45 \ 67 | --checkpoints sqd_both_first_stage_output/ 1>$PWD_DIR/sqd_both_first_stage_log/train.log 2>&1 68 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_wordnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | 42 | python3 src/run_squad.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_pretraining_params $BERT_DIR/params \ 48 | --train_file $DATA/SQuAD/train-v1.1.json \ 49 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 3 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_wordnet true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_wordnet_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d log ]; then 23 | mkdir log 24 | else 25 | rm -r log/* 26 | fi 27 | 28 | if [ ! -d output ]; then 29 | mkdir output 30 | else 31 | rm -r output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | 42 | python3 src/run_squad.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --do_lower_case false \ 47 | --init_checkpoint sqd_wn_first_stage_output/step_3649 \ 48 | --train_file $DATA/SQuAD/train-v1.1.json \ 49 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 50 | --vocab_path $BERT_DIR/vocab.txt \ 51 | --bert_config_path $BERT_DIR/bert_config.json \ 52 | --freeze false \ 53 | --save_steps 4000 \ 54 | --weight_decay 0.01 \ 55 | --warmup_proportion 0.1 \ 56 | --learning_rate 3e-5 \ 57 | --epoch 3 \ 58 | --max_seq_len 384 \ 59 | --doc_stride 128 \ 60 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 61 | --use_wordnet true \ 62 | --random_seed 45 \ 63 | --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1 64 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/run_squad_wordnet_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | export LANG=en_US.UTF-8 19 | export LC_ALL=en_US.UTF-8 20 | export LC_CTYPE=en_US.UTF-8 21 | 22 | if [ ! -d sqd_wn_first_stage_log ]; then 23 | mkdir sqd_wn_first_stage_log 24 | else 25 | rm -r sqd_wn_first_stage_log/* 26 | fi 27 | 28 | if [ ! -d sqd_wn_first_stage_output ]; then 29 | mkdir sqd_wn_first_stage_output 30 | else 31 | rm -r sqd_wn_first_stage_output/* 32 | fi 33 | 34 | export FLAGS_cudnn_deterministic=true 35 | export FLAGS_cpu_deterministic=true 36 | 37 | PWD_DIR=`pwd` 38 | DATA=../data/ 39 | BERT_DIR=cased_L-24_H-1024_A-16 40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt 41 | 42 | python3 src/run_squad.py \ 43 | --batch_size 6 \ 44 | --do_train true \ 45 | --do_predict true \ 46 | --use_ema false \ 47 | --do_lower_case false \ 48 | --init_pretraining_params $BERT_DIR/params \ 49 | --train_file $DATA/SQuAD/train-v1.1.json \ 50 | --predict_file $DATA/SQuAD/dev-v1.1.json \ 51 | --vocab_path $BERT_DIR/vocab.txt \ 52 | --bert_config_path $BERT_DIR/bert_config.json \ 53 | --freeze true \ 54 | --save_steps 4000 \ 55 | --weight_decay 0.01 \ 56 | --warmup_proportion 0.0 \ 57 | --learning_rate 3e-5 \ 58 | --epoch 1 \ 59 | --max_seq_len 384 \ 60 | --doc_stride 128 \ 61 | --concept_embedding_path $CPT_EMBEDDING_PATH \ 62 | --use_wordnet true \ 63 | --random_seed 45 \ 64 | --checkpoints sqd_wn_first_stage_output/ 1>$PWD_DIR/sqd_wn_first_stage_log/train.log 2>&1 65 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/model/__init__.py -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/utils/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Arguments for configuration.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import six 21 | import argparse 22 | import logging 23 | 24 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 25 | datefmt = '%m/%d/%Y %H:%M:%S', 26 | level = logging.INFO) 27 | logging.getLogger().setLevel(logging.INFO) 28 | logger = logging.getLogger(__name__) 29 | 30 | def str2bool(v): 31 | # because argparse does not support to parse "true, False" as python 32 | # boolean directly 33 | return v.lower() in ("true", "t", "1") 34 | 35 | 36 | class ArgumentGroup(object): 37 | def __init__(self, parser, title, des): 38 | self._group = parser.add_argument_group(title=title, description=des) 39 | 40 | def add_arg(self, name, type, default, help, **kwargs): 41 | type = str2bool if type == bool else type 42 | self._group.add_argument( 43 | "--" + name, 44 | default=default, 45 | type=type, 46 | help=help + ' Default: %(default)s.', 47 | **kwargs) 48 | 49 | 50 | def print_arguments(args): 51 | logger.info('----------- Configuration Arguments -----------') 52 | for arg, value in sorted(six.iteritems(vars(args))): 53 | logger.info('%s: %s' % (arg, value)) 54 | logger.info('------------------------------------------------') 55 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/utils/fp16.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | import paddle 17 | import paddle.fluid as fluid 18 | 19 | 20 | def cast_fp16_to_fp32(i, o, prog): 21 | prog.global_block().append_op( 22 | type="cast", 23 | inputs={"X": i}, 24 | outputs={"Out": o}, 25 | attrs={ 26 | "in_dtype": fluid.core.VarDesc.VarType.FP16, 27 | "out_dtype": fluid.core.VarDesc.VarType.FP32 28 | }) 29 | 30 | 31 | def cast_fp32_to_fp16(i, o, prog): 32 | prog.global_block().append_op( 33 | type="cast", 34 | inputs={"X": i}, 35 | outputs={"Out": o}, 36 | attrs={ 37 | "in_dtype": fluid.core.VarDesc.VarType.FP32, 38 | "out_dtype": fluid.core.VarDesc.VarType.FP16 39 | }) 40 | 41 | 42 | def copy_to_master_param(p, block): 43 | v = block.vars.get(p.name, None) 44 | if v is None: 45 | raise ValueError("no param name %s found!" % p.name) 46 | new_p = fluid.framework.Parameter( 47 | block=block, 48 | shape=v.shape, 49 | dtype=fluid.core.VarDesc.VarType.FP32, 50 | type=v.type, 51 | lod_level=v.lod_level, 52 | stop_gradient=p.stop_gradient, 53 | trainable=p.trainable, 54 | optimize_attr=p.optimize_attr, 55 | regularizer=p.regularizer, 56 | gradient_clip_attr=p.gradient_clip_attr, 57 | error_clip=p.error_clip, 58 | name=v.name + ".master") 59 | return new_p 60 | 61 | 62 | def create_master_params_grads(params_grads, main_prog, startup_prog, 63 | loss_scaling): 64 | master_params_grads = [] 65 | tmp_role = main_prog._current_role 66 | OpRole = fluid.core.op_proto_and_checker_maker.OpRole 67 | main_prog._current_role = OpRole.Backward 68 | for p, g in params_grads: 69 | # create master parameters 70 | master_param = copy_to_master_param(p, main_prog.global_block()) 71 | startup_master_param = startup_prog.global_block()._clone_variable( 72 | master_param) 73 | startup_p = startup_prog.global_block().var(p.name) 74 | cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog) 75 | # cast fp16 gradients to fp32 before apply gradients 76 | if g.name.find("layer_norm") > -1: 77 | if loss_scaling > 1: 78 | scaled_g = g / float(loss_scaling) 79 | else: 80 | scaled_g = g 81 | master_params_grads.append([p, scaled_g]) 82 | continue 83 | master_grad = fluid.layers.cast(g, "float32") 84 | if loss_scaling > 1: 85 | master_grad = master_grad / float(loss_scaling) 86 | master_params_grads.append([master_param, master_grad]) 87 | main_prog._current_role = tmp_role 88 | return master_params_grads 89 | 90 | 91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog): 92 | for idx, m_p_g in enumerate(master_params_grads): 93 | train_p, _ = params_grads[idx] 94 | if train_p.name.find("layer_norm") > -1: 95 | continue 96 | with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): 97 | cast_fp32_to_fp16(m_p_g[0], train_p, main_prog) 98 | -------------------------------------------------------------------------------- /ACL2019-KTNET/reading_comprehension/src/utils/init.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import os 18 | import six 19 | import ast 20 | import copy 21 | import logging 22 | 23 | import numpy as np 24 | import paddle.fluid as fluid 25 | 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 27 | datefmt = '%m/%d/%Y %H:%M:%S', 28 | level = logging.INFO) 29 | logging.getLogger().setLevel(logging.INFO) 30 | logger = logging.getLogger(__name__) 31 | 32 | def cast_fp32_to_fp16(exe, main_program): 33 | logger.info("Cast parameters to float16 data format.") 34 | for param in main_program.global_block().all_parameters(): 35 | if not param.name.endswith(".master"): 36 | param_t = fluid.global_scope().find_var(param.name).get_tensor() 37 | data = np.array(param_t) 38 | if param.name.find("layer_norm") == -1: 39 | param_t.set(np.float16(data).view(np.uint16), exe.place) 40 | master_param_var = fluid.global_scope().find_var(param.name + 41 | ".master") 42 | if master_param_var is not None: 43 | master_param_var.get_tensor().set(data, exe.place) 44 | 45 | 46 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): 47 | assert os.path.exists( 48 | init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path 49 | 50 | def existed_persitables(var): 51 | if not fluid.io.is_persistable(var): 52 | return False 53 | return os.path.exists(os.path.join(init_checkpoint_path, var.name)) 54 | 55 | fluid.io.load_vars( 56 | exe, 57 | init_checkpoint_path, 58 | main_program=main_program, 59 | predicate=existed_persitables) 60 | logger.info("Load model from {}".format(init_checkpoint_path)) 61 | 62 | if use_fp16: 63 | cast_fp32_to_fp16(exe, main_program) 64 | 65 | 66 | def init_pretraining_params(exe, 67 | pretraining_params_path, 68 | main_program, 69 | use_fp16=False): 70 | assert os.path.exists(pretraining_params_path 71 | ), "[%s] cann't be found." % pretraining_params_path 72 | 73 | def existed_params(var): 74 | if not isinstance(var, fluid.framework.Parameter): 75 | return False 76 | return os.path.exists(os.path.join(pretraining_params_path, var.name)) 77 | 78 | fluid.io.load_vars( 79 | exe, 80 | pretraining_params_path, 81 | main_program=main_program, 82 | predicate=existed_params) 83 | logger.info("Load pretraining parameters from {}.".format( 84 | pretraining_params_path)) 85 | 86 | if use_fp16: 87 | cast_fp32_to_fp16(exe, main_program) 88 | -------------------------------------------------------------------------------- /ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/retrieve.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ============================================================================== 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | import pickle 19 | import argparse 20 | import os 21 | import nltk 22 | import logging 23 | import string 24 | from tqdm import tqdm 25 | from nltk.corpus import wordnet as wn 26 | 27 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 28 | datefmt = '%m/%d/%Y %H:%M:%S', 29 | level = logging.INFO) 30 | logger = logging.getLogger(__name__) 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--train_token', type=str, default='../tokenization_record/tokens/train.tokenization.uncased.data', help='token file of train set') 35 | parser.add_argument('--eval_token', type=str, default='../tokenization_record/tokens/dev.tokenization.uncased.data', help='token file of dev set') 36 | parser.add_argument('--output_dir', type=str, default='output_record/', help='output directory') 37 | parser.add_argument('--no_stopwords', action='store_true', help='ignore stopwords') 38 | parser.add_argument('--ignore_length', type=int, default=0, help='ignore words with length <= ignore_length') 39 | args = parser.parse_args() 40 | 41 | # initialize mapping from offset id to wn18 synset name 42 | offset_to_wn18name_dict = {} 43 | fin = open('wordnet-mlj12-definitions.txt') 44 | for line in fin: 45 | info = line.strip().split('\t') 46 | offset_str, synset_name = info[0], info[1] 47 | offset_to_wn18name_dict[offset_str] = synset_name 48 | logger.info('Finished loading wn18 definition file.') 49 | 50 | 51 | # load pickled samples 52 | logger.info('Begin to load tokenization results...') 53 | train_samples = pickle.load(open(args.train_token, 'rb')) 54 | dev_samples = pickle.load(open(args.eval_token, 'rb')) 55 | logger.info('Finished loading tokenization results.') 56 | 57 | # build token set 58 | all_token_set = set() 59 | for sample in train_samples + dev_samples: 60 | for token in sample['query_tokens'] + sample['document_tokens']: 61 | all_token_set.add(token) 62 | logger.info('Finished making tokenization results into token set.') 63 | 64 | # load stopwords 65 | stopwords = set(nltk.corpus.stopwords.words('english')) 66 | logger.info('Finished loading stopwords list.') 67 | 68 | # retrive synsets 69 | logger.info('Begin to retrieve synsets...') 70 | token2synset = dict() 71 | stopword_cnt = 0 72 | punctuation_cnt = 0 73 | for token in tqdm(all_token_set): 74 | if token in set(string.punctuation): 75 | logger.info('{} is punctuation, skipped!'.format(token)) 76 | punctuation_cnt += 1 77 | continue 78 | if args.no_stopwords and token in stopwords: 79 | logger.info('{} is stopword, skipped!'.format(token)) 80 | stopword_cnt += 1 81 | continue 82 | if args.ignore_length > 0 and len(token) <= args.ignore_length: 83 | logger.info('{} is too short, skipped!'.format(token)) 84 | continue 85 | synsets = wn.synsets(token) 86 | wn18synset_names = [] 87 | for synset in synsets: 88 | offset_str = str(synset.offset()).zfill(8) 89 | if offset_str in offset_to_wn18name_dict: 90 | wn18synset_names.append(offset_to_wn18name_dict[offset_str]) 91 | if len(wn18synset_names) > 0: 92 | token2synset[token] = wn18synset_names 93 | logger.info('Finished retrieving sysnets.') 94 | logger.info('{} / {} tokens retrieved at lease 1 synset. {} stopwords and {} punctuations skipped.'.format(len(token2synset), len(all_token_set), stopword_cnt, punctuation_cnt)) 95 | 96 | if not os.path.exists(args.output_dir): 97 | os.makedirs(args.output_dir) 98 | 99 | with open(os.path.join(args.output_dir, 'retrived_synsets.data'), 'wb') as fout: 100 | pickle.dump(token2synset, fout) 101 | logger.info('Finished dumping retrieved synsets.') 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /DuQM/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import paddle 16 | import numpy as np 17 | 18 | from paddlenlp.datasets import MapDataset 19 | 20 | 21 | def create_dataloader(dataset, 22 | mode='train', 23 | batch_size=1, 24 | batchify_fn=None, 25 | trans_fn=None): 26 | if trans_fn: 27 | dataset = dataset.map(trans_fn) 28 | 29 | shuffle = True if mode == 'train' else False 30 | if mode == 'train': 31 | batch_sampler = paddle.io.DistributedBatchSampler( 32 | dataset, batch_size=batch_size, shuffle=shuffle) 33 | else: 34 | batch_sampler = paddle.io.BatchSampler( 35 | dataset, batch_size=batch_size, shuffle=shuffle) 36 | 37 | return paddle.io.DataLoader( 38 | dataset=dataset, 39 | batch_sampler=batch_sampler, 40 | collate_fn=batchify_fn, 41 | return_list=True) 42 | 43 | 44 | def read_text_pair(data_path, is_test=False): 45 | """Reads data.""" 46 | with open(data_path, 'r', encoding='utf-8') as f: 47 | for line in f: 48 | data = line.rstrip().split("\t") 49 | if is_test == False: 50 | if len(data) != 3: 51 | continue 52 | yield {'query1': data[0], 'query2': data[1], 'label': data[2]} 53 | else: 54 | if len(data) != 2: 55 | continue 56 | yield {'query1': data[0], 'query2': data[1]} 57 | 58 | 59 | 60 | def convert_example(example, tokenizer, max_seq_length=512, is_test=False): 61 | 62 | query, title = example["query1"], example["query2"] 63 | 64 | encoded_inputs = tokenizer( 65 | text=query, text_pair=title, max_seq_len=max_seq_length) 66 | 67 | input_ids = encoded_inputs["input_ids"] 68 | token_type_ids = encoded_inputs["token_type_ids"] 69 | 70 | if not is_test: 71 | label = np.array([example["label"]], dtype="int64") 72 | return input_ids, token_type_ids, label 73 | else: 74 | return input_ids, token_type_ids -------------------------------------------------------------------------------- /DuQM/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License" 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import paddle 16 | import paddle.nn as nn 17 | import paddle.nn.functional as F 18 | 19 | import paddlenlp as ppnlp 20 | 21 | 22 | class QuestionMatching(nn.Layer): 23 | def __init__(self, pretrained_model, dropout=None, rdrop_coef=0.0): 24 | super().__init__() 25 | self.ptm = pretrained_model 26 | self.dropout = nn.Dropout(dropout if dropout is not None else 0.1) 27 | 28 | # num_labels = 2 (similar or dissimilar) 29 | self.classifier = nn.Linear(self.ptm.config["hidden_size"], 2) 30 | self.rdrop_coef = rdrop_coef 31 | self.rdrop_loss = ppnlp.losses.RDropLoss() 32 | 33 | def forward(self, 34 | input_ids, 35 | token_type_ids=None, 36 | position_ids=None, 37 | attention_mask=None, 38 | do_evaluate=False): 39 | 40 | _, cls_embedding1 = self.ptm(input_ids, token_type_ids, position_ids, 41 | attention_mask) 42 | cls_embedding1 = self.dropout(cls_embedding1) 43 | logits1 = self.classifier(cls_embedding1) 44 | 45 | # For more information about R-drop please refer to this paper: https://arxiv.org/abs/2106.14448 46 | # Original implementation please refer to this code: https://github.com/dropreg/R-Drop 47 | if self.rdrop_coef > 0 and not do_evaluate: 48 | _, cls_embedding2 = self.ptm(input_ids, token_type_ids, position_ids, 49 | attention_mask) 50 | cls_embedding2 = self.dropout(cls_embedding2) 51 | logits2 = self.classifier(cls_embedding2) 52 | kl_loss = self.rdrop_loss(logits1, logits2) 53 | else: 54 | kl_loss = 0.0 55 | 56 | return logits1, kl_loss 57 | -------------------------------------------------------------------------------- /DuQM/predict.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from functools import partial 16 | import argparse 17 | import sys 18 | import os 19 | import random 20 | import time 21 | 22 | import numpy as np 23 | import paddle 24 | import paddle.nn.functional as F 25 | import paddlenlp as ppnlp 26 | from paddlenlp.datasets import load_dataset 27 | from paddlenlp.data import Stack, Tuple, Pad 28 | 29 | from data import create_dataloader, read_text_pair, convert_example 30 | from model import QuestionMatching 31 | 32 | # yapf: disable 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("--input_file", type=str, required=True, help="The full path of input file") 35 | parser.add_argument("--result_file", type=str, required=True, help="The result file name") 36 | parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.") 37 | parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after tokenization. " 38 | "Sequences longer than this will be truncated, sequences shorter will be padded.") 39 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") 40 | parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") 41 | args = parser.parse_args() 42 | # yapf: enable 43 | 44 | 45 | def predict(model, data_loader): 46 | """ 47 | Predicts the data labels. 48 | 49 | Args: 50 | model (obj:`QuestionMatching`): A model to calculate whether the question pair is semantic similar or not. 51 | data_loaer (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids] 52 | Returns: 53 | results(obj:`List`): cosine similarity of text pairs. 54 | """ 55 | batch_logits = [] 56 | 57 | model.eval() 58 | 59 | with paddle.no_grad(): 60 | for batch_data in data_loader: 61 | input_ids, token_type_ids = batch_data 62 | 63 | input_ids = paddle.to_tensor(input_ids) 64 | token_type_ids = paddle.to_tensor(token_type_ids) 65 | 66 | batch_logit, _ = model( 67 | input_ids=input_ids, token_type_ids=token_type_ids) 68 | 69 | batch_logits.append(batch_logit.numpy()) 70 | 71 | batch_logits = np.concatenate(batch_logits, axis=0) 72 | 73 | return batch_logits 74 | 75 | 76 | if __name__ == "__main__": 77 | paddle.set_device(args.device) 78 | 79 | pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained( 80 | 'ernie-gram-zh') 81 | tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained( 82 | 'ernie-gram-zh') 83 | 84 | trans_func = partial( 85 | convert_example, 86 | tokenizer=tokenizer, 87 | max_seq_length=args.max_seq_length, 88 | is_test=True) 89 | 90 | batchify_fn = lambda samples, fn=Tuple( 91 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids 92 | Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment_ids 93 | ): [data for data in fn(samples)] 94 | 95 | test_ds = load_dataset( 96 | read_text_pair, data_path=args.input_file, is_test=True, lazy=False) 97 | 98 | test_data_loader = create_dataloader( 99 | test_ds, 100 | mode='predict', 101 | batch_size=args.batch_size, 102 | batchify_fn=batchify_fn, 103 | trans_fn=trans_func) 104 | 105 | model = QuestionMatching(pretrained_model) 106 | 107 | if args.params_path and os.path.isfile(args.params_path): 108 | state_dict = paddle.load(args.params_path) 109 | model.set_dict(state_dict) 110 | print("Loaded parameters from %s" % args.params_path) 111 | else: 112 | raise ValueError( 113 | "Please set --params_path with correct pretrained model file") 114 | 115 | y_probs = predict(model, test_data_loader) 116 | y_preds = np.argmax(y_probs, axis=1) 117 | 118 | with open(args.result_file, 'w', encoding="utf-8") as f: 119 | for y_pred in y_preds: 120 | f.write(str(y_pred) + "\n") 121 | -------------------------------------------------------------------------------- /DuReader-2.0/.gitignore: -------------------------------------------------------------------------------- 1 | models/ 2 | preprocessed/ 3 | raw/ 4 | *.pyc 5 | vocab.search 6 | local 7 | bleu_metric 8 | rouge_metric 9 | .idea/ 10 | add_copyright.py 11 | copyright 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /DuReader-2.0/data/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | if [[ -d preprocessed ]] && [[ -d raw ]]; then 20 | echo "data exist" 21 | exit 0 22 | else 23 | wget -c https://dataset-bj.cdn.bcebos.com/dureader/dureader_raw.zip 24 | wget -c https://dataset-bj.cdn.bcebos.com/dureader/dureader_preprocessed.zip 25 | fi 26 | 27 | if md5sum --status -c md5sum.txt; then 28 | unzip dureader_raw.zip 29 | unzip dureader_preprocessed.zip 30 | else 31 | echo "download data error!" >> /dev/stderr 32 | exit 1 33 | fi 34 | -------------------------------------------------------------------------------- /DuReader-2.0/data/md5sum.txt: -------------------------------------------------------------------------------- 1 | dc7658b8cdf4f94b8714d130b7d15196 dureader_raw.zip 2 | 3db9a32e5a7c5375a604a70687b45479 dureader_preprocessed.zip 3 | -------------------------------------------------------------------------------- /DuReader-2.0/paddle/UPDATES.md: -------------------------------------------------------------------------------- 1 | # The notes on the updates of PaddlePaddle baseline 2 | 3 | ## Updates 4 | 5 | We implement a BiDAF model with PaddlePaddle. Note that we have an update on the PaddlePaddle baseline (Feb 25, 2019). In this document, we give the details of the major updates: 6 | 7 | ### 1 Paragraph Extraction 8 | 9 | The first update is that we incorporate a strategy of paragraph extraction to improve the model performance (see the file `paddle/para_extraction.py`). A similar strategy has been used in the Top-1 system (Liu et al. 2018) at [2018 Machine Reading Challenge](http://mrc2018.cipsc.org.cn/). 10 | 11 | The original baseline of DuReader (He et al. 2018) employed a simple strategy to select paragraphs for model training and testing. However, the paragraphs that includes the true answers may not be selected. Hence, we want to incorporate as much information for the answer extraction as possible. 12 | 13 | The detail of the new strategy of paragraph extraction is as follows. We apply the new paragraph extraction strategy on each document. For each document, 14 | - We remove the duplicated paragraphs in the document. 15 | - We concatenate the title and all paragraphs in the document with a pre-defined splitter if it is shorter than a predefined maximum length. Otherwise, 16 | - We compute F1 score of each paragraph relative to the question; 17 | - We concatenate the title and the top-K paragraphs (by F1 score) with a pre-defined splitter to form an extracted paragraph that should be shorter than the predefined maximum length. 18 | 19 | ### 2 The Prior of Document Ranking 20 | 21 | We also introduce the prior of document ranking from search engine (see line #176 in `paddle/run.py`). The documents in DuReader are collected from the search results. Hence, the prior scores of document ranking is an important feature. We compute the prior scores from the training data and apply the prior scores in the testing stage. 22 | 23 | ## Reference 24 | 25 | - Liu, J., Wei, W., Sun, M., Chen, H., Du, Y. and Lin, D., 2018. A Multi-answer Multi-task Framework for Real-world Machine Reading Comprehension. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (pp. 2109-2118). 26 | 27 | - He, W., Liu, K., Liu, J., Lyu, Y., Zhao, S., Xiao, X., Liu, Y., Wang, Y., Wu, H., She, Q. and Liu, X., 2017. Dureader: a chinese machine reading comprehension dataset from real-world applications. arXiv preprint arXiv:1711.05073. 28 | 29 | -------------------------------------------------------------------------------- /DuReader-2.0/paddle/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import argparse 20 | import distutils.util 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser(description=__doc__) 25 | parser.add_argument( 26 | '--prepare', 27 | action='store_true', 28 | help='create the directories, prepare the vocabulary and embeddings') 29 | parser.add_argument('--train', action='store_true', help='train the model') 30 | parser.add_argument('--evaluate', action='store_true', help='evaluate the model on dev set') 31 | parser.add_argument('--predict', action='store_true', 32 | help='predict the answers for test set with trained model') 33 | 34 | parser.add_argument("--embed_size", type=int, default=300, 35 | help="The dimension of embedding table. (default: %(default)d)") 36 | parser.add_argument("--hidden_size", type=int, default=150, 37 | help="The size of rnn hidden unit. (default: %(default)d)") 38 | parser.add_argument("--learning_rate", type=float, default=0.001, 39 | help="Learning rate used to train the model. (default: %(default)f)") 40 | parser.add_argument('--optim', default='adam', help='optimizer type') 41 | parser.add_argument("--weight_decay", type=float, default=0.0001, 42 | help="Weight decay. (default: %(default)f)") 43 | 44 | parser.add_argument('--drop_rate', type=float, default=0.0, help="Dropout probability") 45 | parser.add_argument('--random_seed', type=int, default=123) 46 | parser.add_argument("--batch_size", type=int, default=32, 47 | help="The sequence number of a mini-batch data. (default: %(default)d)") 48 | parser.add_argument("--pass_num", type=int, default=5, 49 | help="The number epochs to train. (default: %(default)d)") 50 | parser.add_argument("--use_gpu", type=distutils.util.strtobool, default=True, 51 | help="Whether to use gpu. (default: %(default)d)") 52 | parser.add_argument("--log_interval", type=int, default=50, 53 | help="log the train loss every n batches. (default: %(default)d)") 54 | 55 | parser.add_argument('--max_p_num', type=int, default=5) 56 | parser.add_argument('--max_a_len', type=int, default=200) 57 | parser.add_argument('--max_p_len', type=int, default=500) 58 | parser.add_argument('--max_q_len', type=int, default=60) 59 | parser.add_argument('--doc_num', type=int, default=5) 60 | 61 | parser.add_argument('--vocab_dir', default='../data/vocab', help='vocabulary') 62 | parser.add_argument("--save_dir", type=str, default="../data/models", 63 | help="Specify the path to save trained models.") 64 | parser.add_argument("--save_interval", type=int, default=1, 65 | help="Save the trained model every n passes. (default: %(default)d)") 66 | parser.add_argument("--load_dir", type=str, default="", 67 | help="Specify the path to load trained models.") 68 | parser.add_argument('--log_path', 69 | help='path of the log file. If not set, logs are printed to console') 70 | parser.add_argument('--result_dir', default='../data/results/', 71 | help='the dir to output the results') 72 | parser.add_argument('--result_name', default='test_result', 73 | help='the file name of the predicted results') 74 | 75 | parser.add_argument('--trainset', nargs='+', 76 | default=['../data/demo/trainset/search.train.json'], 77 | help='train dataset') 78 | parser.add_argument('--devset', nargs='+', 79 | default=['../data/demo/devset/search.dev.json'], 80 | help='dev dataset') 81 | parser.add_argument('--testset', nargs='+', 82 | default=['../data/demo/testset/search.test.json'], 83 | help='test dataset') 84 | 85 | parser.add_argument("--enable_ce", action='store_true', 86 | help="If set, run the task with continuous evaluation logs.") 87 | parser.add_argument('--para_print', action='store_true', help="Print debug info") 88 | parser.add_argument("--dev_interval", type=int, default=-1, 89 | help="evaluate on dev set loss every n batches. (default: %(default)d)") 90 | args = parser.parse_args() 91 | return args 92 | -------------------------------------------------------------------------------- /DuReader-2.0/paddle/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | paragraph_extraction () 5 | { 6 | SOURCE_DIR=$1 7 | TARGET_DIR=$2 8 | echo "Start paragraph extraction, this may take a few hours" 9 | echo "Source dir: $SOURCE_DIR" 10 | echo "Target dir: $TARGET_DIR" 11 | mkdir -p $TARGET_DIR/trainset 12 | mkdir -p $TARGET_DIR/devset 13 | mkdir -p $TARGET_DIR/testset 14 | 15 | echo "Processing trainset" 16 | cat $SOURCE_DIR/trainset/search.train.json | python paragraph_extraction.py train \ 17 | > $TARGET_DIR/trainset/search.train.json 18 | cat $SOURCE_DIR/trainset/zhidao.train.json | python paragraph_extraction.py train \ 19 | > $TARGET_DIR/trainset/zhidao.train.json 20 | 21 | echo "Processing devset" 22 | cat $SOURCE_DIR/devset/search.dev.json | python paragraph_extraction.py dev \ 23 | > $TARGET_DIR/devset/search.dev.json 24 | cat $SOURCE_DIR/devset/zhidao.dev.json | python paragraph_extraction.py dev \ 25 | > $TARGET_DIR/devset/zhidao.dev.json 26 | 27 | echo "Processing testset" 28 | cat $SOURCE_DIR/testset/search.test.json | python paragraph_extraction.py test \ 29 | > $TARGET_DIR/testset/search.test.json 30 | cat $SOURCE_DIR/testset/zhidao.test.json | python paragraph_extraction.py test \ 31 | > $TARGET_DIR/testset/zhidao.test.json 32 | echo "Paragraph extraction done!" 33 | } 34 | 35 | 36 | PROCESS_NAME="$1" 37 | case $PROCESS_NAME in 38 | --para_extraction) 39 | # Start paragraph extraction 40 | if [ ! -d ../data/preprocessed ]; then 41 | echo "Please download the preprocessed data first (See README - Preprocess)" 42 | exit 1 43 | fi 44 | paragraph_extraction ../data/preprocessed ../data/extracted 45 | ;; 46 | --prepare|--train|--evaluate|--predict) 47 | # Start Paddle baseline 48 | python run.py $@ 49 | ;; 50 | *) 51 | echo $"Usage: $0 {--para_extraction|--prepare|--train|--evaluate|--predict}" 52 | esac 53 | -------------------------------------------------------------------------------- /DuReader-2.0/tensorflow/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Empty __init__.py file 19 | 20 | Authors: Yizhong Wang(wangyizhong01@baidu.com) 21 | Date: 2017/09/20 12:00:00 22 | """ 23 | -------------------------------------------------------------------------------- /DuReader-2.0/tensorflow/layers/basic_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module provides wrappers for variants of RNN in Tensorflow 19 | """ 20 | 21 | import tensorflow as tf 22 | import tensorflow.contrib as tc 23 | 24 | 25 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True): 26 | """ 27 | Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN 28 | Args: 29 | rnn_type: the type of rnn 30 | inputs: padded inputs into rnn 31 | length: the valid length of the inputs 32 | hidden_size: the size of hidden units 33 | layer_num: multiple rnn layer are stacked if layer_num > 1 34 | dropout_keep_prob: 35 | concat: When the rnn is bidirectional, the forward outputs and backward outputs are 36 | concatenated if this is True, else we add them. 37 | Returns: 38 | RNN outputs and final state 39 | """ 40 | if not rnn_type.startswith('bi'): 41 | cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 42 | outputs, states = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32) 43 | if rnn_type.endswith('lstm'): 44 | c = [state.c for state in states] 45 | h = [state.h for state in states] 46 | states = h 47 | else: 48 | cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 49 | cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 50 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 51 | cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32 52 | ) 53 | states_fw, states_bw = states 54 | if rnn_type.endswith('lstm'): 55 | c_fw = [state_fw.c for state_fw in states_fw] 56 | h_fw = [state_fw.h for state_fw in states_fw] 57 | c_bw = [state_bw.c for state_bw in states_bw] 58 | h_bw = [state_bw.h for state_bw in states_bw] 59 | states_fw, states_bw = h_fw, h_bw 60 | if concat: 61 | outputs = tf.concat(outputs, 2) 62 | states = tf.concat([states_fw, states_bw], 1) 63 | else: 64 | outputs = outputs[0] + outputs[1] 65 | states = states_fw + states_bw 66 | return outputs, states 67 | 68 | 69 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None): 70 | """ 71 | Gets the RNN Cell 72 | Args: 73 | rnn_type: 'lstm', 'gru' or 'rnn' 74 | hidden_size: The size of hidden units 75 | layer_num: MultiRNNCell are used if layer_num > 1 76 | dropout_keep_prob: dropout in RNN 77 | Returns: 78 | An RNN Cell 79 | """ 80 | cells = [] 81 | for i in range(layer_num): 82 | if rnn_type.endswith('lstm'): 83 | cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True) 84 | elif rnn_type.endswith('gru'): 85 | cell = tc.rnn.GRUCell(num_units=hidden_size) 86 | elif rnn_type.endswith('rnn'): 87 | cell = tc.rnn.BasicRNNCell(num_units=hidden_size) 88 | else: 89 | raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type)) 90 | if dropout_keep_prob is not None: 91 | cell = tc.rnn.DropoutWrapper(cell, 92 | input_keep_prob=dropout_keep_prob, 93 | output_keep_prob=dropout_keep_prob) 94 | cells.append(cell) 95 | cells = tc.rnn.MultiRNNCell(cells, state_is_tuple=True) 96 | return cells 97 | 98 | 99 | -------------------------------------------------------------------------------- /DuReader-2.0/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This package implements some utility functions shared by PaddlePaddle 19 | and Tensorflow model implementations. 20 | 21 | Authors: liuyuan(liuyuan04@baidu.com) 22 | Date: 2017/10/06 18:23:06 23 | """ 24 | 25 | 26 | from .dureader_eval import compute_bleu_rouge 27 | from .dureader_eval import normalize 28 | from .preprocess import find_fake_answer 29 | from .preprocess import find_best_question_match 30 | 31 | __all__ = [ 32 | 'compute_bleu_rouge', 33 | 'normalize', 34 | 'find_fake_answer', 35 | 'find_best_question_match', 36 | ] 37 | -------------------------------------------------------------------------------- /DuReader-2.0/utils/download_thirdparty.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | # We use Bleu and Rouge as evaluation metrics, the calculation of these metrics 19 | # relies on the scoring scripts under "https://github.com/tylin/coco-caption" 20 | 21 | bleu_base_url='https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/bleu' 22 | bleu_files=("LICENSE" "__init__.py" "bleu.py" "bleu_scorer.py") 23 | 24 | rouge_base_url="https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/rouge" 25 | rouge_files=("__init__.py" "rouge.py") 26 | 27 | download() { 28 | local metric=$1; shift; 29 | local base_url=$1; shift; 30 | local fnames=($@); 31 | 32 | mkdir -p ${metric} 33 | for fname in ${fnames[@]}; 34 | do 35 | printf "downloading: %s\n" ${base_url}/${fname} 36 | wget --no-check-certificate ${base_url}/${fname} -O ${metric}/${fname} 37 | done 38 | } 39 | 40 | # prepare rouge 41 | download "rouge_metric" ${rouge_base_url} ${rouge_files[@]} 42 | 43 | # prepare bleu 44 | download "bleu_metric" ${bleu_base_url} ${bleu_files[@]} 45 | 46 | # convert python 2.x source code to python 3.x 47 | 2to3 -w "../utils/bleu_metric/bleu_scorer.py" 48 | 2to3 -w "../utils/bleu_metric/bleu.py" 49 | -------------------------------------------------------------------------------- /DuReader-2.0/utils/get_vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Utility function to generate vocabulary file. 19 | """ 20 | 21 | 22 | import argparse 23 | import sys 24 | import json 25 | 26 | from itertools import chain 27 | 28 | 29 | def get_vocab(files, vocab_file): 30 | """ 31 | Builds vocabulary file from field 'segmented_paragraphs' 32 | and 'segmented_question'. 33 | 34 | Args: 35 | files: A list of file names. 36 | vocab_file: The file that stores the vocabulary. 37 | """ 38 | vocab = {} 39 | for f in files: 40 | with open(f, 'r') as fin: 41 | for line in fin: 42 | obj = json.loads(line.strip()) 43 | paras = [ 44 | chain(*d['segmented_paragraphs']) 45 | for d in obj['documents']] 46 | doc_tokens = chain(*paras) 47 | question_tokens = obj['segmented_question'] 48 | for t in list(doc_tokens) + question_tokens: 49 | vocab[t] = vocab.get(t, 0) + 1 50 | # output 51 | sorted_vocab = sorted([(v, c) for v, c in vocab.items()], 52 | key=lambda x: x[1], 53 | reverse=True) 54 | with open(vocab_file, 'w') as outf: 55 | for w, c in sorted_vocab: 56 | print >> outf, '{}\t{}'.format(w.encode('utf8'), c) 57 | 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--files', nargs='+', required=True, 62 | help='file list to count vocab from.') 63 | parser.add_argument('--vocab', required=True, 64 | help='file to store counted vocab.') 65 | args = parser.parse_args() 66 | get_vocab(args.files, args.vocab) 67 | 68 | -------------------------------------------------------------------------------- /DuReader-2.0/utils/marco_tokenize_data.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | 3 | import os, sys, json 4 | import nltk 5 | 6 | def _nltk_tokenize(sequence): 7 | tokens = nltk.word_tokenize(sequence) 8 | 9 | cur_char_offset = 0 10 | token_offsets = [] 11 | token_words = [] 12 | for token in tokens: 13 | cur_char_offset = sequence.find(token, cur_char_offset) 14 | token_offsets.append([cur_char_offset, cur_char_offset + len(token) - 1]) 15 | token_words.append(token) 16 | return token_offsets, token_words 17 | 18 | def segment(input_js): 19 | _, input_js['segmented_question'] = _nltk_tokenize(input_js['question']) 20 | for doc_id, doc in enumerate(input_js['documents']): 21 | doc['segmented_title'] = [] 22 | doc['segmented_paragraphs'] = [] 23 | for para_id, para in enumerate(doc['paragraphs']): 24 | _, seg_para = _nltk_tokenize(para) 25 | doc['segmented_paragraphs'].append(seg_para) 26 | if 'answers' in input_js: 27 | input_js['segmented_answers'] = [] 28 | for answer_id, answer in enumerate(input_js['answers']): 29 | _, seg_answer = _nltk_tokenize(answer) 30 | input_js['segmented_answers'].append(seg_answer) 31 | 32 | 33 | if __name__ == '__main__': 34 | if len(sys.argv) != 2: 35 | print('Usage: tokenize_data.py ') 36 | exit() 37 | 38 | nltk.download('punkt') 39 | 40 | for line in open(sys.argv[1]): 41 | dureader_js = json.loads(line.strip()) 42 | segment(dureader_js) 43 | print(json.dumps(dureader_js)) 44 | -------------------------------------------------------------------------------- /DuReader-2.0/utils/marcov1_to_dureader.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | 8 | def trans(input_js): 9 | output_js = {} 10 | output_js['question'] = input_js['query'] 11 | output_js['question_type'] = input_js['query_type'] 12 | output_js['question_id'] = input_js['query_id'] 13 | output_js['fact_or_opinion'] = "" 14 | output_js['documents'] = [] 15 | for para_id, para in enumerate(input_js['passages']): 16 | doc = {} 17 | doc['title'] = "" 18 | if 'is_selected' in para: 19 | doc['is_selected'] = True if para['is_selected'] != 0 else False 20 | doc['paragraphs'] = [para['passage_text']] 21 | output_js['documents'].append(doc) 22 | 23 | if 'answers' in input_js: 24 | output_js['answers'] = input_js['answers'] 25 | return output_js 26 | 27 | 28 | if __name__ == '__main__': 29 | if len(sys.argv) != 2: 30 | print('Usage: marcov1_to_dureader.py ') 31 | exit() 32 | 33 | df = pd.read_json(sys.argv[1]) 34 | for row in df.iterrows(): 35 | marco_js = json.loads(row[1].to_json()) 36 | dureader_js = trans(marco_js) 37 | print(json.dumps(dureader_js)) 38 | -------------------------------------------------------------------------------- /DuReader-2.0/utils/marcov2_to_v1_tojsonl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import pandas as pd 4 | 5 | if __name__ == '__main__': 6 | if len(sys.argv) != 3: 7 | print('Usage: tojson.py ') 8 | exit() 9 | infile = sys.argv[1] 10 | outfile = sys.argv[2] 11 | df = pd.read_json(infile) 12 | with open(outfile, 'w') as f: 13 | for row in df.iterrows(): 14 | f.write(row[1].to_json() + '\n') -------------------------------------------------------------------------------- /DuReader-2.0/utils/run_marco2dureader_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | input_file=$1 4 | output_file=$2 5 | 6 | # convert the data from MARCO V2 (json) format to MARCO V1 (jsonl) format. 7 | # the script was forked from MARCO repo. 8 | # the format of MARCO V1 is much more easier to explore. 9 | python3 marcov2_to_v1_tojsonl.py $input_file $input_file.marcov1 10 | 11 | # convert the data from MARCO V1 format to DuReader format. 12 | python3 marcov1_to_dureader.py $input_file.marcov1 >$input_file.dureader_raw 13 | 14 | # tokenize the data. 15 | python3 marco_tokenize_data.py $input_file.dureader_raw >$input_file.segmented 16 | 17 | # find fake answers (indicating the start and end positions of answers in the document) for train and dev sets. 18 | # note that this should not be applied for test set, since there is no ground truth in test set. 19 | python preprocess.py $input_file.segmented >$output_file 20 | 21 | # remove the temporal data files. 22 | rm -rf $input_file.dureader_raw $input_file.segmented 23 | -------------------------------------------------------------------------------- /DuReader-Checklist/README.md: -------------------------------------------------------------------------------- 1 | # 阅读理解 DuReaderchecklist 2 | 3 | # 简介 4 | 5 | ## 1. 任务说明 6 | 机器阅读理解 (Machine Reading Comprehension) 是指让机器阅读文本,然后回答和阅读内容相关的问题。阅读理解是自然语言处理和人工智能领域的重要前沿课题,对于提升机器的智能水平、使机器具有持续知识获取的能力等具有重要价值,近年来受到学术界和工业界的广泛关注。 7 | 8 | 自然语言理解对机器学习模型各方面的能力均有极高的要求。然而,当前的机器阅读理解数据集大多都只采用单一的指标来评测模型的好坏,缺乏对模型语言理解能力的细粒度、多维度评测,导致模型的具体缺陷很难被发现和改进。为了解决这个问题,我们建立了细粒度的、多维度的评测数据集,从词汇理解、短语理解、语义角色理解、逻辑推理等多个维度检测模型的不足之处,从而推动阅读理解评测进入“精细化“时代。 9 | 10 | ## 2. 数据集 11 | DuReaderchecklist数据集旨在通过建立checklist评测体系,系统性地评估当前模型能力的不足之处。目前checklist体系中涉及到的自然语言理解能力包含:词汇理解、短语理解、语义角色理解以及推理能力等等。具体的分类体系可参考下图: 12 | ![checklist_framwork](checklist.png) 13 | 14 | DuReaderchecklist数据集包含训练集、开发集以及测试集。其中开发集和测试集中,既包含和训练集同分布的in-domain样本,也包含了按照checklist体系分类后的样本。对于一个给定的问题q、一个篇章p及其标题t,系统需要根据篇章内容,判断该篇章p中是否包含给定问题的答案,如果是,则给出该问题的答案a;否则输出“no answer”。数据集中的每个样本,是一个四元组,例如: 15 | 16 | * * * 17 | 18 | **问题 q**: 番石榴汁热量 19 | 20 | **篇章 p**: 番石榴性温,味甜、酸、涩…,最重要的是番石榴所含的脂肪热量较低,一个番石榴所含的脂肪约0.9克重或84卡路里。比起苹果,番石榴所含有的脂肪少38%,卡路里少42%。 21 | 22 | **标题 t**: 番石榴汁的热量 - 妈妈网百科 23 | 24 | 25 | **参考答案 a**: [‘一个番石榴所含的脂肪约0.9克重或84卡路里’] 26 | 27 | * * * 28 | 29 | **问题 q**: 云南文山市多少人口? 30 | 31 | **篇章 p**: 云南省下辖8个市、8个少数民族自治州,面积39万平方千米,总人口4596万人,云南汉族人口为3062.9万人,占云南省总人口的66.63%... 32 | 33 | **标题 t**: 云南总人口数多少人,2019年云南人口数量统计(最新) 34 | 35 | 36 | **参考答案 a**: [‘无答案’] 37 | 38 | * * * 39 | 40 | 41 | # 快速开始 42 | 43 | ### 安装说明 44 | 45 | * PaddlePaddle 安装 46 | 47 | 本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 48 | 49 | * PaddleNLP 安装 50 | 51 | ```shell 52 | pip install --upgrade paddlenlp -i https://pypi.org/simple 53 | ``` 54 | 55 | * 环境依赖 56 | 57 | Python的版本要求 3.6+ 58 | 59 | ### 目录结构 60 | 61 | ```text 62 | ├── README.md # 说明文档 63 | ├── evaluate.py # 评测脚本 64 | ├── run_eval.sh # 评测入口 65 | ├── train.sh # 训练入口 66 | ├── predict.sh # 预测入口 67 | ├── src/run_du.py # 训练、预测逻辑 68 | ├── src/squad.py # reader、后处理等 69 | ├── src/args.py # 超参配置 70 | ├── src/models.py # 模型 71 | ``` 72 | 73 | 74 | ### 数据准备 75 | 在运行基线之前,需要下载DuReaderchecklist数据集,运行 76 | 77 | ``` 78 | sh download.sh 79 | ``` 80 | 81 | 该命令完成之后,数据集会被保存到```dataset/```文件夹中。此外,基于[ERNIE-1.0](https://arxiv.org/abs/1904.09223)微调后的基线模型参数也会被保存在`finetuned_model/ `文件夹中,可供直接预测使用。 82 | 83 | ### 模型训练 84 | 85 | * 按如下方式可以使用默认配置进行训练,并在开发集做预测: 86 | 87 | ``` 88 | sh train.sh 89 | ``` 90 | 其中训练好的模型参数以及预测结果会被保存在`output/`件夹中。 91 | 92 | * 如需使用其他数据集进行数据增强 (例如[DuReaderrobust](https://github.com/PaddlePaddle/Research/tree/master/NLP/DuReader-Robust-BASELINE)训练集),可以使用以下命令 (数据格式需保持兼容): 93 | 94 | ``` 95 | sh train.sh --train_file path_to_dataset_file 96 | ``` 97 | 其中`path_to_dataset_file `是数据集路径,例如`dataset/train.json`。 98 | 99 | * 如需使用前一阶段训练好的参数进行热启动训练,可运行以下命令: 100 | 101 | ``` 102 | sh train.sh --model_name_or_path path_to_model_ckpt 103 | ``` 104 | 其中`path_to_model_ckpt`是模型参数路径,例如`output/model_2000`。 105 | 106 | 更为详细的参数配置可参考`train.sh`以及`args.py`。 107 | 108 | 109 | ### 模型预测 110 | * 如需使用训练好的参数进行预测,可参考以下命令: 111 | 112 | ``` 113 | sh predict.sh --model_name_or_path path_to_model_ckpt --predict_file path_to_dataset_file 114 | ``` 115 | 其中`path_to_model_ckpt`是模型参数路径,`path_to_dataset_file `是数据集路径。 116 | 117 | * 为了方便测试,我们也提供了已经微调好的模型参数。运行以下命令即可直接进行预测 118 | 119 | ``` 120 | sh predict.sh --model_name_or_path finetuned_model --predict_file dataset/dev.json 121 | ``` 122 | 预测结果会被保存在`output/`件夹中。 123 | 124 | ### 结果评估 125 | 评估脚本的运行参考以下命令: 126 | 127 | ``` 128 | sh run_eval.sh dataset_file pred_file 129 | ``` 130 | 131 | 其中`dataset_file `是数据集文件,`pred_file`是模型预测结果,例如 132 | 133 | ``` 134 | sh run_eval.sh dataset/dev.json output/dev_predictions.json 135 | ``` 136 | 下表是ERNIE-1.0基线模型在dev集合的效果: 137 | 138 | | Dataset | Num_examples | F1 | EM | 139 | | --- | :---: | --- | --- | 140 | | All | 1130 | 64.080 | 55.221 | 141 | | in-domain | 1000 |65.809 | 57.000 | 142 | | vocab | 35 | 44.113 |42.857 | 143 | | phrase | 35 | 63.345 | 62.857 | 144 | |semantic-role | 20 | 41.827 | 25.000 | 145 | |fault-tolerant | 20 | 46.741 | 25.000 | 146 | |reasoning| 20 | 53.429 | 35.000 | 147 | 148 | 149 | 150 | # 其他 151 | 152 | ## 如何贡献代码 153 | 154 | 如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。 155 | -------------------------------------------------------------------------------- /DuReader-Checklist/checklist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Checklist/checklist.png -------------------------------------------------------------------------------- /DuReader-Checklist/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download dataset and model parameters 3 | set -e 4 | 5 | echo "Download DuReader-checklist dataset" 6 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/lic2021/dureader_checklist.dataset.tar.gz 7 | tar -zxvf dureader_checklist.dataset.tar.gz 8 | rm dureader_checklist.dataset.tar.gz 9 | 10 | echo "Download fine-tuned parameters" 11 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/lic2021/dureader_checklist.finetuned_model.tar.gz 12 | tar -zxvf dureader_checklist.finetuned_model.tar.gz 13 | rm dureader_checklist.finetuned_model.tar.gz 14 | -------------------------------------------------------------------------------- /DuReader-Checklist/predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONIOENCODING=utf-8 3 | 4 | if [ -z "$CUDA_VISIBLE_DEVICES" ];then 5 | export CUDA_VISIBLE_DEVICES=0 6 | fi 7 | 8 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 9 | 10 | python -u src/run.py \ 11 | --model_type ernie \ 12 | --max_seq_length 512 \ 13 | --batch_size 4 \ 14 | --logging_steps 50 \ 15 | --max_answer_length 512 \ 16 | --output_dir output \ 17 | --version_2_with_negative \ 18 | --do_pred \ 19 | --cls_threshold 0.7 \ 20 | $@ 21 | -------------------------------------------------------------------------------- /DuReader-Checklist/run_eval.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -lt 2 ]; then 2 | echo "Usage: $0 dataset_file pred_file" 3 | exit 1 4 | fi 5 | python evaluate.py $1 $2 6 | for tag in 'in-domain' 'vocab' 'phrase' 'semantic-role' 'fault-tolerant' 'reasoning' 7 | do 8 | python evaluate.py $1 $2 --tag $tag 9 | done 10 | 11 | -------------------------------------------------------------------------------- /DuReader-Checklist/src/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_args(): 4 | parser = argparse.ArgumentParser(description=__doc__) 5 | parser.add_argument( 6 | "--train_file", 7 | type=str, 8 | default=None, 9 | help="Train data path.") 10 | parser.add_argument( 11 | "--predict_file", 12 | type=str, 13 | default=None, 14 | help="Predict data path.", 15 | nargs='+') 16 | parser.add_argument( 17 | "--model_type", 18 | default=None, 19 | type=str, 20 | required=True, 21 | help="Type of pre-trained model.") 22 | parser.add_argument( 23 | "--model_name_or_path", 24 | default=None, 25 | type=str, 26 | required=True, 27 | help="Path to pre-trained model or shortcut name of model.") 28 | parser.add_argument( 29 | "--output_dir", 30 | default=None, 31 | type=str, 32 | required=True, 33 | help="The output directory where the model predictions and checkpoints will be written." 34 | ) 35 | parser.add_argument( 36 | "--max_seq_length", 37 | default=128, 38 | type=int, 39 | help="The maximum total input sequence length after tokenization. Sequences longer " 40 | "than this will be truncated, sequences shorter will be padded.") 41 | parser.add_argument( 42 | "--batch_size", 43 | default=8, 44 | type=int, 45 | help="Batch size per GPU/CPU for training.") 46 | parser.add_argument( 47 | "--learning_rate", 48 | default=5e-5, 49 | type=float, 50 | help="The initial learning rate for Adam.") 51 | parser.add_argument( 52 | "--weight_decay", 53 | default=0.0, 54 | type=float, 55 | help="Weight decay if we apply some.") 56 | parser.add_argument( 57 | "--adam_epsilon", 58 | default=1e-8, 59 | type=float, 60 | help="Epsilon for Adam optimizer.") 61 | parser.add_argument( 62 | "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") 63 | parser.add_argument( 64 | "--num_train_epochs", 65 | default=3, 66 | type=int, 67 | help="Total number of training epochs to perform.") 68 | parser.add_argument( 69 | "--max_steps", 70 | default=-1, 71 | type=int, 72 | help="If > 0: set total number of training steps to perform. Override num_train_epochs." 73 | ) 74 | parser.add_argument( 75 | "--warmup_proportion", 76 | default=0.0, 77 | type=float, 78 | help="Proportion of training steps to perform linear learning rate warmup for." 79 | ) 80 | parser.add_argument( 81 | "--logging_steps", 82 | type=int, 83 | default=500, 84 | help="Log every X updates steps.") 85 | parser.add_argument( 86 | "--save_steps", 87 | type=int, 88 | default=500, 89 | help="Save checkpoint every X updates steps.") 90 | parser.add_argument( 91 | "--seed", type=int, default=42, help="random seed for initialization") 92 | parser.add_argument( 93 | "--device", 94 | type=str, 95 | default="gpu", 96 | help="Device for selecting for the training.") 97 | parser.add_argument( 98 | "--doc_stride", 99 | type=int, 100 | default=128, 101 | help="When splitting up a long document into chunks, how much stride to take between chunks." 102 | ) 103 | parser.add_argument( 104 | "--n_best_size", 105 | type=int, 106 | default=20, 107 | help="The total number of n-best predictions to generate in the nbest_predictions.json output file." 108 | ) 109 | parser.add_argument( 110 | "--max_query_length", type=int, default=64, help="Max query length.") 111 | parser.add_argument( 112 | "--max_answer_length", type=int, default=30, help="Max answer length.") 113 | parser.add_argument( 114 | "--cls_threshold", type=float, default=0.5, help="No answer threshold") 115 | parser.add_argument( 116 | "--do_lower_case", 117 | action='store_false', 118 | help="Whether to lower case the input text. Should be True for uncased models and False for cased models." 119 | ) 120 | parser.add_argument( 121 | "--verbose", action='store_true', help="Whether to output verbose log.") 122 | parser.add_argument( 123 | "--version_2_with_negative", 124 | action='store_true', 125 | help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true." 126 | ) 127 | parser.add_argument( 128 | "--do_train", action='store_true', help="Whether to train the model.") 129 | 130 | parser.add_argument( 131 | "--do_pred", action='store_true', help="Whether to predict.") 132 | args = parser.parse_args() 133 | return args 134 | -------------------------------------------------------------------------------- /DuReader-Checklist/src/models.py: -------------------------------------------------------------------------------- 1 | from paddlenlp.transformers import ErniePretrainedModel, BertPretrainedModel, RobertaPretrainedModel 2 | from paddle import nn 3 | import paddle 4 | 5 | class ErnieForQuestionAnswering(ErniePretrainedModel): 6 | def __init__(self, ernie): 7 | super(ErnieForQuestionAnswering, self).__init__() 8 | self.ernie = ernie # allow ernie to be config 9 | self.classifier = nn.Linear(self.ernie.config["hidden_size"], 2) 10 | self.classifier_cls = nn.Linear(self.ernie.config["hidden_size"], 2) 11 | self.apply(self.init_weights) 12 | 13 | def forward(self, 14 | input_ids, 15 | token_type_ids=None, 16 | position_ids=None, 17 | attention_mask=None): 18 | sequence_output, pooled_output = self.ernie( 19 | input_ids, 20 | token_type_ids=token_type_ids, 21 | position_ids=position_ids, 22 | attention_mask=attention_mask) 23 | 24 | logits = self.classifier(sequence_output) 25 | logits = paddle.transpose(logits, perm=[2, 0, 1]) 26 | start_logits, end_logits = paddle.unstack(x=logits, axis=0) 27 | cls_logits = self.classifier_cls(pooled_output) 28 | 29 | return start_logits, end_logits, cls_logits 30 | 31 | class BertForQuestionAnswering(BertPretrainedModel): 32 | def __init__(self, bert): 33 | super(BertForQuestionAnswering, self).__init__() 34 | self.bert = bert # allow bert to be config 35 | self.classifier = nn.Linear(self.bert.config["hidden_size"], 2) 36 | self.classifier_cls = nn.Linear(self.bert.config["hidden_size"], 2) 37 | self.apply(self.init_weights) 38 | 39 | def forward(self, 40 | input_ids, 41 | token_type_ids=None, 42 | position_ids=None, 43 | attention_mask=None): 44 | sequence_output, pooled_output = self.bert( 45 | input_ids, 46 | token_type_ids=token_type_ids, 47 | position_ids=position_ids, 48 | attention_mask=attention_mask) 49 | 50 | logits = self.classifier(sequence_output) 51 | logits = paddle.transpose(logits, perm=[2, 0, 1]) 52 | start_logits, end_logits = paddle.unstack(x=logits, axis=0) 53 | cls_logits = self.classifier_cls(pooled_output) 54 | 55 | return start_logits, end_logits, cls_logits 56 | 57 | class RobertaForQuestionAnswering(RobertaPretrainedModel): 58 | def __init__(self, roberta): 59 | super(RobertaForQuestionAnswering, self).__init__() 60 | self.roberta = roberta # allow roberta to be config 61 | self.classifier = nn.Linear(self.roberta.config["hidden_size"], 2) 62 | self.classifier_cls = nn.Linear(self.roberta.config["hidden_size"], 2) 63 | self.apply(self.init_weights) 64 | 65 | def forward(self, 66 | input_ids, 67 | token_type_ids=None, 68 | position_ids=None, 69 | attention_mask=None): 70 | sequence_output, pooled_output = self.roberta( 71 | input_ids, 72 | token_type_ids=token_type_ids, 73 | position_ids=position_ids, 74 | attention_mask=attention_mask) 75 | 76 | logits = self.classifier(sequence_output) 77 | logits = paddle.transpose(logits, perm=[2, 0, 1]) 78 | start_logits, end_logits = paddle.unstack(x=logits, axis=0) 79 | cls_logits = self.classifier_cls(pooled_output) 80 | 81 | return start_logits, end_logits, cls_logits -------------------------------------------------------------------------------- /DuReader-Checklist/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONIOENCODING=utf-8 3 | 4 | unset CUDA_VISIBLE_DEVICES 5 | 6 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 7 | 8 | python -m paddle.distributed.launch --gpus "0" src/run.py \ 9 | --model_type ernie \ 10 | --model_name_or_path ernie-1.0 \ 11 | --max_seq_length 512 \ 12 | --batch_size 2 \ 13 | --learning_rate 3e-5 \ 14 | --num_train_epochs 2 \ 15 | --logging_steps 50 \ 16 | --save_steps 1000 \ 17 | --warmup_proportion 0.1 \ 18 | --max_answer_length 512 \ 19 | --weight_decay 0.01 \ 20 | --output_dir output \ 21 | --version_2_with_negative \ 22 | --do_train \ 23 | --do_pred \ 24 | --train_file dataset/train.json \ 25 | --predict_file dataset/dev.json \ 26 | --cls_threshold 0.7 \ 27 | --device gpu \ 28 | $@ 29 | -------------------------------------------------------------------------------- /DuReader-Retrieval/figures/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Retrieval/figures/example.png -------------------------------------------------------------------------------- /DuReader-Robust/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download dataset and model parameters 3 | set -e 4 | 5 | echo "Download ERNIE 1.0" 6 | mkdir pretrained_model 7 | cd pretrained_model 8 | wget --no-check-certificate https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz 9 | tar -zxvf ERNIE_1.0_max-len-512.tar.gz 10 | rm ERNIE_1.0_max-len-512.tar.gz 11 | cd .. 12 | 13 | echo "Download DuReader-robust dataset" 14 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/dureader_robust/data/dureader_robust-data.tar.gz 15 | tar -zxvf dureader_robust-data.tar.gz 16 | mv dureader_robust-data data 17 | rm dureader_robust-data.tar.gz 18 | 19 | echo "Download fine-tuned parameters" 20 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/dureader_robust/baseline/dureader_robust-baseline-finetuned.tar.gz 21 | tar -zxvf dureader_robust-baseline-finetuned.tar.gz 22 | rm dureader_robust-baseline-finetuned.tar.gz 23 | -------------------------------------------------------------------------------- /DuReader-Robust/md5.txt: -------------------------------------------------------------------------------- 1 | 553223945508e49483e899ae8548e5a9 dureader_robust-baseline-finetuned.tar.gz 2 | 020b26396f1b5932e451dba84d0b3dc8 dureader_robust-data.tar.gz 3 | b6d1da2fbc610ac13b86b5113f8819f7 ERNIE_1.0_max-len-512.tar.gz 4 | -------------------------------------------------------------------------------- /DuReader-Robust/paddlehub_baseline/paddlehub_reading_comprehension.sh: -------------------------------------------------------------------------------- 1 | export FLAGS_eager_delete_tensor_gb=0.0 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | DATASET_PATH="../data" 5 | 6 | python -u reading_comprehension.py \ 7 | --dataset_path=${DATASET_PATH} \ 8 | --batch_size=8 \ 9 | --use_gpu=True \ 10 | --checkpoint_dir="./ckpt_dureader" \ 11 | --learning_rate=3e-5 \ 12 | --weight_decay=0.01 \ 13 | --warmup_proportion=0.1 \ 14 | --num_epoch=5 \ 15 | --max_seq_len=512 \ 16 | --use_data_parallel=False 17 | 18 | -------------------------------------------------------------------------------- /DuReader-Robust/paddlehub_baseline/reading_comprehension.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Finetuning on reading comprehension task """ 16 | 17 | import argparse 18 | import ast 19 | import json 20 | import os 21 | 22 | import paddle.fluid as fluid 23 | import paddlehub as hub 24 | 25 | from demo_dataset import DuReader 26 | 27 | # yapf: disable 28 | parser = argparse.ArgumentParser(__doc__) 29 | parser.add_argument("--dataset_path", type=str, default=None, help="The diretory to DuReader robust dataset") 30 | parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") 31 | parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") 32 | parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate used to train with warmup.") 33 | parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") 34 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") 35 | parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") 36 | parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.") 37 | parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.") 38 | parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") 39 | args = parser.parse_args() 40 | # yapf: enable. 41 | 42 | 43 | if __name__ == '__main__': 44 | # 加载PaddleHub ERNIE预训练模型 45 | module = hub.Module(name="ernie") 46 | 47 | # ERNIE预训练模型输入变量inputs、输出变量outputs、以及模型program 48 | inputs, outputs, program = module.context( 49 | trainable=True, max_seq_len=args.max_seq_len) 50 | 51 | # 加载竞赛数据集并使用ReadingComprehensionReader读取数据 52 | dataset = DuReader(dataset_path=args.dataset_path) 53 | reader = hub.reader.ReadingComprehensionReader( 54 | dataset=dataset, 55 | vocab_path=module.get_vocab_path(), 56 | max_seq_len=args.max_seq_len, 57 | doc_stride=128, 58 | max_query_length=64) 59 | 60 | # 取ERNIE的字级别预训练输出 61 | seq_output = outputs["sequence_output"] 62 | 63 | # 设置运行program所需的feed_list 64 | feed_list = [ 65 | inputs["input_ids"].name, 66 | inputs["position_ids"].name, 67 | inputs["segment_ids"].name, 68 | inputs["input_mask"].name, 69 | ] 70 | 71 | # 选择Fine-tune优化策略 72 | strategy = hub.AdamWeightDecayStrategy( 73 | weight_decay=args.weight_decay, 74 | learning_rate=args.learning_rate, 75 | warmup_proportion=args.warmup_proportion) 76 | 77 | # 设置运行配置 78 | config = hub.RunConfig( 79 | eval_interval=500, 80 | use_pyreader=False, 81 | use_data_parallel=args.use_data_parallel, 82 | use_cuda=args.use_gpu, 83 | num_epoch=args.num_epoch, 84 | batch_size=args.batch_size, 85 | checkpoint_dir=args.checkpoint_dir, 86 | save_ckpt_interval=500, 87 | strategy=strategy) 88 | 89 | # 定义阅读理解Fine-tune Task 90 | # 由于竞赛数据集与cmrc2018数据集格式比较相似,此处sub_task应为cmrc2018 91 | # 否则运行可能出错 92 | reading_comprehension_task = hub.ReadingComprehensionTask( 93 | data_reader=reader, 94 | feature=seq_output, 95 | feed_list=feed_list, 96 | config=config, 97 | sub_task="cmrc2018", 98 | ) 99 | 100 | # 调用finetune_and_eval API,将会自动进行训练、评估以及保存最佳模型 101 | reading_comprehension_task.finetune_and_eval() 102 | 103 | # 数据集验证集部分数据用于预测 104 | data = dataset.get_dev_examples() 105 | # 调用predict接口, 打开return_result(True),将自动返回预测结果 106 | all_prediction = reading_comprehension_task.predict(data=data, load_best_model=False, return_result=True) 107 | # 写入预测结果 108 | json.dump(all_prediction, open('submit.json', 'w'), ensure_ascii=False) 109 | -------------------------------------------------------------------------------- /DuReader-Robust/predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONIOENCODING=utf-8 3 | 4 | if [ -z "$CUDA_VISIBLE_DEVICES" ];then 5 | export CUDA_VISIBLE_DEVICES=0 6 | fi 7 | 8 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 9 | 10 | 11 | if [ -z "$PRETRAINED_MODEL_PATH" ];then 12 | PRETRAINED_MODEL_PATH="./pretrained_model" 13 | fi 14 | echo "PRETRAINED_MODEL_PATH=$PRETRAINED_MODEL_PATH" 15 | 16 | if [ -z "$CKPT" ];then 17 | CKPT="./finetuned_model" 18 | fi 19 | echo "CKPT=$CKPT" 20 | 21 | python -u src/run_mrc.py --use_cuda true \ 22 | --batch_size 24 \ 23 | --checkpoints output \ 24 | --init_checkpoint ${CKPT} \ 25 | --vocab_path ${PRETRAINED_MODEL_PATH}/vocab.txt \ 26 | --ernie_config ${PRETRAINED_MODEL_PATH}/ernie_config.json \ 27 | --max_seq_len 512 \ 28 | --do_lower_case true \ 29 | --doc_stride 128 \ 30 | --max_answer_length 30 \ 31 | --do_train false \ 32 | --do_predict true \ 33 | $@ 34 | 35 | 36 | -------------------------------------------------------------------------------- /DuReader-Robust/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/__init__.py -------------------------------------------------------------------------------- /DuReader-Robust/src/_ce.py: -------------------------------------------------------------------------------- 1 | ####this file is only used for continuous evaluation test! 2 | 3 | import os 4 | import sys 5 | sys.path.insert(0, os.environ['ceroot']) 6 | #sys.path.append('.') 7 | from kpi import CostKpi, DurationKpi, AccKpi 8 | 9 | #### NOTE kpi.py should shared in models in some way!!!! 10 | 11 | train_cost_xnli_card1_kpi = CostKpi( 12 | 'train_cost_xnli_card1', 0.002, 0, actived=True) 13 | train_acc_xnli_card1_kpi = AccKpi( 14 | 'train_acc_xnli_card1', 0.002, 0, actived=True) 15 | train_duration_xnli_card1_kpi = DurationKpi( 16 | 'train_duration_xnli_card1', 0.01, 0, actived=True) 17 | train_cost_xnli_card4_kpi = CostKpi( 18 | 'train_cost_xnli_card4', 0.002, 0, actived=True) 19 | train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True) 20 | train_duration_xnli_card4_kpi = DurationKpi( 21 | 'train_duration_xnli_card4', 0.03, 0, actived=True) 22 | 23 | tracking_kpis = [ 24 | train_cost_xnli_card1_kpi, 25 | train_acc_xnli_card1_kpi, 26 | train_duration_xnli_card1_kpi, 27 | train_cost_xnli_card4_kpi, 28 | train_acc_xnli_card4_kpi, 29 | train_duration_xnli_card4_kpi, 30 | ] 31 | 32 | 33 | def parse_log(log): 34 | ''' 35 | This method should be implemented by model developers. 36 | The suggestion: 37 | each line in the log should be key, value, for example: 38 | " 39 | train_cost\t1.0 40 | test_cost\t1.0 41 | train_cost\t1.0 42 | train_cost\t1.0 43 | train_acc\t1.2 44 | " 45 | ''' 46 | for line in log.split('\n'): 47 | fs = line.strip().split('\t') 48 | print(fs) 49 | if len(fs) == 3 and fs[0] == 'kpis': 50 | print("-----%s" % fs) 51 | kpi_name = fs[1] 52 | kpi_value = float(fs[2]) 53 | yield kpi_name, kpi_value 54 | 55 | 56 | def log_to_ce(log): 57 | kpi_tracker = {} 58 | for kpi in tracking_kpis: 59 | kpi_tracker[kpi.name] = kpi 60 | 61 | for (kpi_name, kpi_value) in parse_log(log): 62 | print(kpi_name, kpi_value) 63 | kpi_tracker[kpi_name].add_record(kpi_value) 64 | kpi_tracker[kpi_name].persist() 65 | 66 | 67 | if __name__ == '__main__': 68 | log = sys.stdin.read() 69 | print("*****") 70 | print(log) 71 | print("****") 72 | log_to_ce(log) 73 | -------------------------------------------------------------------------------- /DuReader-Robust/src/dist_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. 2 | # 3 | #Licensed under the Apache License, Version 2.0 (the "License"); 4 | #you may not use this file except in compliance with the License. 5 | #You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | #Unless required by applicable law or agreed to in writing, software 10 | #distributed under the License is distributed on an "AS IS" BASIS, 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | #See the License for the specific language governing permissions and 13 | #limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | import os 19 | import paddle.fluid as fluid 20 | 21 | 22 | def nccl2_prepare(trainer_id, startup_prog, main_prog): 23 | config = fluid.DistributeTranspilerConfig() 24 | config.mode = "nccl2" 25 | t = fluid.DistributeTranspiler(config=config) 26 | t.transpile( 27 | trainer_id, 28 | trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), 29 | current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), 30 | startup_program=startup_prog, 31 | program=main_prog) 32 | 33 | 34 | def prepare_for_multi_process(exe, build_strategy, train_prog): 35 | # prepare for multi-process 36 | trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) 37 | num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) 38 | if num_trainers < 2: return 39 | print("PADDLE_TRAINERS_NUM", num_trainers) 40 | print("PADDLE_TRAINER_ID", trainer_id) 41 | build_strategy.num_trainers = num_trainers 42 | build_strategy.trainer_id = trainer_id 43 | # NOTE(zcd): use multi processes to train the model, 44 | # and each process use one GPU card. 45 | startup_prog = fluid.Program() 46 | nccl2_prepare(trainer_id, startup_prog, train_prog) 47 | # the startup_prog are run two times, but it doesn't matter. 48 | exe.run(startup_prog) 49 | -------------------------------------------------------------------------------- /DuReader-Robust/src/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/model/__init__.py -------------------------------------------------------------------------------- /DuReader-Robust/src/reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/reader/__init__.py -------------------------------------------------------------------------------- /DuReader-Robust/src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/utils/__init__.py -------------------------------------------------------------------------------- /DuReader-Robust/src/utils/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Arguments for configuration.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import six 21 | import argparse 22 | 23 | import paddle.fluid as fluid 24 | 25 | 26 | def str2bool(v): 27 | # because argparse does not support to parse "true, False" as python 28 | # boolean directly 29 | return v.lower() in ("true", "t", "1") 30 | 31 | 32 | class ArgumentGroup(object): 33 | def __init__(self, parser, title, des): 34 | self._group = parser.add_argument_group(title=title, description=des) 35 | 36 | def add_arg(self, name, type, default, help, **kwargs): 37 | type = str2bool if type == bool else type 38 | self._group.add_argument( 39 | "--" + name, 40 | default=default, 41 | type=type, 42 | help=help + ' Default: %(default)s.', 43 | **kwargs) 44 | 45 | 46 | def print_arguments(args): 47 | print('----------- Configuration Arguments -----------') 48 | for arg, value in sorted(six.iteritems(vars(args))): 49 | print('%s: %s' % (arg, value)) 50 | print('------------------------------------------------') 51 | 52 | def check_cuda(use_cuda, err = \ 53 | "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ 54 | Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" 55 | ): 56 | try: 57 | if use_cuda == True and fluid.is_compiled_with_cuda() == False: 58 | print(err) 59 | sys.exit(1) 60 | except Exception as e: 61 | pass 62 | -------------------------------------------------------------------------------- /DuReader-Robust/src/utils/cards.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | 18 | def get_cards(): 19 | """ 20 | get gpu cards number 21 | """ 22 | num = 0 23 | cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') 24 | if cards != '': 25 | num = len(cards.split(",")) 26 | return num 27 | -------------------------------------------------------------------------------- /DuReader-Robust/src/utils/init.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import os 18 | import six 19 | import ast 20 | import copy 21 | 22 | import numpy as np 23 | import paddle.fluid as fluid 24 | 25 | 26 | def cast_fp32_to_fp16(exe, main_program): 27 | print("Cast parameters to float16 data format.") 28 | for param in main_program.global_block().all_parameters(): 29 | if not param.name.endswith(".master"): 30 | param_t = fluid.global_scope().find_var(param.name).get_tensor() 31 | data = np.array(param_t) 32 | if param.name.find("layer_norm") == -1: 33 | param_t.set(np.float16(data).view(np.uint16), exe.place) 34 | master_param_var = fluid.global_scope().find_var(param.name + 35 | ".master") 36 | if master_param_var is not None: 37 | master_param_var.get_tensor().set(data, exe.place) 38 | 39 | 40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): 41 | assert os.path.exists( 42 | init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path 43 | 44 | def existed_persitables(var): 45 | if not fluid.io.is_persistable(var): 46 | return False 47 | if os.path.exists(os.path.join(init_checkpoint_path, var.name)): 48 | print("INIT {}".format(var.name)) 49 | return True 50 | 51 | fluid.io.load_vars( 52 | exe, 53 | init_checkpoint_path, 54 | main_program=main_program, 55 | predicate=existed_persitables) 56 | print("Load model from {}".format(init_checkpoint_path)) 57 | 58 | if use_fp16: 59 | cast_fp32_to_fp16(exe, main_program) 60 | 61 | 62 | def init_pretraining_params(exe, 63 | pretraining_params_path, 64 | main_program, 65 | use_fp16=False): 66 | assert os.path.exists(pretraining_params_path 67 | ), "[%s] cann't be found." % pretraining_params_path 68 | 69 | def existed_params(var): 70 | if not isinstance(var, fluid.framework.Parameter): 71 | return False 72 | if os.path.exists(os.path.join(pretraining_params_path, var.name)): 73 | print("INIT {}".format(var.name)) 74 | return True 75 | else: 76 | print("SKIP {}".format(var.name)) 77 | return False 78 | 79 | fluid.io.load_vars( 80 | exe, 81 | pretraining_params_path, 82 | main_program=main_program, 83 | predicate=existed_params) 84 | print("Load pretraining parameters from {}.".format( 85 | pretraining_params_path)) 86 | 87 | if use_fp16: 88 | cast_fp32_to_fp16(exe, main_program) 89 | -------------------------------------------------------------------------------- /DuReader-Robust/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONIOENCODING=utf-8 3 | 4 | if [ -z "$CUDA_VISIBLE_DEVICES" ];then 5 | export CUDA_VISIBLE_DEVICES=0 6 | fi 7 | 8 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 9 | 10 | 11 | if [ -z "$PRETRAINED_MODEL_PATH" ];then 12 | PRETRAINED_MODEL_PATH="./pretrained_model" 13 | fi 14 | echo "PRETRAINED_MODEL_PATH=$PRETRAINED_MODEL_PATH" 15 | 16 | python -u src/run_mrc.py --use_cuda true \ 17 | --batch_size 12 \ 18 | --checkpoints output \ 19 | --init_pretraining_params ${PRETRAINED_MODEL_PATH}/params \ 20 | --vocab_path ${PRETRAINED_MODEL_PATH}/vocab.txt \ 21 | --ernie_config ${PRETRAINED_MODEL_PATH}/ernie_config.json \ 22 | --save_steps 10000 \ 23 | --warmup_proportion 0.1 \ 24 | --weight_decay 0.01 \ 25 | --epoch 2 \ 26 | --max_seq_len 512 \ 27 | --do_lower_case true \ 28 | --doc_stride 128 \ 29 | --learning_rate 3e-5 \ 30 | --skip_steps 25 \ 31 | --max_answer_length 30 \ 32 | --do_train true \ 33 | --do_predict true \ 34 | $@ 35 | 36 | -------------------------------------------------------------------------------- /DuReader-vis/images/intro-vis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-vis/images/intro-vis.png -------------------------------------------------------------------------------- /DuReader-vis/images/intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-vis/images/intro.png -------------------------------------------------------------------------------- /MRQA2019-D-NET/README.md: -------------------------------------------------------------------------------- 1 | # D-NET 2 | 3 | ## Introduction 4 | D-NET is a simple pre-training and fine-tuning framework that Baidu used for the MRQA (Machine Reading for Question Answering) 2019 Shared Task, which focused on the generalization of machine reading comprehension (MRC) models. Our system is ranked at top 1 of all the participants in terms of the averaged F1 score. Additionally, we won the first place for 10 of the 12 test sets and the second place for the other two in terms of F1 scores. 5 | 6 | In this repository, we release the related code, data and model parametrs which have been used in the D-NET framework. 7 | 8 | ## Framework 9 | An overview of the D-NET framework is shown in the figure below. To improve the generalization capability of a MRC system, we use mainly two techniques, i.e. **multi-task learning (MTL)** and **ensemble of multiple pre-trained models**. 10 | 11 |

12 | 13 |

14 | 15 | 16 | #### Multi-task learning 17 | In addition to the MRC task, we further introduce several auxiliary tasks in the fine-tuning stage to learn more general language representations. Specifically, we have the following auxiliary tasks: 18 | 19 | - Unsupervised Task: masked Language Model 20 | - Supervised Tasks: 21 | - natural language inference 22 | - paragraph ranking 23 | 24 | We use the [PALM](https://github.com/PaddlePaddle/PALM) multi-task learning library based on [PaddlePaddle](https://www.paddlepaddle.org.cn/) in our experiments, which makes the implementation of new tasks and pre-trained models much easier than from scratch. To train the MRQA data sets with MTL, please refer to the instructions [here](multi_task_learning) (under `multi_task_learning/`). 25 | 26 | #### Ensemble of multiple pre-trained models 27 | In our experiments, we found that the ensemble system based on different pre-trained models shows better generalization capability than the system that based on the single ones. In this repository, we provide the parameters of 3 models that are fine-tuned on the MRQA in-domain data, based on ERNIE2.0, XL-NET and BERT, respectively. The ensemble of these models are implemented as servers. Please refer the instructions [here](server) (under `server/`) for more detials. 28 | 29 | ## Directory structure 30 | ``` 31 | ├── multi_task_learning/ # scripts for multi-task learning 32 | │ ├── configs/ # PALM config files 33 | │ ├── scripts/ # auxiliary scripts 34 | │ ├── wget_pretrained_model.sh # download pretrained model 35 | │ ├── wget_data.sh # download data for MTL 36 | │ ├── run_build_palm.sh # MLT preparation 37 | │ ├── run_evaluation.sh # evaluation 38 | │ ├── run_multi_task.sh # start MTL training 39 | ├── server/ # scripts for the ensemble of multiple pretrained models 40 | │ ├── ernie_server/ # ERNIE mdoel server 41 | │ ├── xlnet_server/ # XL-NET mdoel server 42 | │ ├── bert_server/ # BERT mdoel server 43 | │ ├── main_server.py # main server scripts for ensemble 44 | │ ├── client/ # client scripts which read examples and make requests 45 | │ ├── wget_server_inference_model.sh # script for downlowding model parameters 46 | │ ├── start.sh # script for launching all the servers 47 | ``` 48 | ## Copyright and License 49 | Copyright 2019 Baidu.com, Inc. All Rights Reserved Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 50 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/images/D-NET_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/images/D-NET_framework.png -------------------------------------------------------------------------------- /MRQA2019-D-NET/images/D-NET_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/images/D-NET_server.png -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/configs/answer_matching.yaml: -------------------------------------------------------------------------------- 1 | train_file: "data/am4mrqa/train.txt" 2 | mix_ratio: 0.8 3 | batch_size: 4 4 | in_tokens: False 5 | generate_neg_sample: False 6 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/configs/mask_language_model.yaml: -------------------------------------------------------------------------------- 1 | train_file: "data/mlm4mrqa" 2 | mix_ratio: 2.0 3 | batch_size: 4 4 | in_tokens: False 5 | generate_neg_sample: False 6 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/configs/mtl_config.yaml: -------------------------------------------------------------------------------- 1 | main_task: "reading_comprehension" 2 | auxiliary_task: "mask_language_model answer_matching" 3 | 4 | do_train: True 5 | do_predict: True 6 | 7 | checkpoint_path: "output" 8 | 9 | backbone_model: "bert_model" 10 | pretrain_model_path: "pretrain_model/squad2_model" 11 | pretrain_config_path: "pretrain_model/squad2_model/bert_config.json" 12 | vocab_path: "pretrain_model/squad2_model/vocab.txt" 13 | 14 | optimizer: "bert_optimizer" 15 | learning_rate: 3e-5 16 | lr_scheduler: "linear_warmup_decay" 17 | skip_steps: 100 18 | save_steps: 10000 19 | epoch: 2 20 | use_cuda: True 21 | warmup_proportion: 0.1 22 | weight_decay: 0.1 23 | do_lower_case: False 24 | max_seq_len: 512 25 | use_ema: True 26 | ema_decay: 0.9999 27 | random_seed: 0 28 | use_fp16: False 29 | loss_scaling: 1.0 30 | 31 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/configs/reading_comprehension.yaml: -------------------------------------------------------------------------------- 1 | train_file: "data/mrqa/mrqa-combined.train.raw.json" 2 | predict_file: "data/mrqa/mrqa-combined.dev.raw.json" 3 | sample_rate: 0.02 4 | mix_ratio: 1.0 5 | batch_size: 4 6 | in_tokens: false 7 | doc_stride: 128 8 | with_negative: false 9 | max_query_length: 64 10 | max_answer_length: 30 11 | n_best_size: 20 12 | null_score_diff_threshold: 0.0 13 | verbose: False 14 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/run_build_palm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cp -r configs/* PALM/config/ 4 | cp configs/mtl_config.yaml PALM/ 5 | rm -rf PALM/data 6 | mv data PALM/ 7 | mv squad2_model PALM/pretrain_model 8 | cp run_multi_task.sh PALM/ 9 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/run_evaluation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | # path of dev data 19 | PATH_dev=./PALM/data/mrqa_dev 20 | # path of dev prediction 21 | BERT_MLM_PATH_prediction=./prediction_results/BERT_MLM_ema_predictions.json 22 | BERT_MLM_ParaRank_PATH_prediction=./prediction_results/BERT_MLM_ParaRank_ema_predictions.json 23 | 24 | files=$(ls ./prediction_results/*.log 2> /dev/null | wc -l) 25 | if [ "$files" != "0" ]; 26 | then 27 | rm prediction_results/BERT_MLM*.log 28 | fi 29 | 30 | # evaluation BERT_MLM 31 | echo "evaluate BERT_MLM model........................................." 32 | for dataset in `ls $PATH_dev/in_domain_dev/*.raw.json`;do 33 | echo $dataset >> prediction_results/BERT_MLM.log 34 | python scripts/evaluate-v1.1.py $dataset $BERT_MLM_PATH_prediction >> prediction_results/BERT_MLM.log 35 | done 36 | 37 | for dataset in `ls $PATH_dev/out_of_domain_dev/*.raw.json`;do 38 | echo $dataset >> prediction_results/BERT_MLM.log 39 | python scripts/evaluate-v1.1.py $dataset $BERT_MLM_PATH_prediction >> prediction_results/BERT_MLM.log 40 | done 41 | python scripts/macro_avg.py prediction_results/BERT_MLM.log 42 | 43 | # evaluation BERT_MLM_ParaRank_PATH_prediction 44 | echo "evaluate BERT_MLM_ParaRank model................................" 45 | for dataset in `ls $PATH_dev/in_domain_dev/*.raw.json`;do 46 | echo $dataset >> prediction_results/BERT_MLM_ParaRank.log 47 | python scripts/evaluate-v1.1.py $dataset $BERT_MLM_ParaRank_PATH_prediction >> prediction_results/BERT_MLM_ParaRank.log 48 | done 49 | 50 | 51 | for dataset in `ls $PATH_dev/out_of_domain_dev/*.raw.json`;do 52 | echo $dataset >> prediction_results/BERT_MLM_ParaRank.log 53 | python scripts/evaluate-v1.1.py $dataset $BERT_MLM_ParaRank_PATH_prediction >> prediction_results/BERT_MLM_ParaRank.log 54 | done 55 | python scripts/macro_avg.py prediction_results/BERT_MLM_ParaRank.log 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/run_multi_task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # for gpu memory optimization 4 | export FLAGS_sync_nccl_allreduce=0 5 | export FLAGS_eager_delete_tensor_gb=1 6 | 7 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 8 | 9 | python -u mtl_run.py 10 | 11 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Arguments for configuration.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import six 21 | import argparse 22 | 23 | 24 | def str2bool(v): 25 | # because argparse does not support to parse "true, False" as python 26 | # boolean directly 27 | return v.lower() in ("true", "t", "1") 28 | 29 | 30 | class ArgumentGroup(object): 31 | def __init__(self, parser, title, des): 32 | self._group = parser.add_argument_group(title=title, description=des) 33 | 34 | def add_arg(self, name, type, default, help, **kwargs): 35 | type = str2bool if type == bool else type 36 | self._group.add_argument( 37 | "--" + name, 38 | default=default, 39 | type=type, 40 | help=help + ' Default: %(default)s.', 41 | **kwargs) 42 | 43 | 44 | def print_arguments(args): 45 | print('----------- Configuration Arguments -----------') 46 | for arg, value in sorted(six.iteritems(vars(args))): 47 | print('%s: %s' % (arg, value)) 48 | print('------------------------------------------------') 49 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/combine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # ============================================================================== 4 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | """ 19 | This module add all train/dev data to a file named "mrqa-combined.raw.json". 20 | """ 21 | 22 | import json 23 | import argparse 24 | import glob 25 | 26 | # path of train/dev data 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('path', help='the path of train/dev data') 29 | args = parser.parse_args() 30 | path = args.path 31 | 32 | # all train/dev data files 33 | files = glob.glob(path + '/*.raw.json') 34 | print ('files:', files) 35 | 36 | # add all train/dev data to "datasets" 37 | with open(files[0]) as fin: 38 | datasets = json.load(fin) 39 | for i in range(1, len(files)): 40 | with open(files[i]) as fin: 41 | dataset = json.load(fin) 42 | datasets['data'].extend(dataset['data']) 43 | 44 | # save to "mrqa-combined.raw.json" 45 | with open(path + '/mrqa-combined.raw.json', 'w') as fout: 46 | json.dump(datasets, fout, indent=4) 47 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/combine.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | # path of train and dev data 19 | PATH_train=train 20 | PATH_dev=dev 21 | 22 | # add all train data to a file "$PATH_train/mrqa-combined.raw.json". 23 | python combine.py $PATH_train 24 | 25 | # add all dev data to a file "$PATH_dev/mrqa-combined.raw.json". 26 | python combine.py $PATH_dev -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/convert_mrqa2squad.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | # path of train and dev data 19 | PATH_train=train 20 | PATH_dev=dev 21 | 22 | # Convert train data from MRQA format to SQuAD format 23 | NAME_LIST_train="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions" 24 | for name in $NAME_LIST_train;do 25 | echo "Converting training data from MRQA format to SQuAD format: ""$name" 26 | python convert_mrqa2squad.py $PATH_train/$name.jsonl 27 | done 28 | 29 | # Convert dev data from MRQA format to SQuAD format 30 | NAME_LIST_dev="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions BioASQ TextbookQA RelationExtraction DROP DuoRC RACE" 31 | for name in $NAME_LIST_dev;do 32 | echo "Converting development data from MRQA format to SQuAD format: ""$name" 33 | python convert_mrqa2squad.py --dev $PATH_dev/$name.jsonl 34 | done 35 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/dev/md5sum_dev.txt: -------------------------------------------------------------------------------- 1 | 05f3f16c5c31ba8e46ff5fa80647ac46 SQuAD.jsonl.gz 2 | 5c188c92a84ddffe2ab590ac7598bde2 NewsQA.jsonl.gz 3 | a7a3bd90db58524f666e757db659b047 TriviaQA.jsonl.gz 4 | bfcb304f1b3167693b627cbf0f98bc9e SearchQA.jsonl.gz 5 | 675de35c3605353ec039ca4d2854072d HotpotQA.jsonl.gz 6 | c0347eebbca02d10d1b07b9a64efe61d NaturalQuestions.jsonl.gz 7 | 6408dc4fcf258535d0ea8b125bba5fbb BioASQ.jsonl.gz 8 | 76ca9cc16625dd8da75758d64676e6a1 TextbookQA.jsonl.gz 9 | 128d318ea1391bf77234d8c1b69a45df RelationExtraction.jsonl.gz 10 | 8b03867e4da2817ef341707040d99785 DROP.jsonl.gz 11 | 9e66769a70fdfdec4906a4bcef5f3d71 DuoRC.jsonl.gz 12 | 94a7ef9b9ea9402671e5b0248b6a5395 RACE.jsonl.gz -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | # path to save data 20 | OUTPUT_train=train 21 | OUTPUT_dev=dev 22 | 23 | DATA_URL="https://s3.us-east-2.amazonaws.com/mrqa/release/v2" 24 | alias wget="wget -c --no-check-certificate" 25 | # download training datasets 26 | wget $DATA_URL/train/SQuAD.jsonl.gz -O $OUTPUT_train/SQuAD.jsonl.gz 27 | wget $DATA_URL/train/NewsQA.jsonl.gz -O $OUTPUT_train/NewsQA.jsonl.gz 28 | wget $DATA_URL/train/TriviaQA-web.jsonl.gz -O $OUTPUT_train/TriviaQA.jsonl.gz 29 | wget $DATA_URL/train/SearchQA.jsonl.gz -O $OUTPUT_train/SearchQA.jsonl.gz 30 | wget $DATA_URL/train/HotpotQA.jsonl.gz -O $OUTPUT_train/HotpotQA.jsonl.gz 31 | wget $DATA_URL/train/NaturalQuestionsShort.jsonl.gz -O $OUTPUT_train/NaturalQuestions.jsonl.gz 32 | 33 | # download the in-domain development data 34 | wget $DATA_URL/dev/SQuAD.jsonl.gz -O $OUTPUT_dev/SQuAD.jsonl.gz 35 | wget $DATA_URL/dev/NewsQA.jsonl.gz -O $OUTPUT_dev/NewsQA.jsonl.gz 36 | wget $DATA_URL/dev/TriviaQA-web.jsonl.gz -O $OUTPUT_dev/TriviaQA.jsonl.gz 37 | wget $DATA_URL/dev/SearchQA.jsonl.gz -O $OUTPUT_dev/SearchQA.jsonl.gz 38 | wget $DATA_URL/dev/HotpotQA.jsonl.gz -O $OUTPUT_dev/HotpotQA.jsonl.gz 39 | wget $DATA_URL/dev/NaturalQuestionsShort.jsonl.gz -O $OUTPUT_dev/NaturalQuestions.jsonl.gz 40 | 41 | # download the out-of-domain development data 42 | wget http://participants-area.bioasq.org/MRQA2019/ -O $OUTPUT_dev/BioASQ.jsonl.gz 43 | wget $DATA_URL/dev/TextbookQA.jsonl.gz -O $OUTPUT_dev/TextbookQA.jsonl.gz 44 | wget $DATA_URL/dev/RelationExtraction.jsonl.gz -O $OUTPUT_dev/RelationExtraction.jsonl.gz 45 | wget $DATA_URL/dev/DROP.jsonl.gz -O $OUTPUT_dev/DROP.jsonl.gz 46 | wget $DATA_URL/dev/DuoRC.ParaphraseRC.jsonl.gz -O $OUTPUT_dev/DuoRC.jsonl.gz 47 | wget $DATA_URL/dev/RACE.jsonl.gz -O $OUTPUT_dev/RACE.jsonl.gz 48 | 49 | # check md5sum for training datasets 50 | cd $OUTPUT_train 51 | if md5sum --status -c md5sum_train.txt; then 52 | echo "finish download training data" 53 | else 54 | echo "md5sum check failed!" 55 | fi 56 | cd .. 57 | 58 | # check md5sum for development data 59 | cd $OUTPUT_dev 60 | if md5sum --status -c md5sum_dev.txt; then 61 | echo "finish download development data" 62 | else 63 | echo "md5sum check failed!" 64 | fi 65 | cd .. 66 | 67 | # gzip training datasets 68 | echo "unzipping train data" 69 | NAME_LIST_train="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions" 70 | for name in $NAME_LIST_train;do 71 | gzip -d $OUTPUT_train/$name.jsonl.gz 72 | done 73 | 74 | # gzip development data 75 | echo "unzipping dev data" 76 | NAME_LIST_dev="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions BioASQ TextbookQA RelationExtraction DROP DuoRC RACE" 77 | for name in $NAME_LIST_dev;do 78 | gzip -d $OUTPUT_dev/$name.jsonl.gz 79 | done 80 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/macro_avg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import json 4 | import re 5 | 6 | def extract_score(line): 7 | score_json = json.loads(line) 8 | f1 = score_json['f1'] 9 | em = score_json['exact_match'] 10 | return float(f1), float(em) 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser( 15 | description='Calculate macro average for MRQA') 16 | parser.add_argument('input_file', help='Score file') 17 | args = parser.parse_args() 18 | with open(args.input_file) as fin: 19 | lines = list(map(str.strip, fin.readlines())) 20 | in_domain_scores = {} 21 | for dataset_id in range(0, 12, 2): 22 | f1, em = extract_score(lines[dataset_id+1]) 23 | in_domain_scores[lines[dataset_id]] = f1 24 | out_of_domain_scores = {} 25 | for dataset_id in range(12, 24, 2): 26 | f1, em = extract_score(lines[dataset_id+1]) 27 | out_of_domain_scores[lines[dataset_id]] = f1 28 | print('In domain avg: {}'.format(sum(in_domain_scores.values()) / len(in_domain_scores.values()))) 29 | print('Out of domain avg: {}'.format(sum(out_of_domain_scores.values()) / len(in_domain_scores.values()))) 30 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/scripts/train/md5sum_train.txt: -------------------------------------------------------------------------------- 1 | efd6a551d2697c20a694e933210489f8 SQuAD.jsonl.gz 2 | 182f4e977b849cb1dbfb796030b91444 NewsQA.jsonl.gz 3 | e18f586152612a9358c22f5536bfd32a TriviaQA.jsonl.gz 4 | 612245315e6e7c4d8446e5fcc3dc1086 SearchQA.jsonl.gz 5 | d212c7b3fc949bd0dc47d124e8c34907 HotpotQA.jsonl.gz 6 | e27d27bf7c49eb5ead43cef3f41de6be NaturalQuestions.jsonl.gz -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/wget_data.sh: -------------------------------------------------------------------------------- 1 | # wget train data 2 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/mrqa_multi_task_dataset.tar.gz 3 | tar -xvf mrqa_multi_task_dataset.tar.gz 4 | rm mrqa_multi_task_dataset.tar.gz 5 | 6 | # wget predictions results 7 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/muiti_task_prediction_results.tar.gz 8 | tar -xvf muiti_task_prediction_results.tar.gz 9 | rm muiti_task_prediction_results.tar.gz 10 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/multi_task_learning/wget_pretrained_model.sh: -------------------------------------------------------------------------------- 1 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/squad2_model.tar.gz 2 | tar -xvf squad2_model.tar.gz 3 | rm squad2_model.tar.gz 4 | 5 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/README.md: -------------------------------------------------------------------------------- 1 | # ensemble server system 2 | This directory contains the ensemble system for the three models that are fine-tuned on the MRQA in-domain data (i.e. models based on ERNIE2.0, XL-NET and BERT). The architecture of the ensemble system is shown in the figure below. We first start 3 independent model server for ERNIE, XL-NET and BERT. We then start a main server to receive client requests, invoke model servers and ensemble model results. 3 | For convinience, users are able to explore **any ensemble combinations** (e.g. ERNIE+XL-NET, BERT+XL-NET), by simply modifying the configurations. 4 | 5 |

6 | 7 |

8 | 9 | 10 | ## Environment 11 | In our test environment, we use 12 | 13 | - Python 2.7.13 14 | - PaddlePaddle 1.5.2 15 | - sentencepiece 0.1.83 16 | - flask 1.1.1 17 | - Cuda 9.0 18 | - CuDNN 7.0 19 | 20 | ## Download model parameters 21 | To downlowd the model parameters that are fine-tuned on the MRQA in-domain data, run 22 | 23 | ``` 24 | bash wget_server_inference_model.sh 25 | ``` 26 | A folder named `infere_model` will appear in `ernie_server/`, `xlnet_server/` and `bert_server/`. 27 | 28 | ## Start servers 29 | 30 | Before starting the server, please make sure the ports `5118` to `5121` are available, and specify the `gpu_id` in `start.sh` (by default `GPU 0` on the machine will be used). 31 | 32 | To start the servers, run 33 | 34 | ``` 35 | bash start.sh 36 | ``` 37 | The log for the main server will be saved in `main_server.log`, and the logs for the 3 model servers witll be saved in `ernie_server/ernie.log`, `xlnet_server/xlnet.log` and `bert_server/bert.log`. 38 | 39 | By default, the main server will ensemble the results from ERNIE and XL-NET. To explore other ensemble combinations, one can change the configuration in `start.sh` (e.g. `python main_server.py --ernie --xlnet --bert` for 3 models, `python main_server.py --bert --xlnet` for BERT and XL-NET only). 40 | 41 | Note that in our test environment, we use Tesla K40 (12G) and the three modles are able to fit in a single card. For GPUs with smaller RAM, one can choose to put three models on different card by modifying the configurations in `start.sh`. 42 | 43 | ## Send requests 44 | Once the servers are successfully launched, one can use the client script to send requests. 45 | 46 | ``` 47 | cd client 48 | python client.py demo.txt results.txt 5121 49 | ``` 50 | This will the read the examples in `demo.txt`, send requests to the main server, and save results into `results.txt`. The format of the input file (i.e. `demo.txt`) need to be in [MRQA official format](https://github.com/mrqa/MRQA-Shared-Task-2019). -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/__main__.py: -------------------------------------------------------------------------------- 1 | from algorithm import optimization 2 | from algorithm import multitask 3 | from extension import fp16 4 | from module import transformer_encoder 5 | from toolkit import configure 6 | from toolkit import init 7 | from toolkit import placeholder 8 | from nets import bert 9 | 10 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/algorithm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/algorithm/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/algorithm/multitask.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | 3 | import os 4 | import sys 5 | import random 6 | from copy import deepcopy as copy 7 | import numpy as np 8 | import paddle 9 | import paddle.fluid as fluid 10 | import multiprocessing 11 | 12 | class Task: 13 | 14 | def __init__( 15 | self, 16 | conf, 17 | name = "", 18 | is_training = False, 19 | _DataProcesser = None, 20 | shared_name = ""): 21 | 22 | self.conf = copy(conf) 23 | 24 | self.name = name 25 | self.shared_name = shared_name 26 | 27 | self.is_training = is_training 28 | self.DataProcesser = _DataProcesser 29 | 30 | def _create_reader(self): 31 | raise NotImplementedError("Task:_create_reader not implemented") 32 | 33 | def _create_model(self): 34 | raise NotImplementedError("Task:_create_model not implemented") 35 | 36 | def prepare(self, args): 37 | raise NotImplementedError("Task:prepare not implemented") 38 | 39 | def train_step(self, args): 40 | raise NotImplementedError("Task:train_step not implemented") 41 | 42 | def predict(self, args): 43 | raise NotImplementedError("Task:_predict not implemented") 44 | 45 | 46 | class JointTask: 47 | 48 | def __init__(self): 49 | 50 | self.tasks = [] 51 | 52 | #self.startup_exe = None 53 | #self.train_exe = None 54 | 55 | self.exe = None 56 | 57 | self.share_vars_from = None 58 | 59 | self.startup_prog = fluid.Program() 60 | 61 | def __add__(self, task): 62 | 63 | assert isinstance(task, Task) 64 | 65 | self.tasks.append(task) 66 | 67 | return self 68 | 69 | def prepare(self, args): 70 | 71 | if args.use_cuda: 72 | place = fluid.CUDAPlace(0) 73 | dev_count = fluid.core.get_cuda_device_count() 74 | else: 75 | place = fluid.CPUPlace() 76 | dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) 77 | 78 | #self.startup_exe = fluid.Executor(place) 79 | self.exe = fluid.Executor(place) 80 | 81 | for idx, task in enumerate(self.tasks): 82 | if idx == 0: 83 | print("for idx : %d" % idx) 84 | task.prepare(args, exe = self.exe) 85 | self.share_vars_from = task.compiled_train_prog 86 | else: 87 | print("for idx : %d" % idx) 88 | task.prepare(args, exe = self.exe, share_vars_from = self.share_vars_from) 89 | 90 | def train(self, args): 91 | 92 | joint_steps = [] 93 | for i in xrange(0, len(self.tasks)): 94 | for _ in xrange(0, self.tasks[i].max_train_steps): 95 | joint_steps.append(i) 96 | 97 | self.tasks[0].train_step(args, exe = self.exe) 98 | 99 | random.shuffle(joint_steps) 100 | for next_task_id in joint_steps: 101 | self.tasks[next_task_id].train_step(args, exe = self.exe) 102 | 103 | 104 | if __name__ == "__main__": 105 | 106 | basetask_a = Task(None) 107 | 108 | basetask_b = Task(None) 109 | 110 | joint_tasks = JointTask() 111 | 112 | joint_tasks += basetask_a 113 | 114 | print(joint_tasks.tasks) 115 | 116 | joint_tasks += basetask_b 117 | 118 | print(joint_tasks.tasks) 119 | 120 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/extension/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/extension/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/extension/fp16.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | import paddle 17 | import paddle.fluid as fluid 18 | 19 | 20 | def cast_fp16_to_fp32(i, o, prog): 21 | prog.global_block().append_op( 22 | type="cast", 23 | inputs={"X": i}, 24 | outputs={"Out": o}, 25 | attrs={ 26 | "in_dtype": fluid.core.VarDesc.VarType.FP16, 27 | "out_dtype": fluid.core.VarDesc.VarType.FP32 28 | }) 29 | 30 | 31 | def cast_fp32_to_fp16(i, o, prog): 32 | prog.global_block().append_op( 33 | type="cast", 34 | inputs={"X": i}, 35 | outputs={"Out": o}, 36 | attrs={ 37 | "in_dtype": fluid.core.VarDesc.VarType.FP32, 38 | "out_dtype": fluid.core.VarDesc.VarType.FP16 39 | }) 40 | 41 | 42 | def copy_to_master_param(p, block): 43 | v = block.vars.get(p.name, None) 44 | if v is None: 45 | raise ValueError("no param name %s found!" % p.name) 46 | new_p = fluid.framework.Parameter( 47 | block=block, 48 | shape=v.shape, 49 | dtype=fluid.core.VarDesc.VarType.FP32, 50 | type=v.type, 51 | lod_level=v.lod_level, 52 | stop_gradient=p.stop_gradient, 53 | trainable=p.trainable, 54 | optimize_attr=p.optimize_attr, 55 | regularizer=p.regularizer, 56 | gradient_clip_attr=p.gradient_clip_attr, 57 | error_clip=p.error_clip, 58 | name=v.name + ".master") 59 | return new_p 60 | 61 | 62 | def create_master_params_grads(params_grads, main_prog, startup_prog, 63 | loss_scaling): 64 | master_params_grads = [] 65 | tmp_role = main_prog._current_role 66 | OpRole = fluid.core.op_proto_and_checker_maker.OpRole 67 | main_prog._current_role = OpRole.Backward 68 | for p, g in params_grads: 69 | # create master parameters 70 | master_param = copy_to_master_param(p, main_prog.global_block()) 71 | startup_master_param = startup_prog.global_block()._clone_variable( 72 | master_param) 73 | startup_p = startup_prog.global_block().var(p.name) 74 | cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog) 75 | # cast fp16 gradients to fp32 before apply gradients 76 | if g.name.find("layer_norm") > -1: 77 | if loss_scaling > 1: 78 | scaled_g = g / float(loss_scaling) 79 | else: 80 | scaled_g = g 81 | master_params_grads.append([p, scaled_g]) 82 | continue 83 | master_grad = fluid.layers.cast(g, "float32") 84 | if loss_scaling > 1: 85 | master_grad = master_grad / float(loss_scaling) 86 | master_params_grads.append([master_param, master_grad]) 87 | main_prog._current_role = tmp_role 88 | return master_params_grads 89 | 90 | 91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog): 92 | for idx, m_p_g in enumerate(master_params_grads): 93 | train_p, _ = params_grads[idx] 94 | if train_p.name.find("layer_norm") > -1: 95 | continue 96 | with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): 97 | cast_fp32_to_fp16(m_p_g[0], train_p, main_prog) 98 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/module/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/nets/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/toolkit/init.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import os 18 | import six 19 | import ast 20 | import copy 21 | 22 | import numpy as np 23 | import paddle.fluid as fluid 24 | 25 | 26 | def cast_fp32_to_fp16(exe, main_program): 27 | print("Cast parameters to float16 data format.") 28 | for param in main_program.global_block().all_parameters(): 29 | if not param.name.endswith(".master"): 30 | param_t = fluid.global_scope().find_var(param.name).get_tensor() 31 | data = np.array(param_t) 32 | if param.name.find("layer_norm") == -1: 33 | param_t.set(np.float16(data).view(np.uint16), exe.place) 34 | master_param_var = fluid.global_scope().find_var(param.name + 35 | ".master") 36 | if master_param_var is not None: 37 | master_param_var.get_tensor().set(data, exe.place) 38 | 39 | 40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False, skip_list = []): 41 | assert os.path.exists( 42 | init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path 43 | 44 | def existed_persitables(var): 45 | if not fluid.io.is_persistable(var): 46 | return False 47 | if var.name in skip_list: 48 | return False 49 | return os.path.exists(os.path.join(init_checkpoint_path, var.name)) 50 | 51 | fluid.io.load_vars( 52 | exe, 53 | init_checkpoint_path, 54 | main_program=main_program, 55 | predicate=existed_persitables) 56 | print("Load model from {}".format(init_checkpoint_path)) 57 | 58 | if use_fp16: 59 | cast_fp32_to_fp16(exe, main_program) 60 | 61 | 62 | def init_pretraining_params(exe, 63 | pretraining_params_path, 64 | main_program, 65 | use_fp16=False): 66 | assert os.path.exists(pretraining_params_path 67 | ), "[%s] cann't be found." % pretraining_params_path 68 | 69 | def existed_params(var): 70 | if not isinstance(var, fluid.framework.Parameter): 71 | return False 72 | return os.path.exists(os.path.join(pretraining_params_path, var.name)) 73 | 74 | fluid.io.load_vars( 75 | exe, 76 | pretraining_params_path, 77 | main_program=main_program, 78 | predicate=existed_params) 79 | print("Load pretraining parameters from {}.".format( 80 | pretraining_params_path)) 81 | 82 | if use_fp16: 83 | cast_fp32_to_fp16(exe, main_program) 84 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/pdnlp/toolkit/placeholder.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import six 7 | import ast 8 | import copy 9 | 10 | import numpy as np 11 | import paddle.fluid as fluid 12 | 13 | 14 | class Placeholder(object): 15 | 16 | def __init__(self): 17 | self.shapes = [] 18 | self.dtypes = [] 19 | self.lod_levels = [] 20 | self.names = [] 21 | 22 | def __init__(self, input_shapes): 23 | 24 | self.shapes = [] 25 | self.dtypes = [] 26 | self.lod_levels = [] 27 | self.names = [] 28 | 29 | for new_holder in input_shapes: 30 | shape = new_holder[0] 31 | dtype = new_holder[1] 32 | lod_level = new_holder[2] if len(new_holder) >= 3 else 0 33 | name = new_holder[3] if len(new_holder) >= 4 else "" 34 | 35 | self.append_placeholder(shape, dtype, lod_level = lod_level, name = name) 36 | 37 | def append_placeholder(self, shape, dtype, lod_level = 0, name = ""): 38 | self.shapes.append(shape) 39 | self.dtypes.append(dtype) 40 | self.lod_levels.append(lod_level) 41 | self.names.append(name) 42 | 43 | 44 | def build(self, capacity, reader_name, use_double_buffer = False): 45 | pyreader = fluid.layers.py_reader( 46 | capacity = capacity, 47 | shapes = self.shapes, 48 | dtypes = self.dtypes, 49 | lod_levels = self.lod_levels, 50 | name = reader_name, 51 | use_double_buffer = use_double_buffer) 52 | 53 | return [pyreader, fluid.layers.read_file(pyreader)] 54 | 55 | 56 | def __add__(self, new_holder): 57 | assert isinstance(new_holder, tuple) or isinstance(new_holder, list) 58 | assert len(new_holder) >= 2 59 | 60 | shape = new_holder[0] 61 | dtype = new_holder[1] 62 | lod_level = new_holder[2] if len(new_holder) >= 3 else 0 63 | name = new_holder[3] if len(new_holder) >= 4 else "" 64 | 65 | self.append_placeholder(shape, dtype, lod_level = lod_level, name = name) 66 | 67 | 68 | if __name__ == "__main__": 69 | print("hello world!") 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/start.sh: -------------------------------------------------------------------------------- 1 | export FLAGS_fraction_of_gpu_memory_to_use=0.1 2 | port=$1 3 | gpu=$2 4 | export CUDA_VISIBLE_DEVICES=$gpu 5 | python start_service.py ./infer_model $port 6 | 7 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/start_service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | BERT model service 5 | """ 6 | import json 7 | import sys 8 | import logging 9 | logging.basicConfig( 10 | level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 11 | ) 12 | import requests 13 | from flask import Flask 14 | from flask import Response 15 | from flask import request 16 | import mrc_service 17 | import model_wrapper 18 | import argparse 19 | 20 | 21 | assert len(sys.argv) == 3 or len(sys.argv) == 4, "Usage: python serve.py [process_mode]" 22 | if len(sys.argv) == 3: 23 | _, model_dir, port = sys.argv 24 | mode = 'parallel' 25 | else: 26 | _, model_dir, port, mode = sys.argv 27 | 28 | max_batch_size = 5 29 | 30 | app = Flask(__name__) 31 | app.logger.setLevel(logging.INFO) 32 | model = model_wrapper.BertModelWrapper(model_dir=model_dir) 33 | server = mrc_service.MRQAService('MRQA service', app.logger) 34 | 35 | @app.route('/', methods=['POST']) 36 | def mrqa_service(): 37 | """Description""" 38 | return server(model, process_mode=mode, max_batch_size=max_batch_size) 39 | 40 | 41 | if __name__ == '__main__': 42 | app.run(port=port, debug=False, threaded=False, processes=1) 43 | 44 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/bert_server/task_reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/task_reader/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/client/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Query the MRQA model server to generate predictions. 5 | """ 6 | import argparse 7 | import json 8 | import requests 9 | import time 10 | 11 | 12 | if __name__ == '__main__': 13 | parse = argparse.ArgumentParser("") 14 | parse.add_argument("dataset") 15 | parse.add_argument("output_file") 16 | parse.add_argument("port", type=int) 17 | args = parse.parse_args() 18 | 19 | all_predictions = {} 20 | contexts = [] 21 | f = open(args.dataset) 22 | for example in f: 23 | context = json.loads(example) 24 | if 'header' in context: 25 | continue 26 | contexts.append(context) 27 | f.close() 28 | 29 | results = {} 30 | cnt = 0 31 | for context in contexts: 32 | cnt += 1 33 | start = time.time() 34 | pred = requests.post('http://127.0.0.1:%d' % args.port, json=context) 35 | result = pred.json() 36 | results.update(result) 37 | end=time.time() 38 | print('----- request cnt: {}, time elapsed: {:.2f} ms -----'.format(cnt, (end - start)*1000)) 39 | for qid, answer in result.items(): 40 | print('{}: {}'.format(qid, answer.encode('utf-8'))) 41 | with open(args.output_file,'w') as f: 42 | json.dump(results, f, indent=1) 43 | 44 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/__main__.py: -------------------------------------------------------------------------------- 1 | from algorithm import optimization 2 | from algorithm import multitask 3 | from extension import fp16 4 | from module import transformer_encoder 5 | from toolkit import configure 6 | from toolkit import init 7 | from toolkit import placeholder 8 | from nets import bert 9 | 10 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/algorithm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/algorithm/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/algorithm/multitask.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | 3 | import os 4 | import sys 5 | import random 6 | from copy import deepcopy as copy 7 | import numpy as np 8 | import paddle 9 | import paddle.fluid as fluid 10 | import multiprocessing 11 | 12 | class Task: 13 | 14 | def __init__( 15 | self, 16 | conf, 17 | name = "", 18 | is_training = False, 19 | _DataProcesser = None, 20 | shared_name = ""): 21 | 22 | self.conf = copy(conf) 23 | 24 | self.name = name 25 | self.shared_name = shared_name 26 | 27 | self.is_training = is_training 28 | self.DataProcesser = _DataProcesser 29 | 30 | def _create_reader(self): 31 | raise NotImplementedError("Task:_create_reader not implemented") 32 | 33 | def _create_model(self): 34 | raise NotImplementedError("Task:_create_model not implemented") 35 | 36 | def prepare(self, args): 37 | raise NotImplementedError("Task:prepare not implemented") 38 | 39 | def train_step(self, args): 40 | raise NotImplementedError("Task:train_step not implemented") 41 | 42 | def predict(self, args): 43 | raise NotImplementedError("Task:_predict not implemented") 44 | 45 | 46 | class JointTask: 47 | 48 | def __init__(self): 49 | 50 | self.tasks = [] 51 | 52 | #self.startup_exe = None 53 | #self.train_exe = None 54 | 55 | self.exe = None 56 | 57 | self.share_vars_from = None 58 | 59 | self.startup_prog = fluid.Program() 60 | 61 | def __add__(self, task): 62 | 63 | assert isinstance(task, Task) 64 | 65 | self.tasks.append(task) 66 | 67 | return self 68 | 69 | def prepare(self, args): 70 | 71 | if args.use_cuda: 72 | place = fluid.CUDAPlace(0) 73 | dev_count = fluid.core.get_cuda_device_count() 74 | else: 75 | place = fluid.CPUPlace() 76 | dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) 77 | 78 | #self.startup_exe = fluid.Executor(place) 79 | self.exe = fluid.Executor(place) 80 | 81 | for idx, task in enumerate(self.tasks): 82 | if idx == 0: 83 | print("for idx : %d" % idx) 84 | task.prepare(args, exe = self.exe) 85 | self.share_vars_from = task.compiled_train_prog 86 | else: 87 | print("for idx : %d" % idx) 88 | task.prepare(args, exe = self.exe, share_vars_from = self.share_vars_from) 89 | 90 | def train(self, args): 91 | 92 | joint_steps = [] 93 | for i in xrange(0, len(self.tasks)): 94 | for _ in xrange(0, self.tasks[i].max_train_steps): 95 | joint_steps.append(i) 96 | 97 | self.tasks[0].train_step(args, exe = self.exe) 98 | 99 | random.shuffle(joint_steps) 100 | for next_task_id in joint_steps: 101 | self.tasks[next_task_id].train_step(args, exe = self.exe) 102 | 103 | 104 | if __name__ == "__main__": 105 | 106 | basetask_a = Task(None) 107 | 108 | basetask_b = Task(None) 109 | 110 | joint_tasks = JointTask() 111 | 112 | joint_tasks += basetask_a 113 | 114 | print(joint_tasks.tasks) 115 | 116 | joint_tasks += basetask_b 117 | 118 | print(joint_tasks.tasks) 119 | 120 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/extension/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/extension/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/extension/fp16.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | import paddle 17 | import paddle.fluid as fluid 18 | 19 | 20 | def cast_fp16_to_fp32(i, o, prog): 21 | prog.global_block().append_op( 22 | type="cast", 23 | inputs={"X": i}, 24 | outputs={"Out": o}, 25 | attrs={ 26 | "in_dtype": fluid.core.VarDesc.VarType.FP16, 27 | "out_dtype": fluid.core.VarDesc.VarType.FP32 28 | }) 29 | 30 | 31 | def cast_fp32_to_fp16(i, o, prog): 32 | prog.global_block().append_op( 33 | type="cast", 34 | inputs={"X": i}, 35 | outputs={"Out": o}, 36 | attrs={ 37 | "in_dtype": fluid.core.VarDesc.VarType.FP32, 38 | "out_dtype": fluid.core.VarDesc.VarType.FP16 39 | }) 40 | 41 | 42 | def copy_to_master_param(p, block): 43 | v = block.vars.get(p.name, None) 44 | if v is None: 45 | raise ValueError("no param name %s found!" % p.name) 46 | new_p = fluid.framework.Parameter( 47 | block=block, 48 | shape=v.shape, 49 | dtype=fluid.core.VarDesc.VarType.FP32, 50 | type=v.type, 51 | lod_level=v.lod_level, 52 | stop_gradient=p.stop_gradient, 53 | trainable=p.trainable, 54 | optimize_attr=p.optimize_attr, 55 | regularizer=p.regularizer, 56 | gradient_clip_attr=p.gradient_clip_attr, 57 | error_clip=p.error_clip, 58 | name=v.name + ".master") 59 | return new_p 60 | 61 | 62 | def create_master_params_grads(params_grads, main_prog, startup_prog, 63 | loss_scaling): 64 | master_params_grads = [] 65 | tmp_role = main_prog._current_role 66 | OpRole = fluid.core.op_proto_and_checker_maker.OpRole 67 | main_prog._current_role = OpRole.Backward 68 | for p, g in params_grads: 69 | # create master parameters 70 | master_param = copy_to_master_param(p, main_prog.global_block()) 71 | startup_master_param = startup_prog.global_block()._clone_variable( 72 | master_param) 73 | startup_p = startup_prog.global_block().var(p.name) 74 | cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog) 75 | # cast fp16 gradients to fp32 before apply gradients 76 | if g.name.find("layer_norm") > -1: 77 | if loss_scaling > 1: 78 | scaled_g = g / float(loss_scaling) 79 | else: 80 | scaled_g = g 81 | master_params_grads.append([p, scaled_g]) 82 | continue 83 | master_grad = fluid.layers.cast(g, "float32") 84 | if loss_scaling > 1: 85 | master_grad = master_grad / float(loss_scaling) 86 | master_params_grads.append([master_param, master_grad]) 87 | main_prog._current_role = tmp_role 88 | return master_params_grads 89 | 90 | 91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog): 92 | for idx, m_p_g in enumerate(master_params_grads): 93 | train_p, _ = params_grads[idx] 94 | if train_p.name.find("layer_norm") > -1: 95 | continue 96 | with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): 97 | cast_fp32_to_fp16(m_p_g[0], train_p, main_prog) 98 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/module/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/nets/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/toolkit/init.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import os 18 | import six 19 | import ast 20 | import copy 21 | 22 | import numpy as np 23 | import paddle.fluid as fluid 24 | 25 | 26 | def cast_fp32_to_fp16(exe, main_program): 27 | print("Cast parameters to float16 data format.") 28 | for param in main_program.global_block().all_parameters(): 29 | if not param.name.endswith(".master"): 30 | param_t = fluid.global_scope().find_var(param.name).get_tensor() 31 | data = np.array(param_t) 32 | if param.name.find("layer_norm") == -1: 33 | param_t.set(np.float16(data).view(np.uint16), exe.place) 34 | master_param_var = fluid.global_scope().find_var(param.name + 35 | ".master") 36 | if master_param_var is not None: 37 | master_param_var.get_tensor().set(data, exe.place) 38 | 39 | 40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False, skip_list = []): 41 | assert os.path.exists( 42 | init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path 43 | 44 | def existed_persitables(var): 45 | if not fluid.io.is_persistable(var): 46 | return False 47 | if var.name in skip_list: 48 | return False 49 | return os.path.exists(os.path.join(init_checkpoint_path, var.name)) 50 | 51 | fluid.io.load_vars( 52 | exe, 53 | init_checkpoint_path, 54 | main_program=main_program, 55 | predicate=existed_persitables) 56 | print("Load model from {}".format(init_checkpoint_path)) 57 | 58 | if use_fp16: 59 | cast_fp32_to_fp16(exe, main_program) 60 | 61 | 62 | def init_pretraining_params(exe, 63 | pretraining_params_path, 64 | main_program, 65 | use_fp16=False): 66 | assert os.path.exists(pretraining_params_path 67 | ), "[%s] cann't be found." % pretraining_params_path 68 | 69 | def existed_params(var): 70 | if not isinstance(var, fluid.framework.Parameter): 71 | return False 72 | return os.path.exists(os.path.join(pretraining_params_path, var.name)) 73 | 74 | fluid.io.load_vars( 75 | exe, 76 | pretraining_params_path, 77 | main_program=main_program, 78 | predicate=existed_params) 79 | print("Load pretraining parameters from {}.".format( 80 | pretraining_params_path)) 81 | 82 | if use_fp16: 83 | cast_fp32_to_fp16(exe, main_program) 84 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/pdnlp/toolkit/placeholder.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import six 7 | import ast 8 | import copy 9 | 10 | import numpy as np 11 | import paddle.fluid as fluid 12 | 13 | 14 | class Placeholder(object): 15 | 16 | def __init__(self): 17 | self.shapes = [] 18 | self.dtypes = [] 19 | self.lod_levels = [] 20 | self.names = [] 21 | 22 | def __init__(self, input_shapes): 23 | 24 | self.shapes = [] 25 | self.dtypes = [] 26 | self.lod_levels = [] 27 | self.names = [] 28 | 29 | for new_holder in input_shapes: 30 | shape = new_holder[0] 31 | dtype = new_holder[1] 32 | lod_level = new_holder[2] if len(new_holder) >= 3 else 0 33 | name = new_holder[3] if len(new_holder) >= 4 else "" 34 | 35 | self.append_placeholder(shape, dtype, lod_level = lod_level, name = name) 36 | 37 | def append_placeholder(self, shape, dtype, lod_level = 0, name = ""): 38 | self.shapes.append(shape) 39 | self.dtypes.append(dtype) 40 | self.lod_levels.append(lod_level) 41 | self.names.append(name) 42 | 43 | 44 | def build(self, capacity, reader_name, use_double_buffer = False): 45 | pyreader = fluid.layers.py_reader( 46 | capacity = capacity, 47 | shapes = self.shapes, 48 | dtypes = self.dtypes, 49 | lod_levels = self.lod_levels, 50 | name = reader_name, 51 | use_double_buffer = use_double_buffer) 52 | 53 | return [pyreader, fluid.layers.read_file(pyreader)] 54 | 55 | 56 | def __add__(self, new_holder): 57 | assert isinstance(new_holder, tuple) or isinstance(new_holder, list) 58 | assert len(new_holder) >= 2 59 | 60 | shape = new_holder[0] 61 | dtype = new_holder[1] 62 | lod_level = new_holder[2] if len(new_holder) >= 3 else 0 63 | name = new_holder[3] if len(new_holder) >= 4 else "" 64 | 65 | self.append_placeholder(shape, dtype, lod_level = lod_level, name = name) 66 | 67 | 68 | if __name__ == "__main__": 69 | print("hello world!") 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/start.sh: -------------------------------------------------------------------------------- 1 | export FLAGS_fraction_of_gpu_memory_to_use=0.1 2 | port=$1 3 | gpu=$2 4 | export CUDA_VISIBLE_DEVICES=$gpu 5 | python start_service.py ./infer_model $port 6 | 7 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/start_service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | ERNIE model service 5 | """ 6 | import json 7 | import sys 8 | import logging 9 | logging.basicConfig( 10 | level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 11 | ) 12 | import requests 13 | from flask import Flask 14 | from flask import Response 15 | from flask import request 16 | import mrc_service 17 | import model_wrapper as ernie_wrapper 18 | 19 | assert len(sys.argv) == 3 or len(sys.argv) == 4, "Usage: python serve.py [process_mode]" 20 | if len(sys.argv) == 3: 21 | _, model_dir, port = sys.argv 22 | mode = 'parallel' 23 | else: 24 | _, model_dir, port, mode = sys.argv 25 | 26 | app = Flask(__name__) 27 | app.logger.setLevel(logging.INFO) 28 | ernie_model = ernie_wrapper.ERNIEModelWrapper(model_dir=model_dir) 29 | server = mrc_service.BasicMRCService('Short answer MRC service', app.logger) 30 | 31 | @app.route('/', methods=['POST']) 32 | def mrqa_service(): 33 | """Description""" 34 | model = ernie_model 35 | return server(model, process_mode=mode, max_batch_size=5) 36 | 37 | 38 | if __name__ == '__main__': 39 | app.run(port=port, debug=False, threaded=False, processes=1) 40 | 41 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/ernie_server/task_reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/task_reader/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/main_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import json 4 | import sys 5 | import logging 6 | logging.basicConfig( 7 | level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 8 | ) 9 | import requests 10 | from flask import Flask 11 | from flask import Response 12 | from flask import request 13 | import numpy as np 14 | import argparse 15 | from multiprocessing.dummy import Pool as ThreadPool 16 | 17 | app = Flask(__name__) 18 | 19 | logger = logging.getLogger('flask') 20 | 21 | 22 | def ensemble_example(answers, n_models=None): 23 | if n_models is None: 24 | n_models = len(answers) 25 | answer_dict = dict() 26 | for nbest_predictions in answers: 27 | for prediction in nbest_predictions: 28 | score_list = answer_dict.setdefault(prediction['text'], []) 29 | score_list.append(prediction['probability']) 30 | 31 | ensemble_nbest_predictions = [] 32 | for answer, scores in answer_dict.items(): 33 | prediction = dict() 34 | prediction['text'] = answer 35 | prediction['probability'] = np.sum(scores) / n_models 36 | ensemble_nbest_predictions.append(prediction) 37 | 38 | ensemble_nbest_predictions = \ 39 | sorted(ensemble_nbest_predictions, key=lambda item: item['probability'], reverse=True) 40 | return ensemble_nbest_predictions 41 | 42 | 43 | @app.route('/', methods=['POST']) 44 | def mrqa_main(): 45 | """Description""" 46 | # parse input data 47 | pred = {} 48 | def _call_model(url, input_json): 49 | nbest = requests.post(url, json=input_json) 50 | return nbest 51 | try: 52 | input_json = request.get_json(silent=True) 53 | n_models = len(urls) 54 | pool = ThreadPool(n_models) 55 | results = [] 56 | for url in urls: 57 | result = pool.apply_async(_call_model, (url, input_json)) 58 | results.append(result.get()) 59 | pool.close() 60 | pool.join() 61 | nbests = [nbest.json()['results'] for nbest in results] 62 | qids = list(nbests[0].keys()) 63 | for qid in qids: 64 | ensemble_nbest = ensemble_example([nbest[qid] for nbest in nbests], n_models=n_models) 65 | pred[qid] = ensemble_nbest[0]['text'] 66 | except Exception as e: 67 | pred['error'] = 'empty' 68 | logger.exception(e) 69 | 70 | return Response(json.dumps(pred), mimetype='application/json') 71 | 72 | 73 | if __name__ == '__main__': 74 | url_1 = 'http://127.0.0.1:5118' # url for ernie 75 | url_2 = 'http://127.0.0.1:5119' # url for xl-net 76 | url_3 = 'http://127.0.0.1:5120' # url for bert 77 | parser = argparse.ArgumentParser('main server') 78 | parser.add_argument('--ernie', action='store_true', default=False, help="Include ERNIE") 79 | parser.add_argument('--xlnet', action='store_true', default=False, help="Include XL-NET") 80 | parser.add_argument('--bert', action='store_true', default=False, help="Include BERT") 81 | args = parser.parse_args() 82 | urls = [] 83 | if args.ernie: 84 | print('Include ERNIE model') 85 | urls.append(url_1) 86 | if args.xlnet: 87 | print('Include XL-NET model') 88 | urls.append(url_2) 89 | if args.bert: 90 | print('Include BERT model') 91 | urls.append(url_3) 92 | assert len(urls) > 0, "At lease one model is required" 93 | app.run(host='127.0.0.1', port=5121, debug=False, threaded=False, processes=1) 94 | 95 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_id=0 4 | 5 | # start ernie service 6 | # usage: sh start.sh port gpu_id 7 | cd ernie_server 8 | nohup sh start.sh 5118 $gpu_id > ernie.log 2>&1 & 9 | cd .. 10 | 11 | # start xlnet service 12 | cd xlnet_server 13 | nohup sh start.sh 5119 $gpu_id > xlnet.log 2>&1 & 14 | cd .. 15 | 16 | # start bert service 17 | cd bert_server 18 | nohup sh start.sh 5120 $gpu_id > bert.log 2>&1 & 19 | cd .. 20 | 21 | sleep 3 22 | # start main server 23 | # usage: python main_server.py --model_name 24 | # the model_name specifies the model to be used in the ensemble. 25 | nohup python main_server.py --ernie --xlnet > main_server.log 2>&1 & 26 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/wget_server_inference_model.sh: -------------------------------------------------------------------------------- 1 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/mrqa2019_inference_model.tar.gz 2 | tar -xvf mrqa2019_inference_model.tar.gz 3 | rm mrqa2019_inference_model.tar.gz 4 | mv bert_infer_model bert_server/infer_model 5 | mv xlnet_infer_model xlnet_server/infer_model 6 | mv ernie_infer_model ernie_server/infer_model 7 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/xlnet_server/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/data_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | 6 | special_symbols = { 7 | "" : 0, 8 | "" : 1, 9 | "" : 2, 10 | "" : 3, 11 | "" : 4, 12 | "" : 5, 13 | "" : 6, 14 | "" : 7, 15 | "" : 8, 16 | } 17 | 18 | VOCAB_SIZE = 32000 19 | UNK_ID = special_symbols[""] 20 | CLS_ID = special_symbols[""] 21 | SEP_ID = special_symbols[""] 22 | MASK_ID = special_symbols[""] 23 | EOD_ID = special_symbols[""] 24 | 25 | 26 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/xlnet_server/model/__init__.py -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/serve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | XL-NET model service 5 | """ 6 | import json 7 | import sys 8 | import logging 9 | logging.basicConfig( 10 | level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 11 | ) 12 | import requests 13 | from flask import Flask 14 | from flask import Response 15 | from flask import request 16 | import server_utils 17 | import wrapper as bert_wrapper 18 | 19 | assert len(sys.argv) == 3 or len(sys.argv) == 4, "Usage: python serve.py [process_mode]" 20 | if len(sys.argv) == 3: 21 | _, model_dir, port = sys.argv 22 | mode = 'parallel' 23 | else: 24 | _, model_dir, port, mode = sys.argv 25 | 26 | app = Flask(__name__) 27 | app.logger.setLevel(logging.INFO) 28 | bert_model = bert_wrapper.BertModelWrapper(model_dir=model_dir) 29 | server = server_utils.BasicMRCService('Short answer MRC service', app.logger) 30 | 31 | @app.route('/', methods=['POST']) 32 | def mrqa_service(): 33 | """Description""" 34 | model = bert_model 35 | return server(model, process_mode=mode, max_batch_size=5) 36 | # return server(model) 37 | 38 | 39 | if __name__ == '__main__': 40 | app.run(port=port, debug=False, threaded=False, processes=1) 41 | 42 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/start.sh: -------------------------------------------------------------------------------- 1 | export FLAGS_sync_nccl_allreduce=0 2 | export FLAGS_eager_delete_tensor_gb=1 3 | export FLAGS_fraction_of_gpu_memory_to_use=0.1 4 | port=$1 5 | gpu=$2 6 | export CUDA_VISIBLE_DEVICES=$gpu 7 | 8 | python serve.py ./infer_model $port 9 | -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/xlnet_config/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/xlnet_server/xlnet_config/spiece.model -------------------------------------------------------------------------------- /MRQA2019-D-NET/server/xlnet_server/xlnet_config/xlnet_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "d_head": 64, 3 | "d_inner": 4096, 4 | "d_model": 1024, 5 | "ff_activation": "gelu", 6 | "n_head": 16, 7 | "n_layer": 24, 8 | "n_token": 32000, 9 | "untie_r": true 10 | } --------------------------------------------------------------------------------