├── .gitignore
├── ACL2019-KTNET
    ├── downloaded_files.md5
    ├── images
    │   └── architecture.png
    ├── reading_comprehension
    │   ├── eval_record_nell.sh
    │   ├── eval_record_twomemory.sh
    │   ├── eval_record_wordnet.sh
    │   ├── eval_squad_nell.sh
    │   ├── eval_squad_twomemory.sh
    │   ├── eval_squad_wordnet.sh
    │   ├── run_record_nell.sh
    │   ├── run_record_nell_finetune.sh
    │   ├── run_record_nell_pretrain.sh
    │   ├── run_record_twomemory.sh
    │   ├── run_record_twomemory_finetune.sh
    │   ├── run_record_twomemory_pretrain.sh
    │   ├── run_record_wordnet.sh
    │   ├── run_record_wordnet_finetune.sh
    │   ├── run_record_wordnet_pretrain.sh
    │   ├── run_squad_nell.sh
    │   ├── run_squad_nell_finetune.sh
    │   ├── run_squad_nell_pretrain.sh
    │   ├── run_squad_twomemory.sh
    │   ├── run_squad_twomemory_finetune.sh
    │   ├── run_squad_twomemory_pretrain.sh
    │   ├── run_squad_wordnet.sh
    │   ├── run_squad_wordnet_finetune.sh
    │   ├── run_squad_wordnet_pretrain.sh
    │   └── src
    │   │   ├── batching.py
    │   │   ├── batching_twomemory.py
    │   │   ├── eval
    │   │       └── __init__.py
    │   │   ├── model
    │   │       ├── __init__.py
    │   │       ├── bert.py
    │   │       ├── layers.py
    │   │       └── transformer_encoder.py
    │   │   ├── optimization.py
    │   │   ├── reader
    │   │       ├── __init__.py
    │   │       ├── record.py
    │   │       ├── record_twomemory.py
    │   │       ├── squad.py
    │   │       └── squad_twomemory.py
    │   │   ├── run_record.py
    │   │   ├── run_record_twomemory.py
    │   │   ├── run_squad.py
    │   │   ├── run_squad_twomemory.py
    │   │   ├── tokenization.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── args.py
    │   │       ├── fp16.py
    │   │       └── init.py
    ├── readme.md
    └── retrieve_concepts
    │   ├── ner_tagging_squad
    │       └── tagging.py
    │   ├── retrieve_nell
    │       ├── nell_concept_list.txt
    │       └── retrieve.py
    │   ├── retrieve_wordnet
    │       ├── retrieve.py
    │       └── wordnet-mlj12-definitions.txt
    │   ├── tokenization_record
    │       ├── do_tokenization.py
    │       ├── tokenization.py
    │       ├── vocab.cased.txt
    │       └── vocab.uncased.txt
    │   └── tokenization_squad
    │       ├── do_tokenization.py
    │       ├── tokenization.py
    │       ├── vocab.cased.txt
    │       └── vocab.uncased.txt
├── DuQM
    ├── README.md
    ├── data.py
    ├── model.py
    ├── predict.py
    └── train.py
├── DuReader-2.0
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── data
    │   ├── demo
    │   │   ├── devset
    │   │   │   └── search.dev.json
    │   │   ├── testset
    │   │   │   └── search.test.json
    │   │   └── trainset
    │   │   │   └── search.train.json
    │   ├── download.sh
    │   └── md5sum.txt
    ├── paddle
    │   ├── UPDATES.md
    │   ├── args.py
    │   ├── dataset.py
    │   ├── paragraph_extraction.py
    │   ├── preprocess.py
    │   ├── rc_model.py
    │   ├── run.py
    │   ├── run.sh
    │   └── vocab.py
    ├── tensorflow
    │   ├── dataset.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── basic_rnn.py
    │   │   ├── match_layer.py
    │   │   └── pointer_net.py
    │   ├── rc_model.py
    │   ├── run.py
    │   └── vocab.py
    └── utils
    │   ├── __init__.py
    │   ├── download_thirdparty.sh
    │   ├── dureader_eval.py
    │   ├── get_vocab.py
    │   ├── marco_tokenize_data.py
    │   ├── marcov1_to_dureader.py
    │   ├── marcov2_to_v1_tojsonl.py
    │   ├── preprocess.py
    │   └── run_marco2dureader_preprocess.sh
├── DuReader-Checklist
    ├── README.md
    ├── checklist.png
    ├── download.sh
    ├── evaluate.py
    ├── predict.sh
    ├── run_eval.sh
    ├── src
    │   ├── args.py
    │   ├── models.py
    │   ├── run.py
    │   └── squad.py
    └── train.sh
├── DuReader-Retrieval
    ├── README.md
    └── figures
    │   └── example.png
├── DuReader-Robust
    ├── README.md
    ├── download.sh
    ├── evaluate.py
    ├── md5.txt
    ├── paddlehub_baseline
    │   ├── demo_dataset.py
    │   ├── paddlehub_reading_comprehension.sh
    │   └── reading_comprehension.py
    ├── predict.sh
    ├── src
    │   ├── __init__.py
    │   ├── _ce.py
    │   ├── batching.py
    │   ├── convert_params.py
    │   ├── dist_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── ernie.py
    │   │   └── transformer_encoder.py
    │   ├── optimization.py
    │   ├── reader
    │   │   ├── __init__.py
    │   │   └── squad.py
    │   ├── run_mrc.py
    │   ├── tokenization.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── args.py
    │   │   ├── cards.py
    │   │   ├── fp16.py
    │   │   └── init.py
    └── train.sh
├── DuReader-vis
    ├── README.md
    └── images
    │   ├── intro-vis.png
    │   └── intro.png
├── MRQA2019-D-NET
    ├── README.md
    ├── images
    │   ├── D-NET_framework.png
    │   └── D-NET_server.png
    ├── multi_task_learning
    │   ├── README.md
    │   ├── configs
    │   │   ├── answer_matching.yaml
    │   │   ├── mask_language_model.yaml
    │   │   ├── mtl_config.yaml
    │   │   └── reading_comprehension.yaml
    │   ├── run_build_palm.sh
    │   ├── run_evaluation.sh
    │   ├── run_multi_task.sh
    │   ├── scripts
    │   │   ├── args.py
    │   │   ├── combine.py
    │   │   ├── combine.sh
    │   │   ├── convert_model_params.py
    │   │   ├── convert_mrqa2squad.py
    │   │   ├── convert_mrqa2squad.sh
    │   │   ├── dev
    │   │   │   └── md5sum_dev.txt
    │   │   ├── download_data.sh
    │   │   ├── evaluate-v1.1.py
    │   │   ├── macro_avg.py
    │   │   └── train
    │   │   │   └── md5sum_train.txt
    │   ├── wget_data.sh
    │   └── wget_pretrained_model.sh
    └── server
    │   ├── README.md
    │   ├── bert_server
    │       ├── model_wrapper.py
    │       ├── mrc_service.py
    │       ├── pdnlp
    │       │   ├── __init__.py
    │       │   ├── __main__.py
    │       │   ├── algorithm
    │       │   │   ├── __init__.py
    │       │   │   ├── multitask.py
    │       │   │   └── optimization.py
    │       │   ├── extension
    │       │   │   ├── __init__.py
    │       │   │   └── fp16.py
    │       │   ├── module
    │       │   │   ├── __init__.py
    │       │   │   └── transformer_encoder.py
    │       │   ├── nets
    │       │   │   ├── __init__.py
    │       │   │   ├── bert.py
    │       │   │   └── transformer_encoder.py
    │       │   └── toolkit
    │       │   │   ├── __init__.py
    │       │   │   ├── configure.py
    │       │   │   ├── init.py
    │       │   │   └── placeholder.py
    │       ├── start.sh
    │       ├── start_service.py
    │       └── task_reader
    │       │   ├── __init__.py
    │       │   ├── batching.py
    │       │   ├── mrqa.py
    │       │   └── tokenization.py
    │   ├── client
    │       ├── client.py
    │       └── demo.txt
    │   ├── ernie_server
    │       ├── model_wrapper.py
    │       ├── mrc_service.py
    │       ├── pdnlp
    │       │   ├── __init__.py
    │       │   ├── __main__.py
    │       │   ├── algorithm
    │       │   │   ├── __init__.py
    │       │   │   ├── multitask.py
    │       │   │   └── optimization.py
    │       │   ├── extension
    │       │   │   ├── __init__.py
    │       │   │   └── fp16.py
    │       │   ├── module
    │       │   │   ├── __init__.py
    │       │   │   └── transformer_encoder.py
    │       │   ├── nets
    │       │   │   ├── __init__.py
    │       │   │   └── bert.py
    │       │   └── toolkit
    │       │   │   ├── __init__.py
    │       │   │   ├── configure.py
    │       │   │   ├── init.py
    │       │   │   └── placeholder.py
    │       ├── start.sh
    │       ├── start_service.py
    │       └── task_reader
    │       │   ├── __init__.py
    │       │   ├── batching.py
    │       │   ├── mrqa_infer.py
    │       │   └── tokenization.py
    │   ├── main_server.py
    │   ├── start.sh
    │   ├── wget_server_inference_model.sh
    │   └── xlnet_server
    │       ├── __init__.py
    │       ├── data_utils.py
    │       ├── model
    │           ├── __init__.py
    │           ├── transformer_encoder.py
    │           └── xlnet.py
    │       ├── modeling.py
    │       ├── prepro_utils.py
    │       ├── serve.py
    │       ├── server_utils.py
    │       ├── squad_reader.py
    │       ├── squad_utils.py
    │       ├── start.sh
    │       ├── wrapper.py
    │       └── xlnet_config
    │           ├── spiece.model
    │           └── xlnet_config.json
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | models/
 2 | preprocessed/
 3 | raw/
 4 | *.pyc
 5 | vocab.search
 6 | local
 7 | bleu_metric
 8 | rouge_metric
 9 | .idea/
10 | add_copyright.py
11 | copyright
12 | .DS_Store
13 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/downloaded_files.md5:
--------------------------------------------------------------------------------
 1 | ad550852cf26241b20e8364e40340a99  train.json
 2 | 60c70c4a7e8190483f9899a1c9bc4178  dev.json
 3 | df45d93b87ca3c47b54a33e03fabf719  record_official_evaluate.py
 4 | 981b29407e0affa3b1b156f72073b945  train-v1.1.json
 5 | 3e85deb501d4e538b6bc56f786231552  dev-v1.1.json
 6 | afb04912d18ff20696f7f88eed49bea9  squad_v1_official_evaluate.py
 7 | 64010b964ae2ebf00148b3519a4aafc8  KTNET_preprocess_squad_tagging_output.tar.gz
 8 | e9352221127b7620427c18e39bfae7fc  KTNET_preprocess_tokenize_result_record.tar.gz
 9 | e52da2b1d096e889d32df267b82f9c77  KTNET_preprocess_tokenize_result_squad.tar.gz
10 | 89db2f5cfb07f0c44998d7f49098eb90  KTNET_preprocess_wordnet_concepts.tar.gz
11 | fb62db2fe82d88480ec853f3c6fa237a  NELL.08m.1115.esv.csv.gz
12 | a68e68f9dcf4524b356163369c7f9f50  KTNET_preprocess_nell_concepts.tar.gz
13 | d9b62183c6367ffac3ee6f864c9425a5  wn_concept2vec.txt
14 | 1f69c3d092089b0a0652616b72d61bd8  nell_concept2vec.txt
15 | 5405c050e64fee4ffec17ee50f079b64  cased_L-24_H-1024_A-16.tar.gz
16 | 4bd6e911cdad39c543ba8922a70580cd  KTNET_fine-tuned-model_record_both.tar.gz
17 | 43fa464d6aeabe6dc7a15315d4ea8288  KTNET_fine-tuned-model_record_nell.tar.gz
18 | 20aaefead331f64e435a94ac8a7b58aa  KTNET_fine-tuned-model_record_wordnet.tar.gz
19 | 3abdb7be3fc5e3b98633c918acc25af4  KTNET_fine-tuned-model_squad_both.tar.gz
20 | 9232cf27adda9d64265ccb315e1b9c81  KTNET_fine-tuned-model_squad_nell.tar.gz
21 | a36fdd6d5c88e3e931bb3b28f9aeb4e2  KTNET_fine-tuned-model_squad_wordnet.tar.gz
22 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/images/architecture.png


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/eval_record_nell.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | CKPT_DIR=$1
42 | 
43 | python3 src/run_record.py \
44 |   --batch_size 6 \
45 |   --do_train false \
46 |   --do_predict true \
47 |   --use_ema false \
48 |   --do_lower_case false \
49 |   --init_pretraining_params $BERT_DIR/params \
50 |   --init_checkpoint $CKPT_DIR \
51 |   --train_file $DATA/ReCoRD/train.json \
52 |   --predict_file $DATA/ReCoRD/dev.json \
53 |   --vocab_path $BERT_DIR/vocab.txt \
54 |   --bert_config_path $BERT_DIR/bert_config.json \
55 |   --freeze false \
56 |   --max_seq_len 384 \
57 |   --doc_stride 128 \
58 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
59 |   --use_nell true \
60 |   --random_seed 45 \
61 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
62 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/eval_record_twomemory.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | CKPT_DIR=$1
43 | 
44 | python3 src/run_record_twomemory.py \
45 |   --batch_size 6 \
46 |   --do_train false \
47 |   --do_predict true \
48 |   --use_ema false \
49 |   --do_lower_case false \
50 |   --init_pretraining_params $BERT_DIR/params \
51 |   --init_checkpoint $CKPT_DIR \
52 |   --train_file $DATA/ReCoRD/train.json \
53 |   --predict_file $DATA/ReCoRD/dev.json \
54 |   --vocab_path $BERT_DIR/vocab.txt \
55 |   --bert_config_path $BERT_DIR/bert_config.json \
56 |   --freeze false \
57 |   --max_seq_len 384 \
58 |   --doc_stride 128 \
59 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
60 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
61 |   --use_wordnet true \
62 |   --use_nell true \
63 |   --random_seed 45 \
64 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
65 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/eval_record_wordnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | CKPT_DIR=$1
42 | 
43 | python3 src/run_record.py \
44 |   --batch_size 6 \
45 |   --do_train false \
46 |   --do_predict true \
47 |   --use_ema false \
48 |   --do_lower_case false \
49 |   --init_pretraining_params $BERT_DIR/params \
50 |   --init_checkpoint $CKPT_DIR \
51 |   --train_file $DATA/ReCoRD/train.json \
52 |   --predict_file $DATA/ReCoRD/dev.json \
53 |   --vocab_path $BERT_DIR/vocab.txt \
54 |   --bert_config_path $BERT_DIR/bert_config.json \
55 |   --freeze false \
56 |   --max_seq_len 384 \
57 |   --doc_stride 128 \
58 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
59 |   --use_wordnet true \
60 |   --random_seed 45 \
61 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
62 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/eval_squad_nell.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | CKPT_DIR=$1
42 | 
43 | python3 src/run_squad.py \
44 |   --batch_size 6 \
45 |   --do_train false \
46 |   --do_predict true \
47 |   --use_ema false \
48 |   --do_lower_case false \
49 |   --init_pretraining_params $BERT_DIR/params \
50 |   --init_checkpoint $CKPT_DIR \
51 |   --train_file $DATA/SQuAD/train-v1.1.json \
52 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
53 |   --vocab_path $BERT_DIR/vocab.txt \
54 |   --bert_config_path $BERT_DIR/bert_config.json \
55 |   --freeze false \
56 |   --max_seq_len 384 \
57 |   --doc_stride 128 \
58 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
59 |   --use_nell true \
60 |   --random_seed 45 \
61 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
62 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/eval_squad_twomemory.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | CKPT_DIR=$1
43 | 
44 | python3 src/run_squad_twomemory.py \
45 |   --batch_size 6 \
46 |   --do_train false \
47 |   --do_predict true \
48 |   --use_ema false \
49 |   --do_lower_case false \
50 |   --init_pretraining_params $BERT_DIR/params \
51 |   --init_checkpoint $CKPT_DIR \
52 |   --train_file $DATA/SQuAD/train-v1.1.json \
53 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
54 |   --vocab_path $BERT_DIR/vocab.txt \
55 |   --bert_config_path $BERT_DIR/bert_config.json \
56 |   --freeze false \
57 |   --max_seq_len 384 \
58 |   --doc_stride 128 \
59 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
60 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
61 |   --use_wordnet true \
62 |   --use_nell true \
63 |   --random_seed 45 \
64 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
65 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/eval_squad_wordnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | CKPT_DIR=$1
42 | 
43 | python3 src/run_squad.py \
44 |   --batch_size 6 \
45 |   --do_train false \
46 |   --do_predict true \
47 |   --use_ema false \
48 |   --do_lower_case false \
49 |   --init_pretraining_params $BERT_DIR/params \
50 |   --init_checkpoint $CKPT_DIR \
51 |   --train_file $DATA/SQuAD/train-v1.1.json \
52 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
53 |   --vocab_path $BERT_DIR/vocab.txt \
54 |   --bert_config_path $BERT_DIR/bert_config.json \
55 |   --freeze false \
56 |   --max_seq_len 384 \
57 |   --doc_stride 128 \
58 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
59 |   --use_wordnet true \
60 |   --random_seed 45 \
61 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
62 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_nell.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | 
42 | python3 src/run_record.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_pretraining_params $BERT_DIR/params \
48 |   --train_file $DATA/ReCoRD/train.json \
49 |   --predict_file $DATA/ReCoRD/dev.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 4 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_nell true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_nell_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | 
42 | python3 src/run_record.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_checkpoint record_nell_first_stage_output/step_41970 \
48 |   --train_file $DATA/ReCoRD/train.json \
49 |   --predict_file $DATA/ReCoRD/dev.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 4 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_nell true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_nell_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d record_nell_first_stage_log ]; then
23 | mkdir record_nell_first_stage_log
24 | else
25 | rm -r record_nell_first_stage_log/*
26 | fi
27 | 
28 | if [ ! -d record_nell_first_stage_output ]; then
29 | mkdir record_nell_first_stage_output
30 | else
31 | rm -r record_nell_first_stage_output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | 
42 | python3 src/run_record.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --use_ema false \
47 |   --do_lower_case false \
48 |   --init_pretraining_params $BERT_DIR/params \
49 |   --train_file $DATA/ReCoRD/train.json \
50 |   --predict_file $DATA/ReCoRD/dev.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze true \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.0 \
57 |   --learning_rate 3e-4 \
58 |   --epoch 10 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
62 |   --use_nell true \
63 |   --random_seed 45 \
64 |   --checkpoints record_nell_first_stage_output/ 1>$PWD_DIR/record_nell_first_stage_log/train.log 2>&1
65 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_twomemory.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | 
43 | python3 src/run_record_twomemory.py \
44 |   --batch_size 6 \
45 |   --do_train true \
46 |   --do_predict true \
47 |   --do_lower_case false \
48 |   --init_pretraining_params $BERT_DIR/params \
49 |   --train_file $DATA/ReCoRD/train.json \
50 |   --predict_file $DATA/ReCoRD/dev.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze false \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.1 \
57 |   --learning_rate 3e-5 \
58 |   --epoch 4 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
62 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
63 |   --use_wordnet true \
64 |   --use_nell true \
65 |   --random_seed 45 \
66 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
67 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_twomemory_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | 
43 | python3 src/run_record_twomemory.py \
44 |   --batch_size 6 \
45 |   --do_train true \
46 |   --do_predict true \
47 |   --do_lower_case false \
48 |   --init_checkpoint record_both_first_stage_output/step_41970 \
49 |   --train_file $DATA/ReCoRD/train.json \
50 |   --predict_file $DATA/ReCoRD/dev.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze false \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.1 \
57 |   --learning_rate 3e-5 \
58 |   --epoch 4 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
62 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
63 |   --use_wordnet true \
64 |   --use_nell true \
65 |   --random_seed 45 \
66 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
67 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_twomemory_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d record_both_first_stage_log ]; then
23 | mkdir record_both_first_stage_log
24 | else
25 | rm -r record_both_first_stage_log/*
26 | fi
27 | 
28 | if [ ! -d record_both_first_stage_output ]; then
29 | mkdir record_both_first_stage_output
30 | else
31 | rm -r record_both_first_stage_output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | 
43 | python3 src/run_record_twomemory.py \
44 |   --batch_size 6 \
45 |   --do_train true \
46 |   --do_predict true \
47 |   --use_ema false \
48 |   --do_lower_case false \
49 |   --init_pretraining_params $BERT_DIR/params \
50 |   --train_file $DATA/ReCoRD/train.json \
51 |   --predict_file $DATA/ReCoRD/dev.json \
52 |   --vocab_path $BERT_DIR/vocab.txt \
53 |   --bert_config_path $BERT_DIR/bert_config.json \
54 |   --freeze true \
55 |   --save_steps 4000 \
56 |   --weight_decay 0.01 \
57 |   --warmup_proportion 0.0 \
58 |   --learning_rate 3e-4 \
59 |   --epoch 10 \
60 |   --max_seq_len 384 \
61 |   --doc_stride 128 \
62 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
63 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
64 |   --use_wordnet true \
65 |   --use_nell true \
66 |   --random_seed 45 \
67 |   --checkpoints record_both_first_stage_output/ 1>$PWD_DIR/record_both_first_stage_log/train.log 2>&1
68 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_wordnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | 
42 | python3 src/run_record.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_pretraining_params $BERT_DIR/params \
48 |   --train_file $DATA/ReCoRD/train.json \
49 |   --predict_file $DATA/ReCoRD/dev.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 4 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_wordnet true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_wordnet_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | 
42 | python3 src/run_record.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_checkpoint record_wn_first_stage_output/step_41970 \
48 |   --train_file $DATA/ReCoRD/train.json \
49 |   --predict_file $DATA/ReCoRD/dev.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 4 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_wordnet true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_record_wordnet_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d record_wn_first_stage_log ]; then
23 | mkdir record_wn_first_stage_log
24 | else
25 | rm -r record_wn_first_stage_log/*
26 | fi
27 | 
28 | if [ ! -d record_wn_first_stage_output ]; then
29 | mkdir record_wn_first_stage_output
30 | else
31 | rm -r record_wn_first_stage_output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | 
42 | python3 src/run_record.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --use_ema false \
47 |   --do_lower_case false \
48 |   --init_pretraining_params $BERT_DIR/params \
49 |   --train_file $DATA/ReCoRD/train.json \
50 |   --predict_file $DATA/ReCoRD/dev.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze true \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.0 \
57 |   --learning_rate 3e-4 \
58 |   --epoch 10 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
62 |   --use_wordnet true \
63 |   --random_seed 45 \
64 |   --checkpoints record_wn_first_stage_output/ 1>$PWD_DIR/record_wn_first_stage_log/train.log 2>&1
65 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_nell.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | 
42 | python3 src/run_squad.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_pretraining_params $BERT_DIR/params \
48 |   --train_file $DATA/SQuAD/train-v1.1.json \
49 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 3 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_nell true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_nell_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | 
42 | python3 src/run_squad.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_checkpoint sqd_nell_first_stage_output/step_3649 \
48 |   --train_file $DATA/SQuAD/train-v1.1.json \
49 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 3 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_nell true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_nell_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d sqd_nell_first_stage_log ]; then
23 | mkdir sqd_nell_first_stage_log
24 | else
25 | rm -r sqd_nell_first_stage_log/*
26 | fi
27 | 
28 | if [ ! -d sqd_nell_first_stage_output ]; then
29 | mkdir sqd_nell_first_stage_output
30 | else
31 | rm -r sqd_nell_first_stage_output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
41 | 
42 | python3 src/run_squad.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --use_ema false \
47 |   --do_lower_case false \
48 |   --init_pretraining_params $BERT_DIR/params \
49 |   --train_file $DATA/SQuAD/train-v1.1.json \
50 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze true \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.0 \
57 |   --learning_rate 3e-5 \
58 |   --epoch 1 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
62 |   --use_nell true \
63 |   --random_seed 45 \
64 |   --checkpoints sqd_nell_first_stage_output/ 1>$PWD_DIR/sqd_nell_first_stage_log/train.log 2>&1
65 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_twomemory.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | 
43 | python3 src/run_squad_twomemory.py \
44 |   --batch_size 6 \
45 |   --do_train true \
46 |   --do_predict true \
47 |   --do_lower_case false \
48 |   --init_pretraining_params $BERT_DIR/params \
49 |   --train_file $DATA/SQuAD/train-v1.1.json \
50 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze false \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.1 \
57 |   --learning_rate 3e-5 \
58 |   --epoch 3 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
62 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
63 |   --use_wordnet true \
64 |   --use_nell true \
65 |   --random_seed 45 \
66 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
67 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | 
43 | python3 src/run_squad_twomemory.py \
44 |   --batch_size 6 \
45 |   --do_train true \
46 |   --do_predict true \
47 |   --do_lower_case false \
48 |   --init_checkpoint sqd_both_first_stage_output/step_3649 \
49 |   --train_file $DATA/SQuAD/train-v1.1.json \
50 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze false \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.1 \
57 |   --learning_rate 3e-5 \
58 |   --epoch 3 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
62 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
63 |   --use_wordnet true \
64 |   --use_nell true \
65 |   --random_seed 45 \
66 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
67 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d sqd_both_first_stage_log ]; then
23 | mkdir sqd_both_first_stage_log
24 | else
25 | rm -r sqd_both_first_stage_log/*
26 | fi
27 | 
28 | if [ ! -d sqd_both_first_stage_output ]; then
29 | mkdir sqd_both_first_stage_output
30 | else
31 | rm -r sqd_both_first_stage_output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
42 | 
43 | python3 src/run_squad_twomemory.py \
44 |   --batch_size 6 \
45 |   --do_train true \
46 |   --do_predict true \
47 |   --use_ema false \
48 |   --do_lower_case false \
49 |   --init_pretraining_params $BERT_DIR/params \
50 |   --train_file $DATA/SQuAD/train-v1.1.json \
51 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
52 |   --vocab_path $BERT_DIR/vocab.txt \
53 |   --bert_config_path $BERT_DIR/bert_config.json \
54 |   --freeze true \
55 |   --save_steps 4000 \
56 |   --weight_decay 0.01 \
57 |   --warmup_proportion 0.0 \
58 |   --learning_rate 3e-5 \
59 |   --epoch 1 \
60 |   --max_seq_len 384 \
61 |   --doc_stride 128 \
62 |   --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
63 |   --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
64 |   --use_wordnet true \
65 |   --use_nell true \
66 |   --random_seed 45 \
67 |   --checkpoints sqd_both_first_stage_output/ 1>$PWD_DIR/sqd_both_first_stage_log/train.log 2>&1
68 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_wordnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | 
42 | python3 src/run_squad.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_pretraining_params $BERT_DIR/params \
48 |   --train_file $DATA/SQuAD/train-v1.1.json \
49 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 3 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_wordnet true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d log ]; then
23 | mkdir log
24 | else
25 | rm -r log/*
26 | fi
27 | 
28 | if [ ! -d output ]; then
29 | mkdir output
30 | else
31 | rm -r output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | 
42 | python3 src/run_squad.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --do_lower_case false \
47 |   --init_checkpoint sqd_wn_first_stage_output/step_3649 \
48 |   --train_file $DATA/SQuAD/train-v1.1.json \
49 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
50 |   --vocab_path $BERT_DIR/vocab.txt \
51 |   --bert_config_path $BERT_DIR/bert_config.json \
52 |   --freeze false \
53 |   --save_steps 4000 \
54 |   --weight_decay 0.01 \
55 |   --warmup_proportion 0.1 \
56 |   --learning_rate 3e-5 \
57 |   --epoch 3 \
58 |   --max_seq_len 384 \
59 |   --doc_stride 128 \
60 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
61 |   --use_wordnet true \
62 |   --random_seed 45 \
63 |   --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
64 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | export LANG=en_US.UTF-8
19 | export LC_ALL=en_US.UTF-8
20 | export LC_CTYPE=en_US.UTF-8
21 | 
22 | if [ ! -d sqd_wn_first_stage_log ]; then
23 | mkdir sqd_wn_first_stage_log
24 | else
25 | rm -r sqd_wn_first_stage_log/*
26 | fi
27 | 
28 | if [ ! -d sqd_wn_first_stage_output ]; then
29 | mkdir sqd_wn_first_stage_output
30 | else
31 | rm -r sqd_wn_first_stage_output/*
32 | fi
33 | 
34 | export FLAGS_cudnn_deterministic=true
35 | export FLAGS_cpu_deterministic=true
36 | 
37 | PWD_DIR=`pwd`
38 | DATA=../data/
39 | BERT_DIR=cased_L-24_H-1024_A-16
40 | CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
41 | 
42 | python3 src/run_squad.py \
43 |   --batch_size 6 \
44 |   --do_train true \
45 |   --do_predict true \
46 |   --use_ema false \
47 |   --do_lower_case false \
48 |   --init_pretraining_params $BERT_DIR/params \
49 |   --train_file $DATA/SQuAD/train-v1.1.json \
50 |   --predict_file $DATA/SQuAD/dev-v1.1.json \
51 |   --vocab_path $BERT_DIR/vocab.txt \
52 |   --bert_config_path $BERT_DIR/bert_config.json \
53 |   --freeze true \
54 |   --save_steps 4000 \
55 |   --weight_decay 0.01 \
56 |   --warmup_proportion 0.0 \
57 |   --learning_rate 3e-5 \
58 |   --epoch 1 \
59 |   --max_seq_len 384 \
60 |   --doc_stride 128 \
61 |   --concept_embedding_path $CPT_EMBEDDING_PATH \
62 |   --use_wordnet true \
63 |   --random_seed 45 \
64 |   --checkpoints sqd_wn_first_stage_output/ 1>$PWD_DIR/sqd_wn_first_stage_log/train.log 2>&1
65 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/model/__init__.py


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/utils/args.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Arguments for configuration."""
15 | 
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import six
21 | import argparse
22 | import logging
23 | 
24 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
25 |                     datefmt = '%m/%d/%Y %H:%M:%S',
26 |                     level = logging.INFO)
27 | logging.getLogger().setLevel(logging.INFO)                    
28 | logger = logging.getLogger(__name__)
29 | 
30 | def str2bool(v):
31 |     # because argparse does not support to parse "true, False" as python
32 |     # boolean directly
33 |     return v.lower() in ("true", "t", "1")
34 | 
35 | 
36 | class ArgumentGroup(object):
37 |     def __init__(self, parser, title, des):
38 |         self._group = parser.add_argument_group(title=title, description=des)
39 | 
40 |     def add_arg(self, name, type, default, help, **kwargs):
41 |         type = str2bool if type == bool else type
42 |         self._group.add_argument(
43 |             "--" + name,
44 |             default=default,
45 |             type=type,
46 |             help=help + ' Default: %(default)s.',
47 |             **kwargs)
48 | 
49 | 
50 | def print_arguments(args):
51 |     logger.info('-----------  Configuration Arguments -----------')
52 |     for arg, value in sorted(six.iteritems(vars(args))):
53 |         logger.info('%s: %s' % (arg, value))
54 |     logger.info('------------------------------------------------')
55 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/utils/fp16.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | import paddle
17 | import paddle.fluid as fluid
18 | 
19 | 
20 | def cast_fp16_to_fp32(i, o, prog):
21 |     prog.global_block().append_op(
22 |         type="cast",
23 |         inputs={"X": i},
24 |         outputs={"Out": o},
25 |         attrs={
26 |             "in_dtype": fluid.core.VarDesc.VarType.FP16,
27 |             "out_dtype": fluid.core.VarDesc.VarType.FP32
28 |         })
29 | 
30 | 
31 | def cast_fp32_to_fp16(i, o, prog):
32 |     prog.global_block().append_op(
33 |         type="cast",
34 |         inputs={"X": i},
35 |         outputs={"Out": o},
36 |         attrs={
37 |             "in_dtype": fluid.core.VarDesc.VarType.FP32,
38 |             "out_dtype": fluid.core.VarDesc.VarType.FP16
39 |         })
40 | 
41 | 
42 | def copy_to_master_param(p, block):
43 |     v = block.vars.get(p.name, None)
44 |     if v is None:
45 |         raise ValueError("no param name %s found!" % p.name)
46 |     new_p = fluid.framework.Parameter(
47 |         block=block,
48 |         shape=v.shape,
49 |         dtype=fluid.core.VarDesc.VarType.FP32,
50 |         type=v.type,
51 |         lod_level=v.lod_level,
52 |         stop_gradient=p.stop_gradient,
53 |         trainable=p.trainable,
54 |         optimize_attr=p.optimize_attr,
55 |         regularizer=p.regularizer,
56 |         gradient_clip_attr=p.gradient_clip_attr,
57 |         error_clip=p.error_clip,
58 |         name=v.name + ".master")
59 |     return new_p
60 | 
61 | 
62 | def create_master_params_grads(params_grads, main_prog, startup_prog,
63 |                                loss_scaling):
64 |     master_params_grads = []
65 |     tmp_role = main_prog._current_role
66 |     OpRole = fluid.core.op_proto_and_checker_maker.OpRole
67 |     main_prog._current_role = OpRole.Backward
68 |     for p, g in params_grads:
69 |         # create master parameters
70 |         master_param = copy_to_master_param(p, main_prog.global_block())
71 |         startup_master_param = startup_prog.global_block()._clone_variable(
72 |             master_param)
73 |         startup_p = startup_prog.global_block().var(p.name)
74 |         cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
75 |         # cast fp16 gradients to fp32 before apply gradients
76 |         if g.name.find("layer_norm") > -1:
77 |             if loss_scaling > 1:
78 |                 scaled_g = g / float(loss_scaling)
79 |             else:
80 |                 scaled_g = g
81 |             master_params_grads.append([p, scaled_g])
82 |             continue
83 |         master_grad = fluid.layers.cast(g, "float32")
84 |         if loss_scaling > 1:
85 |             master_grad = master_grad / float(loss_scaling)
86 |         master_params_grads.append([master_param, master_grad])
87 |     main_prog._current_role = tmp_role
88 |     return master_params_grads
89 | 
90 | 
91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog):
92 |     for idx, m_p_g in enumerate(master_params_grads):
93 |         train_p, _ = params_grads[idx]
94 |         if train_p.name.find("layer_norm") > -1:
95 |             continue
96 |         with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
97 |             cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
98 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/reading_comprehension/src/utils/init.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import os
18 | import six
19 | import ast
20 | import copy
21 | import logging
22 | 
23 | import numpy as np
24 | import paddle.fluid as fluid
25 | 
26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
28 |                     level = logging.INFO)
29 | logging.getLogger().setLevel(logging.INFO)                    
30 | logger = logging.getLogger(__name__)
31 | 
32 | def cast_fp32_to_fp16(exe, main_program):
33 |     logger.info("Cast parameters to float16 data format.")
34 |     for param in main_program.global_block().all_parameters():
35 |         if not param.name.endswith(".master"):
36 |             param_t = fluid.global_scope().find_var(param.name).get_tensor()
37 |             data = np.array(param_t)
38 |             if param.name.find("layer_norm") == -1:
39 |                 param_t.set(np.float16(data).view(np.uint16), exe.place)
40 |             master_param_var = fluid.global_scope().find_var(param.name +
41 |                                                              ".master")
42 |             if master_param_var is not None:
43 |                 master_param_var.get_tensor().set(data, exe.place)
44 | 
45 | 
46 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
47 |     assert os.path.exists(
48 |         init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
49 | 
50 |     def existed_persitables(var):
51 |         if not fluid.io.is_persistable(var):
52 |             return False
53 |         return os.path.exists(os.path.join(init_checkpoint_path, var.name))
54 | 
55 |     fluid.io.load_vars(
56 |         exe,
57 |         init_checkpoint_path,
58 |         main_program=main_program,
59 |         predicate=existed_persitables)
60 |     logger.info("Load model from {}".format(init_checkpoint_path))
61 | 
62 |     if use_fp16:
63 |         cast_fp32_to_fp16(exe, main_program)
64 | 
65 | 
66 | def init_pretraining_params(exe,
67 |                             pretraining_params_path,
68 |                             main_program,
69 |                             use_fp16=False):
70 |     assert os.path.exists(pretraining_params_path
71 |                           ), "[%s] cann't be found." % pretraining_params_path
72 | 
73 |     def existed_params(var):
74 |         if not isinstance(var, fluid.framework.Parameter):
75 |             return False
76 |         return os.path.exists(os.path.join(pretraining_params_path, var.name))
77 | 
78 |     fluid.io.load_vars(
79 |         exe,
80 |         pretraining_params_path,
81 |         main_program=main_program,
82 |         predicate=existed_params)
83 |     logger.info("Load pretraining parameters from {}.".format(
84 |         pretraining_params_path))
85 | 
86 |     if use_fp16:
87 |         cast_fp32_to_fp16(exe, main_program)
88 | 


--------------------------------------------------------------------------------
/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/retrieve.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2019 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | 
 18 | import pickle
 19 | import argparse
 20 | import os
 21 | import nltk
 22 | import logging
 23 | import string
 24 | from tqdm import tqdm
 25 | from nltk.corpus import wordnet as wn
 26 | 
 27 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
 28 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 29 |                     level = logging.INFO)
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | def main():
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument('--train_token', type=str, default='../tokenization_record/tokens/train.tokenization.uncased.data', help='token file of train set')
 35 |     parser.add_argument('--eval_token', type=str, default='../tokenization_record/tokens/dev.tokenization.uncased.data', help='token file of dev set')
 36 |     parser.add_argument('--output_dir', type=str, default='output_record/', help='output directory')
 37 |     parser.add_argument('--no_stopwords', action='store_true', help='ignore stopwords')
 38 |     parser.add_argument('--ignore_length', type=int, default=0, help='ignore words with length <= ignore_length')
 39 |     args = parser.parse_args()
 40 | 
 41 |     # initialize mapping from offset id to wn18 synset name
 42 |     offset_to_wn18name_dict = {} 
 43 |     fin = open('wordnet-mlj12-definitions.txt')
 44 |     for line in fin:
 45 |         info = line.strip().split('\t')
 46 |         offset_str, synset_name = info[0], info[1]
 47 |         offset_to_wn18name_dict[offset_str] = synset_name
 48 |     logger.info('Finished loading wn18 definition file.')
 49 |         
 50 | 
 51 |     # load pickled samples
 52 |     logger.info('Begin to load tokenization results...')
 53 |     train_samples = pickle.load(open(args.train_token, 'rb'))
 54 |     dev_samples = pickle.load(open(args.eval_token, 'rb'))
 55 |     logger.info('Finished loading tokenization results.')
 56 |     
 57 |     # build token set
 58 |     all_token_set = set()
 59 |     for sample in train_samples + dev_samples:
 60 |         for token in sample['query_tokens'] + sample['document_tokens']:
 61 |             all_token_set.add(token)
 62 |     logger.info('Finished making tokenization results into token set.')
 63 | 
 64 |     # load stopwords
 65 |     stopwords = set(nltk.corpus.stopwords.words('english'))
 66 |     logger.info('Finished loading stopwords list.')
 67 | 
 68 |     # retrive synsets
 69 |     logger.info('Begin to retrieve synsets...')
 70 |     token2synset = dict()
 71 |     stopword_cnt = 0
 72 |     punctuation_cnt = 0
 73 |     for token in tqdm(all_token_set):
 74 |         if token in set(string.punctuation):
 75 |             logger.info('{} is punctuation, skipped!'.format(token))
 76 |             punctuation_cnt += 1
 77 |             continue        
 78 |         if args.no_stopwords and token in stopwords:
 79 |             logger.info('{} is stopword, skipped!'.format(token))
 80 |             stopword_cnt += 1
 81 |             continue
 82 |         if args.ignore_length > 0 and len(token) <= args.ignore_length:
 83 |             logger.info('{} is too short, skipped!'.format(token))
 84 |             continue
 85 |         synsets = wn.synsets(token)
 86 |         wn18synset_names = []
 87 |         for synset in synsets:
 88 |             offset_str = str(synset.offset()).zfill(8)
 89 |             if offset_str in offset_to_wn18name_dict:
 90 |                 wn18synset_names.append(offset_to_wn18name_dict[offset_str])
 91 |         if len(wn18synset_names) > 0:
 92 |             token2synset[token] = wn18synset_names
 93 |     logger.info('Finished retrieving sysnets.')
 94 |     logger.info('{} / {} tokens retrieved at lease 1 synset. {} stopwords and {} punctuations skipped.'.format(len(token2synset), len(all_token_set), stopword_cnt, punctuation_cnt))
 95 |     
 96 |     if not os.path.exists(args.output_dir):
 97 |         os.makedirs(args.output_dir)
 98 | 
 99 |     with open(os.path.join(args.output_dir, 'retrived_synsets.data'), 'wb') as fout:
100 |         pickle.dump(token2synset, fout)    
101 |     logger.info('Finished dumping retrieved synsets.')
102 | 
103 | if __name__ == '__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/DuQM/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import paddle
16 | import numpy as np
17 | 
18 | from paddlenlp.datasets import MapDataset
19 | 
20 | 
21 | def create_dataloader(dataset,
22 |                       mode='train',
23 |                       batch_size=1,
24 |                       batchify_fn=None,
25 |                       trans_fn=None):
26 |     if trans_fn:
27 |         dataset = dataset.map(trans_fn)
28 | 
29 |     shuffle = True if mode == 'train' else False
30 |     if mode == 'train':
31 |         batch_sampler = paddle.io.DistributedBatchSampler(
32 |             dataset, batch_size=batch_size, shuffle=shuffle)
33 |     else:
34 |         batch_sampler = paddle.io.BatchSampler(
35 |             dataset, batch_size=batch_size, shuffle=shuffle)
36 | 
37 |     return paddle.io.DataLoader(
38 |         dataset=dataset,
39 |         batch_sampler=batch_sampler,
40 |         collate_fn=batchify_fn,
41 |         return_list=True)
42 | 
43 | 
44 | def read_text_pair(data_path, is_test=False):
45 |     """Reads data."""
46 |     with open(data_path, 'r', encoding='utf-8') as f:
47 |         for line in f:
48 |             data = line.rstrip().split("\t")
49 |             if is_test == False:
50 |                 if len(data) != 3:
51 |                     continue
52 |                 yield {'query1': data[0], 'query2': data[1], 'label': data[2]}
53 |             else:
54 |                 if len(data) != 2:
55 |                     continue
56 |                 yield {'query1': data[0], 'query2': data[1]}
57 | 
58 | 
59 | 
60 | def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
61 | 
62 |     query, title = example["query1"], example["query2"]
63 | 
64 |     encoded_inputs = tokenizer(
65 |         text=query, text_pair=title, max_seq_len=max_seq_length)
66 | 
67 |     input_ids = encoded_inputs["input_ids"]
68 |     token_type_ids = encoded_inputs["token_type_ids"]
69 | 
70 |     if not is_test:
71 |         label = np.array([example["label"]], dtype="int64")
72 |         return input_ids, token_type_ids, label
73 |     else:
74 |         return input_ids, token_type_ids


--------------------------------------------------------------------------------
/DuQM/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import paddle
16 | import paddle.nn as nn
17 | import paddle.nn.functional as F
18 | 
19 | import paddlenlp as ppnlp
20 | 
21 | 
22 | class QuestionMatching(nn.Layer):
23 |     def __init__(self, pretrained_model, dropout=None, rdrop_coef=0.0):
24 |         super().__init__()
25 |         self.ptm = pretrained_model
26 |         self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
27 | 
28 |         # num_labels = 2 (similar or dissimilar)
29 |         self.classifier = nn.Linear(self.ptm.config["hidden_size"], 2)
30 |         self.rdrop_coef = rdrop_coef
31 |         self.rdrop_loss = ppnlp.losses.RDropLoss()
32 | 
33 |     def forward(self,
34 |                 input_ids,
35 |                 token_type_ids=None,
36 |                 position_ids=None,
37 |                 attention_mask=None,
38 |                 do_evaluate=False):
39 | 
40 |         _, cls_embedding1 = self.ptm(input_ids, token_type_ids, position_ids,
41 |                                     attention_mask)
42 |         cls_embedding1 = self.dropout(cls_embedding1)
43 |         logits1 = self.classifier(cls_embedding1)
44 |         
45 |         # For more information about R-drop please refer to this paper: https://arxiv.org/abs/2106.14448
46 |         # Original implementation please refer to this code: https://github.com/dropreg/R-Drop
47 |         if self.rdrop_coef > 0 and not do_evaluate:
48 |             _, cls_embedding2 = self.ptm(input_ids, token_type_ids, position_ids,
49 |                                     attention_mask)
50 |             cls_embedding2 = self.dropout(cls_embedding2)
51 |             logits2 = self.classifier(cls_embedding2)
52 |             kl_loss = self.rdrop_loss(logits1, logits2)
53 |         else:
54 |             kl_loss = 0.0
55 | 
56 |         return logits1, kl_loss
57 | 


--------------------------------------------------------------------------------
/DuQM/predict.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from functools import partial
 16 | import argparse
 17 | import sys
 18 | import os
 19 | import random
 20 | import time
 21 | 
 22 | import numpy as np
 23 | import paddle
 24 | import paddle.nn.functional as F
 25 | import paddlenlp as ppnlp
 26 | from paddlenlp.datasets import load_dataset
 27 | from paddlenlp.data import Stack, Tuple, Pad
 28 | 
 29 | from data import create_dataloader, read_text_pair, convert_example
 30 | from model import QuestionMatching
 31 | 
 32 | # yapf: disable
 33 | parser = argparse.ArgumentParser()
 34 | parser.add_argument("--input_file", type=str, required=True, help="The full path of input file")
 35 | parser.add_argument("--result_file", type=str, required=True, help="The result file name")
 36 | parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
 37 | parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after tokenization. "
 38 |     "Sequences longer than this will be truncated, sequences shorter will be padded.")
 39 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
 40 | parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 41 | args = parser.parse_args()
 42 | # yapf: enable
 43 | 
 44 | 
 45 | def predict(model, data_loader):
 46 |     """
 47 |     Predicts the data labels.
 48 | 
 49 |     Args:
 50 |         model (obj:`QuestionMatching`): A model to calculate whether the question pair is semantic similar or not.
 51 |         data_loaer (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids]
 52 |     Returns:
 53 |         results(obj:`List`): cosine similarity of text pairs.
 54 |     """
 55 |     batch_logits = []
 56 | 
 57 |     model.eval()
 58 | 
 59 |     with paddle.no_grad():
 60 |         for batch_data in data_loader:
 61 |             input_ids, token_type_ids = batch_data
 62 | 
 63 |             input_ids = paddle.to_tensor(input_ids)
 64 |             token_type_ids = paddle.to_tensor(token_type_ids)
 65 | 
 66 |             batch_logit, _ = model(
 67 |                 input_ids=input_ids, token_type_ids=token_type_ids)
 68 | 
 69 |             batch_logits.append(batch_logit.numpy())
 70 | 
 71 |         batch_logits = np.concatenate(batch_logits, axis=0)
 72 | 
 73 |         return batch_logits
 74 | 
 75 | 
 76 | if __name__ == "__main__":
 77 |     paddle.set_device(args.device)
 78 | 
 79 |     pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained(
 80 |         'ernie-gram-zh')
 81 |     tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained(
 82 |         'ernie-gram-zh')
 83 | 
 84 |     trans_func = partial(
 85 |         convert_example,
 86 |         tokenizer=tokenizer,
 87 |         max_seq_length=args.max_seq_length,
 88 |         is_test=True)
 89 | 
 90 |     batchify_fn = lambda samples, fn=Tuple(
 91 |         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
 92 |         Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment_ids
 93 |     ): [data for data in fn(samples)]
 94 | 
 95 |     test_ds = load_dataset(
 96 |         read_text_pair, data_path=args.input_file, is_test=True, lazy=False)
 97 | 
 98 |     test_data_loader = create_dataloader(
 99 |         test_ds,
100 |         mode='predict',
101 |         batch_size=args.batch_size,
102 |         batchify_fn=batchify_fn,
103 |         trans_fn=trans_func)
104 | 
105 |     model = QuestionMatching(pretrained_model)
106 | 
107 |     if args.params_path and os.path.isfile(args.params_path):
108 |         state_dict = paddle.load(args.params_path)
109 |         model.set_dict(state_dict)
110 |         print("Loaded parameters from %s" % args.params_path)
111 |     else:
112 |         raise ValueError(
113 |             "Please set --params_path with correct pretrained model file")
114 | 
115 |     y_probs = predict(model, test_data_loader)
116 |     y_preds = np.argmax(y_probs, axis=1)
117 |     
118 |     with open(args.result_file, 'w', encoding="utf-8") as f:
119 |         for y_pred in y_preds:
120 |             f.write(str(y_pred) + "\n")
121 | 


--------------------------------------------------------------------------------
/DuReader-2.0/.gitignore:
--------------------------------------------------------------------------------
 1 | models/
 2 | preprocessed/
 3 | raw/
 4 | *.pyc
 5 | vocab.search
 6 | local
 7 | bleu_metric
 8 | rouge_metric
 9 | .idea/
10 | add_copyright.py
11 | copyright
12 | .DS_Store
13 | 


--------------------------------------------------------------------------------
/DuReader-2.0/data/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | if [[ -d preprocessed ]] && [[ -d raw ]]; then
20 |     echo "data exist"
21 |     exit 0
22 | else
23 |     wget -c https://dataset-bj.cdn.bcebos.com/dureader/dureader_raw.zip
24 |     wget -c https://dataset-bj.cdn.bcebos.com/dureader/dureader_preprocessed.zip
25 | fi
26 | 
27 | if md5sum --status -c md5sum.txt; then
28 |     unzip dureader_raw.zip
29 |     unzip dureader_preprocessed.zip
30 | else
31 |     echo "download data error!" >> /dev/stderr
32 |     exit 1
33 | fi
34 | 


--------------------------------------------------------------------------------
/DuReader-2.0/data/md5sum.txt:
--------------------------------------------------------------------------------
1 | dc7658b8cdf4f94b8714d130b7d15196  dureader_raw.zip
2 | 3db9a32e5a7c5375a604a70687b45479  dureader_preprocessed.zip
3 | 


--------------------------------------------------------------------------------
/DuReader-2.0/paddle/UPDATES.md:
--------------------------------------------------------------------------------
 1 | # The notes on the updates of PaddlePaddle baseline
 2 | 
 3 | ## Updates
 4 | 
 5 | We implement a BiDAF model with PaddlePaddle. Note that we have an update on the PaddlePaddle baseline (Feb 25, 2019). In this document, we give the details of the major updates:
 6 | 
 7 | ### 1 Paragraph Extraction
 8 | 
 9 | The first update is that we incorporate a strategy of paragraph extraction to improve the model performance (see the file `paddle/para_extraction.py`). A similar strategy has been used in the Top-1 system (Liu et al. 2018) at [2018 Machine Reading Challenge](http://mrc2018.cipsc.org.cn/). 
10 | 
11 | The original baseline of DuReader (He et al. 2018) employed a simple strategy to select paragraphs for model training and testing. However, the paragraphs that includes the true answers may not be selected. Hence, we want to incorporate as much information for the answer extraction as possible. 
12 | 
13 | The detail of the new strategy of paragraph extraction is as follows. We apply the new paragraph extraction strategy on each document. For each document, 
14 |  - We remove the duplicated paragraphs in the document.
15 |  - We concatenate the title and all paragraphs in the document with a pre-defined splitter if it is shorter than a predefined maximum length. Otherwise, 
16 | 	- We compute F1 score of each paragraph relative to the question; 
17 | 	- We concatenate the title and the top-K paragraphs (by F1 score) with a pre-defined splitter to form an extracted paragraph that should be shorter than the predefined maximum length.
18 | 
19 | ### 2 The Prior of Document Ranking
20 | 
21 | We also introduce the prior of document ranking from search engine (see line #176 in `paddle/run.py`). The documents in DuReader are collected from the search results. Hence, the prior scores of document ranking is an important feature. We compute the prior scores from the training data and apply the prior scores in the testing stage.  
22 | 
23 | ## Reference
24 | 
25 | - Liu, J., Wei, W., Sun, M., Chen, H., Du, Y. and Lin, D., 2018. A Multi-answer Multi-task Framework for Real-world Machine Reading Comprehension. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (pp. 2109-2118).
26 | 
27 | - He, W., Liu, K., Liu, J., Lyu, Y., Zhao, S., Xiao, X., Liu, Y., Wang, Y., Wu, H., She, Q. and Liu, X., 2017. Dureader: a chinese machine reading comprehension dataset from real-world applications. arXiv preprint arXiv:1711.05073.
28 | 
29 | 


--------------------------------------------------------------------------------
/DuReader-2.0/paddle/args.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import argparse
20 | import distutils.util
21 | 
22 | 
23 | def parse_args():
24 |     parser = argparse.ArgumentParser(description=__doc__)
25 |     parser.add_argument(
26 |         '--prepare',
27 |         action='store_true',
28 |         help='create the directories, prepare the vocabulary and embeddings')
29 |     parser.add_argument('--train', action='store_true', help='train the model')
30 |     parser.add_argument('--evaluate', action='store_true', help='evaluate the model on dev set')
31 |     parser.add_argument('--predict', action='store_true',
32 |                         help='predict the answers for test set with trained model')
33 | 
34 |     parser.add_argument("--embed_size", type=int, default=300,
35 |                         help="The dimension of embedding table. (default: %(default)d)")
36 |     parser.add_argument("--hidden_size", type=int, default=150,
37 |                         help="The size of rnn hidden unit. (default: %(default)d)")
38 |     parser.add_argument("--learning_rate", type=float, default=0.001,
39 |                         help="Learning rate used to train the model. (default: %(default)f)")
40 |     parser.add_argument('--optim', default='adam', help='optimizer type')
41 |     parser.add_argument("--weight_decay", type=float, default=0.0001,
42 |                         help="Weight decay. (default: %(default)f)")
43 | 
44 |     parser.add_argument('--drop_rate', type=float, default=0.0, help="Dropout probability")
45 |     parser.add_argument('--random_seed', type=int, default=123)
46 |     parser.add_argument("--batch_size", type=int, default=32,
47 |                         help="The sequence number of a mini-batch data. (default: %(default)d)")
48 |     parser.add_argument("--pass_num", type=int, default=5,
49 |                         help="The number epochs to train. (default: %(default)d)")
50 |     parser.add_argument("--use_gpu", type=distutils.util.strtobool, default=True,
51 |                         help="Whether to use gpu. (default: %(default)d)")
52 |     parser.add_argument("--log_interval", type=int, default=50,
53 |                         help="log the train loss every n batches. (default: %(default)d)")
54 | 
55 |     parser.add_argument('--max_p_num', type=int, default=5)
56 |     parser.add_argument('--max_a_len', type=int, default=200)
57 |     parser.add_argument('--max_p_len', type=int, default=500)
58 |     parser.add_argument('--max_q_len', type=int, default=60)
59 |     parser.add_argument('--doc_num', type=int, default=5)
60 | 
61 |     parser.add_argument('--vocab_dir', default='../data/vocab', help='vocabulary')
62 |     parser.add_argument("--save_dir", type=str, default="../data/models",
63 |                         help="Specify the path to save trained models.")
64 |     parser.add_argument("--save_interval", type=int, default=1,
65 |                         help="Save the trained model every n passes. (default: %(default)d)")
66 |     parser.add_argument("--load_dir", type=str, default="",
67 |                         help="Specify the path to load trained models.")
68 |     parser.add_argument('--log_path',
69 |                         help='path of the log file. If not set, logs are printed to console')
70 |     parser.add_argument('--result_dir', default='../data/results/',
71 |                         help='the dir to output the results')
72 |     parser.add_argument('--result_name', default='test_result',
73 |                         help='the file name of the predicted results')
74 | 
75 |     parser.add_argument('--trainset', nargs='+',
76 |                         default=['../data/demo/trainset/search.train.json'],
77 |                         help='train dataset')
78 |     parser.add_argument('--devset', nargs='+',
79 |                         default=['../data/demo/devset/search.dev.json'],
80 |                         help='dev dataset')
81 |     parser.add_argument('--testset', nargs='+',
82 |                         default=['../data/demo/testset/search.test.json'],
83 |                         help='test dataset')
84 | 
85 |     parser.add_argument("--enable_ce", action='store_true',
86 |                         help="If set, run the task with continuous evaluation logs.")
87 |     parser.add_argument('--para_print', action='store_true', help="Print debug info")
88 |     parser.add_argument("--dev_interval", type=int, default=-1,
89 |                         help="evaluate on dev set loss every n batches. (default: %(default)d)")
90 |     args = parser.parse_args()
91 |     return args
92 | 


--------------------------------------------------------------------------------
/DuReader-2.0/paddle/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | paragraph_extraction ()
 5 | {
 6 |     SOURCE_DIR=$1
 7 |     TARGET_DIR=$2
 8 |     echo "Start paragraph extraction, this may take a few hours"
 9 |     echo "Source dir: $SOURCE_DIR"
10 |     echo "Target dir: $TARGET_DIR"
11 |     mkdir -p $TARGET_DIR/trainset
12 |     mkdir -p $TARGET_DIR/devset
13 |     mkdir -p $TARGET_DIR/testset
14 | 
15 |     echo "Processing trainset"
16 |     cat $SOURCE_DIR/trainset/search.train.json | python paragraph_extraction.py train \
17 |             > $TARGET_DIR/trainset/search.train.json
18 |     cat $SOURCE_DIR/trainset/zhidao.train.json | python paragraph_extraction.py train \
19 |             > $TARGET_DIR/trainset/zhidao.train.json
20 | 
21 |     echo "Processing devset"
22 |     cat $SOURCE_DIR/devset/search.dev.json | python paragraph_extraction.py dev \
23 |             > $TARGET_DIR/devset/search.dev.json
24 |     cat $SOURCE_DIR/devset/zhidao.dev.json | python paragraph_extraction.py dev \
25 |             > $TARGET_DIR/devset/zhidao.dev.json
26 | 
27 |     echo "Processing testset"
28 |     cat $SOURCE_DIR/testset/search.test.json | python paragraph_extraction.py test \
29 |             > $TARGET_DIR/testset/search.test.json
30 |     cat $SOURCE_DIR/testset/zhidao.test.json | python paragraph_extraction.py test \
31 |             > $TARGET_DIR/testset/zhidao.test.json
32 |     echo "Paragraph extraction done!"
33 | }
34 | 
35 | 
36 | PROCESS_NAME="$1"
37 | case $PROCESS_NAME in
38 |     --para_extraction)
39 |     # Start paragraph extraction 
40 |     if [ ! -d ../data/preprocessed ]; then
41 |         echo "Please download the preprocessed data first (See README - Preprocess)"
42 |         exit 1
43 |     fi
44 |     paragraph_extraction ../data/preprocessed ../data/extracted
45 |     ;;
46 |     --prepare|--train|--evaluate|--predict)
47 |         # Start Paddle baseline
48 |         python run.py $@
49 |     ;;
50 |     *)
51 |         echo $"Usage: $0 {--para_extraction|--prepare|--train|--evaluate|--predict}"
52 | esac
53 | 


--------------------------------------------------------------------------------
/DuReader-2.0/tensorflow/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Empty __init__.py file
19 | 
20 | Authors: Yizhong Wang(wangyizhong01@baidu.com)
21 | Date: 2017/09/20 12:00:00
22 | """
23 | 


--------------------------------------------------------------------------------
/DuReader-2.0/tensorflow/layers/basic_rnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | This module provides wrappers for variants of RNN in Tensorflow
19 | """
20 | 
21 | import tensorflow as tf
22 | import tensorflow.contrib as tc
23 | 
24 | 
25 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True):
26 |     """
27 |     Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN
28 |     Args:
29 |         rnn_type: the type of rnn
30 |         inputs: padded inputs into rnn
31 |         length: the valid length of the inputs
32 |         hidden_size: the size of hidden units
33 |         layer_num: multiple rnn layer are stacked if layer_num > 1
34 |         dropout_keep_prob:
35 |         concat: When the rnn is bidirectional, the forward outputs and backward outputs are
36 |                 concatenated if this is True, else we add them.
37 |     Returns:
38 |         RNN outputs and final state
39 |     """
40 |     if not rnn_type.startswith('bi'):
41 |         cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
42 |         outputs, states = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32)
43 |         if rnn_type.endswith('lstm'):
44 |             c = [state.c for state in states]
45 |             h = [state.h for state in states]
46 |             states = h
47 |     else:
48 |         cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
49 |         cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
50 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(
51 |             cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32
52 |         )
53 |         states_fw, states_bw = states
54 |         if rnn_type.endswith('lstm'):
55 |             c_fw = [state_fw.c for state_fw in states_fw]
56 |             h_fw = [state_fw.h for state_fw in states_fw]
57 |             c_bw = [state_bw.c for state_bw in states_bw]
58 |             h_bw = [state_bw.h for state_bw in states_bw]
59 |             states_fw, states_bw = h_fw, h_bw
60 |         if concat:
61 |             outputs = tf.concat(outputs, 2)
62 |             states = tf.concat([states_fw, states_bw], 1)
63 |         else:
64 |             outputs = outputs[0] + outputs[1]
65 |             states = states_fw + states_bw
66 |     return outputs, states
67 | 
68 | 
69 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None):
70 |     """
71 |     Gets the RNN Cell
72 |     Args:
73 |         rnn_type: 'lstm', 'gru' or 'rnn'
74 |         hidden_size: The size of hidden units
75 |         layer_num: MultiRNNCell are used if layer_num > 1
76 |         dropout_keep_prob: dropout in RNN
77 |     Returns:
78 |         An RNN Cell
79 |     """
80 |     cells = []
81 |     for i in range(layer_num):
82 |         if rnn_type.endswith('lstm'):
83 |             cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True)
84 |         elif rnn_type.endswith('gru'):
85 |             cell = tc.rnn.GRUCell(num_units=hidden_size)
86 |         elif rnn_type.endswith('rnn'):
87 |             cell = tc.rnn.BasicRNNCell(num_units=hidden_size)
88 |         else:
89 |             raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type))
90 |         if dropout_keep_prob is not None:
91 |             cell = tc.rnn.DropoutWrapper(cell,
92 |                                          input_keep_prob=dropout_keep_prob,
93 |                                          output_keep_prob=dropout_keep_prob)
94 |         cells.append(cell)
95 |     cells = tc.rnn.MultiRNNCell(cells, state_is_tuple=True)
96 |     return cells
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/DuReader-2.0/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | This package implements some utility functions shared by PaddlePaddle
19 | and Tensorflow model implementations.
20 | 
21 | Authors: liuyuan(liuyuan04@baidu.com)
22 | Date:    2017/10/06 18:23:06
23 | """
24 | 
25 | 
26 | from .dureader_eval import compute_bleu_rouge
27 | from .dureader_eval import normalize
28 | from .preprocess import find_fake_answer
29 | from .preprocess import find_best_question_match
30 | 
31 | __all__ = [
32 |     'compute_bleu_rouge',
33 |     'normalize',
34 |     'find_fake_answer',
35 |     'find_best_question_match',
36 |     ]
37 | 


--------------------------------------------------------------------------------
/DuReader-2.0/utils/download_thirdparty.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | # We use Bleu and Rouge as evaluation metrics, the calculation of these metrics
19 | # relies on the scoring scripts under "https://github.com/tylin/coco-caption"
20 | 
21 | bleu_base_url='https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/bleu'
22 | bleu_files=("LICENSE" "__init__.py" "bleu.py" "bleu_scorer.py")
23 | 
24 | rouge_base_url="https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/rouge"
25 | rouge_files=("__init__.py" "rouge.py")
26 | 
27 | download() {
28 |     local metric=$1; shift;
29 |     local base_url=$1; shift;
30 |     local fnames=($@);
31 | 
32 |     mkdir -p ${metric}
33 |     for fname in ${fnames[@]};
34 |     do
35 |         printf "downloading: %s\n" ${base_url}/${fname}
36 |         wget --no-check-certificate ${base_url}/${fname} -O ${metric}/${fname}
37 |     done
38 | }
39 | 
40 | # prepare rouge
41 | download "rouge_metric" ${rouge_base_url} ${rouge_files[@]}
42 | 
43 | # prepare bleu
44 | download "bleu_metric" ${bleu_base_url} ${bleu_files[@]}
45 | 
46 | # convert python 2.x source code to python 3.x
47 | 2to3 -w "../utils/bleu_metric/bleu_scorer.py"
48 | 2to3 -w "../utils/bleu_metric/bleu.py"
49 | 


--------------------------------------------------------------------------------
/DuReader-2.0/utils/get_vocab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Utility function to generate vocabulary file.
19 | """
20 | 
21 | 
22 | import argparse
23 | import sys
24 | import json
25 | 
26 | from itertools import chain
27 | 
28 | 
29 | def get_vocab(files, vocab_file):
30 |     """
31 |     Builds vocabulary file from field 'segmented_paragraphs'
32 |     and 'segmented_question'.
33 | 
34 |     Args:
35 |         files: A list of file names.
36 |         vocab_file: The file that stores the vocabulary.
37 |     """
38 |     vocab = {}
39 |     for f in files:
40 |         with open(f, 'r') as fin:
41 |             for line in fin:
42 |                 obj = json.loads(line.strip())
43 |                 paras = [
44 |                         chain(*d['segmented_paragraphs'])
45 |                         for d in obj['documents']]
46 |                 doc_tokens = chain(*paras)
47 |                 question_tokens = obj['segmented_question']
48 |                 for t in list(doc_tokens) + question_tokens:
49 |                     vocab[t] = vocab.get(t, 0) + 1
50 |     # output
51 |     sorted_vocab = sorted([(v, c) for v, c in vocab.items()],
52 |             key=lambda x: x[1],
53 |             reverse=True)
54 |     with open(vocab_file, 'w') as outf:
55 |         for w, c in sorted_vocab:
56 |             print >> outf, '{}\t{}'.format(w.encode('utf8'), c)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument('--files', nargs='+', required=True,
62 |             help='file list to count vocab from.')
63 |     parser.add_argument('--vocab', required=True,
64 |             help='file to store counted vocab.')
65 |     args = parser.parse_args()
66 |     get_vocab(args.files, args.vocab)
67 | 
68 | 


--------------------------------------------------------------------------------
/DuReader-2.0/utils/marco_tokenize_data.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | 
 3 | import os, sys, json
 4 | import nltk
 5 | 
 6 | def _nltk_tokenize(sequence):
 7 |     tokens = nltk.word_tokenize(sequence)
 8 | 
 9 |     cur_char_offset = 0
10 |     token_offsets = []
11 |     token_words = []
12 |     for token in tokens:
13 |         cur_char_offset = sequence.find(token, cur_char_offset)
14 |         token_offsets.append([cur_char_offset, cur_char_offset + len(token) - 1])
15 |         token_words.append(token)
16 |     return token_offsets, token_words
17 | 
18 | def segment(input_js):
19 |     _, input_js['segmented_question'] = _nltk_tokenize(input_js['question'])
20 |     for doc_id, doc in enumerate(input_js['documents']):
21 |         doc['segmented_title'] = []
22 |         doc['segmented_paragraphs'] = []
23 |         for para_id, para in enumerate(doc['paragraphs']):
24 |             _, seg_para = _nltk_tokenize(para)
25 |             doc['segmented_paragraphs'].append(seg_para)
26 |     if 'answers' in input_js:
27 |         input_js['segmented_answers'] = []
28 |         for answer_id, answer in enumerate(input_js['answers']):
29 |             _, seg_answer = _nltk_tokenize(answer)
30 |             input_js['segmented_answers'].append(seg_answer)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     if len(sys.argv) != 2:
35 |         print('Usage: tokenize_data.py <input_path>')
36 |         exit()
37 | 
38 |     nltk.download('punkt')
39 |     
40 |     for line in open(sys.argv[1]):
41 |         dureader_js = json.loads(line.strip())
42 |         segment(dureader_js)
43 |         print(json.dumps(dureader_js))
44 | 


--------------------------------------------------------------------------------
/DuReader-2.0/utils/marcov1_to_dureader.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | 
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def trans(input_js):
 9 |     output_js = {}
10 |     output_js['question'] = input_js['query']
11 |     output_js['question_type'] = input_js['query_type']
12 |     output_js['question_id'] = input_js['query_id']
13 |     output_js['fact_or_opinion'] = ""
14 |     output_js['documents'] = []
15 |     for para_id, para in enumerate(input_js['passages']):
16 |         doc = {}
17 |         doc['title'] = ""
18 |         if 'is_selected' in para:
19 |             doc['is_selected'] = True if para['is_selected'] != 0 else False
20 |         doc['paragraphs'] = [para['passage_text']]
21 |         output_js['documents'].append(doc)
22 | 
23 |     if 'answers' in input_js:
24 |         output_js['answers'] = input_js['answers']
25 |     return output_js
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     if len(sys.argv) != 2:
30 |         print('Usage: marcov1_to_dureader.py <input_path>')
31 |         exit()
32 | 
33 |     df = pd.read_json(sys.argv[1])
34 |     for row in df.iterrows():
35 |         marco_js = json.loads(row[1].to_json())
36 |         dureader_js = trans(marco_js)
37 |         print(json.dumps(dureader_js))
38 | 


--------------------------------------------------------------------------------
/DuReader-2.0/utils/marcov2_to_v1_tojsonl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import pandas as pd 
 4 | 
 5 | if __name__ == '__main__':
 6 |     if len(sys.argv) != 3:
 7 |         print('Usage: tojson.py <input_path> <output_path>')
 8 |         exit()
 9 |     infile = sys.argv[1]
10 |     outfile = sys.argv[2]
11 |     df = pd.read_json(infile)
12 |     with open(outfile, 'w') as f:
13 |         for row in df.iterrows():
14 |             f.write(row[1].to_json() + '\n')


--------------------------------------------------------------------------------
/DuReader-2.0/utils/run_marco2dureader_preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | input_file=$1
 4 | output_file=$2
 5 | 
 6 | # convert the data from MARCO V2 (json) format to MARCO V1 (jsonl) format. 
 7 | # the script was forked from MARCO repo. 
 8 | # the format of MARCO V1 is much more easier to explore. 
 9 | python3 marcov2_to_v1_tojsonl.py $input_file $input_file.marcov1
10 | 
11 | # convert the data from MARCO V1 format to DuReader format. 
12 | python3 marcov1_to_dureader.py $input_file.marcov1 >$input_file.dureader_raw
13 | 
14 | # tokenize the data. 
15 | python3 marco_tokenize_data.py $input_file.dureader_raw >$input_file.segmented
16 | 
17 | # find fake answers (indicating the start and end positions of answers in the document) for train and dev sets. 
18 | # note that this should not be applied for test set, since there is no ground truth in test set. 
19 | python preprocess.py $input_file.segmented >$output_file
20 | 
21 | # remove the temporal data files. 
22 | rm -rf $input_file.dureader_raw $input_file.segmented
23 | 


--------------------------------------------------------------------------------
/DuReader-Checklist/README.md:
--------------------------------------------------------------------------------
  1 | # 阅读理解 DuReader<sub>checklist</sub>
  2 | 
  3 | # 简介
  4 | 
  5 | ## 1. 任务说明
  6 | 机器阅读理解 (Machine Reading Comprehension) 是指让机器阅读文本，然后回答和阅读内容相关的问题。阅读理解是自然语言处理和人工智能领域的重要前沿课题，对于提升机器的智能水平、使机器具有持续知识获取的能力等具有重要价值，近年来受到学术界和工业界的广泛关注。
  7 | 
  8 | 自然语言理解对机器学习模型各方面的能力均有极高的要求。然而，当前的机器阅读理解数据集大多都只采用单一的指标来评测模型的好坏，缺乏对模型语言理解能力的细粒度、多维度评测，导致模型的具体缺陷很难被发现和改进。为了解决这个问题，我们建立了细粒度的、多维度的评测数据集，从词汇理解、短语理解、语义角色理解、逻辑推理等多个维度检测模型的不足之处，从而推动阅读理解评测进入“精细化“时代。
  9 | 
 10 | ## 2. 数据集
 11 | DuReader<sub>checklist</sub>数据集旨在通过建立checklist评测体系，系统性地评估当前模型能力的不足之处。目前checklist体系中涉及到的自然语言理解能力包含：词汇理解、短语理解、语义角色理解以及推理能力等等。具体的分类体系可参考下图：
 12 | ![checklist_framwork](checklist.png)
 13 | 
 14 | DuReader<sub>checklist</sub>数据集包含训练集、开发集以及测试集。其中开发集和测试集中，既包含和训练集同分布的in-domain样本，也包含了按照checklist体系分类后的样本。对于一个给定的问题q、一个篇章p及其标题t，系统需要根据篇章内容，判断该篇章p中是否包含给定问题的答案，如果是，则给出该问题的答案a；否则输出“no answer”。数据集中的每个样本，是一个四元组<q, p, t, a>，例如：
 15 | 
 16 | * * *
 17 | 
 18 | **问题 q**: 番石榴汁热量
 19 | 
 20 | **篇章 p**: 番石榴性温,味甜、酸、涩…，最重要的是番石榴所含的脂肪热量较低,一个番石榴所含的脂肪约0.9克重或84卡路里。比起苹果,番石榴所含有的脂肪少38%,卡路里少42%。
 21 | 
 22 | **标题 t**: 番石榴汁的热量 - 妈妈网百科
 23 | 
 24 | 
 25 | **参考答案 a**: [‘一个番石榴所含的脂肪约0.9克重或84卡路里’]
 26 | 
 27 | * * *
 28 | 
 29 | **问题 q**: 云南文山市多少人口?
 30 | 
 31 | **篇章 p**: 云南省下辖8个市、8个少数民族自治州,面积39万平方千米,总人口4596万人,云南汉族人口为3062.9万人,占云南省总人口的66.63%...
 32 | 
 33 | **标题 t**: 云南总人口数多少人,2019年云南人口数量统计(最新)
 34 | 
 35 | 
 36 | **参考答案 a**: [‘无答案’]
 37 | 
 38 | * * *
 39 | 
 40 | 
 41 | # 快速开始
 42 | 
 43 | ### 安装说明
 44 | 
 45 | * PaddlePaddle 安装
 46 | 
 47 |    本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
 48 | 
 49 | * PaddleNLP 安装
 50 | 
 51 |    ```shell
 52 |    pip install --upgrade paddlenlp -i https://pypi.org/simple
 53 |    ```
 54 | 
 55 | * 环境依赖
 56 | 
 57 |     Python的版本要求 3.6+
 58 | 
 59 | ### 目录结构
 60 | 
 61 | ```text
 62 | ├── README.md               # 说明文档
 63 | ├── evaluate.py             # 评测脚本
 64 | ├── run_eval.sh             # 评测入口
 65 | ├── train.sh                # 训练入口
 66 | ├── predict.sh              # 预测入口
 67 | ├── src/run_du.py           # 训练、预测逻辑  
 68 | ├── src/squad.py            # reader、后处理等
 69 | ├── src/args.py             # 超参配置
 70 | ├── src/models.py           # 模型
 71 | ```
 72 | 
 73 | 
 74 | ### 数据准备
 75 | 在运行基线之前，需要下载DuReader<sub>checklist</sub>数据集，运行
 76 | 
 77 | ```
 78 | sh download.sh
 79 | ```
 80 | 
 81 | 该命令完成之后，数据集会被保存到```dataset/```文件夹中。此外，基于[ERNIE-1.0](https://arxiv.org/abs/1904.09223)微调后的基线模型参数也会被保存在`finetuned_model/ `文件夹中，可供直接预测使用。
 82 | 
 83 | ### 模型训练
 84 | 
 85 | * 按如下方式可以使用默认配置进行训练，并在开发集做预测:
 86 | 
 87 | ```
 88 | sh train.sh
 89 | ```
 90 | 其中训练好的模型参数以及预测结果会被保存在`output/`件夹中。
 91 | 
 92 | * 如需使用其他数据集进行数据增强 (例如[DuReader<sub>robust</sub>](https://github.com/PaddlePaddle/Research/tree/master/NLP/DuReader-Robust-BASELINE)训练集)，可以使用以下命令 (数据格式需保持兼容):
 93 | 
 94 | ```
 95 | sh train.sh --train_file path_to_dataset_file
 96 | ```
 97 | 其中`path_to_dataset_file `是数据集路径，例如`dataset/train.json`。
 98 | 
 99 | * 如需使用前一阶段训练好的参数进行热启动训练，可运行以下命令：
100 | 
101 | ```
102 | sh train.sh --model_name_or_path path_to_model_ckpt
103 | ```
104 | 其中`path_to_model_ckpt`是模型参数路径，例如`output/model_2000`。
105 | 
106 | 更为详细的参数配置可参考`train.sh`以及`args.py`。
107 | 
108 | 
109 | ### 模型预测
110 | * 如需使用训练好的参数进行预测，可参考以下命令：
111 | 
112 | ```
113 | sh predict.sh --model_name_or_path path_to_model_ckpt --predict_file  path_to_dataset_file
114 | ```
115 | 其中`path_to_model_ckpt`是模型参数路径，`path_to_dataset_file `是数据集路径。
116 | 
117 | * 为了方便测试，我们也提供了已经微调好的模型参数。运行以下命令即可直接进行预测
118 | 
119 | ```
120 | sh predict.sh --model_name_or_path finetuned_model --predict_file dataset/dev.json
121 | ```
122 | 预测结果会被保存在`output/`件夹中。
123 | 
124 | ### 结果评估
125 | 评估脚本的运行参考以下命令:
126 | 
127 | ```
128 | sh run_eval.sh dataset_file pred_file
129 | ```
130 | 
131 | 其中`dataset_file `是数据集文件，`pred_file`是模型预测结果，例如
132 | 
133 | ```
134 | sh run_eval.sh dataset/dev.json output/dev_predictions.json
135 | ```
136 | 下表是ERNIE-1.0基线模型在dev集合的效果:
137 | 
138 | | Dataset | Num_examples | F1 | EM |
139 | | --- | :---: | --- | --- |
140 | | All | 1130 | 64.080 | 55.221 |
141 | | in-domain | 1000 |65.809 | 57.000 |
142 | | vocab | 35 | 44.113 |42.857 |
143 | | phrase | 35 | 63.345 | 62.857 |
144 | |semantic-role | 20 | 41.827 | 25.000 |
145 | |fault-tolerant | 20 | 46.741 | 25.000 |
146 | |reasoning| 20 | 53.429 | 35.000 |
147 | 
148 | 
149 | 
150 | # 其他
151 | 
152 | ## 如何贡献代码
153 | 
154 | 如果你可以修复某个issue或者增加一个新功能，欢迎给我们提交PR。如果对应的PR被接受了，我们将根据贡献的质量和难度进行打分（0-5分，越高越好）。如果你累计获得了10分，可以联系我们获得面试机会或者为你写推荐信。
155 | 


--------------------------------------------------------------------------------
/DuReader-Checklist/checklist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Checklist/checklist.png


--------------------------------------------------------------------------------
/DuReader-Checklist/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Download dataset and model parameters
 3 | set -e
 4 | 
 5 | echo "Download DuReader-checklist dataset"
 6 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/lic2021/dureader_checklist.dataset.tar.gz
 7 | tar -zxvf dureader_checklist.dataset.tar.gz
 8 | rm dureader_checklist.dataset.tar.gz
 9 | 
10 | echo "Download fine-tuned parameters"
11 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/lic2021/dureader_checklist.finetuned_model.tar.gz
12 | tar -zxvf dureader_checklist.finetuned_model.tar.gz
13 | rm dureader_checklist.finetuned_model.tar.gz
14 | 


--------------------------------------------------------------------------------
/DuReader-Checklist/predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONIOENCODING=utf-8
 3 | 
 4 | if [ -z "$CUDA_VISIBLE_DEVICES" ];then
 5 |     export CUDA_VISIBLE_DEVICES=0
 6 | fi
 7 | 
 8 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 9 | 
10 | python -u src/run.py \
11 |     --model_type ernie \
12 |     --max_seq_length 512 \
13 |     --batch_size 4 \
14 |     --logging_steps 50 \
15 |     --max_answer_length 512 \
16 |     --output_dir output \
17 |     --version_2_with_negative \
18 |     --do_pred \
19 |     --cls_threshold 0.7 \
20 |     $@
21 | 


--------------------------------------------------------------------------------
/DuReader-Checklist/run_eval.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -lt 2 ]; then
 2 |     echo "Usage: $0 dataset_file pred_file"
 3 |     exit 1
 4 | fi
 5 | python evaluate.py $1 $2
 6 | for tag in 'in-domain' 'vocab' 'phrase' 'semantic-role' 'fault-tolerant' 'reasoning' 
 7 | do
 8 |     python evaluate.py $1 $2 --tag $tag
 9 | done
10 | 
11 | 


--------------------------------------------------------------------------------
/DuReader-Checklist/src/args.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | def parse_args():
  4 |     parser = argparse.ArgumentParser(description=__doc__)
  5 |     parser.add_argument(
  6 |         "--train_file",
  7 |         type=str,
  8 |         default=None,
  9 |         help="Train data path.")
 10 |     parser.add_argument(
 11 |         "--predict_file",
 12 |         type=str,
 13 |         default=None,
 14 |         help="Predict data path.",
 15 |         nargs='+')
 16 |     parser.add_argument(
 17 |         "--model_type",
 18 |         default=None,
 19 |         type=str,
 20 |         required=True,
 21 |         help="Type of pre-trained model.")
 22 |     parser.add_argument(
 23 |         "--model_name_or_path",
 24 |         default=None,
 25 |         type=str,
 26 |         required=True,
 27 |         help="Path to pre-trained model or shortcut name of model.")
 28 |     parser.add_argument(
 29 |         "--output_dir",
 30 |         default=None,
 31 |         type=str,
 32 |         required=True,
 33 |         help="The output directory where the model predictions and checkpoints will be written."
 34 |     )
 35 |     parser.add_argument(
 36 |         "--max_seq_length",
 37 |         default=128,
 38 |         type=int,
 39 |         help="The maximum total input sequence length after tokenization. Sequences longer "
 40 |         "than this will be truncated, sequences shorter will be padded.")
 41 |     parser.add_argument(
 42 |         "--batch_size",
 43 |         default=8,
 44 |         type=int,
 45 |         help="Batch size per GPU/CPU for training.")
 46 |     parser.add_argument(
 47 |         "--learning_rate",
 48 |         default=5e-5,
 49 |         type=float,
 50 |         help="The initial learning rate for Adam.")
 51 |     parser.add_argument(
 52 |         "--weight_decay",
 53 |         default=0.0,
 54 |         type=float,
 55 |         help="Weight decay if we apply some.")
 56 |     parser.add_argument(
 57 |         "--adam_epsilon",
 58 |         default=1e-8,
 59 |         type=float,
 60 |         help="Epsilon for Adam optimizer.")
 61 |     parser.add_argument(
 62 |         "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
 63 |     parser.add_argument(
 64 |         "--num_train_epochs",
 65 |         default=3,
 66 |         type=int,
 67 |         help="Total number of training epochs to perform.")
 68 |     parser.add_argument(
 69 |         "--max_steps",
 70 |         default=-1,
 71 |         type=int,
 72 |         help="If > 0: set total number of training steps to perform. Override num_train_epochs."
 73 |     )
 74 |     parser.add_argument(
 75 |         "--warmup_proportion",
 76 |         default=0.0,
 77 |         type=float,
 78 |         help="Proportion of training steps to perform linear learning rate warmup for."
 79 |     )
 80 |     parser.add_argument(
 81 |         "--logging_steps",
 82 |         type=int,
 83 |         default=500,
 84 |         help="Log every X updates steps.")
 85 |     parser.add_argument(
 86 |         "--save_steps",
 87 |         type=int,
 88 |         default=500,
 89 |         help="Save checkpoint every X updates steps.")
 90 |     parser.add_argument(
 91 |         "--seed", type=int, default=42, help="random seed for initialization")
 92 |     parser.add_argument(
 93 |         "--device",
 94 |         type=str,
 95 |         default="gpu",
 96 |         help="Device for selecting for the training.")
 97 |     parser.add_argument(
 98 |         "--doc_stride",
 99 |         type=int,
100 |         default=128,
101 |         help="When splitting up a long document into chunks, how much stride to take between chunks."
102 |     )
103 |     parser.add_argument(
104 |         "--n_best_size",
105 |         type=int,
106 |         default=20,
107 |         help="The total number of n-best predictions to generate in the nbest_predictions.json output file."
108 |     )
109 |     parser.add_argument(
110 |         "--max_query_length", type=int, default=64, help="Max query length.")
111 |     parser.add_argument(
112 |         "--max_answer_length", type=int, default=30, help="Max answer length.")
113 |     parser.add_argument(
114 |         "--cls_threshold", type=float, default=0.5, help="No answer threshold")
115 |     parser.add_argument(
116 |         "--do_lower_case",
117 |         action='store_false',
118 |         help="Whether to lower case the input text. Should be True for uncased models and False for cased models."
119 |     )
120 |     parser.add_argument(
121 |         "--verbose", action='store_true', help="Whether to output verbose log.")
122 |     parser.add_argument(
123 |         "--version_2_with_negative",
124 |         action='store_true',
125 |         help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true."
126 |     )
127 |     parser.add_argument(
128 |         "--do_train", action='store_true', help="Whether to train the model.")
129 | 
130 |     parser.add_argument(
131 |         "--do_pred", action='store_true', help="Whether to predict.")
132 |     args = parser.parse_args()
133 |     return args
134 | 


--------------------------------------------------------------------------------
/DuReader-Checklist/src/models.py:
--------------------------------------------------------------------------------
 1 | from paddlenlp.transformers import ErniePretrainedModel, BertPretrainedModel, RobertaPretrainedModel
 2 | from paddle import nn
 3 | import paddle
 4 | 
 5 | class ErnieForQuestionAnswering(ErniePretrainedModel):
 6 |     def __init__(self, ernie):
 7 |         super(ErnieForQuestionAnswering, self).__init__()
 8 |         self.ernie = ernie  # allow ernie to be config
 9 |         self.classifier = nn.Linear(self.ernie.config["hidden_size"], 2)
10 |         self.classifier_cls = nn.Linear(self.ernie.config["hidden_size"], 2)
11 |         self.apply(self.init_weights)
12 | 
13 |     def forward(self,
14 |                 input_ids,
15 |                 token_type_ids=None,
16 |                 position_ids=None,
17 |                 attention_mask=None):
18 |         sequence_output, pooled_output = self.ernie(
19 |             input_ids,
20 |             token_type_ids=token_type_ids,
21 |             position_ids=position_ids,
22 |             attention_mask=attention_mask)
23 | 
24 |         logits = self.classifier(sequence_output)
25 |         logits = paddle.transpose(logits, perm=[2, 0, 1])
26 |         start_logits, end_logits = paddle.unstack(x=logits, axis=0)
27 |         cls_logits = self.classifier_cls(pooled_output)
28 | 
29 |         return start_logits, end_logits, cls_logits
30 | 
31 | class BertForQuestionAnswering(BertPretrainedModel):
32 |     def __init__(self, bert):
33 |         super(BertForQuestionAnswering, self).__init__()
34 |         self.bert = bert  # allow bert to be config
35 |         self.classifier = nn.Linear(self.bert.config["hidden_size"], 2)
36 |         self.classifier_cls = nn.Linear(self.bert.config["hidden_size"], 2)
37 |         self.apply(self.init_weights)
38 | 
39 |     def forward(self,
40 |                 input_ids,
41 |                 token_type_ids=None,
42 |                 position_ids=None,
43 |                 attention_mask=None):
44 |         sequence_output, pooled_output = self.bert(
45 |             input_ids,
46 |             token_type_ids=token_type_ids,
47 |             position_ids=position_ids,
48 |             attention_mask=attention_mask)
49 | 
50 |         logits = self.classifier(sequence_output)
51 |         logits = paddle.transpose(logits, perm=[2, 0, 1])
52 |         start_logits, end_logits = paddle.unstack(x=logits, axis=0)
53 |         cls_logits = self.classifier_cls(pooled_output)
54 | 
55 |         return start_logits, end_logits, cls_logits
56 | 
57 | class RobertaForQuestionAnswering(RobertaPretrainedModel):
58 |     def __init__(self, roberta):
59 |         super(RobertaForQuestionAnswering, self).__init__()
60 |         self.roberta = roberta  # allow roberta to be config
61 |         self.classifier = nn.Linear(self.roberta.config["hidden_size"], 2)
62 |         self.classifier_cls = nn.Linear(self.roberta.config["hidden_size"], 2)
63 |         self.apply(self.init_weights)
64 | 
65 |     def forward(self,
66 |                 input_ids,
67 |                 token_type_ids=None,
68 |                 position_ids=None,
69 |                 attention_mask=None):
70 |         sequence_output, pooled_output = self.roberta(
71 |             input_ids,
72 |             token_type_ids=token_type_ids,
73 |             position_ids=position_ids,
74 |             attention_mask=attention_mask)
75 | 
76 |         logits = self.classifier(sequence_output)
77 |         logits = paddle.transpose(logits, perm=[2, 0, 1])
78 |         start_logits, end_logits = paddle.unstack(x=logits, axis=0)
79 |         cls_logits = self.classifier_cls(pooled_output)
80 | 
81 |         return start_logits, end_logits, cls_logits


--------------------------------------------------------------------------------
/DuReader-Checklist/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONIOENCODING=utf-8
 3 | 
 4 | unset CUDA_VISIBLE_DEVICES
 5 | 
 6 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 7 | 
 8 | python -m paddle.distributed.launch --gpus "0" src/run.py \
 9 |     --model_type ernie \
10 |     --model_name_or_path ernie-1.0 \
11 |     --max_seq_length 512 \
12 |     --batch_size 2 \
13 |     --learning_rate 3e-5 \
14 |     --num_train_epochs 2 \
15 |     --logging_steps 50 \
16 |     --save_steps 1000 \
17 |     --warmup_proportion 0.1 \
18 |     --max_answer_length 512 \
19 |     --weight_decay 0.01 \
20 |     --output_dir output \
21 |     --version_2_with_negative \
22 |     --do_train \
23 |     --do_pred \
24 |     --train_file dataset/train.json \
25 |     --predict_file dataset/dev.json \
26 |     --cls_threshold 0.7 \
27 |     --device gpu \
28 |     $@
29 | 


--------------------------------------------------------------------------------
/DuReader-Retrieval/figures/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Retrieval/figures/example.png


--------------------------------------------------------------------------------
/DuReader-Robust/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Download dataset and model parameters
 3 | set -e
 4 | 
 5 | echo "Download ERNIE 1.0"
 6 | mkdir pretrained_model
 7 | cd pretrained_model
 8 | wget --no-check-certificate https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz
 9 | tar -zxvf ERNIE_1.0_max-len-512.tar.gz
10 | rm ERNIE_1.0_max-len-512.tar.gz
11 | cd ..
12 | 
13 | echo "Download DuReader-robust dataset"
14 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/dureader_robust/data/dureader_robust-data.tar.gz 
15 | tar -zxvf dureader_robust-data.tar.gz 
16 | mv dureader_robust-data data
17 | rm dureader_robust-data.tar.gz
18 | 
19 | echo "Download fine-tuned parameters"
20 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/dureader_robust/baseline/dureader_robust-baseline-finetuned.tar.gz
21 | tar -zxvf dureader_robust-baseline-finetuned.tar.gz
22 | rm dureader_robust-baseline-finetuned.tar.gz
23 | 


--------------------------------------------------------------------------------
/DuReader-Robust/md5.txt:
--------------------------------------------------------------------------------
1 | 553223945508e49483e899ae8548e5a9  dureader_robust-baseline-finetuned.tar.gz
2 | 020b26396f1b5932e451dba84d0b3dc8  dureader_robust-data.tar.gz
3 | b6d1da2fbc610ac13b86b5113f8819f7  ERNIE_1.0_max-len-512.tar.gz
4 | 


--------------------------------------------------------------------------------
/DuReader-Robust/paddlehub_baseline/paddlehub_reading_comprehension.sh:
--------------------------------------------------------------------------------
 1 | export FLAGS_eager_delete_tensor_gb=0.0
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | DATASET_PATH="../data"
 5 | 
 6 | python -u reading_comprehension.py \
 7 |                    --dataset_path=${DATASET_PATH} \
 8 |                    --batch_size=8 \
 9 |                    --use_gpu=True \
10 |                    --checkpoint_dir="./ckpt_dureader" \
11 |                    --learning_rate=3e-5 \
12 |                    --weight_decay=0.01 \
13 |                    --warmup_proportion=0.1 \
14 |                    --num_epoch=5 \
15 |                    --max_seq_len=512 \
16 |                    --use_data_parallel=False
17 | 
18 | 


--------------------------------------------------------------------------------
/DuReader-Robust/paddlehub_baseline/reading_comprehension.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Finetuning on reading comprehension task """
 16 | 
 17 | import argparse
 18 | import ast
 19 | import json
 20 | import os
 21 | 
 22 | import paddle.fluid as fluid
 23 | import paddlehub as hub
 24 | 
 25 | from demo_dataset import DuReader
 26 | 
 27 | # yapf: disable
 28 | parser = argparse.ArgumentParser(__doc__)
 29 | parser.add_argument("--dataset_path", type=str, default=None, help="The diretory to DuReader robust dataset")
 30 | parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
 31 | parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
 32 | parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate used to train with warmup.")
 33 | parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
 34 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
 35 | parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
 36 | parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.")
 37 | parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.")
 38 | parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.")
 39 | args = parser.parse_args()
 40 | # yapf: enable.
 41 | 
 42 | 
 43 | if __name__ == '__main__':
 44 |     # 加载PaddleHub ERNIE预训练模型
 45 |     module = hub.Module(name="ernie")
 46 |     
 47 |     # ERNIE预训练模型输入变量inputs、输出变量outputs、以及模型program
 48 |     inputs, outputs, program = module.context(
 49 |         trainable=True, max_seq_len=args.max_seq_len)
 50 | 
 51 |     # 加载竞赛数据集并使用ReadingComprehensionReader读取数据
 52 |     dataset = DuReader(dataset_path=args.dataset_path)
 53 |     reader = hub.reader.ReadingComprehensionReader(
 54 |         dataset=dataset,
 55 |         vocab_path=module.get_vocab_path(),
 56 |         max_seq_len=args.max_seq_len,
 57 |         doc_stride=128,
 58 |         max_query_length=64)
 59 | 
 60 |     # 取ERNIE的字级别预训练输出
 61 |     seq_output = outputs["sequence_output"]
 62 | 
 63 |     # 设置运行program所需的feed_list
 64 |     feed_list = [
 65 |         inputs["input_ids"].name,
 66 |         inputs["position_ids"].name,
 67 |         inputs["segment_ids"].name,
 68 |         inputs["input_mask"].name,
 69 |     ]
 70 | 
 71 |     # 选择Fine-tune优化策略
 72 |     strategy = hub.AdamWeightDecayStrategy(
 73 |         weight_decay=args.weight_decay,
 74 |         learning_rate=args.learning_rate,
 75 |         warmup_proportion=args.warmup_proportion)
 76 | 
 77 |     # 设置运行配置
 78 |     config = hub.RunConfig(
 79 |         eval_interval=500,
 80 |         use_pyreader=False,
 81 |         use_data_parallel=args.use_data_parallel,
 82 |         use_cuda=args.use_gpu,
 83 |         num_epoch=args.num_epoch,
 84 |         batch_size=args.batch_size,
 85 |         checkpoint_dir=args.checkpoint_dir,
 86 |         save_ckpt_interval=500,
 87 |         strategy=strategy)
 88 | 
 89 |     # 定义阅读理解Fine-tune Task
 90 |     # 由于竞赛数据集与cmrc2018数据集格式比较相似，此处sub_task应为cmrc2018
 91 |     # 否则运行可能出错
 92 |     reading_comprehension_task = hub.ReadingComprehensionTask(
 93 |         data_reader=reader,
 94 |         feature=seq_output,
 95 |         feed_list=feed_list,
 96 |         config=config,
 97 |         sub_task="cmrc2018",
 98 |     )
 99 |     
100 |     # 调用finetune_and_eval API，将会自动进行训练、评估以及保存最佳模型
101 |     reading_comprehension_task.finetune_and_eval()
102 |     
103 |     # 数据集验证集部分数据用于预测
104 |     data = dataset.get_dev_examples()
105 |     # 调用predict接口, 打开return_result(True)，将自动返回预测结果
106 |     all_prediction = reading_comprehension_task.predict(data=data, load_best_model=False, return_result=True)
107 |     # 写入预测结果
108 |     json.dump(all_prediction, open('submit.json', 'w'), ensure_ascii=False)
109 | 


--------------------------------------------------------------------------------
/DuReader-Robust/predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONIOENCODING=utf-8
 3 | 
 4 | if [ -z "$CUDA_VISIBLE_DEVICES" ];then
 5 |     export CUDA_VISIBLE_DEVICES=0
 6 | fi
 7 | 
 8 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 9 | 
10 | 
11 | if [ -z "$PRETRAINED_MODEL_PATH" ];then
12 |     PRETRAINED_MODEL_PATH="./pretrained_model"
13 | fi
14 | echo "PRETRAINED_MODEL_PATH=$PRETRAINED_MODEL_PATH"
15 | 
16 | if [ -z "$CKPT" ];then
17 |     CKPT="./finetuned_model"
18 | fi
19 | echo "CKPT=$CKPT"
20 | 
21 | python -u src/run_mrc.py --use_cuda true \
22 |                 --batch_size 24 \
23 |                 --checkpoints output \
24 |                 --init_checkpoint ${CKPT} \
25 |                 --vocab_path ${PRETRAINED_MODEL_PATH}/vocab.txt \
26 |                 --ernie_config ${PRETRAINED_MODEL_PATH}/ernie_config.json \
27 |                 --max_seq_len 512 \
28 |                 --do_lower_case true \
29 |                 --doc_stride 128 \
30 |                 --max_answer_length 30 \
31 |                 --do_train false \
32 |                 --do_predict true \
33 |                 $@
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/DuReader-Robust/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/__init__.py


--------------------------------------------------------------------------------
/DuReader-Robust/src/_ce.py:
--------------------------------------------------------------------------------
 1 | ####this file is only used for continuous evaluation test!
 2 | 
 3 | import os
 4 | import sys
 5 | sys.path.insert(0, os.environ['ceroot'])
 6 | #sys.path.append('.')
 7 | from kpi import CostKpi, DurationKpi, AccKpi
 8 | 
 9 | #### NOTE kpi.py should shared in models in some way!!!!
10 | 
11 | train_cost_xnli_card1_kpi = CostKpi(
12 |     'train_cost_xnli_card1', 0.002, 0, actived=True)
13 | train_acc_xnli_card1_kpi = AccKpi(
14 |     'train_acc_xnli_card1', 0.002, 0, actived=True)
15 | train_duration_xnli_card1_kpi = DurationKpi(
16 |     'train_duration_xnli_card1', 0.01, 0, actived=True)
17 | train_cost_xnli_card4_kpi = CostKpi(
18 |     'train_cost_xnli_card4', 0.002, 0, actived=True)
19 | train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True)
20 | train_duration_xnli_card4_kpi = DurationKpi(
21 |     'train_duration_xnli_card4', 0.03, 0, actived=True)
22 | 
23 | tracking_kpis = [
24 |     train_cost_xnli_card1_kpi,
25 |     train_acc_xnli_card1_kpi,
26 |     train_duration_xnli_card1_kpi,
27 |     train_cost_xnli_card4_kpi,
28 |     train_acc_xnli_card4_kpi,
29 |     train_duration_xnli_card4_kpi,
30 | ]
31 | 
32 | 
33 | def parse_log(log):
34 |     '''
35 |     This method should be implemented by model developers.
36 |     The suggestion:
37 |     each line in the log should be key, value, for example:
38 |     "
39 |     train_cost\t1.0
40 |     test_cost\t1.0
41 |     train_cost\t1.0
42 |     train_cost\t1.0
43 |     train_acc\t1.2
44 |     "
45 |     '''
46 |     for line in log.split('\n'):
47 |         fs = line.strip().split('\t')
48 |         print(fs)
49 |         if len(fs) == 3 and fs[0] == 'kpis':
50 |             print("-----%s" % fs)
51 |             kpi_name = fs[1]
52 |             kpi_value = float(fs[2])
53 |             yield kpi_name, kpi_value
54 | 
55 | 
56 | def log_to_ce(log):
57 |     kpi_tracker = {}
58 |     for kpi in tracking_kpis:
59 |         kpi_tracker[kpi.name] = kpi
60 | 
61 |     for (kpi_name, kpi_value) in parse_log(log):
62 |         print(kpi_name, kpi_value)
63 |         kpi_tracker[kpi_name].add_record(kpi_value)
64 |         kpi_tracker[kpi_name].persist()
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     log = sys.stdin.read()
69 |     print("*****")
70 |     print(log)
71 |     print("****")
72 |     log_to_ce(log)
73 | 


--------------------------------------------------------------------------------
/DuReader-Robust/src/dist_utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 2 | #
 3 | #Licensed under the Apache License, Version 2.0 (the "License");
 4 | #you may not use this file except in compliance with the License.
 5 | #You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #Unless required by applicable law or agreed to in writing, software
10 | #distributed under the License is distributed on an "AS IS" BASIS,
11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #See the License for the specific language governing permissions and
13 | #limitations under the License.
14 | 
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | import os
19 | import paddle.fluid as fluid
20 | 
21 | 
22 | def nccl2_prepare(trainer_id, startup_prog, main_prog):
23 |     config = fluid.DistributeTranspilerConfig()
24 |     config.mode = "nccl2"
25 |     t = fluid.DistributeTranspiler(config=config)
26 |     t.transpile(
27 |         trainer_id,
28 |         trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
29 |         current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
30 |         startup_program=startup_prog,
31 |         program=main_prog)
32 | 
33 | 
34 | def prepare_for_multi_process(exe, build_strategy, train_prog):
35 |     # prepare for multi-process
36 |     trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
37 |     num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
38 |     if num_trainers < 2: return
39 |     print("PADDLE_TRAINERS_NUM", num_trainers)
40 |     print("PADDLE_TRAINER_ID", trainer_id)
41 |     build_strategy.num_trainers = num_trainers
42 |     build_strategy.trainer_id = trainer_id
43 |     # NOTE(zcd): use multi processes to train the model,
44 |     # and each process use one GPU card.
45 |     startup_prog = fluid.Program()
46 |     nccl2_prepare(trainer_id, startup_prog, train_prog)
47 |     # the startup_prog are run two times, but it doesn't matter.
48 |     exe.run(startup_prog)
49 | 


--------------------------------------------------------------------------------
/DuReader-Robust/src/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/model/__init__.py


--------------------------------------------------------------------------------
/DuReader-Robust/src/reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/reader/__init__.py


--------------------------------------------------------------------------------
/DuReader-Robust/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-Robust/src/utils/__init__.py


--------------------------------------------------------------------------------
/DuReader-Robust/src/utils/args.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Arguments for configuration."""
15 | 
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import six
21 | import argparse
22 | 
23 | import paddle.fluid as fluid
24 | 
25 | 
26 | def str2bool(v):
27 |     # because argparse does not support to parse "true, False" as python
28 |     # boolean directly
29 |     return v.lower() in ("true", "t", "1")
30 | 
31 | 
32 | class ArgumentGroup(object):
33 |     def __init__(self, parser, title, des):
34 |         self._group = parser.add_argument_group(title=title, description=des)
35 | 
36 |     def add_arg(self, name, type, default, help, **kwargs):
37 |         type = str2bool if type == bool else type
38 |         self._group.add_argument(
39 |             "--" + name,
40 |             default=default,
41 |             type=type,
42 |             help=help + ' Default: %(default)s.',
43 |             **kwargs)
44 | 
45 | 
46 | def print_arguments(args):
47 |     print('-----------  Configuration Arguments -----------')
48 |     for arg, value in sorted(six.iteritems(vars(args))):
49 |         print('%s: %s' % (arg, value))
50 |     print('------------------------------------------------')
51 | 
52 | def check_cuda(use_cuda, err = \
53 |     "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
54 |     Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
55 |                                                                                                                      ):
56 |     try:
57 |         if use_cuda == True and fluid.is_compiled_with_cuda() == False:
58 |             print(err)
59 |             sys.exit(1)
60 |     except Exception as e:
61 |         pass
62 | 


--------------------------------------------------------------------------------
/DuReader-Robust/src/utils/cards.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | 
18 | def get_cards():
19 |     """
20 |     get gpu cards number
21 |     """
22 |     num = 0
23 |     cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
24 |     if cards != '':
25 |         num = len(cards.split(","))
26 |     return num
27 | 


--------------------------------------------------------------------------------
/DuReader-Robust/src/utils/init.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import os
18 | import six
19 | import ast
20 | import copy
21 | 
22 | import numpy as np
23 | import paddle.fluid as fluid
24 | 
25 | 
26 | def cast_fp32_to_fp16(exe, main_program):
27 |     print("Cast parameters to float16 data format.")
28 |     for param in main_program.global_block().all_parameters():
29 |         if not param.name.endswith(".master"):
30 |             param_t = fluid.global_scope().find_var(param.name).get_tensor()
31 |             data = np.array(param_t)
32 |             if param.name.find("layer_norm") == -1:
33 |                 param_t.set(np.float16(data).view(np.uint16), exe.place)
34 |             master_param_var = fluid.global_scope().find_var(param.name +
35 |                                                              ".master")
36 |             if master_param_var is not None:
37 |                 master_param_var.get_tensor().set(data, exe.place)
38 | 
39 | 
40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
41 |     assert os.path.exists(
42 |         init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
43 | 
44 |     def existed_persitables(var):
45 |         if not fluid.io.is_persistable(var):
46 |             return False
47 |         if os.path.exists(os.path.join(init_checkpoint_path, var.name)):
48 |             print("INIT {}".format(var.name))
49 |             return True
50 | 
51 |     fluid.io.load_vars(
52 |         exe,
53 |         init_checkpoint_path,
54 |         main_program=main_program,
55 |         predicate=existed_persitables)
56 |     print("Load model from {}".format(init_checkpoint_path))
57 | 
58 |     if use_fp16:
59 |         cast_fp32_to_fp16(exe, main_program)
60 | 
61 | 
62 | def init_pretraining_params(exe,
63 |                             pretraining_params_path,
64 |                             main_program,
65 |                             use_fp16=False):
66 |     assert os.path.exists(pretraining_params_path
67 |                           ), "[%s] cann't be found." % pretraining_params_path
68 | 
69 |     def existed_params(var):
70 |         if not isinstance(var, fluid.framework.Parameter):
71 |             return False
72 |         if os.path.exists(os.path.join(pretraining_params_path, var.name)):
73 |             print("INIT {}".format(var.name))
74 |             return True
75 |         else:
76 |             print("SKIP {}".format(var.name))
77 |             return False
78 | 
79 |     fluid.io.load_vars(
80 |         exe,
81 |         pretraining_params_path,
82 |         main_program=main_program,
83 |         predicate=existed_params)
84 |     print("Load pretraining parameters from {}.".format(
85 |         pretraining_params_path))
86 | 
87 |     if use_fp16:
88 |         cast_fp32_to_fp16(exe, main_program)
89 | 


--------------------------------------------------------------------------------
/DuReader-Robust/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONIOENCODING=utf-8
 3 | 
 4 | if [ -z "$CUDA_VISIBLE_DEVICES" ];then
 5 |     export CUDA_VISIBLE_DEVICES=0
 6 | fi
 7 | 
 8 | echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 9 | 
10 | 
11 | if [ -z "$PRETRAINED_MODEL_PATH" ];then
12 |     PRETRAINED_MODEL_PATH="./pretrained_model"
13 | fi
14 | echo "PRETRAINED_MODEL_PATH=$PRETRAINED_MODEL_PATH"
15 | 
16 | python -u src/run_mrc.py --use_cuda true \
17 |                 --batch_size 12 \
18 |                 --checkpoints output \
19 |                 --init_pretraining_params ${PRETRAINED_MODEL_PATH}/params \
20 |                 --vocab_path ${PRETRAINED_MODEL_PATH}/vocab.txt \
21 |                 --ernie_config ${PRETRAINED_MODEL_PATH}/ernie_config.json \
22 |                 --save_steps 10000 \
23 |                 --warmup_proportion 0.1 \
24 |                 --weight_decay  0.01 \
25 |                 --epoch 2 \
26 |                 --max_seq_len 512 \
27 |                 --do_lower_case true \
28 |                 --doc_stride 128 \
29 |                 --learning_rate 3e-5 \
30 |                 --skip_steps 25 \
31 |                 --max_answer_length 30 \
32 |                 --do_train true \
33 |                 --do_predict true \
34 |                 $@
35 | 
36 | 


--------------------------------------------------------------------------------
/DuReader-vis/images/intro-vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-vis/images/intro-vis.png


--------------------------------------------------------------------------------
/DuReader-vis/images/intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/DuReader-vis/images/intro.png


--------------------------------------------------------------------------------
/MRQA2019-D-NET/README.md:
--------------------------------------------------------------------------------
 1 | # D-NET
 2 | 
 3 | ## Introduction
 4 | D-NET is a simple pre-training and fine-tuning framework that Baidu used for the MRQA (Machine Reading for Question Answering) 2019 Shared Task, which focused on the generalization of machine reading comprehension (MRC) models. Our system is ranked at top 1 of all the participants in terms of the averaged F1 score. Additionally, we won the first place for 10 of the 12 test sets and the second place for the other two in terms of F1 scores. 
 5 | 
 6 | In this repository, we release the related code, data and model parametrs which have been used in the D-NET framework. 
 7 | 
 8 | ## Framework
 9 | An overview of the D-NET framework is shown in the figure below. To improve the generalization capability of a MRC system, we use mainly two techniques, i.e. **multi-task learning (MTL)** and **ensemble of multiple pre-trained models**.
10 | 
11 | <p align="center">
12 | <img src="./images/D-NET_framework.png" width="500">
13 | </p>
14 | 
15 | 
16 | #### Multi-task learning
17 | In addition to the MRC task, we further introduce several auxiliary tasks in the fine-tuning stage to learn more general language representations. Specifically, we have the following auxiliary tasks:
18 | 
19 |  - Unsupervised Task: masked Language Model
20 |  - Supervised Tasks:
21 |  	- natural language inference
22 |  	- paragraph ranking
23 | 
24 | We use the [PALM](https://github.com/PaddlePaddle/PALM) multi-task learning library based on [PaddlePaddle](https://www.paddlepaddle.org.cn/) in our experiments, which makes the implementation of new tasks and pre-trained models much easier than from scratch. To train the MRQA data sets with MTL, please refer to the instructions [here](multi_task_learning) (under `multi_task_learning/`).
25 | 
26 | #### Ensemble of multiple pre-trained models
27 | In our experiments, we found that the ensemble system based on different pre-trained models shows better generalization capability than the system that based on the single ones. In this repository, we provide the parameters of 3 models that are fine-tuned on the MRQA in-domain data, based on ERNIE2.0, XL-NET and BERT, respectively. The ensemble of these models are implemented as servers. Please refer the instructions [here](server) (under `server/`) for more detials.
28 | 
29 | ## Directory structure
30 | ```
31 | ├── multi_task_learning/                        # scripts for multi-task learning
32 | │   ├── configs/                                # PALM config files
33 | │   ├── scripts/                                # auxiliary scripts
34 | │   ├── wget_pretrained_model.sh                # download pretrained model
35 | │   ├── wget_data.sh                            # download data for MTL
36 | │   ├── run_build_palm.sh                       # MLT preparation
37 | │   ├── run_evaluation.sh                       # evaluation
38 | │   ├── run_multi_task.sh                       # start MTL training
39 | ├── server/                                     # scripts for the ensemble of multiple pretrained models
40 | │   ├── ernie_server/                           # ERNIE mdoel server
41 | │   ├── xlnet_server/                           # XL-NET mdoel server
42 | │   ├── bert_server/                            # BERT mdoel server
43 | │   ├── main_server.py                          # main server scripts for ensemble
44 | │   ├── client/                                 # client scripts which read examples and make requests
45 | │   ├── wget_server_inference_model.sh          # script for downlowding model parameters
46 | │   ├── start.sh                                # script for launching all the servers
47 | ```
48 | ## Copyright and License
49 | Copyright 2019 Baidu.com, Inc. All Rights Reserved Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
50 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/images/D-NET_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/images/D-NET_framework.png


--------------------------------------------------------------------------------
/MRQA2019-D-NET/images/D-NET_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/images/D-NET_server.png


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/configs/answer_matching.yaml:
--------------------------------------------------------------------------------
1 | train_file: "data/am4mrqa/train.txt"
2 | mix_ratio: 0.8
3 | batch_size: 4
4 | in_tokens: False
5 | generate_neg_sample: False
6 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/configs/mask_language_model.yaml:
--------------------------------------------------------------------------------
1 | train_file: "data/mlm4mrqa"
2 | mix_ratio: 2.0
3 | batch_size: 4
4 | in_tokens: False
5 | generate_neg_sample: False
6 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/configs/mtl_config.yaml:
--------------------------------------------------------------------------------
 1 | main_task: "reading_comprehension"
 2 | auxiliary_task: "mask_language_model answer_matching"
 3 | 
 4 | do_train: True
 5 | do_predict: True
 6 | 
 7 | checkpoint_path: "output"
 8 | 
 9 | backbone_model: "bert_model"
10 | pretrain_model_path: "pretrain_model/squad2_model"
11 | pretrain_config_path: "pretrain_model/squad2_model/bert_config.json"
12 | vocab_path: "pretrain_model/squad2_model/vocab.txt"
13 | 
14 | optimizer: "bert_optimizer"
15 | learning_rate: 3e-5
16 | lr_scheduler: "linear_warmup_decay"
17 | skip_steps: 100
18 | save_steps: 10000
19 | epoch: 2
20 | use_cuda: True
21 | warmup_proportion: 0.1
22 | weight_decay: 0.1
23 | do_lower_case: False
24 | max_seq_len: 512
25 | use_ema: True
26 | ema_decay: 0.9999
27 | random_seed: 0
28 | use_fp16: False
29 | loss_scaling: 1.0
30 | 
31 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/configs/reading_comprehension.yaml:
--------------------------------------------------------------------------------
 1 | train_file: "data/mrqa/mrqa-combined.train.raw.json"
 2 | predict_file: "data/mrqa/mrqa-combined.dev.raw.json"
 3 | sample_rate: 0.02
 4 | mix_ratio: 1.0
 5 | batch_size: 4
 6 | in_tokens: false
 7 | doc_stride: 128
 8 | with_negative: false
 9 | max_query_length: 64
10 | max_answer_length: 30
11 | n_best_size: 20
12 | null_score_diff_threshold: 0.0
13 | verbose: False
14 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/run_build_palm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cp -r configs/* PALM/config/
4 | cp configs/mtl_config.yaml PALM/
5 | rm -rf PALM/data
6 | mv data PALM/
7 | mv squad2_model PALM/pretrain_model
8 | cp run_multi_task.sh PALM/
9 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/run_evaluation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | # path of dev data
19 | PATH_dev=./PALM/data/mrqa_dev
20 | # path of dev prediction
21 | BERT_MLM_PATH_prediction=./prediction_results/BERT_MLM_ema_predictions.json 
22 | BERT_MLM_ParaRank_PATH_prediction=./prediction_results/BERT_MLM_ParaRank_ema_predictions.json
23 | 
24 | files=$(ls ./prediction_results/*.log 2> /dev/null | wc -l)
25 | if [ "$files" != "0" ];
26 | then
27 |     rm prediction_results/BERT_MLM*.log
28 | fi
29 | 
30 | # evaluation BERT_MLM
31 | echo "evaluate BERT_MLM model........................................."
32 | for dataset in `ls $PATH_dev/in_domain_dev/*.raw.json`;do
33 |     echo $dataset >> prediction_results/BERT_MLM.log
34 |     python scripts/evaluate-v1.1.py $dataset $BERT_MLM_PATH_prediction >> prediction_results/BERT_MLM.log
35 | done
36 | 
37 | for dataset in `ls $PATH_dev/out_of_domain_dev/*.raw.json`;do
38 |     echo $dataset >> prediction_results/BERT_MLM.log
39 |     python scripts/evaluate-v1.1.py $dataset $BERT_MLM_PATH_prediction >> prediction_results/BERT_MLM.log
40 | done
41 | python scripts/macro_avg.py prediction_results/BERT_MLM.log
42 | 
43 | # evaluation BERT_MLM_ParaRank_PATH_prediction
44 | echo "evaluate BERT_MLM_ParaRank model................................"
45 | for dataset in `ls $PATH_dev/in_domain_dev/*.raw.json`;do
46 |     echo $dataset >> prediction_results/BERT_MLM_ParaRank.log
47 |     python scripts/evaluate-v1.1.py $dataset $BERT_MLM_ParaRank_PATH_prediction >> prediction_results/BERT_MLM_ParaRank.log
48 | done
49 | 
50 | 
51 | for dataset in `ls $PATH_dev/out_of_domain_dev/*.raw.json`;do
52 |     echo $dataset >> prediction_results/BERT_MLM_ParaRank.log
53 |     python scripts/evaluate-v1.1.py $dataset $BERT_MLM_ParaRank_PATH_prediction >> prediction_results/BERT_MLM_ParaRank.log
54 | done
55 | python scripts/macro_avg.py prediction_results/BERT_MLM_ParaRank.log
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/run_multi_task.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # for gpu memory optimization
 4 | export FLAGS_sync_nccl_allreduce=0
 5 | export FLAGS_eager_delete_tensor_gb=1
 6 | 
 7 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 8 | 
 9 | python -u mtl_run.py
10 | 
11 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/args.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Arguments for configuration."""
15 | 
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import six
21 | import argparse
22 | 
23 | 
24 | def str2bool(v):
25 |     # because argparse does not support to parse "true, False" as python
26 |     # boolean directly
27 |     return v.lower() in ("true", "t", "1")
28 | 
29 | 
30 | class ArgumentGroup(object):
31 |     def __init__(self, parser, title, des):
32 |         self._group = parser.add_argument_group(title=title, description=des)
33 | 
34 |     def add_arg(self, name, type, default, help, **kwargs):
35 |         type = str2bool if type == bool else type
36 |         self._group.add_argument(
37 |             "--" + name,
38 |             default=default,
39 |             type=type,
40 |             help=help + ' Default: %(default)s.',
41 |             **kwargs)
42 | 
43 | 
44 | def print_arguments(args):
45 |     print('-----------  Configuration Arguments -----------')
46 |     for arg, value in sorted(six.iteritems(vars(args))):
47 |         print('%s: %s' % (arg, value))
48 |     print('------------------------------------------------')
49 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/combine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # ==============================================================================
 4 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #    http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 | """
19 | This module add all train/dev data to a file named "mrqa-combined.raw.json".
20 | """
21 | 
22 | import json
23 | import argparse
24 | import glob
25 | 
26 | # path of train/dev data
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument('path', help='the path of train/dev data')
29 | args = parser.parse_args()
30 | path = args.path
31 | 
32 | # all train/dev data files
33 | files = glob.glob(path + '/*.raw.json')
34 | print ('files:', files)
35 | 
36 | # add all train/dev data to "datasets"
37 | with open(files[0]) as fin:
38 |     datasets = json.load(fin)
39 | for i in range(1, len(files)):
40 |     with open(files[i]) as fin:
41 |         dataset = json.load(fin)
42 |     datasets['data'].extend(dataset['data'])
43 | 
44 | # save to "mrqa-combined.raw.json"
45 | with open(path + '/mrqa-combined.raw.json', 'w') as fout:
46 |     json.dump(datasets, fout, indent=4)
47 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/combine.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | # path of train and dev data
19 | PATH_train=train
20 | PATH_dev=dev
21 | 
22 | # add all train data to a file "$PATH_train/mrqa-combined.raw.json".
23 | python combine.py $PATH_train
24 | 
25 | # add all dev data to a file "$PATH_dev/mrqa-combined.raw.json".
26 | python combine.py $PATH_dev


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/convert_mrqa2squad.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | # path of train and dev data
19 | PATH_train=train
20 | PATH_dev=dev
21 | 
22 | # Convert train data from MRQA format to SQuAD format
23 | NAME_LIST_train="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions"
24 | for name in $NAME_LIST_train;do
25 |     echo "Converting training data from MRQA format to SQuAD format: ""$name"
26 |     python convert_mrqa2squad.py $PATH_train/$name.jsonl
27 | done
28 | 
29 | # Convert dev data from MRQA format to SQuAD format
30 | NAME_LIST_dev="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions BioASQ TextbookQA RelationExtraction DROP DuoRC RACE"
31 | for name in $NAME_LIST_dev;do
32 |     echo "Converting development data from MRQA format to SQuAD format: ""$name"
33 |     python convert_mrqa2squad.py --dev $PATH_dev/$name.jsonl
34 | done
35 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/dev/md5sum_dev.txt:
--------------------------------------------------------------------------------
 1 | 05f3f16c5c31ba8e46ff5fa80647ac46  SQuAD.jsonl.gz
 2 | 5c188c92a84ddffe2ab590ac7598bde2  NewsQA.jsonl.gz
 3 | a7a3bd90db58524f666e757db659b047  TriviaQA.jsonl.gz
 4 | bfcb304f1b3167693b627cbf0f98bc9e  SearchQA.jsonl.gz
 5 | 675de35c3605353ec039ca4d2854072d  HotpotQA.jsonl.gz
 6 | c0347eebbca02d10d1b07b9a64efe61d  NaturalQuestions.jsonl.gz
 7 | 6408dc4fcf258535d0ea8b125bba5fbb  BioASQ.jsonl.gz
 8 | 76ca9cc16625dd8da75758d64676e6a1  TextbookQA.jsonl.gz
 9 | 128d318ea1391bf77234d8c1b69a45df  RelationExtraction.jsonl.gz
10 | 8b03867e4da2817ef341707040d99785  DROP.jsonl.gz
11 | 9e66769a70fdfdec4906a4bcef5f3d71  DuoRC.jsonl.gz
12 | 94a7ef9b9ea9402671e5b0248b6a5395  RACE.jsonl.gz


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | # path to save data
20 | OUTPUT_train=train
21 | OUTPUT_dev=dev
22 | 
23 | DATA_URL="https://s3.us-east-2.amazonaws.com/mrqa/release/v2"
24 | alias wget="wget -c --no-check-certificate"
25 | # download training datasets
26 | wget $DATA_URL/train/SQuAD.jsonl.gz -O $OUTPUT_train/SQuAD.jsonl.gz
27 | wget $DATA_URL/train/NewsQA.jsonl.gz -O $OUTPUT_train/NewsQA.jsonl.gz
28 | wget $DATA_URL/train/TriviaQA-web.jsonl.gz -O $OUTPUT_train/TriviaQA.jsonl.gz
29 | wget $DATA_URL/train/SearchQA.jsonl.gz -O $OUTPUT_train/SearchQA.jsonl.gz
30 | wget $DATA_URL/train/HotpotQA.jsonl.gz -O $OUTPUT_train/HotpotQA.jsonl.gz
31 | wget $DATA_URL/train/NaturalQuestionsShort.jsonl.gz -O $OUTPUT_train/NaturalQuestions.jsonl.gz
32 | 
33 | # download the in-domain development data
34 | wget $DATA_URL/dev/SQuAD.jsonl.gz -O $OUTPUT_dev/SQuAD.jsonl.gz
35 | wget $DATA_URL/dev/NewsQA.jsonl.gz -O $OUTPUT_dev/NewsQA.jsonl.gz
36 | wget $DATA_URL/dev/TriviaQA-web.jsonl.gz -O $OUTPUT_dev/TriviaQA.jsonl.gz
37 | wget $DATA_URL/dev/SearchQA.jsonl.gz -O $OUTPUT_dev/SearchQA.jsonl.gz
38 | wget $DATA_URL/dev/HotpotQA.jsonl.gz -O $OUTPUT_dev/HotpotQA.jsonl.gz
39 | wget $DATA_URL/dev/NaturalQuestionsShort.jsonl.gz -O $OUTPUT_dev/NaturalQuestions.jsonl.gz
40 | 
41 | # download the out-of-domain development data
42 | wget http://participants-area.bioasq.org/MRQA2019/ -O $OUTPUT_dev/BioASQ.jsonl.gz
43 | wget $DATA_URL/dev/TextbookQA.jsonl.gz -O $OUTPUT_dev/TextbookQA.jsonl.gz
44 | wget $DATA_URL/dev/RelationExtraction.jsonl.gz -O $OUTPUT_dev/RelationExtraction.jsonl.gz
45 | wget $DATA_URL/dev/DROP.jsonl.gz -O $OUTPUT_dev/DROP.jsonl.gz
46 | wget $DATA_URL/dev/DuoRC.ParaphraseRC.jsonl.gz -O $OUTPUT_dev/DuoRC.jsonl.gz
47 | wget $DATA_URL/dev/RACE.jsonl.gz -O $OUTPUT_dev/RACE.jsonl.gz
48 | 
49 | # check md5sum for training datasets
50 | cd $OUTPUT_train
51 | if md5sum --status -c md5sum_train.txt; then
52 |     echo  "finish download training data"
53 | else
54 |     echo  "md5sum check failed!"
55 | fi
56 | cd ..
57 | 
58 | # check md5sum for development data
59 | cd $OUTPUT_dev
60 | if md5sum --status -c md5sum_dev.txt; then
61 |     echo  "finish download development data"
62 | else
63 |     echo  "md5sum check failed!"
64 | fi
65 | cd ..
66 | 
67 | # gzip training datasets
68 | echo "unzipping train data"
69 | NAME_LIST_train="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions"
70 | for name in $NAME_LIST_train;do
71 |     gzip -d $OUTPUT_train/$name.jsonl.gz
72 | done
73 | 
74 | # gzip development data
75 | echo "unzipping dev data"
76 | NAME_LIST_dev="SQuAD NewsQA TriviaQA SearchQA HotpotQA NaturalQuestions BioASQ TextbookQA RelationExtraction DROP DuoRC RACE"
77 | for name in $NAME_LIST_dev;do
78 |     gzip -d $OUTPUT_dev/$name.jsonl.gz
79 | done
80 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/evaluate-v1.1.py:
--------------------------------------------------------------------------------
 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
 2 | from __future__ import print_function
 3 | from collections import Counter
 4 | import string
 5 | import re
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | 
11 | def normalize_answer(s):
12 |     """Lower text and remove punctuation, articles and extra whitespace."""
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     prediction_tokens = normalize_answer(prediction).split()
31 |     ground_truth_tokens = normalize_answer(ground_truth).split()
32 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 |     num_same = sum(common.values())
34 |     if num_same == 0:
35 |         return 0
36 |     precision = 1.0 * num_same / len(prediction_tokens)
37 |     recall = 1.0 * num_same / len(ground_truth_tokens)
38 |     f1 = (2 * precision * recall) / (precision + recall)
39 |     return f1
40 | 
41 | 
42 | def exact_match_score(prediction, ground_truth):
43 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 | 
45 | 
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 |     scores_for_ground_truths = []
48 |     for ground_truth in ground_truths:
49 |         score = metric_fn(prediction, ground_truth)
50 |         scores_for_ground_truths.append(score)
51 |     return max(scores_for_ground_truths)
52 | 
53 | 
54 | def evaluate(dataset, predictions):
55 |     f1 = exact_match = total = 0
56 |     for article in dataset:
57 |         for paragraph in article['paragraphs']:
58 |             for qa in paragraph['qas']:
59 |                 total += 1
60 |                 if qa['id'] not in predictions:
61 |                     message = 'Unanswered question ' + qa['id'] + \
62 |                               ' will receive score 0.'
63 |                     print(message, file=sys.stderr)
64 |                     continue
65 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 |                 prediction = predictions[qa['id']]
67 |                 exact_match += metric_max_over_ground_truths(
68 |                     exact_match_score, prediction, ground_truths)
69 |                 f1 += metric_max_over_ground_truths(
70 |                     f1_score, prediction, ground_truths)
71 | 
72 |     exact_match = 100.0 * exact_match / total
73 |     f1 = 100.0 * f1 / total
74 | 
75 |     return {'exact_match': exact_match, 'f1': f1}
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     expected_version = '1.1'
80 |     parser = argparse.ArgumentParser(
81 |         description='Evaluation for SQuAD ' + expected_version)
82 |     parser.add_argument('dataset_file', help='Dataset file')
83 |     parser.add_argument('prediction_file', help='Prediction File')
84 |     args = parser.parse_args()
85 |     with open(args.dataset_file) as dataset_file:
86 |         dataset_json = json.load(dataset_file)
87 |         if (dataset_json['version'] != expected_version):
88 |             print('Evaluation expects v-' + expected_version +
89 |                   ', but got dataset with v-' + dataset_json['version'],
90 |                   file=sys.stderr)
91 |         dataset = dataset_json['data']
92 |     with open(args.prediction_file) as prediction_file:
93 |         predictions = json.load(prediction_file)
94 |     print(json.dumps(evaluate(dataset, predictions)))
95 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/macro_avg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | def extract_score(line):
 7 |     score_json = json.loads(line)
 8 |     f1 = score_json['f1']
 9 |     em = score_json['exact_match']
10 |     return float(f1), float(em)
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     parser = argparse.ArgumentParser(
15 |     description='Calculate macro average for MRQA')
16 |     parser.add_argument('input_file', help='Score file')
17 |     args = parser.parse_args()
18 |     with open(args.input_file) as fin:
19 |         lines = list(map(str.strip, fin.readlines()))
20 |     in_domain_scores = {}
21 |     for dataset_id in range(0, 12, 2):
22 |         f1, em = extract_score(lines[dataset_id+1])
23 |         in_domain_scores[lines[dataset_id]] = f1
24 |     out_of_domain_scores = {}
25 |     for dataset_id in range(12, 24, 2):
26 |         f1, em = extract_score(lines[dataset_id+1])
27 |         out_of_domain_scores[lines[dataset_id]] = f1
28 |     print('In domain avg: {}'.format(sum(in_domain_scores.values()) / len(in_domain_scores.values())))
29 |     print('Out of domain avg: {}'.format(sum(out_of_domain_scores.values()) / len(in_domain_scores.values())))
30 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/scripts/train/md5sum_train.txt:
--------------------------------------------------------------------------------
1 | efd6a551d2697c20a694e933210489f8  SQuAD.jsonl.gz
2 | 182f4e977b849cb1dbfb796030b91444  NewsQA.jsonl.gz
3 | e18f586152612a9358c22f5536bfd32a  TriviaQA.jsonl.gz
4 | 612245315e6e7c4d8446e5fcc3dc1086  SearchQA.jsonl.gz
5 | d212c7b3fc949bd0dc47d124e8c34907  HotpotQA.jsonl.gz
6 | e27d27bf7c49eb5ead43cef3f41de6be  NaturalQuestions.jsonl.gz


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/wget_data.sh:
--------------------------------------------------------------------------------
 1 | # wget train data
 2 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/mrqa_multi_task_dataset.tar.gz
 3 | tar -xvf mrqa_multi_task_dataset.tar.gz
 4 | rm mrqa_multi_task_dataset.tar.gz
 5 | 
 6 | # wget predictions results
 7 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/muiti_task_prediction_results.tar.gz
 8 | tar -xvf muiti_task_prediction_results.tar.gz
 9 | rm muiti_task_prediction_results.tar.gz
10 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/multi_task_learning/wget_pretrained_model.sh:
--------------------------------------------------------------------------------
1 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/squad2_model.tar.gz
2 | tar -xvf squad2_model.tar.gz
3 | rm squad2_model.tar.gz
4 | 
5 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/README.md:
--------------------------------------------------------------------------------
 1 | # ensemble server system
 2 | This directory contains the ensemble system for the three models that are fine-tuned on the MRQA in-domain data (i.e. models based on ERNIE2.0, XL-NET and BERT). The architecture of the ensemble system is shown in the figure below. We first start 3 independent model server for ERNIE, XL-NET and BERT. We then start a main server to receive client requests, invoke model servers and ensemble model results. 
 3 | For convinience, users are able to explore **any ensemble combinations** (e.g. ERNIE+XL-NET, BERT+XL-NET), by simply modifying the configurations.
 4 | 
 5 | <p align="center">
 6 | <img src="../images/D-NET_server.png" width="500">
 7 | </p>
 8 | 
 9 | 
10 | ## Environment
11 | In our test environment, we use 
12 | 
13 |  - Python 2.7.13
14 |  - PaddlePaddle 1.5.2
15 |  - sentencepiece 0.1.83
16 |  - flask 1.1.1
17 |  - Cuda 9.0
18 |  - CuDNN 7.0
19 | 
20 | ## Download model parameters 
21 | To downlowd the model parameters that are fine-tuned on the MRQA in-domain data, run
22 |  
23 | ```
24 | bash wget_server_inference_model.sh
25 | ```
26 | A folder named `infere_model` will appear in `ernie_server/`, `xlnet_server/` and `bert_server/`. 
27 | 
28 | ## Start servers
29 | 
30 | Before starting the server, please make sure the ports `5118` to `5121` are available, and specify the `gpu_id` in `start.sh` (by default `GPU 0` on the machine will be used). 
31 | 
32 | To start the servers, run
33 | 
34 | ```
35 | bash start.sh
36 | ```
37 | The log for the main server will be saved in `main_server.log`, and the logs for the 3 model servers witll be saved in `ernie_server/ernie.log`, `xlnet_server/xlnet.log` and `bert_server/bert.log`. 
38 | 
39 | By default, the main server will ensemble the results from ERNIE and XL-NET. To explore other ensemble combinations, one can change the configuration in `start.sh` (e.g. `python main_server.py --ernie --xlnet --bert` for 3 models, `python main_server.py --bert --xlnet` for BERT and XL-NET only). 
40 | 
41 | Note that in our test environment, we use Tesla K40 (12G) and the three modles are able to fit in a single card. For GPUs with smaller RAM, one can choose to put three models on different card by modifying the configurations in  `start.sh`.
42 | 
43 | ## Send requests
44 | Once the servers are successfully launched, one can use the client script to send requests.
45 | 
46 | ```
47 | cd client
48 | python client.py demo.txt results.txt 5121
49 | ```
50 | This will the read the examples in `demo.txt`, send requests to the main server, and save results into `results.txt`. The format of the input file (i.e. `demo.txt`) need to be in [MRQA official format](https://github.com/mrqa/MRQA-Shared-Task-2019).


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/__main__.py:
--------------------------------------------------------------------------------
 1 | from algorithm import optimization
 2 | from algorithm import multitask
 3 | from extension import fp16
 4 | from module import transformer_encoder
 5 | from toolkit import configure
 6 | from toolkit import init
 7 | from toolkit import placeholder
 8 | from nets import bert
 9 | 
10 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/algorithm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/algorithm/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/algorithm/multitask.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf8
  2 | 
  3 | import os
  4 | import sys
  5 | import random
  6 | from copy import deepcopy as copy
  7 | import numpy as np
  8 | import paddle
  9 | import paddle.fluid as fluid
 10 | import multiprocessing
 11 | 
 12 | class Task:
 13 | 
 14 |     def __init__(
 15 |         self, 
 16 |         conf,
 17 |         name = "",
 18 |         is_training = False,
 19 |         _DataProcesser = None,
 20 |         shared_name = ""):
 21 |         
 22 |         self.conf = copy(conf)
 23 | 
 24 |         self.name = name
 25 |         self.shared_name = shared_name
 26 | 
 27 |         self.is_training = is_training
 28 |         self.DataProcesser = _DataProcesser
 29 | 
 30 |     def _create_reader(self):
 31 |         raise NotImplementedError("Task:_create_reader not implemented")
 32 | 
 33 |     def _create_model(self):
 34 |         raise NotImplementedError("Task:_create_model not implemented")
 35 | 
 36 |     def prepare(self, args):
 37 |         raise NotImplementedError("Task:prepare not implemented")
 38 | 
 39 |     def train_step(self, args):
 40 |         raise NotImplementedError("Task:train_step not implemented")
 41 | 
 42 |     def predict(self, args):
 43 |         raise NotImplementedError("Task:_predict not implemented")
 44 | 
 45 | 
 46 | class JointTask:
 47 | 
 48 |     def __init__(self):
 49 | 
 50 |         self.tasks = []
 51 | 
 52 |         #self.startup_exe = None
 53 |         #self.train_exe = None
 54 |        
 55 |         self.exe = None
 56 | 
 57 |         self.share_vars_from = None
 58 | 
 59 |         self.startup_prog = fluid.Program()
 60 | 
 61 |     def __add__(self, task):
 62 | 
 63 |         assert isinstance(task, Task)
 64 | 
 65 |         self.tasks.append(task)
 66 | 
 67 |         return self
 68 | 
 69 |     def prepare(self, args):
 70 | 
 71 |         if args.use_cuda:
 72 |             place = fluid.CUDAPlace(0)
 73 |             dev_count = fluid.core.get_cuda_device_count()
 74 |         else:
 75 |             place = fluid.CPUPlace()
 76 |             dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
 77 | 
 78 |         #self.startup_exe = fluid.Executor(place)
 79 |         self.exe = fluid.Executor(place)
 80 | 
 81 |         for idx, task in enumerate(self.tasks):
 82 |             if idx == 0:
 83 |                 print("for idx : %d" % idx)
 84 |                 task.prepare(args, exe = self.exe)
 85 |                 self.share_vars_from = task.compiled_train_prog
 86 |             else:
 87 |                 print("for idx : %d" % idx)
 88 |                 task.prepare(args, exe = self.exe, share_vars_from = self.share_vars_from)
 89 | 
 90 |     def train(self, args):
 91 | 
 92 |         joint_steps = []
 93 |         for i in xrange(0, len(self.tasks)):
 94 |             for _ in xrange(0, self.tasks[i].max_train_steps):
 95 |                 joint_steps.append(i)
 96 | 
 97 |         self.tasks[0].train_step(args, exe = self.exe)
 98 | 
 99 |         random.shuffle(joint_steps)
100 |         for next_task_id in joint_steps:
101 |             self.tasks[next_task_id].train_step(args, exe = self.exe)
102 | 
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     basetask_a = Task(None)
107 | 
108 |     basetask_b = Task(None)
109 | 
110 |     joint_tasks = JointTask()
111 | 
112 |     joint_tasks += basetask_a
113 | 
114 |     print(joint_tasks.tasks)
115 | 
116 |     joint_tasks += basetask_b
117 | 
118 |     print(joint_tasks.tasks)
119 | 
120 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/extension/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/extension/fp16.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | import paddle
17 | import paddle.fluid as fluid
18 | 
19 | 
20 | def cast_fp16_to_fp32(i, o, prog):
21 |     prog.global_block().append_op(
22 |         type="cast",
23 |         inputs={"X": i},
24 |         outputs={"Out": o},
25 |         attrs={
26 |             "in_dtype": fluid.core.VarDesc.VarType.FP16,
27 |             "out_dtype": fluid.core.VarDesc.VarType.FP32
28 |         })
29 | 
30 | 
31 | def cast_fp32_to_fp16(i, o, prog):
32 |     prog.global_block().append_op(
33 |         type="cast",
34 |         inputs={"X": i},
35 |         outputs={"Out": o},
36 |         attrs={
37 |             "in_dtype": fluid.core.VarDesc.VarType.FP32,
38 |             "out_dtype": fluid.core.VarDesc.VarType.FP16
39 |         })
40 | 
41 | 
42 | def copy_to_master_param(p, block):
43 |     v = block.vars.get(p.name, None)
44 |     if v is None:
45 |         raise ValueError("no param name %s found!" % p.name)
46 |     new_p = fluid.framework.Parameter(
47 |         block=block,
48 |         shape=v.shape,
49 |         dtype=fluid.core.VarDesc.VarType.FP32,
50 |         type=v.type,
51 |         lod_level=v.lod_level,
52 |         stop_gradient=p.stop_gradient,
53 |         trainable=p.trainable,
54 |         optimize_attr=p.optimize_attr,
55 |         regularizer=p.regularizer,
56 |         gradient_clip_attr=p.gradient_clip_attr,
57 |         error_clip=p.error_clip,
58 |         name=v.name + ".master")
59 |     return new_p
60 | 
61 | 
62 | def create_master_params_grads(params_grads, main_prog, startup_prog,
63 |                                loss_scaling):
64 |     master_params_grads = []
65 |     tmp_role = main_prog._current_role
66 |     OpRole = fluid.core.op_proto_and_checker_maker.OpRole
67 |     main_prog._current_role = OpRole.Backward
68 |     for p, g in params_grads:
69 |         # create master parameters
70 |         master_param = copy_to_master_param(p, main_prog.global_block())
71 |         startup_master_param = startup_prog.global_block()._clone_variable(
72 |             master_param)
73 |         startup_p = startup_prog.global_block().var(p.name)
74 |         cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
75 |         # cast fp16 gradients to fp32 before apply gradients
76 |         if g.name.find("layer_norm") > -1:
77 |             if loss_scaling > 1:
78 |                 scaled_g = g / float(loss_scaling)
79 |             else:
80 |                 scaled_g = g
81 |             master_params_grads.append([p, scaled_g])
82 |             continue
83 |         master_grad = fluid.layers.cast(g, "float32")
84 |         if loss_scaling > 1:
85 |             master_grad = master_grad / float(loss_scaling)
86 |         master_params_grads.append([master_param, master_grad])
87 |     main_prog._current_role = tmp_role
88 |     return master_params_grads
89 | 
90 | 
91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog):
92 |     for idx, m_p_g in enumerate(master_params_grads):
93 |         train_p, _ = params_grads[idx]
94 |         if train_p.name.find("layer_norm") > -1:
95 |             continue
96 |         with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
97 |             cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
98 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/module/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/nets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/pdnlp/nets/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/toolkit/init.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import os
18 | import six
19 | import ast
20 | import copy
21 | 
22 | import numpy as np
23 | import paddle.fluid as fluid
24 | 
25 | 
26 | def cast_fp32_to_fp16(exe, main_program):
27 |     print("Cast parameters to float16 data format.")
28 |     for param in main_program.global_block().all_parameters():
29 |         if not param.name.endswith(".master"):
30 |             param_t = fluid.global_scope().find_var(param.name).get_tensor()
31 |             data = np.array(param_t)
32 |             if param.name.find("layer_norm") == -1:
33 |                 param_t.set(np.float16(data).view(np.uint16), exe.place)
34 |             master_param_var = fluid.global_scope().find_var(param.name +
35 |                                                              ".master")
36 |             if master_param_var is not None:
37 |                 master_param_var.get_tensor().set(data, exe.place)
38 | 
39 | 
40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False, skip_list = []):
41 |     assert os.path.exists(
42 |         init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
43 | 
44 |     def existed_persitables(var):
45 |         if not fluid.io.is_persistable(var):
46 |             return False
47 |         if var.name in skip_list:
48 |             return False
49 |         return os.path.exists(os.path.join(init_checkpoint_path, var.name))
50 | 
51 |     fluid.io.load_vars(
52 |         exe,
53 |         init_checkpoint_path,
54 |         main_program=main_program,
55 |         predicate=existed_persitables)
56 |     print("Load model from {}".format(init_checkpoint_path))
57 | 
58 |     if use_fp16:
59 |         cast_fp32_to_fp16(exe, main_program)
60 | 
61 | 
62 | def init_pretraining_params(exe,
63 |                             pretraining_params_path,
64 |                             main_program,
65 |                             use_fp16=False):
66 |     assert os.path.exists(pretraining_params_path
67 |                           ), "[%s] cann't be found." % pretraining_params_path
68 | 
69 |     def existed_params(var):
70 |         if not isinstance(var, fluid.framework.Parameter):
71 |             return False
72 |         return os.path.exists(os.path.join(pretraining_params_path, var.name))
73 | 
74 |     fluid.io.load_vars(
75 |         exe,
76 |         pretraining_params_path,
77 |         main_program=main_program,
78 |         predicate=existed_params)
79 |     print("Load pretraining parameters from {}.".format(
80 |         pretraining_params_path))
81 | 
82 |     if use_fp16:
83 |         cast_fp32_to_fp16(exe, main_program)
84 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/pdnlp/toolkit/placeholder.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import os
 6 | import six
 7 | import ast
 8 | import copy
 9 | 
10 | import numpy as np
11 | import paddle.fluid as fluid
12 | 
13 | 
14 | class Placeholder(object):
15 | 
16 |     def __init__(self):
17 |         self.shapes = []
18 |         self.dtypes = []
19 |         self.lod_levels = []
20 |         self.names = []
21 | 
22 |     def __init__(self, input_shapes):
23 | 
24 |         self.shapes = []
25 |         self.dtypes = []
26 |         self.lod_levels = []
27 |         self.names = []
28 | 
29 |         for new_holder in input_shapes:
30 |             shape = new_holder[0]
31 |             dtype = new_holder[1]
32 |             lod_level = new_holder[2] if len(new_holder) >= 3 else 0
33 |             name = new_holder[3] if len(new_holder) >= 4 else ""
34 | 
35 |             self.append_placeholder(shape, dtype, lod_level = lod_level, name = name)
36 | 
37 |     def append_placeholder(self, shape, dtype, lod_level = 0, name = ""):
38 |         self.shapes.append(shape)
39 |         self.dtypes.append(dtype)
40 |         self.lod_levels.append(lod_level)
41 |         self.names.append(name)
42 | 
43 | 
44 |     def build(self, capacity, reader_name, use_double_buffer = False):
45 |         pyreader = fluid.layers.py_reader(
46 |             capacity = capacity,
47 |             shapes = self.shapes,
48 |             dtypes = self.dtypes,
49 |             lod_levels = self.lod_levels,
50 |             name = reader_name, 
51 |             use_double_buffer = use_double_buffer)
52 | 
53 |         return [pyreader, fluid.layers.read_file(pyreader)]
54 | 
55 | 
56 |     def __add__(self, new_holder):
57 |         assert isinstance(new_holder, tuple) or isinstance(new_holder, list) 
58 |         assert len(new_holder) >= 2
59 | 
60 |         shape = new_holder[0]
61 |         dtype = new_holder[1]
62 |         lod_level = new_holder[2] if len(new_holder) >= 3 else 0
63 |         name = new_holder[3] if len(new_holder) >= 4 else ""
64 | 
65 |         self.append_placeholder(shape, dtype, lod_level = lod_level, name = name)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     print("hello world!")
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/start.sh:
--------------------------------------------------------------------------------
1 | export FLAGS_fraction_of_gpu_memory_to_use=0.1
2 | port=$1
3 | gpu=$2
4 | export CUDA_VISIBLE_DEVICES=$gpu
5 | python start_service.py ./infer_model $port 
6 | 
7 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/start_service.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | BERT model service
 5 | """
 6 | import json
 7 | import sys
 8 | import logging
 9 | logging.basicConfig(
10 |     level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
11 | )
12 | import requests
13 | from flask import Flask
14 | from flask import Response
15 | from flask import request
16 | import mrc_service
17 | import model_wrapper
18 | import argparse
19 | 
20 | 
21 | assert len(sys.argv) == 3 or len(sys.argv) == 4, "Usage: python serve.py <model_dir> <port> [process_mode]"
22 | if len(sys.argv) == 3:
23 |     _, model_dir, port = sys.argv
24 |     mode = 'parallel'
25 | else:
26 |     _, model_dir, port, mode = sys.argv
27 | 
28 | max_batch_size = 5
29 | 
30 | app = Flask(__name__)
31 | app.logger.setLevel(logging.INFO)
32 | model = model_wrapper.BertModelWrapper(model_dir=model_dir)
33 | server = mrc_service.MRQAService('MRQA service', app.logger)
34 | 
35 | @app.route('/', methods=['POST'])
36 | def mrqa_service():
37 |     """Description"""
38 |     return server(model, process_mode=mode, max_batch_size=max_batch_size)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     app.run(port=port, debug=False, threaded=False, processes=1)
43 | 
44 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/bert_server/task_reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/bert_server/task_reader/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/client/client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Query the MRQA model server to generate predictions.
 5 | """
 6 | import argparse
 7 | import json
 8 | import requests
 9 | import time
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parse = argparse.ArgumentParser("")
14 |     parse.add_argument("dataset")
15 |     parse.add_argument("output_file")
16 |     parse.add_argument("port", type=int)
17 |     args = parse.parse_args()
18 | 
19 |     all_predictions = {}
20 |     contexts = []
21 |     f = open(args.dataset)
22 |     for example in f:
23 |         context = json.loads(example)
24 |         if 'header' in context:
25 |             continue
26 |         contexts.append(context)
27 |     f.close()
28 | 
29 |     results = {}
30 |     cnt = 0
31 |     for context in contexts:
32 |         cnt += 1
33 |         start = time.time()
34 |         pred = requests.post('http://127.0.0.1:%d' % args.port, json=context)
35 |         result = pred.json()
36 |         results.update(result)
37 |         end=time.time()
38 |         print('----- request cnt: {}, time elapsed: {:.2f} ms -----'.format(cnt, (end - start)*1000))
39 |         for qid, answer in result.items():
40 |             print('{}: {}'.format(qid, answer.encode('utf-8')))
41 |     with open(args.output_file,'w') as f:
42 |         json.dump(results, f, indent=1)
43 | 
44 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/__main__.py:
--------------------------------------------------------------------------------
 1 | from algorithm import optimization
 2 | from algorithm import multitask
 3 | from extension import fp16
 4 | from module import transformer_encoder
 5 | from toolkit import configure
 6 | from toolkit import init
 7 | from toolkit import placeholder
 8 | from nets import bert
 9 | 
10 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/algorithm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/algorithm/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/algorithm/multitask.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf8
  2 | 
  3 | import os
  4 | import sys
  5 | import random
  6 | from copy import deepcopy as copy
  7 | import numpy as np
  8 | import paddle
  9 | import paddle.fluid as fluid
 10 | import multiprocessing
 11 | 
 12 | class Task:
 13 | 
 14 |     def __init__(
 15 |         self, 
 16 |         conf,
 17 |         name = "",
 18 |         is_training = False,
 19 |         _DataProcesser = None,
 20 |         shared_name = ""):
 21 |         
 22 |         self.conf = copy(conf)
 23 | 
 24 |         self.name = name
 25 |         self.shared_name = shared_name
 26 | 
 27 |         self.is_training = is_training
 28 |         self.DataProcesser = _DataProcesser
 29 | 
 30 |     def _create_reader(self):
 31 |         raise NotImplementedError("Task:_create_reader not implemented")
 32 | 
 33 |     def _create_model(self):
 34 |         raise NotImplementedError("Task:_create_model not implemented")
 35 | 
 36 |     def prepare(self, args):
 37 |         raise NotImplementedError("Task:prepare not implemented")
 38 | 
 39 |     def train_step(self, args):
 40 |         raise NotImplementedError("Task:train_step not implemented")
 41 | 
 42 |     def predict(self, args):
 43 |         raise NotImplementedError("Task:_predict not implemented")
 44 | 
 45 | 
 46 | class JointTask:
 47 | 
 48 |     def __init__(self):
 49 | 
 50 |         self.tasks = []
 51 | 
 52 |         #self.startup_exe = None
 53 |         #self.train_exe = None
 54 |        
 55 |         self.exe = None
 56 | 
 57 |         self.share_vars_from = None
 58 | 
 59 |         self.startup_prog = fluid.Program()
 60 | 
 61 |     def __add__(self, task):
 62 | 
 63 |         assert isinstance(task, Task)
 64 | 
 65 |         self.tasks.append(task)
 66 | 
 67 |         return self
 68 | 
 69 |     def prepare(self, args):
 70 | 
 71 |         if args.use_cuda:
 72 |             place = fluid.CUDAPlace(0)
 73 |             dev_count = fluid.core.get_cuda_device_count()
 74 |         else:
 75 |             place = fluid.CPUPlace()
 76 |             dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
 77 | 
 78 |         #self.startup_exe = fluid.Executor(place)
 79 |         self.exe = fluid.Executor(place)
 80 | 
 81 |         for idx, task in enumerate(self.tasks):
 82 |             if idx == 0:
 83 |                 print("for idx : %d" % idx)
 84 |                 task.prepare(args, exe = self.exe)
 85 |                 self.share_vars_from = task.compiled_train_prog
 86 |             else:
 87 |                 print("for idx : %d" % idx)
 88 |                 task.prepare(args, exe = self.exe, share_vars_from = self.share_vars_from)
 89 | 
 90 |     def train(self, args):
 91 | 
 92 |         joint_steps = []
 93 |         for i in xrange(0, len(self.tasks)):
 94 |             for _ in xrange(0, self.tasks[i].max_train_steps):
 95 |                 joint_steps.append(i)
 96 | 
 97 |         self.tasks[0].train_step(args, exe = self.exe)
 98 | 
 99 |         random.shuffle(joint_steps)
100 |         for next_task_id in joint_steps:
101 |             self.tasks[next_task_id].train_step(args, exe = self.exe)
102 | 
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     basetask_a = Task(None)
107 | 
108 |     basetask_b = Task(None)
109 | 
110 |     joint_tasks = JointTask()
111 | 
112 |     joint_tasks += basetask_a
113 | 
114 |     print(joint_tasks.tasks)
115 | 
116 |     joint_tasks += basetask_b
117 | 
118 |     print(joint_tasks.tasks)
119 | 
120 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/extension/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/extension/fp16.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | import paddle
17 | import paddle.fluid as fluid
18 | 
19 | 
20 | def cast_fp16_to_fp32(i, o, prog):
21 |     prog.global_block().append_op(
22 |         type="cast",
23 |         inputs={"X": i},
24 |         outputs={"Out": o},
25 |         attrs={
26 |             "in_dtype": fluid.core.VarDesc.VarType.FP16,
27 |             "out_dtype": fluid.core.VarDesc.VarType.FP32
28 |         })
29 | 
30 | 
31 | def cast_fp32_to_fp16(i, o, prog):
32 |     prog.global_block().append_op(
33 |         type="cast",
34 |         inputs={"X": i},
35 |         outputs={"Out": o},
36 |         attrs={
37 |             "in_dtype": fluid.core.VarDesc.VarType.FP32,
38 |             "out_dtype": fluid.core.VarDesc.VarType.FP16
39 |         })
40 | 
41 | 
42 | def copy_to_master_param(p, block):
43 |     v = block.vars.get(p.name, None)
44 |     if v is None:
45 |         raise ValueError("no param name %s found!" % p.name)
46 |     new_p = fluid.framework.Parameter(
47 |         block=block,
48 |         shape=v.shape,
49 |         dtype=fluid.core.VarDesc.VarType.FP32,
50 |         type=v.type,
51 |         lod_level=v.lod_level,
52 |         stop_gradient=p.stop_gradient,
53 |         trainable=p.trainable,
54 |         optimize_attr=p.optimize_attr,
55 |         regularizer=p.regularizer,
56 |         gradient_clip_attr=p.gradient_clip_attr,
57 |         error_clip=p.error_clip,
58 |         name=v.name + ".master")
59 |     return new_p
60 | 
61 | 
62 | def create_master_params_grads(params_grads, main_prog, startup_prog,
63 |                                loss_scaling):
64 |     master_params_grads = []
65 |     tmp_role = main_prog._current_role
66 |     OpRole = fluid.core.op_proto_and_checker_maker.OpRole
67 |     main_prog._current_role = OpRole.Backward
68 |     for p, g in params_grads:
69 |         # create master parameters
70 |         master_param = copy_to_master_param(p, main_prog.global_block())
71 |         startup_master_param = startup_prog.global_block()._clone_variable(
72 |             master_param)
73 |         startup_p = startup_prog.global_block().var(p.name)
74 |         cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
75 |         # cast fp16 gradients to fp32 before apply gradients
76 |         if g.name.find("layer_norm") > -1:
77 |             if loss_scaling > 1:
78 |                 scaled_g = g / float(loss_scaling)
79 |             else:
80 |                 scaled_g = g
81 |             master_params_grads.append([p, scaled_g])
82 |             continue
83 |         master_grad = fluid.layers.cast(g, "float32")
84 |         if loss_scaling > 1:
85 |             master_grad = master_grad / float(loss_scaling)
86 |         master_params_grads.append([master_param, master_grad])
87 |     main_prog._current_role = tmp_role
88 |     return master_params_grads
89 | 
90 | 
91 | def master_param_to_train_param(master_params_grads, params_grads, main_prog):
92 |     for idx, m_p_g in enumerate(master_params_grads):
93 |         train_p, _ = params_grads[idx]
94 |         if train_p.name.find("layer_norm") > -1:
95 |             continue
96 |         with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
97 |             cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
98 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/module/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/nets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/pdnlp/nets/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/toolkit/init.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import os
18 | import six
19 | import ast
20 | import copy
21 | 
22 | import numpy as np
23 | import paddle.fluid as fluid
24 | 
25 | 
26 | def cast_fp32_to_fp16(exe, main_program):
27 |     print("Cast parameters to float16 data format.")
28 |     for param in main_program.global_block().all_parameters():
29 |         if not param.name.endswith(".master"):
30 |             param_t = fluid.global_scope().find_var(param.name).get_tensor()
31 |             data = np.array(param_t)
32 |             if param.name.find("layer_norm") == -1:
33 |                 param_t.set(np.float16(data).view(np.uint16), exe.place)
34 |             master_param_var = fluid.global_scope().find_var(param.name +
35 |                                                              ".master")
36 |             if master_param_var is not None:
37 |                 master_param_var.get_tensor().set(data, exe.place)
38 | 
39 | 
40 | def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False, skip_list = []):
41 |     assert os.path.exists(
42 |         init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
43 | 
44 |     def existed_persitables(var):
45 |         if not fluid.io.is_persistable(var):
46 |             return False
47 |         if var.name in skip_list:
48 |             return False
49 |         return os.path.exists(os.path.join(init_checkpoint_path, var.name))
50 | 
51 |     fluid.io.load_vars(
52 |         exe,
53 |         init_checkpoint_path,
54 |         main_program=main_program,
55 |         predicate=existed_persitables)
56 |     print("Load model from {}".format(init_checkpoint_path))
57 | 
58 |     if use_fp16:
59 |         cast_fp32_to_fp16(exe, main_program)
60 | 
61 | 
62 | def init_pretraining_params(exe,
63 |                             pretraining_params_path,
64 |                             main_program,
65 |                             use_fp16=False):
66 |     assert os.path.exists(pretraining_params_path
67 |                           ), "[%s] cann't be found." % pretraining_params_path
68 | 
69 |     def existed_params(var):
70 |         if not isinstance(var, fluid.framework.Parameter):
71 |             return False
72 |         return os.path.exists(os.path.join(pretraining_params_path, var.name))
73 | 
74 |     fluid.io.load_vars(
75 |         exe,
76 |         pretraining_params_path,
77 |         main_program=main_program,
78 |         predicate=existed_params)
79 |     print("Load pretraining parameters from {}.".format(
80 |         pretraining_params_path))
81 | 
82 |     if use_fp16:
83 |         cast_fp32_to_fp16(exe, main_program)
84 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/pdnlp/toolkit/placeholder.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import os
 6 | import six
 7 | import ast
 8 | import copy
 9 | 
10 | import numpy as np
11 | import paddle.fluid as fluid
12 | 
13 | 
14 | class Placeholder(object):
15 | 
16 |     def __init__(self):
17 |         self.shapes = []
18 |         self.dtypes = []
19 |         self.lod_levels = []
20 |         self.names = []
21 | 
22 |     def __init__(self, input_shapes):
23 | 
24 |         self.shapes = []
25 |         self.dtypes = []
26 |         self.lod_levels = []
27 |         self.names = []
28 | 
29 |         for new_holder in input_shapes:
30 |             shape = new_holder[0]
31 |             dtype = new_holder[1]
32 |             lod_level = new_holder[2] if len(new_holder) >= 3 else 0
33 |             name = new_holder[3] if len(new_holder) >= 4 else ""
34 | 
35 |             self.append_placeholder(shape, dtype, lod_level = lod_level, name = name)
36 | 
37 |     def append_placeholder(self, shape, dtype, lod_level = 0, name = ""):
38 |         self.shapes.append(shape)
39 |         self.dtypes.append(dtype)
40 |         self.lod_levels.append(lod_level)
41 |         self.names.append(name)
42 | 
43 | 
44 |     def build(self, capacity, reader_name, use_double_buffer = False):
45 |         pyreader = fluid.layers.py_reader(
46 |             capacity = capacity,
47 |             shapes = self.shapes,
48 |             dtypes = self.dtypes,
49 |             lod_levels = self.lod_levels,
50 |             name = reader_name, 
51 |             use_double_buffer = use_double_buffer)
52 | 
53 |         return [pyreader, fluid.layers.read_file(pyreader)]
54 | 
55 | 
56 |     def __add__(self, new_holder):
57 |         assert isinstance(new_holder, tuple) or isinstance(new_holder, list) 
58 |         assert len(new_holder) >= 2
59 | 
60 |         shape = new_holder[0]
61 |         dtype = new_holder[1]
62 |         lod_level = new_holder[2] if len(new_holder) >= 3 else 0
63 |         name = new_holder[3] if len(new_holder) >= 4 else ""
64 | 
65 |         self.append_placeholder(shape, dtype, lod_level = lod_level, name = name)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     print("hello world!")
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/start.sh:
--------------------------------------------------------------------------------
1 | export FLAGS_fraction_of_gpu_memory_to_use=0.1
2 | port=$1
3 | gpu=$2
4 | export CUDA_VISIBLE_DEVICES=$gpu
5 | python start_service.py ./infer_model $port
6 | 
7 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/start_service.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | ERNIE model service
 5 | """
 6 | import json
 7 | import sys
 8 | import logging
 9 | logging.basicConfig(
10 |     level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
11 | )
12 | import requests
13 | from flask import Flask
14 | from flask import Response
15 | from flask import request
16 | import mrc_service
17 | import model_wrapper as ernie_wrapper
18 | 
19 | assert len(sys.argv) == 3 or len(sys.argv) == 4, "Usage: python serve.py <model_dir> <port> [process_mode]"
20 | if len(sys.argv) == 3:
21 |     _, model_dir, port = sys.argv
22 |     mode = 'parallel'
23 | else:
24 |     _, model_dir, port, mode = sys.argv
25 | 
26 | app = Flask(__name__)
27 | app.logger.setLevel(logging.INFO)
28 | ernie_model = ernie_wrapper.ERNIEModelWrapper(model_dir=model_dir)
29 | server = mrc_service.BasicMRCService('Short answer MRC service', app.logger)
30 | 
31 | @app.route('/', methods=['POST'])
32 | def mrqa_service():
33 |     """Description"""
34 |     model = ernie_model
35 |     return server(model, process_mode=mode, max_batch_size=5)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     app.run(port=port, debug=False, threaded=False, processes=1)
40 | 
41 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/ernie_server/task_reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/ernie_server/task_reader/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/main_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import json
 4 | import sys
 5 | import logging
 6 | logging.basicConfig(
 7 |     level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 8 | )
 9 | import requests
10 | from flask import Flask
11 | from flask import Response
12 | from flask import request
13 | import numpy as np
14 | import argparse
15 | from multiprocessing.dummy import Pool as ThreadPool
16 | 
17 | app = Flask(__name__)
18 | 
19 | logger = logging.getLogger('flask')
20 | 
21 | 
22 | def ensemble_example(answers, n_models=None):
23 |     if n_models is None:
24 |         n_models = len(answers)
25 |     answer_dict = dict()
26 |     for nbest_predictions in answers:
27 |         for prediction in nbest_predictions:
28 |             score_list = answer_dict.setdefault(prediction['text'], [])
29 |             score_list.append(prediction['probability'])
30 | 
31 |     ensemble_nbest_predictions = []
32 |     for answer, scores in answer_dict.items():
33 |         prediction = dict()
34 |         prediction['text'] = answer
35 |         prediction['probability'] = np.sum(scores) / n_models
36 |         ensemble_nbest_predictions.append(prediction)
37 | 
38 |     ensemble_nbest_predictions = \
39 |         sorted(ensemble_nbest_predictions, key=lambda item: item['probability'], reverse=True)
40 |     return ensemble_nbest_predictions
41 | 
42 | 
43 | @app.route('/', methods=['POST'])
44 | def mrqa_main():
45 |     """Description"""
46 |     # parse input data
47 |     pred = {}
48 |     def _call_model(url, input_json):
49 |         nbest = requests.post(url, json=input_json)
50 |         return nbest
51 |     try:
52 |         input_json = request.get_json(silent=True)
53 |         n_models = len(urls)
54 |         pool = ThreadPool(n_models)
55 |         results = []
56 |         for url in urls:
57 |             result = pool.apply_async(_call_model, (url, input_json))
58 |             results.append(result.get())
59 |         pool.close()
60 |         pool.join()
61 |         nbests = [nbest.json()['results'] for nbest in results]
62 |         qids = list(nbests[0].keys())
63 |         for qid in qids:
64 |             ensemble_nbest = ensemble_example([nbest[qid] for nbest in nbests], n_models=n_models)
65 |             pred[qid] = ensemble_nbest[0]['text']
66 |     except Exception as e:
67 |         pred['error'] = 'empty'
68 |         logger.exception(e)
69 | 
70 |     return Response(json.dumps(pred), mimetype='application/json')
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     url_1 = 'http://127.0.0.1:5118'   # url for ernie
75 |     url_2 = 'http://127.0.0.1:5119'   # url for xl-net
76 |     url_3 = 'http://127.0.0.1:5120'   # url for bert
77 |     parser = argparse.ArgumentParser('main server')
78 |     parser.add_argument('--ernie', action='store_true', default=False, help="Include ERNIE")
79 |     parser.add_argument('--xlnet', action='store_true', default=False, help="Include XL-NET")
80 |     parser.add_argument('--bert', action='store_true', default=False, help="Include BERT")
81 |     args = parser.parse_args()
82 |     urls = []
83 |     if args.ernie:
84 |         print('Include ERNIE model')
85 |         urls.append(url_1)
86 |     if args.xlnet:
87 |         print('Include XL-NET model')
88 |         urls.append(url_2)
89 |     if args.bert:
90 |         print('Include BERT model')
91 |         urls.append(url_3)
92 |     assert len(urls) > 0, "At lease one model is required"
93 |     app.run(host='127.0.0.1', port=5121, debug=False, threaded=False, processes=1)
94 | 
95 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_id=0
 4 | 
 5 | # start ernie service
 6 | # usage: sh start.sh port gpu_id
 7 | cd ernie_server
 8 | nohup sh start.sh 5118 $gpu_id > ernie.log 2>&1 &
 9 | cd ..
10 | 
11 | # start xlnet service
12 | cd xlnet_server
13 | nohup sh start.sh 5119 $gpu_id > xlnet.log 2>&1 &
14 | cd ..
15 | 
16 | # start bert service
17 | cd bert_server
18 | nohup sh start.sh 5120 $gpu_id > bert.log 2>&1 &
19 | cd ..
20 | 
21 | sleep 3
22 | # start main server
23 | # usage: python main_server.py --model_name
24 | # the model_name specifies the model to be used in the ensemble.
25 | nohup python main_server.py --ernie --xlnet > main_server.log 2>&1 &
26 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/wget_server_inference_model.sh:
--------------------------------------------------------------------------------
1 | wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/D-Net/mrqa2019_inference_model.tar.gz
2 | tar -xvf mrqa2019_inference_model.tar.gz
3 | rm mrqa2019_inference_model.tar.gz
4 | mv bert_infer_model bert_server/infer_model
5 | mv xlnet_infer_model xlnet_server/infer_model
6 | mv ernie_infer_model ernie_server/infer_model
7 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/xlnet_server/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/data_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | 
 6 | special_symbols = {
 7 |     "<unk>"  : 0,
 8 |     "<s>"    : 1,
 9 |     "</s>"   : 2,
10 |     "<cls>"  : 3,
11 |     "<sep>"  : 4,
12 |     "<pad>"  : 5,
13 |     "<mask>" : 6,
14 |     "<eod>"  : 7,
15 |     "<eop>"  : 8,
16 | }
17 | 
18 | VOCAB_SIZE = 32000
19 | UNK_ID = special_symbols["<unk>"]
20 | CLS_ID = special_symbols["<cls>"]
21 | SEP_ID = special_symbols["<sep>"]
22 | MASK_ID = special_symbols["<mask>"]
23 | EOD_ID = special_symbols["<eod>"]
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/xlnet_server/model/__init__.py


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/serve.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | XL-NET model service
 5 | """
 6 | import json
 7 | import sys
 8 | import logging
 9 | logging.basicConfig(
10 |     level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
11 | )
12 | import requests
13 | from flask import Flask
14 | from flask import Response
15 | from flask import request
16 | import server_utils
17 | import wrapper as bert_wrapper
18 | 
19 | assert len(sys.argv) == 3 or len(sys.argv) == 4, "Usage: python serve.py <model_dir> <port> [process_mode]"
20 | if len(sys.argv) == 3:
21 |     _, model_dir, port = sys.argv
22 |     mode = 'parallel'
23 | else:
24 |     _, model_dir, port, mode = sys.argv
25 | 
26 | app = Flask(__name__)
27 | app.logger.setLevel(logging.INFO)
28 | bert_model = bert_wrapper.BertModelWrapper(model_dir=model_dir)
29 | server = server_utils.BasicMRCService('Short answer MRC service', app.logger)
30 | 
31 | @app.route('/', methods=['POST'])
32 | def mrqa_service():
33 |     """Description"""
34 |     model = bert_model
35 |     return server(model, process_mode=mode, max_batch_size=5)
36 |     # return server(model)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     app.run(port=port, debug=False, threaded=False, processes=1)
41 | 
42 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/start.sh:
--------------------------------------------------------------------------------
1 | export FLAGS_sync_nccl_allreduce=0
2 | export FLAGS_eager_delete_tensor_gb=1
3 | export FLAGS_fraction_of_gpu_memory_to_use=0.1
4 | port=$1
5 | gpu=$2
6 | export CUDA_VISIBLE_DEVICES=$gpu
7 | 
8 | python serve.py ./infer_model $port
9 | 


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/xlnet_config/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu/DuReader/c625076b06da8f56d59f19c41c73bd580a98a347/MRQA2019-D-NET/server/xlnet_server/xlnet_config/spiece.model


--------------------------------------------------------------------------------
/MRQA2019-D-NET/server/xlnet_server/xlnet_config/xlnet_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "d_head": 64, 
 3 |     "d_inner": 4096, 
 4 |     "d_model": 1024, 
 5 |     "ff_activation": "gelu", 
 6 |     "n_head": 16, 
 7 |     "n_layer": 24, 
 8 |     "n_token": 32000, 
 9 |     "untie_r": true
10 | }


--------------------------------------------------------------------------------