├── .gitignore ├── mrc ├── requirements.txt ├── run_spanmrc.sh ├── run_seq2seqmrc.sh ├── run_qa.sh ├── mrc_task_config.py ├── trainer_qa.py ├── trainer_seq2seq_qa.py ├── README.md ├── utils_qa.py ├── run_qa_beam_search.py ├── run_qa.py └── run_seq2seq_qa.py ├── requirements.txt ├── seqtag ├── run_trigger_extraction.sh ├── run_argument_extraction.sh ├── ner_task_config.py └── run_ner.py ├── README.md └── evaluate.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | *.ipynb 3 | *.csv 4 | *.json 5 | -------------------------------------------------------------------------------- /mrc/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets >= 1.8.0 3 | torch >= 1.3.0 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | transformers==4.19.0 3 | datasets>=1.8.0 4 | evaluate 5 | seqeval 6 | tabulate 7 | -------------------------------------------------------------------------------- /seqtag/run_trigger_extraction.sh: -------------------------------------------------------------------------------- 1 | # run trigger extraction using sequence-tagging model 2 | MODEL=bert-base-chinese 3 | OUTPUT_DIR=./output/trg_seqtag/$MODEL 4 | 5 | python run_ner.py \ 6 | --model_name_or_path $MODEL \ 7 | --task_name trg \ 8 | --output_dir $OUTPUT_DIR \ 9 | --overwrite_output_dir \ 10 | --train_file ../dataset/tagging/tagging_train.csv \ 11 | --validation_file ../dataset/tagging/tagging_dev.csv \ 12 | --test_file ../dataset/tagging/tagging_test.csv \ 13 | --do_train \ 14 | --do_eval \ 15 | --do_predict \ 16 | --output_filename trg_predictions.csv \ 17 | --per_device_train_batch_size=32 \ 18 | --per_device_eval_batch_size=32 \ 19 | --num_train_epochs 30 \ 20 | --save_strategy epoch \ 21 | --logging_strategy epoch \ 22 | --evaluation_strategy epoch \ 23 | --save_total_limit 1 \ 24 | --load_best_model_at_end \ 25 | --metric_for_best_model f1 \ 26 | --text_column_name tokens 27 | -------------------------------------------------------------------------------- /mrc/run_spanmrc.sh: -------------------------------------------------------------------------------- 1 | # first, lets train argument extraction SpanMRC model on golden triggers 2 | python run_qa.py \ 3 | --model_name_or_path bert-base-chinese \ 4 | --output_dir ./output/arg_spanmrc/bert-base-chinese \ 5 | --overwrite_output_dir \ 6 | --version_2_with_negative \ 7 | --train_file ../dataset/train.csv \ 8 | --validation_file ../dataset/dev.csv \ 9 | --test_file ../dataset/test.csv \ 10 | --do_train \ 11 | --do_eval \ 12 | --do_predict \ 13 | --output_filename arg_predictions.csv \ 14 | --per_device_train_batch_size=32 \ 15 | --per_device_eval_batch_size=32 \ 16 | --num_train_epochs 30 \ 17 | --save_strategy epoch \ 18 | --logging_strategy epoch \ 19 | --evaluation_strategy epoch \ 20 | --save_total_limit 1 \ 21 | --load_best_model_at_end \ 22 | --metric_for_best_model eval_f1 23 | 24 | # then, given the extraced triggers, we could use this model to predict arguments in a pipeline manner 25 | PRED_TRIGGER_FILE=../seqtag/output/trg_seqtag/bert-base-chinese/trg_predictions.csv 26 | python run_qa.py \ 27 | --model_name_or_path ./output/arg_spanmrc/bert-base-chinese \ 28 | --output_dir ./output/arg_spanmrc/bert-base-chinese \ 29 | --overwrite_output_dir \ 30 | --version_2_with_negative \ 31 | --train_file ../dataset/train.csv \ 32 | --validation_file ../dataset/dev.csv \ 33 | --test_file ../dataset/test.csv \ 34 | --pred_trg_file $PRED_TRIGGER_FILE \ 35 | --do_train false \ 36 | --do_eval false \ 37 | --do_predict \ 38 | --output_filename pipeline_predictions.csv \ 39 | --per_device_train_batch_size=32 \ 40 | --per_device_eval_batch_size=32 \ 41 | --num_train_epochs 30 \ 42 | --save_strategy epoch \ 43 | --logging_strategy epoch \ 44 | --evaluation_strategy epoch \ 45 | --save_total_limit 1 \ 46 | --load_best_model_at_end \ 47 | --metric_for_best_model eval_f1 -------------------------------------------------------------------------------- /seqtag/run_argument_extraction.sh: -------------------------------------------------------------------------------- 1 | 2 | # first, lets train argument extraction model on golden triggers 3 | python3 run_ner.py \ 4 | --model_name_or_path bert-base-chinese \ 5 | --task_name arg \ 6 | --output_dir ./output/arg_seqtag/bert-base-chinese \ 7 | --overwrite_output_dir \ 8 | --train_file ../dataset/tagging/tagging_train.csv \ 9 | --validation_file ../dataset/tagging/tagging_dev.csv \ 10 | --test_file ../dataset/tagging/tagging_test.csv \ 11 | --do_train \ 12 | --do_eval \ 13 | --do_predict \ 14 | --output_filename arg_predictions.csv \ 15 | --per_device_train_batch_size=32 \ 16 | --per_device_eval_batch_size=32 \ 17 | --num_train_epochs 30 \ 18 | --save_strategy epoch \ 19 | --logging_strategy epoch \ 20 | --evaluation_strategy epoch \ 21 | --save_total_limit 1 \ 22 | --load_best_model_at_end \ 23 | --metric_for_best_model f1 \ 24 | --text_column_name tokens 25 | 26 | 27 | # then, given the extraced triggers, we could use this model to predict arguments in a pipeline manner 28 | PRED_TRIGGER_FILE=./output/trg_seqtag/bert-base-chinese/trg_predictions.csv 29 | python3 run_ner.py \ 30 | --model_name_or_path ./output/arg_seqtag/bert-base-chinese \ 31 | --task_name arg \ 32 | --output_dir ./output/arg_seqtag/bert-base-chinese \ 33 | --overwrite_output_dir \ 34 | --train_file ../dataset/tagging/tagging_train.csv \ 35 | --validation_file ../dataset/tagging/tagging_dev.csv \ 36 | --test_file ../dataset/tagging/tagging_test.csv \ 37 | --pred_trg_file $PRED_TRIGGER_FILE \ 38 | --do_train false \ 39 | --do_eval false \ 40 | --do_predict \ 41 | --output_filename pipeline_predictions.csv \ 42 | --per_device_train_batch_size=32 \ 43 | --per_device_eval_batch_size=32 \ 44 | --num_train_epochs 30 \ 45 | --save_strategy epoch \ 46 | --logging_strategy epoch \ 47 | --evaluation_strategy epoch \ 48 | --save_total_limit 1 \ 49 | --load_best_model_at_end \ 50 | --metric_for_best_model f1 \ 51 | --text_column_name tokens -------------------------------------------------------------------------------- /mrc/run_seq2seqmrc.sh: -------------------------------------------------------------------------------- 1 | # first, lets train argument extraction Seq2SeqMRC model on golden triggers 2 | python run_seq2seq_qa.py \ 3 | --model_name_or_path google/mt5-base \ 4 | --output_dir ./output/arg_seq2seqmrc/mt5-base \ 5 | --overwrite_output_dir \ 6 | --version_2_with_negative \ 7 | --train_file ../dataset/train.csv \ 8 | --validation_file ../dataset/dev.csv \ 9 | --test_file ../dataset/test.csv \ 10 | --output_filename arg_predictions.csv \ 11 | --eval_accumulation_steps 1 \ 12 | --predict_with_generate \ 13 | --do_train \ 14 | --do_eval \ 15 | --do_predict \ 16 | --learning_rate 1e-4 \ 17 | --per_device_train_batch_size 32 \ 18 | --per_device_eval_batch_size 8 \ 19 | --num_train_epochs 30 \ 20 | --save_strategy epoch \ 21 | --logging_strategy epoch \ 22 | --evaluation_strategy epoch \ 23 | --save_total_limit 1 \ 24 | --load_best_model_at_end \ 25 | --metric_for_best_model eval_f1 26 | 27 | # then, given the extraced triggers, we could use this model to predict arguments in a pipeline manner 28 | PRED_TRIGGER_FILE=../seqtag/output/trg_seqtag/bert-base-chinese/trg_predictions.csv 29 | python run_seq2seq_qa.py \ 30 | --model_name_or_path ./output/arg_seq2seqmrc/mt5-base \ 31 | --output_dir ./output/arg_seq2seqmrc/mt5-base \ 32 | --overwrite_output_dir \ 33 | --version_2_with_negative \ 34 | --train_file ../dataset/train.csv \ 35 | --validation_file ../dataset/dev.csv \ 36 | --test_file ../dataset/test.csv \ 37 | --pred_trg_file $PRED_TRIGGER_FILE \ 38 | --output_filename pipeline_predictions.csv \ 39 | --eval_accumulation_steps 1 \ 40 | --predict_with_generate \ 41 | --do_train false \ 42 | --do_eval false \ 43 | --do_predict \ 44 | --learning_rate 1e-4 \ 45 | --per_device_train_batch_size 32 \ 46 | --per_device_eval_batch_size 8 \ 47 | --num_train_epochs 30 \ 48 | --save_strategy epoch \ 49 | --logging_strategy epoch \ 50 | --evaluation_strategy epoch \ 51 | --save_total_limit 1 \ 52 | --load_best_model_at_end \ 53 | --metric_for_best_model eval_f1 -------------------------------------------------------------------------------- /mrc/run_qa.sh: -------------------------------------------------------------------------------- 1 | 2 | CUDA_VISIBLE_DEVICES=5 python run_seq2seq_qa.py \ 3 | --model_name_or_path ../output/arg_seq2seq_qa/mt5-base \ 4 | --output_dir ../output/arg_seq2seq_qa/mt5-base \ 5 | --overwrite_output_dir \ 6 | --version_2_with_negative \ 7 | --train_file ../../datasets/Title2Event/train.csv \ 8 | --validation_file ../../datasets/Title2Event/dev.csv \ 9 | --test_file ../../datasets/Title2Event/test.csv \ 10 | --eval_accumulation_steps 1 \ 11 | --predict_with_generate \ 12 | --version_2_with_negative \ 13 | --do_train False --do_eval False \ 14 | --learning_rate 1e-4 \ 15 | --do_predict \ 16 | --per_device_train_batch_size 32 \ 17 | --per_device_eval_batch_size 8 \ 18 | --num_train_epochs 30 \ 19 | --save_strategy epoch \ 20 | --logging_strategy epoch \ 21 | --evaluation_strategy epoch \ 22 | --save_total_limit 1 \ 23 | --load_best_model_at_end \ 24 | --metric_for_best_model eval_f1 \ 25 | --pred_trg_file ../output/trg_ner/bert-base-chinese/trg_predictions.csv \ 26 | 27 | # CUDA_VISIBLE_DEVICES=2 python run_qa.py \ 28 | # --model_name_or_path ../output/arg_qa/bert-base-chinese \ 29 | # --output_dir ../output/arg_qa/bert-base-chinese \ 30 | # --overwrite_output_dir \ 31 | # --version_2_with_negative \ 32 | # --train_file ../../datasets/Title2Event/train.csv \ 33 | # --validation_file ../../datasets/Title2Event/dev.csv \ 34 | # --test_file ../../datasets/Title2Event/test.csv \ 35 | # --do_train False --do_eval False \ 36 | # --do_predict \ 37 | # --per_device_train_batch_size=32 \ 38 | # --per_device_eval_batch_size=32 \ 39 | # --num_train_epochs 30 \ 40 | # --save_strategy epoch \ 41 | # --logging_strategy epoch \ 42 | # --evaluation_strategy epoch \ 43 | # --save_total_limit 1 \ 44 | # --load_best_model_at_end \ 45 | # --metric_for_best_model eval_f1 \ 46 | # --pred_trg_file ../output/trg_ner/bert-base-chinese/trg_predictions.csv \ 47 | 48 | 49 | # CUDA_VISIBLE_DEVICES=7 python run_seq2seq_qa.py \ 50 | # --model_name_or_path t5-base \ 51 | # --dataset_name squad_v2 \ 52 | # --context_column context \ 53 | # --question_column question \ 54 | # --answer_column answers \ 55 | # --do_train \ 56 | # --do_eval \ 57 | # --per_device_train_batch_size 12 \ 58 | # --learning_rate 3e-5 \ 59 | # --num_train_epochs 2 \ 60 | # --max_seq_length 384 \ 61 | # --doc_stride 128 \ 62 | # --output_dir ../output/tmp/t5-base -------------------------------------------------------------------------------- /mrc/mrc_task_config.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datasets import Dataset 3 | 4 | def load_my_datasets_for_mrc(data_args, task_args): 5 | raw_dataset = {} 6 | if data_args.test_file is not None: 7 | tst_df = pd.read_csv(data_args.test_file).fillna("None") 8 | tst_df = preprocess(tst_df, task_args, is_pred_data=True) 9 | test_dataset = Dataset.from_pandas(tst_df) 10 | raw_dataset['test'] = test_dataset 11 | if data_args.train_file is not None: 12 | trn_df = pd.read_csv(data_args.train_file).fillna("None") 13 | trn_df = preprocess(trn_df, task_args) 14 | train_dataset = Dataset.from_pandas(trn_df) 15 | raw_dataset['train'] = train_dataset 16 | if data_args.validation_file is not None: 17 | val_df = pd.read_csv(data_args.validation_file).fillna("None") 18 | val_df = val_df 19 | val_df = preprocess(val_df, task_args) 20 | valid_dataset = Dataset.from_pandas(val_df) 21 | raw_dataset['validation'] = valid_dataset 22 | return raw_dataset 23 | 24 | def preprocess(df, task_arg, is_pred_data=False): 25 | def find_answers(row): 26 | sbj_ans, obj_ans = {"text": [], "answer_start": []}, {"text": [], "answer_start": []} 27 | if task_arg.is_extractive: 28 | if row.triple and row.triple[0] and row.triple[0] in row.title: 29 | sbj_ans['text'] = [row.triple[0]] 30 | sbj_ans['answer_start'] = [row.title.index(row.triple[0])] 31 | if row.triple and row.triple[2] and row.triple[2] in row.title: 32 | obj_ans['text'] = [row.triple[2]] 33 | obj_ans['answer_start'] = [row.title.index(row.triple[2])] 34 | else: 35 | if row.triple and row.triple[0]: sbj_ans['text'] = [row.triple[0]] 36 | if row.triple and row.triple[2]: obj_ans['text'] = [row.triple[2]] 37 | return [sbj_ans, obj_ans] 38 | for col in df.columns: 39 | if col.endswith('triple'): 40 | df[col] = df[col].apply(eval) 41 | df['triple'] = df[[f'event{i}_triple' for i in range(1,7)]].apply(lambda row: [x for x in row], axis=1) 42 | df['gold_answer_triples'] = df['triple'].apply(lambda x: x if None not in x else x[:x.index(None)]) 43 | if is_pred_data and task_arg.pred_trg_file: 44 | pred_trg_df = pd.read_csv(task_arg.pred_trg_file) 45 | df['trigger'] = pred_trg_df[task_arg.pred_trg_col].apply(eval) 46 | df['trigger'] = df['trigger'].apply(lambda x: (x + [None] * (6-len(x)))[:6]) 47 | df['triple'] = df[[f'event{i}_triple' for i in range(1,7)]].apply(lambda row: [x for x in row if x !=None], axis=1) 48 | df['triple'] = df['triple'].apply(lambda x: (x + [[]] * (6-len(x)))[:6]) 49 | 50 | else: 51 | df['trigger'] = df['triple'].apply(lambda x: [tu[1] if tu else None for tu in x]) 52 | df['id'] = df['triple'].apply(lambda x: [str(i) for i, _ in enumerate(x)]) 53 | df = df.explode(['triple', 'id', 'trigger'])[['id', 'title_id', 'title', 'trigger', 'triple', 'gold_answer_triples']].dropna().reset_index(drop=True) 54 | df['title_id'] = df['title_id'].apply(lambda x: str(x)) 55 | df['id'] = df['title_id'] + "-" + df['id'] 56 | 57 | df['question'] = df['trigger'].apply(lambda x: [f"动作{x}的主体是?", f"动作{x}的客体是?"]) 58 | df['q_id'] = [["sbj", "obj"]]*len(df) 59 | df['answers'] = df.apply(find_answers, axis=1) 60 | df.rename({"title": "context"}, axis=1, inplace=True) 61 | df = df.explode(['question', 'answers', 'q_id']).reset_index(drop=True) 62 | df['id'] = df['id'] + "_" + df['q_id'] 63 | df = df.drop(labels='q_id', axis=1) 64 | return df -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Title2Event 2 | This is the repository for the paper [Title2Event: Benchmarking Open Event Extraction with a Large-scale Chinese Title Dataset](https://aclanthology.org/2022.emnlp-main.437/) 3 | ## Quick Start 4 | ### Download the dataset 5 | You can obtain the dataset from our [webpage](https://open-event-hub.github.io/title2event/) \ 6 | Note that the dataset is provided in both `csv` and `json` format, but currently the baseline code reads `csv` files by default. \ 7 | You can also find `tagging_train.csv`,`tagging_dev.csv` ,`tagging_test.csv`, these files contain the `BIO` labels needed to train tagging-based models, and are used by the `SeqTag` model. 8 | ### Requirements 9 | The code is modified from [examples of huggingface transformers](https://github.com/huggingface/transformers/tree/main/examples) \ 10 | In your preferred environment, run 11 | ``` 12 | pip3 install -r requirements.txt 13 | ``` 14 | ### Trigger Extraction 15 | #### Sequence-tagging model 16 | Note that the trigger prediction file is needed for pipeline inference. 17 | ``` 18 | cd seqtag 19 | bash run_trigger_extraction.sh 20 | ``` 21 | ### Argument Extraction 22 | All the following scripts will output two files: \ 23 | **arg_predictions.csv**: the model predictions with golden triggers \ 24 | **pipeline_predictions.csv**: the model predictions given the triggers predicted by the Trigger Extraction model \ 25 | The above files are used in **Evaluation** 26 | #### Sequence-tagging model 27 | ``` 28 | cd seqtag 29 | bash run_argument_extraction.sh 30 | ``` 31 | #### Span MRC model 32 | ``` 33 | cd mrc 34 | bash run_spanmrc.sh 35 | ``` 36 | #### Seq2Seq MRC model 37 | ``` 38 | cd mrc 39 | bash run_seq2seqmrc.sh 40 | ``` 41 | 42 | ### Evaluation 43 | ``` 44 | python3 evaluate.py -f [path of file1] [path of file2] ... 45 | ``` 46 | ## Citation 47 | ``` 48 | @inproceedings{deng-etal-2022-title2event, 49 | title = "{T}itle2{E}vent: Benchmarking Open Event Extraction with a Large-scale {C}hinese Title Dataset", 50 | author = "Deng, Haolin and 51 | Zhang, Yanan and 52 | Zhang, Yangfan and 53 | Ying, Wangyang and 54 | Yu, Changlong and 55 | Gao, Jun and 56 | Wang, Wei and 57 | Bai, Xiaoling and 58 | Yang, Nan and 59 | Ma, Jin and 60 | Chen, Xiang and 61 | Zhou, Tianhua", 62 | booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", 63 | month = dec, 64 | year = "2022", 65 | address = "Abu Dhabi, United Arab Emirates", 66 | publisher = "Association for Computational Linguistics", 67 | url = "https://aclanthology.org/2022.emnlp-main.437", 68 | pages = "6511--6524", 69 | abstract = "Event extraction (EE) is crucial to downstream tasks such as new aggregation and event knowledge graph construction. Most existing EE datasets manually define fixed event types and design specific schema for each of them, failing to cover diverse events emerging from the online text. Moreover, news titles, an important source of event mentions, have not gained enough attention in current EE research. In this paper, we present Title2Event, a large-scale sentence-level dataset benchmarking Open Event Extraction without restricting event types. Title2Event contains more than 42,000 news titles in 34 topics collected from Chinese web pages. To the best of our knowledge, it is currently the largest manually annotated Chinese dataset for open event extraction. We further conduct experiments on Title2Event with different models and show that the characteristics of titles make it challenging for event extraction, addressing the significance of advanced study on this problem. The dataset and baseline codes are available at https://open-event-hub.github.io/title2event.", 70 | } 71 | 72 | ``` 73 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | parser = ArgumentParser() 6 | parser.add_argument('-f', '--file_paths', dest='file_paths', nargs='+', type=str, help="path(s) of model prediction file(s) to evaluate") 7 | parser.add_argument('--pred_col', default='pred_event_triples', help='column name for model predictions') 8 | parser.add_argument('--ans_col', default='event_triples', help='column name for golden answers') 9 | 10 | def read_data(path): 11 | if path.endswith('.csv'): 12 | df = pd.read_csv(path) 13 | return df 14 | 15 | 16 | def evaluate(df: pd.DataFrame, pred_col='pred_event_triples', ans_col='event_triples'): 17 | """ 18 | compute precision, recall, and F1-score for model predictions, based on exact match, and print the tabulated results 19 | args: 20 | df: the dataframe containing a column of model predictions and a column of golden answers, where each cell of both columns should be a list of triplets 21 | pred_col: column name for model predictions 22 | ans_col: column name for golden answers 23 | return: F1-scores of trigger extraction, argument extraction and triplet extraction 24 | """ 25 | gold_trp_num = sum(df[ans_col].apply(lambda x: len(x))) 26 | pred_trp_num = sum(df[pred_col].apply(lambda x: len(x))) 27 | gold_trg_num, gold_arg_num, pred_trg_num, pred_arg_num = 0, 0, 0, 0 28 | trg_match_cnt, arg_match_cnt, triple_match_cnt = 0, 0, 0 29 | 30 | for idx, row in df.iterrows(): 31 | local_triple_match_cnt = 0 32 | gold_trips = list(row[ans_col]) 33 | gold_trgs = [trp[1] for trp in row[ans_col]] 34 | gold_sbjs = [{"P": trp[1], "S": trp[0]} for trp in row[ans_col] if trp[0]!=''] 35 | gold_objs = [{"P": trp[1], "O": trp[2]} for trp in row[ans_col] if trp[2]!=''] 36 | gold_trg_num += len(gold_trgs) 37 | gold_arg_num += (len(gold_sbjs) + len(gold_objs)) 38 | for pred in row[pred_col]: 39 | pred_trg_num += 1 40 | if len(pred) == 1: 41 | pred = ["", pred[0], ""] 42 | elif len(pred) == 2: 43 | pred.append("") 44 | if pred in gold_trips: 45 | local_triple_match_cnt += 1 46 | triple_match_cnt += 1 47 | gold_trips.remove(pred) 48 | 49 | if pred[0] != '': pred_arg_num += 1 50 | if pred[2] != '': pred_arg_num += 1 51 | if pred[1] in gold_trgs: 52 | trg_match_cnt += 1 53 | gold_trgs.remove(pred[1]) 54 | if {"P": pred[1], "S": pred[0]} in gold_sbjs: 55 | arg_match_cnt += 1 56 | gold_sbjs.remove({"P": pred[1], "S": pred[0]}) 57 | if {"P": pred[1], "O": pred[2]} in gold_objs: 58 | arg_match_cnt += 1 59 | gold_objs.remove({"P": pred[1], "O": pred[2]}) 60 | 61 | F1 = lambda p,r: "{:.5f}".format(2*p*r/(p+r)) 62 | trg_p, arg_p, trp_p = trg_match_cnt/pred_trg_num, arg_match_cnt/pred_arg_num, triple_match_cnt/pred_trp_num 63 | trg_r, arg_r, trp_r = trg_match_cnt/gold_trg_num, arg_match_cnt/gold_arg_num, triple_match_cnt/gold_trp_num 64 | trg_f, arg_f, trp_f = F1(trg_p, trg_r), F1(arg_p, arg_r), F1(trp_p, trp_r) 65 | 66 | header = ["task", "Precision", "Recall", "F1"] 67 | rows = [ 68 | ("Trigger Identification", trg_p, trg_r, trg_f), 69 | ("Argument Identification", arg_p, arg_r, arg_f), 70 | ("Triple Identification", trp_p, trp_r, trp_f), 71 | ] 72 | 73 | print(tabulate(rows, headers=header)) 74 | print("gold num > {}".format({"trp": gold_trg_num, "trg": gold_trg_num, "arg": gold_arg_num})) 75 | print("pred num > {}".format({"trp": pred_trp_num, "trg": pred_trg_num, "arg": pred_arg_num})) 76 | print("trg match: %d, arg match: %d, trp match: %d" % (trg_match_cnt, arg_match_cnt, triple_match_cnt)) 77 | 78 | return trg_f, arg_f, trp_f 79 | 80 | if __name__ == '__main__': 81 | args = parser.parse_args() 82 | pred_col, ans_col = args.pred_col, args.ans_col 83 | for file_path in args.file_paths: 84 | df = read_data(file_path) 85 | # post-process: 86 | # 1. convert str object "[1,2,3]" to list object [1,2,3] 87 | # 2. unify Chinese and English punctuations 88 | # 3. unify letters to lower case 89 | # 4. discard empty predictions 90 | df[ans_col] = df[ans_col].apply(lambda x: eval(x.lower().replace(" ", "").replace(":", ":"))) 91 | df['pred_event_triples'] = df['pred_event_triples'].apply(lambda x: eval(x.lower().replace(" ", "").replace(":", ":"))) 92 | df['pred_event_triples'] = df['pred_event_triples'].apply(lambda x: [i for i in x if i!=[]]) 93 | 94 | # evaluate 95 | print("************* {} ***********".format(file_path)) 96 | trg_f, arg_f, trp_f = evaluate(df, pred_col=pred_col, ans_col=ans_col) 97 | 98 | 99 | -------------------------------------------------------------------------------- /mrc/trainer_qa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Team All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | A subclass of `Trainer` specific to Question-Answering tasks 17 | """ 18 | 19 | from transformers import Trainer, is_torch_tpu_available 20 | from transformers.trainer_utils import PredictionOutput 21 | 22 | 23 | if is_torch_tpu_available(): 24 | import torch_xla.core.xla_model as xm 25 | import torch_xla.debug.metrics as met 26 | 27 | 28 | class QuestionAnsweringTrainer(Trainer): 29 | def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): 30 | super().__init__(*args, **kwargs) 31 | self.eval_examples = eval_examples 32 | self.post_process_function = post_process_function 33 | 34 | def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): 35 | eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset 36 | eval_dataloader = self.get_eval_dataloader(eval_dataset) 37 | eval_examples = self.eval_examples if eval_examples is None else eval_examples 38 | 39 | # Temporarily disable metric computation, we will do it in the loop here. 40 | compute_metrics = self.compute_metrics 41 | self.compute_metrics = None 42 | eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop 43 | try: 44 | output = eval_loop( 45 | eval_dataloader, 46 | description="Evaluation", 47 | # No point gathering the predictions if there are no metrics, otherwise we defer to 48 | # self.args.prediction_loss_only 49 | prediction_loss_only=True if compute_metrics is None else None, 50 | ignore_keys=ignore_keys, 51 | ) 52 | finally: 53 | self.compute_metrics = compute_metrics 54 | 55 | if self.post_process_function is not None and self.compute_metrics is not None: 56 | eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) 57 | metrics = self.compute_metrics(eval_preds) 58 | 59 | # Prefix all keys with metric_key_prefix + '_' 60 | for key in list(metrics.keys()): 61 | if not key.startswith(f"{metric_key_prefix}_"): 62 | metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) 63 | 64 | self.log(metrics) 65 | else: 66 | metrics = {} 67 | 68 | if self.args.tpu_metrics_debug or self.args.debug: 69 | # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) 70 | xm.master_print(met.metrics_report()) 71 | 72 | self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) 73 | return metrics 74 | 75 | def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): 76 | predict_dataloader = self.get_test_dataloader(predict_dataset) 77 | 78 | # Temporarily disable metric computation, we will do it in the loop here. 79 | compute_metrics = self.compute_metrics 80 | self.compute_metrics = None 81 | eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop 82 | try: 83 | output = eval_loop( 84 | predict_dataloader, 85 | description="Prediction", 86 | # No point gathering the predictions if there are no metrics, otherwise we defer to 87 | # self.args.prediction_loss_only 88 | prediction_loss_only=True if compute_metrics is None else None, 89 | ignore_keys=ignore_keys, 90 | ) 91 | finally: 92 | self.compute_metrics = compute_metrics 93 | 94 | if self.post_process_function is None or self.compute_metrics is None: 95 | return output 96 | 97 | predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") 98 | metrics = self.compute_metrics(predictions) 99 | 100 | # Prefix all keys with metric_key_prefix + '_' 101 | for key in list(metrics.keys()): 102 | if not key.startswith(f"{metric_key_prefix}_"): 103 | metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) 104 | 105 | return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) 106 | -------------------------------------------------------------------------------- /mrc/trainer_seq2seq_qa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The HuggingFace Team All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | A subclass of `Trainer` specific to Question-Answering tasks 17 | """ 18 | from typing import Dict, List, Optional 19 | 20 | from torch.utils.data import Dataset 21 | 22 | from transformers import Seq2SeqTrainer, is_torch_tpu_available 23 | from transformers.trainer_utils import PredictionOutput 24 | 25 | 26 | if is_torch_tpu_available(): 27 | import torch_xla.core.xla_model as xm 28 | import torch_xla.debug.metrics as met 29 | 30 | 31 | class QuestionAnsweringSeq2SeqTrainer(Seq2SeqTrainer): 32 | def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): 33 | super().__init__(*args, **kwargs) 34 | self.eval_examples = eval_examples 35 | self.post_process_function = post_process_function 36 | 37 | # def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): 38 | def evaluate( 39 | self, 40 | eval_dataset: Optional[Dataset] = None, 41 | eval_examples=None, 42 | ignore_keys: Optional[List[str]] = None, 43 | metric_key_prefix: str = "eval", 44 | max_length: Optional[int] = None, 45 | num_beams: Optional[int] = None, 46 | ) -> Dict[str, float]: 47 | self._max_length = max_length if max_length is not None else self.args.generation_max_length 48 | self._num_beams = num_beams if num_beams is not None else self.args.generation_num_beams 49 | 50 | eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset 51 | eval_dataloader = self.get_eval_dataloader(eval_dataset) 52 | eval_examples = self.eval_examples if eval_examples is None else eval_examples 53 | 54 | # Temporarily disable metric computation, we will do it in the loop here. 55 | compute_metrics = self.compute_metrics 56 | self.compute_metrics = None 57 | eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop 58 | try: 59 | output = eval_loop( 60 | eval_dataloader, 61 | description="Evaluation", 62 | # No point gathering the predictions if there are no metrics, otherwise we defer to 63 | # self.args.prediction_loss_only 64 | prediction_loss_only=True if compute_metrics is None else None, 65 | ignore_keys=ignore_keys, 66 | ) 67 | finally: 68 | self.compute_metrics = compute_metrics 69 | 70 | if self.post_process_function is not None and self.compute_metrics is not None: 71 | eval_preds = self.post_process_function(eval_examples, eval_dataset, output) 72 | metrics = self.compute_metrics(eval_preds) 73 | 74 | # Prefix all keys with metric_key_prefix + '_' 75 | for key in list(metrics.keys()): 76 | if not key.startswith(f"{metric_key_prefix}_"): 77 | metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) 78 | 79 | self.log(metrics) 80 | else: 81 | metrics = {} 82 | 83 | if self.args.tpu_metrics_debug or self.args.debug: 84 | # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) 85 | xm.master_print(met.metrics_report()) 86 | 87 | self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) 88 | return metrics 89 | 90 | def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): 91 | predict_dataloader = self.get_test_dataloader(predict_dataset) 92 | 93 | # Temporarily disable metric computation, we will do it in the loop here. 94 | compute_metrics = self.compute_metrics 95 | self.compute_metrics = None 96 | eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop 97 | try: 98 | output = eval_loop( 99 | predict_dataloader, 100 | description="Prediction", 101 | # No point gathering the predictions if there are no metrics, otherwise we defer to 102 | # self.args.prediction_loss_only 103 | prediction_loss_only=True if compute_metrics is None else None, 104 | ignore_keys=ignore_keys, 105 | ) 106 | finally: 107 | self.compute_metrics = compute_metrics 108 | 109 | if self.post_process_function is None or self.compute_metrics is None: 110 | return output 111 | 112 | predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") 113 | metrics = self.compute_metrics(predictions) 114 | 115 | # Prefix all keys with metric_key_prefix + '_' 116 | for key in list(metrics.keys()): 117 | if not key.startswith(f"{metric_key_prefix}_"): 118 | metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) 119 | 120 | return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) 121 | -------------------------------------------------------------------------------- /seqtag/ner_task_config.py: -------------------------------------------------------------------------------- 1 | from unittest import result 2 | import datasets 3 | from datasets import ClassLabel, Dataset, load_dataset, load_metric, Features 4 | import pandas as pd 5 | import pdb 6 | from transformers import AutoTokenizer 7 | dataset_tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext') 8 | dataset_tokenizer.add_special_tokens({"additional_special_tokens": ['“', '”', "…", "……", "—", "——"]}) # unrecognized by tokenizer 9 | 10 | Task2LabelCol = {'trg': "trg_tags", 'arg': "arg_tags", 'joint': ""} 11 | trg_tags = ('O', 'B-T1', 'I-T1', 'B-T2', 'I-T2', 'B-T3', 'I-T3', 'B-T4', 'I-T4', 'B-T5', 'I-T5', 'B-T6', 'I-T6') 12 | arg_tags = ('O', 'B-sbj', 'I-sbj', 'B-obj', 'I-obj') 13 | 14 | 15 | Task2Features = { 16 | # trigger extraction 17 | "trg": Features( 18 | { 19 | # "id": datasets.Value("string"), 20 | "tokens": datasets.Sequence(datasets.Value("string")), 21 | Task2LabelCol['trg']: datasets.Sequence( 22 | feature=datasets.features.ClassLabel( 23 | names=sorted(list(trg_tags)) 24 | ) 25 | ) 26 | } 27 | ), 28 | # argument extraction 29 | "arg": Features( 30 | { 31 | # "id": datasets.Value("string"), 32 | "tokens": datasets.Sequence(datasets.Value("string")), 33 | Task2LabelCol['arg']: datasets.Sequence( 34 | feature=datasets.features.ClassLabel( 35 | names=sorted(list(arg_tags)) 36 | ) 37 | ) 38 | } 39 | ) 40 | } 41 | 42 | def load_my_datasets_for_ner(data_args, task_args): 43 | if data_args.test_file is not None: 44 | tst_df = pd.read_csv(data_args.test_file) 45 | tst_df = preprocess(tst_df, task_args, is_test_data=True) 46 | test_dataset = Dataset.from_pandas(tst_df) 47 | # load dataframes 48 | if data_args.train_file is not None: 49 | trn_df = pd.read_csv(data_args.train_file) 50 | trn_df = preprocess(trn_df, task_args) 51 | train_dataset = Dataset.from_pandas(trn_df) 52 | if data_args.validation_file is not None: 53 | val_df = pd.read_csv(data_args.validation_file) 54 | val_df = preprocess(val_df, task_args) 55 | valid_dataset = Dataset.from_pandas(val_df) 56 | raw_dataset = {"train": train_dataset, 'validation': valid_dataset, 'test': test_dataset} 57 | return raw_dataset 58 | 59 | def preprocess(df: pd.DataFrame, task_args, is_test_data=False): 60 | for col in df.columns: 61 | if col.endswith('tags') or col.endswith('triple') or col=='tokens': 62 | df[col] = df[col].fillna("None").apply(eval) 63 | if task_args.just_infer and is_test_data: 64 | df['tokens'] = df['title'].apply(lambda x: dataset_tokenizer.tokenize(x)) 65 | df[Task2LabelCol[task_args.task_name]] = df['tokens'].apply(lambda x: ["O"]*len(x)) 66 | 67 | if is_test_data and task_args.task_name == 'arg' and task_args.pred_trg_file: # do argument extraction based on predicted triggers rather than golden labeled triggers 68 | pred_trg_df = pd.read_csv(task_args.pred_trg_file) 69 | pred_trg_df[task_args.pred_trg_tag_col] = pred_trg_df[task_args.pred_trg_tag_col].apply(eval) 70 | df['trg_tags'] = pred_trg_df[task_args.pred_trg_tag_col] 71 | df = expand_arg_tags(df, ignore_arg_tags=True) 72 | 73 | elif task_args.task_name == 'arg': 74 | df = expand_arg_tags(df) 75 | return df 76 | 77 | def expand_arg_tags(df: pd.DataFrame, ignore_arg_tags=False): 78 | """ 79 | expand arg_tags for all events in a single sentence 80 | args: 81 | df: the loaded dataframe 82 | ignore_arg_tags: if set to True, will not align all golden labeled arguments to their corresponding triggers, 83 | should be used when doing argument extraction based on predicted triggers rather than golden labeled triggers, 84 | since the number of predicted triggers may be less than total number of golden labeled triggers 85 | """ 86 | df['trg_tags'] = df['trg_tags'].apply(split_trg_tags) 87 | if ignore_arg_tags: 88 | df = df.explode('trg_tags') 89 | # df['trg_tags'].fillna(value=None, inplace=True) 90 | df['arg_tags'] = df.tokens.apply(lambda x: ['O']*len(x)) 91 | df = df[~(df['trg_tags'].isna())].reset_index() 92 | else: 93 | df['arg_tags'] = df[[f'event{i}_arg_tags' for i in range(1,7)]].values.tolist() 94 | df['event_triples'] = df[[f'event{i}_triple' for i in range(1,7)]].values.tolist() 95 | df['event_triples'] = df['event_triples'].apply(lambda x: [list(i) for i in x]) 96 | df = df.explode(['trg_tags', 'arg_tags', 'event_triples']).reset_index() 97 | df = df[~(df['trg_tags'].isna()) & ~(df['arg_tags']!=None).isna()].reset_index() 98 | return df 99 | 100 | def split_trg_tags(trg_tags): 101 | splited_tags = [] 102 | if trg_tags: 103 | for i in range(1,7): 104 | tags = None 105 | if f'B-T{i}' in trg_tags or f'I-T{i}' in trg_tags: 106 | tags = [tag if str(i) in tag else "O" for tag in trg_tags] 107 | splited_tags.append(tags) 108 | if splited_tags == [None]*6: # if do argument extraction based on predicted triggers, and there's no predicted trigger for this instance, should at least keep one trg_tags so that this instance is not lost 109 | splited_tags[0] = trg_tags 110 | return splited_tags 111 | 112 | def tags2text(tags, tokenized_src_text): 113 | """ 114 | convert BIO tags into a list of tokens (only convert B and I which represent event elements, skip O) 115 | args: 116 | tags: BIO tags of the example on token level 117 | tokenized_src_text: tokenized input text 118 | """ 119 | assert len(tags)==len(tokenized_src_text), "tags and tokenized input text should be of same length" 120 | result = [] 121 | cur_tkns = [] 122 | cur_tags = [] 123 | for i,tkn in enumerate(tokenized_src_text): 124 | if tags[i]=='O': continue 125 | if cur_tags==[] or tags[i].split('-')[1] == cur_tags[-1].split('-')[1]: 126 | cur_tags.append(tags[i]) 127 | cur_tkns.append(tkn) 128 | else: 129 | result.append(dataset_tokenizer.convert_tokens_to_string(cur_tkns).replace(" ", "")) 130 | cur_tkns = [tkn] 131 | cur_tags = [tags[i]] 132 | if cur_tkns: result.append(dataset_tokenizer.convert_tokens_to_string(cur_tkns).replace(" ", "")) 133 | return result 134 | 135 | def combine_trg_args(row): 136 | """ 137 | combine triggers (golden labeled or predicted) with predicted arguments to form triplets 138 | """ 139 | triple = row.pred_arguments 140 | if triple: 141 | if row.triggers: 142 | triple.insert(1, row.triggers[0]) 143 | else: 144 | triple.insert(1, "") 145 | if len(triple) < 3: triple.append("") 146 | return triple 147 | 148 | def agg_triples(row): 149 | """ 150 | aggregate triplets belonging to the same source text 151 | """ 152 | triples = [] 153 | for i in range(1,7): 154 | if len(row[f'event{i}_triple'])>0: 155 | triples.append(list(row[f'event{i}_triple'])) 156 | return triples -------------------------------------------------------------------------------- /mrc/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Question answering 18 | 19 | This folder contains several scripts that showcase how to fine-tune a 🤗 Transformers model on a question answering dataset, 20 | like SQuAD. 21 | 22 | ## Trainer-based scripts 23 | 24 | The [`run_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py), 25 | [`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search.py) and [`run_seq2seq_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py) leverage the 🤗 [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) for fine-tuning. 26 | 27 | ### Fine-tuning BERT on SQuAD1.0 28 | 29 | The [`run_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py) script 30 | allows to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture has a `ForQuestionAnswering` version in the library) on a question-answering dataset (such as SQuAD, or any other QA dataset available in the `datasets` library, or your own csv/jsonlines files) as long as they are structured the same way as SQuAD. You might need to tweak the data processing inside the script if your data is structured differently. 31 | 32 | **Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it 33 | uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in 34 | [this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version of the script which can be found [here](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering). 35 | 36 | Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`. 37 | 38 | This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 39 | on a single tesla V100 16GB. 40 | 41 | ```bash 42 | python run_qa.py \ 43 | --model_name_or_path bert-base-uncased \ 44 | --dataset_name squad \ 45 | --do_train \ 46 | --do_eval \ 47 | --per_device_train_batch_size 12 \ 48 | --learning_rate 3e-5 \ 49 | --num_train_epochs 2 \ 50 | --max_seq_length 384 \ 51 | --doc_stride 128 \ 52 | --output_dir /tmp/debug_squad/ 53 | ``` 54 | 55 | Training with the previously defined hyper-parameters yields the following results: 56 | 57 | ```bash 58 | f1 = 88.52 59 | exact_match = 81.22 60 | ``` 61 | 62 | ### Fine-tuning XLNet with beam search on SQuAD 63 | 64 | The [`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search.py) script is only meant to fine-tune XLNet, which is a special encoder-only Transformer model. The example code below fine-tunes XLNet on the SQuAD1.0 and SQuAD2.0 datasets. 65 | 66 | #### Command for SQuAD1.0: 67 | 68 | ```bash 69 | python run_qa_beam_search.py \ 70 | --model_name_or_path xlnet-large-cased \ 71 | --dataset_name squad \ 72 | --do_train \ 73 | --do_eval \ 74 | --learning_rate 3e-5 \ 75 | --num_train_epochs 2 \ 76 | --max_seq_length 384 \ 77 | --doc_stride 128 \ 78 | --output_dir ./wwm_cased_finetuned_squad/ \ 79 | --per_device_eval_batch_size=4 \ 80 | --per_device_train_batch_size=4 \ 81 | --save_steps 5000 82 | ``` 83 | 84 | #### Command for SQuAD2.0: 85 | 86 | ```bash 87 | export SQUAD_DIR=/path/to/SQUAD 88 | 89 | python run_qa_beam_search.py \ 90 | --model_name_or_path xlnet-large-cased \ 91 | --dataset_name squad_v2 \ 92 | --do_train \ 93 | --do_eval \ 94 | --version_2_with_negative \ 95 | --learning_rate 3e-5 \ 96 | --num_train_epochs 4 \ 97 | --max_seq_length 384 \ 98 | --doc_stride 128 \ 99 | --output_dir ./wwm_cased_finetuned_squad/ \ 100 | --per_device_eval_batch_size=2 \ 101 | --per_device_train_batch_size=2 \ 102 | --save_steps 5000 103 | ``` 104 | 105 | ### Fine-tuning T5 on SQuAD2.0 106 | 107 | The [`run_seq2seq_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These 108 | models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer. 109 | 110 | This example code fine-tunes T5 on the SQuAD2.0 dataset. 111 | 112 | ```bash 113 | python run_seq2seq_qa.py \ 114 | --model_name_or_path t5-small \ 115 | --dataset_name squad_v2 \ 116 | --context_column context \ 117 | --question_column question \ 118 | --answer_column answer \ 119 | --do_train \ 120 | --do_eval \ 121 | --per_device_train_batch_size 12 \ 122 | --learning_rate 3e-5 \ 123 | --num_train_epochs 2 \ 124 | --max_seq_length 384 \ 125 | --doc_stride 128 \ 126 | --output_dir /tmp/debug_seq2seq_squad/ 127 | ``` 128 | 129 | ## Accelerate-based scripts 130 | 131 | Based on the scripts `run_qa_no_trainer.py` and `run_qa_beam_search_no_trainer.py`. 132 | 133 | Like `run_qa.py` and `run_qa_beam_search.py`, these scripts allow you to fine-tune any of the models supported on a 134 | SQuAD or a similar dataset, the main difference is that this script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like. It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer or the dataloaders directly in the script), but still run in a distributed setup, on TPU and supports mixed precision by leveraging the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. 135 | 136 | You can use the script normally after installing it: 137 | 138 | ```bash 139 | pip install accelerate 140 | ``` 141 | 142 | then 143 | 144 | ```bash 145 | python run_qa_no_trainer.py \ 146 | --model_name_or_path bert-base-uncased \ 147 | --dataset_name squad \ 148 | --max_seq_length 384 \ 149 | --doc_stride 128 \ 150 | --output_dir ~/tmp/debug_squad 151 | ``` 152 | 153 | You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run 154 | 155 | ```bash 156 | accelerate config 157 | ``` 158 | 159 | and reply to the questions asked. Then 160 | 161 | ```bash 162 | accelerate test 163 | ``` 164 | 165 | that will check everything is ready for training. Finally, you can launch training with 166 | 167 | ```bash 168 | accelerate launch run_qa_no_trainer.py \ 169 | --model_name_or_path bert-base-uncased \ 170 | --dataset_name squad \ 171 | --max_seq_length 384 \ 172 | --doc_stride 128 \ 173 | --output_dir ~/tmp/debug_squad 174 | ``` 175 | 176 | This command is the same and will work for: 177 | 178 | - a CPU-only setup 179 | - a setup with one GPU 180 | - a distributed training with several GPUs (single or multi node) 181 | - a training on TPUs 182 | 183 | Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it. 184 | -------------------------------------------------------------------------------- /mrc/utils_qa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Team All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Post-processing utilities for question answering. 17 | """ 18 | import collections 19 | import json 20 | import logging 21 | import os 22 | from typing import Optional, Tuple 23 | 24 | import numpy as np 25 | from tqdm.auto import tqdm 26 | 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def postprocess_qa_predictions( 32 | examples, 33 | features, 34 | predictions: Tuple[np.ndarray, np.ndarray], 35 | version_2_with_negative: bool = False, 36 | n_best_size: int = 20, 37 | max_answer_length: int = 30, 38 | null_score_diff_threshold: float = 0.0, 39 | output_dir: Optional[str] = None, 40 | prefix: Optional[str] = None, 41 | log_level: Optional[int] = logging.WARNING, 42 | ): 43 | """ 44 | Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the 45 | original contexts. This is the base postprocessing functions for models that only return start and end logits. 46 | 47 | Args: 48 | examples: The non-preprocessed dataset (see the main script for more information). 49 | features: The processed dataset (see the main script for more information). 50 | predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): 51 | The predictions of the model: two arrays containing the start logits and the end logits respectively. Its 52 | first dimension must match the number of elements of :obj:`features`. 53 | version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): 54 | Whether or not the underlying dataset contains examples with no answers. 55 | n_best_size (:obj:`int`, `optional`, defaults to 20): 56 | The total number of n-best predictions to generate when looking for an answer. 57 | max_answer_length (:obj:`int`, `optional`, defaults to 30): 58 | The maximum length of an answer that can be generated. This is needed because the start and end predictions 59 | are not conditioned on one another. 60 | null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): 61 | The threshold used to select the null answer: if the best answer has a score that is less than the score of 62 | the null answer minus this threshold, the null answer is selected for this example (note that the score of 63 | the null answer for an example giving several features is the minimum of the scores for the null answer on 64 | each feature: all features must be aligned on the fact they `want` to predict a null answer). 65 | 66 | Only useful when :obj:`version_2_with_negative` is :obj:`True`. 67 | output_dir (:obj:`str`, `optional`): 68 | If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if 69 | :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null 70 | answers, are saved in `output_dir`. 71 | prefix (:obj:`str`, `optional`): 72 | If provided, the dictionaries mentioned above are saved with `prefix` added to their names. 73 | log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): 74 | ``logging`` log level (e.g., ``logging.WARNING``) 75 | """ 76 | if len(predictions) != 2: 77 | raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") 78 | all_start_logits, all_end_logits = predictions 79 | 80 | if len(predictions[0]) != len(features): 81 | raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") 82 | 83 | # Build a map example to its corresponding features. 84 | example_id_to_index = {k: i for i, k in enumerate(examples["id"])} 85 | features_per_example = collections.defaultdict(list) 86 | for i, feature in enumerate(features): 87 | features_per_example[example_id_to_index[feature["example_id"]]].append(i) 88 | 89 | # The dictionaries we have to fill. 90 | all_predictions = collections.OrderedDict() 91 | all_nbest_json = collections.OrderedDict() 92 | if version_2_with_negative: 93 | scores_diff_json = collections.OrderedDict() 94 | 95 | # Logging. 96 | logger.setLevel(log_level) 97 | logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") 98 | 99 | # Let's loop over all the examples! 100 | for example_index, example in enumerate(tqdm(examples)): 101 | # Those are the indices of the features associated to the current example. 102 | feature_indices = features_per_example[example_index] 103 | 104 | min_null_prediction = None 105 | prelim_predictions = [] 106 | 107 | # Looping through all the features associated to the current example. 108 | for feature_index in feature_indices: 109 | # We grab the predictions of the model for this feature. 110 | start_logits = all_start_logits[feature_index] 111 | end_logits = all_end_logits[feature_index] 112 | # This is what will allow us to map some the positions in our logits to span of texts in the original 113 | # context. 114 | offset_mapping = features[feature_index]["offset_mapping"] 115 | # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context 116 | # available in the current feature. 117 | token_is_max_context = features[feature_index].get("token_is_max_context", None) 118 | 119 | # Update minimum null prediction. 120 | feature_null_score = start_logits[0] + end_logits[0] 121 | if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: 122 | min_null_prediction = { 123 | "offsets": (0, 0), 124 | "score": feature_null_score, 125 | "start_logit": start_logits[0], 126 | "end_logit": end_logits[0], 127 | } 128 | 129 | # Go through all possibilities for the `n_best_size` greater start and end logits. 130 | start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() 131 | end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() 132 | for start_index in start_indexes: 133 | for end_index in end_indexes: 134 | # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond 135 | # to part of the input_ids that are not in the context. 136 | if ( 137 | start_index >= len(offset_mapping) 138 | or end_index >= len(offset_mapping) 139 | or offset_mapping[start_index] is None 140 | or len(offset_mapping[start_index]) < 2 141 | or offset_mapping[end_index] is None 142 | or len(offset_mapping[end_index]) < 2 143 | ): 144 | continue 145 | # Don't consider answers with a length that is either < 0 or > max_answer_length. 146 | if end_index < start_index or end_index - start_index + 1 > max_answer_length: 147 | continue 148 | # Don't consider answer that don't have the maximum context available (if such information is 149 | # provided). 150 | if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): 151 | continue 152 | 153 | prelim_predictions.append( 154 | { 155 | "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), 156 | "score": start_logits[start_index] + end_logits[end_index], 157 | "start_logit": start_logits[start_index], 158 | "end_logit": end_logits[end_index], 159 | } 160 | ) 161 | if version_2_with_negative: 162 | # Add the minimum null prediction 163 | prelim_predictions.append(min_null_prediction) 164 | null_score = min_null_prediction["score"] 165 | 166 | # Only keep the best `n_best_size` predictions. 167 | predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] 168 | 169 | # Add back the minimum null prediction if it was removed because of its low score. 170 | if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions): 171 | predictions.append(min_null_prediction) 172 | 173 | # Use the offsets to gather the answer text in the original context. 174 | context = example["context"] 175 | for pred in predictions: 176 | offsets = pred.pop("offsets") 177 | pred["text"] = context[offsets[0] : offsets[1]] 178 | 179 | # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid 180 | # failure. 181 | if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): 182 | predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) 183 | 184 | # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using 185 | # the LogSumExp trick). 186 | scores = np.array([pred.pop("score") for pred in predictions]) 187 | exp_scores = np.exp(scores - np.max(scores)) 188 | probs = exp_scores / exp_scores.sum() 189 | 190 | # Include the probabilities in our predictions. 191 | for prob, pred in zip(probs, predictions): 192 | pred["probability"] = prob 193 | 194 | # Pick the best prediction. If the null answer is not possible, this is easy. 195 | if not version_2_with_negative: 196 | all_predictions[example["id"]] = predictions[0]["text"] 197 | else: 198 | # Otherwise we first need to find the best non-empty prediction. 199 | i = 0 200 | while predictions[i]["text"] == "": 201 | i += 1 202 | best_non_null_pred = predictions[i] 203 | 204 | # Then we compare to the null prediction using the threshold. 205 | score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] 206 | scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. 207 | if score_diff > null_score_diff_threshold: 208 | all_predictions[example["id"]] = "" 209 | else: 210 | all_predictions[example["id"]] = best_non_null_pred["text"] 211 | 212 | # Make `predictions` JSON-serializable by casting np.float back to float. 213 | all_nbest_json[example["id"]] = [ 214 | {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} 215 | for pred in predictions 216 | ] 217 | 218 | # If we have an output_dir, let's save all those dicts. 219 | if output_dir is not None: 220 | if not os.path.isdir(output_dir): 221 | raise EnvironmentError(f"{output_dir} is not a directory.") 222 | 223 | prediction_file = os.path.join( 224 | output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" 225 | ) 226 | nbest_file = os.path.join( 227 | output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" 228 | ) 229 | if version_2_with_negative: 230 | null_odds_file = os.path.join( 231 | output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" 232 | ) 233 | 234 | logger.info(f"Saving predictions to {prediction_file}.") 235 | with open(prediction_file, "w") as writer: 236 | writer.write(json.dumps(all_predictions, indent=4) + "\n") 237 | logger.info(f"Saving nbest_preds to {nbest_file}.") 238 | with open(nbest_file, "w") as writer: 239 | writer.write(json.dumps(all_nbest_json, indent=4) + "\n") 240 | if version_2_with_negative: 241 | logger.info(f"Saving null_odds to {null_odds_file}.") 242 | with open(null_odds_file, "w") as writer: 243 | writer.write(json.dumps(scores_diff_json, indent=4) + "\n") 244 | 245 | return all_predictions 246 | 247 | 248 | def postprocess_qa_predictions_with_beam_search( 249 | examples, 250 | features, 251 | predictions: Tuple[np.ndarray, np.ndarray], 252 | version_2_with_negative: bool = False, 253 | n_best_size: int = 20, 254 | max_answer_length: int = 30, 255 | start_n_top: int = 5, 256 | end_n_top: int = 5, 257 | output_dir: Optional[str] = None, 258 | prefix: Optional[str] = None, 259 | log_level: Optional[int] = logging.WARNING, 260 | ): 261 | """ 262 | Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the 263 | original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as 264 | cls token predictions. 265 | 266 | Args: 267 | examples: The non-preprocessed dataset (see the main script for more information). 268 | features: The processed dataset (see the main script for more information). 269 | predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): 270 | The predictions of the model: two arrays containing the start logits and the end logits respectively. Its 271 | first dimension must match the number of elements of :obj:`features`. 272 | version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): 273 | Whether or not the underlying dataset contains examples with no answers. 274 | n_best_size (:obj:`int`, `optional`, defaults to 20): 275 | The total number of n-best predictions to generate when looking for an answer. 276 | max_answer_length (:obj:`int`, `optional`, defaults to 30): 277 | The maximum length of an answer that can be generated. This is needed because the start and end predictions 278 | are not conditioned on one another. 279 | start_n_top (:obj:`int`, `optional`, defaults to 5): 280 | The number of top start logits too keep when searching for the :obj:`n_best_size` predictions. 281 | end_n_top (:obj:`int`, `optional`, defaults to 5): 282 | The number of top end logits too keep when searching for the :obj:`n_best_size` predictions. 283 | output_dir (:obj:`str`, `optional`): 284 | If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if 285 | :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null 286 | answers, are saved in `output_dir`. 287 | prefix (:obj:`str`, `optional`): 288 | If provided, the dictionaries mentioned above are saved with `prefix` added to their names. 289 | log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): 290 | ``logging`` log level (e.g., ``logging.WARNING``) 291 | """ 292 | if len(predictions) != 5: 293 | raise ValueError("`predictions` should be a tuple with five elements.") 294 | start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions 295 | 296 | if len(predictions[0]) != len(features): 297 | raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") 298 | 299 | # Build a map example to its corresponding features. 300 | example_id_to_index = {k: i for i, k in enumerate(examples["id"])} 301 | features_per_example = collections.defaultdict(list) 302 | for i, feature in enumerate(features): 303 | features_per_example[example_id_to_index[feature["example_id"]]].append(i) 304 | 305 | # The dictionaries we have to fill. 306 | all_predictions = collections.OrderedDict() 307 | all_nbest_json = collections.OrderedDict() 308 | scores_diff_json = collections.OrderedDict() if version_2_with_negative else None 309 | 310 | # Logging. 311 | logger.setLevel(log_level) 312 | logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") 313 | 314 | # Let's loop over all the examples! 315 | for example_index, example in enumerate(tqdm(examples)): 316 | # Those are the indices of the features associated to the current example. 317 | feature_indices = features_per_example[example_index] 318 | 319 | min_null_score = None 320 | prelim_predictions = [] 321 | 322 | # Looping through all the features associated to the current example. 323 | for feature_index in feature_indices: 324 | # We grab the predictions of the model for this feature. 325 | start_log_prob = start_top_log_probs[feature_index] 326 | start_indexes = start_top_index[feature_index] 327 | end_log_prob = end_top_log_probs[feature_index] 328 | end_indexes = end_top_index[feature_index] 329 | feature_null_score = cls_logits[feature_index] 330 | # This is what will allow us to map some the positions in our logits to span of texts in the original 331 | # context. 332 | offset_mapping = features[feature_index]["offset_mapping"] 333 | # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context 334 | # available in the current feature. 335 | token_is_max_context = features[feature_index].get("token_is_max_context", None) 336 | 337 | # Update minimum null prediction 338 | if min_null_score is None or feature_null_score < min_null_score: 339 | min_null_score = feature_null_score 340 | 341 | # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits. 342 | for i in range(start_n_top): 343 | for j in range(end_n_top): 344 | start_index = int(start_indexes[i]) 345 | j_index = i * end_n_top + j 346 | end_index = int(end_indexes[j_index]) 347 | # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the 348 | # p_mask but let's not take any risk) 349 | if ( 350 | start_index >= len(offset_mapping) 351 | or end_index >= len(offset_mapping) 352 | or offset_mapping[start_index] is None 353 | or offset_mapping[end_index] is None 354 | ): 355 | continue 356 | # Don't consider answers with a length negative or > max_answer_length. 357 | if end_index < start_index or end_index - start_index + 1 > max_answer_length: 358 | continue 359 | # Don't consider answer that don't have the maximum context available (if such information is 360 | # provided). 361 | if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): 362 | continue 363 | prelim_predictions.append( 364 | { 365 | "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), 366 | "score": start_log_prob[i] + end_log_prob[j_index], 367 | "start_log_prob": start_log_prob[i], 368 | "end_log_prob": end_log_prob[j_index], 369 | } 370 | ) 371 | 372 | # Only keep the best `n_best_size` predictions. 373 | predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] 374 | 375 | # Use the offsets to gather the answer text in the original context. 376 | context = example["context"] 377 | for pred in predictions: 378 | offsets = pred.pop("offsets") 379 | pred["text"] = context[offsets[0] : offsets[1]] 380 | 381 | # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid 382 | # failure. 383 | if len(predictions) == 0: 384 | predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6}) 385 | 386 | # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using 387 | # the LogSumExp trick). 388 | scores = np.array([pred.pop("score") for pred in predictions]) 389 | exp_scores = np.exp(scores - np.max(scores)) 390 | probs = exp_scores / exp_scores.sum() 391 | 392 | # Include the probabilities in our predictions. 393 | for prob, pred in zip(probs, predictions): 394 | pred["probability"] = prob 395 | 396 | # Pick the best prediction and set the probability for the null answer. 397 | all_predictions[example["id"]] = predictions[0]["text"] 398 | if version_2_with_negative: 399 | scores_diff_json[example["id"]] = float(min_null_score) 400 | 401 | # Make `predictions` JSON-serializable by casting np.float back to float. 402 | all_nbest_json[example["id"]] = [ 403 | {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} 404 | for pred in predictions 405 | ] 406 | 407 | # If we have an output_dir, let's save all those dicts. 408 | if output_dir is not None: 409 | if not os.path.isdir(output_dir): 410 | raise EnvironmentError(f"{output_dir} is not a directory.") 411 | 412 | prediction_file = os.path.join( 413 | output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" 414 | ) 415 | nbest_file = os.path.join( 416 | output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" 417 | ) 418 | if version_2_with_negative: 419 | null_odds_file = os.path.join( 420 | output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" 421 | ) 422 | 423 | logger.info(f"Saving predictions to {prediction_file}.") 424 | with open(prediction_file, "w") as writer: 425 | writer.write(json.dumps(all_predictions, indent=4) + "\n") 426 | logger.info(f"Saving nbest_preds to {nbest_file}.") 427 | with open(nbest_file, "w") as writer: 428 | writer.write(json.dumps(all_nbest_json, indent=4) + "\n") 429 | if version_2_with_negative: 430 | logger.info(f"Saving null_odds to {null_odds_file}.") 431 | with open(null_odds_file, "w") as writer: 432 | writer.write(json.dumps(scores_diff_json, indent=4) + "\n") 433 | 434 | return all_predictions, scores_diff_json 435 | -------------------------------------------------------------------------------- /seqtag/run_ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | # Copyright 2020 The HuggingFace Team All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """ 18 | Fine-tuning the library models for token classification. 19 | """ 20 | # You can also adapt this script on your own token classification task and datasets. Pointers for this are left as 21 | # comments. 22 | 23 | import logging 24 | import os 25 | import sys 26 | from pathlib import Path 27 | from dataclasses import dataclass, field 28 | from typing import Optional 29 | import pdb 30 | import numpy as np 31 | from tqdm import tqdm 32 | 33 | 34 | from transformers import ( 35 | AutoConfig, 36 | AutoModelForTokenClassification, 37 | AutoTokenizer, 38 | DataCollatorForTokenClassification, 39 | HfArgumentParser, 40 | PretrainedConfig, 41 | PreTrainedTokenizerFast, 42 | Trainer, 43 | TrainingArguments, 44 | set_seed, 45 | ) 46 | from transformers.trainer_utils import get_last_checkpoint 47 | from transformers.utils import check_min_version 48 | from transformers.utils.versions import require_version 49 | import transformers 50 | import evaluate 51 | 52 | from ner_task_config import * 53 | 54 | 55 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") 56 | 57 | logger = logging.getLogger(__name__) 58 | 59 | 60 | @dataclass 61 | class TaskArguments: 62 | """ 63 | Arguments pertraining to which task to perfrom (e.g., trigger extraction, argument extraction) 64 | """ 65 | task_name: Optional[str] = field( 66 | default="trg", 67 | metadata={"help": "one of {}".format(list(Task2LabelCol.keys()))} 68 | ) 69 | pred_trg_file: Optional[str] = field( 70 | default=None, 71 | metadata={"help": "trigger prediction file, if specified and task_name is 'arg', will do argument extraction based on predicted triggers rather than golden labeled triggers"} 72 | ) 73 | pred_trg_tag_col: Optional[str] = field( 74 | default='pred_trg_tags', 75 | metadata={"help": "column name of predicted triggers tags in pred_trg_file"} 76 | ) 77 | output_filename: Optional[str] = field( 78 | default=None, 79 | metadata={"help": "you can specify the file name of model predictions"} 80 | ) 81 | def __post_init__(self): 82 | self.task_name = self.task_name.lower() 83 | 84 | @dataclass 85 | class ModelArguments: 86 | """ 87 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 88 | """ 89 | 90 | model_name_or_path: str = field( 91 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 92 | ) 93 | config_name: Optional[str] = field( 94 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 95 | ) 96 | tokenizer_name: Optional[str] = field( 97 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 98 | ) 99 | cache_dir: Optional[str] = field( 100 | default=None, 101 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 102 | ) 103 | model_revision: str = field( 104 | default="main", 105 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 106 | ) 107 | use_auth_token: bool = field( 108 | default=False, 109 | metadata={ 110 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 111 | "with private models)." 112 | }, 113 | ) 114 | 115 | 116 | @dataclass 117 | class DataTrainingArguments: 118 | """ 119 | Arguments pertaining to what data we are going to input our model for training and eval. 120 | """ 121 | dataset_name: Optional[str] = field( 122 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 123 | ) 124 | dataset_config_name: Optional[str] = field( 125 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 126 | ) 127 | train_file: Optional[str] = field( 128 | default=None, metadata={"help": "The input training data file (a csv or JSON file)."} 129 | ) 130 | validation_file: Optional[str] = field( 131 | default=None, 132 | metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, 133 | ) 134 | test_file: Optional[str] = field( 135 | default=None, 136 | metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, 137 | ) 138 | text_column_name: Optional[str] = field( 139 | default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."} 140 | ) 141 | label_column_name: Optional[str] = field( 142 | default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."} 143 | ) 144 | overwrite_cache: bool = field( 145 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 146 | ) 147 | preprocessing_num_workers: Optional[int] = field( 148 | default=None, 149 | metadata={"help": "The number of processes to use for the preprocessing."}, 150 | ) 151 | max_seq_length: int = field( 152 | default=None, 153 | metadata={ 154 | "help": "The maximum total input sequence length after tokenization. If set, sequences longer " 155 | "than this will be truncated, sequences shorter will be padded." 156 | }, 157 | ) 158 | pad_to_max_length: bool = field( 159 | default=False, 160 | metadata={ 161 | "help": "Whether to pad all samples to model maximum sentence length. " 162 | "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " 163 | "efficient on GPU but very bad for TPU." 164 | }, 165 | ) 166 | max_train_samples: Optional[int] = field( 167 | default=None, 168 | metadata={ 169 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 170 | "value if set." 171 | }, 172 | ) 173 | max_eval_samples: Optional[int] = field( 174 | default=None, 175 | metadata={ 176 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 177 | "value if set." 178 | }, 179 | ) 180 | max_predict_samples: Optional[int] = field( 181 | default=None, 182 | metadata={ 183 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 184 | "value if set." 185 | }, 186 | ) 187 | label_all_tokens: bool = field( 188 | default=False, 189 | metadata={ 190 | "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " 191 | "one (in which case the other tokens will have a padding index)." 192 | }, 193 | ) 194 | return_entity_level_metrics: bool = field( 195 | default=False, 196 | metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, 197 | ) 198 | 199 | def __post_init__(self): 200 | if self.dataset_name is None and self.train_file is None and self.validation_file is None: 201 | raise ValueError("Need either a dataset name or a training/validation file.") 202 | else: 203 | if self.train_file is not None: 204 | extension = self.train_file.split(".")[-1] 205 | assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." 206 | if self.validation_file is not None: 207 | extension = self.validation_file.split(".")[-1] 208 | assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." 209 | 210 | 211 | def main(): 212 | # See all possible arguments in src/transformers/training_args.py 213 | # or by passing the --help flag to this script. 214 | # We now keep distinct sets of args, for a cleaner separation of concerns. 215 | 216 | parser = HfArgumentParser((TaskArguments, ModelArguments, DataTrainingArguments, TrainingArguments)) 217 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 218 | # If we pass only one argument to the script and it's the path to a json file, 219 | # let's parse it to get our arguments. 220 | task_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 221 | else: 222 | task_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses() 223 | 224 | task_args.just_infer = (not training_args.do_train and training_args.do_predict) 225 | # if task_args.just_infer: 226 | # training_args.do_train, training_args.do_eval, training_args.do_predict = False, False, True 227 | 228 | # Setup logging 229 | logging.basicConfig( 230 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 231 | datefmt="%m/%d/%Y %H:%M:%S", 232 | handlers=[logging.StreamHandler(sys.stdout)], 233 | ) 234 | 235 | log_level = training_args.get_process_log_level() 236 | logger.setLevel(log_level) 237 | datasets.utils.logging.set_verbosity(log_level) 238 | transformers.utils.logging.set_verbosity(log_level) 239 | transformers.utils.logging.enable_default_handler() 240 | transformers.utils.logging.enable_explicit_format() 241 | 242 | # Log on each process the small summary: 243 | logger.warning( 244 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 245 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 246 | ) 247 | logger.info(f"Training/evaluation parameters {training_args}") 248 | 249 | # Detecting last checkpoint. 250 | last_checkpoint = None 251 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 252 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 253 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 254 | raise ValueError( 255 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 256 | "Use --overwrite_output_dir to overcome." 257 | ) 258 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 259 | logger.info( 260 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 261 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 262 | ) 263 | 264 | # Set seed before initializing model. 265 | set_seed(training_args.seed) 266 | 267 | # modified: loading and processing data 268 | logger.info("****** loading and processing data ******") 269 | raw_datasets = load_my_datasets_for_ner(data_args, task_args) 270 | 271 | if training_args.do_train: 272 | column_names = raw_datasets["train"].column_names 273 | # features = raw_datasets["train"].features 274 | else: 275 | column_names = raw_datasets["test"].column_names 276 | # features = raw_datasets["validation"].features 277 | 278 | if data_args.text_column_name is not None: 279 | text_column_name = data_args.text_column_name 280 | elif "tokens" in column_names: 281 | text_column_name = "tokens" 282 | else: 283 | text_column_name = column_names[0] 284 | 285 | if data_args.label_column_name is not None: 286 | label_column_name = data_args.label_column_name 287 | else: 288 | label_column_name = Task2LabelCol[task_args.task_name] 289 | 290 | 291 | # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the 292 | # unique labels. 293 | def get_label_list(labels): 294 | unique_labels = set() 295 | for label in labels: 296 | unique_labels = unique_labels | set(label) 297 | label_list = list(unique_labels) 298 | label_list.sort() 299 | return label_list 300 | 301 | # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere. 302 | # Otherwise, we have to get the list of labels manually. 303 | # labels_are_int = isinstance(features[label_column_name].feature, ClassLabel) 304 | labels_are_int = False 305 | if labels_are_int: 306 | label_list = Task2Features[task_args.task_name][label_column_name].feature.names 307 | label_to_id = {i: i for i in range(len(label_list))} 308 | else: 309 | # label_list = get_label_list(raw_datasets["train"][label_column_name]) 310 | label_list = Task2Features[task_args.task_name][label_column_name].feature.names 311 | label_to_id = {l: i for i, l in enumerate(label_list)} 312 | 313 | num_labels = len(label_list) 314 | 315 | # Load pretrained model and tokenizer 316 | # 317 | # Distributed training: 318 | # The .from_pretrained methods guarantee that only one local process can concurrently 319 | # download model & vocab. 320 | config = AutoConfig.from_pretrained( 321 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 322 | num_labels=num_labels, 323 | finetuning_task=task_args.task_name, 324 | cache_dir=model_args.cache_dir, 325 | revision=model_args.model_revision, 326 | use_auth_token=True if model_args.use_auth_token else None, 327 | ) 328 | 329 | tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path 330 | if config.model_type in {"gpt2", "roberta"}: 331 | tokenizer = AutoTokenizer.from_pretrained( 332 | tokenizer_name_or_path, 333 | cache_dir=model_args.cache_dir, 334 | use_fast=True, 335 | revision=model_args.model_revision, 336 | use_auth_token=True if model_args.use_auth_token else None, 337 | add_prefix_space=True, 338 | ) 339 | else: 340 | tokenizer = AutoTokenizer.from_pretrained( 341 | tokenizer_name_or_path, 342 | cache_dir=model_args.cache_dir, 343 | use_fast=True, 344 | revision=model_args.model_revision, 345 | use_auth_token=True if model_args.use_auth_token else None, 346 | ) 347 | 348 | model = AutoModelForTokenClassification.from_pretrained( 349 | model_args.model_name_or_path, 350 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 351 | config=config, 352 | cache_dir=model_args.cache_dir, 353 | revision=model_args.model_revision, 354 | use_auth_token=True if model_args.use_auth_token else None, 355 | ) 356 | 357 | # Tokenizer check: this script requires a fast tokenizer. 358 | if not isinstance(tokenizer, PreTrainedTokenizerFast): 359 | raise ValueError( 360 | "This example script only works for models that have a fast tokenizer. Checkout the big table of models " 361 | "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " 362 | "requirement" 363 | ) 364 | 365 | # Model has labels -> use them. 366 | if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id: 367 | if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)): 368 | # Reorganize `label_list` to match the ordering of the model. 369 | if labels_are_int: 370 | label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)} 371 | label_list = [model.config.id2label[i] for i in range(num_labels)] 372 | else: 373 | label_list = [model.config.id2label[i] for i in range(num_labels)] 374 | label_to_id = {l: i for i, l in enumerate(label_list)} 375 | else: 376 | logger.warning( 377 | "Your model seems to have been trained with labels, but they don't match the dataset: ", 378 | f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels: {list(sorted(label_list))}." 379 | "\nIgnoring the model labels as a result.", 380 | ) 381 | 382 | # Set the correspondences label/ID inside the model config 383 | model.config.label2id = {l: i for i, l in enumerate(label_list)} 384 | model.config.id2label = {i: l for i, l in enumerate(label_list)} 385 | 386 | # Map that sends B-Xxx label to its I-Xxx counterpart 387 | b_to_i_label = [] 388 | for idx, label in enumerate(label_list): 389 | if label.startswith("B-") and label.replace("B-", "I-") in label_list: 390 | b_to_i_label.append(label_list.index(label.replace("B-", "I-"))) 391 | else: 392 | b_to_i_label.append(idx) 393 | 394 | # Preprocessing the dataset 395 | # Padding strategy 396 | padding = "max_length" if data_args.pad_to_max_length else False 397 | 398 | # Tokenize all texts and align the labels with them. 399 | def tokenize_and_align_labels(examples): 400 | tokenized_inputs = tokenizer( 401 | examples[text_column_name], 402 | padding=padding, 403 | truncation=True, 404 | max_length=data_args.max_seq_length, 405 | # We use this argument because the texts in our dataset are lists of words (with a label for each word). 406 | is_split_into_words=True, 407 | ) 408 | labels = [] 409 | 410 | for i, label in enumerate(examples[label_column_name]): 411 | word_ids = tokenized_inputs.word_ids(batch_index=i) 412 | previous_word_idx = None 413 | label_ids = [] 414 | for word_idx in word_ids: 415 | # Special tokens have a word id that is None. We set the label to -100 so they are automatically 416 | # ignored in the loss function. 417 | if word_idx is None: 418 | label_ids.append(-100) 419 | # We set the label for the first token of each word. 420 | elif word_idx != previous_word_idx: 421 | label_ids.append(label_to_id[label[word_idx]]) 422 | 423 | # For the other tokens in a word, we set the label to either the current label or -100, depending on 424 | # the label_all_tokens flag. 425 | else: 426 | if data_args.label_all_tokens: 427 | label_ids.append(b_to_i_label[label_to_id[label[word_idx]]]) 428 | else: 429 | label_ids.append(-100) 430 | previous_word_idx = word_idx 431 | 432 | labels.append(label_ids) 433 | 434 | # set token_type_ids of trigger tokens to 1, used in argument extraction task 435 | if task_args.task_name == 'arg': 436 | trigger_word_ids = [] 437 | for idx, tag in enumerate(examples['trg_tags'][i]): 438 | if tag != 'O': 439 | trigger_word_ids.append(idx) 440 | for idx, _ in enumerate(tokenized_inputs['token_type_ids'][i]): 441 | if word_ids[idx] in trigger_word_ids: 442 | tokenized_inputs['token_type_ids'][i][idx] = 1 443 | tokenized_inputs["labels"] = labels 444 | return tokenized_inputs 445 | 446 | if training_args.do_train: 447 | if "train" not in raw_datasets: 448 | raise ValueError("--do_train requires a train dataset") 449 | train_dataset = raw_datasets["train"] 450 | if data_args.max_train_samples is not None: 451 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 452 | train_dataset = train_dataset.select(range(max_train_samples)) 453 | with training_args.main_process_first(desc="train dataset map pre-processing"): 454 | train_dataset = train_dataset.map( 455 | tokenize_and_align_labels, 456 | batched=True, 457 | num_proc=data_args.preprocessing_num_workers, 458 | load_from_cache_file=not data_args.overwrite_cache, 459 | desc="Running tokenizer on train dataset", 460 | ) 461 | 462 | if training_args.do_eval: 463 | if "validation" not in raw_datasets: 464 | raise ValueError("--do_eval requires a validation dataset") 465 | eval_dataset = raw_datasets["validation"] 466 | if data_args.max_eval_samples is not None: 467 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 468 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 469 | with training_args.main_process_first(desc="validation dataset map pre-processing"): 470 | eval_dataset = eval_dataset.map( 471 | tokenize_and_align_labels, 472 | batched=True, 473 | num_proc=data_args.preprocessing_num_workers, 474 | load_from_cache_file=not data_args.overwrite_cache, 475 | desc="Running tokenizer on validation dataset", 476 | ) 477 | 478 | if training_args.do_predict: 479 | if "test" not in raw_datasets: 480 | raise ValueError("--do_predict requires a test dataset") 481 | predict_dataset = raw_datasets["test"] 482 | if data_args.max_predict_samples is not None: 483 | max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) 484 | predict_dataset = predict_dataset.select(range(max_predict_samples)) 485 | with training_args.main_process_first(desc="prediction dataset map pre-processing"): 486 | predict_dataset = predict_dataset.map( 487 | tokenize_and_align_labels, 488 | batched=True, 489 | num_proc=data_args.preprocessing_num_workers, 490 | load_from_cache_file=not data_args.overwrite_cache, 491 | desc="Running tokenizer on prediction dataset", 492 | ) 493 | # Data collator 494 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) 495 | 496 | # Metrics 497 | logger.info("*** loading metric ***") 498 | metric = evaluate.load('seqeval') 499 | 500 | 501 | def compute_metrics(p): 502 | predictions, labels = p 503 | predictions = np.argmax(predictions, axis=2) 504 | 505 | # Remove ignored index (special tokens) 506 | true_predictions = [ 507 | [label_list[p] for (p, l) in zip(prediction, label) if l != -100] 508 | for prediction, label in zip(predictions, labels) 509 | ] 510 | true_labels = [ 511 | [label_list[l] for (p, l) in zip(prediction, label) if l != -100] 512 | for prediction, label in zip(predictions, labels) 513 | ] 514 | 515 | results = metric.compute(predictions=true_predictions, references=true_labels) 516 | if data_args.return_entity_level_metrics: 517 | # Unpack nested dictionaries 518 | final_results = {} 519 | for key, value in results.items(): 520 | if isinstance(value, dict): 521 | for n, v in value.items(): 522 | final_results[f"{key}_{n}"] = v 523 | else: 524 | final_results[key] = value 525 | return final_results 526 | else: 527 | return { 528 | "precision": results["overall_precision"], 529 | "recall": results["overall_recall"], 530 | "f1": results["overall_f1"], 531 | "accuracy": results["overall_accuracy"], 532 | } 533 | 534 | # Initialize our Trainer 535 | logger.info("*** initializing trainer ***") 536 | trainer = Trainer( 537 | model=model, 538 | args=training_args, 539 | train_dataset=train_dataset if training_args.do_train else None, 540 | eval_dataset=eval_dataset if training_args.do_eval else None, 541 | tokenizer=tokenizer, 542 | data_collator=data_collator, 543 | # compute_metrics=None if (task_args.task_name=='arg' and task_args.pred_trg_file) or task_args.just_infer else compute_metrics 544 | compute_metrics=compute_metrics, 545 | ) 546 | 547 | # Training 548 | if training_args.do_train: 549 | checkpoint = None 550 | if training_args.resume_from_checkpoint is not None: 551 | checkpoint = training_args.resume_from_checkpoint 552 | elif last_checkpoint is not None: 553 | checkpoint = last_checkpoint 554 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 555 | metrics = train_result.metrics 556 | trainer.save_model() # Saves the tokenizer too for easy upload 557 | 558 | max_train_samples = ( 559 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 560 | ) 561 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 562 | 563 | trainer.log_metrics("train", metrics) 564 | trainer.save_metrics("train", metrics) 565 | trainer.save_state() 566 | 567 | # Evaluation 568 | if training_args.do_eval: 569 | logger.info("*** Evaluate ***") 570 | 571 | metrics = trainer.evaluate() 572 | 573 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 574 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 575 | 576 | trainer.log_metrics("eval", metrics) 577 | trainer.save_metrics("eval", metrics) 578 | 579 | # Predict 580 | if training_args.do_predict: 581 | logger.info("*** Predict ***") 582 | 583 | predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict") 584 | predictions = np.argmax(predictions, axis=2) 585 | 586 | # Remove ignored index (special tokens) 587 | true_predictions = [ 588 | [label_list[p] for (p, l) in zip(prediction, label) if l != -100] 589 | for prediction, label in zip(predictions, labels) 590 | ] 591 | 592 | trainer.log_metrics("predict", metrics) 593 | trainer.save_metrics("predict", metrics) 594 | 595 | # Save predictions 596 | if task_args.task_name == 'trg': 597 | pred_df = predict_dataset.to_pandas() 598 | # pred_df['triggers'] = pred_df[['trg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.trg_tags, tokens=x.tokens), axis=1) 599 | pred_df['triggers'] = pred_df[[f'event{i}_trigger' for i in range(1,7)]].apply(lambda row: [item for item in row if item!=None], axis=1) 600 | pred_df['pred_trg_tags'] = true_predictions 601 | pred_df['pred_triggers'] = pred_df[['pred_trg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.pred_trg_tags, tokenized_src_text=x.tokens), axis=1) 602 | output_predictions_file = os.path.join(training_args.output_dir, task_args.output_filename) if task_args.output_filename else os.path.join(training_args.output_dir, "trg_predictions.csv") 603 | pred_df[['title_id', 'title', 'triggers', 'pred_triggers', 'trg_tags', 'pred_trg_tags']].to_csv(output_predictions_file, index=False) 604 | elif task_args.task_name == 'arg': 605 | pred_df = predict_dataset.to_pandas() 606 | pred_df['pred_arg_tags'] = true_predictions 607 | pred_df['pred_arguments'] = pred_df[['pred_arg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.pred_arg_tags, tokenized_src_text=x.tokens), axis=1) 608 | if task_args.pred_trg_file: # if use predicted triggers 609 | pred_df['triggers'] = pred_df[['trg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.trg_tags, tokenized_src_text=x.tokens), axis=1) 610 | pred_df['event_triples'] = pred_df[[f'event{i}_triple' for i in range(1,7)]].apply(lambda row: [list(item) for item in row if list(item)!=[]], axis=1) 611 | # pred_df['event_triples'] = pred_df['event_triples'].apply(list) 612 | pred_df['pred_event_triples'] = pred_df.apply(combine_trg_args, axis=1) 613 | pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='index', axis=0).reset_index() 614 | pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: x[0]) 615 | # if not task_args.just_infer: pred_df['event_triples'] = pred_df.apply(agg_triples, axis=1) 616 | # if not task_args.just_infer: pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x:x[0]) 617 | output_predictions_file = os.path.join(training_args.output_dir, task_args.output_filename) if task_args.output_filename else os.path.join(training_args.output_dir, "pipeline_predictions.csv") 618 | pred_df[['title_id', 'title', 'event_triples', 'pred_event_triples']].to_csv(output_predictions_file, index=False) 619 | else: # if use golden triggers 620 | # pred_df['arguments'] = pred_df.apply(lambda x: tags2text(tags=x.arg_tags, tokens=x.tokens), axis=1) 621 | pred_df['triggers'] = pred_df[[f'event{i}_trigger' for i in range(1,7)]].apply(lambda row: [item for item in row if item!=None], axis=1) 622 | pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='index', axis=0).reset_index() 623 | pred_df['triggers'] = pred_df['triggers'].apply(lambda x: x[0]) 624 | pred_df['pred_event_triples'] = "" 625 | for idx, row in tqdm(pred_df.iterrows(), total=len(pred_df)): 626 | pred_triples = [] 627 | for i, arg in enumerate(row.pred_arguments): 628 | trp = arg 629 | trp.insert(1, row.triggers[i]) 630 | pred_triples.append(trp) 631 | pred_df.at[idx, "pred_event_triples"] = pred_triples 632 | pred_df['pred_event_triples'] = pred_df['pred_event_triples'].apply(lambda x: [list(i) for i in x]) 633 | pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: [i.tolist() for i in x]) 634 | 635 | output_predictions_file = os.path.join(training_args.output_dir, task_args.output_filename) if task_args.output_filename else os.path.join(training_args.output_dir, "arg_predictions.csv") 636 | pred_df[['title_id', 'title', 'event_triples', 'pred_event_triples', 'arg_tags', 'pred_arg_tags']].to_csv(output_predictions_file, index=False) 637 | logger.info("save prediction output to %s" % output_predictions_file) 638 | 639 | 640 | 641 | kwargs = {"finetuned_from": Path(model_args.model_name_or_path).name, "tasks": "token-classification"} 642 | if data_args.dataset_name is not None: 643 | kwargs["dataset_tags"] = data_args.dataset_name 644 | if data_args.dataset_config_name is not None: 645 | kwargs["dataset_args"] = data_args.dataset_config_name 646 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 647 | else: 648 | kwargs["dataset"] = data_args.dataset_name 649 | 650 | if training_args.push_to_hub: 651 | trainer.push_to_hub(**kwargs) 652 | else: 653 | trainer.create_model_card(**kwargs) 654 | 655 | if __name__ == "__main__": 656 | main() 657 | -------------------------------------------------------------------------------- /mrc/run_qa_beam_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Team All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning XLNet for question answering with beam search using a slightly adapted version of the 🤗 Trainer. 18 | """ 19 | # You can also adapt this script on your own question answering task. Pointers for this are left as comments. 20 | 21 | import logging 22 | import os 23 | import sys 24 | from dataclasses import dataclass, field 25 | from typing import Optional 26 | 27 | import datasets 28 | from datasets import load_dataset, load_metric 29 | 30 | import transformers 31 | from trainer_qa import QuestionAnsweringTrainer 32 | from transformers import ( 33 | DataCollatorWithPadding, 34 | EvalPrediction, 35 | HfArgumentParser, 36 | TrainingArguments, 37 | XLNetConfig, 38 | XLNetForQuestionAnswering, 39 | XLNetTokenizerFast, 40 | default_data_collator, 41 | set_seed, 42 | ) 43 | from transformers.trainer_utils import get_last_checkpoint 44 | from transformers.utils import check_min_version 45 | from transformers.utils.versions import require_version 46 | from utils_qa import postprocess_qa_predictions_with_beam_search 47 | 48 | 49 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 50 | check_min_version("4.19.0.dev0") 51 | 52 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") 53 | 54 | logger = logging.getLogger(__name__) 55 | 56 | 57 | @dataclass 58 | class ModelArguments: 59 | """ 60 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 61 | """ 62 | 63 | model_name_or_path: str = field( 64 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 65 | ) 66 | config_name: Optional[str] = field( 67 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 68 | ) 69 | tokenizer_name: Optional[str] = field( 70 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 71 | ) 72 | cache_dir: Optional[str] = field( 73 | default=None, 74 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 75 | ) 76 | model_revision: str = field( 77 | default="main", 78 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 79 | ) 80 | use_auth_token: bool = field( 81 | default=False, 82 | metadata={ 83 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 84 | "with private models)." 85 | }, 86 | ) 87 | 88 | 89 | @dataclass 90 | class DataTrainingArguments: 91 | """ 92 | Arguments pertaining to what data we are going to input our model for training and eval. 93 | """ 94 | 95 | dataset_name: Optional[str] = field( 96 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 97 | ) 98 | dataset_config_name: Optional[str] = field( 99 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 100 | ) 101 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 102 | validation_file: Optional[str] = field( 103 | default=None, 104 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 105 | ) 106 | test_file: Optional[str] = field( 107 | default=None, 108 | metadata={"help": "An optional input test data file to test the perplexity on (a text file)."}, 109 | ) 110 | overwrite_cache: bool = field( 111 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 112 | ) 113 | preprocessing_num_workers: Optional[int] = field( 114 | default=None, 115 | metadata={"help": "The number of processes to use for the preprocessing."}, 116 | ) 117 | max_seq_length: int = field( 118 | default=384, 119 | metadata={ 120 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 121 | "than this will be truncated, sequences shorter will be padded." 122 | }, 123 | ) 124 | pad_to_max_length: bool = field( 125 | default=True, 126 | metadata={ 127 | "help": "Whether to pad all samples to `max_seq_length`. " 128 | "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " 129 | "be faster on GPU but will be slower on TPU)." 130 | }, 131 | ) 132 | max_train_samples: Optional[int] = field( 133 | default=None, 134 | metadata={ 135 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 136 | "value if set." 137 | }, 138 | ) 139 | max_eval_samples: Optional[int] = field( 140 | default=None, 141 | metadata={ 142 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 143 | "value if set." 144 | }, 145 | ) 146 | max_predict_samples: Optional[int] = field( 147 | default=None, 148 | metadata={ 149 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 150 | "value if set." 151 | }, 152 | ) 153 | version_2_with_negative: bool = field( 154 | default=False, metadata={"help": "If true, some of the examples do not have an answer."} 155 | ) 156 | null_score_diff_threshold: float = field( 157 | default=0.0, 158 | metadata={ 159 | "help": "The threshold used to select the null answer: if the best answer has a score that is less than " 160 | "the score of the null answer minus this threshold, the null answer is selected for this example. " 161 | "Only useful when `version_2_with_negative=True`." 162 | }, 163 | ) 164 | doc_stride: int = field( 165 | default=128, 166 | metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, 167 | ) 168 | n_best_size: int = field( 169 | default=20, 170 | metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, 171 | ) 172 | max_answer_length: int = field( 173 | default=30, 174 | metadata={ 175 | "help": "The maximum length of an answer that can be generated. This is needed because the start " 176 | "and end predictions are not conditioned on one another." 177 | }, 178 | ) 179 | 180 | def __post_init__(self): 181 | if ( 182 | self.dataset_name is None 183 | and self.train_file is None 184 | and self.validation_file is None 185 | and self.test_file is None 186 | ): 187 | raise ValueError("Need either a dataset name or a training/validation/test file.") 188 | else: 189 | if self.train_file is not None: 190 | extension = self.train_file.split(".")[-1] 191 | assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." 192 | if self.validation_file is not None: 193 | extension = self.validation_file.split(".")[-1] 194 | assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." 195 | if self.test_file is not None: 196 | extension = self.test_file.split(".")[-1] 197 | assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." 198 | 199 | 200 | def main(): 201 | # See all possible arguments in src/transformers/training_args.py 202 | # or by passing the --help flag to this script. 203 | # We now keep distinct sets of args, for a cleaner separation of concerns. 204 | 205 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 206 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 207 | # If we pass only one argument to the script and it's the path to a json file, 208 | # let's parse it to get our arguments. 209 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 210 | else: 211 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 212 | 213 | # Setup logging 214 | logging.basicConfig( 215 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 216 | datefmt="%m/%d/%Y %H:%M:%S", 217 | handlers=[logging.StreamHandler(sys.stdout)], 218 | ) 219 | log_level = training_args.get_process_log_level() 220 | logger.setLevel(log_level) 221 | datasets.utils.logging.set_verbosity(log_level) 222 | transformers.utils.logging.set_verbosity(log_level) 223 | transformers.utils.logging.enable_default_handler() 224 | transformers.utils.logging.enable_explicit_format() 225 | 226 | # Log on each process the small summary: 227 | logger.warning( 228 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 229 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 230 | ) 231 | logger.info(f"Training/evaluation parameters {training_args}") 232 | 233 | # Detecting last checkpoint. 234 | last_checkpoint = None 235 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 236 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 237 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 238 | raise ValueError( 239 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 240 | "Use --overwrite_output_dir to overcome." 241 | ) 242 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 243 | logger.info( 244 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 245 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 246 | ) 247 | 248 | # Set seed before initializing model. 249 | set_seed(training_args.seed) 250 | 251 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 252 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 253 | # (the dataset will be downloaded automatically from the datasets Hub). 254 | # 255 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 256 | # 'text' is found. You can easily tweak this behavior (see below). 257 | # 258 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 259 | # download the dataset. 260 | if data_args.dataset_name is not None: 261 | # Downloading and loading a dataset from the hub. 262 | raw_datasets = load_dataset( 263 | data_args.dataset_name, 264 | data_args.dataset_config_name, 265 | cache_dir=model_args.cache_dir, 266 | use_auth_token=True if model_args.use_auth_token else None, 267 | ) 268 | else: 269 | data_files = {} 270 | if data_args.train_file is not None: 271 | data_files["train"] = data_args.train_file 272 | extension = data_args.train_file.split(".")[-1] 273 | if data_args.validation_file is not None: 274 | data_files["validation"] = data_args.validation_file 275 | extension = data_args.validation_file.split(".")[-1] 276 | if data_args.test_file is not None: 277 | data_files["test"] = data_args.test_file 278 | extension = data_args.test_file.split(".")[-1] 279 | raw_datasets = load_dataset( 280 | extension, 281 | data_files=data_files, 282 | field="data", 283 | cache_dir=model_args.cache_dir, 284 | use_auth_token=True if model_args.use_auth_token else None, 285 | ) 286 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 287 | # https://huggingface.co/docs/datasets/loading_datasets.html. 288 | 289 | # Load pretrained model and tokenizer 290 | # 291 | # Distributed training: 292 | # The .from_pretrained methods guarantee that only one local process can concurrently 293 | # download model & vocab. 294 | config = XLNetConfig.from_pretrained( 295 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 296 | cache_dir=model_args.cache_dir, 297 | revision=model_args.model_revision, 298 | use_auth_token=True if model_args.use_auth_token else None, 299 | ) 300 | tokenizer = XLNetTokenizerFast.from_pretrained( 301 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 302 | cache_dir=model_args.cache_dir, 303 | revision=model_args.model_revision, 304 | use_auth_token=True if model_args.use_auth_token else None, 305 | ) 306 | model = XLNetForQuestionAnswering.from_pretrained( 307 | model_args.model_name_or_path, 308 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 309 | config=config, 310 | cache_dir=model_args.cache_dir, 311 | revision=model_args.model_revision, 312 | use_auth_token=True if model_args.use_auth_token else None, 313 | ) 314 | 315 | # Preprocessing the datasets. 316 | # Preprocessing is slighlty different for training and evaluation. 317 | if training_args.do_train: 318 | column_names = raw_datasets["train"].column_names 319 | elif training_args.do_eval: 320 | column_names = raw_datasets["validation"].column_names 321 | else: 322 | column_names = raw_datasets["test"].column_names 323 | question_column_name = "question" if "question" in column_names else column_names[0] 324 | context_column_name = "context" if "context" in column_names else column_names[1] 325 | answer_column_name = "answers" if "answers" in column_names else column_names[2] 326 | 327 | # Padding side determines if we do (question|context) or (context|question). 328 | pad_on_right = tokenizer.padding_side == "right" 329 | 330 | if data_args.max_seq_length > tokenizer.model_max_length: 331 | logger.warning( 332 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 333 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 334 | ) 335 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 336 | 337 | # Training preprocessing 338 | def prepare_train_features(examples): 339 | # Some of the questions have lots of whitespace on the left, which is not useful and will make the 340 | # truncation of the context fail (the tokenized question will take a lots of space). So we remove that 341 | # left whitespace 342 | examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] 343 | 344 | # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 345 | # in one example possible giving several features when a context is long, each of those features having a 346 | # context that overlaps a bit the context of the previous feature. 347 | tokenized_examples = tokenizer( 348 | examples[question_column_name if pad_on_right else context_column_name], 349 | examples[context_column_name if pad_on_right else question_column_name], 350 | truncation="only_second" if pad_on_right else "only_first", 351 | max_length=max_seq_length, 352 | stride=data_args.doc_stride, 353 | return_overflowing_tokens=True, 354 | return_offsets_mapping=True, 355 | return_special_tokens_mask=True, 356 | return_token_type_ids=True, 357 | padding="max_length", 358 | ) 359 | 360 | # Since one example might give us several features if it has a long context, we need a map from a feature to 361 | # its corresponding example. This key gives us just that. 362 | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 363 | # The offset mappings will give us a map from token to character position in the original context. This will 364 | # help us compute the start_positions and end_positions. 365 | offset_mapping = tokenized_examples.pop("offset_mapping") 366 | # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). 367 | special_tokens = tokenized_examples.pop("special_tokens_mask") 368 | 369 | # Let's label those examples! 370 | tokenized_examples["start_positions"] = [] 371 | tokenized_examples["end_positions"] = [] 372 | tokenized_examples["is_impossible"] = [] 373 | tokenized_examples["cls_index"] = [] 374 | tokenized_examples["p_mask"] = [] 375 | 376 | for i, offsets in enumerate(offset_mapping): 377 | # We will label impossible answers with the index of the CLS token. 378 | input_ids = tokenized_examples["input_ids"][i] 379 | cls_index = input_ids.index(tokenizer.cls_token_id) 380 | tokenized_examples["cls_index"].append(cls_index) 381 | 382 | # Grab the sequence corresponding to that example (to know what is the context and what is the question). 383 | sequence_ids = tokenized_examples["token_type_ids"][i] 384 | for k, s in enumerate(special_tokens[i]): 385 | if s: 386 | sequence_ids[k] = 3 387 | context_idx = 1 if pad_on_right else 0 388 | 389 | # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. 390 | # The cls token gets 1.0 too (for predictions of empty answers). 391 | tokenized_examples["p_mask"].append( 392 | [ 393 | 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 394 | for k, s in enumerate(sequence_ids) 395 | ] 396 | ) 397 | 398 | # One example can give several spans, this is the index of the example containing this span of text. 399 | sample_index = sample_mapping[i] 400 | answers = examples[answer_column_name][sample_index] 401 | # If no answers are given, set the cls_index as answer. 402 | if len(answers["answer_start"]) == 0: 403 | tokenized_examples["start_positions"].append(cls_index) 404 | tokenized_examples["end_positions"].append(cls_index) 405 | tokenized_examples["is_impossible"].append(1.0) 406 | else: 407 | # Start/end character index of the answer in the text. 408 | start_char = answers["answer_start"][0] 409 | end_char = start_char + len(answers["text"][0]) 410 | 411 | # Start token index of the current span in the text. 412 | token_start_index = 0 413 | while sequence_ids[token_start_index] != context_idx: 414 | token_start_index += 1 415 | 416 | # End token index of the current span in the text. 417 | token_end_index = len(input_ids) - 1 418 | while sequence_ids[token_end_index] != context_idx: 419 | token_end_index -= 1 420 | # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). 421 | if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): 422 | tokenized_examples["start_positions"].append(cls_index) 423 | tokenized_examples["end_positions"].append(cls_index) 424 | tokenized_examples["is_impossible"].append(1.0) 425 | else: 426 | # Otherwise move the token_start_index and token_end_index to the two ends of the answer. 427 | # Note: we could go after the last offset if the answer is the last word (edge case). 428 | while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: 429 | token_start_index += 1 430 | tokenized_examples["start_positions"].append(token_start_index - 1) 431 | while offsets[token_end_index][1] >= end_char: 432 | token_end_index -= 1 433 | tokenized_examples["end_positions"].append(token_end_index + 1) 434 | tokenized_examples["is_impossible"].append(0.0) 435 | 436 | return tokenized_examples 437 | 438 | if training_args.do_train: 439 | if "train" not in raw_datasets: 440 | raise ValueError("--do_train requires a train dataset") 441 | train_dataset = raw_datasets["train"] 442 | if data_args.max_train_samples is not None: 443 | # Select samples from Dataset, This will help to decrease processing time 444 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 445 | train_dataset = train_dataset.select(range(max_train_samples)) 446 | # Create Training Features 447 | with training_args.main_process_first(desc="train dataset map pre-processing"): 448 | train_dataset = train_dataset.map( 449 | prepare_train_features, 450 | batched=True, 451 | num_proc=data_args.preprocessing_num_workers, 452 | remove_columns=column_names, 453 | load_from_cache_file=not data_args.overwrite_cache, 454 | desc="Running tokenizer on train dataset", 455 | ) 456 | if data_args.max_train_samples is not None: 457 | # Select samples from dataset again since Feature Creation might increase number of features 458 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 459 | train_dataset = train_dataset.select(range(max_train_samples)) 460 | 461 | # Validation preprocessing 462 | def prepare_validation_features(examples): 463 | # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 464 | # in one example possible giving several features when a context is long, each of those features having a 465 | # context that overlaps a bit the context of the previous feature. 466 | tokenized_examples = tokenizer( 467 | examples[question_column_name if pad_on_right else context_column_name], 468 | examples[context_column_name if pad_on_right else question_column_name], 469 | truncation="only_second" if pad_on_right else "only_first", 470 | max_length=max_seq_length, 471 | stride=data_args.doc_stride, 472 | return_overflowing_tokens=True, 473 | return_offsets_mapping=True, 474 | return_special_tokens_mask=True, 475 | return_token_type_ids=True, 476 | padding="max_length", 477 | ) 478 | 479 | # Since one example might give us several features if it has a long context, we need a map from a feature to 480 | # its corresponding example. This key gives us just that. 481 | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 482 | 483 | # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). 484 | special_tokens = tokenized_examples.pop("special_tokens_mask") 485 | 486 | # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the 487 | # corresponding example_id and we will store the offset mappings. 488 | tokenized_examples["example_id"] = [] 489 | 490 | # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. 491 | tokenized_examples["cls_index"] = [] 492 | tokenized_examples["p_mask"] = [] 493 | 494 | for i, input_ids in enumerate(tokenized_examples["input_ids"]): 495 | # Find the CLS token in the input ids. 496 | cls_index = input_ids.index(tokenizer.cls_token_id) 497 | tokenized_examples["cls_index"].append(cls_index) 498 | 499 | # Grab the sequence corresponding to that example (to know what is the context and what is the question). 500 | sequence_ids = tokenized_examples["token_type_ids"][i] 501 | for k, s in enumerate(special_tokens[i]): 502 | if s: 503 | sequence_ids[k] = 3 504 | context_idx = 1 if pad_on_right else 0 505 | 506 | # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. 507 | tokenized_examples["p_mask"].append( 508 | [ 509 | 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 510 | for k, s in enumerate(sequence_ids) 511 | ] 512 | ) 513 | 514 | # One example can give several spans, this is the index of the example containing this span of text. 515 | sample_index = sample_mapping[i] 516 | tokenized_examples["example_id"].append(examples["id"][sample_index]) 517 | 518 | # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token 519 | # position is part of the context or not. 520 | tokenized_examples["offset_mapping"][i] = [ 521 | (o if sequence_ids[k] == context_idx else None) 522 | for k, o in enumerate(tokenized_examples["offset_mapping"][i]) 523 | ] 524 | 525 | return tokenized_examples 526 | 527 | if training_args.do_eval: 528 | if "validation" not in raw_datasets: 529 | raise ValueError("--do_eval requires a validation dataset") 530 | eval_examples = raw_datasets["validation"] 531 | if data_args.max_eval_samples is not None: 532 | # Selecting Eval Samples from Dataset 533 | max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) 534 | eval_examples = eval_examples.select(range(max_eval_samples)) 535 | # Create Features from Eval Dataset 536 | with training_args.main_process_first(desc="validation dataset map pre-processing"): 537 | eval_dataset = eval_examples.map( 538 | prepare_validation_features, 539 | batched=True, 540 | num_proc=data_args.preprocessing_num_workers, 541 | remove_columns=column_names, 542 | load_from_cache_file=not data_args.overwrite_cache, 543 | desc="Running tokenizer on validation dataset", 544 | ) 545 | if data_args.max_eval_samples is not None: 546 | # Selecting Samples from Dataset again since Feature Creation might increase samples size 547 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 548 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 549 | 550 | if training_args.do_predict: 551 | if "test" not in raw_datasets: 552 | raise ValueError("--do_predict requires a test dataset") 553 | predict_examples = raw_datasets["test"] 554 | if data_args.max_predict_samples is not None: 555 | # We will select sample from whole data 556 | predict_examples = predict_examples.select(range(data_args.max_predict_samples)) 557 | # Test Feature Creation 558 | with training_args.main_process_first(desc="prediction dataset map pre-processing"): 559 | predict_dataset = predict_examples.map( 560 | prepare_validation_features, 561 | batched=True, 562 | num_proc=data_args.preprocessing_num_workers, 563 | remove_columns=column_names, 564 | load_from_cache_file=not data_args.overwrite_cache, 565 | desc="Running tokenizer on prediction dataset", 566 | ) 567 | if data_args.max_predict_samples is not None: 568 | # During Feature creation dataset samples might increase, we will select required samples again 569 | max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) 570 | predict_dataset = predict_dataset.select(range(max_predict_samples)) 571 | 572 | # Data collator 573 | # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data 574 | # collator. 575 | data_collator = ( 576 | default_data_collator 577 | if data_args.pad_to_max_length 578 | else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) 579 | ) 580 | 581 | # Post-processing: 582 | def post_processing_function(examples, features, predictions, stage="eval"): 583 | # Post-processing: we match the start logits and end logits to answers in the original context. 584 | predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( 585 | examples=examples, 586 | features=features, 587 | predictions=predictions, 588 | version_2_with_negative=data_args.version_2_with_negative, 589 | n_best_size=data_args.n_best_size, 590 | max_answer_length=data_args.max_answer_length, 591 | start_n_top=model.config.start_n_top, 592 | end_n_top=model.config.end_n_top, 593 | output_dir=training_args.output_dir, 594 | log_level=log_level, 595 | prefix=stage, 596 | ) 597 | # Format the result to the format the metric expects. 598 | if data_args.version_2_with_negative: 599 | formatted_predictions = [ 600 | {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]} 601 | for k, v in predictions.items() 602 | ] 603 | else: 604 | formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] 605 | 606 | references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] 607 | return EvalPrediction(predictions=formatted_predictions, label_ids=references) 608 | 609 | metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") 610 | 611 | def compute_metrics(p: EvalPrediction): 612 | return metric.compute(predictions=p.predictions, references=p.label_ids) 613 | 614 | # Initialize our Trainer 615 | trainer = QuestionAnsweringTrainer( 616 | model=model, 617 | args=training_args, 618 | train_dataset=train_dataset if training_args.do_train else None, 619 | eval_dataset=eval_dataset if training_args.do_eval else None, 620 | eval_examples=eval_examples if training_args.do_eval else None, 621 | tokenizer=tokenizer, 622 | data_collator=data_collator, 623 | post_process_function=post_processing_function, 624 | compute_metrics=compute_metrics, 625 | ) 626 | 627 | # Training 628 | if training_args.do_train: 629 | checkpoint = None 630 | if training_args.resume_from_checkpoint is not None: 631 | checkpoint = training_args.resume_from_checkpoint 632 | elif last_checkpoint is not None: 633 | checkpoint = last_checkpoint 634 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 635 | trainer.save_model() # Saves the tokenizer too for easy upload 636 | 637 | metrics = train_result.metrics 638 | 639 | max_train_samples = ( 640 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 641 | ) 642 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 643 | 644 | trainer.log_metrics("train", metrics) 645 | trainer.save_metrics("train", metrics) 646 | trainer.save_state() 647 | 648 | # Evaluation 649 | if training_args.do_eval: 650 | logger.info("*** Evaluate ***") 651 | metrics = trainer.evaluate() 652 | 653 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 654 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 655 | 656 | trainer.log_metrics("eval", metrics) 657 | trainer.save_metrics("eval", metrics) 658 | 659 | # Prediction 660 | if training_args.do_predict: 661 | logger.info("*** Predict ***") 662 | results = trainer.predict(predict_dataset, predict_examples) 663 | metrics = results.metrics 664 | 665 | max_predict_samples = ( 666 | data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) 667 | ) 668 | metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) 669 | 670 | trainer.log_metrics("predict", metrics) 671 | trainer.save_metrics("predict", metrics) 672 | 673 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} 674 | if data_args.dataset_name is not None: 675 | kwargs["dataset_tags"] = data_args.dataset_name 676 | if data_args.dataset_config_name is not None: 677 | kwargs["dataset_args"] = data_args.dataset_config_name 678 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 679 | else: 680 | kwargs["dataset"] = data_args.dataset_name 681 | 682 | if training_args.push_to_hub: 683 | trainer.push_to_hub(**kwargs) 684 | else: 685 | trainer.create_model_card(**kwargs) 686 | 687 | 688 | def _mp_fn(index): 689 | # For xla_spawn (TPUs) 690 | main() 691 | 692 | 693 | if __name__ == "__main__": 694 | main() 695 | -------------------------------------------------------------------------------- /mrc/run_qa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Team All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer. 18 | """ 19 | # You can also adapt this script on your own question answering task. Pointers for this are left as comments. 20 | 21 | import logging 22 | import os 23 | import pdb 24 | import sys 25 | from dataclasses import dataclass, field 26 | from typing import Optional 27 | import datasets 28 | from datasets import load_metric 29 | import transformers 30 | from trainer_qa import QuestionAnsweringTrainer 31 | from transformers import ( 32 | AutoConfig, 33 | AutoModelForQuestionAnswering, 34 | AutoTokenizer, 35 | DataCollatorWithPadding, 36 | EvalPrediction, 37 | HfArgumentParser, 38 | PreTrainedTokenizerFast, 39 | TrainingArguments, 40 | default_data_collator, 41 | set_seed, 42 | ) 43 | from transformers.trainer_utils import get_last_checkpoint 44 | from transformers.utils import check_min_version 45 | from transformers.utils.versions import require_version 46 | from utils_qa import postprocess_qa_predictions 47 | 48 | import evaluate 49 | from mrc_task_config import * 50 | 51 | 52 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") 53 | 54 | logger = logging.getLogger(__name__) 55 | 56 | 57 | @dataclass 58 | class TaskArguments: 59 | is_extractive: bool = field( 60 | default=True, 61 | metadata={"help": "Used in data prepocessing, determine wether to compute the answer offsets in the source text, set to True for SpanMRC model, False for Seq2Seq model "} 62 | ) 63 | pred_trg_file: Optional[str] = field( 64 | default=None, 65 | metadata={"help": "path of trigger prediction file, if specified, will do argument extraction based on predicted triggers rather than golden labeled triggers"} 66 | ) 67 | pred_trg_col: Optional[str] = field( 68 | default='pred_triggers', 69 | metadata={"help": "column name of predicted triggers in pred_trg_file"} 70 | ) 71 | output_filename: Optional[str] = field( 72 | default=None, 73 | metadata={"help": "you can specify the file name of model predictions"} 74 | ) 75 | @dataclass 76 | class ModelArguments: 77 | """ 78 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 79 | """ 80 | 81 | model_name_or_path: str = field( 82 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 83 | ) 84 | config_name: Optional[str] = field( 85 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 86 | ) 87 | tokenizer_name: Optional[str] = field( 88 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 89 | ) 90 | cache_dir: Optional[str] = field( 91 | default=None, 92 | metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, 93 | ) 94 | model_revision: str = field( 95 | default="main", 96 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 97 | ) 98 | use_auth_token: bool = field( 99 | default=False, 100 | metadata={ 101 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 102 | "with private models)." 103 | }, 104 | ) 105 | 106 | 107 | @dataclass 108 | class DataTrainingArguments: 109 | """ 110 | Arguments pertaining to what data we are going to input our model for training and eval. 111 | """ 112 | 113 | dataset_name: Optional[str] = field( 114 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 115 | ) 116 | dataset_config_name: Optional[str] = field( 117 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 118 | ) 119 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 120 | validation_file: Optional[str] = field( 121 | default=None, 122 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 123 | ) 124 | test_file: Optional[str] = field( 125 | default=None, 126 | metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, 127 | ) 128 | overwrite_cache: bool = field( 129 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 130 | ) 131 | preprocessing_num_workers: Optional[int] = field( 132 | default=None, 133 | metadata={"help": "The number of processes to use for the preprocessing."}, 134 | ) 135 | max_seq_length: int = field( 136 | default=128, 137 | metadata={ 138 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 139 | "than this will be truncated, sequences shorter will be padded." 140 | }, 141 | ) 142 | pad_to_max_length: bool = field( 143 | default=True, 144 | metadata={ 145 | "help": "Whether to pad all samples to `max_seq_length`. " 146 | "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " 147 | "be faster on GPU but will be slower on TPU)." 148 | }, 149 | ) 150 | max_train_samples: Optional[int] = field( 151 | default=None, 152 | metadata={ 153 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 154 | "value if set." 155 | }, 156 | ) 157 | max_eval_samples: Optional[int] = field( 158 | default=None, 159 | metadata={ 160 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 161 | "value if set." 162 | }, 163 | ) 164 | max_predict_samples: Optional[int] = field( 165 | default=None, 166 | metadata={ 167 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 168 | "value if set." 169 | }, 170 | ) 171 | version_2_with_negative: bool = field( 172 | default=False, metadata={"help": "If true, some of the examples do not have an answer."} 173 | ) 174 | null_score_diff_threshold: float = field( 175 | default=0.0, 176 | metadata={ 177 | "help": "The threshold used to select the null answer: if the best answer has a score that is less than " 178 | "the score of the null answer minus this threshold, the null answer is selected for this example. " 179 | "Only useful when `version_2_with_negative=True`." 180 | }, 181 | ) 182 | doc_stride: int = field( 183 | default=128, 184 | metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, 185 | ) 186 | n_best_size: int = field( 187 | default=20, 188 | metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, 189 | ) 190 | max_answer_length: int = field( 191 | default=30, 192 | metadata={ 193 | "help": "The maximum length of an answer that can be generated. This is needed because the start " 194 | "and end predictions are not conditioned on one another." 195 | }, 196 | ) 197 | 198 | def __post_init__(self): 199 | if ( 200 | self.dataset_name is None 201 | and self.train_file is None 202 | and self.validation_file is None 203 | and self.test_file is None 204 | ): 205 | raise ValueError("Need either a dataset name or a training/validation file/test_file.") 206 | else: 207 | if self.train_file is not None: 208 | extension = self.train_file.split(".")[-1] 209 | assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." 210 | if self.validation_file is not None: 211 | extension = self.validation_file.split(".")[-1] 212 | assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." 213 | if self.test_file is not None: 214 | extension = self.test_file.split(".")[-1] 215 | assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." 216 | 217 | 218 | def main(): 219 | # See all possible arguments in src/transformers/training_args.py 220 | # or by passing the --help flag to this script. 221 | # We now keep distinct sets of args, for a cleaner separation of concerns. 222 | 223 | parser = HfArgumentParser((TaskArguments, ModelArguments, DataTrainingArguments, TrainingArguments)) 224 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 225 | # If we pass only one argument to the script and it's the path to a json file, 226 | # let's parse it to get our arguments. 227 | task_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 228 | else: 229 | task_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses() 230 | 231 | task_args.just_infer = (not training_args.do_train and training_args.do_predict) 232 | 233 | # Setup logging 234 | logging.basicConfig( 235 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 236 | datefmt="%m/%d/%Y %H:%M:%S", 237 | handlers=[logging.StreamHandler(sys.stdout)], 238 | ) 239 | 240 | log_level = training_args.get_process_log_level() 241 | logger.setLevel(log_level) 242 | datasets.utils.logging.set_verbosity(log_level) 243 | transformers.utils.logging.set_verbosity(log_level) 244 | transformers.utils.logging.enable_default_handler() 245 | transformers.utils.logging.enable_explicit_format() 246 | 247 | # Log on each process the small summary: 248 | logger.warning( 249 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 250 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 251 | ) 252 | logger.info(f"Training/evaluation parameters {training_args}") 253 | 254 | # Detecting last checkpoint. 255 | last_checkpoint = None 256 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 257 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 258 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 259 | raise ValueError( 260 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 261 | "Use --overwrite_output_dir to overcome." 262 | ) 263 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 264 | logger.info( 265 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 266 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 267 | ) 268 | 269 | # Set seed before initializing model. 270 | set_seed(training_args.seed) 271 | 272 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 273 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 274 | # (the dataset will be downloaded automatically from the datasets Hub). 275 | # 276 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 277 | # 'text' is found. You can easily tweak this behavior (see below). 278 | # 279 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 280 | # download the dataset. 281 | 282 | raw_datasets = load_my_datasets_for_mrc(data_args, task_args) 283 | 284 | 285 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 286 | # https://huggingface.co/docs/datasets/loading_datasets.html. 287 | 288 | # Load pretrained model and tokenizer 289 | # 290 | # Distributed training: 291 | # The .from_pretrained methods guarantee that only one local process can concurrently 292 | # download model & vocab. 293 | config = AutoConfig.from_pretrained( 294 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 295 | cache_dir=model_args.cache_dir, 296 | revision=model_args.model_revision, 297 | use_auth_token=True if model_args.use_auth_token else None, 298 | ) 299 | tokenizer = AutoTokenizer.from_pretrained( 300 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 301 | cache_dir=model_args.cache_dir, 302 | use_fast=True, 303 | revision=model_args.model_revision, 304 | use_auth_token=True if model_args.use_auth_token else None, 305 | ) 306 | model = AutoModelForQuestionAnswering.from_pretrained( 307 | model_args.model_name_or_path, 308 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 309 | config=config, 310 | cache_dir=model_args.cache_dir, 311 | revision=model_args.model_revision, 312 | use_auth_token=True if model_args.use_auth_token else None, 313 | ) 314 | 315 | # Tokenizer check: this script requires a fast tokenizer. 316 | if not isinstance(tokenizer, PreTrainedTokenizerFast): 317 | raise ValueError( 318 | "This example script only works for models that have a fast tokenizer. Checkout the big table of models " 319 | "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " 320 | "requirement" 321 | ) 322 | 323 | # Preprocessing the datasets. 324 | # Preprocessing is slighlty different for training and evaluation. 325 | if training_args.do_train: 326 | column_names = raw_datasets["train"].column_names 327 | elif training_args.do_eval: 328 | column_names = raw_datasets["validation"].column_names 329 | else: 330 | column_names = raw_datasets["test"].column_names 331 | question_column_name = "question" if "question" in column_names else column_names[0] 332 | context_column_name = "context" if "context" in column_names else column_names[1] 333 | answer_column_name = "answers" if "answers" in column_names else column_names[2] 334 | 335 | # Padding side determines if we do (question|context) or (context|question). 336 | pad_on_right = tokenizer.padding_side == "right" 337 | 338 | if data_args.max_seq_length > tokenizer.model_max_length: 339 | logger.warning( 340 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 341 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 342 | ) 343 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 344 | 345 | # Training preprocessing 346 | def prepare_train_features(examples): 347 | # Some of the questions have lots of whitespace on the left, which is not useful and will make the 348 | # truncation of the context fail (the tokenized question will take a lots of space). So we remove that 349 | # left whitespace 350 | examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] 351 | 352 | # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 353 | # in one example possible giving several features when a context is long, each of those features having a 354 | # context that overlaps a bit the context of the previous feature. 355 | tokenized_examples = tokenizer( 356 | examples[question_column_name if pad_on_right else context_column_name], 357 | examples[context_column_name if pad_on_right else question_column_name], 358 | truncation="only_second" if pad_on_right else "only_first", 359 | max_length=max_seq_length, 360 | stride=data_args.doc_stride, 361 | return_overflowing_tokens=True, 362 | return_offsets_mapping=True, 363 | padding="max_length" if data_args.pad_to_max_length else False, 364 | ) 365 | 366 | # Since one example might give us several features if it has a long context, we need a map from a feature to 367 | # its corresponding example. This key gives us just that. 368 | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 369 | # The offset mappings will give us a map from token to character position in the original context. This will 370 | # help us compute the start_positions and end_positions. 371 | offset_mapping = tokenized_examples.pop("offset_mapping") 372 | 373 | # Let's label those examples! 374 | tokenized_examples["start_positions"] = [] 375 | tokenized_examples["end_positions"] = [] 376 | 377 | for i, offsets in enumerate(offset_mapping): 378 | # We will label impossible answers with the index of the CLS token. 379 | input_ids = tokenized_examples["input_ids"][i] 380 | cls_index = input_ids.index(tokenizer.cls_token_id) 381 | 382 | # Grab the sequence corresponding to that example (to know what is the context and what is the question). 383 | sequence_ids = tokenized_examples.sequence_ids(i) 384 | 385 | # One example can give several spans, this is the index of the example containing this span of text. 386 | sample_index = sample_mapping[i] 387 | answers = examples[answer_column_name][sample_index] 388 | # If no answers are given, set the cls_index as answer. 389 | if len(answers["answer_start"]) == 0: 390 | tokenized_examples["start_positions"].append(cls_index) 391 | tokenized_examples["end_positions"].append(cls_index) 392 | else: 393 | # Start/end character index of the answer in the text. 394 | start_char = answers["answer_start"][0] 395 | end_char = start_char + len(answers["text"][0]) 396 | 397 | # Start token index of the current span in the text. 398 | token_start_index = 0 399 | while sequence_ids[token_start_index] != (1 if pad_on_right else 0): 400 | token_start_index += 1 401 | 402 | # End token index of the current span in the text. 403 | token_end_index = len(input_ids) - 1 404 | while sequence_ids[token_end_index] != (1 if pad_on_right else 0): 405 | token_end_index -= 1 406 | 407 | # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). 408 | if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): 409 | tokenized_examples["start_positions"].append(cls_index) 410 | tokenized_examples["end_positions"].append(cls_index) 411 | else: 412 | # Otherwise move the token_start_index and token_end_index to the two ends of the answer. 413 | # Note: we could go after the last offset if the answer is the last word (edge case). 414 | while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: 415 | token_start_index += 1 416 | tokenized_examples["start_positions"].append(token_start_index - 1) 417 | while offsets[token_end_index][1] >= end_char: 418 | token_end_index -= 1 419 | tokenized_examples["end_positions"].append(token_end_index + 1) 420 | 421 | return tokenized_examples 422 | 423 | if training_args.do_train: 424 | if "train" not in raw_datasets: 425 | raise ValueError("--do_train requires a train dataset") 426 | train_dataset = raw_datasets["train"] 427 | if data_args.max_train_samples is not None: 428 | # We will select sample from whole data if argument is specified 429 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 430 | train_dataset = train_dataset.select(range(max_train_samples)) 431 | # Create train feature from dataset 432 | with training_args.main_process_first(desc="train dataset map pre-processing"): 433 | train_dataset = train_dataset.map( 434 | prepare_train_features, 435 | batched=True, 436 | num_proc=data_args.preprocessing_num_workers, 437 | remove_columns=column_names, 438 | load_from_cache_file=not data_args.overwrite_cache, 439 | desc="Running tokenizer on train dataset", 440 | ) 441 | if data_args.max_train_samples is not None: 442 | # Number of samples might increase during Feature Creation, We select only specified max samples 443 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 444 | train_dataset = train_dataset.select(range(max_train_samples)) 445 | 446 | # Validation preprocessing 447 | def prepare_validation_features(examples): 448 | # Some of the questions have lots of whitespace on the left, which is not useful and will make the 449 | # truncation of the context fail (the tokenized question will take a lots of space). So we remove that 450 | # left whitespace 451 | examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] 452 | 453 | # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 454 | # in one example possible giving several features when a context is long, each of those features having a 455 | # context that overlaps a bit the context of the previous feature. 456 | tokenized_examples = tokenizer( 457 | examples[question_column_name if pad_on_right else context_column_name], 458 | examples[context_column_name if pad_on_right else question_column_name], 459 | truncation="only_second" if pad_on_right else "only_first", 460 | max_length=max_seq_length, 461 | stride=data_args.doc_stride, 462 | return_overflowing_tokens=True, 463 | return_offsets_mapping=True, 464 | padding="max_length" if data_args.pad_to_max_length else False, 465 | ) 466 | 467 | # Since one example might give us several features if it has a long context, we need a map from a feature to 468 | # its corresponding example. This key gives us just that. 469 | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 470 | 471 | # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the 472 | # corresponding example_id and we will store the offset mappings. 473 | tokenized_examples["example_id"] = [] 474 | 475 | for i in range(len(tokenized_examples["input_ids"])): 476 | # Grab the sequence corresponding to that example (to know what is the context and what is the question). 477 | sequence_ids = tokenized_examples.sequence_ids(i) 478 | context_index = 1 if pad_on_right else 0 479 | 480 | # One example can give several spans, this is the index of the example containing this span of text. 481 | sample_index = sample_mapping[i] 482 | tokenized_examples["example_id"].append(examples["id"][sample_index]) 483 | 484 | # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token 485 | # position is part of the context or not. 486 | tokenized_examples["offset_mapping"][i] = [ 487 | (o if sequence_ids[k] == context_index else None) 488 | for k, o in enumerate(tokenized_examples["offset_mapping"][i]) 489 | ] 490 | 491 | return tokenized_examples 492 | 493 | if training_args.do_eval: 494 | if "validation" not in raw_datasets: 495 | raise ValueError("--do_eval requires a validation dataset") 496 | eval_examples = raw_datasets["validation"] 497 | if data_args.max_eval_samples is not None: 498 | # We will select sample from whole data 499 | max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) 500 | eval_examples = eval_examples.select(range(max_eval_samples)) 501 | # Validation Feature Creation 502 | with training_args.main_process_first(desc="validation dataset map pre-processing"): 503 | eval_dataset = eval_examples.map( 504 | prepare_validation_features, 505 | batched=True, 506 | num_proc=data_args.preprocessing_num_workers, 507 | remove_columns=column_names, 508 | load_from_cache_file=not data_args.overwrite_cache, 509 | desc="Running tokenizer on validation dataset", 510 | ) 511 | if data_args.max_eval_samples is not None: 512 | # During Feature creation dataset samples might increase, we will select required samples again 513 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 514 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 515 | 516 | if training_args.do_predict: 517 | if "test" not in raw_datasets: 518 | raise ValueError("--do_predict requires a test dataset") 519 | predict_examples = raw_datasets["test"] 520 | if data_args.max_predict_samples is not None: 521 | # We will select sample from whole data 522 | predict_examples = predict_examples.select(range(data_args.max_predict_samples)) 523 | # Predict Feature Creation 524 | with training_args.main_process_first(desc="prediction dataset map pre-processing"): 525 | predict_dataset = predict_examples.map( 526 | prepare_validation_features, 527 | batched=True, 528 | num_proc=data_args.preprocessing_num_workers, 529 | remove_columns=column_names, 530 | load_from_cache_file=not data_args.overwrite_cache, 531 | desc="Running tokenizer on prediction dataset", 532 | ) 533 | if data_args.max_predict_samples is not None: 534 | # During Feature creation dataset samples might increase, we will select required samples again 535 | max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) 536 | predict_dataset = predict_dataset.select(range(max_predict_samples)) 537 | 538 | # Data collator 539 | # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data 540 | # collator. 541 | data_collator = ( 542 | default_data_collator 543 | if data_args.pad_to_max_length 544 | else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) 545 | ) 546 | 547 | # Post-processing: 548 | def post_processing_function(examples, features, predictions, stage="eval"): 549 | # Post-processing: we match the start logits and end logits to answers in the original context. 550 | predictions = postprocess_qa_predictions( 551 | examples=examples, 552 | features=features, 553 | predictions=predictions, 554 | version_2_with_negative=data_args.version_2_with_negative, 555 | n_best_size=data_args.n_best_size, 556 | max_answer_length=data_args.max_answer_length, 557 | null_score_diff_threshold=data_args.null_score_diff_threshold, 558 | output_dir=training_args.output_dir, 559 | log_level=log_level, 560 | prefix=stage, 561 | ) 562 | # Format the result to the format the metric expects. 563 | if data_args.version_2_with_negative: 564 | formatted_predictions = [ 565 | {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() 566 | ] 567 | else: 568 | formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] 569 | 570 | references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] 571 | return EvalPrediction(predictions=formatted_predictions, label_ids=references) 572 | 573 | logger.info("****** loading metric ******") 574 | # metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") 575 | metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad") 576 | 577 | 578 | def compute_metrics(p: EvalPrediction): 579 | return metric.compute(predictions=p.predictions, references=p.label_ids) 580 | 581 | # Initialize our Trainer 582 | logger.info("****** initializing trainer ******") 583 | trainer = QuestionAnsweringTrainer( 584 | model=model, 585 | args=training_args, 586 | train_dataset=train_dataset if training_args.do_train else None, 587 | eval_dataset=eval_dataset if training_args.do_eval else None, 588 | eval_examples=eval_examples if training_args.do_eval else None, 589 | tokenizer=tokenizer, 590 | data_collator=data_collator, 591 | post_process_function=post_processing_function, 592 | compute_metrics=compute_metrics, 593 | ) 594 | 595 | 596 | # Training 597 | if training_args.do_train: 598 | checkpoint = None 599 | if training_args.resume_from_checkpoint is not None: 600 | checkpoint = training_args.resume_from_checkpoint 601 | elif last_checkpoint is not None: 602 | checkpoint = last_checkpoint 603 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 604 | trainer.save_model() # Saves the tokenizer too for easy upload 605 | 606 | metrics = train_result.metrics 607 | max_train_samples = ( 608 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 609 | ) 610 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 611 | 612 | trainer.log_metrics("train", metrics) 613 | trainer.save_metrics("train", metrics) 614 | trainer.save_state() 615 | 616 | # Evaluation 617 | if training_args.do_eval: 618 | logger.info("*** Evaluate ***") 619 | metrics = trainer.evaluate() 620 | 621 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 622 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 623 | 624 | trainer.log_metrics("eval", metrics) 625 | trainer.save_metrics("eval", metrics) 626 | 627 | # Prediction 628 | if training_args.do_predict: 629 | logger.info("*** Predict ***") 630 | results = trainer.predict(predict_dataset, predict_examples) 631 | # pdb.set_trace() 632 | metrics = results.metrics 633 | 634 | max_predict_samples = ( 635 | data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) 636 | ) 637 | metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) 638 | 639 | trainer.log_metrics("predict", metrics) 640 | trainer.save_metrics("predict", metrics) 641 | 642 | # **** save prediction file **** 643 | # read predictions 644 | logger.info("postprocessing output") 645 | ids = [] 646 | preds = [] 647 | title_id_lst = [] 648 | title_lst = [] 649 | trigger_lst = [] 650 | gold_triple_lst = [] 651 | event_triple_lst = [] 652 | for idx, item in enumerate(raw_datasets['test']): 653 | ids.append(results[0][idx]['id']) 654 | ans = results[0][idx]['prediction_text'] 655 | if not ans: ans="" 656 | preds.append(ans) 657 | title_id_lst.append(int(item['title_id'])) 658 | title_lst.append(item['context']) 659 | gold_triple_lst.append(item['gold_answer_triples']) 660 | trigger_lst.append(item['trigger']) 661 | event_triple_lst.append(item['triple']) 662 | pred_df = pd.DataFrame({"title_id": title_id_lst, "title": title_lst, "trigger": trigger_lst, "event_triples": event_triple_lst, "gold_answer_triples": gold_triple_lst, "preds": preds}) 663 | # agg sbj answer and obj answer for same triple 664 | pred_df['idx'] = pred_df.index.tolist() 665 | pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: str(x)) 666 | pred_df = pred_df.groupby(['title_id', 'title', 'event_triples']).agg(list).sort_values(by='idx', axis=0).reset_index() 667 | pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: eval(x)) 668 | pred_df['trigger'] = pred_df['trigger'].apply(lambda x: x[0]) 669 | pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0]) 670 | pred_df.apply(lambda row: row.preds.insert(1, row.trigger), axis=1) 671 | pred_df.rename({"preds": "pred_event_triples"}, axis=1, inplace=True) 672 | pred_df['idx'] = pred_df.index.tolist() 673 | pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='idx', axis=0).reset_index() 674 | pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0]) 675 | pred_df = pred_df[['title_id', 'title', 'gold_answer_triples', 'pred_event_triples']] 676 | pred_df.rename({"gold_answer_triples": "event_triples"}, axis=1, inplace=True) 677 | 678 | if task_args.output_filename: 679 | output_name = task_args.output_filename 680 | else: 681 | output_name = "pipeline_predictions.csv" if task_args.pred_trg_file else "arg_predictions.csv" 682 | output_predictions_file = os.path.join(training_args.output_dir, output_name) 683 | pred_df.to_csv(output_predictions_file, index=False) 684 | logger.info("save output to %s" % output_predictions_file) 685 | 686 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} 687 | 688 | if data_args.dataset_name is not None: 689 | kwargs["dataset_tags"] = data_args.dataset_name 690 | if data_args.dataset_config_name is not None: 691 | kwargs["dataset_args"] = data_args.dataset_config_name 692 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 693 | else: 694 | kwargs["dataset"] = data_args.dataset_name 695 | 696 | if training_args.push_to_hub: 697 | trainer.push_to_hub(**kwargs) 698 | else: 699 | trainer.create_model_card(**kwargs) 700 | 701 | 702 | def _mp_fn(index): 703 | # For xla_spawn (TPUs) 704 | main() 705 | 706 | 707 | if __name__ == "__main__": 708 | main() 709 | -------------------------------------------------------------------------------- /mrc/run_seq2seq_qa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2021 The HuggingFace Team All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library's seq2seq models for question answering using the 🤗 Seq2SeqTrainer. 18 | """ 19 | # You can also adapt this script on your own question answering task. Pointers for this are left as comments. 20 | 21 | import logging 22 | import os 23 | import sys 24 | from dataclasses import dataclass, field 25 | from typing import List, Optional, Tuple 26 | 27 | import datasets 28 | from datasets import load_dataset, load_metric 29 | import evaluate 30 | import transformers 31 | from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer 32 | from transformers import ( 33 | AutoConfig, 34 | AutoModelForSeq2SeqLM, 35 | AutoTokenizer, 36 | T5TokenizerFast, 37 | DataCollatorForSeq2Seq, 38 | HfArgumentParser, 39 | Seq2SeqTrainingArguments, 40 | set_seed, 41 | ) 42 | from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint 43 | from transformers.utils import check_min_version 44 | from transformers.utils.versions import require_version 45 | 46 | import pdb 47 | from mrc_task_config import * 48 | 49 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") 50 | 51 | logger = logging.getLogger(__name__) 52 | 53 | @dataclass 54 | class TaskArguments: 55 | is_extractive: bool = field( 56 | default=False, 57 | metadata={"help": "Used in data prepocessing, determine wether to compute the answer offsets in the source text, set to True for SpanMRC model, False for Seq2Seq model "} 58 | ) 59 | pred_trg_file: Optional[str] = field( 60 | default=None, 61 | metadata={"help": "path of trigger prediction file, if specified and task_name is 'arg', will do argument extraction based on predicted triggers rather than golden labeled triggers"} 62 | ) 63 | pred_trg_col: Optional[str] = field( 64 | default='pred_triggers', 65 | metadata={"help": "column name of predicted triggers in pred_trg_file"} 66 | ) 67 | output_filename: Optional[str] = field( 68 | default=None, 69 | metadata={"help": "you can specify the file name of model predictions"} 70 | ) 71 | 72 | @dataclass 73 | class ModelArguments: 74 | """ 75 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 76 | """ 77 | 78 | model_name_or_path: str = field( 79 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 80 | ) 81 | config_name: Optional[str] = field( 82 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 83 | ) 84 | tokenizer_name: Optional[str] = field( 85 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 86 | ) 87 | cache_dir: Optional[str] = field( 88 | default=None, 89 | metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, 90 | ) 91 | use_fast_tokenizer: bool = field( 92 | default=True, 93 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 94 | ) 95 | model_revision: str = field( 96 | default="main", 97 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 98 | ) 99 | use_auth_token: bool = field( 100 | default=False, 101 | metadata={ 102 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 103 | "with private models)." 104 | }, 105 | ) 106 | 107 | 108 | @dataclass 109 | class DataTrainingArguments: 110 | """ 111 | Arguments pertaining to what data we are going to input our model for training and eval. 112 | """ 113 | 114 | dataset_name: Optional[str] = field( 115 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 116 | ) 117 | dataset_config_name: Optional[str] = field( 118 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 119 | ) 120 | context_column: Optional[str] = field( 121 | default="context", 122 | metadata={"help": "The name of the column in the datasets containing the contexts (for question answering)."}, 123 | ) 124 | question_column: Optional[str] = field( 125 | default="question", 126 | metadata={"help": "The name of the column in the datasets containing the questions (for question answering)."}, 127 | ) 128 | answer_column: Optional[str] = field( 129 | default="answers", 130 | metadata={"help": "The name of the column in the datasets containing the answers (for question answering)."}, 131 | ) 132 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) 133 | validation_file: Optional[str] = field( 134 | default=None, 135 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 136 | ) 137 | test_file: Optional[str] = field( 138 | default=None, 139 | metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, 140 | ) 141 | overwrite_cache: bool = field( 142 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 143 | ) 144 | preprocessing_num_workers: Optional[int] = field( 145 | default=None, 146 | metadata={"help": "The number of processes to use for the preprocessing."}, 147 | ) 148 | max_seq_length: int = field( 149 | default=128, 150 | metadata={ 151 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 152 | "than this will be truncated, sequences shorter will be padded." 153 | }, 154 | ) 155 | max_answer_length: int = field( 156 | default=30, 157 | metadata={ 158 | "help": "The maximum length of an answer that can be generated. This is needed because the start " 159 | "and end predictions are not conditioned on one another." 160 | }, 161 | ) 162 | val_max_answer_length: Optional[int] = field( 163 | default=None, 164 | metadata={ 165 | "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer " 166 | "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`." 167 | "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " 168 | "during ``evaluate`` and ``predict``." 169 | }, 170 | ) 171 | pad_to_max_length: bool = field( 172 | default=True, 173 | metadata={ 174 | "help": "Whether to pad all samples to `max_seq_length`. " 175 | "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " 176 | "be faster on GPU but will be slower on TPU)." 177 | }, 178 | ) 179 | max_train_samples: Optional[int] = field( 180 | default=None, 181 | metadata={ 182 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 183 | "value if set." 184 | }, 185 | ) 186 | max_eval_samples: Optional[int] = field( 187 | default=None, 188 | metadata={ 189 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 190 | "value if set." 191 | }, 192 | ) 193 | max_predict_samples: Optional[int] = field( 194 | default=None, 195 | metadata={ 196 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 197 | "value if set." 198 | }, 199 | ) 200 | version_2_with_negative: bool = field( 201 | default=False, metadata={"help": "If true, some of the examples do not have an answer."} 202 | ) 203 | null_score_diff_threshold: float = field( 204 | default=0.0, 205 | metadata={ 206 | "help": "The threshold used to select the null answer: if the best answer has a score that is less than " 207 | "the score of the null answer minus this threshold, the null answer is selected for this example. " 208 | "Only useful when `version_2_with_negative=True`." 209 | }, 210 | ) 211 | doc_stride: int = field( 212 | default=128, 213 | metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, 214 | ) 215 | n_best_size: int = field( 216 | default=20, 217 | metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, 218 | ) 219 | num_beams: Optional[int] = field( 220 | default=None, 221 | metadata={ 222 | "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, " 223 | "which is used during ``evaluate`` and ``predict``." 224 | }, 225 | ) 226 | ignore_pad_token_for_loss: bool = field( 227 | default=True, 228 | metadata={ 229 | "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." 230 | }, 231 | ) 232 | 233 | def __post_init__(self): 234 | if ( 235 | self.dataset_name is None 236 | and self.train_file is None 237 | and self.validation_file is None 238 | and self.test_file is None 239 | ): 240 | raise ValueError("Need either a dataset name or a training/validation file/test_file.") 241 | else: 242 | if self.train_file is not None: 243 | extension = self.train_file.split(".")[-1] 244 | assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." 245 | if self.validation_file is not None: 246 | extension = self.validation_file.split(".")[-1] 247 | assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." 248 | if self.test_file is not None: 249 | extension = self.test_file.split(".")[-1] 250 | assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." 251 | if self.val_max_answer_length is None: 252 | self.val_max_answer_length = self.max_answer_length 253 | 254 | 255 | question_answering_column_name_mapping = { 256 | "squad_v2": ("question", "context", "answer"), 257 | } 258 | 259 | 260 | def main(): 261 | # See all possible arguments in src/transformers/training_args.py 262 | # or by passing the --help flag to this script. 263 | # We now keep distinct sets of args, for a cleaner separation of concerns. 264 | 265 | parser = HfArgumentParser((TaskArguments, ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) 266 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 267 | # If we pass only one argument to the script and it's the path to a json file, 268 | # let's parse it to get our arguments. 269 | task_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 270 | else: 271 | task_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses() 272 | 273 | # training_args.do_eval = False 274 | 275 | # Setup logging 276 | logging.basicConfig( 277 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 278 | datefmt="%m/%d/%Y %H:%M:%S", 279 | handlers=[logging.StreamHandler(sys.stdout)], 280 | ) 281 | 282 | log_level = training_args.get_process_log_level() 283 | logger.setLevel(log_level) 284 | datasets.utils.logging.set_verbosity(log_level) 285 | transformers.utils.logging.set_verbosity(log_level) 286 | transformers.utils.logging.enable_default_handler() 287 | transformers.utils.logging.enable_explicit_format() 288 | 289 | # Log on each process the small summary: 290 | logger.warning( 291 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 292 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 293 | ) 294 | logger.info(f"Training/evaluation parameters {training_args}") 295 | 296 | # Detecting last checkpoint. 297 | last_checkpoint = None 298 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 299 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 300 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 301 | raise ValueError( 302 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 303 | "Use --overwrite_output_dir to overcome." 304 | ) 305 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 306 | logger.info( 307 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 308 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 309 | ) 310 | 311 | # Set seed before initializing model. 312 | set_seed(training_args.seed) 313 | 314 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 315 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 316 | # (the dataset will be downloaded automatically from the datasets Hub). 317 | # 318 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 319 | # 'text' is found. You can easily tweak this behavior (see below). 320 | # 321 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 322 | # download the dataset. 323 | raw_datasets = load_my_datasets_for_mrc(data_args, task_args) 324 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 325 | # https://huggingface.co/docs/datasets/loading_datasets.html. 326 | 327 | # Load pretrained model and tokenizer 328 | # 329 | # Distributed training: 330 | # The .from_pretrained methods guarantee that only one local process can concurrently 331 | # download model & vocab. 332 | logger.info("loading pretrained model and tokenizer") 333 | config = AutoConfig.from_pretrained( 334 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 335 | cache_dir=model_args.cache_dir, 336 | revision=model_args.model_revision, 337 | use_auth_token=True if model_args.use_auth_token else None, 338 | ) 339 | tokenizer = AutoTokenizer.from_pretrained( 340 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 341 | use_fast=True, 342 | cache_dir=model_args.cache_dir, 343 | revision=model_args.model_revision, 344 | use_auth_token=True if model_args.use_auth_token else None, 345 | ) 346 | model = AutoModelForSeq2SeqLM.from_pretrained( 347 | model_args.model_name_or_path, 348 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 349 | config=config, 350 | cache_dir=model_args.cache_dir, 351 | revision=model_args.model_revision, 352 | use_auth_token=True if model_args.use_auth_token else None, 353 | ) 354 | 355 | model.resize_token_embeddings(len(tokenizer)) 356 | 357 | if model.config.decoder_start_token_id is None: 358 | raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") 359 | 360 | # Preprocessing the datasets. 361 | # We need to generate and tokenize inputs and targets. 362 | if training_args.do_train: 363 | column_names = raw_datasets["train"].column_names 364 | elif training_args.do_eval: 365 | column_names = raw_datasets["validation"].column_names 366 | elif training_args.do_predict: 367 | column_names = raw_datasets["test"].column_names 368 | else: 369 | logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") 370 | return 371 | 372 | # Get the column names for input/target. 373 | dataset_columns = question_answering_column_name_mapping.get(data_args.dataset_name, None) 374 | if data_args.question_column is None: 375 | question_column = dataset_columns[0] if dataset_columns is not None else column_names[0] 376 | else: 377 | question_column = data_args.question_column 378 | if question_column not in column_names: 379 | raise ValueError( 380 | f"--question_column' value '{data_args.question_column}' needs to be one of: {', '.join(column_names)}" 381 | ) 382 | if data_args.context_column is None: 383 | context_column = dataset_columns[1] if dataset_columns is not None else column_names[1] 384 | else: 385 | context_column = data_args.context_column 386 | if context_column not in column_names: 387 | raise ValueError( 388 | f"--context_column' value '{data_args.context_column}' needs to be one of: {', '.join(column_names)}" 389 | ) 390 | if data_args.answer_column is None: 391 | answer_column = dataset_columns[2] if dataset_columns is not None else column_names[2] 392 | else: 393 | answer_column = data_args.answer_column 394 | if answer_column not in column_names: 395 | raise ValueError( 396 | f"--answer_column' value '{data_args.answer_column}' needs to be one of: {', '.join(column_names)}" 397 | ) 398 | 399 | # Temporarily set max_answer_length for training. 400 | max_answer_length = data_args.max_answer_length 401 | padding = "max_length" if data_args.pad_to_max_length else False 402 | 403 | if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): 404 | logger.warning( 405 | "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" 406 | f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" 407 | ) 408 | 409 | if data_args.max_seq_length > tokenizer.model_max_length: 410 | logger.warning( 411 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 412 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 413 | ) 414 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 415 | 416 | def preprocess_squad_batch( 417 | examples, 418 | question_column: str, 419 | context_column: str, 420 | answer_column: str, 421 | ) -> Tuple[List[str], List[str]]: 422 | questions = examples[question_column] 423 | contexts = examples[context_column] 424 | answers = examples[answer_column] 425 | 426 | def generate_input(_question, _context): 427 | return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()]) 428 | 429 | inputs = [generate_input(question, context) for question, context in zip(questions, contexts)] 430 | targets = [] 431 | # targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers] 432 | targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers] 433 | return inputs, targets 434 | 435 | def preprocess_function(examples): 436 | # modified: covert "{}" from dataframe to {} 437 | if isinstance(examples[answer_column][0], str): 438 | for i in range(len(examples[answer_column])): 439 | examples[answer_column][i] = eval(examples[answer_column][i]) 440 | # import pdb 441 | # pdb.set_trace() 442 | inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) 443 | 444 | model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True) 445 | # Setup the tokenizer for targets 446 | with tokenizer.as_target_tokenizer(): 447 | labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True) 448 | 449 | # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore 450 | # padding in the loss. 451 | if padding == "max_length" and data_args.ignore_pad_token_for_loss: 452 | labels["input_ids"] = [ 453 | [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] 454 | ] 455 | 456 | model_inputs["labels"] = labels["input_ids"] 457 | return model_inputs 458 | 459 | # Validation preprocessing 460 | def preprocess_validation_function(examples): 461 | # modified: covert "{}" from dataframe to {} 462 | if isinstance(examples[answer_column][0], str): 463 | for i in range(len(examples[answer_column])): 464 | examples[answer_column][i] = eval(examples[answer_column][i]) 465 | inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) 466 | 467 | model_inputs = tokenizer( 468 | inputs, 469 | max_length=max_seq_length, 470 | padding=padding, 471 | truncation=True, 472 | return_overflowing_tokens=True, 473 | return_offsets_mapping=True, 474 | ) 475 | # Setup the tokenizer for targets 476 | with tokenizer.as_target_tokenizer(): 477 | labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True) 478 | 479 | # Since one example might give us several features if it has a long context, we need a map from a feature to 480 | # its corresponding example. This key gives us just that. 481 | sample_mapping = model_inputs.pop("overflow_to_sample_mapping") 482 | 483 | # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the 484 | # corresponding example_id and we will store the offset mappings. 485 | model_inputs["example_id"] = [] 486 | 487 | for i in range(len(model_inputs["input_ids"])): 488 | # One example can give several spans, this is the index of the example containing this span of text. 489 | sample_index = sample_mapping[i] 490 | model_inputs["example_id"].append(examples["id"][sample_index]) 491 | 492 | # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore 493 | # padding in the loss. 494 | if padding == "max_length" and data_args.ignore_pad_token_for_loss: 495 | labels["input_ids"] = [ 496 | [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] 497 | ] 498 | 499 | model_inputs["labels"] = labels["input_ids"] 500 | return model_inputs 501 | 502 | if training_args.do_train: 503 | if "train" not in raw_datasets: 504 | raise ValueError("--do_train requires a train dataset") 505 | train_dataset = raw_datasets["train"] 506 | if data_args.max_train_samples is not None: 507 | # We will select sample from whole data if agument is specified 508 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 509 | train_dataset = train_dataset.select(range(max_train_samples)) 510 | # Create train feature from dataset 511 | with training_args.main_process_first(desc="train dataset map pre-processing"): 512 | train_dataset = train_dataset.map( 513 | preprocess_function, 514 | batched=True, 515 | num_proc=data_args.preprocessing_num_workers, 516 | remove_columns=column_names, 517 | load_from_cache_file=not data_args.overwrite_cache, 518 | desc="Running tokenizer on train dataset", 519 | ) 520 | if data_args.max_train_samples is not None: 521 | # Number of samples might increase during Feature Creation, We select only specified max samples 522 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 523 | train_dataset = train_dataset.select(range(max_train_samples)) 524 | 525 | if training_args.do_eval: 526 | if "validation" not in raw_datasets: 527 | raise ValueError("--do_eval requires a validation dataset") 528 | eval_examples = raw_datasets["validation"] 529 | if data_args.max_eval_samples is not None: 530 | # We will select sample from whole data 531 | max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) 532 | eval_examples = eval_examples.select(range(max_eval_samples)) 533 | # Validation Feature Creation 534 | with training_args.main_process_first(desc="validation dataset map pre-processing"): 535 | eval_dataset = eval_examples.map( 536 | preprocess_validation_function, 537 | batched=True, 538 | num_proc=data_args.preprocessing_num_workers, 539 | remove_columns=column_names, 540 | load_from_cache_file=not data_args.overwrite_cache, 541 | desc="Running tokenizer on validation dataset", 542 | ) 543 | if data_args.max_eval_samples is not None: 544 | # During Feature creation dataset samples might increase, we will select required samples again 545 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 546 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 547 | 548 | if training_args.do_predict: 549 | if "test" not in raw_datasets: 550 | raise ValueError("--do_predict requires a test dataset") 551 | predict_examples = raw_datasets["test"] 552 | if data_args.max_predict_samples is not None: 553 | # We will select sample from whole data 554 | predict_examples = predict_examples.select(range(data_args.max_predict_samples)) 555 | # Predict Feature Creation 556 | with training_args.main_process_first(desc="prediction dataset map pre-processing"): 557 | predict_dataset = predict_examples.map( 558 | preprocess_validation_function, 559 | batched=True, 560 | num_proc=data_args.preprocessing_num_workers, 561 | remove_columns=column_names, 562 | load_from_cache_file=not data_args.overwrite_cache, 563 | desc="Running tokenizer on prediction dataset", 564 | ) 565 | if data_args.max_predict_samples is not None: 566 | # During Feature creation dataset samples might increase, we will select required samples again 567 | max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) 568 | predict_dataset = predict_dataset.select(range(max_predict_samples)) 569 | 570 | # Data collator 571 | label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id 572 | data_collator = DataCollatorForSeq2Seq( 573 | tokenizer, 574 | model=model, 575 | label_pad_token_id=label_pad_token_id, 576 | pad_to_multiple_of=8 if training_args.fp16 else None, 577 | ) 578 | 579 | logger.info("***** loading metric ******") 580 | # metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") 581 | metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad") 582 | 583 | def compute_metrics(p: EvalPrediction): 584 | if isinstance(p.label_ids[0]['answers'], str): 585 | for i in range(len((p.label_ids))): 586 | p.label_ids[i]['answers'] = eval(p.label_ids[i]['answers']) 587 | return metric.compute(predictions=p.predictions, references=p.label_ids) 588 | 589 | # Post-processing: 590 | def post_processing_function( 591 | examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval" 592 | ): 593 | # import pdb 594 | # pdb.set_trace() 595 | # Decode the predicted tokens. 596 | # preds = outputs.predictions 597 | preds = outputs 598 | if isinstance(preds, tuple): 599 | preds = preds[0] 600 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 601 | 602 | # Build a map example to its corresponding features. 603 | example_id_to_index = {k: i for i, k in enumerate(examples["id"])} 604 | feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)} 605 | predictions = {} 606 | # Let's loop over all the examples! 607 | for example_index, example in enumerate(examples): 608 | # This is the index of the feature associated to the current example. 609 | feature_index = feature_per_example[example_index] 610 | predictions[example["id"]] = decoded_preds[feature_index] 611 | 612 | # Format the result to the format the metric expects. 613 | if data_args.version_2_with_negative: 614 | formatted_predictions = [ 615 | {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() 616 | ] 617 | else: 618 | formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] 619 | 620 | references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples] 621 | return EvalPrediction(predictions=formatted_predictions, label_ids=references) 622 | 623 | # Initialize our Trainer 624 | trainer = QuestionAnsweringSeq2SeqTrainer( 625 | model=model, 626 | args=training_args, 627 | train_dataset=train_dataset if training_args.do_train else None, 628 | eval_dataset=eval_dataset if training_args.do_eval else None, 629 | eval_examples=eval_examples if training_args.do_eval else None, 630 | tokenizer=tokenizer, 631 | data_collator=data_collator, 632 | compute_metrics=compute_metrics, 633 | post_process_function=post_processing_function, 634 | ) 635 | 636 | # Training 637 | if training_args.do_train: 638 | checkpoint = None 639 | if training_args.resume_from_checkpoint is not None: 640 | checkpoint = training_args.resume_from_checkpoint 641 | elif last_checkpoint is not None: 642 | checkpoint = last_checkpoint 643 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 644 | trainer.save_model() # Saves the tokenizer too for easy upload 645 | 646 | metrics = train_result.metrics 647 | max_train_samples = ( 648 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 649 | ) 650 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 651 | 652 | trainer.log_metrics("train", metrics) 653 | trainer.save_metrics("train", metrics) 654 | trainer.save_state() 655 | 656 | # Evaluation 657 | results = {} 658 | max_length = ( 659 | training_args.generation_max_length 660 | if training_args.generation_max_length is not None 661 | else data_args.val_max_answer_length 662 | ) 663 | num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams 664 | if training_args.do_eval: 665 | logger.info("*** Evaluate ***") 666 | metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") 667 | 668 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 669 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 670 | 671 | trainer.log_metrics("eval", metrics) 672 | trainer.save_metrics("eval", metrics) 673 | 674 | # Prediction 675 | if training_args.do_predict: 676 | logger.info("*** Predict ***") 677 | results = trainer.predict(predict_dataset, predict_examples) 678 | metrics = results.metrics 679 | 680 | max_predict_samples = ( 681 | data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) 682 | ) 683 | metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) 684 | 685 | trainer.log_metrics("predict", metrics) 686 | trainer.save_metrics("predict", metrics) 687 | 688 | # **** save prediction file **** 689 | # read predictions 690 | logger.info("postprocessing output") 691 | ids = [] 692 | preds = [] 693 | title_id_lst = [] 694 | title_lst = [] 695 | trigger_lst = [] 696 | gold_triple_lst = [] 697 | event_triple_lst = [] 698 | for idx, item in enumerate(raw_datasets['test']): 699 | ids.append(results[0][idx]['id']) 700 | ans = results[0][idx]['prediction_text'] 701 | if not ans: ans = "" 702 | preds.append(ans) 703 | title_id_lst.append(int(item['title_id'])) 704 | title_lst.append(item['context']) 705 | gold_triple_lst.append(item['gold_answer_triples']) 706 | trigger_lst.append(item['trigger']) 707 | event_triple_lst.append(item['triple']) 708 | pred_df = pd.DataFrame({"title_id": title_id_lst, "title": title_lst, "trigger": trigger_lst, "event_triples": event_triple_lst, "gold_answer_triples": gold_triple_lst, "preds": preds}) 709 | 710 | # aggregate sbj answer and obj answer for same triplet 711 | pred_df['idx'] = pred_df.index.tolist() 712 | pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: str(x)) 713 | pred_df = pred_df.groupby(['title_id', 'title', 'event_triples']).agg(list).sort_values(by='idx', axis=0).reset_index() 714 | pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: eval(x)) 715 | pred_df['trigger'] = pred_df['trigger'].apply(lambda x: x[0]) 716 | pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0]) 717 | pred_df.apply(lambda row: row.preds.insert(1, row.trigger), axis=1) 718 | pred_df.rename({"preds": "pred_event_triples"}, axis=1, inplace=True) 719 | pred_df['idx'] = pred_df.index.tolist() 720 | pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='idx', axis=0).reset_index() 721 | pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0]) 722 | pred_df = pred_df[['title_id', 'title', 'gold_answer_triples', 'pred_event_triples']] 723 | pred_df.rename({"gold_answer_triples": "event_triples"}, axis=1, inplace=True) 724 | 725 | if task_args.output_filename: 726 | output_name = task_args.output_filename 727 | else: 728 | output_name = "pipeline_predictions.csv" if task_args.pred_trg_file else "arg_predictions.csv" 729 | 730 | output_predictions_file = os.path.join(training_args.output_dir, output_name) 731 | pred_df.to_csv(output_predictions_file, index=False) 732 | logger.info("save output to %s" % output_predictions_file) 733 | 734 | if training_args.push_to_hub: 735 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} 736 | if data_args.dataset_name is not None: 737 | kwargs["dataset_tags"] = data_args.dataset_name 738 | if data_args.dataset_config_name is not None: 739 | kwargs["dataset_args"] = data_args.dataset_config_name 740 | kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" 741 | else: 742 | kwargs["dataset"] = data_args.dataset_name 743 | 744 | trainer.push_to_hub(**kwargs) 745 | 746 | 747 | def _mp_fn(index): 748 | # For xla_spawn (TPUs) 749 | main() 750 | 751 | 752 | if __name__ == "__main__": 753 | main() 754 | --------------------------------------------------------------------------------