├── .gitignore
├── mrc
    ├── requirements.txt
    ├── run_spanmrc.sh
    ├── run_seq2seqmrc.sh
    ├── run_qa.sh
    ├── mrc_task_config.py
    ├── trainer_qa.py
    ├── trainer_seq2seq_qa.py
    ├── README.md
    ├── utils_qa.py
    ├── run_qa_beam_search.py
    ├── run_qa.py
    └── run_seq2seq_qa.py
├── requirements.txt
├── seqtag
    ├── run_trigger_extraction.sh
    ├── run_argument_extraction.sh
    ├── ner_task_config.py
    └── run_ner.py
├── README.md
└── evaluate.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | *.ipynb
3 | *.csv
4 | *.json
5 | 


--------------------------------------------------------------------------------
/mrc/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets >= 1.8.0
3 | torch >= 1.3.0
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | transformers==4.19.0
3 | datasets>=1.8.0
4 | evaluate
5 | seqeval
6 | tabulate
7 | 


--------------------------------------------------------------------------------
/seqtag/run_trigger_extraction.sh:
--------------------------------------------------------------------------------
 1 | # run trigger extraction using sequence-tagging model
 2 | MODEL=bert-base-chinese
 3 | OUTPUT_DIR=./output/trg_seqtag/$MODEL
 4 | 
 5 | python run_ner.py \
 6 |     --model_name_or_path $MODEL \
 7 |     --task_name trg \
 8 |     --output_dir $OUTPUT_DIR \
 9 |     --overwrite_output_dir \
10 |     --train_file ../dataset/tagging/tagging_train.csv \
11 |     --validation_file ../dataset/tagging/tagging_dev.csv \
12 |     --test_file ../dataset/tagging/tagging_test.csv \
13 |     --do_train \
14 |     --do_eval  \
15 |     --do_predict \
16 |     --output_filename trg_predictions.csv \
17 |     --per_device_train_batch_size=32 \
18 |     --per_device_eval_batch_size=32 \
19 |     --num_train_epochs 30 \
20 |     --save_strategy epoch \
21 |     --logging_strategy epoch \
22 |     --evaluation_strategy epoch \
23 |     --save_total_limit 1 \
24 |     --load_best_model_at_end \
25 |     --metric_for_best_model f1 \
26 |     --text_column_name tokens 
27 | 


--------------------------------------------------------------------------------
/mrc/run_spanmrc.sh:
--------------------------------------------------------------------------------
 1 | # first, lets train argument extraction SpanMRC model on golden triggers
 2 | python run_qa.py \
 3 |     --model_name_or_path bert-base-chinese \
 4 |     --output_dir ./output/arg_spanmrc/bert-base-chinese \
 5 |     --overwrite_output_dir \
 6 |     --version_2_with_negative \
 7 |     --train_file ../dataset/train.csv \
 8 |     --validation_file ../dataset/dev.csv \
 9 |     --test_file ../dataset/test.csv \
10 |     --do_train \
11 |     --do_eval \
12 |     --do_predict \
13 |     --output_filename arg_predictions.csv \
14 |     --per_device_train_batch_size=32 \
15 |     --per_device_eval_batch_size=32 \
16 |     --num_train_epochs 30 \
17 |     --save_strategy epoch \
18 |     --logging_strategy epoch \
19 |     --evaluation_strategy epoch \
20 |     --save_total_limit 1 \
21 |     --load_best_model_at_end \
22 |     --metric_for_best_model eval_f1 
23 | 
24 | # then, given the extraced triggers, we could use this model to predict arguments in a pipeline manner
25 | PRED_TRIGGER_FILE=../seqtag/output/trg_seqtag/bert-base-chinese/trg_predictions.csv
26 | python run_qa.py \
27 |     --model_name_or_path ./output/arg_spanmrc/bert-base-chinese \
28 |     --output_dir ./output/arg_spanmrc/bert-base-chinese \
29 |     --overwrite_output_dir \
30 |     --version_2_with_negative \
31 |     --train_file ../dataset/train.csv \
32 |     --validation_file ../dataset/dev.csv \
33 |     --test_file ../dataset/test.csv \
34 |     --pred_trg_file  $PRED_TRIGGER_FILE \
35 |     --do_train false \
36 |     --do_eval false \
37 |     --do_predict \
38 |     --output_filename pipeline_predictions.csv \
39 |     --per_device_train_batch_size=32 \
40 |     --per_device_eval_batch_size=32 \
41 |     --num_train_epochs 30 \
42 |     --save_strategy epoch \
43 |     --logging_strategy epoch \
44 |     --evaluation_strategy epoch \
45 |     --save_total_limit 1 \
46 |     --load_best_model_at_end \
47 |     --metric_for_best_model eval_f1 


--------------------------------------------------------------------------------
/seqtag/run_argument_extraction.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # first, lets train argument extraction model on golden triggers
 3 | python3 run_ner.py \
 4 |     --model_name_or_path bert-base-chinese \
 5 |     --task_name arg \
 6 |     --output_dir ./output/arg_seqtag/bert-base-chinese \
 7 |     --overwrite_output_dir \
 8 |     --train_file ../dataset/tagging/tagging_train.csv \
 9 |     --validation_file ../dataset/tagging/tagging_dev.csv \
10 |     --test_file ../dataset/tagging/tagging_test.csv \
11 |     --do_train \
12 |     --do_eval  \
13 |     --do_predict \
14 |     --output_filename arg_predictions.csv \
15 |     --per_device_train_batch_size=32 \
16 |     --per_device_eval_batch_size=32 \
17 |     --num_train_epochs 30 \
18 |     --save_strategy epoch \
19 |     --logging_strategy epoch \
20 |     --evaluation_strategy epoch \
21 |     --save_total_limit 1 \
22 |     --load_best_model_at_end \
23 |     --metric_for_best_model f1 \
24 |     --text_column_name tokens 
25 | 
26 | 
27 | # then, given the extraced triggers, we could use this model to predict arguments in a pipeline manner
28 | PRED_TRIGGER_FILE=./output/trg_seqtag/bert-base-chinese/trg_predictions.csv
29 | python3 run_ner.py \
30 |     --model_name_or_path ./output/arg_seqtag/bert-base-chinese \
31 |     --task_name arg \
32 |     --output_dir ./output/arg_seqtag/bert-base-chinese \
33 |     --overwrite_output_dir \
34 |     --train_file ../dataset/tagging/tagging_train.csv \
35 |     --validation_file ../dataset/tagging/tagging_dev.csv \
36 |     --test_file ../dataset/tagging/tagging_test.csv \
37 |     --pred_trg_file  $PRED_TRIGGER_FILE \
38 |     --do_train false \
39 |     --do_eval  false \
40 |     --do_predict \
41 |     --output_filename pipeline_predictions.csv \
42 |     --per_device_train_batch_size=32 \
43 |     --per_device_eval_batch_size=32 \
44 |     --num_train_epochs 30 \
45 |     --save_strategy epoch \
46 |     --logging_strategy epoch \
47 |     --evaluation_strategy epoch \
48 |     --save_total_limit 1 \
49 |     --load_best_model_at_end \
50 |     --metric_for_best_model f1 \
51 |     --text_column_name tokens 


--------------------------------------------------------------------------------
/mrc/run_seq2seqmrc.sh:
--------------------------------------------------------------------------------
 1 | # first, lets train argument extraction Seq2SeqMRC model on golden triggers
 2 | python run_seq2seq_qa.py \
 3 |     --model_name_or_path google/mt5-base \
 4 |     --output_dir ./output/arg_seq2seqmrc/mt5-base \
 5 |     --overwrite_output_dir \
 6 |     --version_2_with_negative \
 7 |     --train_file ../dataset/train.csv \
 8 |     --validation_file ../dataset/dev.csv \
 9 |     --test_file ../dataset/test.csv \
10 |     --output_filename arg_predictions.csv \
11 |     --eval_accumulation_steps 1 \
12 |     --predict_with_generate \
13 |     --do_train \
14 |     --do_eval \
15 |     --do_predict \
16 |     --learning_rate 1e-4 \
17 |     --per_device_train_batch_size 32 \
18 |     --per_device_eval_batch_size 8 \
19 |     --num_train_epochs 30 \
20 |     --save_strategy epoch \
21 |     --logging_strategy epoch \
22 |     --evaluation_strategy epoch \
23 |     --save_total_limit 1 \
24 |     --load_best_model_at_end \
25 |     --metric_for_best_model eval_f1 
26 |   
27 | # then, given the extraced triggers, we could use this model to predict arguments in a pipeline manner
28 | PRED_TRIGGER_FILE=../seqtag/output/trg_seqtag/bert-base-chinese/trg_predictions.csv
29 | python run_seq2seq_qa.py \
30 |     --model_name_or_path ./output/arg_seq2seqmrc/mt5-base \
31 |     --output_dir ./output/arg_seq2seqmrc/mt5-base \
32 |     --overwrite_output_dir \
33 |     --version_2_with_negative \
34 |     --train_file ../dataset/train.csv \
35 |     --validation_file ../dataset/dev.csv \
36 |     --test_file ../dataset/test.csv \
37 |     --pred_trg_file  $PRED_TRIGGER_FILE \
38 |     --output_filename pipeline_predictions.csv \
39 |     --eval_accumulation_steps 1 \
40 |     --predict_with_generate \
41 |     --do_train false \
42 |     --do_eval false \
43 |     --do_predict \
44 |     --learning_rate 1e-4 \
45 |     --per_device_train_batch_size 32 \
46 |     --per_device_eval_batch_size 8 \
47 |     --num_train_epochs 30 \
48 |     --save_strategy epoch \
49 |     --logging_strategy epoch \
50 |     --evaluation_strategy epoch \
51 |     --save_total_limit 1 \
52 |     --load_best_model_at_end \
53 |     --metric_for_best_model eval_f1 


--------------------------------------------------------------------------------
/mrc/run_qa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | CUDA_VISIBLE_DEVICES=5 python run_seq2seq_qa.py \
 3 |     --model_name_or_path ../output/arg_seq2seq_qa/mt5-base \
 4 |     --output_dir ../output/arg_seq2seq_qa/mt5-base \
 5 |     --overwrite_output_dir \
 6 |     --version_2_with_negative \
 7 |     --train_file ../../datasets/Title2Event/train.csv \
 8 |     --validation_file ../../datasets/Title2Event/dev.csv \
 9 |     --test_file ../../datasets/Title2Event/test.csv \
10 |     --eval_accumulation_steps 1 \
11 |     --predict_with_generate \
12 |     --version_2_with_negative \
13 |     --do_train False --do_eval False \
14 |     --learning_rate 1e-4 \
15 |     --do_predict \
16 |     --per_device_train_batch_size 32 \
17 |     --per_device_eval_batch_size 8 \
18 |     --num_train_epochs 30 \
19 |     --save_strategy epoch \
20 |     --logging_strategy epoch \
21 |     --evaluation_strategy epoch \
22 |     --save_total_limit 1 \
23 |     --load_best_model_at_end \
24 |     --metric_for_best_model eval_f1 \
25 |     --pred_trg_file ../output/trg_ner/bert-base-chinese/trg_predictions.csv \
26 | 
27 | # CUDA_VISIBLE_DEVICES=2 python run_qa.py \
28 | #     --model_name_or_path ../output/arg_qa/bert-base-chinese \
29 | #     --output_dir ../output/arg_qa/bert-base-chinese \
30 | #     --overwrite_output_dir \
31 | #     --version_2_with_negative \
32 | #     --train_file ../../datasets/Title2Event/train.csv \
33 | #     --validation_file ../../datasets/Title2Event/dev.csv \
34 | #     --test_file ../../datasets/Title2Event/test.csv \
35 | #     --do_train  False --do_eval False \
36 | #     --do_predict \
37 | #     --per_device_train_batch_size=32 \
38 | #     --per_device_eval_batch_size=32 \
39 | #     --num_train_epochs 30 \
40 | #     --save_strategy epoch \
41 | #     --logging_strategy epoch \
42 | #     --evaluation_strategy epoch \
43 | #     --save_total_limit 1 \
44 | #     --load_best_model_at_end \
45 | #     --metric_for_best_model eval_f1 \
46 | #     --pred_trg_file ../output/trg_ner/bert-base-chinese/trg_predictions.csv \
47 | 
48 | 
49 | # CUDA_VISIBLE_DEVICES=7 python run_seq2seq_qa.py \
50 | #   --model_name_or_path t5-base \
51 | #   --dataset_name squad_v2 \
52 | #   --context_column context \
53 | #   --question_column question \
54 | #   --answer_column answers \
55 | #   --do_train \
56 | #   --do_eval \
57 | #   --per_device_train_batch_size 12 \
58 | #   --learning_rate 3e-5 \
59 | #   --num_train_epochs 2 \
60 | #   --max_seq_length 384 \
61 | #   --doc_stride 128 \
62 | #   --output_dir ../output/tmp/t5-base


--------------------------------------------------------------------------------
/mrc/mrc_task_config.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datasets import Dataset
 3 | 
 4 | def load_my_datasets_for_mrc(data_args, task_args):
 5 |     raw_dataset = {}
 6 |     if data_args.test_file is not None:
 7 |         tst_df = pd.read_csv(data_args.test_file).fillna("None")
 8 |         tst_df = preprocess(tst_df, task_args, is_pred_data=True)
 9 |         test_dataset = Dataset.from_pandas(tst_df)
10 |         raw_dataset['test'] = test_dataset
11 |     if data_args.train_file is not None:
12 |         trn_df = pd.read_csv(data_args.train_file).fillna("None")
13 |         trn_df = preprocess(trn_df, task_args)
14 |         train_dataset = Dataset.from_pandas(trn_df)
15 |         raw_dataset['train'] = train_dataset
16 |     if data_args.validation_file is not None:
17 |         val_df = pd.read_csv(data_args.validation_file).fillna("None")
18 |         val_df = val_df
19 |         val_df = preprocess(val_df, task_args)
20 |         valid_dataset = Dataset.from_pandas(val_df)
21 |         raw_dataset['validation'] = valid_dataset
22 |     return raw_dataset
23 | 
24 | def preprocess(df, task_arg, is_pred_data=False):
25 |     def find_answers(row):
26 |         sbj_ans, obj_ans = {"text": [], "answer_start": []}, {"text": [], "answer_start": []}
27 |         if task_arg.is_extractive:
28 |             if row.triple and row.triple[0] and row.triple[0] in row.title: 
29 |                 sbj_ans['text'] = [row.triple[0]]
30 |                 sbj_ans['answer_start'] = [row.title.index(row.triple[0])]
31 |             if row.triple and row.triple[2] and row.triple[2] in row.title: 
32 |                 obj_ans['text'] = [row.triple[2]]
33 |                 obj_ans['answer_start'] = [row.title.index(row.triple[2])]
34 |         else:
35 |             if row.triple and row.triple[0]: sbj_ans['text'] = [row.triple[0]]
36 |             if row.triple and row.triple[2]: obj_ans['text'] = [row.triple[2]]
37 |         return [sbj_ans, obj_ans]
38 |     for col in df.columns:
39 |         if col.endswith('triple'):
40 |             df[col] = df[col].apply(eval)
41 |     df['triple'] = df[[f'event{i}_triple' for i in range(1,7)]].apply(lambda row: [x for x in row], axis=1)
42 |     df['gold_answer_triples'] = df['triple'].apply(lambda x: x if None not in x else x[:x.index(None)])
43 |     if is_pred_data and task_arg.pred_trg_file:
44 |         pred_trg_df = pd.read_csv(task_arg.pred_trg_file)
45 |         df['trigger'] = pred_trg_df[task_arg.pred_trg_col].apply(eval)
46 |         df['trigger'] = df['trigger'].apply(lambda x: (x + [None] * (6-len(x)))[:6])
47 |         df['triple'] = df[[f'event{i}_triple' for i in range(1,7)]].apply(lambda row: [x for x in row if x !=None], axis=1)
48 |         df['triple'] = df['triple'].apply(lambda x: (x + [[]] * (6-len(x)))[:6])
49 | 
50 |     else: 
51 |         df['trigger'] = df['triple'].apply(lambda x: [tu[1] if tu else None for tu in x])
52 |     df['id'] = df['triple'].apply(lambda x: [str(i) for i, _ in enumerate(x)])
53 |     df = df.explode(['triple', 'id', 'trigger'])[['id', 'title_id', 'title', 'trigger', 'triple', 'gold_answer_triples']].dropna().reset_index(drop=True)
54 |     df['title_id'] = df['title_id'].apply(lambda x: str(x))
55 |     df['id'] = df['title_id'] + "-" + df['id']
56 |         
57 |     df['question'] = df['trigger'].apply(lambda x: [f"动作{x}的主体是？", f"动作{x}的客体是？"])
58 |     df['q_id'] = [["sbj", "obj"]]*len(df)
59 |     df['answers'] = df.apply(find_answers, axis=1)
60 |     df.rename({"title": "context"}, axis=1, inplace=True)
61 |     df = df.explode(['question', 'answers', 'q_id']).reset_index(drop=True)
62 |     df['id'] = df['id'] + "_" + df['q_id']
63 |     df = df.drop(labels='q_id', axis=1)
64 |     return df


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Title2Event
 2 | This is the repository for the paper [Title2Event: Benchmarking Open Event Extraction with a Large-scale Chinese Title Dataset](https://aclanthology.org/2022.emnlp-main.437/)
 3 | ## Quick Start
 4 | ### Download the dataset
 5 | You can obtain the dataset from our [webpage](https://open-event-hub.github.io/title2event/) \
 6 | Note that the dataset is provided in both `csv` and `json` format, but currently the baseline code reads `csv` files by default. \
 7 | You can also find `tagging_train.csv`,`tagging_dev.csv` ,`tagging_test.csv`, these files contain the `BIO` labels needed to train tagging-based models, and are used by the `SeqTag` model.     
 8 | ### Requirements
 9 | The code is modified from [examples of huggingface transformers](https://github.com/huggingface/transformers/tree/main/examples) \
10 | In your preferred environment, run
11 | ```
12 | pip3 install -r requirements.txt
13 | ```
14 | ### Trigger Extraction
15 | #### Sequence-tagging model
16 | Note that the trigger prediction file is needed for pipeline inference.
17 | ```
18 | cd seqtag
19 | bash run_trigger_extraction.sh
20 | ```
21 | ### Argument Extraction
22 | All the following scripts will output two files: \
23 | **arg_predictions.csv**: the model predictions with golden triggers \
24 | **pipeline_predictions.csv**: the model predictions given the triggers predicted by the Trigger Extraction model \
25 | The above files are used in **Evaluation**
26 | #### Sequence-tagging model
27 | ```
28 | cd seqtag
29 | bash run_argument_extraction.sh
30 | ```
31 | #### Span MRC model
32 | ```
33 | cd mrc
34 | bash run_spanmrc.sh
35 | ```
36 | #### Seq2Seq MRC model
37 | ```
38 | cd mrc
39 | bash run_seq2seqmrc.sh
40 | ```
41 | 
42 | ### Evaluation
43 | ```
44 | python3 evaluate.py -f [path of file1] [path of file2] ...
45 | ```
46 | ## Citation
47 | ```
48 | @inproceedings{deng-etal-2022-title2event,
49 |     title = "{T}itle2{E}vent: Benchmarking Open Event Extraction with a Large-scale {C}hinese Title Dataset",
50 |     author = "Deng, Haolin  and
51 |       Zhang, Yanan  and
52 |       Zhang, Yangfan  and
53 |       Ying, Wangyang  and
54 |       Yu, Changlong  and
55 |       Gao, Jun  and
56 |       Wang, Wei  and
57 |       Bai, Xiaoling  and
58 |       Yang, Nan  and
59 |       Ma, Jin  and
60 |       Chen, Xiang  and
61 |       Zhou, Tianhua",
62 |     booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
63 |     month = dec,
64 |     year = "2022",
65 |     address = "Abu Dhabi, United Arab Emirates",
66 |     publisher = "Association for Computational Linguistics",
67 |     url = "https://aclanthology.org/2022.emnlp-main.437",
68 |     pages = "6511--6524",
69 |     abstract = "Event extraction (EE) is crucial to downstream tasks such as new aggregation and event knowledge graph construction. Most existing EE datasets manually define fixed event types and design specific schema for each of them, failing to cover diverse events emerging from the online text. Moreover, news titles, an important source of event mentions, have not gained enough attention in current EE research. In this paper, we present Title2Event, a large-scale sentence-level dataset benchmarking Open Event Extraction without restricting event types. Title2Event contains more than 42,000 news titles in 34 topics collected from Chinese web pages. To the best of our knowledge, it is currently the largest manually annotated Chinese dataset for open event extraction. We further conduct experiments on Title2Event with different models and show that the characteristics of titles make it challenging for event extraction, addressing the significance of advanced study on this problem. The dataset and baseline codes are available at https://open-event-hub.github.io/title2event.",
70 | }
71 | 
72 | ```
73 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import pandas as pd
 3 | from tabulate import tabulate
 4 | 
 5 | parser = ArgumentParser()
 6 | parser.add_argument('-f', '--file_paths', dest='file_paths', nargs='+', type=str, help="path(s) of model prediction file(s) to evaluate")
 7 | parser.add_argument('--pred_col', default='pred_event_triples', help='column name for model predictions')
 8 | parser.add_argument('--ans_col', default='event_triples', help='column name for golden answers')
 9 | 
10 | def read_data(path):
11 |     if path.endswith('.csv'):
12 |         df = pd.read_csv(path)
13 |     return df
14 | 
15 | 
16 | def evaluate(df: pd.DataFrame, pred_col='pred_event_triples', ans_col='event_triples'):
17 |     """
18 |     compute precision, recall, and F1-score for model predictions, based on exact match, and print the tabulated results
19 |     args:
20 |         df: the dataframe containing a column of model predictions and a column of golden answers, where each cell of both columns should be a list of triplets
21 |         pred_col: column name for model predictions
22 |         ans_col: column name for golden answers
23 |     return: F1-scores of trigger extraction, argument extraction and triplet extraction
24 |     """
25 |     gold_trp_num = sum(df[ans_col].apply(lambda x: len(x)))
26 |     pred_trp_num = sum(df[pred_col].apply(lambda x: len(x)))
27 |     gold_trg_num, gold_arg_num, pred_trg_num, pred_arg_num = 0, 0, 0, 0
28 |     trg_match_cnt, arg_match_cnt, triple_match_cnt = 0, 0, 0
29 | 
30 |     for idx, row in df.iterrows():
31 |         local_triple_match_cnt = 0
32 |         gold_trips = list(row[ans_col])
33 |         gold_trgs = [trp[1] for trp in row[ans_col]]
34 |         gold_sbjs = [{"P": trp[1], "S": trp[0]} for trp in row[ans_col] if trp[0]!='']
35 |         gold_objs = [{"P": trp[1], "O": trp[2]} for trp in row[ans_col] if trp[2]!='']
36 |         gold_trg_num += len(gold_trgs)
37 |         gold_arg_num += (len(gold_sbjs) + len(gold_objs))
38 |         for pred in row[pred_col]:
39 |             pred_trg_num += 1
40 |             if len(pred) == 1:
41 |                 pred = ["", pred[0], ""]
42 |             elif len(pred) == 2:
43 |                 pred.append("")
44 |             if pred in gold_trips:
45 |                 local_triple_match_cnt += 1
46 |                 triple_match_cnt += 1
47 |                 gold_trips.remove(pred)
48 | 
49 |             if pred[0] != '': pred_arg_num += 1
50 |             if pred[2] != '': pred_arg_num += 1
51 |             if pred[1] in gold_trgs:
52 |                 trg_match_cnt += 1
53 |                 gold_trgs.remove(pred[1])
54 |             if {"P": pred[1], "S": pred[0]} in gold_sbjs:
55 |                 arg_match_cnt += 1
56 |                 gold_sbjs.remove({"P": pred[1], "S": pred[0]})
57 |             if {"P": pred[1], "O": pred[2]} in gold_objs:
58 |                 arg_match_cnt += 1
59 |                 gold_objs.remove({"P": pred[1], "O": pred[2]})
60 | 
61 |     F1 = lambda p,r: "{:.5f}".format(2*p*r/(p+r))
62 |     trg_p, arg_p, trp_p = trg_match_cnt/pred_trg_num, arg_match_cnt/pred_arg_num, triple_match_cnt/pred_trp_num
63 |     trg_r, arg_r, trp_r = trg_match_cnt/gold_trg_num, arg_match_cnt/gold_arg_num, triple_match_cnt/gold_trp_num
64 |     trg_f, arg_f, trp_f = F1(trg_p, trg_r), F1(arg_p, arg_r), F1(trp_p, trp_r)
65 |   
66 |     header = ["task", "Precision", "Recall", "F1"]
67 |     rows = [
68 |         ("Trigger Identification", trg_p, trg_r, trg_f),
69 |         ("Argument Identification", arg_p, arg_r, arg_f),
70 |         ("Triple Identification", trp_p, trp_r, trp_f),
71 |     ]
72 | 
73 |     print(tabulate(rows, headers=header))
74 |     print("gold num > {}".format({"trp": gold_trg_num, "trg": gold_trg_num, "arg": gold_arg_num}))
75 |     print("pred num > {}".format({"trp": pred_trp_num, "trg": pred_trg_num, "arg": pred_arg_num}))
76 |     print("trg match: %d, arg match: %d, trp match: %d" % (trg_match_cnt, arg_match_cnt, triple_match_cnt))
77 | 
78 |     return trg_f, arg_f, trp_f
79 | 
80 | if __name__ == '__main__':
81 |     args = parser.parse_args()
82 |     pred_col, ans_col = args.pred_col, args.ans_col
83 |     for file_path in args.file_paths:
84 |         df = read_data(file_path)
85 |         # post-process:
86 |         # 1. convert str object "[1,2,3]" to list object [1,2,3]
87 |         # 2. unify Chinese and English punctuations
88 |         # 3. unify letters to lower case
89 |         # 4. discard empty predictions
90 |         df[ans_col] = df[ans_col].apply(lambda x: eval(x.lower().replace(" ", "").replace(":", "："))) 
91 |         df['pred_event_triples'] = df['pred_event_triples'].apply(lambda x: eval(x.lower().replace(" ", "").replace(":", "：")))
92 |         df['pred_event_triples'] = df['pred_event_triples'].apply(lambda x: [i for i in x if i!=[]])
93 | 
94 |         # evaluate
95 |         print("************* {} ***********".format(file_path))
96 |         trg_f, arg_f, trp_f = evaluate(df, pred_col=pred_col, ans_col=ans_col)
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/mrc/trainer_qa.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Team All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """
 16 | A subclass of `Trainer` specific to Question-Answering tasks
 17 | """
 18 | 
 19 | from transformers import Trainer, is_torch_tpu_available
 20 | from transformers.trainer_utils import PredictionOutput
 21 | 
 22 | 
 23 | if is_torch_tpu_available():
 24 |     import torch_xla.core.xla_model as xm
 25 |     import torch_xla.debug.metrics as met
 26 | 
 27 | 
 28 | class QuestionAnsweringTrainer(Trainer):
 29 |     def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
 30 |         super().__init__(*args, **kwargs)
 31 |         self.eval_examples = eval_examples
 32 |         self.post_process_function = post_process_function
 33 | 
 34 |     def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
 35 |         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
 36 |         eval_dataloader = self.get_eval_dataloader(eval_dataset)
 37 |         eval_examples = self.eval_examples if eval_examples is None else eval_examples
 38 | 
 39 |         # Temporarily disable metric computation, we will do it in the loop here.
 40 |         compute_metrics = self.compute_metrics
 41 |         self.compute_metrics = None
 42 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
 43 |         try:
 44 |             output = eval_loop(
 45 |                 eval_dataloader,
 46 |                 description="Evaluation",
 47 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
 48 |                 # self.args.prediction_loss_only
 49 |                 prediction_loss_only=True if compute_metrics is None else None,
 50 |                 ignore_keys=ignore_keys,
 51 |             )
 52 |         finally:
 53 |             self.compute_metrics = compute_metrics
 54 | 
 55 |         if self.post_process_function is not None and self.compute_metrics is not None:
 56 |             eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
 57 |             metrics = self.compute_metrics(eval_preds)
 58 | 
 59 |             # Prefix all keys with metric_key_prefix + '_'
 60 |             for key in list(metrics.keys()):
 61 |                 if not key.startswith(f"{metric_key_prefix}_"):
 62 |                     metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
 63 | 
 64 |             self.log(metrics)
 65 |         else:
 66 |             metrics = {}
 67 | 
 68 |         if self.args.tpu_metrics_debug or self.args.debug:
 69 |             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
 70 |             xm.master_print(met.metrics_report())
 71 | 
 72 |         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
 73 |         return metrics
 74 | 
 75 |     def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
 76 |         predict_dataloader = self.get_test_dataloader(predict_dataset)
 77 | 
 78 |         # Temporarily disable metric computation, we will do it in the loop here.
 79 |         compute_metrics = self.compute_metrics
 80 |         self.compute_metrics = None
 81 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
 82 |         try:
 83 |             output = eval_loop(
 84 |                 predict_dataloader,
 85 |                 description="Prediction",
 86 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
 87 |                 # self.args.prediction_loss_only
 88 |                 prediction_loss_only=True if compute_metrics is None else None,
 89 |                 ignore_keys=ignore_keys,
 90 |             )
 91 |         finally:
 92 |             self.compute_metrics = compute_metrics
 93 | 
 94 |         if self.post_process_function is None or self.compute_metrics is None:
 95 |             return output
 96 | 
 97 |         predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
 98 |         metrics = self.compute_metrics(predictions)
 99 | 
100 |         # Prefix all keys with metric_key_prefix + '_'
101 |         for key in list(metrics.keys()):
102 |             if not key.startswith(f"{metric_key_prefix}_"):
103 |                 metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
104 | 
105 |         return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
106 | 


--------------------------------------------------------------------------------
/mrc/trainer_seq2seq_qa.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The HuggingFace Team All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """
 16 | A subclass of `Trainer` specific to Question-Answering tasks
 17 | """
 18 | from typing import Dict, List, Optional
 19 | 
 20 | from torch.utils.data import Dataset
 21 | 
 22 | from transformers import Seq2SeqTrainer, is_torch_tpu_available
 23 | from transformers.trainer_utils import PredictionOutput
 24 | 
 25 | 
 26 | if is_torch_tpu_available():
 27 |     import torch_xla.core.xla_model as xm
 28 |     import torch_xla.debug.metrics as met
 29 | 
 30 | 
 31 | class QuestionAnsweringSeq2SeqTrainer(Seq2SeqTrainer):
 32 |     def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
 33 |         super().__init__(*args, **kwargs)
 34 |         self.eval_examples = eval_examples
 35 |         self.post_process_function = post_process_function
 36 | 
 37 |     # def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
 38 |     def evaluate(
 39 |         self,
 40 |         eval_dataset: Optional[Dataset] = None,
 41 |         eval_examples=None,
 42 |         ignore_keys: Optional[List[str]] = None,
 43 |         metric_key_prefix: str = "eval",
 44 |         max_length: Optional[int] = None,
 45 |         num_beams: Optional[int] = None,
 46 |     ) -> Dict[str, float]:
 47 |         self._max_length = max_length if max_length is not None else self.args.generation_max_length
 48 |         self._num_beams = num_beams if num_beams is not None else self.args.generation_num_beams
 49 | 
 50 |         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
 51 |         eval_dataloader = self.get_eval_dataloader(eval_dataset)
 52 |         eval_examples = self.eval_examples if eval_examples is None else eval_examples
 53 | 
 54 |         # Temporarily disable metric computation, we will do it in the loop here.
 55 |         compute_metrics = self.compute_metrics
 56 |         self.compute_metrics = None
 57 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
 58 |         try:
 59 |             output = eval_loop(
 60 |                 eval_dataloader,
 61 |                 description="Evaluation",
 62 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
 63 |                 # self.args.prediction_loss_only
 64 |                 prediction_loss_only=True if compute_metrics is None else None,
 65 |                 ignore_keys=ignore_keys,
 66 |             )
 67 |         finally:
 68 |             self.compute_metrics = compute_metrics
 69 | 
 70 |         if self.post_process_function is not None and self.compute_metrics is not None:
 71 |             eval_preds = self.post_process_function(eval_examples, eval_dataset, output)
 72 |             metrics = self.compute_metrics(eval_preds)
 73 | 
 74 |             # Prefix all keys with metric_key_prefix + '_'
 75 |             for key in list(metrics.keys()):
 76 |                 if not key.startswith(f"{metric_key_prefix}_"):
 77 |                     metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
 78 | 
 79 |             self.log(metrics)
 80 |         else:
 81 |             metrics = {}
 82 | 
 83 |         if self.args.tpu_metrics_debug or self.args.debug:
 84 |             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
 85 |             xm.master_print(met.metrics_report())
 86 | 
 87 |         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
 88 |         return metrics
 89 | 
 90 |     def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
 91 |         predict_dataloader = self.get_test_dataloader(predict_dataset)
 92 | 
 93 |         # Temporarily disable metric computation, we will do it in the loop here.
 94 |         compute_metrics = self.compute_metrics
 95 |         self.compute_metrics = None
 96 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
 97 |         try:
 98 |             output = eval_loop(
 99 |                 predict_dataloader,
100 |                 description="Prediction",
101 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
102 |                 # self.args.prediction_loss_only
103 |                 prediction_loss_only=True if compute_metrics is None else None,
104 |                 ignore_keys=ignore_keys,
105 |             )
106 |         finally:
107 |             self.compute_metrics = compute_metrics
108 | 
109 |         if self.post_process_function is None or self.compute_metrics is None:
110 |             return output
111 | 
112 |         predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
113 |         metrics = self.compute_metrics(predictions)
114 | 
115 |         # Prefix all keys with metric_key_prefix + '_'
116 |         for key in list(metrics.keys()):
117 |             if not key.startswith(f"{metric_key_prefix}_"):
118 |                 metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
119 | 
120 |         return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
121 | 


--------------------------------------------------------------------------------
/seqtag/ner_task_config.py:
--------------------------------------------------------------------------------
  1 | from unittest import result
  2 | import datasets
  3 | from datasets import ClassLabel, Dataset, load_dataset, load_metric, Features
  4 | import pandas as pd
  5 | import pdb
  6 | from transformers import AutoTokenizer
  7 | dataset_tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
  8 | dataset_tokenizer.add_special_tokens({"additional_special_tokens": ['“', '”', "…", "……", "—", "——"]}) # unrecognized by tokenizer
  9 | 
 10 | Task2LabelCol = {'trg': "trg_tags", 'arg': "arg_tags", 'joint': ""}
 11 | trg_tags = ('O', 'B-T1', 'I-T1', 'B-T2', 'I-T2', 'B-T3', 'I-T3', 'B-T4', 'I-T4', 'B-T5', 'I-T5', 'B-T6', 'I-T6')
 12 | arg_tags = ('O', 'B-sbj', 'I-sbj', 'B-obj', 'I-obj')
 13 | 
 14 | 
 15 | Task2Features = {
 16 |     # trigger extraction
 17 |     "trg": Features(
 18 |             {
 19 |                 # "id": datasets.Value("string"),
 20 |                 "tokens": datasets.Sequence(datasets.Value("string")),
 21 |                 Task2LabelCol['trg']: datasets.Sequence(
 22 |                     feature=datasets.features.ClassLabel(
 23 |                         names=sorted(list(trg_tags))
 24 |                     )
 25 |                 )
 26 |             }
 27 |         ),
 28 |     # argument extraction
 29 |     "arg": Features(
 30 |             {
 31 |                 # "id": datasets.Value("string"),
 32 |                 "tokens": datasets.Sequence(datasets.Value("string")),
 33 |                 Task2LabelCol['arg']: datasets.Sequence(
 34 |                     feature=datasets.features.ClassLabel(
 35 |                         names=sorted(list(arg_tags))
 36 |                     )
 37 |                 )
 38 |             }
 39 |         )
 40 | }
 41 | 
 42 | def load_my_datasets_for_ner(data_args, task_args):
 43 |     if data_args.test_file is not None:
 44 |         tst_df = pd.read_csv(data_args.test_file)
 45 |         tst_df = preprocess(tst_df, task_args, is_test_data=True)
 46 |         test_dataset = Dataset.from_pandas(tst_df)
 47 |     # load dataframes
 48 |     if data_args.train_file is not None:
 49 |         trn_df = pd.read_csv(data_args.train_file)
 50 |         trn_df = preprocess(trn_df, task_args)
 51 |         train_dataset = Dataset.from_pandas(trn_df)
 52 |     if data_args.validation_file is not None:
 53 |         val_df = pd.read_csv(data_args.validation_file)
 54 |         val_df = preprocess(val_df, task_args)
 55 |         valid_dataset = Dataset.from_pandas(val_df)
 56 |     raw_dataset = {"train": train_dataset, 'validation': valid_dataset, 'test': test_dataset}
 57 |     return raw_dataset
 58 | 
 59 | def preprocess(df: pd.DataFrame, task_args, is_test_data=False):
 60 |     for col in df.columns:
 61 |         if col.endswith('tags') or col.endswith('triple') or col=='tokens':
 62 |             df[col] = df[col].fillna("None").apply(eval)
 63 |     if task_args.just_infer and is_test_data:
 64 |         df['tokens'] = df['title'].apply(lambda x: dataset_tokenizer.tokenize(x))
 65 |         df[Task2LabelCol[task_args.task_name]] = df['tokens'].apply(lambda x: ["O"]*len(x))
 66 | 
 67 |     if is_test_data and task_args.task_name == 'arg' and task_args.pred_trg_file: # do argument extraction based on predicted triggers rather than golden labeled triggers
 68 |         pred_trg_df = pd.read_csv(task_args.pred_trg_file)
 69 |         pred_trg_df[task_args.pred_trg_tag_col] = pred_trg_df[task_args.pred_trg_tag_col].apply(eval)
 70 |         df['trg_tags'] = pred_trg_df[task_args.pred_trg_tag_col]
 71 |         df = expand_arg_tags(df, ignore_arg_tags=True)
 72 | 
 73 |     elif task_args.task_name == 'arg':
 74 |         df = expand_arg_tags(df)
 75 |     return df
 76 | 
 77 | def expand_arg_tags(df: pd.DataFrame, ignore_arg_tags=False):
 78 |     """
 79 |     expand arg_tags for all events in a single sentence
 80 |     args:
 81 |         df: the loaded dataframe
 82 |         ignore_arg_tags: if set to True, will not align all golden labeled arguments to their corresponding triggers, 
 83 |             should be used when doing argument extraction based on predicted triggers rather than golden labeled triggers,
 84 |             since the number of predicted triggers may be less than total number of golden labeled triggers
 85 |     """
 86 |     df['trg_tags'] = df['trg_tags'].apply(split_trg_tags)
 87 |     if ignore_arg_tags:
 88 |         df = df.explode('trg_tags')
 89 |         # df['trg_tags'].fillna(value=None, inplace=True)
 90 |         df['arg_tags'] = df.tokens.apply(lambda x: ['O']*len(x))
 91 |         df = df[~(df['trg_tags'].isna())].reset_index()
 92 |     else:
 93 |         df['arg_tags'] = df[[f'event{i}_arg_tags' for i in range(1,7)]].values.tolist()
 94 |         df['event_triples'] = df[[f'event{i}_triple' for i in range(1,7)]].values.tolist()
 95 |         df['event_triples'] = df['event_triples'].apply(lambda x: [list(i) for i in x])
 96 |         df = df.explode(['trg_tags', 'arg_tags', 'event_triples']).reset_index()
 97 |         df = df[~(df['trg_tags'].isna()) & ~(df['arg_tags']!=None).isna()].reset_index()
 98 |     return df
 99 |     
100 | def split_trg_tags(trg_tags):
101 |     splited_tags = []
102 |     if trg_tags:
103 |         for i in range(1,7):
104 |             tags = None
105 |             if f'B-T{i}' in trg_tags or f'I-T{i}' in trg_tags:
106 |                 tags = [tag if str(i) in tag else "O" for tag in trg_tags]
107 |             splited_tags.append(tags)
108 |     if splited_tags == [None]*6: # if do argument extraction based on predicted triggers, and there's no predicted trigger for this instance, should at least keep one trg_tags so that this instance is not lost
109 |         splited_tags[0] = trg_tags
110 |     return splited_tags
111 | 
112 | def tags2text(tags, tokenized_src_text):
113 |     """
114 |     convert BIO tags into a list of tokens (only convert B and I which represent event elements, skip O) 
115 |     args:
116 |         tags: BIO tags of the example on token level
117 |         tokenized_src_text: tokenized input text
118 |     """
119 |     assert len(tags)==len(tokenized_src_text), "tags and tokenized input text should be of same length"
120 |     result = []
121 |     cur_tkns = []
122 |     cur_tags = []
123 |     for i,tkn in enumerate(tokenized_src_text):
124 |         if tags[i]=='O': continue
125 |         if cur_tags==[] or tags[i].split('-')[1] == cur_tags[-1].split('-')[1]:
126 |             cur_tags.append(tags[i])
127 |             cur_tkns.append(tkn)
128 |         else:
129 |             result.append(dataset_tokenizer.convert_tokens_to_string(cur_tkns).replace(" ", ""))
130 |             cur_tkns = [tkn]
131 |             cur_tags = [tags[i]]
132 |     if cur_tkns: result.append(dataset_tokenizer.convert_tokens_to_string(cur_tkns).replace(" ", ""))
133 |     return result
134 | 
135 | def combine_trg_args(row):
136 |     """
137 |     combine triggers (golden labeled or predicted) with predicted arguments to form triplets
138 |     """
139 |     triple = row.pred_arguments
140 |     if triple:
141 |         if row.triggers:
142 |             triple.insert(1, row.triggers[0])
143 |         else:
144 |             triple.insert(1, "")
145 |         if len(triple) < 3: triple.append("")
146 |     return triple
147 | 
148 | def agg_triples(row):
149 |     """
150 |     aggregate triplets belonging to the same source text
151 |     """
152 |     triples = []
153 |     for i in range(1,7):
154 |         if len(row[f'event{i}_triple'])>0:
155 |             triples.append(list(row[f'event{i}_triple']))
156 |     return triples


--------------------------------------------------------------------------------
/mrc/README.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 | Copyright 2021 The HuggingFace Team. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | -->
 16 | 
 17 | # Question answering
 18 | 
 19 | This folder contains several scripts that showcase how to fine-tune a 🤗 Transformers model on a question answering dataset,
 20 | like SQuAD. 
 21 | 
 22 | ## Trainer-based scripts
 23 | 
 24 | The [`run_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py),
 25 | [`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search.py) and [`run_seq2seq_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py) leverage the 🤗 [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) for fine-tuning.
 26 | 
 27 | ### Fine-tuning BERT on SQuAD1.0
 28 | 
 29 | The [`run_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py) script
 30 | allows to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture has a `ForQuestionAnswering` version in the library) on a question-answering dataset (such as SQuAD, or any other QA dataset available in the `datasets` library, or your own csv/jsonlines files) as long as they are structured the same way as SQuAD. You might need to tweak the data processing inside the script if your data is structured differently.
 31 | 
 32 | **Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
 33 | uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
 34 | [this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version of the script which can be found [here](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering).
 35 | 
 36 | Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`.
 37 | 
 38 | This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
 39 | on a single tesla V100 16GB.
 40 | 
 41 | ```bash
 42 | python run_qa.py \
 43 |   --model_name_or_path bert-base-uncased \
 44 |   --dataset_name squad \
 45 |   --do_train \
 46 |   --do_eval \
 47 |   --per_device_train_batch_size 12 \
 48 |   --learning_rate 3e-5 \
 49 |   --num_train_epochs 2 \
 50 |   --max_seq_length 384 \
 51 |   --doc_stride 128 \
 52 |   --output_dir /tmp/debug_squad/
 53 | ```
 54 | 
 55 | Training with the previously defined hyper-parameters yields the following results:
 56 | 
 57 | ```bash
 58 | f1 = 88.52
 59 | exact_match = 81.22
 60 | ```
 61 | 
 62 | ### Fine-tuning XLNet with beam search on SQuAD
 63 | 
 64 | The [`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search.py) script is only meant to fine-tune XLNet, which is a special encoder-only Transformer model. The example code below fine-tunes XLNet on the SQuAD1.0 and SQuAD2.0 datasets.
 65 | 
 66 | #### Command for SQuAD1.0:
 67 | 
 68 | ```bash
 69 | python run_qa_beam_search.py \
 70 |     --model_name_or_path xlnet-large-cased \
 71 |     --dataset_name squad \
 72 |     --do_train \
 73 |     --do_eval \
 74 |     --learning_rate 3e-5 \
 75 |     --num_train_epochs 2 \
 76 |     --max_seq_length 384 \
 77 |     --doc_stride 128 \
 78 |     --output_dir ./wwm_cased_finetuned_squad/ \
 79 |     --per_device_eval_batch_size=4  \
 80 |     --per_device_train_batch_size=4   \
 81 |     --save_steps 5000
 82 | ```
 83 | 
 84 | #### Command for SQuAD2.0:
 85 | 
 86 | ```bash
 87 | export SQUAD_DIR=/path/to/SQUAD
 88 | 
 89 | python run_qa_beam_search.py \
 90 |     --model_name_or_path xlnet-large-cased \
 91 |     --dataset_name squad_v2 \
 92 |     --do_train \
 93 |     --do_eval \
 94 |     --version_2_with_negative \
 95 |     --learning_rate 3e-5 \
 96 |     --num_train_epochs 4 \
 97 |     --max_seq_length 384 \
 98 |     --doc_stride 128 \
 99 |     --output_dir ./wwm_cased_finetuned_squad/ \
100 |     --per_device_eval_batch_size=2  \
101 |     --per_device_train_batch_size=2   \
102 |     --save_steps 5000
103 | ```
104 | 
105 | ### Fine-tuning T5 on SQuAD2.0
106 | 
107 | The [`run_seq2seq_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These
108 | models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer.
109 | 
110 | This example code fine-tunes T5 on the SQuAD2.0 dataset.
111 | 
112 | ```bash
113 | python run_seq2seq_qa.py \
114 |   --model_name_or_path t5-small \
115 |   --dataset_name squad_v2 \
116 |   --context_column context \
117 |   --question_column question \
118 |   --answer_column answer \
119 |   --do_train \
120 |   --do_eval \
121 |   --per_device_train_batch_size 12 \
122 |   --learning_rate 3e-5 \
123 |   --num_train_epochs 2 \
124 |   --max_seq_length 384 \
125 |   --doc_stride 128 \
126 |   --output_dir /tmp/debug_seq2seq_squad/
127 | ```
128 | 
129 | ## Accelerate-based scripts
130 | 
131 | Based on the scripts `run_qa_no_trainer.py` and `run_qa_beam_search_no_trainer.py`.
132 | 
133 | Like `run_qa.py` and `run_qa_beam_search.py`, these scripts allow you to fine-tune any of the models supported on a
134 | SQuAD or a similar dataset, the main difference is that this script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like. It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer or the dataloaders directly in the script), but still run in a distributed setup, on TPU and supports mixed precision by leveraging the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. 
135 | 
136 | You can use the script normally after installing it:
137 | 
138 | ```bash
139 | pip install accelerate
140 | ```
141 | 
142 | then
143 | 
144 | ```bash
145 | python run_qa_no_trainer.py \
146 |   --model_name_or_path bert-base-uncased \
147 |   --dataset_name squad \
148 |   --max_seq_length 384 \
149 |   --doc_stride 128 \
150 |   --output_dir ~/tmp/debug_squad
151 | ```
152 | 
153 | You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
154 | 
155 | ```bash
156 | accelerate config
157 | ```
158 | 
159 | and reply to the questions asked. Then
160 | 
161 | ```bash
162 | accelerate test
163 | ```
164 | 
165 | that will check everything is ready for training. Finally, you can launch training with
166 | 
167 | ```bash
168 | accelerate launch run_qa_no_trainer.py \
169 |   --model_name_or_path bert-base-uncased \
170 |   --dataset_name squad \
171 |   --max_seq_length 384 \
172 |   --doc_stride 128 \
173 |   --output_dir ~/tmp/debug_squad
174 | ```
175 | 
176 | This command is the same and will work for:
177 | 
178 | - a CPU-only setup
179 | - a setup with one GPU
180 | - a distributed training with several GPUs (single or multi node)
181 | - a training on TPUs
182 | 
183 | Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
184 | 


--------------------------------------------------------------------------------
/mrc/utils_qa.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Team All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """
 16 | Post-processing utilities for question answering.
 17 | """
 18 | import collections
 19 | import json
 20 | import logging
 21 | import os
 22 | from typing import Optional, Tuple
 23 | 
 24 | import numpy as np
 25 | from tqdm.auto import tqdm
 26 | 
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def postprocess_qa_predictions(
 32 |     examples,
 33 |     features,
 34 |     predictions: Tuple[np.ndarray, np.ndarray],
 35 |     version_2_with_negative: bool = False,
 36 |     n_best_size: int = 20,
 37 |     max_answer_length: int = 30,
 38 |     null_score_diff_threshold: float = 0.0,
 39 |     output_dir: Optional[str] = None,
 40 |     prefix: Optional[str] = None,
 41 |     log_level: Optional[int] = logging.WARNING,
 42 | ):
 43 |     """
 44 |     Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
 45 |     original contexts. This is the base postprocessing functions for models that only return start and end logits.
 46 | 
 47 |     Args:
 48 |         examples: The non-preprocessed dataset (see the main script for more information).
 49 |         features: The processed dataset (see the main script for more information).
 50 |         predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
 51 |             The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
 52 |             first dimension must match the number of elements of :obj:`features`.
 53 |         version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
 54 |             Whether or not the underlying dataset contains examples with no answers.
 55 |         n_best_size (:obj:`int`, `optional`, defaults to 20):
 56 |             The total number of n-best predictions to generate when looking for an answer.
 57 |         max_answer_length (:obj:`int`, `optional`, defaults to 30):
 58 |             The maximum length of an answer that can be generated. This is needed because the start and end predictions
 59 |             are not conditioned on one another.
 60 |         null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
 61 |             The threshold used to select the null answer: if the best answer has a score that is less than the score of
 62 |             the null answer minus this threshold, the null answer is selected for this example (note that the score of
 63 |             the null answer for an example giving several features is the minimum of the scores for the null answer on
 64 |             each feature: all features must be aligned on the fact they `want` to predict a null answer).
 65 | 
 66 |             Only useful when :obj:`version_2_with_negative` is :obj:`True`.
 67 |         output_dir (:obj:`str`, `optional`):
 68 |             If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
 69 |             :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
 70 |             answers, are saved in `output_dir`.
 71 |         prefix (:obj:`str`, `optional`):
 72 |             If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
 73 |         log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
 74 |             ``logging`` log level (e.g., ``logging.WARNING``)
 75 |     """
 76 |     if len(predictions) != 2:
 77 |         raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
 78 |     all_start_logits, all_end_logits = predictions
 79 | 
 80 |     if len(predictions[0]) != len(features):
 81 |         raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
 82 | 
 83 |     # Build a map example to its corresponding features.
 84 |     example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
 85 |     features_per_example = collections.defaultdict(list)
 86 |     for i, feature in enumerate(features):
 87 |         features_per_example[example_id_to_index[feature["example_id"]]].append(i)
 88 | 
 89 |     # The dictionaries we have to fill.
 90 |     all_predictions = collections.OrderedDict()
 91 |     all_nbest_json = collections.OrderedDict()
 92 |     if version_2_with_negative:
 93 |         scores_diff_json = collections.OrderedDict()
 94 | 
 95 |     # Logging.
 96 |     logger.setLevel(log_level)
 97 |     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
 98 | 
 99 |     # Let's loop over all the examples!
100 |     for example_index, example in enumerate(tqdm(examples)):
101 |         # Those are the indices of the features associated to the current example.
102 |         feature_indices = features_per_example[example_index]
103 | 
104 |         min_null_prediction = None
105 |         prelim_predictions = []
106 | 
107 |         # Looping through all the features associated to the current example.
108 |         for feature_index in feature_indices:
109 |             # We grab the predictions of the model for this feature.
110 |             start_logits = all_start_logits[feature_index]
111 |             end_logits = all_end_logits[feature_index]
112 |             # This is what will allow us to map some the positions in our logits to span of texts in the original
113 |             # context.
114 |             offset_mapping = features[feature_index]["offset_mapping"]
115 |             # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
116 |             # available in the current feature.
117 |             token_is_max_context = features[feature_index].get("token_is_max_context", None)
118 | 
119 |             # Update minimum null prediction.
120 |             feature_null_score = start_logits[0] + end_logits[0]
121 |             if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
122 |                 min_null_prediction = {
123 |                     "offsets": (0, 0),
124 |                     "score": feature_null_score,
125 |                     "start_logit": start_logits[0],
126 |                     "end_logit": end_logits[0],
127 |                 }
128 | 
129 |             # Go through all possibilities for the `n_best_size` greater start and end logits.
130 |             start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
131 |             end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
132 |             for start_index in start_indexes:
133 |                 for end_index in end_indexes:
134 |                     # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
135 |                     # to part of the input_ids that are not in the context.
136 |                     if (
137 |                         start_index >= len(offset_mapping)
138 |                         or end_index >= len(offset_mapping)
139 |                         or offset_mapping[start_index] is None
140 |                         or len(offset_mapping[start_index]) < 2
141 |                         or offset_mapping[end_index] is None
142 |                         or len(offset_mapping[end_index]) < 2
143 |                     ):
144 |                         continue
145 |                     # Don't consider answers with a length that is either < 0 or > max_answer_length.
146 |                     if end_index < start_index or end_index - start_index + 1 > max_answer_length:
147 |                         continue
148 |                     # Don't consider answer that don't have the maximum context available (if such information is
149 |                     # provided).
150 |                     if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
151 |                         continue
152 | 
153 |                     prelim_predictions.append(
154 |                         {
155 |                             "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
156 |                             "score": start_logits[start_index] + end_logits[end_index],
157 |                             "start_logit": start_logits[start_index],
158 |                             "end_logit": end_logits[end_index],
159 |                         }
160 |                     )
161 |         if version_2_with_negative:
162 |             # Add the minimum null prediction
163 |             prelim_predictions.append(min_null_prediction)
164 |             null_score = min_null_prediction["score"]
165 | 
166 |         # Only keep the best `n_best_size` predictions.
167 |         predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
168 | 
169 |         # Add back the minimum null prediction if it was removed because of its low score.
170 |         if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
171 |             predictions.append(min_null_prediction)
172 | 
173 |         # Use the offsets to gather the answer text in the original context.
174 |         context = example["context"]
175 |         for pred in predictions:
176 |             offsets = pred.pop("offsets")
177 |             pred["text"] = context[offsets[0] : offsets[1]]
178 | 
179 |         # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
180 |         # failure.
181 |         if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
182 |             predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
183 | 
184 |         # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
185 |         # the LogSumExp trick).
186 |         scores = np.array([pred.pop("score") for pred in predictions])
187 |         exp_scores = np.exp(scores - np.max(scores))
188 |         probs = exp_scores / exp_scores.sum()
189 | 
190 |         # Include the probabilities in our predictions.
191 |         for prob, pred in zip(probs, predictions):
192 |             pred["probability"] = prob
193 | 
194 |         # Pick the best prediction. If the null answer is not possible, this is easy.
195 |         if not version_2_with_negative:
196 |             all_predictions[example["id"]] = predictions[0]["text"]
197 |         else:
198 |             # Otherwise we first need to find the best non-empty prediction.
199 |             i = 0
200 |             while predictions[i]["text"] == "":
201 |                 i += 1
202 |             best_non_null_pred = predictions[i]
203 | 
204 |             # Then we compare to the null prediction using the threshold.
205 |             score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
206 |             scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
207 |             if score_diff > null_score_diff_threshold:
208 |                 all_predictions[example["id"]] = ""
209 |             else:
210 |                 all_predictions[example["id"]] = best_non_null_pred["text"]
211 | 
212 |         # Make `predictions` JSON-serializable by casting np.float back to float.
213 |         all_nbest_json[example["id"]] = [
214 |             {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
215 |             for pred in predictions
216 |         ]
217 | 
218 |     # If we have an output_dir, let's save all those dicts.
219 |     if output_dir is not None:
220 |         if not os.path.isdir(output_dir):
221 |             raise EnvironmentError(f"{output_dir} is not a directory.")
222 | 
223 |         prediction_file = os.path.join(
224 |             output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
225 |         )
226 |         nbest_file = os.path.join(
227 |             output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
228 |         )
229 |         if version_2_with_negative:
230 |             null_odds_file = os.path.join(
231 |                 output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
232 |             )
233 | 
234 |         logger.info(f"Saving predictions to {prediction_file}.")
235 |         with open(prediction_file, "w") as writer:
236 |             writer.write(json.dumps(all_predictions, indent=4) + "\n")
237 |         logger.info(f"Saving nbest_preds to {nbest_file}.")
238 |         with open(nbest_file, "w") as writer:
239 |             writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
240 |         if version_2_with_negative:
241 |             logger.info(f"Saving null_odds to {null_odds_file}.")
242 |             with open(null_odds_file, "w") as writer:
243 |                 writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
244 | 
245 |     return all_predictions
246 | 
247 | 
248 | def postprocess_qa_predictions_with_beam_search(
249 |     examples,
250 |     features,
251 |     predictions: Tuple[np.ndarray, np.ndarray],
252 |     version_2_with_negative: bool = False,
253 |     n_best_size: int = 20,
254 |     max_answer_length: int = 30,
255 |     start_n_top: int = 5,
256 |     end_n_top: int = 5,
257 |     output_dir: Optional[str] = None,
258 |     prefix: Optional[str] = None,
259 |     log_level: Optional[int] = logging.WARNING,
260 | ):
261 |     """
262 |     Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
263 |     original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
264 |     cls token predictions.
265 | 
266 |     Args:
267 |         examples: The non-preprocessed dataset (see the main script for more information).
268 |         features: The processed dataset (see the main script for more information).
269 |         predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
270 |             The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
271 |             first dimension must match the number of elements of :obj:`features`.
272 |         version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
273 |             Whether or not the underlying dataset contains examples with no answers.
274 |         n_best_size (:obj:`int`, `optional`, defaults to 20):
275 |             The total number of n-best predictions to generate when looking for an answer.
276 |         max_answer_length (:obj:`int`, `optional`, defaults to 30):
277 |             The maximum length of an answer that can be generated. This is needed because the start and end predictions
278 |             are not conditioned on one another.
279 |         start_n_top (:obj:`int`, `optional`, defaults to 5):
280 |             The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
281 |         end_n_top (:obj:`int`, `optional`, defaults to 5):
282 |             The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
283 |         output_dir (:obj:`str`, `optional`):
284 |             If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
285 |             :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
286 |             answers, are saved in `output_dir`.
287 |         prefix (:obj:`str`, `optional`):
288 |             If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
289 |         log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
290 |             ``logging`` log level (e.g., ``logging.WARNING``)
291 |     """
292 |     if len(predictions) != 5:
293 |         raise ValueError("`predictions` should be a tuple with five elements.")
294 |     start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
295 | 
296 |     if len(predictions[0]) != len(features):
297 |         raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
298 | 
299 |     # Build a map example to its corresponding features.
300 |     example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
301 |     features_per_example = collections.defaultdict(list)
302 |     for i, feature in enumerate(features):
303 |         features_per_example[example_id_to_index[feature["example_id"]]].append(i)
304 | 
305 |     # The dictionaries we have to fill.
306 |     all_predictions = collections.OrderedDict()
307 |     all_nbest_json = collections.OrderedDict()
308 |     scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
309 | 
310 |     # Logging.
311 |     logger.setLevel(log_level)
312 |     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
313 | 
314 |     # Let's loop over all the examples!
315 |     for example_index, example in enumerate(tqdm(examples)):
316 |         # Those are the indices of the features associated to the current example.
317 |         feature_indices = features_per_example[example_index]
318 | 
319 |         min_null_score = None
320 |         prelim_predictions = []
321 | 
322 |         # Looping through all the features associated to the current example.
323 |         for feature_index in feature_indices:
324 |             # We grab the predictions of the model for this feature.
325 |             start_log_prob = start_top_log_probs[feature_index]
326 |             start_indexes = start_top_index[feature_index]
327 |             end_log_prob = end_top_log_probs[feature_index]
328 |             end_indexes = end_top_index[feature_index]
329 |             feature_null_score = cls_logits[feature_index]
330 |             # This is what will allow us to map some the positions in our logits to span of texts in the original
331 |             # context.
332 |             offset_mapping = features[feature_index]["offset_mapping"]
333 |             # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
334 |             # available in the current feature.
335 |             token_is_max_context = features[feature_index].get("token_is_max_context", None)
336 | 
337 |             # Update minimum null prediction
338 |             if min_null_score is None or feature_null_score < min_null_score:
339 |                 min_null_score = feature_null_score
340 | 
341 |             # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
342 |             for i in range(start_n_top):
343 |                 for j in range(end_n_top):
344 |                     start_index = int(start_indexes[i])
345 |                     j_index = i * end_n_top + j
346 |                     end_index = int(end_indexes[j_index])
347 |                     # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
348 |                     # p_mask but let's not take any risk)
349 |                     if (
350 |                         start_index >= len(offset_mapping)
351 |                         or end_index >= len(offset_mapping)
352 |                         or offset_mapping[start_index] is None
353 |                         or offset_mapping[end_index] is None
354 |                     ):
355 |                         continue
356 |                     # Don't consider answers with a length negative or > max_answer_length.
357 |                     if end_index < start_index or end_index - start_index + 1 > max_answer_length:
358 |                         continue
359 |                     # Don't consider answer that don't have the maximum context available (if such information is
360 |                     # provided).
361 |                     if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
362 |                         continue
363 |                     prelim_predictions.append(
364 |                         {
365 |                             "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
366 |                             "score": start_log_prob[i] + end_log_prob[j_index],
367 |                             "start_log_prob": start_log_prob[i],
368 |                             "end_log_prob": end_log_prob[j_index],
369 |                         }
370 |                     )
371 | 
372 |         # Only keep the best `n_best_size` predictions.
373 |         predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
374 | 
375 |         # Use the offsets to gather the answer text in the original context.
376 |         context = example["context"]
377 |         for pred in predictions:
378 |             offsets = pred.pop("offsets")
379 |             pred["text"] = context[offsets[0] : offsets[1]]
380 | 
381 |         # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
382 |         # failure.
383 |         if len(predictions) == 0:
384 |             predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
385 | 
386 |         # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
387 |         # the LogSumExp trick).
388 |         scores = np.array([pred.pop("score") for pred in predictions])
389 |         exp_scores = np.exp(scores - np.max(scores))
390 |         probs = exp_scores / exp_scores.sum()
391 | 
392 |         # Include the probabilities in our predictions.
393 |         for prob, pred in zip(probs, predictions):
394 |             pred["probability"] = prob
395 | 
396 |         # Pick the best prediction and set the probability for the null answer.
397 |         all_predictions[example["id"]] = predictions[0]["text"]
398 |         if version_2_with_negative:
399 |             scores_diff_json[example["id"]] = float(min_null_score)
400 | 
401 |         # Make `predictions` JSON-serializable by casting np.float back to float.
402 |         all_nbest_json[example["id"]] = [
403 |             {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
404 |             for pred in predictions
405 |         ]
406 | 
407 |     # If we have an output_dir, let's save all those dicts.
408 |     if output_dir is not None:
409 |         if not os.path.isdir(output_dir):
410 |             raise EnvironmentError(f"{output_dir} is not a directory.")
411 | 
412 |         prediction_file = os.path.join(
413 |             output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
414 |         )
415 |         nbest_file = os.path.join(
416 |             output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
417 |         )
418 |         if version_2_with_negative:
419 |             null_odds_file = os.path.join(
420 |                 output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
421 |             )
422 | 
423 |         logger.info(f"Saving predictions to {prediction_file}.")
424 |         with open(prediction_file, "w") as writer:
425 |             writer.write(json.dumps(all_predictions, indent=4) + "\n")
426 |         logger.info(f"Saving nbest_preds to {nbest_file}.")
427 |         with open(nbest_file, "w") as writer:
428 |             writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
429 |         if version_2_with_negative:
430 |             logger.info(f"Saving null_odds to {null_odds_file}.")
431 |             with open(null_odds_file, "w") as writer:
432 |                 writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
433 | 
434 |     return all_predictions, scores_diff_json
435 | 


--------------------------------------------------------------------------------
/seqtag/run_ner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | # Copyright 2020 The HuggingFace Team All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """
 18 | Fine-tuning the library models for token classification.
 19 | """
 20 | # You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
 21 | # comments.
 22 | 
 23 | import logging
 24 | import os
 25 | import sys
 26 | from pathlib import Path
 27 | from dataclasses import dataclass, field
 28 | from typing import Optional
 29 | import pdb
 30 | import numpy as np
 31 | from tqdm import tqdm
 32 | 
 33 | 
 34 | from transformers import (
 35 |     AutoConfig,
 36 |     AutoModelForTokenClassification,
 37 |     AutoTokenizer,
 38 |     DataCollatorForTokenClassification,
 39 |     HfArgumentParser,
 40 |     PretrainedConfig,
 41 |     PreTrainedTokenizerFast,
 42 |     Trainer,
 43 |     TrainingArguments,
 44 |     set_seed,
 45 | )
 46 | from transformers.trainer_utils import get_last_checkpoint
 47 | from transformers.utils import check_min_version
 48 | from transformers.utils.versions import require_version
 49 | import transformers
 50 | import evaluate
 51 | 
 52 | from ner_task_config import *
 53 | 
 54 | 
 55 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 56 | 
 57 | logger = logging.getLogger(__name__)
 58 | 
 59 | 
 60 | @dataclass
 61 | class TaskArguments:
 62 |     """
 63 |     Arguments pertraining to which task to perfrom (e.g., trigger extraction, argument extraction) 
 64 |     """
 65 |     task_name: Optional[str] = field(
 66 |         default="trg", 
 67 |         metadata={"help": "one of {}".format(list(Task2LabelCol.keys()))}
 68 |         )
 69 |     pred_trg_file: Optional[str] = field(
 70 |         default=None, 
 71 |         metadata={"help": "trigger prediction file, if specified and task_name is 'arg', will do argument extraction based on predicted triggers rather than golden labeled triggers"}
 72 |     )
 73 |     pred_trg_tag_col: Optional[str] = field(
 74 |         default='pred_trg_tags', 
 75 |         metadata={"help": "column name of predicted triggers tags in pred_trg_file"}
 76 |     )
 77 |     output_filename: Optional[str] = field(
 78 |         default=None,
 79 |         metadata={"help": "you can specify the file name of model predictions"}
 80 |     )
 81 |     def __post_init__(self):
 82 |         self.task_name = self.task_name.lower()
 83 |         
 84 | @dataclass
 85 | class ModelArguments:
 86 |     """
 87 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 88 |     """
 89 | 
 90 |     model_name_or_path: str = field(
 91 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 92 |     )
 93 |     config_name: Optional[str] = field(
 94 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 95 |     )
 96 |     tokenizer_name: Optional[str] = field(
 97 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 98 |     )
 99 |     cache_dir: Optional[str] = field(
100 |         default=None,
101 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
102 |     )
103 |     model_revision: str = field(
104 |         default="main",
105 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
106 |     )
107 |     use_auth_token: bool = field(
108 |         default=False,
109 |         metadata={
110 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
111 |             "with private models)."
112 |         },
113 |     )
114 | 
115 | 
116 | @dataclass
117 | class DataTrainingArguments:
118 |     """
119 |     Arguments pertaining to what data we are going to input our model for training and eval.
120 |     """
121 |     dataset_name: Optional[str] = field(
122 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
123 |     )
124 |     dataset_config_name: Optional[str] = field(
125 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
126 |     )
127 |     train_file: Optional[str] = field(
128 |         default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
129 |     )
130 |     validation_file: Optional[str] = field(
131 |         default=None,
132 |         metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
133 |     )
134 |     test_file: Optional[str] = field(
135 |         default=None,
136 |         metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
137 |     )
138 |     text_column_name: Optional[str] = field(
139 |         default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
140 |     )
141 |     label_column_name: Optional[str] = field(
142 |         default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
143 |     )
144 |     overwrite_cache: bool = field(
145 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
146 |     )
147 |     preprocessing_num_workers: Optional[int] = field(
148 |         default=None,
149 |         metadata={"help": "The number of processes to use for the preprocessing."},
150 |     )
151 |     max_seq_length: int = field(
152 |         default=None,
153 |         metadata={
154 |             "help": "The maximum total input sequence length after tokenization. If set, sequences longer "
155 |             "than this will be truncated, sequences shorter will be padded."
156 |         },
157 |     )
158 |     pad_to_max_length: bool = field(
159 |         default=False,
160 |         metadata={
161 |             "help": "Whether to pad all samples to model maximum sentence length. "
162 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
163 |             "efficient on GPU but very bad for TPU."
164 |         },
165 |     )
166 |     max_train_samples: Optional[int] = field(
167 |         default=None,
168 |         metadata={
169 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
170 |             "value if set."
171 |         },
172 |     )
173 |     max_eval_samples: Optional[int] = field(
174 |         default=None,
175 |         metadata={
176 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
177 |             "value if set."
178 |         },
179 |     )
180 |     max_predict_samples: Optional[int] = field(
181 |         default=None,
182 |         metadata={
183 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
184 |             "value if set."
185 |         },
186 |     )
187 |     label_all_tokens: bool = field(
188 |         default=False,
189 |         metadata={
190 |             "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
191 |             "one (in which case the other tokens will have a padding index)."
192 |         },
193 |     )
194 |     return_entity_level_metrics: bool = field(
195 |         default=False,
196 |         metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
197 |     )
198 | 
199 |     def __post_init__(self):
200 |         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
201 |             raise ValueError("Need either a dataset name or a training/validation file.")
202 |         else:
203 |             if self.train_file is not None:
204 |                 extension = self.train_file.split(".")[-1]
205 |                 assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
206 |             if self.validation_file is not None:
207 |                 extension = self.validation_file.split(".")[-1]
208 |                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
209 | 
210 | 
211 | def main():
212 |     # See all possible arguments in src/transformers/training_args.py
213 |     # or by passing the --help flag to this script.
214 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
215 | 
216 |     parser = HfArgumentParser((TaskArguments, ModelArguments, DataTrainingArguments, TrainingArguments))
217 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
218 |         # If we pass only one argument to the script and it's the path to a json file,
219 |         # let's parse it to get our arguments.
220 |         task_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
221 |     else:
222 |         task_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses()
223 | 
224 |     task_args.just_infer = (not training_args.do_train and training_args.do_predict)
225 |     # if task_args.just_infer:
226 |     #     training_args.do_train, training_args.do_eval, training_args.do_predict = False, False, True
227 | 
228 |     # Setup logging
229 |     logging.basicConfig(
230 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
231 |         datefmt="%m/%d/%Y %H:%M:%S",
232 |         handlers=[logging.StreamHandler(sys.stdout)],
233 |     )
234 | 
235 |     log_level = training_args.get_process_log_level()
236 |     logger.setLevel(log_level)
237 |     datasets.utils.logging.set_verbosity(log_level)
238 |     transformers.utils.logging.set_verbosity(log_level)
239 |     transformers.utils.logging.enable_default_handler()
240 |     transformers.utils.logging.enable_explicit_format()
241 | 
242 |     # Log on each process the small summary:
243 |     logger.warning(
244 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
245 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
246 |     )
247 |     logger.info(f"Training/evaluation parameters {training_args}")
248 | 
249 |     # Detecting last checkpoint.
250 |     last_checkpoint = None
251 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
252 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
253 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
254 |             raise ValueError(
255 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
256 |                 "Use --overwrite_output_dir to overcome."
257 |             )
258 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
259 |             logger.info(
260 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
261 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
262 |             )
263 | 
264 |     # Set seed before initializing model.
265 |     set_seed(training_args.seed)
266 | 
267 |     # modified: loading and processing data
268 |     logger.info("****** loading and processing data ******")
269 |     raw_datasets = load_my_datasets_for_ner(data_args, task_args)
270 | 
271 |     if training_args.do_train:
272 |         column_names = raw_datasets["train"].column_names
273 |         # features = raw_datasets["train"].features
274 |     else:
275 |         column_names = raw_datasets["test"].column_names
276 |         # features = raw_datasets["validation"].features
277 | 
278 |     if data_args.text_column_name is not None:
279 |         text_column_name = data_args.text_column_name
280 |     elif "tokens" in column_names:
281 |         text_column_name = "tokens"
282 |     else:
283 |         text_column_name = column_names[0]
284 | 
285 |     if data_args.label_column_name is not None:
286 |         label_column_name = data_args.label_column_name
287 |     else: 
288 |         label_column_name = Task2LabelCol[task_args.task_name]
289 |     
290 | 
291 |     # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
292 |     # unique labels.
293 |     def get_label_list(labels):
294 |         unique_labels = set()
295 |         for label in labels:
296 |             unique_labels = unique_labels | set(label)
297 |         label_list = list(unique_labels)
298 |         label_list.sort()
299 |         return label_list
300 | 
301 |     # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
302 |     # Otherwise, we have to get the list of labels manually.
303 |     # labels_are_int = isinstance(features[label_column_name].feature, ClassLabel)
304 |     labels_are_int = False
305 |     if labels_are_int:
306 |         label_list = Task2Features[task_args.task_name][label_column_name].feature.names
307 |         label_to_id = {i: i for i in range(len(label_list))}
308 |     else:
309 |         # label_list = get_label_list(raw_datasets["train"][label_column_name])
310 |         label_list = Task2Features[task_args.task_name][label_column_name].feature.names
311 |         label_to_id = {l: i for i, l in enumerate(label_list)}
312 | 
313 |     num_labels = len(label_list)
314 | 
315 |     # Load pretrained model and tokenizer
316 |     #
317 |     # Distributed training:
318 |     # The .from_pretrained methods guarantee that only one local process can concurrently
319 |     # download model & vocab.
320 |     config = AutoConfig.from_pretrained(
321 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
322 |         num_labels=num_labels,
323 |         finetuning_task=task_args.task_name,
324 |         cache_dir=model_args.cache_dir,
325 |         revision=model_args.model_revision,
326 |         use_auth_token=True if model_args.use_auth_token else None,
327 |     )
328 | 
329 |     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
330 |     if config.model_type in {"gpt2", "roberta"}:
331 |         tokenizer = AutoTokenizer.from_pretrained(
332 |             tokenizer_name_or_path,
333 |             cache_dir=model_args.cache_dir,
334 |             use_fast=True,
335 |             revision=model_args.model_revision,
336 |             use_auth_token=True if model_args.use_auth_token else None,
337 |             add_prefix_space=True,
338 |         )
339 |     else:
340 |         tokenizer = AutoTokenizer.from_pretrained(
341 |             tokenizer_name_or_path,
342 |             cache_dir=model_args.cache_dir,
343 |             use_fast=True,
344 |             revision=model_args.model_revision,
345 |             use_auth_token=True if model_args.use_auth_token else None,
346 |         )
347 | 
348 |     model = AutoModelForTokenClassification.from_pretrained(
349 |         model_args.model_name_or_path,
350 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
351 |         config=config,
352 |         cache_dir=model_args.cache_dir,
353 |         revision=model_args.model_revision,
354 |         use_auth_token=True if model_args.use_auth_token else None,
355 |     )
356 | 
357 |     # Tokenizer check: this script requires a fast tokenizer.
358 |     if not isinstance(tokenizer, PreTrainedTokenizerFast):
359 |         raise ValueError(
360 |             "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
361 |             "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
362 |             "requirement"
363 |         )
364 | 
365 |     # Model has labels -> use them.
366 |     if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
367 |         if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
368 |             # Reorganize `label_list` to match the ordering of the model.
369 |             if labels_are_int:
370 |                 label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)}
371 |                 label_list = [model.config.id2label[i] for i in range(num_labels)]
372 |             else:
373 |                 label_list = [model.config.id2label[i] for i in range(num_labels)]
374 |                 label_to_id = {l: i for i, l in enumerate(label_list)}
375 |         else:
376 |             logger.warning(
377 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
378 |                 f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels: {list(sorted(label_list))}."
379 |                 "\nIgnoring the model labels as a result.",
380 |             )
381 | 
382 |     # Set the correspondences label/ID inside the model config
383 |     model.config.label2id = {l: i for i, l in enumerate(label_list)}
384 |     model.config.id2label = {i: l for i, l in enumerate(label_list)}
385 | 
386 |     # Map that sends B-Xxx label to its I-Xxx counterpart
387 |     b_to_i_label = []
388 |     for idx, label in enumerate(label_list):
389 |         if label.startswith("B-") and label.replace("B-", "I-") in label_list:
390 |             b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
391 |         else:
392 |             b_to_i_label.append(idx)
393 | 
394 |     # Preprocessing the dataset
395 |     # Padding strategy
396 |     padding = "max_length" if data_args.pad_to_max_length else False
397 | 
398 |     # Tokenize all texts and align the labels with them.
399 |     def tokenize_and_align_labels(examples):
400 |         tokenized_inputs = tokenizer(
401 |             examples[text_column_name],
402 |             padding=padding,
403 |             truncation=True,
404 |             max_length=data_args.max_seq_length,
405 |             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
406 |             is_split_into_words=True,
407 |         )
408 |         labels = []
409 | 
410 |         for i, label in enumerate(examples[label_column_name]):
411 |             word_ids = tokenized_inputs.word_ids(batch_index=i)
412 |             previous_word_idx = None
413 |             label_ids = []
414 |             for word_idx in word_ids:
415 |                 # Special tokens have a word id that is None. We set the label to -100 so they are automatically
416 |                 # ignored in the loss function.
417 |                 if word_idx is None:
418 |                     label_ids.append(-100)
419 |                 # We set the label for the first token of each word.
420 |                 elif word_idx != previous_word_idx:
421 |                     label_ids.append(label_to_id[label[word_idx]])
422 | 
423 |                 # For the other tokens in a word, we set the label to either the current label or -100, depending on
424 |                 # the label_all_tokens flag.
425 |                 else:
426 |                     if data_args.label_all_tokens:
427 |                         label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
428 |                     else:
429 |                         label_ids.append(-100)
430 |                 previous_word_idx = word_idx
431 | 
432 |             labels.append(label_ids)
433 | 
434 |             # set token_type_ids of trigger tokens to 1, used in argument extraction task
435 |             if task_args.task_name == 'arg':
436 |                 trigger_word_ids = []
437 |                 for idx, tag in enumerate(examples['trg_tags'][i]):
438 |                     if tag != 'O':
439 |                         trigger_word_ids.append(idx)
440 |                 for idx, _ in enumerate(tokenized_inputs['token_type_ids'][i]):
441 |                     if word_ids[idx] in trigger_word_ids:
442 |                         tokenized_inputs['token_type_ids'][i][idx] = 1
443 |         tokenized_inputs["labels"] = labels
444 |         return tokenized_inputs
445 | 
446 |     if training_args.do_train:
447 |         if "train" not in raw_datasets:
448 |             raise ValueError("--do_train requires a train dataset")
449 |         train_dataset = raw_datasets["train"]
450 |         if data_args.max_train_samples is not None:
451 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
452 |             train_dataset = train_dataset.select(range(max_train_samples))
453 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
454 |             train_dataset = train_dataset.map(
455 |                 tokenize_and_align_labels,
456 |                 batched=True,
457 |                 num_proc=data_args.preprocessing_num_workers,
458 |                 load_from_cache_file=not data_args.overwrite_cache,
459 |                 desc="Running tokenizer on train dataset",
460 |             )
461 | 
462 |     if training_args.do_eval:
463 |         if "validation" not in raw_datasets:
464 |             raise ValueError("--do_eval requires a validation dataset")
465 |         eval_dataset = raw_datasets["validation"]
466 |         if data_args.max_eval_samples is not None:
467 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
468 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
469 |         with training_args.main_process_first(desc="validation dataset map pre-processing"):
470 |             eval_dataset = eval_dataset.map(
471 |                 tokenize_and_align_labels,
472 |                 batched=True,
473 |                 num_proc=data_args.preprocessing_num_workers,
474 |                 load_from_cache_file=not data_args.overwrite_cache,
475 |                 desc="Running tokenizer on validation dataset",
476 |             )
477 | 
478 |     if training_args.do_predict:
479 |         if "test" not in raw_datasets:
480 |             raise ValueError("--do_predict requires a test dataset")
481 |         predict_dataset = raw_datasets["test"]
482 |         if data_args.max_predict_samples is not None:
483 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
484 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
485 |         with training_args.main_process_first(desc="prediction dataset map pre-processing"):
486 |             predict_dataset = predict_dataset.map(
487 |                 tokenize_and_align_labels,
488 |                 batched=True,
489 |                 num_proc=data_args.preprocessing_num_workers,
490 |                 load_from_cache_file=not data_args.overwrite_cache,
491 |                 desc="Running tokenizer on prediction dataset",
492 |             )
493 |     # Data collator
494 |     data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
495 | 
496 |     # Metrics
497 |     logger.info("*** loading metric ***")
498 |     metric = evaluate.load('seqeval')
499 | 
500 | 
501 |     def compute_metrics(p):
502 |         predictions, labels = p
503 |         predictions = np.argmax(predictions, axis=2)
504 | 
505 |         # Remove ignored index (special tokens)
506 |         true_predictions = [
507 |             [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
508 |             for prediction, label in zip(predictions, labels)
509 |         ]
510 |         true_labels = [
511 |             [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
512 |             for prediction, label in zip(predictions, labels)
513 |         ]
514 | 
515 |         results = metric.compute(predictions=true_predictions, references=true_labels)
516 |         if data_args.return_entity_level_metrics:
517 |             # Unpack nested dictionaries
518 |             final_results = {}
519 |             for key, value in results.items():
520 |                 if isinstance(value, dict):
521 |                     for n, v in value.items():
522 |                         final_results[f"{key}_{n}"] = v
523 |                 else:
524 |                     final_results[key] = value
525 |             return final_results
526 |         else:
527 |             return {
528 |                 "precision": results["overall_precision"],
529 |                 "recall": results["overall_recall"],
530 |                 "f1": results["overall_f1"],
531 |                 "accuracy": results["overall_accuracy"],
532 |             }
533 | 
534 |     # Initialize our Trainer
535 |     logger.info("*** initializing trainer ***")
536 |     trainer = Trainer(
537 |         model=model,
538 |         args=training_args,
539 |         train_dataset=train_dataset if training_args.do_train else None,
540 |         eval_dataset=eval_dataset if training_args.do_eval else None,
541 |         tokenizer=tokenizer,
542 |         data_collator=data_collator,
543 |         # compute_metrics=None if (task_args.task_name=='arg' and task_args.pred_trg_file) or task_args.just_infer else compute_metrics
544 |         compute_metrics=compute_metrics,
545 |     )
546 | 
547 |     # Training
548 |     if training_args.do_train:
549 |         checkpoint = None
550 |         if training_args.resume_from_checkpoint is not None:
551 |             checkpoint = training_args.resume_from_checkpoint
552 |         elif last_checkpoint is not None:
553 |             checkpoint = last_checkpoint
554 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
555 |         metrics = train_result.metrics
556 |         trainer.save_model()  # Saves the tokenizer too for easy upload
557 | 
558 |         max_train_samples = (
559 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
560 |         )
561 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
562 | 
563 |         trainer.log_metrics("train", metrics)
564 |         trainer.save_metrics("train", metrics)
565 |         trainer.save_state()
566 | 
567 |     # Evaluation
568 |     if training_args.do_eval:
569 |         logger.info("*** Evaluate ***")
570 | 
571 |         metrics = trainer.evaluate()
572 | 
573 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
574 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
575 | 
576 |         trainer.log_metrics("eval", metrics)
577 |         trainer.save_metrics("eval", metrics)
578 | 
579 |     # Predict
580 |     if training_args.do_predict:
581 |         logger.info("*** Predict ***")
582 | 
583 |         predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
584 |         predictions = np.argmax(predictions, axis=2)
585 | 
586 |         # Remove ignored index (special tokens)
587 |         true_predictions = [
588 |             [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
589 |             for prediction, label in zip(predictions, labels)
590 |         ]
591 | 
592 |         trainer.log_metrics("predict", metrics)
593 |         trainer.save_metrics("predict", metrics)
594 | 
595 |         # Save predictions
596 |         if task_args.task_name == 'trg':
597 |             pred_df = predict_dataset.to_pandas()
598 |             # pred_df['triggers'] = pred_df[['trg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.trg_tags, tokens=x.tokens), axis=1)
599 |             pred_df['triggers'] = pred_df[[f'event{i}_trigger' for i in range(1,7)]].apply(lambda row: [item for item in row if item!=None], axis=1)
600 |             pred_df['pred_trg_tags'] = true_predictions
601 |             pred_df['pred_triggers'] = pred_df[['pred_trg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.pred_trg_tags, tokenized_src_text=x.tokens), axis=1)
602 |             output_predictions_file = os.path.join(training_args.output_dir, task_args.output_filename) if task_args.output_filename else os.path.join(training_args.output_dir, "trg_predictions.csv")
603 |             pred_df[['title_id', 'title', 'triggers', 'pred_triggers', 'trg_tags', 'pred_trg_tags']].to_csv(output_predictions_file, index=False)
604 |         elif task_args.task_name == 'arg':
605 |             pred_df = predict_dataset.to_pandas()
606 |             pred_df['pred_arg_tags'] = true_predictions
607 |             pred_df['pred_arguments'] = pred_df[['pred_arg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.pred_arg_tags, tokenized_src_text=x.tokens), axis=1)
608 |             if task_args.pred_trg_file: # if use predicted triggers
609 |                 pred_df['triggers'] = pred_df[['trg_tags', 'tokens']].apply(lambda x: tags2text(tags=x.trg_tags, tokenized_src_text=x.tokens), axis=1)
610 |                 pred_df['event_triples'] = pred_df[[f'event{i}_triple' for i in range(1,7)]].apply(lambda row: [list(item) for item in row if list(item)!=[]], axis=1)
611 |                 # pred_df['event_triples'] = pred_df['event_triples'].apply(list) 
612 |                 pred_df['pred_event_triples'] = pred_df.apply(combine_trg_args, axis=1)
613 |                 pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='index', axis=0).reset_index()
614 |                 pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: x[0]) 
615 |                 # if not task_args.just_infer: pred_df['event_triples'] = pred_df.apply(agg_triples, axis=1)
616 |                 # if not task_args.just_infer: pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x:x[0])
617 |                 output_predictions_file = os.path.join(training_args.output_dir, task_args.output_filename) if task_args.output_filename else os.path.join(training_args.output_dir, "pipeline_predictions.csv")
618 |                 pred_df[['title_id', 'title', 'event_triples', 'pred_event_triples']].to_csv(output_predictions_file, index=False)
619 |             else: # if use golden triggers
620 |                 # pred_df['arguments'] = pred_df.apply(lambda x: tags2text(tags=x.arg_tags, tokens=x.tokens), axis=1)
621 |                 pred_df['triggers'] = pred_df[[f'event{i}_trigger' for i in range(1,7)]].apply(lambda row: [item for item in row if item!=None], axis=1)   
622 |                 pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='index', axis=0).reset_index()
623 |                 pred_df['triggers'] = pred_df['triggers'].apply(lambda x: x[0])
624 |                 pred_df['pred_event_triples'] = ""
625 |                 for idx, row in tqdm(pred_df.iterrows(), total=len(pred_df)):
626 |                     pred_triples = []
627 |                     for i, arg in enumerate(row.pred_arguments):
628 |                         trp = arg
629 |                         trp.insert(1, row.triggers[i])
630 |                         pred_triples.append(trp)
631 |                     pred_df.at[idx, "pred_event_triples"] = pred_triples
632 |                 pred_df['pred_event_triples'] = pred_df['pred_event_triples'].apply(lambda x: [list(i) for i in x])
633 |                 pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: [i.tolist() for i in x]) 
634 | 
635 |                 output_predictions_file = os.path.join(training_args.output_dir, task_args.output_filename) if task_args.output_filename else os.path.join(training_args.output_dir, "arg_predictions.csv")
636 |                 pred_df[['title_id', 'title', 'event_triples', 'pred_event_triples', 'arg_tags', 'pred_arg_tags']].to_csv(output_predictions_file, index=False)
637 |         logger.info("save prediction output to %s" % output_predictions_file)
638 | 
639 | 
640 | 
641 |     kwargs = {"finetuned_from": Path(model_args.model_name_or_path).name, "tasks": "token-classification"}
642 |     if data_args.dataset_name is not None:
643 |         kwargs["dataset_tags"] = data_args.dataset_name
644 |         if data_args.dataset_config_name is not None:
645 |             kwargs["dataset_args"] = data_args.dataset_config_name
646 |             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
647 |         else:
648 |             kwargs["dataset"] = data_args.dataset_name
649 | 
650 |     if training_args.push_to_hub:
651 |         trainer.push_to_hub(**kwargs)
652 |     else:
653 |         trainer.create_model_card(**kwargs)
654 | 
655 | if __name__ == "__main__":
656 |     main()
657 | 


--------------------------------------------------------------------------------
/mrc/run_qa_beam_search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Team All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning XLNet for question answering with beam search using a slightly adapted version of the 🤗 Trainer.
 18 | """
 19 | # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 20 | 
 21 | import logging
 22 | import os
 23 | import sys
 24 | from dataclasses import dataclass, field
 25 | from typing import Optional
 26 | 
 27 | import datasets
 28 | from datasets import load_dataset, load_metric
 29 | 
 30 | import transformers
 31 | from trainer_qa import QuestionAnsweringTrainer
 32 | from transformers import (
 33 |     DataCollatorWithPadding,
 34 |     EvalPrediction,
 35 |     HfArgumentParser,
 36 |     TrainingArguments,
 37 |     XLNetConfig,
 38 |     XLNetForQuestionAnswering,
 39 |     XLNetTokenizerFast,
 40 |     default_data_collator,
 41 |     set_seed,
 42 | )
 43 | from transformers.trainer_utils import get_last_checkpoint
 44 | from transformers.utils import check_min_version
 45 | from transformers.utils.versions import require_version
 46 | from utils_qa import postprocess_qa_predictions_with_beam_search
 47 | 
 48 | 
 49 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 50 | check_min_version("4.19.0.dev0")
 51 | 
 52 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 53 | 
 54 | logger = logging.getLogger(__name__)
 55 | 
 56 | 
 57 | @dataclass
 58 | class ModelArguments:
 59 |     """
 60 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 61 |     """
 62 | 
 63 |     model_name_or_path: str = field(
 64 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 65 |     )
 66 |     config_name: Optional[str] = field(
 67 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 68 |     )
 69 |     tokenizer_name: Optional[str] = field(
 70 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 71 |     )
 72 |     cache_dir: Optional[str] = field(
 73 |         default=None,
 74 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 75 |     )
 76 |     model_revision: str = field(
 77 |         default="main",
 78 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 79 |     )
 80 |     use_auth_token: bool = field(
 81 |         default=False,
 82 |         metadata={
 83 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
 84 |             "with private models)."
 85 |         },
 86 |     )
 87 | 
 88 | 
 89 | @dataclass
 90 | class DataTrainingArguments:
 91 |     """
 92 |     Arguments pertaining to what data we are going to input our model for training and eval.
 93 |     """
 94 | 
 95 |     dataset_name: Optional[str] = field(
 96 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
 97 |     )
 98 |     dataset_config_name: Optional[str] = field(
 99 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
100 |     )
101 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
102 |     validation_file: Optional[str] = field(
103 |         default=None,
104 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
105 |     )
106 |     test_file: Optional[str] = field(
107 |         default=None,
108 |         metadata={"help": "An optional input test data file to test the perplexity on (a text file)."},
109 |     )
110 |     overwrite_cache: bool = field(
111 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
112 |     )
113 |     preprocessing_num_workers: Optional[int] = field(
114 |         default=None,
115 |         metadata={"help": "The number of processes to use for the preprocessing."},
116 |     )
117 |     max_seq_length: int = field(
118 |         default=384,
119 |         metadata={
120 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
121 |             "than this will be truncated, sequences shorter will be padded."
122 |         },
123 |     )
124 |     pad_to_max_length: bool = field(
125 |         default=True,
126 |         metadata={
127 |             "help": "Whether to pad all samples to `max_seq_length`. "
128 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
129 |             "be faster on GPU but will be slower on TPU)."
130 |         },
131 |     )
132 |     max_train_samples: Optional[int] = field(
133 |         default=None,
134 |         metadata={
135 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
136 |             "value if set."
137 |         },
138 |     )
139 |     max_eval_samples: Optional[int] = field(
140 |         default=None,
141 |         metadata={
142 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
143 |             "value if set."
144 |         },
145 |     )
146 |     max_predict_samples: Optional[int] = field(
147 |         default=None,
148 |         metadata={
149 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
150 |             "value if set."
151 |         },
152 |     )
153 |     version_2_with_negative: bool = field(
154 |         default=False, metadata={"help": "If true, some of the examples do not have an answer."}
155 |     )
156 |     null_score_diff_threshold: float = field(
157 |         default=0.0,
158 |         metadata={
159 |             "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
160 |             "the score of the null answer minus this threshold, the null answer is selected for this example. "
161 |             "Only useful when `version_2_with_negative=True`."
162 |         },
163 |     )
164 |     doc_stride: int = field(
165 |         default=128,
166 |         metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
167 |     )
168 |     n_best_size: int = field(
169 |         default=20,
170 |         metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
171 |     )
172 |     max_answer_length: int = field(
173 |         default=30,
174 |         metadata={
175 |             "help": "The maximum length of an answer that can be generated. This is needed because the start "
176 |             "and end predictions are not conditioned on one another."
177 |         },
178 |     )
179 | 
180 |     def __post_init__(self):
181 |         if (
182 |             self.dataset_name is None
183 |             and self.train_file is None
184 |             and self.validation_file is None
185 |             and self.test_file is None
186 |         ):
187 |             raise ValueError("Need either a dataset name or a training/validation/test file.")
188 |         else:
189 |             if self.train_file is not None:
190 |                 extension = self.train_file.split(".")[-1]
191 |                 assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
192 |             if self.validation_file is not None:
193 |                 extension = self.validation_file.split(".")[-1]
194 |                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
195 |             if self.test_file is not None:
196 |                 extension = self.test_file.split(".")[-1]
197 |                 assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
198 | 
199 | 
200 | def main():
201 |     # See all possible arguments in src/transformers/training_args.py
202 |     # or by passing the --help flag to this script.
203 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
204 | 
205 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
206 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
207 |         # If we pass only one argument to the script and it's the path to a json file,
208 |         # let's parse it to get our arguments.
209 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
210 |     else:
211 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
212 | 
213 |     # Setup logging
214 |     logging.basicConfig(
215 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
216 |         datefmt="%m/%d/%Y %H:%M:%S",
217 |         handlers=[logging.StreamHandler(sys.stdout)],
218 |     )
219 |     log_level = training_args.get_process_log_level()
220 |     logger.setLevel(log_level)
221 |     datasets.utils.logging.set_verbosity(log_level)
222 |     transformers.utils.logging.set_verbosity(log_level)
223 |     transformers.utils.logging.enable_default_handler()
224 |     transformers.utils.logging.enable_explicit_format()
225 | 
226 |     # Log on each process the small summary:
227 |     logger.warning(
228 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
229 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
230 |     )
231 |     logger.info(f"Training/evaluation parameters {training_args}")
232 | 
233 |     # Detecting last checkpoint.
234 |     last_checkpoint = None
235 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
236 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
237 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
238 |             raise ValueError(
239 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
240 |                 "Use --overwrite_output_dir to overcome."
241 |             )
242 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
243 |             logger.info(
244 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
245 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
246 |             )
247 | 
248 |     # Set seed before initializing model.
249 |     set_seed(training_args.seed)
250 | 
251 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
252 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
253 |     # (the dataset will be downloaded automatically from the datasets Hub).
254 |     #
255 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
256 |     # 'text' is found. You can easily tweak this behavior (see below).
257 |     #
258 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
259 |     # download the dataset.
260 |     if data_args.dataset_name is not None:
261 |         # Downloading and loading a dataset from the hub.
262 |         raw_datasets = load_dataset(
263 |             data_args.dataset_name,
264 |             data_args.dataset_config_name,
265 |             cache_dir=model_args.cache_dir,
266 |             use_auth_token=True if model_args.use_auth_token else None,
267 |         )
268 |     else:
269 |         data_files = {}
270 |         if data_args.train_file is not None:
271 |             data_files["train"] = data_args.train_file
272 |             extension = data_args.train_file.split(".")[-1]
273 |         if data_args.validation_file is not None:
274 |             data_files["validation"] = data_args.validation_file
275 |             extension = data_args.validation_file.split(".")[-1]
276 |         if data_args.test_file is not None:
277 |             data_files["test"] = data_args.test_file
278 |             extension = data_args.test_file.split(".")[-1]
279 |         raw_datasets = load_dataset(
280 |             extension,
281 |             data_files=data_files,
282 |             field="data",
283 |             cache_dir=model_args.cache_dir,
284 |             use_auth_token=True if model_args.use_auth_token else None,
285 |         )
286 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
287 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
288 | 
289 |     # Load pretrained model and tokenizer
290 |     #
291 |     # Distributed training:
292 |     # The .from_pretrained methods guarantee that only one local process can concurrently
293 |     # download model & vocab.
294 |     config = XLNetConfig.from_pretrained(
295 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
296 |         cache_dir=model_args.cache_dir,
297 |         revision=model_args.model_revision,
298 |         use_auth_token=True if model_args.use_auth_token else None,
299 |     )
300 |     tokenizer = XLNetTokenizerFast.from_pretrained(
301 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
302 |         cache_dir=model_args.cache_dir,
303 |         revision=model_args.model_revision,
304 |         use_auth_token=True if model_args.use_auth_token else None,
305 |     )
306 |     model = XLNetForQuestionAnswering.from_pretrained(
307 |         model_args.model_name_or_path,
308 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
309 |         config=config,
310 |         cache_dir=model_args.cache_dir,
311 |         revision=model_args.model_revision,
312 |         use_auth_token=True if model_args.use_auth_token else None,
313 |     )
314 | 
315 |     # Preprocessing the datasets.
316 |     # Preprocessing is slighlty different for training and evaluation.
317 |     if training_args.do_train:
318 |         column_names = raw_datasets["train"].column_names
319 |     elif training_args.do_eval:
320 |         column_names = raw_datasets["validation"].column_names
321 |     else:
322 |         column_names = raw_datasets["test"].column_names
323 |     question_column_name = "question" if "question" in column_names else column_names[0]
324 |     context_column_name = "context" if "context" in column_names else column_names[1]
325 |     answer_column_name = "answers" if "answers" in column_names else column_names[2]
326 | 
327 |     # Padding side determines if we do (question|context) or (context|question).
328 |     pad_on_right = tokenizer.padding_side == "right"
329 | 
330 |     if data_args.max_seq_length > tokenizer.model_max_length:
331 |         logger.warning(
332 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
333 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
334 |         )
335 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
336 | 
337 |     # Training preprocessing
338 |     def prepare_train_features(examples):
339 |         # Some of the questions have lots of whitespace on the left, which is not useful and will make the
340 |         # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
341 |         # left whitespace
342 |         examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
343 | 
344 |         # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
345 |         # in one example possible giving several features when a context is long, each of those features having a
346 |         # context that overlaps a bit the context of the previous feature.
347 |         tokenized_examples = tokenizer(
348 |             examples[question_column_name if pad_on_right else context_column_name],
349 |             examples[context_column_name if pad_on_right else question_column_name],
350 |             truncation="only_second" if pad_on_right else "only_first",
351 |             max_length=max_seq_length,
352 |             stride=data_args.doc_stride,
353 |             return_overflowing_tokens=True,
354 |             return_offsets_mapping=True,
355 |             return_special_tokens_mask=True,
356 |             return_token_type_ids=True,
357 |             padding="max_length",
358 |         )
359 | 
360 |         # Since one example might give us several features if it has a long context, we need a map from a feature to
361 |         # its corresponding example. This key gives us just that.
362 |         sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
363 |         # The offset mappings will give us a map from token to character position in the original context. This will
364 |         # help us compute the start_positions and end_positions.
365 |         offset_mapping = tokenized_examples.pop("offset_mapping")
366 |         # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
367 |         special_tokens = tokenized_examples.pop("special_tokens_mask")
368 | 
369 |         # Let's label those examples!
370 |         tokenized_examples["start_positions"] = []
371 |         tokenized_examples["end_positions"] = []
372 |         tokenized_examples["is_impossible"] = []
373 |         tokenized_examples["cls_index"] = []
374 |         tokenized_examples["p_mask"] = []
375 | 
376 |         for i, offsets in enumerate(offset_mapping):
377 |             # We will label impossible answers with the index of the CLS token.
378 |             input_ids = tokenized_examples["input_ids"][i]
379 |             cls_index = input_ids.index(tokenizer.cls_token_id)
380 |             tokenized_examples["cls_index"].append(cls_index)
381 | 
382 |             # Grab the sequence corresponding to that example (to know what is the context and what is the question).
383 |             sequence_ids = tokenized_examples["token_type_ids"][i]
384 |             for k, s in enumerate(special_tokens[i]):
385 |                 if s:
386 |                     sequence_ids[k] = 3
387 |             context_idx = 1 if pad_on_right else 0
388 | 
389 |             # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
390 |             # The cls token gets 1.0 too (for predictions of empty answers).
391 |             tokenized_examples["p_mask"].append(
392 |                 [
393 |                     0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
394 |                     for k, s in enumerate(sequence_ids)
395 |                 ]
396 |             )
397 | 
398 |             # One example can give several spans, this is the index of the example containing this span of text.
399 |             sample_index = sample_mapping[i]
400 |             answers = examples[answer_column_name][sample_index]
401 |             # If no answers are given, set the cls_index as answer.
402 |             if len(answers["answer_start"]) == 0:
403 |                 tokenized_examples["start_positions"].append(cls_index)
404 |                 tokenized_examples["end_positions"].append(cls_index)
405 |                 tokenized_examples["is_impossible"].append(1.0)
406 |             else:
407 |                 # Start/end character index of the answer in the text.
408 |                 start_char = answers["answer_start"][0]
409 |                 end_char = start_char + len(answers["text"][0])
410 | 
411 |                 # Start token index of the current span in the text.
412 |                 token_start_index = 0
413 |                 while sequence_ids[token_start_index] != context_idx:
414 |                     token_start_index += 1
415 | 
416 |                 # End token index of the current span in the text.
417 |                 token_end_index = len(input_ids) - 1
418 |                 while sequence_ids[token_end_index] != context_idx:
419 |                     token_end_index -= 1
420 |                 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
421 |                 if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
422 |                     tokenized_examples["start_positions"].append(cls_index)
423 |                     tokenized_examples["end_positions"].append(cls_index)
424 |                     tokenized_examples["is_impossible"].append(1.0)
425 |                 else:
426 |                     # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
427 |                     # Note: we could go after the last offset if the answer is the last word (edge case).
428 |                     while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
429 |                         token_start_index += 1
430 |                     tokenized_examples["start_positions"].append(token_start_index - 1)
431 |                     while offsets[token_end_index][1] >= end_char:
432 |                         token_end_index -= 1
433 |                     tokenized_examples["end_positions"].append(token_end_index + 1)
434 |                     tokenized_examples["is_impossible"].append(0.0)
435 | 
436 |         return tokenized_examples
437 | 
438 |     if training_args.do_train:
439 |         if "train" not in raw_datasets:
440 |             raise ValueError("--do_train requires a train dataset")
441 |         train_dataset = raw_datasets["train"]
442 |         if data_args.max_train_samples is not None:
443 |             # Select samples from Dataset, This will help to decrease processing time
444 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
445 |             train_dataset = train_dataset.select(range(max_train_samples))
446 |         # Create Training Features
447 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
448 |             train_dataset = train_dataset.map(
449 |                 prepare_train_features,
450 |                 batched=True,
451 |                 num_proc=data_args.preprocessing_num_workers,
452 |                 remove_columns=column_names,
453 |                 load_from_cache_file=not data_args.overwrite_cache,
454 |                 desc="Running tokenizer on train dataset",
455 |             )
456 |         if data_args.max_train_samples is not None:
457 |             # Select samples from dataset again since Feature Creation might increase number of features
458 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
459 |             train_dataset = train_dataset.select(range(max_train_samples))
460 | 
461 |     # Validation preprocessing
462 |     def prepare_validation_features(examples):
463 |         # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
464 |         # in one example possible giving several features when a context is long, each of those features having a
465 |         # context that overlaps a bit the context of the previous feature.
466 |         tokenized_examples = tokenizer(
467 |             examples[question_column_name if pad_on_right else context_column_name],
468 |             examples[context_column_name if pad_on_right else question_column_name],
469 |             truncation="only_second" if pad_on_right else "only_first",
470 |             max_length=max_seq_length,
471 |             stride=data_args.doc_stride,
472 |             return_overflowing_tokens=True,
473 |             return_offsets_mapping=True,
474 |             return_special_tokens_mask=True,
475 |             return_token_type_ids=True,
476 |             padding="max_length",
477 |         )
478 | 
479 |         # Since one example might give us several features if it has a long context, we need a map from a feature to
480 |         # its corresponding example. This key gives us just that.
481 |         sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
482 | 
483 |         # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
484 |         special_tokens = tokenized_examples.pop("special_tokens_mask")
485 | 
486 |         # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
487 |         # corresponding example_id and we will store the offset mappings.
488 |         tokenized_examples["example_id"] = []
489 | 
490 |         # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
491 |         tokenized_examples["cls_index"] = []
492 |         tokenized_examples["p_mask"] = []
493 | 
494 |         for i, input_ids in enumerate(tokenized_examples["input_ids"]):
495 |             # Find the CLS token in the input ids.
496 |             cls_index = input_ids.index(tokenizer.cls_token_id)
497 |             tokenized_examples["cls_index"].append(cls_index)
498 | 
499 |             # Grab the sequence corresponding to that example (to know what is the context and what is the question).
500 |             sequence_ids = tokenized_examples["token_type_ids"][i]
501 |             for k, s in enumerate(special_tokens[i]):
502 |                 if s:
503 |                     sequence_ids[k] = 3
504 |             context_idx = 1 if pad_on_right else 0
505 | 
506 |             # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
507 |             tokenized_examples["p_mask"].append(
508 |                 [
509 |                     0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
510 |                     for k, s in enumerate(sequence_ids)
511 |                 ]
512 |             )
513 | 
514 |             # One example can give several spans, this is the index of the example containing this span of text.
515 |             sample_index = sample_mapping[i]
516 |             tokenized_examples["example_id"].append(examples["id"][sample_index])
517 | 
518 |             # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
519 |             # position is part of the context or not.
520 |             tokenized_examples["offset_mapping"][i] = [
521 |                 (o if sequence_ids[k] == context_idx else None)
522 |                 for k, o in enumerate(tokenized_examples["offset_mapping"][i])
523 |             ]
524 | 
525 |         return tokenized_examples
526 | 
527 |     if training_args.do_eval:
528 |         if "validation" not in raw_datasets:
529 |             raise ValueError("--do_eval requires a validation dataset")
530 |         eval_examples = raw_datasets["validation"]
531 |         if data_args.max_eval_samples is not None:
532 |             # Selecting Eval Samples from Dataset
533 |             max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
534 |             eval_examples = eval_examples.select(range(max_eval_samples))
535 |         # Create Features from Eval Dataset
536 |         with training_args.main_process_first(desc="validation dataset map pre-processing"):
537 |             eval_dataset = eval_examples.map(
538 |                 prepare_validation_features,
539 |                 batched=True,
540 |                 num_proc=data_args.preprocessing_num_workers,
541 |                 remove_columns=column_names,
542 |                 load_from_cache_file=not data_args.overwrite_cache,
543 |                 desc="Running tokenizer on validation dataset",
544 |             )
545 |         if data_args.max_eval_samples is not None:
546 |             # Selecting Samples from Dataset again since Feature Creation might increase samples size
547 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
548 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
549 | 
550 |     if training_args.do_predict:
551 |         if "test" not in raw_datasets:
552 |             raise ValueError("--do_predict requires a test dataset")
553 |         predict_examples = raw_datasets["test"]
554 |         if data_args.max_predict_samples is not None:
555 |             # We will select sample from whole data
556 |             predict_examples = predict_examples.select(range(data_args.max_predict_samples))
557 |         # Test Feature Creation
558 |         with training_args.main_process_first(desc="prediction dataset map pre-processing"):
559 |             predict_dataset = predict_examples.map(
560 |                 prepare_validation_features,
561 |                 batched=True,
562 |                 num_proc=data_args.preprocessing_num_workers,
563 |                 remove_columns=column_names,
564 |                 load_from_cache_file=not data_args.overwrite_cache,
565 |                 desc="Running tokenizer on prediction dataset",
566 |             )
567 |         if data_args.max_predict_samples is not None:
568 |             # During Feature creation dataset samples might increase, we will select required samples again
569 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
570 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
571 | 
572 |     # Data collator
573 |     # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
574 |     # collator.
575 |     data_collator = (
576 |         default_data_collator
577 |         if data_args.pad_to_max_length
578 |         else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
579 |     )
580 | 
581 |     # Post-processing:
582 |     def post_processing_function(examples, features, predictions, stage="eval"):
583 |         # Post-processing: we match the start logits and end logits to answers in the original context.
584 |         predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
585 |             examples=examples,
586 |             features=features,
587 |             predictions=predictions,
588 |             version_2_with_negative=data_args.version_2_with_negative,
589 |             n_best_size=data_args.n_best_size,
590 |             max_answer_length=data_args.max_answer_length,
591 |             start_n_top=model.config.start_n_top,
592 |             end_n_top=model.config.end_n_top,
593 |             output_dir=training_args.output_dir,
594 |             log_level=log_level,
595 |             prefix=stage,
596 |         )
597 |         # Format the result to the format the metric expects.
598 |         if data_args.version_2_with_negative:
599 |             formatted_predictions = [
600 |                 {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]}
601 |                 for k, v in predictions.items()
602 |             ]
603 |         else:
604 |             formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
605 | 
606 |         references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
607 |         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
608 | 
609 |     metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
610 | 
611 |     def compute_metrics(p: EvalPrediction):
612 |         return metric.compute(predictions=p.predictions, references=p.label_ids)
613 | 
614 |     # Initialize our Trainer
615 |     trainer = QuestionAnsweringTrainer(
616 |         model=model,
617 |         args=training_args,
618 |         train_dataset=train_dataset if training_args.do_train else None,
619 |         eval_dataset=eval_dataset if training_args.do_eval else None,
620 |         eval_examples=eval_examples if training_args.do_eval else None,
621 |         tokenizer=tokenizer,
622 |         data_collator=data_collator,
623 |         post_process_function=post_processing_function,
624 |         compute_metrics=compute_metrics,
625 |     )
626 | 
627 |     # Training
628 |     if training_args.do_train:
629 |         checkpoint = None
630 |         if training_args.resume_from_checkpoint is not None:
631 |             checkpoint = training_args.resume_from_checkpoint
632 |         elif last_checkpoint is not None:
633 |             checkpoint = last_checkpoint
634 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
635 |         trainer.save_model()  # Saves the tokenizer too for easy upload
636 | 
637 |         metrics = train_result.metrics
638 | 
639 |         max_train_samples = (
640 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
641 |         )
642 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
643 | 
644 |         trainer.log_metrics("train", metrics)
645 |         trainer.save_metrics("train", metrics)
646 |         trainer.save_state()
647 | 
648 |     # Evaluation
649 |     if training_args.do_eval:
650 |         logger.info("*** Evaluate ***")
651 |         metrics = trainer.evaluate()
652 | 
653 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
654 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
655 | 
656 |         trainer.log_metrics("eval", metrics)
657 |         trainer.save_metrics("eval", metrics)
658 | 
659 |     # Prediction
660 |     if training_args.do_predict:
661 |         logger.info("*** Predict ***")
662 |         results = trainer.predict(predict_dataset, predict_examples)
663 |         metrics = results.metrics
664 | 
665 |         max_predict_samples = (
666 |             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
667 |         )
668 |         metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
669 | 
670 |         trainer.log_metrics("predict", metrics)
671 |         trainer.save_metrics("predict", metrics)
672 | 
673 |     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
674 |     if data_args.dataset_name is not None:
675 |         kwargs["dataset_tags"] = data_args.dataset_name
676 |         if data_args.dataset_config_name is not None:
677 |             kwargs["dataset_args"] = data_args.dataset_config_name
678 |             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
679 |         else:
680 |             kwargs["dataset"] = data_args.dataset_name
681 | 
682 |     if training_args.push_to_hub:
683 |         trainer.push_to_hub(**kwargs)
684 |     else:
685 |         trainer.create_model_card(**kwargs)
686 | 
687 | 
688 | def _mp_fn(index):
689 |     # For xla_spawn (TPUs)
690 |     main()
691 | 
692 | 
693 | if __name__ == "__main__":
694 |     main()
695 | 


--------------------------------------------------------------------------------
/mrc/run_qa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Team All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
 18 | """
 19 | # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 20 | 
 21 | import logging
 22 | import os
 23 | import pdb
 24 | import sys
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | import datasets
 28 | from datasets import load_metric
 29 | import transformers
 30 | from trainer_qa import QuestionAnsweringTrainer
 31 | from transformers import (
 32 |     AutoConfig,
 33 |     AutoModelForQuestionAnswering,
 34 |     AutoTokenizer,
 35 |     DataCollatorWithPadding,
 36 |     EvalPrediction,
 37 |     HfArgumentParser,
 38 |     PreTrainedTokenizerFast,
 39 |     TrainingArguments,
 40 |     default_data_collator,
 41 |     set_seed,
 42 | )
 43 | from transformers.trainer_utils import get_last_checkpoint
 44 | from transformers.utils import check_min_version
 45 | from transformers.utils.versions import require_version
 46 | from utils_qa import postprocess_qa_predictions
 47 | 
 48 | import evaluate
 49 | from mrc_task_config import *
 50 | 
 51 | 
 52 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 53 | 
 54 | logger = logging.getLogger(__name__)
 55 | 
 56 | 
 57 | @dataclass
 58 | class TaskArguments:
 59 |     is_extractive: bool = field(
 60 |         default=True,
 61 |         metadata={"help": "Used in data prepocessing, determine wether to compute the answer offsets in the source text, set to True for SpanMRC model, False for Seq2Seq model "}
 62 |     )
 63 |     pred_trg_file: Optional[str] = field(
 64 |         default=None, 
 65 |         metadata={"help": "path of trigger prediction file, if specified, will do argument extraction based on predicted triggers rather than golden labeled triggers"}
 66 |     )
 67 |     pred_trg_col: Optional[str] = field(
 68 |         default='pred_triggers', 
 69 |         metadata={"help": "column name of predicted triggers in pred_trg_file"}
 70 |     )
 71 |     output_filename: Optional[str] = field(
 72 |         default=None,
 73 |         metadata={"help": "you can specify the file name of model predictions"}
 74 |     )
 75 | @dataclass
 76 | class ModelArguments:
 77 |     """
 78 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 79 |     """
 80 | 
 81 |     model_name_or_path: str = field(
 82 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 83 |     )
 84 |     config_name: Optional[str] = field(
 85 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 86 |     )
 87 |     tokenizer_name: Optional[str] = field(
 88 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 89 |     )
 90 |     cache_dir: Optional[str] = field(
 91 |         default=None,
 92 |         metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
 93 |     )
 94 |     model_revision: str = field(
 95 |         default="main",
 96 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 97 |     )
 98 |     use_auth_token: bool = field(
 99 |         default=False,
100 |         metadata={
101 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
102 |             "with private models)."
103 |         },
104 |     )
105 | 
106 | 
107 | @dataclass
108 | class DataTrainingArguments:
109 |     """
110 |     Arguments pertaining to what data we are going to input our model for training and eval.
111 |     """
112 | 
113 |     dataset_name: Optional[str] = field(
114 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
115 |     )
116 |     dataset_config_name: Optional[str] = field(
117 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
118 |     )
119 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
120 |     validation_file: Optional[str] = field(
121 |         default=None,
122 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
123 |     )
124 |     test_file: Optional[str] = field(
125 |         default=None,
126 |         metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
127 |     )
128 |     overwrite_cache: bool = field(
129 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
130 |     )
131 |     preprocessing_num_workers: Optional[int] = field(
132 |         default=None,
133 |         metadata={"help": "The number of processes to use for the preprocessing."},
134 |     )
135 |     max_seq_length: int = field(
136 |         default=128,
137 |         metadata={
138 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
139 |             "than this will be truncated, sequences shorter will be padded."
140 |         },
141 |     )
142 |     pad_to_max_length: bool = field(
143 |         default=True,
144 |         metadata={
145 |             "help": "Whether to pad all samples to `max_seq_length`. "
146 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
147 |             "be faster on GPU but will be slower on TPU)."
148 |         },
149 |     )
150 |     max_train_samples: Optional[int] = field(
151 |         default=None,
152 |         metadata={
153 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
154 |             "value if set."
155 |         },
156 |     )
157 |     max_eval_samples: Optional[int] = field(
158 |         default=None,
159 |         metadata={
160 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
161 |             "value if set."
162 |         },
163 |     )
164 |     max_predict_samples: Optional[int] = field(
165 |         default=None,
166 |         metadata={
167 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
168 |             "value if set."
169 |         },
170 |     )
171 |     version_2_with_negative: bool = field(
172 |         default=False, metadata={"help": "If true, some of the examples do not have an answer."}
173 |     )
174 |     null_score_diff_threshold: float = field(
175 |         default=0.0,
176 |         metadata={
177 |             "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
178 |             "the score of the null answer minus this threshold, the null answer is selected for this example. "
179 |             "Only useful when `version_2_with_negative=True`."
180 |         },
181 |     )
182 |     doc_stride: int = field(
183 |         default=128,
184 |         metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
185 |     )
186 |     n_best_size: int = field(
187 |         default=20,
188 |         metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
189 |     )
190 |     max_answer_length: int = field(
191 |         default=30,
192 |         metadata={
193 |             "help": "The maximum length of an answer that can be generated. This is needed because the start "
194 |             "and end predictions are not conditioned on one another."
195 |         },
196 |     )
197 | 
198 |     def __post_init__(self):
199 |         if (
200 |             self.dataset_name is None
201 |             and self.train_file is None
202 |             and self.validation_file is None
203 |             and self.test_file is None
204 |         ):
205 |             raise ValueError("Need either a dataset name or a training/validation file/test_file.")
206 |         else:
207 |             if self.train_file is not None:
208 |                 extension = self.train_file.split(".")[-1]
209 |                 assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
210 |             if self.validation_file is not None:
211 |                 extension = self.validation_file.split(".")[-1]
212 |                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
213 |             if self.test_file is not None:
214 |                 extension = self.test_file.split(".")[-1]
215 |                 assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
216 | 
217 | 
218 | def main():
219 |     # See all possible arguments in src/transformers/training_args.py
220 |     # or by passing the --help flag to this script.
221 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
222 | 
223 |     parser = HfArgumentParser((TaskArguments, ModelArguments, DataTrainingArguments, TrainingArguments))
224 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
225 |         # If we pass only one argument to the script and it's the path to a json file,
226 |         # let's parse it to get our arguments.
227 |         task_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
228 |     else:
229 |         task_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses()
230 |     
231 |     task_args.just_infer = (not training_args.do_train and training_args.do_predict)
232 | 
233 |     # Setup logging
234 |     logging.basicConfig(
235 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
236 |         datefmt="%m/%d/%Y %H:%M:%S",
237 |         handlers=[logging.StreamHandler(sys.stdout)],
238 |     )
239 | 
240 |     log_level = training_args.get_process_log_level()
241 |     logger.setLevel(log_level)
242 |     datasets.utils.logging.set_verbosity(log_level)
243 |     transformers.utils.logging.set_verbosity(log_level)
244 |     transformers.utils.logging.enable_default_handler()
245 |     transformers.utils.logging.enable_explicit_format()
246 | 
247 |     # Log on each process the small summary:
248 |     logger.warning(
249 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
250 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
251 |     )
252 |     logger.info(f"Training/evaluation parameters {training_args}")
253 | 
254 |     # Detecting last checkpoint.
255 |     last_checkpoint = None
256 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
257 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
258 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
259 |             raise ValueError(
260 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
261 |                 "Use --overwrite_output_dir to overcome."
262 |             )
263 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
264 |             logger.info(
265 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
266 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
267 |             )
268 | 
269 |     # Set seed before initializing model.
270 |     set_seed(training_args.seed)
271 | 
272 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
273 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
274 |     # (the dataset will be downloaded automatically from the datasets Hub).
275 |     #
276 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
277 |     # 'text' is found. You can easily tweak this behavior (see below).
278 |     #
279 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
280 |     # download the dataset.
281 | 
282 |     raw_datasets = load_my_datasets_for_mrc(data_args, task_args)
283 | 
284 | 
285 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
286 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
287 | 
288 |     # Load pretrained model and tokenizer
289 |     #
290 |     # Distributed training:
291 |     # The .from_pretrained methods guarantee that only one local process can concurrently
292 |     # download model & vocab.
293 |     config = AutoConfig.from_pretrained(
294 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
295 |         cache_dir=model_args.cache_dir,
296 |         revision=model_args.model_revision,
297 |         use_auth_token=True if model_args.use_auth_token else None,
298 |     )
299 |     tokenizer = AutoTokenizer.from_pretrained(
300 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
301 |         cache_dir=model_args.cache_dir,
302 |         use_fast=True,
303 |         revision=model_args.model_revision,
304 |         use_auth_token=True if model_args.use_auth_token else None,
305 |     )
306 |     model = AutoModelForQuestionAnswering.from_pretrained(
307 |         model_args.model_name_or_path,
308 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
309 |         config=config,
310 |         cache_dir=model_args.cache_dir,
311 |         revision=model_args.model_revision,
312 |         use_auth_token=True if model_args.use_auth_token else None,
313 |     )
314 | 
315 |     # Tokenizer check: this script requires a fast tokenizer.
316 |     if not isinstance(tokenizer, PreTrainedTokenizerFast):
317 |         raise ValueError(
318 |             "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
319 |             "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
320 |             "requirement"
321 |         )
322 | 
323 |     # Preprocessing the datasets.
324 |     # Preprocessing is slighlty different for training and evaluation.
325 |     if training_args.do_train:
326 |         column_names = raw_datasets["train"].column_names
327 |     elif training_args.do_eval:
328 |         column_names = raw_datasets["validation"].column_names
329 |     else:
330 |         column_names = raw_datasets["test"].column_names
331 |     question_column_name = "question" if "question" in column_names else column_names[0]
332 |     context_column_name = "context" if "context" in column_names else column_names[1]
333 |     answer_column_name = "answers" if "answers" in column_names else column_names[2]
334 | 
335 |     # Padding side determines if we do (question|context) or (context|question).
336 |     pad_on_right = tokenizer.padding_side == "right"
337 | 
338 |     if data_args.max_seq_length > tokenizer.model_max_length:
339 |         logger.warning(
340 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
341 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
342 |         )
343 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
344 | 
345 |     # Training preprocessing
346 |     def prepare_train_features(examples):
347 |         # Some of the questions have lots of whitespace on the left, which is not useful and will make the
348 |         # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
349 |         # left whitespace
350 |         examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
351 | 
352 |         # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
353 |         # in one example possible giving several features when a context is long, each of those features having a
354 |         # context that overlaps a bit the context of the previous feature.
355 |         tokenized_examples = tokenizer(
356 |             examples[question_column_name if pad_on_right else context_column_name],
357 |             examples[context_column_name if pad_on_right else question_column_name],
358 |             truncation="only_second" if pad_on_right else "only_first",
359 |             max_length=max_seq_length,
360 |             stride=data_args.doc_stride,
361 |             return_overflowing_tokens=True,
362 |             return_offsets_mapping=True,
363 |             padding="max_length" if data_args.pad_to_max_length else False,
364 |         )
365 | 
366 |         # Since one example might give us several features if it has a long context, we need a map from a feature to
367 |         # its corresponding example. This key gives us just that.
368 |         sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
369 |         # The offset mappings will give us a map from token to character position in the original context. This will
370 |         # help us compute the start_positions and end_positions.
371 |         offset_mapping = tokenized_examples.pop("offset_mapping")
372 | 
373 |         # Let's label those examples!
374 |         tokenized_examples["start_positions"] = []
375 |         tokenized_examples["end_positions"] = []
376 | 
377 |         for i, offsets in enumerate(offset_mapping):
378 |             # We will label impossible answers with the index of the CLS token.
379 |             input_ids = tokenized_examples["input_ids"][i]
380 |             cls_index = input_ids.index(tokenizer.cls_token_id)
381 | 
382 |             # Grab the sequence corresponding to that example (to know what is the context and what is the question).
383 |             sequence_ids = tokenized_examples.sequence_ids(i)
384 | 
385 |             # One example can give several spans, this is the index of the example containing this span of text.
386 |             sample_index = sample_mapping[i]
387 |             answers = examples[answer_column_name][sample_index]
388 |             # If no answers are given, set the cls_index as answer.
389 |             if len(answers["answer_start"]) == 0:
390 |                 tokenized_examples["start_positions"].append(cls_index)
391 |                 tokenized_examples["end_positions"].append(cls_index)
392 |             else:
393 |                 # Start/end character index of the answer in the text.
394 |                 start_char = answers["answer_start"][0]
395 |                 end_char = start_char + len(answers["text"][0])
396 | 
397 |                 # Start token index of the current span in the text.
398 |                 token_start_index = 0
399 |                 while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
400 |                     token_start_index += 1
401 | 
402 |                 # End token index of the current span in the text.
403 |                 token_end_index = len(input_ids) - 1
404 |                 while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
405 |                     token_end_index -= 1
406 | 
407 |                 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
408 |                 if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
409 |                     tokenized_examples["start_positions"].append(cls_index)
410 |                     tokenized_examples["end_positions"].append(cls_index)
411 |                 else:
412 |                     # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
413 |                     # Note: we could go after the last offset if the answer is the last word (edge case).
414 |                     while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
415 |                         token_start_index += 1
416 |                     tokenized_examples["start_positions"].append(token_start_index - 1)
417 |                     while offsets[token_end_index][1] >= end_char:
418 |                         token_end_index -= 1
419 |                     tokenized_examples["end_positions"].append(token_end_index + 1)
420 | 
421 |         return tokenized_examples
422 | 
423 |     if training_args.do_train:
424 |         if "train" not in raw_datasets:
425 |             raise ValueError("--do_train requires a train dataset")
426 |         train_dataset = raw_datasets["train"]
427 |         if data_args.max_train_samples is not None:
428 |             # We will select sample from whole data if argument is specified
429 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
430 |             train_dataset = train_dataset.select(range(max_train_samples))
431 |         # Create train feature from dataset
432 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
433 |             train_dataset = train_dataset.map(
434 |                 prepare_train_features,
435 |                 batched=True,
436 |                 num_proc=data_args.preprocessing_num_workers,
437 |                 remove_columns=column_names,
438 |                 load_from_cache_file=not data_args.overwrite_cache,
439 |                 desc="Running tokenizer on train dataset",
440 |             )
441 |         if data_args.max_train_samples is not None:
442 |             # Number of samples might increase during Feature Creation, We select only specified max samples
443 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
444 |             train_dataset = train_dataset.select(range(max_train_samples))
445 | 
446 |     # Validation preprocessing
447 |     def prepare_validation_features(examples):
448 |         # Some of the questions have lots of whitespace on the left, which is not useful and will make the
449 |         # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
450 |         # left whitespace
451 |         examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
452 | 
453 |         # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
454 |         # in one example possible giving several features when a context is long, each of those features having a
455 |         # context that overlaps a bit the context of the previous feature.
456 |         tokenized_examples = tokenizer(
457 |             examples[question_column_name if pad_on_right else context_column_name],
458 |             examples[context_column_name if pad_on_right else question_column_name],
459 |             truncation="only_second" if pad_on_right else "only_first",
460 |             max_length=max_seq_length,
461 |             stride=data_args.doc_stride,
462 |             return_overflowing_tokens=True,
463 |             return_offsets_mapping=True,
464 |             padding="max_length" if data_args.pad_to_max_length else False,
465 |         )
466 | 
467 |         # Since one example might give us several features if it has a long context, we need a map from a feature to
468 |         # its corresponding example. This key gives us just that.
469 |         sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
470 | 
471 |         # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
472 |         # corresponding example_id and we will store the offset mappings.
473 |         tokenized_examples["example_id"] = []
474 | 
475 |         for i in range(len(tokenized_examples["input_ids"])):
476 |             # Grab the sequence corresponding to that example (to know what is the context and what is the question).
477 |             sequence_ids = tokenized_examples.sequence_ids(i)
478 |             context_index = 1 if pad_on_right else 0
479 | 
480 |             # One example can give several spans, this is the index of the example containing this span of text.
481 |             sample_index = sample_mapping[i]
482 |             tokenized_examples["example_id"].append(examples["id"][sample_index])
483 | 
484 |             # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
485 |             # position is part of the context or not.
486 |             tokenized_examples["offset_mapping"][i] = [
487 |                 (o if sequence_ids[k] == context_index else None)
488 |                 for k, o in enumerate(tokenized_examples["offset_mapping"][i])
489 |             ]
490 | 
491 |         return tokenized_examples
492 | 
493 |     if training_args.do_eval:
494 |         if "validation" not in raw_datasets:
495 |             raise ValueError("--do_eval requires a validation dataset")
496 |         eval_examples = raw_datasets["validation"]
497 |         if data_args.max_eval_samples is not None:
498 |             # We will select sample from whole data
499 |             max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
500 |             eval_examples = eval_examples.select(range(max_eval_samples))
501 |         # Validation Feature Creation
502 |         with training_args.main_process_first(desc="validation dataset map pre-processing"):
503 |             eval_dataset = eval_examples.map(
504 |                 prepare_validation_features,
505 |                 batched=True,
506 |                 num_proc=data_args.preprocessing_num_workers,
507 |                 remove_columns=column_names,
508 |                 load_from_cache_file=not data_args.overwrite_cache,
509 |                 desc="Running tokenizer on validation dataset",
510 |             )
511 |         if data_args.max_eval_samples is not None:
512 |             # During Feature creation dataset samples might increase, we will select required samples again
513 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
514 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
515 | 
516 |     if training_args.do_predict:
517 |         if "test" not in raw_datasets:
518 |             raise ValueError("--do_predict requires a test dataset")
519 |         predict_examples = raw_datasets["test"]
520 |         if data_args.max_predict_samples is not None:
521 |             # We will select sample from whole data
522 |             predict_examples = predict_examples.select(range(data_args.max_predict_samples))
523 |         # Predict Feature Creation
524 |         with training_args.main_process_first(desc="prediction dataset map pre-processing"):
525 |             predict_dataset = predict_examples.map(
526 |                 prepare_validation_features,
527 |                 batched=True,
528 |                 num_proc=data_args.preprocessing_num_workers,
529 |                 remove_columns=column_names,
530 |                 load_from_cache_file=not data_args.overwrite_cache,
531 |                 desc="Running tokenizer on prediction dataset",
532 |             )
533 |         if data_args.max_predict_samples is not None:
534 |             # During Feature creation dataset samples might increase, we will select required samples again
535 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
536 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
537 | 
538 |     # Data collator
539 |     # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
540 |     # collator.
541 |     data_collator = (
542 |         default_data_collator
543 |         if data_args.pad_to_max_length
544 |         else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
545 |     )
546 | 
547 |     # Post-processing:
548 |     def post_processing_function(examples, features, predictions, stage="eval"):
549 |         # Post-processing: we match the start logits and end logits to answers in the original context.
550 |         predictions = postprocess_qa_predictions(
551 |             examples=examples,
552 |             features=features,
553 |             predictions=predictions,
554 |             version_2_with_negative=data_args.version_2_with_negative,
555 |             n_best_size=data_args.n_best_size,
556 |             max_answer_length=data_args.max_answer_length,
557 |             null_score_diff_threshold=data_args.null_score_diff_threshold,
558 |             output_dir=training_args.output_dir,
559 |             log_level=log_level,
560 |             prefix=stage,
561 |         )
562 |         # Format the result to the format the metric expects.
563 |         if data_args.version_2_with_negative:
564 |             formatted_predictions = [
565 |                 {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
566 |             ]
567 |         else:
568 |             formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
569 | 
570 |         references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
571 |         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
572 | 
573 |     logger.info("****** loading metric ******")
574 |     # metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
575 |     metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
576 | 
577 | 
578 |     def compute_metrics(p: EvalPrediction):
579 |         return metric.compute(predictions=p.predictions, references=p.label_ids)
580 | 
581 |     # Initialize our Trainer
582 |     logger.info("****** initializing trainer ******")
583 |     trainer = QuestionAnsweringTrainer(
584 |         model=model,
585 |         args=training_args,
586 |         train_dataset=train_dataset if training_args.do_train else None,
587 |         eval_dataset=eval_dataset if training_args.do_eval else None,
588 |         eval_examples=eval_examples if training_args.do_eval else None,
589 |         tokenizer=tokenizer,
590 |         data_collator=data_collator,
591 |         post_process_function=post_processing_function,
592 |         compute_metrics=compute_metrics,
593 |     )
594 | 
595 | 
596 |     # Training
597 |     if training_args.do_train:
598 |         checkpoint = None
599 |         if training_args.resume_from_checkpoint is not None:
600 |             checkpoint = training_args.resume_from_checkpoint
601 |         elif last_checkpoint is not None:
602 |             checkpoint = last_checkpoint
603 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
604 |         trainer.save_model()  # Saves the tokenizer too for easy upload
605 | 
606 |         metrics = train_result.metrics
607 |         max_train_samples = (
608 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
609 |         )
610 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
611 | 
612 |         trainer.log_metrics("train", metrics)
613 |         trainer.save_metrics("train", metrics)
614 |         trainer.save_state()
615 | 
616 |     # Evaluation
617 |     if training_args.do_eval:
618 |         logger.info("*** Evaluate ***")
619 |         metrics = trainer.evaluate()
620 | 
621 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
622 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
623 | 
624 |         trainer.log_metrics("eval", metrics)
625 |         trainer.save_metrics("eval", metrics)
626 | 
627 |     # Prediction
628 |     if training_args.do_predict:
629 |         logger.info("*** Predict ***")
630 |         results = trainer.predict(predict_dataset, predict_examples)
631 |         # pdb.set_trace()
632 |         metrics = results.metrics
633 | 
634 |         max_predict_samples = (
635 |             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
636 |         )
637 |         metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
638 | 
639 |         trainer.log_metrics("predict", metrics)
640 |         trainer.save_metrics("predict", metrics)
641 | 
642 |         # **** save prediction file ****
643 |         # read predictions
644 |         logger.info("postprocessing output")
645 |         ids = []
646 |         preds = []
647 |         title_id_lst = []
648 |         title_lst = []
649 |         trigger_lst = []
650 |         gold_triple_lst = []
651 |         event_triple_lst = []
652 |         for idx, item in enumerate(raw_datasets['test']):
653 |             ids.append(results[0][idx]['id'])
654 |             ans = results[0][idx]['prediction_text']
655 |             if not ans: ans=""
656 |             preds.append(ans)
657 |             title_id_lst.append(int(item['title_id']))
658 |             title_lst.append(item['context'])
659 |             gold_triple_lst.append(item['gold_answer_triples'])
660 |             trigger_lst.append(item['trigger'])
661 |             event_triple_lst.append(item['triple'])
662 |         pred_df = pd.DataFrame({"title_id": title_id_lst, "title": title_lst, "trigger": trigger_lst, "event_triples": event_triple_lst, "gold_answer_triples": gold_triple_lst, "preds": preds})
663 |         # agg sbj answer and obj answer for same triple
664 |         pred_df['idx'] = pred_df.index.tolist()
665 |         pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: str(x))
666 |         pred_df = pred_df.groupby(['title_id', 'title', 'event_triples']).agg(list).sort_values(by='idx', axis=0).reset_index()
667 |         pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: eval(x))
668 |         pred_df['trigger'] = pred_df['trigger'].apply(lambda x: x[0])
669 |         pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0])
670 |         pred_df.apply(lambda row: row.preds.insert(1, row.trigger), axis=1)
671 |         pred_df.rename({"preds": "pred_event_triples"}, axis=1, inplace=True)
672 |         pred_df['idx'] = pred_df.index.tolist()
673 |         pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='idx', axis=0).reset_index()
674 |         pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0])
675 |         pred_df = pred_df[['title_id', 'title', 'gold_answer_triples', 'pred_event_triples']]
676 |         pred_df.rename({"gold_answer_triples": "event_triples"}, axis=1, inplace=True)
677 | 
678 |         if task_args.output_filename:
679 |             output_name = task_args.output_filename
680 |         else:
681 |             output_name = "pipeline_predictions.csv" if task_args.pred_trg_file else "arg_predictions.csv"
682 |         output_predictions_file = os.path.join(training_args.output_dir, output_name)
683 |         pred_df.to_csv(output_predictions_file, index=False)
684 |         logger.info("save output to %s" % output_predictions_file)
685 | 
686 |     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
687 | 
688 |     if data_args.dataset_name is not None:
689 |         kwargs["dataset_tags"] = data_args.dataset_name
690 |         if data_args.dataset_config_name is not None:
691 |             kwargs["dataset_args"] = data_args.dataset_config_name
692 |             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
693 |         else:
694 |             kwargs["dataset"] = data_args.dataset_name
695 | 
696 |     if training_args.push_to_hub:
697 |         trainer.push_to_hub(**kwargs)
698 |     else:
699 |         trainer.create_model_card(**kwargs)
700 | 
701 | 
702 | def _mp_fn(index):
703 |     # For xla_spawn (TPUs)
704 |     main()
705 | 
706 | 
707 | if __name__ == "__main__":
708 |     main()
709 | 


--------------------------------------------------------------------------------
/mrc/run_seq2seq_qa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2021 The HuggingFace Team All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library's seq2seq models for question answering using the 🤗 Seq2SeqTrainer.
 18 | """
 19 | # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 20 | 
 21 | import logging
 22 | import os
 23 | import sys
 24 | from dataclasses import dataclass, field
 25 | from typing import List, Optional, Tuple
 26 | 
 27 | import datasets
 28 | from datasets import load_dataset, load_metric
 29 | import evaluate
 30 | import transformers
 31 | from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 32 | from transformers import (
 33 |     AutoConfig,
 34 |     AutoModelForSeq2SeqLM,
 35 |     AutoTokenizer,
 36 |     T5TokenizerFast,
 37 |     DataCollatorForSeq2Seq,
 38 |     HfArgumentParser,
 39 |     Seq2SeqTrainingArguments,
 40 |     set_seed,
 41 | )
 42 | from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint
 43 | from transformers.utils import check_min_version
 44 | from transformers.utils.versions import require_version
 45 | 
 46 | import pdb
 47 | from mrc_task_config import *
 48 | 
 49 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 50 | 
 51 | logger = logging.getLogger(__name__)
 52 | 
 53 | @dataclass
 54 | class TaskArguments:
 55 |     is_extractive: bool = field(
 56 |         default=False,
 57 |         metadata={"help": "Used in data prepocessing, determine wether to compute the answer offsets in the source text, set to True for SpanMRC model, False for Seq2Seq model "}
 58 |     )
 59 |     pred_trg_file: Optional[str] = field(
 60 |         default=None, 
 61 |         metadata={"help": "path of trigger prediction file, if specified and task_name is 'arg', will do argument extraction based on predicted triggers rather than golden labeled triggers"}
 62 |     )
 63 |     pred_trg_col: Optional[str] = field(
 64 |         default='pred_triggers', 
 65 |         metadata={"help": "column name of predicted triggers in pred_trg_file"}
 66 |     )
 67 |     output_filename: Optional[str] = field(
 68 |         default=None,
 69 |         metadata={"help": "you can specify the file name of model predictions"}
 70 |     )
 71 | 
 72 | @dataclass
 73 | class ModelArguments:
 74 |     """
 75 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 76 |     """
 77 | 
 78 |     model_name_or_path: str = field(
 79 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 80 |     )
 81 |     config_name: Optional[str] = field(
 82 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 83 |     )
 84 |     tokenizer_name: Optional[str] = field(
 85 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 86 |     )
 87 |     cache_dir: Optional[str] = field(
 88 |         default=None,
 89 |         metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
 90 |     )
 91 |     use_fast_tokenizer: bool = field(
 92 |         default=True,
 93 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 94 |     )
 95 |     model_revision: str = field(
 96 |         default="main",
 97 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 98 |     )
 99 |     use_auth_token: bool = field(
100 |         default=False,
101 |         metadata={
102 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
103 |             "with private models)."
104 |         },
105 |     )
106 | 
107 | 
108 | @dataclass
109 | class DataTrainingArguments:
110 |     """
111 |     Arguments pertaining to what data we are going to input our model for training and eval.
112 |     """
113 | 
114 |     dataset_name: Optional[str] = field(
115 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
116 |     )
117 |     dataset_config_name: Optional[str] = field(
118 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
119 |     )
120 |     context_column: Optional[str] = field(
121 |         default="context",
122 |         metadata={"help": "The name of the column in the datasets containing the contexts (for question answering)."},
123 |     )
124 |     question_column: Optional[str] = field(
125 |         default="question",
126 |         metadata={"help": "The name of the column in the datasets containing the questions (for question answering)."},
127 |     )
128 |     answer_column: Optional[str] = field(
129 |         default="answers",
130 |         metadata={"help": "The name of the column in the datasets containing the answers (for question answering)."},
131 |     )
132 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
133 |     validation_file: Optional[str] = field(
134 |         default=None,
135 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
136 |     )
137 |     test_file: Optional[str] = field(
138 |         default=None,
139 |         metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
140 |     )
141 |     overwrite_cache: bool = field(
142 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
143 |     )
144 |     preprocessing_num_workers: Optional[int] = field(
145 |         default=None,
146 |         metadata={"help": "The number of processes to use for the preprocessing."},
147 |     )
148 |     max_seq_length: int = field(
149 |         default=128,
150 |         metadata={
151 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
152 |             "than this will be truncated, sequences shorter will be padded."
153 |         },
154 |     )
155 |     max_answer_length: int = field(
156 |         default=30,
157 |         metadata={
158 |             "help": "The maximum length of an answer that can be generated. This is needed because the start "
159 |             "and end predictions are not conditioned on one another."
160 |         },
161 |     )
162 |     val_max_answer_length: Optional[int] = field(
163 |         default=None,
164 |         metadata={
165 |             "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
166 |             "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`."
167 |             "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
168 |             "during ``evaluate`` and ``predict``."
169 |         },
170 |     )
171 |     pad_to_max_length: bool = field(
172 |         default=True,
173 |         metadata={
174 |             "help": "Whether to pad all samples to `max_seq_length`. "
175 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
176 |             "be faster on GPU but will be slower on TPU)."
177 |         },
178 |     )
179 |     max_train_samples: Optional[int] = field(
180 |         default=None,
181 |         metadata={
182 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
183 |             "value if set."
184 |         },
185 |     )
186 |     max_eval_samples: Optional[int] = field(
187 |         default=None,
188 |         metadata={
189 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
190 |             "value if set."
191 |         },
192 |     )
193 |     max_predict_samples: Optional[int] = field(
194 |         default=None,
195 |         metadata={
196 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
197 |             "value if set."
198 |         },
199 |     )
200 |     version_2_with_negative: bool = field(
201 |         default=False, metadata={"help": "If true, some of the examples do not have an answer."}
202 |     )
203 |     null_score_diff_threshold: float = field(
204 |         default=0.0,
205 |         metadata={
206 |             "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
207 |             "the score of the null answer minus this threshold, the null answer is selected for this example. "
208 |             "Only useful when `version_2_with_negative=True`."
209 |         },
210 |     )
211 |     doc_stride: int = field(
212 |         default=128,
213 |         metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
214 |     )
215 |     n_best_size: int = field(
216 |         default=20,
217 |         metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
218 |     )
219 |     num_beams: Optional[int] = field(
220 |         default=None,
221 |         metadata={
222 |             "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
223 |             "which is used during ``evaluate`` and ``predict``."
224 |         },
225 |     )
226 |     ignore_pad_token_for_loss: bool = field(
227 |         default=True,
228 |         metadata={
229 |             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
230 |         },
231 |     )
232 | 
233 |     def __post_init__(self):
234 |         if (
235 |             self.dataset_name is None
236 |             and self.train_file is None
237 |             and self.validation_file is None
238 |             and self.test_file is None
239 |         ):
240 |             raise ValueError("Need either a dataset name or a training/validation file/test_file.")
241 |         else:
242 |             if self.train_file is not None:
243 |                 extension = self.train_file.split(".")[-1]
244 |                 assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
245 |             if self.validation_file is not None:
246 |                 extension = self.validation_file.split(".")[-1]
247 |                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
248 |             if self.test_file is not None:
249 |                 extension = self.test_file.split(".")[-1]
250 |                 assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
251 |         if self.val_max_answer_length is None:
252 |             self.val_max_answer_length = self.max_answer_length
253 | 
254 | 
255 | question_answering_column_name_mapping = {
256 |     "squad_v2": ("question", "context", "answer"),
257 | }
258 | 
259 | 
260 | def main():
261 |     # See all possible arguments in src/transformers/training_args.py
262 |     # or by passing the --help flag to this script.
263 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
264 | 
265 |     parser = HfArgumentParser((TaskArguments, ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
266 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
267 |         # If we pass only one argument to the script and it's the path to a json file,
268 |         # let's parse it to get our arguments.
269 |         task_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
270 |     else:
271 |         task_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses()
272 | 
273 |     # training_args.do_eval = False
274 | 
275 |     # Setup logging
276 |     logging.basicConfig(
277 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
278 |         datefmt="%m/%d/%Y %H:%M:%S",
279 |         handlers=[logging.StreamHandler(sys.stdout)],
280 |     )
281 | 
282 |     log_level = training_args.get_process_log_level()
283 |     logger.setLevel(log_level)
284 |     datasets.utils.logging.set_verbosity(log_level)
285 |     transformers.utils.logging.set_verbosity(log_level)
286 |     transformers.utils.logging.enable_default_handler()
287 |     transformers.utils.logging.enable_explicit_format()
288 | 
289 |     # Log on each process the small summary:
290 |     logger.warning(
291 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
292 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
293 |     )
294 |     logger.info(f"Training/evaluation parameters {training_args}")
295 | 
296 |     # Detecting last checkpoint.
297 |     last_checkpoint = None
298 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
299 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
300 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
301 |             raise ValueError(
302 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
303 |                 "Use --overwrite_output_dir to overcome."
304 |             )
305 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
306 |             logger.info(
307 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
308 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
309 |             )
310 | 
311 |     # Set seed before initializing model.
312 |     set_seed(training_args.seed)
313 | 
314 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
315 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
316 |     # (the dataset will be downloaded automatically from the datasets Hub).
317 |     #
318 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
319 |     # 'text' is found. You can easily tweak this behavior (see below).
320 |     #
321 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
322 |     # download the dataset.
323 |     raw_datasets = load_my_datasets_for_mrc(data_args, task_args)
324 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
325 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
326 | 
327 |     # Load pretrained model and tokenizer
328 |     #
329 |     # Distributed training:
330 |     # The .from_pretrained methods guarantee that only one local process can concurrently
331 |     # download model & vocab.
332 |     logger.info("loading pretrained model and tokenizer")
333 |     config = AutoConfig.from_pretrained(
334 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
335 |         cache_dir=model_args.cache_dir,
336 |         revision=model_args.model_revision,
337 |         use_auth_token=True if model_args.use_auth_token else None,
338 |     )
339 |     tokenizer = AutoTokenizer.from_pretrained(
340 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
341 |         use_fast=True,
342 |         cache_dir=model_args.cache_dir,
343 |         revision=model_args.model_revision,
344 |         use_auth_token=True if model_args.use_auth_token else None,
345 |     )
346 |     model = AutoModelForSeq2SeqLM.from_pretrained(
347 |         model_args.model_name_or_path,
348 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
349 |         config=config,
350 |         cache_dir=model_args.cache_dir,
351 |         revision=model_args.model_revision,
352 |         use_auth_token=True if model_args.use_auth_token else None,
353 |     )
354 | 
355 |     model.resize_token_embeddings(len(tokenizer))
356 | 
357 |     if model.config.decoder_start_token_id is None:
358 |         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
359 | 
360 |     # Preprocessing the datasets.
361 |     # We need to generate and tokenize inputs and targets.
362 |     if training_args.do_train:
363 |         column_names = raw_datasets["train"].column_names
364 |     elif training_args.do_eval:
365 |         column_names = raw_datasets["validation"].column_names
366 |     elif training_args.do_predict:
367 |         column_names = raw_datasets["test"].column_names
368 |     else:
369 |         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
370 |         return
371 | 
372 |     # Get the column names for input/target.
373 |     dataset_columns = question_answering_column_name_mapping.get(data_args.dataset_name, None)
374 |     if data_args.question_column is None:
375 |         question_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
376 |     else:
377 |         question_column = data_args.question_column
378 |         if question_column not in column_names:
379 |             raise ValueError(
380 |                 f"--question_column' value '{data_args.question_column}' needs to be one of: {', '.join(column_names)}"
381 |             )
382 |     if data_args.context_column is None:
383 |         context_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
384 |     else:
385 |         context_column = data_args.context_column
386 |         if context_column not in column_names:
387 |             raise ValueError(
388 |                 f"--context_column' value '{data_args.context_column}' needs to be one of: {', '.join(column_names)}"
389 |             )
390 |     if data_args.answer_column is None:
391 |         answer_column = dataset_columns[2] if dataset_columns is not None else column_names[2]
392 |     else:
393 |         answer_column = data_args.answer_column
394 |         if answer_column not in column_names:
395 |             raise ValueError(
396 |                 f"--answer_column' value '{data_args.answer_column}' needs to be one of: {', '.join(column_names)}"
397 |             )
398 | 
399 |     # Temporarily set max_answer_length for training.
400 |     max_answer_length = data_args.max_answer_length
401 |     padding = "max_length" if data_args.pad_to_max_length else False
402 | 
403 |     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
404 |         logger.warning(
405 |             "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
406 |             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
407 |         )
408 | 
409 |     if data_args.max_seq_length > tokenizer.model_max_length:
410 |         logger.warning(
411 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
412 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
413 |         )
414 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
415 | 
416 |     def preprocess_squad_batch(
417 |         examples,
418 |         question_column: str,
419 |         context_column: str,
420 |         answer_column: str,
421 |     ) -> Tuple[List[str], List[str]]:
422 |         questions = examples[question_column]
423 |         contexts = examples[context_column]
424 |         answers = examples[answer_column]
425 | 
426 |         def generate_input(_question, _context):
427 |             return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])
428 | 
429 |         inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
430 |         targets = []
431 |         # targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
432 |         targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
433 |         return inputs, targets
434 | 
435 |     def preprocess_function(examples):
436 |         # modified: covert "{}" from dataframe to {}
437 |         if isinstance(examples[answer_column][0], str):
438 |             for i in range(len(examples[answer_column])):
439 |                 examples[answer_column][i] = eval(examples[answer_column][i])
440 |         # import pdb
441 |         # pdb.set_trace()
442 |         inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
443 | 
444 |         model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
445 |         # Setup the tokenizer for targets
446 |         with tokenizer.as_target_tokenizer():
447 |             labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
448 | 
449 |         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
450 |         # padding in the loss.
451 |         if padding == "max_length" and data_args.ignore_pad_token_for_loss:
452 |             labels["input_ids"] = [
453 |                 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
454 |             ]
455 | 
456 |         model_inputs["labels"] = labels["input_ids"]
457 |         return model_inputs
458 | 
459 |     # Validation preprocessing
460 |     def preprocess_validation_function(examples):
461 |          # modified: covert "{}" from dataframe to {}
462 |         if isinstance(examples[answer_column][0], str):
463 |             for i in range(len(examples[answer_column])):
464 |                 examples[answer_column][i] = eval(examples[answer_column][i])
465 |         inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
466 | 
467 |         model_inputs = tokenizer(
468 |             inputs,
469 |             max_length=max_seq_length,
470 |             padding=padding,
471 |             truncation=True,
472 |             return_overflowing_tokens=True,
473 |             return_offsets_mapping=True,
474 |         )
475 |         # Setup the tokenizer for targets
476 |         with tokenizer.as_target_tokenizer():
477 |             labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
478 | 
479 |         # Since one example might give us several features if it has a long context, we need a map from a feature to
480 |         # its corresponding example. This key gives us just that.
481 |         sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
482 | 
483 |         # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
484 |         # corresponding example_id and we will store the offset mappings.
485 |         model_inputs["example_id"] = []
486 | 
487 |         for i in range(len(model_inputs["input_ids"])):
488 |             # One example can give several spans, this is the index of the example containing this span of text.
489 |             sample_index = sample_mapping[i]
490 |             model_inputs["example_id"].append(examples["id"][sample_index])
491 | 
492 |         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
493 |         # padding in the loss.
494 |         if padding == "max_length" and data_args.ignore_pad_token_for_loss:
495 |             labels["input_ids"] = [
496 |                 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
497 |             ]
498 | 
499 |         model_inputs["labels"] = labels["input_ids"]
500 |         return model_inputs
501 | 
502 |     if training_args.do_train:
503 |         if "train" not in raw_datasets:
504 |             raise ValueError("--do_train requires a train dataset")
505 |         train_dataset = raw_datasets["train"]
506 |         if data_args.max_train_samples is not None:
507 |             # We will select sample from whole data if agument is specified
508 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
509 |             train_dataset = train_dataset.select(range(max_train_samples))
510 |         # Create train feature from dataset
511 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
512 |             train_dataset = train_dataset.map(
513 |                 preprocess_function,
514 |                 batched=True,
515 |                 num_proc=data_args.preprocessing_num_workers,
516 |                 remove_columns=column_names,
517 |                 load_from_cache_file=not data_args.overwrite_cache,
518 |                 desc="Running tokenizer on train dataset",
519 |             )
520 |         if data_args.max_train_samples is not None:
521 |             # Number of samples might increase during Feature Creation, We select only specified max samples
522 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
523 |             train_dataset = train_dataset.select(range(max_train_samples))
524 | 
525 |     if training_args.do_eval:
526 |         if "validation" not in raw_datasets:
527 |             raise ValueError("--do_eval requires a validation dataset")
528 |         eval_examples = raw_datasets["validation"]
529 |         if data_args.max_eval_samples is not None:
530 |             # We will select sample from whole data
531 |             max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
532 |             eval_examples = eval_examples.select(range(max_eval_samples))
533 |         # Validation Feature Creation
534 |         with training_args.main_process_first(desc="validation dataset map pre-processing"):
535 |             eval_dataset = eval_examples.map(
536 |                 preprocess_validation_function,
537 |                 batched=True,
538 |                 num_proc=data_args.preprocessing_num_workers,
539 |                 remove_columns=column_names,
540 |                 load_from_cache_file=not data_args.overwrite_cache,
541 |                 desc="Running tokenizer on validation dataset",
542 |             )
543 |         if data_args.max_eval_samples is not None:
544 |             # During Feature creation dataset samples might increase, we will select required samples again
545 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
546 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
547 | 
548 |     if training_args.do_predict:
549 |         if "test" not in raw_datasets:
550 |             raise ValueError("--do_predict requires a test dataset")
551 |         predict_examples = raw_datasets["test"]
552 |         if data_args.max_predict_samples is not None:
553 |             # We will select sample from whole data
554 |             predict_examples = predict_examples.select(range(data_args.max_predict_samples))
555 |         # Predict Feature Creation
556 |         with training_args.main_process_first(desc="prediction dataset map pre-processing"):
557 |             predict_dataset = predict_examples.map(
558 |                 preprocess_validation_function,
559 |                 batched=True,
560 |                 num_proc=data_args.preprocessing_num_workers,
561 |                 remove_columns=column_names,
562 |                 load_from_cache_file=not data_args.overwrite_cache,
563 |                 desc="Running tokenizer on prediction dataset",
564 |             )
565 |         if data_args.max_predict_samples is not None:
566 |             # During Feature creation dataset samples might increase, we will select required samples again
567 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
568 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
569 | 
570 |     # Data collator
571 |     label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
572 |     data_collator = DataCollatorForSeq2Seq(
573 |         tokenizer,
574 |         model=model,
575 |         label_pad_token_id=label_pad_token_id,
576 |         pad_to_multiple_of=8 if training_args.fp16 else None,
577 |     )
578 | 
579 |     logger.info("***** loading metric ******")
580 |     # metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
581 |     metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
582 | 
583 |     def compute_metrics(p: EvalPrediction):
584 |         if isinstance(p.label_ids[0]['answers'], str):
585 |             for i in range(len((p.label_ids))):
586 |                 p.label_ids[i]['answers'] = eval(p.label_ids[i]['answers'])
587 |         return metric.compute(predictions=p.predictions, references=p.label_ids)
588 | 
589 |     # Post-processing:
590 |     def post_processing_function(
591 |         examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"
592 |     ):
593 |         # import pdb
594 |         # pdb.set_trace()
595 |         # Decode the predicted tokens.
596 |         # preds = outputs.predictions
597 |         preds = outputs
598 |         if isinstance(preds, tuple):
599 |             preds = preds[0]
600 |         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
601 | 
602 |         # Build a map example to its corresponding features.
603 |         example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
604 |         feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
605 |         predictions = {}
606 |         # Let's loop over all the examples!
607 |         for example_index, example in enumerate(examples):
608 |             # This is the index of the feature associated to the current example.
609 |             feature_index = feature_per_example[example_index]
610 |             predictions[example["id"]] = decoded_preds[feature_index]
611 | 
612 |         # Format the result to the format the metric expects.
613 |         if data_args.version_2_with_negative:
614 |             formatted_predictions = [
615 |                 {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
616 |             ]
617 |         else:
618 |             formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
619 | 
620 |         references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
621 |         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
622 | 
623 |     # Initialize our Trainer
624 |     trainer = QuestionAnsweringSeq2SeqTrainer(
625 |         model=model,
626 |         args=training_args,
627 |         train_dataset=train_dataset if training_args.do_train else None,
628 |         eval_dataset=eval_dataset if training_args.do_eval else None,
629 |         eval_examples=eval_examples if training_args.do_eval else None,
630 |         tokenizer=tokenizer,
631 |         data_collator=data_collator,
632 |         compute_metrics=compute_metrics,
633 |         post_process_function=post_processing_function,
634 |     )
635 | 
636 |     # Training
637 |     if training_args.do_train:
638 |         checkpoint = None
639 |         if training_args.resume_from_checkpoint is not None:
640 |             checkpoint = training_args.resume_from_checkpoint
641 |         elif last_checkpoint is not None:
642 |             checkpoint = last_checkpoint
643 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
644 |         trainer.save_model()  # Saves the tokenizer too for easy upload
645 | 
646 |         metrics = train_result.metrics
647 |         max_train_samples = (
648 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
649 |         )
650 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
651 | 
652 |         trainer.log_metrics("train", metrics)
653 |         trainer.save_metrics("train", metrics)
654 |         trainer.save_state()
655 | 
656 |     # Evaluation
657 |     results = {}
658 |     max_length = (
659 |         training_args.generation_max_length
660 |         if training_args.generation_max_length is not None
661 |         else data_args.val_max_answer_length
662 |     )
663 |     num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
664 |     if training_args.do_eval:
665 |         logger.info("*** Evaluate ***")
666 |         metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
667 | 
668 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
669 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
670 | 
671 |         trainer.log_metrics("eval", metrics)
672 |         trainer.save_metrics("eval", metrics)
673 | 
674 |     # Prediction
675 |     if training_args.do_predict:
676 |         logger.info("*** Predict ***")
677 |         results = trainer.predict(predict_dataset, predict_examples)
678 |         metrics = results.metrics
679 | 
680 |         max_predict_samples = (
681 |             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
682 |         )
683 |         metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
684 | 
685 |         trainer.log_metrics("predict", metrics)
686 |         trainer.save_metrics("predict", metrics)
687 |         
688 |         # **** save prediction file ****
689 |         # read predictions
690 |         logger.info("postprocessing output")
691 |         ids = []
692 |         preds = []
693 |         title_id_lst = []
694 |         title_lst = []
695 |         trigger_lst = []
696 |         gold_triple_lst = []
697 |         event_triple_lst = []
698 |         for idx, item in enumerate(raw_datasets['test']):
699 |             ids.append(results[0][idx]['id'])
700 |             ans = results[0][idx]['prediction_text']
701 |             if not ans: ans = ""
702 |             preds.append(ans)
703 |             title_id_lst.append(int(item['title_id']))
704 |             title_lst.append(item['context'])
705 |             gold_triple_lst.append(item['gold_answer_triples'])
706 |             trigger_lst.append(item['trigger'])
707 |             event_triple_lst.append(item['triple'])
708 |         pred_df = pd.DataFrame({"title_id": title_id_lst, "title": title_lst, "trigger": trigger_lst, "event_triples": event_triple_lst, "gold_answer_triples": gold_triple_lst, "preds": preds})
709 | 
710 |         # aggregate sbj answer and obj answer for same triplet
711 |         pred_df['idx'] = pred_df.index.tolist()
712 |         pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: str(x))
713 |         pred_df = pred_df.groupby(['title_id', 'title', 'event_triples']).agg(list).sort_values(by='idx', axis=0).reset_index()
714 |         pred_df['event_triples'] = pred_df['event_triples'].apply(lambda x: eval(x))
715 |         pred_df['trigger'] = pred_df['trigger'].apply(lambda x: x[0])
716 |         pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0])
717 |         pred_df.apply(lambda row: row.preds.insert(1, row.trigger), axis=1)
718 |         pred_df.rename({"preds": "pred_event_triples"}, axis=1, inplace=True)
719 |         pred_df['idx'] = pred_df.index.tolist()
720 |         pred_df = pred_df.groupby(['title_id', 'title']).agg(list).sort_values(by='idx', axis=0).reset_index()
721 |         pred_df['gold_answer_triples'] = pred_df['gold_answer_triples'].apply(lambda x: x[0])
722 |         pred_df = pred_df[['title_id', 'title', 'gold_answer_triples', 'pred_event_triples']]
723 |         pred_df.rename({"gold_answer_triples": "event_triples"}, axis=1, inplace=True)
724 | 
725 |         if task_args.output_filename:
726 |             output_name = task_args.output_filename
727 |         else:
728 |             output_name = "pipeline_predictions.csv" if task_args.pred_trg_file else "arg_predictions.csv"
729 | 
730 |         output_predictions_file = os.path.join(training_args.output_dir, output_name)
731 |         pred_df.to_csv(output_predictions_file, index=False)
732 |         logger.info("save output to %s" % output_predictions_file)
733 | 
734 |     if training_args.push_to_hub:
735 |         kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
736 |         if data_args.dataset_name is not None:
737 |             kwargs["dataset_tags"] = data_args.dataset_name
738 |             if data_args.dataset_config_name is not None:
739 |                 kwargs["dataset_args"] = data_args.dataset_config_name
740 |                 kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
741 |             else:
742 |                 kwargs["dataset"] = data_args.dataset_name
743 | 
744 |         trainer.push_to_hub(**kwargs)
745 | 
746 | 
747 | def _mp_fn(index):
748 |     # For xla_spawn (TPUs)
749 |     main()
750 | 
751 | 
752 | if __name__ == "__main__":
753 |     main()
754 | 


--------------------------------------------------------------------------------