├── data └── .gitkeep ├── src ├── datatuner │ ├── __init__.py │ ├── lm │ │ ├── __init__.py │ │ ├── converters.py │ │ ├── launch_tokenizer.py │ │ ├── process_json.py │ │ ├── custom_gpt2.py │ │ ├── special_token_generator.py │ │ ├── custom_tokenizer.py │ │ ├── reranker.py │ │ ├── utils.py │ │ ├── model_loader.py │ │ ├── novograd.py │ │ ├── cross_entropy.py │ │ └── metrics.py │ ├── ops │ │ └── mlflow.py │ ├── classification │ │ ├── consistency_classifier.py │ │ ├── consistency_processor.py │ │ ├── distractors.py │ │ └── classify_generated.py │ └── utils.py └── external │ ├── __init__.py │ ├── ufal_dsg_tgen │ ├── __init__.py │ └── data.py │ ├── jjuraska_slug2slug │ ├── __init__.py │ ├── slot_aligner │ │ ├── alignment │ │ │ ├── numeric_slot.py │ │ │ ├── utils.py │ │ │ ├── list_slot.py │ │ │ ├── scalar_slot.py │ │ │ ├── categorical_slots.py │ │ │ ├── alternatives.json │ │ │ └── boolean_slot.py │ │ └── slot_extraction.py │ ├── config.py │ └── slug2slug_ser.py │ ├── shimorina_inlg_2018 │ ├── __init__.py │ └── webnlg_slot_error_rate.py │ ├── tuetschek_e2e_cleaning │ └── __init__.py │ ├── webnlg_webnlg_baseline │ ├── __init__.py │ ├── benchmark_reader.py │ └── webnlg_baseline_input.py │ ├── ukplab_emnlp2019_dualgraph │ ├── preprocess_LDC2017T10.sh │ ├── split_amr.py │ └── gen_LDC2017T10.sh │ └── README.md ├── paper ├── experiments │ ├── e2e │ │ ├── __init__.py │ │ └── preprocess.py │ ├── ldc │ │ ├── __init__.py │ │ └── preprocess.py │ ├── viggo │ │ ├── __init__.py │ │ └── preprocess.py │ ├── webnlg │ │ ├── __init__.py │ │ ├── webnlg_utils.py │ │ └── preprocess.py │ └── mturk │ │ ├── README.md │ │ └── text_stats.py ├── evaluate_lm_simple.sh ├── train_classifier.sh ├── eval_with_classifier.sh ├── task_configs │ ├── viggo.json │ ├── e2e.json │ ├── ldc.json │ ├── e2e_cg.json │ ├── viggo_cg.json │ ├── ldc_cg.json │ ├── webnlg.json │ └── webnlg_cg.json ├── lm_training_args │ ├── ldc │ │ ├── DataTuner_No_FC_model_training_args.json │ │ └── DataTuner_No_FC_No_FS_model_training_args.json │ ├── viggo │ │ ├── DataTuner_No_FC_model_training_args.json │ │ └── DataTuner_No_FC_No_FS_model_training_args.json │ ├── webnlg │ │ ├── DataTuner_No_FC_model_training_args.json │ │ └── DataTuner_No_FC_No_FS_model_training_args.json │ └── e2e │ │ ├── DataTuner_No_FC_model_training_args.json │ │ └── DataTuner_No_FC_No_FS_model_training_args.json ├── train_lm.sh ├── config.sh ├── preprocess.sh ├── classifier_training_args │ ├── e2e │ │ └── e2e_model_training_args.json │ ├── ldc │ │ └── ldc_model_training_args.json │ ├── webnlg │ │ └── webnlg_model_training_args.json │ └── viggo │ │ └── viggo_model_training_args.json ├── evaluate_lm.sh ├── retrieve.sh └── README.md ├── CODE_OF_CONDUCT.md ├── setup.py ├── environment.yml ├── setup.sh ├── CONTRIBUTING.md └── README.md /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatuner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatuner/lm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/external/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper/experiments/e2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper/experiments/ldc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper/experiments/viggo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper/experiments/webnlg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/external/ufal_dsg_tgen/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/external/shimorina_inlg_2018/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/external/tuetschek_e2e_cleaning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/external/webnlg_webnlg_baseline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/datatuner/lm/converters.py: -------------------------------------------------------------------------------- 1 | # Converter functions available to apply to text fields 2 | 3 | def clean_mrl(mrl): 4 | return mrl.replace("_", " ").replace(".", " ").replace("(", " ( ").replace(")", " ) ") 5 | 6 | 7 | converters = {"clean_mrl": clean_mrl} 8 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="datatuner", 5 | version="1.0", 6 | description="Natural Language Generation Library", 7 | packages=find_packages(), 8 | package_dir={"": "src"}, 9 | package_data={}, 10 | install_requires=[], 11 | extras_require={}, 12 | zip_safe=False, 13 | tests_require=[], 14 | ) 15 | -------------------------------------------------------------------------------- /paper/evaluate_lm_simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ./config.sh 3 | 4 | TEST_FILE=$1 5 | MODEL=$2 6 | 7 | echo "Evaluating $TEST_FILE with the model in $MODEL" 8 | 9 | $python ../src/datatuner/lm/evaluate.py \ 10 | --filename $TEST_FILE \ 11 | --no_sample \ 12 | --model_checkpoint $MODEL \ 13 | --nbest 5 \ 14 | --beam_width 5 \ 15 | --per_step_predictions 5 \ 16 | --averaging default \ 17 | --beam_alpha 0.75 \ 18 | --model_type gpt2 -------------------------------------------------------------------------------- /src/datatuner/ops/mlflow.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | 3 | 4 | def get_artifact(run_id, path): 5 | client = mlflow.tracking.MlflowClient() 6 | return client.download_artifacts(run_id, path) 7 | 8 | 9 | def get_finished_models(experiments): 10 | client = mlflow.tracking.MlflowClient() 11 | runs = client.search_runs(experiments, filter_string="metrics.finished=1") 12 | run_ids = [x.info.run_id for x in runs] 13 | return run_ids 14 | -------------------------------------------------------------------------------- /src/datatuner/lm/launch_tokenizer.py: -------------------------------------------------------------------------------- 1 | from datatuner.lm.model_loader import load_pretrained_tokenizer 2 | from fire import Fire 3 | 4 | 5 | def launch(model_checkpoint, model_type="gpt2"): 6 | tokenizer = load_pretrained_tokenizer(model_checkpoint, model_type) 7 | 8 | while True: 9 | tokenized = tokenizer.tokenize(input("text >>> ")) 10 | 11 | print(tokenized) 12 | print(tokenizer.convert_tokens_to_ids(tokenized)) 13 | 14 | 15 | if __name__ == "__main__": 16 | Fire(launch) 17 | -------------------------------------------------------------------------------- /paper/train_classifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ./config.sh 3 | TRAINING_DATA_FOLDER=$1 4 | OUTPUT_FOLDER=$2 5 | TRAINING_ARGS=$3 6 | NUM_PARALLEL=$4 7 | 8 | if [ -z "$NUM_PARALLEL" ]; then 9 | NUM_PARALLEL=1 10 | fi 11 | 12 | mkdir -p $OUTPUT_FOLDER 13 | 14 | echo "Training the classifier and writing the trained model to $OUTPUT_FOLDER" 15 | 16 | $python -m torch.distributed.launch --nproc_per_node=$NUM_PARALLEL ../src/datatuner/classification/run_classifier.py \ 17 | --data_dir $TRAINING_DATA_FOLDER \ 18 | --output_dir $OUTPUT_FOLDER \ 19 | --retrain_base $TRAINING_ARGS -------------------------------------------------------------------------------- /paper/eval_with_classifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # example: bash eval_with_classifier.sh ./data/consistency/viggo eval_results ~/trained_classifiers/viggo/ amrl text 3 | source ./config.sh 4 | 5 | TRAINING_DATA_FOLDER=$1 6 | GENERATED_DATA_FOLDER=$2 7 | MODEL_FOLDER=$3 8 | DATA_KEY=$4 9 | TEXT_KEY=$5 10 | 11 | cp $TRAINING_DATA_FOLDER/labels.txt $GENERATED_DATA_FOLDER/labels.txt 12 | 13 | $python ../src/datatuner/classification/classify_generated.py generate \ 14 | --in_file $GENERATED_DATA_FOLDER/generated.json \ 15 | --model_folder $MODEL_FOLDER \ 16 | --data_key $DATA_KEY \ 17 | --text_key $TEXT_KEY \̄ -------------------------------------------------------------------------------- /paper/experiments/webnlg/webnlg_utils.py: -------------------------------------------------------------------------------- 1 | from xml.etree import ElementTree 2 | 3 | 4 | def camel_case_split(s): 5 | words = [[s[0]]] 6 | 7 | for c in s[1:]: 8 | if words[-1][-1].islower() and c.isupper(): 9 | words.append(list(c)) 10 | else: 11 | words[-1].append(c) 12 | 13 | return " ".join(["".join(word).lower() for word in words]) 14 | 15 | 16 | def cleanup(s): 17 | if type(s) != str: 18 | s = ElementTree.tostring(s, encoding="unicode") 19 | s = s.replace("\t", " ").replace("\n", " ").replace("_", " ") 20 | s = " ".join(s.split(" ")).strip() 21 | return s 22 | -------------------------------------------------------------------------------- /paper/task_configs/viggo.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "viggo", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "new_mr", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "ref", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /paper/task_configs/e2e.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "e2e_dataset", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "new_mr", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "ref", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /paper/task_configs/ldc.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ldc", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "linearized_amr", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "answer_text", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /paper/lm_training_args/ldc/DataTuner_No_FC_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/ldc.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 3, 6 | "valid_batch_size": 4, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.0001, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 1, 12 | "n_epochs": 20, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": 0, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "adamw", 25 | "max_block_size": 350 26 | } -------------------------------------------------------------------------------- /paper/task_configs/e2e_cg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "e2e_dataset_cg", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "new_mr", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "ref", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ], 29 | "token_typing": "coarse_grained" 30 | } -------------------------------------------------------------------------------- /paper/task_configs/viggo_cg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "viggo_cg", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "new_mr", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "ref", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ], 29 | "token_typing": "coarse_grained" 30 | } -------------------------------------------------------------------------------- /paper/lm_training_args/viggo/DataTuner_No_FC_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/viggo.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 8, 6 | "valid_batch_size": 1, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.01, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 1, 12 | "n_epochs": 10, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": 0, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "novograd", 25 | "max_block_size": null 26 | } -------------------------------------------------------------------------------- /paper/lm_training_args/webnlg/DataTuner_No_FC_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/webnlg.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 8, 6 | "valid_batch_size": 1, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.01, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 2, 12 | "n_epochs": 15, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": false, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": 3, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "novograd", 25 | "max_block_size": 200 26 | } -------------------------------------------------------------------------------- /paper/lm_training_args/e2e/DataTuner_No_FC_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/e2e_dataset_cg.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 10, 6 | "valid_batch_size": 1, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.0001, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 1, 12 | "n_epochs": 10, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": 0, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "adamw", 25 | "max_block_size": null 26 | } -------------------------------------------------------------------------------- /paper/lm_training_args/ldc/DataTuner_No_FC_No_FS_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/ldc_cg.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 3, 6 | "valid_batch_size": 4, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.0001, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 1, 12 | "n_epochs": 20, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": 0, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "adamw", 25 | "max_block_size": 350 26 | } -------------------------------------------------------------------------------- /paper/task_configs/ldc_cg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ldc_cg", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "linearized_amr", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "answer_text", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ], 29 | "token_typing": "coarse_grained" 30 | } -------------------------------------------------------------------------------- /paper/lm_training_args/viggo/DataTuner_No_FC_No_FS_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/viggo_cg.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 8, 6 | "valid_batch_size": 1, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.01, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 1, 12 | "n_epochs": 10, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": -1, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "novograd", 25 | "max_block_size": null 26 | } -------------------------------------------------------------------------------- /paper/lm_training_args/webnlg/DataTuner_No_FC_No_FS_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/webnlg_cg.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 8, 6 | "valid_batch_size": 1, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.01, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 2, 12 | "n_epochs": 15, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": -1, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "novograd", 25 | "max_block_size": 200 26 | } -------------------------------------------------------------------------------- /paper/lm_training_args/e2e/DataTuner_No_FC_No_FS_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_cache": "./dataset_cache", 3 | "task_config": "./task_configs/e2e_dataset_cg.json", 4 | "model_checkpoint": "gpt2-medium", 5 | "train_batch_size": 10, 6 | "valid_batch_size": 1, 7 | "gradient_accumulation_steps": 8, 8 | "lr": 0.0001, 9 | "adam_epsilon": 1e-06, 10 | "max_norm": 1.0, 11 | "patience": 1, 12 | "n_epochs": 10, 13 | "max_data": 0, 14 | "val_max_data": 0, 15 | "freeze": false, 16 | "smoothing": 0.0, 17 | "ignore_cache": true, 18 | "device": "cuda", 19 | "fp16": "", 20 | "local_rank": 1, 21 | "warmup_steps": 0, 22 | "multitask": false, 23 | "scheduler": "piecewiselinear", 24 | "optimizer": "adamw", 25 | "max_block_size": null 26 | } -------------------------------------------------------------------------------- /paper/train_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ./config.sh 3 | 4 | DATASET=$1 5 | SYSTEM=$2 6 | OUTPUT_FOLDER=$3 7 | NUM_PARALLEL=$4 8 | 9 | if [ -z "$NUM_PARALLEL" ]; then 10 | NUM_PARALLEL=1 11 | fi 12 | 13 | SUFFIX="" 14 | if [[ "$SYSTEM" = "DataTuner_No_FC_No_FS" ]]; then 15 | SUFFIX="_cg" 16 | fi 17 | 18 | echo "Training the model for the dataset $DATASET and writing the trained model to $OUTPUT_FOLDER" 19 | 20 | $python -m torch.distributed.launch --nproc_per_node=$NUM_PARALLEL ../src/datatuner/lm/train.py \ 21 | --retrain_base ./lm_training_args/$DATASET/${SYSTEM}_model_training_args.json \ 22 | --logdir $OUTPUT_FOLDER \ 23 | --dataset_path ../data/$DATASET \ 24 | --task_config ./task_configs/${DATASET}${SUFFIX}.json \ 25 | --ignore_cache \ 26 | --overwrite_output_dir -------------------------------------------------------------------------------- /src/external/ukplab_emnlp2019_dualgraph/preprocess_LDC2017T10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 2 ]; then 4 | echo "./preprocess_LDC2017T10.sh " 5 | exit 2 6 | fi 7 | 8 | ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 9 | 10 | bash ${ROOT_DIR}/process_amr/gen_LDC2017T10.sh ${1} 11 | 12 | python ${ROOT_DIR}/process_amr/generate_input_opennmt.py -i ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/ 13 | 14 | mkdir -p ${ROOT_DIR}/data/ldc2017t10 15 | mv ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/dev-* ${ROOT_DIR}/data/ldc2017t10 16 | mv ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/test-* ${ROOT_DIR}/data/ldc2017t10 17 | mv ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/train-* ${ROOT_DIR}/data/ldc2017t10 18 | 19 | rm -rf data/ldc2017t10.* -------------------------------------------------------------------------------- /src/external/README.md: -------------------------------------------------------------------------------- 1 | # External Packages 2 | 3 | This directory contains subsections of other repositories who provide tokenization and evaluation scripts for the data sets used. 4 | The required code is included here for ease of reproducibility. 5 | 6 | See also [NOTICE.txt](../../NOTICE.txt). 7 | 8 | ### Sources 9 | 10 | **/jjuraska_slug2slug** from https://github.com/jjuraska/slug2slug 11 | 12 | 13 | **/shimorina_inlg_2018** from https://gitlab.com/shimorina/inlg-2018 14 | 15 | **/webnlg_webnlg_baseline** from https://gitlab.com/webnlg/webnlg-baseline 16 | 17 | **/tuetschek_e2e_cleaning** from https://github.com/tuetschek/e2e-cleaning 18 | 19 | **/ufal_dsg_tgen** from https://github.com/UFAL-DSG/tgen 20 | 21 | **/ukplab_emnlp2019_dualgraph** from https://github.com/UKPLab/emnlp2019-dualgraph 22 | -------------------------------------------------------------------------------- /paper/experiments/mturk/README.md: -------------------------------------------------------------------------------- 1 | # Preparing MTurk Data 2 | 3 | This directory contains scripts used for sampling the system generated outputs for human annotation, and the results of these annotations. 4 | 5 | ## Generate fluency data for annotation 6 | 7 | `python experiments/mturk/prepare_mturk.py prepare ~/system_outputs/ ~/mturk_fluency/ fluency` 8 | 9 | 10 | ## Generate fidelity data for annotation 11 | `python experiments/mturk/prepare_mturk.py prepare ~/system_outputs/ ~/mturk_fidelity/ fidelity` 12 | 13 | 14 | ## Score the fluency annotations 15 | `python experiments/mturk/prepare_mturk.py score ~/system_outputs_test/ ~/mturk_fluency/ fluency ./experiments/mturk/` 16 | 17 | ## Score the fidelity annotations 18 | `python experiments/mturk/prepare_mturk.py score ~/system_outputs_test/ ~/mturk_fidelity/ fidelity ./experiments/mturk/` -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: finetune 2 | channels: 3 | - pytorch 4 | - powerai 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - fairseq 9 | - spacy=2.0.12 10 | - fire 11 | - pip 12 | - tqdm 13 | - python=3.7.3 14 | - cudatoolkit=9.2 15 | - pytorch>=1.1.0 16 | - tensorboardx=1.8 17 | - tensorflow=1.13.1 18 | - ftfy=5.5.1 19 | - mxnet 20 | - nltk 21 | - ipdb 22 | - ipython 23 | - isort 24 | - scikit-learn<=0.21.3 25 | - python-annoy 26 | - mlflow=1.3 27 | - scipy 28 | - pyspark 29 | - tabulate=0.8.6 30 | - pip: 31 | - sacrebleu 32 | - scikit-posthocs 33 | - pytorch-ignite==0.2.1 34 | - transformers==2.3.0 35 | - sentence-transformers==0.2.5 36 | - pandas_ml 37 | - streamlit==0.52.2 38 | - matplotlib 39 | - pandas==0.24.2 40 | - sentencepiece==0.1.91 41 | - textstat 42 | - mlxtend -------------------------------------------------------------------------------- /paper/task_configs/webnlg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webnlg", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "modifiedtripleset", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "text", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ], 29 | "extra_fields": [ 30 | "category_type", 31 | "category", 32 | "num_triples" 33 | ], 34 | "metrics_fields": [ 35 | "category_type", 36 | "category", 37 | "num_triples" 38 | ] 39 | } 40 | 41 | -------------------------------------------------------------------------------- /paper/task_configs/webnlg_cg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webnlg_cg", 3 | "data_shape": [ 4 | { 5 | "id": "", 6 | "type": "special", 7 | "learn": false 8 | }, 9 | { 10 | "id": "modifiedtripleset", 11 | "type": "text", 12 | "learn": false 13 | }, 14 | { 15 | "id": "", 16 | "type": "special", 17 | "learn": false 18 | }, 19 | { 20 | "id": "text", 21 | "type": "text", 22 | "learn": true, 23 | "metrics": [ 24 | "match", 25 | "bleu" 26 | ] 27 | } 28 | ], 29 | "extra_fields": [ 30 | "category_type", 31 | "category", 32 | "num_triples" 33 | ], 34 | "metrics_fields": [ 35 | "category_type", 36 | "category", 37 | "num_triples" 38 | ], 39 | "token_typing": "coarse_grained" 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/numeric_slot.py: -------------------------------------------------------------------------------- 1 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list 2 | 3 | 4 | def align_numeric_slot_with_unit(text, text_tok, slot, value): 5 | value_number = value.split(' ')[0] 6 | try: 7 | float(value_number) 8 | except ValueError: 9 | return -1 10 | 11 | _, pos = find_first_in_list(value_number, text_tok) 12 | 13 | return pos 14 | 15 | 16 | def align_year_slot(text, text_tok, slot, value): 17 | try: 18 | int(value) 19 | except ValueError: 20 | return -1 21 | 22 | year_alternatives = [value] 23 | if len(value) == 4: 24 | year_alternatives.append('\'' + value[-2:]) 25 | year_alternatives.append(value[-2:]) 26 | 27 | for val in year_alternatives: 28 | if len(val) > 2: 29 | pos = text.find(val) 30 | else: 31 | _, pos = find_first_in_list(val, text_tok) 32 | 33 | if pos >= 0: 34 | return pos 35 | 36 | return -1 37 | -------------------------------------------------------------------------------- /paper/config.sh: -------------------------------------------------------------------------------- 1 | echo "reading configuration variables" 2 | 3 | # Folders of data that cannot be automatically downloaded 4 | # Change the two directories below to the correct ones in your case 5 | # Download it from https://catalog.ldc.upenn.edu/LDC2017T10" 6 | LDC2017_DATA_LOCATION=~/Downloads/abstract_meaning_representation_amr_2.0 7 | # Download it from https://nlds.soe.ucsc.edu/viggo" 8 | VIGGO_DATA_LOCATION=~/Downloads/viggo-v1/ 9 | 10 | python=~/miniconda3/envs/finetune/bin/python 11 | 12 | LM_MODELS_DIR=~/trained_lms 13 | CLASSIFIER_MODELS_DIR=~/trained_classifiers 14 | REPO_FOLDER=datatuner 15 | TMP_DATA_FOLDER=./tmp 16 | DATA_FOLDER=../data 17 | 18 | PAPER_FOLDER_PATTERN=$REPO_FOLDER/paper 19 | 20 | # Check if you're running in the correct folder 21 | assert_run_dir() { 22 | # params: current_dir 23 | if [[ "$PWD" != *$1 ]]; then 24 | echo "You should run this script from the folder '$1'. Exiting" 25 | exit 26 | fi 27 | } 28 | 29 | newline() { 30 | printf "\n" 31 | } 32 | 33 | assert_run_dir $PAPER_FOLDER_PATTERN 34 | 35 | -------------------------------------------------------------------------------- /src/external/ukplab_emnlp2019_dualgraph/split_amr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | 5 | INPUT = sys.argv[1] 6 | OUT_SURF = sys.argv[2] 7 | OUT_GRAPH = sys.argv[3] 8 | 9 | with open(INPUT) as f: 10 | lines = f.readlines() 11 | 12 | with open(OUT_SURF, "w") as surf, open(OUT_GRAPH, "w") as graph: 13 | amr_mode = False 14 | amr_tokens = [] 15 | for line in lines: 16 | if line.startswith("#"): 17 | if amr_mode: 18 | amr_mode = False 19 | amr = " ".join(amr_tokens) 20 | graph.write(amr + "\n") 21 | amr_tokens = [] 22 | tokens = line.split() 23 | if tokens[1] == "::snt": 24 | sent = " ".join(tokens[2:]) 25 | surf.write(sent + "\n") 26 | elif line.strip() == "": 27 | continue 28 | else: 29 | amr_mode = True 30 | amr_tokens.append(line.strip()) 31 | if amr_mode: 32 | amr_mode = False 33 | amr = " ".join(amr_tokens) 34 | graph.write(amr + "\n") 35 | amr_tokens = [] 36 | -------------------------------------------------------------------------------- /paper/preprocess.sh: -------------------------------------------------------------------------------- 1 | source ./config.sh 2 | 3 | assert_run_dir $PAPER_FOLDER_PATTERN 4 | 5 | echo "Running the data formatting for the LDC dataset" 6 | echo $TMP_DATA_FOLDER 7 | python experiments/ldc/preprocess.py --in_folder $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/data/amr_ldc2017t10 --out_folder $DATA_FOLDER/ldc/ --classification_dir $DATA_FOLDER/ldc_consistency 8 | 9 | newline 10 | 11 | echo "Running the data formatting for the WebNLG dataset" 12 | python experiments/webnlg/preprocess.py --in_folder $TMP_DATA_FOLDER/webnlg/data/v1.4/en/ --out_folder $DATA_FOLDER/webnlg --classification_dir $DATA_FOLDER/webnlg_consistency 13 | 14 | newline 15 | 16 | echo "Running the data formatting for the ViGGO dataset" 17 | python experiments/viggo/preprocess.py --in_folder $VIGGO_DATA_LOCATION --out_folder $DATA_FOLDER/viggo --classification_dir $DATA_FOLDER/viggo_consistency 18 | 19 | newline 20 | 21 | echo "Running the data formatting for the Cleaned E2E dataset" 22 | python experiments/e2e/preprocess.py --in_folder $TMP_DATA_FOLDER/e2e-cleaning/cleaned-data/ --out_folder $DATA_FOLDER/e2e --classification_dir $DATA_FOLDER/e2e_consistency 23 | 24 | newline 25 | echo "Finished preprocessing the training data" -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from external.jjuraska_slug2slug import config 4 | 5 | 6 | def find_first_in_list(val, lst): 7 | idx = -1 8 | pos = -1 9 | 10 | for i, elem in enumerate(lst): 11 | if val == elem: 12 | idx = i 13 | 14 | if idx >= 0: 15 | # Calculate approximate character position of the matched value 16 | punct_cnt = lst[:idx].count('.') + lst[:idx].count(',') 17 | pos = len(' '.join(lst[:idx])) + 1 - punct_cnt 18 | 19 | return idx, pos 20 | 21 | 22 | def find_all_in_list(val, lst): 23 | indexes = [] 24 | positions = [] 25 | 26 | for i, elem in enumerate(lst): 27 | if val == elem: 28 | indexes.append(i) 29 | 30 | # Calculate approximate character position of the matched value 31 | punct_cnt = lst[:i].count('.') + lst[:i].count(',') 32 | positions.append(len(' '.join(lst[:i])) + 1 - punct_cnt) 33 | 34 | return indexes, positions 35 | 36 | 37 | def get_slot_value_alternatives(slot): 38 | with open(config.SLOT_ALIGNER_ALTERNATIVES, 'r') as f_alternatives: 39 | alternatives_dict = json.load(f_alternatives) 40 | 41 | return alternatives_dict.get(slot, {}) 42 | -------------------------------------------------------------------------------- /paper/classifier_training_args/e2e/e2e_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "./data/e2e_consistency/", 3 | "model_type": "roberta", 4 | "model_name_or_path": "roberta-large", 5 | "task_name": "mnli", 6 | "config_name": "", 7 | "tokenizer_name": "", 8 | "cache_dir": "", 9 | "max_seq_length": 200, 10 | "filter_long_seq": false, 11 | "do_train": true, 12 | "do_eval": true, 13 | "evaluate_during_training": false, 14 | "do_lower_case": true, 15 | "per_gpu_train_batch_size": 8, 16 | "per_gpu_eval_batch_size": 1, 17 | "gradient_accumulation_steps": 1, 18 | "learning_rate": 5e-05, 19 | "weight_decay": 0.0, 20 | "adam_epsilon": 1e-08, 21 | "max_grad_norm": 1.0, 22 | "num_train_epochs": 3.0, 23 | "max_steps": -1, 24 | "warmup_steps": 500, 25 | "logging_steps": 50, 26 | "save_steps": 1000, 27 | "eval_all_checkpoints": false, 28 | "no_cuda": false, 29 | "overwrite_output_dir": true, 30 | "overwrite_cache": true, 31 | "seed": 42, 32 | "tpu": false, 33 | "tpu_ip_address": "", 34 | "tpu_name": "", 35 | "xrt_tpu_config": "", 36 | "fp16": false, 37 | "fp16_opt_level": "O1", 38 | "local_rank": 0, 39 | "server_ip": "", 40 | "server_port": "", 41 | "n_gpu": 1, 42 | "device": "cuda", 43 | "output_mode": "classification", 44 | "train_batch_size": 8 45 | } -------------------------------------------------------------------------------- /paper/classifier_training_args/ldc/ldc_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "./data/ldc_consistency/", 3 | "model_type": "roberta", 4 | "model_name_or_path": "roberta-large", 5 | "task_name": "mnli", 6 | "config_name": "", 7 | "tokenizer_name": "", 8 | "cache_dir": "", 9 | "max_seq_length": 500, 10 | "filter_long_seq": false, 11 | "do_train": true, 12 | "do_eval": true, 13 | "evaluate_during_training": false, 14 | "do_lower_case": true, 15 | "per_gpu_train_batch_size": 3, 16 | "per_gpu_eval_batch_size": 1, 17 | "gradient_accumulation_steps": 1, 18 | "learning_rate": 5e-05, 19 | "weight_decay": 0.0, 20 | "adam_epsilon": 1e-08, 21 | "max_grad_norm": 1.0, 22 | "num_train_epochs": 3.0, 23 | "max_steps": -1, 24 | "warmup_steps": 500, 25 | "logging_steps": 50, 26 | "save_steps": 1000, 27 | "eval_all_checkpoints": false, 28 | "no_cuda": false, 29 | "overwrite_output_dir": true, 30 | "overwrite_cache": true, 31 | "seed": 42, 32 | "tpu": false, 33 | "tpu_ip_address": "", 34 | "tpu_name": "", 35 | "xrt_tpu_config": "", 36 | "fp16": false, 37 | "fp16_opt_level": "O1", 38 | "local_rank": 0, 39 | "server_ip": "", 40 | "server_port": "", 41 | "n_gpu": 1, 42 | "device": "cuda", 43 | "output_mode": "classification", 44 | "train_batch_size": 3 45 | } -------------------------------------------------------------------------------- /paper/classifier_training_args/webnlg/webnlg_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "./data/webnlg_consistency/", 3 | "model_type": "roberta", 4 | "model_name_or_path": "roberta-large", 5 | "task_name": "mnli", 6 | "config_name": "", 7 | "tokenizer_name": "", 8 | "cache_dir": "", 9 | "max_seq_length": 300, 10 | "filter_long_seq": false, 11 | "do_train": true, 12 | "do_eval": true, 13 | "evaluate_during_training": true, 14 | "do_lower_case": true, 15 | "per_gpu_train_batch_size": 6, 16 | "per_gpu_eval_batch_size": 1, 17 | "gradient_accumulation_steps": 1, 18 | "learning_rate": 5e-05, 19 | "weight_decay": 0.0, 20 | "adam_epsilon": 1e-08, 21 | "max_grad_norm": 1.0, 22 | "num_train_epochs": 3.0, 23 | "max_steps": -1, 24 | "warmup_steps": 500, 25 | "logging_steps": 500, 26 | "save_steps": 1000, 27 | "eval_all_checkpoints": false, 28 | "no_cuda": false, 29 | "overwrite_output_dir": true, 30 | "overwrite_cache": true, 31 | "seed": 42, 32 | "tpu": false, 33 | "tpu_ip_address": "", 34 | "tpu_name": "", 35 | "xrt_tpu_config": "", 36 | "fp16": false, 37 | "fp16_opt_level": "O1", 38 | "local_rank": 0, 39 | "server_ip": "", 40 | "server_port": "", 41 | "n_gpu": 1, 42 | "device": "cuda", 43 | "output_mode": "classification", 44 | "train_batch_size": 6 45 | } -------------------------------------------------------------------------------- /paper/classifier_training_args/viggo/viggo_model_training_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "./data/viggo_consistency/", 3 | "model_type": "roberta", 4 | "model_name_or_path": "roberta-large", 5 | "task_name": "mnli", 6 | "config_name": "", 7 | "tokenizer_name": "", 8 | "cache_dir": "", 9 | "max_seq_length": 200, 10 | "filter_long_seq": false, 11 | "do_train": true, 12 | "do_eval": true, 13 | "evaluate_during_training": false, 14 | "do_lower_case": true, 15 | "per_gpu_train_batch_size": 8, 16 | "per_gpu_eval_batch_size": 1, 17 | "gradient_accumulation_steps": 1, 18 | "learning_rate": 5e-05, 19 | "weight_decay": 0.0, 20 | "adam_epsilon": 1e-08, 21 | "max_grad_norm": 1.0, 22 | "num_train_epochs": 3.0, 23 | "max_steps": -1, 24 | "warmup_steps": 200, 25 | "logging_steps": 50, 26 | "save_steps": 1000, 27 | "eval_all_checkpoints": false, 28 | "no_cuda": false, 29 | "overwrite_output_dir": true, 30 | "overwrite_cache": true, 31 | "seed": 42, 32 | "tpu": false, 33 | "tpu_ip_address": "", 34 | "tpu_name": "", 35 | "xrt_tpu_config": "", 36 | "fp16": false, 37 | "fp16_opt_level": "O1", 38 | "local_rank": 0, 39 | "server_ip": "", 40 | "server_port": "", 41 | "n_gpu": 1, 42 | "device": "cuda", 43 | "output_mode": "classification", 44 | "passed_examples": false, 45 | "train_batch_size": 8 46 | } -------------------------------------------------------------------------------- /src/datatuner/lm/process_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | from fire import Fire 5 | 6 | 7 | def split_list(data, n): 8 | vals_per_item = [0 for _ in range(n)] 9 | for ix, _ in enumerate(data): 10 | vals_per_item[ix % n] += 1 11 | ix = 0 12 | new_list = [] 13 | subset = [] 14 | for _, d in enumerate(data): 15 | if len(subset) < vals_per_item[ix]: 16 | subset.append(d) 17 | if len(subset) == vals_per_item[ix]: 18 | new_list.append(subset) 19 | ix += 1 20 | subset = [] 21 | return new_list 22 | 23 | 24 | def split(filename, out_folder, splits): 25 | j = json.load(open(filename)) 26 | out_folder = Path(out_folder) 27 | out_folder.mkdir(parents=True, exist_ok=True) 28 | 29 | chunks = split_list(j, splits) 30 | for i, chunk in enumerate(chunks): 31 | json.dump(chunk, open(out_folder / f"chunk_{i}.json", "w"), indent=2) 32 | 33 | 34 | def combine(base_folder_name, splits): 35 | output_data = [] 36 | for i in range(splits): 37 | folder = f"{base_folder_name}/chunks/chunk_{i}" 38 | folder = Path(folder) 39 | output_data.extend(json.load(open(folder / "generated.json"))) 40 | 41 | base_folder_name = Path(base_folder_name) 42 | base_folder_name.mkdir(parents=True, exist_ok=True) 43 | json.dump(output_data, open(base_folder_name / "generated.json", "w"), indent=2) 44 | 45 | 46 | if __name__ == "__main__": 47 | Fire() 48 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | CONDA_SH_FILE=$1 2 | source $CONDA_SH_FILE 3 | 4 | 5 | # Confirm external dependencies with user 6 | EXTERNAL_DEPS_MSG="""The scripts provided herein will retrieve several third-party libraries, 7 | environments, and/or other software packages at install-time or build-time (“External Dependencies”) 8 | from third-party sources. There are terms and conditions that you need to agree to 9 | abide by if you choose to install the External Dependencies. If you do not agree 10 | with every term and condition associated with the External Dependencies, 11 | enter “QUIT” in the command line when prompted by the script.""" 12 | 13 | confirm_external_dependencies() { 14 | echo 15 | echo $EXTERNAL_DEPS_MSG 16 | while true; do 17 | read -p "Do you want to PROCEED or QUIT? " yn 18 | case $yn in 19 | PROCEED) 20 | echo "Proceeding" 21 | break 22 | ;; 23 | QUIT) 24 | echo "Quitting" 25 | exit 26 | ;; 27 | esac 28 | done 29 | 30 | } 31 | 32 | confirm_external_dependencies 33 | 34 | echo "Creating the environment" 35 | conda env create --file environment.yml 36 | conda activate finetune 37 | 38 | printf "\n" 39 | 40 | echo "Downloading the spacy dependenices" 41 | python -m spacy download en_core_web_sm 42 | 43 | echo "Downloading the NLTK dependenices" 44 | python -m nltk.downloader punkt 45 | 46 | echo "Installing the code in development mode" 47 | 48 | printf "\n" 49 | 50 | python setup.py develop 51 | 52 | printf "\n" 53 | 54 | echo "Finished setup" -------------------------------------------------------------------------------- /src/datatuner/lm/custom_gpt2.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2LMHeadModel 2 | 3 | from datatuner.lm.cross_entropy import CrossEntropyLoss 4 | 5 | 6 | def custom_gpt2_with_smoothing(smoothing=0.0): 7 | class GPT2LMHeadModelCustom(GPT2LMHeadModel): 8 | def forward( 9 | self, 10 | input_ids, 11 | past=None, 12 | attention_mask=None, 13 | token_type_ids=None, 14 | position_ids=None, 15 | head_mask=None, 16 | labels=None, 17 | ): 18 | 19 | transformer_outputs = self.transformer( 20 | input_ids, past=past, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask 21 | ) 22 | 23 | hidden_states = transformer_outputs[0] 24 | 25 | lm_logits = self.lm_head(hidden_states) 26 | 27 | outputs = (lm_logits,) + transformer_outputs[1:] 28 | if labels is not None: 29 | # Shift so that tokens < n predict n 30 | shift_logits = lm_logits[..., :-1, :].contiguous() 31 | shift_labels = labels[..., 1:].contiguous() 32 | # Flatten the tokens 33 | loss_fct = CrossEntropyLoss(ignore_index=-1, smooth_eps=smoothing, reduction="mean") 34 | 35 | loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) 36 | 37 | outputs = (loss,) + outputs 38 | 39 | return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) 40 | 41 | return GPT2LMHeadModelCustom 42 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | # Directory paths 5 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) 6 | DATA_DIR = os.path.join(ROOT_DIR, 'data') 7 | EVAL_DIR = os.path.join(ROOT_DIR, 'eval') 8 | METRICS_DIR = os.path.join(ROOT_DIR, 'metrics') 9 | MODEL_DIR = os.path.join(ROOT_DIR, 'model') 10 | PREDICTIONS_DIR = os.path.join(ROOT_DIR, 'predictions') 11 | PREDICTIONS_BATCH_DIR = os.path.join(PREDICTIONS_DIR, 'batch') 12 | PREDICTIONS_BATCH_LEX_DIR = os.path.join(PREDICTIONS_DIR, 'batch_lex') 13 | PREDICTIONS_BATCH_EVENT_DIR = os.path.join(PREDICTIONS_DIR, 'batch_event') 14 | SLOT_ALIGNER_DIR = os.path.join(ROOT_DIR, 'slot_aligner') 15 | SLOT_ALIGNER_ALTERNATIVES = os.path.join(SLOT_ALIGNER_DIR, 'alignment', 'alternatives.json') 16 | T2T_DIR = os.path.join(ROOT_DIR, 't2t') 17 | TOOLS_DIR = os.path.join(ROOT_DIR, 'tools') 18 | TTEST_DIR = os.path.join(ROOT_DIR, 'ttest') 19 | TTEST_DATA_DIR = os.path.join(ROOT_DIR, 'ttest', 'data') 20 | TTEST_SCORES_DIR = os.path.join(ROOT_DIR, 'ttest', 'scores') 21 | 22 | # Dataset paths 23 | E2E_DATA_DIR = os.path.join(DATA_DIR, 'rest_e2e') 24 | TV_DATA_DIR = os.path.join(DATA_DIR, 'tv') 25 | LAPTOP_DATA_DIR = os.path.join(DATA_DIR, 'laptop') 26 | HOTEL_DATA_DIR = os.path.join(DATA_DIR, 'hotel') 27 | VIDEO_GAME_DATA_DIR = os.path.join(DATA_DIR, 'video_game') 28 | 29 | # Script paths 30 | METRICS_SCRIPT_PATH = os.path.join(METRICS_DIR, 'measure_scores.py') 31 | 32 | # Constants 33 | COMMA_PLACEHOLDER = ' __comma__' 34 | DELEX_PREFIX = '__slot_' # Important to use special symbols that do not get tokenized (such as '_') 35 | DELEX_SUFFIX = '__' 36 | EMPH_TOKEN = '__emph__' 37 | CONTRAST_TOKEN = '__contrast__' 38 | CONCESSION_TOKEN = '__concession__' 39 | -------------------------------------------------------------------------------- /src/datatuner/classification/consistency_classifier.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | from datatuner.classification.run_classifier import evaluate, main 5 | from transformers.data.processors.utils import InputExample 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | dataset_fields = { 10 | "webnlg": {"text": "text", "data": "modifiedtripleset", "original_data": "raw_modifiedtripleset"}, 11 | "ldc": {"text": "answer_text", "data": "linearized_amr", "original_data": "raw_amr"}, 12 | "viggo": {"text": "ref", "data": "new_mr", "original_data": "mr"}, 13 | "e2e": {"text": "ref", "data": "new_mr", "original_data": "mr"}, 14 | } 15 | 16 | 17 | def get_data_fields(): 18 | out = [] 19 | for x in dataset_fields: 20 | out.append(dataset_fields[x]["data"]) 21 | return out 22 | 23 | 24 | class ConsistencyClassifier: 25 | def __init__(self, args_dict): 26 | self.args_dict = args_dict 27 | sys.argv = [sys.argv[0]] 28 | _, self.model, self.tokenizer, self.args = main(args_dict) 29 | self.cache = {} 30 | 31 | def evaluate(self, items, set_type="test"): 32 | examples = [] 33 | for (i, item) in enumerate(items): 34 | guid = "%s-%s" % (set_type, str(i)) 35 | text_a = item["data"] 36 | text_b = item["text"] 37 | if self.args_dict["do_lower_case"]: 38 | text_a = text_a.lower() 39 | text_b = text_b.lower() 40 | label = "accurate" 41 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 42 | 43 | self.args.examples = examples 44 | 45 | results = evaluate(self.args, self.model, self.tokenizer, prefix="") 46 | return results 47 | -------------------------------------------------------------------------------- /paper/evaluate_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ./config.sh 3 | 4 | TEST_FILE=$1 5 | MODEL=$2 6 | NUM_GPUS=$3 7 | PER_GPU=$4 8 | MAX_DATA=$5 9 | SPLITS=$((NUM_GPUS * PER_GPU)) 10 | echo "SPLITS: " $SPLITS 11 | 12 | if [ -z "$MAX_DATA" ]; then 13 | MAX_DATA=0 14 | fi 15 | 16 | echo "MAX_DATA": $MAX_DATA 17 | 18 | CHUNKED_DATA_FOLDER=$(mktemp -d) 19 | echo "Chunked data outputted to the folder $CHUNKED_DATA_FOLDER" 20 | 21 | CODE_DIR=../src/datatuner/lm/ 22 | # Split data into chunks 23 | $python $CODE_DIR/process_json.py split $TEST_FILE $CHUNKED_DATA_FOLDER $SPLITS 24 | 25 | COMMON_ARGUMENTS="--model_checkpoint $MODEL \ 26 | --no_sample \ 27 | --beam_width 5 \ 28 | --nbest 5 \ 29 | --per_step_predictions 5 \ 30 | --model_type gpt2" 31 | 32 | pids= 33 | MAX_SPLITS=$((SPLITS - 1)) 34 | RESULTS_FOLDER=$MODEL/$(date +'%Y-%m-%d_%H-%M-%S') 35 | mkdir -p $RESULTS_FOLDER 36 | 37 | # Evaluate each chunk 38 | for ((i=0; i<=MAX_SPLITS; i++)); do 39 | echo "Chunk $i" 40 | CUDA_VISIBLE_DEVICES=$(($i % $NUM_GPUS)) $python $CODE_DIR/evaluate.py \ 41 | --filename $CHUNKED_DATA_FOLDER/chunk_$i.json \ 42 | --out_folder ${RESULTS_FOLDER}/chunks/chunk_$i \ 43 | --max_data $MAX_DATA \ 44 | ""$COMMON_ARGUMENTS"" & 45 | pids+=" $!" 46 | done 47 | wait $pids || { echo "there were errors" >&2; rm -rf ${RESULTS_FOLDER}; exit 1; } 48 | 49 | # Combine results from all chunks 50 | $python $CODE_DIR/process_json.py combine $RESULTS_FOLDER $SPLITS 51 | GLOBAL_MAX_DATA=$((SPLITS * MAX_DATA)) 52 | echo "GLOBAL_MAX_DATA": $GLOBAL_MAX_DATA 53 | CUDA_VISIBLE_DEVICES=0 54 | $python $CODE_DIR/evaluate.py \ 55 | --filename $TEST_FILE \ 56 | --out_folder ${RESULTS_FOLDER} \ 57 | --max_data $GLOBAL_MAX_DATA \ 58 | ""$COMMON_ARGUMENTS"" 59 | 60 | echo "removing intermediary results from ${RESULTS_FOLDER}/chunks" 61 | rm -rf ${RESULTS_FOLDER}/"chunks" 62 | 63 | echo "Final results available in" $RESULTS_FOLDER -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/list_slot.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import word_tokenize 2 | 3 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import get_slot_value_alternatives 4 | from external.jjuraska_slug2slug.slot_aligner.alignment.categorical_slots import find_value_alternative 5 | 6 | 7 | def align_list_slot(text, text_tok, slot, value, match_all=True, mode='exact_match', item_sep=', '): 8 | """ 9 | MR := slot[value] 10 | value := item || item; item;... 11 | item := tok || tok tok... 12 | """ 13 | leftmost_pos = -1 14 | 15 | # TODO: load alternatives only once 16 | alternatives = get_slot_value_alternatives(slot) 17 | 18 | # Split the slot value into individual items 19 | items = [item.strip() for item in value.split(item_sep)] 20 | 21 | # Search for all individual items exhaustively 22 | for item in items: 23 | pos = find_value_alternative(text, text_tok, item, alternatives, mode=mode) 24 | 25 | if match_all and pos < 0: 26 | return -1 27 | 28 | if leftmost_pos < 0 or 0 <= pos < leftmost_pos: 29 | leftmost_pos = pos 30 | 31 | return leftmost_pos 32 | 33 | 34 | def align_list_with_conjunctions_slot(text, text_tok, slot, value, match_all=True): 35 | separators = [',', 'and', 'with'] 36 | 37 | value_tok = word_tokenize(value) 38 | value_items = [] 39 | end_of_prev_item = -1 40 | leftmost_pos = -1 41 | 42 | # Split the value into items 43 | for i, tok in enumerate(value_tok): 44 | if tok in separators and i > end_of_prev_item + 1: 45 | item = ' '.join(value_tok[end_of_prev_item + 1:i]) 46 | value_items.append(item) 47 | end_of_prev_item = i 48 | 49 | if end_of_prev_item < len(value_tok) - 1: 50 | item = ' '.join(value_tok[end_of_prev_item + 1:]) 51 | value_items.append(item) 52 | 53 | for item in value_items: 54 | pos = text.find(item) 55 | if 0 <= pos < leftmost_pos or leftmost_pos == -1: 56 | leftmost_pos = pos 57 | if match_all and pos < 0: 58 | return -1 59 | 60 | if leftmost_pos < 0: 61 | return -1 62 | 63 | return leftmost_pos 64 | -------------------------------------------------------------------------------- /src/datatuner/lm/special_token_generator.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | from pathlib import Path 4 | 5 | from datatuner.utils import bracket_contents 6 | from fire import Fire 7 | from tqdm import tqdm 8 | 9 | 10 | def get_custom_tags(s): 11 | """Get tags starting with a token and ending with another in the string""" 12 | return bracket_contents(s, opening="<", ending=">") 13 | 14 | 15 | fn_map = { 16 | "question_sig": [get_custom_tags], 17 | "amr": [get_custom_tags], 18 | } 19 | 20 | 21 | def generate_from_item(item, fields, all_tokens): 22 | for field_name in fields: 23 | if field_name in item: 24 | tokens = list(itertools.chain(*[fn(item[field_name]) for fn in fn_map[fields[field_name]]])) 25 | all_tokens.update(tokens) 26 | 27 | 28 | def generate_from_json(data_folder, outfile, fields={"mrl": "mrl"}): 29 | """Generate the special tokens from the given folder with files train.json, validation.json, and test.json 30 | The used field is defined by the key in the `fields` dictionary and the method used is defined based 31 | on that field. 32 | """ 33 | 34 | data_folder = Path(data_folder) 35 | all_tokens = set() 36 | 37 | for split in ["test", "train", "validation"]: 38 | try: 39 | data = json.load(open(data_folder / (split + ".json"), "r")) 40 | for item in data: 41 | generate_from_item(item, fields, all_tokens) 42 | except: 43 | print(f"file absent: {split}") 44 | 45 | Path(outfile).write_text("\n".join(all_tokens)) 46 | 47 | 48 | def generate_from_jsonl(data_file, outfile, fields={"mrl": "mrl"}, max_items=0): 49 | """Generate the special tokens from the given jsonl file. 50 | The used field is defined by the key in the `fields` dictionary and the method used is defined based 51 | on that field. 52 | """ 53 | 54 | all_tokens = set() 55 | i = 0 56 | with open(data_file, "r") as f: 57 | for line in tqdm(f): 58 | item = json.loads(line.rstrip()) 59 | generate_from_item(item, fields, all_tokens) 60 | i += 1 61 | if max_items > 0 and i >= max_items: 62 | break 63 | Path(outfile).write_text("\n".join(all_tokens)) 64 | 65 | 66 | if __name__ == "__main__": 67 | Fire() 68 | -------------------------------------------------------------------------------- /paper/experiments/mturk/text_stats.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import Counter 3 | from pathlib import Path 4 | 5 | import textstat 6 | from fire import Fire 7 | 8 | from datatuner.classification.consistency_classifier import dataset_fields 9 | 10 | 11 | def count_words(text, casing="any"): 12 | """Counts word frequency using Counter from collections""" 13 | if casing == "any": 14 | text = text.lower() 15 | 16 | skips = [".", ", ", ":", ";", "'", '"'] 17 | for ch in skips: 18 | text = text.replace(ch, "") 19 | words = text.split(" ") 20 | if casing == "lower": 21 | words = [x for x in words if x and x[0].islower()] 22 | elif casing == "upper": 23 | words = [x for x in words if x and x[0].isupper()] 24 | word_counts = Counter(words) 25 | return word_counts 26 | 27 | 28 | def calculate_stats(data_folder): 29 | """Calculate stat of test.json file in a folder""" 30 | data_folder = Path(data_folder) 31 | for dataset in dataset_fields: 32 | print(f"loading {dataset}") 33 | field = dataset_fields[dataset]["text"].strip() 34 | sentences = [] 35 | for item in json.load(open(data_folder / dataset / "test.json")): 36 | sentences.append(item[field][-1] if type(item[field]) == list else item[field]) 37 | 38 | text = " ".join(sentences) 39 | lex_count = textstat.lexicon_count(text) 40 | print(lex_count) 41 | unique_words = count_words(text) 42 | print(f"all unique {len(unique_words)}") 43 | 44 | lower_unique_words = count_words(text, casing="lower") 45 | print(f"lowercase unique {len(lower_unique_words)}") 46 | 47 | upper_unique_words = count_words(text, casing="upper") 48 | print(f"uppercase unique {len(upper_unique_words)}") 49 | 50 | print(f"ratio {len(upper_unique_words) / len(unique_words)}") 51 | 52 | text_standard = textstat.text_standard(text, float_output=True) 53 | print(f"text_standard: {text_standard}") 54 | 55 | dale_chall_readability_score = textstat.dale_chall_readability_score(text) 56 | print(f"dale_chall_readability_score: {dale_chall_readability_score}") 57 | 58 | flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) 59 | print(f"flesch_kincaid_grade: {flesch_kincaid_grade}") 60 | 61 | 62 | if __name__ == "__main__": 63 | Fire(calculate_stats) 64 | -------------------------------------------------------------------------------- /src/external/ukplab_emnlp2019_dualgraph/gen_LDC2017T10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | mkdir -p ${ROOT_DIR}/data 6 | REPO_DIR=${ROOT_DIR}/data/ 7 | 8 | DATA_DIR=${1} 9 | mkdir -p ${REPO_DIR}/tmp_amr 10 | PREPROC_DIR=${REPO_DIR}/tmp_amr 11 | ORIG_AMR_DIR=${DATA_DIR}/data/amrs/split 12 | mkdir -p ${REPO_DIR}/amr_ldc2017t10 13 | FINAL_AMR_DIR=${REPO_DIR}/amr_ldc2017t10 14 | 15 | 16 | mkdir -p ${PREPROC_DIR}/train 17 | mkdir -p ${PREPROC_DIR}/dev 18 | mkdir -p ${PREPROC_DIR}/test 19 | 20 | mkdir -p ${FINAL_AMR_DIR}/train 21 | mkdir -p ${FINAL_AMR_DIR}/dev 22 | mkdir -p ${FINAL_AMR_DIR}/test 23 | 24 | 25 | cat ${ORIG_AMR_DIR}/training/amr-* > ${PREPROC_DIR}/train/raw_amrs.txt 26 | cat ${ORIG_AMR_DIR}/dev/amr-* > ${PREPROC_DIR}/dev/raw_amrs.txt 27 | # cat ${ORIG_AMR_DIR}/test/amr-* > ${PREPROC_DIR}/test/*_raw_amrs.txt 28 | # cat ${ORIG_AMR_DIR}/test/1_amr-release-2.0-alignments-test-proxy.txt ${ORIG_AMR_DIR}/test/2_amr-release-2.0-alignments-test-dfa.txt ${ORIG_AMR_DIR}/test/3_amr-release-2.0-alignments-test-bolt.txt ${ORIG_AMR_DIR}/test/4_amr-release-2.0-alignments-test-consensus.txt ${ORIG_AMR_DIR}/test/5_amr-release-2.0-alignments-test-xinhua.txt > ${PREPROC_DIR}/test/raw_amrs.txt 29 | cat ${ORIG_AMR_DIR}/test/1_amr-release-2.0-amrs-test-proxy.txt ${ORIG_AMR_DIR}/test/2_amr-release-2.0-amrs-test-dfa.txt ${ORIG_AMR_DIR}/test/3_amr-release-2.0-amrs-test-bolt.txt ${ORIG_AMR_DIR}/test/4_amr-release-2.0-amrs-test-xinhua.txt ${ORIG_AMR_DIR}/test/5_amr-release-2.0-amrs-test-consensus.txt> ${PREPROC_DIR}/test/raw_amrs.txt 30 | 31 | 32 | for SPLIT in test dev train ; do 33 | echo "processing $SPLIT..." 34 | # get the surface and the graphs separately 35 | python ${ROOT_DIR}/split_amr.py ${PREPROC_DIR}/${SPLIT}/raw_amrs.txt ${PREPROC_DIR}/${SPLIT}/surface.txt ${PREPROC_DIR}/${SPLIT}/graphs.txt 36 | 37 | python ${ROOT_DIR}/preproc_amr.py ${PREPROC_DIR}/${SPLIT}/graphs.txt ${PREPROC_DIR}/${SPLIT}/surface.txt ${FINAL_AMR_DIR}/${SPLIT}/nodes.pp.txt ${FINAL_AMR_DIR}/${SPLIT}/surface.pp.txt --mode LIN --triples-output ${FINAL_AMR_DIR}/${SPLIT}/triples.pp.txt 38 | # python ${ROOT_DIR}/preproc_amr.py ${PREPROC_DIR}/${SPLIT}/graphs.txt ${PREPROC_DIR}/${SPLIT}/surface.txt ${FINAL_AMR_DIR}/${SPLIT}/nodes.pp.txt ${FINAL_AMR_DIR}/${SPLIT}/surface.pp.txt --mode LINE_GRAPH --triples-output ${FINAL_AMR_DIR}/${SPLIT}/triples.pp.txt 39 | echo "done." 40 | done -------------------------------------------------------------------------------- /src/datatuner/classification/consistency_processor.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import sys 4 | from pathlib import Path 5 | 6 | from transformers.data.processors.utils import DataProcessor, InputExample 7 | 8 | 9 | class ConsistencyProcessor(DataProcessor): 10 | """Processor for the Consistency Classification data set.""" 11 | 12 | def __init__(self, do_lower_case): 13 | self.do_lower_case = do_lower_case 14 | super(DataProcessor, self).__init__() 15 | 16 | def get_train_examples(self, data_dir): 17 | """See base class.""" 18 | return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 19 | 20 | @classmethod 21 | def _read_tsv(cls, input_file, quotechar=None): 22 | """Reads a tab separated value file.""" 23 | with open(input_file, "r", encoding="utf-8-sig") as f: 24 | reader = csv.reader(f, delimiter="|", quotechar=quotechar) 25 | lines = [] 26 | for line in reader: 27 | if sys.version_info[0] == 2: 28 | line = list(unicode(cell, "utf-8") for cell in line) 29 | lines.append(line) 30 | return lines 31 | 32 | def get_dev_examples(self, data_dir): 33 | """See base class.""" 34 | return self._create_examples(self._read_tsv(os.path.join(data_dir, "validation.tsv")), "dev") 35 | 36 | def get_test_examples(self, data_dir): 37 | """See base class.""" 38 | return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 39 | 40 | def get_labels(self, data_dir): 41 | """See base class.""" 42 | labels = (Path(data_dir) / "labels.txt").read_text().split("\n") 43 | labels = [x for x in labels if x] 44 | return labels 45 | 46 | def _create_examples(self, lines, set_type): 47 | """Creates examples for the training and dev sets.""" 48 | # order: ["label","data","text"] 49 | examples = [] 50 | for (i, line) in enumerate(lines): 51 | if i == 0: 52 | continue 53 | guid = "%s-%s" % (set_type, str(i)) 54 | text_a = line[1] 55 | text_b = line[2] 56 | if self.do_lower_case: 57 | text_a = text_a.lower() 58 | text_b = text_b.lower() 59 | label = line[0] 60 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 61 | return examples 62 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slug2slug_ser.py: -------------------------------------------------------------------------------- 1 | from external.jjuraska_slug2slug.slot_aligner.data_analysis import score_slot_realizations 2 | from tempfile import mkdtemp 3 | import pandas as pd 4 | from pathlib import Path 5 | from fire import Fire 6 | import json 7 | from datatuner.classification.consistency_classifier import dataset_fields 8 | import numpy as np 9 | 10 | 11 | def compute_ser(datafile, scored_file, mr_field, text_field): 12 | dataset = "viggo" if "viggo" in str(datafile) else "e2e" 13 | if mr_field is None: 14 | mr_field = dataset_fields[dataset]["original_data"] 15 | if text_field is None: 16 | text_field = dataset_fields[dataset]["text"] 17 | 18 | data = json.load(open(datafile)) 19 | if dataset == "viggo": 20 | subfolder = "video_game" 21 | elif dataset == "e2e": 22 | subfolder = "rest_e2e" 23 | tempdir = Path(mkdtemp()) / subfolder 24 | tempdir.mkdir(parents=True, exist_ok=True) 25 | new_items = [] 26 | for item in data: 27 | new_item = {} 28 | new_item[mr_field] = item[mr_field] 29 | text = item[text_field] 30 | if type(text) == list: 31 | text = text[-1] 32 | 33 | new_item[text_field] = text 34 | new_items.append(new_item) 35 | df = pd.DataFrame(new_items) 36 | 37 | out_file = tempdir / "test.csv" 38 | df.to_csv(out_file, index=False) 39 | 40 | score_slot_realizations(tempdir, "test.csv") 41 | err_df = pd.read_csv(tempdir / ("test [errors].csv")) 42 | 43 | assert len(err_df) == len(df) 44 | err_data = err_df.to_dict(orient="records") 45 | percent_correct_list = [] 46 | for err_item, item in zip(err_data, data): 47 | 48 | item["errors"] = err_item["errors"] 49 | if ( 50 | type(err_item["incorrect slots"]) == float 51 | and "nan" in str(err_item["incorrect slots"]).lower() 52 | ): 53 | err_item["incorrect slots"] = "?" 54 | 55 | else: 56 | item["incorrect_slots"] = ( 57 | err_item["incorrect slots"] if err_item["errors"] > 0 else "" 58 | ) 59 | 60 | item["ser_correct"] = int(item["errors"] == 0) 61 | 62 | item["ser"] = item["errors"] / err_item["mr"].count("[") 63 | 64 | 65 | percent_correct_list.append(item["ser_correct"]) 66 | 67 | datafile = Path(datafile) 68 | print(f"written to {scored_file}") 69 | json.dump(data, open(scored_file, "w"), indent=2) 70 | 71 | 72 | if __name__ == "__main__": 73 | # python ser.py data/e2e_dataset/test.json --dataset e2e 74 | Fire(compute_ser) 75 | -------------------------------------------------------------------------------- /src/datatuner/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os 3 | from pathlib import Path 4 | from time import gmtime, strftime 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | def bracket_contents(string, level=None, opening="[", ending="]"): 11 | """Generate brackets' contents as strings""" 12 | stack = [] 13 | result = [] 14 | for i, c in enumerate(string): 15 | if c == opening: 16 | stack.append(i) 17 | elif c == ending and stack: 18 | start = stack.pop() 19 | result.append((len(stack), f"{opening}{string[start + 1: i]}{ending}")) 20 | 21 | if level is not None: 22 | result = [x for x in result if x[0] == level] 23 | 24 | return [x[1] for x in result] 25 | 26 | 27 | def uniquify_in_order(seq): 28 | """Get unique sequence from given sequence while preserving order""" 29 | seen = set() 30 | seen_add = seen.add 31 | return [x for x in seq if not (x in seen or seen_add(x))] 32 | 33 | 34 | def str_part_matches_array(s, arr): 35 | return any(s in x for x in arr) 36 | 37 | 38 | def str_start_matches_array(s, arr): 39 | return any(x.startswith(s) for x in arr) 40 | 41 | 42 | def arr_part_matches_string(s, arr): 43 | """True if some item in the array arr is a substring of s""" 44 | return any(x in s for x in arr) 45 | 46 | 47 | def ewm_mean(iterable, alpha=0.9): 48 | if len(iterable) > 0: 49 | df = pd.DataFrame({"B": iterable}) 50 | av = df.ewm(alpha=alpha).mean().B.iloc[-1] 51 | 52 | return av 53 | 54 | 55 | def geo_mean(iterable): 56 | a = np.array(iterable) 57 | return a.prod() ** (1.0 / len(a)) 58 | 59 | 60 | def newest_file(folder_path, pattern): 61 | folder_path = Path(folder_path) 62 | list_of_paths = folder_path.glob(pattern) 63 | latest_path = max(list_of_paths, key=lambda p: p.stat().st_ctime) 64 | return latest_path 65 | 66 | 67 | def flatten(d, parent_key="", sep="-"): 68 | items = [] 69 | for k, v in d.items(): 70 | new_key = k + sep + parent_key if parent_key else k 71 | if isinstance(v, collections.MutableMapping): 72 | items.extend(flatten(v, new_key, sep=sep).items()) 73 | else: 74 | items.append((new_key, v)) 75 | return dict(items) 76 | 77 | 78 | def get_curr_time(): 79 | return strftime("%Y-%m-%d_%H-%M-%S", gmtime()) 80 | 81 | 82 | def dedup_consecutive_data(our_data, key): 83 | dedup_our_data = [] 84 | cache = {} 85 | for i, item in enumerate(our_data): 86 | if item[key].replace(" ", "") in cache: 87 | continue 88 | else: 89 | dedup_our_data.append(item) 90 | cache[item[key].replace(" ", "")] = True 91 | 92 | return dedup_our_data 93 | 94 | 95 | def read_lines_from_file(file): 96 | file = Path(file) 97 | texts = file.read_text().split("\n") 98 | texts = [x for x in texts if x.strip()] 99 | return texts 100 | 101 | 102 | def is_empty_or_absent_dir(dir_name): 103 | return not os.path.exists(dir_name) or not os.listdir(dir_name) 104 | -------------------------------------------------------------------------------- /paper/experiments/ldc/preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from copy import deepcopy 4 | from pathlib import Path 5 | 6 | from datatuner.classification.distractors import (get_distractors, 7 | write_classification_data) 8 | from datatuner.lm.special_token_generator import generate_from_json 9 | from datatuner.lm.utils import fix_text_in_dir 10 | from datatuner.utils import bracket_contents 11 | from fire import Fire 12 | 13 | random.seed(42) 14 | 15 | 16 | def get_entities(amr): 17 | options = bracket_contents(amr, opening="(", ending=")") 18 | options = [option.strip("() ") for option in options if option.count("(") == 1 and option.count("<") == 0] 19 | options = [option for option in options if option[0].isupper()] 20 | return options 21 | 22 | 23 | def preprocess(in_folder, out_folder, classification_dir, num_candidates=10, max_per_operation=10): 24 | """Linearize the data already processed into surface texts and AMRs into our format""" 25 | 26 | splits = {"test": "test", "dev": "validation", "train": "train"} 27 | 28 | in_folder = Path(in_folder) 29 | out_folder = Path(out_folder) 30 | out_folder.mkdir(parents=True, exist_ok=True) 31 | 32 | for split in splits: 33 | amrs = (in_folder / split / "nodes.pp.txt").read_text().split("\n") 34 | surfaces = (in_folder / split / "surface.pp.txt").read_text().split("\n") 35 | raw_amrs = (in_folder / ".." / "tmp_amr" / split / "graphs.txt").read_text().split("\n") 36 | items = [ 37 | {"linearized_amr": amr, "answer_text": surface, "raw_amr": raw_amr} 38 | for amr, surface, raw_amr in zip(amrs, surfaces, raw_amrs) 39 | if amr and surface 40 | ] 41 | 42 | classification_data = [] 43 | original_items = deepcopy(items) 44 | for item in items: 45 | 46 | entities = get_entities(item["linearized_amr"]) 47 | 48 | swapping_candidates = [entities] 49 | cutting_candidates = [entities] 50 | 51 | rand_item = None 52 | while rand_item is None or rand_item == item: 53 | rand_item = random.choice(original_items) 54 | 55 | random_text = rand_item["answer_text"] 56 | 57 | distractors, classification_items = get_distractors( 58 | item["linearized_amr"], 59 | item["answer_text"], 60 | swapping_candidates, 61 | cutting_candidates, 62 | random_text, 63 | num_candidates=num_candidates, 64 | max_per_operation=max_per_operation, 65 | ) 66 | classification_data.extend(classification_items) 67 | 68 | item["answer_text"] = distractors + [item["answer_text"]] 69 | 70 | json.dump(items, open(out_folder / (splits[split] + ".json"), "w"), indent=2) 71 | write_classification_data(classification_data, classification_dir, splits[split].replace(".json", "")) 72 | 73 | generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"linearized_amr": "amr"}) 74 | fix_text_in_dir(out_folder) 75 | 76 | 77 | if __name__ == "__main__": 78 | Fire(preprocess) 79 | -------------------------------------------------------------------------------- /src/datatuner/lm/custom_tokenizer.py: -------------------------------------------------------------------------------- 1 | from datatuner.lm.special_token_generator import get_custom_tags 2 | 3 | 4 | def tokenize(self, text, **kwargs): 5 | """ Converts a string in a sequence of tokens (string), using the tokenizer. 6 | Split in words for word-based vocabulary or sub-words for sub-word-based 7 | vocabularies (BPE/SentencePieces/WordPieces). 8 | 9 | Take care of added tokens. 10 | """ 11 | 12 | def split_on_token(tok, text): 13 | result = [] 14 | split_text = text.split(tok) 15 | for i, sub_text in enumerate(split_text): 16 | sub_text = sub_text.strip() 17 | if i == 0 and not sub_text: 18 | result += [tok] 19 | elif i == len(split_text) - 1: 20 | if sub_text: 21 | result += [sub_text] 22 | else: 23 | pass 24 | else: 25 | if sub_text: 26 | result += [sub_text] 27 | result += [tok] 28 | return result 29 | 30 | def split_on_tokens(tok_list, text): 31 | if not text: 32 | return [] 33 | if not tok_list: 34 | return self._tokenize(text, **kwargs) 35 | 36 | tokenized_text = [] 37 | text_list = [text] 38 | for tok in tok_list: 39 | tokenized_text = [] 40 | for sub_text in text_list: 41 | if sub_text not in self.added_tokens_encoder and sub_text not in self.all_special_tokens: 42 | tokenized_text += split_on_token(tok, sub_text) 43 | else: 44 | tokenized_text += [sub_text] 45 | text_list = tokenized_text 46 | 47 | return sum( 48 | ( 49 | self._tokenize(token, **kwargs) 50 | if token not in self.added_tokens_encoder and token not in self.all_special_tokens 51 | else [token] 52 | for token in tokenized_text 53 | ), 54 | [], 55 | ) 56 | 57 | def get_special_tokens(s): 58 | candidates = get_custom_tags(s) 59 | return [cand for cand in candidates if cand in self.added_tokens_encoder.keys()] 60 | 61 | # The below becomes very slow when we scale to thousands of special tokens (e.g. many node types/predicates) 62 | # self.added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens 63 | 64 | all_added = list(self.added_tokens_encoder.keys()) 65 | 66 | TOO_LARGE_NUM_TOKENS_THRESHOLD = 10 67 | # If we have a large number of special tokens, our current hack is to use task specific regexes to decide 68 | # candidates from the sentence first, and then match these candidates with the special tokens in the encoder. 69 | # That way we reduce the number of special tokens per iteration to a handful instead of thousands. 70 | if len(all_added) > TOO_LARGE_NUM_TOKENS_THRESHOLD: 71 | current_added_tokens = get_special_tokens(text) 72 | else: 73 | # Otherwise, we simply take all the added tokens, as is the original case in the library 74 | current_added_tokens = all_added 75 | 76 | added_tokens = current_added_tokens + self.all_special_tokens 77 | 78 | tokenized_text = split_on_tokens(added_tokens, text) 79 | return tokenized_text 80 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /src/datatuner/lm/reranker.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from datatuner.lm.data_loader import get_inputs 6 | from datatuner.lm.model_loader import load_pretrained 7 | from datatuner.lm.utils import custom_deep_copy, load_task_config, should_ignore_in_score 8 | from datatuner.utils import geo_mean 9 | 10 | 11 | class Reranker: 12 | def __init__(self, model_folder, device, is_local=True): 13 | with torch.no_grad(): 14 | self.model, self.tokenizer = load_pretrained(model_folder, model_type="gpt2") 15 | self.model_folder = Path(model_folder) 16 | try: 17 | self.task_config = load_task_config(model_folder / "task_config.json") 18 | except: 19 | self.task_config = None 20 | self.device = device 21 | self.is_local = is_local 22 | self.model.to(self.device) 23 | self.model.eval() 24 | 25 | self.NEWLINE = [198] 26 | self.SPACE = [220] 27 | 28 | def remove_unsupported_tokens(self, ids): 29 | new_ids = [] 30 | for j in ids: 31 | try: 32 | self.tokenizer.decode(j) 33 | new_ids.append(j) 34 | except: 35 | new_ids.append(self.SPACE[0]) 36 | return new_ids 37 | 38 | def create_input(self, input_ids, item): 39 | if not self.is_local: 40 | assert item is not None 41 | item = custom_deep_copy(item) 42 | item.update({"answer_text": input_ids}) 43 | input_ids, token_type_ids = get_inputs(item, self.device, self.tokenizer, self.task_config) 44 | context_len = len(input_ids[0]) - len(input_ids) 45 | 46 | else: 47 | if "linearized_amr" in item: 48 | context = [] 49 | context_len = 0 50 | 51 | input_ids = self.tokenizer.encode( 52 | self.tokenizer.decode(self.remove_unsupported_tokens(context + input_ids)) 53 | ) 54 | 55 | input_ids = torch.tensor(input_ids, device=self.device).unsqueeze(0) 56 | token_type_ids = None 57 | 58 | return input_ids, token_type_ids, context_len 59 | 60 | def score(self, input_ids, item): 61 | 62 | input_ids, token_type_ids, context_len = self.create_input(input_ids, item) 63 | 64 | model_outputs = self.model(input_ids, token_type_ids=token_type_ids) 65 | probs = F.softmax(model_outputs[0][0], dim=1) 66 | x = [] 67 | for i in range(context_len, len(input_ids[0])): 68 | next_token_id = input_ids[0][i].item() 69 | next_token_str = self.tokenizer.decode(next_token_id) 70 | prefix = input_ids[0][context_len:i] 71 | next_prob = probs[i - 1][next_token_id].item() 72 | if ( 73 | not should_ignore_in_score(prefix, self.tokenizer, next_token_str, next_token_id, next_prob) 74 | and input_ids[0][i] != self.SPACE[0] 75 | ): 76 | x.append(next_prob) 77 | score = geo_mean(x) 78 | return score 79 | 80 | def rerank(self, nbest_items, item): 81 | with torch.no_grad(): 82 | scores = [] 83 | 84 | for input_ids in nbest_items: 85 | scores.append(-self.score(input_ids, item)) 86 | 87 | nbest_items = [x for _, _, x in sorted(zip(scores, list(range(0, len(nbest_items))), nbest_items))] 88 | 89 | return nbest_items 90 | -------------------------------------------------------------------------------- /paper/experiments/viggo/preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from copy import deepcopy 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | from datatuner.classification.distractors import (get_distractors, 8 | write_classification_data) 9 | from datatuner.lm.special_token_generator import generate_from_json 10 | from datatuner.lm.utils import fix_text_in_dir 11 | from fire import Fire 12 | 13 | random.seed(42) 14 | 15 | 16 | def parse_mr(mr): 17 | i = mr.index("(") 18 | intro, params_str = mr[:i], mr[i + 1: -1] 19 | j = 0 20 | scope = "key" 21 | current_key = "" 22 | current_val = "" 23 | keys = [] 24 | values = [] 25 | while j < len(params_str): 26 | next_char = params_str[j] 27 | if scope == "key": 28 | if next_char == "[": 29 | scope = "value" 30 | keys.append(current_key) 31 | current_key = "" 32 | else: 33 | current_key += next_char 34 | 35 | elif scope == "value": 36 | if next_char == "]": 37 | scope = "between" 38 | values.append(current_val) 39 | current_val = "" 40 | else: 41 | current_val += next_char 42 | 43 | elif scope == "between": 44 | if next_char == " ": 45 | scope = "key" 46 | 47 | j += 1 48 | assert len(keys) == len(values) 49 | return {"keys": keys, "values": values, "intro": intro} 50 | 51 | 52 | def preprocess(in_folder, out_folder, classification_dir): 53 | in_folder = Path(in_folder) 54 | out_folder = Path(out_folder) 55 | 56 | out_folder.mkdir(parents=True, exist_ok=True) 57 | 58 | splits = {"viggo-test.csv": "test.json", "viggo-train.csv": "train.json", "viggo-valid.csv": "validation.json"} 59 | for split in splits: 60 | df = pd.read_csv(in_folder / split) 61 | data = df.to_dict(orient="records") 62 | original_data = deepcopy(data) 63 | classification_data = [] 64 | 65 | for item in data: 66 | mr = item["mr"] 67 | parsed = parse_mr(mr) 68 | new_params = [ 69 | f"<{key}> {key.replace('_', ' ')}: [ {value} ]" for key, value in zip(parsed["keys"], parsed["values"]) 70 | ] 71 | new_mr = f"<{parsed['intro']}> {parsed['intro'].replace('_', ' ')} ( {', '.join(new_params)}> )" 72 | 73 | item["new_mr"] = new_mr 74 | 75 | valid_values = [x for x in parsed["values"] if x] 76 | swapping_candidates = [valid_values] 77 | cutting_candidates = [valid_values] 78 | 79 | rand_item = None 80 | while rand_item is None or rand_item == item: 81 | rand_item = random.choice(original_data) 82 | random_text = rand_item["ref"] 83 | 84 | distractors, classification_items = get_distractors( 85 | new_mr, 86 | item["ref"], 87 | swapping_candidates, 88 | cutting_candidates, 89 | random_text, 90 | num_candidates=10, 91 | max_per_operation=10, 92 | ) 93 | classification_data.extend(classification_items) 94 | 95 | item["ref"] = distractors + [item["ref"]] 96 | 97 | json.dump(data, open(out_folder / (splits[split]), "w"), indent=2) 98 | write_classification_data(classification_data, classification_dir, splits[split].replace(".json", "")) 99 | 100 | generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"new_mr": "amr"}) 101 | fix_text_in_dir(out_folder) 102 | 103 | 104 | if __name__ == "__main__": 105 | Fire(preprocess) 106 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/scalar_slot.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list, find_all_in_list, get_slot_value_alternatives 4 | 5 | 6 | DIST_IDX_THRESH = 10 7 | DIST_POS_THRESH = 30 8 | 9 | 10 | def align_scalar_slot(text, text_tok, slot, value, slot_mapping=None, value_mapping=None, slot_stem_only=False): 11 | slot_stem_indexes = [] 12 | slot_stem_positions = [] 13 | leftmost_pos = -1 14 | 15 | text = re.sub(r'\'', '', text) 16 | 17 | # Get the words that possibly realize the slot 18 | slot_stems = __get_scalar_slot_stems(slot) 19 | 20 | if slot_mapping is not None: 21 | slot = slot_mapping 22 | alternatives = get_slot_value_alternatives(slot) 23 | 24 | # Search for all possible slot realizations 25 | for slot_stem in slot_stems: 26 | if len(slot_stem) == 1 and not slot_stem.isalnum(): 27 | # Exception for single-letter special-character slot stems 28 | slot_stem_pos = [m.start() for m in re.finditer(slot_stem, text)] 29 | elif len(slot_stem) > 4 or ' ' in slot_stem: 30 | slot_stem_pos = [m.start() for m in re.finditer(slot_stem, text)] 31 | else: 32 | slot_stem_idx, slot_stem_pos = find_all_in_list(slot_stem, text_tok) 33 | if len(slot_stem_idx) > 0: 34 | slot_stem_indexes.extend(slot_stem_idx) 35 | 36 | if len(slot_stem_pos) > 0: 37 | slot_stem_positions.extend(slot_stem_pos) 38 | 39 | slot_stem_positions.sort() 40 | slot_stem_indexes.sort() 41 | 42 | # If it's only required that the slot stem is matched, don't search for the value 43 | if slot_stem_only and len(slot_stem_positions) > 0: 44 | return slot_stem_positions[0] 45 | 46 | # Get the value's alternative realizations 47 | value_alternatives = [value] 48 | if value_mapping is not None: 49 | value = value_mapping[value] 50 | value_alternatives.append(value) 51 | if value in alternatives: 52 | value_alternatives += alternatives[value] 53 | 54 | # Search for all possible value equivalents 55 | for val in value_alternatives: 56 | if len(val) > 4 or ' ' in val: 57 | # Search for multi-word values in the string representation 58 | val_positions = [m.start() for m in re.finditer(val, text)] 59 | for pos in val_positions: 60 | # Remember the leftmost value position as a fallback in case there is no nearby slot stem mention 61 | if pos < leftmost_pos or leftmost_pos == -1: 62 | leftmost_pos = pos 63 | 64 | # Find a slot stem mention within a certain distance from the value realization 65 | if len(slot_stem_positions) > 0: 66 | for slot_stem_pos in slot_stem_positions: 67 | if abs(pos - slot_stem_pos) < DIST_POS_THRESH: 68 | return pos 69 | else: 70 | # Search for single-word values in the tokenized representation 71 | val_indexes, val_positions = find_all_in_list(val, text_tok) 72 | for i, idx in enumerate(val_indexes): 73 | # Remember the leftmost value position as a fallback in case there is no nearby slot stem mention 74 | if val_positions[i] < leftmost_pos or leftmost_pos == -1: 75 | leftmost_pos = val_positions[i] 76 | 77 | # Find a slot stem mention within a certain distance from the value realization 78 | if len(slot_stem_indexes) > 0: 79 | for slot_stem_idx in slot_stem_indexes: 80 | if abs(idx - slot_stem_idx) < DIST_IDX_THRESH: 81 | return val_positions[i] 82 | 83 | return leftmost_pos 84 | 85 | 86 | def __get_scalar_slot_stems(slot): 87 | slot_stems = { 88 | 'esrb': ['esrb'], 89 | 'rating': ['rating', 'ratings', 'rated', 'rate', 'review', 'reviews'], 90 | 'customerrating': ['customer', 'rating', 'ratings', 'rated', 'rate', 'review', 'reviews', 'star', 'stars'], 91 | 'pricerange': ['price', 'pricing', 'cost', 'costs', 'dollars', 'pounds', 'euros', '\$', '£', '€'] 92 | } 93 | 94 | return slot_stems.get(slot, []) 95 | -------------------------------------------------------------------------------- /paper/experiments/e2e/preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import random 4 | from collections import OrderedDict 5 | from copy import deepcopy 6 | from pathlib import Path 7 | 8 | import pandas as pd 9 | from datatuner.classification.distractors import (get_distractors, 10 | write_classification_data) 11 | from datatuner.lm.special_token_generator import generate_from_json 12 | from datatuner.lm.utils import fix_text_in_dir 13 | from fire import Fire 14 | 15 | 16 | def parse_mr(mr): 17 | params_str = mr 18 | j = 0 19 | scope = "key" 20 | current_key = "" 21 | current_val = "" 22 | keys = [] 23 | values = [] 24 | while j < len(params_str): 25 | next_char = params_str[j] 26 | if scope == "key": 27 | if next_char == "[": 28 | scope = "value" 29 | keys.append(current_key) 30 | current_key = "" 31 | else: 32 | current_key += next_char 33 | 34 | elif scope == "value": 35 | if next_char == "]": 36 | scope = "between" 37 | 38 | values.append(current_val) 39 | current_val = "" 40 | else: 41 | current_val += next_char 42 | elif scope == "between": 43 | scope = "key" 44 | j += 2 45 | continue 46 | 47 | j += 1 48 | 49 | assert len(keys) == len(values) 50 | return {"keys": keys, "values": values} 51 | 52 | 53 | def preprocess(in_folder, out_folder, classification_dir): 54 | in_folder = Path(in_folder) 55 | splits = {"train-fixed.no-ol": "train", "devel-fixed.no-ol": "validation", "test-fixed": "test"} 56 | 57 | for split in splits: 58 | classification_data = [] 59 | df = pd.read_csv(in_folder / (split + ".csv")) 60 | out_folder = Path(out_folder) 61 | out_folder.mkdir(parents=True, exist_ok=True) 62 | data = df.to_dict(orient="records") 63 | original_data = deepcopy(data) 64 | 65 | new_data = OrderedDict() 66 | print(len(data)) 67 | for item in data: 68 | key = item["mr"] 69 | if key in new_data: 70 | new_data[key].append(item) 71 | else: 72 | new_data[key] = [item] 73 | 74 | out_data = [] 75 | for mr_key in new_data: 76 | 77 | for item in new_data[mr_key]: 78 | 79 | mr = item["mr"] 80 | parsed = parse_mr(mr) 81 | new_params = [f"<{key}> {key} = [ {value} ]" for key, value in zip(parsed["keys"], parsed["values"])] 82 | new_mr = " ; ".join(new_params) 83 | item["new_mr"] = new_mr 84 | out_data.append(item) 85 | 86 | valid_values = [x for x in parsed["values"] if x] 87 | swapping_candidates = [valid_values] 88 | cutting_candidates = [valid_values] 89 | 90 | rand_item = None 91 | while rand_item is None or rand_item == item: 92 | rand_item = random.choice(original_data) 93 | random_text = rand_item["ref"] 94 | 95 | distractors, classification_items = get_distractors( 96 | new_mr, 97 | item["ref"], 98 | swapping_candidates, 99 | cutting_candidates, 100 | random_text, 101 | num_candidates=1, 102 | max_per_operation=1, 103 | ) 104 | 105 | classification_data.extend(classification_items) 106 | 107 | print(f"written for {split}") 108 | json.dump(out_data, open(out_folder / (splits[split] + ".json"), "w"), indent=2) 109 | classification_data = random.sample(classification_data, int(math.ceil(0.7 * len(classification_data)))) 110 | write_classification_data(classification_data, classification_dir, splits[split].replace(".json", "")) 111 | 112 | generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"new_mr": "amr"}) 113 | fix_text_in_dir(out_folder) 114 | 115 | 116 | if __name__ == "__main__": 117 | Fire(preprocess) 118 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/categorical_slots.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import word_tokenize 2 | from nltk.corpus import wordnet 3 | 4 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list, get_slot_value_alternatives 5 | 6 | 7 | def align_categorical_slot(text, text_tok, slot, value, mode='exact_match'): 8 | # TODO: load alternatives only once 9 | alternatives = get_slot_value_alternatives(slot) 10 | 11 | pos = find_value_alternative(text, text_tok, value, alternatives, mode=mode) 12 | 13 | return pos 14 | 15 | 16 | def find_value_alternative(text, text_tok, value, alternatives, mode): 17 | leftmost_pos = -1 18 | 19 | # Parse the item into tokens according to the selected mode 20 | if mode == 'first_word': 21 | value_alternatives = [value.split(' ')[0]] # Single-element list 22 | elif mode == 'any_word': 23 | value_alternatives = value.split(' ') # List of elements 24 | elif mode == 'all_words': 25 | value_alternatives = [value.split(' ')] # List of single-element lists 26 | else: 27 | value_alternatives = [value] # Single-element list 28 | 29 | # Merge the tokens with the item's alternatives 30 | if value in alternatives: 31 | value_alternatives += alternatives[value] 32 | 33 | # Iterate over individual tokens of the item 34 | for value_alt in value_alternatives: 35 | # If the item is composed of a single token, convert it to a single-element list 36 | if not isinstance(value_alt, list): 37 | value_alt = [value_alt] 38 | 39 | # Keep track of the positions of all the item's tokens 40 | positions = [] 41 | for tok in value_alt: 42 | if len(tok) > 4 or ' ' in tok: 43 | # Search for long and multi-word values in the string representation 44 | pos = text.find(tok) 45 | else: 46 | # Search for short single-word values in the tokenized representation 47 | _, pos = find_first_in_list(tok, text_tok) 48 | positions.append(pos) 49 | 50 | # If all tokens of one of the value's alternatives are matched, record the match and break 51 | if all([p >= 0 for p in positions]): 52 | leftmost_pos = min(positions) 53 | break 54 | 55 | return leftmost_pos 56 | 57 | 58 | # TODO @food has 24 failures which are acceptable to remove the slot 59 | def foodSlot(text, text_tok, value): 60 | value = value.lower() 61 | 62 | pos = text.find(value) 63 | if pos >= 0: 64 | return pos 65 | elif value == 'english': 66 | return text.find('british') 67 | elif value == 'fast food': 68 | return text.find('american style') 69 | else: 70 | text_tok = word_tokenize(text) 71 | for token in text_tok: 72 | # FIXME warning this will be slow on start up 73 | synsets = wordnet.synsets(token, pos='n') 74 | synset_ctr = 0 75 | 76 | for synset in synsets: 77 | synset_ctr += 1 78 | hypernyms = synset.hypernyms() 79 | 80 | # If none of the first 3 meanings of the word has "food" as hypernym, then we do not want to 81 | # identify the word as food-related (e.g. "center" has its 14th meaning associated with "food", 82 | # or "green" has its 7th meaning accociated with "food"). 83 | while synset_ctr <= 3 and len(hypernyms) > 0: 84 | lemmas = [l.name() for l in hypernyms[0].lemmas()] 85 | 86 | if 'food' in lemmas: 87 | # DEBUG PRINT 88 | # print(token) 89 | 90 | return text.find(token) 91 | # Skip false positives (e.g. "a" in the meaning of "vitamin A" has "food" as a hypernym, 92 | # or "coffee" in "coffee shop" has "food" as a hypernym). There are still false positives 93 | # triggered by proper nouns containing a food term, such as "Burger King" or "The Golden Curry". 94 | elif 'vitamin' in lemmas: 95 | break 96 | elif 'beverage' in lemmas: 97 | break 98 | 99 | # Follow the hypernyms recursively up to the root 100 | hypernyms = hypernyms[0].hypernyms() 101 | 102 | return pos 103 | -------------------------------------------------------------------------------- /src/datatuner/lm/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import logging 4 | from pathlib import Path 5 | 6 | import ftfy 7 | import torch 8 | 9 | logger = logging.getLogger(__file__) 10 | 11 | 12 | def average_distributed_scalar(scalar, args): 13 | """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """ 14 | if args.local_rank == -1: 15 | return scalar 16 | scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size() 17 | torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) 18 | return scalar_t.item() 19 | 20 | 21 | def load_task_config(filename): 22 | """Load the task configuration from file""" 23 | task_config = json.load(open(filename, "r")) 24 | return task_config 25 | 26 | 27 | def is_middle_token(tokenizer, token_str, prefix): 28 | try: 29 | tokenizer_name = str(type(tokenizer)) 30 | 31 | if len(prefix) == 0: 32 | return False 33 | 34 | prev_token_str = tokenizer.decode(prefix[-1]) 35 | 36 | # If the previous token is not alphanumeric, it's not a middle token 37 | if not prev_token_str[-1].isalnum(): 38 | return False 39 | 40 | # The prev and current tokens should be of same type. 41 | if not ( 42 | (prev_token_str[-1].isalpha() and token_str[0].isalpha()) 43 | or (prev_token_str[-1].isdigit() and token_str[0].isdigit()) 44 | ): 45 | return False 46 | 47 | if "GPT2" in tokenizer_name: 48 | return not (token_str[0] in [" ", "\u0120"]) 49 | elif "OpenAIGPT" in tokenizer_name: 50 | return not prefix[-1].endswith("") 51 | else: 52 | raise Exception("non-supported tokenizer") 53 | except: 54 | return False 55 | 56 | 57 | def is_added_token(tokenizer, token_id): 58 | return token_id >= len(tokenizer.decoder) 59 | 60 | 61 | def should_stop_further_beam_exploration(prefix, tokenizer, next_token_str, next_token_id, next_prob, prob_thresh=0.9): 62 | """We stop exploring the beam further if the current string is a word continuation as we don't expect better 63 | continuations to appear. 64 | Example 1: if we get "Who is the res" and the next token is "ponsible", we stop exploring. 65 | Example 2: if we get "The airport code is 12" and the next token is "4", we stop exploring. 66 | Example 3: if we get "The airport code is twenty" and the next token is ".", we stop exploring. 67 | Example 4: if we get "The airport code is 123" and the next token is ".", we stop exploring. 68 | """ 69 | return ( 70 | # The token is a middle token 71 | is_middle_token(tokenizer, next_token_str, prefix) 72 | # is not a special token 73 | and not is_added_token(tokenizer, next_token_id) 74 | and next_prob > prob_thresh 75 | ) 76 | 77 | 78 | def should_ignore_in_score(prefix, tokenizer, next_token_str, next_token_id, next_prob, prob_thresh=0.9): 79 | return ( 80 | # Probability is high enough 81 | # next_prob > prob_thresh 82 | # The token is a middle token 83 | is_middle_token(tokenizer, next_token_str, prefix) 84 | # is alphanumeric (avoid punctuations) 85 | and next_token_str.strip()[0].isalnum() 86 | # is not a special token 87 | and not is_added_token(tokenizer, next_token_id) 88 | and next_prob > prob_thresh 89 | ) 90 | 91 | 92 | def custom_deep_copy(d): 93 | if type(d) == dict: 94 | new_d = {} 95 | for key in d: 96 | try: 97 | new_d[key] = torch.clone(d[key]) 98 | except: 99 | new_d[key] = copy.deepcopy(d[key]) 100 | return new_d 101 | else: 102 | try: 103 | return torch.clone(d) 104 | except: 105 | return copy.deepcopy(d) 106 | 107 | 108 | def fix_text_in_dir(directory): 109 | """Fix text encoding with ftfy for the data splits withdirectory""" 110 | directory = Path(directory) 111 | for split in ["train.json", "validation.json", "test.json"]: 112 | data = json.load(open(directory / split)) 113 | for item in data: 114 | for k in item: 115 | if type(item[k]) == str: 116 | item[k] = ftfy.fix_text(item[k]) 117 | elif type(item[k]) == list: 118 | item[k] = [ftfy.fix_text(x) for x in item[k]] 119 | json.dump(data, open(directory / split, "w"), indent=2) 120 | -------------------------------------------------------------------------------- /paper/retrieve.sh: -------------------------------------------------------------------------------- 1 | source ./config.sh 2 | 3 | # Function Definitions 4 | 5 | # confirm external dependencies with user 6 | EXTERNAL_DEPS_MSG="""The scripts provided herein will retrieve several third-party libraries, 7 | environments, and/or other software packages at install-time or build-time (“External Dependencies”) 8 | from third-party sources. There are terms and conditions that you need to agree to 9 | abide by if you choose to install the External Dependencies. If you do not agree 10 | with every term and condition associated with the External Dependencies, 11 | enter “QUIT” in the command line when prompted by the script.""" 12 | 13 | confirm_external_dependencies() { 14 | echo 15 | echo $EXTERNAL_DEPS_MSG 16 | while true; do 17 | read -p "Do you want to PROCEED or QUIT? " yn 18 | case $yn in 19 | PROCEED) 20 | echo "Proceeding" 21 | break 22 | ;; 23 | QUIT) 24 | echo "Quitting" 25 | exit 26 | ;; 27 | esac 28 | done 29 | } 30 | 31 | # clone a specific repository commit to the give folder 32 | clone_repo_commit() { 33 | # params: repo_url, commit, folder 34 | git clone $1 $3 35 | cd $3 36 | git checkout $2 --quiet 37 | cd - 38 | } 39 | 40 | # check if directory exists and exit with a special message if not 41 | assert_dir_exists() { 42 | # params: directory, message on failure 43 | if [ ! -d $1 ]; then 44 | echo "Error: $1 does not exist." 45 | echo $2 46 | exit 47 | fi 48 | } 49 | 50 | 51 | ############################################################################################# 52 | 53 | confirm_external_dependencies 54 | 55 | # Check that the LDC2017T10 and ViGGO datasets have been manually downloaded and placed in the correct locations. 56 | assert_dir_exists $LDC2017_DATA_LOCATION "The folder $LDC2017_DATA_LOCATION should contain the LDC2017T10 dataset. Download it from https://catalog.ldc.upenn.edu/LDC2017T10" 57 | newline 58 | assert_dir_exists $VIGGO_DATA_LOCATION "The folder $VIGGO_DATA_LOCATION should contain the ViGGO dataset. Download it from https://nlds.soe.ucsc.edu/viggo" 59 | newline 60 | 61 | # Ask the user if the temporary folder exists so that we don't remove and retrieve the data again. 62 | if [ -d $TMP_DATA_FOLDER ]; then 63 | echo "Directory $TMP_DATA_FOLDER exists. Are you sure you want to delete the data and retrieve it again?" 64 | 65 | select yn in "Yes" "No"; do 66 | case $yn in 67 | Yes) 68 | echo "Alright. Continuing!" 69 | break 70 | ;; 71 | No) 72 | echo "Exiting" 73 | exit 74 | ;; 75 | esac 76 | done 77 | else 78 | echo "Directory $TMP_DATA_FOLDER does not exist." 79 | fi 80 | 81 | newline 82 | echo "Creating the folder $TMP_DATA_FOLDER for placing the data there." 83 | rm -rf ./tmp 84 | mkdir -p ./tmp 85 | 86 | newline 87 | 88 | MAIN_DIR=`pwd` 89 | ############################################################################################# 90 | 91 | echo "Processing LDC2017T10 dataset" 92 | 93 | echo "Getting the repository for data preprocessing" 94 | clone_repo_commit https://github.com/UKPLab/emnlp2019-dualgraph.git 0c58fb7f3ad3b9da3b92b2d2841558807fc79fd0 $TMP_DATA_FOLDER/emnlp2019-dualgraph 95 | 96 | echo "Copying the changes needed" 97 | cp ../src/external/ukplab_emnlp2019_dualgraph/split_amr.py $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/split_amr.py 98 | cp ../src/external/ukplab_emnlp2019_dualgraph/gen_LDC2017T10.sh $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/gen_LDC2017T10.sh 99 | cp ../src/external/ukplab_emnlp2019_dualgraph/preproc_amr.py $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/preproc_amr.py 100 | cp ../src/external/ukplab_emnlp2019_dualgraph/preprocess_LDC2017T10.sh $TMP_DATA_FOLDER/emnlp2019-dualgraph/preprocess_LDC2017T10.sh 101 | 102 | echo "Running the initial preprocessing" 103 | bash $TMP_DATA_FOLDER/emnlp2019-dualgraph/preprocess_LDC2017T10.sh $LDC2017_DATA_LOCATION ~/ 104 | 105 | ############################################################################################# 106 | 107 | newline 108 | cd $MAIN_DIR 109 | 110 | # WebNLG dataset 111 | echo "Processing WebNLG dataset" 112 | 113 | echo "Retrieving WebNLG data" 114 | clone_repo_commit https://github.com/ThiagoCF05/webnlg.git 12ca34880b225ebd1eb9db07c64e8dd76f7e5784 $TMP_DATA_FOLDER/webnlg 115 | 116 | ############################################################################################# 117 | 118 | newline 119 | cd $MAIN_DIR 120 | 121 | # Cleaned E2E 122 | echo "Processing Cleaned E2E dataset" 123 | clone_repo_commit https://github.com/tuetschek/e2e-cleaning.git c6f634ba16aec89f5ec5462e9c62fb3e8c5c5d16 $TMP_DATA_FOLDER/e2e-cleaning 124 | 125 | ############################################################################################# 126 | 127 | cd $MAIN_DIR 128 | 129 | # E2E Metrics 130 | echo "Getting E2E metrics repository" 131 | clone_repo_commit https://github.com/tuetschek/e2e-metrics.git dca5d301a97f7264b0827fb5589c0cc51008b5d7 $TMP_DATA_FOLDER/e2e-metrics 132 | 133 | 134 | newline 135 | echo "Successfully retrieved the data from their sources" 136 | -------------------------------------------------------------------------------- /src/external/shimorina_inlg_2018/webnlg_slot_error_rate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import re 4 | import sys 5 | 6 | from external.shimorina_webnlg_baseline.benchmark_reader import Benchmark 7 | from external.shimorina_webnlg_baseline.webnlg_baseline_input import select_files 8 | from nltk.tokenize import wordpunct_tokenize 9 | import json 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 12 | 13 | 14 | def substring_match(x, values): 15 | return any(x in v for v in values) 16 | 17 | 18 | verbose = False 19 | 20 | 21 | def calculate_ser(mr, pred): 22 | values = clean_mr(mr) 23 | total_n_slots = len(values) 24 | missing = 0 25 | hallucinated = 0 26 | for value in values: 27 | if value not in pred.lower(): 28 | if verbose: 29 | print("\n") 30 | print("Missing:", value) 31 | print(mr) 32 | print(value) 33 | print(pred) 34 | missing += 1 35 | # delete s and o that are present in MR 36 | # account for the case where the item is "texas" and the values have an entry: "abilene, texas". This is not hallucinated. 37 | all_subj_obj_not_pres = [ 38 | item for item in ENTITIES if not substring_match(item, values) 39 | ] 40 | # all_subj_obj_not_pres = [item for item in ENTITIES if item not in values] 41 | 42 | for entity in all_subj_obj_not_pres: 43 | if entity in pred.lower().split(): 44 | hallucinated += 1 45 | if verbose: 46 | print("\n") 47 | print("Hallucination:") 48 | print(mr) 49 | print(entity) 50 | print(pred) 51 | # print('COUNTS: Missing', missing, 'Hallucinated', hallucinated, 'Denominator', total_n_slots) 52 | ser = (missing + hallucinated) / total_n_slots 53 | return ser 54 | 55 | 56 | def clean_mr(mr): 57 | # (19255)_1994_VK8 | density | 2.0(gramPerCubicCentimetres) | | | 58 | # extract all subjects and objects and clean them 59 | subj_obj = [] 60 | triples = mr.strip().split("|||") # the last one is empty 61 | triples = [triple for triple in triples if triple] # delete empty triples 62 | for triple in triples: 63 | s, p, o = triple.split(" | ") 64 | s = s.lower().replace("_", " ") 65 | o = o.lower().replace("_", " ") 66 | # separate punct signs from text 67 | s = " ".join(re.split(r"(\W)", s)) 68 | o = " ".join(re.split(r"(\W)", o)) 69 | # Drop quotes 70 | s = s.replace('"', "") 71 | o = o.replace('"', "") 72 | # delete white spaces 73 | subj_obj.append(" ".join(s.split())) 74 | subj_obj.append(" ".join(o.split())) 75 | return subj_obj 76 | 77 | 78 | def get_all_subj_obj(): 79 | # read all the webnlg corpus 80 | # extract all subjects and objects 81 | base_path = "/paper/tmp/webnlg/data/v1.4/en/" 82 | path_train = base_path + "train" 83 | path_dev = base_path + "dev" 84 | path_test = base_path + "test" 85 | b = Benchmark() 86 | files_train = select_files(path_train) 87 | files_dev = select_files(path_dev) 88 | files_test = select_files(path_test) 89 | b.fill_benchmark(files_train + files_dev + files_test) 90 | subjs, objs = b.subjects_objects() 91 | # clean subj and obj 92 | subjs_cleaned = [] 93 | for subj in list(subjs): 94 | subjs_cleaned.append(clean(subj)) 95 | objs_cleaned = [] 96 | for obj in list(objs): 97 | objs_cleaned.append(clean(obj)) 98 | return subjs_cleaned, objs_cleaned 99 | 100 | 101 | def clean(entity): 102 | entity = entity.lower().replace("_", " ") 103 | # separate punct signs from text 104 | entity = " ".join(re.split(r"(\W)", entity)) 105 | entity = " ".join(entity.split()) # delete whitespaces 106 | return entity 107 | 108 | 109 | def get_all_entities_in_corpus(): 110 | # get all cleaned s and o from the whole corpus 111 | all_subj_cleaned, all_obj_cleaned = get_all_subj_obj() 112 | entities = list(set(all_subj_cleaned + all_obj_cleaned)) 113 | # delete all numbers from entities 114 | for i, entity in enumerate(entities): 115 | try: 116 | float(entity.replace(" ", "")) 117 | del entities[i] 118 | except ValueError: 119 | pass 120 | return entities 121 | 122 | 123 | ENTITIES = get_all_entities_in_corpus() 124 | 125 | 126 | def compute_ser(datafile, outfile, mr_field, text_field): 127 | df = pd.read_json(datafile, orient="records") 128 | 129 | df["ser"] = df.apply( 130 | lambda x: calculate_ser( 131 | x[mr_field], " ".join(wordpunct_tokenize(x[text_field][0])) 132 | ), 133 | axis=1, 134 | ) 135 | 136 | df["ser_correct"] = df["ser"].apply(lambda x: 0 if x > 0 else 1) 137 | 138 | results = {} 139 | results["mean_ser"] = round(df["ser"].mean(), 4) 140 | results["percent_correct_ser"] = round(len(df[df["ser"] == 0]) / len(df) * 100, 4) 141 | print(json.dumps(results, indent=2)) 142 | 143 | data_dict = df.to_dict(orient="records") 144 | json.dump(data_dict, open(outfile, "w"), indent=2) 145 | 146 | return results 147 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/alternatives.json: -------------------------------------------------------------------------------- 1 | { 2 | "genres": { 3 | "action adventure": [ 4 | [ 5 | "action", 6 | "adventur" 7 | ] 8 | ], 9 | "adventure": [ 10 | "adventur" 11 | ], 12 | "driving racing": [ 13 | "driving", 14 | "drive", 15 | "racing", 16 | "race" 17 | ], 18 | "fighting": [ 19 | "fight" 20 | ], 21 | "mmorpg": [ 22 | "massive" 23 | ], 24 | "platformer": [ 25 | "platforming" 26 | ], 27 | "real time strategy": [ 28 | "real time", 29 | "rts" 30 | ], 31 | "role playing": [ 32 | "roleplaying", 33 | "role play", 34 | "rpg", 35 | "rpgs" 36 | ], 37 | "shooter": [ 38 | "shoot", 39 | "fps" 40 | ], 41 | "simulation": [ 42 | "simulat", 43 | "sim" 44 | ], 45 | "strategy": [ 46 | "strateg" 47 | ], 48 | "tactical": [ 49 | "tactic" 50 | ], 51 | "trivia board game": [ 52 | "trivia", 53 | "board" 54 | ], 55 | "turn based strategy": [ 56 | "turn based" 57 | ], 58 | "vehicular combat": [ 59 | [ 60 | "vehic", 61 | "combat" 62 | ] 63 | ] 64 | }, 65 | "playerperspective": { 66 | "first person": [ 67 | "fps" 68 | ], 69 | "bird view": [ 70 | "top down" 71 | ] 72 | }, 73 | "rating": { 74 | "excellent": [ 75 | "5 out of", 76 | "5 star", 77 | "adore", 78 | "amazing", 79 | "attract", 80 | "awesome", 81 | "best", 82 | "fantastic", 83 | "favorite", 84 | "five", 85 | "great", 86 | "high", 87 | "highly", 88 | "love", 89 | "loved", 90 | "loving", 91 | "quality", 92 | "special", 93 | "superb", 94 | "top", 95 | "unique" 96 | ], 97 | "good": [ 98 | "acclaim", 99 | "cool", 100 | "enjoy", 101 | "fun", 102 | "like", 103 | "liked", 104 | "positive", 105 | "solid", 106 | "well" 107 | ], 108 | "average": [ 109 | "3 out of", 110 | "3 star", 111 | "all right", 112 | "alright", 113 | "decent", 114 | "kinda", 115 | "kind of", 116 | "lukewarm", 117 | "mediocre", 118 | "meh", 119 | "middle", 120 | "middling", 121 | "mixed", 122 | "moderate", 123 | "ok", 124 | "okay", 125 | "ordinary", 126 | "so so", 127 | "three", 128 | "unimpress" 129 | ], 130 | "poor": [ 131 | "1 out of", 132 | "1 star", 133 | "avoid", 134 | "bad", 135 | "badly", 136 | "boring", 137 | "detest", 138 | "disappoint", 139 | "dislike", 140 | "dull", 141 | "hate", 142 | "hated", 143 | "hating", 144 | "lackluster", 145 | "lacking", 146 | "loathe", 147 | "low", 148 | "lowly", 149 | "negative", 150 | "one", 151 | "poorly", 152 | "underwhelm", 153 | "wrong" 154 | ] 155 | }, 156 | "esrb": { 157 | "e ( for everyone )": [ 158 | "e rated", 159 | "rated e", 160 | "e rating", 161 | "rating e", 162 | "everyone", 163 | "all" 164 | ], 165 | "e 10+ ( for everyone 10 and older )": [ 166 | "e10+", 167 | "e 10+", 168 | "e 10 plus", 169 | "everyone 10", 170 | "everyone above", 171 | "everyone over", 172 | "everyone older" 173 | ], 174 | "t ( for teen )": [ 175 | "t rated", 176 | "rated t", 177 | "t rating", 178 | "rating t", 179 | "teen", 180 | "teens", 181 | "teenagers" 182 | ], 183 | "m ( for mature )": [ 184 | "m rated", 185 | "rated m", 186 | "m rating", 187 | "rating m", 188 | "mature", 189 | "adult" 190 | ] 191 | }, 192 | "pricerange": { 193 | "cheap": [ 194 | "less than \u00a320", 195 | "inexpensive", 196 | "affordable", 197 | "low", 198 | "lower", 199 | "budget", 200 | "bargain" 201 | ], 202 | "moderate": [ 203 | "\u00a320-25", 204 | "average", 205 | "reasonable" 206 | ], 207 | "high": [ 208 | "more than \u00a330", 209 | "expensive", 210 | "costly", 211 | "pricey", 212 | "high", 213 | "higher" 214 | ], 215 | "less than \u00a320": [ 216 | "cheap", 217 | "inexpensive", 218 | "affordable", 219 | "low", 220 | "budget" 221 | ], 222 | "\u00a320-25": [ 223 | "moderate", 224 | "average", 225 | "reasonable" 226 | ], 227 | "more than \u00a330": [ 228 | "high", 229 | "expensive", 230 | "costly", 231 | "pricey", 232 | "high" 233 | ] 234 | }, 235 | "area": { 236 | "city centre": [ 237 | "center", 238 | "centre", 239 | "downtown", 240 | [ 241 | "middle", 242 | "city" 243 | ], 244 | [ 245 | "middle", 246 | "town" 247 | ] 248 | ], 249 | "riverside": [ 250 | "river" 251 | ] 252 | }, 253 | "type": { 254 | "television": [ 255 | "tv" 256 | ] 257 | } 258 | } 259 | -------------------------------------------------------------------------------- /src/external/webnlg_webnlg_baseline/benchmark_reader.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as Et 2 | from collections import defaultdict 3 | 4 | 5 | class Triple: 6 | 7 | def __init__(self, s, p, o): 8 | self.s = s 9 | self.o = o 10 | self.p = p 11 | 12 | 13 | class Tripleset: 14 | 15 | def __init__(self): 16 | self.triples = [] 17 | 18 | def fill_tripleset(self, t): 19 | for xml_triple in t: 20 | s, p, o = xml_triple.text.split(' | ') 21 | triple = Triple(s, p, o) 22 | self.triples.append(triple) 23 | 24 | 25 | class Lexicalisation: 26 | 27 | def __init__(self, lex, comment, lid): 28 | self.lex = lex 29 | self.comment = comment 30 | self.id = lid 31 | 32 | 33 | class Entry: 34 | 35 | def __init__(self, category, size, eid): 36 | self.originaltripleset = [] 37 | self.modifiedtripleset = Tripleset() 38 | self.lexs = [] 39 | self.category = category 40 | self.size = size 41 | self.id = eid 42 | 43 | def fill_originaltriple(self, xml_t): 44 | otripleset = Tripleset() 45 | self.originaltripleset.append(otripleset) # multiple originaltriplesets for one entry 46 | otripleset.fill_tripleset(xml_t) 47 | 48 | def fill_modifiedtriple(self, xml_t): 49 | self.modifiedtripleset.fill_tripleset(xml_t) 50 | 51 | def create_lex(self, xml_lex): 52 | comment = xml_lex.attrib['comment'] 53 | lid = xml_lex.attrib['lid'] 54 | lex = Lexicalisation(xml_lex.text, comment, lid) 55 | self.lexs.append(lex) 56 | 57 | def count_lexs(self): 58 | return len(self.lexs) 59 | 60 | 61 | class Benchmark: 62 | 63 | def __init__(self): 64 | self.entries = [] 65 | 66 | def fill_benchmark(self, fileslist): 67 | for file in fileslist: 68 | tree = Et.parse(file[0] + '/' + file[1]) 69 | root = tree.getroot() 70 | for xml_entry in root.iter('entry'): 71 | # ignore triples with no lexicalisations 72 | lexfound = False 73 | for child in xml_entry: 74 | if child.tag == "lex": 75 | lexfound = True 76 | break 77 | if lexfound is False: 78 | continue 79 | 80 | entry_id = xml_entry.attrib['eid'] 81 | category = xml_entry.attrib['category'] 82 | size = xml_entry.attrib['size'] 83 | entry = Entry(category, size, entry_id) 84 | for child in xml_entry: 85 | if child.tag == 'originaltripleset': 86 | entry.fill_originaltriple(child) 87 | elif child.tag == 'modifiedtripleset': 88 | entry.fill_modifiedtriple(child) 89 | elif child.tag == 'lex': 90 | entry.create_lex(child) 91 | self.entries.append(entry) 92 | 93 | def total_lexcount(self): 94 | count = [entry.count_lexs() for entry in self.entries] 95 | return sum(count) 96 | 97 | def unique_p(self): 98 | properties = [triple.p for entry in self.entries for triple in entry.modifiedtripleset.triples] 99 | return len(set(properties)) 100 | 101 | def entry_count(self, size=None, cat=None): 102 | """ 103 | calculate the number of entries in benchmark 104 | :param size: size (should be string) 105 | :param cat: category 106 | :return: entry count 107 | """ 108 | if not size and cat: 109 | entries = [entry for entry in self.entries if entry.category == cat] 110 | elif not cat and size: 111 | entries = [entry for entry in self.entries if entry.size == size] 112 | elif not size and not cat: 113 | return len(self.entries) 114 | else: 115 | entries = [entry for entry in self.entries if entry.category == cat and entry.size == size] 116 | return len(entries) 117 | 118 | def lexcount_size_category(self, size='', cat=''): 119 | count = [entry.count_lexs() for entry in self.entries if entry.category == cat and entry.size == size] 120 | return len(count) 121 | 122 | def property_map(self): 123 | mprop_oprop = defaultdict(set) 124 | for entry in self.entries: 125 | for tripleset in entry.originaltripleset: 126 | for i, triple in enumerate(tripleset.triples): 127 | mprop_oprop[entry.modifiedtripleset.triples[i].p].add(triple.p) 128 | return mprop_oprop 129 | 130 | def subjects_objects(self): 131 | subjects = [] 132 | objects = [] 133 | for entry in self.entries: 134 | for tripleset in entry.originaltripleset: 135 | for triple in tripleset.triples: 136 | if triple.o not in objects: 137 | objects.append(triple.o) 138 | if triple.s not in subjects: 139 | subjects.append(triple.o) 140 | return [subjects, objects] -------------------------------------------------------------------------------- /src/datatuner/lm/model_loader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pathlib import Path 4 | 5 | import mlflow 6 | from datatuner.lm import custom_tokenizer 7 | from datatuner.lm.custom_gpt2 import custom_gpt2_with_smoothing 8 | from datatuner.lm.data_loader import PAD_TOKEN 9 | from transformers import ( 10 | GPT2DoubleHeadsModel, 11 | GPT2LMHeadModel, 12 | GPT2Tokenizer, 13 | OpenAIGPTDoubleHeadsModel, 14 | OpenAIGPTLMHeadModel, 15 | OpenAIGPTTokenizer, 16 | GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, 17 | ) 18 | from transformers.tokenization_utils import PreTrainedTokenizer 19 | 20 | logger = logging.getLogger(__file__) 21 | 22 | 23 | def load_pretrained_tokenizer(model_checkpoint, model_type): 24 | """Load pretrained tokenizer""" 25 | 26 | tokenizer_class = OpenAIGPTTokenizer if "openai-gpt" in model_type else GPT2Tokenizer 27 | tokenizer = tokenizer_class.from_pretrained(model_checkpoint) 28 | PreTrainedTokenizer.tokenize = custom_tokenizer.tokenize 29 | return tokenizer 30 | 31 | 32 | def load_training_args(run_id): 33 | client = mlflow.tracking.MlflowClient() 34 | training_args_file = client.download_artifacts(run_id, "training/model_training_args.json") 35 | model_training_args = json.load(open(training_args_file)) 36 | return model_training_args 37 | 38 | 39 | def get_model_directory(model_checkpoint=None): 40 | """Get the model directory; if `model_checkpoint` is a folder, it is returned; 41 | if it is a shortcut name for a Hugging Face model, the name is returned for handling downstream; 42 | if it's a run_id, the folder is obtained from mlflow.""" 43 | is_local = True 44 | if Path(model_checkpoint).exists(): 45 | return Path(model_checkpoint), is_local 46 | elif model_checkpoint in GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys(): 47 | is_local = False 48 | return model_checkpoint, is_local 49 | else: 50 | client = mlflow.tracking.MlflowClient() 51 | run_info = client.get_run(model_checkpoint) 52 | is_local = False 53 | return Path(run_info.info.artifact_uri) / "training", is_local 54 | 55 | 56 | def read_special_tokens(task_config=None, special_tokens_file=None, dataset_path=None): 57 | """Read special tokens from file and from the task configuration""" 58 | tokens = [] 59 | # If no special tokens file is explicitly passed, we try finding a special_tokens.txt file in the model directory 60 | if special_tokens_file is None: 61 | if dataset_path is not None: 62 | special_tokens_file = Path(dataset_path) / "special_tokens.txt" 63 | 64 | # Add any special tokens indicated in the file 65 | if special_tokens_file is not None and special_tokens_file.exists(): 66 | tokens += [x for x in special_tokens_file.read_text().split("\n") if x.strip()] 67 | logger.info(f"read {len(tokens)} special tokens from {special_tokens_file}") 68 | 69 | if task_config is not None: 70 | # add any special tokens defined in the tokenization 71 | for item in task_config["data_shape"]: 72 | if item["type"] == "special": 73 | tokens += [item["id"]] 74 | 75 | if "extra_special_tokens" in task_config: 76 | tokens.extend(task_config["extra_special_tokens"]) 77 | 78 | # Add basic eos and padding tokens 79 | tokens += [PAD_TOKEN, ""] 80 | 81 | return tokens 82 | 83 | 84 | def load_pretrained( 85 | model_directory, 86 | model_type=None, 87 | smoothing=0.0, 88 | output_attentions=True, 89 | output_hidden_states=True, 90 | multitask=False, 91 | special_tokens_file=None, 92 | task_config=None, 93 | dataset_path=None, 94 | **kwargs, 95 | ): 96 | """Load pretrained model""" 97 | print("Get pretrained model and tokenizer") 98 | model_directory = str(model_directory) 99 | 100 | try: 101 | model_training_args = json.load(open(Path(model_directory) / "model_training_args.json")) 102 | if "gpt2" in model_training_args["model_directory"]: 103 | model_type = "gpt2" 104 | elif "openai-gpt" in model_training_args["model_directory"]: 105 | model_type = "openai-gpt" 106 | 107 | multitask = model_training_args["multitask"] 108 | except: 109 | pass 110 | 111 | if model_type is None: 112 | model_type = model_directory 113 | 114 | tokenizer = load_pretrained_tokenizer(model_directory, model_type) 115 | 116 | if smoothing > 0: 117 | model_class = custom_gpt2_with_smoothing(smoothing=smoothing) 118 | 119 | elif "gpt2" in model_type: 120 | if multitask: 121 | model_class = GPT2DoubleHeadsModel 122 | else: 123 | model_class = GPT2LMHeadModel 124 | elif "openai-gpt" in model_type: 125 | if multitask: 126 | model_class = OpenAIGPTDoubleHeadsModel 127 | else: 128 | model_class = OpenAIGPTLMHeadModel 129 | else: 130 | raise ValueError( 131 | "Invalid model type; make sure to pass the actual model_type if your checkpoint name or model name does not have the model type in them" 132 | ) 133 | 134 | model = model_class.from_pretrained( 135 | model_directory, output_attentions=output_attentions, output_hidden_states=output_hidden_states, **kwargs 136 | ) 137 | 138 | return model, tokenizer 139 | -------------------------------------------------------------------------------- /src/datatuner/lm/novograd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | import torch 17 | from torch.optim import Optimizer 18 | 19 | 20 | class Novograd(Optimizer): 21 | """ 22 | Implements Novograd algorithm. 23 | Args: 24 | params (iterable): iterable of parameters to optimize or dicts defining 25 | parameter groups 26 | lr (float, optional): learning rate (default: 1e-3) 27 | betas (Tuple[float, float], optional): coefficients used for computing 28 | running averages of gradient and its square (default: (0.95, 0)) 29 | eps (float, optional): term added to the denominator to improve 30 | numerical stability (default: 1e-8) 31 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 32 | grad_averaging: gradient averaging 33 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 34 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 35 | (default: False) 36 | """ 37 | 38 | def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, weight_decay=0, grad_averaging=False, amsgrad=False): 39 | if not 0.0 <= lr: 40 | raise ValueError("Invalid learning rate: {}".format(lr)) 41 | if not 0.0 <= eps: 42 | raise ValueError("Invalid epsilon value: {}".format(eps)) 43 | if not 0.0 <= betas[0] < 1.0: 44 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 45 | if not 0.0 <= betas[1] < 1.0: 46 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 47 | defaults = dict( 48 | lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging, amsgrad=amsgrad 49 | ) 50 | 51 | super(Novograd, self).__init__(params, defaults) 52 | 53 | def __setstate__(self, state): 54 | super(Novograd, self).__setstate__(state) 55 | for group in self.param_groups: 56 | group.setdefault("amsgrad", False) 57 | 58 | def step(self, closure=None): 59 | """Performs a single optimization step. 60 | Arguments: 61 | closure (callable, optional): A closure that reevaluates the model 62 | and returns the loss. 63 | """ 64 | loss = None 65 | if closure is not None: 66 | loss = closure() 67 | 68 | for group in self.param_groups: 69 | for p in group["params"]: 70 | if p.grad is None: 71 | continue 72 | grad = p.grad.data 73 | if grad.is_sparse: 74 | raise RuntimeError("Sparse gradients are not supported.") 75 | amsgrad = group["amsgrad"] 76 | 77 | state = self.state[p] 78 | 79 | # State initialization 80 | if len(state) == 0: 81 | state["step"] = 0 82 | # Exponential moving average of gradient values 83 | state["exp_avg"] = torch.zeros_like(p.data) 84 | # Exponential moving average of squared gradient values 85 | state["exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device) 86 | if amsgrad: 87 | # Maintains max of all exp. moving avg. of sq. grad. values 88 | state["max_exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device) 89 | 90 | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] 91 | if amsgrad: 92 | max_exp_avg_sq = state["max_exp_avg_sq"] 93 | beta1, beta2 = group["betas"] 94 | 95 | state["step"] += 1 96 | 97 | norm = torch.sum(torch.pow(grad, 2)) 98 | 99 | if exp_avg_sq == 0: 100 | exp_avg_sq.copy_(norm) 101 | else: 102 | exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) 103 | 104 | if amsgrad: 105 | # Maintains the maximum of all 2nd moment running avg. till now 106 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 107 | # Use the max. for normalizing running avg. of gradient 108 | denom = max_exp_avg_sq.sqrt().add_(group["eps"]) 109 | else: 110 | denom = exp_avg_sq.sqrt().add_(group["eps"]) 111 | 112 | grad.div_(denom) 113 | if group["weight_decay"] != 0: 114 | grad.add_(group["weight_decay"], p.data) 115 | if group["grad_averaging"]: 116 | grad.mul_(1 - beta1) 117 | exp_avg.mul_(beta1).add_(grad) 118 | 119 | p.data.add_(-group["lr"], exp_avg) 120 | 121 | return loss 122 | -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/alignment/boolean_slot.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list, find_all_in_list 4 | 5 | 6 | NEG_IDX_FALSE_PRE_THRESH = 10 7 | NEG_POS_FALSE_PRE_THRESH = 30 8 | NEG_IDX_TRUE_PRE_THRESH = 5 9 | NEG_POS_TRUE_PRE_THRESH = 15 10 | NEG_IDX_POST_THRESH = 10 11 | NEG_POS_POST_THRESH = 30 12 | 13 | negation_cues_pre = [ 14 | 'no', 'not', 'non', 'none', 'neither', 'nor', 'never', 'n\'t', 'cannot', 15 | 'excluded', 'lack', 'lacks', 'lacking', 'unavailable', 'without', 'zero', 16 | 'everything but' 17 | ] 18 | negation_cues_post = [ 19 | 'not', 'nor', 'never', 'n\'t', 'cannot', 20 | 'excluded', 'unavailable' 21 | ] 22 | contrast_cues = [ 23 | 'but', 'however', 'although', 'though', 'nevertheless' 24 | ] 25 | 26 | 27 | def align_boolean_slot(text, text_tok, slot, value, true_val='yes', false_val='no'): 28 | pos = -1 29 | text = re.sub(r'\'', '', text) 30 | 31 | # Get the words that possibly realize the slot 32 | slot_stems = __get_boolean_slot_stems(slot) 33 | 34 | # Search for all possible slot realizations 35 | for slot_stem in slot_stems: 36 | idx, pos = find_first_in_list(slot_stem, text_tok) 37 | if pos >= 0: 38 | if value == true_val: 39 | # Match an instance of the slot stem without a preceding negation 40 | if not __find_negation(text, text_tok, idx, pos, expected_true=True, after=False): 41 | return pos 42 | else: 43 | # Match an instance of the slot stem with a preceding or a following negation 44 | if __find_negation(text, text_tok, idx, pos, expected_true=False, after=True): 45 | return pos 46 | 47 | # If no match found and the value ~ False, search for alternative expressions of the opposite 48 | if pos < 0 and value == false_val: 49 | slot_antonyms = __get_boolean_slot_antonyms(slot) 50 | for slot_antonym in slot_antonyms: 51 | if ' ' in slot_antonym: 52 | pos = text.find(slot_antonym) 53 | else: 54 | _, pos = find_first_in_list(slot_antonym, text_tok) 55 | 56 | if pos >= 0: 57 | return pos 58 | 59 | return -1 60 | 61 | 62 | def __find_negation(text, text_tok, idx, pos, expected_true=False, after=False): 63 | # Set the thresholds depending on the expected boolean value of the slot 64 | if expected_true: 65 | idx_pre_thresh = NEG_IDX_TRUE_PRE_THRESH 66 | pos_pre_thresh = NEG_POS_TRUE_PRE_THRESH 67 | else: 68 | idx_pre_thresh = NEG_IDX_FALSE_PRE_THRESH 69 | pos_pre_thresh = NEG_POS_FALSE_PRE_THRESH 70 | 71 | for negation in negation_cues_pre: 72 | if ' ' in negation: 73 | neg_pos = text.find(negation) 74 | if neg_pos >= 0: 75 | if 0 < (pos - neg_pos - text[neg_pos:pos].count(',')) <= pos_pre_thresh: 76 | # Look for a contrast cue between the negation and the slot realization 77 | neg_text_segment = text[neg_pos + len(negation):pos] 78 | if __has_contrast_after_negation(neg_text_segment): 79 | return False 80 | else: 81 | return True 82 | else: 83 | neg_idxs, _ = find_all_in_list(negation, text_tok) 84 | for neg_idx in neg_idxs: 85 | if 0 < (idx - neg_idx - text_tok[neg_idx + 1:idx].count(',')) <= idx_pre_thresh: 86 | # Look for a contrast cue between the negation and the slot realization 87 | neg_text_segment = text_tok[neg_idx + 1:idx] 88 | if __has_contrast_after_negation_tok(neg_text_segment): 89 | return False 90 | else: 91 | return True 92 | 93 | if after: 94 | for negation in negation_cues_post: 95 | if ' ' in negation: 96 | neg_pos = text.find(negation) 97 | if neg_pos >= 0: 98 | if 0 < (neg_pos - pos) < NEG_POS_POST_THRESH: 99 | return True 100 | else: 101 | neg_idxs, _ = find_all_in_list(negation, text_tok) 102 | for neg_idx in neg_idxs: 103 | if 0 < (neg_idx - idx) < NEG_IDX_POST_THRESH: 104 | return True 105 | 106 | return False 107 | 108 | 109 | def __has_contrast_after_negation(text): 110 | for contr_tok in contrast_cues: 111 | if text.find(contr_tok) >= 0: 112 | return True 113 | 114 | return False 115 | 116 | 117 | def __has_contrast_after_negation_tok(text_tok): 118 | for contr_tok in contrast_cues: 119 | if contr_tok in text_tok: 120 | return True 121 | 122 | return False 123 | 124 | 125 | def __get_boolean_slot_stems(slot): 126 | slot_stems = { 127 | 'familyfriendly': ['family', 'families', 'kid', 'kids', 'child', 'children'], 128 | 'hasusbport': ['usb'], 129 | 'isforbusinesscomputing': ['business'], 130 | 'hasmultiplayer': ['multiplayer', 'friends', 'others'], 131 | 'availableonsteam': ['steam'], 132 | 'haslinuxrelease': ['linux'], 133 | 'hasmacrelease': ['mac'] 134 | } 135 | 136 | return slot_stems.get(slot, []) 137 | 138 | 139 | def __get_boolean_slot_antonyms(slot): 140 | slot_antonyms = { 141 | 'familyfriendly': ['adult', 'adults'], 142 | 'isforbusinesscomputing': ['personal', 'general', 'home', 'nonbusiness'], 143 | 'hasmultiplayer': ['single player'] 144 | } 145 | 146 | return slot_antonyms.get(slot, []) 147 | -------------------------------------------------------------------------------- /src/datatuner/lm/cross_entropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def _is_long(x): 7 | if hasattr(x, "data"): 8 | x = x.data 9 | return isinstance(x, torch.LongTensor) or isinstance(x, torch.cuda.LongTensor) 10 | 11 | 12 | def onehot(indexes, N=None, ignore_index=None): 13 | """ 14 | Creates a one-representation of indexes with N possible entries 15 | if N is not specified, it will suit the maximum index appearing. 16 | indexes is a long-tensor of indexes 17 | ignore_index will be zero in onehot representation 18 | """ 19 | if N is None: 20 | N = indexes.max() + 1 21 | sz = list(indexes.size()) 22 | output = indexes.new().byte().resize_(*sz, N).zero_() 23 | output.scatter_(-1, indexes.unsqueeze(-1), 1) 24 | if ignore_index is not None: 25 | output.masked_fill_(indexes.eq(ignore_index).unsqueeze(-1), 0) 26 | return output 27 | 28 | 29 | def cross_entropy( 30 | inputs, 31 | target, 32 | weight=None, 33 | ignore_index=-100, 34 | reduction="mean", 35 | smooth_eps=None, 36 | smooth_dist=None, 37 | from_logits=True, 38 | ): 39 | """cross entropy loss, with support for target distributions and label smoothing https://arxiv.org/abs/1512.00567""" 40 | smooth_eps = smooth_eps or 0 41 | 42 | # ordinary log-liklihood - use cross_entropy from nn 43 | if _is_long(target) and smooth_eps == 0: 44 | if from_logits: 45 | return F.cross_entropy(inputs, target, weight, ignore_index=ignore_index, reduction=reduction) 46 | else: 47 | return F.nll_loss(inputs, target, weight, ignore_index=ignore_index, reduction=reduction) 48 | 49 | if from_logits: 50 | # log-softmax of inputs 51 | lsm = F.log_softmax(inputs, dim=-1) 52 | else: 53 | lsm = inputs 54 | 55 | masked_indices = None 56 | num_classes = inputs.size(-1) 57 | 58 | if _is_long(target): 59 | masked_indices = target.eq(ignore_index) 60 | target[masked_indices] = 0 61 | 62 | if smooth_eps > 0 and smooth_dist is not None: 63 | if _is_long(target): 64 | target = onehot(target, num_classes).type_as(inputs) 65 | if smooth_dist.dim() < target.dim(): 66 | smooth_dist = smooth_dist.unsqueeze(0) 67 | target.lerp_(smooth_dist, smooth_eps) 68 | 69 | if weight is not None: 70 | lsm = lsm * weight.unsqueeze(0) 71 | 72 | if _is_long(target): 73 | eps_sum = smooth_eps / num_classes 74 | eps_nll = 1.0 - eps_sum - smooth_eps 75 | likelihood = lsm.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1) 76 | loss = -(eps_nll * likelihood + eps_sum * lsm.sum(-1)) 77 | else: 78 | loss = -(target * lsm).sum(-1) 79 | 80 | if masked_indices is not None: 81 | loss.masked_fill_(masked_indices, 0) 82 | 83 | if reduction == "sum": 84 | loss = loss.sum() 85 | elif reduction == "mean": 86 | if masked_indices is None: 87 | loss = loss.mean() 88 | else: 89 | loss = loss.sum() / float(loss.size(0) - masked_indices.sum()) 90 | 91 | return loss 92 | 93 | 94 | class CrossEntropyLoss(nn.CrossEntropyLoss): 95 | """CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing""" 96 | 97 | def __init__( 98 | self, weight=None, ignore_index=-100, reduction="mean", smooth_eps=None, smooth_dist=None, from_logits=True 99 | ): 100 | super(CrossEntropyLoss, self).__init__(weight=weight, ignore_index=ignore_index, reduction=reduction) 101 | self.smooth_eps = smooth_eps 102 | self.smooth_dist = smooth_dist 103 | self.from_logits = from_logits 104 | 105 | def forward(self, input, target, smooth_dist=None): 106 | if smooth_dist is None: 107 | smooth_dist = self.smooth_dist 108 | return cross_entropy( 109 | input, 110 | target, 111 | weight=self.weight, 112 | ignore_index=self.ignore_index, 113 | reduction=self.reduction, 114 | smooth_eps=self.smooth_eps, 115 | smooth_dist=smooth_dist, 116 | from_logits=self.from_logits, 117 | ) 118 | 119 | 120 | def binary_cross_entropy(inputs, target, weight=None, reduction="mean", smooth_eps=None, from_logits=False): 121 | """cross entropy loss, with support for label smoothing https://arxiv.org/abs/1512.00567""" 122 | smooth_eps = smooth_eps or 0 123 | if smooth_eps > 0: 124 | target = target.float() 125 | target.add_(smooth_eps).div_(2.0) 126 | if from_logits: 127 | return F.binary_cross_entropy_with_logits(inputs, target, weight=weight, reduction=reduction) 128 | else: 129 | return F.binary_cross_entropy(inputs, target, weight=weight, reduction=reduction) 130 | 131 | 132 | def binary_cross_entropy_with_logits(inputs, target, weight=None, reduction="mean", smooth_eps=None, from_logits=True): 133 | return binary_cross_entropy(inputs, target, weight, reduction, smooth_eps, from_logits) 134 | 135 | 136 | class BCELoss(nn.BCELoss): 137 | def __init__( 138 | self, weight=None, size_average=None, reduce=None, reduction="mean", smooth_eps=None, from_logits=False 139 | ): 140 | super(BCELoss, self).__init__(weight, size_average, reduce, reduction) 141 | self.smooth_eps = smooth_eps 142 | self.from_logits = from_logits 143 | 144 | def forward(self, input, target): 145 | return binary_cross_entropy( 146 | input, 147 | target, 148 | weight=self.weight, 149 | reduction=self.reduction, 150 | smooth_eps=self.smooth_eps, 151 | from_logits=self.from_logits, 152 | ) 153 | 154 | 155 | class BCEWithLogitsLoss(BCELoss): 156 | def __init__( 157 | self, weight=None, size_average=None, reduce=None, reduction="mean", smooth_eps=None, from_logits=True 158 | ): 159 | super(BCEWithLogitsLoss, self).__init__( 160 | weight, size_average, reduce, reduction, smooth_eps=smooth_eps, from_logits=from_logits 161 | ) 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataTuner 2 | 3 | You have just found the DataTuner. 4 | This repository provides tools for fine-tuning language models for a task. 5 | 6 | * See [LICENSE.txt](LICENSE.txt) for license details. 7 | 8 | * See [NOTICE.txt](NOTICE.txt) for details of third party code included in or downloaded by this code. 9 | 10 | * See [/paper/README.md](paper/README.md) for details about reproducing the results reported in the paper 11 | ["Have Your Text and Use It Too! End-to-End Neural Data-to-Text Generation with Semantic Fidelity" by Hamza Harkous, Isabel Groves and Amir Saffari.](https://www.aclweb.org/anthology/2020.coling-main.218/) 12 | 13 | 14 | 15 | ## Installation 16 | 17 | ### Environment Creation 18 | 19 | Assuming you have an existing `conda` setup, you can setup the environment with the following script. In order to activate the conda environment within the bash script, you need the location of the `conda.sh` file: 20 | 21 | ```bash 22 | bash setup.sh ~/miniconda3/etc/profile.d/conda.sh 23 | ``` 24 | 25 | You can update your existing environment: 26 | 27 | ```bash 28 | conda env update -f=environment.yml 29 | ``` 30 | 31 | To start development, activate your environment: 32 | 33 | ```bash 34 | conda activate finetune 35 | ``` 36 | 37 | Alternatively, you can always use the python binary with the absolute path, e.g.: `~/miniconda3/envs/finetune/bin/python`. 38 | 39 | ## Data 40 | 41 | For any task you want to fine-tune on, you need the data to be a json file containing a list of json objects, one per data point. For example: 42 | 43 | ```json 44 | [ 45 | { 46 | "question": "question text 1", 47 | "query": "query 1" 48 | }, 49 | { 50 | "question": "question text 2", 51 | "query": "query 2 with [SpecialToken example]" 52 | } 53 | ] 54 | ``` 55 | 56 | The library assumes that you have placed your data in a single directory with three files: ``train.json``, ``validation.json``, and ``test.json``. 57 | 58 | ## Configuration 59 | 60 | Now that we have the data in shape, we need to create a new task configuration file that specifies how we want the data to be formatted and what fields should be considered. You can create new config files in the folder ``src/datatuner/lm/task_configs``. 61 | 62 | A typical config file would look as follows: 63 | 64 | 65 | ```json 66 | { 67 | "name": "dataset_name", 68 | "data_shape": [ 69 | { 70 | "id": "", 71 | "type": "special", 72 | "learn": false 73 | }, 74 | { 75 | "id": "question", 76 | "type": "text", 77 | "learn": false 78 | }, 79 | { 80 | "id": "", 81 | "type": "special", 82 | "learn": false 83 | }, 84 | { 85 | "id": "query", 86 | "type": "text", 87 | "learn": true, 88 | "metrics": [ 89 | "match" 90 | ] 91 | } 92 | ], 93 | "extra_special_tokens": ["[SpecialToken"], 94 | "extra_fields": [] 95 | } 96 | ``` 97 | 98 | For each item in the data shape: 99 | 100 | - ``type`` (required): ``special`` if special token, ``text`` if normal text. 101 | - ``id`` (required): the special token ID if type is ``special``; the key for the text in the json data if type is ``text`` 102 | - ``learn`` (required): whether to allow the model to learn this part of the text. If false, the model masks that part during fine-tuning. 103 | - ``metrics`` (optional): the list of metrics that the model should compute upon evaluation. Each metric should have a corresponding function with the same name in ``metrics.py``. 104 | - ``converter`` (optional): the name of the converter function in ``converters.py`` to apply on that text field after reading the text from the file. 105 | 106 | The value of `extra_special_tokens` is a list of special tokens to be added to the vocabulary. 107 | Alternatively (especially if the list is too long or is generated automatically), you can create a text file with one special token per line and pass that as an argument during training via the `--special_tokens_file` argument. 108 | 109 | 110 | The value of `extra_fields` is a list of additional fields to include from the input `json` files to output during evaluation, aside from the main fields used as inputs/outputs. 111 | 112 | ## Training 113 | 114 | The training script `train.py` can be used in single GPU or multi GPU settings. 115 | 116 | ```bash 117 | cd src/datatuner/lm 118 | 119 | # single gpu 120 | python train.py --model_checkpoint ~/data/openai-gpt/ --dataset_path ../../../data/my_dataset/ --task_config ./task_configs/my_task_config.json --n_epoch 3 --lr 1e-5 121 | 122 | # multi gpu 123 | python -m torch.distributed.launch --nproc_per_node=4 train.py --model_checkpoint ~/data/openai-gpt/ --dataset_path ../../../data/my_dataset/ --task_config ./task_configs/my_task_config.json --n_epoch 3 --lr 1e-5 124 | ``` 125 | 126 | 127 | ## Evaluating the Model 128 | 129 | You can run the following to evaluate the model on any test set. The data format is the same as the training data. Notice that you have to currently specify the ``model_type`` parameter matching the model you're loading: 130 | 131 | ```bash 132 | cd src/datatuner/lm 133 | 134 | python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/2020-01-01_01-01-01 --filename ../../../data/my_dataset/test.json --max_length 200 --model_type gpt --top_k 1 135 | 136 | # or if you just want to evaluate the latest model you trained 137 | RUN=$(ls -t ./runs | head -1) && python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/$RUN --filename ../../../data/my_dataset/test.json --max_length 200 --model_type gpt --top_k 1 138 | 139 | # or if you want to use the latest intermediate checkpoint while the model is training: 140 | RUN=$(ls -t ./runs | head -1) && CHECKPOINT=$(ls -t ./runs/$RUN/checkpoint* | head -1) && cp $CHECKPOINT runs/$RUN/pytorch_model.bin 141 | ``` 142 | 143 | During evaluation, the outputs that do not exactly match the expected outputs will be printed. Also, 144 | the metrics will be printed (a dictionary with keys `_`). At the end of evaluation, you will find the file with all the generated ouputs in the file `eval_results//___generated.json`. 145 | 146 | 147 | 148 | # Interacting with the model 149 | 150 | You can also interact with the models. The client will ask you to input the fields required, and it will generate the fields it learnt. 151 | 152 | ```bash 153 | cd src/datatuner/lm 154 | 155 | python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/2020-01-01_01-01-01 --max_length 200 --model_type gpt --top_k 1 --input 156 | 157 | # or if you just want to evaluate the latest model you trained 158 | RUN=$(ls -t ./runs | head -1) && python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/$RUN --max_length 200 --model_type gpt --top_k 1 --input 159 | ``` -------------------------------------------------------------------------------- /src/external/jjuraska_slug2slug/slot_aligner/slot_extraction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from nltk.tokenize import word_tokenize 4 | 5 | from external.jjuraska_slug2slug import config 6 | 7 | 8 | def extract_city(user_input, input_tokens, named_entities): 9 | city = None 10 | 11 | for ne in named_entities: 12 | if ne[0] == 'City': 13 | city = ne[2] 14 | break 15 | 16 | return city 17 | 18 | 19 | def extract_eat_type(user_input): 20 | bar_synonyms = ['bar', 'bistro', 'brasserie', 'inn', 'tavern'] 21 | coffee_shop_synonyms = ['café', 'cafe', 'coffee shop', 'coffeehouse', 'teahouse'] 22 | restaurant_synonyms = ['cafeteria', 'canteen', 'chophouse', 'coffee shop', 'diner', 'donut shop', 'drive-in', 23 | 'eatery', 'eating place', 'fast-food place', 'joint', 'pizzeria', 'place to eat', 24 | 'restaurant', 'steakhouse'] 25 | 26 | if any(x in user_input for x in bar_synonyms): 27 | return 'bar' 28 | elif any(x in user_input for x in coffee_shop_synonyms): 29 | return 'coffee shop' 30 | elif any(x in user_input for x in restaurant_synonyms): 31 | return 'restaurant' 32 | else: 33 | return None 34 | 35 | 36 | def extract_categories(user_input, input_tokens): 37 | # file_categories_restaurants = 'dialogue/dialogue_modules/slug2slug/data/yelp/categories_restaurants.json' 38 | file_categories_restaurants = os.path.join(config.DATA_DIR, 'yelp', 'categories_restaurants.json') 39 | 40 | with open(file_categories_restaurants, 'r') as f_categories: 41 | categories = json.load(f_categories) 42 | 43 | for i, token in enumerate(input_tokens): 44 | # search for single-word occurrences in the category list 45 | if token in categories: 46 | return {'title': token, 47 | 'ids': categories[token]} 48 | 49 | # search for bigram occurrences in the category list 50 | if i > 0: 51 | key = ' '.join(input_tokens[i-1:i+1]) 52 | if key in categories: 53 | return {'title': key, 54 | 'ids': categories[key]} 55 | 56 | return {'title': None, 57 | 'ids': []} 58 | 59 | 60 | def extract_price_range(user_input, input_tokens): 61 | CHEAP = ['1', '2'] 62 | MODERATE = ['2', '3'] 63 | HIGH = ['3', '4'] 64 | 65 | indicators_indep = {'cheap': CHEAP, 66 | 'inexpensive': CHEAP, 67 | 'affordable': CHEAP, 68 | 'modest': CHEAP, 69 | 'budget': CHEAP, 70 | 'economic': CHEAP, 71 | 'economical': CHEAP, 72 | 'expensive': HIGH, 73 | 'costly': HIGH, 74 | 'fancy': HIGH, 75 | 'posh': HIGH, 76 | 'stylish': HIGH, 77 | 'elegant': HIGH, 78 | 'extravagant': HIGH, 79 | 'luxury': HIGH, 80 | 'luxurious': HIGH} 81 | 82 | indicators_indep_bigram = {'low cost': CHEAP, 83 | 'high class': HIGH} 84 | 85 | indicators_priced = {'low': CHEAP, 86 | 'reasonably': CHEAP, 87 | 'moderately': MODERATE, 88 | 'high': HIGH, 89 | 'highly': HIGH} 90 | 91 | indicators_range = {'low': CHEAP, 92 | 'moderate': MODERATE, 93 | 'average': MODERATE, 94 | 'ordinary': MODERATE, 95 | 'middle': MODERATE, 96 | 'high': HIGH} 97 | 98 | # search for single-word occurrences in the indicator list 99 | for token in input_tokens: 100 | if token in indicators_indep: 101 | return indicators_indep[token] 102 | 103 | # search for bigram occurrences in the category list 104 | for key, val in indicators_indep_bigram.items(): 105 | if key in user_input: 106 | return val 107 | 108 | idx = -1 109 | try: 110 | idx = input_tokens.index('priced') 111 | if idx > 0: 112 | prev_token = input_tokens[idx - 1] 113 | if prev_token in indicators_priced: 114 | return indicators_priced[prev_token] 115 | except ValueError: 116 | try: 117 | idx = input_tokens.index('price') 118 | except ValueError: 119 | try: 120 | idx = input_tokens.index('prices') 121 | except ValueError: 122 | pass 123 | 124 | if idx > 0: 125 | prev_token = input_tokens[idx - 1] 126 | if prev_token in indicators_range: 127 | return indicators_range[prev_token] 128 | 129 | return None 130 | 131 | 132 | def extract_area(user_input, input_tokens): 133 | indicators_area = ['downtown', 'city center', 'city centre', 'center of', 'centre of', 'middle of'] 134 | 135 | area = None 136 | 137 | for ind in indicators_area: 138 | if ind in user_input: 139 | area = 'downtown' 140 | break 141 | 142 | return area 143 | 144 | 145 | def extract_family_friendly(user_input, input_tokens): 146 | indicators = ['family', 'families', 'child', 'children', 'kid', 'kids'] 147 | 148 | for ind in indicators: 149 | if ind in user_input: 150 | return True 151 | 152 | return False 153 | 154 | 155 | # TODO: implement 156 | def extract_near(user_input): 157 | indicators = ['near', 'near to', 'close to', 'next to', 'neighborhood of', 'vicinity of'] 158 | 159 | return None 160 | 161 | 162 | def identify_slots(user_input, named_entities): 163 | attributes = {} 164 | 165 | user_input = user_input.lower() 166 | input_tokens = word_tokenize(user_input) 167 | 168 | city = extract_city(user_input, input_tokens, named_entities) 169 | if city: 170 | attributes['city'] = city 171 | 172 | eat_type = extract_eat_type(user_input) 173 | if eat_type: 174 | attributes['eatType'] = eat_type 175 | 176 | categories = extract_categories(user_input, input_tokens) 177 | if categories: 178 | attributes['categories'] = categories 179 | 180 | prices = extract_price_range(user_input, input_tokens) 181 | if prices: 182 | attributes['prices'] = prices 183 | 184 | family_friendly = extract_family_friendly(user_input, input_tokens) 185 | if family_friendly: 186 | attributes['familyFriendly'] = family_friendly 187 | 188 | area = extract_area(user_input, input_tokens) 189 | if area: 190 | attributes['area'] = area 191 | 192 | return attributes 193 | 194 | 195 | # ---- MAIN ---- 196 | 197 | def main(): 198 | user_input = 'Is there a family-friendly bar in downtown santa cruz that serves reasonably priced burgers?' 199 | gnode_entities = [('VisualArtwork', 282.797767, 'restaurant in'), ('City', 2522.766114, 'Santa Cruz')] 200 | print(identify_slots(user_input, gnode_entities)) 201 | 202 | 203 | if __name__ == '__main__': 204 | main() 205 | -------------------------------------------------------------------------------- /paper/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing Paper Results 2 | 3 | This README describes how to reproduce the results in the paper: 4 | 5 | ["Have Your Text and Use It Too! End-to-End Neural Data-to-Text Generation with Semantic Fidelity" by Hamza Harkous, Isabel Groves and Amir Saffari.](https://www.aclweb.org/anthology/2020.coling-main.218/) 6 | 7 | To cite: 8 | ```bibtex 9 | @inproceedings{harkous-etal-2020-text, 10 | title = "Have Your Text and Use It Too! End-to-End Neural Data-to-Text Generation with Semantic Fidelity", 11 | author = "Harkous, Hamza and 12 | Groves, Isabel and 13 | Saffari, Amir", 14 | booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", 15 | month = dec, 16 | year = "2020", 17 | address = "Barcelona, Spain (Online)", 18 | publisher = "International Committee on Computational Linguistics", 19 | url = "https://www.aclweb.org/anthology/2020.coling-main.218", 20 | doi = "10.18653/v1/2020.coling-main.218", 21 | pages = "2410--2424" 22 | } 23 | ``` 24 | 25 | ## Setup 26 | 27 | We assume you have run the setup script from the [main README](../README.md#environment-creation) file. 28 | 29 | 30 | All of the scripts below should be run from the current folder `paper/`: 31 | 32 | ```bash 33 | cd paper 34 | ``` 35 | 36 | ## Update the config 37 | 38 | 39 | The first step is to update the configuration file [config.sh](config.sh) to point it to the correct directories and files. 40 | The main change that must be done is fixing the folders for the LDC2017T10 and the ViGGO dataset. These datasets are available from https://catalog.ldc.upenn.edu/LDC2017T10 and https://nlds.soe.ucsc.edu/viggo respectively. 41 | 42 | 43 | You can leave the rest of the default parameters as they are. 44 | 45 | 46 | ## Retrieve the data 47 | The next step is to download the rest of the datasets from their respective sources. 48 | 49 | 50 | ```bash 51 | bash retrieve.sh 52 | ``` 53 | 54 | ## Preprocess the Data 55 | Now we are ready to preprocess the data to the suitable format required by DataTuner. 56 | 57 | ```bash 58 | bash preprocess.sh 59 | ``` 60 | 61 | The preprocessed dataset will be present in the folder `./data/` 62 | 63 | 64 | ## Train the Data-to-Text Language Model 65 | We are now able to run the Data-to-Text language model fine-tuning. 66 | 67 | As described in the paper, there are two system variants we can train: 68 | - `DataTuner_No_FC`: represents DataTuner without the fidelity classifier 69 | - `DataTuner_No_FC_No_FS`: represents DataTuner with neither fidelity classifier nor fine-grained state embeddings 70 | 71 | The third system variant `DataTuner_FC` reuses the trained `DataTuner_No_FC` and adds the fidelity classifier during the evaluation stage. Hence, the results for that system variant are the ones produced in the section [Run the Trained Classifiers on Generated Data](#run-the-trained-classifiers-on-generated-Data) 72 | 73 | Arguments for the script (in order): 74 | - `DATASET`: dataset name (from `e2e`, `viggo`, `ldc`, and `webnlg`) 75 | - `SYSTEM`: system id (from `DataTuner_No_FC`, `DataTuner_No_FC_No_FS`) 76 | - `OUTPUT_FOLDER`: folder where the model will be written 77 | - `NUM_PARALLEL`: number of processes to run in parallel (usually the number of GPUs) 78 | 79 | Notice that the script overwrites the existing $OUTPUT_FOLDER if it is not empty. 80 | 81 | ```bash 82 | export dataset=e2e # or another dataset 83 | export system=DataTuner_No_FC # or DataTuner_No_FC_No_FS 84 | bash train_lm.sh $dataset $system ~/trained_lms/${dataset}_${system} 4 85 | ``` 86 | 87 | When using distributed training, it might be the case that the training terminates without all the processes exiting. In that case, it is safe to stop the processes with (beware of stopping other processes which happen to have the same script name): 88 | 89 | ```bash 90 | pkill -9 -f train.py 91 | ``` 92 | 93 | ## Evaluate the Data-to-Text Language Model 94 | Next, we can evaluate the Data-to-Text language model. 95 | Arguments for the script (in order): 96 | 97 | - `TEST_FILE`: file containing test data. 98 | - `MODEL`: folder where the model was written (referred to as `OUTPUT_FOLDER` during training) 99 | - `NUM_GPUS`: number of GPUs over which we should distribute the evaluation 100 | - `PER_GPU`: number of processes to run per GPU (typically a value of 2 is safe for a 16 GB GPU) 101 | - `MAX_DATA`: if passed and is more than 0, this sets a maximum on the number of data items to evaluate (used for debugging) 102 | 103 | ```bash 104 | export dataset=e2e # or another dataset 105 | export system=DataTuner_No_FC # or DataTuner_No_FC_No_FS 106 | bash evaluate_lm.sh ../data/$dataset/test.json ~/trained_lms/${dataset}_${system} 4 2 0 107 | bash evaluate_lm.sh ../data/$dataset/test.json ~/trained_lms/${dataset}_${system} 4 2 4 108 | ``` 109 | 110 | A new folder will be generated with the generated data within it. This folder is printed at the end of the evaluation. 111 | 112 | ## Train the Semantic Fidelity Classifiers 113 | 114 | To train the semantic fidelity classifiers, run the following scripts for each of the datasets. 115 | Arguments for the script (in order): 116 | 117 | - `TRAINING_DATA_FOLDER`: folder containing the `train.tsv, validation.tsv, test.tsv` files. 118 | - `OUTPUT_FOLDER`: folder where the model will be written 119 | - `TRAINING_ARGS`: json file with the training arguments to use from a pretrained model 120 | - `NUM_PARALLEL`: number of processes to run in parallel (usually the number of GPUs) 121 | 122 | ```bash 123 | export dataset=e2e # or another dataset 124 | bash train_classifier.sh ../data/${dataset}_consistency/ ~/trained_classifiers/${dataset} ./classifier_training_args/$dataset/${dataset}_model_training_args.json 4 125 | ``` 126 | 127 | ## Run the Trained Classifiers on Generated Data 128 | 129 | Once you have a model trained, and you generated the outputs from the Data-to-Text language model, you can run the fidelity classifier on a generated file to classify and rerank the beam outputs you got. This will produce the output of the system variant `DataTunerFc` containing both the fidelity classifier and the fine-grained state embeddings. 130 | Arguments for the script (in order): 131 | 132 | - `TRAINING_DATA_FOLDER`: folder containing the `labels.txt` file for the fidelity classifier. 133 | - `GENERATED_DATA_FOLDER`: folder containing the `generated.json` file. 134 | - `MODEL_FOLDER`: folder where the model was written 135 | - `DATA_KEY`: key for the data field in the `generated.json` file. 136 | - `TEXT_KEY`: key for the text field in the `generated.json` file. 137 | 138 | ```bash 139 | # for example: 140 | export GENERATED_DATA_FOLDER=~/trained_lms/e2e_DataTuner_No_FC_No_FS/2020-01-03_15-29-52/ 141 | 142 | bash eval_with_classifier.sh ../data/ldc_consistency $GENERATED_DATA_FOLDER ~/trained_classifiers/ldc linearized_amr answer_text 143 | bash eval_with_classifier.sh ../data/webnlg_consistency $GENERATED_DATA_FOLDER ~/trained_classifiers/webnlg modifiedtripleset text 144 | bash eval_with_classifier.sh ../data/e2e_consistency $GENERATED_DATA_FOLDER ~/trained_classifiers/e2e new_mr ref 145 | bash eval_with_classifier.sh ../data/viggo_consistency $GENERATED_DATA_FOLDER ~/trained_classifiers/viggo new_mr ref 146 | ``` 147 | 148 | You will get a file `classified.json` within the evaluation folder. You will also get `classified_wrong.json` in the same folder, containing the items where the fidelity classifier did not find any accurate output in the beam. 149 | 150 | -------------------------------------------------------------------------------- /paper/experiments/webnlg/preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | from pathlib import Path 5 | from xml.etree import ElementTree as ET 6 | 7 | from fire import Fire 8 | 9 | from datatuner.classification.distractors import (get_distractors, 10 | write_classification_data) 11 | from datatuner.lm.special_token_generator import generate_from_json 12 | from datatuner.lm.utils import fix_text_in_dir 13 | from webnlg_utils import camel_case_split, cleanup 14 | 15 | random.seed(42) 16 | 17 | seen_categories = [ 18 | "Airport.xml", 19 | "Astronaut.xml", 20 | "Building.xml", 21 | "City.xml", 22 | "ComicsCharacter.xml", 23 | "Food.xml", 24 | "Monument.xml", 25 | "SportsTeam.xml", 26 | "University.xml", 27 | "WrittenWork.xml", 28 | ] 29 | 30 | unseen_categories = ["Athlete.xml", "Artist.xml", "MeanOfTransportation.xml", "CelestialBody.xml", "Politician.xml"] 31 | 32 | 33 | class Triple: 34 | 35 | def __init__(self, s, p, o): 36 | self.s = s #subject 37 | self.o = o #object 38 | self.p = p #predicate 39 | 40 | 41 | def process_tripleset(s): 42 | """Format the triples set in our target format""" 43 | s = cleanup(s) 44 | key = "" 45 | s = s[len(key) : -len(key) - 1] 46 | subject, predicate, obj = s.split("|") 47 | subject, obj = cleanup(subject), cleanup(obj) 48 | predicate = camel_case_split(predicate) 49 | return { 50 | "text": f" {subject} {predicate} {obj}", 51 | "dict": {"subject": subject, "predicate": predicate, "object": obj}, 52 | } 53 | 54 | 55 | def get_nearby_text(entries, e): 56 | i = 1 57 | random_sentence = None 58 | while random_sentence is None: 59 | for j in range(2): 60 | try: 61 | if j == 0: 62 | entry = entries[e + i] 63 | else: 64 | entry = entries[e - i] 65 | 66 | random_sentence = entry.findall("lex")[0].find("text").text 67 | if random_sentence: 68 | break 69 | 70 | except: 71 | pass 72 | i += 1 73 | return random_sentence 74 | 75 | 76 | def parse(in_file, classification_data, num_candidates=5, max_per_operation=2): 77 | """Parse the given file and update `classification_data` with the parsed data""" 78 | 79 | tree = ET.parse(in_file) 80 | root = tree.getroot() 81 | 82 | entries = list(root.find("entries")) 83 | items = [] 84 | for e, entry in enumerate(entries): 85 | 86 | tripletsets = list(entry.find("modifiedtripleset").findall("mtriple")) + list( 87 | entry.find("modifiedtripleset").findall("otriple") 88 | ) 89 | tripletsets = [process_tripleset(x) for x in tripletsets] 90 | 91 | modifiedtripleset = [x["text"] for x in tripletsets] 92 | modifiedtripleset.sort() 93 | 94 | mtripleset = entry.find("modifiedtripleset") 95 | modtripleset = [] 96 | raw_tripleset = "" 97 | for mtriple in mtripleset: 98 | e1, pred, e2 = mtriple.text.split(" | ") 99 | raw_tripleset += mtriple.text + " ||| " 100 | 101 | modtripleset.append(Triple(cleanup(e1), pred, cleanup(e2))) 102 | 103 | all_lex = entry.findall("lex") 104 | for lex in all_lex: 105 | 106 | sortedtripleset = "" 107 | for sent in lex.find("sortedtripleset").findall("sentence"): 108 | for x in sent.findall("striple"): 109 | sortedtripleset += process_tripleset(x)["text"] + ", " 110 | 111 | references = cleanup(lex.find("references")) 112 | template = cleanup(lex.find("template")) 113 | 114 | try: 115 | text = lex.find("text").text 116 | if not text: 117 | print("empty text") 118 | text = "" 119 | continue 120 | except: 121 | print("exception text") 122 | text = "" 123 | continue 124 | 125 | try: 126 | template = lex.find("template").text 127 | if not template: 128 | print("empty template") 129 | template = "" 130 | continue 131 | except: 132 | print("exception template") 133 | template = "" 134 | continue 135 | 136 | # preprocess distractors 137 | subjects = [x["dict"]["subject"] for x in tripletsets] 138 | objects = [x["dict"]["object"] for x in tripletsets] 139 | predicates = [x["dict"]["predicate"] for x in tripletsets] 140 | 141 | swapping_candidates = [subjects + objects] 142 | cutting_candidates = [subjects + objects] 143 | 144 | random_text = get_nearby_text(entries, e) 145 | 146 | tripletset_str = " ; ".join(modifiedtripleset) 147 | 148 | distractors, classification_items = get_distractors( 149 | tripletset_str, 150 | text, 151 | swapping_candidates, 152 | cutting_candidates, 153 | random_text, 154 | num_candidates=num_candidates, 155 | max_per_operation=max_per_operation, 156 | ) 157 | 158 | classification_data.extend(classification_items) 159 | 160 | item = { 161 | "raw_modifiedtripleset": raw_tripleset, 162 | "modifiedtripleset": " ; ".join(modifiedtripleset), 163 | "sortedtripleset": sortedtripleset, 164 | "references": references, 165 | "template": template, 166 | "text": distractors + [text], 167 | "num_triples": Path(in_file).parent.name, 168 | "category": Path(in_file).name, 169 | "category_type": "seen" if Path(in_file).name in seen_categories else "unseen", 170 | } 171 | items.append(item) 172 | 173 | return items 174 | 175 | 176 | def run_parser(set_path, classification_data): 177 | """Get the entry set for the give path """ 178 | entryset = [] 179 | dirtriples = filter(lambda item: not str(item).startswith("."), os.listdir(set_path)) 180 | dirtriples = sorted(list(dirtriples)) 181 | for dirtriple in dirtriples: 182 | fcategories = filter(lambda item: not str(item).startswith("."), os.listdir(os.path.join(set_path, dirtriple))) 183 | fcategories = sorted(list(fcategories)) 184 | for fcategory in fcategories: 185 | entryset.extend(list(parse(os.path.join(set_path, dirtriple, fcategory), classification_data))) 186 | 187 | return entryset 188 | 189 | 190 | def run( 191 | in_folder="./tmp/webnlg/data/v1.4/en/", 192 | out_folder="datatuner/data/webnlg", 193 | classification_dir="datatuner/data/webnlg_consistency", 194 | output_classification_data=True, 195 | ): 196 | """Run the webnlg data formatting task""" 197 | out_folder = Path(out_folder) 198 | in_folder = Path(in_folder) 199 | classification_dir = Path(classification_dir) 200 | out_folder.mkdir(exist_ok=True, parents=True) 201 | classification_dir.mkdir(exist_ok=True, parents=True) 202 | splits = {"train": "train", "dev": "validation", "test": "test"} 203 | 204 | for split in splits: 205 | data_path = in_folder / split 206 | classification_data = [] 207 | entryset = run_parser(data_path, classification_data) 208 | json.dump(entryset, open(out_folder / (splits[split] + ".json"), "w"), indent=2) 209 | if output_classification_data: 210 | write_classification_data(classification_data, classification_dir, splits[split]) 211 | 212 | generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"modifiedtripleset": "amr"}) 213 | fix_text_in_dir(out_folder) 214 | 215 | 216 | if __name__ == "__main__": 217 | Fire(run) 218 | -------------------------------------------------------------------------------- /src/datatuner/lm/metrics.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import difflib 3 | import json 4 | import logging 5 | import os 6 | import sys 7 | from collections import OrderedDict 8 | from itertools import groupby 9 | from pathlib import Path 10 | from subprocess import PIPE, Popen 11 | from tempfile import mkdtemp 12 | 13 | import mlflow 14 | import numpy as np 15 | from datatuner.ops.mlflow import get_artifact 16 | from datatuner.utils import flatten 17 | from fire import Fire 18 | 19 | logger = logging.getLogger(__file__) 20 | 21 | THIS_DIR = Path(os.path.dirname(os.path.realpath(__file__))) 22 | 23 | E2E_METRICS_FOLDER = THIS_DIR / "../../../paper/tmp/e2e-metrics" 24 | PYTHON_BIN = sys.executable 25 | 26 | 27 | def get_str_diff(case_a, case_b): 28 | """Get the string difference between two strings""" 29 | return ("").join([li[-1] for li in difflib.ndiff(case_a, case_b) if li[0] != " " and li[-1] not in [" ", "'", ","]]) 30 | 31 | 32 | def almostmatch(original, current, all_outputs, final): 33 | """Computes match average while allowing a difference in articles. The metric is computed for the given 34 | keys across the list of dictionaries `all_outputs` 35 | """ 36 | lst = [ 37 | int(x[original] == x[current] or get_str_diff(x[original], x[current]) in ["the", "a", "an"]) 38 | for x in all_outputs 39 | ] 40 | return {"value": np.mean(lst), "count": len(all_outputs)} 41 | 42 | 43 | def match(original, current, all_outputs, final): 44 | """Computes exact match average across the values of the given keys in the list of dictionaries `all_outputs`""" 45 | 46 | def postprocess(x): 47 | return x[current][0] if type(x[current]) == list else x[current] 48 | 49 | lst = [int(str(x[original]).lower() == str(postprocess(x).lower())) for x in all_outputs] 50 | return {"value": np.mean(lst), "count": len(all_outputs)} 51 | 52 | 53 | def bleu(original, current, all_outputs, final, case_insensitive=True, all_keys=None): 54 | """Computes bleu score for the values of the given keys in the list of dictionaries `all_outputs`""" 55 | if len(all_outputs) == 0: 56 | return {"value": 0, "count": 0} 57 | 58 | from sacrebleu import corpus_bleu 59 | 60 | def process(s): 61 | return s.lower() if case_insensitive else s 62 | 63 | # group by all the other keys 64 | all_outputs = copy.deepcopy(all_outputs) 65 | if all_keys is None: 66 | keys = all_outputs[0].keys() 67 | else: 68 | keys = all_keys 69 | print(keys) 70 | 71 | other_keys = list(set([key for key in keys if key not in [original, current]])) 72 | 73 | group = {} 74 | max_refs = 1 75 | for item in all_outputs: 76 | # other inputs concatenated 77 | search_key = str([item[x] for x in other_keys if x in item]) 78 | if type(item[current]) == list: 79 | item[current] = item[current][0] 80 | 81 | current_val = process(item[current]) 82 | original_val = process(item[original]) 83 | 84 | if search_key in group: 85 | group[search_key]["references"].append(original_val) 86 | group[search_key]["prediction"] = current_val 87 | if len(group[search_key]["references"]) > max_refs: 88 | max_refs = len(group[search_key]["references"]) 89 | else: 90 | group[search_key] = {"references": [original_val], "prediction": current_val} 91 | 92 | all_predictions = [] 93 | all_references = [[] for i in range(max_refs)] 94 | 95 | for item in group.values(): 96 | all_predictions.append(item["prediction"]) 97 | for i in range(max_refs): 98 | try: 99 | all_references[i].append(item["references"][i]) 100 | except: 101 | all_references[i].append("") 102 | 103 | e2e_metrics = {} 104 | if final: 105 | e2e_metrics = get_e2e_metrics(all_predictions, all_references) 106 | e2e_metrics.update({"value": corpus_bleu(all_predictions, all_references).score, "count": len(all_predictions)}) 107 | return e2e_metrics 108 | 109 | 110 | def get_e2e_metrics(all_predictions, all_references): 111 | tempdir = Path(mkdtemp()) 112 | human = tempdir / "human_refs.txt" 113 | system = tempdir / "system.txt" 114 | with open(human, "w") as h: 115 | with open(system, "w") as s: 116 | for i, x in enumerate(all_predictions): 117 | s.write(x + "\n") 118 | for j in range(len(all_references)): 119 | v = all_references[j][i] 120 | if v.strip(): 121 | h.write(v + "\n") 122 | h.write("\n") 123 | print(E2E_METRICS_FOLDER / "measure_scores.py") 124 | p = Popen( 125 | [ 126 | PYTHON_BIN, 127 | E2E_METRICS_FOLDER / "measure_scores.py", 128 | f"{human}", 129 | f"{system}", 130 | ], 131 | stdin=PIPE, 132 | stdout=PIPE, 133 | stderr=PIPE, 134 | ) 135 | output, err = p.communicate() 136 | stats = output.decode("utf-8").split("\n") 137 | stats = [x for x in stats if x not in ["", "==============", "SCORES:"]] 138 | stats_dict = {} 139 | for item in stats: 140 | key, value = item.split(": ") 141 | value = float(value) 142 | if key in ["BLEU", "METEOR", "ROUGE_L"]: 143 | value *= 100 144 | if key == "BLEU": 145 | key = "e2e_BLEU" 146 | stats_dict[key] = value 147 | 148 | return stats_dict 149 | 150 | 151 | def round_dict(d): 152 | """Round values in a dictionary""" 153 | items = [(k, round(v * 100.0, 2)) for k, v in d.items()] 154 | return dict(sorted(items, key=lambda t: t[1])) 155 | 156 | 157 | def group_by_field(all_outputs, field): 158 | """group a list of dictionaries by the given field value""" 159 | all_outputs.sort(key=lambda k: k[field]) 160 | return groupby(all_outputs, key=lambda k: k[field]) 161 | 162 | 163 | def compute_metric(metric, original, current, all_outputs, final): 164 | """compute the result for the given metric""" 165 | try: 166 | # get the function name from the "metrics.py" file 167 | func = metrics[metric] 168 | return func(original, current, all_outputs, final) 169 | except: 170 | logger.info(f"Unable to compute the metric {metric}") 171 | raise 172 | 173 | 174 | def aggregate_metrics(all_outputs, fields, metrics_fields, output_to_metrics, final=False): 175 | """Combine the stats array into a value for a given metric""" 176 | 177 | out_metrics = {} 178 | for field in fields: 179 | original = "original_" + field 180 | current = field + " " * len("original_") 181 | 182 | out_metrics[field] = {} 183 | for metric in output_to_metrics[field]: 184 | # first we compute the aggregated metric 185 | out_metrics[field][metric] = {} 186 | out_metrics[field][metric]["total"] = compute_metric(metric, original, current, all_outputs, final) 187 | logger.info(f"{field},{metric},{out_metrics[field][metric]['total']}") 188 | # We then split the metrics computation per metric field. 189 | # We do this by taking all the inputs so far. Although this involves repetition, this is more generalizable 190 | # to cases where the metric is corpus-wide (e.g. BLEU). 191 | for metric_field in metrics_fields: 192 | grouped_items = group_by_field(all_outputs, metric_field) 193 | out_metrics[field][metric][metric_field] = [] 194 | for metric_field_value, field_outputs in grouped_items: 195 | out_metrics[field][metric][metric_field].append( 196 | (metric_field_value, compute_metric(metric, original, current, list(field_outputs), False)) 197 | ) 198 | out_metrics[field][metric][metric_field].sort(key=lambda k: k[1]["value"]) 199 | out_metrics[field][metric][metric_field] = OrderedDict(out_metrics[field][metric][metric_field]) 200 | return out_metrics 201 | 202 | 203 | def compute_metrics_from_run(field, filename=None, run_id=None, eval_folder=None, metrics=None): 204 | if run_id is not None: 205 | assert eval_folder is not None 206 | filename = get_artifact(run_id, f"evaluation/{eval_folder}/generated.json") 207 | 208 | filename = Path(filename) 209 | all_outputs = json.load(open(filename, "r")) 210 | output_to_metrics = {} 211 | if metrics is None: 212 | metrics = ["bleu"] 213 | output_to_metrics[field] = metrics 214 | stats = aggregate_metrics(all_outputs, [field], [], output_to_metrics, final=True) 215 | print(json.dumps(stats, indent=2)) 216 | out_folder = filename.parent 217 | (out_folder / f"stats_{filename.stem}.json").write_text(json.dumps(stats, indent=2)) 218 | 219 | if run_id is not None: 220 | mlflow.start_run(run_id) 221 | flattened_stats = flatten(stats) 222 | flattened_stats = {k: flattened_stats[k] for k in flattened_stats if k.count("-") <= 3} 223 | 224 | mlflow.log_metrics(flattened_stats) 225 | 226 | 227 | metrics = {"match": match, "bleu": bleu, "almostmatch": almostmatch} 228 | 229 | if __name__ == "__main__": 230 | Fire(compute_metrics_from_run) 231 | -------------------------------------------------------------------------------- /src/datatuner/classification/distractors.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import re 4 | from copy import deepcopy 5 | from itertools import chain 6 | from pathlib import Path 7 | 8 | import nltk 9 | import pandas as pd 10 | 11 | random.seed(42) 12 | 13 | 14 | def get_distractors(data, text, swapping_candidates, cutting_candidates, random_text, num_candidates=5, 15 | max_per_operation=5): 16 | """Get the distractors for the given inputs""" 17 | distractors_dict = {} 18 | 19 | for cands in swapping_candidates: 20 | distractors_dict["value_error"] = swap_entities(cands, text, max_outputs=max_per_operation) 21 | 22 | for cands in cutting_candidates: 23 | distractors_dict["value_error"].extend(cut_entities(cands, text, max_outputs=max_per_operation)) 24 | 25 | distractors_dict["value_error"].extend(add_negation_errors(text, max_outputs=int(math.ceil(max_per_operation / 2)))) 26 | distractors_dict["omission"] = add_omission(text, max_outputs=max_per_operation) 27 | if "," in text: 28 | distractors_dict["omission"].extend(add_phrase_omission(text, max_outputs=1 + max_per_operation)) 29 | 30 | distractors_dict["repetition"] = add_repetition(text, max_outputs=1 + max_per_operation) 31 | distractors_dict["hallucination"] = add_repetition( 32 | text, random_text=random_text, replace=True, max_outputs=max_per_operation 33 | ) + add_repetition(text, random_text=random_text, max_outputs=max_per_operation) 34 | 35 | distractors = set(chain(*distractors_dict.values())) 36 | 37 | # Remove text itself if present 38 | if text in distractors: 39 | distractors.remove(text) 40 | 41 | # Shuffle and cut 42 | distractors = list(distractors) 43 | random.shuffle(distractors) 44 | distractors = distractors[:num_candidates] 45 | 46 | # If no distractors found, add placeholders 47 | if len(distractors) == 0: 48 | distractors = ["placeholder"] * num_candidates 49 | # Pad to get to the right number of candidates 50 | if len(distractors) < num_candidates: 51 | ratio = int(math.ceil(num_candidates / len(distractors))) 52 | distractors = (distractors * ratio)[:num_candidates] 53 | 54 | classification_items = [ 55 | {"text": value, "data": data, "label": key} for key in distractors_dict for value in 56 | distractors_dict[key] 57 | ] + [{"text": text, "data": data, "label": "accurate"}] 58 | 59 | # Add negation 60 | replacements = {"[ no ]": "[ yes ]", "[ yes ]": "[ no ]"} 61 | for cand in replacements: 62 | if cand in data: 63 | negated_data = data.replace(cand, replacements[cand], 1) 64 | classification_items.extend([{"text": text, "data": negated_data, "label": "value_error"}]) 65 | 66 | random.shuffle(classification_items) 67 | classification_items = classification_items[:num_candidates] 68 | return distractors, classification_items 69 | 70 | 71 | def add_negation_errors(original_text, max_outputs=5): 72 | outputs = [] 73 | current_text = original_text 74 | blacklisted = ["not", "n't"] 75 | for x in blacklisted: 76 | if "not " in current_text: 77 | new_text = current_text.replace("not", "", 1) 78 | new_text = new_text.replace(" ", " ") 79 | outputs.append(new_text) 80 | current_text = new_text 81 | 82 | return outputs[:max_outputs] 83 | 84 | 85 | def cut_entities(entity_list, original_text, max_outputs=5): 86 | """Remove part of the entity""" 87 | output = [] 88 | entity_list = deepcopy(entity_list)[:max_outputs] 89 | for entity in entity_list: 90 | rand_ind = random.randint(0, len(entity) - 1) 91 | cut_entity = entity[:rand_ind].strip() 92 | if entity in original_text: 93 | output.append(original_text.replace(entity, cut_entity)) 94 | 95 | return output 96 | 97 | 98 | def swap_entities(entity_list, original_text, max_outputs=5): 99 | """Swap an entity from the `entity_list` with another from the list if present in the text""" 100 | entity_set = set(entity_list) 101 | output = [] 102 | 103 | entity_list = deepcopy(entity_list)[:max_outputs] 104 | 105 | random.shuffle(entity_list) 106 | for entity in entity_list: 107 | passed_entities = deepcopy(entity_set) 108 | passed_entities.remove(entity) 109 | passed_entities = list(passed_entities) 110 | if len(passed_entities) > 0: 111 | rand_entity = random.choice(passed_entities) 112 | if entity in original_text: 113 | text = original_text.replace(entity, rand_entity) 114 | output.append(text) 115 | 116 | return output 117 | 118 | 119 | def swap_pronouns(original_text): 120 | """Swap pronoun with a different one""" 121 | lower = ["he", "she", "it", "they"] 122 | upper = ["He", "She", "It", "They"] 123 | 124 | # find if a pronoun is there 125 | tokens = set(re.findall(r"[\w']+", original_text.lower())) 126 | pronoun_i = -1 127 | for i, p in enumerate(lower): 128 | if p in tokens: 129 | pronoun_i = i 130 | break 131 | 132 | text = original_text 133 | # if a pronoun is found, we replace its occurrences with a random other pronoun 134 | if pronoun_i >= 0: 135 | 136 | # get a random other pronoun 137 | candidates = set(list(range(len(lower)))) 138 | candidates.remove(pronoun_i) 139 | other_pronoun_i = random.choice(list(candidates)) 140 | 141 | for pronouns in [lower, upper]: 142 | pronoun = pronouns[pronoun_i] 143 | other_pronoun = pronouns[other_pronoun_i] 144 | text = re.sub(r"\b{}\b".format(pronoun), other_pronoun, text) 145 | return [text] 146 | 147 | return [] 148 | 149 | 150 | def add_phrase_omission(text, max_outputs=5): 151 | indices = [i for i, x in enumerate(text) if x == ","] 152 | output = [] 153 | random.shuffle(indices) 154 | end_strs = [".", ","] 155 | random.shuffle(end_strs) 156 | for end_str in end_strs: 157 | for i in indices: 158 | try: 159 | if len(output) >= max_outputs: 160 | break 161 | # until the index before comma + index from next dot if any 162 | output.append(text[:i] + text[text.index(end_str, i + 1):]) 163 | 164 | except: 165 | pass 166 | 167 | return output 168 | 169 | 170 | def add_omission(text, max_outputs=5): 171 | """Remove the shortest sentence from the text""" 172 | sentences = nltk.sent_tokenize(text) 173 | output = [] 174 | if len(sentences) > 1: 175 | for omit_ind in range(min(max_outputs, len(sentences))): 176 | # sort by increasing length; goal is to remove shortest for subtle omissions 177 | sentences = sorted(sentences, key=lambda x: len(x)) 178 | removed_sentence = sentences[omit_ind] 179 | output.append(text.replace(removed_sentence, "").strip()) 180 | return output 181 | 182 | 183 | def add_repetition(text, random_text=None, replace=False, max_outputs=5): 184 | """Repeat the shortest sentence int the text""" 185 | sentences = nltk.sent_tokenize(text) 186 | 187 | assert not (random_text is None and replace) 188 | 189 | if random_text is None: 190 | sorted_sentences = sorted(sentences, key=lambda x: len(x)) 191 | repeat_ind = 0 192 | random_sentence = sorted_sentences[repeat_ind] 193 | else: 194 | random_sentences = nltk.sent_tokenize(random_text) 195 | random_sentence = sorted(random_sentences, key=lambda x: len(x))[0] 196 | 197 | indices = list(range(min(max_outputs, len(sentences)))) 198 | random.shuffle(indices) 199 | 200 | outputs = [] 201 | for insert_at_ind in indices: 202 | if replace: 203 | sentences[insert_at_ind] = random_sentence 204 | else: 205 | sentences.insert(insert_at_ind, random_sentence) 206 | 207 | outputs.append(" ".join(sentences).strip()) 208 | return outputs[:max_outputs] 209 | 210 | 211 | def write_classification_data(classification_data, classification_dir, split): 212 | classification_dir = Path(classification_dir) 213 | classification_dir.mkdir(parents=True, exist_ok=True) 214 | 215 | random.shuffle(classification_data) 216 | max_data = 100000 if "train" in split else 10000 217 | classification_data = classification_data[:max_data] 218 | print(split) 219 | print(f"Length of classification_data: {len(classification_data)}") 220 | for x in classification_data: 221 | x["text"] = x["text"].replace("\n", " ").replace("\r", " ") 222 | 223 | df = pd.DataFrame(classification_data) 224 | 225 | print(f"Original classes distribution:") 226 | print(f"{df.label.value_counts()}") 227 | if "train" in split: 228 | max_size = df["label"].value_counts().max() 229 | 230 | lst = [df] 231 | for class_index, group in df.groupby("label"): 232 | sizes = [max_size, 3 * len(group), max_size - len(group)] 233 | size = min(sizes) 234 | lst.append(group.sample(size, replace=True)) 235 | 236 | df_new = pd.concat(lst) 237 | df_new = df_new.sample(frac=min(1, max_data / len(df_new))).reset_index(drop=True) 238 | print(f"New classes distribution:") 239 | print(f"{df_new.label.value_counts()}") 240 | else: 241 | df_new = df.sample(frac=min(1, max_data / len(df))).reset_index(drop=True) 242 | 243 | labels = list(df_new.label.unique()) 244 | df_new.to_csv(classification_dir / (split + ".tsv"), sep="|", index=False, columns=["label", "data", "text"]) 245 | 246 | labels_file = classification_dir / "labels.txt" 247 | labels_file.write_text("\n".join(sorted(labels))) 248 | print("") 249 | -------------------------------------------------------------------------------- /src/external/webnlg_webnlg_baseline/webnlg_baseline_input.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import re 4 | import json 5 | import sys 6 | import getopt 7 | from collections import defaultdict 8 | from external.shimorina_webnlg_baseline.benchmark_reader import Benchmark 9 | 10 | 11 | def select_files(topdir, category='', size=(1, 8)): 12 | """ 13 | Collect all xml files from a benchmark directory. 14 | :param topdir: directory with benchmark 15 | :param category: specify DBPedia category to retrieve texts for a specific category (default: retrieve all) 16 | :param size: specify size to retrieve texts of specific size (default: retrieve all) 17 | :return: list of tuples (full path, filename) 18 | """ 19 | finaldirs = [topdir+'/'+str(item)+'triples' for item in range(size[0], size[1])] 20 | finalfiles = [] 21 | for item in finaldirs: 22 | finalfiles += [(item, filename) for filename in os.listdir(item)] 23 | if category: 24 | finalfiles = [] 25 | for item in finaldirs: 26 | finalfiles += [(item, filename) for filename in os.listdir(item) if category in filename] 27 | return finalfiles 28 | 29 | 30 | def delexicalisation(out_src, out_trg, category, properties_objects): 31 | """ 32 | Perform delexicalisation. 33 | :param out_src: source string 34 | :param out_trg: target string 35 | :param category: DBPedia category 36 | :param properties_objects: dictionary mapping properties to objects 37 | :return: delexicalised strings of the source and target; dictionary containing mappings of the replacements made 38 | """ 39 | with open('delex_dict.json') as data_file: 40 | data = json.load(data_file) 41 | # replace all occurrences of Alan_Bean to ASTRONAUT in input 42 | delex_subj = data[category] 43 | delex_src = out_src 44 | delex_trg = out_trg 45 | # for each instance, we save the mappings between nondelex and delex 46 | replcments = {} 47 | for subject in delex_subj: 48 | clean_subj = ' '.join(re.split('(\W)', subject.replace('_', ' '))) 49 | if clean_subj in out_src: 50 | delex_src = out_src.replace(clean_subj + ' ', category.upper() + ' ') 51 | replcments[category.upper()] = ' '.join(clean_subj.split()) # remove redundant spaces 52 | if clean_subj in out_trg: 53 | delex_trg = out_trg.replace(clean_subj + ' ', category.upper() + ' ') 54 | replcments[category.upper()] = ' '.join(clean_subj.split()) 55 | 56 | # replace all occurrences of objects by PROPERTY in input 57 | for pro, obj in sorted(properties_objects.items()): 58 | obj_clean = ' '.join(re.split('(\W)', obj.replace('_', ' ').replace('"', ''))) 59 | if obj_clean in delex_src: 60 | delex_src = delex_src.replace(obj_clean + ' ', pro.upper() + ' ') 61 | replcments[pro.upper()] = ' '.join(obj_clean.split()) # remove redundant spaces 62 | if obj_clean in delex_trg: 63 | delex_trg = delex_trg.replace(obj_clean + ' ', pro.upper() + ' ') 64 | replcments[pro.upper()] = ' '.join(obj_clean.split()) 65 | 66 | # possible enhancement for delexicalisation: 67 | # do delex triple by triple 68 | # now building | location | New_York_City New_York_City | isPartOf | New_York 69 | # is converted to 70 | # BUILDING location ISPARTOF City ISPARTOF City isPartOf ISPARTOF 71 | return delex_src, delex_trg, replcments 72 | 73 | 74 | def create_source_target(b, options, dataset, delex=True): 75 | """ 76 | Write target and source files, and reference files for BLEU. 77 | :param b: instance of Benchmark class 78 | :param options: string "delex" or "notdelex" to label files 79 | :param dataset: dataset part: train, dev, test 80 | :param delex: boolean; perform delexicalisation or not 81 | :return: if delex True, return list of replacement dictionaries for each example 82 | """ 83 | source_out = [] 84 | target_out = [] 85 | rplc_list = [] # store the dict of replacements for each example 86 | for entr in b.entries: 87 | tripleset = entr.modifiedtripleset 88 | lexics = entr.lexs 89 | category = entr.category 90 | for lex in lexics: 91 | triples = '' 92 | properties_objects = {} 93 | for triple in tripleset.triples: 94 | triples += triple.s + ' ' + triple.p + ' ' + triple.o + ' ' 95 | properties_objects[triple.p] = triple.o 96 | triples = triples.replace('_', ' ').replace('"', '') 97 | # separate punct signs from text 98 | out_src = ' '.join(re.split('(\W)', triples)) 99 | out_trg = ' '.join(re.split('(\W)', lex.lex)) 100 | if delex: 101 | out_src, out_trg, rplc_dict = delexicalisation(out_src, out_trg, category, properties_objects) 102 | rplc_list.append(rplc_dict) 103 | # delete white spaces 104 | source_out.append(' '.join(out_src.split())) 105 | target_out.append(' '.join(out_trg.split())) 106 | 107 | # shuffle two lists in the same way 108 | random.seed(10) 109 | if delex: 110 | corpus = list(zip(source_out, target_out, rplc_list)) 111 | random.shuffle(corpus) 112 | source_out, target_out, rplc_list = zip(*corpus) 113 | else: 114 | corpus = list(zip(source_out, target_out)) 115 | random.shuffle(corpus) 116 | source_out, target_out = zip(*corpus) 117 | 118 | with open(dataset + '-webnlg-' + options + '.triple', 'w+') as f: 119 | f.write('\n'.join(source_out)) 120 | with open(dataset + '-webnlg-' + options + '.lex', 'w+') as f: 121 | f.write('\n'.join(target_out)) 122 | 123 | # create separate files with references for multi-bleu.pl for dev set 124 | scr_refs = defaultdict(list) 125 | if dataset == 'dev' and not delex: 126 | for src, trg in zip(source_out, target_out): 127 | scr_refs[src].append(trg) 128 | # length of the value with max elements 129 | max_refs = sorted(scr_refs.values(), key=len)[-1] 130 | keys = [key for (key, value) in sorted(scr_refs.items())] 131 | values = [value for (key, value) in sorted(scr_refs.items())] 132 | # write the source file not delex 133 | with open(options + '-source.triple', 'w+') as f: 134 | f.write('\n'.join(keys)) 135 | # write references files 136 | for j in range(0, len(max_refs)): 137 | with open(options + '-reference' + str(j) + '.lex', 'w+') as f: 138 | out = '' 139 | for ref in values: 140 | try: 141 | out += ref[j] + '\n' 142 | except: 143 | out += '\n' 144 | f.write(out) 145 | 146 | return rplc_list 147 | 148 | 149 | def relexicalise(predfile, rplc_list): 150 | """ 151 | Take a file from seq2seq output and write a relexicalised version of it. 152 | :param rplc_list: list of dictionaries of replacements for each example (UPPER:not delex item) 153 | :return: list of predicted sentences 154 | """ 155 | relex_predictions = [] 156 | with open(predfile, 'r') as f: 157 | predictions = [line for line in f] 158 | for i, pred in enumerate(predictions): 159 | # replace each item in the corresponding example 160 | rplc_dict = rplc_list[i] 161 | relex_pred = pred 162 | for key in sorted(rplc_dict): 163 | relex_pred = relex_pred.replace(key + ' ', rplc_dict[key] + ' ') 164 | relex_predictions.append(relex_pred) 165 | # with open('relexicalised_predictions_full.txt', 'w+') as f: 166 | # f.write(''.join(relex_predictions)) 167 | 168 | # create a mapping between not delex triples and relexicalised sents 169 | with open('dev-webnlg-all-notdelex.triple', 'r') as f: 170 | dev_sources = [line.strip() for line in f] 171 | src_gens = {} 172 | for src, gen in zip(dev_sources, relex_predictions): 173 | src_gens[src] = gen # need only one lex, because they are the same for a given triple 174 | 175 | # write generated sents to a file in the same order as triples are written in the source file 176 | with open('all-notdelex-source.triple', 'r') as f: 177 | triples = [line.strip() for line in f] 178 | with open('relexicalised_predictions.txt', 'w+') as f: 179 | for triple in triples: 180 | f.write(src_gens[triple]) 181 | 182 | return relex_predictions 183 | 184 | 185 | def input_files(path, filepath=None, relex=False): 186 | """ 187 | Read the corpus, write train and dev files. 188 | :param path: directory with the WebNLG benchmark 189 | :param filepath: path to the prediction file with sentences (for relexicalisation) 190 | :param relex: boolean; do relexicalisation or not 191 | :return: 192 | """ 193 | parts = ['train', 'dev'] 194 | options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation 195 | for part in parts: 196 | for option in options: 197 | files = select_files(path + part, size=(1, 8)) 198 | b = Benchmark() 199 | b.fill_benchmark(files) 200 | if option == 'all-delex': 201 | rplc_list = create_source_target(b, option, part, delex=True) 202 | print('Total of {} files processed in {} with {} mode'.format(len(files), part, option)) 203 | elif option == 'all-notdelex': 204 | rplc_list = create_source_target(b, option, part, delex=False) 205 | print('Total of {} files processed in {} with {} mode'.format(len(files), part, option)) 206 | if relex and part == 'dev' and option == 'all-delex': 207 | relexicalise(filepath, rplc_list) 208 | print('Files necessary for training/evaluating are written on disc.') 209 | 210 | 211 | def main(argv): 212 | usage = 'usage:\npython3 webnlg_baseline_input.py -i ' \ 213 | '\ndata-directory is the directory where you unzipped the archive with data' 214 | try: 215 | opts, args = getopt.getopt(argv, 'i:', ['inputdir=']) 216 | except getopt.GetoptError: 217 | print(usage) 218 | sys.exit(2) 219 | input_data = False 220 | for opt, arg in opts: 221 | if opt in ('-i', '--inputdir'): 222 | inputdir = arg 223 | input_data = True 224 | else: 225 | print(usage) 226 | sys.exit() 227 | if not input_data: 228 | print(usage) 229 | sys.exit(2) 230 | print('Input directory is ', inputdir) 231 | input_files(inputdir) 232 | 233 | 234 | if __name__ == "__main__": 235 | main(sys.argv[1:]) 236 | -------------------------------------------------------------------------------- /src/datatuner/classification/classify_generated.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | from collections import Counter 6 | from pathlib import Path 7 | from shutil import copyfile 8 | from subprocess import run 9 | 10 | import numpy as np 11 | import pandas as pd 12 | from datatuner.classification.consistency_classifier import dataset_fields 13 | from datatuner.lm.metrics import bleu 14 | from fire import Fire 15 | from scipy import stats 16 | from tqdm import tqdm 17 | 18 | logger = logging.getLogger(__name__) 19 | THIS_DIR = Path(os.path.dirname(os.path.realpath(__file__))) 20 | PACKAGE_LOCATION = f"{THIS_DIR}/../../../../" 21 | 22 | 23 | def generate( 24 | in_file, 25 | dataset=None, 26 | out_folder=None, 27 | model_folder=None, 28 | model_type="roberta", 29 | model_name="roberta-large", 30 | python_location=sys.executable, 31 | classifier_script=THIS_DIR / "run_classifier.py", 32 | correct_label="accurate", 33 | text_key=None, 34 | data_key=None, 35 | ): 36 | """Classify data generated from the language model""" 37 | 38 | in_file = Path(in_file) 39 | data_folder = in_file.parent 40 | data = json.load(open(in_file)) 41 | basic_texts = [] 42 | if text_key is None: 43 | text_key = dataset_fields[dataset]["text"] 44 | else: 45 | text_key = text_key.strip() + (" " * len("original_")) 46 | 47 | if data_key is None: 48 | data_key = dataset_fields[dataset]["data"] 49 | 50 | # Prepare data for the classifier 51 | for item in data: 52 | if type(item[text_key]) == list: 53 | for x in item[text_key]: 54 | basic_texts.append( 55 | {"text": x.replace("\n", " "), "data": item[data_key].replace("\n", ";"), "label": correct_label} 56 | ) 57 | elif type(item[data_key] == list): 58 | for x in item[data_key]: 59 | basic_texts.append( 60 | {"text": item[text_key].replace("\n", " "), "data": x.replace("\n", ";"), "label": correct_label} 61 | ) 62 | 63 | df = pd.DataFrame(basic_texts) 64 | 65 | df.to_csv(data_folder / "test.tsv", sep="|", index=False, columns=["label", "data", "text"]) 66 | 67 | if model_folder is None: 68 | model_folder = f"{PACKAGE_LOCATION}/{dataset}_consistency_roberta-large_lower" 69 | model_folder = Path(model_folder) 70 | 71 | # Run the classifier command 72 | command = ( 73 | f"{python_location} {classifier_script} --task_name mnli --data_dir {data_folder} --stats_dir {data_folder} " 74 | f" --model_name {model_name} --output_dir {model_folder} --model_type {model_type} --do_eval" 75 | f" --overwrite_cache --per_gpu_eval_batch_size 32 --do_lower_case" 76 | ) 77 | 78 | print(command) 79 | run(command, shell=True) 80 | 81 | rerank_and_eval( 82 | in_file, 83 | dataset, 84 | model_folder=model_folder, 85 | out_folder=out_folder, 86 | correct_label=correct_label, 87 | text_key=text_key, 88 | data_key=data_key, 89 | ) 90 | 91 | 92 | def get_stats(data, dataset): 93 | """Get stats about the dataset""" 94 | if dataset == "webnlg": 95 | return json.dumps( 96 | { 97 | "num_triples": Counter(x["num_triples"] for x in data), 98 | "category": Counter(x["category"] for x in data), 99 | "category_type": Counter(x["category_type"] for x in data), 100 | }, 101 | indent=2, 102 | ) 103 | else: 104 | return "" 105 | 106 | 107 | def rerank_and_eval( 108 | in_file, 109 | dataset, 110 | model_folder=None, 111 | out_folder=None, 112 | nbest=100, 113 | correct_label="accurate", 114 | text_key=None, 115 | data_key=None, 116 | ): 117 | """Compute the metrics based on the generated and classified data before and after reranking""" 118 | 119 | in_file = Path(in_file) 120 | if model_folder is None: 121 | model_folder = f"{PACKAGE_LOCATION}/{dataset}_consistency_roberta-large_lower" 122 | 123 | model_folder = Path(model_folder) 124 | if text_key is None: 125 | text_key = dataset_fields[dataset]["text"] 126 | if data_key is None: 127 | data_key = dataset_fields[dataset]["data"] 128 | original_keys = [data_key] 129 | data = json.load(open(in_file)) 130 | if out_folder is None: 131 | out_folder = in_file.parent 132 | out_folder = Path(out_folder) 133 | out_folder.mkdir(parents=True, exist_ok=True) 134 | 135 | # This file is produced by the `generate` function above. It will have a dictionary. 136 | # `preds` is mapped to the list of labels predicted 137 | # `preds_prob` is mapped to the list of probabilities corresponding to the labels 138 | results = json.load(open(out_folder / "results.json", "r")) 139 | 140 | # Labels in the order we want to prioritize (correct first, then less severe errors) 141 | sorted_labels = ["accurate", "value_error", "repetition", "omission", "hallucination", "pronoun_error"] 142 | 143 | k = 0 144 | assert len(data) > 0 145 | 146 | for item in tqdm(data): 147 | cand_len = len(item[text_key]) 148 | indices = list(range(cand_len))[:nbest] 149 | item["pred_prob"] = results["preds_prob"][k: k + cand_len][:nbest] 150 | item["pred"] = results["preds"][k: k + cand_len][:nbest] 151 | 152 | current_labels = [sorted_labels.index(x) for x in item["pred"]] 153 | reranked = [ 154 | x 155 | for x in sorted( 156 | list(zip(current_labels, item["pred_prob"], indices, item[text_key][:nbest])), 157 | key=lambda x: (x[0], x[2]), 158 | ) 159 | ] 160 | item["reranked"] = [x[3] for x in reranked] 161 | item["reranked_pred_prob"] = [x[1] for x in reranked] 162 | item["reranked_pred"] = [sorted_labels[x[0]] for x in reranked] 163 | 164 | k += cand_len 165 | 166 | correct_data = [x for x in data if x["pred"] and x["pred"][0] == correct_label and x["pred_prob"][0]] 167 | wrong_data = [x for x in data if (not x["pred"]) or x["pred"][0] != correct_label] 168 | 169 | print("Evaluating") 170 | 171 | out_stats = "" 172 | 173 | try: 174 | out_stats += f"all data: {get_stats(data, dataset)}\n" 175 | 176 | original_key = f"original_{text_key.strip()}" 177 | 178 | # Get stats before reranking 179 | out_stats += f"correct data: {get_stats(correct_data, dataset)}\n" 180 | out_stats += f"wrong data: {get_stats(wrong_data, dataset)}\n" 181 | 182 | out_stats += f"data, text: {bleu(original_key, text_key, data, True, case_insensitive=True, all_keys=original_keys)}\n" 183 | bleu_correct = bleu(original_key, text_key, correct_data, False, case_insensitive=True, all_keys=original_keys) 184 | out_stats += f"correct_data, text: {bleu_correct}\n" 185 | bleu_wrong = bleu(original_key, text_key, wrong_data, False, case_insensitive=True, all_keys=original_keys) 186 | out_stats += f"wrong_data, text: {bleu_wrong}\n" 187 | out_stats += f"percent correct: {len(correct_data) / len(data) * 100}\n" 188 | 189 | r, p = stats.pointbiserialr( 190 | [0] * bleu_correct["count"] + [1] * bleu_wrong["count"], 191 | [bleu_correct["value"]] * bleu_correct["count"] + [bleu_wrong["value"]] * bleu_wrong["count"] 192 | ) 193 | out_stats += f"r: {r}, p-value: {p}\n" 194 | except: 195 | print("Not computing stats for before reranking") 196 | 197 | # Get stats after reranking 198 | correct_data = [x for x in data if x["reranked_pred"] and x["reranked_pred"][0] == correct_label] 199 | wrong_data = [x for x in data if (not x["reranked_pred"]) or x["reranked_pred"][0] != correct_label] 200 | 201 | try: 202 | out_stats += f"correct data: {get_stats(correct_data, dataset)}\n" 203 | out_stats += f"wrong data: {get_stats(wrong_data, dataset)}\n" 204 | 205 | out_stats += ( 206 | f"data, reranked: " 207 | f'{bleu(original_key, "reranked", data, True, case_insensitive=True, all_keys=original_keys)}\n' 208 | ) 209 | bleu_correct = bleu( 210 | original_key, "reranked", correct_data, False, case_insensitive=True, all_keys=original_keys 211 | ) 212 | out_stats += f"correct_data, reranked: {bleu_correct}\n" 213 | bleu_wrong = bleu(original_key, "reranked", wrong_data, False, case_insensitive=True, all_keys=original_keys) 214 | out_stats += f"wrong_data, reranked: {bleu_wrong}\n" 215 | out_stats += f"percent correct: {len(correct_data) / len(data) * 100}\n" 216 | 217 | r, p = stats.pointbiserialr( 218 | [0] * bleu_correct["count"] + [1] * bleu_wrong["count"], 219 | [bleu_correct["value"]] * bleu_correct["count"] + [bleu_wrong["value"]] * bleu_wrong["count"], 220 | ) 221 | 222 | out_stats += f"r: {r}, p-value: {p}\n" 223 | 224 | except: 225 | print("Not computing stats for after reranking") 226 | 227 | json.dump(data, open(out_folder / "classified.json", "w"), indent=2) 228 | json.dump(correct_data, open(out_folder / "classified_correct.json", "w"), indent=2) 229 | json.dump(wrong_data, open(out_folder / "classified_wrong.json", "w"), indent=2) 230 | print(out_stats) 231 | (out_folder / "stats.txt").write_text(out_stats) 232 | for item in data: 233 | item[text_key] = item["reranked"] 234 | item["pred_prob"] = item["reranked_pred_prob"] 235 | item["pred"] = item["reranked_pred"] 236 | del item["reranked"] 237 | del item["reranked_pred_prob"] 238 | del item["reranked_pred"] 239 | json.dump(data, open(out_folder / "reranked.json", "w"), indent=2) 240 | 241 | 242 | systems = ["systemFcPost", "systemNoFc", "systemNoFcNoFs"] 243 | datasets = ["ldc", "webnlg", "e2e", "viggo"] 244 | 245 | 246 | def get_semantic_stats(data, folder, system, dataset): 247 | sfc_correct = [item["sfc_correct"] for item in data] 248 | 249 | results = {"sfc_correct": np.mean(sfc_correct)} 250 | if dataset != "ldc": 251 | ser_correct = [item["ser_correct"] for item in data] 252 | ser = [item["ser"] for item in data] 253 | results.update( 254 | { 255 | "ser_correct": np.mean(ser_correct), 256 | "ser": np.mean(ser), 257 | "both_correct": np.mean(np.prod([sfc_correct, ser_correct], axis=0)), 258 | "at_least_one_correct": np.mean(np.logical_or(sfc_correct, ser_correct)), 259 | "sfc_correct_ser_wrong": np.mean( 260 | [int(sfc_correct[i] == 1 and ser_correct[i] == 0) for i in range(len(data))] 261 | ), 262 | "sfc_wrong_ser_correct": np.mean( 263 | [int(sfc_correct[i] == 0 and ser_correct[i] == 1) for i in range(len(data))] 264 | ), 265 | } 266 | ) 267 | 268 | print(results) 269 | out_file = folder / (f"results_{system}.json") 270 | print(f"written to {out_file}") 271 | json.dump(results, open(out_file, "w"), indent=2) 272 | return data 273 | 274 | 275 | if __name__ == "__main__": 276 | Fire() 277 | -------------------------------------------------------------------------------- /src/external/ufal_dsg_tgen/data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | """ 5 | Helper data structures 6 | """ 7 | 8 | from builtins import zip 9 | from builtins import str 10 | from builtins import object 11 | import re 12 | 13 | 14 | class DAI(object): 15 | """Simple representation of a single dialogue act item.""" 16 | 17 | __slots__ = ['da_type', 'slot', 'value'] 18 | 19 | def __init__(self, da_type, slot=None, value=None): 20 | self.da_type = da_type 21 | self.slot = slot 22 | self.value = value 23 | 24 | def __str__(self): 25 | if self.slot is None: 26 | return self.da_type + '()' 27 | if self.value is None: 28 | return self.da_type + '(' + self.slot + ')' 29 | quote = '\'' if (' ' in self.value or ':' in self.value) else '' 30 | return self.da_type + '(' + self.slot + '=' + quote + self.value + quote + ')' 31 | 32 | def __bytes__(self): 33 | return str(self).encode('ascii', errors='replace') 34 | 35 | def __repr__(self): 36 | return 'DAI.parse("' + str(self) + '")' 37 | 38 | def __hash__(self): 39 | return hash(repr(self)) 40 | 41 | def __eq__(self, other): 42 | return (self.da_type == other.da_type and 43 | self.slot == other.slot and 44 | self.value == other.value) 45 | 46 | def __lt__(self, other): 47 | return (self.da_type < other.da_type or 48 | (self.da_type == other.da_type and self.slot < other.slot) or 49 | (self.da_type == other.da_type and self.slot == other.slot and 50 | self.value < other.value)) 51 | 52 | def __le__(self, other): 53 | return (self.da_type < other.da_type or 54 | (self.da_type == other.da_type and self.slot < other.slot) or 55 | (self.da_type == other.da_type and self.slot == other.slot and 56 | self.value <= other.value)) 57 | 58 | def __ne__(self, other): 59 | return not self == other 60 | 61 | def __gt__(self, other): 62 | return not self <= other 63 | 64 | def __ge__(self, other): 65 | return not self < other 66 | 67 | @staticmethod 68 | def parse(dai_text): 69 | da_type, svp = dai_text[:-1].split('(', 1) 70 | 71 | if not svp: # no slot + value (e.g. 'hello()') 72 | return DAI(da_type) 73 | 74 | if '=' not in svp: # no value (e.g. 'request(to_stop)') 75 | return DAI(da_type, svp) 76 | 77 | slot, value = svp.split('=', 1) 78 | if value.endswith('"#'): # remove special '#' characters in Bagel data (TODO treat right) 79 | value = value[:-1] 80 | if value[0] in ['"', '\'']: # remove quotes 81 | value = value[1:-1] 82 | return DAI(da_type, slot, value) 83 | 84 | 85 | class DA(object): 86 | """Dialogue act -- a list of DAIs with a few special functions for parsing etc..""" 87 | 88 | def __init__(self): 89 | self.dais = [] 90 | 91 | def __getitem__(self, idx): 92 | return self.dais[idx] 93 | 94 | def __setitem__(self, idx, value): 95 | self.dais[idx] = value 96 | 97 | def append(self, value): 98 | self.dais.append(value) 99 | 100 | def __str__(self): 101 | return '&'.join([str(dai) for dai in self.dais]) 102 | 103 | def __bytes__(self): 104 | return str(self).encode('ascii', errors='xmlcharrefreplace') 105 | 106 | def __repr__(self): 107 | return 'DA.parse("' + str(self) + '")' 108 | 109 | def __hash__(self): 110 | return hash(repr(self)) 111 | 112 | def __len__(self): 113 | return len(self.dais) 114 | 115 | def __eq__(self, other): 116 | if not isinstance(other, DA): 117 | return NotImplemented 118 | for self_dai, other_dai in zip(self.dais, other.dais): 119 | if self_dai != other_dai: 120 | return False 121 | return True 122 | 123 | def __ne__(self, other): 124 | return not self == other 125 | 126 | def sort(self): 127 | self.dais.sort() 128 | 129 | @staticmethod 130 | def parse(da_text): 131 | """Parse a DA string into DAIs (DA types, slots, and values).""" 132 | da = DA() 133 | for dai_text in da_text[:-1].split(')&'): 134 | da.append(DAI.parse(dai_text + ')')) 135 | return da 136 | 137 | class TagQuotes(object): 138 | """A helper class for numbering the occurrences of quoted things in the text.""" 139 | def __init__(self): 140 | self.counter = 0 141 | 142 | def __call__(self, match): 143 | self.counter += 1 144 | return 'XXXQUOT%d' % self.counter 145 | 146 | @staticmethod 147 | def _protect_quotes(text): 148 | """Find and replace quoted parts of the sentence by tags.""" 149 | tag_pattern = '"[^"]*"|\'[^\']*\'' 150 | tags = re.findall(tag_pattern, text) 151 | sent = re.sub(tag_pattern, DA.TagQuotes(), text) 152 | return sent, tags 153 | 154 | @staticmethod 155 | def parse_cambridge_da(da_text): 156 | """Parse a Cambridge-style DA string a DA object.""" 157 | da = DA() 158 | da_text, quoted = DA._protect_quotes(da_text.strip()) 159 | quoted_num = 1 160 | 161 | for dai_text in re.finditer(r'(\??[a-z_]+)\(([^)]*)\)', da_text): 162 | da_type, svps_text = dai_text.groups() 163 | 164 | if not svps_text: # no slots/values (e.g. 'hello()') 165 | da.append(DAI(da_type, None, None)) 166 | continue 167 | 168 | # we have some slots/values – split them into DAI 169 | svps = re.findall('([^,;=\'"]+(?:=(?:[^"\',;]+))?)(?:[,;]|[\'"]$|$)', svps_text) 170 | for svp in svps: 171 | 172 | if '=' not in svp: # no value, e.g. '?request(near)' 173 | da.append(DAI(da_type, svp, None)) 174 | continue 175 | 176 | # we have a value 177 | slot, value = svp.split('=', 1) 178 | if 'XXXQUOT%d' % quoted_num in value: # get back the quoted value 179 | value = re.sub('XXXQUOT%d' % quoted_num, quoted.pop(0), value, count=1) 180 | quoted_num += 1 181 | if re.match(r'^\'.*\'$', value) or re.match('^".*"$', value): 182 | value = value[1:-1] 183 | assert not re.match(r'^\'', value) and not re.match(r'\'$', value) 184 | assert not re.match(r'^"', value) and not re.match(r'"$', value) 185 | 186 | da.append(DAI(da_type, slot, value)) 187 | 188 | return da 189 | 190 | @staticmethod 191 | def parse_diligent_da(da_text): 192 | """Parse a Diligent-style flat MR (E2E NLG dataset) string into a DA object.""" 193 | da = DA() 194 | 195 | for dai_text in re.finditer(r'([a-z_A-Z]+)\[([^\]]*)\]', da_text): 196 | slot, value = dai_text.groups() 197 | slot = re.sub(r'([A-Z])', r'_\1', slot).lower() 198 | da.append(DAI('inform', slot, value if value else None)) 199 | 200 | return da 201 | 202 | @staticmethod 203 | def parse_dict(da_dict, assume_da_type='inform'): 204 | """Parse an attribute-value dict, assuming the given DA type for all resulting DAIs.""" 205 | da = DA() 206 | for slot, values in da_dict.items(): 207 | for value in values.keys(): 208 | da.append(DAI(assume_da_type, slot, value)) 209 | da.sort() 210 | return da 211 | 212 | def value_for_slot(self, slot): 213 | """Return the value for the given slot (None if unset or not present at all). 214 | Uses the first occurrence of this slot if found.""" 215 | for dai in self.dais: 216 | if dai.slot == slot: 217 | return dai.value 218 | return None 219 | 220 | def has_value(self, value): 221 | """If the DA contains the given value, return the corresponding slot; return None 222 | otherwise. Abstracts away from "and" and "or" values (returns True for both coordination 223 | members).""" 224 | for dai in self.dais: 225 | if dai.value == value: 226 | return dai.slot 227 | if (dai.value is not None and 228 | value not in [None, '?'] and 229 | (re.match(r'.* (and|or) ' + value + r'$', dai.value) or 230 | re.match(r'^' + value + r' (and|or) ', dai.value))): 231 | return dai.slot 232 | return None 233 | 234 | def set_value_for_slot(self, slot, value): 235 | """Replace the value of the given slot. Has no effect if the slot is not present 236 | in the DA. Will only replace the 1st occurrence of the slot.""" 237 | for dai in self.dais: 238 | if dai.slot == slot: 239 | dai.value = value 240 | break 241 | 242 | def get_delexicalized(self, delex_slots): 243 | """Return a delexicalized copy o fthe current DA (delexicalize slots that are in 244 | the given parameter). 245 | 246 | @param delex_slots: a set of names of slots to be delexicalized 247 | @return: a new DA() object with delexicalized values 248 | """ 249 | ret = DA() 250 | for dai in self: 251 | ret_dai = DAI(dai.da_type, dai.slot, 252 | 'X-' + dai.slot 253 | if (dai.slot in delex_slots and 254 | dai.value not in ['none', None, 'dont_care']) 255 | else dai.value) 256 | ret.append(ret_dai) 257 | return ret 258 | 259 | def to_human_string(self): 260 | """Return a string that is supposedly more human-readable than the standard DA form.""" 261 | out = '' 262 | cur_dat = None 263 | for dai in self: 264 | if dai.da_type != cur_dat: 265 | out += ('; ' if out else '') + dai.da_type.upper() 266 | cur_dat = dai.da_type 267 | if dai.slot: 268 | out += ': ' 269 | elif dai.slot: 270 | out += ', ' 271 | if dai.slot: 272 | out += dai.slot 273 | if dai.value: 274 | out += ' = ' + dai.value 275 | return out 276 | 277 | def to_cambridge_da_string(self): 278 | """Convert to Cambridge-style DA string (opposite of parse_cambridge_da).""" 279 | out = '' 280 | cur_dat = None 281 | for dai in self: 282 | if dai.da_type != cur_dat: 283 | out += (')&' if out else '') + dai.da_type + '(' 284 | cur_dat = dai.da_type 285 | elif dai.slot: 286 | out += ',' 287 | if dai.slot: 288 | out += dai.slot 289 | if dai.value: 290 | quote = '\'' if (' ' in dai.value or ':' in dai.value) else '' 291 | out += '=' + quote + dai.value + quote 292 | out += ')' if out else '' 293 | return out 294 | 295 | def to_diligent_da_string(self): 296 | """Convert to Diligent E2E dataset flat MR string (opposite of parse_diligent_da). 297 | Note that all DA type information is lost.""" 298 | # return slot names to camel case 299 | return ', '.join([re.sub(r'_([a-z])', lambda pat: pat.group(1).upper(), dai.slot) 300 | + '[' + dai.value + ']' for dai in self]) 301 | 302 | 303 | class Abst(object): 304 | """Simple representation of a single abstraction/delexicalization instruction.""" 305 | 306 | __slots__ = ['slot', 'value', 'surface_form', 'start', 'end'] 307 | 308 | def __init__(self, slot=None, value=None, surface_form=None, start=None, end=None): 309 | self.slot = slot 310 | self.value = value 311 | self.surface_form = surface_form 312 | self.start = start 313 | self.end = end 314 | if self.start is not None and self.end is None: 315 | self.end = self.start + 1 316 | 317 | def __str__(self): 318 | """Create string representation of the abstraction instruction, in the following format: 319 | slot="value":"surface_form":start-end. Surface form is omitted if None, quotes are omitted 320 | if not needed.""" 321 | # prepare quoting 322 | quote_value = '"' if ' ' in self.value or ':' in self.value else '' 323 | if self.surface_form is not None: 324 | quote_sf = '"' if ' ' in self.surface_form or ':' in self.surface_form else '' 325 | # create output 326 | out = self.slot + '=' + quote_value + self.value + quote_value + ':' 327 | if self.surface_form is not None: 328 | out += quote_sf + self.surface_form + quote_sf + ':' 329 | out += str(self.start) + '-' + str(self.end) 330 | return out 331 | 332 | def __bytes__(self): 333 | return str(self).encode('ascii', errors='xmlcharrefreplace') 334 | 335 | def __repr__(self): 336 | return 'Abst.parse("' + str(self) + '")' 337 | 338 | @staticmethod 339 | def parse(abst_str): 340 | """Create the abstraction instruction from a string representation, in the following 341 | format: slot="value":"surface_form":start-end. Here, surface form is optional and value 342 | and surface form do not need to be enquoted if they do not contain colons or spaces. 343 | @param abst_str: string representation of the abstraction instruction 344 | @return: Abst object representing the abstraction instruction 345 | """ 346 | slot, rest = abst_str.split('=', 1) 347 | if rest.startswith('"'): 348 | value, rest = re.split(r'(?