├── data
    └── .gitkeep
├── src
    ├── datatuner
    │   ├── __init__.py
    │   ├── lm
    │   │   ├── __init__.py
    │   │   ├── converters.py
    │   │   ├── launch_tokenizer.py
    │   │   ├── process_json.py
    │   │   ├── custom_gpt2.py
    │   │   ├── special_token_generator.py
    │   │   ├── custom_tokenizer.py
    │   │   ├── reranker.py
    │   │   ├── utils.py
    │   │   ├── model_loader.py
    │   │   ├── novograd.py
    │   │   ├── cross_entropy.py
    │   │   └── metrics.py
    │   ├── ops
    │   │   └── mlflow.py
    │   ├── classification
    │   │   ├── consistency_classifier.py
    │   │   ├── consistency_processor.py
    │   │   ├── distractors.py
    │   │   └── classify_generated.py
    │   └── utils.py
    └── external
    │   ├── __init__.py
    │   ├── ufal_dsg_tgen
    │       ├── __init__.py
    │       └── data.py
    │   ├── jjuraska_slug2slug
    │       ├── __init__.py
    │       ├── slot_aligner
    │       │   ├── alignment
    │       │   │   ├── numeric_slot.py
    │       │   │   ├── utils.py
    │       │   │   ├── list_slot.py
    │       │   │   ├── scalar_slot.py
    │       │   │   ├── categorical_slots.py
    │       │   │   ├── alternatives.json
    │       │   │   └── boolean_slot.py
    │       │   └── slot_extraction.py
    │       ├── config.py
    │       └── slug2slug_ser.py
    │   ├── shimorina_inlg_2018
    │       ├── __init__.py
    │       └── webnlg_slot_error_rate.py
    │   ├── tuetschek_e2e_cleaning
    │       └── __init__.py
    │   ├── webnlg_webnlg_baseline
    │       ├── __init__.py
    │       ├── benchmark_reader.py
    │       └── webnlg_baseline_input.py
    │   ├── ukplab_emnlp2019_dualgraph
    │       ├── preprocess_LDC2017T10.sh
    │       ├── split_amr.py
    │       └── gen_LDC2017T10.sh
    │   └── README.md
├── paper
    ├── experiments
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   └── preprocess.py
    │   ├── ldc
    │   │   ├── __init__.py
    │   │   └── preprocess.py
    │   ├── viggo
    │   │   ├── __init__.py
    │   │   └── preprocess.py
    │   ├── webnlg
    │   │   ├── __init__.py
    │   │   ├── webnlg_utils.py
    │   │   └── preprocess.py
    │   └── mturk
    │   │   ├── README.md
    │   │   └── text_stats.py
    ├── evaluate_lm_simple.sh
    ├── train_classifier.sh
    ├── eval_with_classifier.sh
    ├── task_configs
    │   ├── viggo.json
    │   ├── e2e.json
    │   ├── ldc.json
    │   ├── e2e_cg.json
    │   ├── viggo_cg.json
    │   ├── ldc_cg.json
    │   ├── webnlg.json
    │   └── webnlg_cg.json
    ├── lm_training_args
    │   ├── ldc
    │   │   ├── DataTuner_No_FC_model_training_args.json
    │   │   └── DataTuner_No_FC_No_FS_model_training_args.json
    │   ├── viggo
    │   │   ├── DataTuner_No_FC_model_training_args.json
    │   │   └── DataTuner_No_FC_No_FS_model_training_args.json
    │   ├── webnlg
    │   │   ├── DataTuner_No_FC_model_training_args.json
    │   │   └── DataTuner_No_FC_No_FS_model_training_args.json
    │   └── e2e
    │   │   ├── DataTuner_No_FC_model_training_args.json
    │   │   └── DataTuner_No_FC_No_FS_model_training_args.json
    ├── train_lm.sh
    ├── config.sh
    ├── preprocess.sh
    ├── classifier_training_args
    │   ├── e2e
    │   │   └── e2e_model_training_args.json
    │   ├── ldc
    │   │   └── ldc_model_training_args.json
    │   ├── webnlg
    │   │   └── webnlg_model_training_args.json
    │   └── viggo
    │   │   └── viggo_model_training_args.json
    ├── evaluate_lm.sh
    ├── retrieve.sh
    └── README.md
├── CODE_OF_CONDUCT.md
├── setup.py
├── environment.yml
├── setup.sh
├── CONTRIBUTING.md
└── README.md


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datatuner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/external/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paper/experiments/e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paper/experiments/ldc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paper/experiments/viggo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paper/experiments/webnlg/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/external/ufal_dsg_tgen/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/external/shimorina_inlg_2018/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/external/tuetschek_e2e_cleaning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/external/webnlg_webnlg_baseline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/converters.py:
--------------------------------------------------------------------------------
1 | #  Converter functions available to apply to text fields
2 | 
3 | def clean_mrl(mrl):
4 |     return mrl.replace("_", " ").replace(".", " ").replace("(", " ( ").replace(")", " ) ")
5 | 
6 | 
7 | converters = {"clean_mrl": clean_mrl}
8 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="datatuner",
 5 |     version="1.0",
 6 |     description="Natural Language Generation Library",
 7 |     packages=find_packages(),
 8 |     package_dir={"": "src"},
 9 |     package_data={},
10 |     install_requires=[],
11 |     extras_require={},
12 |     zip_safe=False,
13 |     tests_require=[],
14 | )
15 | 


--------------------------------------------------------------------------------
/paper/evaluate_lm_simple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ./config.sh
 3 | 
 4 | TEST_FILE=$1
 5 | MODEL=$2
 6 | 
 7 | echo "Evaluating $TEST_FILE with the model in $MODEL"
 8 | 
 9 | $python ../src/datatuner/lm/evaluate.py  \
10 | --filename $TEST_FILE \
11 | --no_sample \
12 | --model_checkpoint $MODEL \
13 | --nbest 5 \
14 | --beam_width 5 \
15 | --per_step_predictions 5 \
16 | --averaging default \
17 | --beam_alpha 0.75 \
18 | --model_type gpt2


--------------------------------------------------------------------------------
/src/datatuner/ops/mlflow.py:
--------------------------------------------------------------------------------
 1 | import mlflow
 2 | 
 3 | 
 4 | def get_artifact(run_id, path):
 5 |     client = mlflow.tracking.MlflowClient()
 6 |     return client.download_artifacts(run_id, path)
 7 | 
 8 | 
 9 | def get_finished_models(experiments):
10 |     client = mlflow.tracking.MlflowClient()
11 |     runs = client.search_runs(experiments, filter_string="metrics.finished=1")
12 |     run_ids = [x.info.run_id for x in runs]
13 |     return run_ids
14 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/launch_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from datatuner.lm.model_loader import load_pretrained_tokenizer
 2 | from fire import Fire
 3 | 
 4 | 
 5 | def launch(model_checkpoint, model_type="gpt2"):
 6 |     tokenizer = load_pretrained_tokenizer(model_checkpoint, model_type)
 7 | 
 8 |     while True:
 9 |         tokenized = tokenizer.tokenize(input("text >>> "))
10 | 
11 |         print(tokenized)
12 |         print(tokenizer.convert_tokens_to_ids(tokenized))
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     Fire(launch)
17 | 


--------------------------------------------------------------------------------
/paper/train_classifier.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ./config.sh
 3 | TRAINING_DATA_FOLDER=$1
 4 | OUTPUT_FOLDER=$2
 5 | TRAINING_ARGS=$3
 6 | NUM_PARALLEL=$4
 7 | 
 8 | if [ -z "$NUM_PARALLEL" ]; then
 9 |     NUM_PARALLEL=1
10 | fi
11 | 
12 | mkdir -p $OUTPUT_FOLDER
13 | 
14 | echo "Training the classifier and writing the trained model to $OUTPUT_FOLDER"
15 | 
16 | $python -m torch.distributed.launch --nproc_per_node=$NUM_PARALLEL ../src/datatuner/classification/run_classifier.py  \
17 | --data_dir $TRAINING_DATA_FOLDER  \
18 | --output_dir  $OUTPUT_FOLDER  \
19 | --retrain_base $TRAINING_ARGS


--------------------------------------------------------------------------------
/paper/eval_with_classifier.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # example: bash eval_with_classifier.sh ./data/consistency/viggo eval_results ~/trained_classifiers/viggo/ amrl text
 3 | source ./config.sh
 4 | 
 5 | TRAINING_DATA_FOLDER=$1
 6 | GENERATED_DATA_FOLDER=$2
 7 | MODEL_FOLDER=$3
 8 | DATA_KEY=$4
 9 | TEXT_KEY=$5
10 | 
11 | cp $TRAINING_DATA_FOLDER/labels.txt $GENERATED_DATA_FOLDER/labels.txt
12 | 
13 | $python ../src/datatuner/classification/classify_generated.py generate \
14 | --in_file $GENERATED_DATA_FOLDER/generated.json \
15 | --model_folder $MODEL_FOLDER \
16 | --data_key $DATA_KEY \
17 | --text_key $TEXT_KEY \̄


--------------------------------------------------------------------------------
/paper/experiments/webnlg/webnlg_utils.py:
--------------------------------------------------------------------------------
 1 | from xml.etree import ElementTree
 2 | 
 3 | 
 4 | def camel_case_split(s):
 5 |     words = [[s[0]]]
 6 | 
 7 |     for c in s[1:]:
 8 |         if words[-1][-1].islower() and c.isupper():
 9 |             words.append(list(c))
10 |         else:
11 |             words[-1].append(c)
12 | 
13 |     return " ".join(["".join(word).lower() for word in words])
14 | 
15 | 
16 | def cleanup(s):
17 |     if type(s) != str:
18 |         s = ElementTree.tostring(s, encoding="unicode")
19 |     s = s.replace("\t", " ").replace("\n", " ").replace("_", " ")
20 |     s = " ".join(s.split(" ")).strip()
21 |     return s
22 | 


--------------------------------------------------------------------------------
/paper/task_configs/viggo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "viggo",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "new_mr",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "ref",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ]
29 | }


--------------------------------------------------------------------------------
/paper/task_configs/e2e.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "e2e_dataset",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "new_mr",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "ref",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ]
29 | }


--------------------------------------------------------------------------------
/paper/task_configs/ldc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "ldc",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "linearized_amr",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "answer_text",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ]
29 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/ldc/DataTuner_No_FC_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/ldc.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 3,
 6 |   "valid_batch_size": 4,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.0001,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 1,
12 |   "n_epochs": 20,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": 0,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "adamw",
25 |   "max_block_size": 350
26 | }


--------------------------------------------------------------------------------
/paper/task_configs/e2e_cg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "e2e_dataset_cg",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "new_mr",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "ref",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ],
29 |     "token_typing": "coarse_grained"
30 | }


--------------------------------------------------------------------------------
/paper/task_configs/viggo_cg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "viggo_cg",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "new_mr",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "ref",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ],
29 |     "token_typing": "coarse_grained"
30 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/viggo/DataTuner_No_FC_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/viggo.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 8,
 6 |   "valid_batch_size": 1,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.01,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 1,
12 |   "n_epochs": 10,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": 0,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "novograd",
25 |   "max_block_size": null
26 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/webnlg/DataTuner_No_FC_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/webnlg.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 8,
 6 |   "valid_batch_size": 1,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.01,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 2,
12 |   "n_epochs": 15,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": false,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": 3,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "novograd",
25 |   "max_block_size": 200
26 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/e2e/DataTuner_No_FC_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/e2e_dataset_cg.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 10,
 6 |   "valid_batch_size": 1,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.0001,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 1,
12 |   "n_epochs": 10,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": 0,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "adamw",
25 |   "max_block_size": null
26 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/ldc/DataTuner_No_FC_No_FS_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/ldc_cg.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 3,
 6 |   "valid_batch_size": 4,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.0001,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 1,
12 |   "n_epochs": 20,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": 0,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "adamw",
25 |   "max_block_size": 350
26 | }


--------------------------------------------------------------------------------
/paper/task_configs/ldc_cg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "ldc_cg",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "linearized_amr",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "answer_text",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ],
29 |     "token_typing": "coarse_grained"
30 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/viggo/DataTuner_No_FC_No_FS_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/viggo_cg.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 8,
 6 |   "valid_batch_size": 1,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.01,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 1,
12 |   "n_epochs": 10,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": -1,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "novograd",
25 |   "max_block_size": null
26 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/webnlg/DataTuner_No_FC_No_FS_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/webnlg_cg.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 8,
 6 |   "valid_batch_size": 1,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.01,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 2,
12 |   "n_epochs": 15,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": -1,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "novograd",
25 |   "max_block_size": 200
26 | }


--------------------------------------------------------------------------------
/paper/lm_training_args/e2e/DataTuner_No_FC_No_FS_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_cache": "./dataset_cache",
 3 |   "task_config": "./task_configs/e2e_dataset_cg.json",
 4 |   "model_checkpoint": "gpt2-medium",
 5 |   "train_batch_size": 10,
 6 |   "valid_batch_size": 1,
 7 |   "gradient_accumulation_steps": 8,
 8 |   "lr": 0.0001,
 9 |   "adam_epsilon": 1e-06,
10 |   "max_norm": 1.0,
11 |   "patience": 1,
12 |   "n_epochs": 10,
13 |   "max_data": 0,
14 |   "val_max_data": 0,
15 |   "freeze": false,
16 |   "smoothing": 0.0,
17 |   "ignore_cache": true,
18 |   "device": "cuda",
19 |   "fp16": "",
20 |   "local_rank": 1,
21 |   "warmup_steps": 0,
22 |   "multitask": false,
23 |   "scheduler": "piecewiselinear",
24 |   "optimizer": "adamw",
25 |   "max_block_size": null
26 | }


--------------------------------------------------------------------------------
/paper/train_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ./config.sh
 3 | 
 4 | DATASET=$1
 5 | SYSTEM=$2
 6 | OUTPUT_FOLDER=$3
 7 | NUM_PARALLEL=$4
 8 | 
 9 | if [ -z "$NUM_PARALLEL" ]; then
10 |     NUM_PARALLEL=1
11 | fi
12 | 
13 | SUFFIX=""
14 | if [[ "$SYSTEM" = "DataTuner_No_FC_No_FS" ]]; then
15 |     SUFFIX="_cg"
16 | fi
17 | 
18 | echo "Training the model for the dataset $DATASET and writing the trained model to $OUTPUT_FOLDER"
19 | 
20 | $python -m torch.distributed.launch  --nproc_per_node=$NUM_PARALLEL ../src/datatuner/lm/train.py  \
21 | --retrain_base ./lm_training_args/$DATASET/${SYSTEM}_model_training_args.json  \
22 | --logdir $OUTPUT_FOLDER  \
23 | --dataset_path ../data/$DATASET \
24 | --task_config ./task_configs/${DATASET}${SUFFIX}.json \
25 | --ignore_cache \
26 | --overwrite_output_dir


--------------------------------------------------------------------------------
/src/external/ukplab_emnlp2019_dualgraph/preprocess_LDC2017T10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 2 ]; then
 4 |   echo "./preprocess_LDC2017T10.sh <dataset_folder> <embeddings_file>"
 5 |   exit 2
 6 | fi
 7 | 
 8 | ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 9 | 
10 | bash ${ROOT_DIR}/process_amr/gen_LDC2017T10.sh ${1}
11 | 
12 | python ${ROOT_DIR}/process_amr/generate_input_opennmt.py -i ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/
13 | 
14 | mkdir -p ${ROOT_DIR}/data/ldc2017t10
15 | mv ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/dev-* ${ROOT_DIR}/data/ldc2017t10
16 | mv ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/test-* ${ROOT_DIR}/data/ldc2017t10
17 | mv ${ROOT_DIR}/process_amr/data/amr_ldc2017t10/train-* ${ROOT_DIR}/data/ldc2017t10
18 | 
19 | rm -rf data/ldc2017t10.*


--------------------------------------------------------------------------------
/src/external/README.md:
--------------------------------------------------------------------------------
 1 | # External Packages
 2 | 
 3 | This directory contains subsections of other repositories who provide tokenization and evaluation scripts for the data sets used. 
 4 | The required code is included here for ease of reproducibility.
 5 | 
 6 | See also [NOTICE.txt](../../NOTICE.txt).
 7 | 
 8 | ### Sources
 9 | 
10 | **/jjuraska_slug2slug** from https://github.com/jjuraska/slug2slug
11 | 
12 | 
13 | **/shimorina_inlg_2018** from https://gitlab.com/shimorina/inlg-2018
14 | 
15 | **/webnlg_webnlg_baseline** from https://gitlab.com/webnlg/webnlg-baseline
16 | 
17 | **/tuetschek_e2e_cleaning** from https://github.com/tuetschek/e2e-cleaning
18 | 
19 | **/ufal_dsg_tgen** from https://github.com/UFAL-DSG/tgen
20 | 
21 | **/ukplab_emnlp2019_dualgraph** from https://github.com/UKPLab/emnlp2019-dualgraph
22 | 


--------------------------------------------------------------------------------
/paper/experiments/mturk/README.md:
--------------------------------------------------------------------------------
 1 | # Preparing MTurk Data
 2 | 
 3 | This directory contains scripts used for sampling the system generated outputs for human annotation, and the results of these annotations. 
 4 | 
 5 | ## Generate fluency data for annotation
 6 | 
 7 | `python experiments/mturk/prepare_mturk.py prepare ~/system_outputs/ ~/mturk_fluency/ fluency`
 8 | 
 9 | 
10 | ## Generate fidelity data for annotation
11 | `python experiments/mturk/prepare_mturk.py prepare ~/system_outputs/ ~/mturk_fidelity/ fidelity`
12 | 
13 | 
14 | ## Score the fluency annotations
15 | `python experiments/mturk/prepare_mturk.py score ~/system_outputs_test/ ~/mturk_fluency/  fluency ./experiments/mturk/`
16 | 
17 | ## Score the fidelity annotations
18 | `python experiments/mturk/prepare_mturk.py score ~/system_outputs_test/ ~/mturk_fidelity/  fidelity ./experiments/mturk/`


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: finetune
 2 | channels:
 3 |   - pytorch
 4 |   - powerai
 5 |   - conda-forge
 6 |   - defaults
 7 | dependencies:
 8 |   - fairseq
 9 |   - spacy=2.0.12
10 |   - fire
11 |   - pip
12 |   - tqdm
13 |   - python=3.7.3
14 |   - cudatoolkit=9.2
15 |   - pytorch>=1.1.0
16 |   - tensorboardx=1.8
17 |   - tensorflow=1.13.1
18 |   - ftfy=5.5.1
19 |   - mxnet
20 |   - nltk
21 |   - ipdb
22 |   - ipython
23 |   - isort
24 |   - scikit-learn<=0.21.3
25 |   - python-annoy
26 |   - mlflow=1.3
27 |   - scipy
28 |   - pyspark
29 |   - tabulate=0.8.6
30 |   - pip:
31 |     - sacrebleu
32 |     - scikit-posthocs
33 |     - pytorch-ignite==0.2.1
34 |     - transformers==2.3.0
35 |     - sentence-transformers==0.2.5
36 |     - pandas_ml
37 |     - streamlit==0.52.2 
38 |     - matplotlib
39 |     - pandas==0.24.2
40 |     - sentencepiece==0.1.91
41 |     - textstat
42 |     - mlxtend


--------------------------------------------------------------------------------
/paper/task_configs/webnlg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "webnlg",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "modifiedtripleset",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "text",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ],
29 |     "extra_fields": [
30 |         "category_type",
31 |         "category",
32 |         "num_triples"
33 |     ],
34 |     "metrics_fields": [
35 |         "category_type",
36 |         "category",
37 |         "num_triples"
38 |     ]
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/paper/task_configs/webnlg_cg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "webnlg_cg",
 3 |     "data_shape": [
 4 |         {
 5 |             "id": "<data>",
 6 |             "type": "special",
 7 |             "learn": false
 8 |         },
 9 |         {
10 |             "id": "modifiedtripleset",
11 |             "type": "text",
12 |             "learn": false
13 |         },
14 |         {
15 |             "id": "<text>",
16 |             "type": "special",
17 |             "learn": false
18 |         },
19 |         {
20 |             "id": "text",
21 |             "type": "text",
22 |             "learn": true,
23 |             "metrics": [
24 |                 "match",
25 |                 "bleu"
26 |             ]
27 |         }
28 |     ],
29 |     "extra_fields": [
30 |         "category_type",
31 |         "category",
32 |         "num_triples"
33 |     ],
34 |     "metrics_fields": [
35 |         "category_type",
36 |         "category",
37 |         "num_triples"
38 |     ],
39 |     "token_typing": "coarse_grained"
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/numeric_slot.py:
--------------------------------------------------------------------------------
 1 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list
 2 | 
 3 | 
 4 | def align_numeric_slot_with_unit(text, text_tok, slot, value):
 5 |     value_number = value.split(' ')[0]
 6 |     try:
 7 |         float(value_number)
 8 |     except ValueError:
 9 |         return -1
10 | 
11 |     _, pos = find_first_in_list(value_number, text_tok)
12 | 
13 |     return pos
14 | 
15 | 
16 | def align_year_slot(text, text_tok, slot, value):
17 |     try:
18 |         int(value)
19 |     except ValueError:
20 |         return -1
21 | 
22 |     year_alternatives = [value]
23 |     if len(value) == 4:
24 |         year_alternatives.append('\'' + value[-2:])
25 |         year_alternatives.append(value[-2:])
26 | 
27 |     for val in year_alternatives:
28 |         if len(val) > 2:
29 |             pos = text.find(val)
30 |         else:
31 |             _, pos = find_first_in_list(val, text_tok)
32 | 
33 |         if pos >= 0:
34 |             return pos
35 | 
36 |     return -1
37 | 


--------------------------------------------------------------------------------
/paper/config.sh:
--------------------------------------------------------------------------------
 1 | echo "reading configuration variables"
 2 | 
 3 | # Folders of data that cannot be automatically downloaded
 4 | # Change the two directories below to the correct ones in your case
 5 | # Download it from https://catalog.ldc.upenn.edu/LDC2017T10"
 6 | LDC2017_DATA_LOCATION=~/Downloads/abstract_meaning_representation_amr_2.0
 7 | # Download it from https://nlds.soe.ucsc.edu/viggo"
 8 | VIGGO_DATA_LOCATION=~/Downloads/viggo-v1/
 9 | 
10 | python=~/miniconda3/envs/finetune/bin/python
11 | 
12 | LM_MODELS_DIR=~/trained_lms
13 | CLASSIFIER_MODELS_DIR=~/trained_classifiers
14 | REPO_FOLDER=datatuner
15 | TMP_DATA_FOLDER=./tmp
16 | DATA_FOLDER=../data
17 | 
18 | PAPER_FOLDER_PATTERN=$REPO_FOLDER/paper
19 | 
20 | # Check if you're running in the correct folder
21 | assert_run_dir() {
22 |     # params: current_dir
23 |     if [[ "$PWD" != *$1 ]]; then
24 |         echo "You should run this script from the folder '$1'. Exiting"
25 |         exit
26 |     fi
27 | }
28 | 
29 | newline() {
30 |     printf "\n"
31 | }
32 | 
33 | assert_run_dir $PAPER_FOLDER_PATTERN
34 | 
35 | 


--------------------------------------------------------------------------------
/src/external/ukplab_emnlp2019_dualgraph/split_amr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | 
 5 | INPUT = sys.argv[1]
 6 | OUT_SURF = sys.argv[2]
 7 | OUT_GRAPH = sys.argv[3]
 8 | 
 9 | with open(INPUT) as f:
10 |     lines = f.readlines()
11 | 
12 | with open(OUT_SURF, "w") as surf, open(OUT_GRAPH, "w") as graph:
13 |     amr_mode = False
14 |     amr_tokens = []
15 |     for line in lines:
16 |         if line.startswith("#"):
17 |             if amr_mode:
18 |                 amr_mode = False
19 |                 amr = " ".join(amr_tokens)
20 |                 graph.write(amr + "\n")
21 |                 amr_tokens = []
22 |             tokens = line.split()
23 |             if tokens[1] == "::snt":
24 |                 sent = " ".join(tokens[2:])
25 |                 surf.write(sent + "\n")
26 |         elif line.strip() == "":
27 |             continue
28 |         else:
29 |             amr_mode = True
30 |             amr_tokens.append(line.strip())
31 |     if amr_mode:
32 |         amr_mode = False
33 |         amr = " ".join(amr_tokens)
34 |         graph.write(amr + "\n")
35 |         amr_tokens = []
36 | 


--------------------------------------------------------------------------------
/paper/preprocess.sh:
--------------------------------------------------------------------------------
 1 | source ./config.sh
 2 | 
 3 | assert_run_dir $PAPER_FOLDER_PATTERN
 4 | 
 5 | echo "Running the data formatting for the LDC dataset"
 6 | echo $TMP_DATA_FOLDER
 7 | python experiments/ldc/preprocess.py --in_folder $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/data/amr_ldc2017t10 --out_folder $DATA_FOLDER/ldc/ --classification_dir $DATA_FOLDER/ldc_consistency
 8 | 
 9 | newline
10 | 
11 | echo "Running the data formatting for the WebNLG dataset"
12 | python experiments/webnlg/preprocess.py --in_folder $TMP_DATA_FOLDER/webnlg/data/v1.4/en/ --out_folder $DATA_FOLDER/webnlg --classification_dir $DATA_FOLDER/webnlg_consistency
13 | 
14 | newline
15 | 
16 | echo "Running the data formatting for the ViGGO dataset"
17 | python experiments/viggo/preprocess.py --in_folder $VIGGO_DATA_LOCATION --out_folder $DATA_FOLDER/viggo --classification_dir $DATA_FOLDER/viggo_consistency
18 | 
19 | newline
20 | 
21 | echo "Running the data formatting for the Cleaned E2E dataset"
22 | python experiments/e2e/preprocess.py --in_folder $TMP_DATA_FOLDER/e2e-cleaning/cleaned-data/ --out_folder $DATA_FOLDER/e2e --classification_dir $DATA_FOLDER/e2e_consistency
23 | 
24 | newline
25 | echo "Finished preprocessing the training data"


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from external.jjuraska_slug2slug import config
 4 | 
 5 | 
 6 | def find_first_in_list(val, lst):
 7 |     idx = -1
 8 |     pos = -1
 9 | 
10 |     for i, elem in enumerate(lst):
11 |         if val == elem:
12 |             idx = i
13 | 
14 |     if idx >= 0:
15 |         # Calculate approximate character position of the matched value
16 |         punct_cnt = lst[:idx].count('.') + lst[:idx].count(',')
17 |         pos = len(' '.join(lst[:idx])) + 1 - punct_cnt
18 | 
19 |     return idx, pos
20 | 
21 | 
22 | def find_all_in_list(val, lst):
23 |     indexes = []
24 |     positions = []
25 | 
26 |     for i, elem in enumerate(lst):
27 |         if val == elem:
28 |             indexes.append(i)
29 | 
30 |             # Calculate approximate character position of the matched value
31 |             punct_cnt = lst[:i].count('.') + lst[:i].count(',')
32 |             positions.append(len(' '.join(lst[:i])) + 1 - punct_cnt)
33 | 
34 |     return indexes, positions
35 | 
36 | 
37 | def get_slot_value_alternatives(slot):
38 |     with open(config.SLOT_ALIGNER_ALTERNATIVES, 'r') as f_alternatives:
39 |         alternatives_dict = json.load(f_alternatives)
40 | 
41 |     return alternatives_dict.get(slot, {})
42 | 


--------------------------------------------------------------------------------
/paper/classifier_training_args/e2e/e2e_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_dir": "./data/e2e_consistency/",
 3 |   "model_type": "roberta",
 4 |   "model_name_or_path": "roberta-large",
 5 |   "task_name": "mnli",
 6 |   "config_name": "",
 7 |   "tokenizer_name": "",
 8 |   "cache_dir": "",
 9 |   "max_seq_length": 200,
10 |   "filter_long_seq": false,
11 |   "do_train": true,
12 |   "do_eval": true,
13 |   "evaluate_during_training": false,
14 |   "do_lower_case": true,
15 |   "per_gpu_train_batch_size": 8,
16 |   "per_gpu_eval_batch_size": 1,
17 |   "gradient_accumulation_steps": 1,
18 |   "learning_rate": 5e-05,
19 |   "weight_decay": 0.0,
20 |   "adam_epsilon": 1e-08,
21 |   "max_grad_norm": 1.0,
22 |   "num_train_epochs": 3.0,
23 |   "max_steps": -1,
24 |   "warmup_steps": 500,
25 |   "logging_steps": 50,
26 |   "save_steps": 1000,
27 |   "eval_all_checkpoints": false,
28 |   "no_cuda": false,
29 |   "overwrite_output_dir": true,
30 |   "overwrite_cache": true,
31 |   "seed": 42,
32 |   "tpu": false,
33 |   "tpu_ip_address": "",
34 |   "tpu_name": "",
35 |   "xrt_tpu_config": "",
36 |   "fp16": false,
37 |   "fp16_opt_level": "O1",
38 |   "local_rank": 0,
39 |   "server_ip": "",
40 |   "server_port": "",
41 |   "n_gpu": 1,
42 |   "device": "cuda",
43 |   "output_mode": "classification",
44 |   "train_batch_size": 8
45 | }


--------------------------------------------------------------------------------
/paper/classifier_training_args/ldc/ldc_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_dir": "./data/ldc_consistency/",
 3 |   "model_type": "roberta",
 4 |   "model_name_or_path": "roberta-large",
 5 |   "task_name": "mnli",
 6 |   "config_name": "",
 7 |   "tokenizer_name": "",
 8 |   "cache_dir": "",
 9 |   "max_seq_length": 500,
10 |   "filter_long_seq": false,
11 |   "do_train": true,
12 |   "do_eval": true,
13 |   "evaluate_during_training": false,
14 |   "do_lower_case": true,
15 |   "per_gpu_train_batch_size": 3,
16 |   "per_gpu_eval_batch_size": 1,
17 |   "gradient_accumulation_steps": 1,
18 |   "learning_rate": 5e-05,
19 |   "weight_decay": 0.0,
20 |   "adam_epsilon": 1e-08,
21 |   "max_grad_norm": 1.0,
22 |   "num_train_epochs": 3.0,
23 |   "max_steps": -1,
24 |   "warmup_steps": 500,
25 |   "logging_steps": 50,
26 |   "save_steps": 1000,
27 |   "eval_all_checkpoints": false,
28 |   "no_cuda": false,
29 |   "overwrite_output_dir": true,
30 |   "overwrite_cache": true,
31 |   "seed": 42,
32 |   "tpu": false,
33 |   "tpu_ip_address": "",
34 |   "tpu_name": "",
35 |   "xrt_tpu_config": "",
36 |   "fp16": false,
37 |   "fp16_opt_level": "O1",
38 |   "local_rank": 0,
39 |   "server_ip": "",
40 |   "server_port": "",
41 |   "n_gpu": 1,
42 |   "device": "cuda",
43 |   "output_mode": "classification",
44 |   "train_batch_size": 3
45 | }


--------------------------------------------------------------------------------
/paper/classifier_training_args/webnlg/webnlg_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_dir": "./data/webnlg_consistency/",
 3 |   "model_type": "roberta",
 4 |   "model_name_or_path": "roberta-large",
 5 |   "task_name": "mnli",
 6 |   "config_name": "",
 7 |   "tokenizer_name": "",
 8 |   "cache_dir": "",
 9 |   "max_seq_length": 300,
10 |   "filter_long_seq": false,
11 |   "do_train": true,
12 |   "do_eval": true,
13 |   "evaluate_during_training": true,
14 |   "do_lower_case": true,
15 |   "per_gpu_train_batch_size": 6,
16 |   "per_gpu_eval_batch_size": 1,
17 |   "gradient_accumulation_steps": 1,
18 |   "learning_rate": 5e-05,
19 |   "weight_decay": 0.0,
20 |   "adam_epsilon": 1e-08,
21 |   "max_grad_norm": 1.0,
22 |   "num_train_epochs": 3.0,
23 |   "max_steps": -1,
24 |   "warmup_steps": 500,
25 |   "logging_steps": 500,
26 |   "save_steps": 1000,
27 |   "eval_all_checkpoints": false,
28 |   "no_cuda": false,
29 |   "overwrite_output_dir": true,
30 |   "overwrite_cache": true,
31 |   "seed": 42,
32 |   "tpu": false,
33 |   "tpu_ip_address": "",
34 |   "tpu_name": "",
35 |   "xrt_tpu_config": "",
36 |   "fp16": false,
37 |   "fp16_opt_level": "O1",
38 |   "local_rank": 0,
39 |   "server_ip": "",
40 |   "server_port": "",
41 |   "n_gpu": 1,
42 |   "device": "cuda",
43 |   "output_mode": "classification",
44 |   "train_batch_size": 6
45 | }


--------------------------------------------------------------------------------
/paper/classifier_training_args/viggo/viggo_model_training_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_dir": "./data/viggo_consistency/",
 3 |   "model_type": "roberta",
 4 |   "model_name_or_path": "roberta-large",
 5 |   "task_name": "mnli",
 6 |   "config_name": "",
 7 |   "tokenizer_name": "",
 8 |   "cache_dir": "",
 9 |   "max_seq_length": 200,
10 |   "filter_long_seq": false,
11 |   "do_train": true,
12 |   "do_eval": true,
13 |   "evaluate_during_training": false,
14 |   "do_lower_case": true,
15 |   "per_gpu_train_batch_size": 8,
16 |   "per_gpu_eval_batch_size": 1,
17 |   "gradient_accumulation_steps": 1,
18 |   "learning_rate": 5e-05,
19 |   "weight_decay": 0.0,
20 |   "adam_epsilon": 1e-08,
21 |   "max_grad_norm": 1.0,
22 |   "num_train_epochs": 3.0,
23 |   "max_steps": -1,
24 |   "warmup_steps": 200,
25 |   "logging_steps": 50,
26 |   "save_steps": 1000,
27 |   "eval_all_checkpoints": false,
28 |   "no_cuda": false,
29 |   "overwrite_output_dir": true,
30 |   "overwrite_cache": true,
31 |   "seed": 42,
32 |   "tpu": false,
33 |   "tpu_ip_address": "",
34 |   "tpu_name": "",
35 |   "xrt_tpu_config": "",
36 |   "fp16": false,
37 |   "fp16_opt_level": "O1",
38 |   "local_rank": 0,
39 |   "server_ip": "",
40 |   "server_port": "",
41 |   "n_gpu": 1,
42 |   "device": "cuda",
43 |   "output_mode": "classification",
44 |   "passed_examples": false,
45 |   "train_batch_size": 8
46 | }


--------------------------------------------------------------------------------
/src/datatuner/lm/process_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | from fire import Fire
 5 | 
 6 | 
 7 | def split_list(data, n):
 8 |     vals_per_item = [0 for _ in range(n)]
 9 |     for ix, _ in enumerate(data):
10 |         vals_per_item[ix % n] += 1
11 |     ix = 0
12 |     new_list = []
13 |     subset = []
14 |     for _, d in enumerate(data):
15 |         if len(subset) < vals_per_item[ix]:
16 |             subset.append(d)
17 |         if len(subset) == vals_per_item[ix]:
18 |             new_list.append(subset)
19 |             ix += 1
20 |             subset = []
21 |     return new_list
22 | 
23 | 
24 | def split(filename, out_folder, splits):
25 |     j = json.load(open(filename))
26 |     out_folder = Path(out_folder)
27 |     out_folder.mkdir(parents=True, exist_ok=True)
28 | 
29 |     chunks = split_list(j, splits)
30 |     for i, chunk in enumerate(chunks):
31 |         json.dump(chunk, open(out_folder / f"chunk_{i}.json", "w"), indent=2)
32 | 
33 | 
34 | def combine(base_folder_name, splits):
35 |     output_data = []
36 |     for i in range(splits):
37 |         folder = f"{base_folder_name}/chunks/chunk_{i}"
38 |         folder = Path(folder)
39 |         output_data.extend(json.load(open(folder / "generated.json")))
40 | 
41 |     base_folder_name = Path(base_folder_name)
42 |     base_folder_name.mkdir(parents=True, exist_ok=True)
43 |     json.dump(output_data, open(base_folder_name / "generated.json", "w"), indent=2)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     Fire()
48 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | CONDA_SH_FILE=$1
 2 | source $CONDA_SH_FILE
 3 | 
 4 | 
 5 | # Confirm external dependencies with user
 6 | EXTERNAL_DEPS_MSG="""The scripts provided herein will retrieve several third-party libraries,
 7 |  environments, and/or other software packages at install-time or build-time (“External Dependencies”)
 8 |  from third-party sources.  There are terms and conditions that you need to agree to
 9 |  abide by if you choose to install the External Dependencies.  If you do not agree
10 |  with every term and condition associated with the External Dependencies,
11 |  enter “QUIT” in the command line when prompted by the script."""
12 | 
13 | confirm_external_dependencies() {
14 |   echo
15 |   echo $EXTERNAL_DEPS_MSG
16 |   while true; do
17 |     read -p "Do you want to PROCEED or QUIT? " yn
18 |         case $yn in
19 |         PROCEED)
20 |             echo "Proceeding"
21 |             break
22 |             ;;
23 |         QUIT)
24 |             echo "Quitting"
25 |             exit
26 |             ;;
27 |         esac
28 |     done
29 | 
30 | }
31 | 
32 | confirm_external_dependencies
33 | 
34 | echo "Creating the environment"
35 | conda env create --file environment.yml
36 | conda activate finetune
37 | 
38 | printf "\n"
39 | 
40 | echo "Downloading the spacy dependenices"
41 | python -m spacy download en_core_web_sm
42 | 
43 | echo "Downloading the NLTK dependenices"
44 | python -m nltk.downloader punkt
45 | 
46 | echo "Installing the code in development mode"
47 | 
48 | printf "\n"
49 | 
50 | python setup.py develop
51 | 
52 | printf "\n"
53 | 
54 | echo "Finished setup"


--------------------------------------------------------------------------------
/src/datatuner/lm/custom_gpt2.py:
--------------------------------------------------------------------------------
 1 | from transformers import GPT2LMHeadModel
 2 | 
 3 | from datatuner.lm.cross_entropy import CrossEntropyLoss
 4 | 
 5 | 
 6 | def custom_gpt2_with_smoothing(smoothing=0.0):
 7 |     class GPT2LMHeadModelCustom(GPT2LMHeadModel):
 8 |         def forward(
 9 |             self,
10 |             input_ids,
11 |             past=None,
12 |             attention_mask=None,
13 |             token_type_ids=None,
14 |             position_ids=None,
15 |             head_mask=None,
16 |             labels=None,
17 |         ):
18 | 
19 |             transformer_outputs = self.transformer(
20 |                 input_ids, past=past, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask
21 |             )
22 | 
23 |             hidden_states = transformer_outputs[0]
24 | 
25 |             lm_logits = self.lm_head(hidden_states)
26 | 
27 |             outputs = (lm_logits,) + transformer_outputs[1:]
28 |             if labels is not None:
29 |                 # Shift so that tokens < n predict n
30 |                 shift_logits = lm_logits[..., :-1, :].contiguous()
31 |                 shift_labels = labels[..., 1:].contiguous()
32 |                 # Flatten the tokens
33 |                 loss_fct = CrossEntropyLoss(ignore_index=-1, smooth_eps=smoothing, reduction="mean")
34 | 
35 |                 loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
36 | 
37 |                 outputs = (loss,) + outputs
38 | 
39 |             return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
40 | 
41 |     return GPT2LMHeadModelCustom
42 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | # Directory paths
 5 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 6 | DATA_DIR = os.path.join(ROOT_DIR, 'data')
 7 | EVAL_DIR = os.path.join(ROOT_DIR, 'eval')
 8 | METRICS_DIR = os.path.join(ROOT_DIR, 'metrics')
 9 | MODEL_DIR = os.path.join(ROOT_DIR, 'model')
10 | PREDICTIONS_DIR = os.path.join(ROOT_DIR, 'predictions')
11 | PREDICTIONS_BATCH_DIR = os.path.join(PREDICTIONS_DIR, 'batch')
12 | PREDICTIONS_BATCH_LEX_DIR = os.path.join(PREDICTIONS_DIR, 'batch_lex')
13 | PREDICTIONS_BATCH_EVENT_DIR = os.path.join(PREDICTIONS_DIR, 'batch_event')
14 | SLOT_ALIGNER_DIR = os.path.join(ROOT_DIR, 'slot_aligner')
15 | SLOT_ALIGNER_ALTERNATIVES = os.path.join(SLOT_ALIGNER_DIR, 'alignment', 'alternatives.json')
16 | T2T_DIR = os.path.join(ROOT_DIR, 't2t')
17 | TOOLS_DIR = os.path.join(ROOT_DIR, 'tools')
18 | TTEST_DIR = os.path.join(ROOT_DIR, 'ttest')
19 | TTEST_DATA_DIR = os.path.join(ROOT_DIR, 'ttest', 'data')
20 | TTEST_SCORES_DIR = os.path.join(ROOT_DIR, 'ttest', 'scores')
21 | 
22 | # Dataset paths
23 | E2E_DATA_DIR = os.path.join(DATA_DIR, 'rest_e2e')
24 | TV_DATA_DIR = os.path.join(DATA_DIR, 'tv')
25 | LAPTOP_DATA_DIR = os.path.join(DATA_DIR, 'laptop')
26 | HOTEL_DATA_DIR = os.path.join(DATA_DIR, 'hotel')
27 | VIDEO_GAME_DATA_DIR = os.path.join(DATA_DIR, 'video_game')
28 | 
29 | # Script paths
30 | METRICS_SCRIPT_PATH = os.path.join(METRICS_DIR, 'measure_scores.py')
31 | 
32 | # Constants
33 | COMMA_PLACEHOLDER = ' __comma__'
34 | DELEX_PREFIX = '__slot_'    # Important to use special symbols that do not get tokenized (such as '_')
35 | DELEX_SUFFIX = '__'
36 | EMPH_TOKEN = '__emph__'
37 | CONTRAST_TOKEN = '__contrast__'
38 | CONCESSION_TOKEN = '__concession__'
39 | 


--------------------------------------------------------------------------------
/src/datatuner/classification/consistency_classifier.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | from datatuner.classification.run_classifier import evaluate, main
 5 | from transformers.data.processors.utils import InputExample
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | dataset_fields = {
10 |     "webnlg": {"text": "text", "data": "modifiedtripleset", "original_data": "raw_modifiedtripleset"},
11 |     "ldc": {"text": "answer_text", "data": "linearized_amr", "original_data": "raw_amr"},
12 |     "viggo": {"text": "ref", "data": "new_mr", "original_data": "mr"},
13 |     "e2e": {"text": "ref", "data": "new_mr", "original_data": "mr"},
14 | }
15 | 
16 | 
17 | def get_data_fields():
18 |     out = []
19 |     for x in dataset_fields:
20 |         out.append(dataset_fields[x]["data"])
21 |     return out
22 | 
23 | 
24 | class ConsistencyClassifier:
25 |     def __init__(self, args_dict):
26 |         self.args_dict = args_dict
27 |         sys.argv = [sys.argv[0]]
28 |         _, self.model, self.tokenizer, self.args = main(args_dict)
29 |         self.cache = {}
30 | 
31 |     def evaluate(self, items, set_type="test"):
32 |         examples = []
33 |         for (i, item) in enumerate(items):
34 |             guid = "%s-%s" % (set_type, str(i))
35 |             text_a = item["data"]
36 |             text_b = item["text"]
37 |             if self.args_dict["do_lower_case"]:
38 |                 text_a = text_a.lower()
39 |                 text_b = text_b.lower()
40 |             label = "accurate"
41 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
42 | 
43 |         self.args.examples = examples
44 | 
45 |         results = evaluate(self.args, self.model, self.tokenizer, prefix="")
46 |         return results
47 | 


--------------------------------------------------------------------------------
/paper/evaluate_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ./config.sh
 3 | 
 4 | TEST_FILE=$1
 5 | MODEL=$2
 6 | NUM_GPUS=$3
 7 | PER_GPU=$4
 8 | MAX_DATA=$5
 9 | SPLITS=$((NUM_GPUS * PER_GPU))
10 | echo "SPLITS: " $SPLITS
11 | 
12 | if [ -z "$MAX_DATA" ]; then
13 |     MAX_DATA=0
14 | fi
15 | 
16 | echo "MAX_DATA": $MAX_DATA
17 | 
18 | CHUNKED_DATA_FOLDER=$(mktemp -d)
19 | echo "Chunked data outputted to the folder $CHUNKED_DATA_FOLDER"
20 | 
21 | CODE_DIR=../src/datatuner/lm/
22 | # Split data into chunks
23 | $python $CODE_DIR/process_json.py split $TEST_FILE $CHUNKED_DATA_FOLDER $SPLITS
24 | 
25 | COMMON_ARGUMENTS="--model_checkpoint $MODEL  \
26 |     --no_sample \
27 |     --beam_width 5 \
28 |     --nbest 5 \
29 |     --per_step_predictions 5 \
30 |     --model_type gpt2"
31 | 
32 | pids=
33 | MAX_SPLITS=$((SPLITS - 1))
34 | RESULTS_FOLDER=$MODEL/$(date +'%Y-%m-%d_%H-%M-%S')
35 | mkdir -p $RESULTS_FOLDER
36 | 
37 | # Evaluate each chunk
38 | for ((i=0; i<=MAX_SPLITS; i++)); do
39 |     echo "Chunk $i"
40 |     CUDA_VISIBLE_DEVICES=$(($i % $NUM_GPUS)) $python $CODE_DIR/evaluate.py \
41 |     --filename $CHUNKED_DATA_FOLDER/chunk_$i.json \
42 |     --out_folder ${RESULTS_FOLDER}/chunks/chunk_$i \
43 |     --max_data $MAX_DATA \
44 |     ""$COMMON_ARGUMENTS"" &
45 |     pids+=" $!"
46 | done
47 | wait $pids || { echo "there were errors" >&2; rm -rf ${RESULTS_FOLDER}; exit 1; }
48 | 
49 | # Combine results from all chunks
50 | $python $CODE_DIR/process_json.py combine $RESULTS_FOLDER $SPLITS
51 | GLOBAL_MAX_DATA=$((SPLITS * MAX_DATA))
52 | echo "GLOBAL_MAX_DATA": $GLOBAL_MAX_DATA
53 | CUDA_VISIBLE_DEVICES=0
54 | $python $CODE_DIR/evaluate.py  \
55 | --filename  $TEST_FILE \
56 | --out_folder ${RESULTS_FOLDER} \
57 | --max_data $GLOBAL_MAX_DATA \
58 | ""$COMMON_ARGUMENTS""
59 | 
60 | echo "removing intermediary results from ${RESULTS_FOLDER}/chunks"
61 | rm -rf ${RESULTS_FOLDER}/"chunks"
62 | 
63 | echo "Final results available in" $RESULTS_FOLDER


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/list_slot.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import word_tokenize
 2 | 
 3 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import get_slot_value_alternatives
 4 | from external.jjuraska_slug2slug.slot_aligner.alignment.categorical_slots import find_value_alternative
 5 | 
 6 | 
 7 | def align_list_slot(text, text_tok, slot, value, match_all=True, mode='exact_match', item_sep=', '):
 8 |     """
 9 |     MR      := slot[value]
10 |     value   := item || item; item;...
11 |     item    := tok || tok tok...
12 |     """
13 |     leftmost_pos = -1
14 | 
15 |     # TODO: load alternatives only once
16 |     alternatives = get_slot_value_alternatives(slot)
17 | 
18 |     # Split the slot value into individual items
19 |     items = [item.strip() for item in value.split(item_sep)]
20 | 
21 |     # Search for all individual items exhaustively
22 |     for item in items:
23 |         pos = find_value_alternative(text, text_tok, item, alternatives, mode=mode)
24 | 
25 |         if match_all and pos < 0:
26 |             return -1
27 | 
28 |         if leftmost_pos < 0 or 0 <= pos < leftmost_pos:
29 |             leftmost_pos = pos
30 | 
31 |     return leftmost_pos
32 | 
33 | 
34 | def align_list_with_conjunctions_slot(text, text_tok, slot, value, match_all=True):
35 |     separators = [',', 'and', 'with']
36 | 
37 |     value_tok = word_tokenize(value)
38 |     value_items = []
39 |     end_of_prev_item = -1
40 |     leftmost_pos = -1
41 | 
42 |     # Split the value into items
43 |     for i, tok in enumerate(value_tok):
44 |         if tok in separators and i > end_of_prev_item + 1:
45 |             item = ' '.join(value_tok[end_of_prev_item + 1:i])
46 |             value_items.append(item)
47 |             end_of_prev_item = i
48 | 
49 |     if end_of_prev_item < len(value_tok) - 1:
50 |         item = ' '.join(value_tok[end_of_prev_item + 1:])
51 |         value_items.append(item)
52 | 
53 |     for item in value_items:
54 |         pos = text.find(item)
55 |         if 0 <= pos < leftmost_pos or leftmost_pos == -1:
56 |             leftmost_pos = pos
57 |         if match_all and pos < 0:
58 |             return -1
59 | 
60 |     if leftmost_pos < 0:
61 |         return -1
62 | 
63 |     return leftmost_pos
64 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/special_token_generator.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import json
 3 | from pathlib import Path
 4 | 
 5 | from datatuner.utils import bracket_contents
 6 | from fire import Fire
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def get_custom_tags(s):
11 |     """Get tags starting with a token and ending with another in the string"""
12 |     return bracket_contents(s, opening="<", ending=">")
13 | 
14 | 
15 | fn_map = {
16 |     "question_sig": [get_custom_tags],
17 |     "amr": [get_custom_tags],
18 | }
19 | 
20 | 
21 | def generate_from_item(item, fields, all_tokens):
22 |     for field_name in fields:
23 |         if field_name in item:
24 |             tokens = list(itertools.chain(*[fn(item[field_name]) for fn in fn_map[fields[field_name]]]))
25 |             all_tokens.update(tokens)
26 | 
27 | 
28 | def generate_from_json(data_folder, outfile, fields={"mrl": "mrl"}):
29 |     """Generate the special tokens from the given folder with files train.json, validation.json, and test.json
30 |     The used field is defined by the key in the `fields` dictionary and the method used is defined based
31 |     on that field.
32 |     """
33 | 
34 |     data_folder = Path(data_folder)
35 |     all_tokens = set()
36 | 
37 |     for split in ["test", "train", "validation"]:
38 |         try:
39 |             data = json.load(open(data_folder / (split + ".json"), "r"))
40 |             for item in data:
41 |                 generate_from_item(item, fields, all_tokens)
42 |         except:
43 |             print(f"file absent: {split}")
44 | 
45 |     Path(outfile).write_text("\n".join(all_tokens))
46 | 
47 | 
48 | def generate_from_jsonl(data_file, outfile, fields={"mrl": "mrl"}, max_items=0):
49 |     """Generate the special tokens from the given jsonl file.
50 |     The used field is defined by the key in the `fields` dictionary and the method used is defined based
51 |     on that field.
52 |     """
53 | 
54 |     all_tokens = set()
55 |     i = 0
56 |     with open(data_file, "r") as f:
57 |         for line in tqdm(f):
58 |             item = json.loads(line.rstrip())
59 |             generate_from_item(item, fields, all_tokens)
60 |             i += 1
61 |             if max_items > 0 and i >= max_items:
62 |                 break
63 |     Path(outfile).write_text("\n".join(all_tokens))
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     Fire()
68 | 


--------------------------------------------------------------------------------
/paper/experiments/mturk/text_stats.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import Counter
 3 | from pathlib import Path
 4 | 
 5 | import textstat
 6 | from fire import Fire
 7 | 
 8 | from datatuner.classification.consistency_classifier import dataset_fields
 9 | 
10 | 
11 | def count_words(text, casing="any"):
12 |     """Counts word frequency using Counter from collections"""
13 |     if casing == "any":
14 |         text = text.lower()
15 | 
16 |     skips = [".", ", ", ":", ";", "'", '"']
17 |     for ch in skips:
18 |         text = text.replace(ch, "")
19 |     words = text.split(" ")
20 |     if casing == "lower":
21 |         words = [x for x in words if x and x[0].islower()]
22 |     elif casing == "upper":
23 |         words = [x for x in words if x and x[0].isupper()]
24 |     word_counts = Counter(words)
25 |     return word_counts
26 | 
27 | 
28 | def calculate_stats(data_folder):
29 |     """Calculate stat of test.json file in a folder"""
30 |     data_folder = Path(data_folder)
31 |     for dataset in dataset_fields:
32 |         print(f"loading {dataset}")
33 |         field = dataset_fields[dataset]["text"].strip()
34 |         sentences = []
35 |         for item in json.load(open(data_folder / dataset / "test.json")):
36 |             sentences.append(item[field][-1] if type(item[field]) == list else item[field])
37 | 
38 |         text = " ".join(sentences)
39 |         lex_count = textstat.lexicon_count(text)
40 |         print(lex_count)
41 |         unique_words = count_words(text)
42 |         print(f"all unique {len(unique_words)}")
43 | 
44 |         lower_unique_words = count_words(text, casing="lower")
45 |         print(f"lowercase unique {len(lower_unique_words)}")
46 | 
47 |         upper_unique_words = count_words(text, casing="upper")
48 |         print(f"uppercase unique {len(upper_unique_words)}")
49 | 
50 |         print(f"ratio {len(upper_unique_words) / len(unique_words)}")
51 | 
52 |         text_standard = textstat.text_standard(text, float_output=True)
53 |         print(f"text_standard: {text_standard}")
54 | 
55 |         dale_chall_readability_score = textstat.dale_chall_readability_score(text)
56 |         print(f"dale_chall_readability_score: {dale_chall_readability_score}")
57 | 
58 |         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
59 |         print(f"flesch_kincaid_grade: {flesch_kincaid_grade}")
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     Fire(calculate_stats)
64 | 


--------------------------------------------------------------------------------
/src/external/ukplab_emnlp2019_dualgraph/gen_LDC2017T10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 4 | 
 5 | mkdir -p ${ROOT_DIR}/data
 6 | REPO_DIR=${ROOT_DIR}/data/
 7 | 
 8 | DATA_DIR=${1}
 9 | mkdir -p ${REPO_DIR}/tmp_amr
10 | PREPROC_DIR=${REPO_DIR}/tmp_amr
11 | ORIG_AMR_DIR=${DATA_DIR}/data/amrs/split
12 | mkdir -p ${REPO_DIR}/amr_ldc2017t10
13 | FINAL_AMR_DIR=${REPO_DIR}/amr_ldc2017t10
14 | 
15 | 
16 | mkdir -p ${PREPROC_DIR}/train
17 | mkdir -p ${PREPROC_DIR}/dev
18 | mkdir -p ${PREPROC_DIR}/test
19 | 
20 | mkdir -p ${FINAL_AMR_DIR}/train
21 | mkdir -p ${FINAL_AMR_DIR}/dev
22 | mkdir -p ${FINAL_AMR_DIR}/test
23 | 
24 | 
25 | cat ${ORIG_AMR_DIR}/training/amr-* > ${PREPROC_DIR}/train/raw_amrs.txt
26 | cat ${ORIG_AMR_DIR}/dev/amr-* > ${PREPROC_DIR}/dev/raw_amrs.txt
27 | # cat ${ORIG_AMR_DIR}/test/amr-* > ${PREPROC_DIR}/test/*_raw_amrs.txt
28 | # cat ${ORIG_AMR_DIR}/test/1_amr-release-2.0-alignments-test-proxy.txt ${ORIG_AMR_DIR}/test/2_amr-release-2.0-alignments-test-dfa.txt ${ORIG_AMR_DIR}/test/3_amr-release-2.0-alignments-test-bolt.txt ${ORIG_AMR_DIR}/test/4_amr-release-2.0-alignments-test-consensus.txt ${ORIG_AMR_DIR}/test/5_amr-release-2.0-alignments-test-xinhua.txt > ${PREPROC_DIR}/test/raw_amrs.txt
29 | cat ${ORIG_AMR_DIR}/test/1_amr-release-2.0-amrs-test-proxy.txt ${ORIG_AMR_DIR}/test/2_amr-release-2.0-amrs-test-dfa.txt ${ORIG_AMR_DIR}/test/3_amr-release-2.0-amrs-test-bolt.txt ${ORIG_AMR_DIR}/test/4_amr-release-2.0-amrs-test-xinhua.txt  ${ORIG_AMR_DIR}/test/5_amr-release-2.0-amrs-test-consensus.txt> ${PREPROC_DIR}/test/raw_amrs.txt
30 | 
31 | 
32 | for SPLIT in test dev train ; do
33 |     echo "processing $SPLIT..."
34 |     # get the surface and the graphs separately
35 |     python ${ROOT_DIR}/split_amr.py ${PREPROC_DIR}/${SPLIT}/raw_amrs.txt ${PREPROC_DIR}/${SPLIT}/surface.txt ${PREPROC_DIR}/${SPLIT}/graphs.txt
36 |     
37 |     python ${ROOT_DIR}/preproc_amr.py ${PREPROC_DIR}/${SPLIT}/graphs.txt ${PREPROC_DIR}/${SPLIT}/surface.txt ${FINAL_AMR_DIR}/${SPLIT}/nodes.pp.txt ${FINAL_AMR_DIR}/${SPLIT}/surface.pp.txt --mode LIN --triples-output  ${FINAL_AMR_DIR}/${SPLIT}/triples.pp.txt
38 |     # python ${ROOT_DIR}/preproc_amr.py ${PREPROC_DIR}/${SPLIT}/graphs.txt ${PREPROC_DIR}/${SPLIT}/surface.txt ${FINAL_AMR_DIR}/${SPLIT}/nodes.pp.txt ${FINAL_AMR_DIR}/${SPLIT}/surface.pp.txt --mode LINE_GRAPH --triples-output ${FINAL_AMR_DIR}/${SPLIT}/triples.pp.txt
39 |     echo "done."
40 | done


--------------------------------------------------------------------------------
/src/datatuner/classification/consistency_processor.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | from transformers.data.processors.utils import DataProcessor, InputExample
 7 | 
 8 | 
 9 | class ConsistencyProcessor(DataProcessor):
10 |     """Processor for the Consistency Classification data set."""
11 | 
12 |     def __init__(self, do_lower_case):
13 |         self.do_lower_case = do_lower_case
14 |         super(DataProcessor, self).__init__()
15 | 
16 |     def get_train_examples(self, data_dir):
17 |         """See base class."""
18 |         return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
19 | 
20 |     @classmethod
21 |     def _read_tsv(cls, input_file, quotechar=None):
22 |         """Reads a tab separated value file."""
23 |         with open(input_file, "r", encoding="utf-8-sig") as f:
24 |             reader = csv.reader(f, delimiter="|", quotechar=quotechar)
25 |             lines = []
26 |             for line in reader:
27 |                 if sys.version_info[0] == 2:
28 |                     line = list(unicode(cell, "utf-8") for cell in line)
29 |                 lines.append(line)
30 |             return lines
31 | 
32 |     def get_dev_examples(self, data_dir):
33 |         """See base class."""
34 |         return self._create_examples(self._read_tsv(os.path.join(data_dir, "validation.tsv")), "dev")
35 | 
36 |     def get_test_examples(self, data_dir):
37 |         """See base class."""
38 |         return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
39 | 
40 |     def get_labels(self, data_dir):
41 |         """See base class."""
42 |         labels = (Path(data_dir) / "labels.txt").read_text().split("\n")
43 |         labels = [x for x in labels if x]
44 |         return labels
45 | 
46 |     def _create_examples(self, lines, set_type):
47 |         """Creates examples for the training and dev sets."""
48 |         # order: ["label","data","text"]
49 |         examples = []
50 |         for (i, line) in enumerate(lines):
51 |             if i == 0:
52 |                 continue
53 |             guid = "%s-%s" % (set_type, str(i))
54 |             text_a = line[1]
55 |             text_b = line[2]
56 |             if self.do_lower_case:
57 |                 text_a = text_a.lower()
58 |                 text_b = text_b.lower()
59 |             label = line[0]
60 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
61 |         return examples
62 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slug2slug_ser.py:
--------------------------------------------------------------------------------
 1 | from external.jjuraska_slug2slug.slot_aligner.data_analysis import score_slot_realizations
 2 | from tempfile import mkdtemp
 3 | import pandas as pd
 4 | from pathlib import Path
 5 | from fire import Fire
 6 | import json
 7 | from datatuner.classification.consistency_classifier import dataset_fields
 8 | import numpy as np
 9 | 
10 | 
11 | def compute_ser(datafile, scored_file, mr_field, text_field):
12 |     dataset = "viggo" if "viggo" in str(datafile) else "e2e"
13 |     if mr_field is None:
14 |         mr_field = dataset_fields[dataset]["original_data"]
15 |     if text_field is None:
16 |         text_field = dataset_fields[dataset]["text"]
17 | 
18 |     data = json.load(open(datafile))
19 |     if dataset == "viggo":
20 |         subfolder = "video_game"
21 |     elif dataset == "e2e":
22 |         subfolder = "rest_e2e"
23 |     tempdir = Path(mkdtemp()) / subfolder
24 |     tempdir.mkdir(parents=True, exist_ok=True)
25 |     new_items = []
26 |     for item in data:
27 |         new_item = {}
28 |         new_item[mr_field] = item[mr_field]
29 |         text = item[text_field]
30 |         if type(text) == list:
31 |             text = text[-1]
32 | 
33 |         new_item[text_field] = text
34 |         new_items.append(new_item)
35 |     df = pd.DataFrame(new_items)
36 | 
37 |     out_file = tempdir / "test.csv"
38 |     df.to_csv(out_file, index=False)
39 | 
40 |     score_slot_realizations(tempdir, "test.csv")
41 |     err_df = pd.read_csv(tempdir / ("test [errors].csv"))
42 | 
43 |     assert len(err_df) == len(df)
44 |     err_data = err_df.to_dict(orient="records")
45 |     percent_correct_list = []
46 |     for err_item, item in zip(err_data, data):
47 | 
48 |         item["errors"] = err_item["errors"]
49 |         if (
50 |             type(err_item["incorrect slots"]) == float
51 |             and "nan" in str(err_item["incorrect slots"]).lower()
52 |         ):
53 |             err_item["incorrect slots"] = "?"
54 | 
55 |         else:
56 |             item["incorrect_slots"] = (
57 |                 err_item["incorrect slots"] if err_item["errors"] > 0 else ""
58 |             )
59 | 
60 |         item["ser_correct"] = int(item["errors"] == 0)
61 | 
62 |         item["ser"] = item["errors"] / err_item["mr"].count("[")
63 |         
64 | 
65 |         percent_correct_list.append(item["ser_correct"])
66 | 
67 |     datafile = Path(datafile)
68 |     print(f"written to {scored_file}")
69 |     json.dump(data, open(scored_file, "w"), indent=2)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     # python ser.py data/e2e_dataset/test.json --dataset e2e
74 |     Fire(compute_ser)
75 | 


--------------------------------------------------------------------------------
/src/datatuner/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import os
  3 | from pathlib import Path
  4 | from time import gmtime, strftime
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | 
 10 | def bracket_contents(string, level=None, opening="[", ending="]"):
 11 |     """Generate brackets' contents as strings"""
 12 |     stack = []
 13 |     result = []
 14 |     for i, c in enumerate(string):
 15 |         if c == opening:
 16 |             stack.append(i)
 17 |         elif c == ending and stack:
 18 |             start = stack.pop()
 19 |             result.append((len(stack), f"{opening}{string[start + 1: i]}{ending}"))
 20 | 
 21 |     if level is not None:
 22 |         result = [x for x in result if x[0] == level]
 23 | 
 24 |     return [x[1] for x in result]
 25 | 
 26 | 
 27 | def uniquify_in_order(seq):
 28 |     """Get unique sequence from given sequence while preserving order"""
 29 |     seen = set()
 30 |     seen_add = seen.add
 31 |     return [x for x in seq if not (x in seen or seen_add(x))]
 32 | 
 33 | 
 34 | def str_part_matches_array(s, arr):
 35 |     return any(s in x for x in arr)
 36 | 
 37 | 
 38 | def str_start_matches_array(s, arr):
 39 |     return any(x.startswith(s) for x in arr)
 40 | 
 41 | 
 42 | def arr_part_matches_string(s, arr):
 43 |     """True if some item in the array arr is a substring of s"""
 44 |     return any(x in s for x in arr)
 45 | 
 46 | 
 47 | def ewm_mean(iterable, alpha=0.9):
 48 |     if len(iterable) > 0:
 49 |         df = pd.DataFrame({"B": iterable})
 50 |         av = df.ewm(alpha=alpha).mean().B.iloc[-1]
 51 | 
 52 |         return av
 53 | 
 54 | 
 55 | def geo_mean(iterable):
 56 |     a = np.array(iterable)
 57 |     return a.prod() ** (1.0 / len(a))
 58 | 
 59 | 
 60 | def newest_file(folder_path, pattern):
 61 |     folder_path = Path(folder_path)
 62 |     list_of_paths = folder_path.glob(pattern)
 63 |     latest_path = max(list_of_paths, key=lambda p: p.stat().st_ctime)
 64 |     return latest_path
 65 | 
 66 | 
 67 | def flatten(d, parent_key="", sep="-"):
 68 |     items = []
 69 |     for k, v in d.items():
 70 |         new_key = k + sep + parent_key if parent_key else k
 71 |         if isinstance(v, collections.MutableMapping):
 72 |             items.extend(flatten(v, new_key, sep=sep).items())
 73 |         else:
 74 |             items.append((new_key, v))
 75 |     return dict(items)
 76 | 
 77 | 
 78 | def get_curr_time():
 79 |     return strftime("%Y-%m-%d_%H-%M-%S", gmtime())
 80 | 
 81 | 
 82 | def dedup_consecutive_data(our_data, key):
 83 |     dedup_our_data = []
 84 |     cache = {}
 85 |     for i, item in enumerate(our_data):
 86 |         if item[key].replace(" ", "") in cache:
 87 |             continue
 88 |         else:
 89 |             dedup_our_data.append(item)
 90 |             cache[item[key].replace(" ", "")] = True
 91 | 
 92 |     return dedup_our_data
 93 | 
 94 | 
 95 | def read_lines_from_file(file):
 96 |     file = Path(file)
 97 |     texts = file.read_text().split("\n")
 98 |     texts = [x for x in texts if x.strip()]
 99 |     return texts
100 | 
101 | 
102 | def is_empty_or_absent_dir(dir_name):
103 |     return not os.path.exists(dir_name) or not os.listdir(dir_name)
104 | 


--------------------------------------------------------------------------------
/paper/experiments/ldc/preprocess.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | from copy import deepcopy
 4 | from pathlib import Path
 5 | 
 6 | from datatuner.classification.distractors import (get_distractors,
 7 |                                                   write_classification_data)
 8 | from datatuner.lm.special_token_generator import generate_from_json
 9 | from datatuner.lm.utils import fix_text_in_dir
10 | from datatuner.utils import bracket_contents
11 | from fire import Fire
12 | 
13 | random.seed(42)
14 | 
15 | 
16 | def get_entities(amr):
17 |     options = bracket_contents(amr, opening="(", ending=")")
18 |     options = [option.strip("() ") for option in options if option.count("(") == 1 and option.count("<") == 0]
19 |     options = [option for option in options if option[0].isupper()]
20 |     return options
21 | 
22 | 
23 | def preprocess(in_folder, out_folder, classification_dir, num_candidates=10, max_per_operation=10):
24 |     """Linearize the data already processed into surface texts and AMRs into our format"""
25 | 
26 |     splits = {"test": "test", "dev": "validation", "train": "train"}
27 | 
28 |     in_folder = Path(in_folder)
29 |     out_folder = Path(out_folder)
30 |     out_folder.mkdir(parents=True, exist_ok=True)
31 | 
32 |     for split in splits:
33 |         amrs = (in_folder / split / "nodes.pp.txt").read_text().split("\n")
34 |         surfaces = (in_folder / split / "surface.pp.txt").read_text().split("\n")
35 |         raw_amrs = (in_folder / ".." / "tmp_amr" / split / "graphs.txt").read_text().split("\n")
36 |         items = [
37 |             {"linearized_amr": amr, "answer_text": surface, "raw_amr": raw_amr}
38 |             for amr, surface, raw_amr in zip(amrs, surfaces, raw_amrs)
39 |             if amr and surface
40 |         ]
41 | 
42 |         classification_data = []
43 |         original_items = deepcopy(items)
44 |         for item in items:
45 | 
46 |             entities = get_entities(item["linearized_amr"])
47 | 
48 |             swapping_candidates = [entities]
49 |             cutting_candidates = [entities]
50 | 
51 |             rand_item = None
52 |             while rand_item is None or rand_item == item:
53 |                 rand_item = random.choice(original_items)
54 | 
55 |             random_text = rand_item["answer_text"]
56 | 
57 |             distractors, classification_items = get_distractors(
58 |                 item["linearized_amr"],
59 |                 item["answer_text"],
60 |                 swapping_candidates,
61 |                 cutting_candidates,
62 |                 random_text,
63 |                 num_candidates=num_candidates,
64 |                 max_per_operation=max_per_operation,
65 |             )
66 |             classification_data.extend(classification_items)
67 | 
68 |             item["answer_text"] = distractors + [item["answer_text"]]
69 | 
70 |         json.dump(items, open(out_folder / (splits[split] + ".json"), "w"), indent=2)
71 |         write_classification_data(classification_data, classification_dir, splits[split].replace(".json", ""))
72 | 
73 |     generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"linearized_amr": "amr"})
74 |     fix_text_in_dir(out_folder)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     Fire(preprocess)
79 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/custom_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from datatuner.lm.special_token_generator import get_custom_tags
 2 | 
 3 | 
 4 | def tokenize(self, text, **kwargs):
 5 |     """ Converts a string in a sequence of tokens (string), using the tokenizer.
 6 |             Split in words for word-based vocabulary or sub-words for sub-word-based
 7 |             vocabularies (BPE/SentencePieces/WordPieces).
 8 | 
 9 |             Take care of added tokens.
10 |         """
11 | 
12 |     def split_on_token(tok, text):
13 |         result = []
14 |         split_text = text.split(tok)
15 |         for i, sub_text in enumerate(split_text):
16 |             sub_text = sub_text.strip()
17 |             if i == 0 and not sub_text:
18 |                 result += [tok]
19 |             elif i == len(split_text) - 1:
20 |                 if sub_text:
21 |                     result += [sub_text]
22 |                 else:
23 |                     pass
24 |             else:
25 |                 if sub_text:
26 |                     result += [sub_text]
27 |                 result += [tok]
28 |         return result
29 | 
30 |     def split_on_tokens(tok_list, text):
31 |         if not text:
32 |             return []
33 |         if not tok_list:
34 |             return self._tokenize(text, **kwargs)
35 | 
36 |         tokenized_text = []
37 |         text_list = [text]
38 |         for tok in tok_list:
39 |             tokenized_text = []
40 |             for sub_text in text_list:
41 |                 if sub_text not in self.added_tokens_encoder and sub_text not in self.all_special_tokens:
42 |                     tokenized_text += split_on_token(tok, sub_text)
43 |                 else:
44 |                     tokenized_text += [sub_text]
45 |             text_list = tokenized_text
46 | 
47 |         return sum(
48 |             (
49 |                 self._tokenize(token, **kwargs)
50 |                 if token not in self.added_tokens_encoder and token not in self.all_special_tokens
51 |                 else [token]
52 |                 for token in tokenized_text
53 |             ),
54 |             [],
55 |         )
56 | 
57 |     def get_special_tokens(s):
58 |         candidates = get_custom_tags(s)
59 |         return [cand for cand in candidates if cand in self.added_tokens_encoder.keys()]
60 | 
61 |     # The below becomes very slow when we scale to thousands of special tokens (e.g. many node types/predicates)
62 |     # self.added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
63 | 
64 |     all_added = list(self.added_tokens_encoder.keys())
65 | 
66 |     TOO_LARGE_NUM_TOKENS_THRESHOLD = 10
67 |     # If we have a large number of special tokens, our current hack is to use task specific regexes to decide
68 |     # candidates from the sentence first, and then match these candidates with the special tokens in the encoder.
69 |     # That way we reduce the number of special tokens per iteration to a handful instead of thousands.
70 |     if len(all_added) > TOO_LARGE_NUM_TOKENS_THRESHOLD:
71 |         current_added_tokens = get_special_tokens(text)
72 |     else:
73 |         # Otherwise, we simply take all the added tokens, as is the original case in the library
74 |         current_added_tokens = all_added
75 | 
76 |     added_tokens = current_added_tokens + self.all_special_tokens
77 | 
78 |     tokenized_text = split_on_tokens(added_tokens, text)
79 |     return tokenized_text
80 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/reranker.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from datatuner.lm.data_loader import get_inputs
 6 | from datatuner.lm.model_loader import load_pretrained
 7 | from datatuner.lm.utils import custom_deep_copy, load_task_config, should_ignore_in_score
 8 | from datatuner.utils import geo_mean
 9 | 
10 | 
11 | class Reranker:
12 |     def __init__(self, model_folder, device, is_local=True):
13 |         with torch.no_grad():
14 |             self.model, self.tokenizer = load_pretrained(model_folder, model_type="gpt2")
15 |             self.model_folder = Path(model_folder)
16 |             try:
17 |                 self.task_config = load_task_config(model_folder / "task_config.json")
18 |             except:
19 |                 self.task_config = None
20 |             self.device = device
21 |             self.is_local = is_local
22 |             self.model.to(self.device)
23 |             self.model.eval()
24 | 
25 |             self.NEWLINE = [198]
26 |             self.SPACE = [220]
27 | 
28 |     def remove_unsupported_tokens(self, ids):
29 |         new_ids = []
30 |         for j in ids:
31 |             try:
32 |                 self.tokenizer.decode(j)
33 |                 new_ids.append(j)
34 |             except:
35 |                 new_ids.append(self.SPACE[0])
36 |         return new_ids
37 | 
38 |     def create_input(self, input_ids, item):
39 |         if not self.is_local:
40 |             assert item is not None
41 |             item = custom_deep_copy(item)
42 |             item.update({"answer_text": input_ids})
43 |             input_ids, token_type_ids = get_inputs(item, self.device, self.tokenizer, self.task_config)
44 |             context_len = len(input_ids[0]) - len(input_ids)
45 | 
46 |         else:
47 |             if "linearized_amr" in item:
48 |                 context = []
49 |                 context_len = 0
50 | 
51 |             input_ids = self.tokenizer.encode(
52 |                 self.tokenizer.decode(self.remove_unsupported_tokens(context + input_ids))
53 |             )
54 | 
55 |             input_ids = torch.tensor(input_ids, device=self.device).unsqueeze(0)
56 |             token_type_ids = None
57 | 
58 |         return input_ids, token_type_ids, context_len
59 | 
60 |     def score(self, input_ids, item):
61 | 
62 |         input_ids, token_type_ids, context_len = self.create_input(input_ids, item)
63 | 
64 |         model_outputs = self.model(input_ids, token_type_ids=token_type_ids)
65 |         probs = F.softmax(model_outputs[0][0], dim=1)
66 |         x = []
67 |         for i in range(context_len, len(input_ids[0])):
68 |             next_token_id = input_ids[0][i].item()
69 |             next_token_str = self.tokenizer.decode(next_token_id)
70 |             prefix = input_ids[0][context_len:i]
71 |             next_prob = probs[i - 1][next_token_id].item()
72 |             if (
73 |                     not should_ignore_in_score(prefix, self.tokenizer, next_token_str, next_token_id, next_prob)
74 |                     and input_ids[0][i] != self.SPACE[0]
75 |             ):
76 |                 x.append(next_prob)
77 |         score = geo_mean(x)
78 |         return score
79 | 
80 |     def rerank(self, nbest_items, item):
81 |         with torch.no_grad():
82 |             scores = []
83 | 
84 |             for input_ids in nbest_items:
85 |                 scores.append(-self.score(input_ids, item))
86 | 
87 |             nbest_items = [x for _, _, x in sorted(zip(scores, list(range(0, len(nbest_items))), nbest_items))]
88 | 
89 |             return nbest_items
90 | 


--------------------------------------------------------------------------------
/paper/experiments/viggo/preprocess.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | from copy import deepcopy
  4 | from pathlib import Path
  5 | 
  6 | import pandas as pd
  7 | from datatuner.classification.distractors import (get_distractors,
  8 |                                                   write_classification_data)
  9 | from datatuner.lm.special_token_generator import generate_from_json
 10 | from datatuner.lm.utils import fix_text_in_dir
 11 | from fire import Fire
 12 | 
 13 | random.seed(42)
 14 | 
 15 | 
 16 | def parse_mr(mr):
 17 |     i = mr.index("(")
 18 |     intro, params_str = mr[:i], mr[i + 1: -1]
 19 |     j = 0
 20 |     scope = "key"
 21 |     current_key = ""
 22 |     current_val = ""
 23 |     keys = []
 24 |     values = []
 25 |     while j < len(params_str):
 26 |         next_char = params_str[j]
 27 |         if scope == "key":
 28 |             if next_char == "[":
 29 |                 scope = "value"
 30 |                 keys.append(current_key)
 31 |                 current_key = ""
 32 |             else:
 33 |                 current_key += next_char
 34 | 
 35 |         elif scope == "value":
 36 |             if next_char == "]":
 37 |                 scope = "between"
 38 |                 values.append(current_val)
 39 |                 current_val = ""
 40 |             else:
 41 |                 current_val += next_char
 42 | 
 43 |         elif scope == "between":
 44 |             if next_char == " ":
 45 |                 scope = "key"
 46 | 
 47 |         j += 1
 48 |     assert len(keys) == len(values)
 49 |     return {"keys": keys, "values": values, "intro": intro}
 50 | 
 51 | 
 52 | def preprocess(in_folder, out_folder, classification_dir):
 53 |     in_folder = Path(in_folder)
 54 |     out_folder = Path(out_folder)
 55 | 
 56 |     out_folder.mkdir(parents=True, exist_ok=True)
 57 | 
 58 |     splits = {"viggo-test.csv": "test.json", "viggo-train.csv": "train.json", "viggo-valid.csv": "validation.json"}
 59 |     for split in splits:
 60 |         df = pd.read_csv(in_folder / split)
 61 |         data = df.to_dict(orient="records")
 62 |         original_data = deepcopy(data)
 63 |         classification_data = []
 64 | 
 65 |         for item in data:
 66 |             mr = item["mr"]
 67 |             parsed = parse_mr(mr)
 68 |             new_params = [
 69 |                 f"<{key}> {key.replace('_', ' ')}: [ {value} ]" for key, value in zip(parsed["keys"], parsed["values"])
 70 |             ]
 71 |             new_mr = f"<{parsed['intro']}> {parsed['intro'].replace('_', ' ')} ( {', '.join(new_params)}> )"
 72 | 
 73 |             item["new_mr"] = new_mr
 74 | 
 75 |             valid_values = [x for x in parsed["values"] if x]
 76 |             swapping_candidates = [valid_values]
 77 |             cutting_candidates = [valid_values]
 78 | 
 79 |             rand_item = None
 80 |             while rand_item is None or rand_item == item:
 81 |                 rand_item = random.choice(original_data)
 82 |             random_text = rand_item["ref"]
 83 | 
 84 |             distractors, classification_items = get_distractors(
 85 |                 new_mr,
 86 |                 item["ref"],
 87 |                 swapping_candidates,
 88 |                 cutting_candidates,
 89 |                 random_text,
 90 |                 num_candidates=10,
 91 |                 max_per_operation=10,
 92 |             )
 93 |             classification_data.extend(classification_items)
 94 | 
 95 |             item["ref"] = distractors + [item["ref"]]
 96 | 
 97 |         json.dump(data, open(out_folder / (splits[split]), "w"), indent=2)
 98 |         write_classification_data(classification_data, classification_dir, splits[split].replace(".json", ""))
 99 | 
100 |     generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"new_mr": "amr"})
101 |     fix_text_in_dir(out_folder)
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     Fire(preprocess)
106 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/scalar_slot.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list, find_all_in_list, get_slot_value_alternatives
 4 | 
 5 | 
 6 | DIST_IDX_THRESH = 10
 7 | DIST_POS_THRESH = 30
 8 | 
 9 | 
10 | def align_scalar_slot(text, text_tok, slot, value, slot_mapping=None, value_mapping=None, slot_stem_only=False):
11 |     slot_stem_indexes = []
12 |     slot_stem_positions = []
13 |     leftmost_pos = -1
14 | 
15 |     text = re.sub(r'\'', '', text)
16 | 
17 |     # Get the words that possibly realize the slot
18 |     slot_stems = __get_scalar_slot_stems(slot)
19 | 
20 |     if slot_mapping is not None:
21 |         slot = slot_mapping
22 |     alternatives = get_slot_value_alternatives(slot)
23 | 
24 |     # Search for all possible slot realizations
25 |     for slot_stem in slot_stems:
26 |         if len(slot_stem) == 1 and not slot_stem.isalnum():
27 |             # Exception for single-letter special-character slot stems
28 |             slot_stem_pos = [m.start() for m in re.finditer(slot_stem, text)]
29 |         elif len(slot_stem) > 4 or ' ' in slot_stem:
30 |             slot_stem_pos = [m.start() for m in re.finditer(slot_stem, text)]
31 |         else:
32 |             slot_stem_idx, slot_stem_pos = find_all_in_list(slot_stem, text_tok)
33 |             if len(slot_stem_idx) > 0:
34 |                 slot_stem_indexes.extend(slot_stem_idx)
35 | 
36 |         if len(slot_stem_pos) > 0:
37 |             slot_stem_positions.extend(slot_stem_pos)
38 | 
39 |     slot_stem_positions.sort()
40 |     slot_stem_indexes.sort()
41 | 
42 |     # If it's only required that the slot stem is matched, don't search for the value
43 |     if slot_stem_only and len(slot_stem_positions) > 0:
44 |         return slot_stem_positions[0]
45 | 
46 |     # Get the value's alternative realizations
47 |     value_alternatives = [value]
48 |     if value_mapping is not None:
49 |         value = value_mapping[value]
50 |         value_alternatives.append(value)
51 |     if value in alternatives:
52 |         value_alternatives += alternatives[value]
53 | 
54 |     # Search for all possible value equivalents
55 |     for val in value_alternatives:
56 |         if len(val) > 4 or ' ' in val:
57 |             # Search for multi-word values in the string representation
58 |             val_positions = [m.start() for m in re.finditer(val, text)]
59 |             for pos in val_positions:
60 |                 # Remember the leftmost value position as a fallback in case there is no nearby slot stem mention
61 |                 if pos < leftmost_pos or leftmost_pos == -1:
62 |                     leftmost_pos = pos
63 | 
64 |                 # Find a slot stem mention within a certain distance from the value realization
65 |                 if len(slot_stem_positions) > 0:
66 |                     for slot_stem_pos in slot_stem_positions:
67 |                         if abs(pos - slot_stem_pos) < DIST_POS_THRESH:
68 |                             return pos
69 |         else:
70 |             # Search for single-word values in the tokenized representation
71 |             val_indexes, val_positions = find_all_in_list(val, text_tok)
72 |             for i, idx in enumerate(val_indexes):
73 |                 # Remember the leftmost value position as a fallback in case there is no nearby slot stem mention
74 |                 if val_positions[i] < leftmost_pos or leftmost_pos == -1:
75 |                     leftmost_pos = val_positions[i]
76 | 
77 |                 # Find a slot stem mention within a certain distance from the value realization
78 |                 if len(slot_stem_indexes) > 0:
79 |                     for slot_stem_idx in slot_stem_indexes:
80 |                         if abs(idx - slot_stem_idx) < DIST_IDX_THRESH:
81 |                             return val_positions[i]
82 | 
83 |     return leftmost_pos
84 | 
85 | 
86 | def __get_scalar_slot_stems(slot):
87 |     slot_stems = {
88 |         'esrb': ['esrb'],
89 |         'rating': ['rating', 'ratings', 'rated', 'rate', 'review', 'reviews'],
90 |         'customerrating': ['customer', 'rating', 'ratings', 'rated', 'rate', 'review', 'reviews', 'star', 'stars'],
91 |         'pricerange': ['price', 'pricing', 'cost', 'costs', 'dollars', 'pounds', 'euros', '\$', '£', '€']
92 |     }
93 | 
94 |     return slot_stems.get(slot, [])
95 | 


--------------------------------------------------------------------------------
/paper/experiments/e2e/preprocess.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import random
  4 | from collections import OrderedDict
  5 | from copy import deepcopy
  6 | from pathlib import Path
  7 | 
  8 | import pandas as pd
  9 | from datatuner.classification.distractors import (get_distractors,
 10 |                                                   write_classification_data)
 11 | from datatuner.lm.special_token_generator import generate_from_json
 12 | from datatuner.lm.utils import fix_text_in_dir
 13 | from fire import Fire
 14 | 
 15 | 
 16 | def parse_mr(mr):
 17 |     params_str = mr
 18 |     j = 0
 19 |     scope = "key"
 20 |     current_key = ""
 21 |     current_val = ""
 22 |     keys = []
 23 |     values = []
 24 |     while j < len(params_str):
 25 |         next_char = params_str[j]
 26 |         if scope == "key":
 27 |             if next_char == "[":
 28 |                 scope = "value"
 29 |                 keys.append(current_key)
 30 |                 current_key = ""
 31 |             else:
 32 |                 current_key += next_char
 33 | 
 34 |         elif scope == "value":
 35 |             if next_char == "]":
 36 |                 scope = "between"
 37 | 
 38 |                 values.append(current_val)
 39 |                 current_val = ""
 40 |             else:
 41 |                 current_val += next_char
 42 |         elif scope == "between":
 43 |             scope = "key"
 44 |             j += 2
 45 |             continue
 46 | 
 47 |         j += 1
 48 | 
 49 |     assert len(keys) == len(values)
 50 |     return {"keys": keys, "values": values}
 51 | 
 52 | 
 53 | def preprocess(in_folder, out_folder, classification_dir):
 54 |     in_folder = Path(in_folder)
 55 |     splits = {"train-fixed.no-ol": "train", "devel-fixed.no-ol": "validation", "test-fixed": "test"}
 56 | 
 57 |     for split in splits:
 58 |         classification_data = []
 59 |         df = pd.read_csv(in_folder / (split + ".csv"))
 60 |         out_folder = Path(out_folder)
 61 |         out_folder.mkdir(parents=True, exist_ok=True)
 62 |         data = df.to_dict(orient="records")
 63 |         original_data = deepcopy(data)
 64 | 
 65 |         new_data = OrderedDict()
 66 |         print(len(data))
 67 |         for item in data:
 68 |             key = item["mr"]
 69 |             if key in new_data:
 70 |                 new_data[key].append(item)
 71 |             else:
 72 |                 new_data[key] = [item]
 73 | 
 74 |         out_data = []
 75 |         for mr_key in new_data:
 76 | 
 77 |             for item in new_data[mr_key]:
 78 | 
 79 |                 mr = item["mr"]
 80 |                 parsed = parse_mr(mr)
 81 |                 new_params = [f"<{key}> {key} = [ {value} ]" for key, value in zip(parsed["keys"], parsed["values"])]
 82 |                 new_mr = " ; ".join(new_params)
 83 |                 item["new_mr"] = new_mr
 84 |                 out_data.append(item)
 85 | 
 86 |                 valid_values = [x for x in parsed["values"] if x]
 87 |                 swapping_candidates = [valid_values]
 88 |                 cutting_candidates = [valid_values]
 89 | 
 90 |                 rand_item = None
 91 |                 while rand_item is None or rand_item == item:
 92 |                     rand_item = random.choice(original_data)
 93 |                 random_text = rand_item["ref"]
 94 | 
 95 |                 distractors, classification_items = get_distractors(
 96 |                     new_mr,
 97 |                     item["ref"],
 98 |                     swapping_candidates,
 99 |                     cutting_candidates,
100 |                     random_text,
101 |                     num_candidates=1,
102 |                     max_per_operation=1,
103 |                 )
104 | 
105 |                 classification_data.extend(classification_items)
106 | 
107 |         print(f"written for {split}")
108 |         json.dump(out_data, open(out_folder / (splits[split] + ".json"), "w"), indent=2)
109 |         classification_data = random.sample(classification_data, int(math.ceil(0.7 * len(classification_data))))
110 |         write_classification_data(classification_data, classification_dir, splits[split].replace(".json", ""))
111 | 
112 |     generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"new_mr": "amr"})
113 |     fix_text_in_dir(out_folder)
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     Fire(preprocess)
118 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/categorical_slots.py:
--------------------------------------------------------------------------------
  1 | from nltk.tokenize import word_tokenize
  2 | from nltk.corpus import wordnet
  3 | 
  4 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list, get_slot_value_alternatives
  5 | 
  6 | 
  7 | def align_categorical_slot(text, text_tok, slot, value, mode='exact_match'):
  8 |     # TODO: load alternatives only once
  9 |     alternatives = get_slot_value_alternatives(slot)
 10 | 
 11 |     pos = find_value_alternative(text, text_tok, value, alternatives, mode=mode)
 12 | 
 13 |     return pos
 14 | 
 15 | 
 16 | def find_value_alternative(text, text_tok, value, alternatives, mode):
 17 |     leftmost_pos = -1
 18 | 
 19 |     # Parse the item into tokens according to the selected mode
 20 |     if mode == 'first_word':
 21 |         value_alternatives = [value.split(' ')[0]]  # Single-element list
 22 |     elif mode == 'any_word':
 23 |         value_alternatives = value.split(' ')  # List of elements
 24 |     elif mode == 'all_words':
 25 |         value_alternatives = [value.split(' ')]  # List of single-element lists
 26 |     else:
 27 |         value_alternatives = [value]  # Single-element list
 28 | 
 29 |     # Merge the tokens with the item's alternatives
 30 |     if value in alternatives:
 31 |         value_alternatives += alternatives[value]
 32 | 
 33 |     # Iterate over individual tokens of the item
 34 |     for value_alt in value_alternatives:
 35 |         # If the item is composed of a single token, convert it to a single-element list
 36 |         if not isinstance(value_alt, list):
 37 |             value_alt = [value_alt]
 38 | 
 39 |         # Keep track of the positions of all the item's tokens
 40 |         positions = []
 41 |         for tok in value_alt:
 42 |             if len(tok) > 4 or ' ' in tok:
 43 |                 # Search for long and multi-word values in the string representation
 44 |                 pos = text.find(tok)
 45 |             else:
 46 |                 # Search for short single-word values in the tokenized representation
 47 |                 _, pos = find_first_in_list(tok, text_tok)
 48 |             positions.append(pos)
 49 | 
 50 |         # If all tokens of one of the value's alternatives are matched, record the match and break
 51 |         if all([p >= 0 for p in positions]):
 52 |             leftmost_pos = min(positions)
 53 |             break
 54 | 
 55 |     return leftmost_pos
 56 | 
 57 | 
 58 | # TODO @food has 24 failures which are acceptable to remove the slot
 59 | def foodSlot(text, text_tok, value):
 60 |     value = value.lower()
 61 | 
 62 |     pos = text.find(value)
 63 |     if pos >= 0:
 64 |         return pos
 65 |     elif value == 'english':
 66 |         return text.find('british')
 67 |     elif value == 'fast food':
 68 |         return text.find('american style')
 69 |     else:
 70 |         text_tok = word_tokenize(text)
 71 |         for token in text_tok:
 72 |             # FIXME warning this will be slow on start up
 73 |             synsets = wordnet.synsets(token, pos='n')
 74 |             synset_ctr = 0
 75 | 
 76 |             for synset in synsets:
 77 |                 synset_ctr += 1
 78 |                 hypernyms = synset.hypernyms()
 79 | 
 80 |                 # If none of the first 3 meanings of the word has "food" as hypernym, then we do not want to
 81 |                 #   identify the word as food-related (e.g. "center" has its 14th meaning associated with "food",
 82 |                 #   or "green" has its 7th meaning accociated with "food").
 83 |                 while synset_ctr <= 3 and len(hypernyms) > 0:
 84 |                     lemmas = [l.name() for l in hypernyms[0].lemmas()]
 85 | 
 86 |                     if 'food' in lemmas:
 87 |                         # DEBUG PRINT
 88 |                         # print(token)
 89 | 
 90 |                         return text.find(token)
 91 |                     # Skip false positives (e.g. "a" in the meaning of "vitamin A" has "food" as a hypernym,
 92 |                     #   or "coffee" in "coffee shop" has "food" as a hypernym). There are still false positives
 93 |                     #   triggered by proper nouns containing a food term, such as "Burger King" or "The Golden Curry".
 94 |                     elif 'vitamin' in lemmas:
 95 |                         break
 96 |                     elif 'beverage' in lemmas:
 97 |                         break
 98 | 
 99 |                     # Follow the hypernyms recursively up to the root
100 |                     hypernyms = hypernyms[0].hypernyms()
101 | 
102 |     return pos
103 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/utils.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import logging
  4 | from pathlib import Path
  5 | 
  6 | import ftfy
  7 | import torch
  8 | 
  9 | logger = logging.getLogger(__file__)
 10 | 
 11 | 
 12 | def average_distributed_scalar(scalar, args):
 13 |     """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
 14 |     if args.local_rank == -1:
 15 |         return scalar
 16 |     scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
 17 |     torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
 18 |     return scalar_t.item()
 19 | 
 20 | 
 21 | def load_task_config(filename):
 22 |     """Load the task configuration from file"""
 23 |     task_config = json.load(open(filename, "r"))
 24 |     return task_config
 25 | 
 26 | 
 27 | def is_middle_token(tokenizer, token_str, prefix):
 28 |     try:
 29 |         tokenizer_name = str(type(tokenizer))
 30 | 
 31 |         if len(prefix) == 0:
 32 |             return False
 33 | 
 34 |         prev_token_str = tokenizer.decode(prefix[-1])
 35 | 
 36 |         # If the previous token is not alphanumeric, it's not a middle token
 37 |         if not prev_token_str[-1].isalnum():
 38 |             return False
 39 | 
 40 |         # The prev and current tokens should be of same type.
 41 |         if not (
 42 |                 (prev_token_str[-1].isalpha() and token_str[0].isalpha())
 43 |                 or (prev_token_str[-1].isdigit() and token_str[0].isdigit())
 44 |         ):
 45 |             return False
 46 | 
 47 |         if "GPT2" in tokenizer_name:
 48 |             return not (token_str[0] in [" ", "\u0120"])
 49 |         elif "OpenAIGPT" in tokenizer_name:
 50 |             return not prefix[-1].endswith("</w>")
 51 |         else:
 52 |             raise Exception("non-supported tokenizer")
 53 |     except:
 54 |         return False
 55 | 
 56 | 
 57 | def is_added_token(tokenizer, token_id):
 58 |     return token_id >= len(tokenizer.decoder)
 59 | 
 60 | 
 61 | def should_stop_further_beam_exploration(prefix, tokenizer, next_token_str, next_token_id, next_prob, prob_thresh=0.9):
 62 |     """We stop exploring the beam further if the current string is a word continuation as we don't expect better
 63 |     continuations to appear.
 64 |     Example 1: if we get "Who is the res" and the next token is "ponsible", we stop exploring.
 65 |     Example 2: if we get "The airport code is 12" and the next token is "4", we stop exploring.
 66 |     Example 3: if we get "The airport code is twenty" and the next token is ".", we stop exploring.
 67 |     Example 4: if we get "The airport code is 123" and the next token is ".", we stop exploring.
 68 |     """
 69 |     return (
 70 |             # The token is a middle token
 71 |             is_middle_token(tokenizer, next_token_str, prefix)
 72 |             #  is not a special token
 73 |             and not is_added_token(tokenizer, next_token_id)
 74 |             and next_prob > prob_thresh
 75 |     )
 76 | 
 77 | 
 78 | def should_ignore_in_score(prefix, tokenizer, next_token_str, next_token_id, next_prob, prob_thresh=0.9):
 79 |     return (
 80 |         # Probability is high enough
 81 |         # next_prob > prob_thresh
 82 |         # The token is a middle token
 83 |             is_middle_token(tokenizer, next_token_str, prefix)
 84 |             # is alphanumeric (avoid punctuations)
 85 |             and next_token_str.strip()[0].isalnum()
 86 |             #  is not a special token
 87 |             and not is_added_token(tokenizer, next_token_id)
 88 |             and next_prob > prob_thresh
 89 |     )
 90 | 
 91 | 
 92 | def custom_deep_copy(d):
 93 |     if type(d) == dict:
 94 |         new_d = {}
 95 |         for key in d:
 96 |             try:
 97 |                 new_d[key] = torch.clone(d[key])
 98 |             except:
 99 |                 new_d[key] = copy.deepcopy(d[key])
100 |         return new_d
101 |     else:
102 |         try:
103 |             return torch.clone(d)
104 |         except:
105 |             return copy.deepcopy(d)
106 | 
107 | 
108 | def fix_text_in_dir(directory):
109 |     """Fix text encoding with ftfy for the data splits withdirectory"""
110 |     directory = Path(directory)
111 |     for split in ["train.json", "validation.json", "test.json"]:
112 |         data = json.load(open(directory / split))
113 |         for item in data:
114 |             for k in item:
115 |                 if type(item[k]) == str:
116 |                     item[k] = ftfy.fix_text(item[k])
117 |                 elif type(item[k]) == list:
118 |                     item[k] = [ftfy.fix_text(x) for x in item[k]]
119 |         json.dump(data, open(directory / split, "w"), indent=2)
120 | 


--------------------------------------------------------------------------------
/paper/retrieve.sh:
--------------------------------------------------------------------------------
  1 | source ./config.sh
  2 | 
  3 | # Function Definitions
  4 | 
  5 | # confirm external dependencies with user
  6 | EXTERNAL_DEPS_MSG="""The scripts provided herein will retrieve several third-party libraries,
  7 |  environments, and/or other software packages at install-time or build-time (“External Dependencies”)
  8 |  from third-party sources.  There are terms and conditions that you need to agree to
  9 |  abide by if you choose to install the External Dependencies.  If you do not agree
 10 |  with every term and condition associated with the External Dependencies,
 11 |  enter “QUIT” in the command line when prompted by the script."""
 12 | 
 13 | confirm_external_dependencies() {
 14 |   echo
 15 |   echo $EXTERNAL_DEPS_MSG
 16 |   while true; do
 17 |     read -p "Do you want to PROCEED or QUIT? " yn
 18 |         case $yn in
 19 |         PROCEED)
 20 |             echo "Proceeding"
 21 |             break
 22 |             ;;
 23 |         QUIT)
 24 |             echo "Quitting"
 25 |             exit
 26 |             ;;
 27 |         esac
 28 |     done
 29 | }
 30 | 
 31 | # clone a specific repository commit to the give folder
 32 | clone_repo_commit() {
 33 |     # params: repo_url, commit, folder
 34 |     git clone $1 $3
 35 |     cd $3
 36 |     git checkout $2 --quiet
 37 |     cd -
 38 | }
 39 | 
 40 | # check if directory exists and exit with a special message if not
 41 | assert_dir_exists() {
 42 |     # params: directory, message on failure
 43 |     if [ ! -d $1 ]; then
 44 |         echo "Error: $1 does not exist."
 45 |         echo $2
 46 |         exit
 47 |     fi
 48 | }
 49 | 
 50 | 
 51 | #############################################################################################
 52 | 
 53 | confirm_external_dependencies
 54 | 
 55 | # Check that the LDC2017T10 and ViGGO datasets have been manually downloaded and placed in the correct locations.
 56 | assert_dir_exists $LDC2017_DATA_LOCATION "The folder $LDC2017_DATA_LOCATION should contain the LDC2017T10 dataset. Download it from https://catalog.ldc.upenn.edu/LDC2017T10"
 57 | newline
 58 | assert_dir_exists $VIGGO_DATA_LOCATION "The folder $VIGGO_DATA_LOCATION should contain the ViGGO dataset. Download it from https://nlds.soe.ucsc.edu/viggo"
 59 | newline
 60 | 
 61 | # Ask the user if the temporary folder exists so that we don't remove and retrieve the data again.
 62 | if [ -d $TMP_DATA_FOLDER ]; then
 63 |     echo "Directory $TMP_DATA_FOLDER exists. Are you sure you want to delete the data and retrieve it again?"
 64 | 
 65 |     select yn in "Yes" "No"; do
 66 |         case $yn in
 67 |         Yes)
 68 |             echo "Alright. Continuing!"
 69 |             break
 70 |             ;;
 71 |         No)
 72 |             echo "Exiting"
 73 |             exit
 74 |             ;;
 75 |         esac
 76 |     done
 77 | else
 78 |     echo "Directory $TMP_DATA_FOLDER does not exist."
 79 | fi
 80 | 
 81 | newline
 82 | echo "Creating the folder $TMP_DATA_FOLDER for placing the data there."
 83 | rm -rf ./tmp
 84 | mkdir -p ./tmp
 85 | 
 86 | newline
 87 | 
 88 | MAIN_DIR=`pwd`
 89 | #############################################################################################
 90 | 
 91 | echo "Processing LDC2017T10 dataset"
 92 | 
 93 | echo "Getting the repository for data preprocessing"
 94 | clone_repo_commit https://github.com/UKPLab/emnlp2019-dualgraph.git 0c58fb7f3ad3b9da3b92b2d2841558807fc79fd0 $TMP_DATA_FOLDER/emnlp2019-dualgraph
 95 | 
 96 | echo "Copying the changes needed"
 97 | cp ../src/external/ukplab_emnlp2019_dualgraph/split_amr.py $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/split_amr.py
 98 | cp ../src/external/ukplab_emnlp2019_dualgraph/gen_LDC2017T10.sh $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/gen_LDC2017T10.sh
 99 | cp ../src/external/ukplab_emnlp2019_dualgraph/preproc_amr.py $TMP_DATA_FOLDER/emnlp2019-dualgraph/process_amr/preproc_amr.py
100 | cp ../src/external/ukplab_emnlp2019_dualgraph/preprocess_LDC2017T10.sh $TMP_DATA_FOLDER/emnlp2019-dualgraph/preprocess_LDC2017T10.sh
101 | 
102 | echo "Running the initial preprocessing"
103 | bash $TMP_DATA_FOLDER/emnlp2019-dualgraph/preprocess_LDC2017T10.sh $LDC2017_DATA_LOCATION ~/
104 | 
105 | #############################################################################################
106 | 
107 | newline
108 | cd $MAIN_DIR
109 | 
110 | # WebNLG dataset
111 | echo "Processing WebNLG dataset"
112 | 
113 | echo "Retrieving WebNLG data"
114 | clone_repo_commit https://github.com/ThiagoCF05/webnlg.git 12ca34880b225ebd1eb9db07c64e8dd76f7e5784 $TMP_DATA_FOLDER/webnlg
115 | 
116 | #############################################################################################
117 | 
118 | newline
119 | cd $MAIN_DIR
120 | 
121 | # Cleaned E2E
122 | echo "Processing Cleaned E2E dataset"
123 | clone_repo_commit https://github.com/tuetschek/e2e-cleaning.git c6f634ba16aec89f5ec5462e9c62fb3e8c5c5d16 $TMP_DATA_FOLDER/e2e-cleaning
124 | 
125 | #############################################################################################
126 | 
127 | cd $MAIN_DIR
128 | 
129 | # E2E Metrics
130 | echo "Getting E2E metrics repository"
131 | clone_repo_commit https://github.com/tuetschek/e2e-metrics.git dca5d301a97f7264b0827fb5589c0cc51008b5d7 $TMP_DATA_FOLDER/e2e-metrics
132 | 
133 | 
134 | newline
135 | echo "Successfully retrieved the data from their sources"
136 | 


--------------------------------------------------------------------------------
/src/external/shimorina_inlg_2018/webnlg_slot_error_rate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import re
  4 | import sys
  5 | 
  6 | from external.shimorina_webnlg_baseline.benchmark_reader import Benchmark
  7 | from external.shimorina_webnlg_baseline.webnlg_baseline_input import select_files
  8 | from nltk.tokenize import wordpunct_tokenize
  9 | import json
 10 | 
 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 12 | 
 13 | 
 14 | def substring_match(x, values):
 15 |     return any(x in v for v in values)
 16 | 
 17 | 
 18 | verbose = False
 19 | 
 20 | 
 21 | def calculate_ser(mr, pred):
 22 |     values = clean_mr(mr)
 23 |     total_n_slots = len(values)
 24 |     missing = 0
 25 |     hallucinated = 0
 26 |     for value in values:
 27 |         if value not in pred.lower():
 28 |             if verbose:
 29 |                 print("\n")
 30 |                 print("Missing:", value)
 31 |                 print(mr)
 32 |                 print(value)
 33 |                 print(pred)
 34 |             missing += 1
 35 |     # delete s and o that are present in MR
 36 |     # account for the case where the item is "texas" and the values have an entry: "abilene, texas". This is not hallucinated.
 37 |     all_subj_obj_not_pres = [
 38 |         item for item in ENTITIES if not substring_match(item, values)
 39 |     ]
 40 |     # all_subj_obj_not_pres = [item for item in ENTITIES if item not in values]
 41 | 
 42 |     for entity in all_subj_obj_not_pres:
 43 |         if entity in pred.lower().split():
 44 |             hallucinated += 1
 45 |             if  verbose:
 46 |                 print("\n")
 47 |                 print("Hallucination:")
 48 |                 print(mr)
 49 |                 print(entity)
 50 |                 print(pred)
 51 |     # print('COUNTS: Missing', missing, 'Hallucinated', hallucinated, 'Denominator', total_n_slots)
 52 |     ser = (missing + hallucinated) / total_n_slots
 53 |     return ser
 54 | 
 55 | 
 56 | def clean_mr(mr):
 57 |     # (19255)_1994_VK8 | density | 2.0(gramPerCubicCentimetres) | | |
 58 |     # extract all subjects and objects and clean them
 59 |     subj_obj = []
 60 |     triples = mr.strip().split("|||")  # the last one is empty
 61 |     triples = [triple for triple in triples if triple]  # delete empty triples
 62 |     for triple in triples:
 63 |         s, p, o = triple.split(" | ")
 64 |         s = s.lower().replace("_", " ")
 65 |         o = o.lower().replace("_", " ")
 66 |         # separate punct signs from text
 67 |         s = " ".join(re.split(r"(\W)", s))
 68 |         o = " ".join(re.split(r"(\W)", o))
 69 |         # Drop quotes
 70 |         s = s.replace('"', "")
 71 |         o = o.replace('"', "")
 72 |         # delete white spaces
 73 |         subj_obj.append(" ".join(s.split()))
 74 |         subj_obj.append(" ".join(o.split()))
 75 |     return subj_obj
 76 | 
 77 | 
 78 | def get_all_subj_obj():
 79 |     # read all the webnlg corpus
 80 |     # extract all subjects and objects
 81 |     base_path = "/paper/tmp/webnlg/data/v1.4/en/"
 82 |     path_train = base_path + "train"
 83 |     path_dev = base_path + "dev"
 84 |     path_test = base_path + "test"
 85 |     b = Benchmark()
 86 |     files_train = select_files(path_train)
 87 |     files_dev = select_files(path_dev)
 88 |     files_test = select_files(path_test)
 89 |     b.fill_benchmark(files_train + files_dev + files_test)
 90 |     subjs, objs = b.subjects_objects()
 91 |     # clean subj and obj
 92 |     subjs_cleaned = []
 93 |     for subj in list(subjs):
 94 |         subjs_cleaned.append(clean(subj))
 95 |     objs_cleaned = []
 96 |     for obj in list(objs):
 97 |         objs_cleaned.append(clean(obj))
 98 |     return subjs_cleaned, objs_cleaned
 99 | 
100 | 
101 | def clean(entity):
102 |     entity = entity.lower().replace("_", " ")
103 |     # separate punct signs from text
104 |     entity = " ".join(re.split(r"(\W)", entity))
105 |     entity = " ".join(entity.split())  # delete whitespaces
106 |     return entity
107 | 
108 | 
109 | def get_all_entities_in_corpus():
110 |     # get all cleaned s and o from the whole corpus
111 |     all_subj_cleaned, all_obj_cleaned = get_all_subj_obj()
112 |     entities = list(set(all_subj_cleaned + all_obj_cleaned))
113 |     # delete all numbers from entities
114 |     for i, entity in enumerate(entities):
115 |         try:
116 |             float(entity.replace(" ", ""))
117 |             del entities[i]
118 |         except ValueError:
119 |             pass
120 |     return entities
121 | 
122 | 
123 | ENTITIES = get_all_entities_in_corpus()
124 | 
125 | 
126 | def compute_ser(datafile, outfile, mr_field, text_field):
127 |     df = pd.read_json(datafile, orient="records")
128 | 
129 |     df["ser"] = df.apply(
130 |         lambda x: calculate_ser(
131 |             x[mr_field], " ".join(wordpunct_tokenize(x[text_field][0]))
132 |         ),
133 |         axis=1,
134 |     )
135 | 
136 |     df["ser_correct"] = df["ser"].apply(lambda x: 0 if x > 0 else 1)
137 | 
138 |     results = {}
139 |     results["mean_ser"] = round(df["ser"].mean(), 4)
140 |     results["percent_correct_ser"] = round(len(df[df["ser"] == 0]) / len(df) * 100, 4)
141 |     print(json.dumps(results, indent=2))
142 | 
143 |     data_dict = df.to_dict(orient="records")
144 |     json.dump(data_dict, open(outfile, "w"), indent=2)
145 | 
146 |     return results
147 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/alternatives.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "genres": {
  3 |     "action adventure": [
  4 |       [
  5 |         "action",
  6 |         "adventur"
  7 |       ]
  8 |     ],
  9 |     "adventure": [
 10 |       "adventur"
 11 |     ],
 12 |     "driving racing": [
 13 |       "driving",
 14 |       "drive",
 15 |       "racing",
 16 |       "race"
 17 |     ],
 18 |     "fighting": [
 19 |       "fight"
 20 |     ],
 21 |     "mmorpg": [
 22 |       "massive"
 23 |     ],
 24 |     "platformer": [
 25 |       "platforming"
 26 |     ],
 27 |     "real time strategy": [
 28 |       "real time",
 29 |       "rts"
 30 |     ],
 31 |     "role playing": [
 32 |       "roleplaying",
 33 |       "role play",
 34 |       "rpg",
 35 |       "rpgs"
 36 |     ],
 37 |     "shooter": [
 38 |       "shoot",
 39 |       "fps"
 40 |     ],
 41 |     "simulation": [
 42 |       "simulat",
 43 |       "sim"
 44 |     ],
 45 |     "strategy": [
 46 |       "strateg"
 47 |     ],
 48 |     "tactical": [
 49 |       "tactic"
 50 |     ],
 51 |     "trivia board game": [
 52 |       "trivia",
 53 |       "board"
 54 |     ],
 55 |     "turn based strategy": [
 56 |       "turn based"
 57 |     ],
 58 |     "vehicular combat": [
 59 |       [
 60 |         "vehic",
 61 |         "combat"
 62 |       ]
 63 |     ]
 64 |   },
 65 |   "playerperspective": {
 66 |     "first person": [
 67 |       "fps"
 68 |     ],
 69 |     "bird view": [
 70 |       "top down"
 71 |     ]
 72 |   },
 73 |   "rating": {
 74 |     "excellent": [
 75 |       "5 out of",
 76 |       "5 star",
 77 |       "adore",
 78 |       "amazing",
 79 |       "attract",
 80 |       "awesome",
 81 |       "best",
 82 |       "fantastic",
 83 |       "favorite",
 84 |       "five",
 85 |       "great",
 86 |       "high",
 87 |       "highly",
 88 |       "love",
 89 |       "loved",
 90 |       "loving",
 91 |       "quality",
 92 |       "special",
 93 |       "superb",
 94 |       "top",
 95 |       "unique"
 96 |     ],
 97 |     "good": [
 98 |       "acclaim",
 99 |       "cool",
100 |       "enjoy",
101 |       "fun",
102 |       "like",
103 |       "liked",
104 |       "positive",
105 |       "solid",
106 |       "well"
107 |     ],
108 |     "average": [
109 |       "3 out of",
110 |       "3 star",
111 |       "all right",
112 |       "alright",
113 |       "decent",
114 |       "kinda",
115 |       "kind of",
116 |       "lukewarm",
117 |       "mediocre",
118 |       "meh",
119 |       "middle",
120 |       "middling",
121 |       "mixed",
122 |       "moderate",
123 |       "ok",
124 |       "okay",
125 |       "ordinary",
126 |       "so so",
127 |       "three",
128 |       "unimpress"
129 |     ],
130 |     "poor": [
131 |       "1 out of",
132 |       "1 star",
133 |       "avoid",
134 |       "bad",
135 |       "badly",
136 |       "boring",
137 |       "detest",
138 |       "disappoint",
139 |       "dislike",
140 |       "dull",
141 |       "hate",
142 |       "hated",
143 |       "hating",
144 |       "lackluster",
145 |       "lacking",
146 |       "loathe",
147 |       "low",
148 |       "lowly",
149 |       "negative",
150 |       "one",
151 |       "poorly",
152 |       "underwhelm",
153 |       "wrong"
154 |     ]
155 |   },
156 |   "esrb": {
157 |     "e ( for everyone )": [
158 |       "e rated",
159 |       "rated e",
160 |       "e rating",
161 |       "rating e",
162 |       "everyone",
163 |       "all"
164 |     ],
165 |     "e 10+ ( for everyone 10 and older )": [
166 |       "e10+",
167 |       "e 10+",
168 |       "e 10 plus",
169 |       "everyone 10",
170 |       "everyone above",
171 |       "everyone over",
172 |       "everyone older"
173 |     ],
174 |     "t ( for teen )": [
175 |       "t rated",
176 |       "rated t",
177 |       "t rating",
178 |       "rating t",
179 |       "teen",
180 |       "teens",
181 |       "teenagers"
182 |     ],
183 |     "m ( for mature )": [
184 |       "m rated",
185 |       "rated m",
186 |       "m rating",
187 |       "rating m",
188 |       "mature",
189 |       "adult"
190 |     ]
191 |   },
192 |   "pricerange": {
193 |     "cheap": [
194 |       "less than \u00a320",
195 |       "inexpensive",
196 |       "affordable",
197 |       "low",
198 |       "lower",
199 |       "budget",
200 |       "bargain"
201 |     ],
202 |     "moderate": [
203 |       "\u00a320-25",
204 |       "average",
205 |       "reasonable"
206 |     ],
207 |     "high": [
208 |       "more than \u00a330",
209 |       "expensive",
210 |       "costly",
211 |       "pricey",
212 |       "high",
213 |       "higher"
214 |     ],
215 |     "less than \u00a320": [
216 |       "cheap",
217 |       "inexpensive",
218 |       "affordable",
219 |       "low",
220 |       "budget"
221 |     ],
222 |     "\u00a320-25": [
223 |       "moderate",
224 |       "average",
225 |       "reasonable"
226 |     ],
227 |     "more than \u00a330": [
228 |       "high",
229 |       "expensive",
230 |       "costly",
231 |       "pricey",
232 |       "high"
233 |     ]
234 |   },
235 |   "area": {
236 |     "city centre": [
237 |       "center",
238 |       "centre",
239 |       "downtown",
240 |       [
241 |         "middle",
242 |         "city"
243 |       ],
244 |       [
245 |         "middle",
246 |         "town"
247 |       ]
248 |     ],
249 |     "riverside": [
250 |       "river"
251 |     ]
252 |   },
253 |   "type": {
254 |     "television": [
255 |       "tv"
256 |     ]
257 |   }
258 | }
259 | 


--------------------------------------------------------------------------------
/src/external/webnlg_webnlg_baseline/benchmark_reader.py:
--------------------------------------------------------------------------------
  1 | import xml.etree.ElementTree as Et
  2 | from collections import defaultdict
  3 | 
  4 | 
  5 | class Triple:
  6 | 
  7 |     def __init__(self, s, p, o):
  8 |         self.s = s
  9 |         self.o = o
 10 |         self.p = p
 11 | 
 12 | 
 13 | class Tripleset:
 14 | 
 15 |     def __init__(self):
 16 |         self.triples = []
 17 | 
 18 |     def fill_tripleset(self, t):
 19 |         for xml_triple in t:
 20 |             s, p, o = xml_triple.text.split(' | ')
 21 |             triple = Triple(s, p, o)
 22 |             self.triples.append(triple)
 23 | 
 24 | 
 25 | class Lexicalisation:
 26 | 
 27 |     def __init__(self, lex, comment, lid):
 28 |         self.lex = lex
 29 |         self.comment = comment
 30 |         self.id = lid
 31 | 
 32 | 
 33 | class Entry:
 34 | 
 35 |     def __init__(self, category, size, eid):
 36 |         self.originaltripleset = []
 37 |         self.modifiedtripleset = Tripleset()
 38 |         self.lexs = []
 39 |         self.category = category
 40 |         self.size = size
 41 |         self.id = eid
 42 | 
 43 |     def fill_originaltriple(self, xml_t):
 44 |         otripleset = Tripleset()
 45 |         self.originaltripleset.append(otripleset)   # multiple originaltriplesets for one entry
 46 |         otripleset.fill_tripleset(xml_t)
 47 | 
 48 |     def fill_modifiedtriple(self, xml_t):
 49 |         self.modifiedtripleset.fill_tripleset(xml_t)
 50 | 
 51 |     def create_lex(self, xml_lex):
 52 |         comment = xml_lex.attrib['comment']
 53 |         lid = xml_lex.attrib['lid']
 54 |         lex = Lexicalisation(xml_lex.text, comment, lid)
 55 |         self.lexs.append(lex)
 56 | 
 57 |     def count_lexs(self):
 58 |         return len(self.lexs)
 59 | 
 60 | 
 61 | class Benchmark:
 62 | 
 63 |     def __init__(self):
 64 |         self.entries = []
 65 | 
 66 |     def fill_benchmark(self, fileslist):
 67 |         for file in fileslist:
 68 |             tree = Et.parse(file[0] + '/' + file[1])
 69 |             root = tree.getroot()
 70 |             for xml_entry in root.iter('entry'):
 71 |                 # ignore triples with no lexicalisations
 72 |                 lexfound = False
 73 |                 for child in xml_entry:
 74 |                     if child.tag == "lex":
 75 |                         lexfound = True
 76 |                         break
 77 |                 if lexfound is False:
 78 |                     continue
 79 | 
 80 |                 entry_id = xml_entry.attrib['eid']
 81 |                 category = xml_entry.attrib['category']
 82 |                 size = xml_entry.attrib['size']
 83 |                 entry = Entry(category, size, entry_id)
 84 |                 for child in xml_entry:
 85 |                     if child.tag == 'originaltripleset':
 86 |                         entry.fill_originaltriple(child)
 87 |                     elif child.tag == 'modifiedtripleset':
 88 |                         entry.fill_modifiedtriple(child)
 89 |                     elif child.tag == 'lex':
 90 |                         entry.create_lex(child)
 91 |                 self.entries.append(entry)
 92 | 
 93 |     def total_lexcount(self):
 94 |         count = [entry.count_lexs() for entry in self.entries]
 95 |         return sum(count)
 96 | 
 97 |     def unique_p(self):
 98 |         properties = [triple.p for entry in self.entries for triple in entry.modifiedtripleset.triples]
 99 |         return len(set(properties))
100 | 
101 |     def entry_count(self, size=None, cat=None):
102 |         """
103 |         calculate the number of entries in benchmark
104 |         :param size: size (should be string)
105 |         :param cat: category
106 |         :return: entry count
107 |         """
108 |         if not size and cat:
109 |             entries = [entry for entry in self.entries if entry.category == cat]
110 |         elif not cat and size:
111 |             entries = [entry for entry in self.entries if entry.size == size]
112 |         elif not size and not cat:
113 |             return len(self.entries)
114 |         else:
115 |             entries = [entry for entry in self.entries if entry.category == cat and entry.size == size]
116 |         return len(entries)
117 | 
118 |     def lexcount_size_category(self, size='', cat=''):
119 |         count = [entry.count_lexs() for entry in self.entries if entry.category == cat and entry.size == size]
120 |         return len(count)
121 | 
122 |     def property_map(self):
123 |         mprop_oprop = defaultdict(set)
124 |         for entry in self.entries:
125 |             for tripleset in entry.originaltripleset:
126 |                 for i, triple in enumerate(tripleset.triples):
127 |                     mprop_oprop[entry.modifiedtripleset.triples[i].p].add(triple.p)
128 |         return mprop_oprop
129 | 
130 |     def subjects_objects(self):
131 |         subjects = []
132 |         objects = []
133 |         for entry in self.entries:
134 |             for tripleset in entry.originaltripleset:
135 |                 for triple in tripleset.triples:
136 |                     if triple.o not in objects:
137 |                         objects.append(triple.o)
138 |                     if triple.s not in subjects:
139 |                         subjects.append(triple.o)
140 |         return [subjects, objects]


--------------------------------------------------------------------------------
/src/datatuner/lm/model_loader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import mlflow
  6 | from datatuner.lm import custom_tokenizer
  7 | from datatuner.lm.custom_gpt2 import custom_gpt2_with_smoothing
  8 | from datatuner.lm.data_loader import PAD_TOKEN
  9 | from transformers import (
 10 |     GPT2DoubleHeadsModel,
 11 |     GPT2LMHeadModel,
 12 |     GPT2Tokenizer,
 13 |     OpenAIGPTDoubleHeadsModel,
 14 |     OpenAIGPTLMHeadModel,
 15 |     OpenAIGPTTokenizer,
 16 |     GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
 17 | )
 18 | from transformers.tokenization_utils import PreTrainedTokenizer
 19 | 
 20 | logger = logging.getLogger(__file__)
 21 | 
 22 | 
 23 | def load_pretrained_tokenizer(model_checkpoint, model_type):
 24 |     """Load pretrained tokenizer"""
 25 | 
 26 |     tokenizer_class = OpenAIGPTTokenizer if "openai-gpt" in model_type else GPT2Tokenizer
 27 |     tokenizer = tokenizer_class.from_pretrained(model_checkpoint)
 28 |     PreTrainedTokenizer.tokenize = custom_tokenizer.tokenize
 29 |     return tokenizer
 30 | 
 31 | 
 32 | def load_training_args(run_id):
 33 |     client = mlflow.tracking.MlflowClient()
 34 |     training_args_file = client.download_artifacts(run_id, "training/model_training_args.json")
 35 |     model_training_args = json.load(open(training_args_file))
 36 |     return model_training_args
 37 | 
 38 | 
 39 | def get_model_directory(model_checkpoint=None):
 40 |     """Get the model directory; if `model_checkpoint` is a folder, it is returned;
 41 |      if it is a shortcut name for a Hugging Face model, the name is returned for handling downstream;
 42 |      if it's a run_id, the folder is obtained from mlflow."""
 43 |     is_local = True
 44 |     if Path(model_checkpoint).exists():
 45 |         return Path(model_checkpoint), is_local
 46 |     elif model_checkpoint in GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys():
 47 |         is_local = False
 48 |         return model_checkpoint, is_local
 49 |     else:
 50 |         client = mlflow.tracking.MlflowClient()
 51 |         run_info = client.get_run(model_checkpoint)
 52 |         is_local = False
 53 |         return Path(run_info.info.artifact_uri) / "training", is_local
 54 | 
 55 | 
 56 | def read_special_tokens(task_config=None, special_tokens_file=None, dataset_path=None):
 57 |     """Read special tokens from file and from the task configuration"""
 58 |     tokens = []
 59 |     # If no special tokens file is explicitly passed, we try finding a special_tokens.txt file in the model directory
 60 |     if special_tokens_file is None:
 61 |         if dataset_path is not None:
 62 |             special_tokens_file = Path(dataset_path) / "special_tokens.txt"
 63 | 
 64 |     # Add any special tokens indicated in the file
 65 |     if special_tokens_file is not None and special_tokens_file.exists():
 66 |         tokens += [x for x in special_tokens_file.read_text().split("\n") if x.strip()]
 67 |         logger.info(f"read {len(tokens)} special tokens from {special_tokens_file}")
 68 | 
 69 |     if task_config is not None:
 70 |         # add any special tokens defined in the tokenization
 71 |         for item in task_config["data_shape"]:
 72 |             if item["type"] == "special":
 73 |                 tokens += [item["id"]]
 74 | 
 75 |         if "extra_special_tokens" in task_config:
 76 |             tokens.extend(task_config["extra_special_tokens"])
 77 | 
 78 |     # Add basic eos and padding tokens
 79 |     tokens += [PAD_TOKEN, "<eos>"]
 80 | 
 81 |     return tokens
 82 | 
 83 | 
 84 | def load_pretrained(
 85 |         model_directory,
 86 |         model_type=None,
 87 |         smoothing=0.0,
 88 |         output_attentions=True,
 89 |         output_hidden_states=True,
 90 |         multitask=False,
 91 |         special_tokens_file=None,
 92 |         task_config=None,
 93 |         dataset_path=None,
 94 |         **kwargs,
 95 | ):
 96 |     """Load pretrained model"""
 97 |     print("Get pretrained model and tokenizer")
 98 |     model_directory = str(model_directory)
 99 | 
100 |     try:
101 |         model_training_args = json.load(open(Path(model_directory) / "model_training_args.json"))
102 |         if "gpt2" in model_training_args["model_directory"]:
103 |             model_type = "gpt2"
104 |         elif "openai-gpt" in model_training_args["model_directory"]:
105 |             model_type = "openai-gpt"
106 | 
107 |         multitask = model_training_args["multitask"]
108 |     except:
109 |         pass
110 | 
111 |     if model_type is None:
112 |         model_type = model_directory
113 | 
114 |     tokenizer = load_pretrained_tokenizer(model_directory, model_type)
115 | 
116 |     if smoothing > 0:
117 |         model_class = custom_gpt2_with_smoothing(smoothing=smoothing)
118 | 
119 |     elif "gpt2" in model_type:
120 |         if multitask:
121 |             model_class = GPT2DoubleHeadsModel
122 |         else:
123 |             model_class = GPT2LMHeadModel
124 |     elif "openai-gpt" in model_type:
125 |         if multitask:
126 |             model_class = OpenAIGPTDoubleHeadsModel
127 |         else:
128 |             model_class = OpenAIGPTLMHeadModel
129 |     else:
130 |         raise ValueError(
131 |             "Invalid model type; make sure to pass the actual model_type if your checkpoint name or model name does not have the model type in them"
132 |         )
133 | 
134 |     model = model_class.from_pretrained(
135 |         model_directory, output_attentions=output_attentions, output_hidden_states=output_hidden_states, **kwargs
136 |     )
137 | 
138 |     return model, tokenizer
139 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/novograd.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | import torch
 17 | from torch.optim import Optimizer
 18 | 
 19 | 
 20 | class Novograd(Optimizer):
 21 |     """
 22 |     Implements Novograd algorithm.
 23 |     Args:
 24 |         params (iterable): iterable of parameters to optimize or dicts defining
 25 |             parameter groups
 26 |         lr (float, optional): learning rate (default: 1e-3)
 27 |         betas (Tuple[float, float], optional): coefficients used for computing
 28 |             running averages of gradient and its square (default: (0.95, 0))
 29 |         eps (float, optional): term added to the denominator to improve
 30 |             numerical stability (default: 1e-8)
 31 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 32 |         grad_averaging: gradient averaging
 33 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 34 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 35 |             (default: False)
 36 |     """
 37 | 
 38 |     def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, weight_decay=0, grad_averaging=False, amsgrad=False):
 39 |         if not 0.0 <= lr:
 40 |             raise ValueError("Invalid learning rate: {}".format(lr))
 41 |         if not 0.0 <= eps:
 42 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 43 |         if not 0.0 <= betas[0] < 1.0:
 44 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 45 |         if not 0.0 <= betas[1] < 1.0:
 46 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 47 |         defaults = dict(
 48 |             lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging, amsgrad=amsgrad
 49 |         )
 50 | 
 51 |         super(Novograd, self).__init__(params, defaults)
 52 | 
 53 |     def __setstate__(self, state):
 54 |         super(Novograd, self).__setstate__(state)
 55 |         for group in self.param_groups:
 56 |             group.setdefault("amsgrad", False)
 57 | 
 58 |     def step(self, closure=None):
 59 |         """Performs a single optimization step.
 60 |         Arguments:
 61 |             closure (callable, optional): A closure that reevaluates the model
 62 |             and returns the loss.
 63 |         """
 64 |         loss = None
 65 |         if closure is not None:
 66 |             loss = closure()
 67 | 
 68 |         for group in self.param_groups:
 69 |             for p in group["params"]:
 70 |                 if p.grad is None:
 71 |                     continue
 72 |                 grad = p.grad.data
 73 |                 if grad.is_sparse:
 74 |                     raise RuntimeError("Sparse gradients are not supported.")
 75 |                 amsgrad = group["amsgrad"]
 76 | 
 77 |                 state = self.state[p]
 78 | 
 79 |                 # State initialization
 80 |                 if len(state) == 0:
 81 |                     state["step"] = 0
 82 |                     # Exponential moving average of gradient values
 83 |                     state["exp_avg"] = torch.zeros_like(p.data)
 84 |                     # Exponential moving average of squared gradient values
 85 |                     state["exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
 86 |                     if amsgrad:
 87 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 88 |                         state["max_exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
 89 | 
 90 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 91 |                 if amsgrad:
 92 |                     max_exp_avg_sq = state["max_exp_avg_sq"]
 93 |                 beta1, beta2 = group["betas"]
 94 | 
 95 |                 state["step"] += 1
 96 | 
 97 |                 norm = torch.sum(torch.pow(grad, 2))
 98 | 
 99 |                 if exp_avg_sq == 0:
100 |                     exp_avg_sq.copy_(norm)
101 |                 else:
102 |                     exp_avg_sq.mul_(beta2).add_(1 - beta2, norm)
103 | 
104 |                 if amsgrad:
105 |                     # Maintains the maximum of all 2nd moment running avg. till now
106 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
107 |                     # Use the max. for normalizing running avg. of gradient
108 |                     denom = max_exp_avg_sq.sqrt().add_(group["eps"])
109 |                 else:
110 |                     denom = exp_avg_sq.sqrt().add_(group["eps"])
111 | 
112 |                 grad.div_(denom)
113 |                 if group["weight_decay"] != 0:
114 |                     grad.add_(group["weight_decay"], p.data)
115 |                 if group["grad_averaging"]:
116 |                     grad.mul_(1 - beta1)
117 |                 exp_avg.mul_(beta1).add_(grad)
118 | 
119 |                 p.data.add_(-group["lr"], exp_avg)
120 | 
121 |         return loss
122 | 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/alignment/boolean_slot.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from external.jjuraska_slug2slug.slot_aligner.alignment.utils import find_first_in_list, find_all_in_list
  4 | 
  5 | 
  6 | NEG_IDX_FALSE_PRE_THRESH = 10
  7 | NEG_POS_FALSE_PRE_THRESH = 30
  8 | NEG_IDX_TRUE_PRE_THRESH = 5
  9 | NEG_POS_TRUE_PRE_THRESH = 15
 10 | NEG_IDX_POST_THRESH = 10
 11 | NEG_POS_POST_THRESH = 30
 12 | 
 13 | negation_cues_pre = [
 14 |     'no', 'not', 'non', 'none', 'neither', 'nor', 'never', 'n\'t', 'cannot',
 15 |     'excluded', 'lack', 'lacks', 'lacking', 'unavailable', 'without', 'zero',
 16 |     'everything but'
 17 | ]
 18 | negation_cues_post = [
 19 |     'not', 'nor', 'never', 'n\'t', 'cannot',
 20 |     'excluded', 'unavailable'
 21 | ]
 22 | contrast_cues = [
 23 |     'but', 'however', 'although', 'though', 'nevertheless'
 24 | ]
 25 | 
 26 | 
 27 | def align_boolean_slot(text, text_tok, slot, value, true_val='yes', false_val='no'):
 28 |     pos = -1
 29 |     text = re.sub(r'\'', '', text)
 30 | 
 31 |     # Get the words that possibly realize the slot
 32 |     slot_stems = __get_boolean_slot_stems(slot)
 33 | 
 34 |     # Search for all possible slot realizations
 35 |     for slot_stem in slot_stems:
 36 |         idx, pos = find_first_in_list(slot_stem, text_tok)
 37 |         if pos >= 0:
 38 |             if value == true_val:
 39 |                 # Match an instance of the slot stem without a preceding negation
 40 |                 if not __find_negation(text, text_tok, idx, pos, expected_true=True, after=False):
 41 |                     return pos
 42 |             else:
 43 |                 # Match an instance of the slot stem with a preceding or a following negation
 44 |                 if __find_negation(text, text_tok, idx, pos, expected_true=False, after=True):
 45 |                     return pos
 46 | 
 47 |     # If no match found and the value ~ False, search for alternative expressions of the opposite
 48 |     if pos < 0 and value == false_val:
 49 |         slot_antonyms = __get_boolean_slot_antonyms(slot)
 50 |         for slot_antonym in slot_antonyms:
 51 |             if ' ' in slot_antonym:
 52 |                 pos = text.find(slot_antonym)
 53 |             else:
 54 |                 _, pos = find_first_in_list(slot_antonym, text_tok)
 55 | 
 56 |             if pos >= 0:
 57 |                 return pos
 58 | 
 59 |     return -1
 60 | 
 61 | 
 62 | def __find_negation(text, text_tok, idx, pos, expected_true=False, after=False):
 63 |     # Set the thresholds depending on the expected boolean value of the slot
 64 |     if expected_true:
 65 |         idx_pre_thresh = NEG_IDX_TRUE_PRE_THRESH
 66 |         pos_pre_thresh = NEG_POS_TRUE_PRE_THRESH
 67 |     else:
 68 |         idx_pre_thresh = NEG_IDX_FALSE_PRE_THRESH
 69 |         pos_pre_thresh = NEG_POS_FALSE_PRE_THRESH
 70 | 
 71 |     for negation in negation_cues_pre:
 72 |         if ' ' in negation:
 73 |             neg_pos = text.find(negation)
 74 |             if neg_pos >= 0:
 75 |                 if 0 < (pos - neg_pos - text[neg_pos:pos].count(',')) <= pos_pre_thresh:
 76 |                     # Look for a contrast cue between the negation and the slot realization
 77 |                     neg_text_segment = text[neg_pos + len(negation):pos]
 78 |                     if __has_contrast_after_negation(neg_text_segment):
 79 |                         return False
 80 |                     else:
 81 |                         return True
 82 |         else:
 83 |             neg_idxs, _ = find_all_in_list(negation, text_tok)
 84 |             for neg_idx in neg_idxs:
 85 |                 if 0 < (idx - neg_idx - text_tok[neg_idx + 1:idx].count(',')) <= idx_pre_thresh:
 86 |                     # Look for a contrast cue between the negation and the slot realization
 87 |                     neg_text_segment = text_tok[neg_idx + 1:idx]
 88 |                     if __has_contrast_after_negation_tok(neg_text_segment):
 89 |                         return False
 90 |                     else:
 91 |                         return True
 92 | 
 93 |     if after:
 94 |         for negation in negation_cues_post:
 95 |             if ' ' in negation:
 96 |                 neg_pos = text.find(negation)
 97 |                 if neg_pos >= 0:
 98 |                     if 0 < (neg_pos - pos) < NEG_POS_POST_THRESH:
 99 |                         return True
100 |             else:
101 |                 neg_idxs, _ = find_all_in_list(negation, text_tok)
102 |                 for neg_idx in neg_idxs:
103 |                     if 0 < (neg_idx - idx) < NEG_IDX_POST_THRESH:
104 |                         return True
105 | 
106 |     return False
107 | 
108 | 
109 | def __has_contrast_after_negation(text):
110 |     for contr_tok in contrast_cues:
111 |         if text.find(contr_tok) >= 0:
112 |             return True
113 | 
114 |     return False
115 | 
116 | 
117 | def __has_contrast_after_negation_tok(text_tok):
118 |     for contr_tok in contrast_cues:
119 |         if contr_tok in text_tok:
120 |             return True
121 | 
122 |     return False
123 | 
124 | 
125 | def __get_boolean_slot_stems(slot):
126 |     slot_stems = {
127 |         'familyfriendly': ['family', 'families', 'kid', 'kids', 'child', 'children'],
128 |         'hasusbport': ['usb'],
129 |         'isforbusinesscomputing': ['business'],
130 |         'hasmultiplayer': ['multiplayer', 'friends', 'others'],
131 |         'availableonsteam': ['steam'],
132 |         'haslinuxrelease': ['linux'],
133 |         'hasmacrelease': ['mac']
134 |     }
135 | 
136 |     return slot_stems.get(slot, [])
137 | 
138 | 
139 | def __get_boolean_slot_antonyms(slot):
140 |     slot_antonyms = {
141 |         'familyfriendly': ['adult', 'adults'],
142 |         'isforbusinesscomputing': ['personal', 'general', 'home', 'nonbusiness'],
143 |         'hasmultiplayer': ['single player']
144 |     }
145 | 
146 |     return slot_antonyms.get(slot, [])
147 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/cross_entropy.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def _is_long(x):
  7 |     if hasattr(x, "data"):
  8 |         x = x.data
  9 |     return isinstance(x, torch.LongTensor) or isinstance(x, torch.cuda.LongTensor)
 10 | 
 11 | 
 12 | def onehot(indexes, N=None, ignore_index=None):
 13 |     """
 14 |     Creates a one-representation of indexes with N possible entries
 15 |     if N is not specified, it will suit the maximum index appearing.
 16 |     indexes is a long-tensor of indexes
 17 |     ignore_index will be zero in onehot representation
 18 |     """
 19 |     if N is None:
 20 |         N = indexes.max() + 1
 21 |     sz = list(indexes.size())
 22 |     output = indexes.new().byte().resize_(*sz, N).zero_()
 23 |     output.scatter_(-1, indexes.unsqueeze(-1), 1)
 24 |     if ignore_index is not None:
 25 |         output.masked_fill_(indexes.eq(ignore_index).unsqueeze(-1), 0)
 26 |     return output
 27 | 
 28 | 
 29 | def cross_entropy(
 30 |         inputs,
 31 |         target,
 32 |         weight=None,
 33 |         ignore_index=-100,
 34 |         reduction="mean",
 35 |         smooth_eps=None,
 36 |         smooth_dist=None,
 37 |         from_logits=True,
 38 | ):
 39 |     """cross entropy loss, with support for target distributions and label smoothing https://arxiv.org/abs/1512.00567"""
 40 |     smooth_eps = smooth_eps or 0
 41 | 
 42 |     # ordinary log-liklihood - use cross_entropy from nn
 43 |     if _is_long(target) and smooth_eps == 0:
 44 |         if from_logits:
 45 |             return F.cross_entropy(inputs, target, weight, ignore_index=ignore_index, reduction=reduction)
 46 |         else:
 47 |             return F.nll_loss(inputs, target, weight, ignore_index=ignore_index, reduction=reduction)
 48 | 
 49 |     if from_logits:
 50 |         # log-softmax of inputs
 51 |         lsm = F.log_softmax(inputs, dim=-1)
 52 |     else:
 53 |         lsm = inputs
 54 | 
 55 |     masked_indices = None
 56 |     num_classes = inputs.size(-1)
 57 | 
 58 |     if _is_long(target):
 59 |         masked_indices = target.eq(ignore_index)
 60 |         target[masked_indices] = 0
 61 | 
 62 |     if smooth_eps > 0 and smooth_dist is not None:
 63 |         if _is_long(target):
 64 |             target = onehot(target, num_classes).type_as(inputs)
 65 |         if smooth_dist.dim() < target.dim():
 66 |             smooth_dist = smooth_dist.unsqueeze(0)
 67 |         target.lerp_(smooth_dist, smooth_eps)
 68 | 
 69 |     if weight is not None:
 70 |         lsm = lsm * weight.unsqueeze(0)
 71 | 
 72 |     if _is_long(target):
 73 |         eps_sum = smooth_eps / num_classes
 74 |         eps_nll = 1.0 - eps_sum - smooth_eps
 75 |         likelihood = lsm.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1)
 76 |         loss = -(eps_nll * likelihood + eps_sum * lsm.sum(-1))
 77 |     else:
 78 |         loss = -(target * lsm).sum(-1)
 79 | 
 80 |     if masked_indices is not None:
 81 |         loss.masked_fill_(masked_indices, 0)
 82 | 
 83 |     if reduction == "sum":
 84 |         loss = loss.sum()
 85 |     elif reduction == "mean":
 86 |         if masked_indices is None:
 87 |             loss = loss.mean()
 88 |         else:
 89 |             loss = loss.sum() / float(loss.size(0) - masked_indices.sum())
 90 | 
 91 |     return loss
 92 | 
 93 | 
 94 | class CrossEntropyLoss(nn.CrossEntropyLoss):
 95 |     """CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing"""
 96 | 
 97 |     def __init__(
 98 |             self, weight=None, ignore_index=-100, reduction="mean", smooth_eps=None, smooth_dist=None, from_logits=True
 99 |     ):
100 |         super(CrossEntropyLoss, self).__init__(weight=weight, ignore_index=ignore_index, reduction=reduction)
101 |         self.smooth_eps = smooth_eps
102 |         self.smooth_dist = smooth_dist
103 |         self.from_logits = from_logits
104 | 
105 |     def forward(self, input, target, smooth_dist=None):
106 |         if smooth_dist is None:
107 |             smooth_dist = self.smooth_dist
108 |         return cross_entropy(
109 |             input,
110 |             target,
111 |             weight=self.weight,
112 |             ignore_index=self.ignore_index,
113 |             reduction=self.reduction,
114 |             smooth_eps=self.smooth_eps,
115 |             smooth_dist=smooth_dist,
116 |             from_logits=self.from_logits,
117 |         )
118 | 
119 | 
120 | def binary_cross_entropy(inputs, target, weight=None, reduction="mean", smooth_eps=None, from_logits=False):
121 |     """cross entropy loss, with support for label smoothing https://arxiv.org/abs/1512.00567"""
122 |     smooth_eps = smooth_eps or 0
123 |     if smooth_eps > 0:
124 |         target = target.float()
125 |         target.add_(smooth_eps).div_(2.0)
126 |     if from_logits:
127 |         return F.binary_cross_entropy_with_logits(inputs, target, weight=weight, reduction=reduction)
128 |     else:
129 |         return F.binary_cross_entropy(inputs, target, weight=weight, reduction=reduction)
130 | 
131 | 
132 | def binary_cross_entropy_with_logits(inputs, target, weight=None, reduction="mean", smooth_eps=None, from_logits=True):
133 |     return binary_cross_entropy(inputs, target, weight, reduction, smooth_eps, from_logits)
134 | 
135 | 
136 | class BCELoss(nn.BCELoss):
137 |     def __init__(
138 |             self, weight=None, size_average=None, reduce=None, reduction="mean", smooth_eps=None, from_logits=False
139 |     ):
140 |         super(BCELoss, self).__init__(weight, size_average, reduce, reduction)
141 |         self.smooth_eps = smooth_eps
142 |         self.from_logits = from_logits
143 | 
144 |     def forward(self, input, target):
145 |         return binary_cross_entropy(
146 |             input,
147 |             target,
148 |             weight=self.weight,
149 |             reduction=self.reduction,
150 |             smooth_eps=self.smooth_eps,
151 |             from_logits=self.from_logits,
152 |         )
153 | 
154 | 
155 | class BCEWithLogitsLoss(BCELoss):
156 |     def __init__(
157 |             self, weight=None, size_average=None, reduce=None, reduction="mean", smooth_eps=None, from_logits=True
158 |     ):
159 |         super(BCEWithLogitsLoss, self).__init__(
160 |             weight, size_average, reduce, reduction, smooth_eps=smooth_eps, from_logits=from_logits
161 |         )
162 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DataTuner
  2 | 
  3 | You have just found the DataTuner. 
  4 | This repository provides tools for fine-tuning language models for a task.
  5 | 
  6 | * See [LICENSE.txt](LICENSE.txt) for license details.
  7 | 
  8 | * See [NOTICE.txt](NOTICE.txt) for details of third party code included in or downloaded by this code. 
  9 | 
 10 | * See [/paper/README.md](paper/README.md) for details about reproducing the results reported in the paper 
 11 | ["Have Your Text and Use It Too! End-to-End Neural Data-to-Text Generation with Semantic Fidelity" by Hamza Harkous, Isabel Groves and Amir Saffari.](https://www.aclweb.org/anthology/2020.coling-main.218/)
 12 | 
 13 | 
 14 | 
 15 | ## Installation
 16 | 
 17 | ### Environment Creation
 18 | 
 19 | Assuming you have an existing `conda` setup, you can setup the environment with the following script. In order to activate the conda environment within the bash script, you need the location of the `conda.sh` file:
 20 | 
 21 | ```bash
 22 | bash setup.sh  ~/miniconda3/etc/profile.d/conda.sh
 23 | ```
 24 | 
 25 | You can update your existing environment:
 26 | 
 27 | ```bash
 28 | conda env update -f=environment.yml
 29 | ```
 30 | 
 31 | To start development, activate your environment:
 32 | 
 33 | ```bash
 34 | conda activate finetune
 35 | ```
 36 | 
 37 | Alternatively, you can always use the python binary with the absolute path, e.g.: `~/miniconda3/envs/finetune/bin/python`.
 38 | 
 39 | ## Data 
 40 | 
 41 | For any task you want to fine-tune on, you need the data to be a json file containing a list of json objects, one per data point. For example:
 42 | 
 43 | ```json
 44 | [
 45 |   {
 46 |     "question": "question text 1",
 47 |     "query": "query 1"
 48 |   },
 49 |   {
 50 |     "question": "question text 2",
 51 |     "query": "query 2 with [SpecialToken example]"
 52 |   }
 53 | ]
 54 | ```
 55 | 
 56 | The library assumes that you have placed your data in a single directory with three files: ``train.json``, ``validation.json``, and ``test.json``.
 57 | 
 58 | ## Configuration 
 59 | 
 60 | Now that we have the data in shape, we need to create a new task configuration file that specifies how we want the data to be formatted and what fields should be considered. You can create new config files in the folder ``src/datatuner/lm/task_configs``.
 61 | 
 62 | A typical config file would look as follows:
 63 | 
 64 | 
 65 | ```json
 66 | {
 67 | "name": "dataset_name",
 68 | "data_shape": [
 69 |         {
 70 |             "id": "<question>",
 71 |             "type": "special",
 72 |             "learn": false
 73 |         },
 74 |         {
 75 |             "id": "question",
 76 |             "type": "text",
 77 |             "learn": false
 78 |         },
 79 |         {
 80 |             "id": "<query>",
 81 |             "type": "special",
 82 |             "learn": false
 83 |         },
 84 |         {
 85 |             "id": "query",
 86 |             "type": "text",
 87 |             "learn": true,
 88 |             "metrics": [
 89 |                 "match"
 90 |             ]
 91 |         }
 92 |     ],
 93 | "extra_special_tokens": ["[SpecialToken"],
 94 | "extra_fields": []
 95 | }
 96 | ```
 97 | 
 98 | For each item in the data shape:
 99 | 
100 | - ``type`` (required): ``special`` if special token, ``text`` if normal text.
101 | - ``id`` (required): the special token ID if type is ``special``; the key for the text in the json data if type is ``text``
102 | - ``learn`` (required): whether to allow the model to learn this part of the text. If false, the model masks that part during fine-tuning.
103 | - ``metrics`` (optional): the list of metrics that the model should compute upon evaluation. Each metric should have a corresponding function with the same name in ``metrics.py``.
104 | - ``converter`` (optional): the name of the converter function in ``converters.py`` to apply on that text field after reading the text from the file. 
105 | 
106 | The value of `extra_special_tokens` is a list of special tokens to be added to the vocabulary. 
107 | Alternatively (especially if the list is too long or is generated automatically), you can create a text file with one special token per line and pass that as an argument during training via the `--special_tokens_file` argument.
108 | 
109 | 
110 | The value of `extra_fields` is a list of additional fields to include from the input `json` files to output during evaluation, aside from the main fields used as inputs/outputs.
111 | 
112 | ## Training 
113 | 
114 | The training script `train.py` can be used in single GPU or multi GPU settings.  
115 | 
116 | ```bash
117 | cd src/datatuner/lm
118 | 
119 | # single gpu
120 | python train.py --model_checkpoint ~/data/openai-gpt/  --dataset_path ../../../data/my_dataset/  --task_config ./task_configs/my_task_config.json --n_epoch 3 --lr 1e-5
121 | 
122 | # multi gpu
123 | python -m torch.distributed.launch --nproc_per_node=4 train.py --model_checkpoint ~/data/openai-gpt/  --dataset_path ../../../data/my_dataset/  --task_config ./task_configs/my_task_config.json --n_epoch 3 --lr 1e-5
124 | ```
125 | 
126 | 
127 | ## Evaluating the Model 
128 | 
129 | You can run the following to evaluate the model on any test set. The data format is the same as the training data. Notice that you have to currently specify the ``model_type`` parameter matching the model you're loading:
130 | 
131 | ```bash
132 | cd src/datatuner/lm
133 | 
134 | python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/2020-01-01_01-01-01  --filename ../../../data/my_dataset/test.json --max_length 200 --model_type gpt --top_k 1
135 | 
136 | # or if you just want to evaluate the latest model you trained 
137 | RUN=$(ls -t ./runs | head -1) && python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/$RUN  --filename ../../../data/my_dataset/test.json --max_length 200 --model_type gpt  --top_k 1
138 | 
139 | # or if you want to use the latest intermediate checkpoint while the model is training:
140 | RUN=$(ls -t ./runs | head -1) && CHECKPOINT=$(ls -t ./runs/$RUN/checkpoint* | head -1) && cp $CHECKPOINT runs/$RUN/pytorch_model.bin
141 | ``` 
142 | 
143 | During evaluation, the outputs that do not exactly match the expected outputs will be printed. Also,
144 | the metrics will be printed (a dictionary with keys `<metric_name>_<field_name>`). At the end of evaluation, you will find the file with all the generated ouputs in the file `eval_results/<run_folder_name>/<task_name>_<test_file_name>_<model_type>_generated.json`.
145 | 
146 | 
147 | 
148 | # Interacting with the model
149 | 
150 | You can also interact with the models. The client will ask you to input the fields required, and it will generate the fields it learnt.
151 | 
152 | ```bash
153 | cd src/datatuner/lm
154 | 
155 | python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/2020-01-01_01-01-01  --max_length 200 --model_type gpt  --top_k 1 --input
156 | 
157 | # or if you just want to evaluate the latest model you trained 
158 | RUN=$(ls -t ./runs | head -1) && python ./evaluate.py --task_config ./task_configs/my_task_config.json --model_checkpoint runs/$RUN  --max_length 200 --model_type gpt  --top_k 1 --input
159 | ``` 


--------------------------------------------------------------------------------
/src/external/jjuraska_slug2slug/slot_aligner/slot_extraction.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from nltk.tokenize import word_tokenize
  4 | 
  5 | from external.jjuraska_slug2slug import config
  6 | 
  7 | 
  8 | def extract_city(user_input, input_tokens, named_entities):
  9 |     city = None
 10 | 
 11 |     for ne in named_entities:
 12 |         if ne[0] == 'City':
 13 |             city = ne[2]
 14 |             break
 15 | 
 16 |     return city
 17 | 
 18 | 
 19 | def extract_eat_type(user_input):
 20 |     bar_synonyms = ['bar', 'bistro', 'brasserie', 'inn', 'tavern']
 21 |     coffee_shop_synonyms = ['café', 'cafe', 'coffee shop', 'coffeehouse', 'teahouse']
 22 |     restaurant_synonyms = ['cafeteria', 'canteen', 'chophouse', 'coffee shop', 'diner', 'donut shop', 'drive-in',
 23 |                            'eatery', 'eating place', 'fast-food place', 'joint', 'pizzeria', 'place to eat',
 24 |                            'restaurant', 'steakhouse']
 25 | 
 26 |     if any(x in user_input for x in bar_synonyms):
 27 |         return 'bar'
 28 |     elif any(x in user_input for x in coffee_shop_synonyms):
 29 |         return 'coffee shop'
 30 |     elif any(x in user_input for x in restaurant_synonyms):
 31 |         return 'restaurant'
 32 |     else:
 33 |         return None
 34 | 
 35 | 
 36 | def extract_categories(user_input, input_tokens):
 37 |     # file_categories_restaurants = 'dialogue/dialogue_modules/slug2slug/data/yelp/categories_restaurants.json'
 38 |     file_categories_restaurants = os.path.join(config.DATA_DIR, 'yelp', 'categories_restaurants.json')
 39 | 
 40 |     with open(file_categories_restaurants, 'r') as f_categories:
 41 |         categories = json.load(f_categories)
 42 | 
 43 |         for i, token in enumerate(input_tokens):
 44 |             # search for single-word occurrences in the category list
 45 |             if token in categories:
 46 |                 return {'title': token,
 47 |                         'ids': categories[token]}
 48 | 
 49 |             # search for bigram occurrences in the category list
 50 |             if i > 0:
 51 |                 key = ' '.join(input_tokens[i-1:i+1])
 52 |                 if key in categories:
 53 |                     return {'title': key,
 54 |                             'ids': categories[key]}
 55 | 
 56 |     return {'title': None,
 57 |             'ids': []}
 58 | 
 59 | 
 60 | def extract_price_range(user_input, input_tokens):
 61 |     CHEAP = ['1', '2']
 62 |     MODERATE = ['2', '3']
 63 |     HIGH = ['3', '4']
 64 | 
 65 |     indicators_indep = {'cheap': CHEAP,
 66 |                         'inexpensive': CHEAP,
 67 |                         'affordable': CHEAP,
 68 |                         'modest': CHEAP,
 69 |                         'budget': CHEAP,
 70 |                         'economic': CHEAP,
 71 |                         'economical': CHEAP,
 72 |                         'expensive': HIGH,
 73 |                         'costly': HIGH,
 74 |                         'fancy': HIGH,
 75 |                         'posh': HIGH,
 76 |                         'stylish': HIGH,
 77 |                         'elegant': HIGH,
 78 |                         'extravagant': HIGH,
 79 |                         'luxury': HIGH,
 80 |                         'luxurious': HIGH}
 81 | 
 82 |     indicators_indep_bigram = {'low cost': CHEAP,
 83 |                                'high class': HIGH}
 84 | 
 85 |     indicators_priced = {'low': CHEAP,
 86 |                          'reasonably': CHEAP,
 87 |                          'moderately': MODERATE,
 88 |                          'high': HIGH,
 89 |                          'highly': HIGH}
 90 | 
 91 |     indicators_range = {'low': CHEAP,
 92 |                         'moderate': MODERATE,
 93 |                         'average': MODERATE,
 94 |                         'ordinary': MODERATE,
 95 |                         'middle': MODERATE,
 96 |                         'high': HIGH}
 97 | 
 98 |     # search for single-word occurrences in the indicator list
 99 |     for token in input_tokens:
100 |         if token in indicators_indep:
101 |             return indicators_indep[token]
102 | 
103 |     # search for bigram occurrences in the category list
104 |     for key, val in indicators_indep_bigram.items():
105 |         if key in user_input:
106 |             return val
107 | 
108 |     idx = -1
109 |     try:
110 |         idx = input_tokens.index('priced')
111 |         if idx > 0:
112 |             prev_token = input_tokens[idx - 1]
113 |             if prev_token in indicators_priced:
114 |                 return indicators_priced[prev_token]
115 |     except ValueError:
116 |         try:
117 |             idx = input_tokens.index('price')
118 |         except ValueError:
119 |             try:
120 |                 idx = input_tokens.index('prices')
121 |             except ValueError:
122 |                 pass
123 | 
124 |         if idx > 0:
125 |             prev_token = input_tokens[idx - 1]
126 |             if prev_token in indicators_range:
127 |                 return indicators_range[prev_token]
128 | 
129 |     return None
130 | 
131 | 
132 | def extract_area(user_input, input_tokens):
133 |     indicators_area = ['downtown', 'city center', 'city centre', 'center of', 'centre of', 'middle of']
134 | 
135 |     area = None
136 | 
137 |     for ind in indicators_area:
138 |         if ind in user_input:
139 |             area = 'downtown'
140 |             break
141 | 
142 |     return area
143 | 
144 | 
145 | def extract_family_friendly(user_input, input_tokens):
146 |     indicators = ['family', 'families', 'child', 'children', 'kid', 'kids']
147 | 
148 |     for ind in indicators:
149 |         if ind in user_input:
150 |             return True
151 | 
152 |     return False
153 | 
154 | 
155 | # TODO: implement
156 | def extract_near(user_input):
157 |     indicators = ['near', 'near to', 'close to', 'next to', 'neighborhood of', 'vicinity of']
158 | 
159 |     return None
160 | 
161 | 
162 | def identify_slots(user_input, named_entities):
163 |     attributes = {}
164 | 
165 |     user_input = user_input.lower()
166 |     input_tokens = word_tokenize(user_input)
167 | 
168 |     city = extract_city(user_input, input_tokens, named_entities)
169 |     if city:
170 |         attributes['city'] = city
171 | 
172 |     eat_type = extract_eat_type(user_input)
173 |     if eat_type:
174 |         attributes['eatType'] = eat_type
175 | 
176 |     categories = extract_categories(user_input, input_tokens)
177 |     if categories:
178 |         attributes['categories'] = categories
179 | 
180 |     prices = extract_price_range(user_input, input_tokens)
181 |     if prices:
182 |         attributes['prices'] = prices
183 | 
184 |     family_friendly = extract_family_friendly(user_input, input_tokens)
185 |     if family_friendly:
186 |         attributes['familyFriendly'] = family_friendly
187 | 
188 |     area = extract_area(user_input, input_tokens)
189 |     if area:
190 |         attributes['area'] = area
191 | 
192 |     return attributes
193 | 
194 | 
195 | # ---- MAIN ----
196 | 
197 | def main():
198 |     user_input = 'Is there a family-friendly bar in downtown santa cruz that serves reasonably priced burgers?'
199 |     gnode_entities = [('VisualArtwork', 282.797767, 'restaurant in'), ('City', 2522.766114, 'Santa Cruz')]
200 |     print(identify_slots(user_input, gnode_entities))
201 | 
202 | 
203 | if __name__ == '__main__':
204 |     main()
205 | 


--------------------------------------------------------------------------------
/paper/README.md:
--------------------------------------------------------------------------------
  1 | # Reproducing Paper Results
  2 | 
  3 | This README describes how to reproduce the results in the paper:
  4 | 
  5 | ["Have Your Text and Use It Too! End-to-End Neural Data-to-Text Generation with Semantic Fidelity" by Hamza Harkous, Isabel Groves and Amir Saffari.](https://www.aclweb.org/anthology/2020.coling-main.218/)
  6 | 
  7 | To cite:
  8 | ```bibtex
  9 | @inproceedings{harkous-etal-2020-text,
 10 |     title = "Have Your Text and Use It Too! End-to-End Neural Data-to-Text Generation with Semantic Fidelity",
 11 |     author = "Harkous, Hamza  and
 12 |       Groves, Isabel  and
 13 |       Saffari, Amir",
 14 |     booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
 15 |     month = dec,
 16 |     year = "2020",
 17 |     address = "Barcelona, Spain (Online)",
 18 |     publisher = "International Committee on Computational Linguistics",
 19 |     url = "https://www.aclweb.org/anthology/2020.coling-main.218",
 20 |     doi = "10.18653/v1/2020.coling-main.218",
 21 |     pages = "2410--2424"
 22 | }
 23 | ```    
 24 | 
 25 | ## Setup
 26 | 
 27 | We assume you have run the setup script from the [main README](../README.md#environment-creation) file.
 28 | 
 29 | 
 30 | All of the scripts below should be run from the current folder `paper/`:
 31 | 
 32 | ```bash
 33 | cd paper
 34 | ```
 35 | 
 36 | ## Update the config
 37 | 
 38 | 
 39 | The first step is to update the configuration file [config.sh](config.sh) to point it to the correct directories and files. 
 40 | The main change that must be done is fixing the folders for the LDC2017T10 and the ViGGO dataset. These datasets are available from https://catalog.ldc.upenn.edu/LDC2017T10 and https://nlds.soe.ucsc.edu/viggo respectively. 
 41 | 
 42 | 
 43 | You can leave the rest of the default parameters as they are.
 44 | 
 45 | 
 46 | ## Retrieve the data
 47 | The next step is to download the rest of the datasets from their respective sources.
 48 | 
 49 | 
 50 | ```bash
 51 | bash retrieve.sh
 52 | ```
 53 | 
 54 | ## Preprocess the Data
 55 | Now we are ready to preprocess the data to the suitable format required by DataTuner. 
 56 | 
 57 | ```bash
 58 | bash preprocess.sh
 59 | ```
 60 | 
 61 | The preprocessed dataset will be present in the folder `./data/`
 62 | 
 63 | 
 64 | ## Train the Data-to-Text Language Model
 65 | We are now able to run the Data-to-Text language model fine-tuning.
 66 | 
 67 | As described in the paper, there are two system variants we can train: 
 68 | - `DataTuner_No_FC`: represents DataTuner without the fidelity classifier
 69 | - `DataTuner_No_FC_No_FS`: represents DataTuner with neither fidelity classifier nor fine-grained state embeddings
 70 | 
 71 | The third system variant `DataTuner_FC` reuses the trained `DataTuner_No_FC` and adds the fidelity classifier during the evaluation stage. Hence, the results for that system variant are the ones produced in the section [Run the Trained Classifiers on Generated Data](#run-the-trained-classifiers-on-generated-Data)
 72 | 
 73 | Arguments for the script (in order):
 74 | - `DATASET`: dataset name (from `e2e`, `viggo`, `ldc`, and `webnlg`)
 75 | - `SYSTEM`: system id (from `DataTuner_No_FC`, `DataTuner_No_FC_No_FS`)
 76 | - `OUTPUT_FOLDER`: folder where the model will be written
 77 | - `NUM_PARALLEL`: number of processes to run in parallel (usually the number of GPUs)
 78 | 
 79 | Notice that the script overwrites the existing $OUTPUT_FOLDER if it is not empty.
 80 | 
 81 | ```bash
 82 | export dataset=e2e # or another dataset
 83 | export system=DataTuner_No_FC # or DataTuner_No_FC_No_FS
 84 | bash train_lm.sh $dataset $system  ~/trained_lms/${dataset}_${system}  4
 85 | ```
 86 | 
 87 | When using distributed training, it might be the case that the training terminates without all the processes exiting. In that case, it is safe to stop the processes with (beware of stopping other processes which happen to have the same script name):
 88 | 
 89 | ```bash
 90 | pkill -9 -f train.py
 91 | ```
 92 | 
 93 | ## Evaluate the Data-to-Text Language Model
 94 | Next, we can evaluate the Data-to-Text language model.
 95 | Arguments for the script (in order):
 96 | 
 97 | - `TEST_FILE`: file containing test data.
 98 | - `MODEL`: folder where the model was written (referred to as `OUTPUT_FOLDER` during training)
 99 | - `NUM_GPUS`: number of GPUs over which we should distribute the evaluation
100 | - `PER_GPU`: number of processes to run per GPU (typically a value of 2 is safe for a 16 GB GPU)
101 | - `MAX_DATA`: if passed and is more than 0, this sets a maximum on the number of data items to evaluate (used for debugging)
102 | 
103 | ```bash
104 | export dataset=e2e # or another dataset
105 | export system=DataTuner_No_FC # or DataTuner_No_FC_No_FS
106 | bash evaluate_lm.sh ../data/$dataset/test.json       ~/trained_lms/${dataset}_${system}   4 2 0 
107 | bash evaluate_lm.sh ../data/$dataset/test.json       ~/trained_lms/${dataset}_${system}   4 2 4
108 | ```
109 | 
110 | A new folder will be generated with the generated data within it. This folder is printed at the end of the evaluation.
111 | 
112 | ## Train the Semantic Fidelity Classifiers
113 | 
114 | To train the semantic fidelity classifiers, run the following scripts for each of the datasets. 
115 | Arguments for the script (in order):
116 | 
117 | - `TRAINING_DATA_FOLDER`: folder containing the `train.tsv, validation.tsv, test.tsv` files.
118 | - `OUTPUT_FOLDER`: folder where the model will be written
119 | - `TRAINING_ARGS`: json file with the training arguments to use from a pretrained model
120 | - `NUM_PARALLEL`: number of processes to run in parallel (usually the number of GPUs)
121 | 
122 | ```bash
123 | export dataset=e2e # or another dataset
124 | bash train_classifier.sh ../data/${dataset}_consistency/       ~/trained_classifiers/${dataset}   ./classifier_training_args/$dataset/${dataset}_model_training_args.json    4
125 | ```
126 | 
127 | ## Run the Trained Classifiers on Generated Data
128 | 
129 | Once you have a model trained, and you generated the outputs from the Data-to-Text language model, you can run the fidelity classifier on a generated file to classify and rerank the beam outputs you got. This will produce the output of the system variant `DataTunerFc` containing both the fidelity classifier and the fine-grained state embeddings.
130 | Arguments for the script (in order):
131 | 
132 | - `TRAINING_DATA_FOLDER`: folder containing the `labels.txt` file for the fidelity classifier.
133 | - `GENERATED_DATA_FOLDER`: folder containing the `generated.json` file.
134 | - `MODEL_FOLDER`: folder where the model was written
135 | - `DATA_KEY`: key for the data field in the `generated.json` file.
136 | - `TEXT_KEY`: key for the text field in the `generated.json` file.
137 | 
138 | ```bash
139 | # for example:
140 | export GENERATED_DATA_FOLDER=~/trained_lms/e2e_DataTuner_No_FC_No_FS/2020-01-03_15-29-52/
141 | 
142 | bash eval_with_classifier.sh ../data/ldc_consistency    $GENERATED_DATA_FOLDER    ~/trained_classifiers/ldc       linearized_amr      answer_text
143 | bash eval_with_classifier.sh ../data/webnlg_consistency $GENERATED_DATA_FOLDER    ~/trained_classifiers/webnlg    modifiedtripleset   text
144 | bash eval_with_classifier.sh ../data/e2e_consistency    $GENERATED_DATA_FOLDER    ~/trained_classifiers/e2e       new_mr              ref
145 | bash eval_with_classifier.sh ../data/viggo_consistency  $GENERATED_DATA_FOLDER    ~/trained_classifiers/viggo     new_mr              ref
146 | ```
147 | 
148 | You will get a file `classified.json` within the evaluation folder. You will also get `classified_wrong.json` in the same folder, containing the items where the fidelity classifier did not find any accurate output in the beam.
149 | 
150 | 


--------------------------------------------------------------------------------
/paper/experiments/webnlg/preprocess.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | from pathlib import Path
  5 | from xml.etree import ElementTree as ET
  6 | 
  7 | from fire import Fire
  8 | 
  9 | from datatuner.classification.distractors import (get_distractors,
 10 |                                                   write_classification_data)
 11 | from datatuner.lm.special_token_generator import generate_from_json
 12 | from datatuner.lm.utils import fix_text_in_dir
 13 | from webnlg_utils import camel_case_split, cleanup
 14 | 
 15 | random.seed(42)
 16 | 
 17 | seen_categories = [
 18 |     "Airport.xml",
 19 |     "Astronaut.xml",
 20 |     "Building.xml",
 21 |     "City.xml",
 22 |     "ComicsCharacter.xml",
 23 |     "Food.xml",
 24 |     "Monument.xml",
 25 |     "SportsTeam.xml",
 26 |     "University.xml",
 27 |     "WrittenWork.xml",
 28 | ]
 29 | 
 30 | unseen_categories = ["Athlete.xml", "Artist.xml", "MeanOfTransportation.xml", "CelestialBody.xml", "Politician.xml"]
 31 | 
 32 | 
 33 | class Triple:
 34 | 
 35 |     def __init__(self, s, p, o):
 36 |         self.s = s #subject
 37 |         self.o = o #object
 38 |         self.p = p #predicate
 39 | 
 40 | 
 41 | def process_tripleset(s):
 42 |     """Format the triples set in our target format"""
 43 |     s = cleanup(s)
 44 |     key = "<mtriple>"
 45 |     s = s[len(key) : -len(key) - 1]
 46 |     subject, predicate, obj = s.split("|")
 47 |     subject, obj = cleanup(subject), cleanup(obj)
 48 |     predicate = camel_case_split(predicate)
 49 |     return {
 50 |         "text": f"<subject> {subject} <predicate> {predicate} <object> {obj}",
 51 |         "dict": {"subject": subject, "predicate": predicate, "object": obj},
 52 |     }
 53 | 
 54 | 
 55 | def get_nearby_text(entries, e):
 56 |     i = 1
 57 |     random_sentence = None
 58 |     while random_sentence is None:
 59 |         for j in range(2):
 60 |             try:
 61 |                 if j == 0:
 62 |                     entry = entries[e + i]
 63 |                 else:
 64 |                     entry = entries[e - i]
 65 | 
 66 |                 random_sentence = entry.findall("lex")[0].find("text").text
 67 |                 if random_sentence:
 68 |                     break
 69 | 
 70 |             except:
 71 |                 pass
 72 |         i += 1
 73 |     return random_sentence
 74 | 
 75 | 
 76 | def parse(in_file, classification_data, num_candidates=5, max_per_operation=2):
 77 |     """Parse the given file and update `classification_data` with the parsed data"""
 78 | 
 79 |     tree = ET.parse(in_file)
 80 |     root = tree.getroot()
 81 | 
 82 |     entries = list(root.find("entries"))
 83 |     items = []
 84 |     for e, entry in enumerate(entries):
 85 | 
 86 |         tripletsets = list(entry.find("modifiedtripleset").findall("mtriple")) + list(
 87 |             entry.find("modifiedtripleset").findall("otriple")
 88 |         )
 89 |         tripletsets = [process_tripleset(x) for x in tripletsets]
 90 | 
 91 |         modifiedtripleset = [x["text"] for x in tripletsets]
 92 |         modifiedtripleset.sort()
 93 | 
 94 |         mtripleset = entry.find("modifiedtripleset")
 95 |         modtripleset = []
 96 |         raw_tripleset = ""
 97 |         for mtriple in mtripleset:
 98 |             e1, pred, e2 = mtriple.text.split(" | ")
 99 |             raw_tripleset += mtriple.text + " ||| "
100 | 
101 |             modtripleset.append(Triple(cleanup(e1), pred, cleanup(e2)))
102 | 
103 |         all_lex = entry.findall("lex")
104 |         for lex in all_lex:
105 | 
106 |             sortedtripleset = ""
107 |             for sent in lex.find("sortedtripleset").findall("sentence"):
108 |                 for x in sent.findall("striple"):
109 |                     sortedtripleset += process_tripleset(x)["text"] + ", "
110 | 
111 |             references = cleanup(lex.find("references"))
112 |             template = cleanup(lex.find("template"))
113 | 
114 |             try:
115 |                 text = lex.find("text").text
116 |                 if not text:
117 |                     print("empty text")
118 |                     text = ""
119 |                     continue
120 |             except:
121 |                 print("exception text")
122 |                 text = ""
123 |                 continue
124 | 
125 |             try:
126 |                 template = lex.find("template").text
127 |                 if not template:
128 |                     print("empty template")
129 |                     template = ""
130 |                     continue
131 |             except:
132 |                 print("exception template")
133 |                 template = ""
134 |                 continue
135 | 
136 |             # preprocess distractors
137 |             subjects = [x["dict"]["subject"] for x in tripletsets]
138 |             objects = [x["dict"]["object"] for x in tripletsets]
139 |             predicates = [x["dict"]["predicate"] for x in tripletsets]
140 | 
141 |             swapping_candidates = [subjects + objects]
142 |             cutting_candidates = [subjects + objects]
143 | 
144 |             random_text = get_nearby_text(entries, e)
145 | 
146 |             tripletset_str = " ; ".join(modifiedtripleset)
147 | 
148 |             distractors, classification_items = get_distractors(
149 |                 tripletset_str,
150 |                 text,
151 |                 swapping_candidates,
152 |                 cutting_candidates,
153 |                 random_text,
154 |                 num_candidates=num_candidates,
155 |                 max_per_operation=max_per_operation,
156 |             )
157 | 
158 |             classification_data.extend(classification_items)
159 | 
160 |             item = {
161 |                 "raw_modifiedtripleset": raw_tripleset,
162 |                 "modifiedtripleset": " ; ".join(modifiedtripleset),
163 |                 "sortedtripleset": sortedtripleset,
164 |                 "references": references,
165 |                 "template": template,
166 |                 "text": distractors + [text],
167 |                 "num_triples": Path(in_file).parent.name,
168 |                 "category": Path(in_file).name,
169 |                 "category_type": "seen" if Path(in_file).name in seen_categories else "unseen",
170 |             }
171 |             items.append(item)
172 | 
173 |     return items
174 | 
175 | 
176 | def run_parser(set_path, classification_data):
177 |     """Get the entry set for the give path """
178 |     entryset = []
179 |     dirtriples = filter(lambda item: not str(item).startswith("."), os.listdir(set_path))
180 |     dirtriples = sorted(list(dirtriples))
181 |     for dirtriple in dirtriples:
182 |         fcategories = filter(lambda item: not str(item).startswith("."), os.listdir(os.path.join(set_path, dirtriple)))
183 |         fcategories = sorted(list(fcategories))
184 |         for fcategory in fcategories:
185 |             entryset.extend(list(parse(os.path.join(set_path, dirtriple, fcategory), classification_data)))
186 | 
187 |     return entryset
188 | 
189 | 
190 | def run(
191 |     in_folder="./tmp/webnlg/data/v1.4/en/",
192 |     out_folder="datatuner/data/webnlg",
193 |     classification_dir="datatuner/data/webnlg_consistency",
194 |     output_classification_data=True,
195 | ):
196 |     """Run the webnlg data formatting task"""
197 |     out_folder = Path(out_folder)
198 |     in_folder = Path(in_folder)
199 |     classification_dir = Path(classification_dir)
200 |     out_folder.mkdir(exist_ok=True, parents=True)
201 |     classification_dir.mkdir(exist_ok=True, parents=True)
202 |     splits = {"train": "train", "dev": "validation", "test": "test"}
203 | 
204 |     for split in splits:
205 |         data_path = in_folder / split
206 |         classification_data = []
207 |         entryset = run_parser(data_path, classification_data)
208 |         json.dump(entryset, open(out_folder / (splits[split] + ".json"), "w"), indent=2)
209 |         if output_classification_data:
210 |             write_classification_data(classification_data, classification_dir, splits[split])
211 | 
212 |     generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"modifiedtripleset": "amr"})
213 |     fix_text_in_dir(out_folder)
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     Fire(run)
218 | 


--------------------------------------------------------------------------------
/src/datatuner/lm/metrics.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import difflib
  3 | import json
  4 | import logging
  5 | import os
  6 | import sys
  7 | from collections import OrderedDict
  8 | from itertools import groupby
  9 | from pathlib import Path
 10 | from subprocess import PIPE, Popen
 11 | from tempfile import mkdtemp
 12 | 
 13 | import mlflow
 14 | import numpy as np
 15 | from datatuner.ops.mlflow import get_artifact
 16 | from datatuner.utils import flatten
 17 | from fire import Fire
 18 | 
 19 | logger = logging.getLogger(__file__)
 20 | 
 21 | THIS_DIR = Path(os.path.dirname(os.path.realpath(__file__)))
 22 | 
 23 | E2E_METRICS_FOLDER = THIS_DIR / "../../../paper/tmp/e2e-metrics"
 24 | PYTHON_BIN = sys.executable
 25 | 
 26 | 
 27 | def get_str_diff(case_a, case_b):
 28 |     """Get the string difference between two strings"""
 29 |     return ("").join([li[-1] for li in difflib.ndiff(case_a, case_b) if li[0] != " " and li[-1] not in [" ", "'", ","]])
 30 | 
 31 | 
 32 | def almostmatch(original, current, all_outputs, final):
 33 |     """Computes match average while allowing a difference in articles. The metric is computed for the given
 34 |     keys across the list of dictionaries `all_outputs`
 35 |     """
 36 |     lst = [
 37 |         int(x[original] == x[current] or get_str_diff(x[original], x[current]) in ["the", "a", "an"])
 38 |         for x in all_outputs
 39 |     ]
 40 |     return {"value": np.mean(lst), "count": len(all_outputs)}
 41 | 
 42 | 
 43 | def match(original, current, all_outputs, final):
 44 |     """Computes exact match average across the values of the given keys in the list of dictionaries `all_outputs`"""
 45 | 
 46 |     def postprocess(x):
 47 |         return x[current][0] if type(x[current]) == list else x[current]
 48 | 
 49 |     lst = [int(str(x[original]).lower() == str(postprocess(x).lower())) for x in all_outputs]
 50 |     return {"value": np.mean(lst), "count": len(all_outputs)}
 51 | 
 52 | 
 53 | def bleu(original, current, all_outputs, final, case_insensitive=True, all_keys=None):
 54 |     """Computes bleu score for the values of the given keys in the list of dictionaries `all_outputs`"""
 55 |     if len(all_outputs) == 0:
 56 |         return {"value": 0, "count": 0}
 57 | 
 58 |     from sacrebleu import corpus_bleu
 59 | 
 60 |     def process(s):
 61 |         return s.lower() if case_insensitive else s
 62 | 
 63 |     # group by all the other keys
 64 |     all_outputs = copy.deepcopy(all_outputs)
 65 |     if all_keys is None:
 66 |         keys = all_outputs[0].keys()
 67 |     else:
 68 |         keys = all_keys
 69 |         print(keys)
 70 | 
 71 |     other_keys = list(set([key for key in keys if key not in [original, current]]))
 72 | 
 73 |     group = {}
 74 |     max_refs = 1
 75 |     for item in all_outputs:
 76 |         # other inputs concatenated
 77 |         search_key = str([item[x] for x in other_keys if x in item])
 78 |         if type(item[current]) == list:
 79 |             item[current] = item[current][0]
 80 | 
 81 |         current_val = process(item[current])
 82 |         original_val = process(item[original])
 83 | 
 84 |         if search_key in group:
 85 |             group[search_key]["references"].append(original_val)
 86 |             group[search_key]["prediction"] = current_val
 87 |             if len(group[search_key]["references"]) > max_refs:
 88 |                 max_refs = len(group[search_key]["references"])
 89 |         else:
 90 |             group[search_key] = {"references": [original_val], "prediction": current_val}
 91 | 
 92 |     all_predictions = []
 93 |     all_references = [[] for i in range(max_refs)]
 94 | 
 95 |     for item in group.values():
 96 |         all_predictions.append(item["prediction"])
 97 |         for i in range(max_refs):
 98 |             try:
 99 |                 all_references[i].append(item["references"][i])
100 |             except:
101 |                 all_references[i].append("")
102 | 
103 |     e2e_metrics = {}
104 |     if final:
105 |         e2e_metrics = get_e2e_metrics(all_predictions, all_references)
106 |     e2e_metrics.update({"value": corpus_bleu(all_predictions, all_references).score, "count": len(all_predictions)})
107 |     return e2e_metrics
108 | 
109 | 
110 | def get_e2e_metrics(all_predictions, all_references):
111 |     tempdir = Path(mkdtemp())
112 |     human = tempdir / "human_refs.txt"
113 |     system = tempdir / "system.txt"
114 |     with open(human, "w") as h:
115 |         with open(system, "w") as s:
116 |             for i, x in enumerate(all_predictions):
117 |                 s.write(x + "\n")
118 |                 for j in range(len(all_references)):
119 |                     v = all_references[j][i]
120 |                     if v.strip():
121 |                         h.write(v + "\n")
122 |                 h.write("\n")
123 |     print(E2E_METRICS_FOLDER / "measure_scores.py")
124 |     p = Popen(
125 |         [
126 |             PYTHON_BIN,
127 |             E2E_METRICS_FOLDER / "measure_scores.py",
128 |             f"{human}",
129 |             f"{system}",
130 |         ],
131 |         stdin=PIPE,
132 |         stdout=PIPE,
133 |         stderr=PIPE,
134 |     )
135 |     output, err = p.communicate()
136 |     stats = output.decode("utf-8").split("\n")
137 |     stats = [x for x in stats if x not in ["", "==============", "SCORES:"]]
138 |     stats_dict = {}
139 |     for item in stats:
140 |         key, value = item.split(": ")
141 |         value = float(value)
142 |         if key in ["BLEU", "METEOR", "ROUGE_L"]:
143 |             value *= 100
144 |         if key == "BLEU":
145 |             key = "e2e_BLEU"
146 |         stats_dict[key] = value
147 | 
148 |     return stats_dict
149 | 
150 | 
151 | def round_dict(d):
152 |     """Round values in a dictionary"""
153 |     items = [(k, round(v * 100.0, 2)) for k, v in d.items()]
154 |     return dict(sorted(items, key=lambda t: t[1]))
155 | 
156 | 
157 | def group_by_field(all_outputs, field):
158 |     """group a list of dictionaries by the given field value"""
159 |     all_outputs.sort(key=lambda k: k[field])
160 |     return groupby(all_outputs, key=lambda k: k[field])
161 | 
162 | 
163 | def compute_metric(metric, original, current, all_outputs, final):
164 |     """compute the result for the given metric"""
165 |     try:
166 |         # get the function name from the "metrics.py" file
167 |         func = metrics[metric]
168 |         return func(original, current, all_outputs, final)
169 |     except:
170 |         logger.info(f"Unable to compute the metric {metric}")
171 |         raise
172 | 
173 | 
174 | def aggregate_metrics(all_outputs, fields, metrics_fields, output_to_metrics, final=False):
175 |     """Combine the stats array into a value for a given metric"""
176 | 
177 |     out_metrics = {}
178 |     for field in fields:
179 |         original = "original_" + field
180 |         current = field + " " * len("original_")
181 | 
182 |         out_metrics[field] = {}
183 |         for metric in output_to_metrics[field]:
184 |             # first we compute the aggregated metric
185 |             out_metrics[field][metric] = {}
186 |             out_metrics[field][metric]["total"] = compute_metric(metric, original, current, all_outputs, final)
187 |             logger.info(f"{field},{metric},{out_metrics[field][metric]['total']}")
188 |             # We then split the metrics computation per metric field.
189 |             # We do this by taking all the inputs so far. Although this involves repetition, this is more generalizable
190 |             # to cases where the metric is corpus-wide (e.g. BLEU).
191 |             for metric_field in metrics_fields:
192 |                 grouped_items = group_by_field(all_outputs, metric_field)
193 |                 out_metrics[field][metric][metric_field] = []
194 |                 for metric_field_value, field_outputs in grouped_items:
195 |                     out_metrics[field][metric][metric_field].append(
196 |                         (metric_field_value, compute_metric(metric, original, current, list(field_outputs), False))
197 |                     )
198 |                 out_metrics[field][metric][metric_field].sort(key=lambda k: k[1]["value"])
199 |                 out_metrics[field][metric][metric_field] = OrderedDict(out_metrics[field][metric][metric_field])
200 |     return out_metrics
201 | 
202 | 
203 | def compute_metrics_from_run(field, filename=None, run_id=None, eval_folder=None, metrics=None):
204 |     if run_id is not None:
205 |         assert eval_folder is not None
206 |         filename = get_artifact(run_id, f"evaluation/{eval_folder}/generated.json")
207 | 
208 |     filename = Path(filename)
209 |     all_outputs = json.load(open(filename, "r"))
210 |     output_to_metrics = {}
211 |     if metrics is None:
212 |         metrics = ["bleu"]
213 |     output_to_metrics[field] = metrics
214 |     stats = aggregate_metrics(all_outputs, [field], [], output_to_metrics, final=True)
215 |     print(json.dumps(stats, indent=2))
216 |     out_folder = filename.parent
217 |     (out_folder / f"stats_{filename.stem}.json").write_text(json.dumps(stats, indent=2))
218 | 
219 |     if run_id is not None:
220 |         mlflow.start_run(run_id)
221 |         flattened_stats = flatten(stats)
222 |         flattened_stats = {k: flattened_stats[k] for k in flattened_stats if k.count("-") <= 3}
223 | 
224 |         mlflow.log_metrics(flattened_stats)
225 | 
226 | 
227 | metrics = {"match": match, "bleu": bleu, "almostmatch": almostmatch}
228 | 
229 | if __name__ == "__main__":
230 |     Fire(compute_metrics_from_run)
231 | 


--------------------------------------------------------------------------------
/src/datatuner/classification/distractors.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | import re
  4 | from copy import deepcopy
  5 | from itertools import chain
  6 | from pathlib import Path
  7 | 
  8 | import nltk
  9 | import pandas as pd
 10 | 
 11 | random.seed(42)
 12 | 
 13 | 
 14 | def get_distractors(data, text, swapping_candidates, cutting_candidates, random_text, num_candidates=5,
 15 |                     max_per_operation=5):
 16 |     """Get the distractors for the given inputs"""
 17 |     distractors_dict = {}
 18 | 
 19 |     for cands in swapping_candidates:
 20 |         distractors_dict["value_error"] = swap_entities(cands, text, max_outputs=max_per_operation)
 21 | 
 22 |     for cands in cutting_candidates:
 23 |         distractors_dict["value_error"].extend(cut_entities(cands, text, max_outputs=max_per_operation))
 24 | 
 25 |     distractors_dict["value_error"].extend(add_negation_errors(text, max_outputs=int(math.ceil(max_per_operation / 2))))
 26 |     distractors_dict["omission"] = add_omission(text, max_outputs=max_per_operation)
 27 |     if "," in text:
 28 |         distractors_dict["omission"].extend(add_phrase_omission(text, max_outputs=1 + max_per_operation))
 29 | 
 30 |     distractors_dict["repetition"] = add_repetition(text, max_outputs=1 + max_per_operation)
 31 |     distractors_dict["hallucination"] = add_repetition(
 32 |         text, random_text=random_text, replace=True, max_outputs=max_per_operation
 33 |     ) + add_repetition(text, random_text=random_text, max_outputs=max_per_operation)
 34 | 
 35 |     distractors = set(chain(*distractors_dict.values()))
 36 | 
 37 |     # Remove text itself if present
 38 |     if text in distractors:
 39 |         distractors.remove(text)
 40 | 
 41 |     # Shuffle and cut
 42 |     distractors = list(distractors)
 43 |     random.shuffle(distractors)
 44 |     distractors = distractors[:num_candidates]
 45 | 
 46 |     # If no distractors found, add placeholders
 47 |     if len(distractors) == 0:
 48 |         distractors = ["placeholder"] * num_candidates
 49 |     # Pad to get to the right number of candidates
 50 |     if len(distractors) < num_candidates:
 51 |         ratio = int(math.ceil(num_candidates / len(distractors)))
 52 |         distractors = (distractors * ratio)[:num_candidates]
 53 | 
 54 |     classification_items = [
 55 |                                {"text": value, "data": data, "label": key} for key in distractors_dict for value in
 56 |                                distractors_dict[key]
 57 |                            ] + [{"text": text, "data": data, "label": "accurate"}]
 58 | 
 59 |     # Add negation
 60 |     replacements = {"[ no ]": "[ yes ]", "[ yes ]": "[ no ]"}
 61 |     for cand in replacements:
 62 |         if cand in data:
 63 |             negated_data = data.replace(cand, replacements[cand], 1)
 64 |             classification_items.extend([{"text": text, "data": negated_data, "label": "value_error"}])
 65 | 
 66 |     random.shuffle(classification_items)
 67 |     classification_items = classification_items[:num_candidates]
 68 |     return distractors, classification_items
 69 | 
 70 | 
 71 | def add_negation_errors(original_text, max_outputs=5):
 72 |     outputs = []
 73 |     current_text = original_text
 74 |     blacklisted = ["not", "n't"]
 75 |     for x in blacklisted:
 76 |         if "not " in current_text:
 77 |             new_text = current_text.replace("not", "", 1)
 78 |             new_text = new_text.replace("  ", " ")
 79 |             outputs.append(new_text)
 80 |             current_text = new_text
 81 | 
 82 |     return outputs[:max_outputs]
 83 | 
 84 | 
 85 | def cut_entities(entity_list, original_text, max_outputs=5):
 86 |     """Remove part of the entity"""
 87 |     output = []
 88 |     entity_list = deepcopy(entity_list)[:max_outputs]
 89 |     for entity in entity_list:
 90 |         rand_ind = random.randint(0, len(entity) - 1)
 91 |         cut_entity = entity[:rand_ind].strip()
 92 |         if entity in original_text:
 93 |             output.append(original_text.replace(entity, cut_entity))
 94 | 
 95 |     return output
 96 | 
 97 | 
 98 | def swap_entities(entity_list, original_text, max_outputs=5):
 99 |     """Swap an entity from the `entity_list` with another from the list if present in the text"""
100 |     entity_set = set(entity_list)
101 |     output = []
102 | 
103 |     entity_list = deepcopy(entity_list)[:max_outputs]
104 | 
105 |     random.shuffle(entity_list)
106 |     for entity in entity_list:
107 |         passed_entities = deepcopy(entity_set)
108 |         passed_entities.remove(entity)
109 |         passed_entities = list(passed_entities)
110 |         if len(passed_entities) > 0:
111 |             rand_entity = random.choice(passed_entities)
112 |             if entity in original_text:
113 |                 text = original_text.replace(entity, rand_entity)
114 |                 output.append(text)
115 | 
116 |     return output
117 | 
118 | 
119 | def swap_pronouns(original_text):
120 |     """Swap pronoun with a different one"""
121 |     lower = ["he", "she", "it", "they"]
122 |     upper = ["He", "She", "It", "They"]
123 | 
124 |     # find if a pronoun is there
125 |     tokens = set(re.findall(r"[\w']+", original_text.lower()))
126 |     pronoun_i = -1
127 |     for i, p in enumerate(lower):
128 |         if p in tokens:
129 |             pronoun_i = i
130 |             break
131 | 
132 |     text = original_text
133 |     # if a pronoun is found, we replace its occurrences with a random other pronoun
134 |     if pronoun_i >= 0:
135 | 
136 |         # get a random other pronoun
137 |         candidates = set(list(range(len(lower))))
138 |         candidates.remove(pronoun_i)
139 |         other_pronoun_i = random.choice(list(candidates))
140 | 
141 |         for pronouns in [lower, upper]:
142 |             pronoun = pronouns[pronoun_i]
143 |             other_pronoun = pronouns[other_pronoun_i]
144 |             text = re.sub(r"\b{}\b".format(pronoun), other_pronoun, text)
145 |         return [text]
146 | 
147 |     return []
148 | 
149 | 
150 | def add_phrase_omission(text, max_outputs=5):
151 |     indices = [i for i, x in enumerate(text) if x == ","]
152 |     output = []
153 |     random.shuffle(indices)
154 |     end_strs = [".", ","]
155 |     random.shuffle(end_strs)
156 |     for end_str in end_strs:
157 |         for i in indices:
158 |             try:
159 |                 if len(output) >= max_outputs:
160 |                     break
161 |                 # until the index before comma + index from next dot if any
162 |                 output.append(text[:i] + text[text.index(end_str, i + 1):])
163 | 
164 |             except:
165 |                 pass
166 | 
167 |     return output
168 | 
169 | 
170 | def add_omission(text, max_outputs=5):
171 |     """Remove the shortest sentence from the text"""
172 |     sentences = nltk.sent_tokenize(text)
173 |     output = []
174 |     if len(sentences) > 1:
175 |         for omit_ind in range(min(max_outputs, len(sentences))):
176 |             # sort by increasing length; goal is to remove shortest for subtle omissions
177 |             sentences = sorted(sentences, key=lambda x: len(x))
178 |             removed_sentence = sentences[omit_ind]
179 |             output.append(text.replace(removed_sentence, "").strip())
180 |     return output
181 | 
182 | 
183 | def add_repetition(text, random_text=None, replace=False, max_outputs=5):
184 |     """Repeat the shortest sentence int the text"""
185 |     sentences = nltk.sent_tokenize(text)
186 | 
187 |     assert not (random_text is None and replace)
188 | 
189 |     if random_text is None:
190 |         sorted_sentences = sorted(sentences, key=lambda x: len(x))
191 |         repeat_ind = 0
192 |         random_sentence = sorted_sentences[repeat_ind]
193 |     else:
194 |         random_sentences = nltk.sent_tokenize(random_text)
195 |         random_sentence = sorted(random_sentences, key=lambda x: len(x))[0]
196 | 
197 |     indices = list(range(min(max_outputs, len(sentences))))
198 |     random.shuffle(indices)
199 | 
200 |     outputs = []
201 |     for insert_at_ind in indices:
202 |         if replace:
203 |             sentences[insert_at_ind] = random_sentence
204 |         else:
205 |             sentences.insert(insert_at_ind, random_sentence)
206 | 
207 |         outputs.append(" ".join(sentences).strip())
208 |     return outputs[:max_outputs]
209 | 
210 | 
211 | def write_classification_data(classification_data, classification_dir, split):
212 |     classification_dir = Path(classification_dir)
213 |     classification_dir.mkdir(parents=True, exist_ok=True)
214 | 
215 |     random.shuffle(classification_data)
216 |     max_data = 100000 if "train" in split else 10000
217 |     classification_data = classification_data[:max_data]
218 |     print(split)
219 |     print(f"Length of classification_data: {len(classification_data)}")
220 |     for x in classification_data:
221 |         x["text"] = x["text"].replace("\n", " ").replace("\r", " ")
222 | 
223 |     df = pd.DataFrame(classification_data)
224 | 
225 |     print(f"Original classes distribution:")
226 |     print(f"{df.label.value_counts()}")
227 |     if "train" in split:
228 |         max_size = df["label"].value_counts().max()
229 | 
230 |         lst = [df]
231 |         for class_index, group in df.groupby("label"):
232 |             sizes = [max_size, 3 * len(group), max_size - len(group)]
233 |             size = min(sizes)
234 |             lst.append(group.sample(size, replace=True))
235 | 
236 |         df_new = pd.concat(lst)
237 |         df_new = df_new.sample(frac=min(1, max_data / len(df_new))).reset_index(drop=True)
238 |         print(f"New classes distribution:")
239 |         print(f"{df_new.label.value_counts()}")
240 |     else:
241 |         df_new = df.sample(frac=min(1, max_data / len(df))).reset_index(drop=True)
242 | 
243 |     labels = list(df_new.label.unique())
244 |     df_new.to_csv(classification_dir / (split + ".tsv"), sep="|", index=False, columns=["label", "data", "text"])
245 | 
246 |     labels_file = classification_dir / "labels.txt"
247 |     labels_file.write_text("\n".join(sorted(labels)))
248 |     print("")
249 | 


--------------------------------------------------------------------------------
/src/external/webnlg_webnlg_baseline/webnlg_baseline_input.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import re
  4 | import json
  5 | import sys
  6 | import getopt
  7 | from collections import defaultdict
  8 | from external.shimorina_webnlg_baseline.benchmark_reader import Benchmark
  9 | 
 10 | 
 11 | def select_files(topdir, category='', size=(1, 8)):
 12 |     """
 13 |     Collect all xml files from a benchmark directory.
 14 |     :param topdir: directory with benchmark
 15 |     :param category: specify DBPedia category to retrieve texts for a specific category (default: retrieve all)
 16 |     :param size: specify size to retrieve texts of specific size (default: retrieve all)
 17 |     :return: list of tuples (full path, filename)
 18 |     """
 19 |     finaldirs = [topdir+'/'+str(item)+'triples' for item in range(size[0], size[1])]
 20 |     finalfiles = []
 21 |     for item in finaldirs:
 22 |         finalfiles += [(item, filename) for filename in os.listdir(item)]
 23 |     if category:
 24 |         finalfiles = []
 25 |         for item in finaldirs:
 26 |             finalfiles += [(item, filename) for filename in os.listdir(item) if category in filename]
 27 |     return finalfiles
 28 | 
 29 | 
 30 | def delexicalisation(out_src, out_trg, category, properties_objects):
 31 |     """
 32 |     Perform delexicalisation.
 33 |     :param out_src: source string
 34 |     :param out_trg: target string
 35 |     :param category: DBPedia category
 36 |     :param properties_objects: dictionary mapping properties to objects
 37 |     :return: delexicalised strings of the source and target; dictionary containing mappings of the replacements made
 38 |     """
 39 |     with open('delex_dict.json') as data_file:
 40 |         data = json.load(data_file)
 41 |     # replace all occurrences of Alan_Bean to ASTRONAUT in input
 42 |     delex_subj = data[category]
 43 |     delex_src = out_src
 44 |     delex_trg = out_trg
 45 |     # for each instance, we save the mappings between nondelex and delex
 46 |     replcments = {}
 47 |     for subject in delex_subj:
 48 |         clean_subj = ' '.join(re.split('(\W)', subject.replace('_', ' ')))
 49 |         if clean_subj in out_src:
 50 |             delex_src = out_src.replace(clean_subj + ' ', category.upper() + ' ')
 51 |             replcments[category.upper()] = ' '.join(clean_subj.split())   # remove redundant spaces
 52 |         if clean_subj in out_trg:
 53 |             delex_trg = out_trg.replace(clean_subj + ' ', category.upper() + ' ')
 54 |             replcments[category.upper()] = ' '.join(clean_subj.split())
 55 | 
 56 |     # replace all occurrences of objects by PROPERTY in input
 57 |     for pro, obj in sorted(properties_objects.items()):
 58 |         obj_clean = ' '.join(re.split('(\W)', obj.replace('_', ' ').replace('"', '')))
 59 |         if obj_clean in delex_src:
 60 |             delex_src = delex_src.replace(obj_clean + ' ', pro.upper() + ' ')
 61 |             replcments[pro.upper()] = ' '.join(obj_clean.split())   # remove redundant spaces
 62 |         if obj_clean in delex_trg:
 63 |             delex_trg = delex_trg.replace(obj_clean + ' ', pro.upper() + ' ')
 64 |             replcments[pro.upper()] = ' '.join(obj_clean.split())
 65 | 
 66 |     # possible enhancement for delexicalisation:
 67 |     # do delex triple by triple
 68 |     # now building | location | New_York_City New_York_City | isPartOf | New_York
 69 |     # is converted to
 70 |     # BUILDING location ISPARTOF City ISPARTOF City isPartOf ISPARTOF
 71 |     return delex_src, delex_trg, replcments
 72 | 
 73 | 
 74 | def create_source_target(b, options, dataset, delex=True):
 75 |     """
 76 |     Write target and source files, and reference files for BLEU.
 77 |     :param b: instance of Benchmark class
 78 |     :param options: string "delex" or "notdelex" to label files
 79 |     :param dataset: dataset part: train, dev, test
 80 |     :param delex: boolean; perform delexicalisation or not
 81 |     :return: if delex True, return list of replacement dictionaries for each example
 82 |     """
 83 |     source_out = []
 84 |     target_out = []
 85 |     rplc_list = []  # store the dict of replacements for each example
 86 |     for entr in b.entries:
 87 |         tripleset = entr.modifiedtripleset
 88 |         lexics = entr.lexs
 89 |         category = entr.category
 90 |         for lex in lexics:
 91 |             triples = ''
 92 |             properties_objects = {}
 93 |             for triple in tripleset.triples:
 94 |                 triples += triple.s + ' ' + triple.p + ' ' + triple.o + ' '
 95 |                 properties_objects[triple.p] = triple.o
 96 |             triples = triples.replace('_', ' ').replace('"', '')
 97 |             # separate punct signs from text
 98 |             out_src = ' '.join(re.split('(\W)', triples))
 99 |             out_trg = ' '.join(re.split('(\W)', lex.lex))
100 |             if delex:
101 |                 out_src, out_trg, rplc_dict = delexicalisation(out_src, out_trg, category, properties_objects)
102 |                 rplc_list.append(rplc_dict)
103 |             # delete white spaces
104 |             source_out.append(' '.join(out_src.split()))
105 |             target_out.append(' '.join(out_trg.split()))
106 | 
107 |     # shuffle two lists in the same way
108 |     random.seed(10)
109 |     if delex:
110 |         corpus = list(zip(source_out, target_out, rplc_list))
111 |         random.shuffle(corpus)
112 |         source_out, target_out, rplc_list = zip(*corpus)
113 |     else:
114 |         corpus = list(zip(source_out, target_out))
115 |         random.shuffle(corpus)
116 |         source_out, target_out = zip(*corpus)
117 | 
118 |     with open(dataset + '-webnlg-' + options + '.triple', 'w+') as f:
119 |         f.write('\n'.join(source_out))
120 |     with open(dataset + '-webnlg-' + options + '.lex', 'w+') as f:
121 |         f.write('\n'.join(target_out))
122 | 
123 |     # create separate files with references for multi-bleu.pl for dev set
124 |     scr_refs = defaultdict(list)
125 |     if dataset == 'dev' and not delex:
126 |         for src, trg in zip(source_out, target_out):
127 |             scr_refs[src].append(trg)
128 |         # length of the value with max elements
129 |         max_refs = sorted(scr_refs.values(), key=len)[-1]
130 |         keys = [key for (key, value) in sorted(scr_refs.items())]
131 |         values = [value for (key, value) in sorted(scr_refs.items())]
132 |         # write the source file not delex
133 |         with open(options + '-source.triple', 'w+') as f:
134 |             f.write('\n'.join(keys))
135 |         # write references files
136 |         for j in range(0, len(max_refs)):
137 |             with open(options + '-reference' + str(j) + '.lex', 'w+') as f:
138 |                 out = ''
139 |                 for ref in values:
140 |                     try:
141 |                         out += ref[j] + '\n'
142 |                     except:
143 |                         out += '\n'
144 |                 f.write(out)
145 | 
146 |     return rplc_list
147 | 
148 | 
149 | def relexicalise(predfile, rplc_list):
150 |     """
151 |     Take a file from seq2seq output and write a relexicalised version of it.
152 |     :param rplc_list: list of dictionaries of replacements for each example (UPPER:not delex item)
153 |     :return: list of predicted sentences
154 |     """
155 |     relex_predictions = []
156 |     with open(predfile, 'r') as f:
157 |         predictions = [line for line in f]
158 |     for i, pred in enumerate(predictions):
159 |         # replace each item in the corresponding example
160 |         rplc_dict = rplc_list[i]
161 |         relex_pred = pred
162 |         for key in sorted(rplc_dict):
163 |             relex_pred = relex_pred.replace(key + ' ', rplc_dict[key] + ' ')
164 |         relex_predictions.append(relex_pred)
165 |     # with open('relexicalised_predictions_full.txt', 'w+') as f:
166 |         # f.write(''.join(relex_predictions))
167 | 
168 |     # create a mapping between not delex triples and relexicalised sents
169 |     with open('dev-webnlg-all-notdelex.triple', 'r') as f:
170 |         dev_sources = [line.strip() for line in f]
171 |     src_gens = {}
172 |     for src, gen in zip(dev_sources, relex_predictions):
173 |         src_gens[src] = gen  # need only one lex, because they are the same for a given triple
174 | 
175 |     # write generated sents to a file in the same order as triples are written in the source file
176 |     with open('all-notdelex-source.triple', 'r') as f:
177 |         triples = [line.strip() for line in f]
178 |     with open('relexicalised_predictions.txt', 'w+') as f:
179 |         for triple in triples:
180 |             f.write(src_gens[triple])
181 | 
182 |     return relex_predictions
183 | 
184 | 
185 | def input_files(path, filepath=None, relex=False):
186 |     """
187 |     Read the corpus, write train and dev files.
188 |     :param path: directory with the WebNLG benchmark
189 |     :param filepath: path to the prediction file with sentences (for relexicalisation)
190 |     :param relex: boolean; do relexicalisation or not
191 |     :return:
192 |     """
193 |     parts = ['train', 'dev']
194 |     options = ['all-delex', 'all-notdelex']  # generate files with/without delexicalisation
195 |     for part in parts:
196 |         for option in options:
197 |             files = select_files(path + part, size=(1, 8))
198 |             b = Benchmark()
199 |             b.fill_benchmark(files)
200 |             if option == 'all-delex':
201 |                 rplc_list = create_source_target(b, option, part, delex=True)
202 |                 print('Total of {} files processed in {} with {} mode'.format(len(files), part, option))
203 |             elif option == 'all-notdelex':
204 |                 rplc_list = create_source_target(b, option, part, delex=False)
205 |                 print('Total of {} files processed in {} with {} mode'.format(len(files), part, option))
206 |             if relex and part == 'dev' and option == 'all-delex':
207 |                 relexicalise(filepath, rplc_list)
208 |     print('Files necessary for training/evaluating are written on disc.')
209 | 
210 | 
211 | def main(argv):
212 |     usage = 'usage:\npython3 webnlg_baseline_input.py -i <data-directory>' \
213 |            '\ndata-directory is the directory where you unzipped the archive with data'
214 |     try:
215 |         opts, args = getopt.getopt(argv, 'i:', ['inputdir='])
216 |     except getopt.GetoptError:
217 |         print(usage)
218 |         sys.exit(2)
219 |     input_data = False
220 |     for opt, arg in opts:
221 |         if opt in ('-i', '--inputdir'):
222 |             inputdir = arg
223 |             input_data = True
224 |         else:
225 |             print(usage)
226 |             sys.exit()
227 |     if not input_data:
228 |         print(usage)
229 |         sys.exit(2)
230 |     print('Input directory is ', inputdir)
231 |     input_files(inputdir)
232 | 
233 | 
234 | if __name__ == "__main__":
235 |     main(sys.argv[1:])
236 | 


--------------------------------------------------------------------------------
/src/datatuner/classification/classify_generated.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import sys
  5 | from collections import Counter
  6 | from pathlib import Path
  7 | from shutil import copyfile
  8 | from subprocess import run
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | from datatuner.classification.consistency_classifier import dataset_fields
 13 | from datatuner.lm.metrics import bleu
 14 | from fire import Fire
 15 | from scipy import stats
 16 | from tqdm import tqdm
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | THIS_DIR = Path(os.path.dirname(os.path.realpath(__file__)))
 20 | PACKAGE_LOCATION = f"{THIS_DIR}/../../../../"
 21 | 
 22 | 
 23 | def generate(
 24 |         in_file,
 25 |         dataset=None,
 26 |         out_folder=None,
 27 |         model_folder=None,
 28 |         model_type="roberta",
 29 |         model_name="roberta-large",
 30 |         python_location=sys.executable,
 31 |         classifier_script=THIS_DIR / "run_classifier.py",
 32 |         correct_label="accurate",
 33 |         text_key=None,
 34 |         data_key=None,
 35 | ):
 36 |     """Classify data generated from the language model"""
 37 | 
 38 |     in_file = Path(in_file)
 39 |     data_folder = in_file.parent
 40 |     data = json.load(open(in_file))
 41 |     basic_texts = []
 42 |     if text_key is None:
 43 |         text_key = dataset_fields[dataset]["text"]
 44 |     else:
 45 |         text_key = text_key.strip() + (" " * len("original_"))
 46 | 
 47 |     if data_key is None:
 48 |         data_key = dataset_fields[dataset]["data"]
 49 | 
 50 |     # Prepare data for the classifier
 51 |     for item in data:
 52 |         if type(item[text_key]) == list:
 53 |             for x in item[text_key]:
 54 |                 basic_texts.append(
 55 |                     {"text": x.replace("\n", " "), "data": item[data_key].replace("\n", ";"), "label": correct_label}
 56 |                 )
 57 |         elif type(item[data_key] == list):
 58 |             for x in item[data_key]:
 59 |                 basic_texts.append(
 60 |                     {"text": item[text_key].replace("\n", " "), "data": x.replace("\n", ";"), "label": correct_label}
 61 |                 )
 62 | 
 63 |     df = pd.DataFrame(basic_texts)
 64 | 
 65 |     df.to_csv(data_folder / "test.tsv", sep="|", index=False, columns=["label", "data", "text"])
 66 | 
 67 |     if model_folder is None:
 68 |         model_folder = f"{PACKAGE_LOCATION}/{dataset}_consistency_roberta-large_lower"
 69 |     model_folder = Path(model_folder)
 70 | 
 71 |     # Run the classifier command
 72 |     command = (
 73 |         f"{python_location} {classifier_script} --task_name mnli --data_dir {data_folder} --stats_dir {data_folder} "
 74 |         f" --model_name {model_name} --output_dir {model_folder} --model_type {model_type}  --do_eval"
 75 |         f" --overwrite_cache  --per_gpu_eval_batch_size 32 --do_lower_case"
 76 |     )
 77 | 
 78 |     print(command)
 79 |     run(command, shell=True)
 80 | 
 81 |     rerank_and_eval(
 82 |         in_file,
 83 |         dataset,
 84 |         model_folder=model_folder,
 85 |         out_folder=out_folder,
 86 |         correct_label=correct_label,
 87 |         text_key=text_key,
 88 |         data_key=data_key,
 89 |     )
 90 | 
 91 | 
 92 | def get_stats(data, dataset):
 93 |     """Get stats about the dataset"""
 94 |     if dataset == "webnlg":
 95 |         return json.dumps(
 96 |             {
 97 |                 "num_triples": Counter(x["num_triples"] for x in data),
 98 |                 "category": Counter(x["category"] for x in data),
 99 |                 "category_type": Counter(x["category_type"] for x in data),
100 |             },
101 |             indent=2,
102 |         )
103 |     else:
104 |         return ""
105 | 
106 | 
107 | def rerank_and_eval(
108 |         in_file,
109 |         dataset,
110 |         model_folder=None,
111 |         out_folder=None,
112 |         nbest=100,
113 |         correct_label="accurate",
114 |         text_key=None,
115 |         data_key=None,
116 | ):
117 |     """Compute the metrics based on the generated and classified data before and after reranking"""
118 | 
119 |     in_file = Path(in_file)
120 |     if model_folder is None:
121 |         model_folder = f"{PACKAGE_LOCATION}/{dataset}_consistency_roberta-large_lower"
122 | 
123 |     model_folder = Path(model_folder)
124 |     if text_key is None:
125 |         text_key = dataset_fields[dataset]["text"]
126 |     if data_key is None:
127 |         data_key = dataset_fields[dataset]["data"]
128 |     original_keys = [data_key]
129 |     data = json.load(open(in_file))
130 |     if out_folder is None:
131 |         out_folder = in_file.parent
132 |     out_folder = Path(out_folder)
133 |     out_folder.mkdir(parents=True, exist_ok=True)
134 | 
135 |     # This file is produced by the `generate` function above. It will have a dictionary.
136 |     # `preds` is mapped to the list of labels predicted
137 |     # `preds_prob` is mapped to the list of probabilities corresponding to the labels
138 |     results = json.load(open(out_folder / "results.json", "r"))
139 | 
140 |     # Labels in the order we want to prioritize (correct first, then less severe errors)
141 |     sorted_labels = ["accurate", "value_error", "repetition", "omission", "hallucination", "pronoun_error"]
142 | 
143 |     k = 0
144 |     assert len(data) > 0
145 | 
146 |     for item in tqdm(data):
147 |         cand_len = len(item[text_key])
148 |         indices = list(range(cand_len))[:nbest]
149 |         item["pred_prob"] = results["preds_prob"][k: k + cand_len][:nbest]
150 |         item["pred"] = results["preds"][k: k + cand_len][:nbest]
151 | 
152 |         current_labels = [sorted_labels.index(x) for x in item["pred"]]
153 |         reranked = [
154 |             x
155 |             for x in sorted(
156 |                 list(zip(current_labels, item["pred_prob"], indices, item[text_key][:nbest])),
157 |                 key=lambda x: (x[0], x[2]),
158 |             )
159 |         ]
160 |         item["reranked"] = [x[3] for x in reranked]
161 |         item["reranked_pred_prob"] = [x[1] for x in reranked]
162 |         item["reranked_pred"] = [sorted_labels[x[0]] for x in reranked]
163 | 
164 |         k += cand_len
165 | 
166 |     correct_data = [x for x in data if x["pred"] and x["pred"][0] == correct_label and x["pred_prob"][0]]
167 |     wrong_data = [x for x in data if (not x["pred"]) or x["pred"][0] != correct_label]
168 | 
169 |     print("Evaluating")
170 | 
171 |     out_stats = ""
172 | 
173 |     try:
174 |         out_stats += f"all data: {get_stats(data, dataset)}\n"
175 | 
176 |         original_key = f"original_{text_key.strip()}"
177 | 
178 |         # Get stats before reranking
179 |         out_stats += f"correct data: {get_stats(correct_data, dataset)}\n"
180 |         out_stats += f"wrong data: {get_stats(wrong_data, dataset)}\n"
181 | 
182 |         out_stats += f"data, text:             {bleu(original_key, text_key, data, True, case_insensitive=True, all_keys=original_keys)}\n"
183 |         bleu_correct = bleu(original_key, text_key, correct_data, False, case_insensitive=True, all_keys=original_keys)
184 |         out_stats += f"correct_data, text:     {bleu_correct}\n"
185 |         bleu_wrong = bleu(original_key, text_key, wrong_data, False, case_insensitive=True, all_keys=original_keys)
186 |         out_stats += f"wrong_data, text:       {bleu_wrong}\n"
187 |         out_stats += f"percent correct: {len(correct_data) / len(data) * 100}\n"
188 | 
189 |         r, p = stats.pointbiserialr(
190 |             [0] * bleu_correct["count"] + [1] * bleu_wrong["count"],
191 |             [bleu_correct["value"]] * bleu_correct["count"] + [bleu_wrong["value"]] * bleu_wrong["count"]
192 |         )
193 |         out_stats += f"r: {r}, p-value: {p}\n"
194 |     except:
195 |         print("Not computing stats for before reranking")
196 | 
197 |     # Get stats after reranking
198 |     correct_data = [x for x in data if x["reranked_pred"] and x["reranked_pred"][0] == correct_label]
199 |     wrong_data = [x for x in data if (not x["reranked_pred"]) or x["reranked_pred"][0] != correct_label]
200 | 
201 |     try:
202 |         out_stats += f"correct data: {get_stats(correct_data, dataset)}\n"
203 |         out_stats += f"wrong data: {get_stats(wrong_data, dataset)}\n"
204 | 
205 |         out_stats += (
206 |             f"data, reranked:         "
207 |             f'{bleu(original_key, "reranked", data, True, case_insensitive=True, all_keys=original_keys)}\n'
208 |         )
209 |         bleu_correct = bleu(
210 |             original_key, "reranked", correct_data, False, case_insensitive=True, all_keys=original_keys
211 |         )
212 |         out_stats += f"correct_data, reranked: {bleu_correct}\n"
213 |         bleu_wrong = bleu(original_key, "reranked", wrong_data, False, case_insensitive=True, all_keys=original_keys)
214 |         out_stats += f"wrong_data, reranked:   {bleu_wrong}\n"
215 |         out_stats += f"percent correct: {len(correct_data) / len(data) * 100}\n"
216 | 
217 |         r, p = stats.pointbiserialr(
218 |             [0] * bleu_correct["count"] + [1] * bleu_wrong["count"],
219 |             [bleu_correct["value"]] * bleu_correct["count"] + [bleu_wrong["value"]] * bleu_wrong["count"],
220 |         )
221 | 
222 |         out_stats += f"r: {r}, p-value: {p}\n"
223 | 
224 |     except:
225 |         print("Not computing stats for after reranking")
226 | 
227 |     json.dump(data, open(out_folder / "classified.json", "w"), indent=2)
228 |     json.dump(correct_data, open(out_folder / "classified_correct.json", "w"), indent=2)
229 |     json.dump(wrong_data, open(out_folder / "classified_wrong.json", "w"), indent=2)
230 |     print(out_stats)
231 |     (out_folder / "stats.txt").write_text(out_stats)
232 |     for item in data:
233 |         item[text_key] = item["reranked"]
234 |         item["pred_prob"] = item["reranked_pred_prob"]
235 |         item["pred"] = item["reranked_pred"]
236 |         del item["reranked"]
237 |         del item["reranked_pred_prob"]
238 |         del item["reranked_pred"]
239 |     json.dump(data, open(out_folder / "reranked.json", "w"), indent=2)
240 | 
241 | 
242 | systems = ["systemFcPost", "systemNoFc", "systemNoFcNoFs"]
243 | datasets = ["ldc", "webnlg", "e2e", "viggo"]
244 | 
245 | 
246 | def get_semantic_stats(data, folder, system, dataset):
247 |     sfc_correct = [item["sfc_correct"] for item in data]
248 | 
249 |     results = {"sfc_correct": np.mean(sfc_correct)}
250 |     if dataset != "ldc":
251 |         ser_correct = [item["ser_correct"] for item in data]
252 |         ser = [item["ser"] for item in data]
253 |         results.update(
254 |             {
255 |                 "ser_correct": np.mean(ser_correct),
256 |                 "ser": np.mean(ser),
257 |                 "both_correct": np.mean(np.prod([sfc_correct, ser_correct], axis=0)),
258 |                 "at_least_one_correct": np.mean(np.logical_or(sfc_correct, ser_correct)),
259 |                 "sfc_correct_ser_wrong": np.mean(
260 |                     [int(sfc_correct[i] == 1 and ser_correct[i] == 0) for i in range(len(data))]
261 |                 ),
262 |                 "sfc_wrong_ser_correct": np.mean(
263 |                     [int(sfc_correct[i] == 0 and ser_correct[i] == 1) for i in range(len(data))]
264 |                 ),
265 |             }
266 |         )
267 | 
268 |     print(results)
269 |     out_file = folder / (f"results_{system}.json")
270 |     print(f"written to {out_file}")
271 |     json.dump(results, open(out_file, "w"), indent=2)
272 |     return data
273 | 
274 | 
275 | if __name__ == "__main__":
276 |     Fire()
277 | 


--------------------------------------------------------------------------------
/src/external/ufal_dsg_tgen/data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | """
  5 | Helper data structures
  6 | """
  7 | 
  8 | from builtins import zip
  9 | from builtins import str
 10 | from builtins import object
 11 | import re
 12 | 
 13 | 
 14 | class DAI(object):
 15 |     """Simple representation of a single dialogue act item."""
 16 | 
 17 |     __slots__ = ['da_type', 'slot', 'value']
 18 | 
 19 |     def __init__(self, da_type, slot=None, value=None):
 20 |         self.da_type = da_type
 21 |         self.slot = slot
 22 |         self.value = value
 23 | 
 24 |     def __str__(self):
 25 |         if self.slot is None:
 26 |             return self.da_type + '()'
 27 |         if self.value is None:
 28 |             return self.da_type + '(' + self.slot + ')'
 29 |         quote = '\'' if (' ' in self.value or ':' in self.value) else ''
 30 |         return self.da_type + '(' + self.slot + '=' + quote + self.value + quote + ')'
 31 | 
 32 |     def __bytes__(self):
 33 |         return str(self).encode('ascii', errors='replace')
 34 | 
 35 |     def __repr__(self):
 36 |         return 'DAI.parse("' + str(self) + '")'
 37 | 
 38 |     def __hash__(self):
 39 |         return hash(repr(self))
 40 | 
 41 |     def __eq__(self, other):
 42 |         return (self.da_type == other.da_type and
 43 |                 self.slot == other.slot and
 44 |                 self.value == other.value)
 45 | 
 46 |     def __lt__(self, other):
 47 |         return (self.da_type < other.da_type or
 48 |                 (self.da_type == other.da_type and self.slot < other.slot) or
 49 |                 (self.da_type == other.da_type and self.slot == other.slot and
 50 |                  self.value < other.value))
 51 | 
 52 |     def __le__(self, other):
 53 |         return (self.da_type < other.da_type or
 54 |                 (self.da_type == other.da_type and self.slot < other.slot) or
 55 |                 (self.da_type == other.da_type and self.slot == other.slot and
 56 |                  self.value <= other.value))
 57 | 
 58 |     def __ne__(self, other):
 59 |         return not self == other
 60 | 
 61 |     def __gt__(self, other):
 62 |         return not self <= other
 63 | 
 64 |     def __ge__(self, other):
 65 |         return not self < other
 66 | 
 67 |     @staticmethod
 68 |     def parse(dai_text):
 69 |         da_type, svp = dai_text[:-1].split('(', 1)
 70 | 
 71 |         if not svp:  # no slot + value (e.g. 'hello()')
 72 |             return DAI(da_type)
 73 | 
 74 |         if '=' not in svp:  # no value (e.g. 'request(to_stop)')
 75 |             return DAI(da_type, svp)
 76 | 
 77 |         slot, value = svp.split('=', 1)
 78 |         if value.endswith('"#'):  # remove special '#' characters in Bagel data (TODO treat right)
 79 |             value = value[:-1]
 80 |         if value[0] in ['"', '\'']:  # remove quotes
 81 |             value = value[1:-1]
 82 |         return DAI(da_type, slot, value)
 83 | 
 84 | 
 85 | class DA(object):
 86 |     """Dialogue act -- a list of DAIs with a few special functions for parsing etc.."""
 87 | 
 88 |     def __init__(self):
 89 |         self.dais = []
 90 | 
 91 |     def __getitem__(self, idx):
 92 |         return self.dais[idx]
 93 | 
 94 |     def __setitem__(self, idx, value):
 95 |         self.dais[idx] = value
 96 | 
 97 |     def append(self, value):
 98 |         self.dais.append(value)
 99 | 
100 |     def __str__(self):
101 |         return '&'.join([str(dai) for dai in self.dais])
102 | 
103 |     def __bytes__(self):
104 |         return str(self).encode('ascii', errors='xmlcharrefreplace')
105 | 
106 |     def __repr__(self):
107 |         return 'DA.parse("' + str(self) + '")'
108 | 
109 |     def __hash__(self):
110 |         return hash(repr(self))
111 | 
112 |     def __len__(self):
113 |         return len(self.dais)
114 | 
115 |     def __eq__(self, other):
116 |         if not isinstance(other, DA):
117 |             return NotImplemented
118 |         for self_dai, other_dai in zip(self.dais, other.dais):
119 |             if self_dai != other_dai:
120 |                 return False
121 |         return True
122 | 
123 |     def __ne__(self, other):
124 |         return not self == other
125 | 
126 |     def sort(self):
127 |         self.dais.sort()
128 | 
129 |     @staticmethod
130 |     def parse(da_text):
131 |         """Parse a DA string into DAIs (DA types, slots, and values)."""
132 |         da = DA()
133 |         for dai_text in da_text[:-1].split(')&'):
134 |             da.append(DAI.parse(dai_text + ')'))
135 |         return da
136 | 
137 |     class TagQuotes(object):
138 |         """A helper class for numbering the occurrences of quoted things in the text."""
139 |         def __init__(self):
140 |             self.counter = 0
141 | 
142 |         def __call__(self, match):
143 |             self.counter += 1
144 |             return 'XXXQUOT%d' % self.counter
145 | 
146 |     @staticmethod
147 |     def _protect_quotes(text):
148 |         """Find and replace quoted parts of the sentence by tags."""
149 |         tag_pattern = '"[^"]*"|\'[^\']*\''
150 |         tags = re.findall(tag_pattern, text)
151 |         sent = re.sub(tag_pattern, DA.TagQuotes(), text)
152 |         return sent, tags
153 | 
154 |     @staticmethod
155 |     def parse_cambridge_da(da_text):
156 |         """Parse a Cambridge-style DA string a DA object."""
157 |         da = DA()
158 |         da_text, quoted = DA._protect_quotes(da_text.strip())
159 |         quoted_num = 1
160 | 
161 |         for dai_text in re.finditer(r'(\??[a-z_]+)\(([^)]*)\)', da_text):
162 |             da_type, svps_text = dai_text.groups()
163 | 
164 |             if not svps_text:  # no slots/values (e.g. 'hello()')
165 |                 da.append(DAI(da_type, None, None))
166 |                 continue
167 | 
168 |             # we have some slots/values – split them into DAI
169 |             svps = re.findall('([^,;=\'"]+(?:=(?:[^"\',;]+))?)(?:[,;]|[\'"]$|$)', svps_text)
170 |             for svp in svps:
171 | 
172 |                 if '=' not in svp:  # no value, e.g. '?request(near)'
173 |                     da.append(DAI(da_type, svp, None))
174 |                     continue
175 | 
176 |                 # we have a value
177 |                 slot, value = svp.split('=', 1)
178 |                 if 'XXXQUOT%d' % quoted_num in value:  # get back the quoted value
179 |                     value = re.sub('XXXQUOT%d' % quoted_num, quoted.pop(0), value, count=1)
180 |                     quoted_num += 1
181 |                 if re.match(r'^\'.*\'$', value) or re.match('^".*"$', value):
182 |                     value = value[1:-1]
183 |                 assert not re.match(r'^\'', value) and not re.match(r'\'$', value)
184 |                 assert not re.match(r'^"', value) and not re.match(r'"$', value)
185 | 
186 |                 da.append(DAI(da_type, slot, value))
187 | 
188 |         return da
189 | 
190 |     @staticmethod
191 |     def parse_diligent_da(da_text):
192 |         """Parse a Diligent-style flat MR (E2E NLG dataset) string into a DA object."""
193 |         da = DA()
194 | 
195 |         for dai_text in re.finditer(r'([a-z_A-Z]+)\[([^\]]*)\]', da_text):
196 |             slot, value = dai_text.groups()
197 |             slot = re.sub(r'([A-Z])', r'_\1', slot).lower()
198 |             da.append(DAI('inform', slot, value if value else None))
199 | 
200 |         return da
201 | 
202 |     @staticmethod
203 |     def parse_dict(da_dict, assume_da_type='inform'):
204 |         """Parse an attribute-value dict, assuming the given DA type for all resulting DAIs."""
205 |         da = DA()
206 |         for slot, values in da_dict.items():
207 |             for value in values.keys():
208 |                 da.append(DAI(assume_da_type, slot, value))
209 |         da.sort()
210 |         return da
211 | 
212 |     def value_for_slot(self, slot):
213 |         """Return the value for the given slot (None if unset or not present at all).
214 |         Uses the first occurrence of this slot if found."""
215 |         for dai in self.dais:
216 |             if dai.slot == slot:
217 |                 return dai.value
218 |         return None
219 | 
220 |     def has_value(self, value):
221 |         """If the DA contains the given value, return the corresponding slot; return None
222 |         otherwise. Abstracts away from "and" and "or" values (returns True for both coordination
223 |         members)."""
224 |         for dai in self.dais:
225 |             if dai.value == value:
226 |                 return dai.slot
227 |             if (dai.value is not None and
228 |                     value not in [None, '?'] and
229 |                     (re.match(r'.* (and|or) ' + value + r'$', dai.value) or
230 |                      re.match(r'^' + value + r' (and|or) ', dai.value))):
231 |                 return dai.slot
232 |         return None
233 | 
234 |     def set_value_for_slot(self, slot, value):
235 |         """Replace the value of the given slot. Has no effect if the slot is not present
236 |         in the DA. Will only replace the 1st occurrence of the slot."""
237 |         for dai in self.dais:
238 |             if dai.slot == slot:
239 |                 dai.value = value
240 |                 break
241 | 
242 |     def get_delexicalized(self, delex_slots):
243 |         """Return a delexicalized copy o fthe current DA (delexicalize slots that are in
244 |         the given parameter).
245 | 
246 |         @param delex_slots: a set of names of slots to be delexicalized
247 |         @return: a new DA() object with delexicalized values
248 |         """
249 |         ret = DA()
250 |         for dai in self:
251 |             ret_dai = DAI(dai.da_type, dai.slot,
252 |                           'X-' + dai.slot
253 |                           if (dai.slot in delex_slots and
254 |                               dai.value not in ['none', None, 'dont_care'])
255 |                           else dai.value)
256 |             ret.append(ret_dai)
257 |         return ret
258 | 
259 |     def to_human_string(self):
260 |         """Return a string that is supposedly more human-readable than the standard DA form."""
261 |         out = ''
262 |         cur_dat = None
263 |         for dai in self:
264 |             if dai.da_type != cur_dat:
265 |                 out += ('; ' if out else '') + dai.da_type.upper()
266 |                 cur_dat = dai.da_type
267 |                 if dai.slot:
268 |                     out += ': '
269 |             elif dai.slot:
270 |                 out += ', '
271 |             if dai.slot:
272 |                 out += dai.slot
273 |             if dai.value:
274 |                 out += ' = ' + dai.value
275 |         return out
276 | 
277 |     def to_cambridge_da_string(self):
278 |         """Convert to Cambridge-style DA string (opposite of parse_cambridge_da)."""
279 |         out = ''
280 |         cur_dat = None
281 |         for dai in self:
282 |             if dai.da_type != cur_dat:
283 |                 out += (')&' if out else '') + dai.da_type + '('
284 |                 cur_dat = dai.da_type
285 |             elif dai.slot:
286 |                 out += ','
287 |             if dai.slot:
288 |                 out += dai.slot
289 |             if dai.value:
290 |                 quote = '\'' if (' ' in dai.value or ':' in dai.value) else ''
291 |                 out += '=' + quote + dai.value + quote
292 |         out += ')' if out else ''
293 |         return out
294 | 
295 |     def to_diligent_da_string(self):
296 |         """Convert to Diligent E2E dataset flat MR string (opposite of parse_diligent_da).
297 |         Note that all DA type information is lost."""
298 |         # return slot names to camel case
299 |         return ', '.join([re.sub(r'_([a-z])', lambda pat: pat.group(1).upper(), dai.slot)
300 |                           + '[' + dai.value + ']' for dai in self])
301 | 
302 | 
303 | class Abst(object):
304 |     """Simple representation of a single abstraction/delexicalization instruction."""
305 | 
306 |     __slots__ = ['slot', 'value', 'surface_form', 'start', 'end']
307 | 
308 |     def __init__(self, slot=None, value=None, surface_form=None, start=None, end=None):
309 |         self.slot = slot
310 |         self.value = value
311 |         self.surface_form = surface_form
312 |         self.start = start
313 |         self.end = end
314 |         if self.start is not None and self.end is None:
315 |             self.end = self.start + 1
316 | 
317 |     def __str__(self):
318 |         """Create string representation of the abstraction instruction, in the following format:
319 |         slot="value":"surface_form":start-end. Surface form is omitted if None, quotes are omitted
320 |         if not needed."""
321 |         # prepare quoting
322 |         quote_value = '"' if ' ' in self.value or ':' in self.value else ''
323 |         if self.surface_form is not None:
324 |             quote_sf = '"' if ' ' in self.surface_form or ':' in self.surface_form else ''
325 |         # create output
326 |         out = self.slot + '=' + quote_value + self.value + quote_value + ':'
327 |         if self.surface_form is not None:
328 |             out += quote_sf + self.surface_form + quote_sf + ':'
329 |         out += str(self.start) + '-' + str(self.end)
330 |         return out
331 | 
332 |     def __bytes__(self):
333 |         return str(self).encode('ascii', errors='xmlcharrefreplace')
334 | 
335 |     def __repr__(self):
336 |         return 'Abst.parse("' + str(self) + '")'
337 | 
338 |     @staticmethod
339 |     def parse(abst_str):
340 |         """Create the abstraction instruction from a string representation, in the following
341 |         format: slot="value":"surface_form":start-end. Here, surface form is optional and value
342 |         and surface form do not need to be enquoted if they do not contain colons or spaces.
343 |         @param abst_str: string representation of the abstraction instruction
344 |         @return: Abst object representing the abstraction instruction
345 |         """
346 |         slot, rest = abst_str.split('=', 1)
347 |         if rest.startswith('"'):
348 |             value, rest = re.split(r'(?<!\\)":', rest[1:], maxsplit=1)
349 |         else:
350 |             value, rest = rest.split(':', 1)
351 |         if rest.startswith('"'):
352 |             surface_form, rest = re.split(r'(?<!\\)":', rest[1:], maxsplit=1)
353 |         elif ':' in rest:
354 |             surface_form, rest = rest.split(':', 1)
355 |         else:
356 |             surface_form = None
357 |         if rest == '-1--1':  # non-realized values
358 |             start, end = None, None
359 |         else:
360 |             start, end = [int(part) for part in rest.split('-', 1)]
361 |         return Abst(slot, value, surface_form, start, end)
362 | 


--------------------------------------------------------------------------------