├── AP_sampling
├── .README.md.swn
├── README.md
├── bin
│ ├── .finetune_ALM.sh.swp
│ ├── collect_top_prob.sh
│ ├── collect_top_prob_Qwen_4b.sh
│ ├── collect_top_prob_opt.sh
│ ├── continue_story_prompt_loop.sh
│ ├── continue_wiki_prompt_loop_eval.sh
│ └── finetune_ALM.sh
├── imgs
│ ├── APD_first_figure.png
│ └── Results.png
├── requirements.txt
└── src
│ ├── .collect_top_prob.py.swp
│ ├── .ipynb_checkpoints
│ └── entropy_prediction_visualization-checkpoint.ipynb
│ ├── .model_mlp_logit.py.swo
│ ├── .prepare_gpt2_id_corpus_from_raw.py.swp
│ ├── QA
│ ├── __pycache__
│ │ ├── online_utils.cpython-310.pyc
│ │ └── online_utils.cpython-38.pyc
│ ├── analyze_results.py
│ ├── analyze_results_online_all.py
│ ├── dataset_preparation
│ │ ├── prepare_arc_dataset.py
│ │ ├── prepare_commonqa_dataset.py
│ │ ├── prepare_lambda_dataset.py
│ │ ├── prepare_multirc_dataset.py
│ │ ├── prepare_qasc_dataset.py
│ │ ├── prepare_socialiqa_dataset.py
│ │ └── prepare_squad_dataset.py
│ ├── online_utils.py
│ ├── test_neg_dataset.py
│ ├── test_neg_dataset_online_all.py
│ ├── test_squad_dataset.py
│ └── test_squad_dataset_online_all.py
│ ├── __pycache__
│ ├── configuration_openelm.cpython-38.pyc
│ ├── data_utils.cpython-310.pyc
│ ├── data_utils.cpython-38.pyc
│ ├── data_utils.cpython-39.pyc
│ ├── model_mlp_logit.cpython-310.pyc
│ ├── model_mlp_logit.cpython-37.pyc
│ ├── model_mlp_logit.cpython-38.pyc
│ ├── model_mlp_logit.cpython-39.pyc
│ ├── modeling_openelm.cpython-38.pyc
│ ├── train_logits_prediction_model.cpython-310.pyc
│ └── train_logits_prediction_model.cpython-38.pyc
│ ├── collect_top_prob.py
│ ├── data_utils.py
│ ├── entropy_prediction_visualization.ipynb
│ ├── example_APD_REAL.py
│ ├── factual_gen
│ ├── __pycache__
│ │ ├── sampling_method.cpython-310.pyc
│ │ ├── sampling_method.cpython-37.pyc
│ │ ├── sampling_method.cpython-38.pyc
│ │ └── sampling_method.cpython-39.pyc
│ ├── gen_fp.py
│ ├── prepare_wiki_MTurk.py
│ ├── print_all_results.py
│ └── sampling_method.py
│ ├── model_mlp_logit.py
│ ├── story_gen
│ ├── collect_res_gpt_eval.py
│ ├── comp_collect_GPT_results.py
│ ├── comp_prompt_GPT.py
│ ├── eval_gen.py
│ ├── prepare_story_prompt.py
│ └── print_story_results.py
│ └── train_logits_prediction_model.py
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── FactualityPrompt
├── Dockerfile
├── LICENSE
├── README.md
├── bin
│ ├── eval.sh
│ ├── eval_loop.sh
│ └── eval_story_div_loop.sh
├── fever_athene
│ ├── .env
│ ├── .gitignore
│ ├── Dockerfile
│ ├── LICENSE.txt
│ ├── NOTICE.txt
│ ├── README.md
│ ├── __init__.py
│ ├── conf
│ │ └── config_no_attention_nodrop_glove_only.json
│ ├── predict.sh
│ ├── requirements.txt
│ ├── server.sh
│ ├── src
│ │ ├── __init__.py
│ │ ├── athene
│ │ │ ├── __init__.py
│ │ │ ├── retrieval
│ │ │ │ ├── __init__.py
│ │ │ │ ├── document
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── doc_retrieval.py
│ │ │ │ │ ├── doc_retrieval_np_sub.py
│ │ │ │ │ └── docment_retrieval.py
│ │ │ │ ├── score
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── score.py
│ │ │ │ └── sentences
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── data_processing
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── data.py
│ │ │ │ │ ├── elmo_data.py
│ │ │ │ │ └── sentence_loader.py
│ │ │ │ │ ├── deep_models
│ │ │ │ │ ├── ESIM.py
│ │ │ │ │ ├── ESIMandELMO.py
│ │ │ │ │ └── __init__.py
│ │ │ │ │ ├── ensemble.py
│ │ │ │ │ └── sentence_retrieval.py
│ │ │ ├── rte
│ │ │ │ ├── __init__.py
│ │ │ │ ├── deep_models
│ │ │ │ │ ├── BaseDeepModel.py
│ │ │ │ │ ├── BiLSTM.py
│ │ │ │ │ ├── ESIM_for_ensemble.py
│ │ │ │ │ ├── ESIM_for_ensemble_glove_only_no_attention.py
│ │ │ │ │ ├── LSTM.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── convert_use.py
│ │ │ │ │ └── copy_graph.py
│ │ │ │ └── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── customized_votingclassifier.py
│ │ │ │ │ ├── data_reader.py
│ │ │ │ │ ├── dataset.py
│ │ │ │ │ ├── estimator_definitions.py
│ │ │ │ │ ├── fill_gold_sentences.py
│ │ │ │ │ ├── score.py
│ │ │ │ │ └── text_processing.py
│ │ │ ├── system.py
│ │ │ └── utils
│ │ │ │ ├── __init__.py
│ │ │ │ └── config.py
│ │ ├── common
│ │ │ ├── __init__.py
│ │ │ ├── dataset
│ │ │ │ ├── __init__.py
│ │ │ │ ├── block.py
│ │ │ │ ├── corpus.py
│ │ │ │ ├── data_set.py
│ │ │ │ ├── formatter.py
│ │ │ │ ├── label_schema.py
│ │ │ │ ├── persistence
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── engine.py
│ │ │ │ │ ├── page.py
│ │ │ │ │ └── session.py
│ │ │ │ ├── reader.py
│ │ │ │ ├── reverse_index.py
│ │ │ │ └── s3
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── index.py
│ │ │ │ │ └── iterator.py
│ │ │ ├── framework
│ │ │ │ ├── __init__.py
│ │ │ │ └── task.py
│ │ │ ├── training
│ │ │ │ ├── __init__.py
│ │ │ │ ├── batcher.py
│ │ │ │ ├── early_stopping.py
│ │ │ │ ├── options.py
│ │ │ │ └── run.py
│ │ │ └── util
│ │ │ │ ├── __init__.py
│ │ │ │ ├── array.py
│ │ │ │ ├── log_helper.py
│ │ │ │ └── random.py
│ │ ├── rename.py
│ │ ├── retrieval
│ │ │ ├── __init__.py
│ │ │ ├── bidaf.py
│ │ │ ├── fever_doc_db.py
│ │ │ ├── filter_lists.py
│ │ │ ├── filter_uninformative.py
│ │ │ ├── reader.py
│ │ │ ├── sent_features.py
│ │ │ ├── sentence.py
│ │ │ └── snopes_doc_db.py
│ │ └── scripts
│ │ │ ├── __init__.py
│ │ │ ├── athene
│ │ │ ├── __init__.py
│ │ │ ├── export_current_config_to_json.py
│ │ │ ├── pipeline.py
│ │ │ ├── replace_noise_dataset.py
│ │ │ ├── replace_noise_dataset_with_scores.py
│ │ │ ├── rte.py
│ │ │ ├── rte_fasttext.py
│ │ │ └── sort_submission.py
│ │ │ ├── build_db.py
│ │ │ ├── build_db_kilt.py
│ │ │ ├── prepare_submission.py
│ │ │ └── score.py
│ └── tests
│ │ └── test_load_models.py
├── prompts
│ ├── fever_factual_1000_final.jsonl
│ ├── fever_factual_100_final.jsonl
│ ├── fever_factual_final.jsonl
│ ├── fever_factual_test7k_final.jsonl
│ ├── fever_nonfactual_1000_final.jsonl
│ ├── fever_nonfactual_100_final.jsonl
│ ├── fever_nonfactual_final.jsonl
│ └── fever_nonfactual_test7k_final.jsonl
├── setup.sh
└── src
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-38.pyc
│ ├── __init__.cpython-39.pyc
│ ├── claim_handling.cpython-38.pyc
│ ├── const.cpython-310.pyc
│ ├── const.cpython-38.pyc
│ ├── const.cpython-39.pyc
│ ├── factuality_metric.cpython-38.pyc
│ ├── metric.cpython-38.pyc
│ ├── retriever.cpython-38.pyc
│ └── retriever.cpython-39.pyc
│ ├── claim_handling.py
│ ├── const.py
│ ├── distinct_n.py
│ ├── distinct_n_each_gen.py
│ ├── evaluate_v3_final.py
│ ├── evaluate_v3_final_all.py
│ ├── factuality_metric.py
│ ├── preprocess_data_megatron_lm.py
│ ├── repetition.py
│ └── retriever.py
├── LICENSE
├── README.md
├── REAL_sampling
├── .DS_Store
├── README.md
├── bin
│ ├── continue_wiki_prompt_loop.sh
│ └── train_THF_model.sh
├── imgs
│ └── REAL_second_figure.png
├── requirements.txt
└── src
│ ├── __pycache__
│ ├── data_utils.cpython-310.pyc
│ ├── data_utils.cpython-37.pyc
│ ├── data_utils.cpython-38.pyc
│ ├── model.cpython-310.pyc
│ ├── model.cpython-37.pyc
│ ├── model.cpython-38.pyc
│ ├── model.cpython-39.pyc
│ ├── train_entropy_prediction_model.cpython-37.pyc
│ └── train_entropy_prediction_model.cpython-38.pyc
│ ├── analyze_datasets
│ ├── __pycache__
│ │ ├── utils.cpython-37.pyc
│ │ └── utils.cpython-38.pyc
│ ├── feature_clf_Hades.py
│ ├── feature_clf_Halu.py
│ ├── feature_clf_all.py
│ ├── feature_clf_factor.py
│ ├── feature_clf_state.py
│ └── utils.py
│ ├── collect_avg_RE.py
│ ├── collect_gt_entropy.py
│ ├── collect_gt_perplexity.py
│ ├── colorize.html
│ ├── data_utils.py
│ ├── entropy_prediction_visualization.ipynb
│ ├── entropy_visualization.ipynb
│ ├── example.py
│ ├── factual_gen
│ ├── .gen_fp.py.swp
│ ├── __pycache__
│ │ ├── sampling_method.cpython-310.pyc
│ │ ├── sampling_method.cpython-38.pyc
│ │ └── sampling_method.cpython-39.pyc
│ ├── collect_GPT_results.py
│ ├── collect_res_gpt_eval.py
│ ├── comp_collect_GPT_results.py
│ ├── comp_prompt_GPT.py
│ ├── gen_fp.py
│ ├── prepare_story_MTurk.py
│ ├── prepare_story_prompt.py
│ ├── prepare_wiki_MTurk.py
│ ├── print_all_results.py
│ ├── prompt_GPT.py
│ └── sampling_method.py
│ ├── model.py
│ ├── prepare_id_corpus_from_raw.py
│ ├── process_hallucination_dataset
│ ├── Hades
│ │ ├── data_loader.py
│ │ └── utils.py
│ ├── __pycache__
│ │ ├── compute_ent_features.cpython-310.pyc
│ │ ├── compute_ent_features.cpython-37.pyc
│ │ ├── compute_ent_features.cpython-38.pyc
│ │ ├── data_classes.cpython-310.pyc
│ │ ├── data_classes.cpython-37.pyc
│ │ └── data_classes.cpython-38.pyc
│ ├── compute_ent_features.py
│ ├── concat_category_csv.py
│ ├── convert_humor_dataset.py
│ ├── data_classes.py
│ ├── get_entropy_HaDes_span.py
│ ├── get_entropy_Halu.py
│ ├── get_entropy_all.py
│ ├── get_entropy_factor.py
│ ├── get_entropy_state.py
│ ├── split_csv_datasets.py
│ ├── split_data.sh
│ └── unify_Halu_datasets_format.py
│ ├── simple_exp.ipynb
│ └── train_entropy_prediction_model.py
└── THIRD_PARTY_LICENSES
/AP_sampling/.README.md.swn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/.README.md.swn
--------------------------------------------------------------------------------
/AP_sampling/README.md:
--------------------------------------------------------------------------------
1 | # Explaining and Improving Contrastive Decoding by Extrapolating the Probabilities of a Huge and Hypothetical LM
2 |
3 |

4 |
5 | ## Introduction
6 |
7 | To overcome the limitation of contrastive decoding (CD), we propose a new unsupervised decoding method called **A**symptotic **P**robability **D**ecoding (APD). APD explicitly extrapolates the probability curves from the LMs of different sizes to infer the asymptotic probabilities from an infinitely large LM without inducing more inference costs than CD. In FactualityPrompts, an open-ended text generation benchmark, sampling using APD significantly boosts factuality in comparison to the CD sampling and its variants, and achieves state-of-the-art results for Pythia 6.9B and OPT 6.7B. Furthermore, in five commonsense QA datasets, APD is often significantly better than CD and achieves a similar effect of using a larger LLM. For example, the perplexity of APD on top of Pythia 6.9B is even lower than the perplexity of Pythia 12B in CommonsenseQA and LAMBADA.
8 |
9 |
10 | ## Computational Environment
11 |
12 | You can reproduce our python enviroment using
13 | ```
14 | conda create --name --file requirement.txt
15 | ```
16 | Most of the codes could also be run using older versions (e.g., the version in the REAL_sampling/requirement.txt) of huggingface except for running the Qwen LLM
17 |
18 | ## How to run APD
19 |
20 | To learn how to use APD and/or REAL sampling in huggingface, please see the following example code
21 |
22 | ```
23 | ./src/example_APD_REAL.py
24 | ```
25 |
26 | ### Run FactualityPrompts
27 |
28 | To evaluate the generation results, first follow ../FactualityPrompt/README.md to download the data, change ../FactualityPrompt/src/const.py and run the following script.
29 |
30 | If you have >7 GPUs in your machine, you can just run the following file to generate the contiunations.
31 | ```
32 | ./bin/continue_wiki_prompt_loop_eval.sh
33 | ```
34 |
35 | ### Run Question Answering Datasets
36 |
37 | Step 1: Run the dataset download codes at src/QA/dataset_preparation (For ARC, we concatenate the easy and challenge json output).
38 |
39 | Step 2: Test APD models on the datasets. For datasets with only positive answers (e.g., LAMBADA, SQuAD, and MultiRC), use src/QA/dataset_preparation/test_squad_dataset.py. For the datasets with negative answers (e.g., QASC, ARC, SocialIQA, and CommonsenceQA), use src/QA/dataset_preparation/test_neg_dataset.py . If you want to also test the APD on the fly baseline, use test_squad_dataset_online_all.py and test_neg_dataset_online_all.py instead. Remember to change the paths in each file accordingly.
40 |
41 | Step 3: Run analyze_results.py or analyze_results_online_all.py to collect results. For datasets that have negative answers and accuracy metrics, set have_acc to be 1.
42 |
43 |
44 | ## How to Train ALM' (in order to use APD)
45 |
46 | Put your text file into "data/raw/".
47 |
48 | Change the INPUT_FILE, data_folder_name, and OUTPUT_MODEL_FOLDER in bin/finetune_ALM.sh and run it (Assuming you have more than 7 GPUs in your machine).
49 |
50 | Notice that our current implementation will first save lots of probabilities and logits from the top tokens of various LLMs into a cache, which will take lot of disk space.
51 | And we also need lots of CPU memory to load these probabilities. For example, after process ~270M Wikipedia text using 5 OPT models, we store 70G tensor and 52G dataset cache and our server has around 750G cpu memory.
52 |
--------------------------------------------------------------------------------
/AP_sampling/bin/.finetune_ALM.sh.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/bin/.finetune_ALM.sh.swp
--------------------------------------------------------------------------------
/AP_sampling/bin/collect_top_prob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #top_k=10
3 | bptt=1024
4 |
5 | #data_folder_name="wiki2021_1e4_Pythia"
6 | #data_folder_name="ROC_gen_1000_p095_Pythia"
7 | #data_folder_name="news_gen_1000_p095_Pythia"
8 | #data_folder_name="wp_gen_1000_p095_Pythia"
9 | #data_folder_name="wiki2021_1e6_Pythia"
10 | data_folder_name="wiki2021_5e6_Pythia"
11 | #data_folder_name="ROC_spring_Pythia"
12 | #data_folder_name="wikinews_Pythia"
13 | #data_folder_name="wp_5000_Pythia"
14 | #data_folder_name="wp_20000_Pythia"
15 | #data_folder_name="wiki2021_1e5_Pythia"
16 |
17 | #top_k="10"
18 | #sampling_methods="10_20"
19 |
20 | top_k="20,5,10"
21 | sampling_methods="0_20,20_100,100_inf"
22 | #top_k="20,20,20"
23 | #sampling_methods="0_20,20_100,100_inf"
24 |
25 | top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped"
26 | output_folder="data/processed/$data_folder_name/prob_tensor_${bptt}_ext2"
27 | #output_folder="data/processed/$data_folder_name/prob_tensor_${bptt}_ext3"
28 | #input_folder_name="../true_entropy/data/processed/$data_folder_name"
29 | input_folder_name="data/processed/$data_folder_name"
30 |
31 | declare -a bsz_arr=(2 4 4 8 12 16)
32 | declare -a model_arr=("EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1.4b-deduped" "EleutherAI/pythia-1b-deduped" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-160m-deduped" "EleutherAI/pythia-70m-deduped" )
33 |
34 | model_name="EleutherAI/pythia-6.9b-deduped"
35 | batch_size=1
36 | cuda_init=0
37 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
38 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt
39 |
40 | pids=()
41 |
42 | for i in "${!model_arr[@]}";
43 | do
44 | model_name=${model_arr[$i]}
45 | batch_size=${bsz_arr[$i]}
46 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
47 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt &
48 | pids+=($!)
49 | done
50 | echo "${pids[@]}"
51 |
52 |
--------------------------------------------------------------------------------
/AP_sampling/bin/collect_top_prob_Qwen_4b.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #bptt=1024
3 | bptt=128
4 |
5 | #data_folder_name="ROC_gen_1000_p095_OPT"
6 | #data_folder_name="news_gen_1000_p095_OPT"
7 | #data_folder_name="wp_gen_1000_p095_OPT"
8 | #data_folder_name="openwebtext_2017_18_1e5_OPT"
9 | #data_folder_name="wiki2021_1e6_OPT"
10 | data_folder_name="wiki2021_1e6_Qwen"
11 | #data_folder_name="wiki2021_5e6_OPT"
12 | #data_folder_name="ROC_spring_OPT"
13 | #data_folder_name="wikinews_OPT"
14 | #data_folder_name="wp_5000_OPT"
15 | #data_folder_name="wp_20000_OPT"
16 | #data_folder_name="wiki2021_1e5_OPT"
17 |
18 | #top_k="10"
19 | #sampling_methods="10_20"
20 | top_k="20,5,10"
21 | sampling_methods="0_20,20_100,100_inf"
22 |
23 | #top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped"
24 | #top_w_idx_model_name="facebook/opt-6.7b"
25 | top_w_idx_model_name="Qwen/Qwen1.5-4b"
26 | #top_w_idx_model_name="Qwen/Qwen1.5-4b-Chat"
27 | #output_folder="data/processed/$data_folder_name/prob_opt_tensor_$bptt"
28 | output_folder="data/processed/$data_folder_name/prob_Qwen_4b_tensor_${bptt}_new"
29 | #output_folder="data/processed/$data_folder_name/prob_Qwen_4b-Chat_tensor_${bptt}_new"
30 | #input_folder_name="../true_entropy/data/processed/$data_folder_name"
31 | input_folder_name="data/processed/$data_folder_name"
32 |
33 | declare -a bsz_arr=(4 8)
34 | declare -a model_arr=("Qwen/Qwen1.5-1.8b" "Qwen/Qwen1.5-0.5b" )
35 | #declare -a model_arr=("Qwen/Qwen1.5-1.8b-Chat" "Qwen/Qwen1.5-0.5b-Chat" )
36 |
37 | model_name="Qwen/Qwen1.5-4b"
38 | #model_name="Qwen/Qwen1.5-4b-Chat"
39 | batch_size=2
40 | cuda_init=0
41 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
42 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt
43 |
44 | pids=()
45 |
46 | for i in "${!model_arr[@]}";
47 | do
48 | model_name=${model_arr[$i]}
49 | batch_size=${bsz_arr[$i]}
50 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
51 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt &
52 | pids+=($!)
53 | done
54 | echo "${pids[@]}"
55 |
56 |
--------------------------------------------------------------------------------
/AP_sampling/bin/collect_top_prob_opt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | bptt=1024
3 |
4 | #data_folder_name="ROC_gen_1000_p095_OPT"
5 | #data_folder_name="news_gen_1000_p095_OPT"
6 | #data_folder_name="wp_gen_1000_p095_OPT"
7 | #data_folder_name="openwebtext_2017_18_1e5_OPT"
8 | #data_folder_name="wiki2021_1e6_OPT"
9 | data_folder_name="wiki2021_5e6_OPT"
10 | #data_folder_name="ROC_spring_OPT"
11 | #data_folder_name="wikinews_OPT"
12 | #data_folder_name="wp_5000_OPT"
13 | #data_folder_name="wp_20000_OPT"
14 | #data_folder_name="wiki2021_1e5_OPT"
15 |
16 | #top_k="10"
17 | #sampling_methods="10_20"
18 | top_k="20,5,10"
19 | sampling_methods="0_20,20_100,100_inf"
20 |
21 | #top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped"
22 | top_w_idx_model_name="facebook/opt-6.7b"
23 | #output_folder="data/processed/$data_folder_name/prob_opt_tensor_$bptt"
24 | output_folder="data/processed/$data_folder_name/prob_opt_tensor_${bptt}_new"
25 | #input_folder_name="../true_entropy/data/processed/$data_folder_name"
26 | input_folder_name="data/processed/$data_folder_name"
27 |
28 | declare -a bsz_arr=(2 4 8 16)
29 | declare -a model_arr=("facebook/opt-2.7b" "facebook/opt-1.3b" "facebook/opt-350m" "facebook/opt-125m" )
30 |
31 | model_name="facebook/opt-6.7b"
32 | batch_size=1
33 | cuda_init=0
34 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
35 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt
36 |
37 | pids=()
38 |
39 | for i in "${!model_arr[@]}";
40 | do
41 | model_name=${model_arr[$i]}
42 | batch_size=${bsz_arr[$i]}
43 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
44 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt &
45 | pids+=($!)
46 | done
47 | echo "${pids[@]}"
48 |
49 |
--------------------------------------------------------------------------------
/AP_sampling/bin/finetune_ALM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | bptt=1024
4 |
5 | #INPUT_FILE="data/raw/wiki2021_text_only_1e4"
6 | #data_folder_name="wiki2021_1e4_Pythia"
7 | #OUTPUT_MODEL_FOLDER="models/prob_wiki_ext2_1e4_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4"
8 | INPUT_FILE="data/raw/wiki2021_text_only_1e6"
9 | data_folder_name="wiki2021_1e6_Pythia"
10 | OUTPUT_MODEL_FOLDER="models/prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4"
11 |
12 | PROC_FOLDER="data/processed/$data_folder_name"
13 | output_folder="data/processed/$data_folder_name/prob_tensor_${bptt}"
14 | TOKENIZER="EleutherAI/pythia-70m-deduped"
15 |
16 | top_k="20,5,5"
17 | sampling_methods="0_20,20_100,100_inf"
18 |
19 | top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped"
20 |
21 |
22 | declare -a bsz_arr=(2 4 4 8 12 16)
23 | declare -a model_arr=("EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1.4b-deduped" "EleutherAI/pythia-1b-deduped" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-160m-deduped" "EleutherAI/pythia-70m-deduped" )
24 |
25 | model_name="EleutherAI/pythia-6.9b-deduped"
26 | batch_size=1
27 | cuda_init=0
28 |
29 | echo "python ../REAL_sampling/src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER"
30 | python ../REAL_sampling/src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER
31 |
32 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
33 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt
34 |
35 | pids=()
36 |
37 | for i in "${!model_arr[@]}";
38 | do
39 | model_name=${model_arr[$i]}
40 | batch_size=${bsz_arr[$i]}
41 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt"
42 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt &
43 | pids+=($!)
44 | done
45 | echo "${pids[@]}"
46 | wait "${pids[@]}"
47 |
48 |
49 | echo "python src/train_logits_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $output_folder/train --validation_label_folder $output_folder/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 500 --num_train_epochs 5 --learning_rate 1e-4 --logit_reg_w 0.8 --file_suffix _${sampling_methods}_k_${top_k}_bptt_${bptt}.pt"
50 | python src/train_logits_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $output_folder/train --validation_label_folder $output_folder/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 500 --num_train_epochs 5 --learning_rate 1e-4 --logit_reg_w 0.8 --file_suffix _${sampling_methods}_k_${top_k}_bptt_${bptt}.pt
51 |
--------------------------------------------------------------------------------
/AP_sampling/imgs/APD_first_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/imgs/APD_first_figure.png
--------------------------------------------------------------------------------
/AP_sampling/imgs/Results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/imgs/Results.png
--------------------------------------------------------------------------------
/AP_sampling/src/.collect_top_prob.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/.collect_top_prob.py.swp
--------------------------------------------------------------------------------
/AP_sampling/src/.model_mlp_logit.py.swo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/.model_mlp_logit.py.swo
--------------------------------------------------------------------------------
/AP_sampling/src/.prepare_gpt2_id_corpus_from_raw.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/.prepare_gpt2_id_corpus_from_raw.py.swp
--------------------------------------------------------------------------------
/AP_sampling/src/QA/__pycache__/online_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/QA/__pycache__/online_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/QA/__pycache__/online_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/QA/__pycache__/online_utils.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_arc_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 | import json
4 | import ast
5 |
6 | #dataset_config = {'name': 'allenai/ai2_arc', 'subset': 'ARC-Challenge', 'train_name': 'train' }
7 | dataset_config = {'name': 'allenai/ai2_arc', 'subset': 'ARC-Easy', 'train_name': 'train' }
8 |
9 | #validation
10 | #output_f_name = './outputs/arc/arc_challenge_train.json'
11 | #output_f_name = './outputs/arc/arc_easy_train.json'
12 | #output_f_name = './outputs/arc/arc_neg_challenge_train.json'
13 | output_f_name = './outputs/arc/arc_neg_easy_train.json'
14 |
15 | dataset = datasets.load_dataset(dataset_config['name'], dataset_config['subset'] )
16 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] )
17 |
18 | print(len(df_train))
19 |
20 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n'
21 | example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n'
22 |
23 | with open(output_f_name, 'w') as f_out:
24 | for index, row in df_train.iterrows():
25 | #ans_dict = ast.literal_eval(row['answers'])
26 | #ans_dict = json.loads(row['answers'])
27 | ans = row['choices']['text'][ row['choices']['label'].index(row['answerKey']) ]
28 | #print(ans_str)
29 | #ans_dict = json.loads(ans_str)
30 | q = row['question']
31 | all_ans_raw = row['choices']['text'].copy()
32 | all_ans_raw.remove(ans)
33 | all_ans = [ans] + all_ans_raw
34 | all_ans = [' ' + x for x in all_ans]
35 | assert len(row['choices']['text']) == len(all_ans)
36 | #q = q[0].upper() + q[1:]
37 | prompt = example + ' Question: ' + q + '\n Answer:'
38 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n')
39 |
40 |
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_commonqa_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 | import json
4 | import ast
5 |
6 | #dataset_config = {'name': 'tau/commonsense_qa', 'train_name': 'validation' }
7 | dataset_config = {'name': 'tau/commonsense_qa', 'train_name': 'train' }
8 |
9 | #validation
10 | #output_f_name = './outputs/commonqa/commonqa_val.json'
11 | #output_f_name = './outputs/commonqa/commonqa_train.json'
12 | output_f_name = './outputs/commonqa/commonqa_neg_train.json'
13 |
14 | dataset = datasets.load_dataset(dataset_config['name'] )
15 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] )
16 |
17 | including_passage = False
18 |
19 | print(len(df_train))
20 |
21 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n'
22 | example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n'
23 |
24 | with open(output_f_name, 'w') as f_out:
25 | for index, row in df_train.iterrows():
26 | #ans_dict = ast.literal_eval(row['answers'])
27 | #ans_dict = json.loads(row['answers'])
28 | ans = row['choices']['text'][ row['choices']['label'].index(row['answerKey']) ]
29 | #print(ans_str)
30 | #ans_dict = json.loads(ans_str)
31 | q = row['question']
32 | all_ans_raw = row['choices']['text'].copy()
33 | all_ans_raw.remove(ans)
34 | all_ans = [ans] + all_ans_raw
35 | all_ans = [' ' + x for x in all_ans]
36 | assert len(row['choices']['text']) == len(all_ans)
37 | #q = q[0].upper() + q[1:]
38 | prompt = example + ' Question: ' + q + '\n Answer:'
39 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n')
40 |
41 |
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_lambda_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 | import json
4 |
5 | dataset_config = {'name': 'EleutherAI/lambada_openai', 'subset': 'en', 'train_name': 'test' }
6 |
7 | #validation
8 | output_f_name = './outputs/lambda/openai_test.json'
9 |
10 | dataset = datasets.load_dataset(dataset_config['name'], dataset_config['subset'] )
11 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] )
12 |
13 | print(len(df_train))
14 |
15 | with open(output_f_name, 'w') as f_out:
16 | for index, row in df_train.iterrows():
17 | text = row['text']
18 | text_split = text.split(' ')
19 | prompt = ' '.join(text_split[:-1])
20 | ans = text_split[-1]
21 | f_out.write(json.dumps({'id': index, 'prompt': prompt, 'answer': ' ' + ans }) + '\n')
22 |
23 |
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_multirc_dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | #input_file = 'outputs/multirc/train_456-fixedIds.json'
4 | input_file = 'outputs/multirc/dev_83-fixedIds.json'
5 |
6 | with open(input_file) as f_in:
7 | input_dict = json.load(f_in)
8 |
9 |
10 | #output_f_name = './outputs/multirc/multirc_train.json'
11 | output_f_name = './outputs/multirc/multirc_dev.json'
12 |
13 | example = ' Passage: Sent 1: John likes to go hiking, and his wife likes to cook.\n Sent 2: His wife likes to cook.\n Here is a question: Who likes to cook?\n The answer is his wife\n\n'
14 |
15 | with open(output_f_name, 'w') as f_out:
16 | for idx, data in enumerate(input_dict['data']):
17 | passage = data['paragraph']['text'].replace('', ' ').replace('', '').replace('
', '\n').replace(' ', ' ')
18 | for qa in data['paragraph']['questions']:
19 | q = qa['question']
20 | for a in qa['answers']:
21 | if a['isAnswer']:
22 | a_text = a['text'].replace(' ', ' ')
23 | prompt = example + ' Passage:' + passage + '\n Here is a question: ' + q + '\n The answer is'
24 | f_out.write(json.dumps({'id': idx, 'passage': passage, 'question': q, 'prompt': prompt, 'answer': ' ' + a_text }) + '\n')
25 |
26 |
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_qasc_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 | import json
4 | import ast
5 |
6 | dataset_config = {'name': 'allenai/qasc', 'train_name': 'train' }
7 |
8 | #validation
9 | #output_f_name = './outputs/qasc/qasc_train.json'
10 | #output_f_name = './outputs/qasc/qasc_fact_early_train.json'
11 | output_f_name = './outputs/qasc/qasc_neg_train.json'
12 | #output_f_name = './outputs/qasc/qasc_neg_fact_train.json'
13 |
14 | dataset = datasets.load_dataset(dataset_config['name'] )
15 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] )
16 |
17 | #including_passage = True
18 | including_passage = False
19 |
20 | print(len(df_train))
21 |
22 | #include_facts = True
23 | include_facts = False
24 |
25 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n'
26 |
27 | if include_facts:
28 | example = ' Question: Which kind of animals can fly?\n Fact 1: a bird is an animal.\n Fact 2: birds can fly.\n Answer: bird.\n\n'
29 | #example = ' Fact 1: a bird is an animal.\n Fact 2: birds can fly.\n Question: Which kind of animals can fly?\n Answer: bird.\n\n'
30 | else:
31 | example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n'
32 |
33 | with open(output_f_name, 'w') as f_out:
34 | for index, row in df_train.iterrows():
35 | #ans_dict = ast.literal_eval(row['answers'])
36 | #ans_dict = json.loads(row['answers'])
37 | ans = row['choices']['text'][ row['choices']['label'].index(row['answerKey']) ]
38 | #print(ans_str)
39 | #ans_dict = json.loads(ans_str)
40 | q = row['question']
41 | all_ans_raw = row['choices']['text'].copy()
42 | all_ans_raw.remove(ans)
43 | all_ans = [ans] + all_ans_raw
44 | all_ans = [' ' + x for x in all_ans]
45 | assert len(row['choices']['text']) == len(all_ans)
46 | #q = q[0].upper() + q[1:]
47 | if include_facts:
48 | #prompt = example + ' Fact 1: ' + row['fact1'] + '\n Fact 2: ' + row['fact2'] + '\n Question: ' + q + '\n Answer:'
49 | prompt = example + ' Question: ' + q + '\n Fact 1: ' + row['fact1'] + '\n Fact 2: ' + row['fact2'] + '\n Answer:'
50 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n')
51 | else:
52 | prompt = example + ' Question: ' + q + '\n Answer:'
53 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n')
54 |
55 |
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_socialiqa_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 | import json
4 | import ast
5 |
6 | dataset_config = {'name': 'allenai/social_i_qa', 'train_name': 'train' }
7 | #dataset_config = {'name': 'allenai/social_i_qa', 'train_name': 'validation' }
8 |
9 | #validation
10 | #output_f_name = './outputs/socialiqa/socialiqa_val.json'
11 | #output_f_name = './outputs/socialiqa/socialiqa_train.json'
12 | output_f_name = './outputs/socialiqa/socialiqa_neg_train.json'
13 |
14 | dataset = datasets.load_dataset(dataset_config['name'] )
15 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] )
16 |
17 | print(len(df_train))
18 |
19 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n'
20 | #example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n'
21 | example = ' Passage: John likes to go hiking, and his wife likes to cook.\n Question: Who likes to cook?\n Answer: his wife\n\n'
22 |
23 | label_dict = {'1': 'A', '2': 'B', '3': 'C'}
24 |
25 | with open(output_f_name, 'w') as f_out:
26 | for index, row in df_train.iterrows():
27 | #ans_dict = ast.literal_eval(row['answers'])
28 | #ans_dict = json.loads(row['answers'])
29 | #print(row)
30 | ans = row[ 'answer' + label_dict[ row['label'] ] ]
31 | all_ans_raw = [row['answer' + x] for x in ['A','B','C']]
32 | all_ans_raw.remove(ans)
33 | all_ans = [ans] + all_ans_raw
34 | all_ans = [' ' + x for x in all_ans]
35 | assert 3 == len(all_ans)
36 | #print(ans_str)
37 | #ans_dict = json.loads(ans_str)
38 | prompt = example + ' Passage: ' + row['context'] + '\n Question: ' + row['question'] + '\n Answer:'
39 | f_out.write(json.dumps({'id': index, 'question': row['question'], 'context': row['context'], 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n')
40 |
--------------------------------------------------------------------------------
/AP_sampling/src/QA/dataset_preparation/prepare_squad_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 | import json
4 |
5 | #dataset_config = {'name': 'rajpurkar/squad', 'train_name': 'train' }
6 | dataset_config = {'name': 'rajpurkar/squad', 'train_name': 'validation' }
7 |
8 | #validation
9 | #output_f_name = './outputs/squad/squad_train.json'
10 | #output_f_name = './outputs/squad/squad_val.json'
11 | output_f_name = './outputs/squad/squad_val_no_pass.json'
12 |
13 | dataset = datasets.load_dataset(dataset_config['name'] )
14 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] )
15 |
16 | including_passage = False
17 |
18 | print(len(df_train))
19 |
20 | if including_passage:
21 | example = ' Passage: John likes to go hiking, and his wife likes to cook.\n Here is a question: Who likes to cook?\n The answer is his wife\n\n'
22 | else:
23 | example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n'
24 |
25 | with open(output_f_name, 'w') as f_out:
26 | for index, row in df_train.iterrows():
27 | ans_dict = row['answers']
28 | #print(ans_str)
29 | #ans_dict = json.loads(ans_str)
30 | for ans in ans_dict['text']:
31 | if including_passage:
32 | prompt = example + ' Passage: ' + row['context'] + '\n Here is a question: ' + row['question'] + '\n The answer is'
33 | f_out.write(json.dumps({'id': row['id'], 'passage': row['context'], 'question': row['question'], 'prompt': prompt, 'answer': ' ' + ans }) + '\n')
34 | else:
35 | prompt = example + ' Here is a question: ' + row['question'] + '\n The answer is'
36 | f_out.write(json.dumps({'id': row['id'], 'question': row['question'], 'prompt': prompt, 'answer': ' ' + ans }) + '\n')
37 |
38 |
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/configuration_openelm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/configuration_openelm.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/data_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/data_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/data_utils.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/data_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/data_utils.cpython-39.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/model_mlp_logit.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-310.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/model_mlp_logit.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-37.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/model_mlp_logit.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/model_mlp_logit.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-39.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/modeling_openelm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/modeling_openelm.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-310.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/data_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | class SeqDataset(torch.utils.data.Dataset):
4 | def __init__(self, w_ind_gpt2_tensor, bptt, device):
5 | self.w_ind_gpt2 = w_ind_gpt2_tensor
6 | self.seq_len = bptt
7 | self.output_device = device
8 |
9 | def __len__(self):
10 | return int( self.w_ind_gpt2.size(0) /self.seq_len )
11 |
12 | def __getitem__(self, idx):
13 | feature = self.w_ind_gpt2[idx*self.seq_len:(idx+1)*self.seq_len].to(dtype = torch.long, device = self.output_device)
14 | return feature
15 |
16 | def create_data_loader(f_in, bsz, bptt, device, dataset_class, shuffle = True):
17 | w_ind_gpt2_tensor = torch.load(f_in, map_location='cpu')
18 | cut_tok_num = w_ind_gpt2_tensor.size(0) % bptt
19 | if cut_tok_num > 0:
20 | w_ind_gpt2_tensor = w_ind_gpt2_tensor[:-cut_tok_num]
21 | dataset = dataset_class(w_ind_gpt2_tensor, bptt, device)
22 | use_cuda = False
23 | if device.type == 'cuda':
24 | use_cuda = True
25 | return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=False)
26 | #return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=True)
27 |
28 |
29 | def load_corpus(data_path, train_bsz, eval_bsz, bptt, device, tensor_folder = "tensors_all", skip_training = False, shuffle_train=True, shuffle_val = False, load_val = True, load_testing = True):
30 | train_corpus_name = data_path + "/" + tensor_folder + "/train.pt"
31 | val_org_corpus_name = data_path +"/" + tensor_folder + "/val_org.pt"
32 | test_org_corpus_name = data_path +"/" + tensor_folder + "/test_org.pt"
33 |
34 | dataloader_train = []
35 | dataloader_val = []
36 | dataloader_test = []
37 |
38 | dataset_class = SeqDataset
39 |
40 | if load_val:
41 | with open(val_org_corpus_name,'rb') as f_in:
42 | dataloader_val = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val)
43 |
44 | if load_testing:
45 | with open(test_org_corpus_name,'rb') as f_in:
46 | dataloader_test = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val)
47 |
48 | if not skip_training:
49 | with open(train_corpus_name,'rb') as f_in:
50 | dataloader_train = create_data_loader(f_in, train_bsz, bptt, device, dataset_class, shuffle = shuffle_train)
51 |
52 | return dataloader_train, dataloader_val, dataloader_test
53 |
--------------------------------------------------------------------------------
/AP_sampling/src/example_APD_REAL.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('./src/factual_gen/')
3 | from sampling_method import FETopPLogitsWarper, APTopPALogitsWarper, LogitsProcessorList
4 | from transformers import AutoModelForCausalLM, AutoTokenizer
5 | import torch
6 |
7 | sampling = 'APD + REAL'
8 | #sampling = 'APD'
9 | #sampling = 'REAL'
10 | #sampling = 'REAL + CD'
11 |
12 | LLM = 'Pythia'
13 | #LLM = 'OPT'
14 |
15 | final_entropy_model_path = 'models/OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3'
16 | fine_tuned_ALM_model_path = 'models/prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4'
17 | APD_p_value = 0.6
18 | decay_temperature = 2
19 | window_size = 40
20 | device = torch.device("cuda:0")
21 |
22 | if LLM == 'Pythia':
23 | LM_gen = 'EleutherAI/pythia-6.9b-deduped'
24 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024)
25 | tokenizer_ent = tokenizer
26 | else:
27 | LM_gen = 'facebook/opt-6.7b'
28 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024)
29 | tokenizer_ent = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m-deduped', padding_side='left', model_max_length=1024)
30 |
31 | tokenizer.pad_token = tokenizer.eos_token
32 | tokenizer_ent.pad_token = tokenizer_ent.eos_token
33 |
34 | model = AutoModelForCausalLM.from_pretrained(LM_gen)
35 | model.eval()
36 | model.to(device)
37 |
38 | if sampling == 'APD + REAL':
39 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = fine_tuned_ALM_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, student_model_name=final_entropy_model_path, use_CD_alpha= False, use_AP=True, device=device)
40 | elif sampling == 'APD':
41 | logits_processor_i = APTopPALogitsWarper(top_p = APD_p_value, student_model_name=fine_tuned_ALM_model_path, device=device, use_alpha=False, temperature = 1, top_k=20, use_log_softmax=True)
42 | elif sampling == 'REAL':
43 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, device=device)
44 | else:
45 | if LLM == 'Pythia':
46 | student_model_name = 'EleutherAI/pythia-70m-deduped'
47 | else:
48 | student_model_name = 'facebook/opt-125m'
49 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, student_model_name=student_model_name, use_CD_alpha= False, device=device)
50 |
51 |
52 | logits_processor = LogitsProcessorList()
53 | logits_processor.append(logits_processor_i)
54 |
55 | input_prompt = " I like to go hiking."
56 | input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
57 |
58 | output_sequences = model.generate(input_ids=input_ids.to(device), pad_token_id=tokenizer.eos_token_id, logits_processor=logits_processor, do_sample=True )
59 | input_len = input_ids.size(-1)
60 | output_con = output_sequences[0,input_len:]
61 | output_text = tokenizer.decode(output_con, skip_special_tokens=True)
62 | print("Input: ", input_prompt)
63 | print("Output: ", output_text)
64 |
--------------------------------------------------------------------------------
/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-37.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc
--------------------------------------------------------------------------------
/AP_sampling/src/factual_gen/prepare_wiki_MTurk.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | from nltk.tokenize import sent_tokenize
4 | import random
5 |
6 | sample_numbers = 1000
7 |
8 | repo_dir = '/mnt/efs/Haw-Shiuan/true_entropy/'
9 |
10 | input_file_dict = {'AP': repo_dir + "outputs/factual_gen/factual_test7k_6.9b_AP_toppk20_log_p0.8_dt_1.0_prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4/factual_test7k_6.9b_AP_toppk20_p0.8_gen_seed1.jsonl",
11 | 'Top-p': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_topp_p1.0_temp_1.0/factual_test7k_6.9b_topp_p1.0_gen_seed1.jsonl',
12 | 'CD': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_CD_toppk20_dt_0.5_log_p0.8_pythia-70m-deduped/factual_test7k_6.9b_CD_toppk20_p0.8_gen_seed1.jsonl',
13 | 'REAL+CD': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_fe_CD_topp_exp_1_win_40_dt_4.0_p0.5_OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3/factual_test7k_6.9b_fe_CD_topp_p0.5_gen_seed1.jsonl',
14 | 'REAL+AP': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_fe_AP_topp_exp_1_win_40_dt_4.0_ait1.0_prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4/factual_test7k_6.9b_fe_AP_topp_p1.0_gen_seed1.jsonl'
15 | }
16 |
17 | output_csv = 'outputs/MTurk/wiki/gen_1000.csv'
18 |
19 | method_list = list(input_file_dict.keys())
20 |
21 | def load_gen(input_file):
22 | id_list = []
23 | context_list = []
24 | gen_list = []
25 | with open(input_file) as f_in:
26 | for line in f_in:
27 | gen_obj = json.loads(line.strip())
28 | context = gen_obj['prompt'].strip()
29 | id_res = int(gen_obj['id'])
30 |
31 | text = gen_obj['text'].strip()
32 | sents = sent_tokenize(text)
33 | gen = sents[0].replace('\n',' ')
34 |
35 | id_list.append(id_res)
36 | context_list.append(context)
37 | gen_list.append(gen)
38 | if len(id_list) >= sample_numbers:
39 | break
40 | return id_list, context_list, gen_list
41 |
42 | prev_id_list = None
43 |
44 | all_res_dict = {}
45 |
46 | for method_name in input_file_dict:
47 | file_name = input_file_dict[method_name]
48 | print(file_name)
49 | id_list, context_list, gen_list = load_gen(file_name)
50 | print(method_name, sum([len(gen) for gen in gen_list ]) / sample_numbers )
51 | if prev_id_list is None:
52 | prev_id_list = id_list
53 | all_res_dict['id'] = id_list
54 | all_res_dict['context'] = context_list
55 | else:
56 | for i in range(len(id_list)):
57 | assert id_list[i] == prev_id_list[i]
58 | prev_id_list = id_list
59 | all_res_dict['gen_'+method_name] = gen_list
60 |
61 | df = pd.DataFrame(all_res_dict)
62 | print(df)
63 |
64 | num_method = len(method_list)
65 |
66 | output_dict = {'id': [], 'context': []}
67 | for i in range(num_method):
68 | output_dict['gen_'+str(i+1)] = []
69 | output_dict['method_'+str(i+1)] = []
70 |
71 | #drop_idx = []
72 |
73 | for index, row in df.iterrows():
74 | gen_list = []
75 |
76 | for method_name in method_list:
77 | gen_list.append(row['gen_'+method_name])
78 | if any([len(gen)<10 or 'External links' in gen for gen in gen_list]) or len(gen_list) != len(set(gen_list)):
79 | #drop_idx.append(index)
80 | continue
81 | output_dict['id'].append(row['id'])
82 | output_dict['context'].append(row['context'])
83 | idx_rnd = list(range(num_method))
84 | random.shuffle(idx_rnd)
85 | for i, idx in enumerate(idx_rnd):
86 | output_dict['gen_'+str(i+1)].append(gen_list[idx])
87 | output_dict['method_'+str(i+1)].append(method_list[idx])
88 |
89 | df = pd.DataFrame(output_dict).set_index('id')
90 | #df = df.drop(drop_idx)
91 |
92 |
93 | print(df)
94 | df.to_csv(output_csv)
95 |
--------------------------------------------------------------------------------
/AP_sampling/src/story_gen/comp_collect_GPT_results.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import os
4 | import json
5 |
6 |
7 | input_folder = 'outputs/story/GPT_exp/test_GPT3.5_responses_500/'
8 | #input_folder = 'outputs/wp/GPT_exp/test_GPT3.5_responses_500/'
9 |
10 |
11 | result_dict = {'file_name': [], 'avg_win_rate_F': [], 'avg_win_rate_C': [], 'avg_win_rate_L': [], 'avg_win_rate_O': [] }
12 |
13 | all_bad_idx = []
14 |
15 | for result_file in os.listdir(input_folder):
16 | file_path = input_folder+result_file
17 | if not os.path.isfile(file_path):
18 | continue
19 | with open(file_path) as f_in:
20 | all_inputs = json.load(f_in)
21 | bad_idx_list = []
22 | if len(all_inputs) == 5:
23 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs
24 | all_bad_idx = all_bad_idx + bad_idx_list
25 |
26 | all_bad_idx_set = set(all_bad_idx)
27 |
28 | print(all_bad_idx_set)
29 |
30 | #for result_file in input_file_list:
31 | for result_file in os.listdir(input_folder):
32 | file_path = input_folder+result_file
33 | if not os.path.isfile(file_path):
34 | continue
35 | with open(file_path) as f_in:
36 | all_inputs = json.load(f_in)
37 | if len(all_inputs) == 4:
38 | pred_method_name, base_method_name, system_prompt1, all_list = all_inputs
39 | elif len(all_inputs) == 5:
40 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs
41 | id_list, context_list_pred, gen_list_pred, gen_list_base, ref_list, prompt_list, first_res_list, response_list, parse_win_list = zip(*all_list)
42 | avg_win_rate_F = []
43 | avg_win_rate_C = []
44 | avg_win_rate_L = []
45 | avg_win_rate_O = []
46 | for i in range(len(id_list)):
47 | if i in all_bad_idx_set:
48 | continue
49 | avg_win_rate_F.append(int(parse_win_list[i]['F'] == 'pred'))
50 | avg_win_rate_C.append(int(parse_win_list[i]['C'] == 'pred'))
51 | avg_win_rate_L.append(int(parse_win_list[i]['L'] == 'pred'))
52 | avg_win_rate_O.append(int(parse_win_list[i]['O'] == 'pred'))
53 |
54 | result_dict['file_name'].append(result_file.replace('start2_1000',''))
55 | result_dict['avg_win_rate_F'].append(np.mean(avg_win_rate_F))
56 | result_dict['avg_win_rate_C'].append(np.mean(avg_win_rate_C))
57 | result_dict['avg_win_rate_L'].append(np.mean(avg_win_rate_L))
58 | result_dict['avg_win_rate_O'].append(np.mean(avg_win_rate_O))
59 |
60 | df = pd.DataFrame.from_dict(result_dict)
61 |
62 | #pd.set_option('display.max_columns', None)
63 | pd.options.display.max_colwidth = 150
64 |
65 | df_sort = df.set_index('file_name').sort_values(by=['file_name'])
66 |
67 | #print(df_sort[ ['avg_win_rate_O', 'avg_diff_score_O']])
68 | print(df_sort)
69 | #print(df['file_name'])
70 |
--------------------------------------------------------------------------------
/AP_sampling/src/story_gen/prepare_story_prompt.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 | input_stories = "/mnt/efs/Haw-Shiuan/entailment_tree/datasets/ROCStories__winter2017.csv"
5 | num_stories = 100
6 | #num_stories = 1000
7 | shot_num = 3
8 | prompt_sent_num = 2
9 | output_prompt_file = "/mnt/efs/Haw-Shiuan/AP_sampling/outputs/story/prompt_start2_b1_{}.jsonl".format(num_stories)
10 |
11 | delimiter = '---'
12 | num_story_line = 5
13 |
14 | df = pd.read_csv(input_stories)
15 | df_sampled_stories = df.sample(n=num_stories, replace=False)
16 | df_rest = df.drop(df_sampled_stories.index)
17 |
18 | def prepare_id(row, prompt_sent_num, init_sent_idx=0):
19 | id_q = ''
20 | for i in range(prompt_sent_num):
21 | id_q += row['sentence'+str(init_sent_idx+ i+1)] + ' '
22 | return id_q[:-1]
23 |
24 | def str_story(row_examples, i, delimiter):
25 | story_str = 'Story {}:\n'.format(i+1)
26 | for i in range(num_story_line):
27 | story_str += row_examples['sentence'+str(i+1)] + ' '
28 | story_str += '\n' + delimiter + '\n'
29 | return story_str
30 |
31 | output_list = []
32 | for index, row in df_sampled_stories.iterrows():
33 | out_dict = {}
34 | id_q = prepare_id(row, prompt_sent_num)
35 | out_dict['id'] = id_q
36 | df_examples = df_rest.sample(n=shot_num, replace=False)
37 | prompt_str = ' Here are {} stories. Each story has five sentences.\n\n'.format(shot_num+1)
38 | for i, (index, row_examples) in enumerate(df_examples.iterrows()):
39 | prompt_str += str_story(row_examples, i, delimiter)
40 |
41 | out_dict['prompt'] = prompt_str + 'Story {}:\n'.format(shot_num+1) + id_q
42 | out_dict['ref'] = prepare_id(row, 5 - prompt_sent_num, prompt_sent_num )
43 | output_list.append(out_dict)
44 |
45 | with open(output_prompt_file, 'w') as f_out:
46 | for out_dict in output_list:
47 | f_out.write(json.dumps(out_dict) + '\n' )
48 |
49 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/FactualityPrompt/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:20.12-py3
2 | # TODO: need to update this starting docker to something public!
3 |
4 | RUN apt-get update && apt-get install -y python pip
5 |
6 | RUN pip install thefuzz
7 | RUN pip install spacy
8 | RUN pip install bitarray
9 | RUN pip install datasets
10 | RUN pip install sentence-transformers==2.2.0
11 | RUN pip install Cython==0.29.15
12 | RUN pip install numpy==1.19.1
13 | RUN pip install benepar==0.1.3
14 | # RUN pip install torch==1.5.0
15 | RUN pip install fairseq==0.9.0
16 | RUN pip install nltk==3.5
17 | RUN pip install spacy==2.3.2
18 | # RUN pip install tensorflow==1.15.0
19 | RUN pip install transformers==3.4.0p
20 | RUN pip install tensorflow
21 |
22 |
23 | # bash script
24 | RUN python -m spacy download en_core_web_sm
25 |
26 | # python in bash
27 | RUN python - << EOF
28 | import nltk
29 | import benepar
30 | nltk.download('stopwords')
31 | nltk.download('punkt')
32 | benepar.download('benepar_en2')
33 | EOF
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=src
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .vscode/
3 | .DS_Store
4 | __pycache__
5 | .ipynb_checkpoints/
6 | /data/
7 | /features/
8 | /logs/
9 | /models/
10 | /model/
11 | /src/model/
12 | /logdir/
13 | *.log
14 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
2 |
3 | ENV NVIDIA_VISIBLE_DEVICES all
4 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
5 |
6 | RUN apt-get update
7 | RUN apt-get install -y --no-install-recommends --allow-unauthenticated \
8 | zip \
9 | gzip \
10 | make \
11 | automake \
12 | gcc \
13 | build-essential \
14 | g++ \
15 | cpp \
16 | libc6-dev \
17 | man-db \
18 | autoconf \
19 | pkg-config \
20 | unzip \
21 | libffi-dev \
22 | software-properties-common \
23 | wget \
24 | git
25 |
26 | ENV HOME "/root"
27 |
28 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
29 | RUN bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda && rm Miniconda3-latest-Linux-x86_64.sh
30 | ENV PATH "$PATH:$HOME/miniconda/bin"
31 | ENV LANG C.UTF-8
32 |
33 | RUN mkdir /fever
34 | WORKDIR /fever
35 | RUN mkdir -p data/fever
36 | RUN mkdir -p data/fasttext
37 |
38 | RUN wget -nv https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip && unzip wiki.en.zip -d data/fasttext && rm wiki.en.zip
39 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/claim_verification_esim.ckpt.zip
40 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/sentence_retrieval_ensemble.ckpt.zip
41 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/document_retrieval_datasets.zip
42 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/claim_verification_esim_glove_fasttext.ckpt.zip
43 |
44 | RUN mkdir -p model/no_attention_glove/rte_checkpoints/
45 | RUN mkdir -p model/esim_0/rte_checkpoints/
46 | RUN mkdir -p model/esim_0/sentence_retrieval_ensemble/
47 | RUN unzip claim_verification_esim.ckpt.zip -d model/no_attention_glove/rte_checkpoints/
48 | RUN unzip claim_verification_esim_glove_fasttext.ckpt.zip -d model/esim_0/rte_checkpoints/
49 | RUN unzip sentence_retrieval_ensemble.ckpt.zip -d model/esim_0/sentence_retrieval_ensemble/
50 | RUN unzip document_retrieval_datasets.zip -d data/fever/
51 |
52 | RUN wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
53 | RUN unzip glove.6B.zip -d data/glove && rm glove.6B.zip
54 | RUN gzip data/glove/*.txt
55 |
56 | RUN rm *.zip
57 |
58 | RUN conda install python=3.6
59 | RUN conda install Cython=0.28.5
60 | ADD requirements.txt /fever/
61 | RUN pip install -r requirements.txt
62 | RUN conda uninstall -y Cython
63 | RUN pip uninstall -y pyfasttext
64 | RUN pip install --force --upgrade cysignals==1.7.2
65 | RUN pip install --force --upgrade pyfasttext
66 | RUN conda install tensorflow=1.9.0 tensorflow-gpu=1.9.0
67 |
68 | RUN python -c "import nltk; nltk.download('punkt')"
69 |
70 | ADD src src
71 | ADD server.sh .
72 | ADD predict.sh .
73 | ENV PYTHONPATH /fever/src
74 | ENV PYTHONUNBUFFERED 1
75 | CMD bash
76 | #CMD python -m athene.system --db-path /local/fever-common/data/fever/fever.db --words-cache model/sentence --sentence-model model/esim_0/sentence_retrieval_ensemble
77 | #CMD python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model1 --add_prefix=model_0/
78 | CMD ["waitress-serve", "--host=0.0.0.0","--port=5000", "--call", "athene.system:web"]
79 | #CMD ["bash", "./server.sh"]
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/NOTICE.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | Copyright 2019
3 | Ubiquitous Knowledge Processing (UKP) Lab
4 | Technische Universität Darmstadt
5 |
6 | -------------------------------------------------------------------------------
7 | Third party legal information
8 |
9 |
10 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/conf/config_no_attention_nodrop_glove_only.json:
--------------------------------------------------------------------------------
1 | {
2 | "BASE_DIR": ".",
3 | "SUBMISSION_FILE_NAME": "predictions.jsonl",
4 | "model_name": "no_attention_glove",
5 | "glove_path": "./data/glove/glove.6B.300d.txt.gz",
6 | "fasttext_path": "./data/fasttext/wiki.en.bin",
7 | "model_folder": "model/no_attention_glove",
8 | "ckpt_folder": "model/no_attention_glove/rte_checkpoints",
9 | "db_path": "./data/fever/fever.db",
10 | "dataset_folder": "./data/fever",
11 | "training_set_file": "./data/fever/train.p7.s5.jsonl",
12 | "dev_set_file": "./data/fever/dev.p7.s5.jsonl",
13 | "test_set_file": "./data/fever/test.p7.s5.jsonl",
14 | "submission_folder": "submission_no_attention_glove",
15 | "submission_file": "submission_no_attention_glove/predictions.jsonl",
16 | "estimator_name": "esim_glove_no_attention",
17 | "pickle_name": "esim.p",
18 | "esim_hyper_param": {
19 | "num_neurons": [
20 | 250,
21 | 180,
22 | 180,
23 | 900,
24 | 550
25 | ],
26 | "lr": 0.002,
27 | "dropout": 0,
28 | "batch_size": 64,
29 | "pos_weight": [
30 | 0.408658712,
31 | 1.942468514,
32 | 1.540587559
33 | ],
34 | "max_checks_no_progress": 5,
35 | "trainable": false,
36 | "show_progress": 1,
37 | "n_outputs": 3,
38 | "lstm_layers": 1,
39 | "optimizer": "adam",
40 | "num_epoch": 100,
41 | "activation": "relu",
42 | "initializer": "he"
43 | },
44 | "max_sentences": 5,
45 | "max_sentence_size": 50,
46 | "n_jobs_ensemble": 3,
47 | "seed": 55,
48 | "name": "claim_verification_esim"
49 | }
50 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/predict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m athene.system --in-file $1 --out-file $2
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | cysignals==1.7.2
3 | thinc==7.0.4
4 | fever-drqa
5 | fever-scorer
6 | spacy
7 | allennlp
8 | wikipedia
9 | pyfasttext
10 | gensim
11 | tensorflow
12 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model1 --add_prefix=model_0/
4 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model2 --add_prefix=model_1/
5 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model3 --add_prefix=model_2/
6 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model4 --add_prefix=model_3/
7 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model5 --add_prefix=model_4/
8 |
9 | waitress-serve --host=0.0.0.0 --port=5000 --call athene.system:web
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/retrieval/document/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/document/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/retrieval/score/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/score/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/data_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/data_processing/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/deep_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/deep_models/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/rte/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/deep_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/rte/deep_models/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/deep_models/convert_use.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow_hub as hub
3 |
4 | from copy_graph import copy_op_to_graph, copy_variable_to_graph, get_copied_op
5 |
6 |
7 | def main():
8 |
9 | tf.logging.set_verbosity(tf.logging.ERROR)
10 |
11 | g1 = tf.Graph()
12 | g2 = tf.Graph()
13 |
14 | scope = 'finetune'
15 |
16 | with g1.as_default():
17 | embed = hub.Module(
18 | "https://tfhub.dev/google/universal-sentence-encoder/1", trainable=True)
19 | sess = tf.Session(graph=g1)
20 | with sess.as_default():
21 | # copy all variables
22 | variables = []
23 | for variable in tf.global_variables():
24 | new_variable = copy_variable_to_graph(
25 | variable, g2, True, scope)
26 | variables.append(new_variable)
27 | # copy all ops
28 | for op in g1.get_operations():
29 | copy_op_to_graph(op, g2, variables, scope)
30 | # copy table initilization
31 | copy_op_to_graph(tf.tables_initializer(), g2, variables, scope)
32 |
33 | tf.reset_default_graph()
34 |
35 | with g2.as_default():
36 | sess = tf.Session(graph=g2)
37 | with sess.as_default():
38 |
39 | sess.run(tf.global_variables_initializer())
40 | sess.run(tf.get_default_graph().get_operation_by_name(
41 | 'finetune/init_all_tables'))
42 |
43 | in_tensor = tf.get_default_graph().get_tensor_by_name(
44 | scope + '/module/fed_input_values:0')
45 | ou_tensor = tf.get_default_graph().get_tensor_by_name(
46 | scope + '/module/Encoder_en/hidden_layers/l2_normalize:0')
47 |
48 | for v in tf.trainable_variables():
49 | print(v.name, v)
50 |
51 | save_path = 'model/USE.ckpt'
52 | saver = tf.train.Saver()
53 | saver.save(sess, save_path)
54 |
55 |
56 | if __name__ == '__main__':
57 | main()
58 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/rte/utils/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/customized_votingclassifier.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import numpy as np
4 |
5 |
6 | class CustomizedVotingClassifier:
7 | def __init__(self, prediction_path_list, voting):
8 | self.prediction_path_list = prediction_path_list
9 | self.voting = voting
10 |
11 | def fit(self, X, y):
12 | raise NotImplementedError(
13 | "This voting classifier is only used for combining existing models, not for training!")
14 |
15 | def _raw_probas(self):
16 | _probas = []
17 | for prediction_path in self.prediction_path_list:
18 | with open(prediction_path, 'rb') as f:
19 | _probas.append(pickle.load(f))
20 | return np.asarray(_probas)
21 |
22 | def predict_proba(self, X):
23 | # samples * classes
24 | _avg_probas = np.average(self._raw_probas(), axis=0)
25 | return np.argmax(_avg_probas, axis=1)
26 |
27 | def predict(self, X):
28 | if self.voting == 'soft':
29 | return self.predict_proba(X)
30 | else:
31 | # models * samples * classes
32 | _raw_probas = self._raw_probas()
33 | # models * samples
34 | _predictions_per_model = np.argmax(_raw_probas, axis=2)
35 | return np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=_predictions_per_model.T)
36 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/dataset.py:
--------------------------------------------------------------------------------
1 | from csv import DictReader
2 |
3 |
4 | class DataSet:
5 | """
6 | Define class for Fake News Challenge data
7 | """
8 |
9 | def __init__(self, file_stances, file_bodies):
10 |
11 | # Load data
12 | self.instances = self.read(file_stances)
13 | bodies = self.read(file_bodies)
14 | self.heads = {}
15 | self.bodies = {}
16 |
17 | # Process instances
18 | for instance in self.instances:
19 | if instance['Claim'] not in self.heads:
20 | head_id = len(self.heads)
21 | self.heads[instance['Claim']] = head_id
22 | instance['Body ID'] = int(instance['Body ID'])
23 |
24 | # Process bodies
25 | for body in bodies:
26 | self.bodies[int(body['Body ID'])] = body['Snippets']
27 |
28 | def read(self, filename):
29 |
30 | """
31 | Read Fake News Challenge data from CSV file
32 | Args:
33 | filename: str, filename + extension
34 | Returns:
35 | rows: list, of dict per instance
36 | """
37 |
38 | # Initialise
39 | rows = []
40 |
41 | # Process file
42 | with open(filename, "r", encoding='utf-8') as table:
43 | r = DictReader(table)
44 | for line in r:
45 | rows.append(line)
46 |
47 | return rows
48 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/estimator_definitions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def get_estimator(scorer_type, save_folder=None):
5 | if scorer_type == 'esim':
6 | # submitted model, glove + fasttext, with attention
7 | from os import path
8 | from athene.rte.deep_models.ESIM_for_ensemble import ESIM
9 | from athene.utils.config import Config
10 | pos_weight = np.asarray(Config.esim_hyper_param['pos_weight'], np.float32)
11 | clf = ESIM(random_state=Config.seed, tensorboard_logdir="logdir/", learning_rate=Config.esim_hyper_param['lr'],
12 | max_check_without_progress=Config.esim_hyper_param['max_checks_no_progress'],
13 | activation=Config.esim_hyper_param['activation'],
14 | initializer=Config.esim_hyper_param['initializer'],
15 | lstm_layers=Config.esim_hyper_param['lstm_layers'],
16 | optimizer=Config.esim_hyper_param['optimizer'],
17 | trainable=Config.esim_hyper_param['trainable'],
18 | batch_size=Config.esim_hyper_param['batch_size'],
19 | dropout_rate=Config.esim_hyper_param['dropout'],
20 | num_neurons=Config.esim_hyper_param['num_neurons'], pos_weight=pos_weight,
21 | ckpt_path=path.join(save_folder, Config.name + '.ckpt'), name=Config.name)
22 |
23 | if scorer_type == 'esim_glove_no_attention':
24 | # glove, no attention
25 | from os import path
26 | from athene.rte.deep_models.ESIM_for_ensemble_glove_only_no_attention import ESIM
27 | from athene.utils.config import Config
28 | pos_weight = np.asarray(Config.esim_hyper_param['pos_weight'], np.float32)
29 | clf = ESIM(random_state=Config.seed, tensorboard_logdir="logdir/", learning_rate=Config.esim_hyper_param['lr'],
30 | max_check_without_progress=Config.esim_hyper_param['max_checks_no_progress'],
31 | activation=Config.esim_hyper_param['activation'],
32 | initializer=Config.esim_hyper_param['initializer'],
33 | lstm_layers=Config.esim_hyper_param['lstm_layers'],
34 | optimizer=Config.esim_hyper_param['optimizer'],
35 | trainable=Config.esim_hyper_param['trainable'],
36 | batch_size=Config.esim_hyper_param['batch_size'],
37 | dropout_rate=Config.esim_hyper_param['dropout'],
38 | num_neurons=Config.esim_hyper_param['num_neurons'], pos_weight=pos_weight,
39 | ckpt_path=path.join(save_folder, Config.name + '.ckpt'), name=Config.name)
40 | return clf
41 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/fill_gold_sentences.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | from tqdm import tqdm
4 | from common.dataset.reader import JSONLineReader
5 | from common.util.log_helper import LogHelper
6 |
7 |
8 | def _sent_to_str(sent):
9 | return sent[-2] + "$$$" + str(sent[-1])
10 |
11 |
12 | def _replace_sent_with_str(sent, string):
13 | segments = string.split(r"$$$")
14 | if len(segments) != 2:
15 | raise Exception("Illegal string: " + string)
16 | sent[-2] = segments[0]
17 | sent[-1] = int(segments[1])
18 | return sent
19 |
20 |
21 | def _build_new_sent_with_str(string, num_of_segments=2):
22 | if num_of_segments == 2:
23 | sent = ["", -1]
24 | elif num_of_segments == 4:
25 | sent = [-1, -1, "", -1]
26 | else:
27 | raise Exception("Illegal num_of_segments: " + str(num_of_segments))
28 | return _replace_sent_with_str(sent, string)
29 |
30 |
31 | def _sents_from_evidences(evidences):
32 | sents = set()
33 | for evidence in evidences:
34 | for s in evidence:
35 | sent = _sent_to_str(s)
36 | sents.add(sent)
37 | return sents
38 |
39 |
40 | def _fill_pred_sents_with_gold(pred_sents, gold_sents, max_sent):
41 | selected_sents = pred_sents[:max_sent]
42 | neg_indices = []
43 | for i, selected in enumerate(selected_sents):
44 | key_selected = _sent_to_str(selected)
45 | if key_selected in gold_sents:
46 | gold_sents.remove(key_selected)
47 | else:
48 | neg_indices.append(i)
49 | if len(gold_sents) == 0:
50 | return selected_sents
51 | if len(selected_sents) <= max_sent:
52 | for _ in range(max_sent - len(selected_sents)):
53 | selected_sents.append(_build_new_sent_with_str(gold_sents.pop()))
54 | if len(gold_sents) == 0:
55 | return selected_sents
56 | if len(neg_indices) > 0:
57 | neg_indices = reversed(neg_indices)
58 | for i in neg_indices:
59 | sent = selected_sents[i]
60 | selected_sents[i] = _replace_sent_with_str(sent, gold_sents.pop())
61 | if len(gold_sents) == 0:
62 | return selected_sents
63 | if len(gold_sents) > 0:
64 | logger.warn(str(len(gold_sents)) +
65 | " gold sentences cannot be filled into prediction")
66 | return selected_sents
67 |
68 |
69 | if __name__ == '__main__':
70 | LogHelper.setup()
71 | logger = LogHelper.get_logger('fill_gold_sentences')
72 | parser = argparse.ArgumentParser()
73 | parser.add_argument(
74 | '--input', help='/path/to/input/file', required=True)
75 | parser.add_argument(
76 | '--output', help='/path/to/output/file', required=True)
77 | parser.add_argument(
78 | '--max-sent', type=int, help='Maximal number of sentences per claim', default=10)
79 | args = parser.parse_args()
80 | jlr = JSONLineReader()
81 | data = jlr.read(args.input)
82 | with open(args.output, "w+") as output_file:
83 | for data in tqdm(data):
84 | if data['verifiable'] != 'NOT VERIFIABLE':
85 | pred_sents = data['predicted_sentences']
86 | gold_evidences = data['evidence']
87 | gold_sents = _sents_from_evidences(gold_evidences)
88 | filled_pred_sents = _fill_pred_sents_with_gold(
89 | pred_sents, gold_sents, args.max_sent)
90 | data['predicted_sentences'] = filled_pred_sents
91 | output_file.write(json.dumps(data) + "\n")
92 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/score.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import recall_score, precision_score, confusion_matrix, accuracy_score, f1_score
2 |
3 |
4 | def print_metrics(gold_labels, predictions, logger=None, average='macro'):
5 | info = logger.info if logger is not None else print
6 | info("Accuracy: " + str(accuracy_score(gold_labels, predictions)) + "\tRecall: " + str(
7 | recall_score(gold_labels, predictions, average=average)) + "\tPrecision: " + str(
8 | precision_score(gold_labels, predictions, average=average)) + "\tF1 " + average + ": " + str(
9 | f1_score(gold_labels, predictions, average=average)))
10 | info("Confusion Matrix:")
11 | info(confusion_matrix(gold_labels, predictions))
12 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/rte/utils/text_processing.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import os
3 | import pickle
4 | import re
5 |
6 | import nltk
7 | import numpy as np
8 |
9 | from common.util.log_helper import LogHelper
10 |
11 | # import torch
12 | np.random.seed(55)
13 |
14 |
15 | def vocab_map(vocab):
16 | voc_dict = {}
17 | for i, v in enumerate(vocab):
18 | voc_dict[v] = i
19 | # else:
20 | # voc_dict['UNK'] = i
21 | return voc_dict
22 |
23 |
24 | def tokenize(sequence):
25 | tokens = [token.replace("``", '').replace("''", '').replace('"', '') for token in nltk.word_tokenize(sequence) if
26 | token != " "]
27 | # return tokens
28 | return tokens
29 |
30 |
31 | def clean_text(text):
32 | text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
33 | text = re.sub(r'\', ' ', text)
38 | text = text.replace("...", " ")
39 | return text
40 |
41 |
42 | def load_whole_glove(glove_file):
43 | logger = LogHelper.get_logger("load_whole_glove")
44 | is_gz = os.path.splitext(glove_file)[1] == '.gz'
45 |
46 | # Getting embedding dimension
47 | def _get_dim(_file):
48 | line = _file.readline()
49 | return len(line.strip().split(' ')) - 1
50 |
51 | if is_gz:
52 | with gzip.open(glove_file, 'rt') as file0:
53 | emb_dim = _get_dim(file0)
54 | else:
55 | with open(glove_file, 'r', encoding='utf-8') as file0:
56 | emb_dim = _get_dim(file0)
57 |
58 | # First row of embedding matrix is 0 for zero padding
59 | vocab = ['[PAD]']
60 | embed = [[0.0] * emb_dim]
61 | vocab.append('UNK')
62 | embed.append([1.0] * emb_dim)
63 |
64 | def _read_glove_file(_vocab, _embed, _file):
65 | for line in _file:
66 | items = line.replace('\r', '').replace('\n', '').split(' ')
67 | if len(items) < 10:
68 | logger.debug("exceptional line: {}".format(line))
69 | continue
70 | word = items[0]
71 | _vocab.append(word)
72 | vec = [float(i) for i in items[1:]]
73 | _embed.append(vec)
74 | return _vocab, _embed
75 |
76 | # Reading embedding matrix
77 | if is_gz:
78 | with gzip.open(glove_file, 'rt') as file:
79 | vocab, embed = _read_glove_file(vocab, embed, file)
80 | else:
81 | with open(glove_file, 'r', encoding='utf-8') as file:
82 | vocab, embed = _read_glove_file(vocab, embed, file)
83 | logger.info('Loaded GloVe!')
84 | return vocab, embed
85 | # if __name__=="__main__":
86 | #
87 | # text ="I don\'t think this is right..."
88 | # text =clean_text(text)
89 | # print(text)
90 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/athene/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/utils/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/dataset/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/block.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 |
4 | from common.util.log_helper import LogHelper
5 |
6 |
7 | class Block(object):
8 | def __init__(self,block,name,path):
9 | self.logger = LogHelper.get_logger(Block.__name__)
10 | self.volume = block
11 | self.path = path
12 | self.name = name
13 | self.data = None
14 |
15 | def save(self,name,data):
16 | self.data[name] = data
17 |
18 | def write(self):
19 | self.logger.info("Write block {0}".format(self.volume))
20 | with open((os.path.join(self.path, self.name + "-" + str(self.volume) + ".p")), "wb+") as f:
21 | pickle.dump(self.data, f)
22 |
23 | with open((os.path.join(self.path, self.name + "-" + str(self.volume) + ".p.idx")), "wb+") as f:
24 | pickle.dump(set(self.data.keys()), f)
25 |
26 | self.data = dict()
27 |
28 | def close(self):
29 | self.write()
30 |
31 | def __exit__(self, exc_type, exc_val, exc_tb):
32 | self.close()
33 |
34 | def __enter__(self):
35 | return self
36 |
37 | def __getitem__(self, item):
38 | return self.data[item]
39 |
40 | def list(self):
41 | return self.data.keys()
42 |
43 | def load(self):
44 | with open((os.path.join(self.path, self.name + "-" + str(self.volume) + ".p")), "rb") as f:
45 | self.data = pickle.load(f)
46 | self.logger.info("Loaded {0} articles".format(len(self.data)))
47 |
48 | def __iter__(self):
49 | if self.data is None:
50 | self.logger.info("Load block {0}".format(self.volume))
51 | self.load()
52 | return iter(self.data)
53 |
54 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/corpus.py:
--------------------------------------------------------------------------------
1 | from common.dataset.block import Block
2 | from common.util.log_helper import LogHelper
3 |
4 |
5 | class Corpus:
6 |
7 | def __init__(self,name,path,blocks,preprocessing=None):
8 | self.logger = LogHelper.get_logger(Corpus.__name__)
9 | self.name = name
10 | self.path = path
11 | self.blocks = blocks
12 | self.active_block_iter = None
13 | self.active_block = None
14 | self.active_block_number = None
15 | self.preprocessing = preprocessing
16 |
17 | def __iter__(self):
18 | self.active_block_iter = None
19 | self.active_block = None
20 | self.active_block_number = None
21 | return self
22 |
23 |
24 | def next_block(self):
25 | if self.active_block_number is None:
26 | self.active_block_number = 0
27 | else:
28 | self.active_block_number += 1
29 |
30 | self.logger.info("Trying to load block {0}".format(self.active_block_number))
31 | if self.active_block_number >= self.blocks:
32 | self.logger.info("No more blocks")
33 | raise StopIteration
34 |
35 | self.active_block = Block(self.active_block_number, self.name,self.path)
36 | self.active_block_iter = iter(self.active_block)
37 |
38 | def __next__(self):
39 | # Check if we have started with a block
40 | if self.active_block_iter is None:
41 | self.next_block()
42 |
43 | # Get the next item from this block
44 | try:
45 | n = next(self.active_block_iter)
46 |
47 | except StopIteration:
48 | # If the block is exhausted, try and get next from the next block
49 | try:
50 | self.next_block()
51 | n = next(self.active_block_iter)
52 | except StopIteration as e:
53 | # If we're out of blocks, reset and stop iteration
54 | self.active_block_iter = None
55 | self.active_block = None
56 | self.active_block_number = None
57 | raise e
58 |
59 | return n, self.preprocessing(self.active_block[n])
60 |
61 | def __getitem__(self, item):
62 | return self.active_block[item]
63 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/data_set.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | class DataSet():
5 | def __init__(self,file,reader,formatter):
6 | self.reader = reader
7 | self.file = file
8 | self.formatter = formatter
9 | self.data = []
10 |
11 |
12 | def read(self):
13 | if os.getenv("DEBUG","").lower() in ["1","y","yes","t"]:
14 | self.data.extend(filter(lambda record: record is not None, self.formatter.format(self.reader.read(self.file)[:10])))
15 | else:
16 | self.data.extend(filter(lambda record: record is not None, self.formatter.format(self.reader.read(self.file))))
17 |
18 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/formatter.py:
--------------------------------------------------------------------------------
1 | class Formatter():
2 | def __init__(self,label_schema):
3 | self.label_schema = label_schema
4 |
5 | def format(self,lines):
6 | formatted = []
7 | for line in lines:
8 | fl = self.format_line(line)
9 | if fl is not None:
10 | if isinstance(fl,list):
11 | formatted.extend(fl)
12 | else:
13 | formatted.append(fl)
14 |
15 | return formatted
16 |
17 | def format_line(self,line):
18 | pass
19 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/label_schema.py:
--------------------------------------------------------------------------------
1 |
2 | class LabelSchema:
3 | def __init__(self,labels):
4 | self.labels = {self.preprocess(val):idx for idx,val in enumerate(labels)}
5 | self.idx = {idx:self.preprocess(val) for idx,val in enumerate(labels)}
6 |
7 | def get_id(self,label):
8 | if self.preprocess(label) in self.labels:
9 | return self.labels[self.preprocess(label)]
10 | return None
11 |
12 | def preprocess(self,item):
13 | return item.lower()
14 |
15 |
16 |
17 | class SNLILabelSchema(LabelSchema):
18 | def __init__(self):
19 | super(SNLILabelSchema, self).__init__(["neither","contradiction","entailment"])
20 |
21 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/persistence/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/dataset/persistence/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/persistence/engine.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine
2 |
3 |
4 | def get_engine(file):
5 | return create_engine('sqlite:///data/fever/{0}.db'.format(file), echo=False)
6 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/persistence/page.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.ext.declarative import declarative_base
2 | from sqlalchemy import Column, Integer, String, Text
3 |
4 | Base = declarative_base()
5 |
6 |
7 | class Page(Base):
8 | __tablename__ = "page"
9 | id = Column(Integer, primary_key=True)
10 | name = Column(String)
11 | doc = Column(Text)
12 | raw = Column(Text)
13 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/persistence/session.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.ext.declarative import declarative_base
2 | from sqlalchemy.orm import sessionmaker
3 |
4 | from common.dataset.persistence.page import Page
5 |
6 | def get_session(engine):
7 | Base = declarative_base()
8 | Session = sessionmaker(bind=engine)
9 |
10 | session = Session()
11 | if not engine.dialect.has_table(engine, Page.__tablename__):
12 | Page.__table__.create(bind=engine,checkfirst=True)
13 | return session
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/reader.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import json
3 |
4 |
5 | class Reader:
6 | def __init__(self,encoding="utf-8"):
7 | self.enc = encoding
8 |
9 | def read(self,file):
10 | with open(file,"r",encoding = self.enc) as f:
11 | return self.process(f)
12 |
13 | def process(self,f):
14 | pass
15 |
16 |
17 | class CSVReader(Reader):
18 | def process(self,fp):
19 | r = csv.DictReader(fp)
20 | return [line for line in r]
21 |
22 | class JSONReader(Reader):
23 | def process(self,fp):
24 | return json.load(fp)
25 |
26 |
27 | class JSONLineReader(Reader):
28 | def process(self,fp):
29 | data = []
30 | for line in fp.readlines():
31 | data.append(json.loads(line.strip()))
32 | return data
33 |
34 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/reverse_index.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import pickle
4 | from tqdm import tqdm
5 |
6 |
7 | class ReverseIndex:
8 | def __init__(self,docs, preprocessing):
9 | self.lookup = defaultdict(set)
10 | self.preprocess = preprocessing
11 |
12 | if docs is not None:
13 | for title,words in tqdm(docs):
14 | self.add(title,self.preprocess(words))
15 |
16 | def add(self,title,words):
17 | for word in words:
18 | self.lookup[word].add(title)
19 |
20 | def docs(self,phrase):
21 | ret = []
22 | for word in self.preprocess(phrase):
23 | ret.extend(self.lookup[word])
24 | return ret
25 |
26 | def save(self,file):
27 | with open(file,"wb+") as f:
28 | pickle.dump(self.lookup,f)
29 |
30 | def load(self,file):
31 | with open(file,"rb") as f:
32 | self.lookup = pickle.load(f)
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/s3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/dataset/s3/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/s3/index.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | from common.util.log_helper import LogHelper
4 |
5 |
6 | class Indexer:
7 | def __init__(self,file):
8 | self.pages = []
9 | self.file = file
10 | self.logger = LogHelper.get_logger(__name__)
11 | self.logger.info("Indexing Pages")
12 |
13 | def index_page(self,key):
14 | self.logger.debug("Index Page: {0}".format(key))
15 | self.pages.append(key)
16 |
17 | def load(self):
18 | self.pages.extend(pickle.load(self.file))
19 |
20 | def get_block(self,block,num_blocks=50):
21 | return self.pages[block*len(self.pages)//num_blocks:(block+1)*len(self.pages)//num_blocks]
22 |
23 | def __enter__(self):
24 | return self
25 |
26 | def __exit__(self, exc_type, exc_val, exc_tb):
27 | self.logger.info("Saving index")
28 | pickle.dump(self.pages,self.file)
29 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/dataset/s3/iterator.py:
--------------------------------------------------------------------------------
1 |
2 | # Based off -
3 | # https://stackoverflow.com/questions/31918960/boto3-to-download-all-files-from-a-s3-bucket
4 |
5 |
6 | def s3_iterator(client, resource, root, dir, bucket, action):
7 | paginator = client.get_paginator('list_objects')
8 |
9 | for result in paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=dir):
10 | if result.get('CommonPrefixes') is not None:
11 | for subdir in result.get('CommonPrefixes'):
12 | s3_iterator(client, resource, root, subdir.get('Prefix'), bucket, action)
13 | if result.get('Contents') is not None:
14 | for file in result.get('Contents'):
15 | action(file.get("Key").replace(root,""))
16 |
17 |
18 | #print(file.get('Key').replace(dist,""))
19 |
20 | #obj = client.get_object(Bucket=bucket, Key=file.get("Key"))
21 | #writer.save(file.get("Key").replace(dist,""), obj["Body"].read().decode("utf-8"))
22 |
23 | #resource.meta.client.download_file(bucket, file.get('Key'), local + os.sep + clean(file.get('Key')))
24 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/framework/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/framework/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/framework/task.py:
--------------------------------------------------------------------------------
1 |
2 | class Task:
3 | def score(self,data,labels):
4 | self.do_scoring(data,labels)
5 |
6 | def do_scoring(self):
7 | raise NotImplementedError("Not Implemented Here")
8 |
9 |
10 | class IRTask(Task):
11 | def do_scoring(self,data,labels):
12 | pass
13 |
14 |
15 |
16 |
17 |
18 |
19 | class InferenceTask(Task):
20 | pass
21 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/training/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/training/batcher.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | from scipy.sparse import coo_matrix
5 | from torch.autograd import Variable
6 |
7 | from common.training.options import gpu
8 |
9 |
10 | class Batcher():
11 |
12 | def __init__(self,data,size):
13 | self.data = data
14 | self.size = size
15 | self.pointer = 0
16 |
17 | if isinstance(self.data,coo_matrix):
18 | self.data = self.data.tocsr()
19 |
20 | def __next__(self):
21 | if self.pointer == splen(self.data):
22 | self.pointer = 0
23 | raise StopIteration
24 |
25 | next = min(splen(self.data),self.pointer+self.size)
26 | to_return = self.data[self.pointer : next]
27 |
28 | start,end = self.pointer,next
29 |
30 | self.pointer = next
31 |
32 |
33 | return to_return, splen(to_return), start, end
34 |
35 | def __iter__(self):
36 | return self
37 |
38 | def splen(data):
39 | try:
40 | return data.shape[0]
41 | except:
42 | return len(data)
43 |
44 |
45 |
46 | def prepare_with_labels(data,labels):
47 | data = data.todense()
48 | v = torch.FloatTensor(np.array(data))
49 | if gpu():
50 | return Variable(v.cuda()), Variable(torch.LongTensor(labels).cuda())
51 | return Variable(v), Variable(torch.LongTensor(labels))
52 |
53 |
54 | def prepare(data):
55 | data = data.todense()
56 | v = torch.FloatTensor(np.array(data))
57 | if gpu():
58 | return Variable(v.cuda())
59 | return Variable(v)
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/training/early_stopping.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from common.util.log_helper import LogHelper
4 |
5 |
6 | class EarlyStopping():
7 | def __init__(self,name,patience=8):
8 | self.patience = patience
9 | self.best_model = None
10 | self.best_score = None
11 |
12 | self.best_epoch = 0
13 | self.epoch = 0
14 |
15 | self.name = name
16 | self.logger = LogHelper.get_logger(EarlyStopping.__name__)
17 |
18 | def __call__(self, model, acc):
19 | self.epoch += 1
20 |
21 | if self.best_score is None:
22 | self.best_score = acc
23 |
24 | if acc >= self.best_score:
25 | torch.save(model.state_dict(),"models/{0}.best.save".format(self.name))
26 | self.best_score = acc
27 | self.best_epoch = self.epoch
28 | self.logger.info("Saving best weights from round {0}".format(self.epoch))
29 | return False
30 |
31 | elif self.epoch > self.best_epoch+self.patience:
32 | self.logger.info("Early stopping: Terminate")
33 | return True
34 |
35 | self.logger.info("Early stopping: Worse Round")
36 | return False
37 |
38 | def set_best_state(self,model):
39 | self.logger.info("Loading weights from round {0}".format(self.best_epoch))
40 | model.load_state_dict(torch.load("models/{0}.best.save".format(self.name)))
41 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/training/options.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 |
5 |
6 | def is_gpu():
7 | return os.getenv("GPU","no").lower() in ["1",1,"yes","true","t"]
8 |
9 | def gpu():
10 | if is_gpu():
11 | torch.cuda.set_device(int(os.getenv("CUDA_DEVICE", 0)))
12 | return True
13 | return False
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/training/run.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from sklearn.utils import shuffle
6 |
7 | from tqdm import tqdm
8 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
9 | from common.training.batcher import Batcher, prepare, prepare_with_labels
10 | from common.util.random import SimpleRandom
11 |
12 |
13 | def evaluate(model,data,labels,batch_size):
14 | predicted = predict(model,data,batch_size)
15 | return accuracy_score(labels,predicted.data.numpy().reshape(-1))
16 |
17 | def predict(model, data, batch_size):
18 | batcher = Batcher(data, batch_size)
19 |
20 | predicted = []
21 | for batch, size, start, end in batcher:
22 | d = prepare(batch)
23 | model.eval()
24 | logits = model(d).cpu()
25 |
26 | predicted.extend(torch.max(logits, 1)[1])
27 | return torch.stack(predicted)
28 |
29 | def train(model, fs, batch_size, lr, epochs,dev=None, clip=None, early_stopping=None,name=None):
30 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
31 |
32 | data, labels = fs
33 | if dev is not None:
34 | dev_data,dev_labels = dev
35 |
36 | for epoch in tqdm(range(epochs)):
37 | epoch_loss = 0
38 | epoch_data = 0
39 |
40 | shuffle(data,labels)
41 |
42 | batcher = Batcher(data, batch_size)
43 |
44 | for batch, size, start, end in batcher:
45 | d,gold = prepare_with_labels(batch,labels[start:end])
46 |
47 | model.train()
48 | optimizer.zero_grad()
49 | logits = model(d)
50 |
51 | loss = F.cross_entropy(logits, gold)
52 | loss.backward()
53 |
54 | epoch_loss += loss.cpu()
55 | epoch_data += size
56 |
57 | if clip is not None:
58 | torch.nn.utils.clip_grad_norm(model.parameters(), clip)
59 | optimizer.step()
60 |
61 | print("Average epoch loss: {0}".format((epoch_loss/epoch_data).data.numpy()))
62 |
63 | #print("Epoch Train Accuracy {0}".format(evaluate(model, data, labels, batch_size)))
64 | if dev is not None:
65 | acc = evaluate(model,dev_data,dev_labels,batch_size)
66 | print("Epoch Dev Accuracy {0}".format(acc))
67 |
68 | if early_stopping is not None and early_stopping(model,acc):
69 | break
70 |
71 | if dev is not None and early_stopping is not None:
72 | early_stopping.set_best_state(model)
73 |
74 |
75 |
76 | def print_evaluation(model,data,ls,log=None):
77 | features,actual = data
78 | predictions = predict(model, features, 500).data.numpy().reshape(-1).tolist()
79 |
80 | labels = [ls.idx[i] for i, _ in enumerate(ls.idx)]
81 |
82 | actual = [labels[i] for i in actual]
83 | predictions = [labels[i] for i in predictions]
84 |
85 | print(accuracy_score(actual, predictions))
86 | print(classification_report(actual, predictions))
87 | print(confusion_matrix(actual, predictions))
88 |
89 | data = zip(actual,predictions)
90 | if log is not None:
91 | f = open(log, "w+")
92 | for a,p in data:
93 | f.write(json.dumps({"actual": a, "predicted": p}) + "\n")
94 | f.close()
95 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/util/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/util/array.py:
--------------------------------------------------------------------------------
1 | def flatten(l):
2 | return [item for sublist in l for item in sublist]
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/util/log_helper.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | class LogHelper():
5 | handler = None
6 |
7 | @staticmethod
8 | def setup():
9 | FORMAT = '[%(levelname)s] %(asctime)s - %(name)s - %(message)s'
10 | LogHelper.handler = logging.StreamHandler()
11 | LogHelper.handler.setLevel(logging.DEBUG)
12 | LogHelper.handler.setFormatter(logging.Formatter(FORMAT))
13 |
14 | LogHelper.get_logger(LogHelper.__name__).info("Log Helper set up")
15 |
16 | @staticmethod
17 | def get_logger(name, level=logging.DEBUG):
18 | l = logging.getLogger(name)
19 | if len(l.handlers) == 0:
20 | l.setLevel(level)
21 | l.addHandler(LogHelper.handler)
22 | return l
23 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/common/util/random.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import numpy as np
4 | import torch
5 |
6 | from common.training.options import gpu
7 |
8 |
9 | class SimpleRandom():
10 | instance = None
11 |
12 | def __init__(self,seed):
13 | self.seed = seed
14 | self.random = random.Random(seed)
15 |
16 | def next_rand(self,a,b):
17 | return self.random.randint(a,b)
18 |
19 | @staticmethod
20 | def get_instance():
21 | if SimpleRandom.instance is None:
22 | SimpleRandom.instance = SimpleRandom(SimpleRandom.get_seed())
23 | return SimpleRandom.instance
24 |
25 | @staticmethod
26 | def get_seed():
27 | return int(os.getenv("RANDOM_SEED", 12459))
28 |
29 | @staticmethod
30 | def set_seeds():
31 |
32 | torch.manual_seed(SimpleRandom.get_seed())
33 | if gpu():
34 | torch.cuda.manual_seed_all(SimpleRandom.get_seed())
35 | np.random.seed(SimpleRandom.get_seed())
36 | random.seed(SimpleRandom.get_seed())
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/rename.py:
--------------------------------------------------------------------------------
1 | import sys, getopt
2 |
3 | import tensorflow as tf
4 |
5 | usage_str = 'python tensorflow_rename_variables.py --checkpoint_dir=path/to/dir/ ' \
6 | '--replace_from=substr --replace_to=substr --add_prefix=abc --dry_run'
7 |
8 |
9 | def rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run):
10 | checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
11 | with tf.Session() as sess:
12 | for var_name, _ in tf.contrib.framework.list_variables(checkpoint_dir):
13 | # Load the variable
14 | print(var_name)
15 | var = tf.contrib.framework.load_variable(checkpoint_dir, var_name)
16 |
17 | # Set the new name
18 | new_name = var_name
19 | if None not in [replace_from, replace_to]:
20 | new_name = new_name.replace(replace_from, replace_to)
21 | if add_prefix:
22 | new_name = add_prefix + new_name
23 |
24 | if dry_run:
25 | print('%s would be renamed to %s.' % (var_name, new_name))
26 | else:
27 | print('Renaming %s to %s.' % (var_name, new_name))
28 | # Rename the variable
29 | var = tf.Variable(var, name=new_name)
30 |
31 | if not dry_run:
32 | # Save the variables
33 | saver = tf.train.Saver()
34 | sess.run(tf.global_variables_initializer())
35 | saver.save(sess, checkpoint.model_checkpoint_path)
36 |
37 |
38 | def main(argv):
39 | checkpoint_dir = None
40 | replace_from = None
41 | replace_to = None
42 | add_prefix = None
43 | dry_run = False
44 |
45 | try:
46 | opts, args = getopt.getopt(argv, 'h', ['help=', 'checkpoint_dir=', 'replace_from=',
47 | 'replace_to=', 'add_prefix=', 'dry_run'])
48 | except getopt.GetoptError:
49 | print(usage_str)
50 | sys.exit(2)
51 | for opt, arg in opts:
52 | if opt in ('-h', '--help'):
53 | print(usage_str)
54 | sys.exit()
55 | elif opt == '--checkpoint_dir':
56 | checkpoint_dir = arg
57 | elif opt == '--replace_from':
58 | replace_from = arg
59 | elif opt == '--replace_to':
60 | replace_to = arg
61 | elif opt == '--add_prefix':
62 | add_prefix = arg
63 | elif opt == '--dry_run':
64 | dry_run = True
65 |
66 | if not checkpoint_dir:
67 | print('Please specify a checkpoint_dir. Usage:')
68 | print(usage_str)
69 | sys.exit(2)
70 |
71 | rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run)
72 |
73 |
74 | if __name__ == '__main__':
75 | main(sys.argv[1:])
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/retrieval/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/fever_doc_db.py:
--------------------------------------------------------------------------------
1 | # from ast import Param
2 | from drqa.retriever import DocDB, utils
3 |
4 |
5 | class FeverDocDB(DocDB):
6 |
7 | def __init__(self,path=None):
8 | super().__init__(path)
9 |
10 | def get_doc_lines(self, doc_id):
11 | """Fetch the raw text of the doc for 'doc_id'."""
12 | cursor = self.connection.cursor()
13 | cursor.execute(
14 | "SELECT lines FROM documents WHERE id = ?",
15 | (utils.normalize(doc_id),)
16 | )
17 | result = cursor.fetchone()
18 | cursor.close()
19 | return result if result is None else result[0]
20 |
21 | def get_non_empty_doc_ids(self):
22 | """Fetch all ids of docs stored in the db."""
23 | cursor = self.connection.cursor()
24 | # cursor.execute("SELECT id FROM documents WHERE length(trim(text)) > 0")
25 | cursor.execute("SELECT id FROM documents WHERE length(trim(lines)) > 0")
26 | results = [r[0] for r in cursor.fetchall()]
27 | cursor.close()
28 | return results
29 |
30 |
31 |
32 | def main():
33 | print("hi?")
34 | db = FeverDocDB(path = "/gpfs/fs1/projects/gpu_adlr/datasets/nayeonl/db/fever.db")
35 | # lines = db.get_doc_lines("Lorelai_Gilmore")
36 | lines = db.get_doc_lines("Goalkeeper_(association_football)")
37 | print(lines)
38 |
39 |
40 |
41 | # db = FeverDocDB(path = "/gpfs/fs1/projects/gpu_adlr/datasets/nayeonl/db/kilt_db.db")
42 | # lines = db.get_doc_lines('Michael Jordan')
43 | # print(lines)
44 |
45 |
46 | if __name__ == '__main__':
47 | main()
48 |
49 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/filter_lists.py:
--------------------------------------------------------------------------------
1 | def uninformative(title):
2 | return title.lower().startswith('list_of_') \
3 | or title.lower().startswith("lists_of_") \
4 | or title.lower().startswith('index_of_.') \
5 | or title.lower().startswith('outline_of_')
6 |
7 | def preprocess(doc):
8 | return None if uninformative(doc['id']) else doc
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/filter_uninformative.py:
--------------------------------------------------------------------------------
1 | def uninformative(title):
2 | return '-LRB-disambiguation-RRB-' in title.lower() \
3 | or '-LRB-disambiguation_page-RRB-' in title.lower() \
4 | or title.lower().startswith('list_of_') \
5 | or title.lower().startswith("lists_of_") \
6 | or title.lower().startswith('index_of_.') \
7 | or title.lower().startswith('outline_of_')
8 |
9 | def preprocess(doc):
10 | return None if uninformative(doc['id']) else doc
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/sent_features.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
2 | from sklearn.metrics.pairwise import cosine_similarity
3 |
4 | from scipy.sparse import hstack
5 |
6 | import numpy as np
7 |
8 | from rte.riedel.fever_features import TermFrequencyFeatureFunction
9 |
10 |
11 | class SentenceTermFrequencyFeatureFunction(TermFrequencyFeatureFunction):
12 |
13 | def __init__(self,doc_db,lim_unigram=5000,naming=None):
14 | super().__init__(doc_db,lim_unigram,naming=naming)
15 | self.ename = "sentences"
16 |
17 | def bodies(self,data):
18 | return set([datum[self.ename] for datum in data])
19 |
20 | def texts(self,data):
21 | return set([datum[self.ename] for datum in data])
22 |
23 | def body_id(self,data):
24 | return [datum[self.ename] for datum in data]
25 |
26 |
27 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/sentence.py:
--------------------------------------------------------------------------------
1 | from common.util.array import flatten
2 | from rte.riedel.data import FeverFormatter, preprocess, FEVERLabelSchema
3 |
4 |
5 | class FEVERSentenceFormatter(FeverFormatter):
6 | def format_line(self,line):
7 | annotation = line["label"]
8 | if annotation is None:
9 | annotation = line["verifiable"]
10 |
11 | pages = []
12 | if 'evidence' in line:
13 | pages = [[(ev[2],ev[3]) for ev in annotation if ev[2] is not None] for annotation in line["evidence"]]
14 |
15 | return {"claim":self.tokenize(line["claim"]), "evidence": pages, "label":self.label_schema.get_id(annotation),"label_text":annotation}
16 |
17 |
18 | class FEVERSentenceTextFormatter(FeverFormatter):
19 | def __init__(self,idx, db,ls):
20 | super().__init__(idx, ls)
21 | self.db = db
22 |
23 | def get_doc_line(self,doc,line):
24 | lines = self.db.get_doc_lines(doc)
25 | return lines.split("\n")[line].split("\t")[1]
26 |
27 | def format_line(self,line):
28 | annotation = line["label"]
29 | if annotation is None:
30 | annotation = line["verifiable"]
31 |
32 | newpages = []
33 | docs = []
34 | if 'evidence' in line:
35 | pages = set(flatten([[(ev[2],ev[3]) for ev in annotation if ev[2] is not None] for annotation in line["evidence"]]))
36 | docs = set(flatten([[ev[2] for ev in annotation if ev[2] is not None] for annotation in line["evidence"]]))
37 |
38 | for page in pages:
39 | newpages.append((page[0],page[1],self.get_doc_line(page[0],page[1])))
40 |
41 | return {"claim":self.tokenize(line["claim"]), "docs": docs, "evidence": newpages, "label":self.label_schema.get_id(annotation),"label_text":annotation}
42 |
43 |
44 |
45 | class FEVERSentenceRelatednessFormatter(FeverFormatter):
46 |
47 | def __init__(self,idx, db,ls):
48 | super().__init__(idx, ls)
49 | self.label_schema = ls
50 | self.ols = FEVERLabelSchema()
51 | self.db = db
52 |
53 | def format_line(self,line):
54 | annotation = line["label"]
55 | if annotation is None:
56 | annotation = line["verifiable"]
57 |
58 | if self.ols.get_id(annotation) != self.ols.get_id("not enough info"):
59 | annotation = "related"
60 | else:
61 | annotation = "unrelated"
62 |
63 | evidence_texts = []
64 | claim = self.tokenize(line['claim']).strip()
65 | for page in set([ev[2] for ev in line['evidence']]):
66 | evidences = set([ev[3] for ev in line['evidence'] if ev[1] == page])
67 | lines = self.db.get_doc_lines(page)
68 | if any(ev<0 for ev in evidences):
69 | evidence_texts = [""]
70 | else:
71 | evidence_texts = [lines.split("\n")[line].split("\t")[1].split() for line in evidences]
72 |
73 | return {"claim":claim, "sentences": evidence_texts, "label":self.label_schema.get_id(annotation),"label_text":annotation}
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/retrieval/snopes_doc_db.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | class SnopesDocDB(object):
5 | def __init__(self, db_path: str):
6 | self.path = db_path
7 | with open(self.path) as f:
8 | self.db_dict = json.load(f)
9 |
10 | def path(self):
11 | return self.path
12 |
13 | def get_doc_ids(self):
14 | results = list(self.db_dict.keys())
15 | return results
16 |
17 | def get_doc_text(self, doc_id):
18 | return self.get_doc_lines(doc_id)
19 |
20 | def get_doc_lines(self, doc_id):
21 | if doc_id not in self.db_dict:
22 | return None
23 | lines = [str(num) + '\t' + line for num, line in enumerate(self.db_dict[doc_id])]
24 | return '\n'.join(lines)
25 |
26 | def get_non_empty_doc_ids(self):
27 | return [result for result in self.get_doc_ids() if len(self.db_dict[result]) > 0]
28 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/scripts/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/athene/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/scripts/athene/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/athene/export_current_config_to_json.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from athene.utils.config import Config
4 |
5 | if __name__ == '__main__':
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('output', help='/path/to/file/to/save/config')
8 | args = parser.parse_args()
9 | Config.save_config(args.output)
10 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/athene/replace_noise_dataset.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import random
4 |
5 | from tqdm import tqdm
6 |
7 | from common.dataset.reader import JSONLineReader
8 | from common.util.log_helper import LogHelper
9 | from drqa.retriever.utils import normalize
10 |
11 |
12 | def predicted_evidence_to_list(pred_evidences):
13 | evidences = []
14 | for e in pred_evidences:
15 | evidences.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1])))
16 | return evidences
17 |
18 |
19 | def gold_evidence_to_list(gold_evidences):
20 | evidences = []
21 | for e_set in gold_evidences:
22 | evidence_set = []
23 | for e in e_set:
24 | evidence_set.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1])))
25 | evidences.append(evidence_set)
26 | return evidences
27 |
28 |
29 | def is_gold_evidence_predicted(_line):
30 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence'])
31 | _all_gold_evidences = gold_evidence_to_list(_line['evidence'])
32 | return any(all(e in _all_predicted_evidences for e in e_set) for e_set in _all_gold_evidences)
33 |
34 |
35 | def random_fill_gold_evidence(_line):
36 | _all_gold_evidences = gold_evidence_to_list(_line['evidence'])
37 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence'])
38 | e_set = random.sample(_all_gold_evidences, 1)[0]
39 | logger.debug("fill with evidence set: " + str(e_set))
40 | for e in e_set:
41 | e_segments = e.split('§§§')
42 | if e not in _all_predicted_evidences:
43 | _line['predicted_evidence'] = [[e_segments[0], int(e_segments[1])]] + _line['predicted_evidence']
44 | _line['predicted_evidence'] = _line['predicted_evidence'][:args.max_evidence]
45 | return _line
46 |
47 |
48 | if __name__ == '__main__':
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument('input', help='/path/to/input/file')
51 | parser.add_argument('output', help='/path/to/output/file')
52 | parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5)
53 | args = parser.parse_args()
54 | LogHelper.setup()
55 | logger = LogHelper.get_logger("replace_noise_dataset")
56 | random.seed(55)
57 | jlr = JSONLineReader()
58 | lines = jlr.read(args.input)
59 | counter = 0
60 | with open(args.output, 'w') as f:
61 | for i, line in tqdm(enumerate(lines)):
62 | if not line['label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(line):
63 | counter += 1
64 | logger.info("line " + str(i + 1) + " should be filled")
65 | line = random_fill_gold_evidence(line)
66 | f.write(json.dumps(line) + '\n')
67 | logger.info(str(counter) + " samples filled with gold evidence")
68 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/athene/replace_noise_dataset_with_scores.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import random
4 |
5 | from tqdm import tqdm
6 |
7 | from common.dataset.reader import JSONLineReader
8 | from common.util.log_helper import LogHelper
9 | from drqa.retriever.utils import normalize
10 |
11 |
12 | def predicted_evidence_to_list(pred_evidences):
13 | evidences = []
14 | for e in pred_evidences:
15 | evidences.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1])))
16 | return evidences
17 |
18 |
19 | def gold_evidence_to_list(gold_evidences):
20 | evidences = []
21 | for e_set in gold_evidences:
22 | evidence_set = []
23 | for e in e_set:
24 | evidence_set.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1])))
25 | evidences.append(evidence_set)
26 | return evidences
27 |
28 |
29 | def is_gold_evidence_predicted(_line):
30 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence'])
31 | _all_gold_evidences = gold_evidence_to_list(_line['evidence'])
32 | return any(all(e in _all_predicted_evidences for e in e_set) for e_set in _all_gold_evidences)
33 |
34 |
35 | def random_fill_gold_evidence(_line):
36 | _all_gold_evidences = gold_evidence_to_list(_line['evidence'])
37 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence'])
38 | e_set = random.sample(_all_gold_evidences, 1)[0]
39 | logger.debug("fill with evidence set: " + str(e_set))
40 | for e in e_set:
41 | e_segments = e.split('§§§')
42 | if e not in _all_predicted_evidences:
43 | _line['predicted_evidence'] = [[e_segments[0], int(e_segments[1])]] + _line['predicted_evidence']
44 | _line['scores'] = [1.0] + _line['scores']
45 | _line['predicted_evidence'] = _line['predicted_evidence'][:args.max_evidence]
46 | _line['scores'] = _line['scores'][:args.max_evidence]
47 | return _line
48 |
49 |
50 | if __name__ == '__main__':
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument('input', help='/path/to/input/file')
53 | parser.add_argument('output', help='/path/to/output/file')
54 | parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5)
55 | args = parser.parse_args()
56 | LogHelper.setup()
57 | logger = LogHelper.get_logger("replace_noise_dataset")
58 | random.seed(55)
59 | jlr = JSONLineReader()
60 | lines = jlr.read(args.input)
61 | counter = 0
62 | with open(args.output, 'w') as f:
63 | for i, line in tqdm(enumerate(lines)):
64 | if not line['label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(line):
65 | counter += 1
66 | logger.info("line " + str(i + 1) + " should be filled")
67 | line = random_fill_gold_evidence(line)
68 | f.write(json.dumps(line) + '\n')
69 | logger.info(str(counter) + " samples filled with gold evidence")
70 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/athene/sort_submission.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 |
5 | from common.dataset.reader import JSONLineReader
6 |
7 | if __name__ == '__main__':
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--submission', help='/path/to/submission/file', required=True)
10 | parser.add_argument('--data', help='/path/to/data/file', required=True)
11 | parser.add_argument('--output', help='/path/to/output/file', required=True)
12 | args = parser.parse_args()
13 | jlr = JSONLineReader()
14 | submission_lines = jlr.read(args.submission)
15 | data_lines = jlr.read(args.data)
16 | assert len(submission_lines) == len(data_lines), "lengths of submission and data set are different!"
17 | submission_dict = {}
18 | for line in submission_lines:
19 | submission_dict[line['id']] = line
20 | assert len(submission_dict) == len(submission_lines), "lines in submission are not unique!"
21 | sorted_lines = []
22 | for d in data_lines:
23 | sorted_lines.append(submission_dict[d['id']])
24 | assert len(sorted_lines) == len(data_lines), "some claims from data set are missing in submission!"
25 | os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
26 | with open(args.output, 'w') as f:
27 | for l in sorted_lines:
28 | f.write(json.dumps(l) + '\n')
29 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/prepare_submission.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import sys
4 | from fever.scorer import fever_score
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--predicted_labels",type=str)
8 |
9 | parser.add_argument("--predicted_evidence",type=str)
10 | parser.add_argument("--out_file",type=str)
11 |
12 | args = parser.parse_args()
13 |
14 | predicted_labels =[]
15 | predicted_evidence = []
16 | actual = []
17 |
18 | with open(args.predicted_labels,"r") as predictions_file:
19 | for line in predictions_file:
20 | predicted_labels.append(json.loads(line)["predicted"])
21 |
22 |
23 | with open(args.predicted_evidence,"r") as predictions_file:
24 | for line in predictions_file:
25 | predicted_evidence.append(json.loads(line)["predicted_sentences"])
26 |
27 | predictions = []
28 | for ev,label in zip(predicted_evidence,predicted_labels):
29 | predictions.append({"predicted_evidence":ev,"predicted_label":label})
30 |
31 | with open(args.out_file,"w+") as f:
32 | for line in predictions:
33 | f.write(json.dumps(line)+"\n")
34 |
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/src/scripts/score.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import sys
4 | from fever.scorer import fever_score
5 | from prettytable import PrettyTable
6 |
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("--predicted_labels",type=str)
9 |
10 | parser.add_argument("--predicted_evidence",type=str)
11 | parser.add_argument("--actual",type=str)
12 |
13 | args = parser.parse_args()
14 |
15 | predicted_labels =[]
16 | predicted_evidence = []
17 | actual = []
18 |
19 | with open(args.predicted_labels,"r") as predictions_file:
20 | for line in predictions_file:
21 | if "predicted_label" in line:
22 | predicted_labels.append(json.loads(line)["predicted_label"])
23 | else:
24 | predicted_labels.append(json.loads(line)["predicted"])
25 |
26 |
27 | with open(args.predicted_evidence,"r") as predictions_file:
28 | for line in predictions_file:
29 | if "predicted_evidence" in line:
30 | predicted_evidence.append(json.loads(line)["predicted_evidence"])
31 | else:
32 | predicted_evidence.append(json.loads(line)["predicted_sentences"])
33 |
34 | with open(args.actual, "r") as actual_file:
35 | for line in actual_file:
36 | actual.append(json.loads(line))
37 |
38 | predictions = []
39 | for ev,label in zip(predicted_evidence,predicted_labels):
40 | predictions.append({"predicted_evidence":ev,"predicted_label":label})
41 |
42 | score,acc,precision,recall,f1 = fever_score(predictions,actual)
43 |
44 | tab = PrettyTable()
45 | tab.field_names = ["FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1"]
46 | tab.add_row((round(score,4),round(acc,4),round(precision,4),round(recall,4),round(f1,4)))
47 |
48 | print(tab)
--------------------------------------------------------------------------------
/FactualityPrompt/fever_athene/tests/test_load_models.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import numpy as np
4 | from athene.retrieval.sentences.deep_models.ESIM import ESIM as ESIMretrieval
5 | from athene.rte.deep_models.ESIM_for_ensemble_glove_only_no_attention import ESIM as ESIMrte
6 | from athene.rte.utils.text_processing import load_whole_glove
7 | from common.util.log_helper import LogHelper
8 |
9 | LogHelper.setup()
10 |
11 |
12 | def test_load_retrieval_model():
13 | dummy_embeddings = np.zeros((1, 300), dtype=np.float32)
14 | estimator = ESIMretrieval(
15 | h_max_length=20, s_max_length=60, learning_rate=0.001, batch_size=256, num_epoch=20,
16 | model_store_dir=None,
17 | embedding=dummy_embeddings,
18 | word_dict=None, dropout_rate=0.2, random_state=88, num_units=128,
19 | share_rnn=True
20 | )
21 | # estimator.restore_model("../models/retrieval/best_model.ckpt")
22 | estimator.restore_model("../models/retrieval/sentence_selection_esim.ckpt")
23 |
24 |
25 | def test_load_rte_model():
26 | dummy_embeddings = np.zeros((1, 300), dtype=np.float32)
27 | estimator = ESIMrte(name='esim_verify',
28 | activation='relu',
29 | batch_size=64,
30 | lstm_layers=1,
31 | n_outputs=3,
32 | num_neurons=[250, 180, 900, 550, 180],
33 | show_progress=1, embedding=dummy_embeddings
34 | )
35 | # estimator.restore_model("../models/rte/esim1.ckpt")
36 | estimator.restore_model("../models/rte/claim_verification_esim.ckpt")
37 |
38 |
39 | @pytest.mark.skip(reason="Loading GloVe takes around 10 mins.")
40 | def test_load_rte_model_2():
41 | vocab, embeddings = load_whole_glove("../../resources/embeddings/glove/glove.6B.300d.txt")
42 | estimator = ESIMrte(name='esim_verify',
43 | activation='relu',
44 | batch_size=64,
45 | lstm_layers=1,
46 | n_outputs=3,
47 | num_neurons=[250, 180, 900, 550, 180],
48 | show_progress=1, embedding=embeddings, vocab_size=len(vocab)
49 | )
50 | estimator.restore_model("../models/rte/claim_verification_esim.ckpt")
51 |
52 |
53 | if __name__ == "__main__":
54 | pytest.main([__file__])
55 |
--------------------------------------------------------------------------------
/FactualityPrompt/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | # bash script
5 | python -m spacy download en_core_web_sm
6 |
7 | # pip install -r requirements.txt
8 |
9 |
10 | pip install fever-drqa
11 |
12 | pip install hydra-core
13 | # pip uninstall sacrebleu; pip install sacrebleu==1.5.1
14 |
15 | pip install tensorflow
16 | pip install torch==1.5.0
17 | pip install torchvision==0.7.0
18 |
19 |
20 | # for SentenceTransformer retriever
21 | pip install torch==1.6.0
22 | pip install -U sentence-transformers # (tokenizer==0.11.6, transformers==4.17.0)
23 |
24 | # python in bash
25 | python - << EOF
26 | import nltk
27 | import benepar
28 | nltk.download('stopwords')
29 | nltk.download('punkt')
30 | benepar.download('benepar_en2')
31 | EOF
--------------------------------------------------------------------------------
/FactualityPrompt/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__init__.py
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/claim_handling.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/claim_handling.cpython-38.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/const.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/const.cpython-310.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/const.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/const.cpython-38.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/const.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/const.cpython-39.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/factuality_metric.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/factuality_metric.cpython-38.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/metric.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/metric.cpython-38.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/retriever.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/retriever.cpython-38.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/__pycache__/retriever.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/retriever.cpython-39.pyc
--------------------------------------------------------------------------------
/FactualityPrompt/src/const.py:
--------------------------------------------------------------------------------
1 | DATA_DIR = "/mnt/efs/Haw-Shiuan/true_entropy/FactualityPrompt/data" # absolute path to data directory
2 | GEN_DIR = "/mnt/efs/Haw-Shiuan/llm-aymptotic-decoding/REAL_sampling/outputs/factual_gen" # absolute path to generations save directory
3 | HOME_DIR = "/mnt/efs/Haw-Shiuan/llm-aymptotic-decoding/FactualityPrompt" # absolute path to this project directory
4 |
--------------------------------------------------------------------------------
/FactualityPrompt/src/factuality_metric.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from fairseq.data.data_utils import collate_tokens
3 | import numpy as np
4 | import re
5 |
6 | NLI_MODEL = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
7 | NLI_MODEL.eval()
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 | softmax = torch.nn.Softmax(dim=1)
10 | NLI_MODEL.to(device)
11 |
12 |
13 | '''
14 | Returns ([[contradiction, neutral, entailment]], argmax)
15 | '''
16 | def nli_metric_batch(batch_of_pairs):
17 | # batch_of_pairs = [
18 | # ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
19 | # ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
20 | # ['potatoes are awesome.', 'I like to run.'],
21 | # ['Mars is very far from earth.', 'Mars is very close.'],
22 | # ]
23 |
24 | encoded_tokens = [NLI_MODEL.encode(pair[0], pair[1]) for pair in batch_of_pairs]
25 | encoded_tokens = [tokens[:min(len(tokens), 512)] for tokens in encoded_tokens] # trucate any long seq
26 | batch = collate_tokens(
27 | encoded_tokens, pad_idx=1
28 | )
29 |
30 | logprobs = NLI_MODEL.predict('mnli', batch)
31 | logits = softmax(logprobs)
32 | labels = logits.argmax(dim=1) # logprobs.argmax(dim=1)
33 |
34 | return logits.tolist(), labels.tolist()
35 |
36 |
37 |
38 | def nli_metric(premise, hypothesis):
39 |
40 | # Encode a pair of sentences and make a prediction
41 | # tokens = NLI_MODEL.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')
42 | tokens = NLI_MODEL.encode(premise, hypothesis)
43 |
44 | seq_len = min(len(tokens), 512)
45 |
46 | logits = NLI_MODEL.predict('mnli', tokens[:seq_len])
47 | logits = softmax(logits)
48 | label = logits.argmax() # 0: contradiction
49 |
50 | return logits.tolist(), label.tolist()
51 |
52 |
53 | # ('As much as', 'CARDINAL')
54 | # ('About 20', 'CARDINAL')
55 | # ('67', 'CARDINAL'),
56 | # ('14,000 meters', 'QUANTITY') vs ('1.4 kilometers', 'QUANTITY')
57 |
58 | def ner_metric(named_entities, prompt_wiki_candidates):
59 |
60 | wiki_text = " ".join(prompt_wiki_candidates).lower()
61 |
62 | # TODO improve the NE match here
63 | # hanlde DATE, TIME, etc better! appears a lot but handled poorly
64 |
65 | existing_correct_ne = []
66 | for ent in named_entities:
67 | ent_text = ent[0].lower()
68 | if 'the ' in ent_text:
69 | ent_text = ent_text.replace('the ', "")
70 |
71 | if ent_text in wiki_text:
72 | existing_correct_ne.append(ent)
73 | elif any([bool(word in wiki_text) for word in ent_text.split(" ") if ent[1] == 'PERSON']):
74 | # handle shorter forms of same NE: Exists "Marcus Morgan Bentley", but NE is "Marcus Bentley" or "Bentley"
75 | existing_correct_ne.append(ent)
76 | elif ent[1] == 'DATE':
77 | date_str = re.sub(r"[,.;@#?!&$]+\ *", " ", ent_text)
78 | date_str = date_str.replace("st", "")
79 | date_str = date_str.replace("nd", "")
80 | date_str = date_str.replace("th", "")
81 | date_str = date_str.replace("of", "")
82 | date_tokens = date_str.split(" ")
83 |
84 | if all([bool(token in wiki_text) for token in date_tokens]):
85 | existing_correct_ne.append(ent)
86 |
87 |
88 |
89 | correct_ratio = len(existing_correct_ne)/ len(named_entities)
90 |
91 | return correct_ratio
92 |
93 |
94 | def ie_metric(claims, evidences):
95 | return NotImplementedError
96 |
97 |
98 |
99 | if __name__ == '__main__':
100 |
101 | print("Hi")
--------------------------------------------------------------------------------
/FactualityPrompt/src/repetition.py:
--------------------------------------------------------------------------------
1 | '''
2 | This code is adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/repetition.py by Ari Holtzman.
3 | '''
4 | import argparse
5 | import json
6 | import os
7 |
8 | from transformers import GPT2Tokenizer
9 |
10 | from src.const import DATA_DIR, HOME_DIR, GEN_DIR
11 |
12 |
13 | def parse_args() -> argparse.Namespace:
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument("file", type=str)
16 | parser.add_argument("--eval_dir", type=str, default = '')
17 | parser.add_argument("--numbers-only", action="store_true")
18 | parser.add_argument("--output", action="store_true")
19 | parser.add_argument("--final", action="store_true")
20 | parser.add_argument('--num_eval_sent', type=int, default=1)
21 |
22 | return parser.parse_args()
23 |
24 |
25 | def main():
26 | args = parse_args()
27 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large", do_lower_case=True)
28 | SEP = tokenizer.encode(tokenizer.bos_token)[0]
29 |
30 | objs = []
31 | max_n = 90
32 |
33 | if len(args.eval_dir) > 0:
34 | args.file = "{}/{}".format(args.eval_dir, args.file)
35 | else:
36 | args.file = "{}/{}".format(GEN_DIR, args.file)
37 | with open(args.file, 'r') as fin:
38 | for l in fin:
39 | objs.append(json.loads(l.strip()))
40 |
41 | n_repeated_examples = 0
42 | repeated_times_sum = 0
43 |
44 | nn = len(objs)
45 | for idx, obj in enumerate(objs):
46 | #print(obj)
47 | gen = obj['text']
48 | if len(gen) == 0:
49 | continue
50 |
51 | if "WikiNamePrefix" in args.file:
52 | wikiPrefix = obj['prompt'].split(". ")[-1].strip()
53 | gen = gen.replace(wikiPrefix, " ")
54 |
55 | if gen[-1] == SEP:
56 | gen.pop(-1)
57 | rev_gen = list(reversed(gen))
58 | last_n_repeats = [0] * max_n
59 |
60 | for n in range(1, max_n + 1):
61 | n_repeat = 1
62 | while len(rev_gen[n*n_repeat:n*(n_repeat+1)]) == n and \
63 | rev_gen[n*n_repeat:n*(n_repeat+1)] == rev_gen[:n]:
64 | n_repeat += 1
65 | last_n_repeats[n - 1] = n_repeat
66 | max_repeated_n = max(range(max_n), key=lambda x: last_n_repeats[x])
67 |
68 | if last_n_repeats[max_repeated_n] > 1 and (max_repeated_n+1 >= 3 or last_n_repeats[max_repeated_n] > 50):
69 | obj['repetition'] = {
70 | 'repeated_phrase': list(reversed(rev_gen[:max_repeated_n + 1])),
71 | 'repeated_times': last_n_repeats[max_repeated_n],
72 | 'repeated_phrase_length': max_repeated_n + 1,
73 | }
74 | n_repeated_examples += 1
75 |
76 | repeated_times_sum += last_n_repeats[max_repeated_n]
77 |
78 | else:
79 | obj['repetition'] = None
80 |
81 | # if not args.numbers_only:
82 | # print("filename\tnumber of repeating examples")
83 | # print(f"{os.path.basename(args.file)},{n_repeated_examples},{repeated_times_sum/nn}")
84 | print(f"{n_repeated_examples},{repeated_times_sum/nn}")
85 |
86 | if args.num_eval_sent == 1:
87 | score_folder_name = "scores"
88 | else:
89 | score_folder_name = "scores_s"+str(args.num_eval_sent)
90 | output_folder = os.path.dirname(args.file) + '/' + score_folder_name
91 | if not os.path.exists(output_folder):
92 | os.makedirs(output_folder)
93 | if args.output:
94 | output_filename = os.path.join(os.path.dirname(args.file), score_folder_name, "repetition_" + os.path.basename(args.file))
95 | with open(output_filename, 'w+') as fout:
96 | for obj in objs:
97 | print(json.dumps(obj), file=fout)
98 |
99 | if args.final:
100 | gen_path = output_folder + '/' + os.path.basename(args.file)
101 | res_path = gen_path.replace(".jsonl", "_results.jsonl")
102 | with open(res_path, 'a') as outfile:
103 | res_obj = {
104 | "repetition": n_repeated_examples,
105 | "repetition_ratio": n_repeated_examples / nn
106 | }
107 | json.dump(res_obj, outfile)
108 | outfile.write("\n")
109 |
110 |
111 | if __name__ == '__main__':
112 | main()
113 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Extrapolating an Infinite LLM♾🤖
2 |
3 | ## Introduction
4 |
5 | Assuming you have a series of LLMs with different sizes that are trained on the same data and you want to increase the factuality and diversity of the text sampled from your largest LLM. Then, consider to use our proposed REAL sampling and/or APD sampling. In FactualityPrompt, we show that APD + REAL sampling outperforms 13 state-of-the-art sampling methods. Our baselines include typical ([Meister et al., 2022](https://arxiv.org/abs/2202.00666)), eta ([Hewitt et al., 2022](https://arxiv.org/pdf/2210.15191)), EDT ([Zhang et al., 2024](https://arxiv.org/abs/2403.14541)), adaptive ([Zhu et al., 2024](https://arxiv.org/abs/2402.18223)), microstat ([Basu et al., 2021](https://arxiv.org/abs/2007.14966)), EAD w/o ELI ([Arora et al., 2023](https://arxiv.org/abs/2302.06784)) factual ([Lee et al., 2022](https://arxiv.org/abs/2206.04624)) top-p ([Holtzman et al., 2020](https://arxiv.org/pdf/1904.09751)), top-k ([Fan et al., 2018](https://arxiv.org/pdf/1805.04833)), and temperature sampling; contrastive search ([Su and Collier, 2022](https://arxiv.org/pdf/2210.14140)) , contrastive decoding (CD) ([Li et al., 2022](https://arxiv.org/pdf/2210.15097)), and DoLa ([Chuang et al., 2023](https://arxiv.org/pdf/2309.03883)). We show that APD + REAL sampling makes Pythia 6.9B simultaneously achieve the factuality of greedy sampling and diversity of top-p with p=0.5.
6 |
7 | 
8 |
9 | ## Usage
10 |
11 | To run our code, please follow the instructions in the README.md of each folder.
12 |
13 | We first write the REAL sampling code in the REAL_sampling folder and revise the code for APD sampling in the AP_sampling folder. As a result, AP_sampling also includes the inference code of REAL sampling. We also slightly modify the code of FactualityPrompt (https://github.com/nayeon7lee/FactualityPrompt) to make it easier to run.
14 |
15 | ## Computational Resources
16 |
17 | Our code assumes that your machine has 8 GPUs and each GPU has 32G memory. If you have less GPU or your GPU has less memory, you can try to reduce your generation model sizes.
18 |
19 | ## Questions
20 |
21 | If you have any questions or find any bugs, please send an email to Haw-Shiuan Chang (hschang@cs.umass.edu).
22 |
23 | ## Security
24 |
25 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
26 |
27 | ## License
28 |
29 | This library is licensed under the [Creative Commons Attribution-NonCommercial 4.0 International](https://creativecommons.org/licenses/by-nc/4.0/) License.
30 |
31 | ## Citation
32 |
33 | If you use our code for THF model or REAL sampling in your work, consider to cite https://arxiv.org/abs/2406.07735 .
34 | ```
35 | @misc{chang2024realsamplingboostingfactuality,
36 | title={REAL Sampling: Boosting Factuality and Diversity of Open-Ended Generation via Asymptotic Entropy},
37 | author={Haw-Shiuan Chang and Nanyun Peng and Mohit Bansal and Anil Ramakrishna and Tagyoung Chung},
38 | year={2024},
39 | eprint={2406.07735},
40 | archivePrefix={arXiv},
41 | primaryClass={cs.CL},
42 | url={https://arxiv.org/abs/2406.07735},
43 | }
44 | ```
45 |
46 | If you use our code for APD sampling in your work, consider to cite https://arxiv.org/abs/2411.01610 (see the example reference and bib information below).
47 | ```
48 | @inproceedings{chang2024explaining,
49 | title={Explaining and Improving Contrastive Decoding by Extrapolating the Probabilities of a Huge and Hypothetical LM},
50 | author={Chang, Haw-Shiuan and Peng, Nanyun and Bansal, Mohit and Ramakrishna, Anil and Chung, Tagyoung},
51 | booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
52 | year={2024},
53 | }
54 | ```
55 |
56 | If you use FactualityPrompt, cite their paper (https://arxiv.org/abs/2206.04624).
57 |
--------------------------------------------------------------------------------
/REAL_sampling/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/.DS_Store
--------------------------------------------------------------------------------
/REAL_sampling/README.md:
--------------------------------------------------------------------------------
1 | # REAL Sampling: Boosting Factuality and Diversity of Open-Ended Generation by Extrapolating the Entropy of an Infinitely Large LM
2 |
3 | 
4 |
5 | ## Introduction
6 |
7 | REAL (**R**esidual **E**ntropy from **A**symptotic **L**ine) sampling is a decoding method that achieves improved factuality and diversity over nucleus sampling by predicting an adaptive threshold of p. Specifically, REAL sampling predicts the step-wise likelihood of an LLM to hallucinate, and lowers the p threshold when an LLM is likely to hallucinate. Otherwise, REAL sampling increases the p threshold to boost the diversity. To predict the step-wise hallucination likelihood without supervision, we construct a Token-level Hallucination Forecasting (THF) model to predict the asymptotic entropy (i.e., inherent uncertainty) of the next token by extrapolating the next-token entropies from a series of LLMs with different sizes. If a LLM's entropy is higher than the asymptotic entropy (i.e., the LLM is more uncertain than it should be), the THF model predicts a high hallucination hazard, which leads to a lower p threshold in REAL sampling. In the FactualityPrompts benchmark, we demonstrate that REAL sampling based on a 70M THF model can substantially improve the factuality and diversity of 7B LLMs simultaneously, judged by both retrieval-based metrics and human evaluation.
8 |
9 | ## Computational Environment
10 |
11 | You can reproduce our python enviroment using
12 | ```
13 | conda create --name --file requirement.txt
14 | ```
15 | ## How to run REAL sampling
16 |
17 | To learn how to use REAL sampling in huggingface, please see the following example code
18 |
19 | ```
20 | ./src/example.py
21 | ```
22 |
23 | ### Run FactualityPrompts
24 |
25 | To evaluate the generation results, first follow ../FactualityPrompt/README.md to download the data, change ../FactualityPrompt/src/const.py and run the following script.
26 |
27 | If you have >7 GPUs in your machine, you can just run the following file to generate the contiunations.
28 | ```
29 | ./bin/continue_wiki_prompt_loop.sh
30 | ```
31 |
32 | To evaluate the generation results, first follow ../FactualityPrompt/README.md to download the data, change ./FactualityPrompt/src/const.py and run the following script.
33 | ```
34 | ../FactualityPrompt/bin/eval_loop.sh
35 | ```
36 |
37 |
38 | ## How to Train THF
39 |
40 |
41 | Put your text file into "data/raw/".
42 |
43 | Change the INPUT_FILE in bin/train_THF_model.sh and run it (Assuming you have more than 7 GPUs in your machine).
44 |
45 |
46 | ## How to use THF to produce unsupervised features for hallucination detection tasks
47 |
48 | Please check src/process_hallucination_dataset/get_entropy_all.py and analyze_datasets/feature_clf_all.py
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/REAL_sampling/bin/continue_wiki_prompt_loop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | prompt_folder='../FactualityPrompt/prompts/'
4 | model_name='EleutherAI/pythia-6.9b-deduped'
5 | #model_name='openlm-research/open_llama_7b_v2'
6 | #model_name='facebook/opt-6.7b'
7 |
8 | #export CUDA_LAUNCH_BLOCKING=1
9 |
10 | dataset_suffix='_test7k'
11 |
12 | temperature='1'
13 |
14 | METHOD_ARR=( 'fe_topp' 'topp' )
15 | MODEL_ARR=( 'OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' '' )
16 | SUBMETHOD_ARR=( 'exp_1_win' 'a' )
17 | P_ARR=( '1.0' '0;0.3;0.5;0.6;0.7;1' )
18 | DT_ARR=( '0.5;0.7;1.0;2.0;3.0;4.0' '1.0' )
19 |
20 | #METHOD_ARR=( 'topp' 'eta' 'typical' 'decay_period' 'topk' )
21 | #MODEL_ARR=( '' '' '' '' '' )
22 | #SUBMETHOD_ARR=( 'a' 'a' 'a' 'a' 'a' )
23 | #P_ARR=( '1.0;0.8;0.7;0.6;0.5;0.4;0.3' '0.1;0.3;0.8' '2;0.9;0.5;0.3' '0.9' '10;5;3;2;1' )
24 | #DT_ARR=( '1' '1' '1' '0.95;0.9;0.7;0.5;0.3;0.1' '1' )
25 |
26 | #METHOD_ARR=( 'CD' )
27 | #MODEL_ARR=( 'EleutherAI/pythia-70m-deduped' )
28 | #SUBMETHOD_ARR=( 'a' )
29 | #P_ARR=( '0.6;0.4;0.2;0.25;0.1;0.05' )
30 | #DT_ARR=( '1.0' )
31 |
32 | #METHOD_ARR=( 'fe_CD_topp' 'fe_topp_period' 'topp' )
33 | #MODEL_ARR=( 'OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' 'OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' '')
34 | #SUBMETHOD_ARR=( 'exp_1_win' 'exp_1_win' 'a' )
35 | #P_ARR=( '1.0' '0.9' '1.0' )
36 | #DT_ARR=( '4.0;0.7;1.5' '5.0' '0.1;0.3;0.7;0.9' )
37 |
38 | #METHOD_ARR=( 'CD' )
39 | #MODEL_ARR=( 'facebook/opt-125m' )
40 | #SUBMETHOD_ARR=( 'a' )
41 | #P_ARR=( '0.1;0.05' )
42 | #DT_ARR=( '0.25' )
43 |
44 | #METHOD_ARR=( 'CS' )
45 | #MODEL_ARR=( '' )
46 | #SUBMETHOD_ARR=( 'a' )
47 | #P_ARR=( '5.0' )
48 | #DT_ARR=( '0.6' )
49 | #P_ARR=( '5.0;10' )
50 | #DT_ARR=( '0.4;0.6' )
51 |
52 | init_existing_seeds=0
53 | repeat_times=4
54 |
55 | dataset_names=("fever_factual${dataset_suffix}_final.jsonl" "fever_nonfactual${dataset_suffix}_final.jsonl")
56 |
57 | #input_datasets=($(for x in "${dataset_names[@]}"; do printf "$x%.0s " {1..${repeat_times}}; done))
58 | #for v in ${dataset_names[@]}; do for i in $(seq 1 $repeat_times); do echo $v; done; done
59 |
60 | END=$(($init_existing_seeds + $repeat_times - 1))
61 | input_datasets=($(for v in ${dataset_names[@]}; do for i in $(seq 1 $repeat_times); do echo $v; done; done))
62 | existing_seeds_arr=($(seq $init_existing_seeds $END))
63 | existing_seeds_arr=("${existing_seeds_arr[@]}" "${existing_seeds_arr[@]}")
64 | echo ${input_datasets[@]}
65 | echo ${existing_seeds_arr[@]}
66 |
67 | for j in "${!METHOD_ARR[@]}"; do
68 | MODEL=${MODEL_ARR[$j]}
69 | sample_method=${METHOD_ARR[$j]}
70 | sample_sub_method=${SUBMETHOD_ARR[$j]}
71 | top_p_all=${P_ARR[$j]}
72 | decay_temperature_all=${DT_ARR[$j]}
73 |
74 | final_entropy_model_path="models/$MODEL"
75 | batch_size=8
76 | if [[ $MODEL == *"410"* ]]; then
77 | batch_size=4
78 | fi
79 | if [[ $MODEL == *"_1b_"* ]]; then
80 | batch_size=2
81 | fi
82 | if [[ $MODEL == *"EleutherAI"* ]]; then
83 | batch_size=4
84 | fi
85 | if [[ $sample_method == *"fe_CD"* ]]; then
86 | batch_size=4
87 | fi
88 | if [[ $sample_method == *"CS"* ]]; then
89 | batch_size=1
90 | fi
91 | IFS=";" read -r -a top_p_list <<< "${top_p_all}"
92 | IFS=";" read -r -a decay_temperature_list <<< "${decay_temperature_all}"
93 | for top_p in "${top_p_list[@]}"; do
94 | for decay_temperature in "${decay_temperature_list[@]}"; do
95 | pids=()
96 | for i in "${!input_datasets[@]}";
97 | do
98 | dataset_name=${input_datasets[$i]}
99 | num_existing_seeds=${existing_seeds_arr[$i]}
100 | echo "python src/factual_gen/gen_fp.py --model_name=$model_name --input_file_name ${prompt_folder}/$dataset_name --cuda_idx $i --p $top_p --num_existing_seeds $num_existing_seeds --sample_method $sample_method --final_entropy_model_path $final_entropy_model_path --batch_size $batch_size --decay_temperature $decay_temperature --temperature $temperature --sample_sub_method $sample_sub_method"
101 | #sleep 1 &
102 | python src/factual_gen/gen_fp.py --model_name=$model_name --input_file_name ${prompt_folder}/$dataset_name --cuda_idx $i --p $top_p --num_existing_seeds $num_existing_seeds --sample_method $sample_method --final_entropy_model_path $final_entropy_model_path --batch_size $batch_size --decay_temperature $decay_temperature --temperature $temperature --sample_sub_method $sample_sub_method &
103 | pids+=($!)
104 | done
105 | echo "${pids[@]}"
106 | wait "${pids[@]}"
107 | done
108 | done
109 | done
110 |
--------------------------------------------------------------------------------
/REAL_sampling/bin/train_THF_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #INPUT_FILE="data/raw/wiki2021_text_only_1e4"
4 | #PROC_FOLDER="data/processed/wiki_1e4_Pythia_temp/"
5 | #TOKENIZER="EleutherAI/pythia-70m-deduped"
6 | #OUTPUT_MODEL_FOLDER="models/wiki_1e4_70M_bsz_128_exp_pred_last_a10_e3"
7 |
8 | INPUT_FILE="data/raw/OWT_wiki_1e7"
9 | PROC_FOLDER="data/processed/OWT_wiki_1e7_Pythia/"
10 | TOKENIZER="EleutherAI/pythia-70m-deduped"
11 | OUTPUT_MODEL_FOLDER="models/OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3"
12 |
13 | echo "python src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER"
14 | python src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER
15 |
16 | declare -a bsz_arr=(1 2 4 4 8 12 16)
17 | declare -a model_arr=("EleutherAI/pythia-6.9b-deduped" "EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1.4b-deduped" "EleutherAI/pythia-1b-deduped" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-160m-deduped" "EleutherAI/pythia-70m-deduped" )
18 |
19 | pids=()
20 | for i in "${!model_arr[@]}";
21 | do
22 | model_name=${model_arr[$i]}
23 | batch_size=${bsz_arr[$i]}
24 | echo "python src/collect_gt_entropy.py --model_name=$model_name --input_folder_name $PROC_FOLDER --cuda_idx $i --batch_size $batch_size"
25 | python src/collect_gt_entropy.py --model_name=$model_name --input_folder_name $PROC_FOLDER --cuda_idx $i --batch_size $batch_size &
26 | pids+=($!)
27 | done
28 | echo "${pids[@]}"
29 | wait "${pids[@]}"
30 |
31 | echo "python src/train_entropy_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $PROC_FOLDER/entropy_tensor_1024/train --validation_label_folder $PROC_FOLDER/entropy_tensor_1024/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 5000 --num_train_epochs 3"
32 | python src/train_entropy_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $PROC_FOLDER/entropy_tensor_1024/train --validation_label_folder $PROC_FOLDER/entropy_tensor_1024/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 5000 --num_train_epochs 3
33 |
--------------------------------------------------------------------------------
/REAL_sampling/imgs/REAL_second_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/imgs/REAL_second_figure.png
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/data_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/data_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/data_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/data_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/data_utils.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-310.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-39.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-37.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/analyze_datasets/utils.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | import torch
3 | from torch.nn import functional as F
4 | import codecs
5 | import json
6 | import spacy
7 | from sklearn.metrics import classification_report, accuracy_score, hamming_loss, \
8 | f1_score, precision_score, recall_score, average_precision_score, roc_auc_score, confusion_matrix, \
9 | brier_score_loss, average_precision_score
10 | import numpy as np
11 |
12 |
13 | def binary_eval(predy, testy, predy_pro, verbose=True, return_f1=False, predscore=None):
14 | acc = accuracy_score(testy, predy)
15 | f1 = f1_score(testy, predy, average=None)
16 | precision = precision_score(testy, predy, average=None)
17 | recall = recall_score(testy, predy, average=None)
18 |
19 | average_precision = average_precision_score(testy, predy_pro)
20 | epsilon = 1e-8
21 |
22 | htn, hfp, hfn, htp = confusion_matrix(testy, predy).ravel()
23 | hsensi = htp / (htp + hfn + epsilon)
24 | hspec = htn / (hfp + htn + epsilon)
25 | gmean = np.sqrt(hsensi*hspec)
26 |
27 |
28 | info = "Acc : {}\nf1 : {}\nprecision : {}\nrecall : {}\nG-mean : {}\nAP : {}".format(acc,
29 | " ".join([str(x) for x in f1]), " ".join([str(x) for x in precision]),
30 | " ".join([str(x) for x in recall]), gmean, average_precision)
31 |
32 | if predscore is not None:
33 | bss = brier_score_loss(testy, predscore)
34 | roc_auc = roc_auc_score(testy, predscore)
35 | info += "\nbss : {}\nROC-AUC : {}".format(bss, roc_auc)
36 |
37 | if verbose:
38 | print(info)
39 |
40 | if return_f1:
41 | return acc, f1, precision, recall, gmean, bss, roc_auc, info
42 | else:
43 | #return acc, info
44 | return average_precision, info
45 |
46 |
47 | def subsets(nums):
48 | """
49 | :type nums: List[int]
50 | :rtype: List[List[int]]
51 | """
52 | ans = []
53 | def dfs(curpos, tmp):
54 | if tmp:
55 | ans.append(tmp[:])
56 | for i in range(curpos, len(nums)):
57 | tmp.append(nums[i])
58 | dfs(i+1, tmp)
59 | tmp.pop(-1)
60 | dfs(0, [])
61 | return ans
62 |
63 |
64 | def sent_ner_bounds(sen, nlp=None):
65 | if nlp is None:
66 | nlp = spacy.load('en')
67 | tokens, tags = [], []
68 | print(sen)
69 | for doc in nlp.pipe([sen]):
70 | for token in doc:
71 | tags.append(token.ent_iob_)
72 | tokens.append(str(token))
73 |
74 | rep_pos = []
75 | vis = [False for _ in range(len(tags))]
76 | for idx, tag in enumerate(tags):
77 | if tag == 'O':
78 | rep_pos.append([idx, idx])
79 | vis[idx] = True
80 | elif tag == 'B':
81 | end = idx
82 | for j in range(idx+1, len(tags)):
83 | if tags[j] == 'I':
84 | end = j
85 | else:
86 | break
87 | rep_pos.append([idx, end])
88 | elif tag == 'I':
89 | continue
90 |
91 | return ' '.join(tokens), rep_pos
92 |
93 |
94 | def remove_marked_sen(sen, start_id, end_id):
95 | tokens = sen if type(sen) == list else sen.strip().split()
96 | if tokens[start_id].startswith("===") and tokens[end_id].endswith("==="):
97 | tokens[start_id] = tokens[start_id][3:]
98 | tokens[end_id] = tokens[end_id][:-3]
99 | return tokens
100 |
101 |
--------------------------------------------------------------------------------
/REAL_sampling/src/collect_gt_entropy.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | import numpy as np
4 | import os
5 | import argparse
6 | from data_utils import load_corpus
7 | from tqdm import tqdm
8 |
9 | def word_ent(model, input_ids):
10 | #top_k_val = 5
11 | assert model is not None
12 | input_ids = input_ids.to(model.device)
13 | outputs = model(input_ids, labels=input_ids)
14 | loss, logits = outputs[:2]
15 | probs = logits.softmax(dim=-1)
16 | ent = - (probs * (1e-23+probs).log() ).sum(dim=-1)
17 | #top_val, top_idx= torch.topk(probs.squeeze(), k=top_k_val, dim=-1)
18 | #top_idx = top_idx.tolist()
19 | #print(top_idx)
20 | #top_tok = [tokenizer.convert_ids_to_tokens(top_idx[i]) for i in range(len(top_idx))]
21 | #return ent.cpu(), top_tok, top_val.cpu()
22 | return ent.cpu()
23 |
24 | def str2bool(v):
25 | if v.lower() in ('yes', 'true', 'True', 't', 'y', '1'):
26 | return True
27 | elif v.lower() in ('no', 'false', 'False', 'f', 'n', '0'):
28 | return False
29 | else:
30 | raise argparse.ArgumentTypeError('Boolean value expected.')
31 |
32 | def parse_args():
33 | parser = argparse.ArgumentParser()
34 | parser.add_argument("--model_name", type=str, required=True)
35 | parser.add_argument("--input_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia')
36 | #parser.add_argument("--output_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia')
37 | #parser.add_argument("--output_tensor_folder", type=str, default = 'entropy_tensor')
38 | parser.add_argument("--output_tensor_folder", type=str, default = 'entropy_tensor_1024')
39 | parser.add_argument("--tensor_folder", type=str, default = 'tensors_all')
40 | parser.add_argument("--do_train", type=str2bool, nargs='?', default=True)
41 | parser.add_argument("--do_val", type=str2bool, nargs='?', default=True)
42 | parser.add_argument("--batch_size", type=int, default=8)
43 | #parser.add_argument("--eval_batch_size", type=int, default=16)
44 | #parser.add_argument("--bptt", type=int, default=256)
45 | parser.add_argument("--bptt", type=int, default=1024)
46 | parser.add_argument("--cuda_idx", type=int, default=0)
47 |
48 | args = parser.parse_args()
49 | return args
50 |
51 | #model_name = 'EleutherAI/pythia-70m-deduped'
52 |
53 | def compute_ent(args, model, model_name, dataloader, save_folder_name):
54 | output_entropy = []
55 | with torch.no_grad():
56 | #for i_batch, sample_batched in enumerate(dataloader_train):
57 | for sample_batched in tqdm(dataloader):
58 | entropy_tensor = word_ent( model, sample_batched )
59 | output_entropy.append(entropy_tensor)
60 |
61 | output_tensor = torch.cat(output_entropy, dim=0)
62 | print(model_name)
63 | print(args.cuda_idx)
64 | print(output_tensor)
65 | print(output_tensor.size())
66 | del output_entropy
67 | ouput_dir = args.input_folder_name + '/' + args.output_tensor_folder + '/' + save_folder_name
68 | if not os.path.exists(ouput_dir):
69 | os.makedirs(ouput_dir)
70 | output_file_name = ouput_dir + '/ent_' + model_name.replace('/','_') + '_bptt_' + str(args.bptt) + '.pt'
71 | torch.save(output_tensor, output_file_name)
72 |
73 |
74 |
75 | def main(args):
76 | model_name = args.model_name
77 | #tokenizer = AutoTokenizer.from_pretrained(model_name)
78 | #model = AutoModelWithLMHead.from_pretrained(model_name)
79 | model = AutoModelForCausalLM.from_pretrained(model_name)
80 | #device = torch.cuda.device(args.cuda_idx)
81 | device = torch.device("cuda:"+str(args.cuda_idx))
82 | model.eval()
83 | model.to(device)
84 |
85 | print(args.do_train)
86 | print(args.do_val)
87 | skip_training = False
88 | #dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = False, load_testing=False)
89 | dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = True, load_testing=False)
90 |
91 | if args.do_train:
92 | compute_ent(args, model, model_name, dataloader_train, 'train')
93 | if args.do_val:
94 | compute_ent(args, model, model_name, dataloader_val, 'val')
95 |
96 | if __name__ == "__main__":
97 | args = parse_args()
98 | main(args)
99 |
--------------------------------------------------------------------------------
/REAL_sampling/src/collect_gt_perplexity.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | import numpy as np
4 | import os
5 | import argparse
6 | from data_utils import load_corpus
7 | from tqdm import tqdm
8 |
9 | loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
10 |
11 | def word_ent(model, input_ids):
12 | #top_k_val = 5
13 | assert model is not None
14 | input_ids = input_ids.to(model.device)
15 | outputs = model(input_ids, labels=input_ids)
16 | loss, logits = outputs[:2]
17 | #probs = logits.softmax(dim=-1)
18 | #ent = - (probs * (1e-23+probs).log() ).sum(dim=-1)
19 | #return ent.cpu()
20 |
21 | labels = input_ids
22 | # we are doing next-token prediction; shift prediction scores and input ids by one
23 | shift_logits = logits[:, :-1, :].contiguous()
24 | shift_labels = labels[:, 1:].contiguous()
25 | bsz, seq_len_minus_one = shift_labels.size()
26 | lm_per = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).view(bsz, seq_len_minus_one)
27 | lm_per = torch.cat( (lm_per, torch.zeros( (bsz,1), device = model.device ) ), dim=1 )
28 | return lm_per.cpu()
29 |
30 | def str2bool(v):
31 | if v.lower() in ('yes', 'true', 'True', 't', 'y', '1'):
32 | return True
33 | elif v.lower() in ('no', 'false', 'False', 'f', 'n', '0'):
34 | return False
35 | else:
36 | raise argparse.ArgumentTypeError('Boolean value expected.')
37 |
38 | def parse_args():
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument("--model_name", type=str, required=True)
41 | parser.add_argument("--input_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia')
42 | #parser.add_argument("--output_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia')
43 | parser.add_argument("--output_tensor_folder", type=str, default = 'perplexity_tensor')
44 | parser.add_argument("--tensor_folder", type=str, default = 'tensors_all')
45 | parser.add_argument("--do_train", type=str2bool, nargs='?', default=True)
46 | parser.add_argument("--do_val", type=str2bool, nargs='?', default=True)
47 | parser.add_argument("--batch_size", type=int, default=8)
48 | #parser.add_argument("--eval_batch_size", type=int, default=16)
49 | parser.add_argument("--bptt", type=int, default=256)
50 | parser.add_argument("--cuda_idx", type=int, default=0)
51 |
52 | args = parser.parse_args()
53 | return args
54 |
55 | #model_name = 'EleutherAI/pythia-70m-deduped'
56 |
57 | def compute_ent(args, model, model_name, dataloader, save_folder_name):
58 | output_entropy = []
59 | with torch.no_grad():
60 | #for i_batch, sample_batched in enumerate(dataloader_train):
61 | for sample_batched in tqdm(dataloader):
62 | entropy_tensor = word_ent( model, sample_batched )
63 | output_entropy.append(entropy_tensor)
64 |
65 | output_tensor = torch.cat(output_entropy, dim=0)
66 | print(model_name)
67 | print(args.cuda_idx)
68 | print(output_tensor)
69 | print(output_tensor.size())
70 | del output_entropy
71 | ouput_dir = args.input_folder_name + '/' + args.output_tensor_folder + '/' + save_folder_name
72 | if not os.path.exists(ouput_dir):
73 | os.makedirs(ouput_dir)
74 | output_file_name = ouput_dir + '/per_' + model_name.replace('/','_') + '_bptt_' + str(args.bptt) + '.pt'
75 | torch.save(output_tensor, output_file_name)
76 |
77 |
78 |
79 | def main(args):
80 | model_name = args.model_name
81 | #tokenizer = AutoTokenizer.from_pretrained(model_name)
82 | #model = AutoModelWithLMHead.from_pretrained(model_name)
83 | model = AutoModelForCausalLM.from_pretrained(model_name)
84 | #device = torch.cuda.device(args.cuda_idx)
85 | device = torch.device("cuda:"+str(args.cuda_idx))
86 | model.eval()
87 | model.to(device)
88 |
89 | print(args.do_train)
90 | print(args.do_val)
91 | skip_training = False
92 | #dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = False, load_testing=False)
93 | dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = True, load_testing=False)
94 |
95 | if args.do_train:
96 | compute_ent(args, model, model_name, dataloader_train, 'train')
97 | if args.do_val:
98 | compute_ent(args, model, model_name, dataloader_val, 'val')
99 |
100 | if __name__ == "__main__":
101 | args = parse_args()
102 | main(args)
103 |
--------------------------------------------------------------------------------
/REAL_sampling/src/colorize.html:
--------------------------------------------------------------------------------
1 |  The  quick  brown  fox  jumps  over  the  lazy  dog 
--------------------------------------------------------------------------------
/REAL_sampling/src/data_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | class SeqDataset(torch.utils.data.Dataset):
4 | def __init__(self, w_ind_gpt2_tensor, bptt, device):
5 | self.w_ind_gpt2 = w_ind_gpt2_tensor
6 | self.seq_len = bptt
7 | self.output_device = device
8 |
9 | def __len__(self):
10 | return int( self.w_ind_gpt2.size(0) /self.seq_len )
11 |
12 | def __getitem__(self, idx):
13 | feature = self.w_ind_gpt2[idx*self.seq_len:(idx+1)*self.seq_len].to(dtype = torch.long, device = self.output_device)
14 | return feature
15 |
16 | def create_data_loader(f_in, bsz, bptt, device, dataset_class, shuffle = True):
17 | w_ind_gpt2_tensor = torch.load(f_in, map_location='cpu')
18 | cut_tok_num = w_ind_gpt2_tensor.size(0) % bptt
19 | if cut_tok_num > 0:
20 | w_ind_gpt2_tensor = w_ind_gpt2_tensor[:-cut_tok_num]
21 | dataset = dataset_class(w_ind_gpt2_tensor, bptt, device)
22 | use_cuda = False
23 | if device.type == 'cuda':
24 | use_cuda = True
25 | return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=False)
26 | #return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=True)
27 |
28 |
29 | def load_corpus(data_path, train_bsz, eval_bsz, bptt, device, tensor_folder = "tensors_all", skip_training = False, shuffle_train=True, shuffle_val = False, load_val = True, load_testing = True):
30 | train_corpus_name = data_path + "/" + tensor_folder + "/train.pt"
31 | val_org_corpus_name = data_path +"/" + tensor_folder + "/val_org.pt"
32 | test_org_corpus_name = data_path +"/" + tensor_folder + "/test_org.pt"
33 |
34 | dataloader_train = []
35 | dataloader_val = []
36 | dataloader_test = []
37 |
38 | dataset_class = SeqDataset
39 |
40 | if load_val:
41 | with open(val_org_corpus_name,'rb') as f_in:
42 | dataloader_val = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val)
43 |
44 | if load_testing:
45 | with open(test_org_corpus_name,'rb') as f_in:
46 | dataloader_test = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val)
47 |
48 | if not skip_training:
49 | with open(train_corpus_name,'rb') as f_in:
50 | dataloader_train = create_data_loader(f_in, train_bsz, bptt, device, dataset_class, shuffle = shuffle_train)
51 |
52 | return dataloader_train, dataloader_val, dataloader_test
53 |
--------------------------------------------------------------------------------
/REAL_sampling/src/example.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('./src/factual_gen/')
3 | from sampling_method import FETopPLogitsWarper, LogitsProcessorList
4 | from transformers import AutoModelForCausalLM, AutoTokenizer
5 | import torch
6 |
7 | sampling = 'REAL'
8 | #sampling = 'REAL + CD'
9 |
10 | LLM = 'Pythia'
11 | #LLM = 'OPT'
12 |
13 | final_entropy_model_path = 'models/OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3'
14 | decay_temperature = 2
15 | window_size = 40
16 | device = torch.device("cuda:0")
17 |
18 | if LLM == 'Pythia':
19 | LM_gen = 'EleutherAI/pythia-6.9b-deduped'
20 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024)
21 | tokenizer_ent = tokenizer
22 | else:
23 | LM_gen = 'facebook/opt-6.7b'
24 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024)
25 | tokenizer_ent = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m-deduped', padding_side='left', model_max_length=1024)
26 |
27 | tokenizer.pad_token = tokenizer.eos_token
28 | tokenizer_ent.pad_token = tokenizer_ent.eos_token
29 |
30 | model = AutoModelForCausalLM.from_pretrained(LM_gen)
31 | model.eval()
32 | model.to(device)
33 |
34 | if sampling == 'REAL':
35 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, device=device)
36 | else:
37 | if LLM == 'Pythia':
38 | student_model_name = 'EleutherAI/pythia-70m-deduped'
39 | else:
40 | student_model_name = 'facebook/opt-125m'
41 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, student_model_name=student_model_name, use_CD_alpha= False, device=device)
42 |
43 | logits_processor = LogitsProcessorList()
44 | logits_processor.append(logits_processor_i)
45 |
46 | input_prompt = " I like to go hiking."
47 | input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
48 |
49 | output_sequences = model.generate(input_ids=input_ids.to(device), pad_token_id=tokenizer.eos_token_id, logits_processor=logits_processor, do_sample=True )
50 | input_len = input_ids.size(-1)
51 | output_con = output_sequences[0,input_len:]
52 | output_text = tokenizer.decode(output_con, skip_special_tokens=True)
53 | print("Input: ", input_prompt)
54 | print("Output: ", output_text)
55 |
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/.gen_fp.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/.gen_fp.py.swp
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/collect_GPT_results.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import os
4 | import json
5 |
6 | input_folder = 'outputs/GPT_exp/old/GPT3.5_responses_500/'
7 |
8 | result_dict = {'file_name': [], 'avg_win_rate_F': [], 'avg_win_rate_C': [], 'avg_win_rate_L': [], 'avg_win_rate_O': [], 'avg_score_F': [], 'avg_score_C': [], 'avg_score_L': [], 'avg_score_O': [], 'avg_b_score_F': [], 'avg_b_score_O': [], 'avg_b_score_C': [], 'avg_b_score_L': [], 'avg_b_score_O': [], 'avg_diff_score_F': [], 'avg_diff_score_C': [], 'avg_diff_score_L': [], 'avg_diff_score_O': []}
9 |
10 | all_bad_idx = []
11 |
12 | for result_file in os.listdir(input_folder):
13 | file_path = input_folder+result_file
14 | if not os.path.isfile(file_path):
15 | continue
16 | with open(file_path) as f_in:
17 | all_inputs = json.load(f_in)
18 | bad_idx_list = []
19 | if len(all_inputs) == 5:
20 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs
21 | all_bad_idx = all_bad_idx + bad_idx_list
22 |
23 | all_bad_idx_set = set(all_bad_idx)
24 |
25 | print(all_bad_idx_set)
26 |
27 | #for result_file in input_file_list:
28 | for result_file in os.listdir(input_folder):
29 | file_path = input_folder+result_file
30 | if not os.path.isfile(file_path):
31 | continue
32 | with open(file_path) as f_in:
33 | all_inputs = json.load(f_in)
34 | if len(all_inputs) == 4:
35 | pred_method_name, base_method_name, system_prompt1, all_list = all_inputs
36 | elif len(all_inputs) == 5:
37 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs
38 | id_list, context_list_pred, gen_list_pred, gen_list_base, ref_list, prompt_list, first_res_list, response_list, parse_win_list, parse_score_pred_list, parse_score_base_list = zip(*all_list)
39 | avg_win_rate_F = []
40 | avg_win_rate_C = []
41 | avg_win_rate_L = []
42 | avg_win_rate_O = []
43 | avg_score_F = []
44 | avg_score_C = []
45 | avg_score_L = []
46 | avg_score_O = []
47 | avg_b_score_F = []
48 | avg_b_score_C = []
49 | avg_b_score_L = []
50 | avg_b_score_O = []
51 | for i in range(len(id_list)):
52 | if i in all_bad_idx_set:
53 | continue
54 | avg_win_rate_F.append(int(parse_win_list[i]['F'] == 'pred'))
55 | avg_win_rate_C.append(int(parse_win_list[i]['C'] == 'pred'))
56 | avg_win_rate_L.append(int(parse_win_list[i]['L'] == 'pred'))
57 | avg_win_rate_O.append(int(parse_win_list[i]['O'] == 'pred'))
58 | avg_score_F.append(float(parse_score_pred_list[i]['F'] ))
59 | avg_score_C.append(float(parse_score_pred_list[i]['C'] ))
60 | avg_score_L.append(float(parse_score_pred_list[i]['L'] ))
61 | avg_score_O.append(float(parse_score_pred_list[i]['O'] ))
62 | avg_b_score_F.append(float(parse_score_base_list[i]['F'] ))
63 | avg_b_score_C.append(float(parse_score_base_list[i]['C'] ))
64 | avg_b_score_L.append(float(parse_score_base_list[i]['L'] ))
65 | avg_b_score_O.append(float(parse_score_base_list[i]['O'] ))
66 |
67 | result_dict['file_name'].append(result_file)
68 | result_dict['avg_win_rate_F'].append(np.mean(avg_win_rate_F))
69 | result_dict['avg_win_rate_C'].append(np.mean(avg_win_rate_C))
70 | result_dict['avg_win_rate_L'].append(np.mean(avg_win_rate_L))
71 | result_dict['avg_win_rate_O'].append(np.mean(avg_win_rate_O))
72 | result_dict['avg_score_F'].append(np.mean(avg_score_F))
73 | result_dict['avg_score_C'].append(np.mean(avg_score_C))
74 | result_dict['avg_score_L'].append(np.mean(avg_score_L))
75 | result_dict['avg_score_O'].append(np.mean(avg_score_O))
76 | result_dict['avg_b_score_F'].append(np.mean(avg_b_score_F))
77 | result_dict['avg_b_score_C'].append(np.mean(avg_b_score_C))
78 | result_dict['avg_b_score_L'].append(np.mean(avg_b_score_L))
79 | result_dict['avg_b_score_O'].append(np.mean(avg_b_score_O))
80 | result_dict['avg_diff_score_F'].append(np.mean(avg_score_F) - np.mean(avg_b_score_F))
81 | result_dict['avg_diff_score_C'].append(np.mean(avg_score_C) - np.mean(avg_b_score_C))
82 | result_dict['avg_diff_score_L'].append(np.mean(avg_score_L) - np.mean(avg_b_score_L))
83 | result_dict['avg_diff_score_O'].append(np.mean(avg_score_O) - np.mean(avg_b_score_O))
84 |
85 | df = pd.DataFrame.from_dict(result_dict)
86 |
87 | #pd.set_option('display.max_columns', None)
88 | pd.options.display.max_colwidth = 150
89 |
90 | df_sort = df.set_index('file_name').sort_values(by=['file_name'])
91 |
92 | #print(df_sort[ ['avg_win_rate_O', 'avg_diff_score_O']])
93 | print(df_sort)
94 | #print(df['file_name'])
95 |
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/comp_collect_GPT_results.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import os
4 | import json
5 |
6 |
7 | input_folder = 'outputs/GPT_exp/comp_GPT3.5_responses_500/'
8 | #input_folder = 'outputs/wp/GPT_exp/test_GPT3.5_responses_500/'
9 |
10 |
11 | result_dict = {'file_name': [], 'avg_win_rate_F': [], 'avg_win_rate_C': [], 'avg_win_rate_L': [], 'avg_win_rate_O': [] }
12 |
13 | all_bad_idx = []
14 |
15 | for result_file in os.listdir(input_folder):
16 | file_path = input_folder+result_file
17 | if not os.path.isfile(file_path):
18 | continue
19 | with open(file_path) as f_in:
20 | all_inputs = json.load(f_in)
21 | bad_idx_list = []
22 | if len(all_inputs) == 5:
23 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs
24 | all_bad_idx = all_bad_idx + bad_idx_list
25 |
26 | all_bad_idx_set = set(all_bad_idx)
27 |
28 | print(all_bad_idx_set)
29 |
30 | #for result_file in input_file_list:
31 | for result_file in os.listdir(input_folder):
32 | file_path = input_folder+result_file
33 | if not os.path.isfile(file_path):
34 | continue
35 | with open(file_path) as f_in:
36 | all_inputs = json.load(f_in)
37 | if len(all_inputs) == 4:
38 | pred_method_name, base_method_name, system_prompt1, all_list = all_inputs
39 | elif len(all_inputs) == 5:
40 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs
41 | id_list, context_list_pred, gen_list_pred, gen_list_base, ref_list, prompt_list, first_res_list, response_list, parse_win_list = zip(*all_list)
42 | avg_win_rate_F = []
43 | avg_win_rate_C = []
44 | avg_win_rate_L = []
45 | avg_win_rate_O = []
46 | for i in range(len(id_list)):
47 | if i in all_bad_idx_set:
48 | continue
49 | avg_win_rate_F.append(int(parse_win_list[i]['F'] == 'pred'))
50 | avg_win_rate_C.append(int(parse_win_list[i]['C'] == 'pred'))
51 | avg_win_rate_L.append(int(parse_win_list[i]['L'] == 'pred'))
52 | avg_win_rate_O.append(int(parse_win_list[i]['O'] == 'pred'))
53 |
54 | result_dict['file_name'].append(result_file)
55 | result_dict['avg_win_rate_F'].append(np.mean(avg_win_rate_F))
56 | result_dict['avg_win_rate_C'].append(np.mean(avg_win_rate_C))
57 | result_dict['avg_win_rate_L'].append(np.mean(avg_win_rate_L))
58 | result_dict['avg_win_rate_O'].append(np.mean(avg_win_rate_O))
59 |
60 | df = pd.DataFrame.from_dict(result_dict)
61 |
62 | #pd.set_option('display.max_columns', None)
63 | pd.options.display.max_colwidth = 150
64 |
65 | df_sort = df.set_index('file_name').sort_values(by=['file_name'])
66 |
67 | #print(df_sort[ ['avg_win_rate_O', 'avg_diff_score_O']])
68 | print(df_sort)
69 | #print(df['file_name'])
70 |
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/prepare_story_prompt.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 | input_stories = "/mnt/efs/Haw-Shiuan/entailment_tree/datasets/ROCStories__spring2016.csv"
5 | num_stories = 1000
6 | shot_num = 3
7 | prompt_sent_num = 2
8 | output_prompt_file = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/MTurk/story/prompt_start2_b2_{}.jsonl".format(num_stories)
9 |
10 | delimiter = '---'
11 | num_story_line = 5
12 |
13 | df = pd.read_csv(input_stories)
14 | df_sampled_stories = df.sample(n=num_stories, replace=False)
15 | df_rest = df.drop(df_sampled_stories.index)
16 |
17 | def prepare_id(row, prompt_sent_num):
18 | id_q = ''
19 | for i in range(prompt_sent_num):
20 | id_q += row['sentence'+str(i+1)] + ' '
21 | return id_q[:-1]
22 |
23 | def str_story(row_examples, i, delimiter):
24 | story_str = 'Story {}:\n'.format(i+1)
25 | for i in range(num_story_line):
26 | story_str += row_examples['sentence'+str(i+1)] + ' '
27 | story_str += '\n' + delimiter + '\n'
28 | return story_str
29 |
30 | output_list = []
31 | for index, row in df_sampled_stories.iterrows():
32 | out_dict = {}
33 | id_q = prepare_id(row, prompt_sent_num)
34 | out_dict['id'] = id_q
35 | df_examples = df_rest.sample(n=shot_num, replace=False)
36 | prompt_str = ' Here are {} stories. Each story has five sentences.\n\n'.format(shot_num+1)
37 | for i, (index, row_examples) in enumerate(df_examples.iterrows()):
38 | prompt_str += str_story(row_examples, i, delimiter)
39 |
40 | out_dict['prompt'] = prompt_str + 'Story {}:\n'.format(shot_num+1) + id_q
41 | output_list.append(out_dict)
42 |
43 | with open(output_prompt_file, 'w') as f_out:
44 | for out_dict in output_list:
45 | f_out.write(json.dumps(out_dict) + '\n' )
46 |
47 |
--------------------------------------------------------------------------------
/REAL_sampling/src/factual_gen/prepare_wiki_MTurk.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | from nltk.tokenize import sent_tokenize
4 | import random
5 |
6 | sample_numbers = 1000
7 |
8 | input_file_dict = {'Ours': "outputs/factual_gen/factual_test7k_6.9b_fe_topp_exp_1_win_40_dt_2.0_p1.0_OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3/factual_test7k_6.9b_fe_topp_p1.0_gen_seed1.jsonl",
9 | 'Top-p': 'outputs/factual_gen/factual_test7k_6.9b_topp_p0.6_temp_1.0/factual_test7k_6.9b_topp_p0.6_gen_seed1.jsonl',
10 | 'CD': 'outputs/factual_gen/factual_test7k_6.9b_CD_dt_1.0_p0.3_pythia-70m-deduped/factual_test7k_6.9b_CD_p0.3_gen_seed1.jsonl',
11 | 'Ours+CD': 'outputs/factual_gen/factual_test7k_6.9b_fe_CD_topp_exp_1_win_40_dt_1.5_p1.0_OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3/factual_test7k_6.9b_fe_CD_topp_p1.0_gen_seed1.jsonl'
12 | }
13 |
14 | output_csv = 'outputs/MTurk/wiki/gen_1000.csv'
15 |
16 | method_list = list(input_file_dict.keys())
17 |
18 | def load_gen(input_file):
19 | id_list = []
20 | context_list = []
21 | gen_list = []
22 | with open(input_file) as f_in:
23 | for line in f_in:
24 | gen_obj = json.loads(line.strip())
25 | context = gen_obj['prompt'].strip()
26 | id_res = int(gen_obj['id'])
27 |
28 | text = gen_obj['text'].strip()
29 | sents = sent_tokenize(text)
30 | gen = sents[0].replace('\n',' ')
31 |
32 | id_list.append(id_res)
33 | context_list.append(context)
34 | gen_list.append(gen)
35 | if len(id_list) >= sample_numbers:
36 | break
37 | return id_list, context_list, gen_list
38 |
39 | prev_id_list = None
40 |
41 | all_res_dict = {}
42 |
43 | for method_name in input_file_dict:
44 | file_name = input_file_dict[method_name]
45 | print(file_name)
46 | id_list, context_list, gen_list = load_gen(file_name)
47 | print(method_name, sum([len(gen) for gen in gen_list ]) / sample_numbers )
48 | if prev_id_list is None:
49 | prev_id_list = id_list
50 | all_res_dict['id'] = id_list
51 | all_res_dict['context'] = context_list
52 | else:
53 | for i in range(len(id_list)):
54 | assert id_list[i] == prev_id_list[i]
55 | prev_id_list = id_list
56 | all_res_dict['gen_'+method_name] = gen_list
57 |
58 | df = pd.DataFrame(all_res_dict)
59 | print(df)
60 |
61 | num_method = len(method_list)
62 |
63 | output_dict = {'id': [], 'context': []}
64 | for i in range(num_method):
65 | output_dict['gen_'+str(i+1)] = []
66 | output_dict['method_'+str(i+1)] = []
67 |
68 | #drop_idx = []
69 |
70 | for index, row in df.iterrows():
71 | gen_list = []
72 |
73 | for method_name in method_list:
74 | gen_list.append(row['gen_'+method_name])
75 | if any([len(gen)<10 or 'External links' in gen for gen in gen_list]) or len(gen_list) != len(set(gen_list)):
76 | #drop_idx.append(index)
77 | continue
78 | output_dict['id'].append(row['id'])
79 | output_dict['context'].append(row['context'])
80 | idx_rnd = list(range(num_method))
81 | random.shuffle(idx_rnd)
82 | for i, idx in enumerate(idx_rnd):
83 | output_dict['gen_'+str(i+1)].append(gen_list[idx])
84 | output_dict['method_'+str(i+1)].append(method_list[idx])
85 |
86 | df = pd.DataFrame(output_dict).set_index('id')
87 | #df = df.drop(drop_idx)
88 |
89 |
90 | print(df)
91 | df.to_csv(output_csv)
92 |
--------------------------------------------------------------------------------
/REAL_sampling/src/prepare_id_corpus_from_raw.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer
2 | import torch
3 | import random
4 | import sys
5 | import os
6 | import argparse
7 |
8 | import logging
9 | logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)
10 |
11 |
12 | def parse_args():
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--model_name", type=str, default = 'EleutherAI/pythia-70m-deduped')
15 | parser.add_argument("--input_file", type=str, required=True, default = 'data/raw/OWT_wiki_1e7')
16 | parser.add_argument("--output_dir", type=str, default = './data/processed/OWT_wiki_1e7_Pythia/tensors_all/')
17 | parser.add_argument("--training_ratio", type=float, default=0.96)
18 | parser.add_argument("--val_ratio", type=float, default=0.02)
19 |
20 | args = parser.parse_args()
21 | return args
22 |
23 | args = parse_args()
24 |
25 | input_file = args.input_file
26 | output_dir = args.output_dir
27 | model_name = args.model_name
28 | training_ratio = args.training_ratio
29 | val_ratio = args.val_ratio
30 |
31 | output_train_file = output_dir + "train.pt"
32 | output_val_file = output_dir + "val_org.pt"
33 | output_test_file = output_dir + "test_org.pt"
34 |
35 | if not os.path.exists(output_dir):
36 | os.makedirs(output_dir)
37 |
38 | max_line_num = 100000000000000
39 | #max_line_num = 100000
40 | #max_line_num = 10000000
41 | #max_line_num = 20000000
42 | #max_line_num = 2000000
43 |
44 | #max_sent_len = 256
45 |
46 | output_arr = []
47 |
48 | tokenizer = AutoTokenizer.from_pretrained(model_name)
49 |
50 | i=0
51 | with open(input_file, encoding='latin-1') as f_in:
52 | for line in f_in:
53 | raw_text = line
54 | i+=1
55 | #indexed_tokens = tokenizer.encode(raw_text, add_prefix_space=True)
56 | indexed_tokens = tokenizer.encode(raw_text)
57 | output_arr.append(indexed_tokens)
58 | if i % 100000 == 0:
59 | print(i)
60 | sys.stdout.flush()
61 | if i > max_line_num:
62 | break
63 |
64 | #idx_shuffled = list(range(len(output_arr)))
65 | #random.shuffle(idx_shuffled)
66 | training_size = int(len(output_arr)*training_ratio)
67 | val_size = int(len(output_arr)*val_ratio)
68 |
69 | def save_to_tensor(output_arr, output_file_name):
70 | data_size = len(output_arr)
71 | len_sum = 0
72 | for sent in output_arr:
73 | sent_len = len(sent)
74 | len_sum += sent_len
75 | #output_tensor = torch.zeros((len_sum),dtype = torch.uint16)
76 | output_tensor = torch.zeros((len_sum),dtype = torch.int32)
77 |
78 | current_start = 0
79 | for i in range(data_size):
80 | sent = output_arr[i]
81 | #output_tensor[current_start:current_start+len(sent)] = torch.tensor(sent,dtype = torch.uint16)
82 | output_tensor[current_start:current_start+len(sent)] = torch.tensor(sent,dtype = torch.int32)
83 | current_start += len(sent)
84 |
85 | torch.save(output_tensor, output_file_name)
86 |
87 | save_to_tensor(output_arr[:training_size], output_train_file)
88 | save_to_tensor(output_arr[training_size:training_size+val_size], output_val_file)
89 | save_to_tensor(output_arr[training_size+val_size:], output_test_file)
90 |
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/Hades/utils.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | import torch
3 | from torch.nn import functional as F
4 | import codecs
5 | import json
6 | import spacy
7 | from sklearn.metrics import classification_report, accuracy_score, hamming_loss, \
8 | f1_score, precision_score, recall_score, average_precision_score, roc_auc_score, confusion_matrix, \
9 | brier_score_loss, average_precision_score
10 | import numpy as np
11 |
12 |
13 | def binary_eval(predy, testy, predy_pro, verbose=True, return_f1=False, predscore=None):
14 | acc = accuracy_score(testy, predy)
15 | f1 = f1_score(testy, predy, average=None)
16 | precision = precision_score(testy, predy, average=None)
17 | recall = recall_score(testy, predy, average=None)
18 |
19 | average_precision = average_precision_score(testy, predy_pro)
20 | epsilon = 1e-8
21 |
22 | htn, hfp, hfn, htp = confusion_matrix(testy, predy).ravel()
23 | hsensi = htp / (htp + hfn + epsilon)
24 | hspec = htn / (hfp + htn + epsilon)
25 | gmean = np.sqrt(hsensi*hspec)
26 |
27 |
28 | info = "Acc : {}\nf1 : {}\nprecision : {}\nrecall : {}\nG-mean : {}\nAP : {}".format(acc,
29 | " ".join([str(x) for x in f1]), " ".join([str(x) for x in precision]),
30 | " ".join([str(x) for x in recall]), gmean, average_precision)
31 |
32 | if predscore is not None:
33 | bss = brier_score_loss(testy, predscore)
34 | roc_auc = roc_auc_score(testy, predscore)
35 | info += "\nbss : {}\nROC-AUC : {}".format(bss, roc_auc)
36 |
37 | if verbose:
38 | print(info)
39 |
40 | if return_f1:
41 | return acc, f1, precision, recall, gmean, bss, roc_auc, info
42 | else:
43 | #return acc, info
44 | return average_precision, info
45 |
46 |
47 | def subsets(nums):
48 | """
49 | :type nums: List[int]
50 | :rtype: List[List[int]]
51 | """
52 | ans = []
53 | def dfs(curpos, tmp):
54 | if tmp:
55 | ans.append(tmp[:])
56 | for i in range(curpos, len(nums)):
57 | tmp.append(nums[i])
58 | dfs(i+1, tmp)
59 | tmp.pop(-1)
60 | dfs(0, [])
61 | return ans
62 |
63 |
64 | def sent_ner_bounds(sen, nlp=None):
65 | if nlp is None:
66 | nlp = spacy.load('en')
67 | tokens, tags = [], []
68 | print(sen)
69 | for doc in nlp.pipe([sen]):
70 | for token in doc:
71 | tags.append(token.ent_iob_)
72 | tokens.append(str(token))
73 |
74 | rep_pos = []
75 | vis = [False for _ in range(len(tags))]
76 | for idx, tag in enumerate(tags):
77 | if tag == 'O':
78 | rep_pos.append([idx, idx])
79 | vis[idx] = True
80 | elif tag == 'B':
81 | end = idx
82 | for j in range(idx+1, len(tags)):
83 | if tags[j] == 'I':
84 | end = j
85 | else:
86 | break
87 | rep_pos.append([idx, end])
88 | elif tag == 'I':
89 | continue
90 |
91 | return ' '.join(tokens), rep_pos
92 |
93 |
94 | def remove_marked_sen(sen, start_id, end_id):
95 | tokens = sen if type(sen) == list else sen.strip().split()
96 | if tokens[start_id].startswith("===") and tokens[end_id].endswith("==="):
97 | tokens[start_id] = tokens[start_id][3:]
98 | tokens[end_id] = tokens[end_id][:-3]
99 | return tokens
100 |
101 |
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-310.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-37.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-310.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-37.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-38.pyc
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/concat_category_csv.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | folder_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/state/'
4 |
5 | #file_suffix = '_val'
6 | file_suffix = '_train'
7 |
8 | output_file_name = 'all'
9 |
10 | input_cat_list = ['animals_true_false',
11 | 'capitals_true_false',
12 | 'cities_true_false',
13 | 'companies_true_false',
14 | 'conj_neg_companies_true_false',
15 | 'conj_neg_facts_true_false',
16 | 'elements_true_false',
17 | 'facts_true_false',
18 | 'generated_true_false',
19 | 'inventions_true_false',
20 | 'neg_companies_true_false',
21 | 'neg_facts_true_false']
22 |
23 | df_all = None
24 |
25 | for cat in input_cat_list:
26 | df_cat = pd.read_csv(folder_path + cat + file_suffix+'.csv')
27 | df_cat['category'] = cat
28 | df_all = pd.concat([df_all,df_cat])
29 |
30 | df_all.to_csv(folder_path+output_file_name+file_suffix+'.csv')
31 |
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/convert_humor_dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from transformers import AutoTokenizer
4 |
5 | #input_path = "/mnt/efs/Haw-Shiuan/rJokesData/data/dev.tsv"
6 | input_path = "/mnt/efs/Haw-Shiuan/rJokesData/data/test.tsv"
7 |
8 | #output_path = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/humor/all_128_train.csv"
9 | output_path = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/humor/all_128_val.csv"
10 |
11 | cut_end = True
12 |
13 | if cut_end:
14 | #max_token_num = 2048
15 | #max_token_num = 1024
16 | max_token_num = 128
17 | small_model_name = 'EleutherAI/pythia-70m-deduped'
18 | tokenizer = AutoTokenizer.from_pretrained(small_model_name, truncation_side='left')
19 |
20 | label_reg_arr = []
21 | label_arr = []
22 | text_arr = []
23 | cat_arr = []
24 |
25 | def preprocessing_text(text):
26 | text_tok = tokenizer.tokenize(text)
27 | num_cut = len(text_tok) - max_token_num
28 | if num_cut > 0:
29 | print('cut ', num_cut)
30 | doc_trunc = tokenizer.convert_tokens_to_string( text_tok[:-(num_cut+10)] ) + ' ...'
31 | return doc_trunc, len(text_tok)
32 | else:
33 | return text, len(text_tok)
34 |
35 | with open(input_path) as f_in:
36 | for line in f_in:
37 | #print(line.strip().split('\t',1))
38 | label_reg, text = line.strip().split('\t',1)
39 | label_reg = int(label_reg)
40 | text, org_len = preprocessing_text(text)
41 | if org_len < 2:
42 | print('skip too short example')
43 | continue
44 | text_arr.append(text)
45 | label_reg_arr.append(label_reg)
46 | if label_reg > 1:
47 | label = 1
48 | else:
49 | label = 0
50 | label_arr.append(label)
51 | cat_arr.append('rJoke')
52 |
53 | print('positive ratio', sum(label_arr) / float( len(label_arr) ) )
54 |
55 | df = pd.DataFrame({'statement': text_arr, 'label': label_arr, 'label_reg': label_reg_arr, 'category': cat_arr})
56 |
57 | df.to_csv(output_path)
58 |
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/split_csv_datasets.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import os
3 |
4 | input_folder = "/mnt/efs/Haw-Shiuan/factor/data/"
5 | output_folder = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/factor/"
6 |
7 | #input_folder = "/mnt/efs/Haw-Shiuan/Probes/datasets/"
8 | #output_folder = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/state/"
9 |
10 | training_ratio = 0.5
11 | val_ratio = 0.5
12 | #test_ratio = 0.1
13 |
14 | assert training_ratio + val_ratio == 1
15 | #assert training_ratio + val_ratio + test_ratio == 1
16 |
17 | for input_file in os.listdir(input_folder):
18 | #print(input_file)
19 | input_path = input_folder + input_file
20 | if not os.path.isfile(input_path):
21 | continue
22 |
23 | #input_name = os.path.basename(input_file)
24 | output_path = output_folder + input_file.replace('.csv', '_{}.csv')
25 |
26 | df = pd.read_csv(input_path)
27 |
28 | training_size = int( len(df) * training_ratio )
29 | val_size = int( len(df) * val_ratio )
30 |
31 | df_part = df.sample(n = training_size)
32 | df_part.to_csv(output_path.format('train'), index = False)
33 |
34 | df = df.drop(df_part.index)
35 | df_part = df.sample(n = val_size)
36 | df_part.to_csv(output_path.format('val'), index = False)
37 |
38 | #df_part = df.drop(df_part.index)
39 | #df_part.to_csv(output_path.format('test'), index = False)
40 |
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/split_data.sh:
--------------------------------------------------------------------------------
1 | input_folder="/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/"
2 |
3 | #input_file="${input_folder}summarization_data_2048.json"
4 | #mid_file="${input_folder}summarization_data_2048_rnd.json"
5 | #output_prefix="${input_folder}summarization_data_2048_"
6 | input_file="${input_folder}summarization_data_1024.json"
7 | mid_file="${input_folder}summarization_data_1024_rnd.json"
8 | output_prefix="${input_folder}summarization_data_1024_"
9 |
10 | #input_file="${input_folder}qa_data.json"
11 | #mid_file="${input_folder}qa_data_rnd.json"
12 | #output_prefix="${input_folder}qa_data_"
13 | #input_file="${input_folder}qa_data_knowledge.json"
14 | #mid_file="${input_folder}qa_data_knowledge_rnd.json"
15 | #output_prefix="${input_folder}qa_data_knowledge_"
16 |
17 | #input_file="${input_folder}dialogue_data.json"
18 | #mid_file="${input_folder}dialogue_data_rnd.json"
19 | #output_prefix="${input_folder}dialogue_data_"
20 | #input_file="${input_folder}dialogue_data_knowledge.json"
21 | #mid_file="${input_folder}dialogue_data_knowledge_rnd.json"
22 | #output_prefix="${input_folder}dialogue_data_knowledge_"
23 |
24 | sort -R $input_file > $mid_file
25 |
26 | num_files=10
27 | #num_files=5
28 | total_lines=$(wc -l <${mid_file})
29 | ((lines_per_file = (total_lines + num_files - 1) / num_files))
30 | split -l ${lines_per_file} ${mid_file}
31 |
32 | cat xaa xab xac xad xae xaf xag xah > ${output_prefix}train.json
33 | mv xai ${output_prefix}val.json
34 | mv xaj ${output_prefix}test.json
35 | rm xa*
36 |
37 | #cat xac xad xae > ${output_prefix}train.json
38 | #mv xaa ${output_prefix}val.json
39 | #mv xab ${output_prefix}test.json
40 | #rm xa*
41 |
--------------------------------------------------------------------------------
/REAL_sampling/src/process_hallucination_dataset/unify_Halu_datasets_format.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from transformers import AutoTokenizer
4 |
5 | #input_path = '/mnt/efs/Haw-Shiuan/HaluEval/data/dialogue_data.json'
6 | #input_path = '/mnt/efs/Haw-Shiuan/HaluEval/data/qa_data.json'
7 | input_path = '/mnt/efs/Haw-Shiuan/HaluEval/data/summarization_data.json'
8 |
9 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/dialogue_data.json'
10 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/qa_data.json'
11 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/summarization_data.json'
12 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/summarization_data_2048.json'
13 | output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/summarization_data_1024.json'
14 |
15 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/dialogue_data_knowledge.json'
16 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/qa_data_knowledge.json'
17 |
18 | #include_knowledge = True
19 | include_knowledge = False
20 |
21 | cut_end = True
22 |
23 | if cut_end:
24 | #max_token_num = 2048
25 | max_token_num = 1024
26 | small_model_name = 'EleutherAI/pythia-70m-deduped'
27 | tokenizer = AutoTokenizer.from_pretrained(small_model_name, truncation_side='left')
28 |
29 | prepend_space = True
30 | if prepend_space:
31 | space_prefix = ' '
32 | space_suffix = ''
33 | else:
34 | space_prefix = ''
35 | space_suffix = ' '
36 |
37 |
38 | output_list = []
39 | with open(input_path, 'r', encoding='utf-8') as f:
40 | for line in f:
41 | sample = json.loads(line)
42 | #pos_output_dict = {'factual': 1}#{'context': '', 'text': '', 'factual': ''}
43 | #neg_output_dict = {'factual': 0}
44 | output_dict = {}
45 | if "dialogue_history" in sample:
46 | output_dict['text_pos'] = space_prefix + sample['right_response']
47 | output_dict['text_neg'] = space_prefix + sample['hallucinated_response']
48 |
49 | context_raw = space_prefix + sample["dialogue_history"] + '[Assistant]:' + space_suffix
50 | if include_knowledge:
51 | context = space_prefix + sample['knowledge'] + '.' + space_suffix + context_raw
52 | else:
53 | context = context_raw
54 | elif "question" in sample:
55 | output_dict['text_pos'] = space_prefix + sample['right_answer']
56 | output_dict['text_neg'] = space_prefix + sample['hallucinated_answer']
57 |
58 | context_raw = space_prefix + 'Question: ' + sample["question"] + '. Answer:' + space_suffix
59 | if include_knowledge:
60 | context = space_prefix + sample['knowledge'] + space_suffix + context_raw
61 | else:
62 | context = context_raw
63 | elif "document" in sample:
64 | output_dict['text_pos'] = space_prefix + sample['right_summary']
65 | output_dict['text_neg'] = space_prefix + sample['hallucinated_summary']
66 |
67 | context = space_prefix + 'Document: ' + sample["document"] + ' Summary:' + space_suffix
68 | if cut_end:
69 | context_tok = tokenizer.tokenize(context)
70 | pos_tok = tokenizer.tokenize(output_dict['text_pos'])
71 | neg_tok = tokenizer.tokenize(output_dict['text_neg'])
72 | num_cut = len(context_tok) + max(len(pos_tok), len(neg_tok)) - max_token_num
73 | if num_cut > 0:
74 | print('cut ', num_cut)
75 | doc_tok = tokenizer.tokenize( sample["document"] )
76 | doc_trunc = tokenizer.convert_tokens_to_string( doc_tok[:-(num_cut+10)] ) + '...'
77 | context = space_prefix + 'Document: ' + doc_trunc + ' Summary:' + space_suffix
78 |
79 | output_dict['context'] = context
80 | output_list.append(output_dict)
81 |
82 | with open(output_path, 'w') as f_out:
83 | for output_dict in output_list:
84 | f_out.write(json.dumps(output_dict)+'\n')
85 |
--------------------------------------------------------------------------------
/REAL_sampling/src/simple_exp.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "95aae402",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from mosestokenizer import *\n",
11 | "detokenizer = MosesDetokenizer('en')\n",
12 | "\n"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "id": "fe042317",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "text = \"christopher atkinson ( c . 1738 \\u2013 23 april 1819 ) , known as christopher atkinson savile or saville from about 1798 , was an english merchant and politician . born in yorkshire , he moved to ===dorset=== and married the niece of a wealthy merchant , entering that business himself . he was elected at the 1780 general election as one of the two members of parliament ( mps ) for dorset west . however he was expelled from the house of commons on the second sitting , after being convicted of receiving a bribe , and sentenced to detention in the pillory . he was granted a royal warrant in 1791 , and returned to parliament for the area in 1796 , retaining the seat until he stood down at the 1802 general election . he had changed his name to atkinson atkinson some time after 1793 . he then purchased extensive estates in westerleigh , in hampshire , which gave him control of both one of okehampton ' s two parliamentary seats . he returned himself as an okehampton mp at the 1818 general election , and held the seat until his death in 1819 unmarried , aged over 72 .\""
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "id": "5ca41243",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "christopher atkinson (c. 1738 – 23 april 1819), known as christopher atkinson savile or saville from about 1798, was an english merchant and politician. born in yorkshire, he moved to ===dorset=== and married the niece of a wealthy merchant, entering that business himself. he was elected at the 1780 general election as one of the two members of parliament (mps) for dorset west. however he was expelled from the house of commons on the second sitting, after being convicted of receiving a bribe, and sentenced to detention in the pillory. he was granted a royal warrant in 1791, and returned to parliament for the area in 1796, retaining the seat until he stood down at the 1802 general election. he had changed his name to atkinson atkinson some time after 1793. he then purchased extensive estates in westerleigh, in hampshire, which gave him control of both one of okehampton 's two parliamentary seats. he returned himself as an okehampton mp at the 1818 general election, and held the seat until his death in 1819 unmarried, aged over 72.\n"
36 | ]
37 | }
38 | ],
39 | "source": [
40 | "print(detokenizer(text.split()))\n"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 1,
46 | "id": "d71ff0fb",
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "['my', 'Ġson', 'Ġis', 'Ġje', 'rem', 'y']\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "from transformers import AutoTokenizer\n",
59 | "small_model_name = 'EleutherAI/pythia-70m-deduped'\n",
60 | "tokenizer = AutoTokenizer.from_pretrained(small_model_name)\n",
61 | "print(tokenizer.tokenize(\"my son is jeremy\"))\n"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "id": "53ef61f6",
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "[619, 3347, 310, 5139, 2013, 90]\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "print(tokenizer.encode(\" my son is jeremy\"))"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "id": "d557aa3c",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": []
89 | }
90 | ],
91 | "metadata": {
92 | "kernelspec": {
93 | "display_name": "Python 3 (ipykernel)",
94 | "language": "python",
95 | "name": "python3"
96 | },
97 | "language_info": {
98 | "codemirror_mode": {
99 | "name": "ipython",
100 | "version": 3
101 | },
102 | "file_extension": ".py",
103 | "mimetype": "text/x-python",
104 | "name": "python",
105 | "nbconvert_exporter": "python",
106 | "pygments_lexer": "ipython3",
107 | "version": "3.7.3"
108 | }
109 | },
110 | "nbformat": 4,
111 | "nbformat_minor": 5
112 | }
113 |
--------------------------------------------------------------------------------
/THIRD_PARTY_LICENSES:
--------------------------------------------------------------------------------
1 | We also include following external repositories with minor modifications in our release:
2 |
3 | 1. FactualityPrompt: Apache 2.0 License
4 | 2. HaDes: MIT License
5 |
--------------------------------------------------------------------------------