├── AP_sampling ├── .README.md.swn ├── README.md ├── bin │ ├── .finetune_ALM.sh.swp │ ├── collect_top_prob.sh │ ├── collect_top_prob_Qwen_4b.sh │ ├── collect_top_prob_opt.sh │ ├── continue_story_prompt_loop.sh │ ├── continue_wiki_prompt_loop_eval.sh │ └── finetune_ALM.sh ├── imgs │ ├── APD_first_figure.png │ └── Results.png ├── requirements.txt └── src │ ├── .collect_top_prob.py.swp │ ├── .ipynb_checkpoints │ └── entropy_prediction_visualization-checkpoint.ipynb │ ├── .model_mlp_logit.py.swo │ ├── .prepare_gpt2_id_corpus_from_raw.py.swp │ ├── QA │ ├── __pycache__ │ │ ├── online_utils.cpython-310.pyc │ │ └── online_utils.cpython-38.pyc │ ├── analyze_results.py │ ├── analyze_results_online_all.py │ ├── dataset_preparation │ │ ├── prepare_arc_dataset.py │ │ ├── prepare_commonqa_dataset.py │ │ ├── prepare_lambda_dataset.py │ │ ├── prepare_multirc_dataset.py │ │ ├── prepare_qasc_dataset.py │ │ ├── prepare_socialiqa_dataset.py │ │ └── prepare_squad_dataset.py │ ├── online_utils.py │ ├── test_neg_dataset.py │ ├── test_neg_dataset_online_all.py │ ├── test_squad_dataset.py │ └── test_squad_dataset_online_all.py │ ├── __pycache__ │ ├── configuration_openelm.cpython-38.pyc │ ├── data_utils.cpython-310.pyc │ ├── data_utils.cpython-38.pyc │ ├── data_utils.cpython-39.pyc │ ├── model_mlp_logit.cpython-310.pyc │ ├── model_mlp_logit.cpython-37.pyc │ ├── model_mlp_logit.cpython-38.pyc │ ├── model_mlp_logit.cpython-39.pyc │ ├── modeling_openelm.cpython-38.pyc │ ├── train_logits_prediction_model.cpython-310.pyc │ └── train_logits_prediction_model.cpython-38.pyc │ ├── collect_top_prob.py │ ├── data_utils.py │ ├── entropy_prediction_visualization.ipynb │ ├── example_APD_REAL.py │ ├── factual_gen │ ├── __pycache__ │ │ ├── sampling_method.cpython-310.pyc │ │ ├── sampling_method.cpython-37.pyc │ │ ├── sampling_method.cpython-38.pyc │ │ └── sampling_method.cpython-39.pyc │ ├── gen_fp.py │ ├── prepare_wiki_MTurk.py │ ├── print_all_results.py │ └── sampling_method.py │ ├── model_mlp_logit.py │ ├── story_gen │ ├── collect_res_gpt_eval.py │ ├── comp_collect_GPT_results.py │ ├── comp_prompt_GPT.py │ ├── eval_gen.py │ ├── prepare_story_prompt.py │ └── print_story_results.py │ └── train_logits_prediction_model.py ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FactualityPrompt ├── Dockerfile ├── LICENSE ├── README.md ├── bin │ ├── eval.sh │ ├── eval_loop.sh │ └── eval_story_div_loop.sh ├── fever_athene │ ├── .env │ ├── .gitignore │ ├── Dockerfile │ ├── LICENSE.txt │ ├── NOTICE.txt │ ├── README.md │ ├── __init__.py │ ├── conf │ │ └── config_no_attention_nodrop_glove_only.json │ ├── predict.sh │ ├── requirements.txt │ ├── server.sh │ ├── src │ │ ├── __init__.py │ │ ├── athene │ │ │ ├── __init__.py │ │ │ ├── retrieval │ │ │ │ ├── __init__.py │ │ │ │ ├── document │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── doc_retrieval.py │ │ │ │ │ ├── doc_retrieval_np_sub.py │ │ │ │ │ └── docment_retrieval.py │ │ │ │ ├── score │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── score.py │ │ │ │ └── sentences │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── data_processing │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── data.py │ │ │ │ │ ├── elmo_data.py │ │ │ │ │ └── sentence_loader.py │ │ │ │ │ ├── deep_models │ │ │ │ │ ├── ESIM.py │ │ │ │ │ ├── ESIMandELMO.py │ │ │ │ │ └── __init__.py │ │ │ │ │ ├── ensemble.py │ │ │ │ │ └── sentence_retrieval.py │ │ │ ├── rte │ │ │ │ ├── __init__.py │ │ │ │ ├── deep_models │ │ │ │ │ ├── BaseDeepModel.py │ │ │ │ │ ├── BiLSTM.py │ │ │ │ │ ├── ESIM_for_ensemble.py │ │ │ │ │ ├── ESIM_for_ensemble_glove_only_no_attention.py │ │ │ │ │ ├── LSTM.py │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── convert_use.py │ │ │ │ │ └── copy_graph.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── customized_votingclassifier.py │ │ │ │ │ ├── data_reader.py │ │ │ │ │ ├── dataset.py │ │ │ │ │ ├── estimator_definitions.py │ │ │ │ │ ├── fill_gold_sentences.py │ │ │ │ │ ├── score.py │ │ │ │ │ └── text_processing.py │ │ │ ├── system.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ └── config.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── dataset │ │ │ │ ├── __init__.py │ │ │ │ ├── block.py │ │ │ │ ├── corpus.py │ │ │ │ ├── data_set.py │ │ │ │ ├── formatter.py │ │ │ │ ├── label_schema.py │ │ │ │ ├── persistence │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── engine.py │ │ │ │ │ ├── page.py │ │ │ │ │ └── session.py │ │ │ │ ├── reader.py │ │ │ │ ├── reverse_index.py │ │ │ │ └── s3 │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── index.py │ │ │ │ │ └── iterator.py │ │ │ ├── framework │ │ │ │ ├── __init__.py │ │ │ │ └── task.py │ │ │ ├── training │ │ │ │ ├── __init__.py │ │ │ │ ├── batcher.py │ │ │ │ ├── early_stopping.py │ │ │ │ ├── options.py │ │ │ │ └── run.py │ │ │ └── util │ │ │ │ ├── __init__.py │ │ │ │ ├── array.py │ │ │ │ ├── log_helper.py │ │ │ │ └── random.py │ │ ├── rename.py │ │ ├── retrieval │ │ │ ├── __init__.py │ │ │ ├── bidaf.py │ │ │ ├── fever_doc_db.py │ │ │ ├── filter_lists.py │ │ │ ├── filter_uninformative.py │ │ │ ├── reader.py │ │ │ ├── sent_features.py │ │ │ ├── sentence.py │ │ │ └── snopes_doc_db.py │ │ └── scripts │ │ │ ├── __init__.py │ │ │ ├── athene │ │ │ ├── __init__.py │ │ │ ├── export_current_config_to_json.py │ │ │ ├── pipeline.py │ │ │ ├── replace_noise_dataset.py │ │ │ ├── replace_noise_dataset_with_scores.py │ │ │ ├── rte.py │ │ │ ├── rte_fasttext.py │ │ │ └── sort_submission.py │ │ │ ├── build_db.py │ │ │ ├── build_db_kilt.py │ │ │ ├── prepare_submission.py │ │ │ └── score.py │ └── tests │ │ └── test_load_models.py ├── prompts │ ├── fever_factual_1000_final.jsonl │ ├── fever_factual_100_final.jsonl │ ├── fever_factual_final.jsonl │ ├── fever_factual_test7k_final.jsonl │ ├── fever_nonfactual_1000_final.jsonl │ ├── fever_nonfactual_100_final.jsonl │ ├── fever_nonfactual_final.jsonl │ └── fever_nonfactual_test7k_final.jsonl ├── setup.sh └── src │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── claim_handling.cpython-38.pyc │ ├── const.cpython-310.pyc │ ├── const.cpython-38.pyc │ ├── const.cpython-39.pyc │ ├── factuality_metric.cpython-38.pyc │ ├── metric.cpython-38.pyc │ ├── retriever.cpython-38.pyc │ └── retriever.cpython-39.pyc │ ├── claim_handling.py │ ├── const.py │ ├── distinct_n.py │ ├── distinct_n_each_gen.py │ ├── evaluate_v3_final.py │ ├── evaluate_v3_final_all.py │ ├── factuality_metric.py │ ├── preprocess_data_megatron_lm.py │ ├── repetition.py │ └── retriever.py ├── LICENSE ├── README.md ├── REAL_sampling ├── .DS_Store ├── README.md ├── bin │ ├── continue_wiki_prompt_loop.sh │ └── train_THF_model.sh ├── imgs │ └── REAL_second_figure.png ├── requirements.txt └── src │ ├── __pycache__ │ ├── data_utils.cpython-310.pyc │ ├── data_utils.cpython-37.pyc │ ├── data_utils.cpython-38.pyc │ ├── model.cpython-310.pyc │ ├── model.cpython-37.pyc │ ├── model.cpython-38.pyc │ ├── model.cpython-39.pyc │ ├── train_entropy_prediction_model.cpython-37.pyc │ └── train_entropy_prediction_model.cpython-38.pyc │ ├── analyze_datasets │ ├── __pycache__ │ │ ├── utils.cpython-37.pyc │ │ └── utils.cpython-38.pyc │ ├── feature_clf_Hades.py │ ├── feature_clf_Halu.py │ ├── feature_clf_all.py │ ├── feature_clf_factor.py │ ├── feature_clf_state.py │ └── utils.py │ ├── collect_avg_RE.py │ ├── collect_gt_entropy.py │ ├── collect_gt_perplexity.py │ ├── colorize.html │ ├── data_utils.py │ ├── entropy_prediction_visualization.ipynb │ ├── entropy_visualization.ipynb │ ├── example.py │ ├── factual_gen │ ├── .gen_fp.py.swp │ ├── __pycache__ │ │ ├── sampling_method.cpython-310.pyc │ │ ├── sampling_method.cpython-38.pyc │ │ └── sampling_method.cpython-39.pyc │ ├── collect_GPT_results.py │ ├── collect_res_gpt_eval.py │ ├── comp_collect_GPT_results.py │ ├── comp_prompt_GPT.py │ ├── gen_fp.py │ ├── prepare_story_MTurk.py │ ├── prepare_story_prompt.py │ ├── prepare_wiki_MTurk.py │ ├── print_all_results.py │ ├── prompt_GPT.py │ └── sampling_method.py │ ├── model.py │ ├── prepare_id_corpus_from_raw.py │ ├── process_hallucination_dataset │ ├── Hades │ │ ├── data_loader.py │ │ └── utils.py │ ├── __pycache__ │ │ ├── compute_ent_features.cpython-310.pyc │ │ ├── compute_ent_features.cpython-37.pyc │ │ ├── compute_ent_features.cpython-38.pyc │ │ ├── data_classes.cpython-310.pyc │ │ ├── data_classes.cpython-37.pyc │ │ └── data_classes.cpython-38.pyc │ ├── compute_ent_features.py │ ├── concat_category_csv.py │ ├── convert_humor_dataset.py │ ├── data_classes.py │ ├── get_entropy_HaDes_span.py │ ├── get_entropy_Halu.py │ ├── get_entropy_all.py │ ├── get_entropy_factor.py │ ├── get_entropy_state.py │ ├── split_csv_datasets.py │ ├── split_data.sh │ └── unify_Halu_datasets_format.py │ ├── simple_exp.ipynb │ └── train_entropy_prediction_model.py └── THIRD_PARTY_LICENSES /AP_sampling/.README.md.swn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/.README.md.swn -------------------------------------------------------------------------------- /AP_sampling/README.md: -------------------------------------------------------------------------------- 1 | # Explaining and Improving Contrastive Decoding by Extrapolating the Probabilities of a Huge and Hypothetical LM 2 | 3 |

4 | 5 | ## Introduction 6 | 7 | To overcome the limitation of contrastive decoding (CD), we propose a new unsupervised decoding method called **A**symptotic **P**robability **D**ecoding (APD). APD explicitly extrapolates the probability curves from the LMs of different sizes to infer the asymptotic probabilities from an infinitely large LM without inducing more inference costs than CD. In FactualityPrompts, an open-ended text generation benchmark, sampling using APD significantly boosts factuality in comparison to the CD sampling and its variants, and achieves state-of-the-art results for Pythia 6.9B and OPT 6.7B. Furthermore, in five commonsense QA datasets, APD is often significantly better than CD and achieves a similar effect of using a larger LLM. For example, the perplexity of APD on top of Pythia 6.9B is even lower than the perplexity of Pythia 12B in CommonsenseQA and LAMBADA. 8 | 9 | 10 | ## Computational Environment 11 | 12 | You can reproduce our python enviroment using 13 | ``` 14 | conda create --name --file requirement.txt 15 | ``` 16 | Most of the codes could also be run using older versions (e.g., the version in the REAL_sampling/requirement.txt) of huggingface except for running the Qwen LLM 17 | 18 | ## How to run APD 19 | 20 | To learn how to use APD and/or REAL sampling in huggingface, please see the following example code 21 | 22 | ``` 23 | ./src/example_APD_REAL.py 24 | ``` 25 | 26 | ### Run FactualityPrompts 27 | 28 | To evaluate the generation results, first follow ../FactualityPrompt/README.md to download the data, change ../FactualityPrompt/src/const.py and run the following script. 29 | 30 | If you have >7 GPUs in your machine, you can just run the following file to generate the contiunations. 31 | ``` 32 | ./bin/continue_wiki_prompt_loop_eval.sh 33 | ``` 34 | 35 | ### Run Question Answering Datasets 36 | 37 | Step 1: Run the dataset download codes at src/QA/dataset_preparation (For ARC, we concatenate the easy and challenge json output). 38 | 39 | Step 2: Test APD models on the datasets. For datasets with only positive answers (e.g., LAMBADA, SQuAD, and MultiRC), use src/QA/dataset_preparation/test_squad_dataset.py. For the datasets with negative answers (e.g., QASC, ARC, SocialIQA, and CommonsenceQA), use src/QA/dataset_preparation/test_neg_dataset.py . If you want to also test the APD on the fly baseline, use test_squad_dataset_online_all.py and test_neg_dataset_online_all.py instead. Remember to change the paths in each file accordingly. 40 | 41 | Step 3: Run analyze_results.py or analyze_results_online_all.py to collect results. For datasets that have negative answers and accuracy metrics, set have_acc to be 1. 42 | 43 | 44 | ## How to Train ALM' (in order to use APD) 45 | 46 | Put your text file into "data/raw/". 47 | 48 | Change the INPUT_FILE, data_folder_name, and OUTPUT_MODEL_FOLDER in bin/finetune_ALM.sh and run it (Assuming you have more than 7 GPUs in your machine). 49 | 50 | Notice that our current implementation will first save lots of probabilities and logits from the top tokens of various LLMs into a cache, which will take lot of disk space. 51 | And we also need lots of CPU memory to load these probabilities. For example, after process ~270M Wikipedia text using 5 OPT models, we store 70G tensor and 52G dataset cache and our server has around 750G cpu memory. 52 | -------------------------------------------------------------------------------- /AP_sampling/bin/.finetune_ALM.sh.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/bin/.finetune_ALM.sh.swp -------------------------------------------------------------------------------- /AP_sampling/bin/collect_top_prob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #top_k=10 3 | bptt=1024 4 | 5 | #data_folder_name="wiki2021_1e4_Pythia" 6 | #data_folder_name="ROC_gen_1000_p095_Pythia" 7 | #data_folder_name="news_gen_1000_p095_Pythia" 8 | #data_folder_name="wp_gen_1000_p095_Pythia" 9 | #data_folder_name="wiki2021_1e6_Pythia" 10 | data_folder_name="wiki2021_5e6_Pythia" 11 | #data_folder_name="ROC_spring_Pythia" 12 | #data_folder_name="wikinews_Pythia" 13 | #data_folder_name="wp_5000_Pythia" 14 | #data_folder_name="wp_20000_Pythia" 15 | #data_folder_name="wiki2021_1e5_Pythia" 16 | 17 | #top_k="10" 18 | #sampling_methods="10_20" 19 | 20 | top_k="20,5,10" 21 | sampling_methods="0_20,20_100,100_inf" 22 | #top_k="20,20,20" 23 | #sampling_methods="0_20,20_100,100_inf" 24 | 25 | top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped" 26 | output_folder="data/processed/$data_folder_name/prob_tensor_${bptt}_ext2" 27 | #output_folder="data/processed/$data_folder_name/prob_tensor_${bptt}_ext3" 28 | #input_folder_name="../true_entropy/data/processed/$data_folder_name" 29 | input_folder_name="data/processed/$data_folder_name" 30 | 31 | declare -a bsz_arr=(2 4 4 8 12 16) 32 | declare -a model_arr=("EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1.4b-deduped" "EleutherAI/pythia-1b-deduped" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-160m-deduped" "EleutherAI/pythia-70m-deduped" ) 33 | 34 | model_name="EleutherAI/pythia-6.9b-deduped" 35 | batch_size=1 36 | cuda_init=0 37 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 38 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt 39 | 40 | pids=() 41 | 42 | for i in "${!model_arr[@]}"; 43 | do 44 | model_name=${model_arr[$i]} 45 | batch_size=${bsz_arr[$i]} 46 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 47 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt & 48 | pids+=($!) 49 | done 50 | echo "${pids[@]}" 51 | 52 | -------------------------------------------------------------------------------- /AP_sampling/bin/collect_top_prob_Qwen_4b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #bptt=1024 3 | bptt=128 4 | 5 | #data_folder_name="ROC_gen_1000_p095_OPT" 6 | #data_folder_name="news_gen_1000_p095_OPT" 7 | #data_folder_name="wp_gen_1000_p095_OPT" 8 | #data_folder_name="openwebtext_2017_18_1e5_OPT" 9 | #data_folder_name="wiki2021_1e6_OPT" 10 | data_folder_name="wiki2021_1e6_Qwen" 11 | #data_folder_name="wiki2021_5e6_OPT" 12 | #data_folder_name="ROC_spring_OPT" 13 | #data_folder_name="wikinews_OPT" 14 | #data_folder_name="wp_5000_OPT" 15 | #data_folder_name="wp_20000_OPT" 16 | #data_folder_name="wiki2021_1e5_OPT" 17 | 18 | #top_k="10" 19 | #sampling_methods="10_20" 20 | top_k="20,5,10" 21 | sampling_methods="0_20,20_100,100_inf" 22 | 23 | #top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped" 24 | #top_w_idx_model_name="facebook/opt-6.7b" 25 | top_w_idx_model_name="Qwen/Qwen1.5-4b" 26 | #top_w_idx_model_name="Qwen/Qwen1.5-4b-Chat" 27 | #output_folder="data/processed/$data_folder_name/prob_opt_tensor_$bptt" 28 | output_folder="data/processed/$data_folder_name/prob_Qwen_4b_tensor_${bptt}_new" 29 | #output_folder="data/processed/$data_folder_name/prob_Qwen_4b-Chat_tensor_${bptt}_new" 30 | #input_folder_name="../true_entropy/data/processed/$data_folder_name" 31 | input_folder_name="data/processed/$data_folder_name" 32 | 33 | declare -a bsz_arr=(4 8) 34 | declare -a model_arr=("Qwen/Qwen1.5-1.8b" "Qwen/Qwen1.5-0.5b" ) 35 | #declare -a model_arr=("Qwen/Qwen1.5-1.8b-Chat" "Qwen/Qwen1.5-0.5b-Chat" ) 36 | 37 | model_name="Qwen/Qwen1.5-4b" 38 | #model_name="Qwen/Qwen1.5-4b-Chat" 39 | batch_size=2 40 | cuda_init=0 41 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 42 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt 43 | 44 | pids=() 45 | 46 | for i in "${!model_arr[@]}"; 47 | do 48 | model_name=${model_arr[$i]} 49 | batch_size=${bsz_arr[$i]} 50 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 51 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt & 52 | pids+=($!) 53 | done 54 | echo "${pids[@]}" 55 | 56 | -------------------------------------------------------------------------------- /AP_sampling/bin/collect_top_prob_opt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bptt=1024 3 | 4 | #data_folder_name="ROC_gen_1000_p095_OPT" 5 | #data_folder_name="news_gen_1000_p095_OPT" 6 | #data_folder_name="wp_gen_1000_p095_OPT" 7 | #data_folder_name="openwebtext_2017_18_1e5_OPT" 8 | #data_folder_name="wiki2021_1e6_OPT" 9 | data_folder_name="wiki2021_5e6_OPT" 10 | #data_folder_name="ROC_spring_OPT" 11 | #data_folder_name="wikinews_OPT" 12 | #data_folder_name="wp_5000_OPT" 13 | #data_folder_name="wp_20000_OPT" 14 | #data_folder_name="wiki2021_1e5_OPT" 15 | 16 | #top_k="10" 17 | #sampling_methods="10_20" 18 | top_k="20,5,10" 19 | sampling_methods="0_20,20_100,100_inf" 20 | 21 | #top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped" 22 | top_w_idx_model_name="facebook/opt-6.7b" 23 | #output_folder="data/processed/$data_folder_name/prob_opt_tensor_$bptt" 24 | output_folder="data/processed/$data_folder_name/prob_opt_tensor_${bptt}_new" 25 | #input_folder_name="../true_entropy/data/processed/$data_folder_name" 26 | input_folder_name="data/processed/$data_folder_name" 27 | 28 | declare -a bsz_arr=(2 4 8 16) 29 | declare -a model_arr=("facebook/opt-2.7b" "facebook/opt-1.3b" "facebook/opt-350m" "facebook/opt-125m" ) 30 | 31 | model_name="facebook/opt-6.7b" 32 | batch_size=1 33 | cuda_init=0 34 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 35 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt 36 | 37 | pids=() 38 | 39 | for i in "${!model_arr[@]}"; 40 | do 41 | model_name=${model_arr[$i]} 42 | batch_size=${bsz_arr[$i]} 43 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 44 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $input_folder_name --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt & 45 | pids+=($!) 46 | done 47 | echo "${pids[@]}" 48 | 49 | -------------------------------------------------------------------------------- /AP_sampling/bin/finetune_ALM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bptt=1024 4 | 5 | #INPUT_FILE="data/raw/wiki2021_text_only_1e4" 6 | #data_folder_name="wiki2021_1e4_Pythia" 7 | #OUTPUT_MODEL_FOLDER="models/prob_wiki_ext2_1e4_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4" 8 | INPUT_FILE="data/raw/wiki2021_text_only_1e6" 9 | data_folder_name="wiki2021_1e6_Pythia" 10 | OUTPUT_MODEL_FOLDER="models/prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4" 11 | 12 | PROC_FOLDER="data/processed/$data_folder_name" 13 | output_folder="data/processed/$data_folder_name/prob_tensor_${bptt}" 14 | TOKENIZER="EleutherAI/pythia-70m-deduped" 15 | 16 | top_k="20,5,5" 17 | sampling_methods="0_20,20_100,100_inf" 18 | 19 | top_w_idx_model_name="EleutherAI/pythia-6.9b-deduped" 20 | 21 | 22 | declare -a bsz_arr=(2 4 4 8 12 16) 23 | declare -a model_arr=("EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1.4b-deduped" "EleutherAI/pythia-1b-deduped" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-160m-deduped" "EleutherAI/pythia-70m-deduped" ) 24 | 25 | model_name="EleutherAI/pythia-6.9b-deduped" 26 | batch_size=1 27 | cuda_init=0 28 | 29 | echo "python ../REAL_sampling/src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER" 30 | python ../REAL_sampling/src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER 31 | 32 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 33 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $cuda_init --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt 34 | 35 | pids=() 36 | 37 | for i in "${!model_arr[@]}"; 38 | do 39 | model_name=${model_arr[$i]} 40 | batch_size=${bsz_arr[$i]} 41 | echo "python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt" 42 | python src/collect_top_prob.py --model_name=$model_name --top_w_idx_model_name=$top_w_idx_model_name --input_folder_name $PROC_FOLDER --output_folder $output_folder --cuda_idx $i --batch_size $batch_size --top_k $top_k --sampling_methods $sampling_methods --bptt $bptt & 43 | pids+=($!) 44 | done 45 | echo "${pids[@]}" 46 | wait "${pids[@]}" 47 | 48 | 49 | echo "python src/train_logits_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $output_folder/train --validation_label_folder $output_folder/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 500 --num_train_epochs 5 --learning_rate 1e-4 --logit_reg_w 0.8 --file_suffix _${sampling_methods}_k_${top_k}_bptt_${bptt}.pt" 50 | python src/train_logits_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $output_folder/train --validation_label_folder $output_folder/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 500 --num_train_epochs 5 --learning_rate 1e-4 --logit_reg_w 0.8 --file_suffix _${sampling_methods}_k_${top_k}_bptt_${bptt}.pt 51 | -------------------------------------------------------------------------------- /AP_sampling/imgs/APD_first_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/imgs/APD_first_figure.png -------------------------------------------------------------------------------- /AP_sampling/imgs/Results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/imgs/Results.png -------------------------------------------------------------------------------- /AP_sampling/src/.collect_top_prob.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/.collect_top_prob.py.swp -------------------------------------------------------------------------------- /AP_sampling/src/.model_mlp_logit.py.swo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/.model_mlp_logit.py.swo -------------------------------------------------------------------------------- /AP_sampling/src/.prepare_gpt2_id_corpus_from_raw.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/.prepare_gpt2_id_corpus_from_raw.py.swp -------------------------------------------------------------------------------- /AP_sampling/src/QA/__pycache__/online_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/QA/__pycache__/online_utils.cpython-310.pyc -------------------------------------------------------------------------------- /AP_sampling/src/QA/__pycache__/online_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/QA/__pycache__/online_utils.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_arc_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | import json 4 | import ast 5 | 6 | #dataset_config = {'name': 'allenai/ai2_arc', 'subset': 'ARC-Challenge', 'train_name': 'train' } 7 | dataset_config = {'name': 'allenai/ai2_arc', 'subset': 'ARC-Easy', 'train_name': 'train' } 8 | 9 | #validation 10 | #output_f_name = './outputs/arc/arc_challenge_train.json' 11 | #output_f_name = './outputs/arc/arc_easy_train.json' 12 | #output_f_name = './outputs/arc/arc_neg_challenge_train.json' 13 | output_f_name = './outputs/arc/arc_neg_easy_train.json' 14 | 15 | dataset = datasets.load_dataset(dataset_config['name'], dataset_config['subset'] ) 16 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] ) 17 | 18 | print(len(df_train)) 19 | 20 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n' 21 | example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n' 22 | 23 | with open(output_f_name, 'w') as f_out: 24 | for index, row in df_train.iterrows(): 25 | #ans_dict = ast.literal_eval(row['answers']) 26 | #ans_dict = json.loads(row['answers']) 27 | ans = row['choices']['text'][ row['choices']['label'].index(row['answerKey']) ] 28 | #print(ans_str) 29 | #ans_dict = json.loads(ans_str) 30 | q = row['question'] 31 | all_ans_raw = row['choices']['text'].copy() 32 | all_ans_raw.remove(ans) 33 | all_ans = [ans] + all_ans_raw 34 | all_ans = [' ' + x for x in all_ans] 35 | assert len(row['choices']['text']) == len(all_ans) 36 | #q = q[0].upper() + q[1:] 37 | prompt = example + ' Question: ' + q + '\n Answer:' 38 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n') 39 | 40 | -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_commonqa_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | import json 4 | import ast 5 | 6 | #dataset_config = {'name': 'tau/commonsense_qa', 'train_name': 'validation' } 7 | dataset_config = {'name': 'tau/commonsense_qa', 'train_name': 'train' } 8 | 9 | #validation 10 | #output_f_name = './outputs/commonqa/commonqa_val.json' 11 | #output_f_name = './outputs/commonqa/commonqa_train.json' 12 | output_f_name = './outputs/commonqa/commonqa_neg_train.json' 13 | 14 | dataset = datasets.load_dataset(dataset_config['name'] ) 15 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] ) 16 | 17 | including_passage = False 18 | 19 | print(len(df_train)) 20 | 21 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n' 22 | example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n' 23 | 24 | with open(output_f_name, 'w') as f_out: 25 | for index, row in df_train.iterrows(): 26 | #ans_dict = ast.literal_eval(row['answers']) 27 | #ans_dict = json.loads(row['answers']) 28 | ans = row['choices']['text'][ row['choices']['label'].index(row['answerKey']) ] 29 | #print(ans_str) 30 | #ans_dict = json.loads(ans_str) 31 | q = row['question'] 32 | all_ans_raw = row['choices']['text'].copy() 33 | all_ans_raw.remove(ans) 34 | all_ans = [ans] + all_ans_raw 35 | all_ans = [' ' + x for x in all_ans] 36 | assert len(row['choices']['text']) == len(all_ans) 37 | #q = q[0].upper() + q[1:] 38 | prompt = example + ' Question: ' + q + '\n Answer:' 39 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n') 40 | 41 | -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_lambda_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | import json 4 | 5 | dataset_config = {'name': 'EleutherAI/lambada_openai', 'subset': 'en', 'train_name': 'test' } 6 | 7 | #validation 8 | output_f_name = './outputs/lambda/openai_test.json' 9 | 10 | dataset = datasets.load_dataset(dataset_config['name'], dataset_config['subset'] ) 11 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] ) 12 | 13 | print(len(df_train)) 14 | 15 | with open(output_f_name, 'w') as f_out: 16 | for index, row in df_train.iterrows(): 17 | text = row['text'] 18 | text_split = text.split(' ') 19 | prompt = ' '.join(text_split[:-1]) 20 | ans = text_split[-1] 21 | f_out.write(json.dumps({'id': index, 'prompt': prompt, 'answer': ' ' + ans }) + '\n') 22 | 23 | -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_multirc_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | #input_file = 'outputs/multirc/train_456-fixedIds.json' 4 | input_file = 'outputs/multirc/dev_83-fixedIds.json' 5 | 6 | with open(input_file) as f_in: 7 | input_dict = json.load(f_in) 8 | 9 | 10 | #output_f_name = './outputs/multirc/multirc_train.json' 11 | output_f_name = './outputs/multirc/multirc_dev.json' 12 | 13 | example = ' Passage: Sent 1: John likes to go hiking, and his wife likes to cook.\n Sent 2: His wife likes to cook.\n Here is a question: Who likes to cook?\n The answer is his wife\n\n' 14 | 15 | with open(output_f_name, 'w') as f_out: 16 | for idx, data in enumerate(input_dict['data']): 17 | passage = data['paragraph']['text'].replace('', ' ').replace('', '').replace('
', '\n').replace(' ', ' ') 18 | for qa in data['paragraph']['questions']: 19 | q = qa['question'] 20 | for a in qa['answers']: 21 | if a['isAnswer']: 22 | a_text = a['text'].replace(' ', ' ') 23 | prompt = example + ' Passage:' + passage + '\n Here is a question: ' + q + '\n The answer is' 24 | f_out.write(json.dumps({'id': idx, 'passage': passage, 'question': q, 'prompt': prompt, 'answer': ' ' + a_text }) + '\n') 25 | 26 | -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_qasc_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | import json 4 | import ast 5 | 6 | dataset_config = {'name': 'allenai/qasc', 'train_name': 'train' } 7 | 8 | #validation 9 | #output_f_name = './outputs/qasc/qasc_train.json' 10 | #output_f_name = './outputs/qasc/qasc_fact_early_train.json' 11 | output_f_name = './outputs/qasc/qasc_neg_train.json' 12 | #output_f_name = './outputs/qasc/qasc_neg_fact_train.json' 13 | 14 | dataset = datasets.load_dataset(dataset_config['name'] ) 15 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] ) 16 | 17 | #including_passage = True 18 | including_passage = False 19 | 20 | print(len(df_train)) 21 | 22 | #include_facts = True 23 | include_facts = False 24 | 25 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n' 26 | 27 | if include_facts: 28 | example = ' Question: Which kind of animals can fly?\n Fact 1: a bird is an animal.\n Fact 2: birds can fly.\n Answer: bird.\n\n' 29 | #example = ' Fact 1: a bird is an animal.\n Fact 2: birds can fly.\n Question: Which kind of animals can fly?\n Answer: bird.\n\n' 30 | else: 31 | example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n' 32 | 33 | with open(output_f_name, 'w') as f_out: 34 | for index, row in df_train.iterrows(): 35 | #ans_dict = ast.literal_eval(row['answers']) 36 | #ans_dict = json.loads(row['answers']) 37 | ans = row['choices']['text'][ row['choices']['label'].index(row['answerKey']) ] 38 | #print(ans_str) 39 | #ans_dict = json.loads(ans_str) 40 | q = row['question'] 41 | all_ans_raw = row['choices']['text'].copy() 42 | all_ans_raw.remove(ans) 43 | all_ans = [ans] + all_ans_raw 44 | all_ans = [' ' + x for x in all_ans] 45 | assert len(row['choices']['text']) == len(all_ans) 46 | #q = q[0].upper() + q[1:] 47 | if include_facts: 48 | #prompt = example + ' Fact 1: ' + row['fact1'] + '\n Fact 2: ' + row['fact2'] + '\n Question: ' + q + '\n Answer:' 49 | prompt = example + ' Question: ' + q + '\n Fact 1: ' + row['fact1'] + '\n Fact 2: ' + row['fact2'] + '\n Answer:' 50 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n') 51 | else: 52 | prompt = example + ' Question: ' + q + '\n Answer:' 53 | f_out.write(json.dumps({'id': index, 'question': q, 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n') 54 | 55 | -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_socialiqa_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | import json 4 | import ast 5 | 6 | dataset_config = {'name': 'allenai/social_i_qa', 'train_name': 'train' } 7 | #dataset_config = {'name': 'allenai/social_i_qa', 'train_name': 'validation' } 8 | 9 | #validation 10 | #output_f_name = './outputs/socialiqa/socialiqa_val.json' 11 | #output_f_name = './outputs/socialiqa/socialiqa_train.json' 12 | output_f_name = './outputs/socialiqa/socialiqa_neg_train.json' 13 | 14 | dataset = datasets.load_dataset(dataset_config['name'] ) 15 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] ) 16 | 17 | print(len(df_train)) 18 | 19 | #example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n' 20 | #example = ' Question: Which kind of animals can fly?\n Answer: bird.\n\n' 21 | example = ' Passage: John likes to go hiking, and his wife likes to cook.\n Question: Who likes to cook?\n Answer: his wife\n\n' 22 | 23 | label_dict = {'1': 'A', '2': 'B', '3': 'C'} 24 | 25 | with open(output_f_name, 'w') as f_out: 26 | for index, row in df_train.iterrows(): 27 | #ans_dict = ast.literal_eval(row['answers']) 28 | #ans_dict = json.loads(row['answers']) 29 | #print(row) 30 | ans = row[ 'answer' + label_dict[ row['label'] ] ] 31 | all_ans_raw = [row['answer' + x] for x in ['A','B','C']] 32 | all_ans_raw.remove(ans) 33 | all_ans = [ans] + all_ans_raw 34 | all_ans = [' ' + x for x in all_ans] 35 | assert 3 == len(all_ans) 36 | #print(ans_str) 37 | #ans_dict = json.loads(ans_str) 38 | prompt = example + ' Passage: ' + row['context'] + '\n Question: ' + row['question'] + '\n Answer:' 39 | f_out.write(json.dumps({'id': index, 'question': row['question'], 'context': row['context'], 'prompt': prompt, 'answer': ' ' + ans, 'all_ans': all_ans }) + '\n') 40 | -------------------------------------------------------------------------------- /AP_sampling/src/QA/dataset_preparation/prepare_squad_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datasets 3 | import json 4 | 5 | #dataset_config = {'name': 'rajpurkar/squad', 'train_name': 'train' } 6 | dataset_config = {'name': 'rajpurkar/squad', 'train_name': 'validation' } 7 | 8 | #validation 9 | #output_f_name = './outputs/squad/squad_train.json' 10 | #output_f_name = './outputs/squad/squad_val.json' 11 | output_f_name = './outputs/squad/squad_val_no_pass.json' 12 | 13 | dataset = datasets.load_dataset(dataset_config['name'] ) 14 | df_train = pd.DataFrame( dataset[ dataset_config['train_name'] ] ) 15 | 16 | including_passage = False 17 | 18 | print(len(df_train)) 19 | 20 | if including_passage: 21 | example = ' Passage: John likes to go hiking, and his wife likes to cook.\n Here is a question: Who likes to cook?\n The answer is his wife\n\n' 22 | else: 23 | example = ' Here is a question: What is the birthplace of Barack Obama?\n The answer is Honolulu, Hawaii.\n\n' 24 | 25 | with open(output_f_name, 'w') as f_out: 26 | for index, row in df_train.iterrows(): 27 | ans_dict = row['answers'] 28 | #print(ans_str) 29 | #ans_dict = json.loads(ans_str) 30 | for ans in ans_dict['text']: 31 | if including_passage: 32 | prompt = example + ' Passage: ' + row['context'] + '\n Here is a question: ' + row['question'] + '\n The answer is' 33 | f_out.write(json.dumps({'id': row['id'], 'passage': row['context'], 'question': row['question'], 'prompt': prompt, 'answer': ' ' + ans }) + '\n') 34 | else: 35 | prompt = example + ' Here is a question: ' + row['question'] + '\n The answer is' 36 | f_out.write(json.dumps({'id': row['id'], 'question': row['question'], 'prompt': prompt, 'answer': ' ' + ans }) + '\n') 37 | 38 | -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/configuration_openelm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/configuration_openelm.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/data_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/data_utils.cpython-310.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/data_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/data_utils.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/data_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/data_utils.cpython-39.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/model_mlp_logit.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-310.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/model_mlp_logit.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-37.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/model_mlp_logit.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/model_mlp_logit.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/model_mlp_logit.cpython-39.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/modeling_openelm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/modeling_openelm.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-310.pyc -------------------------------------------------------------------------------- /AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/__pycache__/train_logits_prediction_model.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/data_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SeqDataset(torch.utils.data.Dataset): 4 | def __init__(self, w_ind_gpt2_tensor, bptt, device): 5 | self.w_ind_gpt2 = w_ind_gpt2_tensor 6 | self.seq_len = bptt 7 | self.output_device = device 8 | 9 | def __len__(self): 10 | return int( self.w_ind_gpt2.size(0) /self.seq_len ) 11 | 12 | def __getitem__(self, idx): 13 | feature = self.w_ind_gpt2[idx*self.seq_len:(idx+1)*self.seq_len].to(dtype = torch.long, device = self.output_device) 14 | return feature 15 | 16 | def create_data_loader(f_in, bsz, bptt, device, dataset_class, shuffle = True): 17 | w_ind_gpt2_tensor = torch.load(f_in, map_location='cpu') 18 | cut_tok_num = w_ind_gpt2_tensor.size(0) % bptt 19 | if cut_tok_num > 0: 20 | w_ind_gpt2_tensor = w_ind_gpt2_tensor[:-cut_tok_num] 21 | dataset = dataset_class(w_ind_gpt2_tensor, bptt, device) 22 | use_cuda = False 23 | if device.type == 'cuda': 24 | use_cuda = True 25 | return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=False) 26 | #return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=True) 27 | 28 | 29 | def load_corpus(data_path, train_bsz, eval_bsz, bptt, device, tensor_folder = "tensors_all", skip_training = False, shuffle_train=True, shuffle_val = False, load_val = True, load_testing = True): 30 | train_corpus_name = data_path + "/" + tensor_folder + "/train.pt" 31 | val_org_corpus_name = data_path +"/" + tensor_folder + "/val_org.pt" 32 | test_org_corpus_name = data_path +"/" + tensor_folder + "/test_org.pt" 33 | 34 | dataloader_train = [] 35 | dataloader_val = [] 36 | dataloader_test = [] 37 | 38 | dataset_class = SeqDataset 39 | 40 | if load_val: 41 | with open(val_org_corpus_name,'rb') as f_in: 42 | dataloader_val = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val) 43 | 44 | if load_testing: 45 | with open(test_org_corpus_name,'rb') as f_in: 46 | dataloader_test = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val) 47 | 48 | if not skip_training: 49 | with open(train_corpus_name,'rb') as f_in: 50 | dataloader_train = create_data_loader(f_in, train_bsz, bptt, device, dataset_class, shuffle = shuffle_train) 51 | 52 | return dataloader_train, dataloader_val, dataloader_test 53 | -------------------------------------------------------------------------------- /AP_sampling/src/example_APD_REAL.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./src/factual_gen/') 3 | from sampling_method import FETopPLogitsWarper, APTopPALogitsWarper, LogitsProcessorList 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import torch 6 | 7 | sampling = 'APD + REAL' 8 | #sampling = 'APD' 9 | #sampling = 'REAL' 10 | #sampling = 'REAL + CD' 11 | 12 | LLM = 'Pythia' 13 | #LLM = 'OPT' 14 | 15 | final_entropy_model_path = 'models/OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' 16 | fine_tuned_ALM_model_path = 'models/prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4' 17 | APD_p_value = 0.6 18 | decay_temperature = 2 19 | window_size = 40 20 | device = torch.device("cuda:0") 21 | 22 | if LLM == 'Pythia': 23 | LM_gen = 'EleutherAI/pythia-6.9b-deduped' 24 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024) 25 | tokenizer_ent = tokenizer 26 | else: 27 | LM_gen = 'facebook/opt-6.7b' 28 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024) 29 | tokenizer_ent = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m-deduped', padding_side='left', model_max_length=1024) 30 | 31 | tokenizer.pad_token = tokenizer.eos_token 32 | tokenizer_ent.pad_token = tokenizer_ent.eos_token 33 | 34 | model = AutoModelForCausalLM.from_pretrained(LM_gen) 35 | model.eval() 36 | model.to(device) 37 | 38 | if sampling == 'APD + REAL': 39 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = fine_tuned_ALM_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, student_model_name=final_entropy_model_path, use_CD_alpha= False, use_AP=True, device=device) 40 | elif sampling == 'APD': 41 | logits_processor_i = APTopPALogitsWarper(top_p = APD_p_value, student_model_name=fine_tuned_ALM_model_path, device=device, use_alpha=False, temperature = 1, top_k=20, use_log_softmax=True) 42 | elif sampling == 'REAL': 43 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, device=device) 44 | else: 45 | if LLM == 'Pythia': 46 | student_model_name = 'EleutherAI/pythia-70m-deduped' 47 | else: 48 | student_model_name = 'facebook/opt-125m' 49 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, student_model_name=student_model_name, use_CD_alpha= False, device=device) 50 | 51 | 52 | logits_processor = LogitsProcessorList() 53 | logits_processor.append(logits_processor_i) 54 | 55 | input_prompt = " I like to go hiking." 56 | input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids 57 | 58 | output_sequences = model.generate(input_ids=input_ids.to(device), pad_token_id=tokenizer.eos_token_id, logits_processor=logits_processor, do_sample=True ) 59 | input_len = input_ids.size(-1) 60 | output_con = output_sequences[0,input_len:] 61 | output_text = tokenizer.decode(output_con, skip_special_tokens=True) 62 | print("Input: ", input_prompt) 63 | print("Output: ", output_text) 64 | -------------------------------------------------------------------------------- /AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc -------------------------------------------------------------------------------- /AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-37.pyc -------------------------------------------------------------------------------- /AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc -------------------------------------------------------------------------------- /AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/AP_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc -------------------------------------------------------------------------------- /AP_sampling/src/factual_gen/prepare_wiki_MTurk.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | from nltk.tokenize import sent_tokenize 4 | import random 5 | 6 | sample_numbers = 1000 7 | 8 | repo_dir = '/mnt/efs/Haw-Shiuan/true_entropy/' 9 | 10 | input_file_dict = {'AP': repo_dir + "outputs/factual_gen/factual_test7k_6.9b_AP_toppk20_log_p0.8_dt_1.0_prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4/factual_test7k_6.9b_AP_toppk20_p0.8_gen_seed1.jsonl", 11 | 'Top-p': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_topp_p1.0_temp_1.0/factual_test7k_6.9b_topp_p1.0_gen_seed1.jsonl', 12 | 'CD': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_CD_toppk20_dt_0.5_log_p0.8_pythia-70m-deduped/factual_test7k_6.9b_CD_toppk20_p0.8_gen_seed1.jsonl', 13 | 'REAL+CD': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_fe_CD_topp_exp_1_win_40_dt_4.0_p0.5_OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3/factual_test7k_6.9b_fe_CD_topp_p0.5_gen_seed1.jsonl', 14 | 'REAL+AP': repo_dir + 'outputs/factual_gen/factual_test7k_6.9b_fe_AP_topp_exp_1_win_40_dt_4.0_ait1.0_prob_wiki_ext2_1e6_70M_bsz_64_e5_only_top_last_w_10_l1_reg_w_08_logit_exp_decay_lr-4/factual_test7k_6.9b_fe_AP_topp_p1.0_gen_seed1.jsonl' 15 | } 16 | 17 | output_csv = 'outputs/MTurk/wiki/gen_1000.csv' 18 | 19 | method_list = list(input_file_dict.keys()) 20 | 21 | def load_gen(input_file): 22 | id_list = [] 23 | context_list = [] 24 | gen_list = [] 25 | with open(input_file) as f_in: 26 | for line in f_in: 27 | gen_obj = json.loads(line.strip()) 28 | context = gen_obj['prompt'].strip() 29 | id_res = int(gen_obj['id']) 30 | 31 | text = gen_obj['text'].strip() 32 | sents = sent_tokenize(text) 33 | gen = sents[0].replace('\n',' ') 34 | 35 | id_list.append(id_res) 36 | context_list.append(context) 37 | gen_list.append(gen) 38 | if len(id_list) >= sample_numbers: 39 | break 40 | return id_list, context_list, gen_list 41 | 42 | prev_id_list = None 43 | 44 | all_res_dict = {} 45 | 46 | for method_name in input_file_dict: 47 | file_name = input_file_dict[method_name] 48 | print(file_name) 49 | id_list, context_list, gen_list = load_gen(file_name) 50 | print(method_name, sum([len(gen) for gen in gen_list ]) / sample_numbers ) 51 | if prev_id_list is None: 52 | prev_id_list = id_list 53 | all_res_dict['id'] = id_list 54 | all_res_dict['context'] = context_list 55 | else: 56 | for i in range(len(id_list)): 57 | assert id_list[i] == prev_id_list[i] 58 | prev_id_list = id_list 59 | all_res_dict['gen_'+method_name] = gen_list 60 | 61 | df = pd.DataFrame(all_res_dict) 62 | print(df) 63 | 64 | num_method = len(method_list) 65 | 66 | output_dict = {'id': [], 'context': []} 67 | for i in range(num_method): 68 | output_dict['gen_'+str(i+1)] = [] 69 | output_dict['method_'+str(i+1)] = [] 70 | 71 | #drop_idx = [] 72 | 73 | for index, row in df.iterrows(): 74 | gen_list = [] 75 | 76 | for method_name in method_list: 77 | gen_list.append(row['gen_'+method_name]) 78 | if any([len(gen)<10 or 'External links' in gen for gen in gen_list]) or len(gen_list) != len(set(gen_list)): 79 | #drop_idx.append(index) 80 | continue 81 | output_dict['id'].append(row['id']) 82 | output_dict['context'].append(row['context']) 83 | idx_rnd = list(range(num_method)) 84 | random.shuffle(idx_rnd) 85 | for i, idx in enumerate(idx_rnd): 86 | output_dict['gen_'+str(i+1)].append(gen_list[idx]) 87 | output_dict['method_'+str(i+1)].append(method_list[idx]) 88 | 89 | df = pd.DataFrame(output_dict).set_index('id') 90 | #df = df.drop(drop_idx) 91 | 92 | 93 | print(df) 94 | df.to_csv(output_csv) 95 | -------------------------------------------------------------------------------- /AP_sampling/src/story_gen/comp_collect_GPT_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import json 5 | 6 | 7 | input_folder = 'outputs/story/GPT_exp/test_GPT3.5_responses_500/' 8 | #input_folder = 'outputs/wp/GPT_exp/test_GPT3.5_responses_500/' 9 | 10 | 11 | result_dict = {'file_name': [], 'avg_win_rate_F': [], 'avg_win_rate_C': [], 'avg_win_rate_L': [], 'avg_win_rate_O': [] } 12 | 13 | all_bad_idx = [] 14 | 15 | for result_file in os.listdir(input_folder): 16 | file_path = input_folder+result_file 17 | if not os.path.isfile(file_path): 18 | continue 19 | with open(file_path) as f_in: 20 | all_inputs = json.load(f_in) 21 | bad_idx_list = [] 22 | if len(all_inputs) == 5: 23 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs 24 | all_bad_idx = all_bad_idx + bad_idx_list 25 | 26 | all_bad_idx_set = set(all_bad_idx) 27 | 28 | print(all_bad_idx_set) 29 | 30 | #for result_file in input_file_list: 31 | for result_file in os.listdir(input_folder): 32 | file_path = input_folder+result_file 33 | if not os.path.isfile(file_path): 34 | continue 35 | with open(file_path) as f_in: 36 | all_inputs = json.load(f_in) 37 | if len(all_inputs) == 4: 38 | pred_method_name, base_method_name, system_prompt1, all_list = all_inputs 39 | elif len(all_inputs) == 5: 40 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs 41 | id_list, context_list_pred, gen_list_pred, gen_list_base, ref_list, prompt_list, first_res_list, response_list, parse_win_list = zip(*all_list) 42 | avg_win_rate_F = [] 43 | avg_win_rate_C = [] 44 | avg_win_rate_L = [] 45 | avg_win_rate_O = [] 46 | for i in range(len(id_list)): 47 | if i in all_bad_idx_set: 48 | continue 49 | avg_win_rate_F.append(int(parse_win_list[i]['F'] == 'pred')) 50 | avg_win_rate_C.append(int(parse_win_list[i]['C'] == 'pred')) 51 | avg_win_rate_L.append(int(parse_win_list[i]['L'] == 'pred')) 52 | avg_win_rate_O.append(int(parse_win_list[i]['O'] == 'pred')) 53 | 54 | result_dict['file_name'].append(result_file.replace('start2_1000','')) 55 | result_dict['avg_win_rate_F'].append(np.mean(avg_win_rate_F)) 56 | result_dict['avg_win_rate_C'].append(np.mean(avg_win_rate_C)) 57 | result_dict['avg_win_rate_L'].append(np.mean(avg_win_rate_L)) 58 | result_dict['avg_win_rate_O'].append(np.mean(avg_win_rate_O)) 59 | 60 | df = pd.DataFrame.from_dict(result_dict) 61 | 62 | #pd.set_option('display.max_columns', None) 63 | pd.options.display.max_colwidth = 150 64 | 65 | df_sort = df.set_index('file_name').sort_values(by=['file_name']) 66 | 67 | #print(df_sort[ ['avg_win_rate_O', 'avg_diff_score_O']]) 68 | print(df_sort) 69 | #print(df['file_name']) 70 | -------------------------------------------------------------------------------- /AP_sampling/src/story_gen/prepare_story_prompt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | input_stories = "/mnt/efs/Haw-Shiuan/entailment_tree/datasets/ROCStories__winter2017.csv" 5 | num_stories = 100 6 | #num_stories = 1000 7 | shot_num = 3 8 | prompt_sent_num = 2 9 | output_prompt_file = "/mnt/efs/Haw-Shiuan/AP_sampling/outputs/story/prompt_start2_b1_{}.jsonl".format(num_stories) 10 | 11 | delimiter = '---' 12 | num_story_line = 5 13 | 14 | df = pd.read_csv(input_stories) 15 | df_sampled_stories = df.sample(n=num_stories, replace=False) 16 | df_rest = df.drop(df_sampled_stories.index) 17 | 18 | def prepare_id(row, prompt_sent_num, init_sent_idx=0): 19 | id_q = '' 20 | for i in range(prompt_sent_num): 21 | id_q += row['sentence'+str(init_sent_idx+ i+1)] + ' ' 22 | return id_q[:-1] 23 | 24 | def str_story(row_examples, i, delimiter): 25 | story_str = 'Story {}:\n'.format(i+1) 26 | for i in range(num_story_line): 27 | story_str += row_examples['sentence'+str(i+1)] + ' ' 28 | story_str += '\n' + delimiter + '\n' 29 | return story_str 30 | 31 | output_list = [] 32 | for index, row in df_sampled_stories.iterrows(): 33 | out_dict = {} 34 | id_q = prepare_id(row, prompt_sent_num) 35 | out_dict['id'] = id_q 36 | df_examples = df_rest.sample(n=shot_num, replace=False) 37 | prompt_str = ' Here are {} stories. Each story has five sentences.\n\n'.format(shot_num+1) 38 | for i, (index, row_examples) in enumerate(df_examples.iterrows()): 39 | prompt_str += str_story(row_examples, i, delimiter) 40 | 41 | out_dict['prompt'] = prompt_str + 'Story {}:\n'.format(shot_num+1) + id_q 42 | out_dict['ref'] = prepare_id(row, 5 - prompt_sent_num, prompt_sent_num ) 43 | output_list.append(out_dict) 44 | 45 | with open(output_prompt_file, 'w') as f_out: 46 | for out_dict in output_list: 47 | f_out.write(json.dumps(out_dict) + '\n' ) 48 | 49 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /FactualityPrompt/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:20.12-py3 2 | # TODO: need to update this starting docker to something public! 3 | 4 | RUN apt-get update && apt-get install -y python pip 5 | 6 | RUN pip install thefuzz 7 | RUN pip install spacy 8 | RUN pip install bitarray 9 | RUN pip install datasets 10 | RUN pip install sentence-transformers==2.2.0 11 | RUN pip install Cython==0.29.15 12 | RUN pip install numpy==1.19.1 13 | RUN pip install benepar==0.1.3 14 | # RUN pip install torch==1.5.0 15 | RUN pip install fairseq==0.9.0 16 | RUN pip install nltk==3.5 17 | RUN pip install spacy==2.3.2 18 | # RUN pip install tensorflow==1.15.0 19 | RUN pip install transformers==3.4.0p 20 | RUN pip install tensorflow 21 | 22 | 23 | # bash script 24 | RUN python -m spacy download en_core_web_sm 25 | 26 | # python in bash 27 | RUN python - << EOF 28 | import nltk 29 | import benepar 30 | nltk.download('stopwords') 31 | nltk.download('punkt') 32 | benepar.download('benepar_en2') 33 | EOF -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/.env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=src -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .vscode/ 3 | .DS_Store 4 | __pycache__ 5 | .ipynb_checkpoints/ 6 | /data/ 7 | /features/ 8 | /logs/ 9 | /models/ 10 | /model/ 11 | /src/model/ 12 | /logdir/ 13 | *.log 14 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | 3 | ENV NVIDIA_VISIBLE_DEVICES all 4 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 5 | 6 | RUN apt-get update 7 | RUN apt-get install -y --no-install-recommends --allow-unauthenticated \ 8 | zip \ 9 | gzip \ 10 | make \ 11 | automake \ 12 | gcc \ 13 | build-essential \ 14 | g++ \ 15 | cpp \ 16 | libc6-dev \ 17 | man-db \ 18 | autoconf \ 19 | pkg-config \ 20 | unzip \ 21 | libffi-dev \ 22 | software-properties-common \ 23 | wget \ 24 | git 25 | 26 | ENV HOME "/root" 27 | 28 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 29 | RUN bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda && rm Miniconda3-latest-Linux-x86_64.sh 30 | ENV PATH "$PATH:$HOME/miniconda/bin" 31 | ENV LANG C.UTF-8 32 | 33 | RUN mkdir /fever 34 | WORKDIR /fever 35 | RUN mkdir -p data/fever 36 | RUN mkdir -p data/fasttext 37 | 38 | RUN wget -nv https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip && unzip wiki.en.zip -d data/fasttext && rm wiki.en.zip 39 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/claim_verification_esim.ckpt.zip 40 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/sentence_retrieval_ensemble.ckpt.zip 41 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/document_retrieval_datasets.zip 42 | RUN wget https://public.ukp.informatik.tu-darmstadt.de/fever-2018-team-athene/claim_verification_esim_glove_fasttext.ckpt.zip 43 | 44 | RUN mkdir -p model/no_attention_glove/rte_checkpoints/ 45 | RUN mkdir -p model/esim_0/rte_checkpoints/ 46 | RUN mkdir -p model/esim_0/sentence_retrieval_ensemble/ 47 | RUN unzip claim_verification_esim.ckpt.zip -d model/no_attention_glove/rte_checkpoints/ 48 | RUN unzip claim_verification_esim_glove_fasttext.ckpt.zip -d model/esim_0/rte_checkpoints/ 49 | RUN unzip sentence_retrieval_ensemble.ckpt.zip -d model/esim_0/sentence_retrieval_ensemble/ 50 | RUN unzip document_retrieval_datasets.zip -d data/fever/ 51 | 52 | RUN wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip 53 | RUN unzip glove.6B.zip -d data/glove && rm glove.6B.zip 54 | RUN gzip data/glove/*.txt 55 | 56 | RUN rm *.zip 57 | 58 | RUN conda install python=3.6 59 | RUN conda install Cython=0.28.5 60 | ADD requirements.txt /fever/ 61 | RUN pip install -r requirements.txt 62 | RUN conda uninstall -y Cython 63 | RUN pip uninstall -y pyfasttext 64 | RUN pip install --force --upgrade cysignals==1.7.2 65 | RUN pip install --force --upgrade pyfasttext 66 | RUN conda install tensorflow=1.9.0 tensorflow-gpu=1.9.0 67 | 68 | RUN python -c "import nltk; nltk.download('punkt')" 69 | 70 | ADD src src 71 | ADD server.sh . 72 | ADD predict.sh . 73 | ENV PYTHONPATH /fever/src 74 | ENV PYTHONUNBUFFERED 1 75 | CMD bash 76 | #CMD python -m athene.system --db-path /local/fever-common/data/fever/fever.db --words-cache model/sentence --sentence-model model/esim_0/sentence_retrieval_ensemble 77 | #CMD python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model1 --add_prefix=model_0/ 78 | CMD ["waitress-serve", "--host=0.0.0.0","--port=5000", "--call", "athene.system:web"] 79 | #CMD ["bash", "./server.sh"] -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/NOTICE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Copyright 2019 3 | Ubiquitous Knowledge Processing (UKP) Lab 4 | Technische Universität Darmstadt 5 | 6 | ------------------------------------------------------------------------------- 7 | Third party legal information 8 | 9 | 10 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/conf/config_no_attention_nodrop_glove_only.json: -------------------------------------------------------------------------------- 1 | { 2 | "BASE_DIR": ".", 3 | "SUBMISSION_FILE_NAME": "predictions.jsonl", 4 | "model_name": "no_attention_glove", 5 | "glove_path": "./data/glove/glove.6B.300d.txt.gz", 6 | "fasttext_path": "./data/fasttext/wiki.en.bin", 7 | "model_folder": "model/no_attention_glove", 8 | "ckpt_folder": "model/no_attention_glove/rte_checkpoints", 9 | "db_path": "./data/fever/fever.db", 10 | "dataset_folder": "./data/fever", 11 | "training_set_file": "./data/fever/train.p7.s5.jsonl", 12 | "dev_set_file": "./data/fever/dev.p7.s5.jsonl", 13 | "test_set_file": "./data/fever/test.p7.s5.jsonl", 14 | "submission_folder": "submission_no_attention_glove", 15 | "submission_file": "submission_no_attention_glove/predictions.jsonl", 16 | "estimator_name": "esim_glove_no_attention", 17 | "pickle_name": "esim.p", 18 | "esim_hyper_param": { 19 | "num_neurons": [ 20 | 250, 21 | 180, 22 | 180, 23 | 900, 24 | 550 25 | ], 26 | "lr": 0.002, 27 | "dropout": 0, 28 | "batch_size": 64, 29 | "pos_weight": [ 30 | 0.408658712, 31 | 1.942468514, 32 | 1.540587559 33 | ], 34 | "max_checks_no_progress": 5, 35 | "trainable": false, 36 | "show_progress": 1, 37 | "n_outputs": 3, 38 | "lstm_layers": 1, 39 | "optimizer": "adam", 40 | "num_epoch": 100, 41 | "activation": "relu", 42 | "initializer": "he" 43 | }, 44 | "max_sentences": 5, 45 | "max_sentence_size": 50, 46 | "n_jobs_ensemble": 3, 47 | "seed": 55, 48 | "name": "claim_verification_esim" 49 | } 50 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m athene.system --in-file $1 --out-file $2 -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | cysignals==1.7.2 3 | thinc==7.0.4 4 | fever-drqa 5 | fever-scorer 6 | spacy 7 | allennlp 8 | wikipedia 9 | pyfasttext 10 | gensim 11 | tensorflow 12 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model1 --add_prefix=model_0/ 4 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model2 --add_prefix=model_1/ 5 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model3 --add_prefix=model_2/ 6 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model4 --add_prefix=model_3/ 7 | python src/rename.py --checkpoint_dir=model/esim_0/sentence_retrieval_ensemble/model5 --add_prefix=model_4/ 8 | 9 | waitress-serve --host=0.0.0.0 --port=5000 --call athene.system:web -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/retrieval/document/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/document/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/retrieval/score/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/score/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/retrieval/sentences/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/retrieval/sentences/data_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/data_processing/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/retrieval/sentences/deep_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/retrieval/sentences/deep_models/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/rte/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/deep_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/rte/deep_models/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/deep_models/convert_use.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_hub as hub 3 | 4 | from copy_graph import copy_op_to_graph, copy_variable_to_graph, get_copied_op 5 | 6 | 7 | def main(): 8 | 9 | tf.logging.set_verbosity(tf.logging.ERROR) 10 | 11 | g1 = tf.Graph() 12 | g2 = tf.Graph() 13 | 14 | scope = 'finetune' 15 | 16 | with g1.as_default(): 17 | embed = hub.Module( 18 | "https://tfhub.dev/google/universal-sentence-encoder/1", trainable=True) 19 | sess = tf.Session(graph=g1) 20 | with sess.as_default(): 21 | # copy all variables 22 | variables = [] 23 | for variable in tf.global_variables(): 24 | new_variable = copy_variable_to_graph( 25 | variable, g2, True, scope) 26 | variables.append(new_variable) 27 | # copy all ops 28 | for op in g1.get_operations(): 29 | copy_op_to_graph(op, g2, variables, scope) 30 | # copy table initilization 31 | copy_op_to_graph(tf.tables_initializer(), g2, variables, scope) 32 | 33 | tf.reset_default_graph() 34 | 35 | with g2.as_default(): 36 | sess = tf.Session(graph=g2) 37 | with sess.as_default(): 38 | 39 | sess.run(tf.global_variables_initializer()) 40 | sess.run(tf.get_default_graph().get_operation_by_name( 41 | 'finetune/init_all_tables')) 42 | 43 | in_tensor = tf.get_default_graph().get_tensor_by_name( 44 | scope + '/module/fed_input_values:0') 45 | ou_tensor = tf.get_default_graph().get_tensor_by_name( 46 | scope + '/module/Encoder_en/hidden_layers/l2_normalize:0') 47 | 48 | for v in tf.trainable_variables(): 49 | print(v.name, v) 50 | 51 | save_path = 'model/USE.ckpt' 52 | saver = tf.train.Saver() 53 | saver.save(sess, save_path) 54 | 55 | 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/rte/utils/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/customized_votingclassifier.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import numpy as np 4 | 5 | 6 | class CustomizedVotingClassifier: 7 | def __init__(self, prediction_path_list, voting): 8 | self.prediction_path_list = prediction_path_list 9 | self.voting = voting 10 | 11 | def fit(self, X, y): 12 | raise NotImplementedError( 13 | "This voting classifier is only used for combining existing models, not for training!") 14 | 15 | def _raw_probas(self): 16 | _probas = [] 17 | for prediction_path in self.prediction_path_list: 18 | with open(prediction_path, 'rb') as f: 19 | _probas.append(pickle.load(f)) 20 | return np.asarray(_probas) 21 | 22 | def predict_proba(self, X): 23 | # samples * classes 24 | _avg_probas = np.average(self._raw_probas(), axis=0) 25 | return np.argmax(_avg_probas, axis=1) 26 | 27 | def predict(self, X): 28 | if self.voting == 'soft': 29 | return self.predict_proba(X) 30 | else: 31 | # models * samples * classes 32 | _raw_probas = self._raw_probas() 33 | # models * samples 34 | _predictions_per_model = np.argmax(_raw_probas, axis=2) 35 | return np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=_predictions_per_model.T) 36 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/dataset.py: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | 3 | 4 | class DataSet: 5 | """ 6 | Define class for Fake News Challenge data 7 | """ 8 | 9 | def __init__(self, file_stances, file_bodies): 10 | 11 | # Load data 12 | self.instances = self.read(file_stances) 13 | bodies = self.read(file_bodies) 14 | self.heads = {} 15 | self.bodies = {} 16 | 17 | # Process instances 18 | for instance in self.instances: 19 | if instance['Claim'] not in self.heads: 20 | head_id = len(self.heads) 21 | self.heads[instance['Claim']] = head_id 22 | instance['Body ID'] = int(instance['Body ID']) 23 | 24 | # Process bodies 25 | for body in bodies: 26 | self.bodies[int(body['Body ID'])] = body['Snippets'] 27 | 28 | def read(self, filename): 29 | 30 | """ 31 | Read Fake News Challenge data from CSV file 32 | Args: 33 | filename: str, filename + extension 34 | Returns: 35 | rows: list, of dict per instance 36 | """ 37 | 38 | # Initialise 39 | rows = [] 40 | 41 | # Process file 42 | with open(filename, "r", encoding='utf-8') as table: 43 | r = DictReader(table) 44 | for line in r: 45 | rows.append(line) 46 | 47 | return rows 48 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/estimator_definitions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_estimator(scorer_type, save_folder=None): 5 | if scorer_type == 'esim': 6 | # submitted model, glove + fasttext, with attention 7 | from os import path 8 | from athene.rte.deep_models.ESIM_for_ensemble import ESIM 9 | from athene.utils.config import Config 10 | pos_weight = np.asarray(Config.esim_hyper_param['pos_weight'], np.float32) 11 | clf = ESIM(random_state=Config.seed, tensorboard_logdir="logdir/", learning_rate=Config.esim_hyper_param['lr'], 12 | max_check_without_progress=Config.esim_hyper_param['max_checks_no_progress'], 13 | activation=Config.esim_hyper_param['activation'], 14 | initializer=Config.esim_hyper_param['initializer'], 15 | lstm_layers=Config.esim_hyper_param['lstm_layers'], 16 | optimizer=Config.esim_hyper_param['optimizer'], 17 | trainable=Config.esim_hyper_param['trainable'], 18 | batch_size=Config.esim_hyper_param['batch_size'], 19 | dropout_rate=Config.esim_hyper_param['dropout'], 20 | num_neurons=Config.esim_hyper_param['num_neurons'], pos_weight=pos_weight, 21 | ckpt_path=path.join(save_folder, Config.name + '.ckpt'), name=Config.name) 22 | 23 | if scorer_type == 'esim_glove_no_attention': 24 | # glove, no attention 25 | from os import path 26 | from athene.rte.deep_models.ESIM_for_ensemble_glove_only_no_attention import ESIM 27 | from athene.utils.config import Config 28 | pos_weight = np.asarray(Config.esim_hyper_param['pos_weight'], np.float32) 29 | clf = ESIM(random_state=Config.seed, tensorboard_logdir="logdir/", learning_rate=Config.esim_hyper_param['lr'], 30 | max_check_without_progress=Config.esim_hyper_param['max_checks_no_progress'], 31 | activation=Config.esim_hyper_param['activation'], 32 | initializer=Config.esim_hyper_param['initializer'], 33 | lstm_layers=Config.esim_hyper_param['lstm_layers'], 34 | optimizer=Config.esim_hyper_param['optimizer'], 35 | trainable=Config.esim_hyper_param['trainable'], 36 | batch_size=Config.esim_hyper_param['batch_size'], 37 | dropout_rate=Config.esim_hyper_param['dropout'], 38 | num_neurons=Config.esim_hyper_param['num_neurons'], pos_weight=pos_weight, 39 | ckpt_path=path.join(save_folder, Config.name + '.ckpt'), name=Config.name) 40 | return clf 41 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/fill_gold_sentences.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from tqdm import tqdm 4 | from common.dataset.reader import JSONLineReader 5 | from common.util.log_helper import LogHelper 6 | 7 | 8 | def _sent_to_str(sent): 9 | return sent[-2] + "$$$" + str(sent[-1]) 10 | 11 | 12 | def _replace_sent_with_str(sent, string): 13 | segments = string.split(r"$$$") 14 | if len(segments) != 2: 15 | raise Exception("Illegal string: " + string) 16 | sent[-2] = segments[0] 17 | sent[-1] = int(segments[1]) 18 | return sent 19 | 20 | 21 | def _build_new_sent_with_str(string, num_of_segments=2): 22 | if num_of_segments == 2: 23 | sent = ["", -1] 24 | elif num_of_segments == 4: 25 | sent = [-1, -1, "", -1] 26 | else: 27 | raise Exception("Illegal num_of_segments: " + str(num_of_segments)) 28 | return _replace_sent_with_str(sent, string) 29 | 30 | 31 | def _sents_from_evidences(evidences): 32 | sents = set() 33 | for evidence in evidences: 34 | for s in evidence: 35 | sent = _sent_to_str(s) 36 | sents.add(sent) 37 | return sents 38 | 39 | 40 | def _fill_pred_sents_with_gold(pred_sents, gold_sents, max_sent): 41 | selected_sents = pred_sents[:max_sent] 42 | neg_indices = [] 43 | for i, selected in enumerate(selected_sents): 44 | key_selected = _sent_to_str(selected) 45 | if key_selected in gold_sents: 46 | gold_sents.remove(key_selected) 47 | else: 48 | neg_indices.append(i) 49 | if len(gold_sents) == 0: 50 | return selected_sents 51 | if len(selected_sents) <= max_sent: 52 | for _ in range(max_sent - len(selected_sents)): 53 | selected_sents.append(_build_new_sent_with_str(gold_sents.pop())) 54 | if len(gold_sents) == 0: 55 | return selected_sents 56 | if len(neg_indices) > 0: 57 | neg_indices = reversed(neg_indices) 58 | for i in neg_indices: 59 | sent = selected_sents[i] 60 | selected_sents[i] = _replace_sent_with_str(sent, gold_sents.pop()) 61 | if len(gold_sents) == 0: 62 | return selected_sents 63 | if len(gold_sents) > 0: 64 | logger.warn(str(len(gold_sents)) + 65 | " gold sentences cannot be filled into prediction") 66 | return selected_sents 67 | 68 | 69 | if __name__ == '__main__': 70 | LogHelper.setup() 71 | logger = LogHelper.get_logger('fill_gold_sentences') 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument( 74 | '--input', help='/path/to/input/file', required=True) 75 | parser.add_argument( 76 | '--output', help='/path/to/output/file', required=True) 77 | parser.add_argument( 78 | '--max-sent', type=int, help='Maximal number of sentences per claim', default=10) 79 | args = parser.parse_args() 80 | jlr = JSONLineReader() 81 | data = jlr.read(args.input) 82 | with open(args.output, "w+") as output_file: 83 | for data in tqdm(data): 84 | if data['verifiable'] != 'NOT VERIFIABLE': 85 | pred_sents = data['predicted_sentences'] 86 | gold_evidences = data['evidence'] 87 | gold_sents = _sents_from_evidences(gold_evidences) 88 | filled_pred_sents = _fill_pred_sents_with_gold( 89 | pred_sents, gold_sents, args.max_sent) 90 | data['predicted_sentences'] = filled_pred_sents 91 | output_file.write(json.dumps(data) + "\n") 92 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/score.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import recall_score, precision_score, confusion_matrix, accuracy_score, f1_score 2 | 3 | 4 | def print_metrics(gold_labels, predictions, logger=None, average='macro'): 5 | info = logger.info if logger is not None else print 6 | info("Accuracy: " + str(accuracy_score(gold_labels, predictions)) + "\tRecall: " + str( 7 | recall_score(gold_labels, predictions, average=average)) + "\tPrecision: " + str( 8 | precision_score(gold_labels, predictions, average=average)) + "\tF1 " + average + ": " + str( 9 | f1_score(gold_labels, predictions, average=average))) 10 | info("Confusion Matrix:") 11 | info(confusion_matrix(gold_labels, predictions)) 12 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/rte/utils/text_processing.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import pickle 4 | import re 5 | 6 | import nltk 7 | import numpy as np 8 | 9 | from common.util.log_helper import LogHelper 10 | 11 | # import torch 12 | np.random.seed(55) 13 | 14 | 15 | def vocab_map(vocab): 16 | voc_dict = {} 17 | for i, v in enumerate(vocab): 18 | voc_dict[v] = i 19 | # else: 20 | # voc_dict['UNK'] = i 21 | return voc_dict 22 | 23 | 24 | def tokenize(sequence): 25 | tokens = [token.replace("``", '').replace("''", '').replace('"', '') for token in nltk.word_tokenize(sequence) if 26 | token != " "] 27 | # return tokens 28 | return tokens 29 | 30 | 31 | def clean_text(text): 32 | text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) 33 | text = re.sub(r'\', ' ', text) 38 | text = text.replace("...", " ") 39 | return text 40 | 41 | 42 | def load_whole_glove(glove_file): 43 | logger = LogHelper.get_logger("load_whole_glove") 44 | is_gz = os.path.splitext(glove_file)[1] == '.gz' 45 | 46 | # Getting embedding dimension 47 | def _get_dim(_file): 48 | line = _file.readline() 49 | return len(line.strip().split(' ')) - 1 50 | 51 | if is_gz: 52 | with gzip.open(glove_file, 'rt') as file0: 53 | emb_dim = _get_dim(file0) 54 | else: 55 | with open(glove_file, 'r', encoding='utf-8') as file0: 56 | emb_dim = _get_dim(file0) 57 | 58 | # First row of embedding matrix is 0 for zero padding 59 | vocab = ['[PAD]'] 60 | embed = [[0.0] * emb_dim] 61 | vocab.append('UNK') 62 | embed.append([1.0] * emb_dim) 63 | 64 | def _read_glove_file(_vocab, _embed, _file): 65 | for line in _file: 66 | items = line.replace('\r', '').replace('\n', '').split(' ') 67 | if len(items) < 10: 68 | logger.debug("exceptional line: {}".format(line)) 69 | continue 70 | word = items[0] 71 | _vocab.append(word) 72 | vec = [float(i) for i in items[1:]] 73 | _embed.append(vec) 74 | return _vocab, _embed 75 | 76 | # Reading embedding matrix 77 | if is_gz: 78 | with gzip.open(glove_file, 'rt') as file: 79 | vocab, embed = _read_glove_file(vocab, embed, file) 80 | else: 81 | with open(glove_file, 'r', encoding='utf-8') as file: 82 | vocab, embed = _read_glove_file(vocab, embed, file) 83 | logger.info('Loaded GloVe!') 84 | return vocab, embed 85 | # if __name__=="__main__": 86 | # 87 | # text ="I don\'t think this is right..." 88 | # text =clean_text(text) 89 | # print(text) 90 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/athene/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/athene/utils/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/dataset/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/block.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from common.util.log_helper import LogHelper 5 | 6 | 7 | class Block(object): 8 | def __init__(self,block,name,path): 9 | self.logger = LogHelper.get_logger(Block.__name__) 10 | self.volume = block 11 | self.path = path 12 | self.name = name 13 | self.data = None 14 | 15 | def save(self,name,data): 16 | self.data[name] = data 17 | 18 | def write(self): 19 | self.logger.info("Write block {0}".format(self.volume)) 20 | with open((os.path.join(self.path, self.name + "-" + str(self.volume) + ".p")), "wb+") as f: 21 | pickle.dump(self.data, f) 22 | 23 | with open((os.path.join(self.path, self.name + "-" + str(self.volume) + ".p.idx")), "wb+") as f: 24 | pickle.dump(set(self.data.keys()), f) 25 | 26 | self.data = dict() 27 | 28 | def close(self): 29 | self.write() 30 | 31 | def __exit__(self, exc_type, exc_val, exc_tb): 32 | self.close() 33 | 34 | def __enter__(self): 35 | return self 36 | 37 | def __getitem__(self, item): 38 | return self.data[item] 39 | 40 | def list(self): 41 | return self.data.keys() 42 | 43 | def load(self): 44 | with open((os.path.join(self.path, self.name + "-" + str(self.volume) + ".p")), "rb") as f: 45 | self.data = pickle.load(f) 46 | self.logger.info("Loaded {0} articles".format(len(self.data))) 47 | 48 | def __iter__(self): 49 | if self.data is None: 50 | self.logger.info("Load block {0}".format(self.volume)) 51 | self.load() 52 | return iter(self.data) 53 | 54 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/corpus.py: -------------------------------------------------------------------------------- 1 | from common.dataset.block import Block 2 | from common.util.log_helper import LogHelper 3 | 4 | 5 | class Corpus: 6 | 7 | def __init__(self,name,path,blocks,preprocessing=None): 8 | self.logger = LogHelper.get_logger(Corpus.__name__) 9 | self.name = name 10 | self.path = path 11 | self.blocks = blocks 12 | self.active_block_iter = None 13 | self.active_block = None 14 | self.active_block_number = None 15 | self.preprocessing = preprocessing 16 | 17 | def __iter__(self): 18 | self.active_block_iter = None 19 | self.active_block = None 20 | self.active_block_number = None 21 | return self 22 | 23 | 24 | def next_block(self): 25 | if self.active_block_number is None: 26 | self.active_block_number = 0 27 | else: 28 | self.active_block_number += 1 29 | 30 | self.logger.info("Trying to load block {0}".format(self.active_block_number)) 31 | if self.active_block_number >= self.blocks: 32 | self.logger.info("No more blocks") 33 | raise StopIteration 34 | 35 | self.active_block = Block(self.active_block_number, self.name,self.path) 36 | self.active_block_iter = iter(self.active_block) 37 | 38 | def __next__(self): 39 | # Check if we have started with a block 40 | if self.active_block_iter is None: 41 | self.next_block() 42 | 43 | # Get the next item from this block 44 | try: 45 | n = next(self.active_block_iter) 46 | 47 | except StopIteration: 48 | # If the block is exhausted, try and get next from the next block 49 | try: 50 | self.next_block() 51 | n = next(self.active_block_iter) 52 | except StopIteration as e: 53 | # If we're out of blocks, reset and stop iteration 54 | self.active_block_iter = None 55 | self.active_block = None 56 | self.active_block_number = None 57 | raise e 58 | 59 | return n, self.preprocessing(self.active_block[n]) 60 | 61 | def __getitem__(self, item): 62 | return self.active_block[item] 63 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/data_set.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class DataSet(): 5 | def __init__(self,file,reader,formatter): 6 | self.reader = reader 7 | self.file = file 8 | self.formatter = formatter 9 | self.data = [] 10 | 11 | 12 | def read(self): 13 | if os.getenv("DEBUG","").lower() in ["1","y","yes","t"]: 14 | self.data.extend(filter(lambda record: record is not None, self.formatter.format(self.reader.read(self.file)[:10]))) 15 | else: 16 | self.data.extend(filter(lambda record: record is not None, self.formatter.format(self.reader.read(self.file)))) 17 | 18 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/formatter.py: -------------------------------------------------------------------------------- 1 | class Formatter(): 2 | def __init__(self,label_schema): 3 | self.label_schema = label_schema 4 | 5 | def format(self,lines): 6 | formatted = [] 7 | for line in lines: 8 | fl = self.format_line(line) 9 | if fl is not None: 10 | if isinstance(fl,list): 11 | formatted.extend(fl) 12 | else: 13 | formatted.append(fl) 14 | 15 | return formatted 16 | 17 | def format_line(self,line): 18 | pass 19 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/label_schema.py: -------------------------------------------------------------------------------- 1 | 2 | class LabelSchema: 3 | def __init__(self,labels): 4 | self.labels = {self.preprocess(val):idx for idx,val in enumerate(labels)} 5 | self.idx = {idx:self.preprocess(val) for idx,val in enumerate(labels)} 6 | 7 | def get_id(self,label): 8 | if self.preprocess(label) in self.labels: 9 | return self.labels[self.preprocess(label)] 10 | return None 11 | 12 | def preprocess(self,item): 13 | return item.lower() 14 | 15 | 16 | 17 | class SNLILabelSchema(LabelSchema): 18 | def __init__(self): 19 | super(SNLILabelSchema, self).__init__(["neither","contradiction","entailment"]) 20 | 21 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/persistence/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/dataset/persistence/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/persistence/engine.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | 3 | 4 | def get_engine(file): 5 | return create_engine('sqlite:///data/fever/{0}.db'.format(file), echo=False) 6 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/persistence/page.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.ext.declarative import declarative_base 2 | from sqlalchemy import Column, Integer, String, Text 3 | 4 | Base = declarative_base() 5 | 6 | 7 | class Page(Base): 8 | __tablename__ = "page" 9 | id = Column(Integer, primary_key=True) 10 | name = Column(String) 11 | doc = Column(Text) 12 | raw = Column(Text) 13 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/persistence/session.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.ext.declarative import declarative_base 2 | from sqlalchemy.orm import sessionmaker 3 | 4 | from common.dataset.persistence.page import Page 5 | 6 | def get_session(engine): 7 | Base = declarative_base() 8 | Session = sessionmaker(bind=engine) 9 | 10 | session = Session() 11 | if not engine.dialect.has_table(engine, Page.__tablename__): 12 | Page.__table__.create(bind=engine,checkfirst=True) 13 | return session -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/reader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | 4 | 5 | class Reader: 6 | def __init__(self,encoding="utf-8"): 7 | self.enc = encoding 8 | 9 | def read(self,file): 10 | with open(file,"r",encoding = self.enc) as f: 11 | return self.process(f) 12 | 13 | def process(self,f): 14 | pass 15 | 16 | 17 | class CSVReader(Reader): 18 | def process(self,fp): 19 | r = csv.DictReader(fp) 20 | return [line for line in r] 21 | 22 | class JSONReader(Reader): 23 | def process(self,fp): 24 | return json.load(fp) 25 | 26 | 27 | class JSONLineReader(Reader): 28 | def process(self,fp): 29 | data = [] 30 | for line in fp.readlines(): 31 | data.append(json.loads(line.strip())) 32 | return data 33 | 34 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/reverse_index.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import pickle 4 | from tqdm import tqdm 5 | 6 | 7 | class ReverseIndex: 8 | def __init__(self,docs, preprocessing): 9 | self.lookup = defaultdict(set) 10 | self.preprocess = preprocessing 11 | 12 | if docs is not None: 13 | for title,words in tqdm(docs): 14 | self.add(title,self.preprocess(words)) 15 | 16 | def add(self,title,words): 17 | for word in words: 18 | self.lookup[word].add(title) 19 | 20 | def docs(self,phrase): 21 | ret = [] 22 | for word in self.preprocess(phrase): 23 | ret.extend(self.lookup[word]) 24 | return ret 25 | 26 | def save(self,file): 27 | with open(file,"wb+") as f: 28 | pickle.dump(self.lookup,f) 29 | 30 | def load(self,file): 31 | with open(file,"rb") as f: 32 | self.lookup = pickle.load(f) -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/s3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/dataset/s3/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/s3/index.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from common.util.log_helper import LogHelper 4 | 5 | 6 | class Indexer: 7 | def __init__(self,file): 8 | self.pages = [] 9 | self.file = file 10 | self.logger = LogHelper.get_logger(__name__) 11 | self.logger.info("Indexing Pages") 12 | 13 | def index_page(self,key): 14 | self.logger.debug("Index Page: {0}".format(key)) 15 | self.pages.append(key) 16 | 17 | def load(self): 18 | self.pages.extend(pickle.load(self.file)) 19 | 20 | def get_block(self,block,num_blocks=50): 21 | return self.pages[block*len(self.pages)//num_blocks:(block+1)*len(self.pages)//num_blocks] 22 | 23 | def __enter__(self): 24 | return self 25 | 26 | def __exit__(self, exc_type, exc_val, exc_tb): 27 | self.logger.info("Saving index") 28 | pickle.dump(self.pages,self.file) 29 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/dataset/s3/iterator.py: -------------------------------------------------------------------------------- 1 | 2 | # Based off - 3 | # https://stackoverflow.com/questions/31918960/boto3-to-download-all-files-from-a-s3-bucket 4 | 5 | 6 | def s3_iterator(client, resource, root, dir, bucket, action): 7 | paginator = client.get_paginator('list_objects') 8 | 9 | for result in paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=dir): 10 | if result.get('CommonPrefixes') is not None: 11 | for subdir in result.get('CommonPrefixes'): 12 | s3_iterator(client, resource, root, subdir.get('Prefix'), bucket, action) 13 | if result.get('Contents') is not None: 14 | for file in result.get('Contents'): 15 | action(file.get("Key").replace(root,"")) 16 | 17 | 18 | #print(file.get('Key').replace(dist,"")) 19 | 20 | #obj = client.get_object(Bucket=bucket, Key=file.get("Key")) 21 | #writer.save(file.get("Key").replace(dist,""), obj["Body"].read().decode("utf-8")) 22 | 23 | #resource.meta.client.download_file(bucket, file.get('Key'), local + os.sep + clean(file.get('Key'))) 24 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/framework/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/framework/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/framework/task.py: -------------------------------------------------------------------------------- 1 | 2 | class Task: 3 | def score(self,data,labels): 4 | self.do_scoring(data,labels) 5 | 6 | def do_scoring(self): 7 | raise NotImplementedError("Not Implemented Here") 8 | 9 | 10 | class IRTask(Task): 11 | def do_scoring(self,data,labels): 12 | pass 13 | 14 | 15 | 16 | 17 | 18 | 19 | class InferenceTask(Task): 20 | pass 21 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/training/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/training/batcher.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from scipy.sparse import coo_matrix 5 | from torch.autograd import Variable 6 | 7 | from common.training.options import gpu 8 | 9 | 10 | class Batcher(): 11 | 12 | def __init__(self,data,size): 13 | self.data = data 14 | self.size = size 15 | self.pointer = 0 16 | 17 | if isinstance(self.data,coo_matrix): 18 | self.data = self.data.tocsr() 19 | 20 | def __next__(self): 21 | if self.pointer == splen(self.data): 22 | self.pointer = 0 23 | raise StopIteration 24 | 25 | next = min(splen(self.data),self.pointer+self.size) 26 | to_return = self.data[self.pointer : next] 27 | 28 | start,end = self.pointer,next 29 | 30 | self.pointer = next 31 | 32 | 33 | return to_return, splen(to_return), start, end 34 | 35 | def __iter__(self): 36 | return self 37 | 38 | def splen(data): 39 | try: 40 | return data.shape[0] 41 | except: 42 | return len(data) 43 | 44 | 45 | 46 | def prepare_with_labels(data,labels): 47 | data = data.todense() 48 | v = torch.FloatTensor(np.array(data)) 49 | if gpu(): 50 | return Variable(v.cuda()), Variable(torch.LongTensor(labels).cuda()) 51 | return Variable(v), Variable(torch.LongTensor(labels)) 52 | 53 | 54 | def prepare(data): 55 | data = data.todense() 56 | v = torch.FloatTensor(np.array(data)) 57 | if gpu(): 58 | return Variable(v.cuda()) 59 | return Variable(v) -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/training/early_stopping.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from common.util.log_helper import LogHelper 4 | 5 | 6 | class EarlyStopping(): 7 | def __init__(self,name,patience=8): 8 | self.patience = patience 9 | self.best_model = None 10 | self.best_score = None 11 | 12 | self.best_epoch = 0 13 | self.epoch = 0 14 | 15 | self.name = name 16 | self.logger = LogHelper.get_logger(EarlyStopping.__name__) 17 | 18 | def __call__(self, model, acc): 19 | self.epoch += 1 20 | 21 | if self.best_score is None: 22 | self.best_score = acc 23 | 24 | if acc >= self.best_score: 25 | torch.save(model.state_dict(),"models/{0}.best.save".format(self.name)) 26 | self.best_score = acc 27 | self.best_epoch = self.epoch 28 | self.logger.info("Saving best weights from round {0}".format(self.epoch)) 29 | return False 30 | 31 | elif self.epoch > self.best_epoch+self.patience: 32 | self.logger.info("Early stopping: Terminate") 33 | return True 34 | 35 | self.logger.info("Early stopping: Worse Round") 36 | return False 37 | 38 | def set_best_state(self,model): 39 | self.logger.info("Loading weights from round {0}".format(self.best_epoch)) 40 | model.load_state_dict(torch.load("models/{0}.best.save".format(self.name))) 41 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/training/options.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def is_gpu(): 7 | return os.getenv("GPU","no").lower() in ["1",1,"yes","true","t"] 8 | 9 | def gpu(): 10 | if is_gpu(): 11 | torch.cuda.set_device(int(os.getenv("CUDA_DEVICE", 0))) 12 | return True 13 | return False -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/training/run.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from sklearn.utils import shuffle 6 | 7 | from tqdm import tqdm 8 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 9 | from common.training.batcher import Batcher, prepare, prepare_with_labels 10 | from common.util.random import SimpleRandom 11 | 12 | 13 | def evaluate(model,data,labels,batch_size): 14 | predicted = predict(model,data,batch_size) 15 | return accuracy_score(labels,predicted.data.numpy().reshape(-1)) 16 | 17 | def predict(model, data, batch_size): 18 | batcher = Batcher(data, batch_size) 19 | 20 | predicted = [] 21 | for batch, size, start, end in batcher: 22 | d = prepare(batch) 23 | model.eval() 24 | logits = model(d).cpu() 25 | 26 | predicted.extend(torch.max(logits, 1)[1]) 27 | return torch.stack(predicted) 28 | 29 | def train(model, fs, batch_size, lr, epochs,dev=None, clip=None, early_stopping=None,name=None): 30 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4) 31 | 32 | data, labels = fs 33 | if dev is not None: 34 | dev_data,dev_labels = dev 35 | 36 | for epoch in tqdm(range(epochs)): 37 | epoch_loss = 0 38 | epoch_data = 0 39 | 40 | shuffle(data,labels) 41 | 42 | batcher = Batcher(data, batch_size) 43 | 44 | for batch, size, start, end in batcher: 45 | d,gold = prepare_with_labels(batch,labels[start:end]) 46 | 47 | model.train() 48 | optimizer.zero_grad() 49 | logits = model(d) 50 | 51 | loss = F.cross_entropy(logits, gold) 52 | loss.backward() 53 | 54 | epoch_loss += loss.cpu() 55 | epoch_data += size 56 | 57 | if clip is not None: 58 | torch.nn.utils.clip_grad_norm(model.parameters(), clip) 59 | optimizer.step() 60 | 61 | print("Average epoch loss: {0}".format((epoch_loss/epoch_data).data.numpy())) 62 | 63 | #print("Epoch Train Accuracy {0}".format(evaluate(model, data, labels, batch_size))) 64 | if dev is not None: 65 | acc = evaluate(model,dev_data,dev_labels,batch_size) 66 | print("Epoch Dev Accuracy {0}".format(acc)) 67 | 68 | if early_stopping is not None and early_stopping(model,acc): 69 | break 70 | 71 | if dev is not None and early_stopping is not None: 72 | early_stopping.set_best_state(model) 73 | 74 | 75 | 76 | def print_evaluation(model,data,ls,log=None): 77 | features,actual = data 78 | predictions = predict(model, features, 500).data.numpy().reshape(-1).tolist() 79 | 80 | labels = [ls.idx[i] for i, _ in enumerate(ls.idx)] 81 | 82 | actual = [labels[i] for i in actual] 83 | predictions = [labels[i] for i in predictions] 84 | 85 | print(accuracy_score(actual, predictions)) 86 | print(classification_report(actual, predictions)) 87 | print(confusion_matrix(actual, predictions)) 88 | 89 | data = zip(actual,predictions) 90 | if log is not None: 91 | f = open(log, "w+") 92 | for a,p in data: 93 | f.write(json.dumps({"actual": a, "predicted": p}) + "\n") 94 | f.close() 95 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/common/util/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/util/array.py: -------------------------------------------------------------------------------- 1 | def flatten(l): 2 | return [item for sublist in l for item in sublist] -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/util/log_helper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class LogHelper(): 5 | handler = None 6 | 7 | @staticmethod 8 | def setup(): 9 | FORMAT = '[%(levelname)s] %(asctime)s - %(name)s - %(message)s' 10 | LogHelper.handler = logging.StreamHandler() 11 | LogHelper.handler.setLevel(logging.DEBUG) 12 | LogHelper.handler.setFormatter(logging.Formatter(FORMAT)) 13 | 14 | LogHelper.get_logger(LogHelper.__name__).info("Log Helper set up") 15 | 16 | @staticmethod 17 | def get_logger(name, level=logging.DEBUG): 18 | l = logging.getLogger(name) 19 | if len(l.handlers) == 0: 20 | l.setLevel(level) 21 | l.addHandler(LogHelper.handler) 22 | return l 23 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/common/util/random.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import torch 5 | 6 | from common.training.options import gpu 7 | 8 | 9 | class SimpleRandom(): 10 | instance = None 11 | 12 | def __init__(self,seed): 13 | self.seed = seed 14 | self.random = random.Random(seed) 15 | 16 | def next_rand(self,a,b): 17 | return self.random.randint(a,b) 18 | 19 | @staticmethod 20 | def get_instance(): 21 | if SimpleRandom.instance is None: 22 | SimpleRandom.instance = SimpleRandom(SimpleRandom.get_seed()) 23 | return SimpleRandom.instance 24 | 25 | @staticmethod 26 | def get_seed(): 27 | return int(os.getenv("RANDOM_SEED", 12459)) 28 | 29 | @staticmethod 30 | def set_seeds(): 31 | 32 | torch.manual_seed(SimpleRandom.get_seed()) 33 | if gpu(): 34 | torch.cuda.manual_seed_all(SimpleRandom.get_seed()) 35 | np.random.seed(SimpleRandom.get_seed()) 36 | random.seed(SimpleRandom.get_seed()) -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/rename.py: -------------------------------------------------------------------------------- 1 | import sys, getopt 2 | 3 | import tensorflow as tf 4 | 5 | usage_str = 'python tensorflow_rename_variables.py --checkpoint_dir=path/to/dir/ ' \ 6 | '--replace_from=substr --replace_to=substr --add_prefix=abc --dry_run' 7 | 8 | 9 | def rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run): 10 | checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) 11 | with tf.Session() as sess: 12 | for var_name, _ in tf.contrib.framework.list_variables(checkpoint_dir): 13 | # Load the variable 14 | print(var_name) 15 | var = tf.contrib.framework.load_variable(checkpoint_dir, var_name) 16 | 17 | # Set the new name 18 | new_name = var_name 19 | if None not in [replace_from, replace_to]: 20 | new_name = new_name.replace(replace_from, replace_to) 21 | if add_prefix: 22 | new_name = add_prefix + new_name 23 | 24 | if dry_run: 25 | print('%s would be renamed to %s.' % (var_name, new_name)) 26 | else: 27 | print('Renaming %s to %s.' % (var_name, new_name)) 28 | # Rename the variable 29 | var = tf.Variable(var, name=new_name) 30 | 31 | if not dry_run: 32 | # Save the variables 33 | saver = tf.train.Saver() 34 | sess.run(tf.global_variables_initializer()) 35 | saver.save(sess, checkpoint.model_checkpoint_path) 36 | 37 | 38 | def main(argv): 39 | checkpoint_dir = None 40 | replace_from = None 41 | replace_to = None 42 | add_prefix = None 43 | dry_run = False 44 | 45 | try: 46 | opts, args = getopt.getopt(argv, 'h', ['help=', 'checkpoint_dir=', 'replace_from=', 47 | 'replace_to=', 'add_prefix=', 'dry_run']) 48 | except getopt.GetoptError: 49 | print(usage_str) 50 | sys.exit(2) 51 | for opt, arg in opts: 52 | if opt in ('-h', '--help'): 53 | print(usage_str) 54 | sys.exit() 55 | elif opt == '--checkpoint_dir': 56 | checkpoint_dir = arg 57 | elif opt == '--replace_from': 58 | replace_from = arg 59 | elif opt == '--replace_to': 60 | replace_to = arg 61 | elif opt == '--add_prefix': 62 | add_prefix = arg 63 | elif opt == '--dry_run': 64 | dry_run = True 65 | 66 | if not checkpoint_dir: 67 | print('Please specify a checkpoint_dir. Usage:') 68 | print(usage_str) 69 | sys.exit(2) 70 | 71 | rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run) 72 | 73 | 74 | if __name__ == '__main__': 75 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/retrieval/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/fever_doc_db.py: -------------------------------------------------------------------------------- 1 | # from ast import Param 2 | from drqa.retriever import DocDB, utils 3 | 4 | 5 | class FeverDocDB(DocDB): 6 | 7 | def __init__(self,path=None): 8 | super().__init__(path) 9 | 10 | def get_doc_lines(self, doc_id): 11 | """Fetch the raw text of the doc for 'doc_id'.""" 12 | cursor = self.connection.cursor() 13 | cursor.execute( 14 | "SELECT lines FROM documents WHERE id = ?", 15 | (utils.normalize(doc_id),) 16 | ) 17 | result = cursor.fetchone() 18 | cursor.close() 19 | return result if result is None else result[0] 20 | 21 | def get_non_empty_doc_ids(self): 22 | """Fetch all ids of docs stored in the db.""" 23 | cursor = self.connection.cursor() 24 | # cursor.execute("SELECT id FROM documents WHERE length(trim(text)) > 0") 25 | cursor.execute("SELECT id FROM documents WHERE length(trim(lines)) > 0") 26 | results = [r[0] for r in cursor.fetchall()] 27 | cursor.close() 28 | return results 29 | 30 | 31 | 32 | def main(): 33 | print("hi?") 34 | db = FeverDocDB(path = "/gpfs/fs1/projects/gpu_adlr/datasets/nayeonl/db/fever.db") 35 | # lines = db.get_doc_lines("Lorelai_Gilmore") 36 | lines = db.get_doc_lines("Goalkeeper_(association_football)") 37 | print(lines) 38 | 39 | 40 | 41 | # db = FeverDocDB(path = "/gpfs/fs1/projects/gpu_adlr/datasets/nayeonl/db/kilt_db.db") 42 | # lines = db.get_doc_lines('Michael Jordan') 43 | # print(lines) 44 | 45 | 46 | if __name__ == '__main__': 47 | main() 48 | 49 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/filter_lists.py: -------------------------------------------------------------------------------- 1 | def uninformative(title): 2 | return title.lower().startswith('list_of_') \ 3 | or title.lower().startswith("lists_of_") \ 4 | or title.lower().startswith('index_of_.') \ 5 | or title.lower().startswith('outline_of_') 6 | 7 | def preprocess(doc): 8 | return None if uninformative(doc['id']) else doc -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/filter_uninformative.py: -------------------------------------------------------------------------------- 1 | def uninformative(title): 2 | return '-LRB-disambiguation-RRB-' in title.lower() \ 3 | or '-LRB-disambiguation_page-RRB-' in title.lower() \ 4 | or title.lower().startswith('list_of_') \ 5 | or title.lower().startswith("lists_of_") \ 6 | or title.lower().startswith('index_of_.') \ 7 | or title.lower().startswith('outline_of_') 8 | 9 | def preprocess(doc): 10 | return None if uninformative(doc['id']) else doc -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/sent_features.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | 4 | from scipy.sparse import hstack 5 | 6 | import numpy as np 7 | 8 | from rte.riedel.fever_features import TermFrequencyFeatureFunction 9 | 10 | 11 | class SentenceTermFrequencyFeatureFunction(TermFrequencyFeatureFunction): 12 | 13 | def __init__(self,doc_db,lim_unigram=5000,naming=None): 14 | super().__init__(doc_db,lim_unigram,naming=naming) 15 | self.ename = "sentences" 16 | 17 | def bodies(self,data): 18 | return set([datum[self.ename] for datum in data]) 19 | 20 | def texts(self,data): 21 | return set([datum[self.ename] for datum in data]) 22 | 23 | def body_id(self,data): 24 | return [datum[self.ename] for datum in data] 25 | 26 | 27 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/sentence.py: -------------------------------------------------------------------------------- 1 | from common.util.array import flatten 2 | from rte.riedel.data import FeverFormatter, preprocess, FEVERLabelSchema 3 | 4 | 5 | class FEVERSentenceFormatter(FeverFormatter): 6 | def format_line(self,line): 7 | annotation = line["label"] 8 | if annotation is None: 9 | annotation = line["verifiable"] 10 | 11 | pages = [] 12 | if 'evidence' in line: 13 | pages = [[(ev[2],ev[3]) for ev in annotation if ev[2] is not None] for annotation in line["evidence"]] 14 | 15 | return {"claim":self.tokenize(line["claim"]), "evidence": pages, "label":self.label_schema.get_id(annotation),"label_text":annotation} 16 | 17 | 18 | class FEVERSentenceTextFormatter(FeverFormatter): 19 | def __init__(self,idx, db,ls): 20 | super().__init__(idx, ls) 21 | self.db = db 22 | 23 | def get_doc_line(self,doc,line): 24 | lines = self.db.get_doc_lines(doc) 25 | return lines.split("\n")[line].split("\t")[1] 26 | 27 | def format_line(self,line): 28 | annotation = line["label"] 29 | if annotation is None: 30 | annotation = line["verifiable"] 31 | 32 | newpages = [] 33 | docs = [] 34 | if 'evidence' in line: 35 | pages = set(flatten([[(ev[2],ev[3]) for ev in annotation if ev[2] is not None] for annotation in line["evidence"]])) 36 | docs = set(flatten([[ev[2] for ev in annotation if ev[2] is not None] for annotation in line["evidence"]])) 37 | 38 | for page in pages: 39 | newpages.append((page[0],page[1],self.get_doc_line(page[0],page[1]))) 40 | 41 | return {"claim":self.tokenize(line["claim"]), "docs": docs, "evidence": newpages, "label":self.label_schema.get_id(annotation),"label_text":annotation} 42 | 43 | 44 | 45 | class FEVERSentenceRelatednessFormatter(FeverFormatter): 46 | 47 | def __init__(self,idx, db,ls): 48 | super().__init__(idx, ls) 49 | self.label_schema = ls 50 | self.ols = FEVERLabelSchema() 51 | self.db = db 52 | 53 | def format_line(self,line): 54 | annotation = line["label"] 55 | if annotation is None: 56 | annotation = line["verifiable"] 57 | 58 | if self.ols.get_id(annotation) != self.ols.get_id("not enough info"): 59 | annotation = "related" 60 | else: 61 | annotation = "unrelated" 62 | 63 | evidence_texts = [] 64 | claim = self.tokenize(line['claim']).strip() 65 | for page in set([ev[2] for ev in line['evidence']]): 66 | evidences = set([ev[3] for ev in line['evidence'] if ev[1] == page]) 67 | lines = self.db.get_doc_lines(page) 68 | if any(ev<0 for ev in evidences): 69 | evidence_texts = [""] 70 | else: 71 | evidence_texts = [lines.split("\n")[line].split("\t")[1].split() for line in evidences] 72 | 73 | return {"claim":claim, "sentences": evidence_texts, "label":self.label_schema.get_id(annotation),"label_text":annotation} 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/retrieval/snopes_doc_db.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class SnopesDocDB(object): 5 | def __init__(self, db_path: str): 6 | self.path = db_path 7 | with open(self.path) as f: 8 | self.db_dict = json.load(f) 9 | 10 | def path(self): 11 | return self.path 12 | 13 | def get_doc_ids(self): 14 | results = list(self.db_dict.keys()) 15 | return results 16 | 17 | def get_doc_text(self, doc_id): 18 | return self.get_doc_lines(doc_id) 19 | 20 | def get_doc_lines(self, doc_id): 21 | if doc_id not in self.db_dict: 22 | return None 23 | lines = [str(num) + '\t' + line for num, line in enumerate(self.db_dict[doc_id])] 24 | return '\n'.join(lines) 25 | 26 | def get_non_empty_doc_ids(self): 27 | return [result for result in self.get_doc_ids() if len(self.db_dict[result]) > 0] 28 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/scripts/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/athene/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/fever_athene/src/scripts/athene/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/athene/export_current_config_to_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from athene.utils.config import Config 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('output', help='/path/to/file/to/save/config') 8 | args = parser.parse_args() 9 | Config.save_config(args.output) 10 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/athene/replace_noise_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | 5 | from tqdm import tqdm 6 | 7 | from common.dataset.reader import JSONLineReader 8 | from common.util.log_helper import LogHelper 9 | from drqa.retriever.utils import normalize 10 | 11 | 12 | def predicted_evidence_to_list(pred_evidences): 13 | evidences = [] 14 | for e in pred_evidences: 15 | evidences.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1]))) 16 | return evidences 17 | 18 | 19 | def gold_evidence_to_list(gold_evidences): 20 | evidences = [] 21 | for e_set in gold_evidences: 22 | evidence_set = [] 23 | for e in e_set: 24 | evidence_set.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1]))) 25 | evidences.append(evidence_set) 26 | return evidences 27 | 28 | 29 | def is_gold_evidence_predicted(_line): 30 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence']) 31 | _all_gold_evidences = gold_evidence_to_list(_line['evidence']) 32 | return any(all(e in _all_predicted_evidences for e in e_set) for e_set in _all_gold_evidences) 33 | 34 | 35 | def random_fill_gold_evidence(_line): 36 | _all_gold_evidences = gold_evidence_to_list(_line['evidence']) 37 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence']) 38 | e_set = random.sample(_all_gold_evidences, 1)[0] 39 | logger.debug("fill with evidence set: " + str(e_set)) 40 | for e in e_set: 41 | e_segments = e.split('§§§') 42 | if e not in _all_predicted_evidences: 43 | _line['predicted_evidence'] = [[e_segments[0], int(e_segments[1])]] + _line['predicted_evidence'] 44 | _line['predicted_evidence'] = _line['predicted_evidence'][:args.max_evidence] 45 | return _line 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument('input', help='/path/to/input/file') 51 | parser.add_argument('output', help='/path/to/output/file') 52 | parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5) 53 | args = parser.parse_args() 54 | LogHelper.setup() 55 | logger = LogHelper.get_logger("replace_noise_dataset") 56 | random.seed(55) 57 | jlr = JSONLineReader() 58 | lines = jlr.read(args.input) 59 | counter = 0 60 | with open(args.output, 'w') as f: 61 | for i, line in tqdm(enumerate(lines)): 62 | if not line['label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(line): 63 | counter += 1 64 | logger.info("line " + str(i + 1) + " should be filled") 65 | line = random_fill_gold_evidence(line) 66 | f.write(json.dumps(line) + '\n') 67 | logger.info(str(counter) + " samples filled with gold evidence") 68 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/athene/replace_noise_dataset_with_scores.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | 5 | from tqdm import tqdm 6 | 7 | from common.dataset.reader import JSONLineReader 8 | from common.util.log_helper import LogHelper 9 | from drqa.retriever.utils import normalize 10 | 11 | 12 | def predicted_evidence_to_list(pred_evidences): 13 | evidences = [] 14 | for e in pred_evidences: 15 | evidences.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1]))) 16 | return evidences 17 | 18 | 19 | def gold_evidence_to_list(gold_evidences): 20 | evidences = [] 21 | for e_set in gold_evidences: 22 | evidence_set = [] 23 | for e in e_set: 24 | evidence_set.append(normalize(str(e[-2])) + '§§§' + normalize(str(e[-1]))) 25 | evidences.append(evidence_set) 26 | return evidences 27 | 28 | 29 | def is_gold_evidence_predicted(_line): 30 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence']) 31 | _all_gold_evidences = gold_evidence_to_list(_line['evidence']) 32 | return any(all(e in _all_predicted_evidences for e in e_set) for e_set in _all_gold_evidences) 33 | 34 | 35 | def random_fill_gold_evidence(_line): 36 | _all_gold_evidences = gold_evidence_to_list(_line['evidence']) 37 | _all_predicted_evidences = predicted_evidence_to_list(_line['predicted_evidence']) 38 | e_set = random.sample(_all_gold_evidences, 1)[0] 39 | logger.debug("fill with evidence set: " + str(e_set)) 40 | for e in e_set: 41 | e_segments = e.split('§§§') 42 | if e not in _all_predicted_evidences: 43 | _line['predicted_evidence'] = [[e_segments[0], int(e_segments[1])]] + _line['predicted_evidence'] 44 | _line['scores'] = [1.0] + _line['scores'] 45 | _line['predicted_evidence'] = _line['predicted_evidence'][:args.max_evidence] 46 | _line['scores'] = _line['scores'][:args.max_evidence] 47 | return _line 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('input', help='/path/to/input/file') 53 | parser.add_argument('output', help='/path/to/output/file') 54 | parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5) 55 | args = parser.parse_args() 56 | LogHelper.setup() 57 | logger = LogHelper.get_logger("replace_noise_dataset") 58 | random.seed(55) 59 | jlr = JSONLineReader() 60 | lines = jlr.read(args.input) 61 | counter = 0 62 | with open(args.output, 'w') as f: 63 | for i, line in tqdm(enumerate(lines)): 64 | if not line['label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(line): 65 | counter += 1 66 | logger.info("line " + str(i + 1) + " should be filled") 67 | line = random_fill_gold_evidence(line) 68 | f.write(json.dumps(line) + '\n') 69 | logger.info(str(counter) + " samples filled with gold evidence") 70 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/athene/sort_submission.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | from common.dataset.reader import JSONLineReader 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--submission', help='/path/to/submission/file', required=True) 10 | parser.add_argument('--data', help='/path/to/data/file', required=True) 11 | parser.add_argument('--output', help='/path/to/output/file', required=True) 12 | args = parser.parse_args() 13 | jlr = JSONLineReader() 14 | submission_lines = jlr.read(args.submission) 15 | data_lines = jlr.read(args.data) 16 | assert len(submission_lines) == len(data_lines), "lengths of submission and data set are different!" 17 | submission_dict = {} 18 | for line in submission_lines: 19 | submission_dict[line['id']] = line 20 | assert len(submission_dict) == len(submission_lines), "lines in submission are not unique!" 21 | sorted_lines = [] 22 | for d in data_lines: 23 | sorted_lines.append(submission_dict[d['id']]) 24 | assert len(sorted_lines) == len(data_lines), "some claims from data set are missing in submission!" 25 | os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) 26 | with open(args.output, 'w') as f: 27 | for l in sorted_lines: 28 | f.write(json.dumps(l) + '\n') 29 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/prepare_submission.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import sys 4 | from fever.scorer import fever_score 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--predicted_labels",type=str) 8 | 9 | parser.add_argument("--predicted_evidence",type=str) 10 | parser.add_argument("--out_file",type=str) 11 | 12 | args = parser.parse_args() 13 | 14 | predicted_labels =[] 15 | predicted_evidence = [] 16 | actual = [] 17 | 18 | with open(args.predicted_labels,"r") as predictions_file: 19 | for line in predictions_file: 20 | predicted_labels.append(json.loads(line)["predicted"]) 21 | 22 | 23 | with open(args.predicted_evidence,"r") as predictions_file: 24 | for line in predictions_file: 25 | predicted_evidence.append(json.loads(line)["predicted_sentences"]) 26 | 27 | predictions = [] 28 | for ev,label in zip(predicted_evidence,predicted_labels): 29 | predictions.append({"predicted_evidence":ev,"predicted_label":label}) 30 | 31 | with open(args.out_file,"w+") as f: 32 | for line in predictions: 33 | f.write(json.dumps(line)+"\n") 34 | -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/src/scripts/score.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import sys 4 | from fever.scorer import fever_score 5 | from prettytable import PrettyTable 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--predicted_labels",type=str) 9 | 10 | parser.add_argument("--predicted_evidence",type=str) 11 | parser.add_argument("--actual",type=str) 12 | 13 | args = parser.parse_args() 14 | 15 | predicted_labels =[] 16 | predicted_evidence = [] 17 | actual = [] 18 | 19 | with open(args.predicted_labels,"r") as predictions_file: 20 | for line in predictions_file: 21 | if "predicted_label" in line: 22 | predicted_labels.append(json.loads(line)["predicted_label"]) 23 | else: 24 | predicted_labels.append(json.loads(line)["predicted"]) 25 | 26 | 27 | with open(args.predicted_evidence,"r") as predictions_file: 28 | for line in predictions_file: 29 | if "predicted_evidence" in line: 30 | predicted_evidence.append(json.loads(line)["predicted_evidence"]) 31 | else: 32 | predicted_evidence.append(json.loads(line)["predicted_sentences"]) 33 | 34 | with open(args.actual, "r") as actual_file: 35 | for line in actual_file: 36 | actual.append(json.loads(line)) 37 | 38 | predictions = [] 39 | for ev,label in zip(predicted_evidence,predicted_labels): 40 | predictions.append({"predicted_evidence":ev,"predicted_label":label}) 41 | 42 | score,acc,precision,recall,f1 = fever_score(predictions,actual) 43 | 44 | tab = PrettyTable() 45 | tab.field_names = ["FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall", "Evidence F1"] 46 | tab.add_row((round(score,4),round(acc,4),round(precision,4),round(recall,4),round(f1,4))) 47 | 48 | print(tab) -------------------------------------------------------------------------------- /FactualityPrompt/fever_athene/tests/test_load_models.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | from athene.retrieval.sentences.deep_models.ESIM import ESIM as ESIMretrieval 5 | from athene.rte.deep_models.ESIM_for_ensemble_glove_only_no_attention import ESIM as ESIMrte 6 | from athene.rte.utils.text_processing import load_whole_glove 7 | from common.util.log_helper import LogHelper 8 | 9 | LogHelper.setup() 10 | 11 | 12 | def test_load_retrieval_model(): 13 | dummy_embeddings = np.zeros((1, 300), dtype=np.float32) 14 | estimator = ESIMretrieval( 15 | h_max_length=20, s_max_length=60, learning_rate=0.001, batch_size=256, num_epoch=20, 16 | model_store_dir=None, 17 | embedding=dummy_embeddings, 18 | word_dict=None, dropout_rate=0.2, random_state=88, num_units=128, 19 | share_rnn=True 20 | ) 21 | # estimator.restore_model("../models/retrieval/best_model.ckpt") 22 | estimator.restore_model("../models/retrieval/sentence_selection_esim.ckpt") 23 | 24 | 25 | def test_load_rte_model(): 26 | dummy_embeddings = np.zeros((1, 300), dtype=np.float32) 27 | estimator = ESIMrte(name='esim_verify', 28 | activation='relu', 29 | batch_size=64, 30 | lstm_layers=1, 31 | n_outputs=3, 32 | num_neurons=[250, 180, 900, 550, 180], 33 | show_progress=1, embedding=dummy_embeddings 34 | ) 35 | # estimator.restore_model("../models/rte/esim1.ckpt") 36 | estimator.restore_model("../models/rte/claim_verification_esim.ckpt") 37 | 38 | 39 | @pytest.mark.skip(reason="Loading GloVe takes around 10 mins.") 40 | def test_load_rte_model_2(): 41 | vocab, embeddings = load_whole_glove("../../resources/embeddings/glove/glove.6B.300d.txt") 42 | estimator = ESIMrte(name='esim_verify', 43 | activation='relu', 44 | batch_size=64, 45 | lstm_layers=1, 46 | n_outputs=3, 47 | num_neurons=[250, 180, 900, 550, 180], 48 | show_progress=1, embedding=embeddings, vocab_size=len(vocab) 49 | ) 50 | estimator.restore_model("../models/rte/claim_verification_esim.ckpt") 51 | 52 | 53 | if __name__ == "__main__": 54 | pytest.main([__file__]) 55 | -------------------------------------------------------------------------------- /FactualityPrompt/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # bash script 5 | python -m spacy download en_core_web_sm 6 | 7 | # pip install -r requirements.txt 8 | 9 | 10 | pip install fever-drqa 11 | 12 | pip install hydra-core 13 | # pip uninstall sacrebleu; pip install sacrebleu==1.5.1 14 | 15 | pip install tensorflow 16 | pip install torch==1.5.0 17 | pip install torchvision==0.7.0 18 | 19 | 20 | # for SentenceTransformer retriever 21 | pip install torch==1.6.0 22 | pip install -U sentence-transformers # (tokenizer==0.11.6, transformers==4.17.0) 23 | 24 | # python in bash 25 | python - << EOF 26 | import nltk 27 | import benepar 28 | nltk.download('stopwords') 29 | nltk.download('punkt') 30 | benepar.download('benepar_en2') 31 | EOF -------------------------------------------------------------------------------- /FactualityPrompt/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__init__.py -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/claim_handling.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/claim_handling.cpython-38.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/const.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/const.cpython-310.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/const.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/const.cpython-38.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/const.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/const.cpython-39.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/factuality_metric.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/factuality_metric.cpython-38.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/metric.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/metric.cpython-38.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/retriever.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/retriever.cpython-38.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/__pycache__/retriever.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/FactualityPrompt/src/__pycache__/retriever.cpython-39.pyc -------------------------------------------------------------------------------- /FactualityPrompt/src/const.py: -------------------------------------------------------------------------------- 1 | DATA_DIR = "/mnt/efs/Haw-Shiuan/true_entropy/FactualityPrompt/data" # absolute path to data directory 2 | GEN_DIR = "/mnt/efs/Haw-Shiuan/llm-aymptotic-decoding/REAL_sampling/outputs/factual_gen" # absolute path to generations save directory 3 | HOME_DIR = "/mnt/efs/Haw-Shiuan/llm-aymptotic-decoding/FactualityPrompt" # absolute path to this project directory 4 | -------------------------------------------------------------------------------- /FactualityPrompt/src/factuality_metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from fairseq.data.data_utils import collate_tokens 3 | import numpy as np 4 | import re 5 | 6 | NLI_MODEL = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') 7 | NLI_MODEL.eval() 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | softmax = torch.nn.Softmax(dim=1) 10 | NLI_MODEL.to(device) 11 | 12 | 13 | ''' 14 | Returns ([[contradiction, neutral, entailment]], argmax) 15 | ''' 16 | def nli_metric_batch(batch_of_pairs): 17 | # batch_of_pairs = [ 18 | # ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'], 19 | # ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'], 20 | # ['potatoes are awesome.', 'I like to run.'], 21 | # ['Mars is very far from earth.', 'Mars is very close.'], 22 | # ] 23 | 24 | encoded_tokens = [NLI_MODEL.encode(pair[0], pair[1]) for pair in batch_of_pairs] 25 | encoded_tokens = [tokens[:min(len(tokens), 512)] for tokens in encoded_tokens] # trucate any long seq 26 | batch = collate_tokens( 27 | encoded_tokens, pad_idx=1 28 | ) 29 | 30 | logprobs = NLI_MODEL.predict('mnli', batch) 31 | logits = softmax(logprobs) 32 | labels = logits.argmax(dim=1) # logprobs.argmax(dim=1) 33 | 34 | return logits.tolist(), labels.tolist() 35 | 36 | 37 | 38 | def nli_metric(premise, hypothesis): 39 | 40 | # Encode a pair of sentences and make a prediction 41 | # tokens = NLI_MODEL.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.') 42 | tokens = NLI_MODEL.encode(premise, hypothesis) 43 | 44 | seq_len = min(len(tokens), 512) 45 | 46 | logits = NLI_MODEL.predict('mnli', tokens[:seq_len]) 47 | logits = softmax(logits) 48 | label = logits.argmax() # 0: contradiction 49 | 50 | return logits.tolist(), label.tolist() 51 | 52 | 53 | # ('As much as', 'CARDINAL') 54 | # ('About 20', 'CARDINAL') 55 | # ('67', 'CARDINAL'), 56 | # ('14,000 meters', 'QUANTITY') vs ('1.4 kilometers', 'QUANTITY') 57 | 58 | def ner_metric(named_entities, prompt_wiki_candidates): 59 | 60 | wiki_text = " ".join(prompt_wiki_candidates).lower() 61 | 62 | # TODO improve the NE match here 63 | # hanlde DATE, TIME, etc better! appears a lot but handled poorly 64 | 65 | existing_correct_ne = [] 66 | for ent in named_entities: 67 | ent_text = ent[0].lower() 68 | if 'the ' in ent_text: 69 | ent_text = ent_text.replace('the ', "") 70 | 71 | if ent_text in wiki_text: 72 | existing_correct_ne.append(ent) 73 | elif any([bool(word in wiki_text) for word in ent_text.split(" ") if ent[1] == 'PERSON']): 74 | # handle shorter forms of same NE: Exists "Marcus Morgan Bentley", but NE is "Marcus Bentley" or "Bentley" 75 | existing_correct_ne.append(ent) 76 | elif ent[1] == 'DATE': 77 | date_str = re.sub(r"[,.;@#?!&$]+\ *", " ", ent_text) 78 | date_str = date_str.replace("st", "") 79 | date_str = date_str.replace("nd", "") 80 | date_str = date_str.replace("th", "") 81 | date_str = date_str.replace("of", "") 82 | date_tokens = date_str.split(" ") 83 | 84 | if all([bool(token in wiki_text) for token in date_tokens]): 85 | existing_correct_ne.append(ent) 86 | 87 | 88 | 89 | correct_ratio = len(existing_correct_ne)/ len(named_entities) 90 | 91 | return correct_ratio 92 | 93 | 94 | def ie_metric(claims, evidences): 95 | return NotImplementedError 96 | 97 | 98 | 99 | if __name__ == '__main__': 100 | 101 | print("Hi") -------------------------------------------------------------------------------- /FactualityPrompt/src/repetition.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/repetition.py by Ari Holtzman. 3 | ''' 4 | import argparse 5 | import json 6 | import os 7 | 8 | from transformers import GPT2Tokenizer 9 | 10 | from src.const import DATA_DIR, HOME_DIR, GEN_DIR 11 | 12 | 13 | def parse_args() -> argparse.Namespace: 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("file", type=str) 16 | parser.add_argument("--eval_dir", type=str, default = '') 17 | parser.add_argument("--numbers-only", action="store_true") 18 | parser.add_argument("--output", action="store_true") 19 | parser.add_argument("--final", action="store_true") 20 | parser.add_argument('--num_eval_sent', type=int, default=1) 21 | 22 | return parser.parse_args() 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large", do_lower_case=True) 28 | SEP = tokenizer.encode(tokenizer.bos_token)[0] 29 | 30 | objs = [] 31 | max_n = 90 32 | 33 | if len(args.eval_dir) > 0: 34 | args.file = "{}/{}".format(args.eval_dir, args.file) 35 | else: 36 | args.file = "{}/{}".format(GEN_DIR, args.file) 37 | with open(args.file, 'r') as fin: 38 | for l in fin: 39 | objs.append(json.loads(l.strip())) 40 | 41 | n_repeated_examples = 0 42 | repeated_times_sum = 0 43 | 44 | nn = len(objs) 45 | for idx, obj in enumerate(objs): 46 | #print(obj) 47 | gen = obj['text'] 48 | if len(gen) == 0: 49 | continue 50 | 51 | if "WikiNamePrefix" in args.file: 52 | wikiPrefix = obj['prompt'].split(". ")[-1].strip() 53 | gen = gen.replace(wikiPrefix, " ") 54 | 55 | if gen[-1] == SEP: 56 | gen.pop(-1) 57 | rev_gen = list(reversed(gen)) 58 | last_n_repeats = [0] * max_n 59 | 60 | for n in range(1, max_n + 1): 61 | n_repeat = 1 62 | while len(rev_gen[n*n_repeat:n*(n_repeat+1)]) == n and \ 63 | rev_gen[n*n_repeat:n*(n_repeat+1)] == rev_gen[:n]: 64 | n_repeat += 1 65 | last_n_repeats[n - 1] = n_repeat 66 | max_repeated_n = max(range(max_n), key=lambda x: last_n_repeats[x]) 67 | 68 | if last_n_repeats[max_repeated_n] > 1 and (max_repeated_n+1 >= 3 or last_n_repeats[max_repeated_n] > 50): 69 | obj['repetition'] = { 70 | 'repeated_phrase': list(reversed(rev_gen[:max_repeated_n + 1])), 71 | 'repeated_times': last_n_repeats[max_repeated_n], 72 | 'repeated_phrase_length': max_repeated_n + 1, 73 | } 74 | n_repeated_examples += 1 75 | 76 | repeated_times_sum += last_n_repeats[max_repeated_n] 77 | 78 | else: 79 | obj['repetition'] = None 80 | 81 | # if not args.numbers_only: 82 | # print("filename\tnumber of repeating examples") 83 | # print(f"{os.path.basename(args.file)},{n_repeated_examples},{repeated_times_sum/nn}") 84 | print(f"{n_repeated_examples},{repeated_times_sum/nn}") 85 | 86 | if args.num_eval_sent == 1: 87 | score_folder_name = "scores" 88 | else: 89 | score_folder_name = "scores_s"+str(args.num_eval_sent) 90 | output_folder = os.path.dirname(args.file) + '/' + score_folder_name 91 | if not os.path.exists(output_folder): 92 | os.makedirs(output_folder) 93 | if args.output: 94 | output_filename = os.path.join(os.path.dirname(args.file), score_folder_name, "repetition_" + os.path.basename(args.file)) 95 | with open(output_filename, 'w+') as fout: 96 | for obj in objs: 97 | print(json.dumps(obj), file=fout) 98 | 99 | if args.final: 100 | gen_path = output_folder + '/' + os.path.basename(args.file) 101 | res_path = gen_path.replace(".jsonl", "_results.jsonl") 102 | with open(res_path, 'a') as outfile: 103 | res_obj = { 104 | "repetition": n_repeated_examples, 105 | "repetition_ratio": n_repeated_examples / nn 106 | } 107 | json.dump(res_obj, outfile) 108 | outfile.write("\n") 109 | 110 | 111 | if __name__ == '__main__': 112 | main() 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extrapolating an Infinite LLM♾🤖 2 | 3 | ## Introduction 4 | 5 | Assuming you have a series of LLMs with different sizes that are trained on the same data and you want to increase the factuality and diversity of the text sampled from your largest LLM. Then, consider to use our proposed REAL sampling and/or APD sampling. In FactualityPrompt, we show that APD + REAL sampling outperforms 13 state-of-the-art sampling methods. Our baselines include typical ([Meister et al., 2022](https://arxiv.org/abs/2202.00666)), eta ([Hewitt et al., 2022](https://arxiv.org/pdf/2210.15191)), EDT ([Zhang et al., 2024](https://arxiv.org/abs/2403.14541)), adaptive ([Zhu et al., 2024](https://arxiv.org/abs/2402.18223)), microstat ([Basu et al., 2021](https://arxiv.org/abs/2007.14966)), EAD w/o ELI ([Arora et al., 2023](https://arxiv.org/abs/2302.06784)) factual ([Lee et al., 2022](https://arxiv.org/abs/2206.04624)) top-p ([Holtzman et al., 2020](https://arxiv.org/pdf/1904.09751)), top-k ([Fan et al., 2018](https://arxiv.org/pdf/1805.04833)), and temperature sampling; contrastive search ([Su and Collier, 2022](https://arxiv.org/pdf/2210.14140)) , contrastive decoding (CD) ([Li et al., 2022](https://arxiv.org/pdf/2210.15097)), and DoLa ([Chuang et al., 2023](https://arxiv.org/pdf/2309.03883)). We show that APD + REAL sampling makes Pythia 6.9B simultaneously achieve the factuality of greedy sampling and diversity of top-p with p=0.5. 6 | 7 |

8 | 9 | ## Usage 10 | 11 | To run our code, please follow the instructions in the README.md of each folder. 12 | 13 | We first write the REAL sampling code in the REAL_sampling folder and revise the code for APD sampling in the AP_sampling folder. As a result, AP_sampling also includes the inference code of REAL sampling. We also slightly modify the code of FactualityPrompt (https://github.com/nayeon7lee/FactualityPrompt) to make it easier to run. 14 | 15 | ## Computational Resources 16 | 17 | Our code assumes that your machine has 8 GPUs and each GPU has 32G memory. If you have less GPU or your GPU has less memory, you can try to reduce your generation model sizes. 18 | 19 | ## Questions 20 | 21 | If you have any questions or find any bugs, please send an email to Haw-Shiuan Chang (hschang@cs.umass.edu). 22 | 23 | ## Security 24 | 25 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 26 | 27 | ## License 28 | 29 | This library is licensed under the [Creative Commons Attribution-NonCommercial 4.0 International](https://creativecommons.org/licenses/by-nc/4.0/) License. 30 | 31 | ## Citation 32 | 33 | If you use our code for THF model or REAL sampling in your work, consider to cite https://arxiv.org/abs/2406.07735 . 34 | ``` 35 | @misc{chang2024realsamplingboostingfactuality, 36 | title={REAL Sampling: Boosting Factuality and Diversity of Open-Ended Generation via Asymptotic Entropy}, 37 | author={Haw-Shiuan Chang and Nanyun Peng and Mohit Bansal and Anil Ramakrishna and Tagyoung Chung}, 38 | year={2024}, 39 | eprint={2406.07735}, 40 | archivePrefix={arXiv}, 41 | primaryClass={cs.CL}, 42 | url={https://arxiv.org/abs/2406.07735}, 43 | } 44 | ``` 45 | 46 | If you use our code for APD sampling in your work, consider to cite https://arxiv.org/abs/2411.01610 (see the example reference and bib information below). 47 | ``` 48 | @inproceedings{chang2024explaining, 49 | title={Explaining and Improving Contrastive Decoding by Extrapolating the Probabilities of a Huge and Hypothetical LM}, 50 | author={Chang, Haw-Shiuan and Peng, Nanyun and Bansal, Mohit and Ramakrishna, Anil and Chung, Tagyoung}, 51 | booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, 52 | year={2024}, 53 | } 54 | ``` 55 | 56 | If you use FactualityPrompt, cite their paper (https://arxiv.org/abs/2206.04624). 57 | -------------------------------------------------------------------------------- /REAL_sampling/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/.DS_Store -------------------------------------------------------------------------------- /REAL_sampling/README.md: -------------------------------------------------------------------------------- 1 | # REAL Sampling: Boosting Factuality and Diversity of Open-Ended Generation by Extrapolating the Entropy of an Infinitely Large LM 2 | 3 |

4 | 5 | ## Introduction 6 | 7 | REAL (**R**esidual **E**ntropy from **A**symptotic **L**ine) sampling is a decoding method that achieves improved factuality and diversity over nucleus sampling by predicting an adaptive threshold of p. Specifically, REAL sampling predicts the step-wise likelihood of an LLM to hallucinate, and lowers the p threshold when an LLM is likely to hallucinate. Otherwise, REAL sampling increases the p threshold to boost the diversity. To predict the step-wise hallucination likelihood without supervision, we construct a Token-level Hallucination Forecasting (THF) model to predict the asymptotic entropy (i.e., inherent uncertainty) of the next token by extrapolating the next-token entropies from a series of LLMs with different sizes. If a LLM's entropy is higher than the asymptotic entropy (i.e., the LLM is more uncertain than it should be), the THF model predicts a high hallucination hazard, which leads to a lower p threshold in REAL sampling. In the FactualityPrompts benchmark, we demonstrate that REAL sampling based on a 70M THF model can substantially improve the factuality and diversity of 7B LLMs simultaneously, judged by both retrieval-based metrics and human evaluation. 8 | 9 | ## Computational Environment 10 | 11 | You can reproduce our python enviroment using 12 | ``` 13 | conda create --name --file requirement.txt 14 | ``` 15 | ## How to run REAL sampling 16 | 17 | To learn how to use REAL sampling in huggingface, please see the following example code 18 | 19 | ``` 20 | ./src/example.py 21 | ``` 22 | 23 | ### Run FactualityPrompts 24 | 25 | To evaluate the generation results, first follow ../FactualityPrompt/README.md to download the data, change ../FactualityPrompt/src/const.py and run the following script. 26 | 27 | If you have >7 GPUs in your machine, you can just run the following file to generate the contiunations. 28 | ``` 29 | ./bin/continue_wiki_prompt_loop.sh 30 | ``` 31 | 32 | To evaluate the generation results, first follow ../FactualityPrompt/README.md to download the data, change ./FactualityPrompt/src/const.py and run the following script. 33 | ``` 34 | ../FactualityPrompt/bin/eval_loop.sh 35 | ``` 36 | 37 | 38 | ## How to Train THF 39 | 40 | 41 | Put your text file into "data/raw/". 42 | 43 | Change the INPUT_FILE in bin/train_THF_model.sh and run it (Assuming you have more than 7 GPUs in your machine). 44 | 45 | 46 | ## How to use THF to produce unsupervised features for hallucination detection tasks 47 | 48 | Please check src/process_hallucination_dataset/get_entropy_all.py and analyze_datasets/feature_clf_all.py 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /REAL_sampling/bin/continue_wiki_prompt_loop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prompt_folder='../FactualityPrompt/prompts/' 4 | model_name='EleutherAI/pythia-6.9b-deduped' 5 | #model_name='openlm-research/open_llama_7b_v2' 6 | #model_name='facebook/opt-6.7b' 7 | 8 | #export CUDA_LAUNCH_BLOCKING=1 9 | 10 | dataset_suffix='_test7k' 11 | 12 | temperature='1' 13 | 14 | METHOD_ARR=( 'fe_topp' 'topp' ) 15 | MODEL_ARR=( 'OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' '' ) 16 | SUBMETHOD_ARR=( 'exp_1_win' 'a' ) 17 | P_ARR=( '1.0' '0;0.3;0.5;0.6;0.7;1' ) 18 | DT_ARR=( '0.5;0.7;1.0;2.0;3.0;4.0' '1.0' ) 19 | 20 | #METHOD_ARR=( 'topp' 'eta' 'typical' 'decay_period' 'topk' ) 21 | #MODEL_ARR=( '' '' '' '' '' ) 22 | #SUBMETHOD_ARR=( 'a' 'a' 'a' 'a' 'a' ) 23 | #P_ARR=( '1.0;0.8;0.7;0.6;0.5;0.4;0.3' '0.1;0.3;0.8' '2;0.9;0.5;0.3' '0.9' '10;5;3;2;1' ) 24 | #DT_ARR=( '1' '1' '1' '0.95;0.9;0.7;0.5;0.3;0.1' '1' ) 25 | 26 | #METHOD_ARR=( 'CD' ) 27 | #MODEL_ARR=( 'EleutherAI/pythia-70m-deduped' ) 28 | #SUBMETHOD_ARR=( 'a' ) 29 | #P_ARR=( '0.6;0.4;0.2;0.25;0.1;0.05' ) 30 | #DT_ARR=( '1.0' ) 31 | 32 | #METHOD_ARR=( 'fe_CD_topp' 'fe_topp_period' 'topp' ) 33 | #MODEL_ARR=( 'OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' 'OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' '') 34 | #SUBMETHOD_ARR=( 'exp_1_win' 'exp_1_win' 'a' ) 35 | #P_ARR=( '1.0' '0.9' '1.0' ) 36 | #DT_ARR=( '4.0;0.7;1.5' '5.0' '0.1;0.3;0.7;0.9' ) 37 | 38 | #METHOD_ARR=( 'CD' ) 39 | #MODEL_ARR=( 'facebook/opt-125m' ) 40 | #SUBMETHOD_ARR=( 'a' ) 41 | #P_ARR=( '0.1;0.05' ) 42 | #DT_ARR=( '0.25' ) 43 | 44 | #METHOD_ARR=( 'CS' ) 45 | #MODEL_ARR=( '' ) 46 | #SUBMETHOD_ARR=( 'a' ) 47 | #P_ARR=( '5.0' ) 48 | #DT_ARR=( '0.6' ) 49 | #P_ARR=( '5.0;10' ) 50 | #DT_ARR=( '0.4;0.6' ) 51 | 52 | init_existing_seeds=0 53 | repeat_times=4 54 | 55 | dataset_names=("fever_factual${dataset_suffix}_final.jsonl" "fever_nonfactual${dataset_suffix}_final.jsonl") 56 | 57 | #input_datasets=($(for x in "${dataset_names[@]}"; do printf "$x%.0s " {1..${repeat_times}}; done)) 58 | #for v in ${dataset_names[@]}; do for i in $(seq 1 $repeat_times); do echo $v; done; done 59 | 60 | END=$(($init_existing_seeds + $repeat_times - 1)) 61 | input_datasets=($(for v in ${dataset_names[@]}; do for i in $(seq 1 $repeat_times); do echo $v; done; done)) 62 | existing_seeds_arr=($(seq $init_existing_seeds $END)) 63 | existing_seeds_arr=("${existing_seeds_arr[@]}" "${existing_seeds_arr[@]}") 64 | echo ${input_datasets[@]} 65 | echo ${existing_seeds_arr[@]} 66 | 67 | for j in "${!METHOD_ARR[@]}"; do 68 | MODEL=${MODEL_ARR[$j]} 69 | sample_method=${METHOD_ARR[$j]} 70 | sample_sub_method=${SUBMETHOD_ARR[$j]} 71 | top_p_all=${P_ARR[$j]} 72 | decay_temperature_all=${DT_ARR[$j]} 73 | 74 | final_entropy_model_path="models/$MODEL" 75 | batch_size=8 76 | if [[ $MODEL == *"410"* ]]; then 77 | batch_size=4 78 | fi 79 | if [[ $MODEL == *"_1b_"* ]]; then 80 | batch_size=2 81 | fi 82 | if [[ $MODEL == *"EleutherAI"* ]]; then 83 | batch_size=4 84 | fi 85 | if [[ $sample_method == *"fe_CD"* ]]; then 86 | batch_size=4 87 | fi 88 | if [[ $sample_method == *"CS"* ]]; then 89 | batch_size=1 90 | fi 91 | IFS=";" read -r -a top_p_list <<< "${top_p_all}" 92 | IFS=";" read -r -a decay_temperature_list <<< "${decay_temperature_all}" 93 | for top_p in "${top_p_list[@]}"; do 94 | for decay_temperature in "${decay_temperature_list[@]}"; do 95 | pids=() 96 | for i in "${!input_datasets[@]}"; 97 | do 98 | dataset_name=${input_datasets[$i]} 99 | num_existing_seeds=${existing_seeds_arr[$i]} 100 | echo "python src/factual_gen/gen_fp.py --model_name=$model_name --input_file_name ${prompt_folder}/$dataset_name --cuda_idx $i --p $top_p --num_existing_seeds $num_existing_seeds --sample_method $sample_method --final_entropy_model_path $final_entropy_model_path --batch_size $batch_size --decay_temperature $decay_temperature --temperature $temperature --sample_sub_method $sample_sub_method" 101 | #sleep 1 & 102 | python src/factual_gen/gen_fp.py --model_name=$model_name --input_file_name ${prompt_folder}/$dataset_name --cuda_idx $i --p $top_p --num_existing_seeds $num_existing_seeds --sample_method $sample_method --final_entropy_model_path $final_entropy_model_path --batch_size $batch_size --decay_temperature $decay_temperature --temperature $temperature --sample_sub_method $sample_sub_method & 103 | pids+=($!) 104 | done 105 | echo "${pids[@]}" 106 | wait "${pids[@]}" 107 | done 108 | done 109 | done 110 | -------------------------------------------------------------------------------- /REAL_sampling/bin/train_THF_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #INPUT_FILE="data/raw/wiki2021_text_only_1e4" 4 | #PROC_FOLDER="data/processed/wiki_1e4_Pythia_temp/" 5 | #TOKENIZER="EleutherAI/pythia-70m-deduped" 6 | #OUTPUT_MODEL_FOLDER="models/wiki_1e4_70M_bsz_128_exp_pred_last_a10_e3" 7 | 8 | INPUT_FILE="data/raw/OWT_wiki_1e7" 9 | PROC_FOLDER="data/processed/OWT_wiki_1e7_Pythia/" 10 | TOKENIZER="EleutherAI/pythia-70m-deduped" 11 | OUTPUT_MODEL_FOLDER="models/OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3" 12 | 13 | echo "python src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER" 14 | python src/prepare_id_corpus_from_raw.py --input_file $INPUT_FILE --output_dir $PROC_FOLDER/tensors_all/ --model_name $TOKENIZER 15 | 16 | declare -a bsz_arr=(1 2 4 4 8 12 16) 17 | declare -a model_arr=("EleutherAI/pythia-6.9b-deduped" "EleutherAI/pythia-2.8b-deduped" "EleutherAI/pythia-1.4b-deduped" "EleutherAI/pythia-1b-deduped" "EleutherAI/pythia-410m-deduped" "EleutherAI/pythia-160m-deduped" "EleutherAI/pythia-70m-deduped" ) 18 | 19 | pids=() 20 | for i in "${!model_arr[@]}"; 21 | do 22 | model_name=${model_arr[$i]} 23 | batch_size=${bsz_arr[$i]} 24 | echo "python src/collect_gt_entropy.py --model_name=$model_name --input_folder_name $PROC_FOLDER --cuda_idx $i --batch_size $batch_size" 25 | python src/collect_gt_entropy.py --model_name=$model_name --input_folder_name $PROC_FOLDER --cuda_idx $i --batch_size $batch_size & 26 | pids+=($!) 27 | done 28 | echo "${pids[@]}" 29 | wait "${pids[@]}" 30 | 31 | echo "python src/train_entropy_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $PROC_FOLDER/entropy_tensor_1024/train --validation_label_folder $PROC_FOLDER/entropy_tensor_1024/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 5000 --num_train_epochs 3" 32 | python src/train_entropy_prediction_model.py --output_dir $OUTPUT_MODEL_FOLDER --train_text_file $PROC_FOLDER/tensors_all/train.pt --validation_text_file $PROC_FOLDER/tensors_all/val_org.pt --train_label_folder $PROC_FOLDER/entropy_tensor_1024/train --validation_label_folder $PROC_FOLDER/entropy_tensor_1024/val --model_name_or_path ${model_arr[-1]} --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --logging_steps 10 --warmup_steps 100 --eval_steps 500 --evaluation_strategy steps --save_steps 5000 --num_train_epochs 3 33 | -------------------------------------------------------------------------------- /REAL_sampling/imgs/REAL_second_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/imgs/REAL_second_figure.png -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/data_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/data_utils.cpython-310.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/data_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/data_utils.cpython-37.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/data_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/data_utils.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/model.cpython-39.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-37.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/__pycache__/train_entropy_prediction_model.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/analyze_datasets/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/analyze_datasets/utils.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.nn import functional as F 4 | import codecs 5 | import json 6 | import spacy 7 | from sklearn.metrics import classification_report, accuracy_score, hamming_loss, \ 8 | f1_score, precision_score, recall_score, average_precision_score, roc_auc_score, confusion_matrix, \ 9 | brier_score_loss, average_precision_score 10 | import numpy as np 11 | 12 | 13 | def binary_eval(predy, testy, predy_pro, verbose=True, return_f1=False, predscore=None): 14 | acc = accuracy_score(testy, predy) 15 | f1 = f1_score(testy, predy, average=None) 16 | precision = precision_score(testy, predy, average=None) 17 | recall = recall_score(testy, predy, average=None) 18 | 19 | average_precision = average_precision_score(testy, predy_pro) 20 | epsilon = 1e-8 21 | 22 | htn, hfp, hfn, htp = confusion_matrix(testy, predy).ravel() 23 | hsensi = htp / (htp + hfn + epsilon) 24 | hspec = htn / (hfp + htn + epsilon) 25 | gmean = np.sqrt(hsensi*hspec) 26 | 27 | 28 | info = "Acc : {}\nf1 : {}\nprecision : {}\nrecall : {}\nG-mean : {}\nAP : {}".format(acc, 29 | " ".join([str(x) for x in f1]), " ".join([str(x) for x in precision]), 30 | " ".join([str(x) for x in recall]), gmean, average_precision) 31 | 32 | if predscore is not None: 33 | bss = brier_score_loss(testy, predscore) 34 | roc_auc = roc_auc_score(testy, predscore) 35 | info += "\nbss : {}\nROC-AUC : {}".format(bss, roc_auc) 36 | 37 | if verbose: 38 | print(info) 39 | 40 | if return_f1: 41 | return acc, f1, precision, recall, gmean, bss, roc_auc, info 42 | else: 43 | #return acc, info 44 | return average_precision, info 45 | 46 | 47 | def subsets(nums): 48 | """ 49 | :type nums: List[int] 50 | :rtype: List[List[int]] 51 | """ 52 | ans = [] 53 | def dfs(curpos, tmp): 54 | if tmp: 55 | ans.append(tmp[:]) 56 | for i in range(curpos, len(nums)): 57 | tmp.append(nums[i]) 58 | dfs(i+1, tmp) 59 | tmp.pop(-1) 60 | dfs(0, []) 61 | return ans 62 | 63 | 64 | def sent_ner_bounds(sen, nlp=None): 65 | if nlp is None: 66 | nlp = spacy.load('en') 67 | tokens, tags = [], [] 68 | print(sen) 69 | for doc in nlp.pipe([sen]): 70 | for token in doc: 71 | tags.append(token.ent_iob_) 72 | tokens.append(str(token)) 73 | 74 | rep_pos = [] 75 | vis = [False for _ in range(len(tags))] 76 | for idx, tag in enumerate(tags): 77 | if tag == 'O': 78 | rep_pos.append([idx, idx]) 79 | vis[idx] = True 80 | elif tag == 'B': 81 | end = idx 82 | for j in range(idx+1, len(tags)): 83 | if tags[j] == 'I': 84 | end = j 85 | else: 86 | break 87 | rep_pos.append([idx, end]) 88 | elif tag == 'I': 89 | continue 90 | 91 | return ' '.join(tokens), rep_pos 92 | 93 | 94 | def remove_marked_sen(sen, start_id, end_id): 95 | tokens = sen if type(sen) == list else sen.strip().split() 96 | if tokens[start_id].startswith("===") and tokens[end_id].endswith("==="): 97 | tokens[start_id] = tokens[start_id][3:] 98 | tokens[end_id] = tokens[end_id][:-3] 99 | return tokens 100 | 101 | -------------------------------------------------------------------------------- /REAL_sampling/src/collect_gt_entropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import numpy as np 4 | import os 5 | import argparse 6 | from data_utils import load_corpus 7 | from tqdm import tqdm 8 | 9 | def word_ent(model, input_ids): 10 | #top_k_val = 5 11 | assert model is not None 12 | input_ids = input_ids.to(model.device) 13 | outputs = model(input_ids, labels=input_ids) 14 | loss, logits = outputs[:2] 15 | probs = logits.softmax(dim=-1) 16 | ent = - (probs * (1e-23+probs).log() ).sum(dim=-1) 17 | #top_val, top_idx= torch.topk(probs.squeeze(), k=top_k_val, dim=-1) 18 | #top_idx = top_idx.tolist() 19 | #print(top_idx) 20 | #top_tok = [tokenizer.convert_ids_to_tokens(top_idx[i]) for i in range(len(top_idx))] 21 | #return ent.cpu(), top_tok, top_val.cpu() 22 | return ent.cpu() 23 | 24 | def str2bool(v): 25 | if v.lower() in ('yes', 'true', 'True', 't', 'y', '1'): 26 | return True 27 | elif v.lower() in ('no', 'false', 'False', 'f', 'n', '0'): 28 | return False 29 | else: 30 | raise argparse.ArgumentTypeError('Boolean value expected.') 31 | 32 | def parse_args(): 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("--model_name", type=str, required=True) 35 | parser.add_argument("--input_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia') 36 | #parser.add_argument("--output_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia') 37 | #parser.add_argument("--output_tensor_folder", type=str, default = 'entropy_tensor') 38 | parser.add_argument("--output_tensor_folder", type=str, default = 'entropy_tensor_1024') 39 | parser.add_argument("--tensor_folder", type=str, default = 'tensors_all') 40 | parser.add_argument("--do_train", type=str2bool, nargs='?', default=True) 41 | parser.add_argument("--do_val", type=str2bool, nargs='?', default=True) 42 | parser.add_argument("--batch_size", type=int, default=8) 43 | #parser.add_argument("--eval_batch_size", type=int, default=16) 44 | #parser.add_argument("--bptt", type=int, default=256) 45 | parser.add_argument("--bptt", type=int, default=1024) 46 | parser.add_argument("--cuda_idx", type=int, default=0) 47 | 48 | args = parser.parse_args() 49 | return args 50 | 51 | #model_name = 'EleutherAI/pythia-70m-deduped' 52 | 53 | def compute_ent(args, model, model_name, dataloader, save_folder_name): 54 | output_entropy = [] 55 | with torch.no_grad(): 56 | #for i_batch, sample_batched in enumerate(dataloader_train): 57 | for sample_batched in tqdm(dataloader): 58 | entropy_tensor = word_ent( model, sample_batched ) 59 | output_entropy.append(entropy_tensor) 60 | 61 | output_tensor = torch.cat(output_entropy, dim=0) 62 | print(model_name) 63 | print(args.cuda_idx) 64 | print(output_tensor) 65 | print(output_tensor.size()) 66 | del output_entropy 67 | ouput_dir = args.input_folder_name + '/' + args.output_tensor_folder + '/' + save_folder_name 68 | if not os.path.exists(ouput_dir): 69 | os.makedirs(ouput_dir) 70 | output_file_name = ouput_dir + '/ent_' + model_name.replace('/','_') + '_bptt_' + str(args.bptt) + '.pt' 71 | torch.save(output_tensor, output_file_name) 72 | 73 | 74 | 75 | def main(args): 76 | model_name = args.model_name 77 | #tokenizer = AutoTokenizer.from_pretrained(model_name) 78 | #model = AutoModelWithLMHead.from_pretrained(model_name) 79 | model = AutoModelForCausalLM.from_pretrained(model_name) 80 | #device = torch.cuda.device(args.cuda_idx) 81 | device = torch.device("cuda:"+str(args.cuda_idx)) 82 | model.eval() 83 | model.to(device) 84 | 85 | print(args.do_train) 86 | print(args.do_val) 87 | skip_training = False 88 | #dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = False, load_testing=False) 89 | dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = True, load_testing=False) 90 | 91 | if args.do_train: 92 | compute_ent(args, model, model_name, dataloader_train, 'train') 93 | if args.do_val: 94 | compute_ent(args, model, model_name, dataloader_val, 'val') 95 | 96 | if __name__ == "__main__": 97 | args = parse_args() 98 | main(args) 99 | -------------------------------------------------------------------------------- /REAL_sampling/src/collect_gt_perplexity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import numpy as np 4 | import os 5 | import argparse 6 | from data_utils import load_corpus 7 | from tqdm import tqdm 8 | 9 | loss_fct = torch.nn.CrossEntropyLoss(reduction='none') 10 | 11 | def word_ent(model, input_ids): 12 | #top_k_val = 5 13 | assert model is not None 14 | input_ids = input_ids.to(model.device) 15 | outputs = model(input_ids, labels=input_ids) 16 | loss, logits = outputs[:2] 17 | #probs = logits.softmax(dim=-1) 18 | #ent = - (probs * (1e-23+probs).log() ).sum(dim=-1) 19 | #return ent.cpu() 20 | 21 | labels = input_ids 22 | # we are doing next-token prediction; shift prediction scores and input ids by one 23 | shift_logits = logits[:, :-1, :].contiguous() 24 | shift_labels = labels[:, 1:].contiguous() 25 | bsz, seq_len_minus_one = shift_labels.size() 26 | lm_per = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).view(bsz, seq_len_minus_one) 27 | lm_per = torch.cat( (lm_per, torch.zeros( (bsz,1), device = model.device ) ), dim=1 ) 28 | return lm_per.cpu() 29 | 30 | def str2bool(v): 31 | if v.lower() in ('yes', 'true', 'True', 't', 'y', '1'): 32 | return True 33 | elif v.lower() in ('no', 'false', 'False', 'f', 'n', '0'): 34 | return False 35 | else: 36 | raise argparse.ArgumentTypeError('Boolean value expected.') 37 | 38 | def parse_args(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--model_name", type=str, required=True) 41 | parser.add_argument("--input_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia') 42 | #parser.add_argument("--output_folder_name", type=str, required=True, default = 'data/processed/openwebtext17-18_1e6_Pythia') 43 | parser.add_argument("--output_tensor_folder", type=str, default = 'perplexity_tensor') 44 | parser.add_argument("--tensor_folder", type=str, default = 'tensors_all') 45 | parser.add_argument("--do_train", type=str2bool, nargs='?', default=True) 46 | parser.add_argument("--do_val", type=str2bool, nargs='?', default=True) 47 | parser.add_argument("--batch_size", type=int, default=8) 48 | #parser.add_argument("--eval_batch_size", type=int, default=16) 49 | parser.add_argument("--bptt", type=int, default=256) 50 | parser.add_argument("--cuda_idx", type=int, default=0) 51 | 52 | args = parser.parse_args() 53 | return args 54 | 55 | #model_name = 'EleutherAI/pythia-70m-deduped' 56 | 57 | def compute_ent(args, model, model_name, dataloader, save_folder_name): 58 | output_entropy = [] 59 | with torch.no_grad(): 60 | #for i_batch, sample_batched in enumerate(dataloader_train): 61 | for sample_batched in tqdm(dataloader): 62 | entropy_tensor = word_ent( model, sample_batched ) 63 | output_entropy.append(entropy_tensor) 64 | 65 | output_tensor = torch.cat(output_entropy, dim=0) 66 | print(model_name) 67 | print(args.cuda_idx) 68 | print(output_tensor) 69 | print(output_tensor.size()) 70 | del output_entropy 71 | ouput_dir = args.input_folder_name + '/' + args.output_tensor_folder + '/' + save_folder_name 72 | if not os.path.exists(ouput_dir): 73 | os.makedirs(ouput_dir) 74 | output_file_name = ouput_dir + '/per_' + model_name.replace('/','_') + '_bptt_' + str(args.bptt) + '.pt' 75 | torch.save(output_tensor, output_file_name) 76 | 77 | 78 | 79 | def main(args): 80 | model_name = args.model_name 81 | #tokenizer = AutoTokenizer.from_pretrained(model_name) 82 | #model = AutoModelWithLMHead.from_pretrained(model_name) 83 | model = AutoModelForCausalLM.from_pretrained(model_name) 84 | #device = torch.cuda.device(args.cuda_idx) 85 | device = torch.device("cuda:"+str(args.cuda_idx)) 86 | model.eval() 87 | model.to(device) 88 | 89 | print(args.do_train) 90 | print(args.do_val) 91 | skip_training = False 92 | #dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = False, load_testing=False) 93 | dataloader_train, dataloader_val, dataloader_test = load_corpus(args.input_folder_name, args.batch_size, args.batch_size, args.bptt, device, args.tensor_folder, shuffle_train=False, skip_training = False, load_val = True, load_testing=False) 94 | 95 | if args.do_train: 96 | compute_ent(args, model, model_name, dataloader_train, 'train') 97 | if args.do_val: 98 | compute_ent(args, model, model_name, dataloader_val, 'val') 99 | 100 | if __name__ == "__main__": 101 | args = parse_args() 102 | main(args) 103 | -------------------------------------------------------------------------------- /REAL_sampling/src/colorize.html: -------------------------------------------------------------------------------- 1 |  The  quick  brown  fox  jumps  over  the  lazy  dog  -------------------------------------------------------------------------------- /REAL_sampling/src/data_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SeqDataset(torch.utils.data.Dataset): 4 | def __init__(self, w_ind_gpt2_tensor, bptt, device): 5 | self.w_ind_gpt2 = w_ind_gpt2_tensor 6 | self.seq_len = bptt 7 | self.output_device = device 8 | 9 | def __len__(self): 10 | return int( self.w_ind_gpt2.size(0) /self.seq_len ) 11 | 12 | def __getitem__(self, idx): 13 | feature = self.w_ind_gpt2[idx*self.seq_len:(idx+1)*self.seq_len].to(dtype = torch.long, device = self.output_device) 14 | return feature 15 | 16 | def create_data_loader(f_in, bsz, bptt, device, dataset_class, shuffle = True): 17 | w_ind_gpt2_tensor = torch.load(f_in, map_location='cpu') 18 | cut_tok_num = w_ind_gpt2_tensor.size(0) % bptt 19 | if cut_tok_num > 0: 20 | w_ind_gpt2_tensor = w_ind_gpt2_tensor[:-cut_tok_num] 21 | dataset = dataset_class(w_ind_gpt2_tensor, bptt, device) 22 | use_cuda = False 23 | if device.type == 'cuda': 24 | use_cuda = True 25 | return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=False) 26 | #return torch.utils.data.DataLoader(dataset, batch_size = bsz, shuffle = shuffle, pin_memory=not use_cuda, drop_last=True) 27 | 28 | 29 | def load_corpus(data_path, train_bsz, eval_bsz, bptt, device, tensor_folder = "tensors_all", skip_training = False, shuffle_train=True, shuffle_val = False, load_val = True, load_testing = True): 30 | train_corpus_name = data_path + "/" + tensor_folder + "/train.pt" 31 | val_org_corpus_name = data_path +"/" + tensor_folder + "/val_org.pt" 32 | test_org_corpus_name = data_path +"/" + tensor_folder + "/test_org.pt" 33 | 34 | dataloader_train = [] 35 | dataloader_val = [] 36 | dataloader_test = [] 37 | 38 | dataset_class = SeqDataset 39 | 40 | if load_val: 41 | with open(val_org_corpus_name,'rb') as f_in: 42 | dataloader_val = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val) 43 | 44 | if load_testing: 45 | with open(test_org_corpus_name,'rb') as f_in: 46 | dataloader_test = create_data_loader(f_in, eval_bsz, bptt, device, dataset_class, shuffle = shuffle_val) 47 | 48 | if not skip_training: 49 | with open(train_corpus_name,'rb') as f_in: 50 | dataloader_train = create_data_loader(f_in, train_bsz, bptt, device, dataset_class, shuffle = shuffle_train) 51 | 52 | return dataloader_train, dataloader_val, dataloader_test 53 | -------------------------------------------------------------------------------- /REAL_sampling/src/example.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./src/factual_gen/') 3 | from sampling_method import FETopPLogitsWarper, LogitsProcessorList 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import torch 6 | 7 | sampling = 'REAL' 8 | #sampling = 'REAL + CD' 9 | 10 | LLM = 'Pythia' 11 | #LLM = 'OPT' 12 | 13 | final_entropy_model_path = 'models/OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3' 14 | decay_temperature = 2 15 | window_size = 40 16 | device = torch.device("cuda:0") 17 | 18 | if LLM == 'Pythia': 19 | LM_gen = 'EleutherAI/pythia-6.9b-deduped' 20 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024) 21 | tokenizer_ent = tokenizer 22 | else: 23 | LM_gen = 'facebook/opt-6.7b' 24 | tokenizer = AutoTokenizer.from_pretrained(LM_gen, padding_side='left', model_max_length=1024) 25 | tokenizer_ent = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m-deduped', padding_side='left', model_max_length=1024) 26 | 27 | tokenizer.pad_token = tokenizer.eos_token 28 | tokenizer_ent.pad_token = tokenizer_ent.eos_token 29 | 30 | model = AutoModelForCausalLM.from_pretrained(LM_gen) 31 | model.eval() 32 | model.to(device) 33 | 34 | if sampling == 'REAL': 35 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, device=device) 36 | else: 37 | if LLM == 'Pythia': 38 | student_model_name = 'EleutherAI/pythia-70m-deduped' 39 | else: 40 | student_model_name = 'facebook/opt-125m' 41 | logits_processor_i = FETopPLogitsWarper(top_p = 1, decay_temperature = decay_temperature, final_entropy_model_path = final_entropy_model_path, tokenizer=tokenizer, tokenizer_ent=tokenizer_ent, sample_sub_method = 'exp_1_win', window_size = window_size, student_model_name=student_model_name, use_CD_alpha= False, device=device) 42 | 43 | logits_processor = LogitsProcessorList() 44 | logits_processor.append(logits_processor_i) 45 | 46 | input_prompt = " I like to go hiking." 47 | input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids 48 | 49 | output_sequences = model.generate(input_ids=input_ids.to(device), pad_token_id=tokenizer.eos_token_id, logits_processor=logits_processor, do_sample=True ) 50 | input_len = input_ids.size(-1) 51 | output_con = output_sequences[0,input_len:] 52 | output_text = tokenizer.decode(output_con, skip_special_tokens=True) 53 | print("Input: ", input_prompt) 54 | print("Output: ", output_text) 55 | -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/.gen_fp.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/.gen_fp.py.swp -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-310.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/factual_gen/__pycache__/sampling_method.cpython-39.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/collect_GPT_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import json 5 | 6 | input_folder = 'outputs/GPT_exp/old/GPT3.5_responses_500/' 7 | 8 | result_dict = {'file_name': [], 'avg_win_rate_F': [], 'avg_win_rate_C': [], 'avg_win_rate_L': [], 'avg_win_rate_O': [], 'avg_score_F': [], 'avg_score_C': [], 'avg_score_L': [], 'avg_score_O': [], 'avg_b_score_F': [], 'avg_b_score_O': [], 'avg_b_score_C': [], 'avg_b_score_L': [], 'avg_b_score_O': [], 'avg_diff_score_F': [], 'avg_diff_score_C': [], 'avg_diff_score_L': [], 'avg_diff_score_O': []} 9 | 10 | all_bad_idx = [] 11 | 12 | for result_file in os.listdir(input_folder): 13 | file_path = input_folder+result_file 14 | if not os.path.isfile(file_path): 15 | continue 16 | with open(file_path) as f_in: 17 | all_inputs = json.load(f_in) 18 | bad_idx_list = [] 19 | if len(all_inputs) == 5: 20 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs 21 | all_bad_idx = all_bad_idx + bad_idx_list 22 | 23 | all_bad_idx_set = set(all_bad_idx) 24 | 25 | print(all_bad_idx_set) 26 | 27 | #for result_file in input_file_list: 28 | for result_file in os.listdir(input_folder): 29 | file_path = input_folder+result_file 30 | if not os.path.isfile(file_path): 31 | continue 32 | with open(file_path) as f_in: 33 | all_inputs = json.load(f_in) 34 | if len(all_inputs) == 4: 35 | pred_method_name, base_method_name, system_prompt1, all_list = all_inputs 36 | elif len(all_inputs) == 5: 37 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs 38 | id_list, context_list_pred, gen_list_pred, gen_list_base, ref_list, prompt_list, first_res_list, response_list, parse_win_list, parse_score_pred_list, parse_score_base_list = zip(*all_list) 39 | avg_win_rate_F = [] 40 | avg_win_rate_C = [] 41 | avg_win_rate_L = [] 42 | avg_win_rate_O = [] 43 | avg_score_F = [] 44 | avg_score_C = [] 45 | avg_score_L = [] 46 | avg_score_O = [] 47 | avg_b_score_F = [] 48 | avg_b_score_C = [] 49 | avg_b_score_L = [] 50 | avg_b_score_O = [] 51 | for i in range(len(id_list)): 52 | if i in all_bad_idx_set: 53 | continue 54 | avg_win_rate_F.append(int(parse_win_list[i]['F'] == 'pred')) 55 | avg_win_rate_C.append(int(parse_win_list[i]['C'] == 'pred')) 56 | avg_win_rate_L.append(int(parse_win_list[i]['L'] == 'pred')) 57 | avg_win_rate_O.append(int(parse_win_list[i]['O'] == 'pred')) 58 | avg_score_F.append(float(parse_score_pred_list[i]['F'] )) 59 | avg_score_C.append(float(parse_score_pred_list[i]['C'] )) 60 | avg_score_L.append(float(parse_score_pred_list[i]['L'] )) 61 | avg_score_O.append(float(parse_score_pred_list[i]['O'] )) 62 | avg_b_score_F.append(float(parse_score_base_list[i]['F'] )) 63 | avg_b_score_C.append(float(parse_score_base_list[i]['C'] )) 64 | avg_b_score_L.append(float(parse_score_base_list[i]['L'] )) 65 | avg_b_score_O.append(float(parse_score_base_list[i]['O'] )) 66 | 67 | result_dict['file_name'].append(result_file) 68 | result_dict['avg_win_rate_F'].append(np.mean(avg_win_rate_F)) 69 | result_dict['avg_win_rate_C'].append(np.mean(avg_win_rate_C)) 70 | result_dict['avg_win_rate_L'].append(np.mean(avg_win_rate_L)) 71 | result_dict['avg_win_rate_O'].append(np.mean(avg_win_rate_O)) 72 | result_dict['avg_score_F'].append(np.mean(avg_score_F)) 73 | result_dict['avg_score_C'].append(np.mean(avg_score_C)) 74 | result_dict['avg_score_L'].append(np.mean(avg_score_L)) 75 | result_dict['avg_score_O'].append(np.mean(avg_score_O)) 76 | result_dict['avg_b_score_F'].append(np.mean(avg_b_score_F)) 77 | result_dict['avg_b_score_C'].append(np.mean(avg_b_score_C)) 78 | result_dict['avg_b_score_L'].append(np.mean(avg_b_score_L)) 79 | result_dict['avg_b_score_O'].append(np.mean(avg_b_score_O)) 80 | result_dict['avg_diff_score_F'].append(np.mean(avg_score_F) - np.mean(avg_b_score_F)) 81 | result_dict['avg_diff_score_C'].append(np.mean(avg_score_C) - np.mean(avg_b_score_C)) 82 | result_dict['avg_diff_score_L'].append(np.mean(avg_score_L) - np.mean(avg_b_score_L)) 83 | result_dict['avg_diff_score_O'].append(np.mean(avg_score_O) - np.mean(avg_b_score_O)) 84 | 85 | df = pd.DataFrame.from_dict(result_dict) 86 | 87 | #pd.set_option('display.max_columns', None) 88 | pd.options.display.max_colwidth = 150 89 | 90 | df_sort = df.set_index('file_name').sort_values(by=['file_name']) 91 | 92 | #print(df_sort[ ['avg_win_rate_O', 'avg_diff_score_O']]) 93 | print(df_sort) 94 | #print(df['file_name']) 95 | -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/comp_collect_GPT_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import json 5 | 6 | 7 | input_folder = 'outputs/GPT_exp/comp_GPT3.5_responses_500/' 8 | #input_folder = 'outputs/wp/GPT_exp/test_GPT3.5_responses_500/' 9 | 10 | 11 | result_dict = {'file_name': [], 'avg_win_rate_F': [], 'avg_win_rate_C': [], 'avg_win_rate_L': [], 'avg_win_rate_O': [] } 12 | 13 | all_bad_idx = [] 14 | 15 | for result_file in os.listdir(input_folder): 16 | file_path = input_folder+result_file 17 | if not os.path.isfile(file_path): 18 | continue 19 | with open(file_path) as f_in: 20 | all_inputs = json.load(f_in) 21 | bad_idx_list = [] 22 | if len(all_inputs) == 5: 23 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs 24 | all_bad_idx = all_bad_idx + bad_idx_list 25 | 26 | all_bad_idx_set = set(all_bad_idx) 27 | 28 | print(all_bad_idx_set) 29 | 30 | #for result_file in input_file_list: 31 | for result_file in os.listdir(input_folder): 32 | file_path = input_folder+result_file 33 | if not os.path.isfile(file_path): 34 | continue 35 | with open(file_path) as f_in: 36 | all_inputs = json.load(f_in) 37 | if len(all_inputs) == 4: 38 | pred_method_name, base_method_name, system_prompt1, all_list = all_inputs 39 | elif len(all_inputs) == 5: 40 | pred_method_name, base_method_name, system_prompt1, bad_idx_list, all_list = all_inputs 41 | id_list, context_list_pred, gen_list_pred, gen_list_base, ref_list, prompt_list, first_res_list, response_list, parse_win_list = zip(*all_list) 42 | avg_win_rate_F = [] 43 | avg_win_rate_C = [] 44 | avg_win_rate_L = [] 45 | avg_win_rate_O = [] 46 | for i in range(len(id_list)): 47 | if i in all_bad_idx_set: 48 | continue 49 | avg_win_rate_F.append(int(parse_win_list[i]['F'] == 'pred')) 50 | avg_win_rate_C.append(int(parse_win_list[i]['C'] == 'pred')) 51 | avg_win_rate_L.append(int(parse_win_list[i]['L'] == 'pred')) 52 | avg_win_rate_O.append(int(parse_win_list[i]['O'] == 'pred')) 53 | 54 | result_dict['file_name'].append(result_file) 55 | result_dict['avg_win_rate_F'].append(np.mean(avg_win_rate_F)) 56 | result_dict['avg_win_rate_C'].append(np.mean(avg_win_rate_C)) 57 | result_dict['avg_win_rate_L'].append(np.mean(avg_win_rate_L)) 58 | result_dict['avg_win_rate_O'].append(np.mean(avg_win_rate_O)) 59 | 60 | df = pd.DataFrame.from_dict(result_dict) 61 | 62 | #pd.set_option('display.max_columns', None) 63 | pd.options.display.max_colwidth = 150 64 | 65 | df_sort = df.set_index('file_name').sort_values(by=['file_name']) 66 | 67 | #print(df_sort[ ['avg_win_rate_O', 'avg_diff_score_O']]) 68 | print(df_sort) 69 | #print(df['file_name']) 70 | -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/prepare_story_prompt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | input_stories = "/mnt/efs/Haw-Shiuan/entailment_tree/datasets/ROCStories__spring2016.csv" 5 | num_stories = 1000 6 | shot_num = 3 7 | prompt_sent_num = 2 8 | output_prompt_file = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/MTurk/story/prompt_start2_b2_{}.jsonl".format(num_stories) 9 | 10 | delimiter = '---' 11 | num_story_line = 5 12 | 13 | df = pd.read_csv(input_stories) 14 | df_sampled_stories = df.sample(n=num_stories, replace=False) 15 | df_rest = df.drop(df_sampled_stories.index) 16 | 17 | def prepare_id(row, prompt_sent_num): 18 | id_q = '' 19 | for i in range(prompt_sent_num): 20 | id_q += row['sentence'+str(i+1)] + ' ' 21 | return id_q[:-1] 22 | 23 | def str_story(row_examples, i, delimiter): 24 | story_str = 'Story {}:\n'.format(i+1) 25 | for i in range(num_story_line): 26 | story_str += row_examples['sentence'+str(i+1)] + ' ' 27 | story_str += '\n' + delimiter + '\n' 28 | return story_str 29 | 30 | output_list = [] 31 | for index, row in df_sampled_stories.iterrows(): 32 | out_dict = {} 33 | id_q = prepare_id(row, prompt_sent_num) 34 | out_dict['id'] = id_q 35 | df_examples = df_rest.sample(n=shot_num, replace=False) 36 | prompt_str = ' Here are {} stories. Each story has five sentences.\n\n'.format(shot_num+1) 37 | for i, (index, row_examples) in enumerate(df_examples.iterrows()): 38 | prompt_str += str_story(row_examples, i, delimiter) 39 | 40 | out_dict['prompt'] = prompt_str + 'Story {}:\n'.format(shot_num+1) + id_q 41 | output_list.append(out_dict) 42 | 43 | with open(output_prompt_file, 'w') as f_out: 44 | for out_dict in output_list: 45 | f_out.write(json.dumps(out_dict) + '\n' ) 46 | 47 | -------------------------------------------------------------------------------- /REAL_sampling/src/factual_gen/prepare_wiki_MTurk.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | from nltk.tokenize import sent_tokenize 4 | import random 5 | 6 | sample_numbers = 1000 7 | 8 | input_file_dict = {'Ours': "outputs/factual_gen/factual_test7k_6.9b_fe_topp_exp_1_win_40_dt_2.0_p1.0_OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3/factual_test7k_6.9b_fe_topp_p1.0_gen_seed1.jsonl", 9 | 'Top-p': 'outputs/factual_gen/factual_test7k_6.9b_topp_p0.6_temp_1.0/factual_test7k_6.9b_topp_p0.6_gen_seed1.jsonl', 10 | 'CD': 'outputs/factual_gen/factual_test7k_6.9b_CD_dt_1.0_p0.3_pythia-70m-deduped/factual_test7k_6.9b_CD_p0.3_gen_seed1.jsonl', 11 | 'Ours+CD': 'outputs/factual_gen/factual_test7k_6.9b_fe_CD_topp_exp_1_win_40_dt_1.5_p1.0_OWT_wiki_1e7_70M_bsz_128_exp_pred_last_a10_e3/factual_test7k_6.9b_fe_CD_topp_p1.0_gen_seed1.jsonl' 12 | } 13 | 14 | output_csv = 'outputs/MTurk/wiki/gen_1000.csv' 15 | 16 | method_list = list(input_file_dict.keys()) 17 | 18 | def load_gen(input_file): 19 | id_list = [] 20 | context_list = [] 21 | gen_list = [] 22 | with open(input_file) as f_in: 23 | for line in f_in: 24 | gen_obj = json.loads(line.strip()) 25 | context = gen_obj['prompt'].strip() 26 | id_res = int(gen_obj['id']) 27 | 28 | text = gen_obj['text'].strip() 29 | sents = sent_tokenize(text) 30 | gen = sents[0].replace('\n',' ') 31 | 32 | id_list.append(id_res) 33 | context_list.append(context) 34 | gen_list.append(gen) 35 | if len(id_list) >= sample_numbers: 36 | break 37 | return id_list, context_list, gen_list 38 | 39 | prev_id_list = None 40 | 41 | all_res_dict = {} 42 | 43 | for method_name in input_file_dict: 44 | file_name = input_file_dict[method_name] 45 | print(file_name) 46 | id_list, context_list, gen_list = load_gen(file_name) 47 | print(method_name, sum([len(gen) for gen in gen_list ]) / sample_numbers ) 48 | if prev_id_list is None: 49 | prev_id_list = id_list 50 | all_res_dict['id'] = id_list 51 | all_res_dict['context'] = context_list 52 | else: 53 | for i in range(len(id_list)): 54 | assert id_list[i] == prev_id_list[i] 55 | prev_id_list = id_list 56 | all_res_dict['gen_'+method_name] = gen_list 57 | 58 | df = pd.DataFrame(all_res_dict) 59 | print(df) 60 | 61 | num_method = len(method_list) 62 | 63 | output_dict = {'id': [], 'context': []} 64 | for i in range(num_method): 65 | output_dict['gen_'+str(i+1)] = [] 66 | output_dict['method_'+str(i+1)] = [] 67 | 68 | #drop_idx = [] 69 | 70 | for index, row in df.iterrows(): 71 | gen_list = [] 72 | 73 | for method_name in method_list: 74 | gen_list.append(row['gen_'+method_name]) 75 | if any([len(gen)<10 or 'External links' in gen for gen in gen_list]) or len(gen_list) != len(set(gen_list)): 76 | #drop_idx.append(index) 77 | continue 78 | output_dict['id'].append(row['id']) 79 | output_dict['context'].append(row['context']) 80 | idx_rnd = list(range(num_method)) 81 | random.shuffle(idx_rnd) 82 | for i, idx in enumerate(idx_rnd): 83 | output_dict['gen_'+str(i+1)].append(gen_list[idx]) 84 | output_dict['method_'+str(i+1)].append(method_list[idx]) 85 | 86 | df = pd.DataFrame(output_dict).set_index('id') 87 | #df = df.drop(drop_idx) 88 | 89 | 90 | print(df) 91 | df.to_csv(output_csv) 92 | -------------------------------------------------------------------------------- /REAL_sampling/src/prepare_id_corpus_from_raw.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | import torch 3 | import random 4 | import sys 5 | import os 6 | import argparse 7 | 8 | import logging 9 | logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR) 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--model_name", type=str, default = 'EleutherAI/pythia-70m-deduped') 15 | parser.add_argument("--input_file", type=str, required=True, default = 'data/raw/OWT_wiki_1e7') 16 | parser.add_argument("--output_dir", type=str, default = './data/processed/OWT_wiki_1e7_Pythia/tensors_all/') 17 | parser.add_argument("--training_ratio", type=float, default=0.96) 18 | parser.add_argument("--val_ratio", type=float, default=0.02) 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | args = parse_args() 24 | 25 | input_file = args.input_file 26 | output_dir = args.output_dir 27 | model_name = args.model_name 28 | training_ratio = args.training_ratio 29 | val_ratio = args.val_ratio 30 | 31 | output_train_file = output_dir + "train.pt" 32 | output_val_file = output_dir + "val_org.pt" 33 | output_test_file = output_dir + "test_org.pt" 34 | 35 | if not os.path.exists(output_dir): 36 | os.makedirs(output_dir) 37 | 38 | max_line_num = 100000000000000 39 | #max_line_num = 100000 40 | #max_line_num = 10000000 41 | #max_line_num = 20000000 42 | #max_line_num = 2000000 43 | 44 | #max_sent_len = 256 45 | 46 | output_arr = [] 47 | 48 | tokenizer = AutoTokenizer.from_pretrained(model_name) 49 | 50 | i=0 51 | with open(input_file, encoding='latin-1') as f_in: 52 | for line in f_in: 53 | raw_text = line 54 | i+=1 55 | #indexed_tokens = tokenizer.encode(raw_text, add_prefix_space=True) 56 | indexed_tokens = tokenizer.encode(raw_text) 57 | output_arr.append(indexed_tokens) 58 | if i % 100000 == 0: 59 | print(i) 60 | sys.stdout.flush() 61 | if i > max_line_num: 62 | break 63 | 64 | #idx_shuffled = list(range(len(output_arr))) 65 | #random.shuffle(idx_shuffled) 66 | training_size = int(len(output_arr)*training_ratio) 67 | val_size = int(len(output_arr)*val_ratio) 68 | 69 | def save_to_tensor(output_arr, output_file_name): 70 | data_size = len(output_arr) 71 | len_sum = 0 72 | for sent in output_arr: 73 | sent_len = len(sent) 74 | len_sum += sent_len 75 | #output_tensor = torch.zeros((len_sum),dtype = torch.uint16) 76 | output_tensor = torch.zeros((len_sum),dtype = torch.int32) 77 | 78 | current_start = 0 79 | for i in range(data_size): 80 | sent = output_arr[i] 81 | #output_tensor[current_start:current_start+len(sent)] = torch.tensor(sent,dtype = torch.uint16) 82 | output_tensor[current_start:current_start+len(sent)] = torch.tensor(sent,dtype = torch.int32) 83 | current_start += len(sent) 84 | 85 | torch.save(output_tensor, output_file_name) 86 | 87 | save_to_tensor(output_arr[:training_size], output_train_file) 88 | save_to_tensor(output_arr[training_size:training_size+val_size], output_val_file) 89 | save_to_tensor(output_arr[training_size+val_size:], output_test_file) 90 | -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/Hades/utils.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.nn import functional as F 4 | import codecs 5 | import json 6 | import spacy 7 | from sklearn.metrics import classification_report, accuracy_score, hamming_loss, \ 8 | f1_score, precision_score, recall_score, average_precision_score, roc_auc_score, confusion_matrix, \ 9 | brier_score_loss, average_precision_score 10 | import numpy as np 11 | 12 | 13 | def binary_eval(predy, testy, predy_pro, verbose=True, return_f1=False, predscore=None): 14 | acc = accuracy_score(testy, predy) 15 | f1 = f1_score(testy, predy, average=None) 16 | precision = precision_score(testy, predy, average=None) 17 | recall = recall_score(testy, predy, average=None) 18 | 19 | average_precision = average_precision_score(testy, predy_pro) 20 | epsilon = 1e-8 21 | 22 | htn, hfp, hfn, htp = confusion_matrix(testy, predy).ravel() 23 | hsensi = htp / (htp + hfn + epsilon) 24 | hspec = htn / (hfp + htn + epsilon) 25 | gmean = np.sqrt(hsensi*hspec) 26 | 27 | 28 | info = "Acc : {}\nf1 : {}\nprecision : {}\nrecall : {}\nG-mean : {}\nAP : {}".format(acc, 29 | " ".join([str(x) for x in f1]), " ".join([str(x) for x in precision]), 30 | " ".join([str(x) for x in recall]), gmean, average_precision) 31 | 32 | if predscore is not None: 33 | bss = brier_score_loss(testy, predscore) 34 | roc_auc = roc_auc_score(testy, predscore) 35 | info += "\nbss : {}\nROC-AUC : {}".format(bss, roc_auc) 36 | 37 | if verbose: 38 | print(info) 39 | 40 | if return_f1: 41 | return acc, f1, precision, recall, gmean, bss, roc_auc, info 42 | else: 43 | #return acc, info 44 | return average_precision, info 45 | 46 | 47 | def subsets(nums): 48 | """ 49 | :type nums: List[int] 50 | :rtype: List[List[int]] 51 | """ 52 | ans = [] 53 | def dfs(curpos, tmp): 54 | if tmp: 55 | ans.append(tmp[:]) 56 | for i in range(curpos, len(nums)): 57 | tmp.append(nums[i]) 58 | dfs(i+1, tmp) 59 | tmp.pop(-1) 60 | dfs(0, []) 61 | return ans 62 | 63 | 64 | def sent_ner_bounds(sen, nlp=None): 65 | if nlp is None: 66 | nlp = spacy.load('en') 67 | tokens, tags = [], [] 68 | print(sen) 69 | for doc in nlp.pipe([sen]): 70 | for token in doc: 71 | tags.append(token.ent_iob_) 72 | tokens.append(str(token)) 73 | 74 | rep_pos = [] 75 | vis = [False for _ in range(len(tags))] 76 | for idx, tag in enumerate(tags): 77 | if tag == 'O': 78 | rep_pos.append([idx, idx]) 79 | vis[idx] = True 80 | elif tag == 'B': 81 | end = idx 82 | for j in range(idx+1, len(tags)): 83 | if tags[j] == 'I': 84 | end = j 85 | else: 86 | break 87 | rep_pos.append([idx, end]) 88 | elif tag == 'I': 89 | continue 90 | 91 | return ' '.join(tokens), rep_pos 92 | 93 | 94 | def remove_marked_sen(sen, start_id, end_id): 95 | tokens = sen if type(sen) == list else sen.strip().split() 96 | if tokens[start_id].startswith("===") and tokens[end_id].endswith("==="): 97 | tokens[start_id] = tokens[start_id][3:] 98 | tokens[end_id] = tokens[end_id][:-3] 99 | return tokens 100 | 101 | -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-310.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-37.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/compute_ent_features.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-310.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-37.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/llm-asymptotic-decoding/beff605ce1a89fd4014fbcb11301136be569f889/REAL_sampling/src/process_hallucination_dataset/__pycache__/data_classes.cpython-38.pyc -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/concat_category_csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | folder_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/state/' 4 | 5 | #file_suffix = '_val' 6 | file_suffix = '_train' 7 | 8 | output_file_name = 'all' 9 | 10 | input_cat_list = ['animals_true_false', 11 | 'capitals_true_false', 12 | 'cities_true_false', 13 | 'companies_true_false', 14 | 'conj_neg_companies_true_false', 15 | 'conj_neg_facts_true_false', 16 | 'elements_true_false', 17 | 'facts_true_false', 18 | 'generated_true_false', 19 | 'inventions_true_false', 20 | 'neg_companies_true_false', 21 | 'neg_facts_true_false'] 22 | 23 | df_all = None 24 | 25 | for cat in input_cat_list: 26 | df_cat = pd.read_csv(folder_path + cat + file_suffix+'.csv') 27 | df_cat['category'] = cat 28 | df_all = pd.concat([df_all,df_cat]) 29 | 30 | df_all.to_csv(folder_path+output_file_name+file_suffix+'.csv') 31 | -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/convert_humor_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from transformers import AutoTokenizer 4 | 5 | #input_path = "/mnt/efs/Haw-Shiuan/rJokesData/data/dev.tsv" 6 | input_path = "/mnt/efs/Haw-Shiuan/rJokesData/data/test.tsv" 7 | 8 | #output_path = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/humor/all_128_train.csv" 9 | output_path = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/humor/all_128_val.csv" 10 | 11 | cut_end = True 12 | 13 | if cut_end: 14 | #max_token_num = 2048 15 | #max_token_num = 1024 16 | max_token_num = 128 17 | small_model_name = 'EleutherAI/pythia-70m-deduped' 18 | tokenizer = AutoTokenizer.from_pretrained(small_model_name, truncation_side='left') 19 | 20 | label_reg_arr = [] 21 | label_arr = [] 22 | text_arr = [] 23 | cat_arr = [] 24 | 25 | def preprocessing_text(text): 26 | text_tok = tokenizer.tokenize(text) 27 | num_cut = len(text_tok) - max_token_num 28 | if num_cut > 0: 29 | print('cut ', num_cut) 30 | doc_trunc = tokenizer.convert_tokens_to_string( text_tok[:-(num_cut+10)] ) + ' ...' 31 | return doc_trunc, len(text_tok) 32 | else: 33 | return text, len(text_tok) 34 | 35 | with open(input_path) as f_in: 36 | for line in f_in: 37 | #print(line.strip().split('\t',1)) 38 | label_reg, text = line.strip().split('\t',1) 39 | label_reg = int(label_reg) 40 | text, org_len = preprocessing_text(text) 41 | if org_len < 2: 42 | print('skip too short example') 43 | continue 44 | text_arr.append(text) 45 | label_reg_arr.append(label_reg) 46 | if label_reg > 1: 47 | label = 1 48 | else: 49 | label = 0 50 | label_arr.append(label) 51 | cat_arr.append('rJoke') 52 | 53 | print('positive ratio', sum(label_arr) / float( len(label_arr) ) ) 54 | 55 | df = pd.DataFrame({'statement': text_arr, 'label': label_arr, 'label_reg': label_reg_arr, 'category': cat_arr}) 56 | 57 | df.to_csv(output_path) 58 | -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/split_csv_datasets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | 4 | input_folder = "/mnt/efs/Haw-Shiuan/factor/data/" 5 | output_folder = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/factor/" 6 | 7 | #input_folder = "/mnt/efs/Haw-Shiuan/Probes/datasets/" 8 | #output_folder = "/mnt/efs/Haw-Shiuan/true_entropy/outputs/state/" 9 | 10 | training_ratio = 0.5 11 | val_ratio = 0.5 12 | #test_ratio = 0.1 13 | 14 | assert training_ratio + val_ratio == 1 15 | #assert training_ratio + val_ratio + test_ratio == 1 16 | 17 | for input_file in os.listdir(input_folder): 18 | #print(input_file) 19 | input_path = input_folder + input_file 20 | if not os.path.isfile(input_path): 21 | continue 22 | 23 | #input_name = os.path.basename(input_file) 24 | output_path = output_folder + input_file.replace('.csv', '_{}.csv') 25 | 26 | df = pd.read_csv(input_path) 27 | 28 | training_size = int( len(df) * training_ratio ) 29 | val_size = int( len(df) * val_ratio ) 30 | 31 | df_part = df.sample(n = training_size) 32 | df_part.to_csv(output_path.format('train'), index = False) 33 | 34 | df = df.drop(df_part.index) 35 | df_part = df.sample(n = val_size) 36 | df_part.to_csv(output_path.format('val'), index = False) 37 | 38 | #df_part = df.drop(df_part.index) 39 | #df_part.to_csv(output_path.format('test'), index = False) 40 | -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/split_data.sh: -------------------------------------------------------------------------------- 1 | input_folder="/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/" 2 | 3 | #input_file="${input_folder}summarization_data_2048.json" 4 | #mid_file="${input_folder}summarization_data_2048_rnd.json" 5 | #output_prefix="${input_folder}summarization_data_2048_" 6 | input_file="${input_folder}summarization_data_1024.json" 7 | mid_file="${input_folder}summarization_data_1024_rnd.json" 8 | output_prefix="${input_folder}summarization_data_1024_" 9 | 10 | #input_file="${input_folder}qa_data.json" 11 | #mid_file="${input_folder}qa_data_rnd.json" 12 | #output_prefix="${input_folder}qa_data_" 13 | #input_file="${input_folder}qa_data_knowledge.json" 14 | #mid_file="${input_folder}qa_data_knowledge_rnd.json" 15 | #output_prefix="${input_folder}qa_data_knowledge_" 16 | 17 | #input_file="${input_folder}dialogue_data.json" 18 | #mid_file="${input_folder}dialogue_data_rnd.json" 19 | #output_prefix="${input_folder}dialogue_data_" 20 | #input_file="${input_folder}dialogue_data_knowledge.json" 21 | #mid_file="${input_folder}dialogue_data_knowledge_rnd.json" 22 | #output_prefix="${input_folder}dialogue_data_knowledge_" 23 | 24 | sort -R $input_file > $mid_file 25 | 26 | num_files=10 27 | #num_files=5 28 | total_lines=$(wc -l <${mid_file}) 29 | ((lines_per_file = (total_lines + num_files - 1) / num_files)) 30 | split -l ${lines_per_file} ${mid_file} 31 | 32 | cat xaa xab xac xad xae xaf xag xah > ${output_prefix}train.json 33 | mv xai ${output_prefix}val.json 34 | mv xaj ${output_prefix}test.json 35 | rm xa* 36 | 37 | #cat xac xad xae > ${output_prefix}train.json 38 | #mv xaa ${output_prefix}val.json 39 | #mv xab ${output_prefix}test.json 40 | #rm xa* 41 | -------------------------------------------------------------------------------- /REAL_sampling/src/process_hallucination_dataset/unify_Halu_datasets_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from transformers import AutoTokenizer 4 | 5 | #input_path = '/mnt/efs/Haw-Shiuan/HaluEval/data/dialogue_data.json' 6 | #input_path = '/mnt/efs/Haw-Shiuan/HaluEval/data/qa_data.json' 7 | input_path = '/mnt/efs/Haw-Shiuan/HaluEval/data/summarization_data.json' 8 | 9 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/dialogue_data.json' 10 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/qa_data.json' 11 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/summarization_data.json' 12 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/summarization_data_2048.json' 13 | output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/summarization_data_1024.json' 14 | 15 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/dialogue_data_knowledge.json' 16 | #output_path = '/mnt/efs/Haw-Shiuan/true_entropy/outputs/Halu/qa_data_knowledge.json' 17 | 18 | #include_knowledge = True 19 | include_knowledge = False 20 | 21 | cut_end = True 22 | 23 | if cut_end: 24 | #max_token_num = 2048 25 | max_token_num = 1024 26 | small_model_name = 'EleutherAI/pythia-70m-deduped' 27 | tokenizer = AutoTokenizer.from_pretrained(small_model_name, truncation_side='left') 28 | 29 | prepend_space = True 30 | if prepend_space: 31 | space_prefix = ' ' 32 | space_suffix = '' 33 | else: 34 | space_prefix = '' 35 | space_suffix = ' ' 36 | 37 | 38 | output_list = [] 39 | with open(input_path, 'r', encoding='utf-8') as f: 40 | for line in f: 41 | sample = json.loads(line) 42 | #pos_output_dict = {'factual': 1}#{'context': '', 'text': '', 'factual': ''} 43 | #neg_output_dict = {'factual': 0} 44 | output_dict = {} 45 | if "dialogue_history" in sample: 46 | output_dict['text_pos'] = space_prefix + sample['right_response'] 47 | output_dict['text_neg'] = space_prefix + sample['hallucinated_response'] 48 | 49 | context_raw = space_prefix + sample["dialogue_history"] + '[Assistant]:' + space_suffix 50 | if include_knowledge: 51 | context = space_prefix + sample['knowledge'] + '.' + space_suffix + context_raw 52 | else: 53 | context = context_raw 54 | elif "question" in sample: 55 | output_dict['text_pos'] = space_prefix + sample['right_answer'] 56 | output_dict['text_neg'] = space_prefix + sample['hallucinated_answer'] 57 | 58 | context_raw = space_prefix + 'Question: ' + sample["question"] + '. Answer:' + space_suffix 59 | if include_knowledge: 60 | context = space_prefix + sample['knowledge'] + space_suffix + context_raw 61 | else: 62 | context = context_raw 63 | elif "document" in sample: 64 | output_dict['text_pos'] = space_prefix + sample['right_summary'] 65 | output_dict['text_neg'] = space_prefix + sample['hallucinated_summary'] 66 | 67 | context = space_prefix + 'Document: ' + sample["document"] + ' Summary:' + space_suffix 68 | if cut_end: 69 | context_tok = tokenizer.tokenize(context) 70 | pos_tok = tokenizer.tokenize(output_dict['text_pos']) 71 | neg_tok = tokenizer.tokenize(output_dict['text_neg']) 72 | num_cut = len(context_tok) + max(len(pos_tok), len(neg_tok)) - max_token_num 73 | if num_cut > 0: 74 | print('cut ', num_cut) 75 | doc_tok = tokenizer.tokenize( sample["document"] ) 76 | doc_trunc = tokenizer.convert_tokens_to_string( doc_tok[:-(num_cut+10)] ) + '...' 77 | context = space_prefix + 'Document: ' + doc_trunc + ' Summary:' + space_suffix 78 | 79 | output_dict['context'] = context 80 | output_list.append(output_dict) 81 | 82 | with open(output_path, 'w') as f_out: 83 | for output_dict in output_list: 84 | f_out.write(json.dumps(output_dict)+'\n') 85 | -------------------------------------------------------------------------------- /REAL_sampling/src/simple_exp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "95aae402", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from mosestokenizer import *\n", 11 | "detokenizer = MosesDetokenizer('en')\n", 12 | "\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "fe042317", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "text = \"christopher atkinson ( c . 1738 \\u2013 23 april 1819 ) , known as christopher atkinson savile or saville from about 1798 , was an english merchant and politician . born in yorkshire , he moved to ===dorset=== and married the niece of a wealthy merchant , entering that business himself . he was elected at the 1780 general election as one of the two members of parliament ( mps ) for dorset west . however he was expelled from the house of commons on the second sitting , after being convicted of receiving a bribe , and sentenced to detention in the pillory . he was granted a royal warrant in 1791 , and returned to parliament for the area in 1796 , retaining the seat until he stood down at the 1802 general election . he had changed his name to atkinson atkinson some time after 1793 . he then purchased extensive estates in westerleigh , in hampshire , which gave him control of both one of okehampton ' s two parliamentary seats . he returned himself as an okehampton mp at the 1818 general election , and held the seat until his death in 1819 unmarried , aged over 72 .\"" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "id": "5ca41243", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "christopher atkinson (c. 1738 – 23 april 1819), known as christopher atkinson savile or saville from about 1798, was an english merchant and politician. born in yorkshire, he moved to ===dorset=== and married the niece of a wealthy merchant, entering that business himself. he was elected at the 1780 general election as one of the two members of parliament (mps) for dorset west. however he was expelled from the house of commons on the second sitting, after being convicted of receiving a bribe, and sentenced to detention in the pillory. he was granted a royal warrant in 1791, and returned to parliament for the area in 1796, retaining the seat until he stood down at the 1802 general election. he had changed his name to atkinson atkinson some time after 1793. he then purchased extensive estates in westerleigh, in hampshire, which gave him control of both one of okehampton 's two parliamentary seats. he returned himself as an okehampton mp at the 1818 general election, and held the seat until his death in 1819 unmarried, aged over 72.\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "print(detokenizer(text.split()))\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "id": "d71ff0fb", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "['my', 'Ġson', 'Ġis', 'Ġje', 'rem', 'y']\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "from transformers import AutoTokenizer\n", 59 | "small_model_name = 'EleutherAI/pythia-70m-deduped'\n", 60 | "tokenizer = AutoTokenizer.from_pretrained(small_model_name)\n", 61 | "print(tokenizer.tokenize(\"my son is jeremy\"))\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "id": "53ef61f6", 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "[619, 3347, 310, 5139, 2013, 90]\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "print(tokenizer.encode(\" my son is jeremy\"))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "d557aa3c", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3 (ipykernel)", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.7.3" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 5 112 | } 113 | -------------------------------------------------------------------------------- /THIRD_PARTY_LICENSES: -------------------------------------------------------------------------------- 1 | We also include following external repositories with minor modifications in our release: 2 | 3 | 1. FactualityPrompt: Apache 2.0 License 4 | 2. HaDes: MIT License 5 | --------------------------------------------------------------------------------