├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── instructions └── main │ └── instructions.json ├── requirements.txt ├── scripts ├── generate_agent_instructions.sh ├── gpt-3.5-turbo.sh ├── llama-2-13b-chat.sh ├── llama-2-70b-chat.sh ├── llama-2-7b-chat.sh ├── replicate.py ├── run.sh ├── run_reasoning.sh ├── run_specs │ ├── agentinstruct │ │ ├── gpt-3.5-turbo-agentinstruct.conf │ │ ├── llama-2-13b-chat-agentinstruct.conf │ │ ├── llama-2-70b-chat-agentinstruct.conf │ │ ├── llama-2-7b-chat-agentinstruct.conf │ │ └── vicuna-13b-agentinstruct.conf │ ├── simple-gpt-3.5-turbo.conf │ ├── simple-llama-2-7b-chat.conf │ ├── zeroshot │ │ ├── gpt-3.5-turbo-zeroshot.conf │ │ ├── llama-2-13b-chat-zeroshot.conf │ │ ├── llama-2-70b-chat-zeroshot.conf │ │ ├── llama-2-7b-chat-zeroshot.conf │ │ └── vicuna-13b-zeroshot.conf │ └── zeroshotcot │ │ ├── gpt-3.5-turbo-zeroshotcot.conf │ │ ├── llama-2-13b-chat-zeroshotcot.conf │ │ ├── llama-2-70b-chat-zeroshotcot.conf │ │ ├── llama-2-7b-chat-zeroshotcot.conf │ │ └── vicuna-13b-zeroshotcot.conf └── vicuna-13b.sh └── src └── agentinstruct ├── agent ├── agent_instr_generation.py ├── agent_pipeline.py └── utils │ └── dataset_preprocessing.py ├── eval ├── format_results.py └── letter_eval.py └── reasoning ├── helm_updates ├── benchmark_output │ └── scenarios │ │ ├── coin │ │ └── data │ │ │ ├── test │ │ │ └── train │ │ └── letter │ │ └── data │ │ ├── test │ │ └── train ├── src │ └── helm │ │ ├── benchmark │ │ ├── __init__.py │ │ ├── adaptation │ │ │ ├── adapter_spec.py │ │ │ └── adapters │ │ │ │ ├── in_context_learning_adapter.py │ │ │ │ └── multiple_choice_joint_adapter.py │ │ ├── executor.py │ │ ├── metrics │ │ │ └── basic_metrics.py │ │ ├── run.py │ │ ├── run_expander.py │ │ ├── run_specs.py │ │ ├── scenarios │ │ │ ├── addsub_scenario.py │ │ │ ├── aqua_scenario.py │ │ │ ├── big_bench_hard_scenario.py │ │ │ ├── coin_scenario.py │ │ │ ├── commonsense_qa_scenario.py │ │ │ ├── gsm_scenario.py │ │ │ ├── letter_scenario.py │ │ │ ├── multi_arith_scenario.py │ │ │ ├── newsqa_scenario.py │ │ │ ├── singleeq_scenario.py │ │ │ └── svamp_scenario.py │ │ └── window_services │ │ │ ├── llama_2_window_service.py │ │ │ ├── llama_window_service.py │ │ │ └── window_service_factory.py │ │ ├── common │ │ ├── general.py │ │ └── request.py │ │ └── proxy │ │ ├── clients │ │ ├── auto_client.py │ │ ├── client.py │ │ ├── huggingface_tokenizer.py │ │ ├── local_client.py │ │ ├── openai_automatic_prompt_tuning.py │ │ ├── openai_client.py │ │ └── together_client.py │ │ ├── models.py │ │ └── services │ │ ├── server_service.py │ │ └── service.py └── update_helm.sh └── serve ├── README.md ├── custom_handler ├── llama-2-13b-chat-handler.py ├── llama-2-70b-chat-handler.py ├── llama-2-7b-chat-handler.py └── vicuna-13b-handler.py └── model_store ├── config.properties ├── llama-2-13b-chat-config.yaml ├── llama-2-70b-chat-config.yaml ├── llama-2-7b-chat-config.yaml ├── requirements.txt └── vicuna-13b-config.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | prod_env/ 2 | restricted/ 3 | *venv/ 4 | _latest/ 5 | benchmark_output/ 6 | __pycache__ 7 | *.egg-info 8 | .mypy_cache 9 | pip-wheel-metadata/ 10 | .DS_Store 11 | .idea 12 | .vscode 13 | *.swp 14 | .nfs* 15 | .sif -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/agentinstruct/reasoning/helm"] 2 | path = src/agentinstruct/reasoning/helm 3 | url = https://github.com/stanford-crfm/helm.git 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backoff==2.2.1 2 | simple_slurm==0.2.6 3 | langchain==0.0.325 4 | pydantic==1.10.* 5 | pysqlite3==0.5.2 6 | pysqlite3-binary==0.5.2 7 | chromadb==0.4.15 8 | datasets==2.14.* -------------------------------------------------------------------------------- /scripts/generate_agent_instructions.sh: -------------------------------------------------------------------------------- 1 | 2 | if [ -d "benchmark_output/runs/$2" ]; then 3 | echo "Directory already exists: benchmark_output/runs/$2" 4 | exit 1 5 | fi 6 | 7 | helm-run --conf-paths $1 --suite $2 --max-eval-instances 5 --skip-expander --dry-run 8 | python src/agentinstruct/agent/agent_pipeline.py --benchmark_output_dir benchmark_output/runs/$2 9 | rm -rf benchmark_output/runs/$2 -------------------------------------------------------------------------------- /scripts/gpt-3.5-turbo.sh: -------------------------------------------------------------------------------- 1 | python scripts/replicate.py 2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/gpt-3.5-turbo-agentinstruct.conf gpt-3.5-turbo-agentinstruct 1000 2 3 | python src/agentinstruct/eval/format_results.py --suite gpt-3.5-turbo-agentinstruct -------------------------------------------------------------------------------- /scripts/llama-2-13b-chat.sh: -------------------------------------------------------------------------------- 1 | python scripts/replicate.py 2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-13b-chat-agentinstruct.conf llama-2-13b-chat-agentinstruct 1000 8 3 | python src/agentinstruct/eval/format_results.py --suite llama-2-13b-chat-agentinstruct -------------------------------------------------------------------------------- /scripts/llama-2-70b-chat.sh: -------------------------------------------------------------------------------- 1 | python scripts/replicate.py 2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-70b-chat-agentinstruct.conf llama-2-70b-chat-agentinstruct 1000 8 3 | python src/agentinstruct/eval/format_results.py --suite llama-2-70b-chat-agentinstruct -------------------------------------------------------------------------------- /scripts/llama-2-7b-chat.sh: -------------------------------------------------------------------------------- 1 | python scripts/replicate.py 2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-7b-chat-agentinstruct.conf llama-2-7b-chat-agentinstruct 1000 8 3 | python src/agentinstruct/eval/format_results.py --suite llama-2-7b-chat-agentinstruct -------------------------------------------------------------------------------- /scripts/replicate.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | try: 4 | os.remove(os.path.join(os.getcwd(), 'instructions/_latest')) 5 | except: 6 | pass 7 | os.symlink(os.path.join(os.getcwd(), f'instructions/main'), os.path.join(os.getcwd(), 'instructions/_latest')) -------------------------------------------------------------------------------- /scripts/run.sh: -------------------------------------------------------------------------------- 1 | ./scripts/generate_agent_instructions.sh $1 $2 2 | ./scripts/run_reasoning.sh $1 $2 $3 $4 $5 3 | python src/agentinstruct/eval/format_results.py --suite $2 4 | -------------------------------------------------------------------------------- /scripts/run_reasoning.sh: -------------------------------------------------------------------------------- 1 | if [ $# -ge 4 ]; then 2 | THREADS=$4 3 | else 4 | THREADS=8 5 | fi 6 | 7 | if [ "$5" ]; then 8 | PLACEHOLDER="--$5" 9 | fi 10 | 11 | helm-run --conf-paths $1 --suite $2 --max-eval-instances $3 -n $THREADS $PLACEHOLDER -------------------------------------------------------------------------------- /scripts/run_specs/agentinstruct/llama-2-13b-chat-agentinstruct.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1} 55 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/agentinstruct/llama-2-70b-chat-agentinstruct.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1} 55 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/agentinstruct/llama-2-7b-chat-agentinstruct.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1} 55 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/agentinstruct/vicuna-13b-agentinstruct.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1} 55 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 81 | {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/simple-gpt-3.5-turbo.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | {description: "addsub:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,instructions=agentinstruct", priority: 1} 3 | ] -------------------------------------------------------------------------------- /scripts/run_specs/simple-llama-2-7b-chat.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1} 3 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshot/gpt-3.5-turbo-zeroshot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=all", priority: 1} 13 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=male", priority: 1} 14 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=female", priority: 1} 15 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=black", priority: 1} 20 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,track=regular,valid_topk=30", priority: 1} 55 | {description: "msmarco:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,track=trec,valid_topk=30", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 78 | {description: "raft:subset=banking_77,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 81 | {description: "raft:subset=overruling,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshot/llama-2-13b-chat-zeroshot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1} 55 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshot/llama-2-70b-chat-zeroshot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1} 55 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshot/llama-2-7b-chat-zeroshot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1} 55 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshot/vicuna-13b-zeroshot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/vicuna-13b,max_train_instances=0", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/vicuna-13b,max_train_instances=0", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/vicuna-13b,max_train_instances=0", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/vicuna-13b,max_train_instances=0", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/vicuna-13b,max_train_instances=0", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/vicuna-13b,max_train_instances=0", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/vicuna-13b,max_train_instances=0", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30", priority: 1} 55 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/vicuna-13b,max_train_instances=0", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0", priority: 1} 81 | {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/vicuna-13b,max_train_instances=0", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/vicuna-13b,max_train_instances=0", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshotcot/llama-2-13b-chat-zeroshotcot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1} 55 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshotcot/llama-2-70b-chat-zeroshotcot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1} 55 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshotcot/llama-2-7b-chat-zeroshotcot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1} 55 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 81 | {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/run_specs/zeroshotcot/vicuna-13b-zeroshotcot.conf: -------------------------------------------------------------------------------- 1 | entries: [ 2 | # AddSub 3 | {description: "addsub:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 4 | 5 | # AQuA 6 | {description: "aqua:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 7 | 8 | # BoolQ 9 | {description: "boolq:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 10 | 11 | # CivilComments 12 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 13 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1} 14 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1} 15 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1} 16 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1} 17 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1} 18 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1} 19 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1} 20 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1} 21 | 22 | # CNN/Daily Mail 23 | {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 24 | 25 | # Coin Flip 26 | {description: "coin:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 27 | 28 | # CommonsenseQA 29 | {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 30 | 31 | # Date Understanding 32 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1} 33 | 34 | # GSM8K 35 | {description: "gsm:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 36 | 37 | # HellaSwag 38 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1} 39 | 40 | # IMDB 41 | {description: "imdb:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 42 | 43 | # Last Letter Concatenation 44 | {description: "letter:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 45 | 46 | # MMLU 47 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1} 48 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1} 49 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1} 50 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1} 51 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1} 52 | 53 | # MS MARCO 54 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1} 55 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1} 56 | 57 | # MultiArith 58 | {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 59 | 60 | # NarrativeQA 61 | {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 62 | 63 | # NaturalQA 64 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1} 65 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1} 66 | 67 | # NewsQA 68 | # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 69 | 70 | # OpenbookQA 71 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1} 72 | 73 | # QuAC 74 | {description: "quac:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 75 | 76 | # RAFT 77 | {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 78 | {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 80 | {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 81 | {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 82 | {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 83 | {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 84 | {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 85 | {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 86 | {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 87 | {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 88 | 89 | # Shuffled Objects 90 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1} 91 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1} 92 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1} 93 | 94 | # SingleEq 95 | {description: "singleeq:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 96 | 97 | # StrategyQA 98 | {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1} 99 | 100 | # SVAMP 101 | {description: "svamp:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1} 102 | 103 | # TruthfulQA 104 | {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1} 105 | 106 | # XSUM 107 | {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1} 108 | ] -------------------------------------------------------------------------------- /scripts/vicuna-13b.sh: -------------------------------------------------------------------------------- 1 | python scripts/replicate.py 2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/vicuna-13b-agentinstruct.conf vicuna-13b-agentinstruct 1000 8 3 | python src/agentinstruct/eval/format_results.py --suite vicuna-13b-agentinstruct -------------------------------------------------------------------------------- /src/agentinstruct/agent/agent_instr_generation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import requests 4 | 5 | from langchain.utilities import BingSearchAPIWrapper 6 | 7 | from langchain.document_loaders import WebBaseLoader 8 | 9 | from langchain.text_splitter import RecursiveCharacterTextSplitter 10 | from langchain.embeddings.openai import OpenAIEmbeddings 11 | from langchain.vectorstores import Chroma 12 | 13 | from langchain.chains import RetrievalQA 14 | 15 | from langchain.chat_models import ChatOpenAI 16 | from langchain.agents import Tool 17 | from langchain.agents import AgentType 18 | from langchain.agents import initialize_agent 19 | 20 | from helm.common.general import parse_hocon 21 | from langchain.load.dump import dumps 22 | 23 | import openai 24 | from tenacity import ( 25 | retry, 26 | stop_after_attempt, 27 | wait_random_exponential, 28 | ) 29 | 30 | os.environ["BING_SEARCH_URL"] = "https://api.bing.microsoft.com/v7.0/search" 31 | 32 | POWERFUL_MODEL = "gpt-4-0613" 33 | MINIMAL_TEMP = 0.3 34 | ZERO_TEMP = 0.0 35 | NUM_RESULTS = 5 36 | 37 | with open('prod_env/credentials.conf', 'r') as creds: 38 | credentials = parse_hocon(creds.read()) 39 | creds.close() 40 | 41 | openai_api_key = credentials.as_plain_ordered_dict().get('openaiApiKey') 42 | bing_subscription_key = credentials.as_plain_ordered_dict().get('bingSubscriptionKey') 43 | 44 | 45 | llm = ChatOpenAI(model=POWERFUL_MODEL, temperature=ZERO_TEMP, openai_api_key=openai_api_key) 46 | search = BingSearchAPIWrapper(bing_subscription_key=bing_subscription_key) 47 | 48 | def get_links(search_metadata): 49 | links = [] 50 | for result in search_metadata: 51 | links.append(result["link"]) 52 | return links 53 | 54 | def get_instructions(dataset_phrase, num_results=5): 55 | search_metadata = search.results(dataset_phrase, num_results) 56 | print(search_metadata) 57 | 58 | old_links = get_links(search_metadata) 59 | print(old_links) 60 | 61 | links = [] 62 | for link in old_links: 63 | try: 64 | requests.get(link, verify = True) 65 | links.append(link) 66 | except: 67 | continue 68 | print(links) 69 | 70 | website_loader = WebBaseLoader(links) 71 | data = website_loader.load() 72 | for doc in data: 73 | doc.page_content = doc.page_content 74 | doc.metadata = {"url": doc.metadata["source"], "source": doc.metadata["source"]} 75 | 76 | text_splitter = RecursiveCharacterTextSplitter() 77 | texts = text_splitter.split_documents(data) 78 | embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) 79 | db = Chroma.from_documents(texts, embeddings) 80 | retriever = db.as_retriever() 81 | qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever) 82 | return qa, links 83 | 84 | def run_agent(dataset_phrase, instance_format, possible_outputs, onepass=False): 85 | possible_outputs_prompt = f"\nPossible outputs:\n{possible_outputs}" 86 | 87 | if onepass: 88 | out_dict = dict() 89 | out_dict["output"] = onepass_simpletips(dataset_phrase, instance_format, possible_outputs_prompt) 90 | return out_dict, None 91 | 92 | qa, links = get_instructions(dataset_phrase) 93 | 94 | tools = [ 95 | Tool( 96 | name = "Ask about dataset", 97 | func=lambda x: qa({"query": x}), 98 | description="useful for when you need to ask questions to get information about the dataset" 99 | ), 100 | ] 101 | chat = ChatOpenAI(model=POWERFUL_MODEL, temperature=MINIMAL_TEMP, openai_api_key=openai_api_key) 102 | agent_chain = initialize_agent(tools, chat, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, return_intermediate_steps=True) 103 | 104 | prompt = (f"{dataset_phrase}. Use your resources to ask a series of simple questions to create instructions for the dataset. These instructions will be prepended to the prompt template during inference to help a large language model answer the prompt correctly." + 105 | " Include detailed tips on what topics to know and steps on how to answer the questions." + 106 | " For each instance, the model will apply these instructions to create an explanation that guides it towards the correct answer." + 107 | "\nPrompt Template (use for reference but no need to include in the instructions):\n"+ instance_format + 108 | possible_outputs_prompt) 109 | 110 | print("Prompt: ", prompt) 111 | 112 | return agent_chain({"input": prompt}), links 113 | 114 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) 115 | def openai_generate(model, prompt, temperature=MINIMAL_TEMP): 116 | response = openai.ChatCompletion.create( 117 | model=model, 118 | temperature=temperature, 119 | messages=[ 120 | {"role": "user", "content": prompt}, 121 | ] 122 | ) 123 | return response['choices'][0]['message']['content'] 124 | 125 | def onepass_simpletips(dataset_phrase, instance_format, possible_outputs_prompt): 126 | 127 | prompt = (f"{dataset_phrase}. Create instructions for the dataset that will be prepended to the prompt template during inference to help a large language model answer the prompt correctly." + 128 | " Include detailed tips on what topics to know and steps on how to answer the questions." + 129 | " For each instance, the model will apply these instructions to create an explanation that guides it towards the correct answer." + 130 | "\nPrompt Template (use for reference but no need to include in the instructions):\n"+ instance_format + 131 | possible_outputs_prompt) 132 | return openai_generate(POWERFUL_MODEL, prompt, temperature=MINIMAL_TEMP) 133 | 134 | def generate_and_save_instructions(working_directory_name, dataset_name, dataset_phrase, instance_format, possible_outputs, sources_dict, onepass=False): 135 | 136 | out_dict, links = run_agent(dataset_phrase, instance_format, possible_outputs, onepass=onepass) 137 | input_prompt = out_dict.get("input", None) 138 | intermediate_steps = dumps(out_dict.get("intermediate_steps", None)) 139 | instr = out_dict["output"][out_dict["output"].find("1."):] 140 | 141 | sources_dict[dataset_name] = { 142 | "all_links": links, 143 | "input_prompt": input_prompt, 144 | "intermediate_steps": intermediate_steps, 145 | } 146 | 147 | return instr, sources_dict 148 | -------------------------------------------------------------------------------- /src/agentinstruct/agent/agent_pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import openai 4 | import argparse 5 | 6 | from utils.dataset_preprocessing import dataset_preprocessing 7 | from agent_instr_generation import generate_and_save_instructions 8 | from helm.common.general import parse_hocon 9 | 10 | with open('prod_env/credentials.conf', 'r') as creds: 11 | credentials = parse_hocon(creds.read()) 12 | 13 | openai.api_key = credentials.as_plain_ordered_dict().get('openaiApiKey') 14 | 15 | __import__('pysqlite3') 16 | import sys 17 | sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') 18 | 19 | def generate_and_place_all_instr(benchmark_output_dir): 20 | suite = benchmark_output_dir.split("/")[-1] 21 | inputs_dict = {} 22 | instr_dict = {} 23 | sources_dict = {} 24 | 25 | instr_dir_path = os.path.join("instructions", suite) 26 | os.makedirs(instr_dir_path, exist_ok="True") 27 | 28 | for dataset_dir in os.listdir(benchmark_output_dir): 29 | if os.path.isdir(os.path.join(benchmark_output_dir, dataset_dir)): 30 | scenario_state_path = os.path.join(benchmark_output_dir, dataset_dir, "scenario_state.json") 31 | if not os.path.exists(scenario_state_path): 32 | print(f"Scenario state does not exist for {dataset_dir}. Skipping.") 33 | continue 34 | dataset_name, dataset_phrase, instance_format, possible_outputs = dataset_preprocessing(scenario_state_path) 35 | inputs_dict[dataset_name] = { 36 | "dataset_phrase": dataset_phrase, 37 | "instance_format": instance_format, 38 | "possible_outputs": possible_outputs, 39 | } 40 | instr, sources_dict = generate_and_save_instructions(instr_dir_path, dataset_name, dataset_phrase, instance_format, possible_outputs, sources_dict, onepass=False) 41 | instr_dict[dataset_name] = { 42 | "instructions": instr, 43 | "task": possible_outputs 44 | } 45 | 46 | with open(os.path.join(instr_dir_path, "instructions.json"), "w") as f: 47 | json.dump(instr_dict, f, indent=4) 48 | with open(os.path.join(instr_dir_path, "inputs.json"), "w") as f: 49 | json.dump(inputs_dict, f, indent=4) 50 | with open(os.path.join(instr_dir_path, "metadata.json"), "w") as f: 51 | json.dump(sources_dict, f, indent=4) 52 | try: 53 | os.unlink(os.path.join(os.getcwd(), 'instructions/_latest')) 54 | except: 55 | pass 56 | os.symlink(os.path.join(os.getcwd(), f'instructions/{suite}'), os.path.join(os.getcwd(), 'instructions/_latest')) 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("--benchmark_output_dir", type=str) 61 | args = parser.parse_args() 62 | generate_and_place_all_instr(args.benchmark_output_dir) 63 | -------------------------------------------------------------------------------- /src/agentinstruct/agent/utils/dataset_preprocessing.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import tiktoken 4 | import backoff 5 | import openai 6 | 7 | def read_scenario_state(scenario_state_path): 8 | with open(scenario_state_path, "r") as f: 9 | scenario_state = json.load(f) 10 | dataset_name = scenario_state["adapter_spec"]["prompt_list"]["dataset_name"] 11 | possible_outputs = scenario_state["adapter_spec"]["method"] 12 | test_instances = [] 13 | labels = set() 14 | for state in scenario_state["request_states"]: 15 | test_instances.append(state["request"]["prompt"]) 16 | labels.add(state["instance"]["references"][0]["output"]["text"]) 17 | if len(labels) < len(test_instances) and possible_outputs == 'generation': 18 | possible_outputs = list(labels) 19 | return dataset_name, test_instances, possible_outputs 20 | 21 | def get_dataset_phrase(dataset_name): 22 | dataset_phrase = re.sub(r"(^(.*?):)", r"The dataset name is \1", dataset_name) 23 | if "The dataset name is" not in dataset_phrase: 24 | dataset_phrase = "The dataset name is " + dataset_phrase 25 | pattern = r"(,|:)(.*?)=(.*?)(,|$)" 26 | while re.search(pattern, dataset_phrase) is not None: 27 | dataset_phrase = re.sub(pattern, r" and the \2 is \3,", dataset_phrase) 28 | dataset_name = re.sub(r":$", "", dataset_name) 29 | dataset_phrase = re.sub(r"(,|:)$", "", dataset_phrase) 30 | return dataset_phrase 31 | 32 | def truncate_instances(instances, max_length=3600): 33 | 34 | encoding = tiktoken.get_encoding("cl100k_base") 35 | instance_num_tokens = [(instance, len(encoding.encode(instance))) for instance in instances] 36 | instance_num_tokens.sort(key=lambda x: x[1]) 37 | instances_str = instance_num_tokens[0][0] 38 | num_tokens = instance_num_tokens[0][1] 39 | for instance, num_tokens_instance in instance_num_tokens[1:]: 40 | if num_tokens + num_tokens_instance <= max_length: 41 | instances_str += "\n\n" + instance 42 | num_tokens += 1 + num_tokens_instance 43 | else: 44 | break 45 | return instances_str 46 | 47 | @backoff.on_exception(backoff.expo, openai.error.RateLimitError, max_time=60) 48 | def get_instance_format(instances): 49 | 50 | output = openai.ChatCompletion.create( 51 | model="gpt-3.5-turbo", 52 | temperature=0, 53 | messages=[ 54 | {"role": "user", "content": f"Given the following instances from a dataset, please isolate the structure of each instance such that a general template is created. Do not include any specific information, just what each instance looks like before its specific information was filled in (the template should have empty brackets in the spots that are different for each instance). We will use this to write our own instances that must follow the same format. Remember to be as general as possible; there are likely some instances in the dataset that are quite different than the ones presented here.\nExample Instances:\n\n{instances}\n\nFormat:"}, 55 | ], 56 | max_tokens=256, 57 | ) 58 | return output["choices"][0]["message"]["content"] 59 | 60 | def get_full_instance_format(instances, verbose=False): 61 | if verbose: 62 | print("original instances: ", instances) 63 | instances = truncate_instances(instances[:5]) 64 | formatted_instances = get_instance_format(instances) 65 | return formatted_instances 66 | 67 | def dataset_preprocessing(scenario_state_path): 68 | dataset_name, test_instances, possible_outputs = read_scenario_state(scenario_state_path) 69 | dataset_phrase = get_dataset_phrase(dataset_name) 70 | instance_format = get_full_instance_format(test_instances, verbose=True) 71 | return dataset_name, dataset_phrase, instance_format, possible_outputs -------------------------------------------------------------------------------- /src/agentinstruct/eval/format_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import pandas as pd 5 | from letter_eval import letter_eval 6 | 7 | dataset_to_metric = { 8 | 'mmlu': 'exact_match', 9 | 'civil_comments': 'quasi_prefix_exact_match', 10 | 'raft': 'quasi_exact_match', 11 | 'big_bench': 'exact_match', 12 | 'summarization_cnndm': 'rouge_2', 13 | 'summarization_xsum': 'rouge_2', 14 | 'truthful_qa': 'exact_match', 15 | 'imdb': 'quasi_exact_match', 16 | 'narrative_qa': 'f1_score', 17 | 'boolq': 'quasi_prefix_exact_match', 18 | 'quac': 'f1_score', 19 | 'aqua': 'exact_match', 20 | 'news_qa': 'f1_score', 21 | 'natural_qa': 'f1_score', 22 | 'commonsense': 'exact_match', 23 | 'truthful_qa': 'exact_match', 24 | 'msmarco': 'RR@10', #switch for trec 25 | 'gsm': 'quasi_exact_match', 26 | 'multi_arith': 'quasi_exact_match', 27 | 'svamp' : 'quasi_exact_match', 28 | 'addsub': 'quasi_exact_match', 29 | 'singleeq': 'quasi_exact_match', 30 | 'letter': 'letter_eval', 31 | 'big_bench_hard': 'quasi_exact_match', 32 | 'coin': "quasi_exact_match", 33 | 'commonsense_qa': 'exact_match', 34 | } 35 | 36 | def main(args): 37 | results = {} 38 | for run in os.listdir(os.path.join('benchmark_output/runs', args.suite)): 39 | 40 | try: 41 | if 'letter' in run: 42 | score, num_instances = letter_eval(os.path.join('benchmark_output/runs', args.suite, run)) 43 | results[run] = {'score': score, 'num_instances': num_instances, 'metric': 'letter_eval'} 44 | continue 45 | 46 | with open(os.path.join('benchmark_output/runs', args.suite, run, 'stats.json'), 'r') as f: 47 | stats = json.load(f) 48 | f.close() 49 | 50 | with open(os.path.join('benchmark_output/runs', args.suite, run, 'scenario_state.json'), 'r') as f1: 51 | scenario_state = json.load(f1) 52 | f1.close() 53 | 54 | dataset = run.split(':')[0].split(',')[0] if ',' in run.split(':')[0] else run.split(':')[0] 55 | metric = dataset_to_metric[dataset] 56 | 57 | if dataset == 'msmarco' and 'track=trec' in run: 58 | metric = 'NDCG@10' 59 | 60 | results[run] = {'score': None, 'num_instances': None, 'metric': metric} 61 | 62 | if 'civil_comments' in run: 63 | score = 0 64 | instances = 0 65 | for stat in stats: 66 | if stat['name']['name'] == metric and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']: 67 | score += stat['mean'] 68 | if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']: 69 | instances += stat['mean'] 70 | results[run]['score'] = score/2 71 | results[run]['num_instances'] = instances 72 | 73 | else: 74 | tmp = None 75 | for stat in stats: 76 | 77 | if stat['name']['name'] == metric and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']: 78 | results[run]['score'] = stat['mean'] 79 | 80 | if stat['name']['name'] == metric and stat['name']['split'] == 'valid' and 'perturbation' not in stat['name']: 81 | tmp = stat['mean'] 82 | 83 | if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']: 84 | results[run]['num_instances'] = stat['mean'] 85 | 86 | if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'valid' and 'perturbation' not in stat['name']: 87 | tmp1 = stat['mean'] 88 | 89 | if results[run]['score'] == None: 90 | if tmp != None: 91 | results[run]['score'] = tmp 92 | results[run]['num_instances'] = tmp1 93 | else: 94 | print(f'Run {run} does not have a test or validation set.\n') 95 | 96 | except Exception as e: 97 | print(f'Skipping {run}.') 98 | 99 | keys = sorted(results) 100 | results = {key: results[key] for key in keys} 101 | df = pd.DataFrame.from_dict(results, columns = ['metric', 'num_instances', 'score'], orient='index') 102 | df.to_csv(f'benchmark_output/runs/{args.suite}/results.csv') 103 | 104 | if __name__ == '__main__': 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument('--suite', type=str, required=True) 107 | main(parser.parse_args()) 108 | -------------------------------------------------------------------------------- /src/agentinstruct/eval/letter_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | import string 5 | 6 | def letter_eval(path): 7 | 8 | def white_space_fix(text: str) -> str: 9 | return " ".join(text.split()) 10 | 11 | def remove_punc(text: str) -> str: 12 | exclude = set(string.punctuation) 13 | return "".join(ch for ch in text if ch not in exclude) 14 | 15 | def lower(text: str) -> str: 16 | return text.lower() 17 | 18 | with open(os.path.join(path, "scenario_state.json"), 'r') as f: 19 | states = json.load(f) 20 | 21 | count = 0 22 | 23 | if 'agentinstruct' in states["adapter_spec"]["prompt_list"]: 24 | mode = 'agentinstruct' if states["adapter_spec"]["prompt_list"]["agentinstruct"] else 'zeroshotcot' 25 | else: 26 | mode='zeroshot' 27 | 28 | for instance in states["request_states"]: 29 | gold = instance["instance"]["references"][0]["output"]["text"] 30 | if mode == 'zeroshotcot': 31 | pred = instance["result"]["full_text"].split('Therefore, the answer is')[-1].translate({ord(c): None for c in string.whitespace}) 32 | elif mode == 'agentinstruct': 33 | pred = instance["result"]["full_text"].split('Answer:')[-1].translate({ord(c): None for c in string.whitespace}) 34 | else: 35 | pred = instance["result"]["completions"][0]["text"].translate({ord(c): None for c in string.whitespace}) 36 | 37 | if pred and gold: 38 | if white_space_fix(remove_punc(lower(gold))) == white_space_fix(remove_punc(lower(pred)))[:2]: 39 | count += 1 40 | 41 | l = len(states["request_states"]) 42 | return count/l, l 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--path', type=str, required=True) 47 | args = parser.parse_args() 48 | print(letter_eval(args.path)) 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/benchmark_output/scenarios/coin/data/train: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": 1, 4 | "question": "A coin is heads up. Dan flips the coin. Earnest does not flip the coin. Agustin does not flip the coin. Kip does not flip the coin. Is the coin still heads up?", 5 | "answer": "no" 6 | }, 7 | { 8 | "id": 2, 9 | "question": "A coin is heads up. Milford flips the coin. Kathie does not flip the coin. Cathy flips the coin. Randy does not flip the coin. Is the coin still heads up?", 10 | "answer": "yes" 11 | }, 12 | { 13 | "id": 3, 14 | "question": "A coin is heads up. Donald flips the coin. Rosalind flips the coin. Madelyn flips the coin. Ida flips the coin. Is the coin still heads up?", 15 | "answer": "yes" 16 | }, 17 | { 18 | "id": 4, 19 | "question": "A coin is heads up. Kristen flips the coin. Clarice does not flip the coin. Thelma flips the coin. Maurice flips the coin. Is the coin still heads up?", 20 | "answer": "no" 21 | }, 22 | { 23 | "id": 5, 24 | "question": "A coin is heads up. Andy flips the coin. Clinton does not flip the coin. Hilda does not flip the coin. Katrina does not flip the coin. Is the coin still heads up?", 25 | "answer": "no" 26 | } 27 | ] -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/benchmark_output/scenarios/letter/data/train: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": 1, 4 | "question": "Take the last letters of each words in \"Phil Schmitt\" and concatenate them.", 5 | "answer": "lt" 6 | }, 7 | { 8 | "id": 2, 9 | "question": "Take the last letters of each words in \"Marta Faulkner\" and concatenate them.", 10 | "answer": "ar" 11 | }, 12 | { 13 | "id": 3, 14 | "question": "Take the last letters of each words in \"Eugenia Watson\" and concatenate them.", 15 | "answer": "an" 16 | }, 17 | { 18 | "id": 4, 19 | "question": "Take the last letters of each words in \"Danielle Barr\" and concatenate them.", 20 | "answer": "er" 21 | }, 22 | { 23 | "id": 5, 24 | "question": "Take the last letters of each words in \"Antwan Bates\" and concatenate them.", 25 | "answer": "ns" 26 | } 27 | ] -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # Add any classes that need to be loaded dynamically via `create_object`. 2 | 3 | # Scenarios 4 | from .scenarios import simple_scenarios # noqa 5 | from .scenarios import mmlu_scenario # noqa 6 | from .scenarios import interactive_qa_mmlu_scenario # noqa 7 | from .scenarios import msmarco_scenario # noqa 8 | from .scenarios import commonsense_scenario # noqa 9 | from .scenarios import twitter_aae_scenario # noqa 10 | from .scenarios import real_toxicity_prompts_scenario # noqa 11 | from .scenarios import math_scenario # noqa 12 | from .scenarios import the_pile_scenario # noqa 13 | from .scenarios import truthful_qa_scenario # noqa 14 | from .scenarios import wikifact_scenario # noqa 15 | from .scenarios import synthetic_reasoning_natural_scenario # noqa 16 | from .scenarios import copyright_scenario # noqa 17 | from .scenarios import disinformation_scenario # noqa 18 | from .scenarios import boolq_scenario # noqa 19 | from .scenarios import code_scenario # noqa 20 | from .scenarios import lsat_qa_scenario # noqa 21 | from .scenarios import gsm_scenario # noqa 22 | from .scenarios import natural_qa_scenario # noqa 23 | from .scenarios import quac_scenario # noqa 24 | from .scenarios import babi_qa_scenario # noqa 25 | from .scenarios import narrativeqa_scenario # noqa 26 | from .scenarios import raft_scenario # noqa 27 | from .scenarios import numeracy_scenario # noqa 28 | from .scenarios import ice_scenario # noqa 29 | from .scenarios import summarization_scenario # noqa 30 | from .scenarios import synthetic_efficiency_scenario # noqa 31 | from .scenarios import synthetic_reasoning_scenario # noqa 32 | from .scenarios import newsqa_scenario # noqa 33 | from .scenarios import wikitext_103_scenario # noqa 34 | from .scenarios import blimp_scenario # noqa 35 | from .scenarios import imdb_scenario # noqa 36 | from .scenarios import dialogue_scenarios # noqa 37 | from .scenarios import bbq_scenario # noqa 38 | from .scenarios import bold_scenario # noqa 39 | from .scenarios import civil_comments_scenario # noqa 40 | from .scenarios import dyck_language_scenario # noqa 41 | from .scenarios import legal_support_scenario # noqa 42 | from .scenarios import lex_glue_scenario # noqa 43 | from .scenarios import lextreme_scenario # noqa 44 | from .scenarios import entity_matching_scenario # noqa 45 | from .scenarios import entity_data_imputation_scenario # noqa 46 | from .scenarios import big_bench_scenario # noqa 47 | from .scenarios import opinions_qa_scenario # noqa 48 | from .scenarios import multi_arith_scenario 49 | from .scenarios import aqua_scenario 50 | from .scenarios import svamp_scenario 51 | from .scenarios import addsub_scenario 52 | from .scenarios import singleeq_scenario 53 | from .scenarios import coin_scenario 54 | from .scenarios import letter_scenario 55 | from .scenarios import big_bench_hard_scenario 56 | from .scenarios import commonsense_qa_scenario 57 | 58 | # Biomedical 59 | from .scenarios import covid_dialog_scenario # noqa 60 | from .scenarios import me_q_sum_scenario # noqa 61 | from .scenarios import med_dialog_scenario # noqa 62 | from .scenarios import med_mcqa_scenario # noqa 63 | from .scenarios import med_paragraph_simplification_scenario # noqa 64 | from .scenarios import med_qa_scenario # noqa 65 | from .scenarios import pubmed_qa_scenario # noqa 66 | from .scenarios import wmt_14_scenario # noqa 67 | 68 | # 69 | # Metrics 70 | from .metrics import basic_metrics # noqa 71 | from .metrics import bbq_metrics # noqa 72 | from .metrics import bias_metrics # noqa 73 | from .metrics import classification_metrics # noqa 74 | from .metrics import code_metrics # noqa 75 | from .metrics import copyright_metrics # noqa 76 | from .metrics import disinformation_metrics # noqa 77 | from .metrics import numeracy_metrics # noqa 78 | from .metrics import ranking_metrics # noqa 79 | from .metrics import summarization_metrics # noqa 80 | from .metrics import toxicity_metrics # noqa 81 | from .metrics import machine_translation_metrics # noqa 82 | 83 | # Perturbations for data augmentation 84 | from .augmentations.extra_space_perturbation import ExtraSpacePerturbation # noqa 85 | from .augmentations.misspelling_perturbation import MisspellingPerturbation # noqa 86 | from .augmentations.contraction_expansion_perturbation import ContractionPerturbation # noqa 87 | from .augmentations.contraction_expansion_perturbation import ExpansionPerturbation # noqa 88 | from .augmentations.typos_perturbation import TyposPerturbation # noqa 89 | from .augmentations.filler_words_perturbation import FillerWordsPerturbation # noqa 90 | from .augmentations.synonym_perturbation import SynonymPerturbation # noqa 91 | from .augmentations.contrast_sets_perturbation import ContrastSetsPerturbation # noqa 92 | from .augmentations.lowercase_perturbation import LowerCasePerturbation # noqa 93 | from .augmentations.space_perturbation import SpacePerturbation # noqa 94 | from .augmentations.mild_mix_perturbation import MildMixPerturbation # noqa 95 | from .augmentations.dialect_perturbation import DialectPerturbation # noqa 96 | from .augmentations.person_name_perturbation import PersonNamePerturbation # noqa 97 | from .augmentations.gender_perturbation import GenderPerturbation # noqa 98 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapter_spec.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Optional, Dict, Any 3 | 4 | 5 | @dataclass(frozen=True) 6 | class Substitution: 7 | """Represents a regular expression search/replace.""" 8 | 9 | source: str 10 | target: str 11 | 12 | 13 | @dataclass(frozen=True) 14 | class AdapterSpec: 15 | """ 16 | Specifies how to take a `Scenario` (a list of `Instance`s) and produce a 17 | `ScenarioState` (set of `Request`s ). Instead of having free-form prompt 18 | hacking, we try to make the process more declarative and systematic. 19 | Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`). 20 | """ 21 | 22 | # Method of adaptation 23 | method: str = "" 24 | 25 | # Prepend all prompts with this string. 26 | # For example, it is recommended to prefix all prompts with [NLG] for UL2. 27 | global_prefix: str = "" 28 | 29 | # Prompt starts with instructions 30 | instructions: str = "" 31 | 32 | # What goes before the input 33 | input_prefix: str = "Input: " 34 | 35 | # What goes after the input 36 | input_suffix: str = "\n" 37 | 38 | # What goes before the input (for multiple choice) 39 | reference_prefix: str = "A. " 40 | 41 | # What goes before the input (for multiple choice) 42 | reference_suffix: str = "\n" 43 | 44 | # What goes before the output 45 | output_prefix: str = "Output: " 46 | 47 | # What goes after the output 48 | output_suffix: str = "\n" 49 | 50 | # What goes between instruction and in-context example blocks in the constructed prompt 51 | instance_prefix: str = "\n" 52 | 53 | # List of regular expression substitutions that we perform 54 | substitutions: List[Substitution] = field(default_factory=list, hash=False) 55 | 56 | # Maximum number of (in-context) training instances to put into the prompt 57 | max_train_instances: int = 5 58 | 59 | # Maximum number of evaluation instances. For getting valid numbers, this 60 | # should be the entire dataset; only reduce this for piloting. 61 | max_eval_instances: Optional[int] = None 62 | 63 | # Generate this many outputs (which could be realized by `num_completions` 64 | # or `top_k_per_token`). 65 | num_outputs: int = 5 66 | 67 | # Number of trials, where in each trial we choose an independent, random 68 | # set of training instances. Used to compute error bars. 69 | num_train_trials: int = 1 70 | 71 | # If true, randomly sample N training examples; if false, select N consecutive training examples 72 | sample_train: bool = True 73 | 74 | # Decoding parameters (inherited by `Request`) 75 | 76 | # Model to make the request to (need to fill in) 77 | model: str = "" 78 | 79 | # Temperature to use 80 | temperature: float = 1 81 | 82 | # Maximum number of tokens to generate 83 | max_tokens: int = 100 84 | 85 | # When to stop (set hash=False to make `AdapterSpec` hashable) 86 | stop_sequences: List[str] = field(default_factory=list, hash=False) 87 | 88 | # Random string (used concretely to bypass cache / see diverse results) 89 | random: Optional[str] = None 90 | 91 | # Prompt List (for multiple calls to chatgpt) 92 | prompt_list: Dict[str, Any] = None 93 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional 2 | from helm.benchmark.adaptation.adapter_spec import AdapterSpec 3 | 4 | from helm.benchmark.adaptation.request_state import RequestState 5 | from helm.benchmark.scenarios.scenario import Instance 6 | from helm.benchmark.window_services.tokenizer_service import TokenizerService 7 | from helm.common.request import Request 8 | from .in_context_learning_adapter import InContextLearningAdapter 9 | from dataclasses import replace 10 | 11 | 12 | class MultipleChoiceJointAdapter(InContextLearningAdapter): 13 | """ 14 | Each `Instance` in a `Scenario` looks like this: 15 | 16 | -> 17 | 18 | [correct] 19 | 20 | 21 | We can define a label (e.g., letter) for each reference: 22 | 23 | 24 | 25 | # train 26 | A. 27 | B. 28 | C. 29 | D. 30 | Answer: C 31 | 32 | # test 33 | A. 34 | B. 35 | C. 36 | D. 37 | Answer: 38 | 39 | In general, each example is: 40 | 41 | 42 | """ 43 | 44 | def __init__(self, adapter_spec: AdapterSpec, tokenizer_service: TokenizerService): 45 | super().__init__(adapter_spec, tokenizer_service) 46 | 47 | @staticmethod 48 | def get_reference_prefix(prefix: str, i: int) -> str: 49 | """ 50 | Example: prefix = "\nA. ", i = 2, return "\nC. " 51 | """ 52 | return prefix.replace("A", chr(ord("A") + i)) 53 | 54 | def generate_requests(self, eval_instance: Instance) -> List[RequestState]: 55 | prompt = self.construct_prompt(self.train_instances, eval_instance, include_output=False, reference_index=None) 56 | output_mapping: Dict[str, str] = dict( 57 | (self.get_reference_prefix("A", reference_index), reference.output.text) 58 | for reference_index, reference in enumerate(eval_instance.references) 59 | ) 60 | request = Request( 61 | model=self.adapter_spec.model, 62 | prompt=prompt.text, 63 | num_completions=1, 64 | top_k_per_token=self.adapter_spec.num_outputs, 65 | temperature=self.adapter_spec.temperature, # usually this is 0 66 | max_tokens=self.adapter_spec.max_tokens, # usually this is 1 67 | stop_sequences=[], 68 | random=self.adapter_spec.random, 69 | ) 70 | request_state = RequestState( 71 | instance=eval_instance, 72 | reference_index=None, 73 | request_mode=None, 74 | train_trial_index=self.train_trial_index, 75 | output_mapping=output_mapping, 76 | request=request, 77 | result=None, 78 | num_train_instances=prompt.num_train_instances, 79 | prompt_truncated=prompt.truncated, 80 | ) 81 | return [request_state] 82 | 83 | def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str: 84 | """Return a list of lines corresponding to this example (part of the prompt).""" 85 | # Input 86 | result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix 87 | 88 | # Include the references 89 | output = "n/a" 90 | for reference_index, reference in enumerate(instance.references): 91 | prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index) 92 | result += prefix + reference.output.text + self.adapter_spec.reference_suffix 93 | if reference.is_correct and output == "n/a": 94 | output = self.get_reference_prefix("A", reference_index) 95 | 96 | if include_output: 97 | result += self.adapter_spec.output_prefix + output + self.adapter_spec.output_suffix 98 | else: 99 | result += self.adapter_spec.output_prefix.rstrip() 100 | 101 | return result 102 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/executor.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict, Any 2 | from dataclasses import dataclass, replace 3 | 4 | from helm.common.general import parallel_map 5 | from helm.common.hierarchical_logger import htrack, hlog 6 | from helm.common.request import RequestResult, Sequence 7 | from helm.common.authentication import Authentication 8 | from helm.proxy.services.remote_service import RemoteService 9 | from helm.proxy.services.server_service import ServerService 10 | from helm.proxy.services.service import Service 11 | from helm.benchmark.adaptation.scenario_state import ScenarioState 12 | from helm.benchmark.adaptation.request_state import RequestState 13 | 14 | 15 | class ExecutorError(Exception): 16 | pass 17 | 18 | 19 | @dataclass(frozen=True) 20 | class ExecutionSpec: 21 | # If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959). 22 | url: Optional[str] 23 | 24 | # Pass into the service 25 | auth: Authentication 26 | 27 | # Path where API credentials and cache is stored. 28 | # This path is the same as `--base-path` when launching the proxy server (see server.py). 29 | # Required when url is not set. 30 | local_path: Optional[str] 31 | 32 | # How many threads to have at once 33 | parallelism: int 34 | 35 | # Whether to skip execution 36 | dry_run: bool = False 37 | 38 | # URL to the MongoDB database. 39 | # If non-empty, the MongoDB database will be used for caching instead of SQLite. 40 | # Example format: mongodb://[username:password@]host1[:port1]/[dbname] 41 | # For full format, see: https://www.mongodb.com/docs/manual/reference/connection-string/ 42 | mongo_uri: str = "" 43 | 44 | 45 | class Executor: 46 | """ 47 | An `Executor` takes a `ScenarioState` which has a bunch of requests. 48 | Issue them to the API and return the results. 49 | """ 50 | 51 | def __init__(self, execution_spec: ExecutionSpec): 52 | self.execution_spec = execution_spec 53 | 54 | self.service: Service 55 | if execution_spec.url: 56 | hlog(f"Running using remote API proxy server: {execution_spec.url}") 57 | self.service = RemoteService(execution_spec.url) 58 | elif execution_spec.local_path: 59 | hlog(f"Running in local mode with base path: {execution_spec.local_path}") 60 | self.service = ServerService( 61 | base_path=execution_spec.local_path, root_mode=True, mongo_uri=execution_spec.mongo_uri 62 | ) 63 | else: 64 | raise ValueError("Either the proxy server URL or the local path must be set") 65 | 66 | @htrack(None) 67 | def execute(self, scenario_state: ScenarioState) -> ScenarioState: 68 | if self.execution_spec.dry_run: 69 | hlog("Skipped execution.") 70 | return scenario_state 71 | 72 | # Fill in process with prompt list (accessible from ScenarioState) so it only has one variable 73 | process = lambda x: self.process(x, scenario_state.adapter_spec.prompt_list) 74 | 75 | # Do it! 76 | request_states = parallel_map( 77 | process, #self.process, 78 | scenario_state.request_states, 79 | parallelism=self.execution_spec.parallelism, 80 | ) 81 | 82 | hlog(f"Processed {len(request_states)} requests") 83 | return ScenarioState(scenario_state.adapter_spec, request_states) 84 | 85 | def process(self, state: RequestState, prompt_list: Dict[str, Any]) -> RequestState: 86 | try: 87 | result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request, prompt_list) 88 | except Exception as e: 89 | raise ExecutorError(f"{str(e)} Request: {state.request}") from e 90 | if not result.success: 91 | if result.error_flags and not result.error_flags.is_fatal: 92 | hlog(f"WARNING: Non-fatal error treated as empty completion: {result.error}") 93 | result.completions = [Sequence(text="", logprob=0, tokens=[])] 94 | else: 95 | raise ExecutorError(f"{str(result.error)} Request: {state.request}") 96 | return replace(state, result=result) 97 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/addsub_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.common.hierarchical_logger import hlog 6 | 7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 8 | from .scenario import ( 9 | Scenario, 10 | Instance, 11 | Reference, 12 | TRAIN_SPLIT, 13 | TEST_SPLIT, 14 | CORRECT_TAG, 15 | Input, 16 | Output, 17 | ) 18 | 19 | 20 | class AddSubScenario(Scenario): 21 | 22 | name = "addsub" 23 | description = "AddSub Dataset" 24 | tags = ["question_answering"] 25 | 26 | def __init__(self): 27 | super().__init__() 28 | 29 | def get_instances(self) -> List[Instance]: 30 | def delete_extra_zero(n): 31 | try: 32 | n = float(n) 33 | except: 34 | hlog(f"None {n}") 35 | return n 36 | if isinstance(n, int): 37 | return str(n) 38 | if isinstance(n, float): 39 | n = str(n).rstrip("0") 40 | n = int(n.rstrip(".")) if n.endswith(".") else float(n) 41 | n = str(n) 42 | return n 43 | 44 | def make_train_set(data_path: str): 45 | train = [ 46 | { 47 | "iIndex": 0, 48 | "sQuestion": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?", 49 | "lSolutions": [39], 50 | }, 51 | { 52 | "iIndex": 1, 53 | "sQuestion": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?", 54 | "lSolutions": [6], 55 | }, 56 | { 57 | "iIndex": 2, 58 | "sQuestion": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?", 59 | "lSolutions": [5], 60 | }, 61 | { 62 | "iIndex": 3, 63 | "sQuestion": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?", 64 | "lSolutions": [9], 65 | }, 66 | { 67 | "iIndex": 4, 68 | "sQuestion": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?", 69 | "lSolutions": [33], 70 | }, 71 | ] 72 | 73 | with open(os.path.join(data_path, "train"), "w") as f: 74 | f.write(json.dumps(train, indent=4)) 75 | 76 | data_path: str = os.path.join(self.output_path, "data") 77 | ensure_directory_exists(data_path) 78 | 79 | url: str = "https://raw.githubusercontent.com/chuanyang-Zheng/Progressive-Hint/main/dataset/AddSub/AddSub.json" 80 | test_path: str = os.path.join(data_path, "test") 81 | ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False) 82 | 83 | make_train_set(data_path) 84 | 85 | instances: List[Instance] = [] 86 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} 87 | 88 | for split, filename in split_to_filename.items(): 89 | target_path: str = os.path.join(data_path, filename) 90 | 91 | with open(target_path, "r") as f: 92 | data = json.load(f) 93 | for entry in data: 94 | question = entry["sQuestion"].strip() 95 | answer = str(entry["lSolutions"][0]) 96 | if answer[-2:] == ".0": 97 | answer = answer[:-2] 98 | instance: Instance = Instance( 99 | input=Input(text=question), 100 | references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])], 101 | split=split, 102 | ) 103 | instances.append(instance) 104 | 105 | return instances 106 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/aqua_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 6 | from .scenario import ( 7 | Scenario, 8 | Instance, 9 | Reference, 10 | TRAIN_SPLIT, 11 | VALID_SPLIT, 12 | TEST_SPLIT, 13 | CORRECT_TAG, 14 | Input, 15 | Output, 16 | ) 17 | 18 | 19 | class AQuAScenario(Scenario): 20 | 21 | name = "aqua" 22 | description = "AQuA Dataset" 23 | tags = ["question_answering"] 24 | 25 | def __init__(self): 26 | super().__init__() 27 | 28 | def get_instances(self) -> List[Instance]: 29 | data_path: str = os.path.join(self.output_path, "data") 30 | ensure_directory_exists(data_path) 31 | 32 | instances: List[Instance] = [] 33 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", VALID_SPLIT: "dev", TEST_SPLIT: "test"} 34 | 35 | for split, filename in split_to_filename.items(): 36 | url: str = f"https://raw.githubusercontent.com/deepmind/AQuA/master/{filename}.json" 37 | target_path: str = os.path.join(data_path, filename) 38 | ensure_file_downloaded(source_url=url, target_path=target_path, unpack=False) 39 | 40 | with open(target_path, "r") as f: 41 | data_lst = list(f) 42 | 43 | for data in data_lst: 44 | entry = json.loads(data) 45 | question = entry["question"] 46 | options = entry["options"] 47 | answer = ord(entry["correct"]) - ord("A") 48 | 49 | references: List[Reference] = [] 50 | for index, option in enumerate(options): 51 | tags = [CORRECT_TAG] if index == answer else [] 52 | references.append(Reference(Output(text=option[2:]), tags=tags)) 53 | 54 | instance: Instance = Instance( 55 | input=Input(text=question), 56 | references=references, 57 | split=split, 58 | ) 59 | instances.append(instance) 60 | 61 | return instances 62 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/big_bench_hard_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.common.hierarchical_logger import hlog 6 | 7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 8 | from .scenario import ( 9 | Scenario, 10 | Instance, 11 | Reference, 12 | TEST_SPLIT, 13 | CORRECT_TAG, 14 | PassageQuestionInput, 15 | Input, 16 | Output, 17 | ) 18 | 19 | 20 | class BigBenchHardScenario(Scenario): 21 | 22 | name = "big_bench_hard" 23 | description = "Big-Bench-Hard Benchmark" 24 | tags = ["question_answering"] 25 | 26 | def __init__(self, dataset: str): 27 | super().__init__() 28 | self.dataset: str = dataset 29 | 30 | def get_instances(self) -> List[Instance]: 31 | data_path: str = os.path.join(self.output_path, self.dataset) 32 | ensure_directory_exists(data_path) 33 | 34 | instances: List[Instance] = [] 35 | split_to_filename: Dict[str, str] = {TEST_SPLIT: "test"} 36 | 37 | for split, filename in split_to_filename.items(): 38 | url: str = f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{self.dataset}.json" 39 | target_path: str = os.path.join(data_path, f"{self.dataset}_{filename}") 40 | ensure_file_downloaded(source_url=url, target_path=target_path, unpack=False) 41 | 42 | with open(target_path, "r") as f: 43 | data = json.load(f) 44 | 45 | for instance in data['examples']: 46 | question = instance['input'] 47 | answer = instance['target'] 48 | 49 | references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])] 50 | instance: Instance = Instance( 51 | input=Input(text=question), 52 | references=references, 53 | split=split, 54 | ) 55 | instances.append(instance) 56 | 57 | return instances 58 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/coin_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from .scenario import ( 6 | Scenario, 7 | Instance, 8 | Reference, 9 | TRAIN_SPLIT, 10 | TEST_SPLIT, 11 | CORRECT_TAG, 12 | Input, 13 | Output, 14 | ) 15 | 16 | 17 | class CoinScenario(Scenario): 18 | 19 | name = "coin" 20 | description = "Coin Flip Dataset" 21 | tags = ["symbolic_reasoning"] 22 | 23 | def __init__(self): 24 | super().__init__() 25 | 26 | def get_instances(self) -> List[Instance]: 27 | data_path: str = os.path.join(self.output_path, "data") 28 | 29 | instances: List[Instance] = [] 30 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} 31 | 32 | for split, filename in split_to_filename.items(): 33 | target_path: str = os.path.join(data_path, filename) 34 | 35 | with open(target_path, "r") as f: 36 | data = json.load(f) 37 | for entry in data: 38 | question = entry["question"] + ' Note that "flip" here means "reverse".' 39 | answer = entry["answer"] 40 | 41 | references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])] 42 | 43 | instance: Instance = Instance( 44 | input=Input(text=question), 45 | references=references, 46 | split=split, 47 | ) 48 | instances.append(instance) 49 | 50 | return instances 51 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/commonsense_qa_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.benchmark.scenarios.scenario import Instance 6 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 7 | 8 | from .scenario import ( 9 | Scenario, 10 | Instance, 11 | Reference, 12 | TRAIN_SPLIT, 13 | VALID_SPLIT, 14 | TEST_SPLIT, 15 | CORRECT_TAG, 16 | Input, 17 | Output, 18 | ) 19 | 20 | class CommonsenseQAScenario(Scenario): 21 | 22 | DATASET_DOWNLOAD_URL: str = ( 23 | "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl" 24 | ) 25 | 26 | name = "commonsense_qa" 27 | description = "CommonsenseQA Dataset" 28 | tags = ["question_answering"] 29 | 30 | def __init__(self): 31 | super().__init__() 32 | 33 | def get_instances(self) -> List[Instance]: 34 | 35 | data_path: str = os.path.join(self.output_path, "data") 36 | ensure_directory_exists(data_path) 37 | instances: List[Instance] = [] 38 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", VALID_SPLIT: "dev"} 39 | 40 | for split, filename in split_to_filename.items(): 41 | url: str = f"https://s3.amazonaws.com/commensenseqa/{filename}_rand_split.jsonl" 42 | target_path: str = os.path.join(data_path, filename) 43 | ensure_file_downloaded(source_url=url, target_path=target_path, unpack=False) 44 | 45 | with open(target_path, "r") as f: 46 | data_lst = list(f) 47 | 48 | for data in data_lst: 49 | entry = json.loads(data) 50 | 51 | question = entry["question"]["stem"] 52 | choices = entry["question"]["choices"] 53 | answer = ord(entry["answerKey"]) - ord("A") 54 | 55 | references: List[Reference] = [] 56 | for index, choice in enumerate(choices): 57 | tags = [CORRECT_TAG] if index == answer else [] 58 | references.append(Reference(Output(text=choice["text"]), tags=tags)) 59 | 60 | instance: Instance = Instance( 61 | input=Input(text=question), 62 | references=references, 63 | split=split, 64 | ) 65 | instances.append(instance) 66 | 67 | return instances 68 | 69 | 70 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/gsm_scenario.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | import os 3 | from typing import List 4 | 5 | from helm.common.general import ensure_file_downloaded 6 | from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output 7 | 8 | 9 | class GSM8KScenario(Scenario): 10 | """Task from "Training Verifiers to Solve Math Word Problems" (Cobbe et al. 2021): https://arxiv.org/abs/2110.14168 11 | 12 | Evaluates the capacity of a model to solve grade school math problems, when prompted to include reasoning. 13 | Encourages the model to work through the problem in a step-by-step way. 14 | 15 | Example from dataset (line breaks added for readability): 16 | 17 | ``` 18 | "question": 19 | "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. 20 | How many clips did Natalia sell altogether in April and May?", 21 | "answer": 22 | "Natalia sold 48/2 = <<48/2=24>>24 clips in May.\n 23 | Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n 24 | #### 72" 25 | ``` 26 | 27 | Also, incorporates prompting methods from "Chain of Thought Prompting Elicits Reasoning in Large Language Models" 28 | (Wei et al. 2021): https://arxiv.org/abs/2201.11903 29 | 30 | For example, we use "The answer is" before the answer, and remove line breaks within the answer. 31 | """ 32 | 33 | name = "gsm" 34 | description = "Grade school math dataset with 8.5K examples (GSM8K)." 35 | tags = ["reasoning", "math"] 36 | 37 | def get_instances(self) -> List[Instance]: 38 | splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT} 39 | base_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/" 40 | instances: List[Instance] = [] 41 | 42 | for split, split_tag in splits.items(): # Iterate over the splits 43 | source_url: str = f"{base_url}/{split}.jsonl" 44 | data_path: str = os.path.join(self.output_path, f"gsm_data_{split}") 45 | ensure_file_downloaded(source_url=source_url, target_path=data_path) 46 | 47 | with jsonlines.open(data_path) as reader: 48 | for example in reader: # Each example is a dictionary with a 'question' and 'answer' key 49 | answer: str = example["answer"].split("#### ")[1] 50 | instances.append( 51 | Instance( 52 | input=Input(text=example["question"]), 53 | references=[Reference(Output(text=answer), tags=[CORRECT_TAG])], 54 | split=split_tag, # Must assign split tag to instance. 55 | ), 56 | ) 57 | return instances 58 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/letter_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from .scenario import ( 6 | Scenario, 7 | Instance, 8 | Reference, 9 | TRAIN_SPLIT, 10 | TEST_SPLIT, 11 | CORRECT_TAG, 12 | Input, 13 | Output, 14 | ) 15 | 16 | 17 | class LetterScenario(Scenario): 18 | 19 | name = "letter" 20 | description = "Last Letter Concatenation Dataset" 21 | tags = ["symbolic_reasoning"] 22 | 23 | def __init__(self): 24 | super().__init__() 25 | 26 | def get_instances(self) -> List[Instance]: 27 | data_path: str = os.path.join(self.output_path, "data") 28 | 29 | instances: List[Instance] = [] 30 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} 31 | 32 | for split, filename in split_to_filename.items(): 33 | target_path: str = os.path.join(data_path, filename) 34 | 35 | with open(target_path, "r") as f: 36 | data = json.load(f) 37 | for entry in data: 38 | question = entry["question"] 39 | answer = entry["answer"] 40 | 41 | references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])] 42 | 43 | instance: Instance = Instance( 44 | input=Input(text=question), 45 | references=references, 46 | split=split, 47 | ) 48 | instances.append(instance) 49 | 50 | return instances 51 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/multi_arith_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.common.hierarchical_logger import hlog 6 | 7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 8 | from .scenario import ( 9 | Scenario, 10 | Instance, 11 | Reference, 12 | TRAIN_SPLIT, 13 | TEST_SPLIT, 14 | CORRECT_TAG, 15 | Input, 16 | Output, 17 | ) 18 | 19 | 20 | class MultiArithScenario(Scenario): 21 | 22 | name = "multi_arith" 23 | description = "MultiArith Dataset" 24 | tags = ["question_answering"] 25 | 26 | def __init__(self): 27 | super().__init__() 28 | 29 | def get_instances(self) -> List[Instance]: 30 | def delete_extra_zero(n): 31 | try: 32 | n = float(n) 33 | except: 34 | hlog(f"None {n}") 35 | return n 36 | if isinstance(n, int): 37 | return str(n) 38 | if isinstance(n, float): 39 | n = str(n).rstrip("0") 40 | n = int(n.rstrip(".")) if n.endswith(".") else float(n) 41 | n = str(n) 42 | return n 43 | 44 | def make_train_set(data_path: str): 45 | train = [ 46 | { 47 | "iIndex": 0, 48 | "sQuestion": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?", 49 | "lSolutions": [39], 50 | }, 51 | { 52 | "iIndex": 1, 53 | "sQuestion": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?", 54 | "lSolutions": [6], 55 | }, 56 | { 57 | "iIndex": 2, 58 | "sQuestion": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?", 59 | "lSolutions": [5], 60 | }, 61 | { 62 | "iIndex": 3, 63 | "sQuestion": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?", 64 | "lSolutions": [9], 65 | }, 66 | { 67 | "iIndex": 4, 68 | "sQuestion": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?", 69 | "lSolutions": [33], 70 | }, 71 | ] 72 | 73 | with open(os.path.join(data_path, "train"), "w") as f: 74 | f.write(json.dumps(train, indent=4)) 75 | 76 | data_path: str = os.path.join(self.output_path, "data") 77 | ensure_directory_exists(data_path) 78 | 79 | url: str = ( 80 | "https://raw.githubusercontent.com/wangxr14/Algebraic-Word-Problem-Solver/master/data/MultiArith.json" 81 | ) 82 | test_path: str = os.path.join(data_path, "test") 83 | ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False) 84 | 85 | make_train_set(data_path) 86 | 87 | instances: List[Instance] = [] 88 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} 89 | 90 | for split, filename in split_to_filename.items(): 91 | target_path: str = os.path.join(data_path, filename) 92 | 93 | with open(target_path, "r") as f: 94 | data = json.load(f) 95 | for entry in data: 96 | question = entry["sQuestion"].strip() 97 | answer = str(entry["lSolutions"][0]) 98 | if answer[-2:] == ".0": 99 | answer = answer[:-2] 100 | instance: Instance = Instance( 101 | input=Input(text=question), 102 | references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])], 103 | split=split, 104 | ) 105 | instances.append(instance) 106 | 107 | return instances 108 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/newsqa_scenario.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | from typing import Dict, List, Tuple 5 | 6 | from .scenario import ( 7 | Scenario, 8 | Instance, 9 | Reference, 10 | TRAIN_SPLIT, 11 | VALID_SPLIT, 12 | CORRECT_TAG, 13 | PassageQuestionInput, 14 | Input, 15 | Output, 16 | ) 17 | 18 | 19 | class NewsQAScenario(Scenario): 20 | """ 21 | The NewsQA dataset is from the paper: 22 | https://arxiv.org/abs/1611.09830 23 | 24 | Original repository can be found at: 25 | https://github.com/Maluuba/newsqa 26 | 27 | Note: The training dataset cannot be directly shared due to copyright issues, and needs to be downloaded by 28 | following the instructions in the repo above. These instructions are duplicated here for 29 | convenience. 30 | 31 | 1. Clone the repo (https://github.com/Maluuba/newsqa) 32 | 2. Download the data from (https://msropendata.com/datasets/939b1042-6402-4697-9c15-7a28de7e1321). 33 | You need to create a login account to download this data. 34 | 3. Download the CNN stories tar file from "https://cs.nyu.edu/~kcho/DMQA/" 35 | 4. Create the conda environment using the command (conda create --name newsqa python=2.7 "pandas>=0.19.2") 36 | 5. Install the requirements (conda activate newsqa && pip install --requirement requirements.txt) 37 | 38 | This should result in the creation of the file (combined-newsqa-data-v1.json) in the repo 39 | which is used in this scenario. 40 | 41 | NewsQA is a QA dataset containing 12,744 stories, 42 | and over 119,633 question-answer pairs. There are 92549 training qa pairs, 43 | 5166 qas in the dev set, and 5126 in the test set. 44 | Particularly, given the a news article from CNN, 45 | the goal is answer questions with answers consisting of spans of text from the corresponding articles. 46 | All of the questions and answers are written by crowd sourced human annotators. 47 | For more details, see https://arxiv.org/abs/1611.09830. 48 | 49 | More concretely, we prompt models using the following format 50 | 51 | Passage: 52 | Question: 53 | Answer: 54 | 55 | Note: Some of the questions do not have an answer in the context so the 56 | model needs to answer "No Answer". While this behavior might be tricky to 57 | learn in the few-shot setting, we still include these examples in the 58 | scenario. 59 | 60 | Using an example from the training dataset, we have: 61 | 62 | ``` 63 | NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman 64 | facing the death sentence for the killing of a teen in a case dubbed 'the house of horrors.' 65 | Moninder Singh Pandher was sentenced to death by a lower court in February... 66 | Question: Who was sentenced to death in February? 67 | Answer: 68 | ``` 69 | 70 | References 71 | 72 | ``` 73 | ['Moninder Singh Pandher'] 74 | ``` 75 | """ 76 | 77 | name = "newsqa" 78 | description = "Question answering using news articles." 79 | tags = ["question_answering"] 80 | 81 | def process_example(self, sample: dict) -> Tuple[Input, List[str]]: 82 | """ 83 | Given an sample from the dataset, create the prompt and the list of 84 | correct references. 85 | """ 86 | passage = sample["text"] 87 | all_questions = sample["questions"] 88 | question = random.sample(all_questions, 1)[0] 89 | prompt = PassageQuestionInput(passage=passage, question=question["q"], separator="\n\n") 90 | 91 | # add the answer with consensus 92 | # two checks below since the key "noAnswer" is not always present in the dictionary question["consensus"], 93 | # and when it is present it is not always True 94 | answers: List[str] = [] 95 | if ("noAnswer" in question["consensus"].keys()) and (question["consensus"]["noAnswer"] is True): 96 | answers.append("No Answer") 97 | else: 98 | start_point = question["consensus"]["s"] 99 | end_point = question["consensus"]["e"] 100 | answer_text = sample["text"][start_point:end_point] 101 | answers.append(answer_text) 102 | 103 | # add the other crowdworker answers 104 | for answer in question["answers"]: 105 | if "noAnswer" in answer["sourcerAnswers"][0].keys(): 106 | answer_text = "No Answer" 107 | # add to valid set of answers if it is already not present in the list 108 | if answer_text not in answers: 109 | answers.append(answer_text) 110 | else: 111 | start_point = answer["sourcerAnswers"][0]["s"] 112 | end_point = answer["sourcerAnswers"][0]["e"] 113 | answer_text = sample["text"][start_point:end_point] 114 | if answer_text not in answers: 115 | answers.append(answer_text) 116 | return prompt, answers 117 | 118 | def cleaned_samples(self, samples: List[Dict]) -> List[Dict]: 119 | """ 120 | Given the full dataset this function only retains news article and QAs where there are 121 | at least one question that is valid. The question is valid if all crowdworkers believe that 122 | the question is valid and that the answer is present in text. 123 | """ 124 | clean_samples: List = [] 125 | for sample in samples: 126 | # set of valid questions in the sample 127 | valid_questions = [] 128 | for question in sample["questions"]: 129 | add_question = True 130 | if ("isQuestionBad" in question.keys()) and (question["isQuestionBad"] != 0.0): 131 | add_question = False 132 | if ("badQuestion" in question["consensus"].keys()) and (question["consensus"]["badQuestion"] is True): 133 | add_question = False 134 | if add_question is True: 135 | valid_questions.append(question) 136 | clean = len(valid_questions) >= 1 137 | sample["questions"] = valid_questions 138 | if clean is True: 139 | clean_samples.append(sample) 140 | return clean_samples 141 | 142 | def get_file_instances(self, target_file: str, splits: Dict) -> List[Instance]: 143 | """ 144 | Helper for generating instances for a split. 145 | Args: 146 | target_file (str): Data file. 147 | splits (dict): Which splits to partition the data into. 148 | Returns: 149 | List[Instance]: Instances from the file for the specified split. 150 | """ 151 | file_instances: List[Instance] = [] 152 | with open(target_file, encoding="utf-8") as f: 153 | all_samples: List[Dict] = json.load(f)["data"] 154 | 155 | clean_samples: List[Dict] = self.cleaned_samples(all_samples) 156 | for sample in clean_samples: 157 | prompt, answers = self.process_example(sample) 158 | split = "train" if sample["type"] == "train" else "valid" 159 | instance = Instance( 160 | input=prompt, 161 | references=[Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in answers], 162 | split=splits[split], 163 | ) 164 | file_instances.append(instance) 165 | return file_instances 166 | 167 | def get_instances(self) -> List[Instance]: 168 | file_path: str = os.path.join("restricted", self.name, "combined-newsqa-data-v1.json") 169 | assert os.path.exists(file_path) 170 | splits = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT} 171 | random.seed(0) # randomness needed to pick question at random 172 | instances: List[Instance] = self.get_file_instances(target_file=file_path, splits=splits) 173 | return instances 174 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/singleeq_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.common.hierarchical_logger import hlog 6 | 7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 8 | from .scenario import ( 9 | Scenario, 10 | Instance, 11 | Reference, 12 | TRAIN_SPLIT, 13 | TEST_SPLIT, 14 | CORRECT_TAG, 15 | Input, 16 | Output, 17 | ) 18 | 19 | 20 | class SingleEqScenario(Scenario): 21 | 22 | name = "singleeq" 23 | description = "SingleEq Dataset" 24 | tags = ["question_answering"] 25 | 26 | def __init__(self): 27 | super().__init__() 28 | 29 | def get_instances(self) -> List[Instance]: 30 | def delete_extra_zero(n): 31 | try: 32 | n = float(n) 33 | except: 34 | hlog(f"None {n}") 35 | return n 36 | if isinstance(n, int): 37 | return str(n) 38 | if isinstance(n, float): 39 | n = str(n).rstrip("0") 40 | n = int(n.rstrip(".")) if n.endswith(".") else float(n) 41 | n = str(n) 42 | return n 43 | 44 | def make_train_set(data_path: str): 45 | train = [ 46 | { 47 | "iIndex": 0, 48 | "sQuestion": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?", 49 | "lSolutions": [39], 50 | }, 51 | { 52 | "iIndex": 1, 53 | "sQuestion": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?", 54 | "lSolutions": [6], 55 | }, 56 | { 57 | "iIndex": 2, 58 | "sQuestion": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?", 59 | "lSolutions": [5], 60 | }, 61 | { 62 | "iIndex": 3, 63 | "sQuestion": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?", 64 | "lSolutions": [9], 65 | }, 66 | { 67 | "iIndex": 4, 68 | "sQuestion": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?", 69 | "lSolutions": [33], 70 | }, 71 | ] 72 | 73 | with open(os.path.join(data_path, "train"), "w") as f: 74 | f.write(json.dumps(train, indent=4)) 75 | 76 | data_path: str = os.path.join(self.output_path, "data") 77 | ensure_directory_exists(data_path) 78 | 79 | url: str = ( 80 | "https://raw.githubusercontent.com/chuanyang-Zheng/Progressive-Hint/main/dataset/SingleEq/SingleEq.json" 81 | ) 82 | test_path: str = os.path.join(data_path, "test") 83 | ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False) 84 | 85 | make_train_set(data_path) 86 | 87 | instances: List[Instance] = [] 88 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} 89 | 90 | for split, filename in split_to_filename.items(): 91 | target_path: str = os.path.join(data_path, filename) 92 | 93 | with open(target_path, "r") as f: 94 | data = json.load(f) 95 | for entry in data: 96 | question = entry["sQuestion"].strip() 97 | answer = str(entry["lSolutions"][0]) 98 | if answer[-2:] == ".0": 99 | answer = answer[:-2] 100 | instance: Instance = Instance( 101 | input=Input(text=question), 102 | references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])], 103 | split=split, 104 | ) 105 | instances.append(instance) 106 | 107 | return instances 108 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/svamp_scenario.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List, Dict 4 | 5 | from helm.common.hierarchical_logger import hlog 6 | 7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists 8 | from .scenario import ( 9 | Scenario, 10 | Instance, 11 | Reference, 12 | TRAIN_SPLIT, 13 | TEST_SPLIT, 14 | CORRECT_TAG, 15 | Input, 16 | Output, 17 | ) 18 | 19 | 20 | class SVAMPScenario(Scenario): 21 | 22 | name = "svamp" 23 | description = "SVAMP Dataset" 24 | tags = ["question_answering"] 25 | 26 | def __init__(self): 27 | super().__init__() 28 | 29 | def get_instances(self) -> List[Instance]: 30 | def delete_extra_zero(n): 31 | try: 32 | n = float(n) 33 | except: 34 | hlog(f"None {n}") 35 | return n 36 | if isinstance(n, int): 37 | return str(n) 38 | if isinstance(n, float): 39 | n = str(n).rstrip("0") 40 | n = int(n.rstrip(".")) if n.endswith(".") else float(n) 41 | n = str(n) 42 | return n 43 | 44 | def make_train_set(data_path: str): 45 | train = [ 46 | { 47 | "ID": "train-1", 48 | "Body": "Leah had 32 chocolates and her sister had 42. If they ate 35,", 49 | "Question": "how many pieces do they have left in total?", 50 | "Answer": 39, 51 | }, 52 | { 53 | "ID": "train-2", 54 | "Body": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees.", 55 | "Question": "How many trees did the grove workers plant today?", 56 | "Answer": 6, 57 | }, 58 | { 59 | "ID": "train-3", 60 | "Body": "If there are 3 cars in the parking lot and 2 more cars arrive,", 61 | "Question": "how many cars are in the parking lot?", 62 | "Answer": 5, 63 | }, 64 | { 65 | "ID": "train-4", 66 | "Body": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad.", 67 | "Question": "How many toys does he have now?", 68 | "Answer": 9, 69 | }, 70 | { 71 | "ID": "train-5", 72 | "Body": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more.", 73 | "Question": "How many golf balls did he have at the end of wednesday?", 74 | "Answer": 33, 75 | }, 76 | ] 77 | 78 | with open(os.path.join(data_path, "train"), "w") as f: 79 | f.write(json.dumps(train, indent=4)) 80 | 81 | data_path: str = os.path.join(self.output_path, "data") 82 | ensure_directory_exists(data_path) 83 | 84 | instances: List[Instance] = [] 85 | split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} 86 | 87 | url: str = "https://raw.githubusercontent.com/arkilpatel/SVAMP/main/SVAMP.json" 88 | test_path: str = os.path.join(data_path, "test") 89 | ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False) 90 | 91 | make_train_set(data_path) 92 | 93 | for split, filename in split_to_filename.items(): 94 | target_path: str = os.path.join(data_path, filename) 95 | 96 | with open(target_path, "r") as f: 97 | data = json.load(f) 98 | for entry in data: 99 | question = entry["Body"].strip() + " " + entry["Question"].strip() 100 | answer = str(entry["Answer"]) 101 | if answer[-2:] == ".0": 102 | answer = answer[:-2] 103 | instance: Instance = Instance( 104 | input=Input(text=question), 105 | references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])], 106 | split=split, 107 | ) 108 | instances.append(instance) 109 | 110 | return instances 111 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_2_window_service.py: -------------------------------------------------------------------------------- 1 | from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig 2 | from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService 3 | from helm.benchmark.window_services.tokenizer_service import TokenizerService 4 | 5 | 6 | class Llama2WindowService(HuggingFaceWindowService): 7 | def __init__(self, service: TokenizerService): 8 | # Tokenizer name hf-internal-testing/llama-tokenizer is taken from: 9 | # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example 10 | model_config = HuggingFaceModelConfig( 11 | namespace="hf-internal-testing", model_name="llama-tokenizer", revision=None 12 | ) 13 | super().__init__(service, model_config) 14 | 15 | @property 16 | def max_sequence_length(self) -> int: 17 | return 4000 18 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_window_service.py: -------------------------------------------------------------------------------- 1 | from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig 2 | from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService 3 | from helm.benchmark.window_services.tokenizer_service import TokenizerService 4 | 5 | 6 | class LlamaWindowService(HuggingFaceWindowService): 7 | def __init__(self, service: TokenizerService): 8 | # Tokenizer name hf-internal-testing/llama-tokenizer is taken from: 9 | # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example 10 | model_config = HuggingFaceModelConfig( 11 | namespace="hf-internal-testing", model_name="llama-tokenizer", revision=None 12 | ) 13 | super().__init__(service, model_config) 14 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/window_service_factory.py: -------------------------------------------------------------------------------- 1 | from helm.proxy.models import ( 2 | get_model, 3 | get_model_names_with_tag, 4 | Model, 5 | AI21_WIDER_CONTEXT_WINDOW_TAG, 6 | WIDER_CONTEXT_WINDOW_TAG, 7 | ) 8 | from .ai21_window_service import AI21WindowService 9 | from .wider_ai21_window_service import WiderAI21WindowService 10 | from .anthropic_window_service import AnthropicWindowService 11 | from .cohere_window_service import CohereWindowService, CohereCommandWindowService 12 | from .luminous_window_service import ( 13 | LuminousBaseWindowService, 14 | LuminousExtendedWindowService, 15 | LuminousSupremeWindowService, 16 | LuminousWorldWindowService, 17 | ) 18 | from .openai_window_service import OpenAIWindowService 19 | from .wider_openai_window_service import WiderOpenAIWindowService 20 | from .mt_nlg_window_service import MTNLGWindowService 21 | from .bloom_window_service import BloomWindowService 22 | from .huggingface_window_service import HuggingFaceWindowService 23 | from .ice_window_service import ICEWindowService 24 | from .santacoder_window_service import SantaCoderWindowService 25 | from .gpt2_window_service import GPT2WindowService 26 | from .gptj_window_service import GPTJWindowService 27 | from .gptneox_window_service import GPTNeoXWindowService 28 | from .opt_window_service import OPTWindowService 29 | from .t0pp_window_service import T0ppWindowService 30 | from .t511b_window_service import T511bWindowService 31 | from .flan_t5_window_service import FlanT5WindowService 32 | from .ul2_window_service import UL2WindowService 33 | from .yalm_window_service import YaLMWindowService 34 | from .window_service import WindowService 35 | from .tokenizer_service import TokenizerService 36 | from .llama_window_service import LlamaWindowService 37 | from .llama_2_window_service import Llama2WindowService 38 | from helm.proxy.clients.huggingface_client import get_huggingface_model_config 39 | 40 | 41 | class WindowServiceFactory: 42 | @staticmethod 43 | def get_window_service(model_name: str, service: TokenizerService) -> WindowService: 44 | """ 45 | Returns a `WindowService` given the name of the model. 46 | Make sure this function returns instantaneously on repeated calls. 47 | """ 48 | model: Model = get_model(model_name) 49 | organization: str = model.organization 50 | engine: str = model.engine 51 | 52 | window_service: WindowService 53 | huggingface_model_config = get_huggingface_model_config(model_name) 54 | if huggingface_model_config: 55 | window_service = HuggingFaceWindowService(service=service, model_config=huggingface_model_config) 56 | elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG): 57 | window_service = WiderOpenAIWindowService(service) 58 | # For the Google models, we approximate with the OpenAIWindowService 59 | elif organization == "openai" or organization == "simple" or organization == "google": 60 | window_service = OpenAIWindowService(service) 61 | elif organization == "AlephAlpha": 62 | if engine == "luminous-base": 63 | window_service = LuminousBaseWindowService(service) 64 | elif engine == "luminous-extended": 65 | window_service = LuminousExtendedWindowService(service) 66 | elif engine == "luminous-supreme": 67 | window_service = LuminousSupremeWindowService(service) 68 | elif engine == "luminous-world": 69 | window_service = LuminousWorldWindowService(service) 70 | else: 71 | raise ValueError(f"Unhandled Aleph Alpha model: {engine}") 72 | elif organization == "microsoft": 73 | window_service = MTNLGWindowService(service) 74 | elif organization == "anthropic": 75 | window_service = AnthropicWindowService(service) 76 | elif engine == "santacoder": 77 | window_service = SantaCoderWindowService(service) 78 | elif model_name == "huggingface/gpt2": 79 | window_service = GPT2WindowService(service) 80 | elif model_name == "together/bloom": 81 | window_service = BloomWindowService(service) 82 | elif model_name == "together/glm": 83 | # From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on 84 | # icetk---a unified multimodal tokenizer for images, Chinese, and English." 85 | window_service = ICEWindowService(service) 86 | elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "gooseai/gpt-j-6b"]: 87 | window_service = GPTJWindowService(service) 88 | elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b", "together/gpt-neoxt-chat-base-20b"]: 89 | window_service = GPTNeoXWindowService(service) 90 | elif model_name == "together/h3-2.7b": 91 | window_service = GPT2WindowService(service) 92 | elif model_name in ["together/opt-66b", "together/opt-175b"]: 93 | window_service = OPTWindowService(service) 94 | elif model_name == "together/t0pp": 95 | window_service = T0ppWindowService(service) 96 | elif model_name == "together/t5-11b": 97 | window_service = T511bWindowService(service) 98 | elif model_name == "together/flan-t5-xxl": 99 | window_service = FlanT5WindowService(service) 100 | elif model_name == "together/ul2": 101 | window_service = UL2WindowService(service) 102 | elif model_name == "together/yalm": 103 | window_service = YaLMWindowService(service) 104 | elif model_name == "local/vicuna-13b": 105 | window_service = LlamaWindowService(service) 106 | elif model_name in ["local/llama-2-7b-chat", "local/llama-2-13b-chat", "local/llama-2-70b-chat"]: 107 | window_service = Llama2WindowService(service) 108 | elif organization == "cohere": 109 | if "command" in engine: 110 | window_service = CohereCommandWindowService(service) 111 | else: 112 | window_service = CohereWindowService(service) 113 | elif organization == "ai21": 114 | if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG): 115 | window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service)) 116 | else: 117 | window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service)) 118 | else: 119 | raise ValueError(f"Unhandled model name: {model_name}") 120 | 121 | return window_service 122 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/common/request.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Optional, Dict 3 | 4 | from helm.proxy.models import Model, get_model 5 | from .general import indent_lines, format_text 6 | 7 | 8 | @dataclass(frozen=True) 9 | class Request: 10 | """ 11 | A `Request` specifies how to query a language model (given a prompt, 12 | complete it). It is the unified representation for communicating with 13 | various APIs (e.g., GPT-3, Jurassic). 14 | """ 15 | 16 | model: str = "openai/text-davinci-002" 17 | """Which model to query""" 18 | 19 | embedding: bool = False 20 | """Whether to query embedding instead of text response""" 21 | 22 | prompt: str = "" 23 | """What prompt do condition the language model on""" 24 | 25 | temperature: float = 1.0 26 | """Temperature parameter that governs diversity""" 27 | 28 | num_completions: int = 1 29 | """Generate this many completions (by sampling from the model)""" 30 | 31 | top_k_per_token: int = 1 32 | """Take this many highest probability candidates per token in the completion""" 33 | 34 | max_tokens: int = 100 35 | """Maximum number of tokens to generate (per completion)""" 36 | 37 | stop_sequences: List[str] = field(default_factory=list) 38 | """Stop generating once we hit one of these strings.""" 39 | 40 | echo_prompt: bool = False 41 | """Should `prompt` be included as a prefix of each completion? (e.g., for 42 | evaluating perplexity of the prompt)""" 43 | 44 | top_p: float = 1 45 | """Same from tokens that occupy this probability mass (nucleus sampling)""" 46 | 47 | presence_penalty: float = 0 48 | """Penalize repetition (OpenAI & Writer only)""" 49 | 50 | frequency_penalty: float = 0 51 | """Penalize repetition (OpenAI & Writer only)""" 52 | 53 | random: Optional[str] = None 54 | """Used to control randomness. Expect different responses for the same 55 | request but with different values for `random`.""" 56 | 57 | messages: Optional[List[Dict[str, str]]] = None 58 | """Used for chat models. (OpenAI only for now). 59 | if messages is specified for a chat model, the prompt is ignored. 60 | Otherwise, the client should convert the prompt into a message.""" 61 | 62 | @property 63 | def model_organization(self) -> str: 64 | """Example: 'openai/davinci' => 'openai'""" 65 | model: Model = get_model(self.model) 66 | return model.organization 67 | 68 | @property 69 | def model_engine(self) -> str: 70 | """Example: 'openai/davinci' => 'davinci'""" 71 | model: Model = get_model(self.model) 72 | return model.engine 73 | 74 | 75 | @dataclass(frozen=True) 76 | class Token: 77 | """ 78 | A `Token` represents one token position in a `Sequence`, which has the 79 | chosen `text` as well as the top probabilities under the model. 80 | 81 | Note: (text, logprob) could exist or not exist in `top_logprobs`. 82 | """ 83 | 84 | # Text that was chosen 85 | text: str 86 | 87 | # Log probability of generating that 88 | logprob: float 89 | 90 | # text -> log probability of generating that 91 | top_logprobs: Dict[str, float] 92 | 93 | def render_lines(self) -> List[str]: 94 | top_logprobs_entries = sorted(self.top_logprobs.items(), key=lambda entry: -entry[1]) 95 | top_logprobs_str = ( 96 | "{" + ", ".join(f"{format_text(text)}: {logprob}" for text, logprob in top_logprobs_entries) + "}" 97 | ) 98 | return [ 99 | f"{format_text(self.text)} logprob={self.logprob} top_logprobs={top_logprobs_str}", 100 | ] 101 | 102 | 103 | @dataclass(frozen=True) 104 | class Sequence: 105 | """A `Sequence` is a sequence of tokens.""" 106 | 107 | # The concatenation of all the tokens 108 | text: str 109 | 110 | # The sum of the log probabilities of all tokens 111 | logprob: float 112 | 113 | # The tokens 114 | tokens: List[Token] 115 | 116 | # Why did the sequence finish? 117 | finish_reason: Optional[Dict] = None 118 | 119 | def __add__(self, other: "Sequence") -> "Sequence": 120 | return Sequence(self.text + other.text, self.logprob + other.logprob, self.tokens + other.tokens) 121 | 122 | def render_lines(self) -> List[str]: 123 | result = [ 124 | f"text: {self.text}", 125 | f"log_prob: {self.logprob}", 126 | "tokens {", 127 | ] 128 | for token in self.tokens: 129 | result.extend(indent_lines(token.render_lines(), 2)) 130 | result.append("}") 131 | if self.finish_reason: 132 | result.append(f"finish_reason: {self.finish_reason}") 133 | return result 134 | 135 | 136 | @dataclass(frozen=True) 137 | class ErrorFlags: 138 | """Describes how to treat errors in the request.""" 139 | 140 | is_retriable: Optional[bool] = None 141 | """Whether the request is retriable or whether the error is permanent. 142 | If None, the error is treated as retriable.""" 143 | 144 | is_fatal: Optional[bool] = None 145 | """Whether the error is fatal, i.e. the run should be discarded. 146 | If None, the error is treated as fatal.""" 147 | 148 | 149 | @dataclass(frozen=False) 150 | class RequestResult: 151 | """What comes back due to a `Request`.""" 152 | 153 | success: bool 154 | """Whether the request was successful""" 155 | 156 | embedding: List[float] 157 | """Fixed dimensional embedding corresponding to the entire prompt""" 158 | 159 | completions: List[Sequence] 160 | """List of completion""" 161 | 162 | cached: bool 163 | """Whether the request was actually cached""" 164 | 165 | request_time: Optional[float] = None 166 | """How long did the request take?""" 167 | 168 | request_datetime: Optional[int] = None 169 | """When was the request sent? 170 | We keep track of when the request was made because the underlying model or inference procedure backing the API 171 | might change over time. The integer represents the current time in seconds since the Epoch (January 1, 1970).""" 172 | 173 | error: Optional[str] = None 174 | """If `success` is false, what was the error?""" 175 | 176 | error_flags: Optional[ErrorFlags] = None 177 | """Describes how to treat errors in the request.""" 178 | 179 | batch_size: Optional[int] = None 180 | """Batch size (`TogetherClient` only)""" 181 | 182 | batch_request_time: Optional[float] = None 183 | """How long it took to process the batch? (`TogetherClient` only)""" 184 | 185 | full_text: Optional[str] = None 186 | 187 | cot: Optional[bool] = None 188 | 189 | def render_lines(self) -> List[str]: 190 | output = [ 191 | f"success: {self.success}", 192 | f"cached: {self.cached}", 193 | ] 194 | if self.request_time: 195 | output.append(f"request_time: {self.request_time}") 196 | if self.request_datetime: 197 | output.append(f"request_datetime: {self.request_datetime}") 198 | if self.error: 199 | output.append(f"error: {self.error}") 200 | 201 | output.append("completions {") 202 | for completion in self.completions: 203 | output.extend(indent_lines(completion.render_lines())) 204 | output.append("}") 205 | 206 | return output 207 | 208 | 209 | EMBEDDING_UNAVAILABLE_REQUEST_RESULT = RequestResult( 210 | success=False, 211 | cached=False, 212 | error="Computing the embedding is unavailable in this client", 213 | completions=[], 214 | embedding=[], 215 | ) 216 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/client.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | from abc import ABC, abstractmethod 4 | from typing import Callable, Any, Dict, List 5 | 6 | from helm.common.hierarchical_logger import hlog 7 | from helm.common.request import Request, RequestResult, Sequence, Token 8 | from helm.common.tokenization_request import ( 9 | TokenizationRequest, 10 | TokenizationRequestResult, 11 | DecodeRequest, 12 | DecodeRequestResult, 13 | ) 14 | 15 | 16 | class Client(ABC): 17 | @staticmethod 18 | def make_cache_key(raw_request: Dict, request: Request) -> Dict: 19 | """ 20 | Construct the key for the cache using the raw request. 21 | Add `request.random` to the key, if defined. 22 | """ 23 | if request.random is not None: 24 | assert "random" not in raw_request 25 | cache_key = {**raw_request, "random": request.random} 26 | else: 27 | cache_key = raw_request 28 | return cache_key 29 | 30 | @abstractmethod 31 | def make_request(self, request: Request, prompt_list: Dict[str, Any]) -> RequestResult: 32 | pass 33 | 34 | @abstractmethod 35 | def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult: 36 | pass 37 | 38 | @abstractmethod 39 | def decode(self, request: DecodeRequest) -> DecodeRequestResult: 40 | pass 41 | 42 | 43 | def wrap_request_time(compute: Callable[[], Any]) -> Callable[[], Any]: 44 | """Return a version of `compute` that puts `request_time` into its output.""" 45 | 46 | def wrapped_compute(): 47 | start_time = time.time() 48 | response = compute() 49 | end_time = time.time() 50 | response["request_time"] = end_time - start_time 51 | response["request_datetime"] = int(start_time) 52 | return response 53 | 54 | return wrapped_compute 55 | 56 | 57 | def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool = True) -> Sequence: 58 | """ 59 | Certain providers have bugs where they aren't respecting max_tokens, 60 | stop_sequences and the end of text token, so as a hack, we have to manually 61 | truncate the suffix of `sequence` and `tokens` as a post-hoc process. 62 | """ 63 | # TODO: if echo_prompt, then we should only ignore the prompt, but we don't 64 | # know how many tokens the prompt takes up. 65 | # In the benchmark, usually echo_prompt is only used for language modeling, 66 | # where max_tokens = 0, so there's nothing to truncate. 67 | if request.echo_prompt: 68 | if request.max_tokens != 0: 69 | hlog("WARNING: don't know how to handle echo_prompt and max_tokens > 0, not truncating") 70 | return sequence 71 | 72 | for stop in request.stop_sequences: 73 | # Find `stop` in the text 74 | try: 75 | new_text = sequence.text[: sequence.text.index(stop)] 76 | except ValueError: 77 | # The stop sequence doesn't exist, but it might exist in the list of tokens. 78 | new_text = sequence.text 79 | 80 | # Strip `stop` off the tokens 81 | new_tokens: List[Token] = [] 82 | # Need to start 83 | for token in sequence.tokens: 84 | # Note: we can only strip at token boundaries 85 | if token.text.startswith(stop): 86 | break 87 | new_tokens.append(token) 88 | 89 | if len(new_text) < len(sequence.text) and len(new_tokens) == len(sequence.tokens): 90 | hlog( 91 | f"WARNING: Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), " 92 | f"but wasn't able to strip the tokens" 93 | ) 94 | 95 | # Recompute log probability 96 | new_logprob = sum(token.logprob for token in new_tokens) 97 | 98 | # if print_warning: 99 | # hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}") 100 | 101 | sequence = Sequence(text=new_text, logprob=new_logprob, tokens=new_tokens) 102 | 103 | # Truncate based on the max number of tokens. 104 | if len(sequence.tokens) > request.max_tokens: 105 | if print_warning: 106 | hlog(f"WARNING: truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}") 107 | new_tokens = sequence.tokens[: request.max_tokens] 108 | 109 | # This is imperfect stitching together of tokens, so just to make sure this is okay 110 | # TODO: should use the proper detokenizer since T5-style models. 111 | # Usually, in our benchmark, max_tokens is active when it's 1, so hopefully this isn't an issue. 112 | new_text = "".join(token.text for token in new_tokens) 113 | if not sequence.text.startswith(new_text): 114 | hlog(f"WARNING: {json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}") 115 | 116 | new_logprob = sum(token.logprob for token in new_tokens) 117 | 118 | sequence = Sequence(text=new_text, logprob=new_logprob, tokens=new_tokens) 119 | 120 | return sequence 121 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/huggingface_tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Dict, Optional, Set 3 | 4 | from transformers import AutoTokenizer 5 | 6 | from helm.common.hierarchical_logger import htrack_block, hlog 7 | 8 | from helm.proxy.clients.huggingface_model_registry import get_huggingface_model_config 9 | 10 | 11 | # Tokenizer names where the HELM tokenizer name and the Hugging Face tokenizer name 12 | # are identical. 13 | _KNOWN_TOKENIZER_NAMES: Set[str] = { 14 | "EleutherAI/gpt-j-6B", # Not a typo: Named "gpt-j-6B" instead of "gpt-j-6b" in Hugging Face 15 | "EleutherAI/gpt-neox-20b", 16 | "bigscience/bloom", 17 | "bigscience/T0pp", 18 | "facebook/opt-66b", 19 | "google/ul2", 20 | "google/flan-t5-xxl", 21 | "bigcode/santacoder", 22 | "Writer/palmyra-base", 23 | "bigcode/starcoder", 24 | "hf-internal-testing/llama-tokenizer", 25 | } 26 | 27 | 28 | # Map of HELM tokenizer name to Hugging Face tokenizer name for tokenizers where they differ. 29 | _KNOWN_TOKENIZER_ALIASES: Dict[str, str] = { 30 | "huggingface/gpt2": "gpt2", 31 | "google/t5-11b": "t5-11b", 32 | } 33 | 34 | 35 | class HuggingFaceTokenizers: 36 | 37 | tokenizers: Dict[str, Any] = {} 38 | 39 | @staticmethod 40 | def get_tokenizer(tokenizer_name: str) -> Any: 41 | """ 42 | Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached. 43 | Returns the tokenizer. 44 | """ 45 | 46 | def load_tokenizer(hf_tokenizer_name: str, revision: Optional[str] = None): 47 | """Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace.""" 48 | tokenizer_kwargs = {} 49 | if revision is not None: 50 | tokenizer_kwargs["revision"] = revision 51 | try: 52 | # From the Hugging Face documentation, "local_files_only(defaults to False) — 53 | # Whether or not to only look at local files". 54 | # Running `local_files_only=False` requires an internet connection even if the files are downloaded 55 | # and cached. We need to first run with `local_files_only=True` just in case the machine 56 | # we are running this code has connection issues. If the tokenizer files are not cached, 57 | # we attempt to download them from HuggingFace. 58 | # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside 59 | # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face 60 | # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available. 61 | return AutoTokenizer.from_pretrained( 62 | hf_tokenizer_name, local_files_only=True, use_fast=True, **tokenizer_kwargs 63 | ) 64 | except OSError: 65 | hlog(f"Local files do not exist for HuggingFace tokenizer: {hf_tokenizer_name}. Downloading...") 66 | return AutoTokenizer.from_pretrained( 67 | hf_tokenizer_name, local_files_only=False, use_fast=True, **tokenizer_kwargs 68 | ) 69 | 70 | if tokenizer_name not in HuggingFaceTokenizers.tokenizers: 71 | with htrack_block(f"Loading {tokenizer_name} with Hugging Face Transformers"): 72 | # To avoid deadlocks when using HuggingFace tokenizers with multiple processes 73 | os.environ["TOKENIZERS_PARALLELISM"] = "False" 74 | 75 | # Weights are cached at ~/.cache/huggingface/transformers. 76 | hf_tokenizer_name: str 77 | revision: Optional[str] = None 78 | model_config = get_huggingface_model_config(tokenizer_name) 79 | if model_config: 80 | hf_tokenizer_name = model_config.model_id 81 | revision = model_config.revision 82 | elif tokenizer_name in _KNOWN_TOKENIZER_NAMES: 83 | hf_tokenizer_name = tokenizer_name 84 | elif tokenizer_name in _KNOWN_TOKENIZER_ALIASES: 85 | hf_tokenizer_name = _KNOWN_TOKENIZER_ALIASES[tokenizer_name] 86 | else: 87 | raise ValueError(f"Unsupported HuggingFace tokenizer: {tokenizer_name}") 88 | 89 | # Keep the tokenizer in memory, so we don't recreate it for future requests 90 | HuggingFaceTokenizers.tokenizers[tokenizer_name] = load_tokenizer(hf_tokenizer_name, revision) 91 | 92 | return HuggingFaceTokenizers.tokenizers[tokenizer_name] 93 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/src/helm/proxy/services/service.py: -------------------------------------------------------------------------------- 1 | import mako.template 2 | from abc import ABC, abstractmethod 3 | from dataclasses import dataclass 4 | from typing import Dict, List, Tuple, Any 5 | 6 | from helm.common.general import parse_hocon 7 | from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult 8 | from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest 9 | from helm.common.tokenization_request import ( 10 | WindowServiceInfo, 11 | TokenizationRequest, 12 | TokenizationRequestResult, 13 | DecodeRequest, 14 | DecodeRequestResult, 15 | ) 16 | from helm.common.request import Request, RequestResult 17 | from helm.proxy.models import Model 18 | from helm.proxy.query import Query, QueryResult 19 | from helm.proxy.accounts import Authentication, Account 20 | 21 | VERSION = "1.0" 22 | CREDENTIALS_FILE = "credentials.conf" 23 | ACCOUNTS_FILE = "accounts.sqlite" 24 | CACHE_DIR = "cache" 25 | MONGO_URI = "mongo_uri" 26 | MAX_EXPANSION = 1000 27 | 28 | 29 | @dataclass(frozen=True) 30 | class GeneralInfo: 31 | version: str 32 | example_queries: List[Query] 33 | all_models: List[Model] 34 | 35 | 36 | def expand_environments(environments: Dict[str, List[str]]): 37 | """ 38 | `environments` is a map from variable names to a list of strings. 39 | Return: a list of environments, where for each variable, we choose one of its string. 40 | """ 41 | output_environments: List[Dict[str, str]] = [] 42 | 43 | def recurse(old_items: List[Tuple[str, List[str]]], new_items: List[Tuple[str, str]]): 44 | if len(output_environments) >= MAX_EXPANSION: 45 | return 46 | if len(old_items) == 0: 47 | output_environments.append(dict(new_items)) 48 | else: 49 | item, rest_old_items = old_items[0], old_items[1:] 50 | key, list_value = item 51 | for elem_value in list_value: 52 | recurse(rest_old_items, new_items + [(key, elem_value)]) 53 | 54 | recurse(list(environments.items()), []) 55 | return output_environments 56 | 57 | 58 | def substitute_text(text: str, environment: Dict[str, str]) -> str: 59 | """ 60 | Example: 61 | text = "Hello {name}" 62 | environment = {"name": "Sue"} 63 | Return "Hello Sue" 64 | """ 65 | return mako.template.Template(text).render(**environment) 66 | 67 | 68 | def synthesize_request(prompt: str, settings: str, environment: Dict[str, str]) -> Request: 69 | """Substitute `environment` into `prompt` and `settings`.""" 70 | request: Dict[str, Any] = {} 71 | request["prompt"] = substitute_text(prompt, environment) 72 | request.update(parse_hocon(substitute_text(settings, environment))) 73 | return Request(**request) 74 | 75 | 76 | class Service(ABC): 77 | @abstractmethod 78 | def get_general_info(self) -> GeneralInfo: 79 | """Get general info.""" 80 | pass 81 | 82 | @abstractmethod 83 | def get_window_service_info(self, model_name: str) -> WindowServiceInfo: 84 | """Get window service info.""" 85 | pass 86 | 87 | @abstractmethod 88 | def expand_query(self, query: Query) -> QueryResult: 89 | """Turn the `query` into requests.""" 90 | pass 91 | 92 | @abstractmethod 93 | def make_request(self, auth: Authentication, request: Request, prompt_list: Dict[str, Any] = {}) -> RequestResult: 94 | """Actually make a request to an API.""" 95 | pass 96 | 97 | @abstractmethod 98 | def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult: 99 | """Tokenize via an API.""" 100 | pass 101 | 102 | @abstractmethod 103 | def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult: 104 | """Decodes to text.""" 105 | pass 106 | 107 | def is_toxicity_scoring_available(self) -> bool: 108 | """Whether toxicity score is available, i.e. whether the Perspective API key is set. 109 | Return: (is_available, error_message)""" 110 | return False 111 | 112 | @abstractmethod 113 | def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult: 114 | """Get toxicity scores for a batch of text. 115 | Should only be called if `self.is_toxicity_scoring_available` is True.""" 116 | pass 117 | 118 | def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult: 119 | """Get responses to a critique request.""" 120 | pass 121 | 122 | @abstractmethod 123 | def create_account(self, auth: Authentication) -> Account: 124 | """Creates a new account.""" 125 | pass 126 | 127 | @abstractmethod 128 | def delete_account(self, auth: Authentication, api_key: str) -> Account: 129 | """Deletes an account.""" 130 | pass 131 | 132 | @abstractmethod 133 | def get_accounts(self, auth: Authentication) -> List[Account]: 134 | """Get list of accounts.""" 135 | pass 136 | 137 | @abstractmethod 138 | def get_account(self, auth: Authentication) -> Account: 139 | """Get information about an account.""" 140 | pass 141 | 142 | @abstractmethod 143 | def update_account(self, auth: Authentication, account: Account) -> Account: 144 | """Update account.""" 145 | pass 146 | 147 | @abstractmethod 148 | def rotate_api_key(self, auth: Authentication, account: Account) -> Account: 149 | """Generate a new API key for a given account.""" 150 | pass 151 | 152 | @abstractmethod 153 | def shutdown(self, auth: Authentication): 154 | """Shutdown server.""" 155 | pass 156 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/helm_updates/update_helm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script updates the helm module with AgentInstruct changes 4 | # should run the script from the top level of the AgentInstruct repo 5 | 6 | # linking our instructions to _latest 7 | python scripts/replicate.py 8 | 9 | # move benchmark output to top level with letter and coin data 10 | cp -r src/agentinstruct/reasoning/helm_updates/benchmark_output . 11 | 12 | # creating prod_env at top level 13 | mkdir prod_env 14 | 15 | # creating credentials file 16 | touch prod_env/credentials.conf 17 | 18 | # removing helm/.github 19 | rm -rf src/agentinstruct/reasoning/helm/.github 20 | 21 | # removing helm/docs 22 | rm -rf src/agentinstruct/reasoning/helm/.github 23 | 24 | # added prompt dict to AdapterSpec class 25 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapter_spec.py src/agentinstruct/reasoning/helm/src/helm/benchmark/adaptation/adapter_spec.py 26 | 27 | # updating truncation in in_context_learning_adapter.py 28 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py src/agentinstruct/reasoning/helm/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py 29 | 30 | # add scenario imports to __init__.py 31 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/__init__.py src/agentinstruct/reasoning/helm/src/helm/benchmark/__init__.py 32 | 33 | # update the multiple_choice_join_adapter 34 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py src/agentinstruct/reasoning/helm/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py 35 | 36 | # update executor.py with prompt_list 37 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/executor.py src/agentinstruct/reasoning/helm/src/helm/benchmark/executor.py 38 | 39 | # update basic_metrics.py to check for empty strings 40 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/metrics/basic_metrics.py src/agentinstruct/reasoning/helm/src/helm/benchmark/metrics/basic_metrics.py 41 | 42 | # handle --skip-expanders arg for zero-shot runs on run.py 43 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/run.py src/agentinstruct/reasoning/helm/src/helm/benchmark/run.py 44 | 45 | # update the run_expander with instruction expanders for agentinstruct and zeroshotcot 46 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/run_expander.py src/agentinstruct/reasoning/helm/src/helm/benchmark/run_expander.py 47 | 48 | # update the run_specs with new datasets 49 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/run_specs.py src/agentinstruct/reasoning/helm/src/helm/benchmark/run_specs.py 50 | 51 | # add addsub scenario 52 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/addsub_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 53 | 54 | # add aqua scenario 55 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/aqua_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 56 | 57 | # add big bench hard scenario 58 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/big_bench_hard_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 59 | 60 | # add coin scenario 61 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/coin_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 62 | 63 | # add commonsense_qa scenario 64 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/commonsense_qa_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 65 | 66 | # update gsm scenario 67 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/gsm_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios/gsm_scenario.py 68 | 69 | # add letter scenario 70 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/letter_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 71 | 72 | # add multi_arith_scenario 73 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/multi_arith_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 74 | 75 | # add singleeq scenario 76 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/singleeq_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 77 | 78 | # add svamp scenario 79 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/svamp_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios 80 | 81 | # add llama window service 82 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_window_service.py src/agentinstruct/reasoning/helm/src/helm/benchmark/window_services 83 | 84 | # add llama-2 window service 85 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_2_window_service.py src/agentinstruct/reasoning/helm/src/helm/benchmark/window_services 86 | 87 | # update window_service_factory.py with llama-2 88 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/window_service_factory.py src/agentinstruct/reasoning/helm/src/helm/benchmark/window_services/window_service_factory.py 89 | 90 | # update dataset download prodecure in general.py 91 | cp src/agentinstruct/reasoning/helm_updates/src/helm/common/general.py src/agentinstruct/reasoning/helm/src/helm/common/general.py 92 | 93 | # add full_text property to RequestResult class in order to store itermediate reasoning 94 | cp src/agentinstruct/reasoning/helm_updates/src/helm/common/request.py src/agentinstruct/reasoning/helm/src/helm/common/request.py 95 | 96 | # add local client to auto_client and pass through prompt_list 97 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/auto_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/auto_client.py 98 | 99 | # add prompt_list to abstractmethod make_request in client.py 100 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/client.py 101 | 102 | # add llama tokenizer to huggingface_tokenizer.py 103 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/huggingface_tokenizer.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/huggingface_tokenizer.py 104 | 105 | # update openai_client.py 106 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/openai_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/openai_client.py 107 | 108 | # add openai_automatic_prompt_tuning.py with agentinstruct process 109 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/openai_automatic_prompt_tuning.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/ 110 | 111 | # add local_client.py with agentinstruct process 112 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/local_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/ 113 | 114 | # update together_client.py with agentinstruct process 115 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/together_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/together_client.py 116 | 117 | # add new models to models.py 118 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/models.py src/agentinstruct/reasoning/helm/src/helm/proxy/models.py 119 | 120 | # pass prompt_list through server_service.py 121 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/services/server_service.py src/agentinstruct/reasoning/helm/src/helm/proxy/services/server_service.py 122 | 123 | # pass prompt_list through service.py abstractmethod 124 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/services/service.py src/agentinstruct/reasoning/helm/src/helm/proxy/services/service.py 125 | 126 | 127 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/README.md: -------------------------------------------------------------------------------- 1 | # AgentInstruct Serve API Setup Guide 2 | 3 | ### Installation 4 | Our design follows TorchServe API. This TorchServe API is best run within their official docker container. Here, we focus on Llama-2-7b-chat, however the process is identical for other Llama-2-chat models (see [here](https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md) for recovering the vicuna-13b v1.1 weights). You can download LLama-2-7b-chat [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main?clone=true), which requires a HuggingFace access token with approved access to Llama-2. If you don't have git lfs, make sure to install it first (e.g., using apt-get). 5 | ``` 6 | git lfs install 7 | git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf 8 | ``` 9 | 10 | Before you begin, ensure you have cloned the AgentInstruct repository as specified in the main README. Then, in a new shell, run the following to start a docker container with the TorchServe API: 11 | ``` 12 | docker pull pytorch/torchserve:0.8.2-gpu 13 | docker run --network=mynetwork --name=serve-container -v ~/agentinstruct:/code/agentinstruct -v ~/Llama-2-7b-chat-hf:/code/Llama-2-7b-chat-hf -u root -it --gpus all -p 8081:8081 -p 8082:8082 -p 8083:8083 pytorch/torchserve:0.8.2-gpu bash 14 | cd /code/agentinstruct/src/agentinstruct/reasoning/serve 15 | ``` 16 | This container requires CUDA >= 11.8. See [here](https://hub.docker.com/r/pytorch/torchserve/tags) for additional tags, or follow the guide [here](https://github.com/pytorch/serve/blob/v0.8.2/docker/README.md) to create an image well-suited for your system. 17 | 18 | The image comes preinstalled with TorchServe and the required dependencies (torch, JDK17, etc.). Additional model-specific packages should be put in `model_store/requirements.txt`, and will be installed when a model is assigned to workers. 19 | 20 | ### Set Up the API 21 | Let's see an example on setting up the API to serve inference requests to llama-2-7b-chat step by step. 22 | 23 | #### Generating Runtime File 24 | 25 | To generate a runtime file for a model, run 26 | ``` 27 | torch-model-archiver --model-name llama-2-7b-chat --version 1.0 --handler custom_handler/llama-2-7b-chat-handler.py -r model_store/requirements.txt -f -c model_store/llama-2-7b-chat-config.yaml --archive-format tgz --export-path model_store 28 | ``` 29 | 30 | #### Starting Up the API 31 | ``` 32 | export TEMP=/tmp # or some existing directory with write access 33 | torchserve --start --ncs --ts-config model_store/config.properties 34 | ``` 35 | This will load the API, but will not register any models or load any workers. 36 | 37 | #### Registering Model and Loading Workers 38 | To load 8 copies of llama-2-7b-chat, one to each gpu, run: 39 | ``` 40 | curl -X POST "http://serve-container:8082/models?url=llama-2-7b-chat.tar.gz&initial_workers=8" 41 | ``` 42 | 43 | #### Sending Inference Requests 44 | Now you're ready to start sending inference requests to the model over serve-container:8081. The model `local/llama-2-7b-chat` in HELM will send requests to this API. You can now continue following the instructions in the main README starting from the "Replicating Main Results" section. 45 | 46 | #### Stopping the API 47 | ``` 48 | export TEMP=/tmp # must be same directory using during startup 49 | torchserve --stop 50 | ``` 51 | 52 | See [here](https://pytorch.org/serve/management_api.html) for more information on managing the API. 53 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/model_store/config.properties: -------------------------------------------------------------------------------- 1 | inference_address=http://serve-container:8081 2 | management_address=http://serve-container:8082 3 | metrics_address=http://serve-container:8083 4 | default_workers_per_model=1 5 | install_py_dep_per_model=true 6 | max_response_size=655350000 7 | default_response_timeout=5000 8 | model_store=/code/agentinstruct/src/agentinstruct/reasoning/serve/model_store 9 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/model_store/llama-2-13b-chat-config.yaml: -------------------------------------------------------------------------------- 1 | responseTimeout: 5000 2 | torchrun: 3 | nproc-per-node: 1 4 | handler: 5 | model_path: "/code/Llama-2-13b-chat-hf" 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/model_store/llama-2-70b-chat-config.yaml: -------------------------------------------------------------------------------- 1 | responseTimeout: 5000 2 | torchrun: 3 | nproc-per-node: 1 4 | handler: 5 | model_path: "/code/Llama-2-70b-chat-hf" 6 | quantize: "nf4" 7 | num_gpu_per_model: 1 8 | per_gpu_mem: 48000000000 #48GB 9 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/model_store/llama-2-7b-chat-config.yaml: -------------------------------------------------------------------------------- 1 | responseTimeout: 5000 2 | torchrun: 3 | nproc-per-node: 1 4 | handler: 5 | model_path: "/code/Llama-2-7b-chat-hf" 6 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/model_store/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.32.* 2 | accelerate 3 | sentencepiece 4 | protobuf==3.20.* 5 | bitsandbytes 6 | scipy 7 | -------------------------------------------------------------------------------- /src/agentinstruct/reasoning/serve/model_store/vicuna-13b-config.yaml: -------------------------------------------------------------------------------- 1 | responseTimeout: 5000 2 | torchrun: 3 | nproc-per-node: 1 4 | handler: 5 | model_path: "/code/vicuna-13b" 6 | --------------------------------------------------------------------------------