├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── instructions
└── main
│ └── instructions.json
├── requirements.txt
├── scripts
├── generate_agent_instructions.sh
├── gpt-3.5-turbo.sh
├── llama-2-13b-chat.sh
├── llama-2-70b-chat.sh
├── llama-2-7b-chat.sh
├── replicate.py
├── run.sh
├── run_reasoning.sh
├── run_specs
│ ├── agentinstruct
│ │ ├── gpt-3.5-turbo-agentinstruct.conf
│ │ ├── llama-2-13b-chat-agentinstruct.conf
│ │ ├── llama-2-70b-chat-agentinstruct.conf
│ │ ├── llama-2-7b-chat-agentinstruct.conf
│ │ └── vicuna-13b-agentinstruct.conf
│ ├── simple-gpt-3.5-turbo.conf
│ ├── simple-llama-2-7b-chat.conf
│ ├── zeroshot
│ │ ├── gpt-3.5-turbo-zeroshot.conf
│ │ ├── llama-2-13b-chat-zeroshot.conf
│ │ ├── llama-2-70b-chat-zeroshot.conf
│ │ ├── llama-2-7b-chat-zeroshot.conf
│ │ └── vicuna-13b-zeroshot.conf
│ └── zeroshotcot
│ │ ├── gpt-3.5-turbo-zeroshotcot.conf
│ │ ├── llama-2-13b-chat-zeroshotcot.conf
│ │ ├── llama-2-70b-chat-zeroshotcot.conf
│ │ ├── llama-2-7b-chat-zeroshotcot.conf
│ │ └── vicuna-13b-zeroshotcot.conf
└── vicuna-13b.sh
└── src
└── agentinstruct
├── agent
├── agent_instr_generation.py
├── agent_pipeline.py
└── utils
│ └── dataset_preprocessing.py
├── eval
├── format_results.py
└── letter_eval.py
└── reasoning
├── helm_updates
├── benchmark_output
│ └── scenarios
│ │ ├── coin
│ │ └── data
│ │ │ ├── test
│ │ │ └── train
│ │ └── letter
│ │ └── data
│ │ ├── test
│ │ └── train
├── src
│ └── helm
│ │ ├── benchmark
│ │ ├── __init__.py
│ │ ├── adaptation
│ │ │ ├── adapter_spec.py
│ │ │ └── adapters
│ │ │ │ ├── in_context_learning_adapter.py
│ │ │ │ └── multiple_choice_joint_adapter.py
│ │ ├── executor.py
│ │ ├── metrics
│ │ │ └── basic_metrics.py
│ │ ├── run.py
│ │ ├── run_expander.py
│ │ ├── run_specs.py
│ │ ├── scenarios
│ │ │ ├── addsub_scenario.py
│ │ │ ├── aqua_scenario.py
│ │ │ ├── big_bench_hard_scenario.py
│ │ │ ├── coin_scenario.py
│ │ │ ├── commonsense_qa_scenario.py
│ │ │ ├── gsm_scenario.py
│ │ │ ├── letter_scenario.py
│ │ │ ├── multi_arith_scenario.py
│ │ │ ├── newsqa_scenario.py
│ │ │ ├── singleeq_scenario.py
│ │ │ └── svamp_scenario.py
│ │ └── window_services
│ │ │ ├── llama_2_window_service.py
│ │ │ ├── llama_window_service.py
│ │ │ └── window_service_factory.py
│ │ ├── common
│ │ ├── general.py
│ │ └── request.py
│ │ └── proxy
│ │ ├── clients
│ │ ├── auto_client.py
│ │ ├── client.py
│ │ ├── huggingface_tokenizer.py
│ │ ├── local_client.py
│ │ ├── openai_automatic_prompt_tuning.py
│ │ ├── openai_client.py
│ │ └── together_client.py
│ │ ├── models.py
│ │ └── services
│ │ ├── server_service.py
│ │ └── service.py
└── update_helm.sh
└── serve
├── README.md
├── custom_handler
├── llama-2-13b-chat-handler.py
├── llama-2-70b-chat-handler.py
├── llama-2-7b-chat-handler.py
└── vicuna-13b-handler.py
└── model_store
├── config.properties
├── llama-2-13b-chat-config.yaml
├── llama-2-70b-chat-config.yaml
├── llama-2-7b-chat-config.yaml
├── requirements.txt
└── vicuna-13b-config.yaml
/.gitignore:
--------------------------------------------------------------------------------
1 | prod_env/
2 | restricted/
3 | *venv/
4 | _latest/
5 | benchmark_output/
6 | __pycache__
7 | *.egg-info
8 | .mypy_cache
9 | pip-wheel-metadata/
10 | .DS_Store
11 | .idea
12 | .vscode
13 | *.swp
14 | .nfs*
15 | .sif
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/agentinstruct/reasoning/helm"]
2 | path = src/agentinstruct/reasoning/helm
3 | url = https://github.com/stanford-crfm/helm.git
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | backoff==2.2.1
2 | simple_slurm==0.2.6
3 | langchain==0.0.325
4 | pydantic==1.10.*
5 | pysqlite3==0.5.2
6 | pysqlite3-binary==0.5.2
7 | chromadb==0.4.15
8 | datasets==2.14.*
--------------------------------------------------------------------------------
/scripts/generate_agent_instructions.sh:
--------------------------------------------------------------------------------
1 |
2 | if [ -d "benchmark_output/runs/$2" ]; then
3 | echo "Directory already exists: benchmark_output/runs/$2"
4 | exit 1
5 | fi
6 |
7 | helm-run --conf-paths $1 --suite $2 --max-eval-instances 5 --skip-expander --dry-run
8 | python src/agentinstruct/agent/agent_pipeline.py --benchmark_output_dir benchmark_output/runs/$2
9 | rm -rf benchmark_output/runs/$2
--------------------------------------------------------------------------------
/scripts/gpt-3.5-turbo.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/gpt-3.5-turbo-agentinstruct.conf gpt-3.5-turbo-agentinstruct 1000 2
3 | python src/agentinstruct/eval/format_results.py --suite gpt-3.5-turbo-agentinstruct
--------------------------------------------------------------------------------
/scripts/llama-2-13b-chat.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-13b-chat-agentinstruct.conf llama-2-13b-chat-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite llama-2-13b-chat-agentinstruct
--------------------------------------------------------------------------------
/scripts/llama-2-70b-chat.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-70b-chat-agentinstruct.conf llama-2-70b-chat-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite llama-2-70b-chat-agentinstruct
--------------------------------------------------------------------------------
/scripts/llama-2-7b-chat.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-7b-chat-agentinstruct.conf llama-2-7b-chat-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite llama-2-7b-chat-agentinstruct
--------------------------------------------------------------------------------
/scripts/replicate.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | try:
4 | os.remove(os.path.join(os.getcwd(), 'instructions/_latest'))
5 | except:
6 | pass
7 | os.symlink(os.path.join(os.getcwd(), f'instructions/main'), os.path.join(os.getcwd(), 'instructions/_latest'))
--------------------------------------------------------------------------------
/scripts/run.sh:
--------------------------------------------------------------------------------
1 | ./scripts/generate_agent_instructions.sh $1 $2
2 | ./scripts/run_reasoning.sh $1 $2 $3 $4 $5
3 | python src/agentinstruct/eval/format_results.py --suite $2
4 |
--------------------------------------------------------------------------------
/scripts/run_reasoning.sh:
--------------------------------------------------------------------------------
1 | if [ $# -ge 4 ]; then
2 | THREADS=$4
3 | else
4 | THREADS=8
5 | fi
6 |
7 | if [ "$5" ]; then
8 | PLACEHOLDER="--$5"
9 | fi
10 |
11 | helm-run --conf-paths $1 --suite $2 --max-eval-instances $3 -n $THREADS $PLACEHOLDER
--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/llama-2-13b-chat-agentinstruct.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
55 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/llama-2-70b-chat-agentinstruct.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
55 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/llama-2-7b-chat-agentinstruct.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
55 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/vicuna-13b-agentinstruct.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
55 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
81 | {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/simple-gpt-3.5-turbo.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | {description: "addsub:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,instructions=agentinstruct", priority: 1}
3 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/simple-llama-2-7b-chat.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
3 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/gpt-3.5-turbo-zeroshot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=all", priority: 1}
13 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=male", priority: 1}
14 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=female", priority: 1}
15 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=black", priority: 1}
20 | {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
55 | {description: "msmarco:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
78 | {description: "raft:subset=banking_77,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
81 | {description: "raft:subset=overruling,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/llama-2-13b-chat-zeroshot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
55 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/llama-2-70b-chat-zeroshot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
55 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/llama-2-7b-chat-zeroshot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
55 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/vicuna-13b-zeroshot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/vicuna-13b,max_train_instances=0", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/vicuna-13b,max_train_instances=0", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/vicuna-13b,max_train_instances=0", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/vicuna-13b,max_train_instances=0", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/vicuna-13b,max_train_instances=0", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/vicuna-13b,max_train_instances=0", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/vicuna-13b,max_train_instances=0", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
55 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/vicuna-13b,max_train_instances=0", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0", priority: 1}
81 | {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/vicuna-13b,max_train_instances=0", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/vicuna-13b,max_train_instances=0", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/llama-2-13b-chat-zeroshotcot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
55 | {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/llama-2-70b-chat-zeroshotcot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
55 | {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/llama-2-7b-chat-zeroshotcot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
55 | {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
81 | {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/vicuna-13b-zeroshotcot.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 | # AddSub
3 | {description: "addsub:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
4 |
5 | # AQuA
6 | {description: "aqua:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
7 |
8 | # BoolQ
9 | {description: "boolq:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
10 |
11 | # CivilComments
12 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1}
13 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
14 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
15 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
16 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
17 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
18 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
19 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
20 | {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
21 |
22 | # CNN/Daily Mail
23 | {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
24 |
25 | # Coin Flip
26 | {description: "coin:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
27 |
28 | # CommonsenseQA
29 | {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
30 |
31 | # Date Understanding
32 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
33 |
34 | # GSM8K
35 | {description: "gsm:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
36 |
37 | # HellaSwag
38 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
39 |
40 | # IMDB
41 | {description: "imdb:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
42 |
43 | # Last Letter Concatenation
44 | {description: "letter:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
45 |
46 | # MMLU
47 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
48 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
49 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
50 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
51 | {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
52 |
53 | # MS MARCO
54 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
55 | {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
56 |
57 | # MultiArith
58 | {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
59 |
60 | # NarrativeQA
61 | {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
62 |
63 | # NaturalQA
64 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
65 | {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
66 |
67 | # NewsQA
68 | # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
69 |
70 | # OpenbookQA
71 | {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
72 |
73 | # QuAC
74 | {description: "quac:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
75 |
76 | # RAFT
77 | {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
78 | {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
79 | {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
80 | {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
81 | {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
82 | {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
83 | {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
84 | {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
85 | {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
86 | {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
87 | {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
88 |
89 | # Shuffled Objects
90 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
91 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
92 | {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
93 |
94 | # SingleEq
95 | {description: "singleeq:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
96 |
97 | # StrategyQA
98 | {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
99 |
100 | # SVAMP
101 | {description: "svamp:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |
103 | # TruthfulQA
104 | {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 |
106 | # XSUM
107 | {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]
--------------------------------------------------------------------------------
/scripts/vicuna-13b.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/vicuna-13b-agentinstruct.conf vicuna-13b-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite vicuna-13b-agentinstruct
--------------------------------------------------------------------------------
/src/agentinstruct/agent/agent_instr_generation.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import requests
4 |
5 | from langchain.utilities import BingSearchAPIWrapper
6 |
7 | from langchain.document_loaders import WebBaseLoader
8 |
9 | from langchain.text_splitter import RecursiveCharacterTextSplitter
10 | from langchain.embeddings.openai import OpenAIEmbeddings
11 | from langchain.vectorstores import Chroma
12 |
13 | from langchain.chains import RetrievalQA
14 |
15 | from langchain.chat_models import ChatOpenAI
16 | from langchain.agents import Tool
17 | from langchain.agents import AgentType
18 | from langchain.agents import initialize_agent
19 |
20 | from helm.common.general import parse_hocon
21 | from langchain.load.dump import dumps
22 |
23 | import openai
24 | from tenacity import (
25 | retry,
26 | stop_after_attempt,
27 | wait_random_exponential,
28 | )
29 |
30 | os.environ["BING_SEARCH_URL"] = "https://api.bing.microsoft.com/v7.0/search"
31 |
32 | POWERFUL_MODEL = "gpt-4-0613"
33 | MINIMAL_TEMP = 0.3
34 | ZERO_TEMP = 0.0
35 | NUM_RESULTS = 5
36 |
37 | with open('prod_env/credentials.conf', 'r') as creds:
38 | credentials = parse_hocon(creds.read())
39 | creds.close()
40 |
41 | openai_api_key = credentials.as_plain_ordered_dict().get('openaiApiKey')
42 | bing_subscription_key = credentials.as_plain_ordered_dict().get('bingSubscriptionKey')
43 |
44 |
45 | llm = ChatOpenAI(model=POWERFUL_MODEL, temperature=ZERO_TEMP, openai_api_key=openai_api_key)
46 | search = BingSearchAPIWrapper(bing_subscription_key=bing_subscription_key)
47 |
48 | def get_links(search_metadata):
49 | links = []
50 | for result in search_metadata:
51 | links.append(result["link"])
52 | return links
53 |
54 | def get_instructions(dataset_phrase, num_results=5):
55 | search_metadata = search.results(dataset_phrase, num_results)
56 | print(search_metadata)
57 |
58 | old_links = get_links(search_metadata)
59 | print(old_links)
60 |
61 | links = []
62 | for link in old_links:
63 | try:
64 | requests.get(link, verify = True)
65 | links.append(link)
66 | except:
67 | continue
68 | print(links)
69 |
70 | website_loader = WebBaseLoader(links)
71 | data = website_loader.load()
72 | for doc in data:
73 | doc.page_content = doc.page_content
74 | doc.metadata = {"url": doc.metadata["source"], "source": doc.metadata["source"]}
75 |
76 | text_splitter = RecursiveCharacterTextSplitter()
77 | texts = text_splitter.split_documents(data)
78 | embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
79 | db = Chroma.from_documents(texts, embeddings)
80 | retriever = db.as_retriever()
81 | qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
82 | return qa, links
83 |
84 | def run_agent(dataset_phrase, instance_format, possible_outputs, onepass=False):
85 | possible_outputs_prompt = f"\nPossible outputs:\n{possible_outputs}"
86 |
87 | if onepass:
88 | out_dict = dict()
89 | out_dict["output"] = onepass_simpletips(dataset_phrase, instance_format, possible_outputs_prompt)
90 | return out_dict, None
91 |
92 | qa, links = get_instructions(dataset_phrase)
93 |
94 | tools = [
95 | Tool(
96 | name = "Ask about dataset",
97 | func=lambda x: qa({"query": x}),
98 | description="useful for when you need to ask questions to get information about the dataset"
99 | ),
100 | ]
101 | chat = ChatOpenAI(model=POWERFUL_MODEL, temperature=MINIMAL_TEMP, openai_api_key=openai_api_key)
102 | agent_chain = initialize_agent(tools, chat, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, return_intermediate_steps=True)
103 |
104 | prompt = (f"{dataset_phrase}. Use your resources to ask a series of simple questions to create instructions for the dataset. These instructions will be prepended to the prompt template during inference to help a large language model answer the prompt correctly." +
105 | " Include detailed tips on what topics to know and steps on how to answer the questions." +
106 | " For each instance, the model will apply these instructions to create an explanation that guides it towards the correct answer." +
107 | "\nPrompt Template (use for reference but no need to include in the instructions):\n"+ instance_format +
108 | possible_outputs_prompt)
109 |
110 | print("Prompt: ", prompt)
111 |
112 | return agent_chain({"input": prompt}), links
113 |
114 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
115 | def openai_generate(model, prompt, temperature=MINIMAL_TEMP):
116 | response = openai.ChatCompletion.create(
117 | model=model,
118 | temperature=temperature,
119 | messages=[
120 | {"role": "user", "content": prompt},
121 | ]
122 | )
123 | return response['choices'][0]['message']['content']
124 |
125 | def onepass_simpletips(dataset_phrase, instance_format, possible_outputs_prompt):
126 |
127 | prompt = (f"{dataset_phrase}. Create instructions for the dataset that will be prepended to the prompt template during inference to help a large language model answer the prompt correctly." +
128 | " Include detailed tips on what topics to know and steps on how to answer the questions." +
129 | " For each instance, the model will apply these instructions to create an explanation that guides it towards the correct answer." +
130 | "\nPrompt Template (use for reference but no need to include in the instructions):\n"+ instance_format +
131 | possible_outputs_prompt)
132 | return openai_generate(POWERFUL_MODEL, prompt, temperature=MINIMAL_TEMP)
133 |
134 | def generate_and_save_instructions(working_directory_name, dataset_name, dataset_phrase, instance_format, possible_outputs, sources_dict, onepass=False):
135 |
136 | out_dict, links = run_agent(dataset_phrase, instance_format, possible_outputs, onepass=onepass)
137 | input_prompt = out_dict.get("input", None)
138 | intermediate_steps = dumps(out_dict.get("intermediate_steps", None))
139 | instr = out_dict["output"][out_dict["output"].find("1."):]
140 |
141 | sources_dict[dataset_name] = {
142 | "all_links": links,
143 | "input_prompt": input_prompt,
144 | "intermediate_steps": intermediate_steps,
145 | }
146 |
147 | return instr, sources_dict
148 |
--------------------------------------------------------------------------------
/src/agentinstruct/agent/agent_pipeline.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import openai
4 | import argparse
5 |
6 | from utils.dataset_preprocessing import dataset_preprocessing
7 | from agent_instr_generation import generate_and_save_instructions
8 | from helm.common.general import parse_hocon
9 |
10 | with open('prod_env/credentials.conf', 'r') as creds:
11 | credentials = parse_hocon(creds.read())
12 |
13 | openai.api_key = credentials.as_plain_ordered_dict().get('openaiApiKey')
14 |
15 | __import__('pysqlite3')
16 | import sys
17 | sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
18 |
19 | def generate_and_place_all_instr(benchmark_output_dir):
20 | suite = benchmark_output_dir.split("/")[-1]
21 | inputs_dict = {}
22 | instr_dict = {}
23 | sources_dict = {}
24 |
25 | instr_dir_path = os.path.join("instructions", suite)
26 | os.makedirs(instr_dir_path, exist_ok="True")
27 |
28 | for dataset_dir in os.listdir(benchmark_output_dir):
29 | if os.path.isdir(os.path.join(benchmark_output_dir, dataset_dir)):
30 | scenario_state_path = os.path.join(benchmark_output_dir, dataset_dir, "scenario_state.json")
31 | if not os.path.exists(scenario_state_path):
32 | print(f"Scenario state does not exist for {dataset_dir}. Skipping.")
33 | continue
34 | dataset_name, dataset_phrase, instance_format, possible_outputs = dataset_preprocessing(scenario_state_path)
35 | inputs_dict[dataset_name] = {
36 | "dataset_phrase": dataset_phrase,
37 | "instance_format": instance_format,
38 | "possible_outputs": possible_outputs,
39 | }
40 | instr, sources_dict = generate_and_save_instructions(instr_dir_path, dataset_name, dataset_phrase, instance_format, possible_outputs, sources_dict, onepass=False)
41 | instr_dict[dataset_name] = {
42 | "instructions": instr,
43 | "task": possible_outputs
44 | }
45 |
46 | with open(os.path.join(instr_dir_path, "instructions.json"), "w") as f:
47 | json.dump(instr_dict, f, indent=4)
48 | with open(os.path.join(instr_dir_path, "inputs.json"), "w") as f:
49 | json.dump(inputs_dict, f, indent=4)
50 | with open(os.path.join(instr_dir_path, "metadata.json"), "w") as f:
51 | json.dump(sources_dict, f, indent=4)
52 | try:
53 | os.unlink(os.path.join(os.getcwd(), 'instructions/_latest'))
54 | except:
55 | pass
56 | os.symlink(os.path.join(os.getcwd(), f'instructions/{suite}'), os.path.join(os.getcwd(), 'instructions/_latest'))
57 |
58 | if __name__ == "__main__":
59 | parser = argparse.ArgumentParser()
60 | parser.add_argument("--benchmark_output_dir", type=str)
61 | args = parser.parse_args()
62 | generate_and_place_all_instr(args.benchmark_output_dir)
63 |
--------------------------------------------------------------------------------
/src/agentinstruct/agent/utils/dataset_preprocessing.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import tiktoken
4 | import backoff
5 | import openai
6 |
7 | def read_scenario_state(scenario_state_path):
8 | with open(scenario_state_path, "r") as f:
9 | scenario_state = json.load(f)
10 | dataset_name = scenario_state["adapter_spec"]["prompt_list"]["dataset_name"]
11 | possible_outputs = scenario_state["adapter_spec"]["method"]
12 | test_instances = []
13 | labels = set()
14 | for state in scenario_state["request_states"]:
15 | test_instances.append(state["request"]["prompt"])
16 | labels.add(state["instance"]["references"][0]["output"]["text"])
17 | if len(labels) < len(test_instances) and possible_outputs == 'generation':
18 | possible_outputs = list(labels)
19 | return dataset_name, test_instances, possible_outputs
20 |
21 | def get_dataset_phrase(dataset_name):
22 | dataset_phrase = re.sub(r"(^(.*?):)", r"The dataset name is \1", dataset_name)
23 | if "The dataset name is" not in dataset_phrase:
24 | dataset_phrase = "The dataset name is " + dataset_phrase
25 | pattern = r"(,|:)(.*?)=(.*?)(,|$)"
26 | while re.search(pattern, dataset_phrase) is not None:
27 | dataset_phrase = re.sub(pattern, r" and the \2 is \3,", dataset_phrase)
28 | dataset_name = re.sub(r":$", "", dataset_name)
29 | dataset_phrase = re.sub(r"(,|:)$", "", dataset_phrase)
30 | return dataset_phrase
31 |
32 | def truncate_instances(instances, max_length=3600):
33 |
34 | encoding = tiktoken.get_encoding("cl100k_base")
35 | instance_num_tokens = [(instance, len(encoding.encode(instance))) for instance in instances]
36 | instance_num_tokens.sort(key=lambda x: x[1])
37 | instances_str = instance_num_tokens[0][0]
38 | num_tokens = instance_num_tokens[0][1]
39 | for instance, num_tokens_instance in instance_num_tokens[1:]:
40 | if num_tokens + num_tokens_instance <= max_length:
41 | instances_str += "\n\n" + instance
42 | num_tokens += 1 + num_tokens_instance
43 | else:
44 | break
45 | return instances_str
46 |
47 | @backoff.on_exception(backoff.expo, openai.error.RateLimitError, max_time=60)
48 | def get_instance_format(instances):
49 |
50 | output = openai.ChatCompletion.create(
51 | model="gpt-3.5-turbo",
52 | temperature=0,
53 | messages=[
54 | {"role": "user", "content": f"Given the following instances from a dataset, please isolate the structure of each instance such that a general template is created. Do not include any specific information, just what each instance looks like before its specific information was filled in (the template should have empty brackets in the spots that are different for each instance). We will use this to write our own instances that must follow the same format. Remember to be as general as possible; there are likely some instances in the dataset that are quite different than the ones presented here.\nExample Instances:\n\n{instances}\n\nFormat:"},
55 | ],
56 | max_tokens=256,
57 | )
58 | return output["choices"][0]["message"]["content"]
59 |
60 | def get_full_instance_format(instances, verbose=False):
61 | if verbose:
62 | print("original instances: ", instances)
63 | instances = truncate_instances(instances[:5])
64 | formatted_instances = get_instance_format(instances)
65 | return formatted_instances
66 |
67 | def dataset_preprocessing(scenario_state_path):
68 | dataset_name, test_instances, possible_outputs = read_scenario_state(scenario_state_path)
69 | dataset_phrase = get_dataset_phrase(dataset_name)
70 | instance_format = get_full_instance_format(test_instances, verbose=True)
71 | return dataset_name, dataset_phrase, instance_format, possible_outputs
--------------------------------------------------------------------------------
/src/agentinstruct/eval/format_results.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import json
4 | import pandas as pd
5 | from letter_eval import letter_eval
6 |
7 | dataset_to_metric = {
8 | 'mmlu': 'exact_match',
9 | 'civil_comments': 'quasi_prefix_exact_match',
10 | 'raft': 'quasi_exact_match',
11 | 'big_bench': 'exact_match',
12 | 'summarization_cnndm': 'rouge_2',
13 | 'summarization_xsum': 'rouge_2',
14 | 'truthful_qa': 'exact_match',
15 | 'imdb': 'quasi_exact_match',
16 | 'narrative_qa': 'f1_score',
17 | 'boolq': 'quasi_prefix_exact_match',
18 | 'quac': 'f1_score',
19 | 'aqua': 'exact_match',
20 | 'news_qa': 'f1_score',
21 | 'natural_qa': 'f1_score',
22 | 'commonsense': 'exact_match',
23 | 'truthful_qa': 'exact_match',
24 | 'msmarco': 'RR@10', #switch for trec
25 | 'gsm': 'quasi_exact_match',
26 | 'multi_arith': 'quasi_exact_match',
27 | 'svamp' : 'quasi_exact_match',
28 | 'addsub': 'quasi_exact_match',
29 | 'singleeq': 'quasi_exact_match',
30 | 'letter': 'letter_eval',
31 | 'big_bench_hard': 'quasi_exact_match',
32 | 'coin': "quasi_exact_match",
33 | 'commonsense_qa': 'exact_match',
34 | }
35 |
36 | def main(args):
37 | results = {}
38 | for run in os.listdir(os.path.join('benchmark_output/runs', args.suite)):
39 |
40 | try:
41 | if 'letter' in run:
42 | score, num_instances = letter_eval(os.path.join('benchmark_output/runs', args.suite, run))
43 | results[run] = {'score': score, 'num_instances': num_instances, 'metric': 'letter_eval'}
44 | continue
45 |
46 | with open(os.path.join('benchmark_output/runs', args.suite, run, 'stats.json'), 'r') as f:
47 | stats = json.load(f)
48 | f.close()
49 |
50 | with open(os.path.join('benchmark_output/runs', args.suite, run, 'scenario_state.json'), 'r') as f1:
51 | scenario_state = json.load(f1)
52 | f1.close()
53 |
54 | dataset = run.split(':')[0].split(',')[0] if ',' in run.split(':')[0] else run.split(':')[0]
55 | metric = dataset_to_metric[dataset]
56 |
57 | if dataset == 'msmarco' and 'track=trec' in run:
58 | metric = 'NDCG@10'
59 |
60 | results[run] = {'score': None, 'num_instances': None, 'metric': metric}
61 |
62 | if 'civil_comments' in run:
63 | score = 0
64 | instances = 0
65 | for stat in stats:
66 | if stat['name']['name'] == metric and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
67 | score += stat['mean']
68 | if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
69 | instances += stat['mean']
70 | results[run]['score'] = score/2
71 | results[run]['num_instances'] = instances
72 |
73 | else:
74 | tmp = None
75 | for stat in stats:
76 |
77 | if stat['name']['name'] == metric and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
78 | results[run]['score'] = stat['mean']
79 |
80 | if stat['name']['name'] == metric and stat['name']['split'] == 'valid' and 'perturbation' not in stat['name']:
81 | tmp = stat['mean']
82 |
83 | if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
84 | results[run]['num_instances'] = stat['mean']
85 |
86 | if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'valid' and 'perturbation' not in stat['name']:
87 | tmp1 = stat['mean']
88 |
89 | if results[run]['score'] == None:
90 | if tmp != None:
91 | results[run]['score'] = tmp
92 | results[run]['num_instances'] = tmp1
93 | else:
94 | print(f'Run {run} does not have a test or validation set.\n')
95 |
96 | except Exception as e:
97 | print(f'Skipping {run}.')
98 |
99 | keys = sorted(results)
100 | results = {key: results[key] for key in keys}
101 | df = pd.DataFrame.from_dict(results, columns = ['metric', 'num_instances', 'score'], orient='index')
102 | df.to_csv(f'benchmark_output/runs/{args.suite}/results.csv')
103 |
104 | if __name__ == '__main__':
105 | parser = argparse.ArgumentParser()
106 | parser.add_argument('--suite', type=str, required=True)
107 | main(parser.parse_args())
108 |
--------------------------------------------------------------------------------
/src/agentinstruct/eval/letter_eval.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import argparse
4 | import string
5 |
6 | def letter_eval(path):
7 |
8 | def white_space_fix(text: str) -> str:
9 | return " ".join(text.split())
10 |
11 | def remove_punc(text: str) -> str:
12 | exclude = set(string.punctuation)
13 | return "".join(ch for ch in text if ch not in exclude)
14 |
15 | def lower(text: str) -> str:
16 | return text.lower()
17 |
18 | with open(os.path.join(path, "scenario_state.json"), 'r') as f:
19 | states = json.load(f)
20 |
21 | count = 0
22 |
23 | if 'agentinstruct' in states["adapter_spec"]["prompt_list"]:
24 | mode = 'agentinstruct' if states["adapter_spec"]["prompt_list"]["agentinstruct"] else 'zeroshotcot'
25 | else:
26 | mode='zeroshot'
27 |
28 | for instance in states["request_states"]:
29 | gold = instance["instance"]["references"][0]["output"]["text"]
30 | if mode == 'zeroshotcot':
31 | pred = instance["result"]["full_text"].split('Therefore, the answer is')[-1].translate({ord(c): None for c in string.whitespace})
32 | elif mode == 'agentinstruct':
33 | pred = instance["result"]["full_text"].split('Answer:')[-1].translate({ord(c): None for c in string.whitespace})
34 | else:
35 | pred = instance["result"]["completions"][0]["text"].translate({ord(c): None for c in string.whitespace})
36 |
37 | if pred and gold:
38 | if white_space_fix(remove_punc(lower(gold))) == white_space_fix(remove_punc(lower(pred)))[:2]:
39 | count += 1
40 |
41 | l = len(states["request_states"])
42 | return count/l, l
43 |
44 | if __name__ == '__main__':
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument('--path', type=str, required=True)
47 | args = parser.parse_args()
48 | print(letter_eval(args.path))
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/benchmark_output/scenarios/coin/data/train:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "id": 1,
4 | "question": "A coin is heads up. Dan flips the coin. Earnest does not flip the coin. Agustin does not flip the coin. Kip does not flip the coin. Is the coin still heads up?",
5 | "answer": "no"
6 | },
7 | {
8 | "id": 2,
9 | "question": "A coin is heads up. Milford flips the coin. Kathie does not flip the coin. Cathy flips the coin. Randy does not flip the coin. Is the coin still heads up?",
10 | "answer": "yes"
11 | },
12 | {
13 | "id": 3,
14 | "question": "A coin is heads up. Donald flips the coin. Rosalind flips the coin. Madelyn flips the coin. Ida flips the coin. Is the coin still heads up?",
15 | "answer": "yes"
16 | },
17 | {
18 | "id": 4,
19 | "question": "A coin is heads up. Kristen flips the coin. Clarice does not flip the coin. Thelma flips the coin. Maurice flips the coin. Is the coin still heads up?",
20 | "answer": "no"
21 | },
22 | {
23 | "id": 5,
24 | "question": "A coin is heads up. Andy flips the coin. Clinton does not flip the coin. Hilda does not flip the coin. Katrina does not flip the coin. Is the coin still heads up?",
25 | "answer": "no"
26 | }
27 | ]
--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/benchmark_output/scenarios/letter/data/train:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "id": 1,
4 | "question": "Take the last letters of each words in \"Phil Schmitt\" and concatenate them.",
5 | "answer": "lt"
6 | },
7 | {
8 | "id": 2,
9 | "question": "Take the last letters of each words in \"Marta Faulkner\" and concatenate them.",
10 | "answer": "ar"
11 | },
12 | {
13 | "id": 3,
14 | "question": "Take the last letters of each words in \"Eugenia Watson\" and concatenate them.",
15 | "answer": "an"
16 | },
17 | {
18 | "id": 4,
19 | "question": "Take the last letters of each words in \"Danielle Barr\" and concatenate them.",
20 | "answer": "er"
21 | },
22 | {
23 | "id": 5,
24 | "question": "Take the last letters of each words in \"Antwan Bates\" and concatenate them.",
25 | "answer": "ns"
26 | }
27 | ]
--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | # Add any classes that need to be loaded dynamically via `create_object`.
2 |
3 | # Scenarios
4 | from .scenarios import simple_scenarios # noqa
5 | from .scenarios import mmlu_scenario # noqa
6 | from .scenarios import interactive_qa_mmlu_scenario # noqa
7 | from .scenarios import msmarco_scenario # noqa
8 | from .scenarios import commonsense_scenario # noqa
9 | from .scenarios import twitter_aae_scenario # noqa
10 | from .scenarios import real_toxicity_prompts_scenario # noqa
11 | from .scenarios import math_scenario # noqa
12 | from .scenarios import the_pile_scenario # noqa
13 | from .scenarios import truthful_qa_scenario # noqa
14 | from .scenarios import wikifact_scenario # noqa
15 | from .scenarios import synthetic_reasoning_natural_scenario # noqa
16 | from .scenarios import copyright_scenario # noqa
17 | from .scenarios import disinformation_scenario # noqa
18 | from .scenarios import boolq_scenario # noqa
19 | from .scenarios import code_scenario # noqa
20 | from .scenarios import lsat_qa_scenario # noqa
21 | from .scenarios import gsm_scenario # noqa
22 | from .scenarios import natural_qa_scenario # noqa
23 | from .scenarios import quac_scenario # noqa
24 | from .scenarios import babi_qa_scenario # noqa
25 | from .scenarios import narrativeqa_scenario # noqa
26 | from .scenarios import raft_scenario # noqa
27 | from .scenarios import numeracy_scenario # noqa
28 | from .scenarios import ice_scenario # noqa
29 | from .scenarios import summarization_scenario # noqa
30 | from .scenarios import synthetic_efficiency_scenario # noqa
31 | from .scenarios import synthetic_reasoning_scenario # noqa
32 | from .scenarios import newsqa_scenario # noqa
33 | from .scenarios import wikitext_103_scenario # noqa
34 | from .scenarios import blimp_scenario # noqa
35 | from .scenarios import imdb_scenario # noqa
36 | from .scenarios import dialogue_scenarios # noqa
37 | from .scenarios import bbq_scenario # noqa
38 | from .scenarios import bold_scenario # noqa
39 | from .scenarios import civil_comments_scenario # noqa
40 | from .scenarios import dyck_language_scenario # noqa
41 | from .scenarios import legal_support_scenario # noqa
42 | from .scenarios import lex_glue_scenario # noqa
43 | from .scenarios import lextreme_scenario # noqa
44 | from .scenarios import entity_matching_scenario # noqa
45 | from .scenarios import entity_data_imputation_scenario # noqa
46 | from .scenarios import big_bench_scenario # noqa
47 | from .scenarios import opinions_qa_scenario # noqa
48 | from .scenarios import multi_arith_scenario
49 | from .scenarios import aqua_scenario
50 | from .scenarios import svamp_scenario
51 | from .scenarios import addsub_scenario
52 | from .scenarios import singleeq_scenario
53 | from .scenarios import coin_scenario
54 | from .scenarios import letter_scenario
55 | from .scenarios import big_bench_hard_scenario
56 | from .scenarios import commonsense_qa_scenario
57 |
58 | # Biomedical
59 | from .scenarios import covid_dialog_scenario # noqa
60 | from .scenarios import me_q_sum_scenario # noqa
61 | from .scenarios import med_dialog_scenario # noqa
62 | from .scenarios import med_mcqa_scenario # noqa
63 | from .scenarios import med_paragraph_simplification_scenario # noqa
64 | from .scenarios import med_qa_scenario # noqa
65 | from .scenarios import pubmed_qa_scenario # noqa
66 | from .scenarios import wmt_14_scenario # noqa
67 |
68 | #
69 | # Metrics
70 | from .metrics import basic_metrics # noqa
71 | from .metrics import bbq_metrics # noqa
72 | from .metrics import bias_metrics # noqa
73 | from .metrics import classification_metrics # noqa
74 | from .metrics import code_metrics # noqa
75 | from .metrics import copyright_metrics # noqa
76 | from .metrics import disinformation_metrics # noqa
77 | from .metrics import numeracy_metrics # noqa
78 | from .metrics import ranking_metrics # noqa
79 | from .metrics import summarization_metrics # noqa
80 | from .metrics import toxicity_metrics # noqa
81 | from .metrics import machine_translation_metrics # noqa
82 |
83 | # Perturbations for data augmentation
84 | from .augmentations.extra_space_perturbation import ExtraSpacePerturbation # noqa
85 | from .augmentations.misspelling_perturbation import MisspellingPerturbation # noqa
86 | from .augmentations.contraction_expansion_perturbation import ContractionPerturbation # noqa
87 | from .augmentations.contraction_expansion_perturbation import ExpansionPerturbation # noqa
88 | from .augmentations.typos_perturbation import TyposPerturbation # noqa
89 | from .augmentations.filler_words_perturbation import FillerWordsPerturbation # noqa
90 | from .augmentations.synonym_perturbation import SynonymPerturbation # noqa
91 | from .augmentations.contrast_sets_perturbation import ContrastSetsPerturbation # noqa
92 | from .augmentations.lowercase_perturbation import LowerCasePerturbation # noqa
93 | from .augmentations.space_perturbation import SpacePerturbation # noqa
94 | from .augmentations.mild_mix_perturbation import MildMixPerturbation # noqa
95 | from .augmentations.dialect_perturbation import DialectPerturbation # noqa
96 | from .augmentations.person_name_perturbation import PersonNamePerturbation # noqa
97 | from .augmentations.gender_perturbation import GenderPerturbation # noqa
98 |
--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapter_spec.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import List, Optional, Dict, Any
3 |
4 |
5 | @dataclass(frozen=True)
6 | class Substitution:
7 | """Represents a regular expression search/replace."""
8 |
9 | source: str
10 | target: str
11 |
12 |
13 | @dataclass(frozen=True)
14 | class AdapterSpec:
15 | """
16 | Specifies how to take a `Scenario` (a list of `Instance`s) and produce a
17 | `ScenarioState` (set of `Request`s ). Instead of having free-form prompt
18 | hacking, we try to make the process more declarative and systematic.
19 | Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
20 | """
21 |
22 | # Method of adaptation
23 | method: str = ""
24 |
25 | # Prepend all prompts with this string.
26 | # For example, it is recommended to prefix all prompts with [NLG] for UL2.
27 | global_prefix: str = ""
28 |
29 | # Prompt starts with instructions
30 | instructions: str = ""
31 |
32 | # What goes before the input
33 | input_prefix: str = "Input: "
34 |
35 | # What goes after the input
36 | input_suffix: str = "\n"
37 |
38 | # What goes before the input (for multiple choice)
39 | reference_prefix: str = "A. "
40 |
41 | # What goes before the input (for multiple choice)
42 | reference_suffix: str = "\n"
43 |
44 | # What goes before the output
45 | output_prefix: str = "Output: "
46 |
47 | # What goes after the output
48 | output_suffix: str = "\n"
49 |
50 | # What goes between instruction and in-context example blocks in the constructed prompt
51 | instance_prefix: str = "\n"
52 |
53 | # List of regular expression substitutions that we perform
54 | substitutions: List[Substitution] = field(default_factory=list, hash=False)
55 |
56 | # Maximum number of (in-context) training instances to put into the prompt
57 | max_train_instances: int = 5
58 |
59 | # Maximum number of evaluation instances. For getting valid numbers, this
60 | # should be the entire dataset; only reduce this for piloting.
61 | max_eval_instances: Optional[int] = None
62 |
63 | # Generate this many outputs (which could be realized by `num_completions`
64 | # or `top_k_per_token`).
65 | num_outputs: int = 5
66 |
67 | # Number of trials, where in each trial we choose an independent, random
68 | # set of training instances. Used to compute error bars.
69 | num_train_trials: int = 1
70 |
71 | # If true, randomly sample N training examples; if false, select N consecutive training examples
72 | sample_train: bool = True
73 |
74 | # Decoding parameters (inherited by `Request`)
75 |
76 | # Model to make the request to (need to fill in)
77 | model: str = ""
78 |
79 | # Temperature to use
80 | temperature: float = 1
81 |
82 | # Maximum number of tokens to generate
83 | max_tokens: int = 100
84 |
85 | # When to stop (set hash=False to make `AdapterSpec` hashable)
86 | stop_sequences: List[str] = field(default_factory=list, hash=False)
87 |
88 | # Random string (used concretely to bypass cache / see diverse results)
89 | random: Optional[str] = None
90 |
91 | # Prompt List (for multiple calls to chatgpt)
92 | prompt_list: Dict[str, Any] = None
93 |
--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional
2 | from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3 |
4 | from helm.benchmark.adaptation.request_state import RequestState
5 | from helm.benchmark.scenarios.scenario import Instance
6 | from helm.benchmark.window_services.tokenizer_service import TokenizerService
7 | from helm.common.request import Request
8 | from .in_context_learning_adapter import InContextLearningAdapter
9 | from dataclasses import replace
10 |
11 |
12 | class MultipleChoiceJointAdapter(InContextLearningAdapter):
13 | """
14 | Each `Instance` in a `Scenario` looks like this:
15 |
16 | ->
17 |
18 | [correct]
19 |
20 |
21 | We can define a label (e.g., letter) for each reference:
22 |
23 |
24 |
25 | # train
26 | A.
27 | B.
28 | C.
29 | D.
30 | Answer: C
31 |
32 | # test
33 | A.
34 | B.
35 | C.
36 | D.
37 | Answer:
38 |
39 | In general, each example is:
40 |
41 |