├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── instructions
    └── main
    │   └── instructions.json
├── requirements.txt
├── scripts
    ├── generate_agent_instructions.sh
    ├── gpt-3.5-turbo.sh
    ├── llama-2-13b-chat.sh
    ├── llama-2-70b-chat.sh
    ├── llama-2-7b-chat.sh
    ├── replicate.py
    ├── run.sh
    ├── run_reasoning.sh
    ├── run_specs
    │   ├── agentinstruct
    │   │   ├── gpt-3.5-turbo-agentinstruct.conf
    │   │   ├── llama-2-13b-chat-agentinstruct.conf
    │   │   ├── llama-2-70b-chat-agentinstruct.conf
    │   │   ├── llama-2-7b-chat-agentinstruct.conf
    │   │   └── vicuna-13b-agentinstruct.conf
    │   ├── simple-gpt-3.5-turbo.conf
    │   ├── simple-llama-2-7b-chat.conf
    │   ├── zeroshot
    │   │   ├── gpt-3.5-turbo-zeroshot.conf
    │   │   ├── llama-2-13b-chat-zeroshot.conf
    │   │   ├── llama-2-70b-chat-zeroshot.conf
    │   │   ├── llama-2-7b-chat-zeroshot.conf
    │   │   └── vicuna-13b-zeroshot.conf
    │   └── zeroshotcot
    │   │   ├── gpt-3.5-turbo-zeroshotcot.conf
    │   │   ├── llama-2-13b-chat-zeroshotcot.conf
    │   │   ├── llama-2-70b-chat-zeroshotcot.conf
    │   │   ├── llama-2-7b-chat-zeroshotcot.conf
    │   │   └── vicuna-13b-zeroshotcot.conf
    └── vicuna-13b.sh
└── src
    └── agentinstruct
        ├── agent
            ├── agent_instr_generation.py
            ├── agent_pipeline.py
            └── utils
            │   └── dataset_preprocessing.py
        ├── eval
            ├── format_results.py
            └── letter_eval.py
        └── reasoning
            ├── helm_updates
                ├── benchmark_output
                │   └── scenarios
                │   │   ├── coin
                │   │       └── data
                │   │       │   ├── test
                │   │       │   └── train
                │   │   └── letter
                │   │       └── data
                │   │           ├── test
                │   │           └── train
                ├── src
                │   └── helm
                │   │   ├── benchmark
                │   │       ├── __init__.py
                │   │       ├── adaptation
                │   │       │   ├── adapter_spec.py
                │   │       │   └── adapters
                │   │       │   │   ├── in_context_learning_adapter.py
                │   │       │   │   └── multiple_choice_joint_adapter.py
                │   │       ├── executor.py
                │   │       ├── metrics
                │   │       │   └── basic_metrics.py
                │   │       ├── run.py
                │   │       ├── run_expander.py
                │   │       ├── run_specs.py
                │   │       ├── scenarios
                │   │       │   ├── addsub_scenario.py
                │   │       │   ├── aqua_scenario.py
                │   │       │   ├── big_bench_hard_scenario.py
                │   │       │   ├── coin_scenario.py
                │   │       │   ├── commonsense_qa_scenario.py
                │   │       │   ├── gsm_scenario.py
                │   │       │   ├── letter_scenario.py
                │   │       │   ├── multi_arith_scenario.py
                │   │       │   ├── newsqa_scenario.py
                │   │       │   ├── singleeq_scenario.py
                │   │       │   └── svamp_scenario.py
                │   │       └── window_services
                │   │       │   ├── llama_2_window_service.py
                │   │       │   ├── llama_window_service.py
                │   │       │   └── window_service_factory.py
                │   │   ├── common
                │   │       ├── general.py
                │   │       └── request.py
                │   │   └── proxy
                │   │       ├── clients
                │   │           ├── auto_client.py
                │   │           ├── client.py
                │   │           ├── huggingface_tokenizer.py
                │   │           ├── local_client.py
                │   │           ├── openai_automatic_prompt_tuning.py
                │   │           ├── openai_client.py
                │   │           └── together_client.py
                │   │       ├── models.py
                │   │       └── services
                │   │           ├── server_service.py
                │   │           └── service.py
                └── update_helm.sh
            └── serve
                ├── README.md
                ├── custom_handler
                    ├── llama-2-13b-chat-handler.py
                    ├── llama-2-70b-chat-handler.py
                    ├── llama-2-7b-chat-handler.py
                    └── vicuna-13b-handler.py
                └── model_store
                    ├── config.properties
                    ├── llama-2-13b-chat-config.yaml
                    ├── llama-2-70b-chat-config.yaml
                    ├── llama-2-7b-chat-config.yaml
                    ├── requirements.txt
                    └── vicuna-13b-config.yaml


/.gitignore:
--------------------------------------------------------------------------------
 1 | prod_env/
 2 | restricted/
 3 | *venv/
 4 | _latest/
 5 | benchmark_output/
 6 | __pycache__
 7 | *.egg-info
 8 | .mypy_cache
 9 | pip-wheel-metadata/
10 | .DS_Store
11 | .idea
12 | .vscode
13 | *.swp
14 | .nfs*
15 | .sif


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/agentinstruct/reasoning/helm"]
2 | 	path = src/agentinstruct/reasoning/helm
3 | 	url = https://github.com/stanford-crfm/helm.git
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | backoff==2.2.1
2 | simple_slurm==0.2.6
3 | langchain==0.0.325
4 | pydantic==1.10.*
5 | pysqlite3==0.5.2
6 | pysqlite3-binary==0.5.2
7 | chromadb==0.4.15
8 | datasets==2.14.*


--------------------------------------------------------------------------------
/scripts/generate_agent_instructions.sh:
--------------------------------------------------------------------------------
1 | 
2 | if [ -d "benchmark_output/runs/$2" ]; then
3 |   echo "Directory already exists: benchmark_output/runs/$2"
4 |   exit 1 
5 | fi
6 | 
7 | helm-run --conf-paths $1 --suite $2 --max-eval-instances 5 --skip-expander --dry-run
8 | python src/agentinstruct/agent/agent_pipeline.py --benchmark_output_dir benchmark_output/runs/$2
9 | rm -rf benchmark_output/runs/$2


--------------------------------------------------------------------------------
/scripts/gpt-3.5-turbo.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/gpt-3.5-turbo-agentinstruct.conf gpt-3.5-turbo-agentinstruct 1000 2
3 | python src/agentinstruct/eval/format_results.py --suite gpt-3.5-turbo-agentinstruct


--------------------------------------------------------------------------------
/scripts/llama-2-13b-chat.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-13b-chat-agentinstruct.conf llama-2-13b-chat-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite llama-2-13b-chat-agentinstruct


--------------------------------------------------------------------------------
/scripts/llama-2-70b-chat.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-70b-chat-agentinstruct.conf llama-2-70b-chat-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite llama-2-70b-chat-agentinstruct


--------------------------------------------------------------------------------
/scripts/llama-2-7b-chat.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/llama-2-7b-chat-agentinstruct.conf llama-2-7b-chat-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite llama-2-7b-chat-agentinstruct


--------------------------------------------------------------------------------
/scripts/replicate.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | try:
4 |     os.remove(os.path.join(os.getcwd(), 'instructions/_latest'))
5 | except:
6 |     pass
7 | os.symlink(os.path.join(os.getcwd(), f'instructions/main'), os.path.join(os.getcwd(), 'instructions/_latest'))


--------------------------------------------------------------------------------
/scripts/run.sh:
--------------------------------------------------------------------------------
1 | ./scripts/generate_agent_instructions.sh $1 $2
2 | ./scripts/run_reasoning.sh $1 $2 $3 $4 $5
3 | python src/agentinstruct/eval/format_results.py --suite $2 
4 | 


--------------------------------------------------------------------------------
/scripts/run_reasoning.sh:
--------------------------------------------------------------------------------
 1 | if [ $# -ge 4 ]; then
 2 |     THREADS=$4
 3 | else
 4 |     THREADS=8
 5 | fi
 6 | 
 7 | if [ "$5" ]; then
 8 |     PLACEHOLDER="--$5"
 9 | fi
10 | 
11 | helm-run --conf-paths $1 --suite $2 --max-eval-instances $3 -n $THREADS $PLACEHOLDER


--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/llama-2-13b-chat-agentinstruct.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/llama-2-70b-chat-agentinstruct.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/llama-2-7b-chat-agentinstruct.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/agentinstruct/vicuna-13b-agentinstruct.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30,instructions=agentinstruct", priority: 1}
 55 |     {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30,instructions=agentinstruct", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,instructions=agentinstruct,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/simple-gpt-3.5-turbo.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 |     {description: "addsub:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,instructions=agentinstruct", priority: 1}
3 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/simple-llama-2-7b-chat.conf:
--------------------------------------------------------------------------------
1 | entries: [
2 |     {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=agentinstruct", priority: 1}
3 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/gpt-3.5-turbo-zeroshot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
 55 |     {description: "msmarco:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 81 |     {description: "raft:subset=overruling,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=openai/gpt-3.5-turbo-0301,max_train_instances=0", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=openai/gpt-3.5-turbo-0301,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/llama-2-13b-chat-zeroshot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/llama-2-70b-chat-zeroshot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/llama-2-7b-chat-zeroshot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshot/vicuna-13b-zeroshot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/vicuna-13b,max_train_instances=0", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/vicuna-13b,max_train_instances=0", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30", priority: 1}
 55 |     {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/vicuna-13b,max_train_instances=0", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/vicuna-13b,max_train_instances=0", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/llama-2-13b-chat-zeroshotcot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-13b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-13b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/llama-2-70b-chat-zeroshotcot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-70b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-70b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/llama-2-7b-chat-zeroshotcot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
 55 |     {description: "msmarco:model=local/llama-2-7b-chat,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/llama-2-7b-chat,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/run_specs/zeroshotcot/vicuna-13b-zeroshotcot.conf:
--------------------------------------------------------------------------------
  1 | entries: [
  2 |     # AddSub
  3 |     {description: "addsub:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  4 | 
  5 |     # AQuA
  6 |     {description: "aqua:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
  7 | 
  8 |     # BoolQ
  9 |     {description: "boolq:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 10 | 
 11 |     # CivilComments
 12 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=all", priority: 1} 
 13 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=male", priority: 1}
 14 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=female", priority: 1}
 15 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=LGBTQ", priority: 1}
 16 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=christian", priority: 1}
 17 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=muslim", priority: 1}
 18 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=other_religions", priority: 1}
 19 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=black", priority: 1}
 20 |     {description: "civil_comments:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,demographic=white", priority: 1}
 21 | 
 22 |     # CNN/Daily Mail
 23 |     {description: "summarization_cnndm:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
 24 | 
 25 |     # Coin Flip
 26 |     {description: "coin:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 27 | 
 28 |     # CommonsenseQA
 29 |     {description: "commonsense_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 30 | 
 31 |     # Date Understanding
 32 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=date_understanding", priority: 1}
 33 | 
 34 |     # GSM8K
 35 |     {description: "gsm:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 36 | 
 37 |     # HellaSwag
 38 |     {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=hellaswag,method=multiple_choice_joint", priority: 1}
 39 | 
 40 |     # IMDB
 41 |     {description: "imdb:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 42 | 
 43 |     # Last Letter Concatenation
 44 |     {description: "letter:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 45 | 
 46 |     # MMLU
 47 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=abstract_algebra", priority: 1}
 48 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=college_chemistry", priority: 1}
 49 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=computer_security", priority: 1}
 50 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=econometrics", priority: 1}
 51 |     {description: "mmlu:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,subject=us_foreign_policy", priority: 1}
 52 | 
 53 |     # MS MARCO
 54 |     {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=regular,valid_topk=30,instructions=zeroshotcot", priority: 1}
 55 |     {description: "msmarco:model=local/vicuna-13b,max_train_instances=0,track=trec,valid_topk=30,instructions=zeroshotcot", priority: 1}
 56 | 
 57 |     # MultiArith
 58 |     {description: "multi_arith:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 59 | 
 60 |     # NarrativeQA
 61 |     {description: "narrative_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 62 | 
 63 |     # NaturalQA
 64 |     {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,mode=closedbook", priority: 1}
 65 |     {description: "natural_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,mode=openbook_longans", priority: 1}
 66 | 
 67 |     # NewsQA
 68 |     # {description: "news_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 69 | 
 70 |     # OpenbookQA
 71 |     {description: "commonsense:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
 72 | 
 73 |     # QuAC
 74 |     {description: "quac:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 75 | 
 76 |     # RAFT
 77 |     {description: "raft:subset=ade_corpus_v2,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 78 |     {description: "raft:subset=banking_77,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 79 |     {description: "raft:subset=neurips_impact_statement_risks,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 80 |     {description: "raft:subset=one_stop_english,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 81 |     {description: "raft:subset=overruling,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 82 |     {description: "raft:subset=semiconductor_org_types,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 83 |     {description: "raft:subset=tweet_eval_hate,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 84 |     {description: "raft:subset=twitter_complaints,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 85 |     {description: "raft:subset=systematic_review_inclusion,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 86 |     {description: "raft:subset=tai_safety_research,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 87 |     {description: "raft:subset=terms_of_service,model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 88 | 
 89 |     # Shuffled Objects
 90 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_three_objects", priority: 1}
 91 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_five_objects", priority: 1}
 92 |     {description: "big_bench_hard:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,dataset=tracking_shuffled_objects_seven_objects", priority: 1}
 93 | 
 94 |     # SingleEq
 95 |     {description: "singleeq:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
 96 | 
 97 |     # StrategyQA
 98 |     {description: "big_bench:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,task=strategyqa,subtask=", priority: 1}
 99 | 
100 |     # SVAMP
101 |     {description: "svamp:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot", priority: 1}
102 |    
103 |     # TruthfulQA
104 |     {description: "truthful_qa:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,task=mc_single", priority: 1}
105 | 
106 |     # XSUM
107 |     {description: "summarization_xsum_sampled:model=local/vicuna-13b,max_train_instances=0,instructions=zeroshotcot,temperature=0.3,device=cpu", priority: 1}
108 | ]


--------------------------------------------------------------------------------
/scripts/vicuna-13b.sh:
--------------------------------------------------------------------------------
1 | python scripts/replicate.py
2 | bash scripts/run_reasoning.sh scripts/run_specs/agentinstruct/vicuna-13b-agentinstruct.conf vicuna-13b-agentinstruct 1000 8
3 | python src/agentinstruct/eval/format_results.py --suite vicuna-13b-agentinstruct


--------------------------------------------------------------------------------
/src/agentinstruct/agent/agent_instr_generation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import requests
  4 | 
  5 | from langchain.utilities import BingSearchAPIWrapper
  6 | 
  7 | from langchain.document_loaders import WebBaseLoader
  8 | 
  9 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 10 | from langchain.embeddings.openai import OpenAIEmbeddings
 11 | from langchain.vectorstores import Chroma
 12 | 
 13 | from langchain.chains import RetrievalQA
 14 | 
 15 | from langchain.chat_models import ChatOpenAI
 16 | from langchain.agents import Tool
 17 | from langchain.agents import AgentType
 18 | from langchain.agents import initialize_agent
 19 | 
 20 | from helm.common.general import parse_hocon
 21 | from langchain.load.dump import dumps
 22 | 
 23 | import openai
 24 | from tenacity import (
 25 |     retry,
 26 |     stop_after_attempt,
 27 |     wait_random_exponential,
 28 | )
 29 | 
 30 | os.environ["BING_SEARCH_URL"] = "https://api.bing.microsoft.com/v7.0/search"
 31 | 
 32 | POWERFUL_MODEL = "gpt-4-0613"
 33 | MINIMAL_TEMP = 0.3
 34 | ZERO_TEMP = 0.0
 35 | NUM_RESULTS = 5
 36 | 
 37 | with open('prod_env/credentials.conf', 'r') as creds:
 38 |     credentials = parse_hocon(creds.read())
 39 | creds.close()
 40 |     
 41 | openai_api_key = credentials.as_plain_ordered_dict().get('openaiApiKey')
 42 | bing_subscription_key = credentials.as_plain_ordered_dict().get('bingSubscriptionKey')
 43 | 
 44 | 
 45 | llm = ChatOpenAI(model=POWERFUL_MODEL, temperature=ZERO_TEMP, openai_api_key=openai_api_key)
 46 | search = BingSearchAPIWrapper(bing_subscription_key=bing_subscription_key)
 47 | 
 48 | def get_links(search_metadata):
 49 |     links = []
 50 |     for result in search_metadata:
 51 |         links.append(result["link"])
 52 |     return links
 53 | 
 54 | def get_instructions(dataset_phrase, num_results=5):
 55 |     search_metadata = search.results(dataset_phrase, num_results)
 56 |     print(search_metadata)
 57 | 
 58 |     old_links = get_links(search_metadata)
 59 |     print(old_links)
 60 | 
 61 |     links = []
 62 |     for link in old_links:
 63 |         try:
 64 |             requests.get(link, verify = True)
 65 |             links.append(link)
 66 |         except:
 67 |             continue
 68 |     print(links)    
 69 | 
 70 |     website_loader = WebBaseLoader(links)
 71 |     data = website_loader.load()
 72 |     for doc in data:
 73 |         doc.page_content = doc.page_content
 74 |         doc.metadata = {"url": doc.metadata["source"], "source": doc.metadata["source"]}
 75 | 
 76 |     text_splitter = RecursiveCharacterTextSplitter()
 77 |     texts = text_splitter.split_documents(data)
 78 |     embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
 79 |     db = Chroma.from_documents(texts, embeddings)
 80 |     retriever = db.as_retriever()
 81 |     qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
 82 |     return qa, links
 83 | 
 84 | def run_agent(dataset_phrase, instance_format, possible_outputs, onepass=False):
 85 |     possible_outputs_prompt = f"\nPossible outputs:\n{possible_outputs}"
 86 | 
 87 |     if onepass:
 88 |         out_dict = dict()
 89 |         out_dict["output"] = onepass_simpletips(dataset_phrase, instance_format, possible_outputs_prompt)
 90 |         return out_dict, None
 91 | 
 92 |     qa, links = get_instructions(dataset_phrase)
 93 | 
 94 |     tools = [
 95 |         Tool(
 96 |             name = "Ask about dataset",
 97 |             func=lambda x: qa({"query": x}),
 98 |             description="useful for when you need to ask questions to get information about the dataset"
 99 |         ),
100 |     ]
101 |     chat = ChatOpenAI(model=POWERFUL_MODEL, temperature=MINIMAL_TEMP, openai_api_key=openai_api_key)
102 |     agent_chain = initialize_agent(tools, chat, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, return_intermediate_steps=True)
103 | 
104 |     prompt = (f"{dataset_phrase}. Use your resources to ask a series of simple questions to create instructions for the dataset. These instructions will be prepended to the prompt template during inference to help a large language model answer the prompt correctly." +
105 |     " Include detailed tips on what topics to know and steps on how to answer the questions." +
106 |     " For each instance, the model will apply these instructions to create an explanation that guides it towards the correct answer." +
107 |     "\nPrompt Template (use for reference but no need to include in the instructions):\n"+ instance_format +
108 |     possible_outputs_prompt)
109 | 
110 |     print("Prompt: ", prompt)
111 | 
112 |     return agent_chain({"input": prompt}), links
113 | 
114 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
115 | def openai_generate(model, prompt, temperature=MINIMAL_TEMP):
116 |     response =  openai.ChatCompletion.create(
117 |             model=model,
118 |             temperature=temperature,            
119 |             messages=[
120 |                 {"role": "user", "content": prompt},
121 |             ]
122 |     )
123 |     return response['choices'][0]['message']['content']
124 | 
125 | def onepass_simpletips(dataset_phrase, instance_format, possible_outputs_prompt):
126 | 
127 |     prompt = (f"{dataset_phrase}. Create instructions for the dataset that will be prepended to the prompt template during inference to help a large language model answer the prompt correctly." +
128 |     " Include detailed tips on what topics to know and steps on how to answer the questions." +
129 |     " For each instance, the model will apply these instructions to create an explanation that guides it towards the correct answer." +
130 |     "\nPrompt Template (use for reference but no need to include in the instructions):\n"+ instance_format +
131 |     possible_outputs_prompt)
132 |     return openai_generate(POWERFUL_MODEL, prompt, temperature=MINIMAL_TEMP)
133 | 
134 | def generate_and_save_instructions(working_directory_name, dataset_name, dataset_phrase, instance_format, possible_outputs, sources_dict, onepass=False):
135 |     
136 |     out_dict, links = run_agent(dataset_phrase, instance_format, possible_outputs, onepass=onepass)
137 |     input_prompt = out_dict.get("input", None)
138 |     intermediate_steps = dumps(out_dict.get("intermediate_steps", None))
139 |     instr = out_dict["output"][out_dict["output"].find("1."):]
140 | 
141 |     sources_dict[dataset_name] = {
142 |         "all_links": links,
143 |         "input_prompt": input_prompt,
144 |         "intermediate_steps": intermediate_steps,
145 |     }
146 |     
147 |     return instr, sources_dict
148 | 


--------------------------------------------------------------------------------
/src/agentinstruct/agent/agent_pipeline.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import openai
 4 | import argparse
 5 | 
 6 | from utils.dataset_preprocessing import dataset_preprocessing
 7 | from agent_instr_generation import generate_and_save_instructions
 8 | from helm.common.general import parse_hocon
 9 | 
10 | with open('prod_env/credentials.conf', 'r') as creds:
11 |         credentials = parse_hocon(creds.read())    
12 |         
13 | openai.api_key = credentials.as_plain_ordered_dict().get('openaiApiKey')
14 | 
15 | __import__('pysqlite3')
16 | import sys
17 | sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
18 |     
19 | def generate_and_place_all_instr(benchmark_output_dir):
20 |     suite = benchmark_output_dir.split("/")[-1]
21 |     inputs_dict = {}
22 |     instr_dict = {}
23 |     sources_dict = {}
24 |     
25 |     instr_dir_path = os.path.join("instructions", suite)
26 |     os.makedirs(instr_dir_path, exist_ok="True")
27 |             
28 |     for dataset_dir in os.listdir(benchmark_output_dir):
29 |         if os.path.isdir(os.path.join(benchmark_output_dir, dataset_dir)):
30 |             scenario_state_path = os.path.join(benchmark_output_dir, dataset_dir, "scenario_state.json")
31 |             if not os.path.exists(scenario_state_path):
32 |                 print(f"Scenario state does not exist for {dataset_dir}. Skipping.")
33 |                 continue            
34 |             dataset_name, dataset_phrase, instance_format, possible_outputs = dataset_preprocessing(scenario_state_path)
35 |             inputs_dict[dataset_name] = {
36 |                 "dataset_phrase": dataset_phrase,
37 |                 "instance_format": instance_format,
38 |                 "possible_outputs": possible_outputs,
39 |             }
40 |             instr, sources_dict = generate_and_save_instructions(instr_dir_path, dataset_name, dataset_phrase, instance_format, possible_outputs, sources_dict, onepass=False)   
41 |             instr_dict[dataset_name] = {
42 |                 "instructions": instr,
43 |                 "task": possible_outputs
44 |             }
45 | 
46 |     with open(os.path.join(instr_dir_path, "instructions.json"), "w") as f:
47 |         json.dump(instr_dict, f, indent=4)
48 |     with open(os.path.join(instr_dir_path, "inputs.json"), "w") as f:
49 |         json.dump(inputs_dict, f, indent=4)
50 |     with open(os.path.join(instr_dir_path, "metadata.json"), "w") as f:
51 |         json.dump(sources_dict, f, indent=4)    
52 |     try:
53 |         os.unlink(os.path.join(os.getcwd(), 'instructions/_latest'))
54 |     except:
55 |         pass
56 |     os.symlink(os.path.join(os.getcwd(), f'instructions/{suite}'), os.path.join(os.getcwd(), 'instructions/_latest'))
57 | 
58 | if __name__ == "__main__":
59 |     parser = argparse.ArgumentParser()
60 |     parser.add_argument("--benchmark_output_dir", type=str)
61 |     args = parser.parse_args()    
62 |     generate_and_place_all_instr(args.benchmark_output_dir)
63 | 


--------------------------------------------------------------------------------
/src/agentinstruct/agent/utils/dataset_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import tiktoken
 4 | import backoff
 5 | import openai
 6 | 
 7 | def read_scenario_state(scenario_state_path):
 8 |     with open(scenario_state_path, "r") as f:
 9 |         scenario_state = json.load(f)
10 |     dataset_name = scenario_state["adapter_spec"]["prompt_list"]["dataset_name"]
11 |     possible_outputs = scenario_state["adapter_spec"]["method"]
12 |     test_instances = []
13 |     labels = set()
14 |     for state in scenario_state["request_states"]:
15 |         test_instances.append(state["request"]["prompt"])
16 |         labels.add(state["instance"]["references"][0]["output"]["text"])
17 |     if len(labels) < len(test_instances) and possible_outputs == 'generation':
18 |         possible_outputs = list(labels)                  
19 |     return dataset_name, test_instances, possible_outputs
20 | 
21 | def get_dataset_phrase(dataset_name):
22 |     dataset_phrase = re.sub(r"(^(.*?):)", r"The dataset name is \1", dataset_name)                
23 |     if "The dataset name is" not in dataset_phrase:
24 |         dataset_phrase = "The dataset name is " + dataset_phrase
25 |     pattern = r"(,|:)(.*?)=(.*?)(,|$)"
26 |     while re.search(pattern, dataset_phrase) is not None:
27 |         dataset_phrase = re.sub(pattern, r" and the \2 is \3,", dataset_phrase)
28 |     dataset_name = re.sub(r":$", "", dataset_name)
29 |     dataset_phrase = re.sub(r"(,|:)$", "", dataset_phrase)
30 |     return dataset_phrase
31 | 
32 | def truncate_instances(instances, max_length=3600):
33 | 
34 |     encoding = tiktoken.get_encoding("cl100k_base")
35 |     instance_num_tokens = [(instance, len(encoding.encode(instance))) for instance in instances]
36 |     instance_num_tokens.sort(key=lambda x: x[1])
37 |     instances_str = instance_num_tokens[0][0]
38 |     num_tokens = instance_num_tokens[0][1]
39 |     for instance, num_tokens_instance in instance_num_tokens[1:]:
40 |         if num_tokens + num_tokens_instance <= max_length:
41 |             instances_str += "\n\n" + instance
42 |             num_tokens += 1 + num_tokens_instance
43 |         else:
44 |             break    
45 |     return instances_str
46 | 
47 | @backoff.on_exception(backoff.expo, openai.error.RateLimitError, max_time=60)
48 | def get_instance_format(instances):
49 |     
50 |     output = openai.ChatCompletion.create(
51 |             model="gpt-3.5-turbo",
52 |             temperature=0,
53 |             messages=[
54 |                     {"role": "user", "content": f"Given the following instances from a dataset, please isolate the structure of each instance such that a general template is created. Do not include any specific information, just what each instance looks like before its specific information was filled in (the template should have empty brackets in the spots that are different for each instance). We will use this to write our own instances that must follow the same format. Remember to be as general as possible; there are likely some instances in the dataset that are quite different than the ones presented here.\nExample Instances:\n\n{instances}\n\nFormat:"},
55 |                 ],
56 |             max_tokens=256,
57 |             )
58 |     return output["choices"][0]["message"]["content"]
59 | 
60 | def get_full_instance_format(instances, verbose=False):
61 |     if verbose:
62 |         print("original instances: ", instances)
63 |     instances = truncate_instances(instances[:5])
64 |     formatted_instances = get_instance_format(instances)
65 |     return formatted_instances
66 | 
67 | def dataset_preprocessing(scenario_state_path):
68 |     dataset_name, test_instances, possible_outputs = read_scenario_state(scenario_state_path)
69 |     dataset_phrase = get_dataset_phrase(dataset_name)
70 |     instance_format = get_full_instance_format(test_instances, verbose=True)
71 |     return dataset_name, dataset_phrase, instance_format, possible_outputs


--------------------------------------------------------------------------------
/src/agentinstruct/eval/format_results.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import json
  4 | import pandas as pd
  5 | from letter_eval import letter_eval
  6 | 
  7 | dataset_to_metric = {
  8 |     'mmlu': 'exact_match',
  9 |     'civil_comments': 'quasi_prefix_exact_match',
 10 |     'raft': 'quasi_exact_match',
 11 |     'big_bench': 'exact_match',
 12 |     'summarization_cnndm': 'rouge_2',
 13 |     'summarization_xsum': 'rouge_2',
 14 |     'truthful_qa': 'exact_match',
 15 |     'imdb': 'quasi_exact_match', 
 16 |     'narrative_qa': 'f1_score', 
 17 |     'boolq': 'quasi_prefix_exact_match', 
 18 |     'quac': 'f1_score', 
 19 |     'aqua': 'exact_match', 
 20 |     'news_qa': 'f1_score',
 21 |     'natural_qa': 'f1_score',
 22 |     'commonsense': 'exact_match',
 23 |     'truthful_qa': 'exact_match',
 24 |     'msmarco': 'RR@10', #switch for trec
 25 |     'gsm': 'quasi_exact_match',
 26 |     'multi_arith': 'quasi_exact_match',
 27 |     'svamp' : 'quasi_exact_match',
 28 |     'addsub': 'quasi_exact_match',
 29 |     'singleeq': 'quasi_exact_match',
 30 |     'letter': 'letter_eval',
 31 |     'big_bench_hard': 'quasi_exact_match',
 32 |     'coin': "quasi_exact_match",
 33 |     'commonsense_qa': 'exact_match',
 34 | }
 35 | 
 36 | def main(args):
 37 |     results = {}
 38 |     for run in os.listdir(os.path.join('benchmark_output/runs', args.suite)):
 39 |         
 40 |         try:
 41 |             if 'letter' in run:
 42 |                 score, num_instances = letter_eval(os.path.join('benchmark_output/runs', args.suite, run))
 43 |                 results[run] = {'score': score, 'num_instances': num_instances, 'metric': 'letter_eval'}
 44 |                 continue 
 45 |         
 46 |             with open(os.path.join('benchmark_output/runs', args.suite, run, 'stats.json'), 'r') as f:
 47 |                 stats = json.load(f)
 48 |             f.close()
 49 |             
 50 |             with open(os.path.join('benchmark_output/runs', args.suite, run, 'scenario_state.json'), 'r') as f1:
 51 |                 scenario_state = json.load(f1)
 52 |             f1.close()
 53 |             
 54 |             dataset = run.split(':')[0].split(',')[0] if ',' in run.split(':')[0] else run.split(':')[0]
 55 |             metric = dataset_to_metric[dataset]
 56 |             
 57 |             if dataset == 'msmarco' and 'track=trec' in run:
 58 |                 metric = 'NDCG@10'
 59 |             
 60 |             results[run] = {'score': None, 'num_instances': None, 'metric': metric}
 61 |                 
 62 |             if 'civil_comments' in run:
 63 |                 score = 0
 64 |                 instances = 0
 65 |                 for stat in stats:
 66 |                     if stat['name']['name'] == metric and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
 67 |                         score += stat['mean']
 68 |                     if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
 69 |                         instances += stat['mean']
 70 |                 results[run]['score'] =  score/2
 71 |                 results[run]['num_instances'] = instances
 72 |                 
 73 |             else: 
 74 |                 tmp = None
 75 |                 for stat in stats:
 76 |                     
 77 |                     if stat['name']['name'] == metric and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
 78 |                         results[run]['score'] = stat['mean']
 79 |                         
 80 |                     if stat['name']['name'] == metric and stat['name']['split'] == 'valid' and 'perturbation' not in stat['name']:
 81 |                         tmp = stat['mean']
 82 |                         
 83 |                     if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'test' and 'perturbation' not in stat['name']:
 84 |                         results[run]['num_instances'] = stat['mean']
 85 |                     
 86 |                     if stat['name']['name'] == 'num_instances' and stat['name']['split'] == 'valid' and 'perturbation' not in stat['name']:
 87 |                         tmp1 = stat['mean']
 88 |                 
 89 |             if results[run]['score'] == None:
 90 |                 if tmp != None:
 91 |                     results[run]['score'] = tmp
 92 |                     results[run]['num_instances'] = tmp1
 93 |                 else:
 94 |                     print(f'Run {run} does not have a test or validation set.\n')
 95 |             
 96 |         except Exception as e:
 97 |             print(f'Skipping {run}.')
 98 |         
 99 |     keys = sorted(results)
100 |     results = {key: results[key] for key in keys}
101 |     df = pd.DataFrame.from_dict(results, columns = ['metric', 'num_instances', 'score'], orient='index') 
102 |     df.to_csv(f'benchmark_output/runs/{args.suite}/results.csv')
103 | 
104 | if __name__ == '__main__':
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument('--suite', type=str, required=True)
107 |     main(parser.parse_args())
108 | 


--------------------------------------------------------------------------------
/src/agentinstruct/eval/letter_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | import string
 5 | 
 6 | def letter_eval(path):
 7 |     
 8 |     def white_space_fix(text: str) -> str:
 9 |         return " ".join(text.split())
10 | 
11 |     def remove_punc(text: str) -> str:
12 |         exclude = set(string.punctuation)
13 |         return "".join(ch for ch in text if ch not in exclude)
14 | 
15 |     def lower(text: str) -> str:
16 |         return text.lower()
17 |         
18 |     with open(os.path.join(path, "scenario_state.json"), 'r') as f:
19 |         states = json.load(f)
20 |         
21 |     count = 0
22 |     
23 |     if 'agentinstruct' in states["adapter_spec"]["prompt_list"]:
24 |         mode = 'agentinstruct' if states["adapter_spec"]["prompt_list"]["agentinstruct"] else 'zeroshotcot'
25 |     else:
26 |         mode='zeroshot'
27 |     
28 |     for instance in states["request_states"]:
29 |         gold = instance["instance"]["references"][0]["output"]["text"]
30 |         if mode == 'zeroshotcot':
31 |             pred = instance["result"]["full_text"].split('Therefore, the answer is')[-1].translate({ord(c): None for c in string.whitespace})
32 |         elif mode == 'agentinstruct':
33 |             pred = instance["result"]["full_text"].split('Answer:')[-1].translate({ord(c): None for c in string.whitespace})
34 |         else:
35 |             pred = instance["result"]["completions"][0]["text"].translate({ord(c): None for c in string.whitespace})
36 |         
37 |         if pred and gold:
38 |             if white_space_fix(remove_punc(lower(gold))) == white_space_fix(remove_punc(lower(pred)))[:2]:
39 |                 count += 1
40 | 
41 |     l = len(states["request_states"])
42 |     return count/l, l
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--path', type=str, required=True)
47 |     args = parser.parse_args()
48 |     print(letter_eval(args.path))
49 |                             
50 |     
51 | 
52 |         
53 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/benchmark_output/scenarios/coin/data/train:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "id": 1,
 4 |         "question": "A coin is heads up. Dan flips the coin. Earnest does not flip the coin. Agustin does not flip the coin. Kip does not flip the coin. Is the coin still heads up?",
 5 |         "answer": "no"
 6 |     },
 7 |     {
 8 |         "id": 2,
 9 |         "question": "A coin is heads up. Milford flips the coin. Kathie does not flip the coin. Cathy flips the coin. Randy does not flip the coin. Is the coin still heads up?",
10 |         "answer": "yes"
11 |     },
12 |     {
13 |         "id": 3,
14 |         "question": "A coin is heads up. Donald flips the coin. Rosalind flips the coin. Madelyn flips the coin. Ida flips the coin. Is the coin still heads up?",
15 |         "answer": "yes"
16 |     },
17 |     {
18 |         "id": 4,
19 |         "question": "A coin is heads up. Kristen flips the coin. Clarice does not flip the coin. Thelma flips the coin. Maurice flips the coin. Is the coin still heads up?",
20 |         "answer": "no"
21 |     },
22 |     {
23 |         "id": 5,
24 |         "question": "A coin is heads up. Andy flips the coin. Clinton does not flip the coin. Hilda does not flip the coin. Katrina does not flip the coin. Is the coin still heads up?",
25 |         "answer": "no"
26 |     }
27 | ]


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/benchmark_output/scenarios/letter/data/train:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "id": 1,
 4 |         "question": "Take the last letters of each words in \"Phil Schmitt\" and concatenate them.",
 5 |         "answer": "lt"
 6 |     },
 7 |     {
 8 |         "id": 2,
 9 |         "question": "Take the last letters of each words in \"Marta Faulkner\" and concatenate them.",
10 |         "answer": "ar"
11 |     },
12 |     {
13 |         "id": 3,
14 |         "question": "Take the last letters of each words in \"Eugenia Watson\" and concatenate them.",
15 |         "answer": "an"
16 |     },
17 |     {
18 |         "id": 4,
19 |         "question": "Take the last letters of each words in \"Danielle Barr\" and concatenate them.",
20 |         "answer": "er"
21 |     },
22 |     {
23 |         "id": 5,
24 |         "question": "Take the last letters of each words in \"Antwan Bates\" and concatenate them.",
25 |         "answer": "ns"
26 |     }
27 | ]


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/__init__.py:
--------------------------------------------------------------------------------
 1 | # Add any classes that need to be loaded dynamically via `create_object`.
 2 | 
 3 | # Scenarios
 4 | from .scenarios import simple_scenarios  # noqa
 5 | from .scenarios import mmlu_scenario  # noqa
 6 | from .scenarios import interactive_qa_mmlu_scenario  # noqa
 7 | from .scenarios import msmarco_scenario  # noqa
 8 | from .scenarios import commonsense_scenario  # noqa
 9 | from .scenarios import twitter_aae_scenario  # noqa
10 | from .scenarios import real_toxicity_prompts_scenario  # noqa
11 | from .scenarios import math_scenario  # noqa
12 | from .scenarios import the_pile_scenario  # noqa
13 | from .scenarios import truthful_qa_scenario  # noqa
14 | from .scenarios import wikifact_scenario  # noqa
15 | from .scenarios import synthetic_reasoning_natural_scenario  # noqa
16 | from .scenarios import copyright_scenario  # noqa
17 | from .scenarios import disinformation_scenario  # noqa
18 | from .scenarios import boolq_scenario  # noqa
19 | from .scenarios import code_scenario  # noqa
20 | from .scenarios import lsat_qa_scenario  # noqa
21 | from .scenarios import gsm_scenario  # noqa
22 | from .scenarios import natural_qa_scenario  # noqa
23 | from .scenarios import quac_scenario  # noqa
24 | from .scenarios import babi_qa_scenario  # noqa
25 | from .scenarios import narrativeqa_scenario  # noqa
26 | from .scenarios import raft_scenario  # noqa
27 | from .scenarios import numeracy_scenario  # noqa
28 | from .scenarios import ice_scenario  # noqa
29 | from .scenarios import summarization_scenario  # noqa
30 | from .scenarios import synthetic_efficiency_scenario  # noqa
31 | from .scenarios import synthetic_reasoning_scenario  # noqa
32 | from .scenarios import newsqa_scenario  # noqa
33 | from .scenarios import wikitext_103_scenario  # noqa
34 | from .scenarios import blimp_scenario  # noqa
35 | from .scenarios import imdb_scenario  # noqa
36 | from .scenarios import dialogue_scenarios  # noqa
37 | from .scenarios import bbq_scenario  # noqa
38 | from .scenarios import bold_scenario  # noqa
39 | from .scenarios import civil_comments_scenario  # noqa
40 | from .scenarios import dyck_language_scenario  # noqa
41 | from .scenarios import legal_support_scenario  # noqa
42 | from .scenarios import lex_glue_scenario  # noqa
43 | from .scenarios import lextreme_scenario  # noqa
44 | from .scenarios import entity_matching_scenario  # noqa
45 | from .scenarios import entity_data_imputation_scenario  # noqa
46 | from .scenarios import big_bench_scenario  # noqa
47 | from .scenarios import opinions_qa_scenario  # noqa
48 | from .scenarios import multi_arith_scenario
49 | from .scenarios import aqua_scenario
50 | from .scenarios import svamp_scenario
51 | from .scenarios import addsub_scenario
52 | from .scenarios import singleeq_scenario
53 | from .scenarios import coin_scenario
54 | from .scenarios import letter_scenario
55 | from .scenarios import big_bench_hard_scenario
56 | from .scenarios import commonsense_qa_scenario
57 | 
58 | # Biomedical
59 | from .scenarios import covid_dialog_scenario  # noqa
60 | from .scenarios import me_q_sum_scenario  # noqa
61 | from .scenarios import med_dialog_scenario  # noqa
62 | from .scenarios import med_mcqa_scenario  # noqa
63 | from .scenarios import med_paragraph_simplification_scenario  # noqa
64 | from .scenarios import med_qa_scenario  # noqa
65 | from .scenarios import pubmed_qa_scenario  # noqa
66 | from .scenarios import wmt_14_scenario  # noqa
67 | 
68 | #
69 | # Metrics
70 | from .metrics import basic_metrics  # noqa
71 | from .metrics import bbq_metrics  # noqa
72 | from .metrics import bias_metrics  # noqa
73 | from .metrics import classification_metrics  # noqa
74 | from .metrics import code_metrics  # noqa
75 | from .metrics import copyright_metrics  # noqa
76 | from .metrics import disinformation_metrics  # noqa
77 | from .metrics import numeracy_metrics  # noqa
78 | from .metrics import ranking_metrics  # noqa
79 | from .metrics import summarization_metrics  # noqa
80 | from .metrics import toxicity_metrics  # noqa
81 | from .metrics import machine_translation_metrics  # noqa
82 | 
83 | # Perturbations for data augmentation
84 | from .augmentations.extra_space_perturbation import ExtraSpacePerturbation  # noqa
85 | from .augmentations.misspelling_perturbation import MisspellingPerturbation  # noqa
86 | from .augmentations.contraction_expansion_perturbation import ContractionPerturbation  # noqa
87 | from .augmentations.contraction_expansion_perturbation import ExpansionPerturbation  # noqa
88 | from .augmentations.typos_perturbation import TyposPerturbation  # noqa
89 | from .augmentations.filler_words_perturbation import FillerWordsPerturbation  # noqa
90 | from .augmentations.synonym_perturbation import SynonymPerturbation  # noqa
91 | from .augmentations.contrast_sets_perturbation import ContrastSetsPerturbation  # noqa
92 | from .augmentations.lowercase_perturbation import LowerCasePerturbation  # noqa
93 | from .augmentations.space_perturbation import SpacePerturbation  # noqa
94 | from .augmentations.mild_mix_perturbation import MildMixPerturbation  # noqa
95 | from .augmentations.dialect_perturbation import DialectPerturbation  # noqa
96 | from .augmentations.person_name_perturbation import PersonNamePerturbation  # noqa
97 | from .augmentations.gender_perturbation import GenderPerturbation  # noqa
98 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapter_spec.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List, Optional, Dict, Any
 3 | 
 4 | 
 5 | @dataclass(frozen=True)
 6 | class Substitution:
 7 |     """Represents a regular expression search/replace."""
 8 | 
 9 |     source: str
10 |     target: str
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class AdapterSpec:
15 |     """
16 |     Specifies how to take a `Scenario` (a list of `Instance`s) and produce a
17 |     `ScenarioState` (set of `Request`s ). Instead of having free-form prompt
18 |     hacking, we try to make the process more declarative and systematic.
19 |     Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
20 |     """
21 | 
22 |     # Method of adaptation
23 |     method: str = ""
24 | 
25 |     # Prepend all prompts with this string.
26 |     # For example, it is recommended to prefix all prompts with [NLG] for UL2.
27 |     global_prefix: str = ""
28 | 
29 |     # Prompt starts with instructions
30 |     instructions: str = ""
31 | 
32 |     # What goes before the input
33 |     input_prefix: str = "Input: "
34 | 
35 |     # What goes after the input
36 |     input_suffix: str = "\n"
37 | 
38 |     # What goes before the input (for multiple choice)
39 |     reference_prefix: str = "A. "
40 | 
41 |     # What goes before the input (for multiple choice)
42 |     reference_suffix: str = "\n"
43 | 
44 |     # What goes before the output
45 |     output_prefix: str = "Output: "
46 | 
47 |     # What goes after the output
48 |     output_suffix: str = "\n"
49 | 
50 |     # What goes between instruction and in-context example blocks in the constructed prompt
51 |     instance_prefix: str = "\n"
52 | 
53 |     # List of regular expression substitutions that we perform
54 |     substitutions: List[Substitution] = field(default_factory=list, hash=False)
55 | 
56 |     # Maximum number of (in-context) training instances to put into the prompt
57 |     max_train_instances: int = 5
58 | 
59 |     # Maximum number of evaluation instances. For getting valid numbers, this
60 |     # should be the entire dataset; only reduce this for piloting.
61 |     max_eval_instances: Optional[int] = None
62 | 
63 |     # Generate this many outputs (which could be realized by `num_completions`
64 |     # or `top_k_per_token`).
65 |     num_outputs: int = 5
66 | 
67 |     # Number of trials, where in each trial we choose an independent, random
68 |     # set of training instances.  Used to compute error bars.
69 |     num_train_trials: int = 1
70 | 
71 |     # If true, randomly sample N training examples; if false, select N consecutive training examples
72 |     sample_train: bool = True
73 | 
74 |     # Decoding parameters (inherited by `Request`)
75 | 
76 |     # Model to make the request to (need to fill in)
77 |     model: str = ""
78 | 
79 |     # Temperature to use
80 |     temperature: float = 1
81 | 
82 |     # Maximum number of tokens to generate
83 |     max_tokens: int = 100
84 | 
85 |     # When to stop (set hash=False to make `AdapterSpec` hashable)
86 |     stop_sequences: List[str] = field(default_factory=list, hash=False)
87 | 
88 |     # Random string (used concretely to bypass cache / see diverse results)
89 |     random: Optional[str] = None
90 | 
91 |     # Prompt List (for multiple calls to chatgpt)
92 |     prompt_list: Dict[str, Any] = None
93 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Optional
  2 | from helm.benchmark.adaptation.adapter_spec import AdapterSpec
  3 | 
  4 | from helm.benchmark.adaptation.request_state import RequestState
  5 | from helm.benchmark.scenarios.scenario import Instance
  6 | from helm.benchmark.window_services.tokenizer_service import TokenizerService
  7 | from helm.common.request import Request
  8 | from .in_context_learning_adapter import InContextLearningAdapter
  9 | from dataclasses import replace
 10 | 
 11 | 
 12 | class MultipleChoiceJointAdapter(InContextLearningAdapter):
 13 |     """
 14 |     Each `Instance` in a `Scenario` looks like this:
 15 | 
 16 |         <input> -> <reference1>
 17 |                    <reference2>
 18 |                    <reference3> [correct]
 19 |                    <reference4>
 20 | 
 21 |     We can define a label (e.g., letter) for each reference:
 22 | 
 23 |         <instructions>
 24 | 
 25 |         <input>                  # train
 26 |         A. <reference1>
 27 |         B. <reference2>
 28 |         C. <reference3>
 29 |         D. <reference4>
 30 |         Answer: C
 31 | 
 32 |         <input>                  # test
 33 |         A. <reference1>
 34 |         B. <reference2>
 35 |         C. <reference3>
 36 |         D. <reference4>
 37 |         Answer:
 38 | 
 39 |     In general, each example is:
 40 | 
 41 |         <input_prefix><input><reference_prefixes[0]><reference><output_prefix><output>
 42 |     """
 43 | 
 44 |     def __init__(self, adapter_spec: AdapterSpec, tokenizer_service: TokenizerService):
 45 |         super().__init__(adapter_spec, tokenizer_service)
 46 | 
 47 |     @staticmethod
 48 |     def get_reference_prefix(prefix: str, i: int) -> str:
 49 |         """
 50 |         Example: prefix = "\nA. ", i = 2, return "\nC. "
 51 |         """
 52 |         return prefix.replace("A", chr(ord("A") + i))
 53 | 
 54 |     def generate_requests(self, eval_instance: Instance) -> List[RequestState]:
 55 |         prompt = self.construct_prompt(self.train_instances, eval_instance, include_output=False, reference_index=None)
 56 |         output_mapping: Dict[str, str] = dict(
 57 |             (self.get_reference_prefix("A", reference_index), reference.output.text)
 58 |             for reference_index, reference in enumerate(eval_instance.references)
 59 |         )
 60 |         request = Request(
 61 |             model=self.adapter_spec.model,
 62 |             prompt=prompt.text,
 63 |             num_completions=1,
 64 |             top_k_per_token=self.adapter_spec.num_outputs,
 65 |             temperature=self.adapter_spec.temperature,  # usually this is 0
 66 |             max_tokens=self.adapter_spec.max_tokens,  # usually this is 1
 67 |             stop_sequences=[],
 68 |             random=self.adapter_spec.random,
 69 |         )
 70 |         request_state = RequestState(
 71 |             instance=eval_instance,
 72 |             reference_index=None,
 73 |             request_mode=None,
 74 |             train_trial_index=self.train_trial_index,
 75 |             output_mapping=output_mapping,
 76 |             request=request,
 77 |             result=None,
 78 |             num_train_instances=prompt.num_train_instances,
 79 |             prompt_truncated=prompt.truncated,
 80 |         )
 81 |         return [request_state]
 82 | 
 83 |     def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
 84 |         """Return a list of lines corresponding to this example (part of the prompt)."""
 85 |         # Input
 86 |         result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix
 87 | 
 88 |         # Include the references
 89 |         output = "n/a"
 90 |         for reference_index, reference in enumerate(instance.references):
 91 |             prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
 92 |             result += prefix + reference.output.text + self.adapter_spec.reference_suffix
 93 |             if reference.is_correct and output == "n/a":
 94 |                 output = self.get_reference_prefix("A", reference_index)
 95 | 
 96 |         if include_output:
 97 |             result += self.adapter_spec.output_prefix + output + self.adapter_spec.output_suffix
 98 |         else:
 99 |             result += self.adapter_spec.output_prefix.rstrip()
100 | 
101 |         return result
102 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/executor.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Dict, Any
 2 | from dataclasses import dataclass, replace
 3 | 
 4 | from helm.common.general import parallel_map
 5 | from helm.common.hierarchical_logger import htrack, hlog
 6 | from helm.common.request import RequestResult, Sequence
 7 | from helm.common.authentication import Authentication
 8 | from helm.proxy.services.remote_service import RemoteService
 9 | from helm.proxy.services.server_service import ServerService
10 | from helm.proxy.services.service import Service
11 | from helm.benchmark.adaptation.scenario_state import ScenarioState
12 | from helm.benchmark.adaptation.request_state import RequestState
13 | 
14 | 
15 | class ExecutorError(Exception):
16 |     pass
17 | 
18 | 
19 | @dataclass(frozen=True)
20 | class ExecutionSpec:
21 |     # If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959).
22 |     url: Optional[str]
23 | 
24 |     # Pass into the service
25 |     auth: Authentication
26 | 
27 |     # Path where API credentials and cache is stored.
28 |     # This path is the same as `--base-path` when launching the proxy server (see server.py).
29 |     # Required when url is not set.
30 |     local_path: Optional[str]
31 | 
32 |     # How many threads to have at once
33 |     parallelism: int
34 | 
35 |     # Whether to skip execution
36 |     dry_run: bool = False
37 | 
38 |     # URL to the MongoDB database.
39 |     # If non-empty, the MongoDB database will be used for caching instead of SQLite.
40 |     # Example format: mongodb://[username:password@]host1[:port1]/[dbname]
41 |     # For full format, see: https://www.mongodb.com/docs/manual/reference/connection-string/
42 |     mongo_uri: str = ""
43 | 
44 | 
45 | class Executor:
46 |     """
47 |     An `Executor` takes a `ScenarioState` which has a bunch of requests.
48 |     Issue them to the API and return the results.
49 |     """
50 | 
51 |     def __init__(self, execution_spec: ExecutionSpec):
52 |         self.execution_spec = execution_spec
53 | 
54 |         self.service: Service
55 |         if execution_spec.url:
56 |             hlog(f"Running using remote API proxy server: {execution_spec.url}")
57 |             self.service = RemoteService(execution_spec.url)
58 |         elif execution_spec.local_path:
59 |             hlog(f"Running in local mode with base path: {execution_spec.local_path}")
60 |             self.service = ServerService(
61 |                 base_path=execution_spec.local_path, root_mode=True, mongo_uri=execution_spec.mongo_uri
62 |             )
63 |         else:
64 |             raise ValueError("Either the proxy server URL or the local path must be set")
65 | 
66 |     @htrack(None)
67 |     def execute(self, scenario_state: ScenarioState) -> ScenarioState:
68 |         if self.execution_spec.dry_run:
69 |             hlog("Skipped execution.")
70 |             return scenario_state
71 | 
72 |         # Fill in process with prompt list (accessible from ScenarioState) so it only has one variable
73 |         process = lambda x: self.process(x, scenario_state.adapter_spec.prompt_list)
74 | 
75 |         # Do it!
76 |         request_states = parallel_map(
77 |             process, #self.process,
78 |             scenario_state.request_states,
79 |             parallelism=self.execution_spec.parallelism,
80 |         )
81 | 
82 |         hlog(f"Processed {len(request_states)} requests")
83 |         return ScenarioState(scenario_state.adapter_spec, request_states)
84 | 
85 |     def process(self, state: RequestState, prompt_list: Dict[str, Any]) -> RequestState:
86 |         try:
87 |             result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request, prompt_list)
88 |         except Exception as e:
89 |             raise ExecutorError(f"{str(e)} Request: {state.request}") from e
90 |         if not result.success:
91 |             if result.error_flags and not result.error_flags.is_fatal:
92 |                 hlog(f"WARNING: Non-fatal error treated as empty completion: {result.error}")
93 |                 result.completions = [Sequence(text="", logprob=0, tokens=[])]
94 |             else:
95 |                 raise ExecutorError(f"{str(result.error)} Request: {state.request}")
96 |         return replace(state, result=result)
97 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/addsub_scenario.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import List, Dict
  4 | 
  5 | from helm.common.hierarchical_logger import hlog
  6 | 
  7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
  8 | from .scenario import (
  9 |     Scenario,
 10 |     Instance,
 11 |     Reference,
 12 |     TRAIN_SPLIT,
 13 |     TEST_SPLIT,
 14 |     CORRECT_TAG,
 15 |     Input,
 16 |     Output,
 17 | )
 18 | 
 19 | 
 20 | class AddSubScenario(Scenario):
 21 | 
 22 |     name = "addsub"
 23 |     description = "AddSub Dataset"
 24 |     tags = ["question_answering"]
 25 | 
 26 |     def __init__(self):
 27 |         super().__init__()
 28 | 
 29 |     def get_instances(self) -> List[Instance]:
 30 |         def delete_extra_zero(n):
 31 |             try:
 32 |                 n = float(n)
 33 |             except:
 34 |                 hlog(f"None {n}")
 35 |                 return n
 36 |             if isinstance(n, int):
 37 |                 return str(n)
 38 |             if isinstance(n, float):
 39 |                 n = str(n).rstrip("0")
 40 |                 n = int(n.rstrip(".")) if n.endswith(".") else float(n)
 41 |                 n = str(n)
 42 |                 return n
 43 | 
 44 |         def make_train_set(data_path: str):
 45 |             train = [
 46 |                 {
 47 |                     "iIndex": 0,
 48 |                     "sQuestion": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
 49 |                     "lSolutions": [39],
 50 |                 },
 51 |                 {
 52 |                     "iIndex": 1,
 53 |                     "sQuestion": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
 54 |                     "lSolutions": [6],
 55 |                 },
 56 |                 {
 57 |                     "iIndex": 2,
 58 |                     "sQuestion": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
 59 |                     "lSolutions": [5],
 60 |                 },
 61 |                 {
 62 |                     "iIndex": 3,
 63 |                     "sQuestion": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
 64 |                     "lSolutions": [9],
 65 |                 },
 66 |                 {
 67 |                     "iIndex": 4,
 68 |                     "sQuestion": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
 69 |                     "lSolutions": [33],
 70 |                 },
 71 |             ]
 72 | 
 73 |             with open(os.path.join(data_path, "train"), "w") as f:
 74 |                 f.write(json.dumps(train, indent=4))
 75 | 
 76 |         data_path: str = os.path.join(self.output_path, "data")
 77 |         ensure_directory_exists(data_path)
 78 | 
 79 |         url: str = "https://raw.githubusercontent.com/chuanyang-Zheng/Progressive-Hint/main/dataset/AddSub/AddSub.json"
 80 |         test_path: str = os.path.join(data_path, "test")
 81 |         ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False)
 82 | 
 83 |         make_train_set(data_path)
 84 | 
 85 |         instances: List[Instance] = []
 86 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
 87 | 
 88 |         for split, filename in split_to_filename.items():
 89 |             target_path: str = os.path.join(data_path, filename)
 90 | 
 91 |             with open(target_path, "r") as f:
 92 |                 data = json.load(f)
 93 |                 for entry in data:
 94 |                     question = entry["sQuestion"].strip()
 95 |                     answer = str(entry["lSolutions"][0])
 96 |                     if answer[-2:] == ".0":
 97 |                         answer = answer[:-2]
 98 |                     instance: Instance = Instance(
 99 |                         input=Input(text=question),
100 |                         references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])],
101 |                         split=split,
102 |                     )
103 |                     instances.append(instance)
104 | 
105 |         return instances
106 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/aqua_scenario.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import List, Dict
 4 | 
 5 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 6 | from .scenario import (
 7 |     Scenario,
 8 |     Instance,
 9 |     Reference,
10 |     TRAIN_SPLIT,
11 |     VALID_SPLIT,
12 |     TEST_SPLIT,
13 |     CORRECT_TAG,
14 |     Input,
15 |     Output,
16 | )
17 | 
18 | 
19 | class AQuAScenario(Scenario):
20 | 
21 |     name = "aqua"
22 |     description = "AQuA Dataset"
23 |     tags = ["question_answering"]
24 | 
25 |     def __init__(self):
26 |         super().__init__()
27 | 
28 |     def get_instances(self) -> List[Instance]:
29 |         data_path: str = os.path.join(self.output_path, "data")
30 |         ensure_directory_exists(data_path)
31 | 
32 |         instances: List[Instance] = []
33 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", VALID_SPLIT: "dev", TEST_SPLIT: "test"}
34 | 
35 |         for split, filename in split_to_filename.items():
36 |             url: str = f"https://raw.githubusercontent.com/deepmind/AQuA/master/{filename}.json"
37 |             target_path: str = os.path.join(data_path, filename)
38 |             ensure_file_downloaded(source_url=url, target_path=target_path, unpack=False)
39 | 
40 |             with open(target_path, "r") as f:
41 |                 data_lst = list(f)
42 | 
43 |                 for data in data_lst:
44 |                     entry = json.loads(data)
45 |                     question = entry["question"]
46 |                     options = entry["options"]
47 |                     answer = ord(entry["correct"]) - ord("A")
48 | 
49 |                     references: List[Reference] = []
50 |                     for index, option in enumerate(options):
51 |                         tags = [CORRECT_TAG] if index == answer else []
52 |                         references.append(Reference(Output(text=option[2:]), tags=tags))
53 | 
54 |                     instance: Instance = Instance(
55 |                         input=Input(text=question),
56 |                         references=references,
57 |                         split=split,
58 |                     )
59 |                     instances.append(instance)
60 | 
61 |         return instances
62 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/big_bench_hard_scenario.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import List, Dict
 4 | 
 5 | from helm.common.hierarchical_logger import hlog
 6 | 
 7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 8 | from .scenario import (
 9 |     Scenario,
10 |     Instance,
11 |     Reference,
12 |     TEST_SPLIT,
13 |     CORRECT_TAG,
14 |     PassageQuestionInput,
15 |     Input,
16 |     Output,
17 | )
18 | 
19 | 
20 | class BigBenchHardScenario(Scenario):
21 | 
22 |     name = "big_bench_hard"
23 |     description = "Big-Bench-Hard Benchmark"
24 |     tags = ["question_answering"]
25 | 
26 |     def __init__(self, dataset: str):
27 |         super().__init__()
28 |         self.dataset: str = dataset
29 | 
30 |     def get_instances(self) -> List[Instance]:
31 |         data_path: str = os.path.join(self.output_path, self.dataset)
32 |         ensure_directory_exists(data_path)
33 | 
34 |         instances: List[Instance] = []
35 |         split_to_filename: Dict[str, str] = {TEST_SPLIT: "test"}
36 | 
37 |         for split, filename in split_to_filename.items():
38 |             url: str = f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{self.dataset}.json"
39 |             target_path: str = os.path.join(data_path, f"{self.dataset}_{filename}")
40 |             ensure_file_downloaded(source_url=url, target_path=target_path, unpack=False)
41 | 
42 |             with open(target_path, "r") as f:
43 |                 data = json.load(f)
44 |                 
45 |                 for instance in data['examples']:
46 |                     question = instance['input']
47 |                     answer = instance['target']
48 |                     
49 |                     references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
50 |                     instance: Instance = Instance(
51 |                                 input=Input(text=question),
52 |                                 references=references,
53 |                                 split=split,
54 |                             )
55 |                     instances.append(instance)
56 | 
57 |         return instances
58 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/coin_scenario.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import List, Dict
 4 | 
 5 | from .scenario import (
 6 |     Scenario,
 7 |     Instance,
 8 |     Reference,
 9 |     TRAIN_SPLIT,
10 |     TEST_SPLIT,
11 |     CORRECT_TAG,
12 |     Input,
13 |     Output,
14 | )
15 | 
16 | 
17 | class CoinScenario(Scenario):
18 | 
19 |     name = "coin"
20 |     description = "Coin Flip Dataset"
21 |     tags = ["symbolic_reasoning"]
22 | 
23 |     def __init__(self):
24 |         super().__init__()
25 | 
26 |     def get_instances(self) -> List[Instance]:
27 |         data_path: str = os.path.join(self.output_path, "data")
28 | 
29 |         instances: List[Instance] = []
30 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
31 | 
32 |         for split, filename in split_to_filename.items():
33 |             target_path: str = os.path.join(data_path, filename)
34 | 
35 |             with open(target_path, "r") as f:
36 |                 data = json.load(f)
37 |                 for entry in data:
38 |                     question = entry["question"] + ' Note that "flip" here means "reverse".'
39 |                     answer = entry["answer"]
40 | 
41 |                     references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
42 | 
43 |                     instance: Instance = Instance(
44 |                         input=Input(text=question),
45 |                         references=references,
46 |                         split=split,
47 |                     )
48 |                     instances.append(instance)
49 | 
50 |         return instances
51 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/commonsense_qa_scenario.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import List, Dict
 4 | 
 5 | from helm.benchmark.scenarios.scenario import Instance
 6 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 7 | 
 8 | from .scenario import (
 9 |     Scenario,
10 |     Instance,
11 |     Reference,
12 |     TRAIN_SPLIT,
13 |     VALID_SPLIT,
14 |     TEST_SPLIT,
15 |     CORRECT_TAG,
16 |     Input,
17 |     Output,
18 | )
19 | 
20 | class CommonsenseQAScenario(Scenario):
21 |     
22 |     DATASET_DOWNLOAD_URL: str = (
23 |         "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl"
24 |     )
25 | 
26 |     name = "commonsense_qa"
27 |     description = "CommonsenseQA Dataset"
28 |     tags = ["question_answering"]
29 |     
30 |     def __init__(self):
31 |         super().__init__()
32 |         
33 |     def get_instances(self) -> List[Instance]:
34 |         
35 |         data_path: str = os.path.join(self.output_path, "data")
36 |         ensure_directory_exists(data_path)
37 |         instances: List[Instance] = []
38 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", VALID_SPLIT: "dev"}
39 |         
40 |         for split, filename in split_to_filename.items():
41 |             url: str = f"https://s3.amazonaws.com/commensenseqa/{filename}_rand_split.jsonl"
42 |             target_path: str = os.path.join(data_path, filename)
43 |             ensure_file_downloaded(source_url=url, target_path=target_path, unpack=False)
44 |             
45 |             with open(target_path, "r") as f:
46 |                 data_lst = list(f)
47 | 
48 |                 for data in data_lst:
49 |                     entry = json.loads(data)
50 |                     
51 |                     question = entry["question"]["stem"]
52 |                     choices = entry["question"]["choices"]
53 |                     answer = ord(entry["answerKey"]) - ord("A")
54 |                     
55 |                     references: List[Reference] = []
56 |                     for index, choice in enumerate(choices):
57 |                         tags = [CORRECT_TAG] if index == answer else []
58 |                         references.append(Reference(Output(text=choice["text"]), tags=tags))
59 |                         
60 |                     instance: Instance = Instance(
61 |                         input=Input(text=question),
62 |                         references=references,
63 |                         split=split,
64 |                     )
65 |                     instances.append(instance)
66 |                     
67 |         return instances
68 |         
69 |     
70 |     


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/gsm_scenario.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | import os
 3 | from typing import List
 4 | 
 5 | from helm.common.general import ensure_file_downloaded
 6 | from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
 7 | 
 8 | 
 9 | class GSM8KScenario(Scenario):
10 |     """Task from "Training Verifiers to Solve Math Word Problems" (Cobbe et al. 2021): https://arxiv.org/abs/2110.14168
11 | 
12 |     Evaluates the capacity of a model to solve grade school math problems, when prompted to include reasoning.
13 |     Encourages the model to work through the problem in a step-by-step way.
14 | 
15 |     Example from dataset (line breaks added for readability):
16 | 
17 |     ```
18 |     "question":
19 |         "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May.
20 |         How many clips did Natalia sell altogether in April and May?",
21 |     "answer":
22 |         "Natalia sold 48/2 = <<48/2=24>>24 clips in May.\n
23 |         Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n
24 |         #### 72"
25 |     ```
26 | 
27 |     Also, incorporates prompting methods from "Chain of Thought Prompting Elicits Reasoning in Large Language Models"
28 |     (Wei et al. 2021): https://arxiv.org/abs/2201.11903
29 | 
30 |     For example, we use "The answer is" before the answer, and remove line breaks within the answer.
31 |     """
32 | 
33 |     name = "gsm"
34 |     description = "Grade school math dataset with 8.5K examples (GSM8K)."
35 |     tags = ["reasoning", "math"]
36 | 
37 |     def get_instances(self) -> List[Instance]:
38 |         splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
39 |         base_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
40 |         instances: List[Instance] = []
41 | 
42 |         for split, split_tag in splits.items():  # Iterate over the splits
43 |             source_url: str = f"{base_url}/{split}.jsonl"
44 |             data_path: str = os.path.join(self.output_path, f"gsm_data_{split}")
45 |             ensure_file_downloaded(source_url=source_url, target_path=data_path)
46 | 
47 |             with jsonlines.open(data_path) as reader:
48 |                 for example in reader:  # Each example is a dictionary with a 'question' and 'answer' key
49 |                     answer: str = example["answer"].split("#### ")[1]
50 |                     instances.append(
51 |                         Instance(
52 |                             input=Input(text=example["question"]),
53 |                             references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
54 |                             split=split_tag,  # Must assign split tag to instance.
55 |                         ),
56 |                     )
57 |         return instances
58 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/letter_scenario.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import List, Dict
 4 | 
 5 | from .scenario import (
 6 |     Scenario,
 7 |     Instance,
 8 |     Reference,
 9 |     TRAIN_SPLIT,
10 |     TEST_SPLIT,
11 |     CORRECT_TAG,
12 |     Input,
13 |     Output,
14 | )
15 | 
16 | 
17 | class LetterScenario(Scenario):
18 | 
19 |     name = "letter"
20 |     description = "Last Letter Concatenation Dataset"
21 |     tags = ["symbolic_reasoning"]
22 | 
23 |     def __init__(self):
24 |         super().__init__()
25 | 
26 |     def get_instances(self) -> List[Instance]:
27 |         data_path: str = os.path.join(self.output_path, "data")
28 | 
29 |         instances: List[Instance] = []
30 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
31 | 
32 |         for split, filename in split_to_filename.items():
33 |             target_path: str = os.path.join(data_path, filename)
34 | 
35 |             with open(target_path, "r") as f:
36 |                 data = json.load(f)
37 |                 for entry in data:
38 |                     question = entry["question"]
39 |                     answer = entry["answer"]
40 | 
41 |                     references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
42 | 
43 |                     instance: Instance = Instance(
44 |                         input=Input(text=question),
45 |                         references=references,
46 |                         split=split,
47 |                     )
48 |                     instances.append(instance)
49 | 
50 |         return instances
51 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/multi_arith_scenario.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import List, Dict
  4 | 
  5 | from helm.common.hierarchical_logger import hlog
  6 | 
  7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
  8 | from .scenario import (
  9 |     Scenario,
 10 |     Instance,
 11 |     Reference,
 12 |     TRAIN_SPLIT,
 13 |     TEST_SPLIT,
 14 |     CORRECT_TAG,
 15 |     Input,
 16 |     Output,
 17 | )
 18 | 
 19 | 
 20 | class MultiArithScenario(Scenario):
 21 | 
 22 |     name = "multi_arith"
 23 |     description = "MultiArith Dataset"
 24 |     tags = ["question_answering"]
 25 | 
 26 |     def __init__(self):
 27 |         super().__init__()
 28 | 
 29 |     def get_instances(self) -> List[Instance]:
 30 |         def delete_extra_zero(n):
 31 |             try:
 32 |                 n = float(n)
 33 |             except:
 34 |                 hlog(f"None {n}")
 35 |                 return n
 36 |             if isinstance(n, int):
 37 |                 return str(n)
 38 |             if isinstance(n, float):
 39 |                 n = str(n).rstrip("0")
 40 |                 n = int(n.rstrip(".")) if n.endswith(".") else float(n)
 41 |                 n = str(n)
 42 |                 return n
 43 | 
 44 |         def make_train_set(data_path: str):
 45 |             train = [
 46 |                 {
 47 |                     "iIndex": 0,
 48 |                     "sQuestion": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
 49 |                     "lSolutions": [39],
 50 |                 },
 51 |                 {
 52 |                     "iIndex": 1,
 53 |                     "sQuestion": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
 54 |                     "lSolutions": [6],
 55 |                 },
 56 |                 {
 57 |                     "iIndex": 2,
 58 |                     "sQuestion": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
 59 |                     "lSolutions": [5],
 60 |                 },
 61 |                 {
 62 |                     "iIndex": 3,
 63 |                     "sQuestion": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
 64 |                     "lSolutions": [9],
 65 |                 },
 66 |                 {
 67 |                     "iIndex": 4,
 68 |                     "sQuestion": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
 69 |                     "lSolutions": [33],
 70 |                 },
 71 |             ]
 72 | 
 73 |             with open(os.path.join(data_path, "train"), "w") as f:
 74 |                 f.write(json.dumps(train, indent=4))
 75 | 
 76 |         data_path: str = os.path.join(self.output_path, "data")
 77 |         ensure_directory_exists(data_path)
 78 | 
 79 |         url: str = (
 80 |             "https://raw.githubusercontent.com/wangxr14/Algebraic-Word-Problem-Solver/master/data/MultiArith.json"
 81 |         )
 82 |         test_path: str = os.path.join(data_path, "test")
 83 |         ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False)
 84 | 
 85 |         make_train_set(data_path)
 86 | 
 87 |         instances: List[Instance] = []
 88 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
 89 | 
 90 |         for split, filename in split_to_filename.items():
 91 |             target_path: str = os.path.join(data_path, filename)
 92 | 
 93 |             with open(target_path, "r") as f:
 94 |                 data = json.load(f)
 95 |                 for entry in data:
 96 |                     question = entry["sQuestion"].strip()
 97 |                     answer = str(entry["lSolutions"][0])
 98 |                     if answer[-2:] == ".0":
 99 |                         answer = answer[:-2]
100 |                     instance: Instance = Instance(
101 |                         input=Input(text=question),
102 |                         references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])],
103 |                         split=split,
104 |                     )
105 |                     instances.append(instance)
106 | 
107 |         return instances
108 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/newsqa_scenario.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import random
  4 | from typing import Dict, List, Tuple
  5 | 
  6 | from .scenario import (
  7 |     Scenario,
  8 |     Instance,
  9 |     Reference,
 10 |     TRAIN_SPLIT,
 11 |     VALID_SPLIT,
 12 |     CORRECT_TAG,
 13 |     PassageQuestionInput,
 14 |     Input,
 15 |     Output,
 16 | )
 17 | 
 18 | 
 19 | class NewsQAScenario(Scenario):
 20 |     """
 21 |     The NewsQA dataset is from the paper:
 22 |     https://arxiv.org/abs/1611.09830
 23 | 
 24 |     Original repository can be found at:
 25 |     https://github.com/Maluuba/newsqa
 26 | 
 27 |     Note: The training dataset cannot be directly shared due to copyright issues, and needs to be downloaded by
 28 |     following the instructions in the repo above. These instructions are duplicated here for
 29 |     convenience.
 30 | 
 31 |     1. Clone the repo (https://github.com/Maluuba/newsqa)
 32 |     2. Download the data from (https://msropendata.com/datasets/939b1042-6402-4697-9c15-7a28de7e1321).
 33 |     You need to create a login account to download this data.
 34 |     3. Download the CNN stories tar file from "https://cs.nyu.edu/~kcho/DMQA/"
 35 |     4. Create the conda environment using the command (conda create --name newsqa python=2.7 "pandas>=0.19.2")
 36 |     5. Install the requirements (conda activate newsqa && pip install --requirement requirements.txt)
 37 | 
 38 |     This should result in the creation of the file (combined-newsqa-data-v1.json) in the repo
 39 |     which is used in this scenario.
 40 | 
 41 |     NewsQA is a QA dataset containing 12,744 stories,
 42 |     and over 119,633 question-answer pairs. There are 92549 training qa pairs,
 43 |     5166 qas in the dev set, and 5126 in the test set.
 44 |     Particularly, given the a news article from CNN,
 45 |     the goal is answer questions with answers consisting of spans of text from the corresponding articles.
 46 |     All of the questions and answers are written by crowd sourced human annotators.
 47 |     For more details, see https://arxiv.org/abs/1611.09830.
 48 | 
 49 |     More concretely, we prompt models using the following format
 50 | 
 51 |         Passage: <news article>
 52 |         Question: <question>
 53 |         Answer:
 54 | 
 55 |     Note: Some of the questions do not have an answer in the context so the
 56 |     model needs to answer "No Answer". While this behavior might be tricky to
 57 |     learn in the few-shot setting, we still include these examples in the
 58 |     scenario.
 59 | 
 60 |     Using an example from the training dataset, we have:
 61 | 
 62 |     ```
 63 |     NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman
 64 |     facing the death sentence for the killing of a teen in a case dubbed 'the house of horrors.'
 65 |     Moninder Singh Pandher was sentenced to death by a lower court in February...
 66 |     Question: Who was sentenced to death in February?
 67 |     Answer:
 68 |     ```
 69 | 
 70 |     References
 71 | 
 72 |     ```
 73 |     ['Moninder Singh Pandher']
 74 |     ```
 75 |     """
 76 | 
 77 |     name = "newsqa"
 78 |     description = "Question answering using news articles."
 79 |     tags = ["question_answering"]
 80 | 
 81 |     def process_example(self, sample: dict) -> Tuple[Input, List[str]]:
 82 |         """
 83 |         Given an sample from the dataset, create the prompt and the list of
 84 |         correct references.
 85 |         """
 86 |         passage = sample["text"]
 87 |         all_questions = sample["questions"]
 88 |         question = random.sample(all_questions, 1)[0]
 89 |         prompt = PassageQuestionInput(passage=passage, question=question["q"], separator="\n\n")
 90 | 
 91 |         # add the answer with consensus
 92 |         # two checks below since the key "noAnswer" is not always present in the dictionary question["consensus"],
 93 |         # and when it is present it is not always True
 94 |         answers: List[str] = []
 95 |         if ("noAnswer" in question["consensus"].keys()) and (question["consensus"]["noAnswer"] is True):
 96 |             answers.append("No Answer")
 97 |         else:
 98 |             start_point = question["consensus"]["s"]
 99 |             end_point = question["consensus"]["e"]
100 |             answer_text = sample["text"][start_point:end_point]
101 |             answers.append(answer_text)
102 | 
103 |         # add the other crowdworker answers
104 |         for answer in question["answers"]:
105 |             if "noAnswer" in answer["sourcerAnswers"][0].keys():
106 |                 answer_text = "No Answer"
107 |                 # add to valid set of answers if it is already not present in the list
108 |                 if answer_text not in answers:
109 |                     answers.append(answer_text)
110 |             else:
111 |                 start_point = answer["sourcerAnswers"][0]["s"]
112 |                 end_point = answer["sourcerAnswers"][0]["e"]
113 |                 answer_text = sample["text"][start_point:end_point]
114 |                 if answer_text not in answers:
115 |                     answers.append(answer_text)
116 |         return prompt, answers
117 | 
118 |     def cleaned_samples(self, samples: List[Dict]) -> List[Dict]:
119 |         """
120 |         Given the full dataset this function only retains news article and QAs where there are
121 |         at least one question that is valid. The question is valid if all crowdworkers believe that
122 |         the question is valid and that the answer is present in text.
123 |         """
124 |         clean_samples: List = []
125 |         for sample in samples:
126 |             # set of valid questions in the sample
127 |             valid_questions = []
128 |             for question in sample["questions"]:
129 |                 add_question = True
130 |                 if ("isQuestionBad" in question.keys()) and (question["isQuestionBad"] != 0.0):
131 |                     add_question = False
132 |                 if ("badQuestion" in question["consensus"].keys()) and (question["consensus"]["badQuestion"] is True):
133 |                     add_question = False
134 |                 if add_question is True:
135 |                     valid_questions.append(question)
136 |             clean = len(valid_questions) >= 1
137 |             sample["questions"] = valid_questions
138 |             if clean is True:
139 |                 clean_samples.append(sample)
140 |         return clean_samples
141 | 
142 |     def get_file_instances(self, target_file: str, splits: Dict) -> List[Instance]:
143 |         """
144 |         Helper for generating instances for a split.
145 |         Args:
146 |             target_file (str): Data file.
147 |             splits (dict): Which splits to partition the data into.
148 |         Returns:
149 |             List[Instance]: Instances from the file for the specified split.
150 |         """
151 |         file_instances: List[Instance] = []
152 |         with open(target_file, encoding="utf-8") as f:
153 |             all_samples: List[Dict] = json.load(f)["data"]
154 | 
155 |         clean_samples: List[Dict] = self.cleaned_samples(all_samples)
156 |         for sample in clean_samples:
157 |             prompt, answers = self.process_example(sample)
158 |             split = "train" if sample["type"] == "train" else "valid"
159 |             instance = Instance(
160 |                 input=prompt,
161 |                 references=[Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in answers],
162 |                 split=splits[split],
163 |             )
164 |             file_instances.append(instance)
165 |         return file_instances
166 | 
167 |     def get_instances(self) -> List[Instance]:
168 |         file_path: str = os.path.join("restricted", self.name, "combined-newsqa-data-v1.json")
169 |         assert os.path.exists(file_path)
170 |         splits = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT}
171 |         random.seed(0)  # randomness needed to pick question at random
172 |         instances: List[Instance] = self.get_file_instances(target_file=file_path, splits=splits)
173 |         return instances
174 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/singleeq_scenario.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import List, Dict
  4 | 
  5 | from helm.common.hierarchical_logger import hlog
  6 | 
  7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
  8 | from .scenario import (
  9 |     Scenario,
 10 |     Instance,
 11 |     Reference,
 12 |     TRAIN_SPLIT,
 13 |     TEST_SPLIT,
 14 |     CORRECT_TAG,
 15 |     Input,
 16 |     Output,
 17 | )
 18 | 
 19 | 
 20 | class SingleEqScenario(Scenario):
 21 | 
 22 |     name = "singleeq"
 23 |     description = "SingleEq Dataset"
 24 |     tags = ["question_answering"]
 25 | 
 26 |     def __init__(self):
 27 |         super().__init__()
 28 | 
 29 |     def get_instances(self) -> List[Instance]:
 30 |         def delete_extra_zero(n):
 31 |             try:
 32 |                 n = float(n)
 33 |             except:
 34 |                 hlog(f"None {n}")
 35 |                 return n
 36 |             if isinstance(n, int):
 37 |                 return str(n)
 38 |             if isinstance(n, float):
 39 |                 n = str(n).rstrip("0")
 40 |                 n = int(n.rstrip(".")) if n.endswith(".") else float(n)
 41 |                 n = str(n)
 42 |                 return n
 43 | 
 44 |         def make_train_set(data_path: str):
 45 |             train = [
 46 |                 {
 47 |                     "iIndex": 0,
 48 |                     "sQuestion": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
 49 |                     "lSolutions": [39],
 50 |                 },
 51 |                 {
 52 |                     "iIndex": 1,
 53 |                     "sQuestion": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
 54 |                     "lSolutions": [6],
 55 |                 },
 56 |                 {
 57 |                     "iIndex": 2,
 58 |                     "sQuestion": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
 59 |                     "lSolutions": [5],
 60 |                 },
 61 |                 {
 62 |                     "iIndex": 3,
 63 |                     "sQuestion": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
 64 |                     "lSolutions": [9],
 65 |                 },
 66 |                 {
 67 |                     "iIndex": 4,
 68 |                     "sQuestion": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
 69 |                     "lSolutions": [33],
 70 |                 },
 71 |             ]
 72 | 
 73 |             with open(os.path.join(data_path, "train"), "w") as f:
 74 |                 f.write(json.dumps(train, indent=4))
 75 | 
 76 |         data_path: str = os.path.join(self.output_path, "data")
 77 |         ensure_directory_exists(data_path)
 78 | 
 79 |         url: str = (
 80 |             "https://raw.githubusercontent.com/chuanyang-Zheng/Progressive-Hint/main/dataset/SingleEq/SingleEq.json"
 81 |         )
 82 |         test_path: str = os.path.join(data_path, "test")
 83 |         ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False)
 84 | 
 85 |         make_train_set(data_path)
 86 | 
 87 |         instances: List[Instance] = []
 88 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
 89 | 
 90 |         for split, filename in split_to_filename.items():
 91 |             target_path: str = os.path.join(data_path, filename)
 92 | 
 93 |             with open(target_path, "r") as f:
 94 |                 data = json.load(f)
 95 |                 for entry in data:
 96 |                     question = entry["sQuestion"].strip()
 97 |                     answer = str(entry["lSolutions"][0])
 98 |                     if answer[-2:] == ".0":
 99 |                         answer = answer[:-2]
100 |                     instance: Instance = Instance(
101 |                         input=Input(text=question),
102 |                         references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])],
103 |                         split=split,
104 |                     )
105 |                     instances.append(instance)
106 | 
107 |         return instances
108 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/svamp_scenario.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import List, Dict
  4 | 
  5 | from helm.common.hierarchical_logger import hlog
  6 | 
  7 | from helm.common.general import ensure_file_downloaded, ensure_directory_exists
  8 | from .scenario import (
  9 |     Scenario,
 10 |     Instance,
 11 |     Reference,
 12 |     TRAIN_SPLIT,
 13 |     TEST_SPLIT,
 14 |     CORRECT_TAG,
 15 |     Input,
 16 |     Output,
 17 | )
 18 | 
 19 | 
 20 | class SVAMPScenario(Scenario):
 21 | 
 22 |     name = "svamp"
 23 |     description = "SVAMP Dataset"
 24 |     tags = ["question_answering"]
 25 | 
 26 |     def __init__(self):
 27 |         super().__init__()
 28 | 
 29 |     def get_instances(self) -> List[Instance]:
 30 |         def delete_extra_zero(n):
 31 |             try:
 32 |                 n = float(n)
 33 |             except:
 34 |                 hlog(f"None {n}")
 35 |                 return n
 36 |             if isinstance(n, int):
 37 |                 return str(n)
 38 |             if isinstance(n, float):
 39 |                 n = str(n).rstrip("0")
 40 |                 n = int(n.rstrip(".")) if n.endswith(".") else float(n)
 41 |                 n = str(n)
 42 |                 return n
 43 | 
 44 |         def make_train_set(data_path: str):
 45 |             train = [
 46 |                 {
 47 |                     "ID": "train-1",
 48 |                     "Body": "Leah had 32 chocolates and her sister had 42. If they ate 35,",
 49 |                     "Question": "how many pieces do they have left in total?",
 50 |                     "Answer": 39,
 51 |                 },
 52 |                 {
 53 |                     "ID": "train-2",
 54 |                     "Body": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees.",
 55 |                     "Question": "How many trees did the grove workers plant today?",
 56 |                     "Answer": 6,
 57 |                 },
 58 |                 {
 59 |                     "ID": "train-3",
 60 |                     "Body": "If there are 3 cars in the parking lot and 2 more cars arrive,",
 61 |                     "Question": "how many cars are in the parking lot?",
 62 |                     "Answer": 5,
 63 |                 },
 64 |                 {
 65 |                     "ID": "train-4",
 66 |                     "Body": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad.",
 67 |                     "Question": "How many toys does he have now?",
 68 |                     "Answer": 9,
 69 |                 },
 70 |                 {
 71 |                     "ID": "train-5",
 72 |                     "Body": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more.",
 73 |                     "Question": "How many golf balls did he have at the end of wednesday?",
 74 |                     "Answer": 33,
 75 |                 },
 76 |             ]
 77 | 
 78 |             with open(os.path.join(data_path, "train"), "w") as f:
 79 |                 f.write(json.dumps(train, indent=4))
 80 | 
 81 |         data_path: str = os.path.join(self.output_path, "data")
 82 |         ensure_directory_exists(data_path)
 83 | 
 84 |         instances: List[Instance] = []
 85 |         split_to_filename: Dict[str, str] = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
 86 | 
 87 |         url: str = "https://raw.githubusercontent.com/arkilpatel/SVAMP/main/SVAMP.json"
 88 |         test_path: str = os.path.join(data_path, "test")
 89 |         ensure_file_downloaded(source_url=url, target_path=test_path, unpack=False)
 90 | 
 91 |         make_train_set(data_path)
 92 | 
 93 |         for split, filename in split_to_filename.items():
 94 |             target_path: str = os.path.join(data_path, filename)
 95 | 
 96 |             with open(target_path, "r") as f:
 97 |                 data = json.load(f)
 98 |                 for entry in data:
 99 |                     question = entry["Body"].strip() + " " + entry["Question"].strip()
100 |                     answer = str(entry["Answer"])
101 |                     if answer[-2:] == ".0":
102 |                         answer = answer[:-2]
103 |                     instance: Instance = Instance(
104 |                         input=Input(text=question),
105 |                         references=[Reference(Output(text=delete_extra_zero(answer)), tags=[CORRECT_TAG])],
106 |                         split=split,
107 |                     )
108 |                     instances.append(instance)
109 | 
110 |         return instances
111 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_2_window_service.py:
--------------------------------------------------------------------------------
 1 | from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig
 2 | from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService
 3 | from helm.benchmark.window_services.tokenizer_service import TokenizerService
 4 | 
 5 | 
 6 | class Llama2WindowService(HuggingFaceWindowService):
 7 |     def __init__(self, service: TokenizerService):
 8 |         # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
 9 |         # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example
10 |         model_config = HuggingFaceModelConfig(
11 |             namespace="hf-internal-testing", model_name="llama-tokenizer", revision=None
12 |         )
13 |         super().__init__(service, model_config)
14 |         
15 |     @property
16 |     def max_sequence_length(self) -> int:
17 |         return 4000
18 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_window_service.py:
--------------------------------------------------------------------------------
 1 | from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig
 2 | from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService
 3 | from helm.benchmark.window_services.tokenizer_service import TokenizerService
 4 | 
 5 | 
 6 | class LlamaWindowService(HuggingFaceWindowService):
 7 |     def __init__(self, service: TokenizerService):
 8 |         # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
 9 |         # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example
10 |         model_config = HuggingFaceModelConfig(
11 |             namespace="hf-internal-testing", model_name="llama-tokenizer", revision=None
12 |         )
13 |         super().__init__(service, model_config)
14 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/window_service_factory.py:
--------------------------------------------------------------------------------
  1 | from helm.proxy.models import (
  2 |     get_model,
  3 |     get_model_names_with_tag,
  4 |     Model,
  5 |     AI21_WIDER_CONTEXT_WINDOW_TAG,
  6 |     WIDER_CONTEXT_WINDOW_TAG,
  7 | )
  8 | from .ai21_window_service import AI21WindowService
  9 | from .wider_ai21_window_service import WiderAI21WindowService
 10 | from .anthropic_window_service import AnthropicWindowService
 11 | from .cohere_window_service import CohereWindowService, CohereCommandWindowService
 12 | from .luminous_window_service import (
 13 |     LuminousBaseWindowService,
 14 |     LuminousExtendedWindowService,
 15 |     LuminousSupremeWindowService,
 16 |     LuminousWorldWindowService,
 17 | )
 18 | from .openai_window_service import OpenAIWindowService
 19 | from .wider_openai_window_service import WiderOpenAIWindowService
 20 | from .mt_nlg_window_service import MTNLGWindowService
 21 | from .bloom_window_service import BloomWindowService
 22 | from .huggingface_window_service import HuggingFaceWindowService
 23 | from .ice_window_service import ICEWindowService
 24 | from .santacoder_window_service import SantaCoderWindowService
 25 | from .gpt2_window_service import GPT2WindowService
 26 | from .gptj_window_service import GPTJWindowService
 27 | from .gptneox_window_service import GPTNeoXWindowService
 28 | from .opt_window_service import OPTWindowService
 29 | from .t0pp_window_service import T0ppWindowService
 30 | from .t511b_window_service import T511bWindowService
 31 | from .flan_t5_window_service import FlanT5WindowService
 32 | from .ul2_window_service import UL2WindowService
 33 | from .yalm_window_service import YaLMWindowService
 34 | from .window_service import WindowService
 35 | from .tokenizer_service import TokenizerService
 36 | from .llama_window_service import LlamaWindowService
 37 | from .llama_2_window_service import Llama2WindowService
 38 | from helm.proxy.clients.huggingface_client import get_huggingface_model_config
 39 | 
 40 | 
 41 | class WindowServiceFactory:
 42 |     @staticmethod
 43 |     def get_window_service(model_name: str, service: TokenizerService) -> WindowService:
 44 |         """
 45 |         Returns a `WindowService` given the name of the model.
 46 |         Make sure this function returns instantaneously on repeated calls.
 47 |         """
 48 |         model: Model = get_model(model_name)
 49 |         organization: str = model.organization
 50 |         engine: str = model.engine
 51 | 
 52 |         window_service: WindowService
 53 |         huggingface_model_config = get_huggingface_model_config(model_name)
 54 |         if huggingface_model_config:
 55 |             window_service = HuggingFaceWindowService(service=service, model_config=huggingface_model_config)
 56 |         elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
 57 |             window_service = WiderOpenAIWindowService(service)
 58 |         # For the Google models, we approximate with the OpenAIWindowService
 59 |         elif organization == "openai" or organization == "simple" or organization == "google":
 60 |             window_service = OpenAIWindowService(service)
 61 |         elif organization == "AlephAlpha":
 62 |             if engine == "luminous-base":
 63 |                 window_service = LuminousBaseWindowService(service)
 64 |             elif engine == "luminous-extended":
 65 |                 window_service = LuminousExtendedWindowService(service)
 66 |             elif engine == "luminous-supreme":
 67 |                 window_service = LuminousSupremeWindowService(service)
 68 |             elif engine == "luminous-world":
 69 |                 window_service = LuminousWorldWindowService(service)
 70 |             else:
 71 |                 raise ValueError(f"Unhandled Aleph Alpha model: {engine}")
 72 |         elif organization == "microsoft":
 73 |             window_service = MTNLGWindowService(service)
 74 |         elif organization == "anthropic":
 75 |             window_service = AnthropicWindowService(service)
 76 |         elif engine == "santacoder":
 77 |             window_service = SantaCoderWindowService(service)
 78 |         elif model_name == "huggingface/gpt2":
 79 |             window_service = GPT2WindowService(service)
 80 |         elif model_name == "together/bloom":
 81 |             window_service = BloomWindowService(service)
 82 |         elif model_name == "together/glm":
 83 |             # From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on
 84 |             # icetk---a unified multimodal tokenizer for images, Chinese, and English."
 85 |             window_service = ICEWindowService(service)
 86 |         elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "gooseai/gpt-j-6b"]:
 87 |             window_service = GPTJWindowService(service)
 88 |         elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b", "together/gpt-neoxt-chat-base-20b"]:
 89 |             window_service = GPTNeoXWindowService(service)
 90 |         elif model_name == "together/h3-2.7b":
 91 |             window_service = GPT2WindowService(service)
 92 |         elif model_name in ["together/opt-66b", "together/opt-175b"]:
 93 |             window_service = OPTWindowService(service)
 94 |         elif model_name == "together/t0pp":
 95 |             window_service = T0ppWindowService(service)
 96 |         elif model_name == "together/t5-11b":
 97 |             window_service = T511bWindowService(service)
 98 |         elif model_name == "together/flan-t5-xxl":
 99 |             window_service = FlanT5WindowService(service)
100 |         elif model_name == "together/ul2":
101 |             window_service = UL2WindowService(service)
102 |         elif model_name == "together/yalm":
103 |             window_service = YaLMWindowService(service)
104 |         elif model_name == "local/vicuna-13b":
105 |             window_service = LlamaWindowService(service)
106 |         elif model_name in ["local/llama-2-7b-chat", "local/llama-2-13b-chat", "local/llama-2-70b-chat"]:
107 |             window_service = Llama2WindowService(service)
108 |         elif organization == "cohere":
109 |             if "command" in engine:
110 |                 window_service = CohereCommandWindowService(service)
111 |             else:
112 |                 window_service = CohereWindowService(service)
113 |         elif organization == "ai21":
114 |             if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
115 |                 window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
116 |             else:
117 |                 window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
118 |         else:
119 |             raise ValueError(f"Unhandled model name: {model_name}")
120 | 
121 |         return window_service
122 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/common/request.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import List, Optional, Dict
  3 | 
  4 | from helm.proxy.models import Model, get_model
  5 | from .general import indent_lines, format_text
  6 | 
  7 | 
  8 | @dataclass(frozen=True)
  9 | class Request:
 10 |     """
 11 |     A `Request` specifies how to query a language model (given a prompt,
 12 |     complete it).  It is the unified representation for communicating with
 13 |     various APIs (e.g., GPT-3, Jurassic).
 14 |     """
 15 | 
 16 |     model: str = "openai/text-davinci-002"
 17 |     """Which model to query"""
 18 | 
 19 |     embedding: bool = False
 20 |     """Whether to query embedding instead of text response"""
 21 | 
 22 |     prompt: str = ""
 23 |     """What prompt do condition the language model on"""
 24 | 
 25 |     temperature: float = 1.0
 26 |     """Temperature parameter that governs diversity"""
 27 | 
 28 |     num_completions: int = 1
 29 |     """Generate this many completions (by sampling from the model)"""
 30 | 
 31 |     top_k_per_token: int = 1
 32 |     """Take this many highest probability candidates per token in the completion"""
 33 | 
 34 |     max_tokens: int = 100
 35 |     """Maximum number of tokens to generate (per completion)"""
 36 | 
 37 |     stop_sequences: List[str] = field(default_factory=list)
 38 |     """Stop generating once we hit one of these strings."""
 39 | 
 40 |     echo_prompt: bool = False
 41 |     """Should `prompt` be included as a prefix of each completion? (e.g., for
 42 |     evaluating perplexity of the prompt)"""
 43 | 
 44 |     top_p: float = 1
 45 |     """Same from tokens that occupy this probability mass (nucleus sampling)"""
 46 | 
 47 |     presence_penalty: float = 0
 48 |     """Penalize repetition (OpenAI & Writer only)"""
 49 | 
 50 |     frequency_penalty: float = 0
 51 |     """Penalize repetition (OpenAI & Writer only)"""
 52 | 
 53 |     random: Optional[str] = None
 54 |     """Used to control randomness. Expect different responses for the same
 55 |     request but with different values for `random`."""
 56 | 
 57 |     messages: Optional[List[Dict[str, str]]] = None
 58 |     """Used for chat models. (OpenAI only for now).
 59 |     if messages is specified for a chat model, the prompt is ignored.
 60 |     Otherwise, the client should convert the prompt into a message."""
 61 | 
 62 |     @property
 63 |     def model_organization(self) -> str:
 64 |         """Example: 'openai/davinci' => 'openai'"""
 65 |         model: Model = get_model(self.model)
 66 |         return model.organization
 67 | 
 68 |     @property
 69 |     def model_engine(self) -> str:
 70 |         """Example: 'openai/davinci' => 'davinci'"""
 71 |         model: Model = get_model(self.model)
 72 |         return model.engine
 73 | 
 74 | 
 75 | @dataclass(frozen=True)
 76 | class Token:
 77 |     """
 78 |     A `Token` represents one token position in a `Sequence`, which has the
 79 |     chosen `text` as well as the top probabilities under the model.
 80 | 
 81 |     Note: (text, logprob) could exist or not exist in `top_logprobs`.
 82 |     """
 83 | 
 84 |     # Text that was chosen
 85 |     text: str
 86 | 
 87 |     # Log probability of generating that
 88 |     logprob: float
 89 | 
 90 |     # text -> log probability of generating that
 91 |     top_logprobs: Dict[str, float]
 92 | 
 93 |     def render_lines(self) -> List[str]:
 94 |         top_logprobs_entries = sorted(self.top_logprobs.items(), key=lambda entry: -entry[1])
 95 |         top_logprobs_str = (
 96 |             "{" + ", ".join(f"{format_text(text)}: {logprob}" for text, logprob in top_logprobs_entries) + "}"
 97 |         )
 98 |         return [
 99 |             f"{format_text(self.text)} logprob={self.logprob} top_logprobs={top_logprobs_str}",
100 |         ]
101 | 
102 | 
103 | @dataclass(frozen=True)
104 | class Sequence:
105 |     """A `Sequence` is a sequence of tokens."""
106 | 
107 |     # The concatenation of all the tokens
108 |     text: str
109 | 
110 |     # The sum of the log probabilities of all tokens
111 |     logprob: float
112 | 
113 |     # The tokens
114 |     tokens: List[Token]
115 | 
116 |     # Why did the sequence finish?
117 |     finish_reason: Optional[Dict] = None
118 | 
119 |     def __add__(self, other: "Sequence") -> "Sequence":
120 |         return Sequence(self.text + other.text, self.logprob + other.logprob, self.tokens + other.tokens)
121 | 
122 |     def render_lines(self) -> List[str]:
123 |         result = [
124 |             f"text: {self.text}",
125 |             f"log_prob: {self.logprob}",
126 |             "tokens {",
127 |         ]
128 |         for token in self.tokens:
129 |             result.extend(indent_lines(token.render_lines(), 2))
130 |         result.append("}")
131 |         if self.finish_reason:
132 |             result.append(f"finish_reason: {self.finish_reason}")
133 |         return result
134 | 
135 | 
136 | @dataclass(frozen=True)
137 | class ErrorFlags:
138 |     """Describes how to treat errors in the request."""
139 | 
140 |     is_retriable: Optional[bool] = None
141 |     """Whether the request is retriable or whether the error is permanent.
142 |     If None, the error is treated as retriable."""
143 | 
144 |     is_fatal: Optional[bool] = None
145 |     """Whether the error is fatal, i.e. the run should be discarded.
146 |     If None, the error is treated as fatal."""
147 | 
148 | 
149 | @dataclass(frozen=False)
150 | class RequestResult:
151 |     """What comes back due to a `Request`."""
152 | 
153 |     success: bool
154 |     """Whether the request was successful"""
155 | 
156 |     embedding: List[float]
157 |     """Fixed dimensional embedding corresponding to the entire prompt"""
158 | 
159 |     completions: List[Sequence]
160 |     """List of completion"""
161 | 
162 |     cached: bool
163 |     """Whether the request was actually cached"""
164 | 
165 |     request_time: Optional[float] = None
166 |     """How long did the request take?"""
167 | 
168 |     request_datetime: Optional[int] = None
169 |     """When was the request sent?
170 |     We keep track of when the request was made because the underlying model or inference procedure backing the API
171 |     might change over time. The integer represents the current time in seconds since the Epoch (January 1, 1970)."""
172 | 
173 |     error: Optional[str] = None
174 |     """If `success` is false, what was the error?"""
175 | 
176 |     error_flags: Optional[ErrorFlags] = None
177 |     """Describes how to treat errors in the request."""
178 | 
179 |     batch_size: Optional[int] = None
180 |     """Batch size (`TogetherClient` only)"""
181 | 
182 |     batch_request_time: Optional[float] = None
183 |     """How long it took to process the batch? (`TogetherClient` only)"""
184 |     
185 |     full_text: Optional[str] = None
186 |     
187 |     cot: Optional[bool] = None
188 |     
189 |     def render_lines(self) -> List[str]:
190 |         output = [
191 |             f"success: {self.success}",
192 |             f"cached: {self.cached}",
193 |         ]
194 |         if self.request_time:
195 |             output.append(f"request_time: {self.request_time}")
196 |         if self.request_datetime:
197 |             output.append(f"request_datetime: {self.request_datetime}")
198 |         if self.error:
199 |             output.append(f"error: {self.error}")
200 | 
201 |         output.append("completions {")
202 |         for completion in self.completions:
203 |             output.extend(indent_lines(completion.render_lines()))
204 |         output.append("}")
205 | 
206 |         return output
207 | 
208 | 
209 | EMBEDDING_UNAVAILABLE_REQUEST_RESULT = RequestResult(
210 |     success=False,
211 |     cached=False,
212 |     error="Computing the embedding is unavailable in this client",
213 |     completions=[],
214 |     embedding=[],
215 | )
216 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/client.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import json
  3 | from abc import ABC, abstractmethod
  4 | from typing import Callable, Any, Dict, List
  5 | 
  6 | from helm.common.hierarchical_logger import hlog
  7 | from helm.common.request import Request, RequestResult, Sequence, Token
  8 | from helm.common.tokenization_request import (
  9 |     TokenizationRequest,
 10 |     TokenizationRequestResult,
 11 |     DecodeRequest,
 12 |     DecodeRequestResult,
 13 | )
 14 | 
 15 | 
 16 | class Client(ABC):
 17 |     @staticmethod
 18 |     def make_cache_key(raw_request: Dict, request: Request) -> Dict:
 19 |         """
 20 |         Construct the key for the cache using the raw request.
 21 |         Add `request.random` to the key, if defined.
 22 |         """
 23 |         if request.random is not None:
 24 |             assert "random" not in raw_request
 25 |             cache_key = {**raw_request, "random": request.random}
 26 |         else:
 27 |             cache_key = raw_request
 28 |         return cache_key
 29 | 
 30 |     @abstractmethod
 31 |     def make_request(self, request: Request, prompt_list: Dict[str, Any]) -> RequestResult:
 32 |         pass
 33 | 
 34 |     @abstractmethod
 35 |     def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
 36 |         pass
 37 | 
 38 |     @abstractmethod
 39 |     def decode(self, request: DecodeRequest) -> DecodeRequestResult:
 40 |         pass
 41 | 
 42 | 
 43 | def wrap_request_time(compute: Callable[[], Any]) -> Callable[[], Any]:
 44 |     """Return a version of `compute` that puts `request_time` into its output."""
 45 | 
 46 |     def wrapped_compute():
 47 |         start_time = time.time()
 48 |         response = compute()
 49 |         end_time = time.time()
 50 |         response["request_time"] = end_time - start_time
 51 |         response["request_datetime"] = int(start_time)
 52 |         return response
 53 | 
 54 |     return wrapped_compute
 55 | 
 56 | 
 57 | def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool = True) -> Sequence:
 58 |     """
 59 |     Certain providers have bugs where they aren't respecting max_tokens,
 60 |     stop_sequences and the end of text token, so as a hack, we have to manually
 61 |     truncate the suffix of `sequence` and `tokens` as a post-hoc process.
 62 |     """
 63 |     # TODO: if echo_prompt, then we should only ignore the prompt, but we don't
 64 |     # know how many tokens the prompt takes up.
 65 |     # In the benchmark, usually echo_prompt is only used for language modeling,
 66 |     # where max_tokens = 0, so there's nothing to truncate.
 67 |     if request.echo_prompt:
 68 |         if request.max_tokens != 0:
 69 |             hlog("WARNING: don't know how to handle echo_prompt and max_tokens > 0, not truncating")
 70 |         return sequence
 71 | 
 72 |     for stop in request.stop_sequences:
 73 |         # Find `stop` in the text
 74 |         try:
 75 |             new_text = sequence.text[: sequence.text.index(stop)]
 76 |         except ValueError:
 77 |             # The stop sequence doesn't exist, but it might exist in the list of tokens.
 78 |             new_text = sequence.text
 79 | 
 80 |         # Strip `stop` off the tokens
 81 |         new_tokens: List[Token] = []
 82 |         # Need to start
 83 |         for token in sequence.tokens:
 84 |             # Note: we can only strip at token boundaries
 85 |             if token.text.startswith(stop):
 86 |                 break
 87 |             new_tokens.append(token)
 88 | 
 89 |         if len(new_text) < len(sequence.text) and len(new_tokens) == len(sequence.tokens):
 90 |             hlog(
 91 |                 f"WARNING: Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
 92 |                 f"but wasn't able to strip the tokens"
 93 |             )
 94 | 
 95 |         # Recompute log probability
 96 |         new_logprob = sum(token.logprob for token in new_tokens)
 97 | 
 98 |         # if print_warning:
 99 |         #     hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}")
100 | 
101 |         sequence = Sequence(text=new_text, logprob=new_logprob, tokens=new_tokens)
102 | 
103 |     # Truncate based on the max number of tokens.
104 |     if len(sequence.tokens) > request.max_tokens:
105 |         if print_warning:
106 |             hlog(f"WARNING: truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
107 |         new_tokens = sequence.tokens[: request.max_tokens]
108 | 
109 |         # This is imperfect stitching together of tokens, so just to make sure this is okay
110 |         # TODO: should use the proper detokenizer since T5-style models.
111 |         # Usually, in our benchmark, max_tokens is active when it's 1, so hopefully this isn't an issue.
112 |         new_text = "".join(token.text for token in new_tokens)
113 |         if not sequence.text.startswith(new_text):
114 |             hlog(f"WARNING: {json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
115 | 
116 |         new_logprob = sum(token.logprob for token in new_tokens)
117 | 
118 |         sequence = Sequence(text=new_text, logprob=new_logprob, tokens=new_tokens)
119 | 
120 |     return sequence
121 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/huggingface_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, Dict, Optional, Set
 3 | 
 4 | from transformers import AutoTokenizer
 5 | 
 6 | from helm.common.hierarchical_logger import htrack_block, hlog
 7 | 
 8 | from helm.proxy.clients.huggingface_model_registry import get_huggingface_model_config
 9 | 
10 | 
11 | # Tokenizer names where the HELM tokenizer name and the Hugging Face tokenizer name
12 | # are identical.
13 | _KNOWN_TOKENIZER_NAMES: Set[str] = {
14 |     "EleutherAI/gpt-j-6B",  # Not a typo: Named "gpt-j-6B" instead of "gpt-j-6b" in Hugging Face
15 |     "EleutherAI/gpt-neox-20b",
16 |     "bigscience/bloom",
17 |     "bigscience/T0pp",
18 |     "facebook/opt-66b",
19 |     "google/ul2",
20 |     "google/flan-t5-xxl",
21 |     "bigcode/santacoder",
22 |     "Writer/palmyra-base",
23 |     "bigcode/starcoder",
24 |     "hf-internal-testing/llama-tokenizer",
25 | }
26 | 
27 | 
28 | # Map of HELM tokenizer name to Hugging Face tokenizer name for tokenizers where they differ.
29 | _KNOWN_TOKENIZER_ALIASES: Dict[str, str] = {
30 |     "huggingface/gpt2": "gpt2",
31 |     "google/t5-11b": "t5-11b",
32 | }
33 | 
34 | 
35 | class HuggingFaceTokenizers:
36 | 
37 |     tokenizers: Dict[str, Any] = {}
38 | 
39 |     @staticmethod
40 |     def get_tokenizer(tokenizer_name: str) -> Any:
41 |         """
42 |         Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached.
43 |         Returns the tokenizer.
44 |         """
45 | 
46 |         def load_tokenizer(hf_tokenizer_name: str, revision: Optional[str] = None):
47 |             """Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace."""
48 |             tokenizer_kwargs = {}
49 |             if revision is not None:
50 |                 tokenizer_kwargs["revision"] = revision
51 |             try:
52 |                 # From the Hugging Face documentation, "local_files_only(defaults to False) —
53 |                 # Whether or not to only look at local files".
54 |                 # Running `local_files_only=False` requires an internet connection even if the files are downloaded
55 |                 # and cached. We need to first run with `local_files_only=True` just in case the machine
56 |                 # we are running this code has connection issues. If the tokenizer files are not cached,
57 |                 # we attempt to download them from HuggingFace.
58 |                 # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
59 |                 # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
60 |                 # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
61 |                 return AutoTokenizer.from_pretrained(
62 |                     hf_tokenizer_name, local_files_only=True, use_fast=True, **tokenizer_kwargs
63 |                 )
64 |             except OSError:
65 |                 hlog(f"Local files do not exist for HuggingFace tokenizer: {hf_tokenizer_name}. Downloading...")
66 |                 return AutoTokenizer.from_pretrained(
67 |                     hf_tokenizer_name, local_files_only=False, use_fast=True, **tokenizer_kwargs
68 |                 )
69 | 
70 |         if tokenizer_name not in HuggingFaceTokenizers.tokenizers:
71 |             with htrack_block(f"Loading {tokenizer_name} with Hugging Face Transformers"):
72 |                 # To avoid deadlocks when using HuggingFace tokenizers with multiple processes
73 |                 os.environ["TOKENIZERS_PARALLELISM"] = "False"
74 | 
75 |                 # Weights are cached at ~/.cache/huggingface/transformers.
76 |                 hf_tokenizer_name: str
77 |                 revision: Optional[str] = None
78 |                 model_config = get_huggingface_model_config(tokenizer_name)
79 |                 if model_config:
80 |                     hf_tokenizer_name = model_config.model_id
81 |                     revision = model_config.revision
82 |                 elif tokenizer_name in _KNOWN_TOKENIZER_NAMES:
83 |                     hf_tokenizer_name = tokenizer_name
84 |                 elif tokenizer_name in _KNOWN_TOKENIZER_ALIASES:
85 |                     hf_tokenizer_name = _KNOWN_TOKENIZER_ALIASES[tokenizer_name]
86 |                 else:
87 |                     raise ValueError(f"Unsupported HuggingFace tokenizer: {tokenizer_name}")
88 | 
89 |                 # Keep the tokenizer in memory, so we don't recreate it for future requests
90 |                 HuggingFaceTokenizers.tokenizers[tokenizer_name] = load_tokenizer(hf_tokenizer_name, revision)
91 | 
92 |         return HuggingFaceTokenizers.tokenizers[tokenizer_name]
93 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/src/helm/proxy/services/service.py:
--------------------------------------------------------------------------------
  1 | import mako.template
  2 | from abc import ABC, abstractmethod
  3 | from dataclasses import dataclass
  4 | from typing import Dict, List, Tuple, Any
  5 | 
  6 | from helm.common.general import parse_hocon
  7 | from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
  8 | from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
  9 | from helm.common.tokenization_request import (
 10 |     WindowServiceInfo,
 11 |     TokenizationRequest,
 12 |     TokenizationRequestResult,
 13 |     DecodeRequest,
 14 |     DecodeRequestResult,
 15 | )
 16 | from helm.common.request import Request, RequestResult
 17 | from helm.proxy.models import Model
 18 | from helm.proxy.query import Query, QueryResult
 19 | from helm.proxy.accounts import Authentication, Account
 20 | 
 21 | VERSION = "1.0"
 22 | CREDENTIALS_FILE = "credentials.conf"
 23 | ACCOUNTS_FILE = "accounts.sqlite"
 24 | CACHE_DIR = "cache"
 25 | MONGO_URI = "mongo_uri"
 26 | MAX_EXPANSION = 1000
 27 | 
 28 | 
 29 | @dataclass(frozen=True)
 30 | class GeneralInfo:
 31 |     version: str
 32 |     example_queries: List[Query]
 33 |     all_models: List[Model]
 34 | 
 35 | 
 36 | def expand_environments(environments: Dict[str, List[str]]):
 37 |     """
 38 |     `environments` is a map from variable names to a list of strings.
 39 |     Return: a list of environments, where for each variable, we choose one of its string.
 40 |     """
 41 |     output_environments: List[Dict[str, str]] = []
 42 | 
 43 |     def recurse(old_items: List[Tuple[str, List[str]]], new_items: List[Tuple[str, str]]):
 44 |         if len(output_environments) >= MAX_EXPANSION:
 45 |             return
 46 |         if len(old_items) == 0:
 47 |             output_environments.append(dict(new_items))
 48 |         else:
 49 |             item, rest_old_items = old_items[0], old_items[1:]
 50 |             key, list_value = item
 51 |             for elem_value in list_value:
 52 |                 recurse(rest_old_items, new_items + [(key, elem_value)])
 53 | 
 54 |     recurse(list(environments.items()), [])
 55 |     return output_environments
 56 | 
 57 | 
 58 | def substitute_text(text: str, environment: Dict[str, str]) -> str:
 59 |     """
 60 |     Example:
 61 |         text = "Hello {name}"
 62 |         environment = {"name": "Sue"}
 63 |         Return "Hello Sue"
 64 |     """
 65 |     return mako.template.Template(text).render(**environment)
 66 | 
 67 | 
 68 | def synthesize_request(prompt: str, settings: str, environment: Dict[str, str]) -> Request:
 69 |     """Substitute `environment` into `prompt` and `settings`."""
 70 |     request: Dict[str, Any] = {}
 71 |     request["prompt"] = substitute_text(prompt, environment)
 72 |     request.update(parse_hocon(substitute_text(settings, environment)))
 73 |     return Request(**request)
 74 | 
 75 | 
 76 | class Service(ABC):
 77 |     @abstractmethod
 78 |     def get_general_info(self) -> GeneralInfo:
 79 |         """Get general info."""
 80 |         pass
 81 | 
 82 |     @abstractmethod
 83 |     def get_window_service_info(self, model_name: str) -> WindowServiceInfo:
 84 |         """Get window service info."""
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def expand_query(self, query: Query) -> QueryResult:
 89 |         """Turn the `query` into requests."""
 90 |         pass
 91 | 
 92 |     @abstractmethod
 93 |     def make_request(self, auth: Authentication, request: Request, prompt_list: Dict[str, Any] = {}) -> RequestResult:
 94 |         """Actually make a request to an API."""
 95 |         pass
 96 | 
 97 |     @abstractmethod
 98 |     def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
 99 |         """Tokenize via an API."""
100 |         pass
101 | 
102 |     @abstractmethod
103 |     def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
104 |         """Decodes to text."""
105 |         pass
106 | 
107 |     def is_toxicity_scoring_available(self) -> bool:
108 |         """Whether toxicity score is available, i.e. whether the Perspective API key is set.
109 |         Return: (is_available, error_message)"""
110 |         return False
111 | 
112 |     @abstractmethod
113 |     def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
114 |         """Get toxicity scores for a batch of text.
115 |         Should only be called if `self.is_toxicity_scoring_available` is True."""
116 |         pass
117 | 
118 |     def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
119 |         """Get responses to a critique request."""
120 |         pass
121 | 
122 |     @abstractmethod
123 |     def create_account(self, auth: Authentication) -> Account:
124 |         """Creates a new account."""
125 |         pass
126 | 
127 |     @abstractmethod
128 |     def delete_account(self, auth: Authentication, api_key: str) -> Account:
129 |         """Deletes an account."""
130 |         pass
131 | 
132 |     @abstractmethod
133 |     def get_accounts(self, auth: Authentication) -> List[Account]:
134 |         """Get list of accounts."""
135 |         pass
136 | 
137 |     @abstractmethod
138 |     def get_account(self, auth: Authentication) -> Account:
139 |         """Get information about an account."""
140 |         pass
141 | 
142 |     @abstractmethod
143 |     def update_account(self, auth: Authentication, account: Account) -> Account:
144 |         """Update account."""
145 |         pass
146 | 
147 |     @abstractmethod
148 |     def rotate_api_key(self, auth: Authentication, account: Account) -> Account:
149 |         """Generate a new API key for a given account."""
150 |         pass
151 | 
152 |     @abstractmethod
153 |     def shutdown(self, auth: Authentication):
154 |         """Shutdown server."""
155 |         pass
156 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/helm_updates/update_helm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This script updates the helm module with AgentInstruct changes
  4 | # should run the script from the top level of the AgentInstruct repo
  5 | 
  6 | # linking our instructions to _latest
  7 | python scripts/replicate.py
  8 | 
  9 | # move benchmark output to top level with letter and coin data
 10 | cp -r src/agentinstruct/reasoning/helm_updates/benchmark_output .
 11 | 
 12 | # creating prod_env at top level
 13 | mkdir prod_env
 14 | 
 15 | # creating credentials file
 16 | touch prod_env/credentials.conf
 17 | 
 18 | # removing helm/.github
 19 | rm -rf src/agentinstruct/reasoning/helm/.github
 20 | 
 21 | # removing helm/docs
 22 | rm -rf src/agentinstruct/reasoning/helm/.github
 23 | 
 24 | # added prompt dict to AdapterSpec class
 25 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapter_spec.py src/agentinstruct/reasoning/helm/src/helm/benchmark/adaptation/adapter_spec.py
 26 | 
 27 | # updating truncation in in_context_learning_adapter.py 
 28 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py src/agentinstruct/reasoning/helm/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
 29 | 
 30 | # add scenario imports to __init__.py
 31 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/__init__.py src/agentinstruct/reasoning/helm/src/helm/benchmark/__init__.py
 32 | 
 33 | # update the multiple_choice_join_adapter
 34 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py src/agentinstruct/reasoning/helm/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
 35 | 
 36 | # update executor.py with prompt_list
 37 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/executor.py src/agentinstruct/reasoning/helm/src/helm/benchmark/executor.py
 38 | 
 39 | # update basic_metrics.py to check for empty strings
 40 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/metrics/basic_metrics.py src/agentinstruct/reasoning/helm/src/helm/benchmark/metrics/basic_metrics.py
 41 | 
 42 | # handle --skip-expanders arg for zero-shot runs on run.py
 43 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/run.py src/agentinstruct/reasoning/helm/src/helm/benchmark/run.py
 44 | 
 45 | # update the run_expander with instruction expanders for agentinstruct and zeroshotcot
 46 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/run_expander.py src/agentinstruct/reasoning/helm/src/helm/benchmark/run_expander.py
 47 | 
 48 | # update the run_specs with new datasets
 49 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/run_specs.py src/agentinstruct/reasoning/helm/src/helm/benchmark/run_specs.py
 50 | 
 51 | # add addsub scenario
 52 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/addsub_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 53 | 
 54 | # add aqua scenario
 55 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/aqua_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 56 | 
 57 | # add big bench hard scenario
 58 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/big_bench_hard_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 59 | 
 60 | # add coin scenario
 61 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/coin_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 62 | 
 63 | # add commonsense_qa scenario
 64 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/commonsense_qa_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 65 | 
 66 | # update gsm scenario
 67 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/gsm_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios/gsm_scenario.py
 68 | 
 69 | # add letter scenario
 70 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/letter_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 71 | 
 72 | # add multi_arith_scenario
 73 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/multi_arith_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 74 | 
 75 | # add singleeq scenario
 76 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/singleeq_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 77 | 
 78 | # add svamp scenario
 79 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/scenarios/svamp_scenario.py src/agentinstruct/reasoning/helm/src/helm/benchmark/scenarios
 80 | 
 81 | # add llama window service
 82 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_window_service.py src/agentinstruct/reasoning/helm/src/helm/benchmark/window_services
 83 | 
 84 | # add llama-2 window service
 85 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/llama_2_window_service.py src/agentinstruct/reasoning/helm/src/helm/benchmark/window_services
 86 | 
 87 | # update window_service_factory.py with llama-2
 88 | cp src/agentinstruct/reasoning/helm_updates/src/helm/benchmark/window_services/window_service_factory.py src/agentinstruct/reasoning/helm/src/helm/benchmark/window_services/window_service_factory.py
 89 | 
 90 | # update dataset download prodecure in general.py
 91 | cp src/agentinstruct/reasoning/helm_updates/src/helm/common/general.py src/agentinstruct/reasoning/helm/src/helm/common/general.py
 92 | 
 93 | # add full_text property to RequestResult class in order to store itermediate reasoning
 94 | cp src/agentinstruct/reasoning/helm_updates/src/helm/common/request.py src/agentinstruct/reasoning/helm/src/helm/common/request.py
 95 | 
 96 | # add local client to auto_client and pass through prompt_list
 97 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/auto_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/auto_client.py
 98 | 
 99 | # add prompt_list to abstractmethod make_request in client.py
100 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/client.py
101 | 
102 | # add llama tokenizer to huggingface_tokenizer.py
103 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/huggingface_tokenizer.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/huggingface_tokenizer.py
104 | 
105 | # update openai_client.py
106 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/openai_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/openai_client.py
107 | 
108 | # add openai_automatic_prompt_tuning.py with agentinstruct process
109 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/openai_automatic_prompt_tuning.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/
110 | 
111 | # add local_client.py with agentinstruct process
112 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/local_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/
113 | 
114 | # update together_client.py with agentinstruct process
115 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/clients/together_client.py src/agentinstruct/reasoning/helm/src/helm/proxy/clients/together_client.py
116 | 
117 | # add new models to models.py
118 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/models.py src/agentinstruct/reasoning/helm/src/helm/proxy/models.py
119 | 
120 | # pass prompt_list through server_service.py
121 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/services/server_service.py src/agentinstruct/reasoning/helm/src/helm/proxy/services/server_service.py
122 | 
123 | # pass prompt_list through service.py abstractmethod
124 | cp src/agentinstruct/reasoning/helm_updates/src/helm/proxy/services/service.py src/agentinstruct/reasoning/helm/src/helm/proxy/services/service.py
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/README.md:
--------------------------------------------------------------------------------
 1 | # AgentInstruct Serve API Setup Guide
 2 | 
 3 | ### Installation
 4 | Our design follows TorchServe API. This TorchServe API is best run within their official docker container. Here, we focus on Llama-2-7b-chat, however the process is identical for other Llama-2-chat models (see [here](https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md) for recovering the vicuna-13b v1.1 weights). You can download LLama-2-7b-chat [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main?clone=true), which requires a HuggingFace access token with approved access to Llama-2. If you don't have git lfs, make sure to install it first (e.g., using apt-get).
 5 | ```
 6 | git lfs install
 7 | git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
 8 | ```
 9 | 
10 | Before you begin, ensure you have cloned the AgentInstruct repository as specified in the main README. Then, in a new shell, run the following to start a docker container with the TorchServe API:
11 | ```
12 | docker pull pytorch/torchserve:0.8.2-gpu
13 | docker run --network=mynetwork --name=serve-container -v ~/agentinstruct:/code/agentinstruct -v ~/Llama-2-7b-chat-hf:/code/Llama-2-7b-chat-hf -u root -it --gpus all -p 8081:8081 -p 8082:8082 -p 8083:8083 pytorch/torchserve:0.8.2-gpu bash
14 | cd /code/agentinstruct/src/agentinstruct/reasoning/serve
15 | ```
16 | This container requires CUDA >= 11.8. See [here](https://hub.docker.com/r/pytorch/torchserve/tags) for additional tags, or follow the guide [here](https://github.com/pytorch/serve/blob/v0.8.2/docker/README.md) to create an image well-suited for your system.
17 | 
18 | The image comes preinstalled with TorchServe and the required dependencies (torch, JDK17, etc.). Additional model-specific packages should be put in `model_store/requirements.txt`, and will be installed when a model is assigned to workers.
19 | 
20 | ### Set Up the API
21 | Let's see an example on setting up the API to serve inference requests to llama-2-7b-chat step by step. 
22 | 
23 | #### Generating Runtime File
24 | 
25 | To generate a runtime file for a model, run
26 | ```
27 | torch-model-archiver --model-name llama-2-7b-chat --version 1.0 --handler custom_handler/llama-2-7b-chat-handler.py  -r model_store/requirements.txt -f -c model_store/llama-2-7b-chat-config.yaml --archive-format tgz --export-path model_store
28 | ```
29 | 
30 | #### Starting Up the API
31 | ```
32 | export TEMP=/tmp # or some existing directory with write access
33 | torchserve --start --ncs --ts-config model_store/config.properties
34 | ```
35 | This will load the API, but will not register any models or load any workers.
36 | 
37 | #### Registering Model and Loading Workers
38 | To load 8 copies of llama-2-7b-chat, one to each gpu, run:
39 | ```
40 | curl -X POST "http://serve-container:8082/models?url=llama-2-7b-chat.tar.gz&initial_workers=8"
41 | ```
42 | 
43 | #### Sending Inference Requests
44 | Now you're ready to start sending inference requests to the model over serve-container:8081. The model `local/llama-2-7b-chat` in HELM will send requests to this API. You can now continue following the instructions in the main README starting from the "Replicating Main Results" section.
45 | 
46 | #### Stopping the API
47 | ```
48 | export TEMP=/tmp # must be same directory using during startup
49 | torchserve --stop
50 | ```
51 | 
52 | See [here](https://pytorch.org/serve/management_api.html) for more information on managing the API.
53 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/model_store/config.properties:
--------------------------------------------------------------------------------
1 | inference_address=http://serve-container:8081
2 | management_address=http://serve-container:8082
3 | metrics_address=http://serve-container:8083
4 | default_workers_per_model=1
5 | install_py_dep_per_model=true
6 | max_response_size=655350000
7 | default_response_timeout=5000
8 | model_store=/code/agentinstruct/src/agentinstruct/reasoning/serve/model_store
9 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/model_store/llama-2-13b-chat-config.yaml:
--------------------------------------------------------------------------------
1 | responseTimeout: 5000
2 | torchrun:
3 |     nproc-per-node: 1
4 | handler:
5 |     model_path: "/code/Llama-2-13b-chat-hf"
6 |    
7 | 
8 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/model_store/llama-2-70b-chat-config.yaml:
--------------------------------------------------------------------------------
1 | responseTimeout: 5000
2 | torchrun:
3 |     nproc-per-node: 1 
4 | handler:
5 |     model_path: "/code/Llama-2-70b-chat-hf"
6 |     quantize: "nf4"
7 |     num_gpu_per_model: 1
8 |     per_gpu_mem: 48000000000 #48GB
9 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/model_store/llama-2-7b-chat-config.yaml:
--------------------------------------------------------------------------------
1 | responseTimeout: 5000
2 | torchrun:
3 |     nproc-per-node: 1
4 | handler:
5 |     model_path: "/code/Llama-2-7b-chat-hf"
6 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/model_store/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.32.*
2 | accelerate
3 | sentencepiece
4 | protobuf==3.20.*
5 | bitsandbytes
6 | scipy
7 | 


--------------------------------------------------------------------------------
/src/agentinstruct/reasoning/serve/model_store/vicuna-13b-config.yaml:
--------------------------------------------------------------------------------
1 | responseTimeout: 5000
2 | torchrun:
3 |     nproc-per-node: 1
4 | handler:
5 |     model_path: "/code/vicuna-13b"
6 | 


--------------------------------------------------------------------------------